pax_global_header 0000666 0000000 0000000 00000000064 15162412645 0014520 g ustar 00root root 0000000 0000000 52 comment=41c726586cfef55de0c8474923d2734f1529d223
qlustered-deepdiff-41c7265/ 0000775 0000000 0000000 00000000000 15162412645 0015551 5 ustar 00root root 0000000 0000000 qlustered-deepdiff-41c7265/.bumpversion.cfg 0000664 0000000 0000000 00000000451 15162412645 0020661 0 ustar 00root root 0000000 0000000 [bumpversion]
current_version = 9.0.0
commit = False
tag = False
tag_name = {new_version}
[bumpversion:file:pyproject.toml]
[bumpversion:file:README.md]
[bumpversion:file:CITATION.cff]
[bumpversion:file:docs/index.rst]
[bumpversion:file:docs/conf.py]
[bumpversion:file:deepdiff/__init__.py]
qlustered-deepdiff-41c7265/.coveragerc 0000664 0000000 0000000 00000000121 15162412645 0017664 0 ustar 00root root 0000000 0000000 [report]
omit =
*/python?.?/*
*/site-packages/nose/*
*__init__*
qlustered-deepdiff-41c7265/.direnvrc.example 0000664 0000000 0000000 00000000632 15162412645 0021021 0 ustar 00root root 0000000 0000000 function load_venv () {
ACTUAL_VENV_PATH="$HOME/.venvs/$1"
if [ -d "$ACTUAL_VENV_PATH" ] && [ -f "$ACTUAL_VENV_PATH/bin/activate" ]; then
echo "direnv: Activating $ACTUAL_VENV_PATH..."
source "$ACTUAL_VENV_PATH/bin/activate"
export UV_PROJECT_ENVIRONMENT="$ACTUAL_VENV_PATH"
else
echo "direnv: Virtual environment at $ACTUAL_VENV_PATH not found or is incomplete."
fi
}
qlustered-deepdiff-41c7265/.envrc.example 0000664 0000000 0000000 00000000017 15162412645 0020317 0 ustar 00root root 0000000 0000000 load_venv deep
qlustered-deepdiff-41c7265/.github/ 0000775 0000000 0000000 00000000000 15162412645 0017111 5 ustar 00root root 0000000 0000000 qlustered-deepdiff-41c7265/.github/FUNDING.yml 0000664 0000000 0000000 00000000043 15162412645 0020723 0 ustar 00root root 0000000 0000000 github: [seperman]
ko_fi: seperman
qlustered-deepdiff-41c7265/.github/ISSUE_TEMPLATE/ 0000775 0000000 0000000 00000000000 15162412645 0021274 5 ustar 00root root 0000000 0000000 qlustered-deepdiff-41c7265/.github/ISSUE_TEMPLATE/bug_report.md 0000664 0000000 0000000 00000001342 15162412645 0023766 0 ustar 00root root 0000000 0000000 ---
name: Bug report
about: Create a report to help us improve
title: ''
labels: ''
assignees: ''
---
Please checkout the [F.A.Q](https://zepworks.com/deepdiff/current/faq.html) page before creating a bug ticket to make sure it is not already addressed.
**Describe the bug**
A clear and concise description of what the bug is.
**To Reproduce**
Steps to reproduce the behavior
**Expected behavior**
A clear and concise description of what you expected to happen.
**OS, DeepDiff version and Python version (please complete the following information):**
- OS: [e.g. Ubuntu]
- Version [e.g. 20LTS]
- Python Version [e.g. 3.10.12]
- DeepDiff Version [e.g. 5.8.0]
**Additional context**
Add any other context about the problem here.
qlustered-deepdiff-41c7265/.github/ISSUE_TEMPLATE/feature_request.md 0000664 0000000 0000000 00000001123 15162412645 0025016 0 ustar 00root root 0000000 0000000 ---
name: Feature request
about: Suggest an idea for this project
title: ''
labels: ''
assignees: ''
---
**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
**Describe the solution you'd like**
A clear and concise description of what you want to happen.
**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.
**Additional context**
Add any other context or screenshots about the feature request here.
qlustered-deepdiff-41c7265/.github/workflows/ 0000775 0000000 0000000 00000000000 15162412645 0021146 5 ustar 00root root 0000000 0000000 qlustered-deepdiff-41c7265/.github/workflows/main.yaml 0000664 0000000 0000000 00000003300 15162412645 0022752 0 ustar 00root root 0000000 0000000 name: CI
on:
push: { branches: [master, dev] }
pull_request: { branches: [master, dev] }
jobs:
build:
runs-on: ubuntu-latest
env:
DEFAULT_PYTHON: '3.14'
strategy:
matrix:
python-version: ['3.10','3.11','3.12','3.13','3.14']
architecture: ['x64']
steps:
- uses: actions/checkout@v3
- name: Setup Python
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: ${{ matrix.architecture }}
cache: pip
cache-dependency-path: pyproject.toml
- name: Install nox
run: pip install nox==2025.5.1
- name: Lint with flake8
if: ${{ matrix.python-version == '3.14' }}
run: |
nox -s flake8 -- deepdiff --count --select=E9,F63,F7,F82 --show-source --statistics
nox -s flake8 -- deepdiff --count --exit-zero --max-complexity=26 --max-line-length=250 --statistics
- name: Test with pytest (no coverage)
if: ${{ matrix.python-version != '3.14' }}
run: |
nox -s pytest-${{ matrix.python-version }} -- --benchmark-disable tests/
- name: Test with pytest (+ coverage)
if: ${{ matrix.python-version == '3.14' }}
run: |
nox -s pytest-${{ matrix.python-version }} -- \
--benchmark-disable \
--cov-report=xml \
--cov=deepdiff \
tests/ --runslow
- name: Upload coverage
if: ${{ matrix.python-version == '3.14' }}
uses: codecov/codecov-action@v4
with:
token: ${{ secrets.CODECOV_TOKEN }}
file: coverage.xml
env_vars: OS,PYTHON
fail_ci_if_error: true
qlustered-deepdiff-41c7265/.gitignore 0000664 0000000 0000000 00000001540 15162412645 0017541 0 ustar 00root root 0000000 0000000 # Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
.pytest_cache/
# C extensions
*.so
# Distribution / packaging
.Python
env/
.venv
build/
develop-eggs/
dist/
downloads/
eggs/
lib/
lib64/
parts/
sdist/
var/
no_upload/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.cache
nosetests.xml
coverage.xml
# Translations
*.mo
*.pot
# Django stuff:
*.log
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# OS-specific spam
.DS_Store
# Editor / IDE files
*.swp
.idea/
.~lock*
.python-version*
temp*
# env file
.env
pyrightconfig.json
# direnv file
.envrc
qlustered-deepdiff-41c7265/AGENTS.md 0000777 0000000 0000000 00000000000 15162412645 0020326 2CLAUDE.md ustar 00root root 0000000 0000000 qlustered-deepdiff-41c7265/AUTHORS.md 0000664 0000000 0000000 00000020160 15162412645 0017217 0 ustar 00root root 0000000 0000000 # Authors
Authors in order of the timeline of their contributions:
- [Sep Dehpour (Seperman)](http://www.zepworks.com)
- [Victor Hahn Castell](http://hahncastell.de) for the tree view and major contributions:
- [nfvs](https://github.com/nfvs) for Travis-CI setup script.
- [brbsix](https://github.com/brbsix) for initial Py3 porting.
- [WangFenjin](https://github.com/WangFenjin) for unicode support.
- [timoilya](https://github.com/timoilya) for comparing list of sets when ignoring order.
- [Bernhard10](https://github.com/Bernhard10) for significant digits comparison.
- [b-jazz](https://github.com/b-jazz) for PEP257 cleanup, Standardize on full names, fixing line endings.
- [finnhughes](https://github.com/finnhughes) for fixing __slots__
- [moloney](https://github.com/moloney) for Unicode vs. Bytes default
- [serv-inc](https://github.com/serv-inc) for adding help(deepdiff)
- [movermeyer](https://github.com/movermeyer) for updating docs
- [maxrothman](https://github.com/maxrothman) for search in inherited class attributes
- [maxrothman](https://github.com/maxrothman) for search for types/objects
- [MartyHub](https://github.com/MartyHub) for exclude regex paths
- [sreecodeslayer](https://github.com/sreecodeslayer) for DeepSearch match_string
- Brian Maissy [brianmaissy](https://github.com/) for weakref fix, enum tests
- Bartosz Borowik [boba-2](https://github.com/boba-2) for Exclude types fix when ignoring order
- Brian Maissy [brianmaissy](https://github.com/brianmaissy) for fixing classes which inherit from classes with slots didn't have all of their slots compared
- Juan Soler [Soleronline](https://github.com/Soleronline) for adding ignore_type_number
- [mthaddon](https://github.com/mthaddon) for adding timedelta diffing support
- [Necrophagos](https://github.com/Necrophagos) for Hashing of the number 1 vs. True
- [gaal-dev](https://github.com/gaal-dev) for adding exclude_obj_callback
- Ivan Piskunov [van-ess0](https://github.com/van-ess0) for deprecation warning enhancement.
- Michał Karaś [MKaras93](https://github.com/MKaras93) for the pretty view
- Christian Kothe [chkothe](https://github.com/chkothe) for the basic support for diffing numpy arrays
- [Timothy](https://github.com/timson) for truncate_datetime
- [d0b3rm4n](https://github.com/d0b3rm4n) for bugfix to not apply format to non numbers.
- [MyrikLD](https://github.com/MyrikLD) for Bug Fix NoneType in ignore type groups
- Stian Jensen [stianjensen](https://github.com/stianjensen) for improving ignoring of NoneType in diff
- Florian Klien [flowolf](https://github.com/flowolf) for adding math_epsilon
- Tim Klein [timjklein36](https://github.com/timjklein36) for retaining the order of multiple dictionary items added via Delta.
- Wilhelm Schürmann [wbsch](https://github.com/wbsch) for fixing the typo with yml files.
- [lyz-code](https://github.com/lyz-code) for adding support for regular expressions in DeepSearch and strict_checking feature in DeepSearch.
- [dtorres-sf](https://github.com/dtorres-sf) for adding the option for custom compare function
- Tony Wang [Tony-Wang](https://github.com/Tony-Wang) for bugfix: verbose_level==0 should disable values_changes.
- Sun Ao [eggachecat](https://github.com/eggachecat) for adding custom operators.
- Sun Ao [eggachecat](https://github.com/eggachecat) for adding ignore_order_func.
- [SlavaSkvortsov](https://github.com/SlavaSkvortsov) for fixing unprocessed key error.
- Håvard Thom [havardthom](https://github.com/havardthom) for adding UUID support.
- Dhanvantari Tilak [Dhanvantari](https://github.com/Dhanvantari) for Bug-Fix: `TypeError in _get_numbers_distance() when ignore_order = True`.
- Yael Mintz [yaelmi3](https://github.com/yaelmi3) for detailed pretty print when verbose_level=2.
- Mikhail Khviyuzov [mskhviyu](https://github.com/mskhviyu) for Exclude obj callback strict.
- [dtorres-sf](https://github.com/dtorres-sf) for the fix for diffing using iterable_compare_func with nested objects.
- [Enric Pou](https://github.com/epou) for bug fix of ValueError when using Decimal 0.x
- [Uwe Fladrich](https://github.com/uwefladrich) for fixing bug when diff'ing non-sequence iterables
- [Michal Ozery-Flato](https://github.com/michalozeryflato) for setting equal_nan=ignore_nan_inequality in the call for np.array_equal
- [martin-kokos](https://github.com/martin-kokos) for using Pytest's tmp_path fixture instead of /tmp/
- Håvard Thom [havardthom](https://github.com/havardthom) for adding include_obj_callback and include_obj_callback_strict.
- [Noam Gottlieb](https://github.com/noamgot) for fixing a corner case where numpy's `np.float32` nans are not ignored when using `ignore_nan_equality`.
- [maggelus](https://github.com/maggelus) for the bugfix deephash for paths.
- [maggelus](https://github.com/maggelus) for the bugfix deephash compiled regex.
- [martin-kokos](https://github.com/martin-kokos) for fixing the tests dependent on toml.
- [kor4ik](https://github.com/kor4ik) for the bugfix for `include_paths` for nested dictionaries.
- [martin-kokos](https://github.com/martin-kokos) for using tomli and tomli-w for dealing with tomli files.
- [Alex Sauer-Budge](https://github.com/amsb) for the bugfix for `datetime.date`.
- [William Jamieson](https://github.com/WilliamJamieson) for [NumPy 2.0 compatibility](https://github.com/seperman/deepdiff/pull/422)
- [Leo Sin](https://github.com/leoslf) for Supporting Python 3.12 in the build process
- [sf-tcalhoun](https://github.com/sf-tcalhoun) for fixing "Instantiating a Delta with a flat_dict_list unexpectedly mutates the flat_dict_list"
- [dtorres-sf](https://github.com/dtorres-sf) for fixing iterable moved items when iterable_compare_func is used.
- [Florian Finkernagel](https://github.com/TyberiusPrime) for pandas and polars support.
- Mathis Chenuet [artemisart](https://github.com/artemisart) for fixing slots classes comparison and PR review.
- Sherjeel Shabih [sherjeelshabih](https://github.com/sherjeelshabih) for fixing the issue where the key deep_distance is not returned when both compared items are equal #510
- [Aaron D. Marasco](https://github.com/AaronDMarasco) for adding `prefix` option to `pretty()`
- [Juergen Skrotzky](https://github.com/Jorgen-VikingGod) for adding empty `py.typed`
- [Mate Valko](https://github.com/vmatt) for fixing the issue so we lower only if clean_key is instance of str via #504
- [jlaba](https://github.com/jlaba) for fixing #493 include_paths, when only certain keys are included via #499
- [Doron Behar](https://github.com/doronbehar) for fixing DeepHash for numpy booleans via #496
- [Aaron D. Marasco](https://github.com/AaronDMarasco) for adding print() options which allows a user-defined string (or callback function) to prefix every output when using the pretty() call.
- [David Hotham](https://github.com/dimbleby) for relaxing orderly-set dependency via #486
- [dtorres-sf](https://github.com/dtorres-sf) for the fix for moving nested tables when using iterable_compare_func.
- [Jim Cipar](https://github.com/jcipar) for the fix recursion depth limit when hashing numpy.datetime64
- [Enji Cooper](https://github.com/ngie-eign) for converting legacy setuptools use to pyproject.toml
- [Diogo Correia](https://github.com/diogotcorreia) for reporting security vulnerability in Delta and DeepDiff that could allow remote code execution.
- [am-periphery](https://github.com/am-periphery) for reporting CVE-2026-33155: denial-of-service via crafted pickle payloads triggering massive memory allocation.
- [echan5](https://github.com/echan5) for adding callable `group_by` support.
- [yannrouillard](https://github.com/yannrouillard) for fixing colored view display when all list items are removed.
- [tpvasconcelos](https://github.com/tpvasconcelos) for fixing `__slots__` handling for objects with `__getattr__`.
- [devin13cox](https://github.com/devin13cox) for always using t1 path for reporting.
- [vitalis89](https://github.com/vitalis89) for fixing `ignore_keys` issue in `detailed__dict__`.
- [ljames8](https://github.com/ljames8) for fixing logarithmic similarity type hint.
- [srini047](https://github.com/srini047) for fixing README typo.
- [Nagato-Yuzuru](https://github.com/Nagato-Yuzuru) for colored view tests.
- [akshat62](https://github.com/akshat62) for adding Fraction numeric support.
qlustered-deepdiff-41c7265/CHANGELOG.md 0000664 0000000 0000000 00000042265 15162412645 0017373 0 ustar 00root root 0000000 0000000 # DeepDiff Change log
- v9-0-0
- migration note:
- `to_dict()` and `to_json()` now accept a `verbose_level` parameter and always return a usable text-view dict. When the original view is `'tree'`, they default to `verbose_level=2` for full detail. The old `view_override` parameter is removed. To get the previous results, you will need to pass the explicit verbose_level to `to_json` and `to_dict` if you are using the tree view.
- Dropping support for Python 3.9
- Support for python 3.14
- Added support for callable `group_by` thanks to [echan5](https://github.com/echan5)
- Added `FlatDeltaDict` TypedDict for `to_flat_dicts` return type
- Fixed colored view display when all list items are removed thanks to [yannrouillard](https://github.com/yannrouillard)
- Fixed `hasattr()` swallowing `AttributeError` in `__slots__` handling for objects with `__getattr__` thanks to [tpvasconcelos](https://github.com/tpvasconcelos)
- Fixed `ignore_order=True` missing int-vs-float type changes
- Always use t1 path for reporting thanks to [devin13cox](https://github.com/devin13cox)
- Fixed `_convert_oversized_ints` failing on NamedTuples
- Fixed orjson `TypeError` for integers exceeding 64-bit range
- Fixed parameter bug in `to_flat_dicts` where `include_action_in_path` and `report_type_changes` were not being passed through
- Fixed `ignore_keys` issue in `detailed__dict__` thanks to [vitalis89](https://github.com/vitalis89)
- Fixed logarithmic similarity type hint thanks to [ljames8](https://github.com/ljames8)
- Added `Fraction` numeric support thanks to [akshat62](https://github.com/akshat62)
- v8-6-2
- Security fix (CVE-2026-33155): Prevent denial-of-service via crafted pickle payloads that trigger massive memory allocation through the REDUCE opcode. Size-sensitive callables like `bytes()` and `bytearray()` are now wrapped to reject allocations exceeding 128 MB.
- v8-6-1
- Patched security vulnerability in the Delta class which was vulnerable to class pollution via its constructor, and when combined with a gadget available in DeltaDiff itself, it could lead to Denial of Service and Remote Code Execution (via insecure Pickle deserialization).
- v8-6-0
- Added Colored View thanks to @mauvilsa
- Added support for applying deltas to NamedTuple thanks to @paulsc
- Fixed test_delta.py with Python 3.14 thanks to @Romain-Geissler-1A
- Added python property serialization to json
- Added ip address serialization
- Switched to UV from pip
- Added Claude.md
- Added uuid hashing thanks to @akshat62
- Added `ignore_uuid_types` flag to DeepDiff to avoid type reports when comparing UUID and string.
- Added comprehensive type hints across the codebase (multiple commits for better type safety)
- Added support for memoryview serialization
- Added support for bytes serialization (non-UTF8 compatible)
- Fixed bug where group_by with numbers would leak type info into group path reports
- Fixed bug in `_get_clean_to_keys_mapping` without explicit significant digits
- Added support for python dict key serialization
- Enhanced support for IP address serialization with safe module imports
- Added development tooling improvements (pyright config, .envrc example)
- Updated documentation and development instructions
- v8-5-0
- Updating deprecated pydantic calls
- Switching to pyproject.toml
- Fix for moving nested tables when using iterable_compare_func.
- Fix recursion depth limit when hashing numpy.datetime64
- Moving from legacy setuptools use to pyproject.toml
- v8-4-2
- fixes the type hints for the base
- fixes summarize so if json dumps fails, we can still get a repr of the results
- adds ipaddress support
- v8-4-1
- Adding BaseOperatorPlus base class for custom operators
- default_timezone can be passed now to set your default timezone to something other than UTC.
- New summarization algorithm that produces valid json
- Better type hint support
- Breaking change in DeepHash where we raise Exception instead of logging if we can't hash a value.
- Added the log_stacktrace parameter to DeepDiff. When True, it will log the stacktrace along with the error.
- v8-3-0
- Fixed some static typing issues
- Added the summarize module for better repr of nested values
- v8-2-0
- Small optimizations so we don't load functions that are not needed
- Updated the minimum version of Orderly-set
- Normalize all datetimes into UTC. Assume timezone naive datetimes are UTC.
- v8-1-0
- Removing deprecated lines from setup.py
- Added `prefix` option to `pretty()`
- Fixes hashing of numpy boolean values.
- Fixes __slots__ comparison when the attribute doesn't exist.
- Relaxing orderly-set reqs
- Added Python 3.13 support
- Only lower if clean_key is instance of str #504
- Fixes issue where the key deep_distance is not returned when both compared items are equal #510
- Fixes exclude_paths fails to work in certain cases
- exclude_paths fails to work #509
- Fixes to_json() method chokes on standard json.dumps() kwargs such as sort_keys
- to_dict() method chokes on standard json.dumps() kwargs #490
- Fixes accessing the affected_root_keys property on the diff object returned by DeepDiff fails when one of the dicts is empty
- Fixes accessing the affected_root_keys property on the diff object returned by DeepDiff fails when one of the dicts is empty #508
- v8-0-1
- Bugfix. Numpy should be optional.
- v8-0-0
- With the introduction of `threshold_to_diff_deeper`, the values returned are different than in previous versions of DeepDiff. You can still get the older values by setting `threshold_to_diff_deeper=0`. However to signify that enough has changed in this release that the users need to update the parameters passed to DeepDiff, we will be doing a major version update.
- [x] `use_enum_value=True` makes it so when diffing enum, we use the enum's value. It makes it so comparing an enum to a string or any other value is not reported as a type change.
- [x] `threshold_to_diff_deeper=float` is a number between 0 and 1. When comparing dictionaries that have a small intersection of keys, we will report the dictionary as a `new_value` instead of reporting individual keys changed. If you set it to zero, you get the same results as DeepDiff 7.0.1 and earlier, which means this feature is disabled. The new default is 0.33 which means if less that one third of keys between dictionaries intersect, report it as a new object.
- [x] Deprecated `ordered-set` and switched to `orderly-set`. The `ordered-set` package was not being maintained anymore and starting Python 3.6, there were better options for sets that ordered. I forked one of the new implementations, modified it, and published it as `orderly-set`.
- [x] Added `use_log_scale:bool` and `log_scale_similarity_threshold:float`. They can be used to ignore small changes in numbers by comparing their differences in logarithmic space. This is different than ignoring the difference based on significant digits.
- [x] json serialization of reversed lists.
- [x] Fix for iterable moved items when `iterable_compare_func` is used.
- [x] Pandas and Polars support
- v7-0-1
- Fixes the translation between Difflib opcodes and Delta flat rows.
- v7-0-0
- When verbose=2, return `new_path` when the `path` and `new_path` are different (for example when ignore_order=True and the index of items have changed).
- Dropping support for Python 3.7
- Introducing serialize to flat rows for delta objects.
- fixes the issue with hashing `datetime.date` objects where it treated them as numbers instead of dates (fixes #445).
- upgrading orjson to the latest version
- Fix for bug when diffing two lists with ignore_order and providing compare_func
- Fixes "Wrong diff on list of strings" #438
- Supporting Python 3.12 in the build process by [Leo Sin](https://github.com/leoslf)
- Fixes "Instantiating a Delta with a flat_dict_list unexpectedly mutates the flat_dict_list" #457 by [sf-tcalhoun](https://github.com/sf-tcalhoun)
- Fixes "Error on Delta With None Key and Removed Item from List" #441
- Fixes "Error when comparing two nested dicts with 2 added fields" #450
- Fixes "Error when subtracting Delta from a dictionary" #443
- v6-7-1
- Support for subtracting delta objects when iterable_compare_func is used.
- Better handling of force adding a delta to an object.
- Fix for [`Can't compare dicts with both single and double quotes in keys`](https://github.com/seperman/deepdiff/issues/430)
- Updated docs for Inconsistent Behavior with math_epsilon and ignore_order = True
- v6-7-0
- Delta can be subtracted from other objects now.
- verify_symmetry is deprecated. Use bidirectional instead.
- always_include_values flag in Delta can be enabled to include values in the delta for every change.
- Fix for Delta.__add__ breaks with esoteric dict keys.
- You can load a delta from the list of flat dictionaries.
- v6-6-1
- Fix for [DeepDiff raises decimal exception when using significant digits](https://github.com/seperman/deepdiff/issues/426)
- Introducing group_by_sort_key
- Adding group_by 2D. For example `group_by=['last_name', 'zip_code']`
- v6-6-0
- Numpy 2.0 support
- Adding [Delta.to_flat_dicts](https://zepworks.com/deepdiff/current/serialization.html#delta-serialize-to-flat-dictionaries)
- v6-5-0
- Adding [`parse_path`](https://github.com/seperman/deepdiff/pull/419)
- v6-4-1
- Bugfix: Keep Numpy Optional
- v6-4-0
- [Add Ignore List Order Option to DeepHash](https://github.com/seperman/deepdiff/pull/403) by
[Bobby Morck](https://github.com/bmorck)
- [pyyaml to 6.0.1 to fix cython build problems](https://github.com/seperman/deepdiff/pull/406) by [Robert Bo Davis](https://github.com/robert-bo-davis)
- [Precompiled regex simple diff](https://github.com/seperman/deepdiff/pull/413) by [cohml](https://github.com/cohml)
- New flag: `zip_ordered_iterables` for forcing iterable items to be compared one by one.
- v6-3-1
- Bugfix deephash for paths by [maggelus](https://github.com/maggelus)
- Bugfix deephash compiled regex [maggelus](https://github.com/maggelus)
- Fix tests dependent on toml by [martin-kokos](https://github.com/martin-kokos)
- Bugfix for `include_paths` for nested dictionaries by [kor4ik](https://github.com/kor4ik)
- Use tomli and tomli-w for dealing with tomli files by [martin-kokos](https://github.com/martin-kokos)
- Bugfix for `datetime.date` by [Alex Sauer-Budge](https://github.com/amsb)
- v6-3-0
- `PrefixOrSuffixOperator`: This operator will skip strings that are suffix or prefix of each other.
- `include_obj_callback` and `include_obj_callback_strict` are added by [Håvard Thom](https://github.com/havardthom).
- Fixed a corner case where numpy's `np.float32` nans are not ignored when using `ignore_nan_equality` by [Noam Gottlieb](https://github.com/noamgot)
- `orjson` becomes optional again.
- Fix for `ignore_type_in_groups` with numeric values so it does not report number changes when the number types are different.
- v6-2-3
- Switching to Orjson for serialization to improve the performance.
- Setting `equal_nan=ignore_nan_inequality` in the call for `np.array_equal`
- Using Pytest's tmp_path fixture instead of `/tmp/`
- v6-2-2
- Enum test fix for python 3.11
- Adding support for dateutils rrules
- v6-2-1
- Removed the print statements.
- v6-2-0
- Major improvement in the diff report for lists when items are all hashable and the order of items is important.
- v6-1-0
- DeepDiff.affected_paths can be used to get the list of all paths where a change, addition, or deletion was reported for.
- DeepDiff.affected_root_keys can be used to get the list of all paths where a change, addition, or deletion was reported for.
- Bugfix: ValueError when using Decimal 0.x #339 by [Enric Pou](https://github.com/epou)
- Serialization of UUID
- v6-0-0
- [Exclude obj callback strict](https://github.com/seperman/deepdiff/pull/320/files) parameter is added to DeepDiff by Mikhail Khviyuzov [mskhviyu](https://github.com/mskhviyu).
- A fix for diffing using `iterable_compare_func` with nested objects by [dtorres-sf](https://github.com/dtorres-sf) who originally contributed this feature.
- v5-7-0:
- https://github.com/seperman/deepdiff/pull/284 Bug-Fix: TypeError in _get_numbers_distance() when ignore_order = True by @Dhanvantari
- https://github.com/seperman/deepdiff/pull/280 Add support for UUIDs by @havardthom
- Major bug in delta when it comes to iterable items added or removed is investigated by @uwefladrich and resolved by @seperman
- v5-6-0: Adding custom operators, and ignore_order_func. Bugfix: verbose_level==0 should disable values_changes. Bugfix: unprocessed key error.
- v5-5-0: adding iterable_compare_func for DeepDiff, adding output_format of list for path() in tree view.
- v5-4-0: adding strict_checking for numbers in DeepSearch.
- v5-3-0: add support for regular expressions in DeepSearch.
- v5-2-3: Retaining the order of multiple dictionary items added via Delta. Fixed the typo with yml files in deep cli. Fixing Grep RecursionError where using non UTF-8 character. Allowing kwargs to be passed to to_json method.
- v5-2-2: Fixed Delta serialization when None type is present.
- v5-2-0: Removed Murmur3 as the preferred hashing method. Using SHA256 by default now. Added commandline for deepdiff. Added group_by. Added math_epsilon. Improved ignoring of NoneType.
- v5-0-2: Bug Fix NoneType in ignore type groups https://github.com/seperman/deepdiff/issues/207
- v5-0-1: Bug fix to not apply format to non numbers.
- v5-0-0: Introducing the Delta object, Improving Numpy support, Fixing tuples comparison when ignore_order=True, Dramatically improving the results when ignore_order=True by running in passes, Introducing pretty print view, deep_distance, purge, progress logging, cache and truncate_datetime.
- v4-3-3: Adds support for datetime.time
- v4-3-2: Deprecation Warning Enhancement
- v4-3-1: Fixing the issue with exclude_path and hash calculations when dictionaries were inside iterables. https://github.com/seperman/deepdiff/issues/174
- v4-3-0: adding exclude_obj_callback
- v4-2-0: .json property is finally removed. Fix for Py3.10. Dropping support for EOL Python 3.4. Ignoring private keys when calculating hashes. For example __init__ is not a part of hash calculation anymore. Fix for #166 Problem with comparing lists, with an boolean as element.
- v4-0-9: Fixing the bug for hashing custom unhashable objects
- v4-0-8: Adding ignore_nan_inequality for float('nan')
- v4-0-7: Hashing of the number 1 vs. True
- v4-0-6: found a tiny bug in Python formatting of numbers in scientific notation. Added a workaround.
- v4-0-5: Fixing number diffing. Adding number_format_notation and number_to_string_func.
- v4-0-4: Adding ignore_string_case and ignore_type_subclasses
- v4-0-3: Adding versionbump tool for release
- v4-0-2: Fixing installation issue where rst files are missing.
- v4-0-1: Fixing installation Tarball missing requirements.txt . DeepDiff v4+ should not show up as pip installable for Py2. Making Murmur3 installation optional.
- v4-0-0: Ending Python 2 support, Adding more functionalities and documentation for DeepHash. Switching to Pytest for testing. Switching to Murmur3 128bit for hashing. Fixing classes which inherit from classes with slots didn't have all of their slots compared. Renaming ContentHash to DeepHash. Adding exclude by path and regex path to DeepHash. Adding ignore_type_in_groups. Adding match_string to DeepSearch. Adding Timedelta object diffing.
- v3-5-0: Exclude regex path
- v3-3-0: Searching for objects and class attributes
- v3-2-2: Adding help(deepdiff)
- v3-2-1: Fixing hash of None
- v3-2-0: Adding grep for search: object | grep(item)
- v3-1-3: Unicode vs. Bytes default fix
- v3-1-2: NotPresent Fix when item is added or removed.
- v3-1-1: Bug fix when item value is None (#58)
- v3-1-0: Serialization to/from json
- v3-0-0: Introducing Tree View
- v2-5-3: Bug fix on logging for content hash.
- v2-5-2: Bug fixes on content hash.
- v2-5-0: Adding ContentHash module to fix ignore_order once and for all.
- v2-1-0: Adding Deep Search. Now you can search for item in an object.
- v2-0-0: Exclusion patterns better coverage. Updating docs.
- v1-8-0: Exclusion patterns.
- v1-7-0: Deep Set comparison.
- v1-6-0: Unifying key names. i.e newvalue is new_value now. For backward compatibility, newvalue still works.
- v1-5-0: Fixing ignore order containers with unordered items. Adding significant digits when comparing decimals. Changes property is deprecated.
- v1-1-0: Changing Set, Dictionary and Object Attribute Add/Removal to be reported as Set instead of List. Adding Pypy compatibility.
- v1-0-2: Checking for ImmutableMapping type instead of dict
- v1-0-1: Better ignore order support
- v1-0-0: Restructuring output to make it more useful. This is NOT backward compatible.
- v0-6-1: Fixing iterables with unhashable when order is ignored
- v0-6-0: Adding unicode support
- v0-5-9: Adding decimal support
- v0-5-8: Adding ignore order of unhashables support
- v0-5-7: Adding ignore order support
- v0-5-6: Adding slots support
- v0-5-5: Adding loop detection
qlustered-deepdiff-41c7265/CITATION.cff 0000664 0000000 0000000 00000000433 15162412645 0017443 0 ustar 00root root 0000000 0000000 cff-version: 1.2.0
message: "If you use this software, please cite it as below."
authors:
- family-names: "Dehpour"
given-names: "Sep"
orcid: "https://orcid.org/0009-0009-5828-4345"
title: "DeepDiff"
version: 9.0.0
date-released: 2026
url: "https://github.com/seperman/deepdiff"
qlustered-deepdiff-41c7265/CLAUDE.md 0000664 0000000 0000000 00000007276 15162412645 0017044 0 ustar 00root root 0000000 0000000 # CLAUDE.md
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
## Project Overview
DeepDiff is a Python library for deep comparison, searching, and hashing of Python objects. It provides:
- **DeepDiff**: Deep difference detection between objects
- **DeepSearch**: Search for objects within other objects
- **DeepHash**: Content-based hashing for any object
- **Delta**: Git-like diff objects that can be applied to other objects
- **CLI**: Command-line interface via `deep` command
## Development Commands
### Setup
```bash
# Install with all development dependencies
uv pip install -e ".[cli,coverage,dev,docs,static,test]"
# OR using uv (recommended)
uv sync --all-extras
```
**Virtual Environment**: Activate with `source ~/.venvs/deep/bin/activate` before running tests or Python commands
### Testing
```bash
# Run tests with coverage
source ~/.venvs/deep/bin/activate && pytest --cov=deepdiff --cov-report term-missing
# Run tests including slow ones
source ~/.venvs/deep/bin/activate && pytest --cov=deepdiff --runslow
# Run single test file
source ~/.venvs/deep/bin/activate && pytest tests/test_diff_text.py
# Run tests across multiple Python versions. No need to use this unless getting ready for creating a new build
source ~/.venvs/deep/bin/activate && nox -s pytest
```
### **Type Checking with Pyright:**
Always use this pattern for type checking:
```bash
source ~/.venvs/deep/bin/activate && pyright {file_path}
```
Examples:
- `source ~/.venvs/deep/bin/activate && pyright deepdiff/diff.py` - Type check specific file
- `source ~/.venvs/deep/bin/activate && pyright deepdiff/` - Type check entire module
- `source ~/.venvs/deep/bin/activate && pyright .` - Type check entire repo
### Common Pitfalls to Avoid
1. **Forgetting Virtual Environment**: ALWAYS activate venv before ANY Python command:
```bash
source ~/.venvs/deep/bin/activate
```
2. **Running pytest without venv**: This will cause import errors. Always use:
```bash
source ~/.venvs/deep/bin/activate && pytest
```
3. **Running module commands without venv**: Commands like `capi run`, `cettings shell`, etc. all require venv to be activated first
4. **Using `pip` instead of `uv pip`**: This project uses `uv` for package management. Always use `uv pip` instead of `pip`.
### Slow quality checks only to run before creating a build
```bash
# Linting (max line length: 120)
nox -s flake8
# Type checking
nox -s mypy
# Run all quality checks
nox
```
## Architecture
### Core Structure
- **deepdiff/diff.py**: Main DeepDiff implementation (most complex component)
- **deepdiff/deephash.py**: DeepHash functionality
- **deepdiff/base.py**: Shared base classes and functionality
- **deepdiff/model.py**: Core data structures and result objects
- **deepdiff/helper.py**: Utility functions and type definitions
- **deepdiff/delta.py**: Delta objects for applying changes
### Key Patterns
- **Inheritance**: `Base` class provides common functionality with mixins
- **Result Objects**: Different result formats (`ResultDict`, `TreeResult`, `TextResult`)
- **Path Navigation**: Consistent path notation for nested object access
- **Performance**: LRU caching and numpy array optimization
### Testing
- Located in `/tests/` directory
- Organized by functionality (e.g., `test_diff_text.py`, `test_hash.py`)
- Aims for ~100% test coverage
- Uses pytest with comprehensive fixtures
## Development Notes
- **Python Support**: 3.10+ and PyPy3
- **Main Branch**: `master` (PRs typically go to `dev` branch)
- **Build System**: Modern `pyproject.toml` with `flit_core`
- **Dependencies**: Core dependency is `orderly-set>=5.4.1,<6`
- **CLI Tool**: Available as `deep` command after installation with `[cli]` extra
qlustered-deepdiff-41c7265/LICENSE 0000664 0000000 0000000 00000002166 15162412645 0016563 0 ustar 00root root 0000000 0000000 The MIT License (MIT)
Copyright (c) 2014 - 2026 Sep Dehpour (Seperman) and contributors
getqluster.com
zepworks.com
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
qlustered-deepdiff-41c7265/MANIFEST.in 0000664 0000000 0000000 00000000712 15162412645 0017307 0 ustar 00root root 0000000 0000000 include LICENSE
include AUTHORS
include CHANGELOG
include *.rst
include deepdiff/*.rst
include *.txt
include *.sh
include pytest.ini
include *.py
exclude uv.lock
recursive-include docs/ *.rst
recursive-include docs/ *.png
recursive-include tests *.csv
recursive-include tests *.json
recursive-include tests *.pickle
recursive-include tests *.py
recursive-include tests *.toml
recursive-include tests *.yaml
global-exclude __pycache__
global-exclude *.py[co]
qlustered-deepdiff-41c7265/README.md 0000664 0000000 0000000 00000012007 15162412645 0017030 0 ustar 00root root 0000000 0000000 # DeepDiff v 9.0.0



[](https://github.com/seperman/deepdiff/actions)
[](https://codecov.io/gh/seperman/deepdiff)
**DeepDiff is now part of [Qluster](/qluster).**
*If you're building workflows around data validation and correction, [Qluster](/qluster) gives your team a structured way to manage rules, review failures, approve fixes, and reuse decisions—without building the entire system from scratch.*
## Modules
- [DeepDiff](https://zepworks.com/deepdiff/current/diff.html): Deep Difference of dictionaries, iterables, strings, and ANY other object.
- [DeepSearch](https://zepworks.com/deepdiff/current/dsearch.html): Search for objects within other objects.
- [DeepHash](https://zepworks.com/deepdiff/current/deephash.html): Hash any object based on their content.
- [Delta](https://zepworks.com/deepdiff/current/delta.html): Store the difference of objects and apply them to other objects.
- [Extract](https://zepworks.com/deepdiff/current/extract.html): Extract an item from a nested Python object using its path.
- [commandline](https://zepworks.com/deepdiff/current/commandline.html): Use DeepDiff from commandline.
Tested on Python 3.10+ and PyPy3.
- **[Documentation](https://zepworks.com/deepdiff/9.0.0/)**
## What is new?
Please check the [ChangeLog](CHANGELOG.md) file for the detailed information.
DeepDiff 9-0-0
- migration note:
- `to_dict()` and `to_json()` now accept a `verbose_level` parameter and always return a usable text-view dict. When the original view is `'tree'`, they default to `verbose_level=2` for full detail. The old `view_override` parameter is removed. To get the previous results, you will need to pass the explicit verbose_level to `to_json` and `to_dict` if you are using the tree view.
- Dropping support for Python 3.9
- Support for python 3.14
- Added support for callable `group_by` thanks to @echan5
- Added `FlatDeltaDict` TypedDict for `to_flat_dicts` return type
- Fixed colored view display when all list items are removed thanks to @yannrouillard
- Fixed `hasattr()` swallowing `AttributeError` in `__slots__` handling for objects with `__getattr__` thanks to @tpvasconcelos
- Fixed `ignore_order=True` missing int-vs-float type changes
- Always use t1 path for reporting thanks to @devin13cox
- Fixed `_convert_oversized_ints` failing on NamedTuples
- Fixed orjson `TypeError` for integers exceeding 64-bit range
- Fixed parameter bug in `to_flat_dicts` where `include_action_in_path` and `report_type_changes` were not being passed through
- Fixed `ignore_keys` issue in `detailed__dict__` thanks to @vitalis89
- Fixed logarithmic similarity type hint thanks to @ljames8
- Added `Fraction` numeric support thanks to @akshat62
## Installation
### Install from PyPi:
`pip install deepdiff`
If you want to use DeepDiff from commandline:
`pip install "deepdiff[cli]"`
If you want to improve the performance of DeepDiff with certain functionalities such as improved json serialization:
`pip install "deepdiff[optimize]"`
Install optional packages:
- [yaml](https://pypi.org/project/PyYAML/)
- [tomli](https://pypi.org/project/tomli/) (python 3.10 and older) and [tomli-w](https://pypi.org/project/tomli-w/) for writing
- [clevercsv](https://pypi.org/project/clevercsv/) for more robust CSV parsing
- [orjson](https://pypi.org/project/orjson/) for speed and memory optimized parsing
- [pydantic](https://pypi.org/project/pydantic/)
# Documentation
# ChangeLog
Please take a look at the [CHANGELOG](CHANGELOG.md) file.
# Survey
:mega: **Please fill out our [fast 5-question survey](https://forms.gle/E6qXexcgjoKnSzjB8)** so that we can learn how & why you use DeepDiff, and what improvements we should make. Thank you! :dancers:
# Local dev
1. Clone the repo
2. Switch to the dev branch
3. Create your own branch
4. Install dependencies
- Method 1: Use [`uv`](https://github.com/astral-sh/uv) to install the dependencies: `uv sync --all-extras`.
- Method 2: Use pip: `pip install -e ".[cli,coverage,dev,docs,static,test]"`
5. Build `uv build`
# Contribute
1. Please make your PR against the dev branch
2. Please make sure that your PR has tests. Since DeepDiff is used in many sensitive data driven projects, we strive to maintain around 100% test coverage on the code.
Please run `pytest --cov=deepdiff --runslow` to see the coverage report. Note that the `--runslow` flag will run some slow tests too. In most cases you only want to run the fast tests which so you won't add the `--runslow` flag.
Or to see a more user friendly version, please run: `pytest --cov=deepdiff --cov-report term-missing --runslow`.
Thank you!
# Authors
Please take a look at the [AUTHORS](AUTHORS.md) file.
qlustered-deepdiff-41c7265/conftest.py 0000664 0000000 0000000 00000007224 15162412645 0017755 0 ustar 00root root 0000000 0000000 import sys
import os
import json
import pytest
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'tests')))
FIXTURES_DIR = os.path.join(os.path.dirname(__file__), 'tests/fixtures/')
def pytest_addoption(parser):
parser.addoption(
"--runslow", action="store_true", default=False, help="run slow tests"
)
def pytest_configure(config):
config.addinivalue_line("markers", "slow: mark test as slow to run")
def pytest_collection_modifyitems(config, items):
if config.getoption("--runslow"):
# --runslow given in cli: do not skip slow tests
return
skip_slow = pytest.mark.skip(reason="need --runslow option to run")
for item in items:
if "slow" in item.keywords:
item.add_marker(skip_slow)
@pytest.fixture(scope='class')
def nested_a_t1():
with open(os.path.join(FIXTURES_DIR, 'nested_a_t1.json')) as the_file:
return json.load(the_file)
@pytest.fixture(scope='class')
def nested_a_t2():
with open(os.path.join(FIXTURES_DIR, 'nested_a_t2.json')) as the_file:
return json.load(the_file)
@pytest.fixture(scope='class')
def nested_a_result():
with open(os.path.join(FIXTURES_DIR, 'nested_a_result.json')) as the_file:
return json.load(the_file)
@pytest.fixture(scope='function')
def compounds():
with open(os.path.join(FIXTURES_DIR, 'compounds.json')) as the_file:
return json.load(the_file)
@pytest.fixture(scope='class')
def nested_a_affected_paths():
return {
'root[0][0][2][0][1]', 'root[0][1][1][1][5]', 'root[0][2][1]',
'root[1][1][2][0][1]', 'root[1][2][0]', 'root[1][2][0][1][5]',
'root[1][0][2][2][3]', 'root[0][0][1][0][0]', 'root[0][1][0][2][3]',
'root[0][3][0][2][3]', 'root[0][3][1][0][2]', 'root[1][1][1][0][0]',
'root[1][0][1][2][1]', 'root[1][0][2][1][2]', 'root[1][3][0][2][3]',
'root[1][3][1][0][2]', 'root[1][2][0][2]', 'root[1][0][2][0][1]',
'root[0][3][2][0][1]', 'root[0][3][2][1][0]', 'root[1][3][1][1]',
'root[1][2][1][1][0]', 'root[1][2][1][0]', 'root[1][0][0][0][2]',
'root[1][3][2][1][0]', 'root[1][0][0][1][1]', 'root[0][1][2][0]',
'root[0][1][2][1][0]', 'root[0][2][0][1][2]', 'root[1][3][0][1]',
'root[0][3][1][1]', 'root[1][2][0][0][2]', 'root[1][3][2][0][1]',
'root[1][0][1][0]', 'root[1][2][0][0][0]', 'root[1][0][0][0][1]',
'root[1][3][2][2][2]', 'root[0][1][1][2][1]', 'root[0][1][1][2][2]',
'root[0][2][0][0][2]', 'root[0][2][0][0][3]', 'root[0][3][1][2][1]',
'root[0][3][1][2][2]', 'root[1][2][1][2][3]', 'root[1][0][0][1][2]',
'root[1][0][0][2][1]', 'root[1][3][1][2][1]', 'root[1][3][1][2][2]'
}
@pytest.fixture(scope='class')
def nested_b_t1():
with open(os.path.join(FIXTURES_DIR, 'nested_b_t1.json')) as the_file:
return json.load(the_file)
@pytest.fixture(scope='class')
def nested_b_t2():
with open(os.path.join(FIXTURES_DIR, 'nested_b_t2.json')) as the_file:
return json.load(the_file)
@pytest.fixture(scope='class')
def nested_b_result():
with open(os.path.join(FIXTURES_DIR, 'nested_b_result.json')) as the_file:
return json.load(the_file)
@pytest.fixture(scope='class')
def compare_func_t1():
with open(os.path.join(FIXTURES_DIR, 'compare_func_t1.json')) as the_file:
return json.load(the_file)
@pytest.fixture(scope='class')
def compare_func_t2():
with open(os.path.join(FIXTURES_DIR, 'compare_func_t2.json')) as the_file:
return json.load(the_file)
@pytest.fixture(scope='class')
def compare_func_result1():
with open(os.path.join(FIXTURES_DIR, 'compare_func_result1.json')) as the_file:
return json.load(the_file)
qlustered-deepdiff-41c7265/deepdiff/ 0000775 0000000 0000000 00000000000 15162412645 0017317 5 ustar 00root root 0000000 0000000 qlustered-deepdiff-41c7265/deepdiff/__init__.py 0000664 0000000 0000000 00000000736 15162412645 0021436 0 ustar 00root root 0000000 0000000 """This module offers the DeepDiff, DeepSearch, grep, Delta and DeepHash classes."""
# flake8: noqa
__version__ = '9.0.0'
import logging
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s %(levelname)8s %(message)s')
from .diff import DeepDiff as DeepDiff
from .search import DeepSearch as DeepSearch, grep as grep
from .deephash import DeepHash as DeepHash
from .delta import Delta as Delta
from .path import extract as extract, parse_path as parse_path
qlustered-deepdiff-41c7265/deepdiff/anyset.py 0000664 0000000 0000000 00000003673 15162412645 0021205 0 ustar 00root root 0000000 0000000 from deepdiff.deephash import DeepHash
from deepdiff.helper import dict_, SetOrdered
class AnySet:
"""
Any object can be in this set whether hashable or not.
Note that the current implementation has memory leak and keeps
traces of objects in itself even after popping.
However one the AnySet object is deleted, all those traces will be gone too.
"""
def __init__(self, items=None):
self._set = SetOrdered()
self._hashes = dict_()
self._hash_to_objects = dict_()
if items:
for item in items:
self.add(item)
def add(self, item):
try:
self._set.add(item)
except TypeError:
hashes_obj = DeepHash(item, hashes=self._hashes)
hash_ = hashes_obj[item]
if hash_ not in self._hash_to_objects:
self._hash_to_objects[hash_] = item
def __contains__(self, item):
try:
result = item in self._set
except TypeError:
hashes_obj = DeepHash(item, hashes=self._hashes)
hash_ = hashes_obj[item]
result = hash_ in self._hash_to_objects
return result
def pop(self):
if self._set:
return self._set.pop()
else:
return self._hash_to_objects.pop(next(iter(self._hash_to_objects)))
def __eq__(self, other):
set_part, hashes_to_objs_part = other
return (self._set == set_part and self._hash_to_objects == hashes_to_objs_part)
__req__ = __eq__
def __repr__(self):
return "< AnySet {}, {} >".format(self._set, self._hash_to_objects)
__str__ = __repr__
def __len__(self):
return len(self._set) + len(self._hash_to_objects)
def __iter__(self):
for item in self._set:
yield item
for item in self._hash_to_objects.values():
yield item
def __bool__(self):
return bool(self._set or self._hash_to_objects)
qlustered-deepdiff-41c7265/deepdiff/base.py 0000664 0000000 0000000 00000005265 15162412645 0020613 0 ustar 00root root 0000000 0000000 import uuid
from typing import List, Optional, Union, Tuple, Any, Type
from deepdiff.helper import strings, numbers, SetOrdered
DEFAULT_SIGNIFICANT_DIGITS_WHEN_IGNORE_NUMERIC_TYPES = 12
TYPE_STABILIZATION_MSG = 'Unable to stabilize the Numpy array {} due to {}. Please set ignore_order=False.'
class Base:
numbers = numbers
strings = strings
def get_significant_digits(self, significant_digits: Optional[int], ignore_numeric_type_changes: bool) -> Optional[int]:
if significant_digits is not None and significant_digits < 0:
raise ValueError(
"significant_digits must be None or a non-negative integer")
if significant_digits is None:
if ignore_numeric_type_changes:
significant_digits = DEFAULT_SIGNIFICANT_DIGITS_WHEN_IGNORE_NUMERIC_TYPES
return significant_digits
def get_ignore_types_in_groups(self,
ignore_type_in_groups: Optional[Union[List[Any], Tuple[Any, ...]]],
ignore_string_type_changes: bool,
ignore_numeric_type_changes: bool,
ignore_type_subclasses: bool,
ignore_uuid_types: bool = False) -> List[Union[SetOrdered, Tuple[Type[Any], ...]]]:
if ignore_type_in_groups:
if isinstance(ignore_type_in_groups[0], type):
ignore_type_in_groups = [ignore_type_in_groups]
else:
ignore_type_in_groups = []
result = []
for item_group in ignore_type_in_groups:
new_item_group = SetOrdered()
for item in item_group:
item = type(item) if item is None or not isinstance(item, type) else item
new_item_group.add(item)
result.append(new_item_group)
ignore_type_in_groups = result
if ignore_string_type_changes and self.strings not in ignore_type_in_groups:
ignore_type_in_groups.append(SetOrdered(self.strings))
if ignore_numeric_type_changes and self.numbers not in ignore_type_in_groups:
ignore_type_in_groups.append(SetOrdered(self.numbers))
if ignore_uuid_types:
# Create a group containing both UUID and str types
uuid_str_group = SetOrdered([uuid.UUID, str])
if uuid_str_group not in ignore_type_in_groups:
ignore_type_in_groups.append(uuid_str_group)
if not ignore_type_subclasses:
# is_instance method needs tuples. When we look for subclasses, we need them to be tuples
ignore_type_in_groups = list(map(tuple, ignore_type_in_groups))
return ignore_type_in_groups
qlustered-deepdiff-41c7265/deepdiff/colored_view.py 0000664 0000000 0000000 00000013575 15162412645 0022365 0 ustar 00root root 0000000 0000000 import json
import os
from ast import literal_eval
from importlib.util import find_spec
from typing import Any, Dict
from deepdiff.model import TextResult, TreeResult
if os.name == "nt" and find_spec("colorama"):
import colorama
colorama.init()
# ANSI color codes
RED = '\033[31m'
GREEN = '\033[32m'
RESET = '\033[0m'
class ColoredView:
"""A view that shows JSON with color-coded differences."""
def __init__(self, t2: Any, tree_result: TreeResult, compact: bool = False):
self.t2 = t2
self.tree = tree_result
self.compact = compact
self.diff_paths = self._collect_diff_paths()
def _collect_diff_paths(self) -> Dict[str, str]:
"""Collect all paths that have differences and their types."""
text_result = TextResult(tree_results=self.tree, verbose_level=2)
diff_paths = {}
for diff_type, items in text_result.items():
if not items:
continue
try:
iter(items)
except TypeError:
continue
for path, item in items.items():
if diff_type in ("values_changed", "type_changes"):
changed_path = item.get("new_path") or path
diff_paths[changed_path] = ("changed", item["old_value"], item["new_value"])
elif diff_type in ("dictionary_item_added", "iterable_item_added", "set_item_added"):
diff_paths[path] = ("added", None, item)
elif diff_type in ("dictionary_item_removed", "iterable_item_removed", "set_item_removed"):
diff_paths[path] = ("removed", item, None)
return diff_paths
def _format_value(self, value: Any) -> str:
"""Format a value for display."""
if isinstance(value, bool):
return 'true' if value else 'false'
elif isinstance(value, str):
return f'"{value}"'
elif isinstance(value, (dict, list, tuple)):
return json.dumps(value)
else:
return str(value)
def _get_path_removed(self, path: str) -> dict:
"""Get all removed items for a given path."""
removed = {}
for key, value in self.diff_paths.items():
if value[0] == 'removed' and key.startswith(path + "["):
key_suffix = key[len(path):]
if key_suffix.count("[") == 1 and key_suffix.endswith("]"):
removed[literal_eval(key_suffix[1:-1])] = value[1]
return removed
def _has_differences(self, path_prefix: str) -> bool:
"""Check if a path prefix has any differences under it."""
return any(diff_path.startswith(path_prefix + "[") for diff_path in self.diff_paths)
def _colorize_json(self, obj: Any, path: str = 'root', indent: int = 0) -> str:
"""Recursively colorize JSON based on differences, with pretty-printing."""
INDENT = ' '
current_indent = INDENT * indent
next_indent = INDENT * (indent + 1)
if path in self.diff_paths and path not in self._colorize_skip_paths:
diff_type, old, new = self.diff_paths[path]
if diff_type == 'changed':
return f"{RED}{self._format_value(old)}{RESET} -> {GREEN}{self._format_value(new)}{RESET}"
elif diff_type == 'added':
return f"{GREEN}{self._format_value(new)}{RESET}"
elif diff_type == 'removed':
return f"{RED}{self._format_value(old)}{RESET}"
if isinstance(obj, (dict, list)) and self.compact and not self._has_differences(path):
return '{...}' if isinstance(obj, dict) else '[...]'
if isinstance(obj, dict):
if not obj:
return '{}'
items = []
for key, value in obj.items():
new_path = f"{path}['{key}']" if isinstance(key, str) else f"{path}[{key}]"
if new_path in self.diff_paths and self.diff_paths[new_path][0] == 'added':
# Colorize both key and value for added fields
items.append(f'{next_indent}{GREEN}"{key}": {self._colorize_json(value, new_path, indent + 1)}{RESET}')
else:
items.append(f'{next_indent}"{key}": {self._colorize_json(value, new_path, indent + 1)}')
for key, value in self._get_path_removed(path).items():
new_path = f"{path}['{key}']" if isinstance(key, str) else f"{path}[{key}]"
items.append(f'{next_indent}{RED}"{key}": {self._colorize_json(value, new_path, indent + 1)}{RESET}')
return '{\n' + ',\n'.join(items) + f'\n{current_indent}' + '}'
elif isinstance(obj, (list, tuple)):
removed_map = self._get_path_removed(path)
if not obj and not removed_map:
return '[]'
for index in removed_map:
self._colorize_skip_paths.add(f"{path}[{index}]")
items = []
remove_index = 0
for index, value in enumerate(obj):
while remove_index == next(iter(removed_map), None):
items.append(f'{next_indent}{RED}{self._format_value(removed_map.pop(remove_index))}{RESET}')
remove_index += 1
items.append(f'{next_indent}{self._colorize_json(value, f"{path}[{index}]", indent + 1)}')
remove_index += 1
for value in removed_map.values():
items.append(f'{next_indent}{RED}{self._format_value(value)}{RESET}')
return '[\n' + ',\n'.join(items) + f'\n{current_indent}' + ']'
else:
return self._format_value(obj)
def __str__(self) -> str:
"""Return the colorized, pretty-printed JSON string."""
self._colorize_skip_paths = set()
return self._colorize_json(self.t2)
def __iter__(self):
"""Make the view iterable by yielding the tree results."""
yield from self.tree.items()
qlustered-deepdiff-41c7265/deepdiff/commands.py 0000664 0000000 0000000 00000024223 15162412645 0021475 0 ustar 00root root 0000000 0000000 import click
import sys
from decimal import Decimal
from pprint import pprint
from deepdiff.diff import (
DeepDiff,
CUTOFF_DISTANCE_FOR_PAIRS_DEFAULT,
CUTOFF_INTERSECTION_FOR_PAIRS_DEFAULT,
logger
)
from deepdiff import Delta, DeepSearch, extract as deep_extract
from deepdiff.serialization import load_path_content, save_content_to_path
try:
import orjson
except ImportError:
orjson = None
@click.group()
def cli():
"""A simple command line tool."""
pass # pragma: no cover.
@cli.command()
@click.argument('t1', type=click.Path(exists=True, resolve_path=True))
@click.argument('t2', type=click.Path(exists=True, resolve_path=True))
@click.option('--cutoff-distance-for-pairs', required=False, default=CUTOFF_DISTANCE_FOR_PAIRS_DEFAULT, type=float, show_default=True)
@click.option('--cutoff-intersection-for-pairs', required=False, default=CUTOFF_INTERSECTION_FOR_PAIRS_DEFAULT, type=float, show_default=True)
@click.option('--cache-size', required=False, default=0, type=int, show_default=True)
@click.option('--cache-tuning-sample-size', required=False, default=0, type=int, show_default=True)
@click.option('--cache-purge-level', required=False, default=1, type=click.IntRange(0, 2), show_default=True)
@click.option('--create-patch', is_flag=True, show_default=True)
@click.option('--exclude-paths', required=False, type=str, show_default=False, multiple=True)
@click.option('--exclude-regex-paths', required=False, type=str, show_default=False, multiple=True)
@click.option('--math-epsilon', required=False, type=Decimal, show_default=False)
@click.option('--get-deep-distance', is_flag=True, show_default=True)
@click.option('--group-by', required=False, type=str, show_default=False, multiple=False)
@click.option('--ignore-order', is_flag=True, show_default=True)
@click.option('--ignore-string-type-changes', is_flag=True, show_default=True)
@click.option('--ignore-numeric-type-changes', is_flag=True, show_default=True)
@click.option('--ignore-type-subclasses', is_flag=True, show_default=True)
@click.option('--ignore-string-case', is_flag=True, show_default=True)
@click.option('--ignore-nan-inequality', is_flag=True, show_default=True)
@click.option('--include-private-variables', is_flag=True, show_default=True)
@click.option('--log-frequency-in-sec', required=False, default=0, type=int, show_default=True)
@click.option('--max-passes', required=False, default=10000000, type=int, show_default=True)
@click.option('--max_diffs', required=False, default=None, type=int, show_default=True)
@click.option('--threshold-to-diff-deeper', required=False, default=0.33, type=float, show_default=False)
@click.option('--number-format-notation', required=False, type=click.Choice(['f', 'e'], case_sensitive=True), show_default=True, default="f")
@click.option('--progress-logger', required=False, type=click.Choice(['info', 'error'], case_sensitive=True), show_default=True, default="info")
@click.option('--report-repetition', is_flag=True, show_default=True)
@click.option('--significant-digits', required=False, default=None, type=int, show_default=True)
@click.option('--truncate-datetime', required=False, type=click.Choice(['second', 'minute', 'hour', 'day'], case_sensitive=True), show_default=True, default=None)
@click.option('--verbose-level', required=False, default=1, type=click.IntRange(0, 2), show_default=True)
@click.option('--view', required=False, type=click.Choice(['tree', 'colored', 'colored_compact'], case_sensitive=True), show_default=True, default='tree')
@click.option('--debug', is_flag=True, show_default=False)
def diff(
*args, **kwargs
):
"""
Deep Diff Commandline
Deep Difference of content in files.
It can read csv, tsv, json, yaml, and toml files.
T1 and T2 are the path to the files to be compared with each other.
"""
debug = kwargs.pop('debug')
kwargs['ignore_private_variables'] = not kwargs.pop('include_private_variables')
kwargs['progress_logger'] = logger.info if kwargs['progress_logger'] == 'info' else logger.error
create_patch = kwargs.pop('create_patch')
t1_path = kwargs.pop("t1")
t2_path = kwargs.pop("t2")
t1_extension = t1_path.split('.')[-1]
t2_extension = t2_path.split('.')[-1]
if "view" in kwargs and kwargs["view"] is None:
kwargs.pop("view")
for name, t_path, t_extension in [('t1', t1_path, t1_extension), ('t2', t2_path, t2_extension)]:
try:
kwargs[name] = load_path_content(t_path, file_type=t_extension)
except Exception as e: # pragma: no cover.
if debug: # pragma: no cover.
raise # pragma: no cover.
else: # pragma: no cover.
sys.exit(str(f"Error when loading {name}: {e}")) # pragma: no cover.
# if (t1_extension != t2_extension):
if t1_extension in {'csv', 'tsv'}:
kwargs['t1'] = [dict(i) for i in kwargs['t1']]
if t2_extension in {'csv', 'tsv'}:
kwargs['t2'] = [dict(i) for i in kwargs['t2']]
if create_patch:
# Disabling logging progress since it will leak into stdout
kwargs['log_frequency_in_sec'] = 0
try:
diff = DeepDiff(**kwargs)
except Exception as e: # pragma: no cover. No need to test this.
sys.exit(str(e)) # pragma: no cover. No need to test this.
if create_patch:
try:
delta = Delta(diff)
except Exception as e: # pragma: no cover.
if debug: # pragma: no cover.
raise # pragma: no cover.
else: # pragma: no cover.
sys.exit(f"Error when loading the patch (aka delta): {e}") # pragma: no cover.
# printing into stdout
sys.stdout.buffer.write(delta.dumps())
else:
try:
if kwargs["view"] in {'colored', 'colored_compact'}:
print(diff)
else:
print(diff.to_json(indent=2))
except Exception:
pprint(diff, indent=2)
@cli.command()
@click.argument('path', type=click.Path(exists=True, resolve_path=True))
@click.argument('delta_path', type=click.Path(exists=True, resolve_path=True))
@click.option('--backup', '-b', is_flag=True, show_default=True)
@click.option('--raise-errors', is_flag=True, show_default=True)
@click.option('--debug', is_flag=True, show_default=False)
def patch(
path, delta_path, backup, raise_errors, debug
):
"""
Deep Patch Commandline
Patches a file based on the information in a delta file.
The delta file can be created by the deep diff command and
passing the --create-patch argument.
Deep Patch is similar to Linux's patch command.
The difference is that it is made for patching data.
It can read csv, tsv, json, yaml, and toml files.
"""
try:
delta = Delta(delta_path=delta_path, raise_errors=raise_errors)
except Exception as e: # pragma: no cover.
if debug: # pragma: no cover.
raise # pragma: no cover.
else: # pragma: no cover.
sys.exit(str(f"Error when loading the patch (aka delta) {delta_path}: {e}")) # pragma: no cover.
extension = path.split('.')[-1]
try:
content = load_path_content(path, file_type=extension)
except Exception as e: # pragma: no cover.
sys.exit(str(f"Error when loading {path}: {e}")) # pragma: no cover.
result = delta + content
try:
save_content_to_path(result, path, file_type=extension, keep_backup=backup)
except Exception as e: # pragma: no cover.
if debug: # pragma: no cover.
raise # pragma: no cover.
else: # pragma: no cover.
sys.exit(str(f"Error when saving {path}: {e}")) # pragma: no cover.
@cli.command()
@click.argument('item', required=True, type=str)
@click.argument('path', type=click.Path(exists=True, resolve_path=True))
@click.option('--ignore-case', '-i', is_flag=True, show_default=True)
@click.option('--exact-match', is_flag=True, show_default=True)
@click.option('--exclude-paths', required=False, type=str, show_default=False, multiple=True)
@click.option('--exclude-regex-paths', required=False, type=str, show_default=False, multiple=True)
@click.option('--verbose-level', required=False, default=1, type=click.IntRange(0, 2), show_default=True)
@click.option('--debug', is_flag=True, show_default=False)
def grep(item, path, debug, **kwargs):
"""
Deep Grep Commandline
Grep through the contents of a file and find the path to the item.
It can read csv, tsv, json, yaml, and toml files.
"""
kwargs['case_sensitive'] = not kwargs.pop('ignore_case')
kwargs['match_string'] = kwargs.pop('exact_match')
try:
content = load_path_content(path)
except Exception as e: # pragma: no cover.
if debug: # pragma: no cover.
raise # pragma: no cover.
else: # pragma: no cover.
sys.exit(str(f"Error when loading {path}: {e}")) # pragma: no cover.
try:
result = DeepSearch(content, item, **kwargs)
except Exception as e: # pragma: no cover.
if debug: # pragma: no cover.
raise # pragma: no cover.
else: # pragma: no cover.
sys.exit(str(f"Error when running deep search on {path}: {e}")) # pragma: no cover.
pprint(result, indent=2)
@cli.command()
@click.argument('path_inside', required=True, type=str)
@click.argument('path', type=click.Path(exists=True, resolve_path=True))
@click.option('--debug', is_flag=True, show_default=False)
def extract(path_inside, path, debug):
"""
Deep Extract Commandline
Extract an item from a file based on the path that is passed.
It can read csv, tsv, json, yaml, and toml files.
"""
try:
content = load_path_content(path)
except Exception as e: # pragma: no cover.
if debug: # pragma: no cover.
raise # pragma: no cover.
else: # pragma: no cover.
sys.exit(str(f"Error when loading {path}: {e}")) # pragma: no cover.
try:
result = deep_extract(content, path_inside)
except Exception as e: # pragma: no cover.
if debug: # pragma: no cover.
raise # pragma: no cover.
else: # pragma: no cover.
sys.exit(str(f"Error when running deep search on {path}: {e}")) # pragma: no cover.
pprint(result, indent=2)
qlustered-deepdiff-41c7265/deepdiff/deephash.py 0000664 0000000 0000000 00000074627 15162412645 0021472 0 ustar 00root root 0000000 0000000 #!/usr/bin/env python
import logging
import datetime
import uuid
from typing import Union, Optional, Any, List, TYPE_CHECKING, Dict, Tuple, Set, Callable, Generator
from collections.abc import Iterable, MutableMapping
from collections import defaultdict
from hashlib import sha1, sha256
from pathlib import Path
from enum import Enum
import re
from deepdiff.helper import (strings, numbers, only_numbers, times, unprocessed, not_hashed, add_to_frozen_set,
convert_item_or_items_into_set_else_none, get_doc, ipranges,
convert_item_or_items_into_compiled_regexes_else_none,
get_id, type_is_subclass_of_type_group, type_in_type_group,
number_to_string, datetime_normalize, KEY_TO_VAL_STR,
get_truncate_datetime, dict_, add_root_to_paths, PydanticBaseModel)
from deepdiff.base import Base
if TYPE_CHECKING:
from pytz.tzinfo import BaseTzInfo
import numpy as np
# Type aliases for better readability
HashableType = Union[str, int, float, bytes, bool, tuple, frozenset, type(None)]
HashResult = Union[str, Any] # Can be string hash or unprocessed marker
HashTuple = Tuple[HashResult, int] # (hash_result, count)
HashesDict = Dict[Any, Union[HashTuple, List[Any]]] # Special case for UNPROCESSED_KEY
PathType = Union[str, List[str], Set[str]]
RegexType = Union[str, re.Pattern[str], List[Union[str, re.Pattern[str]]]]
NumberToStringFunc = Callable[..., str] # More flexible for different number_to_string implementations
try:
import pandas
except ImportError:
pandas = False # type: ignore
try:
import polars
except ImportError:
polars = False # type: ignore
try:
import numpy as np
booleanTypes: Tuple[type, ...] = (bool, np.bool_) # type: ignore
except ImportError:
booleanTypes = (bool,) # type: ignore
logger: logging.Logger = logging.getLogger(__name__)
UNPROCESSED_KEY: object = object()
EMPTY_FROZENSET: frozenset = frozenset()
INDEX_VS_ATTRIBUTE: Tuple[str, str] = ('[%s]', '.%s')
HASH_LOOKUP_ERR_MSG: str = '{} is not one of the hashed items.'
def sha256hex(obj: Union[str, bytes]) -> str:
"""Use Sha256 as a cryptographic hash."""
if isinstance(obj, str):
obj = obj.encode('utf-8')
return sha256(obj).hexdigest()
def sha1hex(obj: Union[str, bytes]) -> str:
"""Use Sha1 as a cryptographic hash."""
if isinstance(obj, str):
obj = obj.encode('utf-8')
return sha1(obj).hexdigest()
default_hasher: Callable[[Union[str, bytes]], str] = sha256hex
def combine_hashes_lists(items: List[List[str]], prefix: Union[str, bytes]) -> str:
"""
Combines lists of hashes into one hash
This can be optimized in future.
It needs to work with both murmur3 hashes (int) and sha256 (str)
Although murmur3 is not used anymore.
"""
if isinstance(prefix, bytes):
prefix = prefix.decode('utf-8')
hashes_bytes = b''
for item in items:
# In order to make sure the order of hashes in each item does not affect the hash
# we resort them.
hashes_bytes += (''.join(map(str, sorted(item))) + '--').encode('utf-8')
return prefix + str(default_hasher(hashes_bytes))
class BoolObj(Enum):
TRUE = 1
FALSE = 0
def prepare_string_for_hashing(
obj: Union[str, bytes, memoryview],
ignore_string_type_changes: bool = False,
ignore_string_case: bool = False,
encodings: Optional[List[str]] = None,
ignore_encoding_errors: bool = False,
) -> str:
"""
Clean type conversions
"""
original_type = obj.__class__.__name__
# https://docs.python.org/3/library/codecs.html#codecs.decode
errors_mode = 'ignore' if ignore_encoding_errors else 'strict'
if isinstance(obj, memoryview):
obj = obj.tobytes()
if isinstance(obj, bytes):
err = None
encodings = ['utf-8'] if encodings is None else encodings
encoded = False
for encoding in encodings:
try:
obj = obj.decode(encoding, errors=errors_mode)
encoded = True
break
except UnicodeDecodeError as er:
err = er
if not encoded and err is not None:
obj_decoded = obj.decode('utf-8', errors='ignore') # type: ignore
start = max(err.start - 20, 0)
start_prefix = ''
if start > 0:
start_prefix = '...'
end = err.end + 20
end_suffix = '...'
if end >= len(obj):
end = len(obj)
end_suffix = ''
raise UnicodeDecodeError(
err.encoding,
err.object,
err.start,
err.end,
f"{err.reason} in '{start_prefix}{obj_decoded[start:end]}{end_suffix}'. Please either pass ignore_encoding_errors=True or pass the encoding via encodings=['utf-8', '...']."
) from None
if not ignore_string_type_changes:
obj = KEY_TO_VAL_STR.format(original_type, obj)
if ignore_string_case:
obj = obj.lower()
return str(obj)
doc = get_doc('deephash_doc.rst')
class DeepHash(Base):
__doc__ = doc
# Class attributes
hashes: Dict[Any, Any]
exclude_types_tuple: Tuple[type, ...]
ignore_repetition: bool
exclude_paths: Optional[Set[str]]
include_paths: Optional[Set[str]]
exclude_regex_paths: Optional[List[re.Pattern[str]]]
hasher: Callable[[Union[str, bytes]], str]
use_enum_value: bool
default_timezone: Union[datetime.timezone, "BaseTzInfo"]
significant_digits: Optional[int]
truncate_datetime: Optional[str]
number_format_notation: str
ignore_type_in_groups: Any
ignore_string_type_changes: bool
ignore_numeric_type_changes: bool
ignore_string_case: bool
exclude_obj_callback: Optional[Callable[[Any, str], bool]]
apply_hash: bool
type_check_func: Callable[[type, Any], bool]
number_to_string: Any
ignore_private_variables: bool
encodings: Optional[List[str]]
ignore_encoding_errors: bool
ignore_iterable_order: bool
custom_operators: Optional[List[Any]]
def __init__(self,
obj: Any,
*,
apply_hash: bool = True,
custom_operators: Optional[List[Any]] = None,
default_timezone: Union[datetime.timezone, "BaseTzInfo"] = datetime.timezone.utc,
encodings: Optional[List[str]] = None,
exclude_obj_callback: Optional[Callable[[Any, str], bool]] = None,
exclude_paths: Optional[PathType] = None,
exclude_regex_paths: Optional[RegexType] = None,
exclude_types: Optional[Union[List[type], Set[type], Tuple[type, ...]]] = None,
hasher: Optional[Callable[[Union[str, bytes]], str]] = None,
hashes: Optional[Union[Dict[Any, Any], "DeepHash"]] = None,
ignore_encoding_errors: bool = False,
ignore_iterable_order: bool = True,
ignore_numeric_type_changes: bool = False,
ignore_private_variables: bool = True,
ignore_repetition: bool = True,
ignore_string_case: bool = False,
ignore_string_type_changes: bool = False,
ignore_type_in_groups: Any = None,
ignore_type_subclasses: bool = False,
ignore_uuid_types: bool = False,
include_paths: Optional[PathType] = None,
number_format_notation: str = "f",
number_to_string_func: Optional[NumberToStringFunc] = None,
parent: str = "root",
significant_digits: Optional[int] = None,
truncate_datetime: Optional[str] = None,
use_enum_value: bool = False,
**kwargs) -> None:
if kwargs:
raise ValueError(
("The following parameter(s) are not valid: %s\n"
"The valid parameters are obj, hashes, exclude_types, significant_digits, truncate_datetime,"
"exclude_paths, include_paths, exclude_regex_paths, hasher, ignore_repetition, "
"number_format_notation, apply_hash, ignore_type_in_groups, ignore_string_type_changes, "
"ignore_numeric_type_changes, ignore_type_subclasses, ignore_string_case, ignore_uuid_types, "
"number_to_string_func, ignore_private_variables, parent, use_enum_value, default_timezone "
"encodings, ignore_encoding_errors") % ', '.join(kwargs.keys()))
if isinstance(hashes, MutableMapping):
self.hashes = hashes
elif isinstance(hashes, DeepHash):
self.hashes = hashes.hashes
else:
self.hashes = dict_()
exclude_types = set() if exclude_types is None else set(exclude_types)
self.exclude_types_tuple = tuple(exclude_types) # we need tuple for checking isinstance
self.ignore_repetition = ignore_repetition
self.exclude_paths = add_root_to_paths(convert_item_or_items_into_set_else_none(exclude_paths))
self.include_paths = add_root_to_paths(convert_item_or_items_into_set_else_none(include_paths))
self.exclude_regex_paths = convert_item_or_items_into_compiled_regexes_else_none(exclude_regex_paths)
self.hasher = default_hasher if hasher is None else hasher
self.hashes[UNPROCESSED_KEY] = [] # type: ignore
self.use_enum_value = use_enum_value
self.default_timezone = default_timezone
self.significant_digits = self.get_significant_digits(significant_digits, ignore_numeric_type_changes)
self.truncate_datetime = get_truncate_datetime(truncate_datetime)
self.number_format_notation = number_format_notation
self.ignore_type_in_groups = self.get_ignore_types_in_groups(
ignore_type_in_groups=ignore_type_in_groups,
ignore_string_type_changes=ignore_string_type_changes,
ignore_numeric_type_changes=ignore_numeric_type_changes,
ignore_type_subclasses=ignore_type_subclasses,
ignore_uuid_types=ignore_uuid_types,
)
self.ignore_string_type_changes = ignore_string_type_changes
self.ignore_numeric_type_changes = ignore_numeric_type_changes
self.ignore_string_case = ignore_string_case
self.exclude_obj_callback = exclude_obj_callback
# makes the hash return constant size result if true
# the only time it should be set to False is when
# testing the individual hash functions for different types of objects.
self.apply_hash = apply_hash
self.type_check_func = type_in_type_group if ignore_type_subclasses else type_is_subclass_of_type_group
# self.type_check_func = type_is_subclass_of_type_group if ignore_type_subclasses else type_in_type_group
self.number_to_string = number_to_string_func or number_to_string
self.ignore_private_variables = ignore_private_variables
self.encodings = encodings
self.ignore_encoding_errors = ignore_encoding_errors
self.ignore_iterable_order = ignore_iterable_order
self.custom_operators = custom_operators
self._hash(obj, parent=parent, parents_ids=frozenset({get_id(obj)}))
if self.hashes[UNPROCESSED_KEY]:
logger.warning("Can not hash the following items: {}.".format(self.hashes[UNPROCESSED_KEY]))
else:
del self.hashes[UNPROCESSED_KEY]
sha256hex: Callable[[Union[str, bytes]], str] = sha256hex
sha1hex: Callable[[Union[str, bytes]], str] = sha1hex
def __getitem__(self, obj: Any, extract_index: Optional[int] = 0) -> Any:
return self._getitem(self.hashes, obj, extract_index=extract_index,
use_enum_value=self.use_enum_value,
ignore_numeric_type_changes=self.ignore_numeric_type_changes)
@staticmethod
def _get_slots_dict(obj: Any) -> Dict[str, Any]:
"""Get a dict of initialized slot attributes.
Uses object.__getattribute__ to check each slot directly, bypassing
__getattr__. For uninitialized slots on classes that define __getattr__,
falls back to getattr — letting it raise if the object is truly broken.
"""
result = {}
has_getattr = hasattr(type(obj), '__getattr__')
for slot in obj.__slots__:
try:
result[slot] = object.__getattribute__(obj, slot)
except AttributeError:
if has_getattr:
# The slot isn't initialized, but the class defines __getattr__.
# Try the normal getattr to let __getattr__ provide a value or
# raise — if it raises, we propagate to fail the strategy.
result[slot] = getattr(obj, slot)
return result
@staticmethod
def _getitem(hashes: Dict[Any, Any], obj: Any, extract_index: Optional[int] = 0,
use_enum_value: bool = False, ignore_numeric_type_changes: bool = False) -> Any:
"""
extract_index is zero for hash and 1 for count and None to get them both.
To keep it backward compatible, we only get the hash by default so it is set to zero by default.
"""
key = obj
if obj is True:
key = BoolObj.TRUE
elif obj is False:
key = BoolObj.FALSE
elif use_enum_value and isinstance(obj, Enum):
key = obj.value
key = DeepHash._make_hash_key_for_lookup(key, ignore_numeric_type_changes=ignore_numeric_type_changes)
result_n_count: Tuple[Any, int] = (None, 0) # type: ignore
try:
result_n_count = hashes[key]
except (TypeError, KeyError):
key = get_id(obj)
try:
result_n_count = hashes[key]
except KeyError:
raise KeyError(HASH_LOOKUP_ERR_MSG.format(obj)) from None
if obj is UNPROCESSED_KEY:
extract_index = None
return result_n_count if extract_index is None else result_n_count[extract_index]
def __contains__(self, obj: Any) -> bool:
key = self._make_hash_key(obj)
result = False
try:
result = key in self.hashes
except (TypeError, KeyError):
result = False
if not result:
result = get_id(obj) in self.hashes
return result
def get(self, key: Any, default: Any = None, extract_index: Optional[int] = 0) -> Any:
"""
Get method for the hashes dictionary.
It can extract the hash for a given key that is already calculated when extract_index=0
or the count of items that went to building the object when extract_index=1.
"""
return self.get_key(self.hashes, key, default=default, extract_index=extract_index,
ignore_numeric_type_changes=self.ignore_numeric_type_changes)
@staticmethod
def get_key(hashes: Dict[Any, Any], key: Any, default: Any = None, extract_index: Optional[int] = 0,
use_enum_value: bool = False, ignore_numeric_type_changes: bool = False) -> Any:
"""
get_key method for the hashes dictionary.
It can extract the hash for a given key that is already calculated when extract_index=0
or the count of items that went to building the object when extract_index=1.
"""
try:
result = DeepHash._getitem(hashes, key, extract_index=extract_index,
use_enum_value=use_enum_value,
ignore_numeric_type_changes=ignore_numeric_type_changes)
except KeyError:
result = default
return result
@staticmethod
def _unwrap_hash_key(key: Any) -> Any:
"""Unwrap a (type, value) hash key back to the original value for public API."""
if isinstance(key, tuple) and len(key) == 2 and isinstance(key[0], type) and isinstance(key[1], only_numbers):
return key[1]
return key
def _get_objects_to_hashes_dict(self, extract_index: Optional[int] = 0) -> Dict[Any, Any]:
"""
A dictionary containing only the objects to hashes,
or a dictionary of objects to the count of items that went to build them.
extract_index=0 for hashes and extract_index=1 for counts.
"""
result = dict_()
for key, value in self.hashes.items():
key = self._unwrap_hash_key(key)
if key is UNPROCESSED_KEY:
result[key] = value
else:
result[key] = value[extract_index]
return result
def __eq__(self, other: Any) -> bool:
if isinstance(other, DeepHash):
return self.hashes == other.hashes
else:
# We only care about the hashes
return self._get_objects_to_hashes_dict() == other
__req__ = __eq__
def __repr__(self) -> str:
"""
Hide the counts since it will be confusing to see them when they are hidden everywhere else.
"""
from deepdiff.summarize import summarize
return summarize(self._get_objects_to_hashes_dict(extract_index=0), max_length=500)
def __str__(self) -> str:
return str(self._get_objects_to_hashes_dict(extract_index=0))
def __bool__(self) -> bool:
return bool(self.hashes)
def keys(self) -> Any:
return [self._unwrap_hash_key(k) for k in self.hashes.keys()]
def values(self) -> Generator[Any, None, None]:
return (i[0] for i in self.hashes.values()) # Just grab the item and not its count
def items(self) -> Generator[Tuple[Any, Any], None, None]:
return ((self._unwrap_hash_key(i), v[0]) for i, v in self.hashes.items())
def _prep_obj(self, obj: Any, parent: str, parents_ids: frozenset = EMPTY_FROZENSET, is_namedtuple: bool = False, is_pydantic_object: bool = False) -> HashTuple:
"""prepping objects"""
original_type = type(obj) if not isinstance(obj, type) else obj
obj_to_dict_strategies = []
if is_namedtuple:
obj_to_dict_strategies.append(lambda o: o._asdict())
elif is_pydantic_object:
obj_to_dict_strategies.append(lambda o: {k: v for (k, v) in o.__dict__.items() if v !="model_fields_set"})
else:
obj_to_dict_strategies.append(lambda o: o.__dict__)
if hasattr(obj, "__slots__"):
obj_to_dict_strategies.append(lambda o: DeepHash._get_slots_dict(o))
else:
import inspect
obj_to_dict_strategies.append(lambda o: dict(inspect.getmembers(o, lambda m: not inspect.isroutine(m))))
for get_dict in obj_to_dict_strategies:
try:
d = get_dict(obj)
break
except AttributeError:
pass
else:
self.hashes[UNPROCESSED_KEY].append(obj) # type: ignore
return (unprocessed, 0)
obj = d
result, counts = self._prep_dict(obj, parent=parent, parents_ids=parents_ids,
print_as_attribute=True, original_type=original_type)
result = "nt{}".format(result) if is_namedtuple else "obj{}".format(result)
return result, counts
def _skip_this(self, obj: Any, parent: str) -> bool:
skip = False
if self.exclude_paths and parent in self.exclude_paths:
skip = True
if self.include_paths and parent != 'root':
if parent not in self.include_paths:
skip = True
for prefix in self.include_paths:
if parent.startswith(prefix):
skip = False
break
elif self.exclude_regex_paths and any(
[exclude_regex_path.search(parent) for exclude_regex_path in self.exclude_regex_paths]): # type: ignore
skip = True
elif self.exclude_types_tuple and isinstance(obj, self.exclude_types_tuple):
skip = True
elif self.exclude_obj_callback and self.exclude_obj_callback(obj, parent):
skip = True
return skip
def _prep_dict(self, obj: Union[Dict[Any, Any], MutableMapping], parent: str, parents_ids: frozenset = EMPTY_FROZENSET, print_as_attribute: bool = False, original_type: Optional[type] = None) -> HashTuple:
result = []
counts = 1
key_text = "%s{}".format(INDEX_VS_ATTRIBUTE[print_as_attribute])
for key, item in obj.items():
counts += 1
# ignore private variables
if self.ignore_private_variables and isinstance(key, str) and key.startswith('__'):
continue
key_formatted = "'%s'" % key if not print_as_attribute and isinstance(key, strings) else key
key_in_report = key_text % (parent, key_formatted)
key_hash, _ = self._hash(key, parent=key_in_report, parents_ids=parents_ids)
if not key_hash:
continue
item_id = get_id(item)
if (parents_ids and item_id in parents_ids) or self._skip_this(item, parent=key_in_report):
continue
parents_ids_added = add_to_frozen_set(parents_ids, item_id)
hashed, count = self._hash(item, parent=key_in_report, parents_ids=parents_ids_added)
hashed = KEY_TO_VAL_STR.format(key_hash, hashed)
result.append(hashed)
counts += count
result.sort()
result = ';'.join(result)
if print_as_attribute:
type_ = original_type or type(obj)
type_str = type_.__name__
for type_group in self.ignore_type_in_groups:
if self.type_check_func(type_, type_group):
type_str = ','.join(map(lambda x: x.__name__, type_group))
break
else:
type_str = 'dict'
return "{}:{{{}}}".format(type_str, result), counts
def _prep_iterable(self, obj: Iterable[Any], parent: str, parents_ids: frozenset = EMPTY_FROZENSET) -> HashTuple:
counts = 1
result = defaultdict(int)
for i, item in enumerate(obj):
new_parent = "{}[{}]".format(parent, i)
if self._skip_this(item, parent=new_parent):
continue
item_id = get_id(item)
if parents_ids and item_id in parents_ids:
continue
parents_ids_added = add_to_frozen_set(parents_ids, item_id)
hashed, count = self._hash(item, parent=new_parent, parents_ids=parents_ids_added)
# counting repetitions
result[hashed] += 1
counts += count
if self.ignore_repetition:
result = list(result.keys())
else:
result = [
'{}|{}'.format(i, v) for i, v in result.items()
]
result = map(str, result) # making sure the result items are string so join command works.
if self.ignore_iterable_order:
result = sorted(result)
result = ','.join(result)
result = KEY_TO_VAL_STR.format(type(obj).__name__, result)
return result, counts
def _prep_bool(self, obj: bool) -> BoolObj:
return BoolObj.TRUE if obj else BoolObj.FALSE
def _prep_path(self, obj: Path) -> str:
type_ = obj.__class__.__name__
return KEY_TO_VAL_STR.format(type_, obj)
def _prep_number(self, obj: Union[int, float, complex]) -> str:
type_ = "number" if self.ignore_numeric_type_changes else obj.__class__.__name__
if self.significant_digits is not None:
obj = self.number_to_string(obj, significant_digits=self.significant_digits,
number_format_notation=self.number_format_notation) # type: ignore
return KEY_TO_VAL_STR.format(type_, obj)
def _prep_ipranges(self, obj) -> str:
type_ = 'iprange'
obj = str(obj)
return KEY_TO_VAL_STR.format(type_, obj)
def _prep_datetime(self, obj: datetime.datetime) -> str:
type_ = 'datetime'
obj = datetime_normalize(self.truncate_datetime, obj, default_timezone=self.default_timezone)
return KEY_TO_VAL_STR.format(type_, obj)
def _prep_date(self, obj: datetime.date) -> str:
type_ = 'datetime' # yes still datetime but it doesn't need normalization
return KEY_TO_VAL_STR.format(type_, obj)
def _prep_tuple(self, obj: tuple, parent: str, parents_ids: frozenset) -> HashTuple:
# Checking to see if it has _fields. Which probably means it is a named
# tuple.
try:
obj._asdict # type: ignore
# It must be a normal tuple
except AttributeError:
result, counts = self._prep_iterable(obj=obj, parent=parent, parents_ids=parents_ids)
# We assume it is a namedtuple then
else:
result, counts = self._prep_obj(obj, parent, parents_ids=parents_ids, is_namedtuple=True)
return result, counts
def _make_hash_key(self, obj: Any) -> Any:
"""
Create a key for the hashes dict that distinguishes numeric types.
In Python, 1 == 1.0 and hash(1) == hash(1.0), so int and float values
collide as dict keys. When ignore_numeric_type_changes is False, we wrap
numeric objects as (type, value) tuples so that each type gets its own
cache entry and its own hash.
"""
if not self.ignore_numeric_type_changes and isinstance(obj, only_numbers):
return (type(obj), obj)
return obj
@staticmethod
def _make_hash_key_for_lookup(obj: Any, ignore_numeric_type_changes: bool = False) -> Any:
"""Static version of _make_hash_key for use in static accessor methods."""
if not ignore_numeric_type_changes and isinstance(obj, only_numbers):
return (type(obj), obj)
return obj
def _hash(self, obj: Any, parent: str, parents_ids: frozenset = EMPTY_FROZENSET) -> HashTuple:
"""The main hash method"""
counts = 1
if self.custom_operators is not None:
for operator in self.custom_operators:
func = getattr(operator, 'normalize_value_for_hashing', None)
if func is None:
raise NotImplementedError(f"{operator.__class__.__name__} needs to define a normalize_value_for_hashing method to be compatible with ignore_order=True or iterable_compare_func.".format(operator))
else:
obj = func(parent, obj)
if isinstance(obj, booleanTypes):
obj = self._prep_bool(obj)
result = None
elif self.use_enum_value and isinstance(obj, Enum):
obj = obj.value
else:
result = not_hashed
hash_key = self._make_hash_key(obj)
try:
result, counts = self.hashes[hash_key]
except (TypeError, KeyError):
pass
else:
return result, counts
if self._skip_this(obj, parent):
return None, 0
elif obj is None:
result = 'NONE'
elif isinstance(obj, strings):
result = prepare_string_for_hashing(
obj,
ignore_string_type_changes=self.ignore_string_type_changes,
ignore_string_case=self.ignore_string_case,
encodings=self.encodings,
ignore_encoding_errors=self.ignore_encoding_errors,
)
elif isinstance(obj, Path):
result = self._prep_path(obj)
elif isinstance(obj, times):
result = self._prep_datetime(obj) # type: ignore
elif isinstance(obj, datetime.date):
result = self._prep_date(obj)
elif isinstance(obj, numbers): # type: ignore
result = self._prep_number(obj)
elif isinstance(obj, ipranges):
result = self._prep_ipranges(obj)
elif isinstance(obj, uuid.UUID):
# Handle UUID objects (including uuid6.UUID) by using their integer value
result = str(obj.int)
elif isinstance(obj, MutableMapping):
result, counts = self._prep_dict(obj=obj, parent=parent, parents_ids=parents_ids)
elif isinstance(obj, tuple):
result, counts = self._prep_tuple(obj=obj, parent=parent, parents_ids=parents_ids)
elif (pandas and isinstance(obj, pandas.DataFrame)): # type: ignore
def gen(): # type: ignore
yield ('dtype', obj.dtypes) # type: ignore
yield ('index', obj.index) # type: ignore
yield from obj.items() # type: ignore # which contains (column name, series tuples)
result, counts = self._prep_iterable(obj=gen(), parent=parent, parents_ids=parents_ids)
elif (polars and isinstance(obj, polars.DataFrame)): # type: ignore
def gen():
yield from obj.columns # type: ignore
yield from list(obj.schema.items()) # type: ignore
yield from obj.rows() # type: ignore
result, counts = self._prep_iterable(obj=gen(), parent=parent, parents_ids=parents_ids)
elif isinstance(obj, Iterable):
result, counts = self._prep_iterable(obj=obj, parent=parent, parents_ids=parents_ids)
elif obj == BoolObj.TRUE or obj == BoolObj.FALSE:
result = 'bool:true' if obj is BoolObj.TRUE else 'bool:false'
elif isinstance(obj, PydanticBaseModel):
result, counts = self._prep_obj(obj=obj, parent=parent, parents_ids=parents_ids, is_pydantic_object=True)
else:
result, counts = self._prep_obj(obj=obj, parent=parent, parents_ids=parents_ids)
if result is not_hashed: # pragma: no cover
self.hashes[UNPROCESSED_KEY].append(obj) # type: ignore
elif result is unprocessed:
pass
elif self.apply_hash:
if isinstance(obj, strings):
result_cleaned = result
else:
result_cleaned = prepare_string_for_hashing(
str(result), ignore_string_type_changes=self.ignore_string_type_changes,
ignore_string_case=self.ignore_string_case)
result = self.hasher(result_cleaned)
# It is important to keep the hash of all objects.
# The hashes will be later used for comparing the objects.
# Object to hash when possible otherwise ObjectID to hash
try:
self.hashes[hash_key] = (result, counts)
except TypeError:
obj_id = get_id(obj)
self.hashes[obj_id] = (result, counts)
return result, counts
if __name__ == "__main__": # pragma: no cover
import doctest
doctest.testmod()
qlustered-deepdiff-41c7265/deepdiff/delta.py 0000664 0000000 0000000 00000163273 15162412645 0020776 0 ustar 00root root 0000000 0000000 import copy
import logging
from typing import List, Dict, IO, Callable, Set, Union, Optional, Any, cast
from functools import partial, cmp_to_key
from collections.abc import Mapping
from copy import deepcopy
from deepdiff import DeepDiff
from deepdiff.serialization import pickle_load, pickle_dump
from deepdiff.helper import (
strings, numbers,
np_ndarray, np_array_factory, numpy_dtypes, get_doc,
not_found, numpy_dtype_string_to_type, dict_,
Opcode, FlatDeltaRow, FlatDeltaDict, UnkownValueCode, FlatDataAction,
OPCODE_TAG_TO_FLAT_DATA_ACTION,
FLAT_DATA_ACTION_TO_OPCODE_TAG,
SetOrdered,
)
from deepdiff.path import (
_path_to_elements, _get_nested_obj, _get_nested_obj_and_force,
GET, GETATTR, check_elem, parse_path, stringify_path,
)
from deepdiff.anyset import AnySet
from deepdiff.summarize import summarize
logger = logging.getLogger(__name__)
VERIFICATION_MSG = 'Expected the old value for {} to be {} but it is {}. Error found on: {}. You may want to set force=True, especially if this delta is created by passing flat_rows_list or flat_dict_list'
ELEM_NOT_FOUND_TO_ADD_MSG = 'Key or index of {} is not found for {} for setting operation.'
TYPE_CHANGE_FAIL_MSG = 'Unable to do the type change for {} from to type {} due to {}'
VERIFY_BIDIRECTIONAL_MSG = ('You have applied the delta to an object that has '
'different values than the original object the delta was made from.')
FAIL_TO_REMOVE_ITEM_IGNORE_ORDER_MSG = 'Failed to remove index[{}] on {}. It was expected to be {} but got {}'
DELTA_NUMPY_OPERATOR_OVERRIDE_MSG = (
'A numpy ndarray is most likely being added to a delta. '
'Due to Numpy override the + operator, you can only do: delta + ndarray '
'and NOT ndarray + delta')
BINIARY_MODE_NEEDED_MSG = "Please open the file in the binary mode and pass to Delta by passing 'b' in open(..., 'b'): {}"
DELTA_AT_LEAST_ONE_ARG_NEEDED = 'At least one of the diff, delta_path or delta_file arguments need to be passed.'
INVALID_ACTION_WHEN_CALLING_GET_ELEM = 'invalid action of {} when calling _get_elem_and_compare_to_old_value'
INVALID_ACTION_WHEN_CALLING_SIMPLE_SET_ELEM = 'invalid action of {} when calling _simple_set_elem_value'
INVALID_ACTION_WHEN_CALLING_SIMPLE_DELETE_ELEM = 'invalid action of {} when calling _simple_set_elem_value'
UNABLE_TO_GET_ITEM_MSG = 'Unable to get the item at {}: {}'
UNABLE_TO_GET_PATH_MSG = 'Unable to get the item at {}'
INDEXES_NOT_FOUND_WHEN_IGNORE_ORDER = 'Delta added to an incompatible object. Unable to add the following items at the specific indexes. {}'
NUMPY_TO_LIST = 'NUMPY_TO_LIST'
NOT_VALID_NUMPY_TYPE = "{} is not a valid numpy type."
doc = get_doc('delta.rst')
class DeltaError(ValueError):
"""
Delta specific errors
"""
pass
class DeltaNumpyOperatorOverrideError(ValueError):
"""
Delta Numpy Operator Override Error
"""
pass
class Delta:
__doc__ = doc
def __init__(
self,
diff: Union[DeepDiff, Mapping, str, bytes, None]=None,
delta_path: Optional[str]=None,
delta_file: Optional[IO]=None,
delta_diff: Optional[dict]=None,
flat_dict_list: Optional[List[Dict]]=None,
flat_rows_list: Optional[List[FlatDeltaRow]]=None,
deserializer: Callable=pickle_load,
log_errors: bool=True,
mutate: bool=False,
raise_errors: bool=False,
safe_to_import: Optional[Set[str]]=None,
serializer: Callable=pickle_dump,
verify_symmetry: Optional[bool]=None,
bidirectional: bool=False,
always_include_values: bool=False,
iterable_compare_func_was_used: Optional[bool]=None,
force: bool=False,
fill: Any=not_found,
):
# for pickle deserializer:
if hasattr(deserializer, '__code__') and 'safe_to_import' in set(deserializer.__code__.co_varnames):
_deserializer = deserializer
else:
def _deserializer(obj, safe_to_import=None):
result = deserializer(obj)
if result.get('_iterable_opcodes'):
_iterable_opcodes = {}
for path, op_codes in result['_iterable_opcodes'].items():
_iterable_opcodes[path] = []
for op_code in op_codes:
_iterable_opcodes[path].append(
Opcode(
**op_code
)
)
result['_iterable_opcodes'] = _iterable_opcodes
return result
self._reversed_diff = None
if verify_symmetry is not None:
logger.warning(
"DeepDiff Deprecation: use bidirectional instead of verify_symmetry parameter."
)
bidirectional = verify_symmetry
self.bidirectional = bidirectional
if bidirectional:
self.always_include_values = True # We need to include the values in bidirectional deltas
else:
self.always_include_values = always_include_values
if diff is not None:
if isinstance(diff, DeepDiff):
self.diff = diff._to_delta_dict(directed=not bidirectional, always_include_values=self.always_include_values)
elif isinstance(diff, Mapping):
self.diff = diff
elif isinstance(diff, strings):
self.diff = _deserializer(diff, safe_to_import=safe_to_import)
elif delta_path:
with open(delta_path, 'rb') as the_file:
content = the_file.read()
self.diff = _deserializer(content, safe_to_import=safe_to_import)
elif delta_diff:
self.diff = delta_diff
elif delta_file:
try:
content = delta_file.read()
except UnicodeDecodeError as e:
raise ValueError(BINIARY_MODE_NEEDED_MSG.format(e)) from None
self.diff = _deserializer(content, safe_to_import=safe_to_import)
elif flat_dict_list:
# Use copy to preserve original value of flat_dict_list in calling module
self.diff = self._from_flat_dicts(copy.deepcopy(flat_dict_list))
elif flat_rows_list:
self.diff = self._from_flat_rows(copy.deepcopy(flat_rows_list))
else:
raise ValueError(DELTA_AT_LEAST_ONE_ARG_NEEDED)
self.mutate = mutate
self.raise_errors = raise_errors
self.log_errors = log_errors
self._numpy_paths = self.diff.get('_numpy_paths', False)
# When we create the delta from a list of flat dictionaries, details such as iterable_compare_func_was_used get lost.
# That's why we allow iterable_compare_func_was_used to be explicitly set.
self._iterable_compare_func_was_used = self.diff.get('_iterable_compare_func_was_used', iterable_compare_func_was_used)
self.serializer = serializer
self.deserializer = deserializer
self.force = force
self.fill = fill
if force:
self.get_nested_obj = _get_nested_obj_and_force
else:
self.get_nested_obj = _get_nested_obj
self.reset()
def __repr__(self):
return "".format(summarize(self.diff, max_length=100))
def reset(self):
self.post_process_paths_to_convert = dict_()
def __add__(self, other):
if isinstance(other, numbers) and self._numpy_paths: # type: ignore
raise DeltaNumpyOperatorOverrideError(DELTA_NUMPY_OPERATOR_OVERRIDE_MSG)
if self.mutate:
self.root = other
else:
self.root = deepcopy(other)
self._do_pre_process()
self._do_values_changed()
self._do_set_item_added()
self._do_set_item_removed()
self._do_type_changes()
# NOTE: the remove iterable action needs to happen BEFORE
# all the other iterables to match the reverse of order of operations in DeepDiff
self._do_iterable_opcodes()
self._do_iterable_item_removed()
self._do_iterable_item_added()
self._do_ignore_order()
self._do_dictionary_item_added()
self._do_dictionary_item_removed()
self._do_attribute_added()
self._do_attribute_removed()
self._do_post_process()
other = self.root
# removing the reference to other
del self.root
self.reset()
return other
__radd__ = __add__
def __rsub__(self, other):
if self._reversed_diff is None:
self._reversed_diff = self._get_reverse_diff()
self.diff, self._reversed_diff = self._reversed_diff, self.diff
result = self.__add__(other)
self.diff, self._reversed_diff = self._reversed_diff, self.diff
return result
def _raise_or_log(self, msg, level='error'):
if self.log_errors:
getattr(logger, level)(msg)
if self.raise_errors:
raise DeltaError(msg)
def _do_verify_changes(self, path, expected_old_value, current_old_value):
if self.bidirectional and expected_old_value != current_old_value:
if isinstance(path, str):
path_str = path
else:
path_str = stringify_path(path, root_element=('', GETATTR))
self._raise_or_log(VERIFICATION_MSG.format(
path_str, expected_old_value, current_old_value, VERIFY_BIDIRECTIONAL_MSG))
def _get_elem_and_compare_to_old_value(
self,
obj,
path_for_err_reporting,
expected_old_value,
elem=None,
action=None,
forced_old_value=None,
next_element=None,
):
try:
check_elem(elem)
except ValueError as error:
self._raise_or_log(UNABLE_TO_GET_ITEM_MSG.format(path_for_err_reporting, error))
return not_found
# if forced_old_value is not None:
try:
if action == GET:
current_old_value = obj[elem]
elif action == GETATTR:
current_old_value = getattr(obj, elem) # type: ignore
else:
raise DeltaError(INVALID_ACTION_WHEN_CALLING_GET_ELEM.format(action))
except (KeyError, IndexError, AttributeError, TypeError) as e:
if self.force:
if forced_old_value is None:
if next_element is None or isinstance(next_element, str):
_forced_old_value = {}
else:
_forced_old_value = []
else:
_forced_old_value = forced_old_value
if action == GET:
if isinstance(obj, list):
if isinstance(elem, int) and elem < len(obj):
obj[elem] = _forced_old_value
else:
obj.append(_forced_old_value)
else:
obj[elem] = _forced_old_value
elif action == GETATTR:
setattr(obj, elem, _forced_old_value) # type: ignore
return _forced_old_value
current_old_value = not_found
if isinstance(path_for_err_reporting, (list, tuple)):
path_for_err_reporting = '.'.join([i[0] for i in path_for_err_reporting])
if self.bidirectional:
self._raise_or_log(VERIFICATION_MSG.format(
path_for_err_reporting,
expected_old_value, current_old_value, e))
else:
self._raise_or_log(UNABLE_TO_GET_PATH_MSG.format(
path_for_err_reporting))
return current_old_value
def _simple_set_elem_value(self, obj, path_for_err_reporting, elem=None, value=None, action=None):
"""
Set the element value directly on an object
"""
try:
if action == GET:
try:
obj[elem] = value
except IndexError:
if elem == len(obj):
obj.append(value)
elif self.fill is not not_found and elem > len(obj):
while len(obj) < elem:
if callable(self.fill):
obj.append(self.fill(obj, value, path_for_err_reporting))
else:
obj.append(self.fill)
obj.append(value)
else:
self._raise_or_log(ELEM_NOT_FOUND_TO_ADD_MSG.format(elem, path_for_err_reporting))
elif action == GETATTR:
setattr(obj, elem, value) # type: ignore
else:
raise DeltaError(INVALID_ACTION_WHEN_CALLING_SIMPLE_SET_ELEM.format(action))
except (KeyError, IndexError, AttributeError, TypeError) as e:
self._raise_or_log('Failed to set {} due to {}'.format(path_for_err_reporting, e))
def _coerce_obj(self, parent, obj, path, parent_to_obj_elem,
parent_to_obj_action, elements, to_type, from_type):
"""
Coerce obj and mark it in post_process_paths_to_convert for later to be converted back.
Also reassign it to its parent to replace the old object.
"""
self.post_process_paths_to_convert[elements[:-1]] = {'old_type': to_type, 'new_type': from_type}
# If this function is going to ever be used to convert numpy arrays, uncomment these lines:
# if from_type is np_ndarray:
# obj = obj.tolist()
# else:
obj = to_type(obj)
if parent:
# Making sure that the object is re-instated inside the parent especially if it was immutable
# and we had to turn it into a mutable one. In such cases the object has a new id.
self._simple_set_elem_value(obj=parent, path_for_err_reporting=path, elem=parent_to_obj_elem,
value=obj, action=parent_to_obj_action)
return obj
def _set_new_value(self, parent, parent_to_obj_elem, parent_to_obj_action,
obj, elements, path, elem, action, new_value):
"""
Set the element value on an object and if necessary convert the object to the proper mutable type
"""
if isinstance(obj, tuple):
# Check if it's a NamedTuple and use _replace() to generate a new copy with the change
if hasattr(obj, '_fields') and hasattr(obj, '_replace'):
if action == GETATTR:
obj = obj._replace(**{elem: new_value})
if parent:
self._simple_set_elem_value(obj=parent, path_for_err_reporting=path,
elem=parent_to_obj_elem, value=obj,
action=parent_to_obj_action)
return
else:
# Regular tuple - convert this object back to a tuple later
obj = self._coerce_obj(
parent, obj, path, parent_to_obj_elem,
parent_to_obj_action, elements,
to_type=list, from_type=tuple)
if elem != 0 and self.force and isinstance(obj, list) and len(obj) == 0:
# it must have been a dictionary
obj = {}
self._simple_set_elem_value(obj=parent, path_for_err_reporting=path, elem=parent_to_obj_elem,
value=obj, action=parent_to_obj_action)
self._simple_set_elem_value(obj=obj, path_for_err_reporting=path, elem=elem,
value=new_value, action=action)
def _simple_delete_elem(self, obj, path_for_err_reporting, elem=None, action=None):
"""
Delete the element directly on an object
"""
try:
if action == GET:
del obj[elem]
elif action == GETATTR:
del obj.__dict__[elem]
else:
raise DeltaError(INVALID_ACTION_WHEN_CALLING_SIMPLE_DELETE_ELEM.format(action))
except (KeyError, IndexError, AttributeError) as e:
self._raise_or_log('Failed to set {} due to {}'.format(path_for_err_reporting, e))
def _del_elem(self, parent, parent_to_obj_elem, parent_to_obj_action,
obj, elements, path, elem, action):
"""
Delete the element value on an object and if necessary convert the object to the proper mutable type
"""
obj_is_new = False
if isinstance(obj, tuple):
# convert this object back to a tuple later
self.post_process_paths_to_convert[elements[:-1]] = {'old_type': list, 'new_type': tuple}
obj = list(obj)
obj_is_new = True
self._simple_delete_elem(obj=obj, path_for_err_reporting=path, elem=elem, action=action)
if obj_is_new and parent:
# Making sure that the object is re-instated inside the parent especially if it was immutable
# and we had to turn it into a mutable one. In such cases the object has a new id.
self._simple_set_elem_value(obj=parent, path_for_err_reporting=path, elem=parent_to_obj_elem,
value=obj, action=parent_to_obj_action)
def _do_iterable_item_added(self):
iterable_item_added = self.diff.get('iterable_item_added', {})
iterable_item_moved = self.diff.get('iterable_item_moved')
# First we need to create a placeholder for moved items.
# This will then get replaced below after we go through added items.
# Without this items can get double added because moved store the new_value and does not need item_added replayed
if iterable_item_moved:
added_dict = {v["new_path"]: None for k, v in iterable_item_moved.items()}
iterable_item_added.update(added_dict)
if iterable_item_added:
self._do_item_added(iterable_item_added, insert=True)
if iterable_item_moved:
added_dict = {v["new_path"]: v["value"] for k, v in iterable_item_moved.items()}
self._do_item_added(added_dict, insert=False)
def _do_dictionary_item_added(self):
dictionary_item_added = self.diff.get('dictionary_item_added')
if dictionary_item_added:
self._do_item_added(dictionary_item_added, sort=False)
def _do_attribute_added(self):
attribute_added = self.diff.get('attribute_added')
if attribute_added:
self._do_item_added(attribute_added)
@staticmethod
def _sort_key_for_item_added(path_and_value):
elements = _path_to_elements(path_and_value[0])
# Example elements: [(4.3, 'GET'), ('b', 'GETATTR'), ('a3', 'GET')]
# We only care about the values in the elements not how to get the values.
return [i[0] for i in elements]
@staticmethod
def _sort_comparison(left, right):
"""
We use sort comparison instead of _sort_key_for_item_added when we run into comparing element types that can not
be compared with each other, such as None to None. Or integer to string.
"""
# Example elements: [(4.3, 'GET'), ('b', 'GETATTR'), ('a3', 'GET')]
# We only care about the values in the elements not how to get the values.
left_path = [i[0] for i in _path_to_elements(left[0], root_element=None)]
right_path = [i[0] for i in _path_to_elements(right[0], root_element=None)]
try:
if left_path < right_path:
return -1
elif left_path > right_path:
return 1
else:
return 0
except TypeError:
if len(left_path) > len(right_path):
left_path = left_path[:len(right_path)]
elif len(right_path) > len(left_path):
right_path = right_path[:len(left_path)]
for l_elem, r_elem in zip(left_path, right_path):
if type(l_elem) != type(r_elem) or type(l_elem) in None:
l_elem = str(l_elem)
r_elem = str(r_elem)
try:
if l_elem < r_elem:
return -1
elif l_elem > r_elem:
return 1
except TypeError:
continue
return 0
def _do_item_added(self, items, sort=True, insert=False):
if sort:
# sorting items by their path so that the items with smaller index
# are applied first (unless `sort` is `False` so that order of
# added items is retained, e.g. for dicts).
try:
items = sorted(items.items(), key=self._sort_key_for_item_added)
except TypeError:
items = sorted(items.items(), key=cmp_to_key(self._sort_comparison))
else:
items = items.items()
for path, new_value in items:
elem_and_details = self._get_elements_and_details(path)
if elem_and_details:
elements, parent, parent_to_obj_elem, parent_to_obj_action, obj, elem, action = elem_and_details
else:
continue # pragma: no cover. Due to cPython peephole optimizer, this line doesn't get covered. https://github.com/nedbat/coveragepy/issues/198
# Insert is only true for iterables, make sure it is a valid index.
if(insert and elem < len(obj)): # type: ignore
obj.insert(elem, None) # type: ignore
self._set_new_value(parent, parent_to_obj_elem, parent_to_obj_action,
obj, elements, path, elem, action, new_value)
def _do_values_changed(self):
values_changed = self.diff.get('values_changed')
if values_changed:
self._do_values_or_type_changed(values_changed)
def _do_type_changes(self):
type_changes = self.diff.get('type_changes')
if type_changes:
self._do_values_or_type_changed(type_changes, is_type_change=True)
def _do_post_process(self):
if self.post_process_paths_to_convert:
# Example: We had converted some object to be mutable and now we are converting them back to be immutable.
# We don't need to check the change because it is not really a change that was part of the original diff.
self._do_values_or_type_changed(self.post_process_paths_to_convert, is_type_change=True, verify_changes=False)
def _do_pre_process(self):
if self._numpy_paths and ('iterable_item_added' in self.diff or 'iterable_item_removed' in self.diff):
preprocess_paths = dict_()
for path, type_ in self._numpy_paths.items(): # type: ignore
preprocess_paths[path] = {'old_type': np_ndarray, 'new_type': list}
try:
type_ = numpy_dtype_string_to_type(type_)
except Exception as e:
self._raise_or_log(NOT_VALID_NUMPY_TYPE.format(e))
continue # pragma: no cover. Due to cPython peephole optimizer, this line doesn't get covered. https://github.com/nedbat/coveragepy/issues/198
self.post_process_paths_to_convert[path] = {'old_type': list, 'new_type': type_}
if preprocess_paths:
self._do_values_or_type_changed(preprocess_paths, is_type_change=True)
def _get_elements_and_details(self, path):
try:
elements = _path_to_elements(path)
if len(elements) > 1:
elements_subset = elements[:-2]
if len(elements_subset) != len(elements):
next_element = elements[-2][0]
next2_element = elements[-1][0]
else:
next_element = None
parent = self.get_nested_obj(obj=self, elements=elements_subset, next_element=next_element)
parent_to_obj_elem, parent_to_obj_action = elements[-2]
obj = self._get_elem_and_compare_to_old_value(
obj=parent, path_for_err_reporting=path, expected_old_value=None,
elem=parent_to_obj_elem, action=parent_to_obj_action, next_element=next2_element) # type: ignore
else:
# parent = self
# obj = self.root
# parent_to_obj_elem = 'root'
# parent_to_obj_action = GETATTR
parent = parent_to_obj_elem = parent_to_obj_action = None
obj = self
# obj = self.get_nested_obj(obj=self, elements=elements[:-1])
elem, action = elements[-1] # type: ignore
check_elem(elem)
except Exception as e:
self._raise_or_log(UNABLE_TO_GET_ITEM_MSG.format(path, e))
return None
else:
if obj is not_found:
return None
return elements, parent, parent_to_obj_elem, parent_to_obj_action, obj, elem, action
def _do_values_or_type_changed(self, changes, is_type_change=False, verify_changes=True):
compare_func_was_used = self.diff.get('_iterable_compare_func_was_used', False)
for path, value in changes.items():
# When iterable_compare_func is used, DiffLevel.path() inverts use_t2 for
# moved items (see model.py DiffLevel.path). This means dict keys here are
# actually t2 paths and new_path holds the t1 path. Apply at t1 so we
# don't access indices that don't exist yet or modify the wrong item.
apply_path = value['new_path'] if (compare_func_was_used and value.get('new_path')) else path
elem_and_details = self._get_elements_and_details(apply_path)
if elem_and_details:
elements, parent, parent_to_obj_elem, parent_to_obj_action, obj, elem, action = elem_and_details
else:
continue # pragma: no cover. Due to cPython peephole optimizer, this line doesn't get covered. https://github.com/nedbat/coveragepy/issues/198
expected_old_value = value.get('old_value', not_found)
current_old_value = self._get_elem_and_compare_to_old_value(
obj=obj, path_for_err_reporting=path, expected_old_value=expected_old_value, elem=elem, action=action)
if current_old_value is not_found:
continue # pragma: no cover. I have not been able to write a test for this case. But we should still check for it.
# With type change if we could have originally converted the type from old_value
# to new_value just by applying the class of the new_value, then we might not include the new_value
# in the delta dictionary. That is defined in Model.DeltaResult._from_tree_type_changes
if is_type_change and 'new_value' not in value:
try:
new_type = value['new_type']
# in case of Numpy we pass the ndarray plus the dtype in a tuple
if new_type in numpy_dtypes:
new_value = np_array_factory(current_old_value, new_type)
else:
new_value = new_type(current_old_value)
except Exception as e:
self._raise_or_log(TYPE_CHANGE_FAIL_MSG.format(obj[elem], value.get('new_type', 'unknown'), e)) # type: ignore
continue
else:
new_value = value['new_value']
self._set_new_value(parent, parent_to_obj_elem, parent_to_obj_action,
obj, elements, path, elem, action, new_value)
if verify_changes:
self._do_verify_changes(path, expected_old_value, current_old_value)
def _do_item_removed(self, items):
"""
Handle removing items.
"""
# Sorting the iterable_item_removed in reverse order based on the paths.
# So that we delete a bigger index before a smaller index
try:
sorted_item = sorted(items.items(), key=self._sort_key_for_item_added, reverse=True)
except TypeError:
sorted_item = sorted(items.items(), key=cmp_to_key(self._sort_comparison), reverse=True)
for path, expected_old_value in sorted_item:
elem_and_details = self._get_elements_and_details(path)
if elem_and_details:
elements, parent, parent_to_obj_elem, parent_to_obj_action, obj, elem, action = elem_and_details
else:
continue # pragma: no cover. Due to cPython peephole optimizer, this line doesn't get covered. https://github.com/nedbat/coveragepy/issues/198
look_for_expected_old_value = False
current_old_value = not_found
try:
if action == GET:
current_old_value = obj[elem] # type: ignore
elif action == GETATTR:
current_old_value = getattr(obj, elem)
look_for_expected_old_value = current_old_value != expected_old_value
except (KeyError, IndexError, AttributeError, TypeError):
look_for_expected_old_value = True
if look_for_expected_old_value and isinstance(obj, list) and not self._iterable_compare_func_was_used:
# It may return None if it doesn't find it
elem = self._find_closest_iterable_element_for_index(obj, elem, expected_old_value)
if elem is not None:
current_old_value = expected_old_value
if current_old_value is not_found or elem is None:
continue
self._del_elem(parent, parent_to_obj_elem, parent_to_obj_action,
obj, elements, path, elem, action)
self._do_verify_changes(path, expected_old_value, current_old_value)
def _find_closest_iterable_element_for_index(self, obj, elem, expected_old_value):
closest_elem = None
closest_distance = float('inf')
for index, value in enumerate(obj):
dist = abs(index - elem)
if dist > closest_distance:
break
if value == expected_old_value and dist < closest_distance:
closest_elem = index
closest_distance = dist
return closest_elem
def _do_iterable_opcodes(self):
_iterable_opcodes = self.diff.get('_iterable_opcodes', {})
if _iterable_opcodes:
for path, opcodes in _iterable_opcodes.items():
transformed = []
# elements = _path_to_elements(path)
elem_and_details = self._get_elements_and_details(path)
if elem_and_details:
elements, parent, parent_to_obj_elem, parent_to_obj_action, obj, elem, action = elem_and_details
if parent is None:
parent = self
obj = self.root
parent_to_obj_elem = 'root'
parent_to_obj_action = GETATTR
else:
continue # pragma: no cover. Due to cPython peephole optimizer, this line doesn't get covered. https://github.com/nedbat/coveragepy/issues/198
# import pytest; pytest.set_trace()
obj = self.get_nested_obj(obj=self, elements=elements)
is_obj_tuple = isinstance(obj, tuple)
for opcode in opcodes:
if opcode.tag == 'replace':
# Replace items in list a[i1:i2] with b[j1:j2]
transformed.extend(opcode.new_values)
elif opcode.tag == 'delete':
# Delete items from list a[i1:i2], so we do nothing here
continue
elif opcode.tag == 'insert':
# Insert items from list b[j1:j2] into the new list
transformed.extend(opcode.new_values)
elif opcode.tag == 'equal':
# Items are the same in both lists, so we add them to the result
transformed.extend(obj[opcode.t1_from_index:opcode.t1_to_index]) # type: ignore
if is_obj_tuple:
obj = tuple(obj) # type: ignore
# Making sure that the object is re-instated inside the parent especially if it was immutable
# and we had to turn it into a mutable one. In such cases the object has a new id.
self._simple_set_elem_value(obj=parent, path_for_err_reporting=path, elem=parent_to_obj_elem,
value=obj, action=parent_to_obj_action)
else:
obj[:] = transformed # type: ignore
# obj = self.get_nested_obj(obj=self, elements=elements)
# for
def _do_iterable_item_removed(self):
iterable_item_removed = self.diff.get('iterable_item_removed', {})
iterable_item_moved = self.diff.get('iterable_item_moved')
if iterable_item_moved:
# These will get added back during items_added
removed_dict = {k: v["value"] for k, v in iterable_item_moved.items()}
iterable_item_removed.update(removed_dict)
if iterable_item_removed:
self._do_item_removed(iterable_item_removed)
def _do_dictionary_item_removed(self):
dictionary_item_removed = self.diff.get('dictionary_item_removed')
if dictionary_item_removed:
self._do_item_removed(dictionary_item_removed)
def _do_attribute_removed(self):
attribute_removed = self.diff.get('attribute_removed')
if attribute_removed:
self._do_item_removed(attribute_removed)
def _do_set_item_added(self):
items = self.diff.get('set_item_added')
if items:
self._do_set_or_frozenset_item(items, func='union')
def _do_set_item_removed(self):
items = self.diff.get('set_item_removed')
if items:
self._do_set_or_frozenset_item(items, func='difference')
def _do_set_or_frozenset_item(self, items, func):
for path, value in items.items():
elements = _path_to_elements(path)
parent = self.get_nested_obj(obj=self, elements=elements[:-1])
elem, action = elements[-1]
obj = self._get_elem_and_compare_to_old_value(
parent, path_for_err_reporting=path, expected_old_value=None, elem=elem, action=action, forced_old_value=set())
new_value = getattr(obj, func)(value)
if hasattr(parent, '_fields') and hasattr(parent, '_replace'):
# Handle parent NamedTuple by creating a new instance with _replace(). Will not work with nested objects.
new_parent = parent._replace(**{elem: new_value})
self.root = new_parent
else:
self._simple_set_elem_value(parent, path_for_err_reporting=path, elem=elem, value=new_value, action=action)
def _do_ignore_order_get_old(self, obj, remove_indexes_per_path, fixed_indexes_values, path_for_err_reporting):
"""
A generator that gets the old values in an iterable when the order was supposed to be ignored.
"""
old_obj_index = -1
max_len = len(obj) - 1
while old_obj_index < max_len:
old_obj_index += 1
current_old_obj = obj[old_obj_index]
if current_old_obj in fixed_indexes_values:
continue
if old_obj_index in remove_indexes_per_path:
expected_obj_to_delete = remove_indexes_per_path.pop(old_obj_index)
if current_old_obj == expected_obj_to_delete:
continue
else:
self._raise_or_log(FAIL_TO_REMOVE_ITEM_IGNORE_ORDER_MSG.format(
old_obj_index, path_for_err_reporting, expected_obj_to_delete, current_old_obj))
yield current_old_obj
def _do_ignore_order(self):
"""
't1': [5, 1, 1, 1, 6],
't2': [7, 1, 1, 1, 8],
'iterable_items_added_at_indexes': {
'root': {
0: 7,
4: 8
}
},
'iterable_items_removed_at_indexes': {
'root': {
4: 6,
0: 5
}
}
"""
fixed_indexes = self.diff.get('iterable_items_added_at_indexes', dict_())
remove_indexes = self.diff.get('iterable_items_removed_at_indexes', dict_())
paths = SetOrdered(fixed_indexes.keys()) | SetOrdered(remove_indexes.keys())
for path in paths: # type: ignore
# In the case of ignore_order reports, we are pointing to the container object.
# Thus we add a [0] to the elements so we can get the required objects and discard what we don't need.
elem_and_details = self._get_elements_and_details("{}[0]".format(path))
if elem_and_details:
_, parent, parent_to_obj_elem, parent_to_obj_action, obj, _, _ = elem_and_details
else:
continue # pragma: no cover. Due to cPython peephole optimizer, this line doesn't get covered. https://github.com/nedbat/coveragepy/issues/198
# copying both these dictionaries since we don't want to mutate them.
fixed_indexes_per_path = fixed_indexes.get(path, dict_()).copy()
remove_indexes_per_path = remove_indexes.get(path, dict_()).copy()
fixed_indexes_values = AnySet(fixed_indexes_per_path.values())
new_obj = []
# Numpy's NdArray does not like the bool function.
if isinstance(obj, np_ndarray):
there_are_old_items = obj.size > 0
else:
there_are_old_items = bool(obj)
old_item_gen = self._do_ignore_order_get_old(
obj, remove_indexes_per_path, fixed_indexes_values, path_for_err_reporting=path)
while there_are_old_items or fixed_indexes_per_path:
new_obj_index = len(new_obj)
if new_obj_index in fixed_indexes_per_path:
new_item = fixed_indexes_per_path.pop(new_obj_index)
new_obj.append(new_item)
elif there_are_old_items:
try:
new_item = next(old_item_gen)
except StopIteration:
there_are_old_items = False
else:
new_obj.append(new_item)
else:
# pop a random item from the fixed_indexes_per_path dictionary
self._raise_or_log(INDEXES_NOT_FOUND_WHEN_IGNORE_ORDER.format(fixed_indexes_per_path))
new_item = fixed_indexes_per_path.pop(next(iter(fixed_indexes_per_path)))
new_obj.append(new_item)
if isinstance(obj, tuple):
new_obj = tuple(new_obj)
# Making sure that the object is re-instated inside the parent especially if it was immutable
# and we had to turn it into a mutable one. In such cases the object has a new id.
self._simple_set_elem_value(obj=parent, path_for_err_reporting=path, elem=parent_to_obj_elem,
value=new_obj, action=parent_to_obj_action)
def _get_reverse_diff(self):
if not self.bidirectional:
raise ValueError('Please recreate the delta with bidirectional=True')
SIMPLE_ACTION_TO_REVERSE = {
'iterable_item_added': 'iterable_item_removed',
'iterable_items_added_at_indexes': 'iterable_items_removed_at_indexes',
'attribute_added': 'attribute_removed',
'set_item_added': 'set_item_removed',
'dictionary_item_added': 'dictionary_item_removed',
}
# Adding the reverse of the dictionary
for key in list(SIMPLE_ACTION_TO_REVERSE.keys()):
SIMPLE_ACTION_TO_REVERSE[SIMPLE_ACTION_TO_REVERSE[key]] = key
r_diff = {}
for action, info in self.diff.items():
reverse_action = SIMPLE_ACTION_TO_REVERSE.get(action)
if reverse_action:
r_diff[reverse_action] = info
elif action == 'values_changed':
r_diff[action] = {}
for path, path_info in info.items():
reverse_path = path_info['new_path'] if path_info.get('new_path') else path
r_diff[action][reverse_path] = {
'new_value': path_info['old_value'], 'old_value': path_info['new_value']
}
elif action == 'type_changes':
r_diff[action] = {}
for path, path_info in info.items():
reverse_path = path_info['new_path'] if path_info.get('new_path') else path
r_diff[action][reverse_path] = {
'old_type': path_info['new_type'], 'new_type': path_info['old_type'],
}
if 'new_value' in path_info:
r_diff[action][reverse_path]['old_value'] = path_info['new_value']
if 'old_value' in path_info:
r_diff[action][reverse_path]['new_value'] = path_info['old_value']
elif action == 'iterable_item_moved':
r_diff[action] = {}
for path, path_info in info.items():
old_path = path_info['new_path']
r_diff[action][old_path] = {
'new_path': path, 'value': path_info['value'],
}
elif action == '_iterable_opcodes':
r_diff[action] = {}
for path, op_codes in info.items():
r_diff[action][path] = []
for op_code in op_codes:
tag = op_code.tag
tag = {'delete': 'insert', 'insert': 'delete'}.get(tag, tag)
new_op_code = Opcode(
tag=tag,
t1_from_index=op_code.t2_from_index,
t1_to_index=op_code.t2_to_index,
t2_from_index=op_code.t1_from_index,
t2_to_index=op_code.t1_to_index,
new_values=op_code.old_values,
old_values=op_code.new_values,
)
r_diff[action][path].append(new_op_code)
return r_diff
def dump(self, file):
"""
Dump into file object
"""
# Small optimization: Our internal pickle serializer can just take a file object
# and directly write to it. However if a user defined serializer is passed
# we want to make it compatible with the expectation that self.serializer(self.diff)
# will give the user the serialization and then it can be written to
# a file object when using the dump(file) function.
param_names_of_serializer = set(self.serializer.__code__.co_varnames)
if 'file_obj' in param_names_of_serializer:
self.serializer(self.diff, file_obj=file)
else:
file.write(self.dumps())
def dumps(self):
"""
Return the serialized representation of the object as a bytes object, instead of writing it to a file.
"""
return self.serializer(self.diff)
def to_dict(self):
return dict(self.diff)
def _flatten_iterable_opcodes(self, _parse_path):
"""
Converts op_codes to FlatDeltaRows
"""
result = []
for path, op_codes in self.diff['_iterable_opcodes'].items():
for op_code in op_codes:
result.append(
FlatDeltaRow(
path=_parse_path(path),
action=OPCODE_TAG_TO_FLAT_DATA_ACTION[op_code.tag],
value=op_code.new_values,
old_value=op_code.old_values,
type=type(op_code.new_values),
old_type=type(op_code.old_values),
new_path=None,
t1_from_index=op_code.t1_from_index,
t1_to_index=op_code.t1_to_index,
t2_from_index=op_code.t2_from_index,
t2_to_index=op_code.t2_to_index,
)
)
return result
@staticmethod
def _get_flat_row(action, info, _parse_path, keys_and_funcs, report_type_changes=True):
for path, details in info.items():
row = {'path': _parse_path(path), 'action': action}
for key, new_key, func in keys_and_funcs:
if key in details:
if func:
row[new_key] = func(details[key])
else:
row[new_key] = details[key]
if report_type_changes:
if 'value' in row and 'type' not in row:
row['type'] = type(row['value'])
if 'old_value' in row and 'old_type' not in row:
row['old_type'] = type(row['old_value'])
yield FlatDeltaRow(**row)
@staticmethod
def _from_flat_rows(flat_rows_list: List[FlatDeltaRow]):
flat_dict_list = (i._asdict() for i in flat_rows_list)
return Delta._from_flat_dicts(flat_dict_list)
@staticmethod
def _from_flat_dicts(flat_dict_list):
"""
Create the delta's diff object from the flat_dict_list
"""
result = {}
FLATTENING_NEW_ACTION_MAP = {
'unordered_iterable_item_added': 'iterable_items_added_at_indexes',
'unordered_iterable_item_removed': 'iterable_items_removed_at_indexes',
}
for flat_dict in flat_dict_list:
index = None
action = flat_dict.get("action")
path = flat_dict.get("path")
value = flat_dict.get('value')
new_path = flat_dict.get('new_path')
old_value = flat_dict.get('old_value', UnkownValueCode)
if not action:
raise ValueError("Flat dict need to include the 'action'.")
if path is None:
raise ValueError("Flat dict need to include the 'path'.")
if action in FLATTENING_NEW_ACTION_MAP:
action = FLATTENING_NEW_ACTION_MAP[action]
index = path.pop()
if action in {
FlatDataAction.attribute_added,
FlatDataAction.attribute_removed,
}:
root_element = ('root', GETATTR)
else:
root_element = ('root', GET)
if isinstance(path, str):
path_str = path
else:
path_str = stringify_path(path, root_element=root_element) # We need the string path
if new_path and new_path != path:
new_path = stringify_path(new_path, root_element=root_element)
else:
new_path = None
if action not in result:
result[action] = {}
if action in {
'iterable_items_added_at_indexes',
'iterable_items_removed_at_indexes',
}:
if path_str not in result[action]:
result[action][path_str] = {}
result[action][path_str][index] = value
elif action in {
FlatDataAction.set_item_added,
FlatDataAction.set_item_removed
}:
if path_str not in result[action]:
result[action][path_str] = set()
result[action][path_str].add(value)
elif action in {
FlatDataAction.dictionary_item_added,
FlatDataAction.dictionary_item_removed,
FlatDataAction.attribute_removed,
FlatDataAction.attribute_added,
FlatDataAction.iterable_item_added,
FlatDataAction.iterable_item_removed,
}:
result[action][path_str] = value
elif action == 'values_changed':
if old_value == UnkownValueCode:
result[action][path_str] = {'new_value': value}
else:
result[action][path_str] = {'new_value': value, 'old_value': old_value}
elif action == 'type_changes':
type_ = flat_dict.get('type', UnkownValueCode)
old_type = flat_dict.get('old_type', UnkownValueCode)
result[action][path_str] = {'new_value': value}
for elem, elem_value in [
('new_type', type_),
('old_type', old_type),
('old_value', old_value),
]:
if elem_value != UnkownValueCode:
result[action][path_str][elem] = elem_value
elif action == FlatDataAction.iterable_item_moved:
result[action][path_str] = {'value': value}
elif action in {
FlatDataAction.iterable_items_inserted,
FlatDataAction.iterable_items_deleted,
FlatDataAction.iterable_items_replaced,
FlatDataAction.iterable_items_equal,
}:
if '_iterable_opcodes' not in result:
result['_iterable_opcodes'] = {}
if path_str not in result['_iterable_opcodes']:
result['_iterable_opcodes'][path_str] = []
result['_iterable_opcodes'][path_str].append(
Opcode(
tag=FLAT_DATA_ACTION_TO_OPCODE_TAG[action], # type: ignore
t1_from_index=flat_dict.get('t1_from_index'),
t1_to_index=flat_dict.get('t1_to_index'),
t2_from_index=flat_dict.get('t2_from_index'),
t2_to_index=flat_dict.get('t2_to_index'),
new_values=flat_dict.get('value'),
old_values=flat_dict.get('old_value'),
)
)
if new_path:
result[action][path_str]['new_path'] = new_path
return result
def to_flat_dicts(self, include_action_in_path=False, report_type_changes=True) -> List[FlatDeltaDict]:
"""
Returns a flat list of actions that is easily machine readable.
For example:
{'iterable_item_added': {'root[3]': 5, 'root[2]': 3}}
Becomes:
[
{'path': [3], 'value': 5, 'action': 'iterable_item_added'},
{'path': [2], 'value': 3, 'action': 'iterable_item_added'},
]
**Parameters**
include_action_in_path : Boolean, default=False
When False, we translate DeepDiff's paths like root[3].attribute1 into a [3, 'attribute1'].
When True, we include the action to retrieve the item in the path: [(3, 'GET'), ('attribute1', 'GETATTR')]
Note that the "action" here is the different than the action reported by to_flat_dicts. The action here is just about the "path" output.
report_type_changes : Boolean, default=True
If False, we don't report the type change. Instead we report the value change.
Example:
t1 = {"a": None}
t2 = {"a": 1}
dump = Delta(DeepDiff(t1, t2)).dumps()
delta = Delta(dump)
assert t2 == delta + t1
flat_result = delta.to_flat_dicts()
flat_expected = [{'path': ['a'], 'action': 'type_changes', 'value': 1, 'new_type': int, 'old_type': type(None)}]
assert flat_expected == flat_result
flat_result2 = delta.to_flat_dicts(report_type_changes=False)
flat_expected2 = [{'path': ['a'], 'action': 'values_changed', 'value': 1}]
**List of actions**
Here are the list of actions that the flat dictionary can return.
iterable_item_added
iterable_item_removed
iterable_item_moved
values_changed
type_changes
set_item_added
set_item_removed
dictionary_item_added
dictionary_item_removed
attribute_added
attribute_removed
"""
return cast(List[FlatDeltaDict], [
i._asdict() for i in self.to_flat_rows(include_action_in_path=include_action_in_path, report_type_changes=report_type_changes)
])
def to_flat_rows(self, include_action_in_path=False, report_type_changes=True) -> List[FlatDeltaRow]:
"""
Just like to_flat_dicts but returns FlatDeltaRow Named Tuples
"""
result = []
if include_action_in_path:
_parse_path = partial(parse_path, include_actions=True)
else:
_parse_path = parse_path
if report_type_changes:
keys_and_funcs = [
('value', 'value', None),
('new_value', 'value', None),
('old_value', 'old_value', None),
('new_type', 'type', None),
('old_type', 'old_type', None),
('new_path', 'new_path', _parse_path),
]
else:
if not self.always_include_values:
raise ValueError(
"When converting to flat dictionaries, if report_type_changes=False and there are type changes, "
"you must set the always_include_values=True at the delta object creation. Otherwise there is nothing to include."
)
keys_and_funcs = [
('value', 'value', None),
('new_value', 'value', None),
('old_value', 'old_value', None),
('new_path', 'new_path', _parse_path),
]
FLATTENING_NEW_ACTION_MAP = {
'iterable_items_added_at_indexes': 'unordered_iterable_item_added',
'iterable_items_removed_at_indexes': 'unordered_iterable_item_removed',
}
for action, info in self.diff.items():
if action == '_iterable_opcodes':
result.extend(self._flatten_iterable_opcodes(_parse_path=_parse_path))
continue
if action.startswith('_'):
continue
if action in FLATTENING_NEW_ACTION_MAP:
new_action = FLATTENING_NEW_ACTION_MAP[action]
for path, index_to_value in info.items():
path = _parse_path(path)
for index, value in index_to_value.items():
path2 = path.copy()
if include_action_in_path:
path2.append((index, 'GET')) # type: ignore
else:
path2.append(index)
if report_type_changes:
row = FlatDeltaRow(path=path2, value=value, action=new_action, type=type(value)) # type: ignore
else:
row = FlatDeltaRow(path=path2, value=value, action=new_action) # type: ignore
result.append(row)
elif action in {'set_item_added', 'set_item_removed'}:
for path, values in info.items():
path = _parse_path(path)
for value in values:
if report_type_changes:
row = FlatDeltaRow(path=path, value=value, action=action, type=type(value))
else:
row = FlatDeltaRow(path=path, value=value, action=action)
result.append(row)
elif action == 'dictionary_item_added':
for path, value in info.items():
path = _parse_path(path)
if isinstance(value, dict) and len(value) == 1:
new_key = next(iter(value))
path.append(new_key)
value = value[new_key]
elif isinstance(value, (list, tuple)) and len(value) == 1:
value = value[0]
path.append(0) # type: ignore
action = 'iterable_item_added'
elif isinstance(value, set) and len(value) == 1:
value = value.pop()
action = 'set_item_added'
if report_type_changes:
row = FlatDeltaRow(path=path, value=value, action=action, type=type(value)) # type: ignore
else:
row = FlatDeltaRow(path=path, value=value, action=action) # type: ignore
result.append(row)
elif action in {
'dictionary_item_removed', 'iterable_item_added',
'iterable_item_removed', 'attribute_removed', 'attribute_added'
}:
for path, value in info.items():
path = _parse_path(path)
if report_type_changes:
row = FlatDeltaRow(path=path, value=value, action=action, type=type(value))
else:
row = FlatDeltaRow(path=path, value=value, action=action)
result.append(row)
elif action == 'type_changes':
if not report_type_changes:
action = 'values_changed'
for row in self._get_flat_row(
action=action,
info=info,
_parse_path=_parse_path,
keys_and_funcs=keys_and_funcs,
report_type_changes=report_type_changes,
):
result.append(row)
else:
for row in self._get_flat_row(
action=action,
info=info,
_parse_path=_parse_path,
keys_and_funcs=keys_and_funcs,
report_type_changes=report_type_changes,
):
result.append(row)
return result
if __name__ == "__main__": # pragma: no cover
import doctest
doctest.testmod()
qlustered-deepdiff-41c7265/deepdiff/diff.py 0000775 0000000 0000000 00000274767 15162412645 0020633 0 ustar 00root root 0000000 0000000 #!/usr/bin/env python
# In order to run the docstrings:
# python3 -m deepdiff.diff
# You might need to run it many times since dictionaries come in different orders
# every time you run the docstrings.
# However the docstring expects it in a specific order in order to pass!
import difflib
import logging
import types
import datetime
import uuid
from enum import Enum
from copy import deepcopy
from math import isclose as is_close
from typing import List, Dict, Callable, Union, Any, Pattern, Tuple, Optional, Set, FrozenSet, TYPE_CHECKING, Protocol, Literal
from collections.abc import Mapping, Iterable, Sequence
from collections import defaultdict
from inspect import getmembers
from itertools import zip_longest
from functools import lru_cache
from deepdiff.helper import (strings, bytes_type, numbers, uuids, ListItemRemovedOrAdded, notpresent,
IndexedHash, unprocessed, add_to_frozen_set, basic_types,
convert_item_or_items_into_set_else_none, get_type,
convert_item_or_items_into_compiled_regexes_else_none,
type_is_subclass_of_type_group, type_in_type_group, get_doc,
number_to_string, datetime_normalize, KEY_TO_VAL_STR, booleans,
np_ndarray, np_floating, get_numpy_ndarray_rows, RepeatedTimer,
TEXT_VIEW, TREE_VIEW, DELTA_VIEW, COLORED_VIEW, COLORED_COMPACT_VIEW,
detailed__dict__, add_root_to_paths,
np, get_truncate_datetime, dict_, CannotCompare, ENUM_INCLUDE_KEYS,
PydanticBaseModel, Opcode, SetOrdered, ipranges)
from deepdiff.serialization import SerializationMixin
from deepdiff.distance import DistanceMixin, logarithmic_similarity
from deepdiff.model import (
RemapDict, ResultDict, TextResult, TreeResult, DiffLevel,
DictRelationship, AttributeRelationship, REPORT_KEYS,
SubscriptableIterableRelationship, NonSubscriptableIterableRelationship,
SetRelationship, NumpyArrayRelationship, CUSTOM_FIELD,
FORCE_DEFAULT,
)
from deepdiff.deephash import DeepHash, combine_hashes_lists
from deepdiff.base import Base
from deepdiff.lfucache import LFUCache, DummyLFU
from deepdiff.colored_view import ColoredView
if TYPE_CHECKING:
from pytz.tzinfo import BaseTzInfo
logger = logging.getLogger(__name__)
MAX_PASSES_REACHED_MSG = (
'DeepDiff has reached the max number of passes of {}. '
'You can possibly get more accurate results by increasing the max_passes parameter.')
MAX_DIFFS_REACHED_MSG = (
'DeepDiff has reached the max number of diffs of {}. '
'You can possibly get more accurate results by increasing the max_diffs parameter.')
notpresent_indexed = IndexedHash(indexes=[0], item=notpresent)
doc = get_doc('diff_doc.rst')
PROGRESS_MSG = "DeepDiff {} seconds in progress. Pass #{}, Diff #{}"
def _report_progress(_stats: Dict[str, Any], progress_logger: Callable[[str], None], duration: float) -> None:
"""
Report the progress every few seconds.
"""
progress_logger(PROGRESS_MSG.format(duration, _stats[PASSES_COUNT], _stats[DIFF_COUNT]))
DISTANCE_CACHE_HIT_COUNT = 'DISTANCE CACHE HIT COUNT'
DIFF_COUNT = 'DIFF COUNT'
PASSES_COUNT = 'PASSES COUNT'
MAX_PASS_LIMIT_REACHED = 'MAX PASS LIMIT REACHED'
MAX_DIFF_LIMIT_REACHED = 'MAX DIFF LIMIT REACHED'
DISTANCE_CACHE_ENABLED = 'DISTANCE CACHE ENABLED'
PREVIOUS_DIFF_COUNT = 'PREVIOUS DIFF COUNT'
PREVIOUS_DISTANCE_CACHE_HIT_COUNT = 'PREVIOUS DISTANCE CACHE HIT COUNT'
CANT_FIND_NUMPY_MSG = 'Unable to import numpy. This must be a bug in DeepDiff since a numpy array is detected.'
INVALID_VIEW_MSG = "view parameter must be one of 'text', 'tree', 'delta', 'colored' or 'colored_compact'. But {} was passed."
CUTOFF_RANGE_ERROR_MSG = 'cutoff_distance_for_pairs needs to be a positive float max 1.'
VERBOSE_LEVEL_RANGE_MSG = 'verbose_level should be 0, 1, or 2.'
PURGE_LEVEL_RANGE_MSG = 'cache_purge_level should be 0, 1, or 2.'
_ENABLE_CACHE_EVERY_X_DIFF = '_ENABLE_CACHE_EVERY_X_DIFF'
model_fields_set = frozenset(["model_fields_set"])
# What is the threshold to consider 2 items to be pairs. Only used when ignore_order = True.
CUTOFF_DISTANCE_FOR_PAIRS_DEFAULT = 0.3
# What is the threshold to calculate pairs of items between 2 iterables.
# For example 2 iterables that have nothing in common, do not need their pairs to be calculated.
CUTOFF_INTERSECTION_FOR_PAIRS_DEFAULT = 0.7
DEEPHASH_PARAM_KEYS = (
'exclude_types',
'exclude_paths',
'include_paths',
'exclude_regex_paths',
'hasher',
'significant_digits',
'number_format_notation',
'ignore_string_type_changes',
'ignore_numeric_type_changes',
'ignore_uuid_types',
'use_enum_value',
'ignore_type_in_groups',
'ignore_type_subclasses',
'ignore_string_case',
'exclude_obj_callback',
'ignore_private_variables',
'encodings',
'ignore_encoding_errors',
'default_timezone',
'custom_operators',
)
class DeepDiffProtocol(Protocol):
t1: Any
t2: Any
cutoff_distance_for_pairs: float
use_log_scale: bool
log_scale_similarity_threshold: float
view: str
math_epsilon: Optional[float]
class DeepDiff(ResultDict, SerializationMixin, DistanceMixin, DeepDiffProtocol, Base):
__doc__ = doc
CACHE_AUTO_ADJUST_THRESHOLD = 0.25
def __init__(self,
t1: Any,
t2: Any,
_original_type: Optional[Any]=None,
cache_purge_level: int=1,
cache_size: int=0,
cache_tuning_sample_size: int=0,
custom_operators: Optional[List[Any]] =None,
cutoff_distance_for_pairs: float=CUTOFF_DISTANCE_FOR_PAIRS_DEFAULT,
cutoff_intersection_for_pairs: float=CUTOFF_INTERSECTION_FOR_PAIRS_DEFAULT,
default_timezone:Union[datetime.timezone, "BaseTzInfo"]=datetime.timezone.utc,
encodings: Optional[List[str]]=None,
exclude_obj_callback: Optional[Callable]=None,
exclude_obj_callback_strict: Optional[Callable]=None,
exclude_paths: Union[str, List[str], Set[str], FrozenSet[str], None]=None,
exclude_regex_paths: Union[str, List[str], Pattern[str], List[Pattern[str]], None]=None,
exclude_types: Optional[List[type]]=None,
get_deep_distance: bool=False,
group_by: Union[str, Tuple[str, str], Callable, None]=None,
group_by_sort_key: Union[str, Callable, None]=None,
hasher: Optional[Callable]=None,
hashes: Optional[Dict[Any, Any]]=None,
ignore_encoding_errors: bool=False,
ignore_nan_inequality: bool=False,
ignore_numeric_type_changes: bool=False,
ignore_order: bool=False,
ignore_order_func: Optional[Callable]=None,
ignore_private_variables: bool=True,
ignore_string_case: bool=False,
ignore_string_type_changes: bool=False,
ignore_type_in_groups: Optional[List[Tuple[Any, ...]]]=None,
ignore_type_subclasses: bool=False,
ignore_uuid_types: bool=False,
include_obj_callback: Optional[Callable]=None,
include_obj_callback_strict: Optional[Callable]=None,
include_paths: Union[str, List[str], None]=None,
iterable_compare_func: Optional[Callable]=None,
log_frequency_in_sec: int=0,
log_scale_similarity_threshold: float=0.1,
log_stacktrace: bool=False,
math_epsilon: Optional[float]=None,
max_diffs: Optional[int]=None,
max_passes: int=10000000,
number_format_notation: Literal["f", "e"]="f",
number_to_string_func: Optional[Callable]=None,
progress_logger: Callable[[str], None]=logger.info,
report_repetition: bool=False,
significant_digits: Optional[int]=None,
threshold_to_diff_deeper: float = 0.33,
truncate_datetime: Optional[str]=None,
use_enum_value: bool=False,
use_log_scale: bool=False,
verbose_level: int=1,
view: str=TEXT_VIEW,
zip_ordered_iterables: bool=False,
_parameters: Optional[Dict[str, Any]]=None,
_shared_parameters: Optional[Dict[str, Any]]=None,
**kwargs):
super().__init__()
if kwargs:
raise ValueError((
"The following parameter(s) are not valid: %s\n"
"The valid parameters are ignore_order, report_repetition, significant_digits, "
"number_format_notation, exclude_paths, include_paths, exclude_types, exclude_regex_paths, ignore_type_in_groups, "
"ignore_string_type_changes, ignore_numeric_type_changes, ignore_type_subclasses, ignore_uuid_types, truncate_datetime, "
"ignore_private_variables, ignore_nan_inequality, number_to_string_func, verbose_level, "
"view, hasher, hashes, max_passes, max_diffs, zip_ordered_iterables, "
"cutoff_distance_for_pairs, cutoff_intersection_for_pairs, log_frequency_in_sec, cache_size, "
"cache_tuning_sample_size, get_deep_distance, group_by, group_by_sort_key, cache_purge_level, log_stacktrace,"
"math_epsilon, iterable_compare_func, use_enum_value, _original_type, threshold_to_diff_deeper, default_timezone "
"ignore_order_func, custom_operators, encodings, ignore_encoding_errors, use_log_scale, log_scale_similarity_threshold "
"_parameters and _shared_parameters.") % ', '.join(kwargs.keys()))
if _parameters:
self.__dict__.update(_parameters)
else:
self.custom_operators = custom_operators or []
self.ignore_order = ignore_order
self.ignore_order_func = ignore_order_func
ignore_type_in_groups = ignore_type_in_groups or []
if numbers == ignore_type_in_groups or numbers in ignore_type_in_groups:
ignore_numeric_type_changes = True
self.ignore_numeric_type_changes = ignore_numeric_type_changes
if strings == ignore_type_in_groups or strings in ignore_type_in_groups:
ignore_string_type_changes = True
# Handle ignore_uuid_types - check if uuid+str group is already in ignore_type_in_groups
uuid_str_group = (uuids[0], str)
if uuid_str_group == ignore_type_in_groups or uuid_str_group in ignore_type_in_groups:
ignore_uuid_types = True
self.ignore_uuid_types = ignore_uuid_types
self.use_enum_value = use_enum_value
self.log_scale_similarity_threshold = log_scale_similarity_threshold
self.use_log_scale = use_log_scale
self.default_timezone = default_timezone
self.log_stacktrace = log_stacktrace
self.threshold_to_diff_deeper = threshold_to_diff_deeper
self.ignore_string_type_changes = ignore_string_type_changes
self.ignore_type_in_groups = self.get_ignore_types_in_groups(
ignore_type_in_groups=ignore_type_in_groups,
ignore_string_type_changes=ignore_string_type_changes,
ignore_numeric_type_changes=ignore_numeric_type_changes,
ignore_type_subclasses=ignore_type_subclasses,
ignore_uuid_types=ignore_uuid_types)
self.report_repetition = report_repetition
self.exclude_paths = add_root_to_paths(convert_item_or_items_into_set_else_none(exclude_paths))
self.include_paths = add_root_to_paths(convert_item_or_items_into_set_else_none(include_paths))
self.exclude_regex_paths = convert_item_or_items_into_compiled_regexes_else_none(exclude_regex_paths)
self.exclude_types = set(exclude_types) if exclude_types else None
self.exclude_types_tuple = tuple(exclude_types) if exclude_types else None # we need tuple for checking isinstance
self.ignore_type_subclasses = ignore_type_subclasses
self.type_check_func = type_in_type_group if ignore_type_subclasses else type_is_subclass_of_type_group
self.ignore_string_case = ignore_string_case
self.exclude_obj_callback = exclude_obj_callback
self.exclude_obj_callback_strict = exclude_obj_callback_strict
self.include_obj_callback = include_obj_callback
self.include_obj_callback_strict = include_obj_callback_strict
self.number_to_string = number_to_string_func or number_to_string
self.iterable_compare_func = iterable_compare_func
self.zip_ordered_iterables = zip_ordered_iterables
self.ignore_private_variables = ignore_private_variables
self.ignore_nan_inequality = ignore_nan_inequality
self.hasher = hasher
self.cache_tuning_sample_size = cache_tuning_sample_size
self.group_by = group_by
if callable(group_by_sort_key):
self.group_by_sort_key = group_by_sort_key
elif group_by_sort_key:
def _group_by_sort_key(x):
return x[group_by_sort_key]
self.group_by_sort_key = _group_by_sort_key
else:
self.group_by_sort_key = None
self.encodings = encodings
self.ignore_encoding_errors = ignore_encoding_errors
self.significant_digits = self.get_significant_digits(significant_digits, ignore_numeric_type_changes)
self.math_epsilon = math_epsilon
if self.math_epsilon is not None and self.ignore_order:
logger.warning("math_epsilon in conjunction with ignore_order=True is only used for flat object comparisons. Custom math_epsilon will not have an effect when comparing nested objects.")
self.truncate_datetime = get_truncate_datetime(truncate_datetime)
self.number_format_notation = number_format_notation
if verbose_level in {0, 1, 2}:
self.verbose_level = verbose_level
else:
raise ValueError(VERBOSE_LEVEL_RANGE_MSG)
if cache_purge_level not in {0, 1, 2}:
raise ValueError(PURGE_LEVEL_RANGE_MSG)
self.view = view
# Setting up the cache for dynamic programming. One dictionary per instance of root of DeepDiff running.
self.max_passes = max_passes
self.max_diffs = max_diffs
self.cutoff_distance_for_pairs = float(cutoff_distance_for_pairs)
self.cutoff_intersection_for_pairs = float(cutoff_intersection_for_pairs)
if self.cutoff_distance_for_pairs < 0 or self.cutoff_distance_for_pairs > 1:
raise ValueError(CUTOFF_RANGE_ERROR_MSG)
# _Parameters are the clean _parameters to initialize DeepDiff with so we avoid all the above
# cleaning functionalities when running DeepDiff recursively.
# However DeepHash has its own set of _parameters that are slightly different than DeepDIff.
# DeepDiff _parameters are transformed to DeepHash _parameters via _get_deephash_params method.
self.progress_logger = progress_logger
self.cache_size = cache_size
_parameters = self.__dict__.copy()
_parameters['group_by'] = None # overwriting since these parameters will be passed on to other passes.
if log_stacktrace:
self.log_err = logger.exception
else:
self.log_err = logger.error
# Non-Root
if _shared_parameters:
self.is_root = False
self._shared_parameters = _shared_parameters
self.__dict__.update(_shared_parameters)
# We are in some pass other than root
progress_timer = None
# Root
else:
self.is_root = True
# Caching the DeepDiff results for dynamic programming
self._distance_cache = LFUCache(cache_size) if cache_size else DummyLFU()
self._stats = {
PASSES_COUNT: 0,
DIFF_COUNT: 0,
DISTANCE_CACHE_HIT_COUNT: 0,
PREVIOUS_DIFF_COUNT: 0,
PREVIOUS_DISTANCE_CACHE_HIT_COUNT: 0,
MAX_PASS_LIMIT_REACHED: False,
MAX_DIFF_LIMIT_REACHED: False,
DISTANCE_CACHE_ENABLED: bool(cache_size),
}
self.hashes = dict_() if hashes is None else hashes
self._numpy_paths = dict_() # if _numpy_paths is None else _numpy_paths
self.group_by_keys = set() # Track keys that originated from group_by operations
self._shared_parameters = {
'hashes': self.hashes,
'_stats': self._stats,
'_distance_cache': self._distance_cache,
'group_by_keys': self.group_by_keys,
'_numpy_paths': self._numpy_paths,
_ENABLE_CACHE_EVERY_X_DIFF: self.cache_tuning_sample_size * 10,
}
if log_frequency_in_sec:
# Creating a progress log reporter that runs in a separate thread every log_frequency_in_sec seconds.
progress_timer = RepeatedTimer(log_frequency_in_sec, _report_progress, self._stats, progress_logger)
else:
progress_timer = None
self._parameters = _parameters
self.deephash_parameters = self._get_deephash_params()
self.tree = TreeResult()
self._iterable_opcodes = {}
if group_by and self.is_root:
try:
original_t1 = t1
t1 = self._group_iterable_to_dict(t1, group_by, item_name='t1')
except (KeyError, ValueError):
pass
else:
try:
t2 = self._group_iterable_to_dict(t2, group_by, item_name='t2')
except (KeyError, ValueError):
t1 = original_t1
self.t1 = t1
self.t2 = t2
try:
root = DiffLevel(t1, t2, verbose_level=self.verbose_level)
# _original_type is only used to pass the original type of the data. Currently only used for numpy arrays.
# The reason is that we convert the numpy array to python list and then later for distance calculations
# we convert only the the last dimension of it into numpy arrays.
self._diff(root, parents_ids=frozenset({id(t1)}), _original_type=_original_type)
if get_deep_distance and view in {TEXT_VIEW, TREE_VIEW}:
self.tree['deep_distance'] = self._get_rough_distance()
self.tree.remove_empty_keys()
view_results = self._get_view_results(self.view)
if isinstance(view_results, ColoredView):
self.update(view_results.tree)
self._colored_view = view_results
else:
self.update(view_results)
finally:
if self.is_root:
if cache_purge_level:
del self._distance_cache
del self.hashes
del self._shared_parameters
del self._parameters
for key in (PREVIOUS_DIFF_COUNT, PREVIOUS_DISTANCE_CACHE_HIT_COUNT,
DISTANCE_CACHE_ENABLED):
del self._stats[key]
if progress_timer:
duration = progress_timer.stop()
self._stats['DURATION SEC'] = duration
logger.info('stats {}'.format(self.get_stats()))
if cache_purge_level == 2:
self.__dict__.clear()
def _get_deephash_params(self):
result = {key: self._parameters[key] for key in DEEPHASH_PARAM_KEYS}
result['ignore_repetition'] = not self.report_repetition
result['number_to_string_func'] = self.number_to_string
return result
def _report_result(self, report_type, change_level, local_tree=None):
"""
Add a detected change to the reference-style result dictionary.
report_type will be added to level.
(We'll create the text-style report from there later.)
:param report_type: A well defined string key describing the type of change.
Examples: "set_item_added", "values_changed"
:param change_level: A DiffLevel object describing the objects in question in their
before-change and after-change object structure.
:local_tree: None
"""
if not self._skip_this(change_level):
change_level.report_type = report_type
tree = self.tree if local_tree is None else local_tree
tree[report_type].add(change_level)
def custom_report_result(self, report_type, level, extra_info=None):
"""
Add a detected change to the reference-style result dictionary.
report_type will be added to level.
(We'll create the text-style report from there later.)
:param report_type: A well defined string key describing the type of change.
Examples: "set_item_added", "values_changed"
:param parent: A DiffLevel object describing the objects in question in their
before-change and after-change object structure.
:param extra_info: A dict that describe this result
:rtype: None
"""
if not self._skip_this(level):
level.report_type = report_type
level.additional[CUSTOM_FIELD] = extra_info
self.tree[report_type].add(level)
@staticmethod
def _dict_from_slots(object: Any) -> Dict[str, Any]:
def unmangle(attribute: str) -> str:
if attribute.startswith('__') and attribute != '__weakref__':
return '_{type}{attribute}'.format(
type=type(object).__name__,
attribute=attribute
)
return attribute
all_slots = []
if isinstance(object, type):
mro = object.__mro__ # pragma: no cover. I have not been able to write a test for this case. But we still check for it.
else:
mro = object.__class__.__mro__
for type_in_mro in mro:
slots = getattr(type_in_mro, '__slots__', None)
if slots:
if isinstance(slots, strings):
all_slots.append(slots)
else:
all_slots.extend(slots)
return {i: getattr(object, key) for i in all_slots if hasattr(object, key := unmangle(i))}
def _diff_enum(self, level: Any, parents_ids: FrozenSet[int]=frozenset(), local_tree: Optional[Any]=None) -> None:
t1 = detailed__dict__(level.t1, include_keys=ENUM_INCLUDE_KEYS)
t2 = detailed__dict__(level.t2, include_keys=ENUM_INCLUDE_KEYS)
self._diff_dict(
level,
parents_ids,
print_as_attribute=True,
override=True,
override_t1=t1,
override_t2=t2,
local_tree=local_tree,
)
def _diff_obj(self, level: Any, parents_ids: FrozenSet[int]=frozenset(), is_namedtuple: bool=False, local_tree: Optional[Any]=None, is_pydantic_object: bool=False) -> None:
"""Difference of 2 objects"""
processing_error = False
t1: Optional[Dict[str, Any]] = None
t2: Optional[Dict[str, Any]] = None
try:
if is_namedtuple:
t1 = level.t1._asdict()
t2 = level.t2._asdict()
elif is_pydantic_object:
t1 = detailed__dict__(level.t1, ignore_private_variables=self.ignore_private_variables, ignore_keys=model_fields_set)
t2 = detailed__dict__(level.t2, ignore_private_variables=self.ignore_private_variables, ignore_keys=model_fields_set)
elif all('__dict__' in dir(t) for t in level):
t1 = detailed__dict__(level.t1, ignore_private_variables=self.ignore_private_variables)
t2 = detailed__dict__(level.t2, ignore_private_variables=self.ignore_private_variables)
elif all('__slots__' in dir(t) for t in level):
t1 = self._dict_from_slots(level.t1)
t2 = self._dict_from_slots(level.t2)
else:
t1 = {k: v for k, v in getmembers(level.t1) if not callable(v)}
t2 = {k: v for k, v in getmembers(level.t2) if not callable(v)}
except AttributeError:
processing_error = True
if processing_error is True or t1 is None or t2 is None:
self._report_result('unprocessed', level, local_tree=local_tree)
return
self._diff_dict(
level,
parents_ids,
print_as_attribute=True,
override=True,
override_t1=t1,
override_t2=t2,
local_tree=local_tree,
)
def _skip_this(self, level: Any) -> bool:
"""
Check whether this comparison should be skipped because one of the objects to compare meets exclusion criteria.
:rtype: bool
"""
level_path = level.path()
skip = False
if self.exclude_paths and level_path in self.exclude_paths:
skip = True
if self.include_paths and level_path != 'root':
if level_path not in self.include_paths:
skip = True
for prefix in self.include_paths:
if prefix in level_path or level_path in prefix:
skip = False
break
elif self.exclude_regex_paths and any(
[exclude_regex_path.search(level_path) for exclude_regex_path in self.exclude_regex_paths]):
skip = True
elif self.exclude_types_tuple and \
(isinstance(level.t1, self.exclude_types_tuple) or isinstance(level.t2, self.exclude_types_tuple)):
skip = True
elif self.exclude_obj_callback and \
(self.exclude_obj_callback(level.t1, level_path) or self.exclude_obj_callback(level.t2, level_path)):
skip = True
elif self.exclude_obj_callback_strict and \
(self.exclude_obj_callback_strict(level.t1, level_path) and
self.exclude_obj_callback_strict(level.t2, level_path)):
skip = True
elif self.include_obj_callback and level_path != 'root':
skip = True
if (self.include_obj_callback(level.t1, level_path) or self.include_obj_callback(level.t2, level_path)):
skip = False
elif self.include_obj_callback_strict and level_path != 'root':
skip = True
if (self.include_obj_callback_strict(level.t1, level_path) and
self.include_obj_callback_strict(level.t2, level_path)):
skip = False
return skip
def _skip_this_key(self, level: Any, key: Any) -> bool:
# if include_paths is not set, than treet every path as included
if self.include_paths is None:
return False
if "{}['{}']".format(level.path(), key) in self.include_paths:
return False
if level.path() in self.include_paths:
# matches e.g. level+key root['foo']['bar']['veg'] include_paths ["root['foo']['bar']"]
return False
for prefix in self.include_paths:
if "{}['{}']".format(level.path(), key) in prefix:
# matches as long the prefix is longer than this object key
# eg.: level+key root['foo']['bar'] matches prefix root['foo']['bar'] from include paths
# level+key root['foo'] matches prefix root['foo']['bar'] from include_paths
# level+key root['foo']['bar'] DOES NOT match root['foo'] from include_paths This needs to be handled afterwards
return False
# check if a higher level is included as a whole (=without any sublevels specified)
# matches e.g. level+key root['foo']['bar']['veg'] include_paths ["root['foo']"]
# but does not match, if it is level+key root['foo']['bar']['veg'] include_paths ["root['foo']['bar']['fruits']"]
up = level.up
while up is not None:
if up.path() in self.include_paths:
return False
up = up.up
return True
def _get_clean_to_keys_mapping(self, keys: Any, level: Any) -> Dict[Any, Any]:
"""
Get a dictionary of cleaned value of keys to the keys themselves.
This is mainly used to transform the keys when the type changes of keys should be ignored.
TODO: needs also some key conversion for groups of types other than the built-in strings and numbers.
"""
result = dict_()
for key in keys:
if self.ignore_string_type_changes and isinstance(key, bytes):
clean_key = key.decode('utf-8')
elif self.ignore_string_type_changes and isinstance(key, memoryview):
clean_key = key.tobytes().decode('utf-8')
elif self.use_enum_value and isinstance(key, Enum):
clean_key = key.value
elif isinstance(key, numbers):
# Skip type prefixing for keys that originated from group_by operations
if hasattr(self, 'group_by_keys') and key in self.group_by_keys:
if self.significant_digits is None:
clean_key = key
else:
clean_key = self.number_to_string(key, significant_digits=self.significant_digits,
number_format_notation=self.number_format_notation) # type: ignore # type: ignore
else:
type_ = "number" if self.ignore_numeric_type_changes else key.__class__.__name__
if self.significant_digits is None:
clean_key = key
else:
clean_key = self.number_to_string(key, significant_digits=self.significant_digits,
number_format_notation=self.number_format_notation) # type: ignore # type: ignore
clean_key = KEY_TO_VAL_STR.format(type_, clean_key)
else:
clean_key = key
if self.ignore_string_case and isinstance(clean_key, str):
clean_key = clean_key.lower()
if clean_key in result:
logger.warning(('{} and {} in {} become the same key when ignore_numeric_type_changes'
'or ignore_numeric_type_changes are set to be true.').format(
key, result[clean_key], level.path()))
else:
result[clean_key] = key
return result
def _diff_dict(
self,
level: Any,
parents_ids: FrozenSet[int]=frozenset([]),
print_as_attribute: bool=False,
override: bool=False,
override_t1: Optional[Any]=None,
override_t2: Optional[Any]=None,
local_tree: Optional[Any]=None,
) -> None:
"""Difference of 2 dictionaries"""
if override:
# for special stuff like custom objects and named tuples we receive preprocessed t1 and t2
# but must not spoil the chain (=level) with it
t1 = override_t1
t2 = override_t2
else:
t1 = level.t1
t2 = level.t2
if print_as_attribute:
item_added_key = "attribute_added"
item_removed_key = "attribute_removed"
rel_class = AttributeRelationship
else:
item_added_key = "dictionary_item_added"
item_removed_key = "dictionary_item_removed"
rel_class = DictRelationship
if self.ignore_private_variables:
t1_keys = SetOrdered([key for key in t1 if not(isinstance(key, str) and key.startswith('__')) and not self._skip_this_key(level, key)])
t2_keys = SetOrdered([key for key in t2 if not(isinstance(key, str) and key.startswith('__')) and not self._skip_this_key(level, key)])
else:
t1_keys = SetOrdered([key for key in t1 if not self._skip_this_key(level, key)])
t2_keys = SetOrdered([key for key in t2 if not self._skip_this_key(level, key)])
if self.ignore_string_type_changes or self.ignore_numeric_type_changes or self.ignore_string_case:
t1_clean_to_keys = self._get_clean_to_keys_mapping(keys=t1_keys, level=level)
t2_clean_to_keys = self._get_clean_to_keys_mapping(keys=t2_keys, level=level)
t1_keys = SetOrdered(t1_clean_to_keys.keys())
t2_keys = SetOrdered(t2_clean_to_keys.keys())
else:
t1_clean_to_keys = t2_clean_to_keys = None
t_keys_intersect = t2_keys & t1_keys
t_keys_added = t2_keys - t_keys_intersect
t_keys_removed = t1_keys - t_keys_intersect
if self.threshold_to_diff_deeper:
if self.exclude_paths:
t_keys_union = {f"{level.path()}[{repr(key)}]" for key in (t2_keys | t1_keys)}
t_keys_union -= self.exclude_paths
t_keys_union_len = len(t_keys_union)
else:
t_keys_union_len = len(t2_keys | t1_keys)
if t_keys_union_len > 1 and len(t_keys_intersect) / t_keys_union_len < self.threshold_to_diff_deeper:
self._report_result('values_changed', level, local_tree=local_tree)
return
for key in t_keys_added:
if self._count_diff() is StopIteration:
return
key = t2_clean_to_keys[key] if t2_clean_to_keys else key
change_level = level.branch_deeper(
notpresent,
t2[key],
child_relationship_class=rel_class,
child_relationship_param=key,
child_relationship_param2=key,
)
self._report_result(item_added_key, change_level, local_tree=local_tree)
for key in t_keys_removed:
if self._count_diff() is StopIteration:
return # pragma: no cover. This is already covered for addition.
key = t1_clean_to_keys[key] if t1_clean_to_keys else key
change_level = level.branch_deeper(
t1[key],
notpresent,
child_relationship_class=rel_class,
child_relationship_param=key,
child_relationship_param2=key,
)
self._report_result(item_removed_key, change_level, local_tree=local_tree)
for key in t_keys_intersect: # key present in both dicts - need to compare values
if self._count_diff() is StopIteration:
return # pragma: no cover. This is already covered for addition.
key1 = t1_clean_to_keys[key] if t1_clean_to_keys else key
key2 = t2_clean_to_keys[key] if t2_clean_to_keys else key
item_id = id(t1[key1])
if parents_ids and item_id in parents_ids:
continue
parents_ids_added = add_to_frozen_set(parents_ids, item_id)
# Go one level deeper
next_level = level.branch_deeper(
t1[key1],
t2[key2],
child_relationship_class=rel_class,
child_relationship_param=key,
child_relationship_param2=key,
)
self._diff(next_level, parents_ids_added, local_tree=local_tree)
def _diff_set(self, level: Any, local_tree: Optional[Any]=None) -> None:
"""Difference of sets"""
t1_hashtable = self._create_hashtable(level, 't1')
t2_hashtable = self._create_hashtable(level, 't2')
t1_hashes = set(t1_hashtable.keys())
t2_hashes = set(t2_hashtable.keys())
hashes_added = t2_hashes - t1_hashes
hashes_removed = t1_hashes - t2_hashes
items_added = [t2_hashtable[i].item for i in hashes_added]
items_removed = [t1_hashtable[i].item for i in hashes_removed]
for item in items_added:
if self._count_diff() is StopIteration:
return # pragma: no cover. This is already covered for addition.
change_level = level.branch_deeper(
notpresent, item, child_relationship_class=SetRelationship)
self._report_result('set_item_added', change_level, local_tree=local_tree)
for item in items_removed:
if self._count_diff() is StopIteration:
return # pragma: no cover. This is already covered for addition.
change_level = level.branch_deeper(
item, notpresent, child_relationship_class=SetRelationship)
self._report_result('set_item_removed', change_level, local_tree=local_tree)
@staticmethod
def _iterables_subscriptable(t1: Any, t2: Any) -> bool:
try:
if getattr(t1, '__getitem__') and getattr(t2, '__getitem__'):
return True
else: # pragma: no cover
return False # should never happen
except AttributeError:
return False
def _diff_iterable(self, level: Any, parents_ids: FrozenSet[int]=frozenset(), _original_type: Optional[type]=None, local_tree: Optional[Any]=None) -> None:
"""Difference of iterables"""
if (self.ignore_order_func and self.ignore_order_func(level)) or self.ignore_order:
self._diff_iterable_with_deephash(level, parents_ids, _original_type=_original_type, local_tree=local_tree)
else:
self._diff_iterable_in_order(level, parents_ids, _original_type=_original_type, local_tree=local_tree)
def _compare_in_order(
self, level,
t1_from_index=None, t1_to_index=None,
t2_from_index=None, t2_to_index=None
) -> List[Tuple[Tuple[int, int], Tuple[Any, Any]]]:
"""
Default compare if `iterable_compare_func` is not provided.
This will compare in sequence order.
"""
if t1_from_index is None:
return [((i, i), (x, y)) for i, (x, y) in enumerate(
zip_longest(
level.t1, level.t2, fillvalue=ListItemRemovedOrAdded))]
else:
t1_chunk = level.t1[t1_from_index:t1_to_index]
t2_chunk = level.t2[t2_from_index:t2_to_index]
return [((i + t1_from_index, i + t2_from_index), (x, y)) for i, (x, y) in enumerate(
zip_longest(
t1_chunk, t2_chunk, fillvalue=ListItemRemovedOrAdded))]
def _get_matching_pairs(
self, level,
t1_from_index=None, t1_to_index=None,
t2_from_index=None, t2_to_index=None
) -> List[Tuple[Tuple[int, int], Tuple[Any, Any]]]:
"""
Given a level get matching pairs. This returns list of two tuples in the form:
[
(t1 index, t2 index), (t1 item, t2 item)
]
This will compare using the passed in `iterable_compare_func` if available.
Default it to compare in order
"""
if self.iterable_compare_func is None:
# Match in order if there is no compare function provided
return self._compare_in_order(
level,
t1_from_index=t1_from_index, t1_to_index=t1_to_index,
t2_from_index=t2_from_index, t2_to_index=t2_to_index,
)
try:
matches = []
y_matched = set()
y_index_matched = set()
for i, x in enumerate(level.t1):
x_found = False
for j, y in enumerate(level.t2):
if(j in y_index_matched):
# This ensures a one-to-one relationship of matches from t1 to t2.
# If y this index in t2 has already been matched to another x
# it cannot have another match, so just continue.
continue
if(self.iterable_compare_func(x, y, level)):
deep_hash = DeepHash(y,
hashes=self.hashes,
apply_hash=True,
**self.deephash_parameters,
)
y_index_matched.add(j)
y_matched.add(deep_hash[y])
matches.append(((i, j), (x, y)))
x_found = True
break
if(not x_found):
matches.append(((i, -1), (x, ListItemRemovedOrAdded)))
for j, y in enumerate(level.t2):
deep_hash = DeepHash(y,
hashes=self.hashes,
apply_hash=True,
**self.deephash_parameters,
)
if(deep_hash[y] not in y_matched):
matches.append(((-1, j), (ListItemRemovedOrAdded, y)))
return matches
except CannotCompare:
return self._compare_in_order(
level,
t1_from_index=t1_from_index, t1_to_index=t1_to_index,
t2_from_index=t2_from_index, t2_to_index=t2_to_index
)
def _diff_iterable_in_order(self, level, parents_ids=frozenset(), _original_type=None, local_tree=None):
# We're handling both subscriptable and non-subscriptable iterables. Which one is it?
subscriptable = self._iterables_subscriptable(level.t1, level.t2)
if subscriptable:
child_relationship_class = SubscriptableIterableRelationship
else:
child_relationship_class = NonSubscriptableIterableRelationship
if (
not self.zip_ordered_iterables
and isinstance(level.t1, Sequence)
and isinstance(level.t2, Sequence)
and self._all_values_basic_hashable(level.t1)
and self._all_values_basic_hashable(level.t2)
and self.iterable_compare_func is None
):
local_tree_pass = TreeResult()
opcodes_with_values = self._diff_ordered_iterable_by_difflib(
level,
parents_ids=parents_ids,
_original_type=_original_type,
child_relationship_class=child_relationship_class,
local_tree=local_tree_pass,
)
# Sometimes DeepDiff's old iterable diff does a better job than DeepDiff
if len(local_tree_pass) > 1:
local_tree_pass2 = TreeResult()
self._diff_by_forming_pairs_and_comparing_one_by_one(
level,
parents_ids=parents_ids,
_original_type=_original_type,
child_relationship_class=child_relationship_class,
local_tree=local_tree_pass2,
)
if len(local_tree_pass) >= len(local_tree_pass2):
local_tree_pass = local_tree_pass2
else:
self._iterable_opcodes[level.path(force=FORCE_DEFAULT)] = opcodes_with_values
for report_type, levels in local_tree_pass.items():
if levels:
self.tree[report_type] |= levels
else:
self._diff_by_forming_pairs_and_comparing_one_by_one(
level,
parents_ids=parents_ids,
_original_type=_original_type,
child_relationship_class=child_relationship_class,
local_tree=local_tree,
)
def _all_values_basic_hashable(self, iterable: Iterable[Any]) -> bool:
"""
Are all items basic hashable types?
Or there are custom types too?
"""
# We don't want to exhaust a generator
if isinstance(iterable, types.GeneratorType):
return False
for item in iterable:
if not isinstance(item, basic_types):
return False
return True
def _diff_by_forming_pairs_and_comparing_one_by_one(
self, level, local_tree, parents_ids=frozenset(),
_original_type=None, child_relationship_class=None,
t1_from_index=None, t1_to_index=None,
t2_from_index=None, t2_to_index=None,
):
for (i, j), (x, y) in self._get_matching_pairs(
level,
t1_from_index=t1_from_index, t1_to_index=t1_to_index,
t2_from_index=t2_from_index, t2_to_index=t2_to_index
):
if self._count_diff() is StopIteration:
return # pragma: no cover. This is already covered for addition.
reference_param1 = i
reference_param2 = j
if y is ListItemRemovedOrAdded: # item removed completely
change_level = level.branch_deeper(
x,
notpresent,
child_relationship_class=child_relationship_class,
child_relationship_param=reference_param1,
child_relationship_param2=reference_param2,
)
self._report_result('iterable_item_removed', change_level, local_tree=local_tree)
elif x is ListItemRemovedOrAdded: # new item added
change_level = level.branch_deeper(
notpresent,
y,
child_relationship_class=child_relationship_class,
child_relationship_param=reference_param1,
child_relationship_param2=reference_param2,
)
self._report_result('iterable_item_added', change_level, local_tree=local_tree)
else: # check if item value has changed
if (i != j and ((x == y) or self.iterable_compare_func)):
# Item moved
change_level = level.branch_deeper(
x,
y,
child_relationship_class=child_relationship_class,
child_relationship_param=reference_param1,
child_relationship_param2=reference_param2
)
self._report_result('iterable_item_moved', change_level, local_tree=local_tree)
if self.iterable_compare_func:
# Mark additional context denoting that we have moved an item.
# This will allow for correctly setting paths relative to t2 when using an iterable_compare_func
level.additional["moved"] = True
else:
continue
item_id = id(x)
if parents_ids and item_id in parents_ids:
continue
parents_ids_added = add_to_frozen_set(parents_ids, item_id)
# Go one level deeper
next_level = level.branch_deeper(
x,
y,
child_relationship_class=child_relationship_class,
child_relationship_param=reference_param1,
child_relationship_param2=reference_param2
)
self._diff(next_level, parents_ids_added, local_tree=local_tree)
def _diff_ordered_iterable_by_difflib(
self, level, local_tree, parents_ids=frozenset(), _original_type=None, child_relationship_class=None,
):
seq = difflib.SequenceMatcher(isjunk=None, a=level.t1, b=level.t2, autojunk=False)
opcodes = seq.get_opcodes()
opcodes_with_values = []
# TODO: this logic should be revisted so we detect reverse operations
# like when a replacement happens at index X and a reverse replacement happens at index Y
# in those cases we have a "iterable_item_moved" operation.
for tag, t1_from_index, t1_to_index, t2_from_index, t2_to_index in opcodes:
if tag == 'equal':
opcodes_with_values.append(Opcode(
tag, t1_from_index, t1_to_index, t2_from_index, t2_to_index,
))
continue
# print('{:7} t1[{}:{}] --> t2[{}:{}] {!r:>8} --> {!r}'.format(
# tag, t1_from_index, t1_to_index, t2_from_index, t2_to_index, level.t1[t1_from_index:t1_to_index], level.t2[t2_from_index:t2_to_index]))
opcodes_with_values.append(Opcode(
tag, t1_from_index, t1_to_index, t2_from_index, t2_to_index,
old_values = level.t1[t1_from_index: t1_to_index],
new_values = level.t2[t2_from_index: t2_to_index],
))
if tag == 'replace':
self._diff_by_forming_pairs_and_comparing_one_by_one(
level, local_tree=local_tree, parents_ids=parents_ids,
_original_type=_original_type, child_relationship_class=child_relationship_class,
t1_from_index=t1_from_index, t1_to_index=t1_to_index,
t2_from_index=t2_from_index, t2_to_index=t2_to_index,
)
elif tag == 'delete':
for index, x in enumerate(level.t1[t1_from_index:t1_to_index]):
change_level = level.branch_deeper(
x,
notpresent,
child_relationship_class=child_relationship_class,
child_relationship_param=index + t1_from_index,
child_relationship_param2=index + t1_from_index,
)
self._report_result('iterable_item_removed', change_level, local_tree=local_tree)
elif tag == 'insert':
for index, y in enumerate(level.t2[t2_from_index:t2_to_index]):
change_level = level.branch_deeper(
notpresent,
y,
child_relationship_class=child_relationship_class,
child_relationship_param=index + t2_from_index,
child_relationship_param2=index + t2_from_index,
)
self._report_result('iterable_item_added', change_level, local_tree=local_tree)
return opcodes_with_values
def _diff_str(self, level, local_tree=None):
"""Compare strings"""
if self.ignore_string_case:
level.t1 = level.t1.lower()
level.t2 = level.t2.lower()
if type(level.t1) == type(level.t2) and level.t1 == level.t2: # NOQA
return
# do we add a diff for convenience?
do_diff = True
t1_str = level.t1
t2_str = level.t2
if isinstance(level.t1, memoryview):
try:
t1_str = level.t1.tobytes().decode('ascii')
except UnicodeDecodeError:
do_diff = False
elif isinstance(level.t1, bytes_type):
try:
t1_str = level.t1.decode('ascii')
except UnicodeDecodeError:
do_diff = False
if isinstance(level.t2, memoryview):
try:
t2_str = level.t2.tobytes().decode('ascii')
except UnicodeDecodeError:
do_diff = False
elif isinstance(level.t2, bytes_type):
try:
t2_str = level.t2.decode('ascii')
except UnicodeDecodeError:
do_diff = False
if isinstance(level.t1, Enum):
t1_str = level.t1.value
if isinstance(level.t2, Enum):
t2_str = level.t2.value
if t1_str == t2_str:
return
if do_diff:
if '\n' in t1_str or isinstance(t2_str, str) and '\n' in t2_str:
diff = difflib.unified_diff(
t1_str.splitlines(), t2_str.splitlines(), lineterm='')
diff = list(diff)
if diff:
level.additional['diff'] = '\n'.join(diff)
self._report_result('values_changed', level, local_tree=local_tree)
def _diff_tuple(self, level, parents_ids, local_tree=None):
# Checking to see if it has _fields. Which probably means it is a named
# tuple.
try:
level.t1._asdict
# It must be a normal tuple
except AttributeError:
self._diff_iterable(level, parents_ids, local_tree=local_tree)
# We assume it is a namedtuple then
else:
self._diff_obj(level, parents_ids, is_namedtuple=True, local_tree=local_tree)
def _add_hash(self, hashes, item_hash, item, i):
if item_hash in hashes:
hashes[item_hash].indexes.append(i)
else:
hashes[item_hash] = IndexedHash(indexes=[i], item=item)
def _create_hashtable(self, level, t):
"""Create hashtable of {item_hash: (indexes, item)}"""
obj = getattr(level, t)
local_hashes = dict_()
for (i, item) in enumerate(obj):
try:
parent = "{}[{}]".format(level.path(), i)
# Note: in the DeepDiff we only calculate the hash of items when we have to.
# So self.hashes does not include hashes of all objects in t1 and t2.
# It only includes the ones needed when comparing iterables.
# The self.hashes dictionary gets shared between different runs of DeepHash
# So that any object that is already calculated to have a hash is not re-calculated.
deep_hash = DeepHash(
item,
hashes=self.hashes,
parent=parent,
apply_hash=True,
**self.deephash_parameters,
)
except UnicodeDecodeError as err:
err.reason = f"Can not produce a hash for {level.path()}: {err.reason}"
raise
except NotImplementedError:
raise
# except Exception as e: # pragma: no cover
# logger.error("Can not produce a hash for %s."
# "Not counting this object.\n %s" %
# (level.path(), e))
else:
try:
item_hash = deep_hash[item]
except KeyError:
pass
else:
if item_hash is unprocessed: # pragma: no cover
self.log_err("Item %s was not processed while hashing "
"thus not counting this object." %
level.path())
else:
self._add_hash(hashes=local_hashes, item_hash=item_hash, item=item, i=i)
# Also we hash the iterables themselves too so that we can later create cache keys from those hashes.
DeepHash(
obj,
hashes=self.hashes,
parent=level.path(),
apply_hash=True,
**self.deephash_parameters,
)
return local_hashes
@staticmethod
@lru_cache(maxsize=2028)
def _get_distance_cache_key(added_hash, removed_hash):
key1, key2 = (added_hash, removed_hash) if added_hash > removed_hash else (removed_hash, added_hash)
if isinstance(key1, int):
# If the hash function produces integers we convert them to hex values.
# This was used when the default hash function was Murmur3 128bit which produces integers.
key1 = hex(key1).encode('utf-8')
key2 = hex(key2).encode('utf-8')
elif isinstance(key1, str):
key1 = key1.encode('utf-8')
key2 = key2.encode('utf-8')
return key1 + b'--' + key2 + b'dc'
def _get_rough_distance_of_hashed_objs(
self, added_hash, removed_hash, added_hash_obj, removed_hash_obj, _original_type=None):
# We need the rough distance between the 2 objects to see if they qualify to be pairs or not
_distance = cache_key = None
if self._stats[DISTANCE_CACHE_ENABLED]:
cache_key = self._get_distance_cache_key(added_hash, removed_hash)
if cache_key in self._distance_cache:
self._stats[DISTANCE_CACHE_HIT_COUNT] += 1
_distance = self._distance_cache.get(cache_key)
if _distance is None:
# We can only cache the rough distance and not the actual diff result for reuse.
# The reason is that we have modified the parameters explicitly so they are different and can't
# be used for diff reporting
diff = DeepDiff(
removed_hash_obj.item, added_hash_obj.item,
_parameters=self._parameters,
_shared_parameters=self._shared_parameters,
view=DELTA_VIEW,
_original_type=_original_type,
iterable_compare_func=self.iterable_compare_func,
)
_distance = diff._get_rough_distance()
if cache_key and self._stats[DISTANCE_CACHE_ENABLED]:
self._distance_cache.set(cache_key, value=_distance)
return _distance
def _get_most_in_common_pairs_in_iterables(
self, hashes_added, hashes_removed, t1_hashtable, t2_hashtable, parents_ids, _original_type):
"""
Get the closest pairs between items that are removed and items that are added.
returns a dictionary of hashes that are closest to each other.
The dictionary is going to be symmetrical so any key will be a value too and otherwise.
Note that due to the current reporting structure in DeepDiff, we don't compare an item that
was added to an item that is in both t1 and t2.
For example
[{1, 2}, {4, 5, 6}]
[{1, 2}, {1, 2, 3}]
is only compared between {4, 5, 6} and {1, 2, 3} even though technically {1, 2, 3} is
just one item different than {1, 2}
Perhaps in future we can have a report key that is item duplicated and modified instead of just added.
"""
cache_key = None
if self._stats[DISTANCE_CACHE_ENABLED]:
cache_key = combine_hashes_lists(items=[hashes_added, hashes_removed], prefix='pairs_cache')
if cache_key in self._distance_cache:
return self._distance_cache.get(cache_key).copy()
# A dictionary of hashes to distances and each distance to an ordered set of hashes.
# It tells us about the distance of each object from other objects.
# And the objects with the same distances are grouped together in an ordered set.
# It also includes a "max" key that is just the value of the biggest current distance in the
# most_in_common_pairs dictionary.
def defaultdict_orderedset():
return defaultdict(SetOrdered)
most_in_common_pairs = defaultdict(defaultdict_orderedset)
pairs = dict_()
pre_calced_distances = None
if hashes_added and hashes_removed and np and len(hashes_added) > 1 and len(hashes_removed) > 1:
# pre-calculates distances ONLY for 1D arrays whether an _original_type
# was explicitly passed or a homogeneous array is detected.
# Numpy is needed for this optimization.
pre_calced_distances = self._precalculate_numpy_arrays_distance(
hashes_added, hashes_removed, t1_hashtable, t2_hashtable, _original_type)
if hashes_added and hashes_removed \
and self.iterable_compare_func \
and len(hashes_added) > 0 and len(hashes_removed) > 0:
pre_calced_distances = self._precalculate_distance_by_custom_compare_func(
hashes_added, hashes_removed, t1_hashtable, t2_hashtable, _original_type)
for added_hash in hashes_added:
for removed_hash in hashes_removed:
added_hash_obj = t2_hashtable[added_hash]
removed_hash_obj = t1_hashtable[removed_hash]
# Loop is detected
if id(removed_hash_obj.item) in parents_ids:
continue
_distance = None
if pre_calced_distances:
_distance = pre_calced_distances.get("{}--{}".format(added_hash, removed_hash))
if _distance is None:
_distance = self._get_rough_distance_of_hashed_objs(
added_hash, removed_hash, added_hash_obj, removed_hash_obj, _original_type)
# Left for future debugging
# print(f'{Fore.RED}distance of {added_hash_obj.item} and {removed_hash_obj.item}: {_distance}{Style.RESET_ALL}')
# Discard potential pairs that are too far.
if _distance >= self.cutoff_distance_for_pairs:
continue
pairs_of_item = most_in_common_pairs[added_hash]
pairs_of_item[_distance].add(removed_hash)
used_to_hashes = set()
distances_to_from_hashes = defaultdict(SetOrdered)
for from_hash, distances_to_to_hashes in most_in_common_pairs.items():
# del distances_to_to_hashes['max']
for dist in distances_to_to_hashes:
distances_to_from_hashes[dist].add(from_hash)
for dist in sorted(distances_to_from_hashes.keys()):
from_hashes = distances_to_from_hashes[dist]
while from_hashes:
from_hash = from_hashes.pop()
if from_hash not in used_to_hashes:
to_hashes = most_in_common_pairs[from_hash][dist]
while to_hashes:
to_hash = to_hashes.pop()
if to_hash not in used_to_hashes:
used_to_hashes.add(from_hash)
used_to_hashes.add(to_hash)
# Left for future debugging:
# print(f'{bcolors.FAIL}Adding {t2_hashtable[from_hash].item} as a pairs of {t1_hashtable[to_hash].item} with distance of {dist}{bcolors.ENDC}')
pairs[from_hash] = to_hash
inverse_pairs = {v: k for k, v in pairs.items()}
pairs.update(inverse_pairs)
if cache_key and self._stats[DISTANCE_CACHE_ENABLED]:
self._distance_cache.set(cache_key, value=pairs)
return pairs.copy()
def _diff_iterable_with_deephash(self, level, parents_ids, _original_type=None, local_tree=None):
"""Diff of hashable or unhashable iterables. Only used when ignoring the order."""
full_t1_hashtable = self._create_hashtable(level, 't1')
full_t2_hashtable = self._create_hashtable(level, 't2')
t1_hashes = SetOrdered(full_t1_hashtable.keys())
t2_hashes = SetOrdered(full_t2_hashtable.keys())
hashes_added = t2_hashes - t1_hashes
hashes_removed = t1_hashes - t2_hashes
# Deciding whether to calculate pairs or not.
if (len(hashes_added) + len(hashes_removed)) / (len(full_t1_hashtable) + len(full_t2_hashtable) + 1) > self.cutoff_intersection_for_pairs:
get_pairs = False
else:
get_pairs = True
# reduce the size of hashtables
if self.report_repetition:
t1_hashtable = full_t1_hashtable
t2_hashtable = full_t2_hashtable
else:
t1_hashtable = {k: v for k, v in full_t1_hashtable.items() if k in hashes_removed}
t2_hashtable = {k: v for k, v in full_t2_hashtable.items() if k in hashes_added}
if self._stats[PASSES_COUNT] < self.max_passes and get_pairs:
self._stats[PASSES_COUNT] += 1
pairs = self._get_most_in_common_pairs_in_iterables(
hashes_added, hashes_removed, t1_hashtable, t2_hashtable, parents_ids, _original_type)
elif get_pairs:
if not self._stats[MAX_PASS_LIMIT_REACHED]:
self._stats[MAX_PASS_LIMIT_REACHED] = True
logger.warning(MAX_PASSES_REACHED_MSG.format(self.max_passes))
pairs = dict_()
else:
pairs = dict_()
def get_other_pair(hash_value, in_t1=True):
"""
Gets the other paired indexed hash item to the hash_value in the pairs dictionary
in_t1: are we looking for the other pair in t1 or t2?
"""
if in_t1:
hashtable = t1_hashtable
the_other_hashes = hashes_removed
else:
hashtable = t2_hashtable
the_other_hashes = hashes_added
other = pairs.pop(hash_value, notpresent)
if other is notpresent:
other = notpresent_indexed
else:
# The pairs are symmetrical.
# removing the other direction of pair
# so it does not get used.
del pairs[other]
the_other_hashes.remove(other)
other = hashtable[other]
return other
if self.report_repetition:
for hash_value in hashes_added:
if self._count_diff() is StopIteration:
return # pragma: no cover. This is already covered for addition (when report_repetition=False).
other = get_other_pair(hash_value)
item_id = id(other.item)
indexes = t2_hashtable[hash_value].indexes if other.item is notpresent else other.indexes
# When we report repetitions, we want the child_relationship_param2 only if there is no repetition.
# Because when there is a repetition, we report it in a different way (iterable_items_added_at_indexes for example).
# When there is no repetition, we want child_relationship_param2 so that we report the "new_path" correctly.
if len(t2_hashtable[hash_value].indexes) == 1:
index2 = t2_hashtable[hash_value].indexes[0]
else:
index2 = None
for i in indexes:
change_level = level.branch_deeper(
other.item,
t2_hashtable[hash_value].item,
child_relationship_class=SubscriptableIterableRelationship,
child_relationship_param=i,
child_relationship_param2=index2,
)
if other.item is notpresent:
self._report_result('iterable_item_added', change_level, local_tree=local_tree)
else:
parents_ids_added = add_to_frozen_set(parents_ids, item_id)
self._diff(change_level, parents_ids_added, local_tree=local_tree)
for hash_value in hashes_removed:
if self._count_diff() is StopIteration:
return # pragma: no cover. This is already covered for addition.
other = get_other_pair(hash_value, in_t1=False)
item_id = id(other.item)
# When we report repetitions, we want the child_relationship_param2 only if there is no repetition.
# Because when there is a repetition, we report it in a different way (iterable_items_added_at_indexes for example).
# When there is no repetition, we want child_relationship_param2 so that we report the "new_path" correctly.
if other.item is notpresent or len(other.indexes > 1):
index2 = None
else:
index2 = other.indexes[0]
for i in t1_hashtable[hash_value].indexes:
change_level = level.branch_deeper(
t1_hashtable[hash_value].item,
other.item,
child_relationship_class=SubscriptableIterableRelationship,
child_relationship_param=i,
child_relationship_param2=index2,
)
if other.item is notpresent:
self._report_result('iterable_item_removed', change_level, local_tree=local_tree)
else:
# I was not able to make a test case for the following 2 lines since the cases end up
# getting resolved above in the hashes_added calcs. However I am leaving these 2 lines
# in case things change in future.
parents_ids_added = add_to_frozen_set(parents_ids, item_id) # pragma: no cover.
self._diff(change_level, parents_ids_added, local_tree=local_tree) # pragma: no cover.
items_intersect = t2_hashes.intersection(t1_hashes)
for hash_value in items_intersect:
t1_indexes = t1_hashtable[hash_value].indexes
t2_indexes = t2_hashtable[hash_value].indexes
t1_indexes_len = len(t1_indexes)
t2_indexes_len = len(t2_indexes)
if t1_indexes_len != t2_indexes_len: # this is a repetition change!
# create "change" entry, keep current level untouched to handle further changes
repetition_change_level = level.branch_deeper(
t1_hashtable[hash_value].item,
t2_hashtable[hash_value].item, # nb: those are equal!
child_relationship_class=SubscriptableIterableRelationship,
child_relationship_param=t1_hashtable[hash_value]
.indexes[0])
repetition_change_level.additional['repetition'] = RemapDict(
old_repeat=t1_indexes_len,
new_repeat=t2_indexes_len,
old_indexes=t1_indexes,
new_indexes=t2_indexes)
self._report_result('repetition_change',
repetition_change_level, local_tree=local_tree)
else:
for hash_value in hashes_added:
if self._count_diff() is StopIteration:
return
other = get_other_pair(hash_value)
item_id = id(other.item)
index = t2_hashtable[hash_value].indexes[0] if other.item is notpresent else other.indexes[0]
index2 = t2_hashtable[hash_value].indexes[0]
change_level = level.branch_deeper(
other.item,
t2_hashtable[hash_value].item,
child_relationship_class=SubscriptableIterableRelationship,
child_relationship_param=index,
child_relationship_param2=index2,
)
if other.item is notpresent:
self._report_result('iterable_item_added', change_level, local_tree=local_tree)
else:
parents_ids_added = add_to_frozen_set(parents_ids, item_id)
self._diff(change_level, parents_ids_added, local_tree=local_tree)
for hash_value in hashes_removed:
if self._count_diff() is StopIteration:
return # pragma: no cover. This is already covered for addition.
other = get_other_pair(hash_value, in_t1=False)
item_id = id(other.item)
index = t1_hashtable[hash_value].indexes[0]
index2 = t1_hashtable[hash_value].indexes[0] if other.item is notpresent else other.indexes[0]
change_level = level.branch_deeper(
t1_hashtable[hash_value].item,
other.item,
child_relationship_class=SubscriptableIterableRelationship,
child_relationship_param=index,
child_relationship_param2=index2,
)
if other.item is notpresent:
self._report_result('iterable_item_removed', change_level, local_tree=local_tree)
else:
# Just like the case when report_repetition = True, these lines never run currently.
# However they will stay here in case things change in future.
parents_ids_added = add_to_frozen_set(parents_ids, item_id) # pragma: no cover.
self._diff(change_level, parents_ids_added, local_tree=local_tree) # pragma: no cover.
def _diff_booleans(self, level, local_tree=None):
if level.t1 != level.t2:
self._report_result('values_changed', level, local_tree=local_tree)
def _diff_numbers(self, level, local_tree=None, report_type_change=True):
"""Diff Numbers"""
if report_type_change:
t1_type = "number" if self.ignore_numeric_type_changes else level.t1.__class__.__name__
t2_type = "number" if self.ignore_numeric_type_changes else level.t2.__class__.__name__
else:
t1_type = t2_type = ''
if self.use_log_scale:
if not logarithmic_similarity(level.t1, level.t2, threshold=self.log_scale_similarity_threshold):
self._report_result('values_changed', level, local_tree=local_tree)
elif self.math_epsilon is not None:
if not is_close(level.t1, level.t2, abs_tol=self.math_epsilon):
self._report_result('values_changed', level, local_tree=local_tree)
elif self.significant_digits is None:
if level.t1 != level.t2:
self._report_result('values_changed', level, local_tree=local_tree)
else:
# Bernhard10: I use string formatting for comparison, to be consistent with usecases where
# data is read from files that were previously written from python and
# to be consistent with on-screen representation of numbers.
# Other options would be abs(t1-t2)<10**-self.significant_digits
# or math.is_close (python3.5+)
# Note that abs(3.25-3.251) = 0.0009999999999998899 < 0.001
# Note also that "{:.3f}".format(1.1135) = 1.113, but "{:.3f}".format(1.11351) = 1.114
# For Decimals, format seems to round 2.5 to 2 and 3.5 to 4 (to closest even number)
t1_s = self.number_to_string(level.t1,
significant_digits=self.significant_digits,
number_format_notation=self.number_format_notation) # type: ignore
t2_s = self.number_to_string(level.t2,
significant_digits=self.significant_digits,
number_format_notation=self.number_format_notation) # type: ignore
t1_s = KEY_TO_VAL_STR.format(t1_type, t1_s)
t2_s = KEY_TO_VAL_STR.format(t2_type, t2_s)
if t1_s != t2_s:
self._report_result('values_changed', level, local_tree=local_tree)
def _diff_ipranges(self, level, local_tree=None):
"""Diff IP ranges"""
if str(level.t1) != str(level.t2):
self._report_result('values_changed', level, local_tree=local_tree)
def _diff_datetime(self, level, local_tree=None):
"""Diff DateTimes"""
level.t1 = datetime_normalize(self.truncate_datetime, level.t1, default_timezone=self.default_timezone)
level.t2 = datetime_normalize(self.truncate_datetime, level.t2, default_timezone=self.default_timezone)
if level.t1 != level.t2:
self._report_result('values_changed', level, local_tree=local_tree)
def _diff_time(self, level, local_tree=None):
"""Diff DateTimes"""
if self.truncate_datetime:
level.t1 = datetime_normalize(self.truncate_datetime, level.t1, default_timezone=self.default_timezone)
level.t2 = datetime_normalize(self.truncate_datetime, level.t2, default_timezone=self.default_timezone)
if level.t1 != level.t2:
self._report_result('values_changed', level, local_tree=local_tree)
def _diff_uuids(self, level, local_tree=None):
"""Diff UUIDs"""
if level.t1.int != level.t2.int:
self._report_result('values_changed', level, local_tree=local_tree)
def _diff_numpy_array(self, level, parents_ids=frozenset(), local_tree=None):
"""Diff numpy arrays"""
if level.path() not in self._numpy_paths:
self._numpy_paths[level.path()] = get_type(level.t2).__name__
if np is None:
# This line should never be run. If it is ever called means the type check detected a numpy array
# which means numpy module needs to be available. So np can't be None.
raise ImportError(CANT_FIND_NUMPY_MSG) # pragma: no cover
if (self.ignore_order_func and not self.ignore_order_func(level)) or not self.ignore_order:
# fast checks
if self.significant_digits is None:
if np.array_equal(level.t1, level.t2, equal_nan=self.ignore_nan_inequality):
return # all good
else:
try:
np.testing.assert_almost_equal(level.t1, level.t2, decimal=self.significant_digits)
except TypeError:
np.array_equal(level.t1, level.t2, equal_nan=self.ignore_nan_inequality)
except AssertionError:
pass # do detailed checking below
else:
return # all good
# compare array meta-data
_original_type = level.t1.dtype
if level.t1.shape != level.t2.shape:
# arrays are converted to python lists so that certain features of DeepDiff can apply on them easier.
# They will be converted back to Numpy at their final dimension.
level.t1 = level.t1.tolist()
level.t2 = level.t2.tolist()
self._diff_iterable(level, parents_ids, _original_type=_original_type, local_tree=local_tree)
else:
# metadata same -- the difference is in the content
shape = level.t1.shape
dimensions = len(shape)
if dimensions == 1:
self._diff_iterable(level, parents_ids, _original_type=_original_type, local_tree=local_tree)
elif (self.ignore_order_func and self.ignore_order_func(level)) or self.ignore_order:
# arrays are converted to python lists so that certain features of DeepDiff can apply on them easier.
# They will be converted back to Numpy at their final dimension.
level.t1 = level.t1.tolist()
level.t2 = level.t2.tolist()
self._diff_iterable_with_deephash(level, parents_ids, _original_type=_original_type, local_tree=local_tree)
else:
for (t1_path, t1_row), (t2_path, t2_row) in zip(
get_numpy_ndarray_rows(level.t1, shape),
get_numpy_ndarray_rows(level.t2, shape)):
new_level = level.branch_deeper(
t1_row,
t2_row,
child_relationship_class=NumpyArrayRelationship,
child_relationship_param=t1_path,
child_relationship_param2=t2_path,
)
self._diff_iterable_in_order(new_level, parents_ids, _original_type=_original_type, local_tree=local_tree)
def _diff_types(self, level, local_tree=None):
"""Diff types"""
level.report_type = 'type_changes'
self._report_result('type_changes', level, local_tree=local_tree)
def _count_diff(self):
if (self.max_diffs is not None and self._stats[DIFF_COUNT] > self.max_diffs):
if not self._stats[MAX_DIFF_LIMIT_REACHED]:
self._stats[MAX_DIFF_LIMIT_REACHED] = True
logger.warning(MAX_DIFFS_REACHED_MSG.format(self.max_diffs))
return StopIteration
self._stats[DIFF_COUNT] += 1
if self.cache_size and self.cache_tuning_sample_size:
self._auto_tune_cache()
def _auto_tune_cache(self):
take_sample = (self._stats[DIFF_COUNT] % self.cache_tuning_sample_size == 0)
if self.cache_tuning_sample_size:
if self._stats[DISTANCE_CACHE_ENABLED]:
if take_sample:
self._auto_off_cache()
# Turn on the cache once in a while
elif self._stats[DIFF_COUNT] % self._shared_parameters[_ENABLE_CACHE_EVERY_X_DIFF] == 0:
self.progress_logger('Re-enabling the distance and level caches.')
# decreasing the sampling frequency
self._shared_parameters[_ENABLE_CACHE_EVERY_X_DIFF] *= 10
self._stats[DISTANCE_CACHE_ENABLED] = True
if take_sample:
for key in (PREVIOUS_DIFF_COUNT, PREVIOUS_DISTANCE_CACHE_HIT_COUNT):
self._stats[key] = self._stats[key[9:]]
def _auto_off_cache(self):
"""
Auto adjust the cache based on the usage
"""
if self._stats[DISTANCE_CACHE_ENABLED]:
angle = (self._stats[DISTANCE_CACHE_HIT_COUNT] - self._stats['PREVIOUS {}'.format(DISTANCE_CACHE_HIT_COUNT)]) / (self._stats[DIFF_COUNT] - self._stats[PREVIOUS_DIFF_COUNT])
if angle < self.CACHE_AUTO_ADJUST_THRESHOLD:
self._stats[DISTANCE_CACHE_ENABLED] = False
self.progress_logger('Due to minimal cache hits, {} is disabled.'.format('distance cache'))
def _use_custom_operator(self, level):
"""
For each level we check all custom operators.
If any one of them was a match for the level, we run the diff of the operator.
If the operator returned True, the operator must have decided these objects should not
be compared anymore. It might have already reported their results.
In that case the report will appear in the final results of this diff.
Otherwise basically the 2 objects in the level are being omitted from the results.
"""
for operator in self.custom_operators:
if operator.match(level):
prevent_default = operator.give_up_diffing(level=level, diff_instance=self)
if prevent_default:
return True
return False
def _diff(self, level, parents_ids=frozenset(), _original_type=None, local_tree=None):
"""
The main diff method
**parameters**
level: the tree level or tree node
parents_ids: the ids of all the parent objects in the tree from the current node.
_original_type: If the objects had an original type that was different than what currently exists in the level.t1 and t2
"""
if self._count_diff() is StopIteration:
return
if self._use_custom_operator(level):
return
if level.t1 is level.t2:
return
if self._skip_this(level):
return
report_type_change = True
if get_type(level.t1) != get_type(level.t2):
for type_group in self.ignore_type_in_groups:
if self.type_check_func(level.t1, type_group) and self.type_check_func(level.t2, type_group):
report_type_change = False
break
if self.use_enum_value and isinstance(level.t1, Enum):
level.t1 = level.t1.value
report_type_change = False
if self.use_enum_value and isinstance(level.t2, Enum):
level.t2 = level.t2.value
report_type_change = False
if report_type_change:
self._diff_types(level, local_tree=local_tree)
return
# This is an edge case where t1=None or t2=None and None is in the ignore type group.
if level.t1 is None or level.t2 is None:
self._report_result('values_changed', level, local_tree=local_tree)
return
if self.ignore_nan_inequality and isinstance(level.t1, (float, np_floating)) and str(level.t1) == str(level.t2) == 'nan':
return
if isinstance(level.t1, booleans):
self._diff_booleans(level, local_tree=local_tree)
elif isinstance(level.t1, strings):
# Special handling when comparing string with UUID and ignore_uuid_types is True
if self.ignore_uuid_types and isinstance(level.t2, uuids):
try:
# Convert string to UUID for comparison
t1_uuid = uuid.UUID(level.t1)
if t1_uuid.int != level.t2.int:
self._report_result('values_changed', level, local_tree=local_tree)
except (ValueError, AttributeError):
# If string is not a valid UUID, report as changed
self._report_result('values_changed', level, local_tree=local_tree)
else:
self._diff_str(level, local_tree=local_tree)
elif isinstance(level.t1, datetime.datetime):
self._diff_datetime(level, local_tree=local_tree)
elif isinstance(level.t1, ipranges):
self._diff_ipranges(level, local_tree=local_tree)
elif isinstance(level.t1, (datetime.date, datetime.timedelta, datetime.time)):
self._diff_time(level, local_tree=local_tree)
elif isinstance(level.t1, uuids):
# Special handling when comparing UUID with string and ignore_uuid_types is True
if self.ignore_uuid_types and isinstance(level.t2, str):
try:
# Convert string to UUID for comparison
t2_uuid = uuid.UUID(level.t2)
if level.t1.int != t2_uuid.int:
self._report_result('values_changed', level, local_tree=local_tree)
except (ValueError, AttributeError):
# If string is not a valid UUID, report as changed
self._report_result('values_changed', level, local_tree=local_tree)
else:
self._diff_uuids(level, local_tree=local_tree)
elif isinstance(level.t1, numbers):
self._diff_numbers(level, local_tree=local_tree, report_type_change=report_type_change)
elif isinstance(level.t1, Mapping):
self._diff_dict(level, parents_ids, local_tree=local_tree)
elif isinstance(level.t1, tuple):
self._diff_tuple(level, parents_ids, local_tree=local_tree)
elif isinstance(level.t1, (set, frozenset, SetOrdered)):
self._diff_set(level, local_tree=local_tree)
elif isinstance(level.t1, np_ndarray):
self._diff_numpy_array(level, parents_ids, local_tree=local_tree)
elif isinstance(level.t1, PydanticBaseModel):
self._diff_obj(level, parents_ids, local_tree=local_tree, is_pydantic_object=True)
elif isinstance(level.t1, Iterable):
self._diff_iterable(level, parents_ids, _original_type=_original_type, local_tree=local_tree)
elif isinstance(level.t1, Enum):
self._diff_enum(level, parents_ids, local_tree=local_tree)
else:
self._diff_obj(level, parents_ids)
def _get_view_results(self, view, verbose_level=None):
"""
Get the results based on the view
"""
result = self.tree
if not self.report_repetition: # and self.is_root:
result.mutual_add_removes_to_become_value_changes()
if view == TREE_VIEW:
pass
elif view == TEXT_VIEW:
effective_verbose_level = verbose_level if verbose_level is not None else self.verbose_level
result = TextResult(tree_results=self.tree, verbose_level=effective_verbose_level)
result.remove_empty_keys()
elif view == DELTA_VIEW:
result = self._to_delta_dict(report_repetition_required=False)
elif view == COLORED_VIEW:
result = ColoredView(t2=self.t2, tree_result=self.tree, compact=False)
elif view == COLORED_COMPACT_VIEW:
result = ColoredView(t2=self.t2, tree_result=self.tree, compact=True)
else:
raise ValueError(INVALID_VIEW_MSG.format(view))
return result
@staticmethod
def _get_key_for_group_by(row, group_by, item_name):
"""
Get the key value to group a row by, using the specified group_by parameter.
Example
>>> row = {'first': 'John', 'middle': 'Joe', 'last': 'Smith'}
>>> DeepDiff._get_key_for_group_by(row, 'first', 't1')
'John'
>>> nested_row = {'id': 123, 'demographics': {'names': {'first': 'John', 'middle': 'Joe', 'last': 'Smith'}}}
>>> group_by = lambda x: x['demographics']['names']['first']
>>> DeepDiff._get_key_for_group_by(nested_row, group_by, 't1')
'John'
Args:
row (dict): The dictionary (row) to extract the group by key from.
group_by (str or callable): The key name or function to call to get to the key value to group by.
item_name (str): The name of the item, used for error messages.
Returns:
str: The key value to group by.
Raises:
KeyError: If the specified key is not found in the row.
"""
try:
if callable(group_by):
return group_by(row)
return row.pop(group_by)
except KeyError:
logger.error("Unable to group {} by {}. The key is missing in {}".format(item_name, group_by, row))
raise
def _group_iterable_to_dict(self, item, group_by, item_name):
"""
Convert a list of dictionaries into a dictionary of dictionaries
where the key is the value of the group_by key in each dictionary.
"""
group_by_level2 = None
if isinstance(group_by, (list, tuple)):
group_by_level1 = group_by[0]
if len(group_by) > 1:
group_by_level2 = group_by[1]
else:
group_by_level1 = group_by
if isinstance(item, Iterable) and not isinstance(item, Mapping):
result = {}
item_copy = deepcopy(item)
for row in item_copy:
if isinstance(row, Mapping):
key1 = self._get_key_for_group_by(row, group_by_level1, item_name)
# Track keys created by group_by to avoid type prefixing later
if hasattr(self, 'group_by_keys'):
self.group_by_keys.add(key1)
if group_by_level2:
key2 = self._get_key_for_group_by(row, group_by_level2, item_name)
# Track level 2 keys as well
if hasattr(self, 'group_by_keys'):
self.group_by_keys.add(key2)
if key1 not in result:
result[key1] = {}
if self.group_by_sort_key:
if key2 not in result[key1]:
result[key1][key2] = []
result_key1_key2 = result[key1][key2]
if row not in result_key1_key2:
result_key1_key2.append(row)
else:
result[key1][key2] = row
else:
if self.group_by_sort_key:
if key1 not in result:
result[key1] = []
if row not in result[key1]:
result[key1].append(row)
else:
result[key1] = row
else:
msg = "Unable to group {} by {} since the item {} is not a dictionary.".format(item_name, group_by_level1, row)
logger.error(msg)
raise ValueError(msg)
if self.group_by_sort_key:
if group_by_level2:
for key1, row1 in result.items():
for key2, row in row1.items():
row.sort(key=self.group_by_sort_key)
else:
for key, row in result.items():
row.sort(key=self.group_by_sort_key)
return result
msg = "Unable to group {} by {}".format(item_name, group_by)
logger.error(msg)
raise ValueError(msg)
def get_stats(self):
"""
Get some stats on internals of the DeepDiff run.
"""
return self._stats
@property
def affected_paths(self):
"""
Get the list of paths that were affected.
Whether a value was changed or they were added or removed.
Example
>>> from pprint import pprint
>>> t1 = {1: 1, 2: 2, 3: [3], 4: 4}
>>> t2 = {1: 1, 2: 4, 3: [3, 4], 5: 5, 6: 6}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint(ddiff, indent=4)
{ 'dictionary_item_added': ['root[5]', 'root[6]'],
'dictionary_item_removed': ['root[4]'],
'iterable_item_added': {'root[3][1]': 4},
'values_changed': {'root[2]': {'new_value': 4, 'old_value': 2}}}
>>> sorted(ddiff.affected_paths)
['root[2]', 'root[3][1]', 'root[4]', 'root[5]', 'root[6]']
>>> sorted(ddiff.affected_root_keys)
[2, 3, 4, 5, 6]
"""
result = SetOrdered()
for key in REPORT_KEYS:
value = self.get(key)
if value:
if isinstance(value, SetOrdered):
result |= value
else:
result |= SetOrdered(value.keys())
return result
@property
def affected_root_keys(self):
"""
Get the list of root keys that were affected.
Whether a value was changed or they were added or removed.
Example
>>> from pprint import pprint
>>> t1 = {1: 1, 2: 2, 3: [3], 4: 4}
>>> t2 = {1: 1, 2: 4, 3: [3, 4], 5: 5, 6: 6}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint(ddiff, indent=4)
{ 'dictionary_item_added': ['root[5]', 'root[6]'],
'dictionary_item_removed': ['root[4]'],
'iterable_item_added': {'root[3][1]': 4},
'values_changed': {'root[2]': {'new_value': 4, 'old_value': 2}}}
>>> sorted(ddiff.affected_paths)
['root[2]', 'root[3][1]', 'root[4]', 'root[5]', 'root[6]']
>>> sorted(ddiff.affected_root_keys)
[2, 3, 4, 5, 6]
"""
result = SetOrdered()
for key in REPORT_KEYS:
value = self.tree.get(key)
if value:
if isinstance(value, SetOrdered):
values_list = value
else:
values_list = value.keys()
for item in values_list:
root_key = item.get_root_key()
if root_key is not notpresent:
result.add(root_key)
return result
def __str__(self):
if hasattr(self, '_colored_view') and self.view in {COLORED_VIEW, COLORED_COMPACT_VIEW}:
return str(self._colored_view)
return super().__str__()
if __name__ == "__main__": # pragma: no cover
import doctest
doctest.testmod()
qlustered-deepdiff-41c7265/deepdiff/distance.py 0000664 0000000 0000000 00000033614 15162412645 0021472 0 ustar 00root root 0000000 0000000 import math
import datetime
from typing import TYPE_CHECKING, Callable, Protocol, Any, Union, Optional
from deepdiff.deephash import DeepHash
from deepdiff.helper import (
DELTA_VIEW, numbers, strings, add_to_frozen_set, not_found, only_numbers, np, np_float64, time_to_seconds,
cartesian_product_numpy, np_ndarray, np_array_factory, get_homogeneous_numpy_compatible_type_of_seq, dict_,
CannotCompare, NumberType)
from collections.abc import Mapping, Iterable
if TYPE_CHECKING:
from deepdiff.diff import DeepDiffProtocol
class DistanceProtocol(DeepDiffProtocol, Protocol):
hashes: dict
deephash_parameters: dict
ignore_numeric_type_changes: bool
iterable_compare_func: Optional[Callable]
math_epsilon: Optional[float]
cutoff_distance_for_pairs: float
def __get_item_rough_length(self, item, parent:str="root") -> float:
...
def _to_delta_dict(
self,
directed: bool = True,
report_repetition_required: bool = True,
always_include_values: bool = False,
) -> dict:
...
def __calculate_item_deephash(self, item: Any) -> None:
...
DISTANCE_CALCS_NEEDS_CACHE = "Distance calculation can not happen once the cache is purged. Try with _cache='keep'"
class DistanceMixin:
def _get_rough_distance(self: "DistanceProtocol"):
"""
Gives a numeric value for the distance of t1 and t2 based on how many operations are needed to convert
one to the other.
This is a similar concept to the Levenshtein Edit Distance but for the structured data and it is designed
to be between 0 and 1.
A distance of zero means the objects are equal and a distance of 1 is very far.
Note: The distance calculation formula is subject to change in future. Use the distance results only as a
way of comparing the distances of pairs of items with other pairs rather than an absolute distance
such as the one provided by Levenshtein edit distance.
Info: The current algorithm is based on the number of operations that are needed to convert t1 to t2 divided
by the number of items that make up t1 and t2.
"""
_distance = get_numeric_types_distance(
self.t1, self.t2, max_=self.cutoff_distance_for_pairs, use_log_scale=self.use_log_scale, log_scale_similarity_threshold=self.log_scale_similarity_threshold)
if _distance is not not_found:
return _distance
item = self if self.view == DELTA_VIEW else self._to_delta_dict(report_repetition_required=False)
diff_length = _get_item_length(item)
if diff_length == 0:
return 0
t1_len = self.__get_item_rough_length(self.t1)
t2_len = self.__get_item_rough_length(self.t2)
return diff_length / (t1_len + t2_len)
def __get_item_rough_length(self: "DistanceProtocol", item, parent='root'):
"""
Get the rough length of an item.
It is used as a part of calculating the rough distance between objects.
**parameters**
item: The item to calculate the rough length for
parent: It is only used for DeepHash reporting purposes. Not really useful here.
"""
if not hasattr(self, 'hashes'):
raise RuntimeError(DISTANCE_CALCS_NEEDS_CACHE)
length = DeepHash.get_key(self.hashes, key=item, default=None, extract_index=1,
ignore_numeric_type_changes=self.ignore_numeric_type_changes)
if length is None:
self.__calculate_item_deephash(item)
length = DeepHash.get_key(self.hashes, key=item, default=None, extract_index=1,
ignore_numeric_type_changes=self.ignore_numeric_type_changes)
return length
def __calculate_item_deephash(self: "DistanceProtocol", item: Any) -> None:
DeepHash(
item,
hashes=self.hashes,
parent='root',
apply_hash=True,
**self.deephash_parameters,
)
def _precalculate_distance_by_custom_compare_func(
self: "DistanceProtocol", hashes_added, hashes_removed, t1_hashtable, t2_hashtable, _original_type):
pre_calced_distances = dict_()
for added_hash in hashes_added:
for removed_hash in hashes_removed:
try:
is_close_distance = self.iterable_compare_func(t2_hashtable[added_hash].item, t1_hashtable[removed_hash].item)
except CannotCompare:
pass
else:
if is_close_distance:
# an arbitrary small distance if math_epsilon is not defined
distance = self.math_epsilon or 0.000001
else:
distance = 1
pre_calced_distances["{}--{}".format(added_hash, removed_hash)] = distance
return pre_calced_distances
def _precalculate_numpy_arrays_distance(
self: "DistanceProtocol", hashes_added, hashes_removed, t1_hashtable, t2_hashtable, _original_type):
# We only want to deal with 1D arrays.
if isinstance(t2_hashtable[next(iter(hashes_added))].item, (np_ndarray, list)):
return
pre_calced_distances = dict_()
added = [t2_hashtable[k].item for k in hashes_added]
removed = [t1_hashtable[k].item for k in hashes_removed]
if _original_type is None:
added_numpy_compatible_type = get_homogeneous_numpy_compatible_type_of_seq(added)
removed_numpy_compatible_type = get_homogeneous_numpy_compatible_type_of_seq(removed)
if added_numpy_compatible_type and added_numpy_compatible_type == removed_numpy_compatible_type:
_original_type = added_numpy_compatible_type
if _original_type is None:
return
added = np_array_factory(added, dtype=_original_type)
removed = np_array_factory(removed, dtype=_original_type)
pairs = cartesian_product_numpy(added, removed)
pairs_transposed = pairs.T
distances = _get_numpy_array_distance(
pairs_transposed[0], pairs_transposed[1],
max_=self.cutoff_distance_for_pairs,
use_log_scale=self.use_log_scale,
log_scale_similarity_threshold=self.log_scale_similarity_threshold,
)
i = 0
for added_hash in hashes_added:
for removed_hash in hashes_removed:
pre_calced_distances["{}--{}".format(added_hash, removed_hash)] = distances[i]
i += 1
return pre_calced_distances
def _get_item_length(item, parents_ids=frozenset([])):
"""
Get the number of operations in a diff object.
It is designed mainly for the delta view output
but can be used with other dictionary types of view outputs too.
"""
length = 0
if isinstance(item, Mapping):
for key, subitem in item.items():
# dedupe the repetition report so the number of times items have shown up does not affect the distance.
if key in {'iterable_items_added_at_indexes', 'iterable_items_removed_at_indexes'}:
new_subitem = dict_()
for path_, indexes_to_items in subitem.items():
used_value_ids = set()
new_indexes_to_items = dict_()
for k, v in indexes_to_items.items():
v_id = id(v)
if v_id not in used_value_ids:
used_value_ids.add(v_id)
new_indexes_to_items[k] = v
new_subitem[path_] = new_indexes_to_items
subitem = new_subitem
# internal keys such as _numpy_paths should not count towards the distance.
# old_type and old_value are metadata about the previous state, not additional operations.
if isinstance(key, strings) and (key.startswith('_') or key == 'deep_distance' or key == 'new_path'
or key == 'old_type' or key == 'old_value'):
continue
item_id = id(subitem)
if parents_ids and item_id in parents_ids:
continue
parents_ids_added = add_to_frozen_set(parents_ids, item_id)
length += _get_item_length(subitem, parents_ids_added)
elif isinstance(item, numbers):
length = 1
elif isinstance(item, strings):
length = 1
elif isinstance(item, Iterable):
for subitem in item:
item_id = id(subitem)
if parents_ids and item_id in parents_ids:
continue
parents_ids_added = add_to_frozen_set(parents_ids, item_id)
length += _get_item_length(subitem, parents_ids_added)
elif isinstance(item, type): # it is a class
length = 1
else:
if hasattr(item, '__dict__'):
for subitem in item.__dict__:
item_id = id(subitem)
parents_ids_added = add_to_frozen_set(parents_ids, item_id)
length += _get_item_length(subitem, parents_ids_added)
return length
def _get_numbers_distance(num1, num2, max_=1, use_log_scale=False, log_scale_similarity_threshold=0.1):
"""
Get the distance of 2 numbers. The output is a number between 0 to the max.
The reason is the
When max is returned means the 2 numbers are really far, and 0 means they are equal.
"""
if num1 == num2:
return 0
if use_log_scale:
distance = logarithmic_distance(num1, num2)
if distance < 0:
return 0
return distance
if not isinstance(num1, float):
num1 = float(num1)
if not isinstance(num2, float):
num2 = float(num2)
# Since we have a default cutoff of 0.3 distance when
# getting the pairs of items during the ingore_order=True
# calculations, we need to make the divisor of comparison very big
# so that any 2 numbers can be chosen as pairs.
divisor = (num1 + num2) / max_
if divisor == 0:
return max_
try:
return min(max_, abs((num1 - num2) / divisor))
except Exception: # pragma: no cover. I don't think this line will ever run but doesn't hurt to leave it.
return max_ # pragma: no cover
def _numpy_div(a, b, replace_inf_with=1):
max_array = np.full(shape=a.shape, fill_value=replace_inf_with, dtype=np_float64)
result = np.divide(a, b, out=max_array, where=b != 0, dtype=np_float64)
# wherever 2 numbers are the same, make sure the distance is zero. This is mainly for 0 divided by zero.
result[a == b] = 0
return result
# To deal with numbers close to zero
MATH_LOG_OFFSET = 1e-10
def numpy_apply_log_keep_sign(array, offset=MATH_LOG_OFFSET):
# Calculate the absolute value and add the offset
abs_plus_offset = np.abs(array) + offset
# Calculate the logarithm
log_values = np.log(abs_plus_offset)
# Apply the original signs to the log values
signed_log_values = np.copysign(log_values, array)
return signed_log_values
def logarithmic_similarity(a: NumberType, b: NumberType, threshold: float=0.1) -> bool:
"""
A threshold of 0.1 translates to about 10.5% difference.
A threshold of 0.5 translates to about 65% difference.
A threshold of 0.05 translates to about 5.1% difference.
"""
return logarithmic_distance(a, b) < threshold
def logarithmic_distance(a: NumberType, b: NumberType) -> float:
# Apply logarithm to the absolute values and consider the sign
a = float(a)
b = float(b)
log_a = math.copysign(math.log(abs(a) + MATH_LOG_OFFSET), a)
log_b = math.copysign(math.log(abs(b) + MATH_LOG_OFFSET), b)
return abs(log_a - log_b)
def _get_numpy_array_distance(num1, num2, max_=1, use_log_scale=False, log_scale_similarity_threshold=0.1):
"""
Get the distance of 2 numbers. The output is a number between 0 to the max.
The reason is the
When max is returned means the 2 numbers are really far, and 0 means they are equal.
"""
# Since we have a default cutoff of 0.3 distance when
# getting the pairs of items during the ingore_order=True
# calculations, we need to make the divisor of comparison very big
# so that any 2 numbers can be chosen as pairs.
if use_log_scale:
num1 = numpy_apply_log_keep_sign(num1)
num2 = numpy_apply_log_keep_sign(num2)
divisor = (num1 + num2) / max_
result = _numpy_div((num1 - num2), divisor, replace_inf_with=max_)
distance_array = np.clip(np.absolute(result), 0, max_)
if use_log_scale:
distance_array[distance_array < log_scale_similarity_threshold] = 0
return distance_array
def _get_datetime_distance(date1, date2, max_, use_log_scale, log_scale_similarity_threshold):
return _get_numbers_distance(date1.timestamp(), date2.timestamp(), max_)
def _get_date_distance(date1, date2, max_, use_log_scale, log_scale_similarity_threshold):
return _get_numbers_distance(date1.toordinal(), date2.toordinal(), max_)
def _get_timedelta_distance(timedelta1, timedelta2, max_, use_log_scale, log_scale_similarity_threshold):
return _get_numbers_distance(timedelta1.total_seconds(), timedelta2.total_seconds(), max_)
def _get_time_distance(time1, time2, max_, use_log_scale, log_scale_similarity_threshold):
return _get_numbers_distance(time_to_seconds(time1), time_to_seconds(time2), max_)
TYPES_TO_DIST_FUNC = [
(only_numbers, _get_numbers_distance),
(datetime.datetime, _get_datetime_distance),
(datetime.date, _get_date_distance),
(datetime.timedelta, _get_timedelta_distance),
(datetime.time, _get_time_distance),
]
def get_numeric_types_distance(num1, num2, max_, use_log_scale=False, log_scale_similarity_threshold=0.1):
for type_, func in TYPES_TO_DIST_FUNC:
if isinstance(num1, type_) and isinstance(num2, type_):
return func(num1, num2, max_, use_log_scale, log_scale_similarity_threshold)
return not_found
qlustered-deepdiff-41c7265/deepdiff/docstrings/ 0000775 0000000 0000000 00000000000 15162412645 0021476 5 ustar 00root root 0000000 0000000 qlustered-deepdiff-41c7265/deepdiff/docstrings/authors.rst 0000664 0000000 0000000 00000021656 15162412645 0023727 0 ustar 00root root 0000000 0000000 :doc:`/index`
Authors
=======
Authors in order of the timeline of their contributions:
- `Sep Dehpour (Seperman)`_
- `Victor Hahn Castell`_ for the tree view and major contributions:
- `nfvs`_ for Travis-CI setup script.
- `brbsix`_ for initial Py3 porting.
- `WangFenjin`_ for unicode support.
- `timoilya`_ for comparing list of sets when ignoring order.
- `Bernhard10`_ for significant digits comparison.
- `b-jazz`_ for PEP257 cleanup, Standardize on full names, fixing line
endings.
- `finnhughes`_ for fixing **slots**
- `moloney`_ for Unicode vs. Bytes default
- `serv-inc`_ for adding help(deepdiff)
- `movermeyer`_ for updating docs
- `maxrothman`_ for search in inherited class attributes
- `maxrothman`_ for search for types/objects
- `MartyHub`_ for exclude regex paths
- `sreecodeslayer`_ for DeepSearch match_string
- Brian Maissy `brianmaissy`_ for weakref fix, enum tests
- Bartosz Borowik `boba-2`_ for Exclude types fix when ignoring order
- Brian Maissy `brianmaissy `__ for
fixing classes which inherit from classes with slots didn’t have all
of their slots compared
- Juan Soler `Soleronline`_ for adding ignore_type_number
- `mthaddon`_ for adding timedelta diffing support
- `Necrophagos`_ for Hashing of the number 1 vs. True
- `gaal-dev`_ for adding exclude_obj_callback
- Ivan Piskunov `van-ess0`_ for deprecation warning enhancement.
- Michał Karaś `MKaras93`_ for the pretty view
- Christian Kothe `chkothe`_ for the basic support for diffing numpy
arrays
- `Timothy`_ for truncate_datetime
- `d0b3rm4n`_ for bugfix to not apply format to non numbers.
- `MyrikLD`_ for Bug Fix NoneType in ignore type groups
- Stian Jensen `stianjensen`_ for improving ignoring of NoneType in
diff
- Florian Klien `flowolf`_ for adding math_epsilon
- Tim Klein `timjklein36`_ for retaining the order of multiple
dictionary items added via Delta.
- Wilhelm Schürmann\ `wbsch`_ for fixing the typo with yml files.
- `lyz-code`_ for adding support for regular expressions in DeepSearch
and strict_checking feature in DeepSearch.
- `dtorres-sf`_ for adding the option for custom compare function
- Tony Wang `Tony-Wang`_ for bugfix: verbose_level==0 should disable
values_changes.
- Sun Ao `eggachecat`_ for adding custom operators.
- Sun Ao `eggachecat`_ for adding ignore_order_func.
- `SlavaSkvortsov`_ for fixing unprocessed key error.
- Håvard Thom `havardthom`_ for adding UUID support.
- Dhanvantari Tilak `Dhanvantari`_ for Bug-Fix:
``TypeError in _get_numbers_distance() when ignore_order = True``.
- Yael Mintz `yaelmi3`_ for detailed pretty print when verbose_level=2.
- Mikhail Khviyuzov `mskhviyu`_ for Exclude obj callback strict.
- `dtorres-sf`_ for the fix for diffing using iterable_compare_func with nested objects.
- `Enric Pou `__ for bug fix of ValueError
when using Decimal 0.x
- `Uwe Fladrich `__ for fixing bug when diff'ing non-sequence iterables
- `Michal Ozery-Flato `__ for
setting equal_nan=ignore_nan_inequality in the call for
np.array_equal
- `martin-kokos `__ for using Pytest’s
tmp_path fixture instead of /tmp/
- Håvard Thom `havardthom `__ for adding
include_obj_callback and include_obj_callback_strict.
- `Noam Gottlieb `__ for fixing a corner
case where numpy’s ``np.float32`` nans are not ignored when using
``ignore_nan_equality``.
- `maggelus `__ for the bugfix deephash
for paths.
- `maggelus `__ for the bugfix deephash
compiled regex.
- `martin-kokos `__ for fixing the
tests dependent on toml.
- `kor4ik `__ for the bugfix for
``include_paths`` for nested dictionaries.
- `martin-kokos `__ for using tomli
and tomli-w for dealing with tomli files.
- `Alex Sauer-Budge `__ for the bugfix for
``datetime.date``.
- `William Jamieson `__ for `NumPy 2.0 compatibility `__
- `Leo Sin `__ for Supporting Python 3.12 in
the build process
- `sf-tcalhoun `__ for fixing
“Instantiating a Delta with a flat_dict_list unexpectedly mutates the
flat_dict_list”
- `dtorres-sf `__ for fixing iterable
moved items when iterable_compare_func is used.
- `Florian Finkernagel `__ for pandas
and polars support.
- Mathis Chenuet `artemisart `__ for
fixing slots classes comparison and PR review.
- Sherjeel Shabih `sherjeelshabih `__
for fixing the issue where the key deep_distance is not returned when
both compared items are equal #510
- `Juergen Skrotzky `__ for adding
empty ``py.typed``
- `Mate Valko `__ for fixing the issue so we
lower only if clean_key is instance of str via #504
- `jlaba `__ for fixing #493 include_paths,
when only certain keys are included via #499
- `Doron Behar `__ for fixing DeepHash
for numpy booleans via #496
- `Aaron D. Marasco `__ for adding
print() options which allows a user-defined string (or callback
function) to prefix every output when using the pretty() call.
- `David Hotham `__ for relaxing
orderly-set dependency via #486
- `dtorres-sf `__ for the fix for moving
nested tables when using iterable_compare_func.
- `Jim Cipar `__ for the fix recursion depth
limit when hashing numpy.datetime64
- `Enji Cooper `__ for converting legacy
setuptools use to pyproject.toml
- `Diogo Correia `__ for reporting security vulnerability in Delta and DeepDiff that could allow remote code execution.
- `am-periphery `__ for reporting CVE-2026-33155: denial-of-service via crafted pickle payloads triggering massive memory allocation.
- `echan5 `__ for adding callable ``group_by`` support.
- `yannrouillard `__ for fixing colored view display when all list items are removed.
- `tpvasconcelos `__ for fixing ``__slots__`` handling for objects with ``__getattr__``.
- `devin13cox `__ for always using t1 path for reporting.
- `vitalis89 `__ for fixing ``ignore_keys`` issue in ``detailed__dict__``.
- `ljames8 `__ for fixing logarithmic similarity type hint.
- `srini047 `__ for fixing README typo.
- `Nagato-Yuzuru `__ for colored view tests.
- `akshat62 `__ for adding Fraction numeric support.
.. _Sep Dehpour (Seperman): http://www.zepworks.com
.. _Victor Hahn Castell: http://hahncastell.de
.. _nfvs: https://github.com/nfvs
.. _brbsix: https://github.com/brbsix
.. _WangFenjin: https://github.com/WangFenjin
.. _timoilya: https://github.com/timoilya
.. _Bernhard10: https://github.com/Bernhard10
.. _b-jazz: https://github.com/b-jazz
.. _finnhughes: https://github.com/finnhughes
.. _moloney: https://github.com/moloney
.. _serv-inc: https://github.com/serv-inc
.. _movermeyer: https://github.com/movermeyer
.. _maxrothman: https://github.com/maxrothman
.. _MartyHub: https://github.com/MartyHub
.. _sreecodeslayer: https://github.com/sreecodeslayer
.. _brianmaissy: https://github.com/
.. _boba-2: https://github.com/boba-2
.. _Soleronline: https://github.com/Soleronline
.. _mthaddon: https://github.com/mthaddon
.. _Necrophagos: https://github.com/Necrophagos
.. _gaal-dev: https://github.com/gaal-dev
.. _van-ess0: https://github.com/van-ess0
.. _MKaras93: https://github.com/MKaras93
.. _chkothe: https://github.com/chkothe
.. _Timothy: https://github.com/timson
.. _d0b3rm4n: https://github.com/d0b3rm4n
.. _MyrikLD: https://github.com/MyrikLD
.. _stianjensen: https://github.com/stianjensen
.. _flowolf: https://github.com/flowolf
.. _timjklein36: https://github.com/timjklein36
.. _wbsch: https://github.com/wbsch
.. _lyz-code: https://github.com/lyz-code
.. _dtorres-sf: https://github.com/dtorres-sf
.. _Tony-Wang: https://github.com/Tony-Wang
.. _eggachecat: https://github.com/eggachecat
.. _SlavaSkvortsov: https://github.com/SlavaSkvortsov
.. _havardthom: https://github.com/havardthom
.. _Dhanvantari: https://github.com/Dhanvantari
.. _yaelmi3: https://github.com/yaelmi3
.. _mskhviyu: https://github.com/mskhviyu
Thank you for contributing to DeepDiff!
Back to :doc:`/index`
qlustered-deepdiff-41c7265/deepdiff/docstrings/basics.rst 0000664 0000000 0000000 00000034421 15162412645 0023500 0 ustar 00root root 0000000 0000000 :doc:`/index`
Basics
======
Importing
>>> from deepdiff import DeepDiff
>>> from pprint import pprint
Same object returns empty
>>> t1 = {1:1, 2:2, 3:3}
>>> t2 = t1
>>> print(DeepDiff(t1, t2))
{}
Type of an item has changed
>>> t1 = {1:1, 2:2, 3:3}
>>> t2 = {1:1, 2:"2", 3:3}
>>> pprint(DeepDiff(t1, t2), indent=2)
{ 'type_changes': { 'root[2]': { 'new_type': ,
'new_value': '2',
'old_type': ,
'old_value': 2}}}
Value of an item has changed
>>> t1 = {1:1, 2:2, 3:3}
>>> t2 = {1:1, 2:4, 3:3}
>>> pprint(DeepDiff(t1, t2, verbose_level=0), indent=2)
{'values_changed': {'root[2]': {'new_value': 4, 'old_value': 2}}}
Item added and/or removed
>>> t1 = {1:1, 3:3, 4:4}
>>> t2 = {1:1, 3:3, 5:5, 6:6}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint (ddiff)
{'dictionary_item_added': [root[5], root[6]],
'dictionary_item_removed': [root[4]]}
Set verbose level to 2 in order to see the added or removed items with their values
>>> t1 = {1:1, 3:3, 4:4}
>>> t2 = {1:1, 3:3, 5:5, 6:6}
>>> ddiff = DeepDiff(t1, t2, verbose_level=2)
>>> pprint(ddiff, indent=2)
{ 'dictionary_item_added': {'root[5]': 5, 'root[6]': 6},
'dictionary_item_removed': {'root[4]': 4}}
Set verbose level to 2 includes new_path when the path has changed for a report between t1 and t2:
>>> t1 = [1, 3]
>>> t2 = [3, 2]
>>>
>>>
>>> diff = DeepDiff(t1, t2, ignore_order=True, verbose_level=2)
>>> pprint(diff)
{'values_changed': {'root[0]': {'new_path': 'root[1]',
'new_value': 2,
'old_value': 1}}}
String difference
>>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":"world"}}
>>> t2 = {1:1, 2:4, 3:3, 4:{"a":"hello", "b":"world!"}}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint (ddiff, indent = 2)
{ 'values_changed': { 'root[2]': {'new_value': 4, 'old_value': 2},
"root[4]['b']": { 'new_value': 'world!',
'old_value': 'world'}}}
String difference 2
>>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":"world!\nGoodbye!\n1\n2\nEnd"}}
>>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":"world\n1\n2\nEnd"}}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint (ddiff, indent = 2)
{ 'values_changed': { "root[4]['b']": { 'diff': '--- \n'
'+++ \n'
'@@ -1,5 +1,4 @@\n'
'-world!\n'
'-Goodbye!\n'
'+world\n'
' 1\n'
' 2\n'
' End',
'new_value': 'world\n1\n2\nEnd',
'old_value': 'world!\n'
'Goodbye!\n'
'1\n'
'2\n'
'End'}}}
>>>
>>> print (ddiff['values_changed']["root[4]['b']"]["diff"])
---
+++
@@ -1,5 +1,4 @@
-world!
-Goodbye!
+world
1
2
End
List difference
>>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, 3, 4]}}
>>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2]}}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint (ddiff, indent = 2)
{'iterable_item_removed': {"root[4]['b'][2]": 3, "root[4]['b'][3]": 4}}
List that contains dictionary:
>>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, {1:1, 2:2}]}}
>>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, {1:3}]}}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint (ddiff, indent = 2)
{ 'dictionary_item_removed': [root[4]['b'][2][2]],
'values_changed': {"root[4]['b'][2][1]": {'new_value': 3, 'old_value': 1}}}
Sets:
>>> t1 = {1, 2, 8}
>>> t2 = {1, 2, 3, 5}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint(ddiff)
{'set_item_added': [root[3], root[5]], 'set_item_removed': [root[8]]}
Named Tuples:
>>> from collections import namedtuple
>>> Point = namedtuple('Point', ['x', 'y'])
>>> t1 = Point(x=11, y=22)
>>> t2 = Point(x=11, y=23)
>>> pprint (DeepDiff(t1, t2))
{'values_changed': {'root.y': {'new_value': 23, 'old_value': 22}}}
Custom objects:
>>> class ClassA(object):
... a = 1
... def __init__(self, b):
... self.b = b
...
>>> t1 = ClassA(1)
>>> t2 = ClassA(2)
>>>
>>> pprint(DeepDiff(t1, t2))
{'values_changed': {'root.b': {'new_value': 2, 'old_value': 1}}}
Object attribute added:
>>> t2.c = "new attribute"
>>> pprint(DeepDiff(t1, t2))
{'attribute_added': [root.c],
'values_changed': {'root.b': {'new_value': 2, 'old_value': 1}}}
Datetime
DeepDiff converts all datetimes into UTC. If a datetime is timezone naive, we assume it is in UTC too.
That is different than what Python does. Python assumes your timezone naive datetime is in your local timezone.
>>> from deepdiff import DeepDiff
>>> from datetime import datetime, timezone
>>> d1 = datetime(2020, 8, 31, 13, 14, 1)
>>> d2 = datetime(2020, 8, 31, 13, 14, 1, tzinfo=timezone.utc)
>>> d1 == d2
False
>>> DeepDiff(d1, d2)
{}
.. note::
All the examples above use the default :ref:`text_view_label`.
If you want traversing functionality in the results, use the :ref:`tree_view_label`.
You just need to set view='tree' to get it in tree form.
.. _group_by_label:
Group By
--------
group_by can be used when dealing with the list of dictionaries. It converts them from lists to a single dictionary with the key defined by group_by. The common use case is when reading data from a flat CSV, and the primary key is one of the columns in the CSV. We want to use the primary key instead of the CSV row number to group the rows. The group_by can do 2D group_by by passing a list of 2 keys. It is also possible to have a callable group_by, which can be used to access keys in more nested data structures.
For example:
>>> [
... {'id': 'AA', 'name': 'Joe', 'last_name': 'Nobody'},
... {'id': 'BB', 'name': 'James', 'last_name': 'Blue'},
... {'id': 'CC', 'name': 'Mike', 'last_name': 'Apple'},
... ]
Becomes:
>>> t1 = {
... 'AA': {'name': 'Joe', 'last_name': 'Nobody'},
... 'BB': {'name': 'James', 'last_name': 'Blue'},
... 'CC': {'name': 'Mike', 'last_name': 'Apple'},
... }
With that in mind, let's take a look at the following:
>>> from deepdiff import DeepDiff
>>> t1 = [
... {'id': 'AA', 'name': 'Joe', 'last_name': 'Nobody'},
... {'id': 'BB', 'name': 'James', 'last_name': 'Blue'},
... {'id': 'CC', 'name': 'Mike', 'last_name': 'Apple'},
... ]
>>>
>>> t2 = [
... {'id': 'AA', 'name': 'Joe', 'last_name': 'Nobody'},
... {'id': 'BB', 'name': 'James', 'last_name': 'Brown'},
... {'id': 'CC', 'name': 'Mike', 'last_name': 'Apple'},
... ]
>>>
>>> DeepDiff(t1, t2)
{'values_changed': {"root[1]['last_name']": {'new_value': 'Brown', 'old_value': 'Blue'}}}
Now we use group_by='id':
>>> DeepDiff(t1, t2, group_by='id')
{'values_changed': {"root['BB']['last_name']": {'new_value': 'Brown', 'old_value': 'Blue'}}}
.. note::
group_by actually changes the structure of the t1 and t2. You can see this by using the tree view:
>>> diff = DeepDiff(t1, t2, group_by='id', view='tree')
>>> diff
{'values_changed': []}
>>> diff['values_changed'][0]
>>> diff['values_changed'][0].up
>>> diff['values_changed'][0].up.up
>>> diff['values_changed'][0].up.up.t1
{'AA': {'name': 'Joe', 'last_name': 'Nobody'}, 'BB': {'name': 'James', 'last_name': 'Blue'}, 'CC': {'name': 'Mike', 'last_name': 'Apple'}}
2D Example:
>>> from pprint import pprint
>>> from deepdiff import DeepDiff
>>>
>>> t1 = [
... {'id': 'AA', 'name': 'Joe', 'last_name': 'Nobody'},
... {'id': 'BB', 'name': 'James', 'last_name': 'Blue'},
... {'id': 'BB', 'name': 'Jimmy', 'last_name': 'Red'},
... {'id': 'CC', 'name': 'Mike', 'last_name': 'Apple'},
... ]
>>>
>>> t2 = [
... {'id': 'AA', 'name': 'Joe', 'last_name': 'Nobody'},
... {'id': 'BB', 'name': 'James', 'last_name': 'Brown'},
... {'id': 'CC', 'name': 'Mike', 'last_name': 'Apple'},
... ]
>>>
>>> diff = DeepDiff(t1, t2, group_by=['id', 'name'])
>>> pprint(diff)
{'dictionary_item_removed': [root['BB']['Jimmy']],
'values_changed': {"root['BB']['James']['last_name']": {'new_value': 'Brown',
'old_value': 'Blue'}}}
Callable group_by Example:
>>> from deepdiff import DeepDiff
>>>
>>> t1 = [
... {'id': 'AA', 'demographics': {'names': {'first': 'Joe', 'middle': 'John', 'last': 'Nobody'}}},
... {'id': 'BB', 'demographics': {'names': {'first': 'James', 'middle': 'Joyce', 'last': 'Blue'}}},
... {'id': 'CC', 'demographics': {'names': {'first': 'Mike', 'middle': 'Mark', 'last': 'Apple'}}},
... ]
>>>
>>> t2 = [
... {'id': 'AA', 'demographics': {'names': {'first': 'Joe', 'middle': 'John', 'last': 'Nobody'}}},
... {'id': 'BB', 'demographics': {'names': {'first': 'James', 'middle': 'Joyce', 'last': 'Brown'}}},
... {'id': 'CC', 'demographics': {'names': {'first': 'Mike', 'middle': 'Charles', 'last': 'Apple'}}},
... ]
>>>
>>> diff = DeepDiff(t1, t2, group_by=lambda x: x['demographics']['names']['first'])
>>> pprint(diff)
{'values_changed': {"root['James']['demographics']['names']['last']": {'new_value': 'Brown',
'old_value': 'Blue'},
"root['Mike']['demographics']['names']['middle']": {'new_value': 'Charles',
'old_value': 'Mark'}}}
.. _group_by_sort_key_label:
Group By - Sort Key
-------------------
group_by_sort_key is used to define how dictionaries are sorted if multiple ones fall under one group. When this parameter is used, group_by converts the lists of dictionaries into a dictionary of keys to lists of dictionaries. Then, group_by_sort_key is used to sort between the list.
For example, there are duplicate id values. If we only use group_by='id', one of the dictionaries with id of 'BB' will overwrite the other. However, if we also set group_by_sort_key='name', we keep both dictionaries with the id of 'BB'.
Example:
>>> [{'id': 'AA', 'int_id': 2, 'last_name': 'Nobody', 'name': 'Joe'},
... {'id': 'BB', 'int_id': 20, 'last_name': 'Blue', 'name': 'James'},
... {'id': 'BB', 'int_id': 3, 'last_name': 'Red', 'name': 'Jimmy'},
... {'id': 'CC', 'int_id': 4, 'last_name': 'Apple', 'name': 'Mike'}]
Becomes:
>>> {'AA': [{'int_id': 2, 'last_name': 'Nobody', 'name': 'Joe'}],
... 'BB': [{'int_id': 20, 'last_name': 'Blue', 'name': 'James'},
... {'int_id': 3, 'last_name': 'Red', 'name': 'Jimmy'}],
... 'CC': [{'int_id': 4, 'last_name': 'Apple', 'name': 'Mike'}]}
Example of using group_by_sort_key
>>> t1 = [
... {'id': 'AA', 'name': 'Joe', 'last_name': 'Nobody', 'int_id': 2},
... {'id': 'BB', 'name': 'James', 'last_name': 'Blue', 'int_id': 20},
... {'id': 'BB', 'name': 'Jimmy', 'last_name': 'Red', 'int_id': 3},
... {'id': 'CC', 'name': 'Mike', 'last_name': 'Apple', 'int_id': 4},
... ]
>>>
>>> t2 = [
... {'id': 'AA', 'name': 'Joe', 'last_name': 'Nobody', 'int_id': 2},
... {'id': 'BB', 'name': 'James', 'last_name': 'Brown', 'int_id': 20},
... {'id': 'CC', 'name': 'Mike', 'last_name': 'Apple', 'int_id': 4},
... ]
>>>
>>> diff = DeepDiff(t1, t2, group_by='id', group_by_sort_key='name')
>>>
>>> pprint(diff)
{'iterable_item_removed': {"root['BB'][1]": {'int_id': 3,
'last_name': 'Red',
'name': 'Jimmy'}},
'values_changed': {"root['BB'][0]['last_name']": {'new_value': 'Brown',
'old_value': 'Blue'}}}
.. _default_timezone_label:
Default Time Zone
-----------------
default_timezone defines the default timezone. If a datetime is timezone naive, which means it doesn't have a timezone, we assume the datetime is in this timezone. Also any datetime that has a timezone will be converted to this timezone so the datetimes can be compared properly all in the same timezone. Note that Python's default behavior assumes the default timezone is your local timezone. DeepDiff's default is UTC, not your local time zone.
Note that if we change the default_timezone, the output timezone changes accordingly
>>> from deepdiff import DeepDiff
>>> import pytz
>>> from datetime import date, datetime, time, timezone
>>> dt_utc = datetime(2025, 2, 3, 12, 0, 0, tzinfo=pytz.utc) # UTC timezone
>>> dt_utc2 = datetime(2025, 2, 3, 11, 0, 0, tzinfo=pytz.utc) # UTC timezone
>>> dt_ny = dt_utc.astimezone(pytz.timezone('America/New_York'))
>>> dt_ny2 = dt_utc2.astimezone(pytz.timezone('America/New_York'))
>>> diff = DeepDiff(dt_ny, dt_ny2)
>>> diff
{'values_changed': {'root': {'new_value': datetime.datetime(2025, 2, 3, 11, 0, tzinfo=datetime.timezone.utc), 'old_value': datetime.datetime(2025, 2, 3, 12, 0, tzinfo=datetime.timezone.utc)}}}
>>> diff2 = DeepDiff(dt_ny, dt_ny2, default_timezone=pytz.timezone('America/New_York'))
>>> diff2
{'values_changed': {'root': {'new_value': datetime.datetime(2025, 2, 3, 6, 0, tzinfo=), 'old_value': datetime.datetime(2025, 2, 3, 7, 0, tzinfo=)}}}
Back to :doc:`/index`
qlustered-deepdiff-41c7265/deepdiff/docstrings/changelog.rst 0000664 0000000 0000000 00000043333 15162412645 0024165 0 ustar 00root root 0000000 0000000 :doc:`/index`
Changelog
=========
DeepDiff Changelog
- v9-0-0
- migration note:
- `to_dict()` and `to_json()` now accept a `verbose_level` parameter and always return a usable text-view dict. When the original view is `'tree'`, they default to `verbose_level=2` for full detail. The old `view_override` parameter is removed. To get the previous results, you will need to pass the explicit verbose_level to `to_json` and `to_dict` if you are using the tree view.
- Dropping support for Python 3.9
- Support for python 3.14
- Added support for callable ``group_by`` thanks to `echan5 `__
- Added ``FlatDeltaDict`` TypedDict for ``to_flat_dicts`` return type
- Fixed colored view display when all list items are removed thanks to `yannrouillard `__
- Fixed ``hasattr()`` swallowing ``AttributeError`` in ``__slots__`` handling for objects with ``__getattr__`` thanks to `tpvasconcelos `__
- Fixed ``ignore_order=True`` missing int-vs-float type changes
- Always use t1 path for reporting thanks to `devin13cox `__
- Fixed ``_convert_oversized_ints`` failing on NamedTuples
- Fixed orjson ``TypeError`` for integers exceeding 64-bit range
- Fixed parameter bug in ``to_flat_dicts`` where ``include_action_in_path`` and ``report_type_changes`` were not being passed through
- Fixed ``ignore_keys`` issue in ``detailed__dict__`` thanks to `vitalis89 `__
- Fixed logarithmic similarity type hint thanks to `ljames8 `__
- Added ``Fraction`` numeric support thanks to `akshat62 `__
- v8-6-2
- Security fix (CVE-2026-33155): Prevent denial-of-service via crafted pickle payloads that trigger massive memory allocation through the REDUCE opcode. Size-sensitive callables like ``bytes()`` and ``bytearray()`` are now wrapped to reject allocations exceeding 128 MB.
- v8-6-1
- Patched security vulnerability in the Delta class which was vulnerable to class pollution via its constructor, and when combined with a gadget available in DeltaDiff itself, it could lead to Denial of Service and Remote Code Execution (via insecure Pickle deserialization).
- v8-6-0
- Added Colored View thanks to @mauvilsa
- Added support for applying deltas to NamedTuple thanks to @paulsc
- Fixed test_delta.py with Python 3.14 thanks to @Romain-Geissler-1A
- Added python property serialization to json
- Added ip address serialization
- Switched to UV from pip
- Added Claude.md
- Added uuid hashing thanks to @akshat62
- Added ``ignore_uuid_types`` flag to DeepDiff to avoid type reports
when comparing UUID and string.
- Added comprehensive type hints across the codebase (multiple commits
for better type safety)
- Added support for memoryview serialization
- Added support for bytes serialization (non-UTF8 compatible)
- Fixed bug where group_by with numbers would leak type info into group
path reports
- Fixed bug in ``_get_clean_to_keys_mapping`` without explicit
significant digits
- Added support for python dict key serialization
- Enhanced support for IP address serialization with safe module imports
- Added development tooling improvements (pyright config, .envrc
example)
- Updated documentation and development instructions
- v8-5-0
- Updating deprecated pydantic calls
- Switching to pyproject.toml
- Fix for moving nested tables when using iterable_compare_func. by
- Fix recursion depth limit when hashing numpy.datetime64
- Moving from legacy setuptools use to pyproject.toml
- v8-4-2
- fixes the type hints for the base
- fixes summarize so if json dumps fails, we can still get a repr of the results
- adds ipaddress support
- v8-4-1
- Adding BaseOperatorPlus base class for custom operators
- default_timezone can be passed now to set your default timezone to something other than UTC.
- New summarization algorithm that produces valid json
- Better type hint support
- Breaking change in DeepHash where we raise Exception instead of logging if we can't hash a value.
- Added the log_stacktrace parameter to DeepDiff. When True, it will log the stacktrace along with the error.
- v8-3-0
- Fixed some static typing issues
- Added the summarize module for better repr of nested values
- v8-2-0
- Small optimizations so we don't load functions that are not needed
- Updated the minimum version of Orderly-set
- Normalize all datetimes into UTC. Assume timezone naive datetimes are UTC.
- v8-1-0
- Removing deprecated lines from setup.py
- Added ``prefix`` option to ``pretty()``
- Fixes hashing of numpy boolean values.
- Fixes **slots** comparison when the attribute doesn’t exist.
- Relaxing orderly-set reqs
- Added Python 3.13 support
- Only lower if clean_key is instance of str
- Fixes issue where the key deep_distance is not returned when both
compared items are equal
- Fixes exclude_paths fails to work in certain cases
- exclude_paths fails to work
- Fixes to_json() method chokes on standard json.dumps() kwargs such as
sort_keys
- to_dict() method chokes on standard json.dumps() kwargs
- Fixes accessing the affected_root_keys property on the diff object
returned by DeepDiff fails when one of the dicts is empty
- Fixes accessing the affected_root_keys property on the
diff object returned by DeepDiff fails when one of the dicts is empty
- v8-0-1
- Bugfix. Numpy should be optional.
- v8-0-0
- With the introduction of `threshold_to_diff_deeper`, the values returned are different than in previous versions of DeepDiff. You can still get the older values by setting `threshold_to_diff_deeper=0`. However to signify that enough has changed in this release that the users need to update the parameters passed to DeepDiff, we will be doing a major version update.
- `use_enum_value=True` makes it so when diffing enum, we use the enum's value. It makes it so comparing an enum to a string or any other value is not reported as a type change.
- `threshold_to_diff_deeper=float` is a number between 0 and 1. When comparing dictionaries that have a small intersection of keys, we will report the dictionary as a `new_value` instead of reporting individual keys changed. If you set it to zero, you get the same results as DeepDiff 7.0.1 and earlier, which means this feature is disabled. The new default is 0.33 which means if less that one third of keys between dictionaries intersect, report it as a new object.
- Deprecated `ordered-set` and switched to `orderly-set`. The `ordered-set` package was not being maintained anymore and starting Python 3.6, there were better options for sets that ordered. I forked one of the new implementations, modified it, and published it as `orderly-set`.
- Added `use_log_scale:bool` and `log_scale_similarity_threshold:float`. They can be used to ignore small changes in numbers by comparing their differences in logarithmic space. This is different than ignoring the difference based on significant digits.
- json serialization of reversed lists.
- Fix for iterable moved items when `iterable_compare_func` is used.
- Pandas and Polars support
- v7-0-1
- Fixes the translation between Difflib opcodes and Delta flat rows.
- v7-0-0
- When verbose=2, return ``new_path`` when the ``path`` and
``new_path`` are different (for example when ignore_order=True and
the index of items have changed).
- Dropping support for Python 3.7
- Introducing serialize to flat rows for delta objects.
- fixes the issue with hashing ``datetime.date`` objects where it
treated them as numbers instead of dates (fixes #445).
- upgrading orjson to the latest version
- Fix for bug when diffing two lists with ignore_order and providing
compare_func
- Fixes “Wrong diff on list of strings” #438
- Supporting Python 3.12 in the build process by `Leo
Sin `__
- Fixes “Instantiating a Delta with a flat_dict_list unexpectedly
mutates the flat_dict_list” #457 by
`sf-tcalhoun `__
- Fixes “Error on Delta With None Key and Removed Item from List”
#441
- Fixes “Error when comparing two nested dicts with 2 added fields”
#450
- Fixes “Error when subtracting Delta from a dictionary” #443
- v6-7-1
- Support for subtracting delta objects when iterable_compare_func
is used.
- Better handling of force adding a delta to an object.
- Fix for
```Can't compare dicts with both single and double quotes in keys`` `__
- Updated docs for Inconsistent Behavior with math_epsilon and
ignore_order = True
- v6-7-0
- Delta can be subtracted from other objects now.
- verify_symmetry is deprecated. Use bidirectional instead.
- always_include_values flag in Delta can be enabled to include
values in the delta for every change.
- Fix for Delta.\__add\_\_ breaks with esoteric dict keys.
- v6-6-1
- Fix for `DeepDiff raises decimal exception when using significant
digits `__
- Introducing group_by_sort_key
- Adding group_by 2D. For example
``group_by=['last_name', 'zip_code']``
- v6-6-0
- Numpy 2.0 support
- Adding
`Delta.to_flat_dicts `__
- v6-5-0
- Adding
```parse_path`` `__
- v6-4-1
- Bugfix: Keep Numpy Optional
- v6-4-0
- `Add Ignore List Order Option to
DeepHash `__ by
`Bobby Morck `__
- `pyyaml to 6.0.1 to fix cython build
problems `__ by
`Robert Bo Davis `__
- `Precompiled regex simple
diff `__ by
`cohml `__
- New flag: ``zip_ordered_iterables`` for forcing iterable items to
be compared one by one.
- v6-3-1
- Bugfix deephash for paths by
`maggelus `__
- Bugfix deephash compiled regex
`maggelus `__
- Fix tests dependent on toml by
`martin-kokos `__
- Bugfix for ``include_paths`` for nested dictionaries by
`kor4ik `__
- Use tomli and tomli-w for dealing with tomli files by
`martin-kokos `__
- Bugfix for ``datetime.date`` by `Alex
Sauer-Budge `__
- v6-3-0
- ``PrefixOrSuffixOperator``: This operator will skip strings that
are suffix or prefix of each other.
- ``include_obj_callback`` and ``include_obj_callback_strict`` are
added by `Håvard Thom `__.
- Fixed a corner case where numpy’s ``np.float32`` nans are not
ignored when using ``ignore_nan_equality`` by `Noam
Gottlieb `__
- ``orjson`` becomes optional again.
- Fix for ``ignore_type_in_groups`` with numeric values so it does
not report number changes when the number types are different.
- v6-2-3
- Switching to Orjson for serialization to improve the performance.
- Setting ``equal_nan=ignore_nan_inequality`` in the call for
``np.array_equal``
- Using Pytest’s tmp_path fixture instead of ``/tmp/``
- v6-2-2
- Enum test fix for python 3.11
- Adding support for dateutils rrules
- v6-2-1
- Removed the print statements.
- v6-2-0
- Major improvement in the diff report for lists when items are all
hashable and the order of items is important.
- v6-1-0
- DeepDiff.affected_paths can be used to get the list of all paths
where a change, addition, or deletion was reported for.
- DeepDiff.affected_root_keys can be used to get the list of all
paths where a change, addition, or deletion was reported for.
- Bugfix: ValueError when using Decimal 0.x #339 by `Enric
Pou `__
- Serialization of UUID
- v6-0-0
- `Exclude obj callback
strict `__
parameter is added to DeepDiff by Mikhail Khviyuzov
`mskhviyu `__.
- A fix for diffing using ``iterable_compare_func`` with nested
objects by `dtorres-sf `__ who
originally contributed this feature.
- v5-7-0:
- https://github.com/seperman/deepdiff/pull/284 Bug-Fix: TypeError
in \_get_numbers_distance() when ignore_order = True by
@Dhanvantari
- https://github.com/seperman/deepdiff/pull/280 Add support for
UUIDs by @havardthom
- Major bug in delta when it comes to iterable items added or
removed is investigated by @uwefladrich and resolved by @seperman
- v5-6-0: Adding custom operators, and ignore_order_func. Bugfix: verbose_level==0 should disable values_changes. Bugfix: unprocessed key error.
- v5-5-0: adding iterable_compare_func for DeepDiff, adding output_format of list for path() in tree view.
- v5-4-0: adding strict_checking for numbers in DeepSearch.
- v5-3-0: add support for regular expressions in DeepSearch.
- v5-2-3: Retaining the order of multiple dictionary items added via Delta. Fixed the typo with yml files in deep cli. Fixing Grep RecursionError where using non UTF-8 character. Allowing kwargs to be passed to to_json method.
- v5-2-2: Fixed Delta serialization when None type is present.
- v5-2-0: Removed Murmur3 as the preferred hashing method. Using SHA256 by default now. Added commandline for deepdiff. Added group_by. Added math_epsilon. Improved ignoring of NoneType.
- v5-0-2: Bug Fix NoneType in ignore type groups https://github.com/seperman/deepdiff/issues/207
- v5-0-1: Bug fix to not apply format to non numbers.
- v5-0-0: Introducing the Delta object, Improving Numpy support, Fixing tuples comparison when ignore_order=True, Dramatically improving the results when ignore_order=True by running in passes, Introducing pretty print view, deep_distance, purge, progress logging, cache and truncate_datetime.
- v4-3-3: Adds support for datetime.time
- v4-3-2: Deprecation Warning Enhancement
- v4-3-1: Fixing the issue with exclude_path and hash calculations when dictionaries were inside iterables. https://github.com/seperman/deepdiff/issues/174
- v4-3-0: adding exclude_obj_callback
- v4-2-0: .json property is finally removed. Fix for Py3.10. Dropping support for EOL Python 3.4. Ignoring private keys when calculating hashes. For example __init__ is not a part of hash calculation anymore. Fix for #166 Problem with comparing lists, with an boolean as element.
- v4-1-0: .json property is finally removed.
- v4-0-9: Fixing the bug for hashing custom unhashable objects
- v4-0-8: Adding ignore_nan_inequality for float('nan')
- v4-0-7: Hashing of the number 1 vs. True
- v4-0-6: found a tiny bug in Python formatting of numbers in scientific notation. Added a workaround.
- v4-0-5: Fixing number diffing. Adding number_format_notation and number_to_string_func.
- v4-0-4: Adding ignore_string_case and ignore_type_subclasses
- v4-0-3: Adding versionbump tool for release
- v4-0-2: Fixing installation issue where rst files are missing.
- v4-0-1: Fixing installation Tarball missing requirements.txt . DeepDiff v4+ should not show up as pip installable for Py2. Making Murmur3 installation optional.
- v4-0-0: Ending Python 2 support, Adding more functionalities and documentation for DeepHash. Switching to Pytest for testing. Switching to Murmur3 128bit for hashing. Fixing classes which inherit from classes with slots didn't have all of their slots compared. Renaming ContentHash to DeepHash. Adding exclude by path and regex path to DeepHash. Adding ignore_type_in_groups. Adding match_string to DeepSearch. Adding Timedelta object diffing.
- v3-5-0: Exclude regex path
- v3-3-0: Searching for objects and class attributes
- v3-2-2: Adding help(deepdiff)
- v3-2-1: Fixing hash of None
- v3-2-0: Adding grep for search: object | grep(item)
- v3-1-3: Unicode vs. Bytes default fix
- v3-1-2: NotPresent Fix when item is added or removed.
- v3-1-1: Bug fix when item value is None (#58)
- v3-1-0: Serialization to/from json
- v3-0-0: Introducing Tree View
- v2-5-3: Bug fix on logging for content hash.
- v2-5-2: Bug fixes on content hash.
- v2-5-0: Adding ContentHash module to fix ignore_order once and for all.
- v2-1-0: Adding Deep Search. Now you can search for item in an object.
- v2-0-0: Exclusion patterns better coverage. Updating docs.
- v1-8-0: Exclusion patterns.
- v1-7-0: Deep Set comparison.
- v1-6-0: Unifying key names. i.e newvalue is new_value now. For backward compatibility, newvalue still works.
- v1-5-0: Fixing ignore order containers with unordered items. Adding significant digits when comparing decimals. Changes property is deprecated.
- v1-1-0: Changing Set, Dictionary and Object Attribute Add/Removal to be reported as Set instead of List. Adding Pypy compatibility.
- v1-0-2: Checking for ImmutableMapping type instead of dict
- v1-0-1: Better ignore order support
- v1-0-0: Restructuring output to make it more useful. This is NOT backward compatible.
- v0-6-1: Fixing iterables with unhashable when order is ignored
- v0-6-0: Adding unicode support
- v0-5-9: Adding decimal support
- v0-5-8: Adding ignore order for unhashables support
- v0-5-7: Adding ignore order support
- v0-5-6: Adding slots support
- v0-5-5: Adding loop detection
Back to :doc:`/index`
qlustered-deepdiff-41c7265/deepdiff/docstrings/colored_view.rst 0000664 0000000 0000000 00000005766 15162412645 0024727 0 ustar 00root root 0000000 0000000 .. _colored_view_label:
Colored View
============
The `ColoredView` feature in `deepdiff` provides a human-readable, color-coded JSON output of the
differences between two objects. This feature is particularly useful for visualizing changes in a
clear and intuitive manner.
- **Color-Coded Differences:**
- **Added Elements:** Shown in green.
- **Removed Elements:** Shown in red.
- **Changed Elements:** The old value is shown in red, and the new value is shown in green.
Usage
-----
To use the `ColoredView`, simply pass the `COLORED_VIEW` option to the `DeepDiff` function:
.. code-block:: python
from deepdiff import DeepDiff
from deepdiff.helper import COLORED_VIEW
t1 = {"name": "John", "age": 30, "scores": [1, 2, 3], "address": {"city": "New York", "zip": "10001"}}
t2 = {"name": "John", "age": 31, "scores": [1, 2, 4], "address": {"city": "Boston", "zip": "10001"}, "new": "value"}
diff = DeepDiff(t1, t2, view=COLORED_VIEW)
print(diff)
Or from command line:
.. code-block:: bash
deep diff --view colored t1.json t2.json
The output will look something like this:
.. raw:: html
Colored Compact View
--------------------
For a more concise output, especially with deeply nested objects where many parts are unchanged,
the `ColoredView` with the compact option can be used. This view is similar but collapses
unchanged nested dictionaries to `{...}` and unchanged lists/tuples to `[...]`. To use the compact
option do:
.. code-block:: python
from deepdiff import DeepDiff
from deepdiff.helper import COLORED_COMPACT_VIEW
t1 = {"name": "John", "age": 30, "scores": [1, 2, 3], "address": {"city": "New York", "zip": "10001"}}
t2 = {"name": "John", "age": 31, "scores": [1, 2, 4], "address": {"city": "New York", "zip": "10001"}, "new": "value"}
diff = DeepDiff(t1, t2, view=COLORED_COMPACT_VIEW)
print(diff)
Or from command line:
.. code-block:: bash
deep diff --view colored_compact t1.json t2.json
The output will look something like this:
.. raw:: html
qlustered-deepdiff-41c7265/deepdiff/docstrings/commandline.rst 0000664 0000000 0000000 00000020276 15162412645 0024525 0 ustar 00root root 0000000 0000000 :doc:`/index`
Command Line
============
`New in DeepDiff 5.2.0`
DeepDiff provides commandline interface to a subset of functionality that it provides through its Python API.
The commands are:
- :ref:`deep_diff_command`
- :ref:`deep_grep_command`
- :ref:`deep_extract_command`
- :ref:`deep_patch_command`
.. _deep_diff_command:
deep diff command
-----------------
Run
.. code:: bash
$ deep diff
to get the options:
.. code-block:: bash
$ deep diff --help
Usage: deep diff [OPTIONS] T1 T2
Deep Diff Commandline
Deep Difference of content in files.
It can read csv, tsv, json, yaml, and toml files.
T1 and T2 are the path to the files to be compared with each other.
Options:
--cutoff-distance-for-pairs FLOAT
[default: 0.3]
--cutoff-intersection-for-pairs FLOAT
[default: 0.7]
--cache-size INTEGER [default: 0]
--cache-tuning-sample-size INTEGER
[default: 0]
--cache-purge-level INTEGER RANGE
[default: 1]
--create-patch [default: False]
--exclude-paths TEXT
--exclude-regex-paths TEXT
--math-epsilon DECIMAL
--get-deep-distance [default: False]
--group-by TEXT
--ignore-order [default: False]
--ignore-string-type-changes [default: False]
--ignore-numeric-type-changes [default: False]
--ignore-type-subclasses [default: False]
--ignore-string-case [default: False]
--ignore-nan-inequality [default: False]
--include-private-variables [default: False]
--log-frequency-in-sec INTEGER [default: 0]
--max-passes INTEGER [default: 10000000]
--max_diffs INTEGER
--number-format-notation [f|e] [default: f]
--progress-logger [info|error] [default: info]
--report-repetition [default: False]
--significant-digits INTEGER
--truncate-datetime [second|minute|hour|day]
--verbose-level INTEGER RANGE [default: 1]
--view [-|colored|colored_compact]
[default: -]
Format for displaying differences.
--help Show this message and exit.
Example usage:
Let's imagine we have t1.csv and t2.csv:
.. csv-table:: t1.csv
:file: ../tests/fixtures/t1.csv
:header-rows: 1
.. csv-table:: t2.csv
:file: ../tests/fixtures/t2.csv
:header-rows: 1
We can run:
.. code-block:: bash
$ deep diff t1.csv t2.csv --ignore-order
{'values_changed': {"root[2]['zip']": {'new_value': 90002, 'old_value': 90001}}}
As you can see here the path to the item that is being changed is `root[2]['zip']` which is ok but
what if we assume last names are unique and group by last_name?
.. code-block:: bash
$ deep diff t1.csv t2.csv --ignore-order --group-by last_name
{ 'values_changed': { "root['Molotov']['zip']": { 'new_value': 90002,
'old_value': 90001}}}
The path is perhaps more readable now: `root['Molotov']['zip']`. It is more clear that the zip code of Molotov has changed.
.. Note::
The parameters in the deep diff commandline are a subset of those in :ref:`deepdiff_label` 's Python API.
To output in a specific format, for example the colored compact view (see :doc:`colored_view` for output details):
.. code-block:: bash
$ deep diff t1.json t2.json --view colored_compact
.. _deep_grep_command:
deep grep command
-----------------
Run
.. code:: bash
$ deep grep
to get the options:
.. code-block:: bash
$ deep grep --help
Usage: deep grep [OPTIONS] ITEM PATH
Deep Grep Commandline
Grep through the contents of a file and find the path to the item.
It can read csv, tsv, json, yaml, and toml files.
Options:
-i, --ignore-case [default: False]
--exact-match [default: False]
--exclude-paths TEXT
--exclude-regex-paths TEXT
--verbose-level INTEGER RANGE [default: 1]
--help Show this message and exit.
.. csv-table:: t1.csv
:file: ../tests/fixtures/t1.csv
:header-rows: 1
.. code-block:: bash
$ deep grep --ignore-case james t1.csv
{'matched_values': ["root[2]['first_name']"]}
.. _deep_extract_command:
deep extract command
--------------------
Run
.. code:: bash
$ deep extract
to get the options:
.. code-block:: bash
$ deep extract --help
Usage: deep extract [OPTIONS] PATH_INSIDE PATH
Deep Extract Commandline
Extract an item from a file based on the path that is passed. It can read
csv, tsv, json, yaml, and toml files.
Options:
--help Show this message and exit.
.. csv-table:: t1.csv
:file: ../tests/fixtures/t1.csv
:header-rows: 1
.. code-block:: bash
$ deep extract "root[2]['first_name']" t1.csv
'James'
.. _deep_patch_command:
deep patch command
------------------
Run
.. code:: bash
$ deep patch --help
to get the options:
.. code-block:: text
$ deep patch --help
Usage: deep patch [OPTIONS] PATH DELTA_PATH
Deep Patch Commandline
Patches a file based on the information in a delta file. The delta file
can be created by the deep diff command and passing the --create-patch
argument.
Deep Patch is similar to Linux's patch command. The difference is that it
is made for patching data. It can read csv, tsv, json, yaml, and toml
files.
Options:
-b, --backup [default: False]
--raise-errors [default: False]
--help Show this message and exit.
Imagine if we have the following files:
.. csv-table:: t1.csv
:file: ../tests/fixtures/t1.csv
:header-rows: 1
.. csv-table:: t2.csv
:file: ../tests/fixtures/t2.csv
:header-rows: 1
First we need to create a "delta" file which represents the difference between the 2 files.
.. code-block:: bash
$ deep diff t1.csv t2.csv --ignore-order
{'values_changed': {"root[2]['zip']": {'new_value': 90002, 'old_value': 90001}}}
We create the delta by using the deep diff command and passing the `--create-patch` argument.
However since we are using `--ignore-order`, `deep diff` will ask us to also use `--report-repetition`:
.. code-block:: bash
deep diff t1.csv t2.csv --ignore-order --report-repetition --create-patch
=}values_changed}root[2]['zip']} new_valueJ_sss.%
Note that the delta is not human readable. It is meant for us to pass it into a file:
.. code-block:: bash
deep diff t1.csv t2.csv --ignore-order --report-repetition --create-patch > patch1.pickle
Now this delta file is ready to be applied by the `deep patch` command to any json, csv, toml or yaml file!
It is expecting the structure of the file to be similar to the one in the csv file though.
Let's look at this yaml file:
`another.yaml`
.. code-block:: yaml
---
-
first_name: Joe
last_name: Nobody
zip: 90011
-
first_name: Jack
last_name: Doit
zip: 22222
-
first_name: Sara
last_name: Stanley
zip: 11111
All that our delta knows is that `root[2]['zip']` has changed to `90002`.
Let's apply the delta:
.. code-block:: bash
deep patch --backup another.yaml patch1.pickle --raise-errors
And looking at the `another.yaml` file, the zip code is indeed updated!
.. code-block:: yaml
- first_name: Joe
last_name: Nobody
zip: 90011
- first_name: Jack
last_name: Doit
zip: 22222
- first_name: Sara
last_name: Stanley
zip: 90002
As you can see the formatting of the yaml file is changed.
This is due to the fact that DeepDiff loads the file into a Python dictionary, modifies it and then writes it back to disk.
During this operation, the file loses its original formatting.
.. note::
The deep patch command only provides a subset of what DeepDiff's :ref:`delta_label`'s Python API provides.
The deep patch command is minimalistic and is designed to have a similar interface to Linux's patch command
rather than DeepDiff's :ref:`delta_label`.
Back to :doc:`/index`
qlustered-deepdiff-41c7265/deepdiff/docstrings/custom.rst 0000664 0000000 0000000 00000035761 15162412645 0023556 0 ustar 00root root 0000000 0000000 :doc:`/index`
Customized Diff
===============
.. _iterable_compare_func_label:
Iterable Compare Func
---------------------
New in DeepDiff 5.5.0
There are times that we want to guide DeepDiff as to what items to compare with other items. In such cases we can pass a `iterable_compare_func` that takes a function pointer to compare two items. The function takes three parameters (x, y, level) and should return `True` if it is a match, `False` if it is not a match or raise `CannotCompare` if it is unable to compare the two.
For example take the following objects:
Now let's define a compare_func that takes 3 parameters: x, y and level.
>>> from deepdiff import DeepDiff
>>> from deepdiff.helper import CannotCompare
>>>
>>> t1 = [
... {
... 'id': 1,
... 'value': [1]
... },
... {
... 'id': 2,
... 'value': [7, 8, 1]
... },
... {
... 'id': 3,
... 'value': [7, 8],
... },
... ]
>>>
>>> t2 = [
... {
... 'id': 2,
... 'value': [7, 8]
... },
... {
... 'id': 3,
... 'value': [7, 8, 1],
... },
... {
... 'id': 1,
... 'value': [1]
... },
... ]
>>>
>>> DeepDiff(t1, t2)
{'values_changed': {"root[0]['id']": {'new_value': 2, 'old_value': 1}, "root[0]['value'][0]": {'new_value': 7, 'old_value': 1}, "root[1]['id']": {'new_value': 3, 'old_value': 2}, "root[2]['id']": {'new_value': 1, 'old_value': 3}, "root[2]['value'][0]": {'new_value': 1, 'old_value': 7}}, 'iterable_item_added': {"root[0]['value'][1]": 8}, 'iterable_item_removed': {"root[2]['value'][1]": 8}}
As you can see the results are different. Now items with the same ids are compared with each other.
>>> def compare_func(x, y, level=None):
... try:
... return x['id'] == y['id']
... except Exception:
... raise CannotCompare() from None
...
>>> DeepDiff(t1, t2, iterable_compare_func=compare_func)
{'iterable_item_added': {"root[2]['value'][2]": 1}, 'iterable_item_removed': {"root[1]['value'][2]": 1}}
If we set the verbose_level=2, we can see more details.
>>> DeepDiff(t1, t2, iterable_compare_func=compare_func, verbose_level=2)
{'iterable_item_added': {"root[2]['value'][2]": 1}, 'iterable_item_removed': {"root[1]['value'][2]": 1}, 'iterable_item_moved': {'root[0]': {'new_path': 'root[2]', 'value': {'id': 1, 'value': [1]}}, 'root[1]': {'new_path': 'root[0]', 'value': {'id': 2, 'value': [7, 8]}}, 'root[2]': {'new_path': 'root[1]', 'value': {'id': 3, 'value': [7, 8, 1]}}}}
We can also use the level parameter. Levels are explained in the :ref:`tree_view_label`.
For example you could use the level object to further determine if the 2 objects should be matches or not.
>>> t1 = {
... 'path1': [],
... 'path2': [
... {
... 'id': 1,
... 'value': [1]
... },
... {
... 'id': 2,
... 'value': [7, 8, 1]
... },
... ]
... }
>>>
>>> t2 = {
... 'path1': [{'pizza'}],
... 'path2': [
... {
... 'id': 2,
... 'value': [7, 8, 1]
... },
... {
... 'id': 1,
... 'value': [1, 2]
... },
... ]
... }
>>>
>>>
>>> def compare_func2(x, y, level):
... if (not isinstance(x, dict) or not isinstance(y, dict)):
... raise CannotCompare
... if(level.path() == "root['path2']"):
... if (x["id"] == y["id"]):
... return True
... return False
...
>>>
>>> DeepDiff(t1, t2, iterable_compare_func=compare_func2)
{'iterable_item_added': {"root['path1'][0]": {'pizza'}, "root['path2'][0]['value'][1]": 2}}
.. note::
The level parameter of the iterable_compare_func is only used when ignore_order=False which is the default value for ignore_order.
.. _custom_operators_label:
Custom Operators
----------------
Whether two objects are different or not largely depends on the context. For example, apples and bananas are the same
if you are considering whether they are fruits or not.
In that case, you can pass a *custom_operators* for the job.
Custom operators give you a lot of power. In the following examples, we explore various use cases such as:
- Making DeepDiff report the L2 Distance of items
- Only include specific paths in diffing
- Making DeepDiff stop diffing once we find the first diff.
You can use one of the predefined custom operators that come with DeepDiff. Or you can define one yourself.
Built-In Custom Operators
.. _prefix_or_suffix_operator_label:
PrefixOrSuffixOperator
......................
This operator will skip strings that are suffix or prefix of each other.
For example when this operator is used, the two strings of "joe" and "joe's car" will not be reported as different.
>>> from deepdiff import DeepDiff
>>> from deepdiff.operator import PrefixOrSuffixOperator
>>> t1 = {
... "key1": ["foo", "bar's food", "jack", "joe"]
... }
>>> t2 = {
... "key1": ["foo", "bar", "jill", "joe'car"]
... }
>>>
>>> DeepDiff(t1, t2)
{'values_changed': {"root['key1'][1]": {'new_value': 'bar', 'old_value': "bar's food"}, "root['key1'][2]": {'new_value': 'jill', 'old_value': 'jack'}, "root['key1'][3]": {'new_value': "joe'car", 'old_value': 'joe'}}}
>>> DeepDiff(t1, t2, custom_operators=[
... PrefixOrSuffixOperator()
... ])
>>>
{'values_changed': {"root['key1'][2]": {'new_value': 'jill', 'old_value': 'jack'}}}
Define A Custom Operator
------------------------
To define a custom operator, you just need to inherit *BaseOperator* or *BaseOperatorPlus*.
- *BaseOperatorPlus* is our new base operator that can be subclassed and provides the structure to build any custom operator.
- *BaseOperator* is our older base class for creating custom operators. It was designed mainly for simple string based regex comparison.
Base Operator Plus
..................
*BaseOperatorPlus* is our new base operator that can be subclassed and provides the structure to build any custom operator.
.. code-block:: python
class BaseOperatorPlus(metaclass=ABCMeta):
@abstractmethod
def match(self, level) -> bool:
"""
Given a level which includes t1 and t2 in the tree view, is this operator a good match to compare t1 and t2?
If yes, we will run the give_up_diffing to compare t1 and t2 for this level.
"""
pass
@abstractmethod
def give_up_diffing(self, level, diff_instance: "DeepDiff") -> bool:
"""
Given a level which includes t1 and t2 in the tree view, and the "distance" between l1 and l2.
do we consider t1 and t2 to be equal or not. The distance is a number between zero to one and is calculated by DeepDiff to measure how similar objects are.
"""
@abstractmethod
def normalize_value_for_hashing(self, parent: Any, obj: Any) -> Any:
"""
You can use this function to normalize values for ignore_order=True
For example, you may want to turn all the words to be lowercase. Then you return obj.lower()
"""
pass
**Example 1: We don't care about the exact GUID values. As long as pairs of strings match GUID regex, we want them to be considered as equals**
>>> import re
... from typing import Any
... from deepdiff import DeepDiff
... from deepdiff.operator import BaseOperatorPlus
...
...
... d1 = {
... "Name": "SUB_OBJECT_FILES",
... "Values": {
... "Value": [
... "{f254498b-b752-4f35-bef5-6f1844b61eb7}",
... "{7fb2a550-1849-45c0-b273-9aa5e4eb9f2b}",
... "{a9cbecc0-21dc-49ce-8b2c-d36352dae139}"
... ]
... }
... }
...
... d2 = {
... "Name": "SUB_OBJECT_FILES",
... "Values": {
... "Value": [
... "{e5d18917-1a2c-4abe-b601-8ec002629953}",
... "{ea71ba1f-1339-4fae-bc28-a9ce9b8a8c67}",
... "{66bb6192-9cd2-4074-8be1-f2ac52877c70}",
... ]
... }
... }
...
...
... class RemoveGUIDsOperator(BaseOperatorPlus):
... _pattern = r"[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}"
... _substitute = "guid"
...
... def match(self, level) -> bool:
... return isinstance(level.t1, str) and isinstance(level.t2, str)
...
... @classmethod
... def _remove_pattern(cls, t: str):
... return re.sub(cls._pattern, cls._substitute, t)
...
... def give_up_diffing(self, level, diff_instance):
... t1 = self._remove_pattern(level.t1)
... t2 = self._remove_pattern(level.t2)
... return t1 == t2
...
... def normalize_value_for_hashing(self, parent: Any, obj: Any) -> Any:
... """
... Used for ignore_order=True
... """
... if isinstance(obj, str):
... return self._remove_pattern(obj)
... return obj
...
...
... operator = RemoveGUIDsOperator()
...
>>> diff1 = DeepDiff(d1, d2, custom_operators=[operator], log_stacktrace=True)
... diff1
{}
>>> diff2 = DeepDiff(d1, d2, ignore_order=True, custom_operators=[operator], log_stacktrace=True)
... diff2
{}
Base Operator
.............
*BaseOperator* is our older base class for creating custom operators. It was designed mainly for simple string based regex comparison.
.. code-block:: python
class BaseOperator:
def __init__(self, regex_paths:Optional[List[str]]=None, types:Optional[List[type]]=None):
if regex_paths:
self.regex_paths = convert_item_or_items_into_compiled_regexes_else_none(regex_paths)
else:
self.regex_paths = None
self.types = types
def match(self, level) -> bool:
if self.regex_paths:
for pattern in self.regex_paths:
matched = re.search(pattern, level.path()) is not None
if matched:
return True
if self.types:
for type_ in self.types:
if isinstance(level.t1, type_) and isinstance(level.t2, type_):
return True
return False
def give_up_diffing(self, level, diff_instance) -> bool:
raise NotImplementedError('Please implement the diff function.')
**Example 2: An operator that mapping L2:distance as diff criteria and reports the distance**
>>> import math
>>>
>>> from typing import List
>>> from deepdiff import DeepDiff
>>> from deepdiff.operator import BaseOperator
>>>
>>>
>>> class L2DistanceDifferWithPreventDefault(BaseOperator):
... def __init__(self, regex_paths: List[str], distance_threshold: float):
... super().__init__(regex_paths)
... self.distance_threshold = distance_threshold
... def _l2_distance(self, c1, c2):
... return math.sqrt(
... (c1["x"] - c2["x"]) ** 2 + (c1["y"] - c2["y"]) ** 2
... )
... def give_up_diffing(self, level, diff_instance):
... l2_distance = self._l2_distance(level.t1, level.t2)
... if l2_distance > self.distance_threshold:
... diff_instance.custom_report_result('distance_too_far', level, {
... "l2_distance": l2_distance
... })
... return True
...
>>>
>>> t1 = {
... "coordinates": [
... {"x": 5, "y": 5},
... {"x": 8, "y": 8}
... ]
... }
>>>
>>> t2 = {
... "coordinates": [
... {"x": 6, "y": 6},
... {"x": 88, "y": 88}
... ]
... }
>>> DeepDiff(t1, t2, custom_operators=[L2DistanceDifferWithPreventDefault(
... ["^root\\['coordinates'\\]\\[\\d+\\]$"],
... 1
... )])
{'distance_too_far': {"root['coordinates'][0]": {'l2_distance': 1.4142135623730951}, "root['coordinates'][1]": {'l2_distance': 113.13708498984761}}}
**Example 3: If the objects are subclasses of a certain type, only compare them if their list attributes are not equal sets**
>>> class CustomClass:
... def __init__(self, d: dict, l: list):
... self.dict = d
... self.dict['list'] = l
...
>>>
>>> custom1 = CustomClass(d=dict(a=1, b=2), l=[1, 2, 3])
>>> custom2 = CustomClass(d=dict(c=3, d=4), l=[1, 2, 3, 2])
>>> custom3 = CustomClass(d=dict(a=1, b=2), l=[1, 2, 3, 4])
>>>
>>>
>>> class ListMatchOperator(BaseOperator):
... def give_up_diffing(self, level, diff_instance):
... if set(level.t1.dict['list']) == set(level.t2.dict['list']):
... return True
...
>>>
>>> DeepDiff(custom1, custom2, custom_operators=[
... ListMatchOperator(types=[CustomClass])
... ])
{}
>>>
>>>
>>> DeepDiff(custom2, custom3, custom_operators=[
... ListMatchOperator(types=[CustomClass])
... ])
{'dictionary_item_added': [root.dict['a'], root.dict['b']], 'dictionary_item_removed': [root.dict['c'], root.dict['d']], 'values_changed': {"root.dict['list'][3]": {'new_value': 4, 'old_value': 2}}}
>>>
**Example 4: Only diff certain paths**
>>> from deepdiff import DeepDiff
>>> class MyOperator:
... def __init__(self, include_paths):
... self.include_paths = include_paths
... def match(self, level) -> bool:
... return True
... def give_up_diffing(self, level, diff_instance) -> bool:
... return level.path() not in self.include_paths
...
>>>
>>> t1 = {'a': [10, 11], 'b': [20, 21], 'c': [30, 31]}
>>> t2 = {'a': [10, 22], 'b': [20, 33], 'c': [30, 44]}
>>>
>>> DeepDiff(t1, t2, custom_operators=[
... MyOperator(include_paths="root['a'][1]")
... ])
{'values_changed': {"root['a'][1]": {'new_value': 22, 'old_value': 11}}}
**Example 5: Give up further diffing once the first diff is found**
Sometimes all you care about is that there is a difference between 2 objects and not all the details of what exactly is different.
In that case you may want to stop diffing as soon as the first diff is found.
>>> from deepdiff import DeepDiff
>>> class MyOperator:
... def match(self, level) -> bool:
... return True
... def give_up_diffing(self, level, diff_instance) -> bool:
... return any(diff_instance.tree.values())
...
>>> t1 = [[1, 2], [3, 4], [5, 6]]
>>> t2 = [[1, 3], [3, 5], [5, 7]]
>>>
>>> DeepDiff(t1, t2, custom_operators=[
... MyOperator()
... ])
{'values_changed': {'root[0][1]': {'new_value': 3, 'old_value': 2}}}
Back to :doc:`/index`
qlustered-deepdiff-41c7265/deepdiff/docstrings/deep_distance.rst 0000664 0000000 0000000 00000014453 15162412645 0025026 0 ustar 00root root 0000000 0000000 :doc:`/index`
.. _deep_distance_label:
Deep Distance
=============
Deep Distance is the distance between 2 objects. It is a floating point number between 0 and 1. Deep Distance in concept is inspired by `Levenshtein Edit Distance `_.
At its core, the Deep Distance is the number of operations needed to convert one object to the other divided by the sum of the sizes of the 2 objects capped at 1. Note that unlike Levenshtein Distance, the Deep Distance is based on the number of operations and NOT the “minimum” number of operations to convert one object to the other. The number is highly dependent on the granularity of the diff results. And the granularity is controlled by the parameters passed to DeepDiff.
.. _get_deep_distance_label:
Get Deep Distance
-----------------
get_deep_distance: Boolean, default = False
get_deep_distance will get you the deep distance between objects. The distance is a number between 0 and 1 where zero means there is no diff between the 2 objects and 1 means they are very different. Note that this number should only be used to compare the similarity of 2 objects and nothing more. The algorithm for calculating this number may or may not change in the future releases of DeepDiff.
The value of Deep Distance will show up in the result diff object's deep_distance key.
>>> from deepdiff import DeepDiff
>>> DeepDiff(10.0, 10.1, get_deep_distance=True)
{'values_changed': {'root': {'new_value': 10.1, 'old_value': 10.0}}, 'deep_distance': 0.0014925373134328302}
>>> DeepDiff(10.0, 100.1, get_deep_distance=True)
{'values_changed': {'root': {'new_value': 100.1, 'old_value': 10.0}}, 'deep_distance': 0.24550408719346048}
>>> DeepDiff(10.0, 1000.1, get_deep_distance=True)
{'values_changed': {'root': {'new_value': 1000.1, 'old_value': 10.0}}, 'deep_distance': 0.29405999405999406}
>>> DeepDiff([1], [1], get_deep_distance=True)
{}
>>> DeepDiff([1], [1, 2], get_deep_distance=True)
{'iterable_item_added': {'root[1]': 2}, 'deep_distance': 0.2}
>>> DeepDiff([1], [1, 2, 3], get_deep_distance=True)
{'iterable_item_added': {'root[1]': 2, 'root[2]': 3}, 'deep_distance': 0.3333333333333333}
>>> DeepDiff([[2, 1]], [[1, 2, 3]], ignore_order=True, get_deep_distance=True)
{'iterable_item_added': {'root[0][2]': 3}, 'deep_distance': 0.1111111111111111}
.. _distance_and_diff_granularity_label:
Distance And Diff Granularity
-----------------------------
.. note::
Deep Distance of objects are highly dependent on the diff object that is produced. A diff object that is more granular will give more accurate Deep Distance value too.
Let's use the following 2 deeply nested objects as an example. If you ignore the order of items, they are very similar and only differ in a few elements.
We will run 2 diffs and ask for the deep distance. The only difference between the below 2 diffs is that in the first one the :ref:`cutoff_intersection_for_pairs_label` is not passed so the default value of 0.3 is used while in the other one cutoff_intersection_for_pairs=1 is used which forces extra pass calculations.
>>> from pprint import pprint
>>> t1 = [
... {
... "key3": [[[[[[[[[[1, 2, 4, 5]]], [[[8, 7, 3, 5]]]]]]]]]],
... "key4": [7, 8]
... },
... {
... "key5": "val5",
... "key6": "val6"
... }
... ]
>>>
>>> t2 = [
... {
... "key5": "CHANGE",
... "key6": "val6"
... },
... {
... "key3": [[[[[[[[[[1, 3, 5, 4]]], [[[8, 8, 1, 5]]]]]]]]]],
... "key4": [7, 8]
... }
... ]
We don't pass cutoff_intersection_for_pairs in the first diff.
>>> diff1=DeepDiff(t1, t2, ignore_order=True, cache_size=5000, get_deep_distance=True)
>>> pprint(diff1)
{'deep_distance': 0.36363636363636365,
'values_changed': {'root[0]': {'new_value': {'key5': 'CHANGE', 'key6': 'val6'},
'old_value': {'key3': [[[[[[[[[[1, 2, 4, 5]]],
[[[8,
7,
3,
5]]]]]]]]]],
'key4': [7, 8]}},
'root[1]': {'new_value': {'key3': [[[[[[[[[[1, 3, 5, 4]]],
[[[8,
8,
1,
5]]]]]]]]]],
'key4': [7, 8]},
'old_value': {'key5': 'val5', 'key6': 'val6'}}}}
Note that the stats show that only 5 set of objects were compared with each other according to the DIFF COUNT:
>>> diff1.get_stats()
{'PASSES COUNT': 0, 'DIFF COUNT': 5, 'DISTANCE CACHE HIT COUNT': 0, 'MAX PASS LIMIT REACHED': False, 'MAX DIFF LIMIT REACHED': False}
Let's pass cutoff_intersection_for_pairs=1 to enforce pass calculations. As you can see the results are way more granular and the deep distance value is way more accurate now.
>>> diff2=DeepDiff(t1, t2, ignore_order=True, cache_size=5000, cutoff_intersection_for_pairs=1, get_deep_distance=True)
>>> from pprint import pprint
>>> pprint(diff2)
{'deep_distance': 0.06060606060606061,
'iterable_item_removed': {"root[0]['key3'][0][0][0][0][0][0][1][0][0][1]": 7},
'values_changed': {"root[0]['key3'][0][0][0][0][0][0][0][0][0][1]": {'new_value': 3,
'old_value': 2},
"root[0]['key3'][0][0][0][0][0][0][1][0][0][2]": {'new_value': 1,
'old_value': 3},
"root[1]['key5']": {'new_value': 'CHANGE',
'old_value': 'val5'}}}
As you can see now way more calculations have happened behind the scene. Instead of only 5 set of items being compared with each other, we have 306 items that are compared with each other in 110 passes.
>>> diff2.get_stats()
{'PASSES COUNT': 110, 'DIFF COUNT': 306, 'DISTANCE CACHE HIT COUNT': 0, 'MAX PASS LIMIT REACHED': False, 'MAX DIFF LIMIT REACHED': False}
Back to :doc:`/index`
qlustered-deepdiff-41c7265/deepdiff/docstrings/deephash.rst 0000664 0000000 0000000 00000000240 15162412645 0024005 0 ustar 00root root 0000000 0000000 :doc:`/index`
DeepHash
========
.. toctree::
:maxdepth: 3
.. automodule:: deepdiff.deephash
.. autoclass:: DeepHash
:members:
Back to :doc:`/index`
qlustered-deepdiff-41c7265/deepdiff/docstrings/deephash_doc.rst 0000664 0000000 0000000 00000043021 15162412645 0024636 0 ustar 00root root 0000000 0000000 :orphan:
**DeepHash**
DeepHash calculates the hash of objects based on their contents in a deterministic way.
This way 2 objects with the same content should have the same hash.
The main usage of DeepHash is to calculate the hash of otherwise unhashable objects.
For example you can use DeepHash to calculate the hash of a set or a dictionary!
At the core of it, DeepHash is a deterministic serialization of your object into a string so it
can be passed to a hash function. By default it uses SHA256. You have the option to pass any other hashing function to be used instead.
**Import**
>>> from deepdiff import DeepHash
**Parameters**
obj : any object, The object to be hashed based on its content.
apply_hash: Boolean, default = True
DeepHash at its core is doing deterministic serialization of objects into strings.
Then it hashes the string.
The only time you want the apply_hash to be False is if you want to know what
the string representation of your object is BEFORE it gets hashed.
exclude_types: list, default = None
List of object types to exclude from hashing.
exclude_paths: list, default = None
List of paths to exclude from the report. If only one item, you can pass it as a string instead of a list containing only one path.
include_paths: list, default = None
List of the only paths to include in the report. If only one item, you can pass it as a string.
exclude_regex_paths: list, default = None
List of string regex paths or compiled regex paths objects to exclude from the report. If only one item, you can pass it as a string instead of a list containing only one regex path.
exclude_obj_callback
function, default = None
A function that takes the object and its path and returns a Boolean. If True is returned, the object is excluded from the results, otherwise it is included.
This is to give the user a higher level of control than one can achieve via exclude_paths, exclude_regex_paths or other means.
encodings: List, default = None
Character encodings to iterate through when we convert bytes into strings. You may want to pass an explicit list of encodings in your objects if you start getting UnicodeDecodeError from DeepHash. Also check out ignore_encoding_errors if you can get away with ignoring these errors and don't want to bother with an explicit list of encodings but it will come at the price of slightly less accuracy of the final results. Example: encodings=["utf-8", "latin-1"]
hashes: dictionary, default = empty dictionary
A dictionary of {object or object id: object hash} to start with.
Any object that is encountered and it is already in the hashes dictionary or its id is in the hashes dictionary,
will re-use the hash that is provided by this dictionary instead of re-calculating
its hash. This is typically used when you have a series of objects to be hashed and there might be repeats of the same object.
hasher: function. default = DeepHash.sha256hex
hasher is the hashing function. The default is DeepHash.sha256hex.
But you can pass another hash function to it if you want.
For example a cryptographic hash function or Python's builtin hash function.
All it needs is a function that takes the input in string format and returns the hash.
You can use it by passing: hasher=hash for Python's builtin hash.
The following alternative is already provided:
- hasher=DeepHash.sha1hex
Note that prior to DeepDiff 5.2, Murmur3 was the default hash function.
But Murmur3 is removed from DeepDiff dependencies since then.
ignore_repetition: Boolean, default = True
If repetitions in an iterable should cause the hash of iterable to be different.
Note that the deepdiff diffing functionality lets this to be the default at all times.
But if you are using DeepHash directly, you can set this parameter.
ignore_type_in_groups
Ignore type changes between members of groups of types. For example if you want to ignore type changes between float and decimals etc. Note that this is a more granular feature. Most of the times the shortcuts provided to you are enough.
The shortcuts are ignore_string_type_changes which by default is False and ignore_numeric_type_changes which is by default False. You can read more about those shortcuts in this page. ignore_type_in_groups gives you more control compared to the shortcuts.
For example lets say you have specifically str and byte datatypes to be ignored for type changes. Then you have a couple of options:
1. Set ignore_string_type_changes=True which is the default.
2. Set ignore_type_in_groups=[(str, bytes)]. Here you are saying if we detect one type to be str and the other one bytes, do not report them as type change. It is exactly as passing ignore_type_in_groups=[DeepDiff.strings] or ignore_type_in_groups=DeepDiff.strings .
Now what if you want also typeA and typeB to be ignored when comparing agains each other?
1. ignore_type_in_groups=[DeepDiff.strings, (typeA, typeB)]
2. or ignore_type_in_groups=[(str, bytes), (typeA, typeB)]
ignore_string_type_changes: Boolean, default = True
string type conversions should not affect the hash output when this is set to True.
For example "Hello" and b"Hello" should produce the same hash.
By setting it to True, both the string and bytes of hello return the same hash.
ignore_numeric_type_changes: Boolean, default = False
numeric type conversions should not affect the hash output when this is set to True.
For example 10, 10.0 and Decimal(10) should produce the same hash.
When ignore_numeric_type_changes is set to True, all numbers are converted
to strings with the precision of significant_digits parameter and number_format_notation notation.
If no significant_digits is passed by the user, a default value of 12 is used.
ignore_type_subclasses
Use ignore_type_subclasses=True so when ignoring type (class), the subclasses of that class are ignored too.
ignore_string_case
Whether to be case-sensitive or not when comparing strings. By settings ignore_string_case=False, strings will be compared case-insensitively.
ignore_private_variables: Boolean, default = True
Whether to exclude the private variables in the calculations or not. It only affects variables that start with double underscores (__).
ignore_encoding_errors: Boolean, default = False
If you want to get away with UnicodeDecodeError without passing explicit character encodings, set this option to True. If you want to make sure the encoding is done properly, keep this as False and instead pass an explicit list of character encodings to be considered via the encodings parameter.
ignore_iterable_order: Boolean, default = True
If order of items in an iterable should not cause the hash of the iterable to be different.
number_format_notation : string, default="f"
number_format_notation is what defines the meaning of significant digits. The default value of "f" means the digits AFTER the decimal point. "f" stands for fixed point. The other option is "e" which stands for exponent notation or scientific notation.
significant_digits : int >= 0, default=None
By default the significant_digits compares only that many digits AFTER the decimal point. However you can set override that by setting the number_format_notation="e" which will make it mean the digits in scientific notation.
Important: This will affect ANY number comparison when it is set.
Note: If ignore_numeric_type_changes is set to True and you have left significant_digits to the default of None, it gets automatically set to 12. The reason is that normally when numbers from 2 different types are compared, instead of comparing the values, we only report the type change. However when ignore_numeric_type_changes=True, in order compare numbers from different types to each other, we need to convert them all into strings. The significant_digits will be used to make sure we accurately convert all the numbers into strings in order to report the changes between them.
Internally it uses "{:.Xf}".format(Your Number) to compare numbers where X=significant_digits when the number_format_notation is left as the default of "f" meaning fixed point.
Note that "{:.3f}".format(1.1135) = 1.113, but "{:.3f}".format(1.11351) = 1.114
For Decimals, Python's format rounds 2.5 to 2 and 3.5 to 4 (to the closest even number)
When you set the number_format_notation="e", we use "{:.Xe}".format(Your Number) where X=significant_digits.
truncate_datetime: string, default = None
Can take value one of 'second', 'minute', 'hour', 'day' and truncate with this value datetime objects before hashing it
**Returns**
A dictionary of {item: item hash}.
If your object is nested, it will build hashes of all the objects it contains too.
.. note::
DeepHash output is not like conventional hash functions. It is a dictionary of object IDs to their hashes. This happens because DeepHash calculates the hash of the object and any other objects found within the object in a recursive manner. If you only need the hash of the object you are passing, all you need to do is to do:
>>> from deepdiff import DeepHash
>>> obj = {1: 2, 'a': 'b'}
>>> DeepHash(obj)[obj] # doctest: +SKIP
**Examples**
Let's say you have a dictionary object.
>>> from deepdiff import DeepHash
>>> obj = {1: 2, 'a': 'b'}
If you try to hash it:
>>> hash(obj)
Traceback (most recent call last):
File "", line 1, in
TypeError: unhashable type: 'dict'
But with DeepHash:
>>> from deepdiff import DeepHash
>>> obj = {1: 2, 'a': 'b'}
>>> DeepHash(obj) # doctest: +SKIP
So what is exactly the hash of obj in this case?
DeepHash is calculating the hash of the obj and any other object that obj contains.
The output of DeepHash is a dictionary of object IDs to their hashes.
In order to get the hash of obj itself, you need to use the object (or the id of object) to get its hash:
>>> hashes = DeepHash(obj)
>>> hashes[obj]
'bf5478de322aa033da36bf3bcf9f0599e13a520773f50c6eb9f2487377a7929b'
Which you can write as:
>>> hashes = DeepHash(obj)[obj]
At first it might seem weird why DeepHash(obj)[obj] but remember that DeepHash(obj) is a dictionary of hashes of all other objects that obj contains too.
If you prefer to use another hashing algorithm, you can pass it using the hasher parameter.
If you do a deep copy of the obj, it should still give you the same hash:
>>> from copy import deepcopy
>>> obj2 = deepcopy(obj)
>>> DeepHash(obj2)[obj2]
'bf5478de322aa033da36bf3bcf9f0599e13a520773f50c6eb9f2487377a7929b'
Note that by default DeepHash will include string type differences. So if your strings were bytes:
>>> obj3 = {1: 2, b'a': b'b'}
>>> DeepHash(obj3)[obj3]
'71db3231177d49f78b52a356ca206e6179417b681604d00ed703a077049e3300'
But if you want the same hash if string types are different, set ignore_string_type_changes to True:
>>> DeepHash(obj3, ignore_string_type_changes=True)[obj3]
'e60c2befb84be625037c75e1e26d0bfc85a0ffc1f3cde9500f68f6eac55e5ad6'
ignore_numeric_type_changes is by default False too.
>>> from decimal import Decimal
>>> obj1 = {4:10}
>>> obj2 = {4.0: Decimal(10.0)}
>>> DeepHash(obj1)[4] == DeepHash(obj2)[4.0]
False
But by setting it to True, we can get the same hash.
>>> DeepHash(obj1, ignore_numeric_type_changes=True)[4] == DeepHash(obj2, ignore_numeric_type_changes=True)[4.0]
True
number_format_notation: String, default = "f"
number_format_notation is what defines the meaning of significant digits. The default value of "f" means the digits AFTER the decimal point. "f" stands for fixed point. The other option is "e" which stands for exponent notation or scientific notation.
ignore_string_type_changes: Boolean, default = True
By setting it to True, both the string and bytes of hello return the same hash.
>>> DeepHash(b'hello', ignore_string_type_changes=True)[b'hello']
'2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824'
>>> DeepHash('hello', ignore_string_type_changes=True)['hello']
'2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824'
ignore_numeric_type_changes: Boolean, default = False
For example if significant_digits=5, 1.1, Decimal(1.1) are both converted to 1.10000
That way they both produce the same hash.
>>> t1 = {1: 1, 2: 2.22}
>>> DeepHash(t1)[1]
'c1800a30c736483f13615542e7096f7973631fef8ca935ee1ed9f35fb06fd44e'
>>> DeepHash(t1, ignore_numeric_type_changes=True)[1] == DeepHash(t1, ignore_numeric_type_changes=True)[1.0]
True
You can pass a list of tuples or list of lists if you have various type groups. When t1 and t2 both fall under one of these type groups, the type change will be ignored. DeepDiff already comes with 2 groups: DeepDiff.strings and DeepDiff.numbers . If you want to pass both:
>>> from deepdiff import DeepDiff
>>> ignore_type_in_groups = [DeepDiff.strings, DeepDiff.numbers]
ignore_type_in_groups example with custom objects:
>>> class Burrito:
... bread = 'flour'
... def __init__(self):
... self.spicy = True
...
>>>
>>> class Taco:
... bread = 'flour'
... def __init__(self):
... self.spicy = True
...
>>>
>>> burrito = Burrito()
>>> taco = Taco()
>>>
>>> burritos = [burrito]
>>> tacos = [taco]
>>>
>>> d1 = DeepHash(burritos, ignore_type_in_groups=[(Taco, Burrito)])
>>> d2 = DeepHash(tacos, ignore_type_in_groups=[(Taco, Burrito)])
>>> d1[burrito] == d2[taco]
True
ignore_type_subclasses
Use ignore_type_subclasses=True so when ignoring type (class), the subclasses of that class are ignored too.
>>> from deepdiff import DeepHash
>>>
>>> class ClassB:
... def __init__(self, x):
... self.x = x
... def __repr__(self):
... return "obj b"
...
>>>
>>> class ClassC(ClassB):
... def __repr__(self):
... return "obj c"
...
>>> obj_b = ClassB(1)
>>> obj_c = ClassC(1)
>>>
>>> # By default, subclasses are considered part of the type group.
... # ignore_type_in_groups=[(ClassB, )] matches ClassC too since it's a subclass.
... hashes_b = DeepHash(obj_b, ignore_type_in_groups=[(ClassB, )])
>>> hashes_c = DeepHash(obj_c, ignore_type_in_groups=[(ClassB, )])
>>> hashes_b[obj_b] == hashes_c[obj_c]
True
>>>
>>> # With ignore_type_subclasses=True, only exact type matches count.
... # ClassC no longer matches (ClassB, ) group, so hashes differ.
... hashes_b = DeepHash(obj_b, ignore_type_in_groups=[(ClassB, )], ignore_type_subclasses=True)
>>> hashes_c = DeepHash(obj_c, ignore_type_in_groups=[(ClassB, )], ignore_type_subclasses=True)
>>> hashes_b[obj_b] != hashes_c[obj_c]
True
ignore_string_case
Whether to be case-sensitive or not when comparing strings. By settings ignore_string_case=False, strings will be compared case-insensitively.
>>> from deepdiff import DeepHash
>>> DeepHash('hello')['hello'] == DeepHash('heLLO')['heLLO']
False
>>> DeepHash('hello', ignore_string_case=True)['hello'] == DeepHash('heLLO', ignore_string_case=True)['heLLO']
True
exclude_obj_callback
function, default = None
A function that takes the object and its path and returns a Boolean. If True is returned, the object is excluded from the results, otherwise it is included.
This is to give the user a higher level of control than one can achieve via exclude_paths, exclude_regex_paths or other means.
>>> def exclude_obj_callback(obj, path):
... return True if isinstance(obj, str) and obj in ('x', 'y') else False
...
>>> dic1 = {"x": 1, "y": 2, "z": 3}
>>> t1 = [dic1]
>>> t1_hash = DeepHash(t1, exclude_obj_callback=exclude_obj_callback)
>>>
>>> dic2 = {"z": 3}
>>> t2 = [dic2]
>>> t2_hash = DeepHash(t2, exclude_obj_callback=exclude_obj_callback)
>>>
>>> t1_hash[t1] == t2_hash[t2]
True
number_format_notation : string, default="f"
When numbers are converted to the string, you have the choices between "f" as fixed point and "e" as scientific notation:
>>> t1=10002
>>> t2=10004
>>> t1_hash = DeepHash(t1, significant_digits=3, number_format_notation="f")
>>> t2_hash = DeepHash(t2, significant_digits=3, number_format_notation="f")
>>>
>>> t1_hash[t1] == t2_hash[t2]
False
>>>
>>>
>>> # Now we use the scientific notation
... t1_hash = DeepHash(t1, significant_digits=3, number_format_notation="e")
>>> t2_hash = DeepHash(t2, significant_digits=3, number_format_notation="e")
>>>
>>> t1_hash[t1] == t2_hash[t2]
True
Defining your own number_to_string_func
Lets say you want the hash of numbers below 100 to be the same for some reason.
>>> from deepdiff import DeepHash
>>> from deepdiff.helper import number_to_string
>>> def custom_number_to_string(number, *args, **kwargs):
... number = 100 if number < 100 else number
... return number_to_string(number, *args, **kwargs)
...
>>> t1 = [10, 12, 100000]
>>> t2 = [50, 63, 100021]
>>> t1_hash = DeepHash(t1, significant_digits=3, number_format_notation="e", number_to_string_func=custom_number_to_string)
>>> t2_hash = DeepHash(t2, significant_digits=3, number_format_notation="e", number_to_string_func=custom_number_to_string)
>>> t1_hash[t1] == t2_hash[t2]
True
So both lists produced the same hash thanks to the low significant digits for 100000 vs 100021 and also the custom_number_to_string that converted all numbers below 100 to be 100!
qlustered-deepdiff-41c7265/deepdiff/docstrings/delta.rst 0000664 0000000 0000000 00000052147 15162412645 0023332 0 ustar 00root root 0000000 0000000 .. _delta_label:
Delta
=====
DeepDiff Delta is a directed delta that when applied to t1 can yield t2 where delta is the difference between t1 and t2.
Delta objects are like git commits but for structured data.
You can convert the diff results into Delta objects, store the deltas, and later apply to other objects.
.. note::
If you plan to generate Delta objects from the DeepDiff result, and ignore_order=True, you need to also set the report_repetition=True.
**Parameters**
diff : Delta dictionary, Delta dump payload or a DeepDiff object, default=None.
:ref:`delta_diff_label` is the content to be loaded.
delta_path : String, default=None.
:ref:`delta_path_label` is the local path to the delta dump file to be loaded
delta_file : File Object, default=None.
:ref:`delta_file_label` is the file object containing the delta data.
delta_diff : Delta diff, default=None.
This is a slightly different diff than the output of DeepDiff. When Delta object is initiated from the DeepDiff output, it transforms the diff into a slightly different structure that is more suitable for delta. You can find that object via delta.diff.
It is the same object that is serialized when you create a delta dump. If you already have the delta_diff object, you can pass it to Delta via the delta_diff parameter.
flat_dict_list : List of flat dictionaries, default=None,
:ref:`flat_dict_list_label` can be used to load the delta object from a list of flat dictionaries.
.. note::
You need to pass only one of the diff, delta_path, or delta_file parameters.
deserializer : Deserializer function, default=pickle_load
:ref:`delta_deserializer_label` is the function to deserialize the delta content. The default is the pickle_load function that comes with DeepDiff.
serializer : Serializer function, default=pickle_dump
:ref:`delta_serializer_label` is the function to serialize the delta content into a format that can be stored. The default is the pickle_dump function that comes with DeepDiff.
log_errors : Boolean, default=True
Whether to log the errors or not when applying the delta object.
raise_errors : Boolean, default=False
:ref:`raise_errors_label`
Whether to raise errors or not when applying a delta object.
mutate : Boolean, default=False.
:ref:`delta_mutate_label` defines whether to mutate the original object when adding the delta to it or not.
Note that this parameter is not always successful in mutating. For example if your original object
is an immutable type such as a frozenset or a tuple, mutation will not succeed.
Hence it is recommended to keep this parameter as the default value of False unless you are sure
that you do not have immutable objects. There is a small overhead of doing deepcopy on the original
object when mutate=False. If performance is a concern and modifying the original object is not a big deal,
set the mutate=True but always reassign the output back to the original object.
safe_to_import : Set, default=None.
:ref:`delta_safe_to_import_label` is a set of modules that needs to be explicitly white listed to be loaded
Example: {'mymodule.MyClass', 'decimal.Decimal'}
Note that this set will be added to the basic set of modules that are already white listed.
The set of what is already white listed can be found in deepdiff.serialization.SAFE_TO_IMPORT
bidirectional : Boolean, default=False
:ref:`delta_verify_symmetry_label` is used to verify that the original value of items are the same as when the delta was created. Note that in order for this option to work, the delta object will need to store more data and thus the size of the object will increase. Let's say that the diff object says root[0] changed value from X to Y. If you create the delta with the default value of bidirectional=False, then what delta will store is root[0] = Y. And if this delta was applied to an object that has any root[0] value, it will still set the root[0] to Y. However if bidirectional=True, then the delta object will store also that the original value of root[0] was X and if you try to apply the delta to an object that has root[0] of any value other than X, it will notify you.
force : Boolean, default=False
:ref:`delta_force_label` is used to force apply a delta to objects that have a different structure than what the delta was originally created from.
always_include_values : Boolean, default=False
:ref:`always_include_values_label` is used to make sure the delta objects includes the values that were changed. Sometime Delta tries to be efficient not include the values when it can get away with it. By setting this parameter to True, you ensure that the Delta object will include the values.
.. _delta_fill:
fill : Any, default=No Fill
This is only relevant if `force` is set. This parameter only applies when force is set and trying to fill an existing array. If the index of the array being applied is larger than the length of the array this value will be used to fill empty spaces of the array to extend it in order to add the new value. If this parameter is not set, the items will get dropped and the array not extended. If this parameter is set with a callable function, it will get called each time a fill item is needed. It will be provided with three arguments: first argument is the array being filled, second argument is the value that is being added to the array, the third argument is the path that is being added.
Example function: `def fill(obj, value, path): return "Camry" if "car" in path else None`
**Returns**
A delta object that can be added to t1 to recreate t2.
Delta objects can contain the following vocabulary:
iterable_item_added
iterable_item_moved
iterable_item_removed
set_item_added
set_item_removed
dictionary_item_added
dictionary_item_removed
attribute_added
attribute_removed
type_changes
values_changed
iterable_items_added_at_indexes
iterable_items_removed_at_indexes
.. _delta_diff_label:
Diff to load in Delta
---------------------
diff : Delta dictionary, Delta dump payload or a DeepDiff object, default=None.
diff is the content to be loaded.
>>> from deepdiff import DeepDiff, Delta
>>> from pprint import pprint
>>>
>>> t1 = [1, 2, 3]
>>> t2 = ['a', 2, 3, 4]
>>> diff = DeepDiff(t1, t2)
>>> diff
{'type_changes': {'root[0]': {'old_type': , 'new_type': , 'old_value': 1, 'new_value': 'a'}}, 'iterable_item_added': {'root[3]': 4}}
>>> delta = Delta(diff)
>>> delta # doctest: +SKIP
Applying the delta object to t1 will yield t2:
>>> t1 + delta
['a', 2, 3, 4]
>>> t1 + delta == t2
True
If we want to subtract a delta, we need to create a bidirectional delta:
>>> delta = Delta(diff, bidirectional=True)
>>> t2 - delta
[1, 2, 3]
>>> t2 - delta == t1
True
Now let's dump the delta object so we can store it.
>>> dump = delta.dumps()
>>>
>>> dump # doctest: +SKIP
The dumps() function gives us the serialized content of the delta in the form of bytes. We could store it however we want. Or we could use the dump(file_object) to write the dump to the file_object instead. But before we try the dump(file_object) method, let's create a new Delta object and reapply it to t1 and see if we still get t2:
>>> delta2 = Delta(dump)
>>> t1 + delta2 == t2
True
>>>
.. _delta_path_label:
Delta Path parameter
--------------------
Ok now we can try the dumps(file_object). It does what you expect:
>>> with open('/tmp/delta1', 'wb') as dump_file:
... delta.dump(dump_file)
...
And we use the delta_path parameter to load the delta
>>> delta3 = Delta(delta_path='/tmp/delta1')
It still gives us the same result when applied.
>>> t1 + delta3 == t2
True
.. _delta_file_label:
Delta File parameter
--------------------
You can also pass a file object containing the delta dump:
>>> with open('/tmp/delta1', 'rb') as dump_file:
... delta4 = Delta(delta_file=dump_file)
...
>>> t1 + delta4 == t2
True
.. _flat_dict_list_label:
Flat Dict List
--------------
You can create a delta object from the list of flat dictionaries that are produced via :ref:`to_flat_dicts_label`. Read more on :ref:`delta_from_flat_dicts_label`.
.. _flat_rows_list_label:
Flat Rows List
--------------
You can create a delta object from the list of flat dictionaries that are produced via :ref:`delta_to_flat_rows_label`. Read more on :ref:`delta_to_flat_rows_label`.
.. _delta_deserializer_label:
Delta Deserializer
------------------
DeepDiff by default uses a restricted Python pickle function to deserialize the Delta dumps. Read more about :ref:`delta_dump_safety_label`.
The user of Delta can decide to switch the serializer and deserializer to their custom ones. The serializer and deserializer parameters can be used exactly for that reason. The best way to come up with your own serializer and deserializer is to take a look at the `pickle_dump and pickle_load functions in the serializer module `_
.. _delta_json_deserializer_label:
Json Deserializer for Delta
```````````````````````````
If all you deal with are Json serializable objects, you can use json for serialization.
>>> from deepdiff import DeepDiff, Delta
>>> from deepdiff.serialization import json_dumps, json_loads
>>> t1 = {"a": 1}
>>> t2 = {"a": 2}
>>>
>>> diff = DeepDiff(t1, t2)
>>> delta = Delta(diff, serializer=json_dumps)
>>> dump = delta.dumps()
>>> dump
'{"values_changed":{"root[\'a\']":{"new_value":2}}}'
>>> delta_reloaded = Delta(dump, deserializer=json_loads)
>>> t2 == delta_reloaded + t1
True
.. note::
Json is very limited and easily you can get to deltas that are not json serializable. You will probably want to extend the Python's Json serializer to support your needs.
>>> import json
>>> t1 = {"a": 1}
>>> t2 = {"a": None}
>>> diff = DeepDiff(t1, t2)
>>> diff
{'type_changes': {"root['a']": {'old_type': , 'new_type': , 'old_value': 1, 'new_value': None}}}
>>> Delta(diff, serializer=json.dumps) # doctest: +SKIP
>>> delta = Delta(diff, serializer=json.dumps)
>>> dump = delta.dumps() # doctest: +ELLIPSIS
Traceback (most recent call last):
...
TypeError: Object of type type is not JSON serializable...
.. _delta_serializer_label:
Delta Serializer
----------------
DeepDiff uses pickle to serialize delta objects by default. Please take a look at the :ref:`delta_deserializer_label` for more information.
.. _to_flat_dicts_label:
Delta Serialize To Flat Dictionaries
------------------------------------
Read about :ref:`delta_to_flat_dicts_label`
.. _delta_dump_safety_label:
Delta Dump Safety
-----------------
Delta by default uses Python's pickle to serialize and deserialize. While the unrestricted use of pickle is not safe as noted in the `pickle's documentation `_ , DeepDiff's Delta is written with extra care to `restrict the globals `_ and hence mitigate this security risk.
In fact only a few Python object types are allowed by default. The user of DeepDiff can pass additional types using the :ref:`delta_safe_to_import_label` to allow further object types that need to be allowed.
.. _delta_mutate_label:
Delta Mutate parameter
----------------------
mutate : Boolean, default=False.
delta_mutate defines whether to mutate the original object when adding the delta to it or not.
Note that this parameter is not always successful in mutating. For example if your original object
is an immutable type such as a frozenset or a tuple, mutation will not succeed.
Hence it is recommended to keep this parameter as the default value of False unless you are sure
that you do not have immutable objects. There is a small overhead of doing deepcopy on the original
object when mutate=False. If performance is a concern and modifying the original object is not a big deal,
set the mutate=True but always reassign the output back to the original object.
For example:
>>> t1 = [1, 2, [3, 5, 6]]
>>> t2 = [2, 3, [3, 6, 8]]
>>> diff = DeepDiff(t1, t2, ignore_order=True, report_repetition=True)
>>> diff
{'values_changed': {'root[0]': {'new_value': 3, 'old_value': 1}, 'root[2][1]': {'new_value': 8, 'old_value': 5}}}
>>> delta = Delta(diff)
>>> delta # doctest: +SKIP
Note that we can apply delta to objects different than the original objects they were made from:
>>> t3 = ["a", 2, [3, "b", "c"]]
>>> t3 + delta
[3, 2, [3, 8, 'c']]
If we check t3, it is still the same as the original value of t3:
>>> t3
['a', 2, [3, 'b', 'c']]
Now let's make the delta with mutate=True
>>> delta2 = Delta(diff, mutate=True)
>>> t3 + delta2
[3, 2, [3, 8, 'c']]
>>> t3
[3, 2, [3, 8, 'c']]
Applying the delta to t3 mutated the t3 itself in this case!
.. _delta_and_numpy_label:
Delta and Numpy
---------------
>>> from deepdiff import DeepDiff, Delta
>>> import numpy as np
>>> t1 = np.array([1, 2, 3, 5])
>>> t2 = np.array([2, 2, 7, 5])
>>> diff = DeepDiff(t1, t2)
>>> diff
{'values_changed': {'root[0]': {'new_value': np.int64(2), 'old_value': np.int64(1)}, 'root[2]': {'new_value': np.int64(7), 'old_value': np.int64(3)}}}
>>> delta = Delta(diff)
.. note::
When applying delta to Numpy arrays, make sure to put the delta object first and the numpy array second. This is because Numpy array overrides the + operator and thus DeepDiff's Delta won't be able to be applied.
>>> t1 + delta
Traceback (most recent call last):
File "", line 1, in
raise DeltaNumpyOperatorOverrideError(DELTA_NUMPY_OPERATOR_OVERRIDE_MSG)
deepdiff.delta.DeltaNumpyOperatorOverrideError: A numpy ndarray is most likely being added to a delta. Due to Numpy override the + operator, you can only do: delta + ndarray and NOT ndarray + delta
Let's put the delta first then:
>>> delta + t1
array([2, 2, 7, 5])
>>> delta + t2 == t2
array([ True, True, True, True])
.. note::
You can apply a delta that was created from normal Python objects to Numpy arrays. But it is not recommended.
.. _raise_errors_label:
Delta Raise Errors parameter
----------------------------
raise_errors : Boolean, default=False
Whether to raise errors or not when applying a delta object.
>>> from deepdiff import DeepDiff, Delta
>>> t1 = [1, 2, [3, 5, 6]]
>>> t2 = [2, 3, [3, 6, 8]]
>>> diff = DeepDiff(t1, t2, ignore_order=True, report_repetition=True)
>>> delta = Delta(diff, raise_errors=False)
Now let's apply the delta to a very different object:
>>> t3 = [1, 2, 3, 5]
>>> t4 = t3 + delta # doctest: +SKIP
We get a log message that it was unable to get the item at root[2][1]. We get the message since by default log_errors=True
Let's see what t4 is now:
>>> t4 # doctest: +SKIP
[3, 2, 3, 5]
So the delta was partially applied on t3.
Now let's set the raise_errors=True
>>> delta2 = Delta(diff, raise_errors=True)
>>>
>>> t3 + delta2 # doctest: +ELLIPSIS
Traceback (most recent call last):
...
deepdiff.delta.DeltaError: Unable to get the item at root[2][1]
.. _delta_safe_to_import_label:
Delta Safe To Import parameter
------------------------------
safe_to_import : Set, default=None.
safe_to_import is a set of modules that needs to be explicitly white listed to be loaded
Example: {'mymodule.MyClass', 'decimal.Decimal'}
Note that this set will be added to the basic set of modules that are already white listed.
As noted in :ref:`delta_dump_safety_label` and :ref:`delta_deserializer_label`, DeepDiff's Delta takes safety very seriously and thus limits the globals that can be deserialized when importing. However on occasions that you need a specific type (class) that needs to be used in delta objects, you need to pass it to the Delta via safe_to_import parameter.
The set of what is already white listed can be found in deepdiff.serialization.SAFE_TO_IMPORT
At the time of writing this document, this list consists of:
>>> from deepdiff.serialization import SAFE_TO_IMPORT
>>> from pprint import pprint
>>> pprint(SAFE_TO_IMPORT) # doctest: +SKIP
frozenset({'builtins.None',
'builtins.bin',
'builtins.bool',
...})
If you want to pass any other argument to safe_to_import, you will need to put the full path to the type as it appears in the sys.modules
For example let's say you have a package call mypackage and has a module called mymodule. If you check the sys.modules, the address to this module must be mypackage.mymodule. In order for Delta to be able to serialize this object via pickle, first of all it has to be `picklable `_.
>>> diff = DeepDiff(t1, t2)
>>> delta = Delta(diff)
>>> dump = delta.dumps()
The dump at this point is serialized via Pickle and can be written to disc if needed.
Later when you want to load this dump, by default Delta will block you from importing anything that is NOT in deepdiff.serialization.SAFE_TO_IMPORT . In fact it will show you this error message when trying to load this dump:
deepdiff.serialization.ForbiddenModule: Module 'builtins.type' is forbidden. You need to explicitly pass it by passing a safe_to_import parameter
In order to let Delta know that this specific module is safe to import, you will need to pass it to Delta during loading of this dump:
>>> delta = Delta(dump, safe_to_import={'mypackage.mymodule'})
.. note ::
If you pass a custom deserializer to Delta, DeepDiff will pass safe_to_import parameter to the custom deserializer if that deserializer takes safe_to_import as a parameter in its definition.
For example if you just use json.loads as deserializer, the safe_to_import items won't be passed to it since json.loads does not have such a parameter.
.. _delta_verify_symmetry_label:
Delta Verify Symmetry parameter
-------------------------------
bidirectional : Boolean, default=False
bidirectional is used to to include all the required information so that we can use the delta object both for addition and subtraction. It will also check that the object you are adding the delta to, has the same values as the original object that the delta was created from.
It complains if the object is not what it expected to be.
>>> from deepdiff import DeepDiff, Delta
>>> t1 = [1]
>>> t2 = [2]
>>> t3 = [3]
>>>
>>> diff = DeepDiff(t1, t2)
>>>
>>> delta2 = Delta(diff, raise_errors=False, bidirectional=True)
>>> t4 = delta2 + t3 # doctest: +SKIP
>>> t4 # doctest: +SKIP
[2]
And if you had set raise_errors=True, then it would have raised the error in addition to logging it.
.. _delta_force_label:
Delta Force
-----------
force : Boolean, default=False
force is used to force apply a delta to objects that have a different structure than what the delta was originally created from.
>>> from deepdiff import DeepDiff, Delta
>>> t1 = {
... 'x': {
... 'y': [1, 2, 3]
... },
... 'q': {
... 'r': 'abc',
... }
... }
>>>
>>> t2 = {
... 'x': {
... 'y': [1, 2, 3, 4]
... },
... 'q': {
... 'r': 'abc',
... 't': 0.5,
... }
... }
>>>
>>> diff = DeepDiff(t1, t2)
>>> diff
{'dictionary_item_added': ["root['q']['t']"], 'iterable_item_added': {"root['x']['y'][3]": 4}}
>>> delta = Delta(diff)
>>> {} + delta # doctest: +SKIP
{}
Once we set the force to be True
>>> delta = Delta(diff, force=True)
>>> {} + delta
{'x': {'y': {3: 4}}, 'q': {'t': 0.5}}
Notice that the force attribute does not know the original object at ['x']['y'] was supposed to be a list, so it assumes it was a dictionary.
.. _always_include_values_label:
Always Include Values
---------------------
always_include_values is used to make sure the delta objects includes the values that were changed. Sometime Delta tries to be efficient not include the values when it can get away with it. By setting this parameter to True, you ensure that the Delta object will include the values.
For example, when the type of an object changes, if we can easily convert from one type to the other, the Delta object does not include the values:
>>> from deepdiff import DeepDiff, Delta
>>> diff = DeepDiff(t1=[1, 2], t2=[1, '2'])
>>> diff
{'type_changes': {'root[1]': {'old_type': , 'new_type': , 'old_value': 2, 'new_value': '2'}}}
>>> delta=Delta(diff)
>>> delta # doctest: +SKIP
As you can see the delta object does not include the values that were changed. Now let's pass always_include_values=True:
>>> delta=Delta(diff, always_include_values=True)
>>> delta.diff
{'type_changes': {'root[1]': {'old_type': , 'new_type': , 'new_value': '2'}}}
If we want to make sure the old values stay with delta, we pass bidirectional=True. By doing so we can also use the delta object to subtract from other objects.
>>> delta=Delta(diff, always_include_values=True, bidirectional=True)
>>> delta.diff
{'type_changes': {'root[1]': {'old_type': , 'new_type': , 'old_value': 2, 'new_value': '2'}}}
qlustered-deepdiff-41c7265/deepdiff/docstrings/diff.rst 0000664 0000000 0000000 00000000551 15162412645 0023141 0 ustar 00root root 0000000 0000000 :doc:`/index`
.. _deepdiff_label:
DeepDiff
========
.. automodule:: deepdiff.diff
.. autoclass:: DeepDiff
:members:
.. toctree::
:maxdepth: 3
basics
custom
deep_distance
exclude_paths
ignore_order
ignore_types_or_values
numbers
optimizations
other
serialization
stats
troubleshoot
view
Back to :doc:`/index`
qlustered-deepdiff-41c7265/deepdiff/docstrings/diff_doc.rst 0000664 0000000 0000000 00000041471 15162412645 0023774 0 ustar 00root root 0000000 0000000 :orphan:
.. |qluster_link| raw:: html
Qluster
.. admonition:: DeepDiff is now part of |qluster_link|.
*If you're building workflows around data validation and correction,* `Qluster `__ *gives your team a structured way to manage rules, review failures, approve fixes, and reuse decisions—without building the entire system from scratch.*
DeepDiff Module
===============
Deep Difference of dictionaries, iterables, strings and almost any other object.
It will recursively look for all the changes.
**Parameters**
t1 : A dictionary, list, string or any python object that has __dict__ or __slots__
This is the first item to be compared to the second item
t2 : dictionary, list, string or almost any python object that has __dict__ or __slots__
The second item is to be compared to the first one
cutoff_distance_for_pairs : 1 >= float >= 0, default=0.3
:ref:`cutoff_distance_for_pairs_label` What is the threshold to consider 2 items as pairs.
Note that it is only used when ignore_order = True.
cutoff_intersection_for_pairs : 1 >= float >= 0, default=0.7
:ref:`cutoff_intersection_for_pairs_label` What is the threshold to calculate pairs of items between 2 iterables.
For example 2 iterables that have nothing in common, do not need their pairs to be calculated.
Note that it is only used when ignore_order = True.
cache_size : int >= 0, default=0
:ref:`cache_size_label` Cache size to be used to improve the performance. A cache size of zero means it is disabled.
Using the cache_size can dramatically improve the diff performance especially for the nested objects at the cost of more memory usage.
cache_purge_level: int, 0, 1, or 2. default=1
:ref:`cache_purge_level` defines what objects in DeepDiff should be deleted to free the memory once the diff object is calculated. If this value is set to zero, most of the functionality of the diff object is removed and the most memory is released. A value of 1 preserves all the functionalities of the diff object. A value of 2 also preserves the cache and hashes that were calculated during the diff calculations. In most cases the user does not need to have those objects remained in the diff unless for investigation purposes.
cache_tuning_sample_size : int >= 0, default = 0
:ref:`cache_tuning_sample_size_label` This is an experimental feature. It works hands in hands with the :ref:`cache_size_label`. When cache_tuning_sample_size is set to anything above zero, it will sample the cache usage with the passed sample size and decide whether to use the cache or not. And will turn it back on occasionally during the diffing process. This option can be useful if you are not sure if you need any cache or not. However you will gain much better performance with keeping this parameter zero and running your diff with different cache sizes and benchmarking to find the optimal cache size.
custom_operators : BaseOperator subclasses, default = None
:ref:`custom_operators_label` if you are considering whether they are fruits or not. In that case, you can pass a *custom_operators* for the job.
default_timezone : datetime.timezone subclasses or pytz datetimes, default = datetime.timezone.utc
:ref:`default_timezone_label` defines the default timezone. If a datetime is timezone naive, which means it doesn't have a timezone, we assume the datetime is in this timezone. Also any datetime that has a timezone will be converted to this timezone so the datetimes can be compared properly all in the same timezone. Note that Python's default behavior assumes the default timezone is your local timezone. DeepDiff's default is UTC, not your local time zone.
encodings: List, default = None
:ref:`encodings_label` Character encodings to iterate through when we convert bytes into strings. You may want to pass an explicit list of encodings in your objects if you start getting UnicodeDecodeError from DeepHash. Also check out :ref:`ignore_encoding_errors_label` if you can get away with ignoring these errors and don't want to bother with an explicit list of encodings but it will come at the price of slightly less accuracy of the final results. Example: encodings=["utf-8", "latin-1"]
exclude_paths: list, default = None
:ref:`exclude_paths_label`
List of paths to exclude from the report. If only one item, you can path it as a string.
exclude_regex_paths: list, default = None
:ref:`exclude_regex_paths_label`
List of string regex paths or compiled regex paths objects to exclude from the report. If only one item, you can pass it as a string or regex compiled object.
exclude_types: list, default = None
:ref:`exclude_types_label`
List of object types to exclude from the report.
exclude_obj_callback: function, default = None
:ref:`exclude_obj_callback_label`
A function that takes the object and its path and returns a Boolean. If True is returned, the object is excluded from the results, otherwise it is included.
This is to give the user a higher level of control than one can achieve via exclude_paths, exclude_regex_paths or other means.
exclude_obj_callback_strict: function, default = None
:ref:`exclude_obj_callback_strict_label`
A function that works the same way as exclude_obj_callback, but excludes elements from the result only if the function returns True for both elements.
include_paths: list, default = None
:ref:`include_paths_label`
List of the only paths to include in the report. If only one item is in the list, you can pass it as a string.
include_obj_callback: function, default = None
:ref:`include_obj_callback_label`
A function that takes the object and its path and returns a Boolean. If True is returned, the object is included in the results, otherwise it is excluded.
This is to give the user a higher level of control than one can achieve via include_paths.
include_obj_callback_strict: function, default = None
:ref:`include_obj_callback_strict_label`
A function that works the same way as include_obj_callback, but includes elements in the result only if the function returns True for both elements.
get_deep_distance: Boolean, default = False
:ref:`get_deep_distance_label` will get you the deep distance between objects. The distance is a number between 0 and 1 where zero means there is no diff between the 2 objects and 1 means they are very different. Note that this number should only be used to compare the similarity of 2 objects and nothing more. The algorithm for calculating this number may or may not change in the future releases of DeepDiff.
group_by: String or a list of size 2, default=None
:ref:`group_by_label` can be used when dealing with the list of dictionaries. It converts them from lists to a single dictionary with the key defined by group_by. The common use case is when reading data from a flat CSV, and the primary key is one of the columns in the CSV. We want to use the primary key instead of the CSV row number to group the rows. The group_by can do 2D group_by by passing a list of 2 keys.
group_by_sort_key: String or a function
:ref:`group_by_sort_key_label` is used to define how dictionaries are sorted if multiple ones fall under one group. When this parameter is used, group_by converts the lists of dictionaries into a dictionary of keys to lists of dictionaries. Then, :ref:`group_by_sort_key_label` is used to sort between the list.
hasher: default = DeepHash.sha256hex
Hash function to be used. If you don't want SHA256, you can use your own hash function
by passing hasher=hash. This is for advanced usage and normally you don't need to modify it.
ignore_order : Boolean, default=False
:ref:`ignore_order_label` ignores order of elements when comparing iterables (lists)
Normally ignore_order does not report duplicates and repetition changes.
In order to report repetitions, set report_repetition=True in addition to ignore_order=True
ignore_order_func : Function, default=None
:ref:`ignore_order_func_label` Sometimes single *ignore_order* parameter is not enough to do a diff job,
you can use *ignore_order_func* to determine whether the order of certain paths should be ignored
ignore_string_type_changes: Boolean, default = False
:ref:`ignore_string_type_changes_label`
Whether to ignore string type changes or not. For example b"Hello" vs. "Hello" are considered the same if ignore_string_type_changes is set to True.
ignore_numeric_type_changes: Boolean, default = False
:ref:`ignore_numeric_type_changes_label`
Whether to ignore numeric type changes or not. For example 10 vs. 10.0 are considered the same if ignore_numeric_type_changes is set to True.
ignore_type_in_groups: Tuple or List of Tuples, default = None
:ref:`ignore_type_in_groups_label`
ignores types when t1 and t2 are both within the same type group.
ignore_type_subclasses: Boolean, default = False
:ref:`ignore_type_subclasses_label`
ignore type (class) changes when dealing with the subclasses of classes that were marked to be ignored.
.. Note::
ignore_type_subclasses was incorrectly doing the reverse of its job up until DeepDiff 6.7.1
Please make sure to flip it in your use cases, when upgrading from older versions to 7.0.0 or above.
ignore_uuid_types: Boolean, default = False
:ref:`ignore_uuid_types_label`
Whether to ignore UUID vs string type differences when comparing. When set to True, comparing a UUID object with its string representation will not report as a type change.
ignore_string_case: Boolean, default = False
:ref:`ignore_string_case_label`
Whether to be case-sensitive or not when comparing strings. By setting ignore_string_case=True, strings will be compared case-insensitively.
ignore_nan_inequality: Boolean, default = False
:ref:`ignore_nan_inequality_label`
Whether to ignore float('nan') inequality in Python.
ignore_private_variables: Boolean, default = True
:ref:`ignore_private_variables_label`
Whether to exclude the private variables in the calculations or not. It only affects variables that start with double underscores (__).
ignore_encoding_errors: Boolean, default = False
:ref:`ignore_encoding_errors_label` If you want to get away with UnicodeDecodeError without passing explicit character encodings, set this option to True. If you want to make sure the encoding is done properly, keep this as False and instead pass an explicit list of character encodings to be considered via the :ref:`encodings_label` parameter.
zip_ordered_iterables: Boolean, default = False
:ref:`zip_ordered_iterables_label`:
When comparing ordered iterables such as lists, DeepDiff tries to find the smallest difference between the two iterables to report. That means that items in the two lists are not paired individually in the order of appearance in the iterables. Sometimes, that is not the desired behavior. Set this flag to True to make DeepDiff pair and compare the items in the iterables in the order they appear.
iterable_compare_func:
:ref:`iterable_compare_func_label`:
There are times that we want to guide DeepDiff as to what items to compare with other items. In such cases we can pass a iterable_compare_func that takes a function pointer to compare two items. The function takes three parameters (x, y, level) and should return True if it is a match, False if it is not a match or raise CannotCompare if it is unable to compare the two.
log_frequency_in_sec: Integer, default = 0
:ref:`log_frequency_in_sec_label`
How often to log the progress. The default of 0 means logging progress is disabled.
If you set it to 20, it will log every 20 seconds. This is useful only when running DeepDiff
on massive objects that will take a while to run. If you are only dealing with small objects, keep it at 0 to disable progress logging.
log_scale_similarity_threshold: float, default = 0.1
:ref:`use_log_scale_label` along with :ref:`log_scale_similarity_threshold_label` can be used to ignore small changes in numbers by comparing their differences in logarithmic space. This is different than ignoring the difference based on significant digits.
log_stacktrace: Boolean, default = False
If True, we log the stacktrace when logging errors. Otherwise we only log the error message.
max_passes: Integer, default = 10000000
:ref:`max_passes_label` defined the maximum number of passes to run on objects to pin point what exactly is different. This is only used when ignore_order=True. A new pass is started each time 2 iterables are compared in a way that every single item that is different from the first one is compared to every single item that is different in the second iterable.
max_diffs: Integer, default = None
:ref:`max_diffs_label` defined the maximum number of diffs to run on objects to pin point what exactly is different. This is only used when ignore_order=True
math_epsilon: Decimal, default = None
:ref:`math_epsilon_label` uses Python's built in Math.isclose. It defines a tolerance value which is passed to math.isclose(). Any numbers that are within the tolerance will not report as being different. Any numbers outside of that tolerance will show up as different.
number_format_notation : string, default="f"
:ref:`number_format_notation_label` is what defines the meaning of significant digits. The default value of "f" means the digits AFTER the decimal point. "f" stands for fixed point. The other option is "e" which stands for exponent notation or scientific notation.
number_to_string_func : function, default=None
:ref:`number_to_string_func_label` is an advanced feature to give the user the full control into overriding how numbers are converted to strings for comparison. The default function is defined in https://github.com/seperman/deepdiff/blob/master/deepdiff/helper.py and is called number_to_string. You can define your own function to do that.
progress_logger: log function, default = logger.info
:ref:`progress_logger_label` defines what logging function to use specifically for progress reporting. This function is only used when progress logging is enabled which happens by setting log_frequency_in_sec to anything above zero.
report_repetition : Boolean, default=False
:ref:`report_repetition_label` reports repetitions when set True
It only works when ignore_order is set to True too.
significant_digits : int >= 0, default=None
:ref:`significant_digits_label` defines the number of digits AFTER the decimal point to be used in the comparison. However you can override that by setting the number_format_notation="e" which will make it mean the digits in scientific notation.
truncate_datetime: string, default = None
:ref:`truncate_datetime_label` can take value one of 'second', 'minute', 'hour', 'day' and truncate with this value datetime objects before hashing it
threshold_to_diff_deeper: float, default = 0.33
:ref:`threshold_to_diff_deeper_label` is a number between 0 and 1. When comparing dictionaries that have a small intersection of keys, we will report the dictionary as a new_value instead of reporting individual keys changed. If you set it to zero, you get the same results as DeepDiff 7.0.1 and earlier, which means this feature is disabled. The new default is 0.33 which means if less that one third of keys between dictionaries intersect, report it as a new object.
use_enum_value: Boolean, default=False
:ref:`use_enum_value_label` makes it so when diffing enum, we use the enum's value. It makes it so comparing an enum to a string or any other value is not reported as a type change.
use_log_scale: Boolean, default=False
:ref:`use_log_scale_label` along with :ref:`log_scale_similarity_threshold_label` can be used to ignore small changes in numbers by comparing their differences in logarithmic space. This is different than ignoring the difference based on significant digits.
verbose_level: 2 >= int >= 0, default = 1
Higher verbose level shows you more details.
For example verbose level 1 shows what dictionary item are added or removed.
And verbose level 2 shows the value of the items that are added or removed too.
view: string, default = text
:ref:`view_label`
Views are different "formats" of results. Each view comes with its own features.
The choices are text (the default) and tree.
The text view is the original format of the results.
The tree view allows you to traverse through the tree of results. So you can traverse through the tree and see what items were compared to what.
**Returns**
A DeepDiff object that has already calculated the difference of the 2 items. The format of the object is chosen by the view parameter.
**Supported data types**
int, string, unicode, dictionary, list, tuple, set, frozenset, OrderedDict, NamedTuple, Numpy, custom objects and more!
.. Note::
|:mega:| **Please fill out our** `fast 5-question survey `__ so that we can learn how & why you use DeepDiff, and what improvements we should make. Thank you! |:dancers:|
qlustered-deepdiff-41c7265/deepdiff/docstrings/dsearch.rst 0000664 0000000 0000000 00000000336 15162412645 0023643 0 ustar 00root root 0000000 0000000 :doc:`/index`
.. _deepsearch_label:
DeepSearch
==========
.. toctree::
:maxdepth: 3
.. automodule:: deepdiff.search
.. autoclass:: grep
:members:
.. autoclass:: DeepSearch
:members:
Back to :doc:`/index`
qlustered-deepdiff-41c7265/deepdiff/docstrings/exclude_paths.rst 0000664 0000000 0000000 00000007014 15162412645 0025062 0 ustar 00root root 0000000 0000000 :doc:`/index`
.. _exclude_paths_label:
Exclude Paths
=============
Exclude part of your object tree from comparison.
use exclude_paths and pass a set or list of paths to exclude, if only one item is being passed, then just put it there as a string. No need to pass it as a list then.
Example
>>> t1 = {"for life": "vegan", "ingredients": ["no meat", "no eggs", "no dairy"]}
>>> t2 = {"for life": "vegan", "ingredients": ["veggies", "tofu", "soy sauce"]}
>>> print (DeepDiff(t1, t2, exclude_paths="root['ingredients']")) # one item pass it as a string
{}
>>> print (DeepDiff(t1, t2, exclude_paths=["root['ingredients']", "root['ingredients2']"])) # multiple items pass as a list or a set.
{}
Also for root keys you don't have to pass as "root['key']". You can instead just pass the key:
Example
>>> t1 = {"for life": "vegan", "ingredients": ["no meat", "no eggs", "no dairy"]}
>>> t2 = {"for life": "vegan", "ingredients": ["veggies", "tofu", "soy sauce"]}
>>> print (DeepDiff(t1, t2, exclude_paths="ingredients")) # one item pass it as a string
{}
>>> print (DeepDiff(t1, t2, exclude_paths=["ingredients", "ingredients2"])) # multiple items pass as a list or a set.
{}
.. _include_paths_label:
Include Paths
=============
Only include this part of your object tree in the comparison.
Use include_paths and pass a set or list of paths to limit diffing to only those paths. If only one item is being passed, just put it there as a string—no need to pass it as a list then.
Example
>>> t1 = {"for life": "vegan", "ingredients": ["no meat", "no eggs", "no dairy"]}
>>> t2 = {"for life": "vegan", "ingredients": ["veggies", "tofu", "soy sauce"]}
>>> print (DeepDiff(t1, t2, include_paths="root['for life']")) # one item pass it as a string
{}
>>> print (DeepDiff(t1, t2, include_paths=["for life", "ingredients2"])) # multiple items pass as a list or a set and you don't need to pass the full path when dealing with root keys. So instead of "root['for life']" you can pass "for life"
{}
When passing include_paths, all the children of that path will be included too.
Example
>>> t1 = {
... "foo": {"bar": "potato"},
... "ingredients": ["no meat", "no eggs", "no dairy"]
... }
>>> t2 = {
... "foo": {"bar": "banana"},
... "ingredients": ["bread", "cheese"]
... }
>>> DeepDiff(t1, t2, include_paths="foo")
{'values_changed': {"root['foo']['bar']": {'new_value': 'banana', 'old_value': 'potato'}}}
.. _exclude_regex_paths_label:
Exclude Regex Paths
-------------------
You can also exclude using regular expressions by using `exclude_regex_paths` and pass a set or list of path regexes to exclude. The items in the list could be raw regex strings or compiled regex objects.
>>> import re
>>> t1 = [{'a': 1, 'b': 2}, {'c': 4, 'b': 5}]
>>> t2 = [{'a': 1, 'b': 3}, {'c': 4, 'b': 5}]
>>> print(DeepDiff(t1, t2, exclude_regex_paths=r"root\[\d+\]\['b'\]"))
{}
>>> exclude_path = re.compile(r"root\[\d+\]\['b'\]")
>>> print(DeepDiff(t1, t2, exclude_regex_paths=[exclude_path]))
{}
example 2:
>>> t1 = {'a': [1, 2, [3, {'foo1': 'bar'}]]}
>>> t2 = {'a': [1, 2, [3, {'foo2': 'bar'}]]}
>>> DeepDiff(t1, t2, exclude_regex_paths="\['foo.'\]") # since it is one item in exclude_regex_paths, you don't have to put it in a list or a set.
{}
Tip: DeepDiff is using re.search on the path. So if you want to force it to match from the beginning of the path, add `^` to the beginning of regex.
Back to :doc:`/index`
qlustered-deepdiff-41c7265/deepdiff/docstrings/extract.rst 0000664 0000000 0000000 00000000205 15162412645 0023677 0 ustar 00root root 0000000 0000000 :doc:`/index`
.. _extract_label:
Extract
=======
.. automodule:: deepdiff.path
.. autofunction:: extract
Back to :doc:`/index`
qlustered-deepdiff-41c7265/deepdiff/docstrings/faq.rst 0000664 0000000 0000000 00000015355 15162412645 0023010 0 ustar 00root root 0000000 0000000 :doc:`/index`
F.A.Q
=====
.. |qluster_link| raw:: html
Qluster
.. admonition:: DeepDiff is now part of |qluster_link|.
*If you're building workflows around data validation and correction,* `Qluster `__ *gives your team a structured way to manage rules, review failures, approve fixes, and reuse decisions—without building the entire system from scratch.*
.. Note::
|:mega:| **Please fill out our** `fast 5-question survey `__ so that we can learn how & why you use DeepDiff, and what improvements we should make. Thank you! |:dancers:|
Q: DeepDiff report is not precise when ignore_order=True
--------------------------------------------------------
>>> from deepdiff import DeepDiff
>>> from pprint import pprint
>>> t1 = [
... {
... "key": "some/pathto/customers/foo/",
... "flags": 0,
... "value": ""
... },
... {
... "key": "some/pathto/customers/foo/account_number",
... "flags": 0,
... "value": "somevalue1"
... }
... ]
>>>
>>> t2 = [
... {
... "key": "some/pathto/customers/foo/account_number",
... "flags": 0,
... "value": "somevalue2"
... },
... {
... "key": "some/pathto/customers/foo/",
... "flags": 0,
... "value": "new"
... }
... ]
>>>
>>> pprint(DeepDiff(t1, t2))
{'values_changed': {"root[0]['key']": {'new_value': 'some/pathto/customers/foo/account_number',
'old_value': 'some/pathto/customers/foo/'},
"root[0]['value']": {'new_value': 'somevalue2',
'old_value': ''},
"root[1]['key']": {'new_value': 'some/pathto/customers/foo/',
'old_value': 'some/pathto/customers/foo/account_number'},
"root[1]['value']": {'new_value': 'new',
'old_value': 'somevalue1'}}}
**Answer**
This is explained in :ref:`cutoff_distance_for_pairs_label` and :ref:`cutoff_intersection_for_pairs_label`
Bump up these 2 parameters to 1 and you get what you want:
>>> pprint(DeepDiff(t1, t2, ignore_order=True, cutoff_distance_for_pairs=1, cutoff_intersection_for_pairs=1))
{'values_changed': {"root[0]['value']": {'new_value': 'new', 'old_value': ''},
"root[1]['value']": {'new_value': 'somevalue2',
'old_value': 'somevalue1'}}}
Q: The report of changes in a nested dictionary is too granular
---------------------------------------------------------------
**Answer**
Use :ref:`threshold_to_diff_deeper_label`
>>> from deepdiff import DeepDiff
>>> t1 = {"veggie": "carrots"}
>>> t2 = {"meat": "carrots"}
>>>
>>> DeepDiff(t1, t2, threshold_to_diff_deeper=0)
{'dictionary_item_added': ["root['meat']"], 'dictionary_item_removed': ["root['veggie']"]}
>>> DeepDiff(t1, t2, threshold_to_diff_deeper=0.33)
{'values_changed': {'root': {'new_value': {'meat': 'carrots'}, 'old_value': {'veggie': 'carrots'}}}}
Q: TypeError: Object of type type is not JSON serializable
----------------------------------------------------------
I'm trying to serialize the DeepDiff results into json and I'm getting the TypeError.
>>> diff=DeepDiff(1, "a")
>>> diff
{'type_changes': {'root': {'old_type': , 'new_type': , 'old_value': 1, 'new_value': 'a'}}}
>>> json.dumps(diff)
Traceback (most recent call last):
File "", line 1, in
File ".../json/__init__.py", line 231, in dumps
return _default_encoder.encode(obj)
File ".../json/encoder.py", line 199, in encode
chunks = self.iterencode(o, _one_shot=True)
File ".../json/encoder.py", line 257, in iterencode
return _iterencode(o, 0)
File ".../json/encoder.py", line 179, in default
raise TypeError(f'Object of type {o.__class__.__name__} '
TypeError: Object of type type is not JSON serializable
**Answer**
In order to serialize DeepDiff results into json, use to_json()
>>> diff.to_json()
'{"type_changes": {"root": {"old_type": "int", "new_type": "str", "old_value": 1, "new_value": "a"}}}'
Q: How do I parse DeepDiff result paths?
----------------------------------------
**Answer**
Use parse_path:
>>> from deepdiff import parse_path
>>> parse_path("root[1][2]['age']")
[1, 2, 'age']
>>> parse_path("root[1][2]['age']", include_actions=True)
[{'element': 1, 'action': 'GET'}, {'element': 2, 'action': 'GET'}, {'element': 'age', 'action': 'GET'}]
>>>
>>> parse_path("root['joe'].age")
['joe', 'age']
>>> parse_path("root['joe'].age", include_actions=True)
[{'element': 'joe', 'action': 'GET'}, {'element': 'age', 'action': 'GETATTR'}]
Or use the tree view so you can use path(output_format='list'):
>>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, 3, 4]}}
>>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2]}}
>>> ddiff = DeepDiff(t1, t2, view='tree')
>>> ddiff
{'iterable_item_removed': [, ]}
>>> # Note that the iterable_item_removed is a set. In this case it has 2 items in it.
>>> # One way to get one item from the set is to convert it to a list
>>> # And then get the first item of the list:
>>> removed = list(ddiff['iterable_item_removed'])[0]
>>> removed
>>>
>>> parent = removed.up
>>> parent
>>> parent.path() # gives you the string representation of the path
"root[4]['b']"
>>> parent.path(output_format='list') # gives you the list of keys and attributes that make up the path
[4, 'b']
Q: Why my datetimes are reported in UTC?
----------------------------------------
**Answer**
DeepDiff converts all datetimes into UTC. If a datetime is timezone naive, we assume it is in UTC too.
That is different than what Python does. Python assumes your timezone naive datetime is in your local timezone. However, you can override it to any other time zone such as your :ref:`default_timezone_label`.
>>> from deepdiff import DeepDiff
>>> from datetime import datetime, timezone
>>> d1 = datetime(2020, 8, 31, 13, 14, 1)
>>> d2 = datetime(2020, 8, 31, 13, 14, 1, tzinfo=timezone.utc)
>>> d1 == d2
False
>>> DeepDiff(d1, d2)
{}
>>> d3 = d2.astimezone(pytz.timezone('America/New_York'))
>>> DeepDiff(d1, d3)
{}
>>> d1 == d3
False
---------
Back to :doc:`/index`
qlustered-deepdiff-41c7265/deepdiff/docstrings/ignore_order.rst 0000664 0000000 0000000 00000034010 15162412645 0024704 0 ustar 00root root 0000000 0000000 :doc:`/index`
.. _ignore_order_label:
Ignore Order
============
DeepDiff by default compares objects in the order that it iterates through them in iterables.
In other words if you have 2 lists, then the first item of the lists are compared to each other, then the 2nd items and so on.
That makes DeepDiff be able to run in linear time.
However, there are often times when you don't care about the order in which the items have appeared.
In such cases DeepDiff needs to do way more work in order to find the differences.
There are a couple of parameters provided to you to have full control over.
List difference with ignore_order=False which is the default:
>>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, 3]}}
>>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 3, 2, 3]}}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint (ddiff, indent = 2)
{ 'iterable_item_added': {"root[4]['b'][3]": 3},
'values_changed': { "root[4]['b'][1]": {'new_value': 3, 'old_value': 2},
"root[4]['b'][2]": {'new_value': 2, 'old_value': 3}}}
Ignore Order
------------
List difference ignoring order or duplicates: (with the same dictionaries as above)
>>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, 3]}}
>>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 3, 2, 3]}}
>>> ddiff = DeepDiff(t1, t2, ignore_order=True)
>>> print (ddiff)
{}
.. _ignore_order_func_label:
Dynamic Ignore Order
--------------------
Sometimes single *ignore_order* parameter is not enough to do a diff job,
you can use *ignore_order_func* to determine whether the order of certain paths should be ignored
List difference ignoring order with *ignore_order_func*
>>> t1 = {"set": [1,2,3], "list": [1,2,3]}
>>> t2 = {"set": [3,2,1], "list": [3,2,1]}
>>> ddiff = DeepDiff(t1, t2, ignore_order_func=lambda level: "set" in level.path())
>>> print (ddiff)
{ 'values_changed': { "root['list'][0]": {'new_value': 3, 'old_value': 1},
"root['list'][2]": {'new_value': 1, 'old_value': 3}}}
Ignoring order when certain word in the path
>>> from deepdiff import DeepDiff
>>> t1 = {'a': [1, 2], 'b': [3, 4]}
>>> t2 = {'a': [2, 1], 'b': [4, 3]}
>>> DeepDiff(t1, t2, ignore_order=True)
{}
>>> def ignore_order_func(level):
... return 'a' in level.path()
...
>>> DeepDiff(t1, t2, ignore_order=True, ignore_order_func=ignore_order_func)
{'values_changed': {"root['b'][0]": {'new_value': 4, 'old_value': 3}, "root['b'][1]": {'new_value': 3, 'old_value': 4}}}
.. _report_repetition_label:
Reporting Repetitions
---------------------
List difference ignoring order and reporting repetitions:
>>> from deepdiff import DeepDiff
>>> from pprint import pprint
>>> t1 = [1, 3, 1, 4]
>>> t2 = [4, 4, 1]
>>> ddiff = DeepDiff(t1, t2, ignore_order=True, report_repetition=True)
>>> pprint(ddiff, indent=2)
{ 'iterable_item_removed': {'root[1]': 3},
'repetition_change': { 'root[0]': { 'new_indexes': [2],
'new_repeat': 1,
'old_indexes': [0, 2],
'old_repeat': 2,
'value': 1},
'root[3]': { 'new_indexes': [0, 1],
'new_repeat': 2,
'old_indexes': [3],
'old_repeat': 1,
'value': 4}}}
.. _max_passes_label:
Max Passes
----------
max_passes: Integer, default = 10000000
Maximum number of passes to run on objects to pin point what exactly is different. This is only used when ignore_order=True
If you have deeply nested objects, DeepDiff needs to run multiple passes in order to pin point the difference.
That can dramatically increase the time spent to find the difference.
You can control the maximum number of passes that can be run via the max_passes parameter.
.. note::
The definition of pass is whenever 2 iterable objects are being compared with each other and deepdiff decides to compare every single element of one iterable with every single element of the other iterable.
Refer to :ref:`cutoff_distance_for_pairs_label` and :ref:`cutoff_intersection_for_pairs_label` for more info on how DeepDiff decides to start a new pass.
Max Passes Example
>>> from pprint import pprint
>>> from deepdiff import DeepDiff
>>>
>>> t1 = [
... {
... 'key3': [[[[[1, 2, 4, 5]]]]],
... 'key4': [7, 8],
... },
... {
... 'key5': 'val5',
... 'key6': 'val6',
... },
... ]
>>>
>>> t2 = [
... {
... 'key5': 'CHANGE',
... 'key6': 'val6',
... },
... {
... 'key3': [[[[[1, 3, 5, 4]]]]],
... 'key4': [7, 8],
... },
... ]
>>>
>>> for max_passes in (1, 2, 62, 65):
... diff = DeepDiff(t1, t2, ignore_order=True, max_passes=max_passes, verbose_level=2)
... print('-\n----- Max Passes = {} -----'.format(max_passes))
... pprint(diff)
...
DeepDiff has reached the max number of passes of 1. You can possibly get more accurate results by increasing the max_passes parameter.
-
----- Max Passes = 1 -----
{'values_changed': {'root[0]': {'new_value': {'key5': 'CHANGE', 'key6': 'val6'},
'old_value': {'key3': [[[[[1, 2, 4, 5]]]]],
'key4': [7, 8]}},
'root[1]': {'new_value': {'key3': [[[[[1, 3, 5, 4]]]]],
'key4': [7, 8]},
'old_value': {'key5': 'val5', 'key6': 'val6'}}}}
DeepDiff has reached the max number of passes of 2. You can possibly get more accurate results by increasing the max_passes parameter.
-
----- Max Passes = 2 -----
{'values_changed': {"root[0]['key3'][0]": {'new_value': [[[[1, 3, 5, 4]]]],
'old_value': [[[[1, 2, 4, 5]]]]},
"root[1]['key5']": {'new_value': 'CHANGE',
'old_value': 'val5'}}}
DeepDiff has reached the max number of passes of 62. You can possibly get more accurate results by increasing the max_passes parameter.
-
----- Max Passes = 62 -----
{'values_changed': {"root[0]['key3'][0][0][0][0]": {'new_value': [1, 3, 5, 4],
'old_value': [1, 2, 4, 5]},
"root[1]['key5']": {'new_value': 'CHANGE',
'old_value': 'val5'}}}
DeepDiff has reached the max number of passes of 65. You can possibly get more accurate results by increasing the max_passes parameter.
-
----- Max Passes = 65 -----
{'values_changed': {"root[0]['key3'][0][0][0][0][1]": {'new_value': 3,
'old_value': 2},
"root[1]['key5']": {'new_value': 'CHANGE',
'old_value': 'val5'}}}
.. note::
If there are potential passes left to be run and the max_passes value is reached, DeepDiff will issue a warning.
However the most accurate result might have already been found when there are still potential passes left to be run.
For example in the above example at max_passes=64, DeepDiff finds the optimal result however it has one more pass
to go before it has run all the potential passes. Hence just for the sake of example we are using max_passes=65
as an example of a number that doesn't issue warnings.
.. note::
Also take a look at :ref:`max_passes_label`
.. _cutoff_distance_for_pairs_label:
Cutoff Distance For Pairs
-------------------------
cutoff_distance_for_pairs : 1 >= float >= 0, default=0.3
What is the threshold to consider 2 items as potential pairs.
Note that it is only used when ignore_order = True.
cutoff_distance_for_pairs in combination with :ref:`cutoff_intersection_for_pairs_label` are the parameters that decide whether 2 objects to be paired with each other during ignore_order=True algorithm or not. Note that these parameters are mainly used for nested iterables.
For example by going from the default of cutoff_distance_for_pairs=0.3 to 0.1, we have essentially disallowed the 1.0 and 20.0 to be paired with each other. As you can see, DeepDiff has decided that the :ref:`deep_distance_label` of 1.0 and 20.0 to be around 0.27. Since that is way above cutoff_distance_for_pairs of 0.1, the 2 items are not paired. As a result the lists containing the 2 numbers are directly compared with each other:
>>> from deepdiff import DeepDiff
>>> t1 = [[1.0]]
>>> t2 = [[20.0]]
>>> DeepDiff(t1, t2, ignore_order=True, cutoff_distance_for_pairs=0.3)
{'values_changed': {'root[0][0]': {'new_value': 20.0, 'old_value': 1.0}}}
>>> DeepDiff(t1, t2, ignore_order=True, cutoff_distance_for_pairs=0.1)
{'values_changed': {'root[0]': {'new_value': [20.0], 'old_value': [1.0]}}}
>>> DeepDiff(1.0, 20.0, get_deep_distance=True)
{'values_changed': {'root': {'new_value': 20.0, 'old_value': 1.0}}, 'deep_distance': 0.2714285714285714}
.. _cutoff_intersection_for_pairs_label:
Cutoff Intersection For Pairs
-----------------------------
cutoff_intersection_for_pairs : 1 >= float >= 0, default=0.7
What is the threshold to calculate pairs of items between 2 iterables.
For example 2 iterables that have nothing in common, do not need their pairs to be calculated.
Note that it is only used when ignore_order = True.
Behind the scene DeepDiff takes the :ref:`deep_distance_label` of objects when running ignore_order=True.
The distance is between zero and 1.
A distance of zero means the items are equal. A distance of 1 means they are 100% different.
When comparing iterables, the cutoff_intersection_for_pairs is used to decide whether to compare every single item in each iterable
with every single item in the other iterable or not. If the distance between the 2 iterables is equal or bigger than the
cutoff_intersection_for_pairs, then the 2 iterables items are only compared as added or removed items and NOT modified items.
However, if the distance between 2 iterables is below the cutoff, every single item from each iterable will be compared to every
single item from the other iterable to find the closest "pair" of each item.
.. note::
The process of comparing every item to the other is very expensive so :ref:`cutoff_intersection_for_pairs_label` in combination with :ref:`cutoff_distance_for_pairs_label` is used to give acceptable results with much higher speed.
With a low cutoff_intersection_for_pairs, the 2 iterables above will be considered too
far off from each other to get the individual pairs of items.
So numbers that are not only related to each other via their positions in the lists
and not their values are paired together in the results.
>>> t1 = [1.0, 2.0, 3.0, 4.0, 5.0]
>>> t2 = [5.0, 3.01, 1.2, 2.01, 4.0]
>>>
>>> DeepDiff(t1, t2, ignore_order=True, cutoff_intersection_for_pairs=0.1)
{'values_changed': {'root[1]': {'new_value': 3.01, 'old_value': 2.0}, 'root[2]': {'new_value': 1.2, 'old_value': 3.0}}, 'iterable_item_added': {'root[3]': 2.01}, 'iterable_item_removed': {'root[0]': 1.0}}
With the cutoff_intersection_for_pairs of 0.7 (which is the default value),
the 2 iterables will be considered close enough to get pairs of items between the 2.
So 2.0 and 2.01 are paired together for example.
>>> t1 = [1.0, 2.0, 3.0, 4.0, 5.0]
>>> t2 = [5.0, 3.01, 1.2, 2.01, 4.0]
>>>
>>> DeepDiff(t1, t2, ignore_order=True, cutoff_intersection_for_pairs=0.7)
{'values_changed': {'root[2]': {'new_value': 3.01, 'old_value': 3.0}, 'root[0]': {'new_value': 1.2, 'old_value': 1.0}, 'root[1]': {'new_value': 2.01, 'old_value': 2.0}}}
As an example of how much this parameter can affect the results in deeply nested objects, please take a look at :ref:`distance_and_diff_granularity_label`.
.. _iterable_compare_func_label2:
Iterable Compare Func
---------------------
New in DeepDiff 5.5.0
There are times that we want to guide DeepDiff as to what items to compare with other items. In such cases we can pass a `iterable_compare_func` that takes a function pointer to compare two items. The function takes three parameters (x, y, level) and should return `True` if it is a match, `False` if it is not a match or raise `CannotCompare` if it is unable to compare the two.
For example take the following objects:
>>> from deepdiff import DeepDiff
>>> from deepdiff.helper import CannotCompare
>>>
>>> t1 = [
... {
... 'id': 1,
... 'value': [1]
... },
... {
... 'id': 2,
... 'value': [7, 8, 1]
... },
... {
... 'id': 3,
... 'value': [7, 8],
... },
... ]
>>>
>>> t2 = [
... {
... 'id': 2,
... 'value': [7, 8]
... },
... {
... 'id': 3,
... 'value': [7, 8, 1],
... },
... {
... 'id': 1,
... 'value': [1]
... },
... ]
>>>
>>> DeepDiff(t1, t2, ignore_order=True)
{'values_changed': {"root[2]['id']": {'new_value': 2, 'old_value': 3}, "root[1]['id']": {'new_value': 3, 'old_value': 2}}}
Now let's define a compare_func that takes 3 parameters: x, y and level.
>>> def compare_func(x, y, level=None):
... try:
... return x['id'] == y['id']
... except Exception:
... raise CannotCompare() from None
...
>>> DeepDiff(t1, t2, ignore_order=True, iterable_compare_func=compare_func)
{'iterable_item_added': {"root[2]['value'][2]": 1}, 'iterable_item_removed': {"root[1]['value'][2]": 1}}
As you can see the results are different. Now items with the same ids are compared with each other.
.. note::
The level parameter of the iterable_compare_func is only used when ignore_order=False.
Back to :doc:`/index`
qlustered-deepdiff-41c7265/deepdiff/docstrings/ignore_types_or_values.rst 0000664 0000000 0000000 00000043502 15162412645 0027022 0 ustar 00root root 0000000 0000000 :doc:`/index`
Ignore Types Or Values
======================
DeepDiff provides numerous functionalities for the user to be able to define what paths, item types etc. to be included or ignored during the diffing process.
As an example, you may have a type change in your objects:
Type change
>>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, 3]}}
>>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":"world\n\n\nEnd"}}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint (ddiff, indent = 2)
{ 'type_changes': { "root[4]['b']": { 'new_type': ,
'new_value': 'world\n\n\nEnd',
'old_type': ,
'old_value': [1, 2, 3]}}}
And if you don't care about the value of items that have changed type, you can set verbose level to 0
>>> t1 = {1:1, 2:2, 3:3}
>>> t2 = {1:1, 2:"2", 3:3}
>>> pprint(DeepDiff(t1, t2, verbose_level=0), indent=2)
{ 'type_changes': { 'root[2]': { 'new_type': ,
'old_type': }}}
But what if you did not care about the integer becoming a string with the same value? What if you didn't want 2 -> "2" to be considered a type or value change? Throughout this page you will find different examples of functionalities that can help you achieve what you want.
.. _exclude_types_label:
Exclude Types
-------------
exclude_types: list, default = None
List of object types to exclude from the report.
Exclude certain types from comparison:
>>> l1 = logging.getLogger("test")
>>> l2 = logging.getLogger("test2")
>>> t1 = {"log": l1, 2: 1337}
>>> t2 = {"log": l2, 2: 1337}
>>> print(DeepDiff(t1, t2, exclude_types={logging.Logger}))
{}
.. _ignore_string_type_changes_label:
Ignore String Type Changes
--------------------------
ignore_string_type_changes: Boolean, default = False
Whether to ignore string type changes or not. For example b"Hello" vs. "Hello" are considered the same if ignore_string_type_changes is set to True.
>>> DeepDiff(b'hello', 'hello', ignore_string_type_changes=True)
{}
>>> DeepDiff(b'hello', 'hello')
{'type_changes': {'root': {'old_type': , 'new_type': , 'old_value': b'hello', 'new_value': 'hello'}}}
.. _ignore_numeric_type_changes_label:
Ignore Numeric Type Changes
---------------------------
ignore_numeric_type_changes: Boolean, default = False
Whether to ignore numeric type changes or not. For example 10 vs. 10.0 are considered the same if ignore_numeric_type_changes is set to True.
Example with Decimal
>>> from decimal import Decimal
>>> from deepdiff import DeepDiff
>>>
>>> t1 = Decimal('10.01')
>>> t2 = 10.01
>>>
>>> DeepDiff(t1, t2)
{'type_changes': {'root': {'old_type': , 'new_type': , 'old_value': Decimal('10.01'), 'new_value': 10.01}}}
>>> DeepDiff(t1, t2, ignore_numeric_type_changes=True)
{}
Note that this parameter only works for comparing numbers with numbers. If you compare a number to a string value of the number, this parameter does not solve your problem:
Example with Fraction
>>> from fractions import Fraction
>>> from deepdiff import DeepDiff
>>>
>>> t1 = Fraction(1, 2)
>>> t2 = 0.5
>>>
>>> DeepDiff(t1, t2)
{'type_changes': {'root': {'old_type': , 'new_type': , 'old_value': Fraction(1, 2), 'new_value': 0.5}}}
>>> DeepDiff(t1, t2, ignore_numeric_type_changes=True)
{}
Example:
>>> t1 = Decimal('10.01')
>>> t2 = "10.01"
>>>
>>> DeepDiff(t1, t2, ignore_numeric_type_changes=True)
{'type_changes': {'root': {'old_type': , 'new_type': , 'old_value': Decimal('10.01'), 'new_value': '10.01'}}}
If you face repeated patterns of comparing numbers to string values of numbers, you will want to preprocess your input to convert the strings into numbers before feeding it into DeepDiff.
.. _ignore_type_in_groups_label:
Ignore Type In Groups
---------------------
ignore_type_in_groups: Tuple or List of Tuples, default = None
Ignore type changes between members of groups of types. For example if you want to ignore type changes between float and decimals etc. Note that this is a more granular feature. While this feature is production ready for strings and numbers, it is still experimental with other custom lists of types, Hence it is recommended to use the shortcuts provided to you which are :ref:`ignore_string_type_changes_label` and :ref:`ignore_numeric_type_changes_label` unless you have a specific need beyond those 2 cases and you need do define your own ignore_type_in_groups.
For example lets say you have specifically str and byte datatypes to be ignored for type changes. Then you have a couple of options:
1. Set ignore_string_type_changes=True.
2. Or set ignore_type_in_groups=[(str, bytes)]. Here you are saying if we detect one type to be str and the other one bytes, do not report them as type change. It is exactly as passing ignore_type_in_groups=[DeepDiff.strings] or ignore_type_in_groups=DeepDiff.strings .
Now what if you want also typeA and typeB to be ignored when comparing against each other?
1. ignore_type_in_groups=[DeepDiff.strings, (typeA, typeB)]
2. or ignore_type_in_groups=[(str, bytes), (typeA, typeB)]
Note: The example below shows you have to use this feature. For enum types, however, you can just use :ref:`use_enum_value_label`
Example: Ignore Enum to string comparison
>>> from deepdiff import DeepDiff
>>> from enum import Enum
>>> class MyEnum1(Enum):
... book = "book"
... cake = "cake"
...
>>> DeepDiff("book", MyEnum1.book)
{'type_changes': {'root': {'old_type': , 'new_type': , 'old_value': 'book', 'new_value': }}}
>>> DeepDiff("book", MyEnum1.book, ignore_type_in_groups=[(Enum, str)])
{}
Example: Ignore Type Number - Dictionary that contains float and integer. Note that this is exactly the same as passing ignore_numeric_type_changes=True.
>>> from deepdiff import DeepDiff
>>> from pprint import pprint
>>> t1 = {1: 1, 2: 2.22}
>>> t2 = {1: 1.0, 2: 2.22}
>>> ddiff = DeepDiff(t1, t2)
>>> pprint(ddiff, indent=2)
{ 'type_changes': { 'root[1]': { 'new_type': ,
'new_value': 1.0,
'old_type': ,
'old_value': 1}}}
>>> ddiff = DeepDiff(t1, t2, ignore_type_in_groups=DeepDiff.numbers)
>>> pprint(ddiff, indent=2)
{}
Example: Ignore Type Number - List that contains float and integer. Note that this is exactly the same as passing ignore_numeric_type_changes=True.
>>> from deepdiff import DeepDiff
>>> from pprint import pprint
>>> t1 = [1, 2, 3]
>>> t2 = [1.0, 2.0, 3.0]
>>> ddiff = DeepDiff(t1, t2)
>>> pprint(ddiff, indent=2)
{ 'type_changes': { 'root[0]': { 'new_type': ,
'new_value': 1.0,
'old_type': ,
'old_value': 1},
'root[1]': { 'new_type': ,
'new_value': 2.0,
'old_type': ,
'old_value': 2},
'root[2]': { 'new_type': ,
'new_value': 3.0,
'old_type': ,
'old_value': 3}}}
>>> ddiff = DeepDiff(t1, t2, ignore_type_in_groups=DeepDiff.numbers)
>>> pprint(ddiff, indent=2)
{}
You can pass a list of tuples or list of lists if you have various type groups. When t1 and t2 both fall under one of these type groups, the type change will be ignored. DeepDiff already comes with 2 groups: DeepDiff.strings and DeepDiff.numbers . If you want to pass both:
>>> ignore_type_in_groups = [DeepDiff.strings, DeepDiff.numbers]
ignore_type_in_groups example with custom objects:
>>> class Burrito:
... bread = 'flour'
... def __init__(self):
... self.spicy = True
...
>>>
>>> class Taco:
... bread = 'flour'
... def __init__(self):
... self.spicy = True
...
>>>
>>> burrito = Burrito()
>>> taco = Taco()
>>>
>>> burritos = [burrito]
>>> tacos = [taco]
>>>
>>> DeepDiff(burritos, tacos, ignore_type_in_groups=[(Taco, Burrito)], ignore_order=True)
{}
.. note::
You can pass list of tuples of types to ignore_type_in_groups or you can put actual values in the tuples and ignore_type_in_groups will extract the type from them. The example below has used (1, 1.0) instead of (int, float),
Ignoring string to None comparison:
>>> from deepdiff import DeepDiff
>>> import datetime
>>>
>>> t1 = [1, 2, 3, 'a', None]
>>> t2 = [1.0, 2.0, 3.3, b'a', 'hello']
>>> DeepDiff(t1, t2, ignore_type_in_groups=[(1, 1.0), (None, str, bytes)])
{'values_changed': {'root[2]': {'new_value': 3.3, 'old_value': 3}}}
>>>
Ignoring datetime to string comparison
>>> now = datetime.datetime(2020, 5, 5)
>>> t1 = [1, 2, 3, 'a', now]
>>> t2 = [1, 2, 3, 'a', 'now']
>>> DeepDiff(t1, t2, ignore_type_in_groups=[(str, bytes, datetime.datetime)])
{'values_changed': {'root[4]': {'new_value': 'now', 'old_value': datetime.datetime(2020, 5, 5, 0, 0)}}}
.. _ignore_type_subclasses_label:
Ignore Type Subclasses
----------------------
ignore_type_subclasses: Boolean, default = False
Use ignore_type_subclasses=True so when ignoring type (class), the subclasses of that class are ignored too.
.. Note::
ignore_type_subclasses was incorrectly doing the reverse of its job up until DeepDiff 6.7.1
Please make sure to flip it in your use cases, when upgrading from older versions to 7.0.0 or above.
>>> from deepdiff import DeepDiff
>>> class ClassA:
... def __init__(self, x, y):
... self.x = x
... self.y = y
...
>>> class ClassB:
... def __init__(self, x):
... self.x = x
...
>>> class ClassC(ClassB):
... pass
...
>>> obj_a = ClassA(1, 2)
>>> obj_c = ClassC(3)
>>>
>>> DeepDiff(obj_a, obj_c, ignore_type_in_groups=[(ClassA, ClassB)], ignore_type_subclasses=True)
{'type_changes': {'root': {'old_type': , 'new_type': , 'old_value': <__main__.ClassA object at 0x10076a2e8>, 'new_value': <__main__.ClassC object at 0x10082f630>}}}
>>>
>>> DeepDiff(obj_a, obj_c, ignore_type_in_groups=[(ClassA, ClassB)], ignore_type_subclasses=False)
{'values_changed': {'root.x': {'new_value': 3, 'old_value': 1}}, 'attribute_removed': [root.y]}
.. _ignore_uuid_types_label:
Ignore UUID Types
------------------
ignore_uuid_types: Boolean, default = False
Whether to ignore UUID vs string type differences when comparing. When set to True, comparing a UUID object with its string representation will not report as a type change.
Without ignore_uuid_types:
>>> import uuid
>>> from deepdiff import DeepDiff
>>> test_uuid = uuid.UUID('12345678-1234-5678-1234-567812345678')
>>> uuid_str = '12345678-1234-5678-1234-567812345678'
>>> DeepDiff(test_uuid, uuid_str)
{'type_changes': {'root': {'old_type': , 'new_type': , 'old_value': UUID('12345678-1234-5678-1234-567812345678'), 'new_value': '12345678-1234-5678-1234-567812345678'}}}
With ignore_uuid_types=True:
>>> DeepDiff(test_uuid, uuid_str, ignore_uuid_types=True)
{}
This works in both directions:
>>> DeepDiff(uuid_str, test_uuid, ignore_uuid_types=True)
{}
The parameter works with nested structures like dictionaries and lists:
>>> dict1 = {'id': test_uuid, 'name': 'test'}
>>> dict2 = {'id': uuid_str, 'name': 'test'}
>>> DeepDiff(dict1, dict2, ignore_uuid_types=True)
{}
Note that if the UUID and string represent different values, it will still report as a value change:
>>> different_uuid = uuid.UUID('87654321-4321-8765-4321-876543218765')
>>> DeepDiff(different_uuid, uuid_str, ignore_uuid_types=True)
{'values_changed': {'root': {'old_value': UUID('87654321-4321-8765-4321-876543218765'), 'new_value': '12345678-1234-5678-1234-567812345678'}}}
This parameter can be combined with other ignore flags:
>>> data1 = {'id': test_uuid, 'name': 'TEST', 'count': 42}
>>> data2 = {'id': uuid_str, 'name': 'test', 'count': 42.0}
>>> DeepDiff(data1, data2, ignore_uuid_types=True, ignore_string_case=True, ignore_numeric_type_changes=True)
{}
.. _ignore_string_case_label:
Ignore String Case
------------------
ignore_string_case: Boolean, default = False
Whether to be case-sensitive or not when comparing strings. By settings ignore_string_case=False, strings will be compared case-insensitively.
>>> DeepDiff(t1='Hello', t2='heLLO')
{'values_changed': {'root': {'new_value': 'heLLO', 'old_value': 'Hello'}}}
>>> DeepDiff(t1='Hello', t2='heLLO', ignore_string_case=True)
{}
Ignore Nan Inequality
---------------------
ignore_nan_inequality: Boolean, default = False
Read more at :ref:`ignore_nan_inequality_label`
Whether to ignore float('nan') inequality in Python.
.. _ignore_private_variables_label:
Ignore Private Variables
------------------------
ignore_private_variables: Boolean, default = True
Whether to exclude the private variables in the calculations or not. It only affects variables that start with double underscores (__).
.. _exclude_obj_callback_label:
Exclude Obj Callback
--------------------
exclude_obj_callback: function, default = None
A function that takes the object and its path and returns a Boolean. If True is returned, the object is excluded from the results, otherwise it is included.
This is to give the user a higher level of control than one can achieve via exclude_paths, exclude_regex_paths or other means.
>>> def exclude_obj_callback(obj, path):
... return True if "skip" in path or isinstance(obj, int) else False
...
>>> t1 = {"x": 10, "y": "b", "z": "c", "skip_1": 0}
>>> t2 = {"x": 12, "y": "b", "z": "c", "skip_2": 0}
>>> DeepDiff(t1, t2, exclude_obj_callback=exclude_obj_callback)
{}
.. _exclude_obj_callback_strict_label:
Exclude Obj Callback Strict
---------------------------
exclude_obj_callback_strict: function, default = None
A function that works the same way as exclude_obj_callback, but excludes elements from the result only if the function returns True for both elements
>>> def exclude_obj_callback_strict(obj, path):
... return True if isinstance(obj, int) and obj > 10 else False
...
>>> t1 = {"x": 10, "y": "b", "z": "c"}
>>> t2 = {"x": 12, "y": "b", "z": "c"}
>>> DeepDiff(t1, t2, exclude_obj_callback=exclude_obj_callback_strict)
{}
>>> DeepDiff(t1, t2, exclude_obj_callback_strict=exclude_obj_callback_strict)
{'values_changed': {"root['x']": {'new_value': 12, 'old_value': 10}}}
.. _include_obj_callback_label:
Include Obj Callback
--------------------
include_obj_callback: function, default = None
A function that takes the object and its path and returns a Boolean. If True is returned, the object is included in the results, otherwise it is excluded.
This is to give the user a higher level of control than one can achieve via include_paths.
>>> def include_obj_callback(obj, path):
... return True if "include" in path or isinstance(obj, int) else False
...
>>> t1 = {"x": 10, "y": "b", "z": "c", "include_me": "a"}
>>> t2 = {"x": 10, "y": "b", "z": "c", "include_me": "b"}
>>> DeepDiff(t1, t2, include_obj_callback=include_obj_callback)
{'values_changed': {"root['include_me']": {'new_value': "b", 'old_value': "a"}}}
.. _include_obj_callback_strict_label:
Include Obj Callback Strict
---------------------------
include_obj_callback_strict: function, default = None
A function that works the same way as include_obj_callback, but includes elements in the result only if the function returns True for both elements.
>>> def include_obj_callback_strict(obj, path):
... return True if isinstance(obj, int) and obj > 10 else False
...
>>> t1 = {"x": 10, "y": "b", "z": "c"}
>>> t2 = {"x": 12, "y": "b", "z": "c"}
>>> DeepDiff(t1, t2, include_obj_callback=include_obj_callback_strict)
{'values_changed': {"root['x']": {'new_value': 12, 'old_value': 10}}}
>>> DeepDiff(t1, t2, include_obj_callback_strict=include_obj_callback_strict)
{}
.. _truncate_datetime_label:
Truncate Datetime
-----------------
truncate_datetime: string, default = None
truncate_datetime can take value one of 'second', 'minute', 'hour', 'day' and truncate with this value datetime objects before hashing it
>>> import datetime
>>> from deepdiff import DeepDiff
>>> d1 = {'a': datetime.datetime(2020, 5, 17, 22, 15, 34, 913070)}
>>> d2 = {'a': datetime.datetime(2020, 5, 17, 22, 15, 39, 296583)}
>>> DeepDiff(d1, d2, truncate_datetime='minute')
{}
.. _use_enum_value_label:
Use Enum Value
--------------
use_enum_value: Boolean, default=False
Makes it so when diffing enum, we use the enum's value. It makes it so comparing an enum to a string or any other value is not reported as a type change.
>>> from enum import Enum
>>> from deepdiff import DeepDiff
>>>
>>> class MyEnum2(str, Enum):
... book = "book"
... cake = "cake"
...
>>> DeepDiff("book", MyEnum2.book)
{'type_changes': {'root': {'old_type': , 'new_type': , 'old_value': 'book', 'new_value': }}}
>>> DeepDiff("book", MyEnum2.book, use_enum_value=True)
{}
Back to :doc:`/index`
qlustered-deepdiff-41c7265/deepdiff/docstrings/index.rst 0000664 0000000 0000000 00000012065 15162412645 0023343 0 ustar 00root root 0000000 0000000 .. DeepDiff documentation master file, created by
sphinx-quickstart on Mon Jul 20 06:06:44 2015.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
DeepDiff 9.0.0 documentation!
=============================
.. |qluster_link| raw:: html
Qluster
.. admonition:: DeepDiff is now part of |qluster_link|.
*If you're building workflows around data validation and correction,* `Qluster `__ *gives your team a structured way to manage rules, review failures, approve fixes, and reuse decisions—without building the entire system from scratch.*
*******
Modules
*******
The DeepDiff library includes the following modules:
- **DeepDiff** For Deep Difference of 2 objects. :doc:`/diff`
It returns the deep difference of python objects. It can also be used to take the distance between objects. :doc:`/deep_distance`
- **DeepSearch** Search for objects within other objects. :doc:`/dsearch`
- **DeepHash** Hash any object based on their content even if they are not "hashable" in Python's eyes. :doc:`/deephash`
- **Delta** Delta of objects that can be applied to other objects. Imagine git commits but for structured data. :doc:`/delta`
- **Extract** For extracting a path from an object :doc:`/extract`
- **Commandline** Most of the above functionality is also available via the commandline module :doc:`/commandline`
***********
What Is New
***********
DeepDiff 9-0-0
--------------
- migration note:
- `to_dict()` and `to_json()` now accept a `verbose_level` parameter and always return a usable text-view dict. When the original view is `'tree'`, they default to `verbose_level=2` for full detail. The old `view_override` parameter is removed. To get the previous results, you will need to pass the explicit verbose_level to `to_json` and `to_dict` if you are using the tree view.
- Dropping support for Python 3.9
- Support for python 3.14
- Added support for callable ``group_by`` thanks to `echan5 `__
- Added ``FlatDeltaDict`` TypedDict for ``to_flat_dicts`` return type
- Fixed colored view display when all list items are removed thanks to `yannrouillard `__
- Fixed ``hasattr()`` swallowing ``AttributeError`` in ``__slots__`` handling for objects with ``__getattr__`` thanks to `tpvasconcelos `__
- Fixed ``ignore_order=True`` missing int-vs-float type changes
- Always use t1 path for reporting thanks to `devin13cox `__
- Fixed ``_convert_oversized_ints`` failing on NamedTuples
- Fixed orjson ``TypeError`` for integers exceeding 64-bit range
- Fixed parameter bug in ``to_flat_dicts`` where ``include_action_in_path`` and ``report_type_changes`` were not being passed through
- Fixed ``ignore_keys`` issue in ``detailed__dict__`` thanks to `vitalis89 `__
- Fixed logarithmic similarity type hint thanks to `ljames8 `__
- Added ``Fraction`` numeric support thanks to `akshat62 `__
*********
Tutorials
*********
Tutorials can be found on `Zepworks blog `_
************
Installation
************
Install from PyPi::
pip install deepdiff
If you want to use DeepDiff from commandline::
pip install "deepdiff[cli]"
If you want to improve the performance of DeepDiff with certain processes such as json serialization::
pip install "deepdiff[optimize]"
Read about DeepDiff optimizations at :ref:`optimizations_label`
Importing
---------
.. code:: python
>>> from deepdiff import DeepDiff # For Deep Difference of 2 objects
>>> from deepdiff import grep, DeepSearch # For finding if item exists in an object
>>> from deepdiff import DeepHash # For hashing objects based on their contents
>>> from deepdiff import Delta # For creating delta of objects that can be applied later to other objects.
>>> from deepdiff import extract # For extracting a path from an object
.. note::
if you want to use DeepDiff via commandline, make sure to run::
pip install "deepdiff[cli]"
Then you can access the commands via:
- DeepDiff
.. code:: bash
$ deep diff --help
- Delta
.. code:: bash
$ deep patch --help
- grep
.. code:: bash
$ deep grep --help
- extract
.. code:: bash
$ deep extract --help
Supported data types
--------------------
int, string, unicode, dictionary, list, tuple, set, frozenset, OrderedDict, NamedTuple, Numpy, custom objects and more!
References
==========
.. toctree::
:maxdepth: 4
diff
dsearch
deephash
delta
extract
colored_view
commandline
changelog
authors
faq
support
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
qlustered-deepdiff-41c7265/deepdiff/docstrings/numbers.rst 0000664 0000000 0000000 00000022215 15162412645 0023705 0 ustar 00root root 0000000 0000000 :doc:`/index`
Numbers
=======
When dealing with numbers, DeepDiff provides the following functionalities:
.. _significant_digits_label:
Significant Digits
------------------
significant_digits : int >= 0, default=None
significant_digits defines the number of digits AFTER the decimal point to be used in the comparison. However you can override that by setting the number_format_notation="e" which will make it mean the digits in scientific notation.
.. note::
Setting significant_digits will affect ANY number comparison.
If ignore_numeric_type_changes is set to True and you have left significant_digits to the default of None, it gets automatically set to 55. The reason is that normally when numbers from 2 different types are compared, instead of comparing the values, we only report the type change. However when ignore_numeric_type_changes=True, in order compare numbers from different types to each other, we need to convert them all into strings. The significant_digits will be used to make sure we accurately convert all the numbers into strings in order to report the changes between them.
.. note::
significant_digits by default uses "{:.Xf}".format(Your Number) behind the scene to compare numbers where X=significant_digits when the number_format_notation is left as the default of "f" meaning fixed point.
As a side note, please pay attention that adding digits to your floating point can result in small differences in the results. For example:
"{:.3f}".format(1.1135) = 1.113, but "{:.3f}".format(1.11351) = 1.114
For Decimals, Python's format rounds 2.5 to 2 and 3.5 to 4 (to the closest even number)
.. note::
To override what significant digits mean and switch it to scientific notation, use number_format_notation="e"
Behind the scene that switches DeepDiff to use "{:.Xe}".format(Your Number) where X=significant_digits.
**Examples:**
Approximate decimals comparison (Significant digits after the point):
>>> t1 = Decimal('1.52')
>>> t2 = Decimal('1.57')
>>> DeepDiff(t1, t2, significant_digits=0)
{}
>>> DeepDiff(t1, t2, significant_digits=1)
{'values_changed': {'root': {'new_value': Decimal('1.57'), 'old_value': Decimal('1.52')}}}
Approximate fractions comparison (Significant digits after the point):
>>> from fractions import Fraction
>>> t1 = Fraction(22, 7) # 3.142857...
>>> t2 = Fraction(355, 113) # 3.141592...
>>> DeepDiff(t1, t2, significant_digits=2)
{}
>>> DeepDiff(t1, t2, significant_digits=3)
{'values_changed': {'root': {'new_value': Fraction(355, 113), 'old_value': Fraction(22, 7)}}}
Approximate float comparison (Significant digits after the point):
>>> t1 = [ 1.1129, 1.3359 ]
>>> t2 = [ 1.113, 1.3362 ]
>>> pprint(DeepDiff(t1, t2, significant_digits=3))
{}
>>> pprint(DeepDiff(t1, t2))
{'values_changed': {'root[0]': {'new_value': 1.113, 'old_value': 1.1129},
'root[1]': {'new_value': 1.3362, 'old_value': 1.3359}}}
>>> pprint(DeepDiff(1.23*10**20, 1.24*10**20, significant_digits=1))
{'values_changed': {'root': {'new_value': 1.24e+20, 'old_value': 1.23e+20}}}
.. _number_format_notation_label:
Number Format Notation
----------------------
number_format_notation : string, default="f"
number_format_notation is what defines the meaning of significant digits. The default value of "f" means the digits AFTER the decimal point. "f" stands for fixed point. The other option is "e" which stands for exponent notation or scientific notation.
**Examples:**
Approximate number comparison (significant_digits after the decimal point in scientific notation)
>>> DeepDiff(1024, 1020, significant_digits=2, number_format_notation="f") # default is "f"
{'values_changed': {'root': {'new_value': 1020, 'old_value': 1024}}}
>>> DeepDiff(1024, 1020, significant_digits=2, number_format_notation="e")
{}
.. _number_to_string_func_label:
Number To String Function
-------------------------
number_to_string_func : function, default=None
In many cases DeepDiff converts numbers to strings in order to compare them. For example when ignore_order=True, when significant digits parameter is defined or when the ignore_numeric_type_changes=True.
In its simplest form, the number_to_string_func is "{:.Xf}".format(Your Number) where X is the significant digits and the number_format_notation is left as the default of "f" meaning fixed point.
The number_to_string_func parameter gives the user the full control into overriding how numbers are converted to strings for comparison. The default function is defined in https://github.com/seperman/deepdiff/blob/master/deepdiff/helper.py and is called number_to_string. You can define your own custom function instead of the default one in the helper module.
Defining your own number_to_string_func
Lets say you want the numbers comparison happen only for numbers above 100 for some reason.
>>> from deepdiff import DeepDiff
>>> from deepdiff.helper import number_to_string
>>> def custom_number_to_string(number, *args, **kwargs):
... number = 100 if number < 100 else number
... return number_to_string(number, *args, **kwargs)
...
>>> t1 = [10, 12, 100000]
>>> t2 = [50, 63, 100021]
>>> DeepDiff(t1, t2, significant_digits=3, number_format_notation="e")
{'values_changed': {'root[0]': {'new_value': 50, 'old_value': 10}, 'root[1]': {'new_value': 63, 'old_value': 12}}}
>>>
>>> DeepDiff(t1, t2, significant_digits=3, number_format_notation="e",
... number_to_string_func=custom_number_to_string)
{}
Ignore Numeric Type Changes
---------------------------
ignore_numeric_type_changes: Boolean, default = False
read more at :ref:`ignore_numeric_type_changes_label`
.. _ignore_nan_inequality_label:
Ignore Nan Inequality
---------------------
ignore_nan_inequality: Boolean, default = False
Whether to ignore float('nan') inequality in Python. Note that this is a cPython "feature". Some versions of Pypy3 have nan==nan where in cPython nan!=nan
>>> float('nan') == float('nan')
False
>>> DeepDiff(float('nan'), float('nan'))
{'values_changed': {'root': {'new_value': nan, 'old_value': nan}}}
>>> DeepDiff(float('nan'), float('nan'), ignore_nan_inequality=True)
{}
.. _math_epsilon_label:
Math Epsilon
------------
math_epsilon: Decimal, default = None
math_epsilon uses Python's built in Math.isclose. It defines a tolerance value which is passed to math.isclose(). Any numbers that are within the tolerance will not report as being different. Any numbers outside of that tolerance will show up as different.
For example for some sensor data derived and computed values must lie in a certain range. It does not matter that they are off by e.g. 1e-5.
To check against that the math core module provides the valuable isclose() function. It evaluates the being close of two numbers to each other, with reference to an epsilon (abs_tol). This is superior to the format function, as it evaluates the mathematical representation and not the string representation.
Example with Decimal:
>>> from decimal import Decimal
>>> d1 = {"a": Decimal("7.175")}
>>> d2 = {"a": Decimal("7.174")}
>>> DeepDiff(d1, d2, math_epsilon=0.01)
{}
Example with Fraction:
>>> from fractions import Fraction
>>> d1 = {"a": Fraction(7175, 1000)}
>>> d2 = {"a": Fraction(7174, 1000)}
>>> DeepDiff(d1, d2, math_epsilon=0.01)
{}
.. note::
math_epsilon cannot currently handle the hashing of values, which is done when :ref:`ignore_order_label` is True.
.. _use_log_scale_label:
Use Log Scale
-------------
use_log_scale: Boolean, default=False
use_log_scale along with :ref:`log_scale_similarity_threshold_label` can be used to ignore small changes in numbers by comparing their differences in logarithmic space. This is different than ignoring the difference based on significant digits.
>>> from deepdiff import DeepDiff
>>> t1 = {'foo': 110, 'bar': 306}
>>> t2 = {'foo': 140, 'bar': 298}
>>>
>>> DeepDiff(t1, t2)
{'values_changed': {"root['foo']": {'new_value': 140, 'old_value': 110}, "root['bar']": {'new_value': 298, 'old_value': 306}}}
>>> DeepDiff(t1, t2, use_log_scale=True, log_scale_similarity_threshold=0.01)
{'values_changed': {"root['foo']": {'new_value': 140, 'old_value': 110}, "root['bar']": {'new_value': 298, 'old_value': 306}}}
>>> DeepDiff(t1, t2, use_log_scale=True, log_scale_similarity_threshold=0.1)
{'values_changed': {"root['foo']": {'new_value': 140, 'old_value': 110}}}
>>> DeepDiff(t1, t2, use_log_scale=True, log_scale_similarity_threshold=0.3)
{}
.. _log_scale_similarity_threshold_label:
Log Scale Similarity Threshold
------------------------------
log_scale_similarity_threshold: float, default = 0.1
:ref:`use_log_scale_label` along with log_scale_similarity_threshold can be used to ignore small changes in numbers by comparing their differences in logarithmic space. This is different than ignoring the difference based on significant digits. See the example above.
Performance Improvement of Numbers diffing
------------------------------------------
Take a look at :ref:`diffing_numbers_optimizations_label`
Back to :doc:`/index`
qlustered-deepdiff-41c7265/deepdiff/docstrings/optimizations.rst 0000664 0000000 0000000 00000033634 15162412645 0025152 0 ustar 00root root 0000000 0000000 :doc:`/index`
.. _optimizations_label:
Optimizations
=============
If you are dealing with large nested objects and ignore_order=True, chances are DeepDiff takes a while to calculate the diff. Here are some tips that may help you with optimizations and progress report.
Optimized Libraries
-------------------
If you dump DeepDiff or Delta objects as json, you can improve the performance by installing orjson.
DeepDiff will automatically use orjson instead of Python's built-in json library to do json serialization.
pip install "deepdiff[optimize]"
Max Passes
----------
:ref:`max_passes_label` comes with the default of 10000000.
If you don't need to exactly pinpoint the difference and you can get away with getting a less granular report, you can reduce the number of passes. It is recommended to get a diff of your objects with the defaults max_passes and take a look at the stats by running :ref:`get_stats_label` before deciding to reduce this number. In many cases reducing this number does not yield faster results.
A new pass is started each time 2 iterables are compared in a way that every single item that is different from the first one is compared to every single item that is different in the second iterable.
.. _max_diffs_label:
Max Diffs
---------
max_diffs: Integer, default = None
max_diffs defined the maximum number of diffs to run on objects to pin point what exactly is different. This is only used when ignore_order=True. Every time 2 individual items are compared a diff is counted. The default value of None means there is no limit in the number of diffs that will take place. Any positive integer can make DeepDiff stop doing the calculations upon reaching that max_diffs count.
You can run diffs and then :ref:`get_stats_label` to see how many diffs and passes have happened.
>>> from deepdiff import DeepDiff
>>> diff=DeepDiff(1, 2)
>>> diff
{'values_changed': {'root': {'new_value': 2, 'old_value': 1}}}
>>> diff.get_stats()
{'PASSES COUNT': 0, 'DIFF COUNT': 1, 'DISTANCE CACHE HIT COUNT': 0, 'MAX PASS LIMIT REACHED': False, 'MAX DIFF LIMIT REACHED': False}
>>> diff=DeepDiff([[1,2]], [[2,3,1]])
>>> diff.get_stats()
{'PASSES COUNT': 0, 'DIFF COUNT': 8, 'DISTANCE CACHE HIT COUNT': 0, 'MAX PASS LIMIT REACHED': False, 'MAX DIFF LIMIT REACHED': False}
>>> diff=DeepDiff([[1,2]], [[2,3,1]], ignore_order=True)
>>> diff.get_stats()
{'PASSES COUNT': 3, 'DIFF COUNT': 6, 'DISTANCE CACHE HIT COUNT': 0, 'MAX PASS LIMIT REACHED': False, 'MAX DIFF LIMIT REACHED': False}
.. note::
Compare :ref:`max_diffs_label` with :ref:`max_passes_label`
.. _cache_size_label:
Cache Size
----------
cache_size : int >= 0, default=0
Cache size to be used to improve the performance. A cache size of zero means it is disabled.
Using the cache_size can dramatically improve the diff performance especially for the nested objects at the cost of more memory usage. However if cache hits rate is very low, having a cache actually reduces the performance.
Cache Examples
--------------
For example lets take a look at the performance of the benchmark_deeply_nested_a in the `DeepDiff-Benchmark repo `_ .
No Cache
^^^^^^^^
With the no cache option we have the following stats:
{'PASSES COUNT': 11234, 'DIFF COUNT': 107060, 'DISTANCE CACHE HIT COUNT': 0, 'MAX PASS LIMIT REACHED': False, 'MAX DIFF LIMIT REACHED': False, 'DURATION SEC': 10}
Yes it has taken 10 seconds to do the diff!
.. figure:: _static/benchmark_deeply_nested_a__3.8__ignore_order=True__cache_size=0__cache_tuning_sample_size=0__cutoff_intersection_for_pairs=1.png
:alt: cache_size=0
cache_size=0
Cache Size 500
^^^^^^^^^^^^^^
With a cache size of 500, we are doing the same diff in 2.5 seconds! And the memory usage has not changed. It is still hovering around 100Mb.
{'PASSES COUNT': 3960, 'DIFF COUNT': 19469, 'DISTANCE CACHE HIT COUNT': 11847, 'MAX PASS LIMIT REACHED': False, 'MAX DIFF LIMIT REACHED': False, 'DURATION SEC': 2}
As you can see the number of passes and diff counts have gone down and instead the distance cache hit count has gone up.
.. figure:: _static/benchmark_deeply_nested_a__3.8__ignore_order=True__cache_size=500__cache_tuning_sample_size=0__cutoff_intersection_for_pairs=1.png
:alt: cache_size=500
cache_size=500
Cache Size 500 and Cache Tuning Sample Size 500
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
With a cache size of 500, we set the :ref:`cache_tuning_sample_size_label` to be 500 too. And we have a slight improvement. we are doing the same diff in 2 seconds now. And the memory usage has not changed. It is still hovering around 100Mb.
{'PASSES COUNT': 3960, 'DIFF COUNT': 19469, 'DISTANCE CACHE HIT COUNT': 11847, 'MAX PASS LIMIT REACHED': False, 'MAX DIFF LIMIT REACHED': False, 'DURATION SEC': 2}
As you can see in this case none of the stats have changed compared to the previous stats.
.. figure:: _static/benchmark_deeply_nested_a__3.8__ignore_order=True__cache_size=500__cache_tuning_sample_size=500__cutoff_intersection_for_pairs=1.png
:alt: cache_size=500 cache_tuning_sample_size=500
cache_size=500 cache_tuning_sample_size=500
Cache Size of 5000
^^^^^^^^^^^^^^^^^^
Let's pay a little attention to our stats. Particularly to 'DISTANCE CACHE HIT COUNT': 11847 and the fact that the memory usage has not changed so far. What if we bump the cache_size to 5000 and disable cache_tuning_sample_size?
{'PASSES COUNT': 1486, 'DIFF COUNT': 6637, 'DISTANCE CACHE HIT COUNT': 3440, 'MAX PASS LIMIT REACHED': False, 'MAX DIFF LIMIT REACHED': False, 'DURATION SEC': 0}
We get the result calculated below 1 second! And the memory usage is only slightly above 100Mb.
.. figure:: _static/benchmark_deeply_nested_a__3.8__ignore_order=True__cache_size=5000__cache_tuning_sample_size=0__cutoff_intersection_for_pairs=1.png
:alt: cache_size=5000
cache_size=5000
.. _cache_tuning_sample_size_label:
Cache Tuning Sample Size
------------------------
cache_tuning_sample_size : int >= 0, default = 0
cache_tuning_sample_size is an experimental feature. It works hands in hands with the :ref:`cache_size_label`. When cache_tuning_sample_size is set to anything above zero, it will sample the cache usage with the passed sample size and decide whether to use the cache or not. And will turn it back on occasionally during the diffing process. This option can be useful if you are not sure if you need any cache or not. However you will gain much better performance with keeping this parameter zero and running your diff with different cache sizes and benchmarking to find the optimal cache size.
.. note::
A good start with cache_tuning_sample_size is to set it to the size of your cache.
.. _diffing_numbers_optimizations_label:
Optimizations for Diffing Numbers
---------------------------------
If you are diffing lists of python numbers, you could get performance improvement just by installing numpy. DeepDiff will use Numpy to improve the performance behind the scene.
For example lets take a look at the performance of the benchmark_array_no_numpy vs. benchmark_numpy_array in the `DeepDiff-Benchmark repo `_.
In this specific test, we have 2 lists of numbers that have nothing in common: `mat1 `_ and `mat2 `_ .
No Cache and No Numpy
^^^^^^^^^^^^^^^^^^^^^
With the no cache option and no Numpy installed we have the following stats:
{'PASSES COUNT': 1, 'DIFF COUNT': 439944, 'DISTANCE CACHE HIT COUNT': 0, 'MAX PASS LIMIT REACHED': False, 'MAX DIFF LIMIT REACHED': False, 'DURATION SEC': 30}
Yes it has taken 30 seconds to do the diff!
.. figure:: _static/benchmark_array_no_numpy__3.8__ignore_order=True__cache_size=0__cache_tuning_sample_size=0__cutoff_intersection_for_pairs=1.png
:alt: cache_size=0 and no Numpy
cache_size=0 and no Numpy
Cache Size 10000 and No Numpy
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
What if we increase the cache size to 10000?
{'PASSES COUNT': 1, 'DIFF COUNT': 439944, 'DISTANCE CACHE HIT COUNT': 0, 'MAX PASS LIMIT REACHED': False, 'MAX DIFF LIMIT REACHED': False, 'DURATION SEC': 35}
Not only it didn't help, it increased the diff time by 15%!!
Worse, if you look at the stats you see that the cache hit count is zero. This has happened since the 2 lists of items have nothing in common and hence caching the results does not improve the performance.
.. figure:: _static/benchmark_array_no_numpy__3.8__ignore_order=True__cache_size=10000__cache_tuning_sample_size=0__cutoff_intersection_for_pairs=1.png
:alt: cache_size=10000 and no Numpy
cache_size=10000 and no Numpy
No Cache and Numpy
^^^^^^^^^^^^^^^^^^
Let's install Numpy now. Set the cache_size=0 and run the diff again.
Yay, the same diff is done in 5 seconds!
{'PASSES COUNT': 1, 'DIFF COUNT': 1348, 'DISTANCE CACHE HIT COUNT': 0, 'MAX PASS LIMIT REACHED': False, 'MAX DIFF LIMIT REACHED': False, 'DURATION SEC': 5}
As you can see the memory usage has gone up from around 500Mb to around 630Mb.
.. figure:: _static/benchmark_numpy_array__3.8__ignore_order=True__cache_size=0__cache_tuning_sample_size=0__cutoff_intersection_for_pairs=1.png
:alt: Numpy but no cache
Numpy but no cache
Pypy
----
If you are diffing big blobs of data that do not mainly include numbers, you may gain some performance improvement by running DeepDiff on Pypy3 instead of cPython.
For example lets take a look at the performance of the benchmark_big_jsons in the `DeepDiff-Benchmark repo `_.
First we will run it on cPython 3.8:
It takes around 17.5 seconds and 40Mb of memory:
.. figure:: _static/benchmark_big_jsons__3.8__ignore_order=True__cache_size=0__cache_tuning_sample_size=0__max_diffs=300000__max_passes=40000__cutoff_intersection_for_pairs=1.png
:alt: Nested blob of text diffed in Python3.8
Nested blob of text diffed in Python3.8
And then we run it in Pypy3.6-7.3.0. It takes 12 seconds now but around 110Mb of memory.
.. figure:: _static/benchmark_big_jsons__pypy3.6__ignore_order=True__cache_size=0__cache_tuning_sample_size=0__max_diffs=300000__max_passes=40000__cutoff_intersection_for_pairs=1.png
:alt: Nested blob of text diffed in Pypy3.6-7.3.0
Nested blob of text diffed in Pypy3.6-7.3.0
.. note::
Note that if you are diffing numbers, and have Numpy installed as recommended, cPython will have a better performance than Pypy. But if you are diffing blobs of mixed strings and some numbers, Pypy will have a better CPU performance and worse memory usage.
Cutoff Intersection For Pairs
-----------------------------
:ref:`cutoff_intersection_for_pairs_label` which is only used when ignore_order=True can have a huge affect on the granularity of the results and the performance. A value of zero essentially stops DeepDiff from doing passes while a value of 1 forced DeepDiff to do passes on iterables even when they are very different. Running passes is an expensive operation.
As an example of how much this parameter can affect the results in deeply nested objects, please take a look at :ref:`distance_and_diff_granularity_label`.
.. _cache_purge_level:
Cache Purge Level
-----------------
cache_purge_level: int, 0, 1, or 2. default=1
cache_purge_level defines what objects in DeepDiff should be deleted to free the memory once the diff object is calculated. If this value is set to zero, most of the functionality of the diff object is removed and the most memory is released. A value of 1 preserves all the functionalities of the diff object. A value of 2 also preserves the cache and hashes that were calculated during the diff calculations. In most cases the user does not need to have those objects remained in the diff unless for investigation purposes.
.. _zip_ordered_iterables_label:
Zip Ordered Iterables
---------------------
zip_ordered_iterables: Boolean, default = False
When comparing ordered iterables such as lists, DeepDiff tries to find the smallest difference between the two iterables to report. That means that items in the two lists are not paired individually in the order of appearance in the iterables. Sometimes, that is not the desired behavior. Set this flag to True to make DeepDiff pair and compare the items in the iterables in the order they appear.
>>> from pprint import pprint
>>> from deepdiff import DeepDiff
>>> t1 = ["a", "b", "d", "e"]
>>> t2 = ["a", "b", "c", "d", "e"]
>>> DeepDiff(t1, t2)
{'iterable_item_added': {'root[2]': 'c'}}
When this flag is set to True and ignore_order=False, diffing will be faster.
>>> diff=DeepDiff(t1, t2, zip_ordered_iterables=True)
>>> pprint(diff)
{'iterable_item_added': {'root[4]': 'e'},
'values_changed': {'root[2]': {'new_value': 'c', 'old_value': 'd'},
'root[3]': {'new_value': 'd', 'old_value': 'e'}}}
.. _threshold_to_diff_deeper_label:
Threshold To Diff Deeper
------------------------
threshold_to_diff_deeper: float, default = 0.33
threshold_to_diff_deeper is a number between 0 and 1. When comparing dictionaries that have a small intersection of keys, we will report the dictionary as a new_value instead of reporting individual keys changed. If you set it to zero, you get the same results as DeepDiff 7.0.1 and earlier, which means this feature is disabled. The new default is 0.33 which means if less that one third of keys between dictionaries intersect, report it as a new object.
>>> from deepdiff import DeepDiff
>>> t1 = {"veggie": "carrots"}
>>> t2 = {"meat": "carrots"}
>>>
>>> DeepDiff(t1, t2, threshold_to_diff_deeper=0)
{'dictionary_item_added': ["root['meat']"], 'dictionary_item_removed': ["root['veggie']"]}
>>> DeepDiff(t1, t2, threshold_to_diff_deeper=0.33)
{'values_changed': {'root': {'new_value': {'meat': 'carrots'}, 'old_value': {'veggie': 'carrots'}}}}
Back to :doc:`/index`
qlustered-deepdiff-41c7265/deepdiff/docstrings/other.rst 0000664 0000000 0000000 00000006217 15162412645 0023357 0 ustar 00root root 0000000 0000000 :doc:`/index`
Other Parameters
================
.. _encodings_label:
Encodings
---------
significant_digits : int >= 0, default=None
Character encodings to iterate through when we convert bytes into strings. You may want to pass an explicit list of encodings in your objects if you start getting UnicodeDecodeError from DeepHash. Also check out :ref:`ignore_encoding_errors_label` if you can get away with ignoring these errors and don't want to bother with an explicit list of encodings but it will come at the price of slightly less accuracy of the final results. Example: encodings=["utf-8", "latin-1"]
The reason the decoding of bytes to string is needed is that when `ignore_order = True` we calculate the hash of the objects in order to facilitate in diffing them. In order to calculate the hash, we serialize all objects into strings. During the serialization we may encounter issues with character encodings.
**Examples:**
Comparing bytes that have non UTF-8 encoding:
>>> from deepdiff import DeepDiff
>>> item = b"\xbc cup of flour"
>>> DeepDiff([b'foo'], [item], ignore_order=True)
Traceback (most recent call last):
raise UnicodeDecodeError(
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xbc in position 0: Can not produce a hash for root: invalid start byte in 'p of flo...'. Please either pass ignore_encoding_errors=True or pass the encoding via encodings=['utf-8', '...'].
Let's try to pass both 'utf-8' and 'latin-1' as encodings to be tried:
>>> DeepDiff([b'foo'], [item], encodings=['utf-8', 'latin-1'], ignore_order=True)
{'values_changed': {'root[0]': {'new_value': b'\xbc cup of flour', 'old_value': b'foo'}}}
.. _ignore_encoding_errors_label:
Ignore Encoding Errors
----------------------
ignore_encoding_errors: Boolean, default = False
If you want to get away with UnicodeDecodeError without passing explicit character encodings, set this option to True. If you want to make sure the encoding is done properly, keep this as False and instead pass an explicit list of character encodings to be considered via the encodings parameter.
We can generally get the same results as above example if we just pass `ignore_encoding_errors=True`. However it comes at the cost of less accuracy of the results.
>>> DeepDiff([b'foo'], [b"\xbc cup of flour"], ignore_encoding_errors=True, ignore_order=True)
{'values_changed': {'root[0]': {'new_value': b'\xbc cup of flour', 'old_value': b'foo'}}}
For example if we replace `foo` with ` cup of flour`, we have bytes that are only different in the problematic character. Ignoring that character means DeepDiff will consider these 2 strings to be equal since their hash becomes the same. Note that we only hash items when `ignore_order=True`.
>>> DeepDiff([b" cup of flour"], [b"\xbc cup of flour"], ignore_encoding_errors=True, ignore_order=True)
{}
But if we had passed the proper encoding, it would have detected that these 2 bytes are different:
>>> DeepDiff([b" cup of flour"], [b"\xbc cup of flour"], encodings=['latin-1'], ignore_order=True)
{'values_changed': {'root[0]': {'new_value': b'\xbc cup of flour', 'old_value': b' cup of flour'}}}
Back to :doc:`/index`
qlustered-deepdiff-41c7265/deepdiff/docstrings/search_doc.rst 0000664 0000000 0000000 00000005200 15162412645 0024317 0 ustar 00root root 0000000 0000000 :orphan:
grep is a more user friendly interface for DeepSearch. It takes exactly the same arguments as DeepSearch except that you pipe the object into it instead of passing it as a parameter.
It works just like grep in linux shell!
**Parameters**
item : The item to search for
verbose_level : int >= 0, default = 1.
Verbose level one shows the paths of found items.
Verbose level 2 shows the path and value of the found items.
exclude_paths: list, default = None.
List of paths to exclude from the report.
exclude_types: list, default = None.
List of object types to exclude from the report.
case_sensitive: Boolean, default = False
match_string: Boolean, default = False
If True, the value of the object or its children have to exactly match the item.
If False, the value of the item can be a part of the value of the object or its children
use_regexp: Boolean, default = False
strict_checking: Boolean, default = True
If True, it will check the type of the object to match, so when searching for '1234',
it will NOT match the int 1234. Currently this only affects the numeric values searching.
**Examples**
Importing
>>> from deepdiff import grep
>>> from pprint import pprint
Search in list for string
>>> obj = ["long somewhere", "string", 0, "somewhere great!"]
>>> item = "somewhere"
>>> ds = obj | grep(item)
>>> print(ds)
{'matched_values': ['root[0]', 'root[3]']}
Search in nested data for string
>>> obj = ["something somewhere", {"long": "somewhere", "string": 2, 0: 0, "somewhere": "around"}]
>>> item = "somewhere"
>>> ds = obj | grep(item, verbose_level=2)
>>> pprint(ds, indent=2)
{ 'matched_paths': {"root[1]['somewhere']": 'around'},
'matched_values': { 'root[0]': 'something somewhere',
"root[1]['long']": 'somewhere'}}
You can also use regular expressions
>>> obj = ["something here", {"long": "somewhere", "someone": 2, 0: 0, "somewhere": "around"}]
>>> ds = obj | grep("some.*", use_regexp=True)
>>> pprint(ds, indent=2)
{ 'matched_paths': ["root[1]['someone']", "root[1]['somewhere']"],
'matched_values': ['root[0]', "root[1]['long']"]}
Change strict_checking to False to match numbers in strings and vice versa:
>>> obj = {"long": "somewhere", "num": 1123456, 0: 0, "somewhere": "around"}
>>> item = "1234"
>>> result = {"matched_values": {"root['num']"}}
>>> ds = obj | grep(item, verbose_level=1, use_regexp=True)
>>> pprint(ds)
{}
>>>
>>> ds = obj | grep(item, verbose_level=1, use_regexp=True, strict_checking=False)
>>> pprint(ds)
{'matched_values': ["root['num']"]}
qlustered-deepdiff-41c7265/deepdiff/docstrings/serialization.rst 0000664 0000000 0000000 00000027060 15162412645 0025112 0 ustar 00root root 0000000 0000000 :doc:`/index`
.. _serialization_label:
Serialization
=============
.. _to_dict_label:
To Dict
-------
In order to convert the DeepDiff object into a normal Python dictionary, use the to_dict() method.
The result is always a text-view dictionary regardless of the original view used to create the DeepDiff object.
**Parameters**
verbose_level: int, default=None
Override the verbose_level for the serialized output.
When None, the behavior depends on the original view:
- If the original view is 'text', the verbose_level from DeepDiff initialization is used.
- If the original view is 'tree', verbose_level=2 is used to provide the most detailed output.
Valid values are 0, 1, or 2.
Example:
>>> t1 = {1: 1, 2: 2, 3: 3, 4: {"a": "hello", "b": [1, 2, 3]}}
>>> t2 = {1: 1, 2: 2, 3: 3, 4: {"a": "hello", "b": "world\n\n\nEnd"}}
>>> ddiff = DeepDiff(t1, t2)
>>> ddiff.to_dict()
{'type_changes': {"root[4]['b']": {'old_type': , 'new_type': , 'old_value': [1, 2, 3], 'new_value': 'world\n\n\nEnd'}}}
When the original view is 'tree', to_dict() defaults to verbose_level=2 for the most detailed output:
Example:
>>> t1 = {1: 1, 2: 2, 3: 3, 4: {"a": "hello", "b": [1, 2, 3]}}
>>> t2 = {1: 1, 2: 2, 3: 3, 4: {"a": "hello", "b": "world\n\n\nEnd"}}
>>> ddiff = DeepDiff(t1, t2, view='tree')
>>> ddiff.to_dict()
{'type_changes': {"root[4]['b']": {'old_type': , 'new_type': , 'old_value': [1, 2, 3], 'new_value': 'world\n\n\nEnd'}}}
You can also override the verbose_level:
Example:
>>> ddiff = DeepDiff(t1, t2, view='tree')
>>> ddiff.to_dict(verbose_level=0)
{'type_changes': {"root[4]['b']": {'old_type': , 'new_type': }}}
.. _to_json_label:
To Json
-------
Dump json of the text view.
In order to do safe json serialization, use the to_json() method.
**Parameters**
default_mapping : dictionary(optional), a dictionary of mapping of different types to json types.
by default DeepDiff converts certain data types. For example Decimals into floats so they can be exported into json.
If you have a certain object type that the json serializer can not serialize it, please pass the appropriate type
conversion through this dictionary.
verbose_level: int, default=None
Override the verbose_level for the serialized output. Same behavior as to_dict().
kwargs: Any other kwargs you pass will be passed on to Python's json.dumps()
Example 1 Serialize custom objects:
>>> class A:
... pass
...
>>> class B:
... pass
...
>>> t1 = A()
>>> t2 = B()
>>> ddiff = DeepDiff(t1, t2)
>>> ddiff.to_json()
TypeError: We do not know how to convert <__main__.A object at 0x10648> of type for json serialization. Please pass the default_mapping parameter with proper mapping of the object to a basic python type.
>>> default_mapping = {A: lambda x: 'obj A', B: lambda x: 'obj B'}
>>> ddiff.to_json(default_mapping=default_mapping)
'{"type_changes": {"root": {"old_type": "A", "new_type": "B", "old_value": "obj A", "new_value": "obj B"}}}'
Example 2:
>>> t1 = {1: 1, 2: 2, 3: 3, 4: {"a": "hello", "b": [1, 2, 3]}}
>>> t2 = {1: 1, 2: 2, 3: 3, 4: {"a": "hello", "b": "world\n\n\nEnd"}}
>>> ddiff = DeepDiff(t1, t2, view='tree')
>>> ddiff.to_json()
'{"type_changes": {"root[4][\'b\']": {"old_type": "list", "new_type": "str", "old_value": [1, 2, 3], "new_value": "world\\n\\n\\nEnd"}}}'
.. _to_json_pickle_label:
To Json Pickle
--------------
If you want the original DeepDiff object to be serialized with all the bells and whistles, you can use the to_json_pickle() and from_json_pickle() in order to serialize and deserialize its results into json. Note that json_pickle is unsafe and json pickle dumps from untrusted sources should never be loaded. It is recommended not to use this serialization unless you have to.
.. note::
You need to install the `jsonpickle `_ package to use the to_json_pickle() method.
Serialize and then deserialize back to deepdiff
>>> t1 = {1: 1, 2: 2, 3: 3}
>>> t2 = {1: 1, 2: "2", 3: 3}
>>> ddiff = DeepDiff(t1, t2)
>>> jsoned = ddiff.to_json_pickle()
>>> jsoned
'{"type_changes": {"root[2]": {"new_type": {"py/type": "builtins.str"}, "new_value": "2", "old_type": {"py/type": "builtins.int"}, "old_value": 2}}}'
>>> ddiff_new = DeepDiff.from_json_pickle(jsoned)
>>> ddiff == ddiff_new
True
.. _from_json_pickle_label:
From Json Pickle
----------------
Load the diff object from the json pickle dump.
Take a look at the above :ref:`to_json_pickle_label` for an example.
.. _delta_to_flat_rows_label:
Delta Serialize To Flat Rows
----------------------------
Sometimes, it is desired to serialize a :ref:`delta_label` object to a list of flat rows. For example, to store them in relation databases. In that case, you can use the Delta.to_flat_rows to achieve the desired outcome. The rows are named tuples and can be converted to dictionaries using `._asdict()`
>>> from pprint import pprint
>>> from deepdiff import DeepDiff, Delta
>>> t1 = {"key1": "value1"}
>>> t2 = {"field2": {"key2": "value2"}}
>>> diff = DeepDiff(t1, t2, verbose_level=2)
>>> pprint(diff, indent=2)
{ 'dictionary_item_added': {"root['field2']": {'key2': 'value2'}},
'dictionary_item_removed': {"root['key1']": 'value1'}}
>>> delta = Delta(diff, bidirectional=True)
>>> flat_rows = delta.to_flat_rows()
>>> pprint(flat_rows, indent=2)
[ FlatDeltaRow(path=['field2', 'key2'], action='dictionary_item_added', value='value2'),
FlatDeltaRow(path=['key1'], action='dictionary_item_removed', value='value1')]
.. note::
When converting a delta to flat rows, nested dictionaries that have single keys in them are flattened too.
Notice that the diff object says
{ 'dictionary_item_added': {"root['field2']": {'key2': 'value2'}}
but the flat row is:
FlatDeltaRow(path=['field2', 'key2'], action='dictionary_item_added', value='value2')
That means, when you recreate the delta from the flat rows, you need to set force=True to apply the delta:
>>> t1 + delta == t2
True
>>> t2 - delta == t1
True
>>> delta2 = Delta(flat_rows_list=flat_rows, bidirectional=True)
>>> t1 + delta2 == t2
Expected the old value for root['field2']['key2'] to be None but it is not found. Error found on: 'field2'
False. You may want to set force=True, especially if this delta is created by passing flat_rows_list or flat_dict_list
>>> t1 + delta
{'field2': {'key2': 'value2'}}
>>> t1 + delta2
{}
>>> delta2 = Delta(flat_rows_list=flat_rows, bidirectional=True, force=True) # We need to set force=True
>>> t1 + delta2
{'field2': {'key2': 'value2'}}
>>>
Flat Row Specs:
class FlatDataAction(str, enum.Enum):
values_changed = 'values_changed'
type_changes = 'type_changes'
set_item_added = 'set_item_added'
set_item_removed = 'set_item_removed'
dictionary_item_added = 'dictionary_item_added'
dictionary_item_removed = 'dictionary_item_removed'
iterable_item_added = 'iterable_item_added'
iterable_item_removed = 'iterable_item_removed'
iterable_item_moved = 'iterable_item_moved'
iterable_items_inserted = 'iterable_items_inserted' # opcode
iterable_items_deleted = 'iterable_items_deleted' # opcode
iterable_items_replaced = 'iterable_items_replaced' # opcode
iterable_items_equal = 'iterable_items_equal' # opcode
attribute_removed = 'attribute_removed'
attribute_added = 'attribute_added'
unordered_iterable_item_added = 'unordered_iterable_item_added'
unordered_iterable_item_removed = 'unordered_iterable_item_removed'
UnkownValueCode = 'unknown___'
class FlatDeltaRow(NamedTuple):
path: List
action: FlatDataAction
value: Optional[Any] = UnkownValueCode
old_value: Optional[Any] = UnkownValueCode
type: Optional[Any] = UnkownValueCode
old_type: Optional[Any] = UnkownValueCode
new_path: Optional[List] = None
t1_from_index: Optional[int] = None
t1_to_index: Optional[int] = None
t2_from_index: Optional[int] = None
t2_to_index: Optional[int] = None
.. _delta_to_flat_dicts_label:
Delta Serialize To Flat Dictionaries
------------------------------------
Sometimes, it is desired to serialize a :ref:`delta_label` object to a list of flat dictionaries. For example, to store them in relation databases. In that case, you can use the Delta.to_flat_dicts to achieve the desired outcome.
Since None is a valid value, we use a special hard-coded string to signify "unknown": 'unknown___'
.. note::
Many new keys are added to the flat dicts in DeepDiff 7.0.0
You may want to use :ref:`delta_to_flat_rows_label` instead of flat dicts.
For example:
>>> from pprint import pprint
>>> from deepdiff import DeepDiff, Delta
>>> t1 = {"key1": "value1"}
>>> t2 = {"field2": {"key2": "value2"}}
>>> diff = DeepDiff(t1, t2, verbose_level=2)
>>> pprint(diff, indent=2)
{ 'dictionary_item_added': {"root['field2']": {'key2': 'value2'}},
'dictionary_item_removed': {"root['key1']": 'value1'}}
>>> delta = Delta(diff, bidirectional=True)
>>> flat_dicts = delta.to_flat_dicts()
>>> pprint(flat_dicts, indent=2)
[ { 'action': 'dictionary_item_added',
'new_path': None,
'old_type': 'unknown___',
'old_value': 'unknown___',
'path': ['field2', 'key2'],
't1_from_index': None,
't1_to_index': None,
't2_from_index': None,
't2_to_index': None,
'type': 'unknown___',
'value': 'value2'},
{ 'action': 'dictionary_item_removed',
'new_path': None,
'old_type': 'unknown___',
'old_value': 'unknown___',
'path': ['key1'],
't1_from_index': None,
't1_to_index': None,
't2_from_index': None,
't2_to_index': None,
'type': 'unknown___',
'value': 'value1'}]
Example 2:
>>> t3 = ["A", "B"]
>>> t4 = ["A", "B", "C", "D"]
>>> diff = DeepDiff(t3, t4, verbose_level=2)
>>> pprint(diff, indent=2)
{'iterable_item_added': {'root[2]': 'C', 'root[3]': 'D'}}
>>>
>>> delta = Delta(diff, bidirectional=True)
>>> flat_dicts = delta.to_flat_dicts()
>>> pprint(flat_dicts, indent=2)
[ { 'action': 'iterable_item_added',
'new_path': None,
'old_type': 'unknown___',
'old_value': 'unknown___',
'path': [2],
't1_from_index': None,
't1_to_index': None,
't2_from_index': None,
't2_to_index': None,
'type': 'unknown___',
'value': 'C'},
{ 'action': 'iterable_item_added',
'new_path': None,
'old_type': 'unknown___',
'old_value': 'unknown___',
'path': [3],
't1_from_index': None,
't1_to_index': None,
't2_from_index': None,
't2_to_index': None,
'type': 'unknown___',
'value': 'D'}]
.. _delta_from_flat_dicts_label:
Delta Load From Flat Dictionaries
------------------------------------
>>> from deepdiff import DeepDiff, Delta
>>> t3 = ["A", "B"]
>>> t4 = ["A", "B", "C", "D"]
>>> diff = DeepDiff(t3, t4, verbose_level=2)
>>> delta = Delta(diff, bidirectional=True)
>>> flat_dicts = delta.to_flat_dicts()
>>>
>>> delta2 = Delta(flat_dict_list=flat_dicts)
>>> t3 + delta == t4
True
Back to :doc:`/index`
qlustered-deepdiff-41c7265/deepdiff/docstrings/stats.rst 0000664 0000000 0000000 00000005646 15162412645 0023401 0 ustar 00root root 0000000 0000000 :doc:`/index`
.. _stats_n_logging_label:
Stats and Logging
=================
.. _log_frequency_in_sec_label:
Log Frequency In Sec
--------------------
log_frequency_in_sec: Integer, default = 0
How often to log the progress. The default of 0 means logging progress is disabled.
If you set it to 20, it will log every 20 seconds. This is useful only when running DeepDiff
on massive objects that will take a while to run. If you are only dealing with small objects, keep it at 0 to disable progress logging.
For example we have run a diff on 2 nested objects that took 2 seconds to get the results. By passing the log_frequency_in_sec=1, we get the following in the logs:
>>> DeepDiff(t1, t2, log_frequency_in_sec=1)
INFO:deepdiff.diff:DeepDiff 1 seconds in progress. Pass #1634, Diff #8005
INFO:deepdiff.diff:DeepDiff 2 seconds in progress. Pass #3319, Diff #16148
INFO:deepdiff.diff:stats {'PASSES COUNT': 3960, 'DIFF COUNT': 19469, 'DISTANCE CACHE HIT COUNT': 11847, 'MAX PASS LIMIT REACHED': False, 'MAX DIFF LIMIT REACHED': False, 'DURATION SEC': 2}
.. note::
The default python logger will omit the info logs. You can either set the logging filter to include info logs or pass a different logger via :ref:`progress_logger_label`
>>> import logging
>>> logging.basicConfig(level=logging.INFO)
.. _progress_logger_label:
Progress Logger
---------------
progress_logger: log function, default = logger.info
What logging function to use specifically for progress reporting. This function is only used when progress logging is enabled
by setting log_frequency_in_sec to anything above zero. The function that is passed as the progress_logger needs to be thread safe.
For example you can pass progress_logger=logger.warning to the example above and everything is logged as warning level:
>>> DeepDiff(t1, t2, log_frequency_in_sec=1, progress_logger=logger.warning)
WARNING:deepdiff.diff:DeepDiff 1 seconds in progress. Pass #1634, Diff #8005
WARNING:deepdiff.diff:DeepDiff 2 seconds in progress. Pass #3319, Diff #16148
WARNING:deepdiff.diff:stats {'PASSES COUNT': 3960, 'DIFF COUNT': 19469, 'DISTANCE CACHE HIT COUNT': 11847, 'MAX PASS LIMIT REACHED': False, 'MAX DIFF LIMIT REACHED': False, 'DURATION SEC': 2}
.. _get_stats_label:
Get Stats
---------
You can run the get_stats() method on a diff object to get some stats on the object.
For example:
>>> from pprint import pprint
>>> from deepdiff import DeepDiff
>>>
>>> t1 = [
... [1, 2, 3, 9], [9, 8, 5, 9]
... ]
>>>
>>> t2 = [
... [1, 2, 4, 10], [4, 2, 5]
... ]
>>>
>>> diff = DeepDiff(t1, t2, ignore_order=True, cache_size=5000, cutoff_intersection_for_pairs=1)
>>> pprint(diff.get_stats())
{'DIFF COUNT': 37,
'DISTANCE CACHE HIT COUNT': 0,
'MAX DIFF LIMIT REACHED': False,
'MAX PASS LIMIT REACHED': False,
'PASSES COUNT': 7}
Back to :doc:`/index`
qlustered-deepdiff-41c7265/deepdiff/docstrings/support.rst 0000664 0000000 0000000 00000001360 15162412645 0023744 0 ustar 00root root 0000000 0000000 :doc:`/index`
Support
=======
.. |qluster_link| raw:: html
Qluster
.. admonition:: DeepDiff is now part of |qluster_link|.
*If you're building workflows around data validation and correction,* `Qluster `__ *gives your team a structured way to manage rules, review failures, approve fixes, and reuse decisions—without building the entire system from scratch.*
Thank you for using DeepDiff!
If you find a bug, please create a ticket on our `GitHub repo `__
We are **available for consulting** if you need immediate help or custom implementations of DeepDiff. You can reach us via filling up `this form `__
Back to :doc:`/index`
qlustered-deepdiff-41c7265/deepdiff/docstrings/troubleshoot.rst 0000664 0000000 0000000 00000001152 15162412645 0024760 0 ustar 00root root 0000000 0000000 :doc:`/index`
.. _troubleshoot_label:
Troubleshoot
============
Murmur3 Installation
~~~~~~~~~~~~~~~~~~~~
NOTE: Murmur3 was removed from DeepDiff 5.2.0
If you are running into this issue, you are using an older version of DeepDiff.
`Failed to build mmh3 when installing DeepDiff`
DeepDiff prefers to use Murmur3 for hashing. However you have to manually install murmur3 by running: `pip install mmh3`
On MacOS Mojave, some users experience difficulty when installing Murmur3.
The problem can be solved by running:
`xcode-select --install`
And then running
`pip install mmh3`
Back to :doc:`/index`
qlustered-deepdiff-41c7265/deepdiff/docstrings/view.rst 0000664 0000000 0000000 00000035021 15162412645 0023203 0 ustar 00root root 0000000 0000000 :doc:`/index`
.. _view_label:
View
====
You have the options of text view and tree view.
The main difference is that the tree view has the capabilities to traverse the objects to see what objects were compared to what other objects.
While the view options decide the format of the output that is mostly machine readable, regardless of the view you choose, you can get a more human readable output by using the pretty() method.
DeepDiff also offers other specialized views such as the :doc:`colored_view` (which includes a compact variant) and :doc:`delta` view for specific use cases.
.. _text_view_label:
Text View
---------
Text view is the default view of DeepDiff. It is simpler than tree view.
Example of using the text view.
>>> from decimal import Decimal
>>> from deepdiff import DeepDiff
>>> t1 = {1:1, 3:3, 4:4}
>>> t2 = {1:1, 3:3, 5:5, 6:6}
>>> ddiff = DeepDiff(t1, t2)
>>> print(ddiff)
{'dictionary_item_added': [root[5], root[6]], 'dictionary_item_removed': [root[4]]}
So for example ddiff['dictionary_item_added'] is a set of string results. That's why this view is called the text view.
You can get this view by default or by passing `view='text'`.
.. _tree_view_label:
Tree View
---------
The tree view provides you with tree objects that you can traverse through to find
the parents of the objects that are diffed and the actual objects that are being diffed.
This view is very useful when dealing with nested objects.
Note that tree view always returns results in the form of Python sets.
You can traverse through the tree elements!
.. note::
The Tree view is just a different representation of the diffed data.
Behind the scene, DeepDiff creates the tree view first and then converts it to textual
representation for the text view.
**Tree View Interface**
.. code:: text
+---------------------------------------------------------------+
| |
| parent(t1) parent node parent(t2) |----level
| + ^ + |
+------|--------------------------|---------------------|-------+
| | | up |
| Child | | | ChildRelationship
| Relationship | | |
| down | | |
+------|----------------------|-------------------------|-------+
| v v v |
| child(t1) child node child(t2) |----level
| |
+---------------------------------------------------------------+
:up: Move up to the parent node aka parent level
:down: Move down to the child node aka child level
:path(): Get the path to the current node in string representation, path(output_format='list') gives you the path in list representation. path(use_t2=True) gives you the path to t2.
:t1: The first item in the current node that is being diffed
:t2: The second item in the current node that is being diffed
:additional: Additional information about the node i.e. repetition
:repetition: Shortcut to get the repetition report
The tree view allows you to have more than mere textual representaion of the diffed objects.
It gives you the actual objects (t1, t2) throughout the tree of parents and children.
**Examples for Tree View**
.. note::
Set view='tree' in order to get the results in tree view.
Value of an item has changed (Tree View)
>>> from deepdiff import DeepDiff
>>> from pprint import pprint
>>> t1 = {1:1, 2:2, 3:3}
>>> t2 = {1:1, 2:4, 3:3}
>>> ddiff_verbose0 = DeepDiff(t1, t2, verbose_level=0, view='tree')
>>> ddiff_verbose0
{'values_changed': []}
>>>
>>> ddiff_verbose1 = DeepDiff(t1, t2, verbose_level=1, view='tree')
>>> ddiff_verbose1
{'values_changed': []}
>>> set_of_values_changed = ddiff_verbose1['values_changed']
>>> # since set_of_values_changed includes only one item in a set
>>> # in order to get that one item we can:
>>> (changed,) = set_of_values_changed
>>> changed # Another way to get this is to do: changed=list(set_of_values_changed)[0]
>>> changed.t1
2
>>> changed.t2
4
>>> # You can traverse through the tree, get to the parents!
>>> changed.up
List difference (Tree View)
>>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, 3, 4]}}
>>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2]}}
>>> ddiff = DeepDiff(t1, t2, view='tree')
>>> ddiff
{'iterable_item_removed': [, ]}
>>> # Note that the iterable_item_removed is a set. In this case it has 2 items in it.
>>> # One way to get one item from the set is to convert it to a list
>>> # And then get the first item of the list:
>>> removed = list(ddiff['iterable_item_removed'])[0]
>>> removed
>>>
>>> parent = removed.up
>>> parent
>>> parent.path() # gives you the string representation of the path
"root[4]['b']"
>>> parent.path(output_format='list') # gives you the list of keys and attributes that make up the path
[4, 'b']
>>> parent.t1
[1, 2, 3, 4]
>>> parent.t2
[1, 2]
>>> parent.up
>>> parent.up.up
>>> parent.up.up.t1
{1: 1, 2: 2, 3: 3, 4: {'a': 'hello', 'b': [1, 2, 3, 4]}}
>>> parent.up.up.t1 == t1 # It is holding the original t1 that we passed to DeepDiff
True
List difference 2 (Tree View)
>>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, 3]}}
>>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 3, 2, 3]}}
>>> ddiff = DeepDiff(t1, t2, view='tree')
>>> pprint(ddiff, indent = 2)
{ 'iterable_item_added': [],
'values_changed': [, ]}
>>>
>>> # Note that iterable_item_added is a set with one item.
>>> # So in order to get that one item from it, we can do:
>>>
>>> (added,) = ddiff['iterable_item_added']
>>> added
>>> added.up.up
>>> added.up.up.path()
'root[4]'
>>> added.up.up.path(output_format='list') # gives you the list of keys and attributes that make up the path
[4]
>>> added.up.up.down
>>>
>>> # going up twice and then down twice gives you the same node in the tree:
>>> added.up.up.down.down == added
True
List difference ignoring order but reporting repetitions (Tree View)
>>> t1 = [1, 3, 1, 4]
>>> t2 = [4, 4, 1]
>>> ddiff = DeepDiff(t1, t2, ignore_order=True, report_repetition=True, view='tree')
>>> pprint(ddiff, indent=2)
{ 'iterable_item_removed': [],
'repetition_change': [, ]}
>>>
>>> # repetition_change is a set with 2 items.
>>> # in order to get those 2 items, we can do the following.
>>> # or we can convert the set to list and get the list items.
>>> # or we can iterate through the set items
>>>
>>> (repeat1, repeat2) = ddiff['repetition_change']
>>> repeat1 # the default verbosity is set to 1.
>>> # The actual data regarding the repetitions can be found in the repetition attribute:
>>> repeat1.repetition
{'old_repeat': 1, 'new_repeat': 2, 'old_indexes': [3], 'new_indexes': [0, 1]}
>>>
>>> # If you change the verbosity, you will see less:
>>> ddiff = DeepDiff(t1, t2, ignore_order=True, report_repetition=True, view='tree', verbose_level=0)
>>> ddiff
{'repetition_change': [, ], 'iterable_item_removed': []}
>>> (repeat1, repeat2) = ddiff['repetition_change']
>>> repeat1
>>>
>>> # But the verbosity level does not change the actual report object.
>>> # It only changes the textual representaion of the object. We get the actual object here:
>>> repeat1.repetition
{'old_repeat': 1, 'new_repeat': 2, 'old_indexes': [3], 'new_indexes': [0, 1]}
>>> repeat1.t1
4
>>> repeat1.t2
4
>>> repeat1.up
List that contains dictionary (Tree View)
>>> t1 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, {1:1, 2:2}]}}
>>> t2 = {1:1, 2:2, 3:3, 4:{"a":"hello", "b":[1, 2, {1:3}]}}
>>> ddiff = DeepDiff(t1, t2, view='tree')
>>> pprint (ddiff, indent = 2)
{ 'dictionary_item_removed': [],
'values_changed': []}
Sets (Tree View):
>>> t1 = {1, 2, 8}
>>> t2 = {1, 2, 3, 5}
>>> ddiff = DeepDiff(t1, t2, view='tree')
>>> print(ddiff)
{'set_item_removed': [], 'set_item_added': [, ]}
>>> # grabbing one item from set_item_removed set which has one item only
>>> (item,) = ddiff['set_item_removed']
>>> item.up
>>> item.up.t1 == t1
True
Named Tuples (Tree View):
>>> from collections import namedtuple
>>> Point = namedtuple('Point', ['x', 'y'])
>>> t1 = Point(x=11, y=22)
>>> t2 = Point(x=11, y=23)
>>> print(DeepDiff(t1, t2, view='tree'))
{'values_changed': []}
Custom objects (Tree View):
>>> class ClassA(object):
... a = 1
... def __init__(self, b):
... self.b = b
...
>>> t1 = ClassA(1)
>>> t2 = ClassA(2)
>>>
>>> print(DeepDiff(t1, t2, view='tree'))
{'values_changed': []}
Object attribute added (Tree View):
>>> t2.c = "new attribute"
>>> pprint(DeepDiff(t1, t2, view='tree'))
{'attribute_added': [],
'values_changed': []}
Approximate decimals comparison (Significant digits after the point) (Tree View):
>>> t1 = Decimal('1.52')
>>> t2 = Decimal('1.57')
>>> DeepDiff(t1, t2, significant_digits=0, view='tree')
{}
>>> ddiff = DeepDiff(t1, t2, significant_digits=1, view='tree')
>>> ddiff
{'values_changed': []}
>>> (change1,) = ddiff['values_changed']
>>> change1
>>> change1.t1
Decimal('1.52')
>>> change1.t2
Decimal('1.57')
>>> change1.path()
'root'
Approximate float comparison (Significant digits after the point) (Tree View):
>>> t1 = [ 1.1129, 1.3359 ]
>>> t2 = [ 1.113, 1.3362 ]
>>> ddiff = DeepDiff(t1, t2, significant_digits=3, view='tree')
>>> ddiff
{}
>>> ddiff = DeepDiff(t1, t2, view='tree')
>>> pprint(ddiff, indent=2)
{ 'values_changed': [, ]}
>>> ddiff = DeepDiff(1.23*10**20, 1.24*10**20, significant_digits=1, view='tree')
>>> ddiff
{'values_changed': []}
pretty() method
---------------
Use the pretty method for human readable output. This is regardless of what view you have used to generate the results.
>>> from deepdiff import DeepDiff
>>> t1={1,2,4}
>>> t2={2,3}
>>> print(DeepDiff(t1, t2).pretty())
Item root[3] added to set.
Item root[4] removed from set.
Item root[1] removed from set.
The pretty method has an optional parameter ``prefix`` that allows a prefix string before every output line (*e.g.* for logging):
>>> from deepdiff import DeepDiff
>>> t1={1,2,4}
>>> t2={2,3}
>>> print(DeepDiff(t1, t2).pretty(prefix='Diff: '))
Diff: Item root[3] added to set.
Diff: Item root[4] removed from set.
Diff: Item root[1] removed from set.
The ``prefix`` may also be a callable function. This function must accept ``**kwargs``; as of this version, the only parameter is ``diff`` but the signature allows for future expansion.
The ``diff`` given will be the ``DeepDiff`` that ``pretty`` was called on; this allows interesting capabilities such as:
.. code:: python
>>> from deepdiff import DeepDiff
>>> t1={1,2,4}
>>> t2={2,3}
>>> def callback(**kwargs):
... """Helper function using a hidden variable on the diff that tracks which count prints next"""
... kwargs['diff']._diff_count = 1 + getattr(kwargs['diff'], '_diff_count', 0)
... return f"Diff #{kwargs['diff']._diff_count}: "
...
>>> print(DeepDiff(t1, t2).pretty(prefix=callback))
Diff #1: Item root[3] added to set.
Diff #2: Item root[4] removed from set.
Diff #3: Item root[1] removed from set.
Text view vs. Tree view vs. pretty() method
-----------------------------------------------
Views are just different format of results. Each comes with its own set of features. At the end of the day the user can choose the right format based on the use case.
- The text view is the default format of the results. It is the format that is the most suitable if you don't need to know the traversal history of the objects being compared.
- The tree view allows you to traverse back and forth through the tree and see what objects were compared to what other objects.
- The pretty() method is not a view. All the views are dictionaries. The pretty() method spits out a string output of what has changed and is designed to be human readable.
For example
>>> from deepdiff import DeepDiff
>>> t1={1,2,4}
>>> t2={2,3}
Text view (default)
>>> DeepDiff(t1, t2) # same as view='text'
{'set_item_removed': [root[4], root[1]], 'set_item_added': [root[3]]}
Tree view
>>> tree = DeepDiff(t1, t2, view='tree')
>>> tree
{'set_item_removed': [, ], 'set_item_added': []}
>>> tree['set_item_added'][0]
>>> tree['set_item_added'][0].t2
3
Pretty method. Regardless of what view was used, you can use the "pretty()" method to get a human readable output.
>>> print(DeepDiff(t1, t2).pretty())
Item root[3] added to set.
Item root[4] removed from set.
Item root[1] removed from set.
Back to :doc:`/index`
qlustered-deepdiff-41c7265/deepdiff/helper.py 0000664 0000000 0000000 00000066725 15162412645 0021170 0 ustar 00root root 0000000 0000000 import sys
import re
import os
import datetime
import uuid
import logging
import warnings
import string
import time
import enum
import ipaddress
from typing import NamedTuple, Any, List, Optional, Dict, Union, TYPE_CHECKING, Tuple, Iterable, Iterator, Set, FrozenSet, Callable, Pattern, Type, TypeVar, Generic, Literal, overload, TypedDict
from collections.abc import Mapping, Sequence, Generator
from ast import literal_eval
from decimal import Decimal, localcontext, InvalidOperation as InvalidDecimalOperation
from fractions import Fraction
from itertools import repeat
from orderly_set import StableSetEq as SetOrderedBase # median: 1.0867 s for cache test, 5.63s for all tests
from threading import Timer
if TYPE_CHECKING:
from pytz.tzinfo import BaseTzInfo
class np_type:
pass
class pydantic_base_model_type:
pass
class SetOrdered(SetOrderedBase):
def __repr__(self) -> str:
return str(list(self))
try:
import numpy as np
except ImportError: # pragma: no cover. The case without Numpy is tested locally only.
np = None # pragma: no cover.
np_array_factory = 'numpy not available' # pragma: no cover.
np_ndarray = np_type # pragma: no cover.
np_bool_ = np_type # pragma: no cover.
np_int8 = np_type # pragma: no cover.
np_int16 = np_type # pragma: no cover.
np_int32 = np_type # pragma: no cover.
np_int64 = np_type # pragma: no cover.
np_uint8 = np_type # pragma: no cover.
np_uint16 = np_type # pragma: no cover.
np_uint32 = np_type # pragma: no cover.
np_uint64 = np_type # pragma: no cover.
np_intp = np_type # pragma: no cover.
np_uintp = np_type # pragma: no cover.
np_float32 = np_type # pragma: no cover.
np_float64 = np_type # pragma: no cover.
np_double = np_type # pragma: no cover.
np_floating = np_type # pragma: no cover.
np_complex64 = np_type # pragma: no cover.
np_complex128 = np_type # pragma: no cover.
np_cdouble = np_type # pragma: no cover.
np_complexfloating = np_type # pragma: no cover.
np_datetime64 = np_type # pragma: no cover.
else:
np_array_factory = np.array
np_ndarray = np.ndarray
np_bool_ = np.bool_
np_int8 = np.int8
np_int16 = np.int16
np_int32 = np.int32
np_int64 = np.int64
np_uint8 = np.uint8
np_uint16 = np.uint16
np_uint32 = np.uint32
np_uint64 = np.uint64
np_intp = np.intp
np_uintp = np.uintp
np_float32 = np.float32
np_float64 = np.float64
np_double = np.double # np.float_ is an alias for np.double and is being removed by NumPy 2.0
np_floating = np.floating
np_complex64 = np.complex64
np_complex128 = np.complex128
np_cdouble = np.cdouble # np.complex_ is an alias for np.cdouble and is being removed by NumPy 2.0
np_complexfloating = np.complexfloating
np_datetime64 = np.datetime64
numpy_numbers: Tuple[Type[Any], ...] = (
np_int8, np_int16, np_int32, np_int64, np_uint8,
np_uint16, np_uint32, np_uint64, np_intp, np_uintp,
np_float32, np_float64, np_double, np_floating, np_complex64,
np_complex128, np_cdouble,)
numpy_complex_numbers: Tuple[Type[Any], ...] = (
np_complexfloating, np_complex64, np_complex128, np_cdouble,
)
numpy_dtypes: Set[Type[Any]] = set(numpy_numbers)
numpy_dtypes.add(np_bool_) # type: ignore
numpy_dtypes.add(np_datetime64) # type: ignore
numpy_dtype_str_to_type: Dict[str, Type[Any]] = {
item.__name__: item for item in numpy_dtypes
}
try:
from pydantic.main import BaseModel as PydanticBaseModel # type: ignore
except ImportError:
PydanticBaseModel = pydantic_base_model_type
logger = logging.getLogger(__name__)
py_major_version = sys.version_info.major
py_minor_version = sys.version_info.minor
py_current_version: Decimal = Decimal("{}.{}".format(py_major_version, py_minor_version))
py2 = py_major_version == 2
py3 = py_major_version == 3
py4 = py_major_version == 4
NUMERICS: FrozenSet[str] = frozenset(string.digits)
class EnumBase(str, enum.Enum):
def __repr__(self) -> str:
"""
We need to add a single quotes so we can easily copy the value when we do ipdb.
"""
return f"'{self.name}'"
def __str__(self) -> str:
return self.name
def _int_or_zero(value: str) -> int:
"""
Tries to extract some number from a string.
12c becomes 12
"""
try:
return int(value)
except Exception:
result = []
for char in value:
if char in NUMERICS:
result.append(char)
if result:
return int(''.join(result))
return 0
def get_semvar_as_integer(version: str) -> int:
"""
Converts:
'1.23.5' to 1023005
"""
version_parts = version.split('.')
if len(version_parts) > 3:
version_parts = version_parts[:3]
elif len(version_parts) < 3:
version_parts.extend(['0'] * (3 - len(version_parts)))
return sum([10**(i * 3) * _int_or_zero(v) for i, v in enumerate(reversed(version_parts))])
# we used to use OrderedDictPlus when dictionaries in Python were not ordered.
dict_ = dict
if py4:
logger.warning('Python 4 is not supported yet. Switching logic to Python 3.') # pragma: no cover
py3 = True # pragma: no cover
if py2: # pragma: no cover
sys.exit('Python 2 is not supported anymore. The last version of DeepDiff that supported Py2 was 3.3.0')
pypy3 = py3 and hasattr(sys, "pypy_translation_info")
if np and get_semvar_as_integer(np.__version__) < 1019000:
sys.exit('The minimum required Numpy version is 1.19.0. Please upgrade your Numpy package.')
strings: Tuple[Type[str], Type[bytes], Type[memoryview]] = (str, bytes, memoryview) # which are both basestring
unicode_type = str
bytes_type = bytes
only_complex_number: Tuple[Type[Any], ...] = (complex,) + numpy_complex_numbers
only_numbers: Tuple[Type[Any], ...] = (int, float, complex, Decimal, Fraction) + numpy_numbers
datetimes: Tuple[Type[Any], ...] = (datetime.datetime, datetime.date, datetime.timedelta, datetime.time, np_datetime64)
ipranges: Tuple[Type[Any], ...] = (ipaddress.IPv4Interface, ipaddress.IPv6Interface, ipaddress.IPv4Network, ipaddress.IPv6Network, ipaddress.IPv4Address, ipaddress.IPv6Address)
uuids: Tuple[Type[uuid.UUID]] = (uuid.UUID, )
times: Tuple[Type[Any], ...] = (datetime.datetime, datetime.time, np_datetime64)
numbers: Tuple[Type[Any], ...] = only_numbers + datetimes
# Type alias for use in type annotations
NumberType = Union[int, float, complex, Decimal, Fraction, datetime.datetime, datetime.date, datetime.timedelta, datetime.time, Any]
booleans: Tuple[Type[bool], Type[Any]] = (bool, np_bool_)
basic_types: Tuple[Type[Any], ...] = strings + numbers + uuids + booleans + (type(None), )
class IndexedHash(NamedTuple):
indexes: List[Any]
item: Any
current_dir = os.path.dirname(os.path.abspath(__file__))
ID_PREFIX = '!>*id'
KEY_TO_VAL_STR = "{}:{}"
TREE_VIEW = 'tree'
TEXT_VIEW = 'text'
DELTA_VIEW = '_delta'
COLORED_VIEW = 'colored'
COLORED_COMPACT_VIEW = 'colored_compact'
ENUM_INCLUDE_KEYS: List[str] = ['__objclass__', 'name', 'value']
def short_repr(item: Any, max_length: int = 15) -> str:
"""Short representation of item if it is too long"""
item = repr(item)
if len(item) > max_length:
item = '{}...{}'.format(item[:max_length - 3], item[-1])
return item
class ListItemRemovedOrAdded: # pragma: no cover
"""Class of conditions to be checked"""
pass
class OtherTypes:
def __repr__(self) -> str:
return "Error: {}".format(self.__class__.__name__) # pragma: no cover
__str__ = __repr__
class Skipped(OtherTypes):
pass
class Unprocessed(OtherTypes):
pass
class NotHashed(OtherTypes):
pass
class NotPresent: # pragma: no cover
"""
In a change tree, this indicated that a previously existing object has been removed -- or will only be added
in the future.
We previously used None for this but this caused problem when users actually added and removed None. Srsly guys? :D
"""
def __repr__(self) -> str:
return 'not present' # pragma: no cover
__str__ = __repr__
class CannotCompare(Exception):
"""
Exception when two items cannot be compared in the compare function.
"""
pass
unprocessed = Unprocessed()
skipped = Skipped()
not_hashed = NotHashed()
notpresent = NotPresent()
# Disabling remapping from old to new keys since the mapping is deprecated.
RemapDict = dict_
# class RemapDict(dict_):
# """
# DISABLED
# Remap Dictionary.
# For keys that have a new, longer name, remap the old key to the new key.
# Other keys that don't have a new name are handled as before.
# """
# def __getitem__(self, old_key):
# new_key = EXPANDED_KEY_MAP.get(old_key, old_key)
# if new_key != old_key:
# logger.warning(
# "DeepDiff Deprecation: %s is renamed to %s. Please start using "
# "the new unified naming convention.", old_key, new_key)
# if new_key in self:
# return self.get(new_key)
# else: # pragma: no cover
# raise KeyError(new_key)
class indexed_set(set):
"""
A set class that lets you get an item by index
>>> a = indexed_set()
>>> a.add(10)
>>> a.add(20)
>>> a[0]
10
"""
def add_to_frozen_set(parents_ids: FrozenSet[int], item_id: int) -> FrozenSet[int]:
return parents_ids | {item_id}
def convert_item_or_items_into_set_else_none(items: Union[str, Iterable[str], None]) -> Optional[Set[str]]:
if items:
if isinstance(items, str):
return {items}
else:
return set(items)
else:
return None
def add_root_to_paths(paths: Optional[Iterable[str]]) -> Optional[SetOrdered]:
"""
Sometimes the users want to just pass
[key] instead of root[key] for example.
Here we automatically add all sorts of variations that might match
the path they were supposed to pass.
"""
if paths is None:
return
result = SetOrdered()
for path in paths:
if path.startswith('root'):
result.add(path)
else:
if path.isdigit():
result.add(f"root['{path}']")
result.add(f"root[{path}]")
elif path[0].isdigit():
result.add(f"root['{path}']")
else:
result.add(f"root.{path}")
result.add(f"root['{path}']")
return result
RE_COMPILED_TYPE = type(re.compile(''))
def convert_item_or_items_into_compiled_regexes_else_none(items: Union[str, Pattern[str], Iterable[Union[str, Pattern[str]]], None]) -> Optional[List[Pattern[str]]]:
if items:
if isinstance(items, (str, RE_COMPILED_TYPE)):
items_list = [items] # type: ignore
else:
items_list = list(items) # type: ignore
return [i if isinstance(i, RE_COMPILED_TYPE) else re.compile(i) for i in items_list]
else:
return None
def get_id(obj: Any) -> str:
"""
Adding some characters to id so they are not just integers to reduce the risk of collision.
"""
return "{}{}".format(ID_PREFIX, id(obj))
def get_type(obj: Any) -> Type[Any]:
"""
Get the type of object or if it is a class, return the class itself.
"""
if isinstance(obj, np_ndarray):
return obj.dtype.type # type: ignore
return obj if type(obj) is type else type(obj)
def numpy_dtype_string_to_type(dtype_str: str) -> Type[Any]:
return numpy_dtype_str_to_type[dtype_str]
def type_in_type_group(item: Any, type_group: Tuple[Type[Any], ...]) -> bool:
return get_type(item) in type_group
def type_is_subclass_of_type_group(item: Any, type_group: Tuple[Type[Any], ...]) -> bool:
return isinstance(item, type_group) \
or (isinstance(item, type) and issubclass(item, type_group)) \
or type_in_type_group(item, type_group)
def get_doc(doc_filename: str) -> str:
try:
with open(os.path.join(current_dir, 'docstrings', doc_filename), 'r') as doc_file:
doc = doc_file.read()
doc = doc.replace(':orphan:\n\n', '', 1)
except Exception: # pragma: no cover
doc = 'Failed to load the docstrings. Please visit: https://zepworks.com/deepdiff/current/' # pragma: no cover
return doc
number_formatting: Dict[str, str] = {
"f": r'{:.%sf}',
"e": r'{:.%se}',
}
def number_to_string(number: Any, significant_digits: int, number_format_notation: Literal['f', 'e'] = 'f') -> Any:
"""
Convert numbers to string considering significant digits.
"""
try:
using = number_formatting[number_format_notation]
except KeyError:
raise ValueError("number_format_notation got invalid value of {}. The valid values are 'f' and 'e'".format(number_format_notation)) from None
if not isinstance(number, numbers): # type: ignore
return number
elif isinstance(number, Decimal):
with localcontext() as ctx:
# Precision = number of integer digits + significant_digits
# Using number//1 to get the integer part of the number
ctx.prec = len(str(abs(number // 1))) + significant_digits
try:
number = number.quantize(Decimal('0.' + '0' * significant_digits))
except InvalidDecimalOperation:
# Sometimes rounding up causes a higher precision to be needed for the quantize operation
# For example '999.99999999' will become '1000.000000' after quantize
ctx.prec += 1
number = number.quantize(Decimal('0.' + '0' * significant_digits))
elif isinstance(number, Fraction):
# Convert Fraction to float so that string formatting works on Python < 3.12
number = round(float(number), significant_digits)
if significant_digits == 0:
number = int(number)
elif isinstance(number, only_complex_number): # type: ignore
# Case for complex numbers.
number = number.__class__(
"{real}+{imag}j".format( # type: ignore
real=number_to_string(
number=number.real, # type: ignore
significant_digits=significant_digits,
number_format_notation=number_format_notation
),
imag=number_to_string(
number=number.imag, # type: ignore
significant_digits=significant_digits,
number_format_notation=number_format_notation
)
) # type: ignore
)
else:
number = round(number=number, ndigits=significant_digits) # type: ignore
if significant_digits == 0:
number = int(number) # type: ignore
if number == 0.0:
# Special case for 0: "-0.xx" should compare equal to "0.xx"
number = abs(number) # type: ignore
# Cast number to string
result = (using % significant_digits).format(number)
# https://bugs.python.org/issue36622
if number_format_notation == 'e':
# Removing leading 0 for exponential part.
result = re.sub(
pattern=r'(?<=e(\+|\-))0(?=\d)+',
repl=r'',
string=result
)
return result
class DeepDiffDeprecationWarning(DeprecationWarning):
"""
Use this warning instead of DeprecationWarning
"""
pass
def cartesian_product(a: Iterable[Tuple[Any, ...]], b: Iterable[Any]) -> Iterator[Tuple[Any, ...]]:
"""
Get the Cartesian product of two iterables
**parameters**
a: list of lists
b: iterable to do the Cartesian product
"""
for i in a:
for j in b:
yield i + (j,)
def cartesian_product_of_shape(dimentions: Iterable[int], result: Optional[Tuple[Tuple[Any, ...], ...]] = None) -> Iterator[Tuple[Any, ...]]:
"""
Cartesian product of a dimensions iterable.
This is mainly used to traverse Numpy ndarrays.
Each array has dimensions that are defined in ndarray.shape
"""
if result is None:
result = ((),) # a tuple with an empty tuple
for dimension in dimentions:
result = tuple(cartesian_product(result, range(dimension)))
return iter(result)
def get_numpy_ndarray_rows(obj: Any, shape: Optional[Tuple[int, ...]] = None) -> Generator[Tuple[Tuple[int, ...], Any], None, None]:
"""
Convert a multi dimensional numpy array to list of rows
"""
if shape is None:
shape = obj.shape # type: ignore
dimentions = shape[:-1] if shape else ()
for path_tuple in cartesian_product_of_shape(dimentions):
result = obj
for index in path_tuple:
result = result[index]
yield path_tuple, result
class _NotFound:
def __eq__(self, other: Any) -> bool:
return False
__req__ = __eq__
def __repr__(self) -> str:
return 'not found'
__str__ = __repr__
not_found = _NotFound()
warnings.simplefilter('once', DeepDiffDeprecationWarning)
class RepeatedTimer:
"""
Threaded Repeated Timer by MestreLion
https://stackoverflow.com/a/38317060/1497443
"""
def __init__(self, interval: float, function: Callable[..., Any], *args: Any, **kwargs: Any) -> None:
self._timer = None
self.interval = interval
self.function = function
self.args = args
self.start_time = time.time()
self.kwargs = kwargs
self.is_running = False
self.start()
def _get_duration_sec(self) -> int:
return int(time.time() - self.start_time)
def _run(self) -> None:
self.is_running = False
self.start()
self.function(*self.args, **self.kwargs)
def start(self) -> None:
self.kwargs.update(duration=self._get_duration_sec())
if not self.is_running:
self._timer = Timer(self.interval, self._run)
self._timer.start()
self.is_running = True
def stop(self) -> int:
duration = self._get_duration_sec()
if self._timer is not None:
self._timer.cancel()
self.is_running = False
return duration
def _eval_decimal(params: str) -> Decimal:
return Decimal(params)
def _eval_datetime(params: str) -> datetime.datetime:
params_with_parens = f'({params})'
params_tuple = literal_eval(params_with_parens)
return datetime.datetime(*params_tuple) # type: ignore
def _eval_date(params: str) -> datetime.date:
params_with_parens = f'({params})'
params_tuple = literal_eval(params_with_parens)
return datetime.date(*params_tuple) # type: ignore
LITERAL_EVAL_PRE_PROCESS: List[Tuple[str, str, Callable[[str], Any]]] = [
('Decimal(', ')', _eval_decimal),
('datetime.datetime(', ')', _eval_datetime),
('datetime.date(', ')', _eval_date),
]
def literal_eval_extended(item: str) -> Any:
"""
An extended version of literal_eval
"""
try:
return literal_eval(item)
except (SyntaxError, ValueError):
for begin, end, func in LITERAL_EVAL_PRE_PROCESS:
if item.startswith(begin) and item.endswith(end):
# Extracting and removing extra quotes so for example "Decimal('10.1')" becomes "'10.1'" and then '10.1'
params = item[len(begin): -len(end)].strip('\'\"')
return func(params)
raise
def time_to_seconds(t: datetime.time) -> int:
return (t.hour * 60 + t.minute) * 60 + t.second
def datetime_normalize(
truncate_datetime:Union[str, None],
obj:Union[datetime.datetime, datetime.time],
default_timezone: Union[
datetime.timezone, "BaseTzInfo"
] = datetime.timezone.utc,
) -> Any:
if truncate_datetime:
if truncate_datetime == 'second':
obj = obj.replace(microsecond=0)
elif truncate_datetime == 'minute':
obj = obj.replace(second=0, microsecond=0)
elif truncate_datetime == 'hour':
obj = obj.replace(minute=0, second=0, microsecond=0)
elif truncate_datetime == 'day':
obj = obj.replace(hour=0, minute=0, second=0, microsecond=0)
if isinstance(obj, datetime.datetime):
if has_timezone(obj):
obj = obj.astimezone(default_timezone)
else:
obj = obj.replace(tzinfo=default_timezone)
elif isinstance(obj, datetime.time):
return time_to_seconds(obj)
return obj
def has_timezone(dt: datetime.datetime) -> bool:
"""
Function to check if a datetime object has a timezone
Checking dt.tzinfo.utcoffset(dt) ensures that the datetime object is truly timezone-aware
because some datetime objects may have a tzinfo attribute that is not None but still
doesn't provide a valid offset.
Certain tzinfo objects, such as pytz.timezone(None), can exist but do not provide meaningful UTC offset information.
If tzinfo is present but calling .utcoffset(dt) returns None, the datetime is not truly timezone-aware.
"""
return dt.tzinfo is not None and dt.tzinfo.utcoffset(dt) is not None
def get_truncate_datetime(truncate_datetime: Union[str, None]) -> Union[str, None]:
"""
Validates truncate_datetime value
"""
if truncate_datetime not in {None, 'second', 'minute', 'hour', 'day'}:
raise ValueError("truncate_datetime must be second, minute, hour or day")
return truncate_datetime
def cartesian_product_numpy(*arrays: Any) -> Any:
"""
Cartesian product of Numpy arrays by Paul Panzer
https://stackoverflow.com/a/49445693/1497443
"""
la = len(arrays)
dtype = np.result_type(*arrays) # type: ignore
arr = np.empty((la, *map(len, arrays)), dtype=dtype) # type: ignore
idx = slice(None), *repeat(None, la)
for i, a in enumerate(arrays):
arr[i, ...] = a[idx[:la - i]]
return arr.reshape(la, -1).T
def diff_numpy_array(A: Any, B: Any) -> Any:
"""
Numpy Array A - B
return items in A that are not in B
By Divakar
https://stackoverflow.com/a/52417967/1497443
"""
return A[~np.isin(A, B)] # type: ignore
PYTHON_TYPE_TO_NUMPY_TYPE: Dict[Type[Any], Type[Any]] = {
int: np_int64,
float: np_float64,
Decimal: np_float64
}
def get_homogeneous_numpy_compatible_type_of_seq(seq: Sequence[Any]) -> Union[Type[Any], Literal[False]]:
"""
Return with the numpy dtype if the array can be converted to a non-object numpy array.
Originally written by mgilson https://stackoverflow.com/a/13252348/1497443
This is the modified version.
"""
iseq = iter(seq)
first_type = type(next(iseq))
if first_type in {int, float, Decimal}:
type_match = first_type if all((type(x) is first_type) for x in iseq) else False
if type_match:
return PYTHON_TYPE_TO_NUMPY_TYPE.get(type_match, False)
else:
return False
else:
return False
def detailed__dict__(obj: Any, ignore_private_variables: bool = True, ignore_keys: FrozenSet[str] = frozenset(), include_keys: Optional[List[str]] = None) -> Dict[str, Any]:
"""
Get the detailed dictionary of an object.
This is used so we retrieve object properties too.
"""
if include_keys:
result = {}
for key in include_keys:
try:
value = getattr(obj, key)
except Exception:
pass
else:
if not callable(value) or key == '__objclass__': # We don't want to compare functions, however for backward compatibility, __objclass__ needs to be reported.
result[key] = value
else:
result = obj.__dict__.copy() # A shallow copy
private_var_prefix = f"_{obj.__class__.__name__}__" # The semi private variables in Python get this prefix
for key in obj.__dict__:
if key in ignore_keys or (
ignore_private_variables and key.startswith('__') and not key.startswith(private_var_prefix)
):
del result[key]
if isinstance(obj, PydanticBaseModel):
getter = lambda x, y: getattr(type(x), y)
else:
getter = getattr
for key in dir(obj):
if key not in result and key not in ignore_keys and (
not ignore_private_variables or (
ignore_private_variables and not key.startswith('__') and not key.startswith(private_var_prefix)
)
):
value = getter(obj, key)
if not callable(value):
result[key] = value
return result
def named_tuple_repr(self: NamedTuple) -> str:
fields = []
for field, value in self._asdict().items():
# Only include fields that do not have their default value
if field in self._field_defaults:
if value != self._field_defaults[field]:
fields.append(f"{field}={value!r}")
else:
fields.append(f"{field}={value!r}")
return f"{self.__class__.__name__}({', '.join(fields)})"
class OpcodeTag(EnumBase):
insert = 'insert'
delete = 'delete'
equal = 'equal'
replace = 'replace' # type: ignore
# swapped = 'swapped' # in the future we should support reporting of items swapped with each other
class Opcode(NamedTuple):
tag: str
t1_from_index: int
t1_to_index: int
t2_from_index: int
t2_to_index: int
old_values: Optional[List[Any]] = None
new_values: Optional[List[Any]] = None
__repr__ = __str__ = named_tuple_repr
class FlatDataAction(EnumBase):
values_changed = 'values_changed'
type_changes = 'type_changes'
set_item_added = 'set_item_added'
set_item_removed = 'set_item_removed'
dictionary_item_added = 'dictionary_item_added'
dictionary_item_removed = 'dictionary_item_removed'
iterable_item_added = 'iterable_item_added'
iterable_item_removed = 'iterable_item_removed'
iterable_item_moved = 'iterable_item_moved'
iterable_items_inserted = 'iterable_items_inserted' # opcode
iterable_items_deleted = 'iterable_items_deleted' # opcode
iterable_items_replaced = 'iterable_items_replaced' # opcode
iterable_items_equal = 'iterable_items_equal' # opcode
attribute_removed = 'attribute_removed'
attribute_added = 'attribute_added'
unordered_iterable_item_added = 'unordered_iterable_item_added'
unordered_iterable_item_removed = 'unordered_iterable_item_removed'
initiated = "initiated"
OPCODE_TAG_TO_FLAT_DATA_ACTION = {
OpcodeTag.insert: FlatDataAction.iterable_items_inserted,
OpcodeTag.delete: FlatDataAction.iterable_items_deleted,
OpcodeTag.replace: FlatDataAction.iterable_items_replaced,
OpcodeTag.equal: FlatDataAction.iterable_items_equal,
}
FLAT_DATA_ACTION_TO_OPCODE_TAG = {v: i for i, v in OPCODE_TAG_TO_FLAT_DATA_ACTION.items()}
UnkownValueCode: str = 'unknown___'
class FlatDeltaRow(NamedTuple):
path: List
action: FlatDataAction
value: Optional[Any] = UnkownValueCode
old_value: Optional[Any] = UnkownValueCode
type: Optional[Any] = UnkownValueCode
old_type: Optional[Any] = UnkownValueCode
new_path: Optional[List] = None
t1_from_index: Optional[int] = None
t1_to_index: Optional[int] = None
t2_from_index: Optional[int] = None
t2_to_index: Optional[int] = None
__repr__ = __str__ = named_tuple_repr
class _FlatDeltaDictRequired(TypedDict):
path: List
action: FlatDataAction
class FlatDeltaDict(_FlatDeltaDictRequired, total=False):
value: Optional[Any]
old_value: Optional[Any]
type: Optional[Any]
old_type: Optional[Any]
new_path: Optional[List]
t1_from_index: Optional[int]
t1_to_index: Optional[int]
t2_from_index: Optional[int]
t2_to_index: Optional[int]
JSON = Union[Dict[str, str], List[str], List[int], Dict[str, "JSON"], List["JSON"], str, int, float, bool, None]
class SummaryNodeType(EnumBase):
dict = 'dict'
list = 'list'
leaf = 'leaf'
qlustered-deepdiff-41c7265/deepdiff/lfucache.py 0000664 0000000 0000000 00000015711 15162412645 0021450 0 ustar 00root root 0000000 0000000 """
LFU cache Written by Shane Wang
https://medium.com/@epicshane/a-python-implementation-of-lfu-least-frequently-used-cache-with-o-1-time-complexity-e16b34a3c49b
https://github.com/luxigner/lfu_cache
Modified by Sep Dehpour
"""
from collections import defaultdict
from threading import Lock
from statistics import mean
from deepdiff.helper import not_found, dict_, SetOrdered
class CacheNode:
def __init__(self, key, report_type, value, freq_node, pre, nxt):
self.key = key
if report_type:
self.content = defaultdict(SetOrdered)
self.content[report_type].add(value)
else:
self.content = value
self.freq_node = freq_node
self.pre = pre # previous CacheNode
self.nxt = nxt # next CacheNode
def free_myself(self):
if self.freq_node.cache_head == self.freq_node.cache_tail: # type: ignore
self.freq_node.cache_head = self.freq_node.cache_tail = None # type: ignore
elif self.freq_node.cache_head == self: # type: ignore
self.nxt.pre = None # type: ignore
self.freq_node.cache_head = self.nxt # type: ignore
elif self.freq_node.cache_tail == self: # type: ignore
self.pre.nxt = None # type: ignore
self.freq_node.cache_tail = self.pre # type: ignore
else:
self.pre.nxt = self.nxt # type: ignore
self.nxt.pre = self.pre # type: ignore
self.pre = None
self.nxt = None
self.freq_node = None
class FreqNode:
def __init__(self, freq, pre, nxt):
self.freq = freq
self.pre = pre # previous FreqNode
self.nxt = nxt # next FreqNode
self.cache_head = None # CacheNode head under this FreqNode
self.cache_tail = None # CacheNode tail under this FreqNode
def count_caches(self):
if self.cache_head is None and self.cache_tail is None:
return 0
elif self.cache_head == self.cache_tail:
return 1
else:
return '2+'
def remove(self):
if self.pre is not None:
self.pre.nxt = self.nxt
if self.nxt is not None:
self.nxt.pre = self.pre
pre = self.pre
nxt = self.nxt
self.pre = self.nxt = self.cache_head = self.cache_tail = None
return (pre, nxt)
def pop_head_cache(self):
if self.cache_head is None and self.cache_tail is None:
return None
elif self.cache_head == self.cache_tail:
cache_head = self.cache_head
self.cache_head = self.cache_tail = None
return cache_head
else:
cache_head = self.cache_head
self.cache_head.nxt.pre = None # type: ignore
self.cache_head = self.cache_head.nxt # type: ignore
return cache_head
def append_cache_to_tail(self, cache_node):
cache_node.freq_node = self
if self.cache_head is None and self.cache_tail is None:
self.cache_head = self.cache_tail = cache_node
else:
cache_node.pre = self.cache_tail
cache_node.nxt = None
self.cache_tail.nxt = cache_node # type: ignore
self.cache_tail = cache_node
def insert_after_me(self, freq_node):
freq_node.pre = self
freq_node.nxt = self.nxt
if self.nxt is not None:
self.nxt.pre = freq_node
self.nxt = freq_node
def insert_before_me(self, freq_node):
if self.pre is not None:
self.pre.nxt = freq_node
freq_node.pre = self.pre
freq_node.nxt = self
self.pre = freq_node
class LFUCache:
def __init__(self, capacity):
self.cache = dict_() # {key: cache_node}
if capacity <= 0:
raise ValueError('Capacity of LFUCache needs to be positive.') # pragma: no cover.
self.capacity = capacity
self.freq_link_head = None
self.lock = Lock()
def get(self, key):
with self.lock:
if key in self.cache:
cache_node = self.cache[key]
freq_node = cache_node.freq_node
content = cache_node.content
self.move_forward(cache_node, freq_node)
return content
else:
return not_found
def set(self, key, report_type=None, value=None):
with self.lock:
if key in self.cache:
cache_node = self.cache[key]
if report_type:
cache_node.content[report_type].add(value)
else:
cache_node.content = value
else:
if len(self.cache) >= self.capacity:
self.dump_cache()
self.create_cache_node(key, report_type, value)
def __contains__(self, key):
return key in self.cache
def move_forward(self, cache_node, freq_node):
if freq_node.nxt is None or freq_node.nxt.freq != freq_node.freq + 1:
target_freq_node = FreqNode(freq_node.freq + 1, None, None)
target_empty = True
else:
target_freq_node = freq_node.nxt
target_empty = False
cache_node.free_myself()
target_freq_node.append_cache_to_tail(cache_node)
if target_empty:
freq_node.insert_after_me(target_freq_node)
if freq_node.count_caches() == 0:
if self.freq_link_head == freq_node:
self.freq_link_head = target_freq_node
freq_node.remove()
def dump_cache(self):
head_freq_node = self.freq_link_head
self.cache.pop(head_freq_node.cache_head.key) # type: ignore
head_freq_node.pop_head_cache() # type: ignore
if head_freq_node.count_caches() == 0: # type: ignore
self.freq_link_head = head_freq_node.nxt # type: ignore
head_freq_node.remove() # type: ignore
def create_cache_node(self, key, report_type, value):
cache_node = CacheNode(
key=key, report_type=report_type,
value=value, freq_node=None, pre=None, nxt=None)
self.cache[key] = cache_node
if self.freq_link_head is None or self.freq_link_head.freq != 0:
new_freq_node = FreqNode(0, None, None)
new_freq_node.append_cache_to_tail(cache_node)
if self.freq_link_head is not None:
self.freq_link_head.insert_before_me(new_freq_node)
self.freq_link_head = new_freq_node
else:
self.freq_link_head.append_cache_to_tail(cache_node)
def get_sorted_cache_keys(self):
result = [(i, freq.freq_node.freq) for i, freq in self.cache.items()]
result.sort(key=lambda x: -x[1])
return result
def get_average_frequency(self):
return mean(freq.freq_node.freq for freq in self.cache.values())
class DummyLFU:
def __init__(self, *args, **kwargs):
pass
set = __init__
get = __init__
def __contains__(self, key):
return False
qlustered-deepdiff-41c7265/deepdiff/model.py 0000664 0000000 0000000 00000126205 15162412645 0020777 0 ustar 00root root 0000000 0000000 import logging
from collections.abc import Mapping
from copy import copy
from typing import Any, Dict, List, Optional, Set, Union, Literal, Type, TYPE_CHECKING
from deepdiff.helper import (
RemapDict, strings, notpresent, get_type, numpy_numbers, np, literal_eval_extended,
dict_, SetOrdered)
from deepdiff.path import stringify_element
if TYPE_CHECKING:
from deepdiff.diff import DeepDiff
logger = logging.getLogger(__name__)
FORCE_DEFAULT: Literal['fake'] = 'fake'
UP_DOWN: Dict[str, str] = {'up': 'down', 'down': 'up'}
REPORT_KEYS: Set[str] = {
"type_changes",
"dictionary_item_added",
"dictionary_item_removed",
"values_changed",
"unprocessed",
"iterable_item_added",
"iterable_item_removed",
"iterable_item_moved",
"attribute_added",
"attribute_removed",
"set_item_removed",
"set_item_added",
"repetition_change",
}
CUSTOM_FIELD: str = "__internal:custom:extra_info"
class DoesNotExist(Exception):
pass
class ResultDict(RemapDict):
def remove_empty_keys(self) -> None:
"""
Remove empty keys from this object. Should always be called after the result is final.
:return:
"""
empty_keys = [k for k, v in self.items() if not isinstance(v, (int)) and not v]
for k in empty_keys:
del self[k]
class TreeResult(ResultDict):
def __init__(self) -> None:
for key in REPORT_KEYS:
self[key] = SetOrdered()
def mutual_add_removes_to_become_value_changes(self) -> None:
"""
There might be the same paths reported in the results as removed and added.
In such cases they should be reported as value_changes.
Note that this function mutates the tree in ways that causes issues when report_repetition=True
and should be avoided in that case.
This function should only be run on the Tree Result.
"""
iterable_item_added = self.get('iterable_item_added')
iterable_item_removed = self.get('iterable_item_removed')
if iterable_item_added is not None and iterable_item_removed is not None:
added_paths = {i.path(): i for i in iterable_item_added}
removed_paths = {i.path(): i for i in iterable_item_removed}
mutual_paths = set(added_paths) & set(removed_paths)
if mutual_paths and 'values_changed' not in self or self['values_changed'] is None:
self['values_changed'] = SetOrdered()
for path in mutual_paths:
level_before = removed_paths[path]
iterable_item_removed.remove(level_before)
level_after = added_paths[path]
iterable_item_added.remove(level_after)
level_before.t2 = level_after.t2
self['values_changed'].add(level_before) # type: ignore
level_before.report_type = 'values_changed'
if 'iterable_item_removed' in self and not iterable_item_removed:
del self['iterable_item_removed']
if 'iterable_item_added' in self and not iterable_item_added:
del self['iterable_item_added']
def __getitem__(self, item: str) -> SetOrdered:
if item not in self:
self[item] = SetOrdered()
result = self.get(item)
if result is None:
result = SetOrdered()
self[item] = result
return result
def __len__(self) -> int:
length = 0
for value in self.values():
if isinstance(value, SetOrdered):
length += len(value)
elif isinstance(value, int):
length += 1
return length
class TextResult(ResultDict):
ADD_QUOTES_TO_STRINGS: bool = True
def __init__(self, tree_results: Optional['TreeResult'] = None, verbose_level: int = 1) -> None:
self.verbose_level = verbose_level
# TODO: centralize keys
self.update({
"type_changes": dict_(),
"dictionary_item_added": self.__set_or_dict(),
"dictionary_item_removed": self.__set_or_dict(),
"values_changed": dict_(),
"unprocessed": [],
"iterable_item_added": dict_(),
"iterable_item_removed": dict_(),
"iterable_item_moved": dict_(),
"attribute_added": self.__set_or_dict(),
"attribute_removed": self.__set_or_dict(),
"set_item_removed": SetOrdered(),
"set_item_added": SetOrdered(),
"repetition_change": dict_()
})
if tree_results:
self._from_tree_results(tree_results)
def __set_or_dict(self) -> Union[Dict[str, Any], SetOrdered]:
return {} if self.verbose_level >= 2 else SetOrdered()
def _from_tree_results(self, tree: 'TreeResult') -> None:
"""
Populate this object by parsing an existing reference-style result dictionary.
:param tree: A TreeResult
:return:
"""
self._from_tree_type_changes(tree)
self._from_tree_default(tree, 'dictionary_item_added')
self._from_tree_default(tree, 'dictionary_item_removed')
self._from_tree_value_changed(tree)
self._from_tree_unprocessed(tree)
self._from_tree_default(tree, 'iterable_item_added')
self._from_tree_default(tree, 'iterable_item_removed')
self._from_tree_iterable_item_moved(tree)
self._from_tree_default(tree, 'attribute_added')
self._from_tree_default(tree, 'attribute_removed')
self._from_tree_set_item_removed(tree)
self._from_tree_set_item_added(tree)
self._from_tree_repetition_change(tree)
self._from_tree_deep_distance(tree)
self._from_tree_custom_results(tree)
def _from_tree_default(self, tree: 'TreeResult', report_type: str, ignore_if_in_iterable_opcodes: bool = False) -> None:
if report_type in tree:
for change in tree[report_type]: # report each change
# When we convert from diff to delta result, we care more about opcodes than iterable_item_added or removed
if (
ignore_if_in_iterable_opcodes
and report_type in {"iterable_item_added", "iterable_item_removed"}
and change.up.path(force=FORCE_DEFAULT) in self["_iterable_opcodes"]
):
continue
# determine change direction (added or removed)
# Report t2 (the new one) whenever possible.
# In cases where t2 doesn't exist (i.e. stuff removed), report t1.
if change.t2 is not notpresent:
item = change.t2
else:
item = change.t1
# do the reporting
report = self[report_type]
if isinstance(report, SetOrdered):
report.add(change.path(force=FORCE_DEFAULT))
elif isinstance(report, dict):
report[change.path(force=FORCE_DEFAULT)] = item
elif isinstance(report, list): # pragma: no cover
# we don't actually have any of those right now, but just in case
report.append(change.path(force=FORCE_DEFAULT))
else: # pragma: no cover
# should never happen
raise TypeError("Cannot handle {} report container type.".
format(report))
def _from_tree_type_changes(self, tree):
if 'type_changes' in tree:
for change in tree['type_changes']:
path = change.path(force=FORCE_DEFAULT)
if type(change.t1) is type:
include_values = False
old_type = change.t1
new_type = change.t2
else:
include_values = True
old_type = get_type(change.t1)
new_type = get_type(change.t2)
remap_dict = RemapDict({
'old_type': old_type,
'new_type': new_type,
})
if self.verbose_level > 1:
new_path = change.path(use_t2=True, force=FORCE_DEFAULT)
if path != new_path:
remap_dict['new_path'] = new_path
self['type_changes'][path] = remap_dict
if self.verbose_level and include_values:
remap_dict.update(old_value=change.t1, new_value=change.t2)
def _from_tree_value_changed(self, tree):
if 'values_changed' in tree and self.verbose_level > 0:
for change in tree['values_changed']:
path = change.path(force=FORCE_DEFAULT)
the_changed = {'new_value': change.t2, 'old_value': change.t1}
if self.verbose_level > 1:
new_path = change.path(use_t2=True, force=FORCE_DEFAULT)
if path != new_path:
the_changed['new_path'] = new_path
self['values_changed'][path] = the_changed
if 'diff' in change.additional:
the_changed.update({'diff': change.additional['diff']})
def _from_tree_iterable_item_moved(self, tree):
if 'iterable_item_moved' in tree and self.verbose_level > 1:
for change in tree['iterable_item_moved']:
the_changed = {'new_path': change.path(use_t2=True, reporting_move=True), 'value': change.t2}
self['iterable_item_moved'][change.path(
force=FORCE_DEFAULT, use_t2=False, reporting_move=True)] = the_changed
def _from_tree_unprocessed(self, tree):
if 'unprocessed' in tree:
for change in tree['unprocessed']:
self['unprocessed'].append("{}: {} and {}".format(change.path(
force=FORCE_DEFAULT), change.t1, change.t2))
def _from_tree_set_item_added_or_removed(self, tree, key):
if key in tree:
set_item_info = self[key]
is_dict = isinstance(set_item_info, Mapping)
for change in tree[key]:
path = change.up.path(
) # we want't the set's path, the added item is not directly accessible
item = change.t2 if key == 'set_item_added' else change.t1
if self.ADD_QUOTES_TO_STRINGS and isinstance(item, strings):
item = "'%s'" % item
if is_dict:
if path not in set_item_info:
set_item_info[path] = set() # type: ignore
set_item_info[path].add(item)
else:
set_item_info.add("{}[{}]".format(path, str(item)))
# this syntax is rather peculiar, but it's DeepDiff 2.x compatible)
def _from_tree_set_item_added(self, tree):
self._from_tree_set_item_added_or_removed(tree, key='set_item_added')
def _from_tree_set_item_removed(self, tree):
self._from_tree_set_item_added_or_removed(tree, key='set_item_removed')
def _from_tree_repetition_change(self, tree):
if 'repetition_change' in tree:
for change in tree['repetition_change']:
path = change.path(force=FORCE_DEFAULT)
self['repetition_change'][path] = RemapDict(
change.additional['repetition']
)
self['repetition_change'][path]['value'] = change.t1
def _from_tree_deep_distance(self, tree):
if 'deep_distance' in tree:
self['deep_distance'] = tree['deep_distance']
def _from_tree_custom_results(self, tree):
for k, _level_list in tree.items():
if k not in REPORT_KEYS:
if not isinstance(_level_list, SetOrdered):
continue
# if len(_level_list) == 0:
# continue
#
# if not isinstance(_level_list[0], DiffLevel):
# continue
# _level_list is a list of DiffLevel
_custom_dict = {}
for _level in _level_list:
_custom_dict[_level.path(
force=FORCE_DEFAULT)] = _level.additional.get(CUSTOM_FIELD, {})
self[k] = _custom_dict
class DeltaResult(TextResult):
ADD_QUOTES_TO_STRINGS: bool = False
def __init__(self, tree_results: Optional['TreeResult'] = None, ignore_order: Optional[bool] = None, always_include_values: bool = False, _iterable_opcodes: Optional[Dict[str, Any]] = None) -> None:
self.ignore_order = ignore_order
self.always_include_values = always_include_values
self.update({
"type_changes": dict_(),
"dictionary_item_added": dict_(),
"dictionary_item_removed": dict_(),
"values_changed": dict_(),
"iterable_item_added": dict_(),
"iterable_item_removed": dict_(),
"iterable_item_moved": dict_(),
"attribute_added": dict_(),
"attribute_removed": dict_(),
"set_item_removed": dict_(),
"set_item_added": dict_(),
"iterable_items_added_at_indexes": dict_(),
"iterable_items_removed_at_indexes": dict_(),
"_iterable_opcodes": _iterable_opcodes or {},
})
if tree_results:
self._from_tree_results(tree_results)
def _from_tree_results(self, tree):
"""
Populate this object by parsing an existing reference-style result dictionary.
:param tree: A TreeResult
:return:
"""
self._from_tree_type_changes(tree)
self._from_tree_default(tree, 'dictionary_item_added')
self._from_tree_default(tree, 'dictionary_item_removed')
self._from_tree_value_changed(tree)
if self.ignore_order:
self._from_tree_iterable_item_added_or_removed(
tree, 'iterable_item_added', delta_report_key='iterable_items_added_at_indexes')
self._from_tree_iterable_item_added_or_removed(
tree, 'iterable_item_removed', delta_report_key='iterable_items_removed_at_indexes')
else:
self._from_tree_default(tree, 'iterable_item_added', ignore_if_in_iterable_opcodes=True)
self._from_tree_default(tree, 'iterable_item_removed', ignore_if_in_iterable_opcodes=True)
self._from_tree_iterable_item_moved(tree)
self._from_tree_default(tree, 'attribute_added')
self._from_tree_default(tree, 'attribute_removed')
self._from_tree_set_item_removed(tree)
self._from_tree_set_item_added(tree)
self._from_tree_repetition_change(tree)
def _from_tree_iterable_item_added_or_removed(self, tree, report_type, delta_report_key):
if report_type in tree:
for change in tree[report_type]: # report each change
# determine change direction (added or removed)
# Report t2 (the new one) whenever possible.
# In cases where t2 doesn't exist (i.e. stuff removed), report t1.
if change.t2 is not notpresent:
item = change.t2
else:
item = change.t1
# do the reporting
path, param, _ = change.path(force=FORCE_DEFAULT, get_parent_too=True)
try:
iterable_items_added_at_indexes = self[delta_report_key][path]
except KeyError:
iterable_items_added_at_indexes = self[delta_report_key][path] = dict_()
iterable_items_added_at_indexes[param] = item
def _from_tree_type_changes(self, tree):
if 'type_changes' in tree:
for change in tree['type_changes']:
include_values = None
if type(change.t1) is type:
include_values = False
old_type = change.t1
new_type = change.t2
else:
old_type = get_type(change.t1)
new_type = get_type(change.t2)
include_values = True
try:
if new_type in numpy_numbers:
new_t1 = change.t1.astype(new_type)
include_values = not np.array_equal(new_t1, change.t2)
else:
new_t1 = new_type(change.t1)
# If simply applying the type from one value converts it to the other value,
# there is no need to include the actual values in the delta.
include_values = new_t1 != change.t2
except Exception:
pass
path = change.path(force=FORCE_DEFAULT)
new_path = change.path(use_t2=True, force=FORCE_DEFAULT)
remap_dict = RemapDict({
'old_type': old_type,
'new_type': new_type,
})
if path != new_path:
remap_dict['new_path'] = new_path
self['type_changes'][path] = remap_dict
if include_values or self.always_include_values:
remap_dict.update(old_value=change.t1, new_value=change.t2)
def _from_tree_value_changed(self, tree):
if 'values_changed' in tree:
for change in tree['values_changed']:
path = change.path(force=FORCE_DEFAULT)
new_path = change.path(use_t2=True, force=FORCE_DEFAULT)
the_changed = {'new_value': change.t2, 'old_value': change.t1}
if path != new_path:
the_changed['new_path'] = new_path
self['values_changed'][path] = the_changed
# If we ever want to store the difflib results instead of the new_value
# these lines need to be uncommented and the Delta object needs to be able
# to use them.
# if 'diff' in change.additional:
# the_changed.update({'diff': change.additional['diff']})
def _from_tree_repetition_change(self, tree):
if 'repetition_change' in tree:
for change in tree['repetition_change']:
path, _, _ = change.path(get_parent_too=True)
repetition = RemapDict(change.additional['repetition'])
value = change.t1
try:
iterable_items_added_at_indexes = self['iterable_items_added_at_indexes'][path]
except KeyError:
iterable_items_added_at_indexes = self['iterable_items_added_at_indexes'][path] = dict_()
for index in repetition['new_indexes']:
iterable_items_added_at_indexes[index] = value
def _from_tree_iterable_item_moved(self, tree):
if 'iterable_item_moved' in tree:
for change in tree['iterable_item_moved']:
if (
change.up.path(force=FORCE_DEFAULT, reporting_move=True) not in self["_iterable_opcodes"]
):
the_changed = {'new_path': change.path(use_t2=True, reporting_move=True), 'value': change.t2}
self['iterable_item_moved'][change.path(
force=FORCE_DEFAULT, reporting_move=True)] = the_changed
class DiffLevel:
"""
An object of this class represents a single object-tree-level in a reported change.
A double-linked list of these object describes a single change on all of its levels.
Looking at the tree of all changes, a list of those objects represents a single path through the tree
(which is just fancy for "a change").
This is the result object class for object reference style reports.
Example:
>>> t1 = {2: 2, 4: 44}
>>> t2 = {2: "b", 5: 55}
>>> ddiff = DeepDiff(t1, t2, view='tree')
>>> ddiff
{'dictionary_item_added': {},
'dictionary_item_removed': {},
'type_changes': {}}
Graph:
↑up ↑up
| |
| ChildRelationship | ChildRelationship
| |
↓down ↓down
.path() = 'root[5]' .path() = 'root[4]'
Note that the 2 top level DiffLevel objects are 2 different objects even though
they are essentially talking about the same diff operation.
A ChildRelationship object describing the relationship between t1 and it's child object,
where t1's child object equals down.t1.
Think about it like a graph:
+---------------------------------------------------------------+
| |
| parent difflevel parent |
| + ^ + |
+------|--------------------------|---------------------|-------+
| | | up |
| Child | | | ChildRelationship
| Relationship | | |
| down | | |
+------|----------------------|-------------------------|-------+
| v v v |
| child difflevel child |
| |
+---------------------------------------------------------------+
The child_rel example:
# dictionary_item_removed is a set so in order to get an item from it:
>>> (difflevel,) = ddiff['dictionary_item_removed'])
>>> difflevel.up.t1_child_rel
>>> (difflevel,) = ddiff['dictionary_item_added'])
>>> difflevel
>>> difflevel.up
>>>
>>> difflevel.up
# t1 didn't exist
>>> difflevel.up.t1_child_rel
# t2 is added
>>> difflevel.up.t2_child_rel
"""
def __init__(self,
t1: Any,
t2: Any,
down: Optional['DiffLevel'] = None,
up: Optional['DiffLevel'] = None,
report_type: Optional[str] = None,
child_rel1: Optional['ChildRelationship'] = None,
child_rel2: Optional['ChildRelationship'] = None,
additional: Optional[Dict[str, Any]] = None,
verbose_level: int = 1) -> None:
"""
:param child_rel1: Either:
- An existing ChildRelationship object describing the "down" relationship for t1; or
- A ChildRelationship subclass. In this case, we will create the ChildRelationship objects
for both t1 and t2.
Alternatives for child_rel1 and child_rel2 must be used consistently.
:param child_rel2: Either:
- An existing ChildRelationship object describing the "down" relationship for t2; or
- The param argument for a ChildRelationship class we shall create.
Alternatives for child_rel1 and child_rel2 must be used consistently.
"""
# The current-level object in the left hand tree
self.t1 = t1
# The current-level object in the right hand tree
self.t2 = t2
# Another DiffLevel object describing this change one level deeper down the object tree
self.down = down
# Another DiffLevel object describing this change one level further up the object tree
self.up = up
self.report_type = report_type
# If this object is this change's deepest level, this contains a string describing the type of change.
# Examples: "set_item_added", "values_changed"
# Note: don't use {} as additional's default value - this would turn out to be always the same dict object
self.additional = dict_() if additional is None else additional
# For some types of changes we store some additional information.
# This is a dict containing this information.
# Currently, this is used for:
# - values_changed: In case the changes data is a multi-line string,
# we include a textual diff as additional['diff'].
# - repetition_change: additional['repetition']:
# e.g. {'old_repeat': 2, 'new_repeat': 1, 'old_indexes': [0, 2], 'new_indexes': [2]}
# the user supplied ChildRelationship objects for t1 and t2
# A ChildRelationship object describing the relationship between t1 and it's child object,
# where t1's child object equals down.t1.
# If this relationship is representable as a string, str(self.t1_child_rel) returns a formatted param parsable python string,
# e.g. "[2]", ".my_attribute"
self.t1_child_rel = child_rel1
# Another ChildRelationship object describing the relationship between t2 and it's child object.
self.t2_child_rel = child_rel2
# Will cache result of .path() per 'force' as key for performance
self._path = dict_()
self.verbose_level = verbose_level
def __repr__(self) -> str:
if self.verbose_level:
from deepdiff.summarize import summarize
if self.additional:
additional_repr = summarize(self.additional, max_length=35)
result = "<{} {}>".format(self.path(), additional_repr)
else:
t1_repr = summarize(self.t1, max_length=35)
t2_repr = summarize(self.t2, max_length=35)
result = "<{} t1:{}, t2:{}>".format(self.path(), t1_repr, t2_repr)
else:
result = "<{}>".format(self.path())
return result
def __setattr__(self, key: str, value: Any) -> None:
# Setting up or down, will set the opposite link in this linked list.
if key in UP_DOWN and value is not None:
self.__dict__[key] = value
opposite_key = UP_DOWN[key]
value.__dict__[opposite_key] = self
else:
self.__dict__[key] = value
def __iter__(self) -> Any:
yield self.t1
yield self.t2
@property
def repetition(self) -> Dict[str, Any]:
return self.additional['repetition']
def auto_generate_child_rel(self, klass: Type['ChildRelationship'], param: Any, param2: Optional[Any] = None) -> None:
"""
Auto-populate self.child_rel1 and self.child_rel2.
This requires self.down to be another valid DiffLevel object.
:param klass: A ChildRelationship subclass describing the kind of parent-child relationship,
e.g. DictRelationship.
:param param: A ChildRelationship subclass-dependent parameter describing how to get from parent to child,
e.g. the key in a dict
"""
if self.down.t1 is not notpresent: # type: ignore
self.t1_child_rel = ChildRelationship.create(
klass=klass, parent=self.t1, child=self.down.t1, param=param) # type: ignore
if self.down.t2 is not notpresent: # type: ignore
self.t2_child_rel = ChildRelationship.create(
klass=klass, parent=self.t2, child=self.down.t2, param=param if param2 is None else param2) # type: ignore
@property
def all_up(self) -> 'DiffLevel':
"""
Get the root object of this comparison.
(This is a convenient wrapper for following the up attribute as often as you can.)
:rtype: DiffLevel
"""
level = self
while level.up:
level = level.up
return level
@property
def all_down(self) -> 'DiffLevel':
"""
Get the leaf object of this comparison.
(This is a convenient wrapper for following the down attribute as often as you can.)
:rtype: DiffLevel
"""
level = self
while level.down:
level = level.down
return level
@staticmethod
def _format_result(root: str, result: Optional[str]) -> Optional[str]:
return None if result is None else "{}{}".format(root, result)
def get_root_key(self, use_t2: bool = False) -> Any:
"""
Get the path's root key value for this change
For example if the path to the element that is reported to have a change in value is root['X'][0]
then get_root_key should return 'X'
"""
root_level = self.all_up
if(use_t2):
next_rel = root_level.t2_child_rel
else:
next_rel = root_level.t1_child_rel or root_level.t2_child_rel # next relationship object to get a formatted param from
if next_rel:
return next_rel.param
return notpresent
def path(self, root: str = "root", force: Optional[str] = None, get_parent_too: bool = False, use_t2: bool = False, output_format: Literal['str', 'list'] = 'str', reporting_move: bool = False) -> Any:
"""
A python syntax string describing how to descend to this level, assuming the top level object is called root.
Returns None if the path is not representable as a string.
This might be the case for example if there are sets involved (because then there's not path at all) or because
custom objects used as dictionary keys (then there is a path but it's not representable).
Example: root['ingredients'][0]
Note: We will follow the left side of the comparison branch, i.e. using the t1's to build the path.
Using t1 or t2 should make no difference at all, except for the last step of a child-added/removed relationship.
If it does in any other case, your comparison path is corrupt.
**Parameters**
:param root: The result string shall start with this var name
:param force: Bends the meaning of "no string representation".
If None:
Will strictly return Python-parsable expressions. The result those yield will compare
equal to the objects in question.
If 'yes':
Will return a path including '(unrepresentable)' in place of non string-representable parts.
If 'fake':
Will try to produce an output optimized for readability.
This will pretend all iterables are subscriptable, for example.
:param output_format: The format of the output. The options are 'str' which is the default and produces a
string representation of the path or 'list' to produce a list of keys and attributes
that produce the path.
:param reporting_move: This should be set to true if and only if we are reporting on iterable_item_moved.
All other cases should leave this set to False.
"""
# TODO: We could optimize this by building on top of self.up's path if it is cached there
cache_key = "{}{}{}{}".format(force, get_parent_too, use_t2, output_format)
if cache_key in self._path:
cached = self._path[cache_key]
if get_parent_too:
parent, param, result = cached
return (self._format_result(root, parent), param, self._format_result(root, result))
else:
return self._format_result(root, cached)
if output_format == 'str':
result = parent = param = ""
else:
result = []
level = self.all_up # start at the root
# traverse all levels of this relationship
while level and level is not self:
# get this level's relationship object
if level.additional.get("moved") and not reporting_move:
# To ensure we can properly replay items such as values_changed in items that may have moved, we
# need to make sure that all paths are reported relative to t2 if a level has reported a move.
# If we are reporting a move, the path is already correct and does not need to be swapped.
# Additional context of "moved" is only ever set if using iterable_compare_func and a move has taken place.
level_use_t2 = not use_t2
else:
level_use_t2 = use_t2
if level_use_t2:
next_rel = level.t2_child_rel or level.t1_child_rel
else:
next_rel = level.t1_child_rel or level.t2_child_rel # next relationship object to get a formatted param from
# t1 and t2 both are empty
if next_rel is None:
break
# Build path for this level
if output_format == 'str':
item = next_rel.get_param_repr(force)
if item:
parent = result
param = next_rel.param
result += item
else:
# it seems this path is not representable as a string
result = None
break
elif output_format == 'list':
result.append(next_rel.param) # type: ignore
# Prepare processing next level
level = level.down
if output_format == 'str':
if get_parent_too:
self._path[cache_key] = (parent, param, result) # type: ignore
output = (self._format_result(root, parent), param, self._format_result(root, result)) # type: ignore
else:
self._path[cache_key] = result
output = self._format_result(root, result) if isinstance(result, (str, type(None))) else None
else:
output = result
return output
def create_deeper(self,
new_t1: Any,
new_t2: Any,
child_relationship_class: Type['ChildRelationship'],
child_relationship_param: Optional[Any] = None,
child_relationship_param2: Optional[Any] = None,
report_type: Optional[str] = None) -> 'DiffLevel':
"""
Start a new comparison level and correctly link it to this one.
:rtype: DiffLevel
:return: New level
"""
level = self.all_down
result = DiffLevel(
new_t1, new_t2, down=None, up=level, report_type=report_type, verbose_level=self.verbose_level)
level.down = result
level.auto_generate_child_rel(
klass=child_relationship_class, param=child_relationship_param, param2=child_relationship_param2)
return result
def branch_deeper(self,
new_t1: Any,
new_t2: Any,
child_relationship_class: Type['ChildRelationship'],
child_relationship_param: Optional[Any] = None,
child_relationship_param2: Optional[Any] = None,
report_type: Optional[str] = None) -> 'DiffLevel':
"""
Branch this comparison: Do not touch this comparison line, but create a new one with exactly the same content,
just one level deeper.
:rtype: DiffLevel
:return: New level in new comparison line
"""
branch = self.copy()
return branch.create_deeper(new_t1, new_t2, child_relationship_class,
child_relationship_param, child_relationship_param2, report_type)
def copy(self) -> 'DiffLevel':
"""
Get a deep copy of this comparision line.
:return: The leaf ("downmost") object of the copy.
"""
orig = self.all_up
result = copy(orig) # copy top level
while orig is not None:
result.additional = copy(orig.additional)
if orig.down is not None: # copy and create references to the following level
# copy following level
result.down = copy(orig.down)
if orig.t1_child_rel is not None:
result.t1_child_rel = ChildRelationship.create(
klass=orig.t1_child_rel.__class__,
parent=result.t1,
child=result.down.t1,
param=orig.t1_child_rel.param)
if orig.t2_child_rel is not None:
result.t2_child_rel = ChildRelationship.create(
klass=orig.t2_child_rel.__class__,
parent=result.t2,
child=result.down.t2,
param=orig.t2_child_rel.param)
# descend to next level
orig = orig.down
if result.down is not None:
result = result.down
return result
class ChildRelationship:
"""
Describes the relationship between a container object (the "parent") and the contained
"child" object.
"""
# Format to a be used for representing param.
# E.g. for a dict, this turns a formatted param param "42" into "[42]".
param_repr_format: Optional[str] = None
# This is a hook allowing subclasses to manipulate param strings.
# :param string: Input string
# :return: Manipulated string, as appropriate in this context.
quote_str: Optional[str] = None
@staticmethod
def create(klass: Type['ChildRelationship'], parent: Any, child: Any, param: Optional[Any] = None) -> 'ChildRelationship':
if not issubclass(klass, ChildRelationship):
raise TypeError
return klass(parent, child, param)
def __init__(self, parent: Any, child: Any, param: Optional[Any] = None) -> None:
# The parent object of this relationship, e.g. a dict
self.parent = parent
# The child object of this relationship, e.g. a value in a dict
self.child = child
# A subclass-dependent parameter describing how to get from parent to child, e.g. the key in a dict
self.param = param
def __repr__(self) -> str:
from deepdiff.summarize import summarize
name = "<{} parent:{}, child:{}, param:{}>"
parent = summarize(self.parent, max_length=35)
child = summarize(self.child, max_length=35)
param = summarize(self.param, max_length=15)
return name.format(self.__class__.__name__, parent, child, param)
def get_param_repr(self, force: Optional[str] = None) -> Optional[str]:
"""
Returns a formatted param python parsable string describing this relationship,
or None if the relationship is not representable as a string.
This string can be appended to the parent Name.
Subclasses representing a relationship that cannot be expressed as a string override this method to return None.
Examples: "[2]", ".attribute", "['mykey']"
:param force: Bends the meaning of "no string representation".
If None:
Will strictly return partials of Python-parsable expressions. The result those yield will compare
equal to the objects in question.
If 'yes':
Will return a formatted param including '(unrepresentable)' instead of the non string-representable part.
"""
return self.stringify_param(force)
def stringify_param(self, force: Optional[str] = None) -> Optional[str]:
"""
Convert param to a string. Return None if there is no string representation.
This is called by get_param_repr()
:param force: Bends the meaning of "no string representation".
If None:
Will strictly return Python-parsable expressions. The result those yield will compare
equal to the objects in question.
If 'yes':
Will return '(unrepresentable)' instead of None if there is no string representation
TODO: stringify_param has issues with params that when converted to string via repr,
it is not straightforward to turn them back into the original object.
Although repr is meant to be able to reconstruct the original object but for complex objects, repr
often does not recreate the original object.
Perhaps we should log that the repr reconstruction failed so the user is aware.
"""
param = self.param
if isinstance(param, strings):
result = stringify_element(param, quote_str=self.quote_str)
elif isinstance(param, tuple): # Currently only for numpy ndarrays
result = ']['.join(map(repr, param))
elif hasattr(param, '__dataclass_fields__'):
attrs_to_values = [f"{key}={value}" for key, value in [(i, getattr(param, i)) for i in param.__dataclass_fields__]] # type: ignore
result = f"{param.__class__.__name__}({','.join(attrs_to_values)})"
else:
candidate = repr(param)
try:
resurrected = literal_eval_extended(candidate)
# Note: This will miss string-representable custom objects.
# However, the only alternative I can currently think of is using eval() which is inherently dangerous.
except (SyntaxError, ValueError) as err:
logger.error(
f'stringify_param was not able to get a proper repr for "{param}". '
"This object will be reported as None. Add instructions for this object to DeepDiff's "
f"helper.literal_eval_extended to make it work properly: {err}")
result = None
else:
result = candidate if resurrected == param else None
if result:
result = ':' if self.param_repr_format is None else self.param_repr_format.format(result)
return result
class DictRelationship(ChildRelationship):
param_repr_format: Optional[str] = "[{}]"
quote_str: Optional[str] = "'{}'"
class NumpyArrayRelationship(ChildRelationship):
param_repr_format: Optional[str] = "[{}]"
quote_str: Optional[str] = None
class SubscriptableIterableRelationship(DictRelationship):
pass
class InaccessibleRelationship(ChildRelationship):
pass
# there is no random access to set elements
class SetRelationship(InaccessibleRelationship):
pass
class NonSubscriptableIterableRelationship(InaccessibleRelationship):
param_repr_format: Optional[str] = "[{}]"
def get_param_repr(self, force: Optional[str] = None) -> Optional[str]:
if force == 'yes':
result = "(unrepresentable)"
elif force == 'fake' and self.param:
result = self.stringify_param()
else:
result = None
return result
class AttributeRelationship(ChildRelationship):
param_repr_format: Optional[str] = ".{}"
qlustered-deepdiff-41c7265/deepdiff/operator.py 0000664 0000000 0000000 00000004670 15162412645 0021533 0 ustar 00root root 0000000 0000000 import re
from typing import Any, Optional, List, TYPE_CHECKING
from abc import ABCMeta, abstractmethod
from deepdiff.helper import convert_item_or_items_into_compiled_regexes_else_none
if TYPE_CHECKING:
from deepdiff import DeepDiff
class BaseOperatorPlus(metaclass=ABCMeta):
@abstractmethod
def match(self, level) -> bool:
"""
Given a level which includes t1 and t2 in the tree view, is this operator a good match to compare t1 and t2?
If yes, we will run the give_up_diffing to compare t1 and t2 for this level.
"""
pass
@abstractmethod
def give_up_diffing(self, level, diff_instance: "DeepDiff") -> bool:
"""
Given a level which includes t1 and t2 in the tree view, and the "distance" between l1 and l2.
do we consider t1 and t2 to be equal or not. The distance is a number between zero to one and is calculated by DeepDiff to measure how similar objects are.
"""
@abstractmethod
def normalize_value_for_hashing(self, parent: Any, obj: Any) -> Any:
"""
You can use this function to normalize values for ignore_order=True
For example, you may want to turn all the words to be lowercase. Then you return obj.lower()
"""
pass
class BaseOperator:
def __init__(self, regex_paths:Optional[List[str]]=None, types:Optional[List[type]]=None):
if regex_paths:
self.regex_paths = convert_item_or_items_into_compiled_regexes_else_none(regex_paths)
else:
self.regex_paths = None
self.types = types
def match(self, level) -> bool:
if self.regex_paths:
for pattern in self.regex_paths:
matched = re.search(pattern, level.path()) is not None
if matched:
return True
if self.types:
for type_ in self.types:
if isinstance(level.t1, type_) and isinstance(level.t2, type_):
return True
return False
def give_up_diffing(self, level, diff_instance) -> bool:
raise NotImplementedError('Please implement the diff function.')
class PrefixOrSuffixOperator:
def match(self, level) -> bool:
return level.t1 and level.t2 and isinstance(level.t1, str) and isinstance(level.t2, str)
def give_up_diffing(self, level, diff_instance) -> bool:
t1 = level.t1
t2 = level.t2
return t1.startswith(t2) or t2.startswith(t1)
qlustered-deepdiff-41c7265/deepdiff/path.py 0000664 0000000 0000000 00000024533 15162412645 0020634 0 ustar 00root root 0000000 0000000 import logging
from ast import literal_eval
from functools import lru_cache
logger = logging.getLogger(__name__)
GETATTR = 'GETATTR'
GET = 'GET'
class PathExtractionError(ValueError):
pass
class RootCanNotBeModified(ValueError):
pass
def _add_to_elements(elements, elem, inside):
# Ignore private items
if not elem:
return
if not elem.startswith('__'):
remove_quotes = False
if '𝆺𝅥𝅯' in elem or '\\' in elem:
remove_quotes = True
else:
try:
elem = literal_eval(elem)
remove_quotes = False
except (ValueError, SyntaxError):
remove_quotes = True
if remove_quotes and elem[0] == elem[-1] and elem[0] in {'"', "'"}:
elem = elem[1: -1]
action = GETATTR if inside == '.' else GET
elements.append((elem, action))
DEFAULT_FIRST_ELEMENT = ('root', GETATTR)
@lru_cache(maxsize=1024 * 128)
def _path_to_elements(path, root_element=DEFAULT_FIRST_ELEMENT):
"""
Given a path, it extracts the elements that form the path and their relevant most likely retrieval action.
>>> from deepdiff import _path_to_elements
>>> path = "root[4.3].b['a3']"
>>> _path_to_elements(path, root_element=None)
[(4.3, 'GET'), ('b', 'GETATTR'), ('a3', 'GET')]
"""
if isinstance(path, (tuple, list)):
return path
elements = []
if root_element:
elements.append(root_element)
elem = ''
inside = False
prev_char = None
path = path[4:] # removing "root from the beginning"
brackets = []
inside_quotes = False
quote_used = ''
for char in path:
if prev_char == '𝆺𝅥𝅯':
elem += char
elif char in {'"', "'"}:
elem += char
# If we are inside and the quote is not what we expected, the quote is not closing
if not(inside_quotes and quote_used != char):
inside_quotes = not inside_quotes
if inside_quotes:
quote_used = char
else:
_add_to_elements(elements, elem, inside)
elem = ''
quote_used = ''
elif inside_quotes:
elem += char
elif char == '[':
if inside == '.':
_add_to_elements(elements, elem, inside)
inside = '['
elem = ''
# we are already inside. The bracket is a part of the word.
elif inside == '[':
elem += char
else:
inside = '['
brackets.append('[')
elem = ''
elif char == '.':
if inside == '[':
elem += char
elif inside == '.':
_add_to_elements(elements, elem, inside)
elem = ''
else:
inside = '.'
elem = ''
elif char == ']':
if brackets and brackets[-1] == '[':
brackets.pop()
if brackets:
elem += char
else:
_add_to_elements(elements, elem, inside)
elem = ''
inside = False
else:
elem += char
prev_char = char
if elem:
_add_to_elements(elements, elem, inside)
return tuple(elements)
def _get_nested_obj(obj, elements, next_element=None):
for (elem, action) in elements:
check_elem(elem)
if action == GET:
obj = obj[elem]
elif action == GETATTR:
obj = getattr(obj, elem)
return obj
def _guess_type(elements, elem, index, next_element):
# If we are not at the last elements
if index < len(elements) - 1:
# We assume it is a nested dictionary not a nested list
return {}
if isinstance(next_element, int):
return []
return {}
def check_elem(elem):
if isinstance(elem, str) and elem.startswith("__") and elem.endswith("__"):
raise ValueError("traversing dunder attributes is not allowed")
def _get_nested_obj_and_force(obj, elements, next_element=None):
prev_elem = None
prev_action = None
prev_obj = obj
for index, (elem, action) in enumerate(elements):
check_elem(elem)
_prev_obj = obj
if action == GET:
try:
obj = obj[elem]
prev_obj = _prev_obj
except KeyError:
obj[elem] = _guess_type(elements, elem, index, next_element)
obj = obj[elem]
prev_obj = _prev_obj
except IndexError:
if isinstance(obj, list) and isinstance(elem, int) and elem >= len(obj):
obj.extend([None] * (elem - len(obj)))
obj.append(_guess_type(elements, elem, index), next_element)
obj = obj[-1]
prev_obj = _prev_obj
elif isinstance(obj, list) and len(obj) == 0 and prev_elem:
# We ran into an empty list that should have been a dictionary
# We need to change it from an empty list to a dictionary
obj = {elem: _guess_type(elements, elem, index, next_element)}
if prev_action == GET:
prev_obj[prev_elem] = obj
else:
setattr(prev_obj, prev_elem, obj)
obj = obj[elem]
elif action == GETATTR:
obj = getattr(obj, elem)
prev_obj = _prev_obj
prev_elem = elem
prev_action = action
return obj
def extract(obj, path):
"""
Get the item from obj based on path.
Example:
>>> from deepdiff import extract
>>> obj = {1: [{'2': 'b'}, 3], 2: [4, 5]}
>>> path = "root[1][0]['2']"
>>> extract(obj, path)
'b'
Note that you can use extract in conjunction with DeepDiff results
or even with the search and :ref:`deepsearch_label` modules. For example:
>>> from deepdiff import grep
>>> obj = {1: [{'2': 'b'}, 3], 2: [4, 5]}
>>> result = obj | grep(5)
>>> result
{'matched_values': ['root[2][1]']}
>>> result['matched_values'][0]
'root[2][1]'
>>> path = result['matched_values'][0]
>>> extract(obj, path)
5
.. note::
Note that even if DeepDiff tried gives you a path to an item in a set,
there is no such thing in Python and hence you will get an error trying
to extract that item from a set.
If you want to be able to get items from sets, use the SetOrdered module
to generate the sets.
In fact Deepdiff uses SetOrdered as a dependency.
>>> from deepdiff import grep, extract
>>> obj = {"a", "b"}
>>> obj | grep("b")
Set item detected in the path.'set' objects do NOT support indexing. But DeepSearch will still report a path.
{'matched_values': SetOrdered(['root[0]'])}
>>> extract(obj, 'root[0]')
Traceback (most recent call last):
File "", line 1, in
File "deepdiff/deepdiff/path.py", line 126, in extract
return _get_nested_obj(obj, elements)
File "deepdiff/deepdiff/path.py", line 84, in _get_nested_obj
obj = obj[elem]
TypeError: 'set' object is not subscriptable
>>> from orderly_set import SetOrdered
>>> obj = SetOrdered(["a", "b"])
>>> extract(obj, 'root[0]')
'a'
"""
elements = _path_to_elements(path, root_element=None)
return _get_nested_obj(obj, elements)
def parse_path(path, root_element=DEFAULT_FIRST_ELEMENT, include_actions=False):
"""
Parse a path to a format that is machine readable
**Parameters**
path : A string
The path string such as "root[1][2]['age']"
root_element: string, default='root'
What the root is called in the path.
include_actions: boolean, default=False
If True, we return the action required to retrieve the item at each element of the path.
**Examples**
>>> from deepdiff import parse_path
>>> parse_path("root[1][2]['age']")
[1, 2, 'age']
>>> parse_path("root[1][2]['age']", include_actions=True)
[{'element': 1, 'action': 'GET'}, {'element': 2, 'action': 'GET'}, {'element': 'age', 'action': 'GET'}]
>>>
>>> parse_path("root['joe'].age")
['joe', 'age']
>>> parse_path("root['joe'].age", include_actions=True)
[{'element': 'joe', 'action': 'GET'}, {'element': 'age', 'action': 'GETATTR'}]
"""
result = _path_to_elements(path, root_element=root_element)
result = iter(result)
if root_element:
next(result) # We don't want the root item
if include_actions is False:
return [i[0] for i in result]
return [{'element': i[0], 'action': i[1]} for i in result]
def stringify_element(param, quote_str=None):
has_quote = "'" in param
has_double_quote = '"' in param
if has_quote and has_double_quote and not quote_str:
new_param = []
for char in param:
if char in {'"', "'"}:
new_param.append('𝆺𝅥𝅯')
new_param.append(char)
result = '"' + ''.join(new_param) + '"'
elif has_quote:
result = f'"{param}"'
elif has_double_quote:
result = f"'{param}'"
else:
result = param if quote_str is None else quote_str.format(param)
return result
def stringify_path(path, root_element=DEFAULT_FIRST_ELEMENT, quote_str="'{}'"):
"""
Gets the path as an string.
For example [1, 2, 'age'] should become
root[1][2]['age']
"""
if not path:
return root_element[0]
result = [root_element[0]]
has_actions = False
try:
if path[0][1] in {GET, GETATTR}:
has_actions = True
except (KeyError, IndexError, TypeError):
pass
if not has_actions:
path = [(i, GET) for i in path]
path[0] = (path[0][0], root_element[1]) # The action for the first element might be a GET or GETATTR. We update the action based on the root_element.
for element, action in path:
if isinstance(element, str) and action == GET:
element = stringify_element(element, quote_str)
if action == GET:
result.append(f"[{element}]")
else:
result.append(f".{element}")
return ''.join(result)
qlustered-deepdiff-41c7265/deepdiff/py.typed 0000664 0000000 0000000 00000000000 15162412645 0021004 0 ustar 00root root 0000000 0000000 qlustered-deepdiff-41c7265/deepdiff/search.py 0000664 0000000 0000000 00000033570 15162412645 0021146 0 ustar 00root root 0000000 0000000 #!/usr/bin/env python
import re
from collections.abc import MutableMapping, Iterable
from typing import Any, Dict, FrozenSet, List, Pattern, Set, Union, Tuple
from deepdiff.helper import SetOrdered
import logging
from deepdiff.helper import (
strings, numbers, add_to_frozen_set, get_doc, dict_, RE_COMPILED_TYPE, ipranges
)
logger = logging.getLogger(__name__)
doc = get_doc('search_doc.rst')
class DeepSearch(Dict[str, Union[Dict[str, Any], SetOrdered, List[str]]]):
r"""
**DeepSearch**
Deep Search inside objects to find the item matching your criteria.
**Parameters**
obj : The object to search within
item : The item to search for
verbose_level : int >= 0, default = 1.
Verbose level one shows the paths of found items.
Verbose level 2 shows the path and value of the found items.
exclude_paths: list, default = None.
List of paths to exclude from the report.
exclude_types: list, default = None.
List of object types to exclude from the report.
case_sensitive: Boolean, default = False
match_string: Boolean, default = False
If True, the value of the object or its children have to exactly match the item.
If False, the value of the item can be a part of the value of the object or its children
use_regexp: Boolean, default = False
strict_checking: Boolean, default = True
If True, it will check the type of the object to match, so when searching for '1234',
it will NOT match the int 1234. Currently this only affects the numeric values searching.
**Returns**
A DeepSearch object that has the matched paths and matched values.
**Supported data types**
int, string, unicode, dictionary, list, tuple, set, frozenset, OrderedDict, NamedTuple and custom objects!
**Examples**
Importing
>>> from deepdiff import DeepSearch
>>> from pprint import pprint
Search in list for string
>>> obj = ["long somewhere", "string", 0, "somewhere great!"]
>>> item = "somewhere"
>>> ds = DeepSearch(obj, item, verbose_level=2)
>>> print(ds)
{'matched_values': {'root[0]': 'long somewhere', 'root[3]': 'somewhere great!'}}
Search in nested data for string
>>> obj = ["something somewhere", {"long": "somewhere", "string": 2, 0: 0, "somewhere": "around"}]
>>> item = "somewhere"
>>> ds = DeepSearch(obj, item, verbose_level=2)
>>> pprint(ds, indent=2)
{ 'matched_paths': {"root[1]['somewhere']": 'around'},
'matched_values': { 'root[0]': 'something somewhere',
"root[1]['long']": 'somewhere'}}
"""
warning_num: int = 0
def __init__(self,
obj: Any,
item: Any,
exclude_paths: Union[SetOrdered, Set[str], List[str]] = SetOrdered(),
exclude_regex_paths: Union[SetOrdered, Set[Union[str, Pattern[str]]], List[Union[str, Pattern[str]]]] = SetOrdered(),
exclude_types: Union[SetOrdered, Set[type], List[type]] = SetOrdered(),
verbose_level: int = 1,
case_sensitive: bool = False,
match_string: bool = False,
use_regexp: bool = False,
strict_checking: bool = True,
**kwargs: Any) -> None:
if kwargs:
raise ValueError((
"The following parameter(s) are not valid: %s\n"
"The valid parameters are obj, item, exclude_paths, exclude_types,\n"
"case_sensitive, match_string and verbose_level."
) % ', '.join(kwargs.keys()))
self.obj: Any = obj
self.case_sensitive: bool = case_sensitive if isinstance(item, strings) else True
item = item if self.case_sensitive else (item.lower() if isinstance(item, str) else item)
self.exclude_paths: SetOrdered = SetOrdered(exclude_paths)
self.exclude_regex_paths: List[Pattern[str]] = [re.compile(exclude_regex_path) for exclude_regex_path in exclude_regex_paths]
self.exclude_types: SetOrdered = SetOrdered(exclude_types)
self.exclude_types_tuple: tuple[type, ...] = tuple(
exclude_types) # we need tuple for checking isinstance
self.verbose_level: int = verbose_level
self.update(
matched_paths=self.__set_or_dict(),
matched_values=self.__set_or_dict(),
unprocessed=[])
# Type narrowing for mypy/pyright
self.matched_paths: Union[Dict[str, Any], SetOrdered]
self.matched_values: Union[Dict[str, Any], SetOrdered]
self.unprocessed: List[str]
self.use_regexp: bool = use_regexp
if not strict_checking and (isinstance(item, numbers) or isinstance(item, ipranges)):
item = str(item)
if self.use_regexp:
try:
item = re.compile(item)
except TypeError as e:
raise TypeError(f"The passed item of {item} is not usable for regex: {e}") from None
self.strict_checking: bool = strict_checking
# Cases where user wants to match exact string item
self.match_string: bool = match_string
self.__search(obj, item, parents_ids=frozenset({id(obj)}))
empty_keys = [k for k, v in self.items() if not v]
for k in empty_keys:
del self[k]
def __set_or_dict(self) -> Union[Dict[str, Any], SetOrdered]:
return dict_() if self.verbose_level >= 2 else SetOrdered()
def __report(self, report_key: str, key: str, value: Any) -> None:
if self.verbose_level >= 2:
report_dict = self[report_key]
if isinstance(report_dict, dict):
report_dict[key] = value
else:
report_set = self[report_key]
if isinstance(report_set, SetOrdered):
report_set.add(key)
def __search_obj(self,
obj: Any,
item: Any,
parent: str,
parents_ids: FrozenSet[int] = frozenset(),
is_namedtuple: bool = False) -> None:
"""Search objects"""
found = False
if obj == item:
found = True
# We report the match but also continue inside the match to see if there are
# further matches inside the `looped` object.
self.__report(report_key='matched_values', key=parent, value=obj)
try:
if is_namedtuple:
obj = obj._asdict()
else:
# Skip magic methods. Slightly hacky, but unless people are defining
# new magic methods they want to search, it should work fine.
obj = {i: getattr(obj, i) for i in dir(obj)
if not (i.startswith('__') and i.endswith('__'))}
except AttributeError:
try:
obj = {i: getattr(obj, i) for i in obj.__slots__}
except AttributeError:
if not found:
unprocessed = self.get('unprocessed', [])
if isinstance(unprocessed, list):
unprocessed.append("%s" % parent)
return
self.__search_dict(
obj, item, parent, parents_ids, print_as_attribute=True)
def __skip_this(self, item: Any, parent: str) -> bool:
skip = False
if parent in self.exclude_paths:
skip = True
elif self.exclude_regex_paths and any(
[exclude_regex_path.search(parent) for exclude_regex_path in self.exclude_regex_paths]):
skip = True
else:
if isinstance(item, self.exclude_types_tuple):
skip = True
return skip
def __search_dict(self,
obj: Union[Dict[Any, Any], MutableMapping[Any, Any]],
item: Any,
parent: str,
parents_ids: FrozenSet[int] = frozenset(),
print_as_attribute: bool = False) -> None:
"""Search dictionaries"""
if print_as_attribute:
parent_text = "%s.%s"
else:
parent_text = "%s[%s]"
obj_keys = SetOrdered(obj.keys())
for item_key in obj_keys:
if not print_as_attribute and isinstance(item_key, strings):
item_key_str = "'%s'" % item_key
else:
item_key_str = item_key
obj_child = obj[item_key]
item_id = id(obj_child)
if parents_ids and item_id in parents_ids:
continue
parents_ids_added = add_to_frozen_set(parents_ids, item_id)
new_parent = parent_text % (parent, item_key_str)
new_parent_cased = new_parent if self.case_sensitive else new_parent.lower()
str_item = str(item)
if (self.match_string and str_item == new_parent_cased) or\
(not self.match_string and str_item in new_parent_cased) or\
(self.use_regexp and item.search(new_parent_cased)):
self.__report(
report_key='matched_paths',
key=new_parent,
value=obj_child)
self.__search(
obj_child,
item,
parent=new_parent,
parents_ids=parents_ids_added)
def __search_iterable(self,
obj: Iterable[Any],
item: Any,
parent: str = "root",
parents_ids: FrozenSet[int] = frozenset()) -> None:
"""Search iterables except dictionaries, sets and strings."""
for i, thing in enumerate(obj):
new_parent = "{}[{}]".format(parent, i)
if self.__skip_this(thing, parent=new_parent):
continue
if self.case_sensitive or not isinstance(thing, strings):
thing_cased = thing
else:
thing_cased = thing.lower() if isinstance(thing, str) else thing
if not self.use_regexp and thing_cased == item:
self.__report(
report_key='matched_values', key=new_parent, value=thing)
else:
item_id = id(thing)
if parents_ids and item_id in parents_ids:
continue
parents_ids_added = add_to_frozen_set(parents_ids, item_id)
self.__search(thing, item, "%s[%s]" %
(parent, i), parents_ids_added)
def __search_str(self, obj: Union[str, bytes, memoryview], item: Union[str, bytes, memoryview, Pattern[str]], parent: str) -> None:
"""Compare strings"""
obj_text = obj if self.case_sensitive else (obj.lower() if isinstance(obj, str) else obj)
is_matched = False
if self.use_regexp and isinstance(item, type(re.compile(''))):
is_matched = bool(item.search(str(obj_text)))
elif (self.match_string and str(item) == str(obj_text)) or (not self.match_string and str(item) in str(obj_text)):
is_matched = True
if is_matched:
self.__report(report_key='matched_values', key=parent, value=obj)
def __search_numbers(self, obj: Any, item: Any, parent: str) -> None:
if (
item == obj or (
not self.strict_checking and (
item == str(obj) or (
self.use_regexp and item.search(str(obj))
)
)
)
):
self.__report(report_key='matched_values', key=parent, value=obj)
def __search_tuple(self, obj: Tuple[Any, ...], item: Any, parent: str, parents_ids: FrozenSet[int]) -> None:
# Checking to see if it has _fields. Which probably means it is a named
# tuple.
try:
getattr(obj, '_asdict')
# It must be a normal tuple
except AttributeError:
self.__search_iterable(obj, item, parent, parents_ids)
# We assume it is a namedtuple then
else:
self.__search_obj(
obj, item, parent, parents_ids, is_namedtuple=True)
def __search(self, obj: Any, item: Any, parent: str = "root", parents_ids: FrozenSet[int] = frozenset()) -> None:
"""The main search method"""
if self.__skip_this(item, parent):
return
elif isinstance(obj, strings) and isinstance(item, (strings, RE_COMPILED_TYPE)):
self.__search_str(obj, item, parent)
elif isinstance(obj, strings) and isinstance(item, numbers):
return
elif isinstance(obj, ipranges):
self.__search_str(str(obj), item, parent)
elif isinstance(obj, numbers):
self.__search_numbers(obj, item, parent)
elif isinstance(obj, MutableMapping):
self.__search_dict(obj, item, parent, parents_ids)
elif isinstance(obj, tuple):
self.__search_tuple(obj, item, parent, parents_ids)
elif isinstance(obj, (set, frozenset)):
if self.warning_num < 10:
logger.warning(
"Set item detected in the path."
"'set' objects do NOT support indexing. But DeepSearch will still report a path."
)
self.warning_num += 1
self.__search_iterable(obj, item, parent, parents_ids)
elif isinstance(obj, Iterable) and not isinstance(obj, strings):
self.__search_iterable(obj, item, parent, parents_ids)
else:
self.__search_obj(obj, item, parent, parents_ids)
class grep:
__doc__ = doc
def __init__(self,
item: Any,
**kwargs: Any) -> None:
self.item: Any = item
self.kwargs: Dict[str, Any] = kwargs
def __ror__(self, other: Any) -> "DeepSearch":
return DeepSearch(obj=other, item=self.item, **self.kwargs)
if __name__ == "__main__": # pragma: no cover
import doctest
doctest.testmod()
qlustered-deepdiff-41c7265/deepdiff/serialization.py 0000664 0000000 0000000 00000073102 15162412645 0022551 0 ustar 00root root 0000000 0000000 import pickle
import sys
import io
import os
import json
import uuid
import logging
import re # NOQA
import builtins # NOQA
import datetime # NOQA
import decimal # NOQA
import orderly_set # NOQA
import collections # NOQA
import fractions
import ipaddress
import base64
from copy import deepcopy, copy
from functools import partial
from collections.abc import Mapping, KeysView
from typing import (
Callable, Optional, Union,
overload, Literal, Any,
)
from deepdiff.helper import (
strings,
get_type,
TEXT_VIEW,
TREE_VIEW,
np_float32,
np_float64,
np_int32,
np_int64,
np_ndarray,
Opcode,
SetOrdered,
pydantic_base_model_type,
PydanticBaseModel,
NotPresent,
ipranges,
)
from deepdiff.model import DeltaResult
try:
import orjson
except ImportError: # pragma: no cover.
orjson = None
logger = logging.getLogger(__name__)
class UnsupportedFormatErr(TypeError):
pass
NONE_TYPE = type(None)
CSV_HEADER_MAX_CHUNK_SIZE = 2048 # The chunk needs to be big enough that covers a couple of rows of data.
MODULE_NOT_FOUND_MSG = 'DeepDiff Delta did not find {} in your modules. Please make sure it is already imported.'
FORBIDDEN_MODULE_MSG = "Module '{}' is forbidden. You need to explicitly pass it by passing a safe_to_import parameter"
DELTA_IGNORE_ORDER_NEEDS_REPETITION_REPORT = 'report_repetition must be set to True when ignore_order is True to create the delta object.'
DELTA_ERROR_WHEN_GROUP_BY = 'Delta can not be made when group_by is used since the structure of data is modified from the original form.'
SAFE_TO_IMPORT = frozenset({
'builtins.range',
'builtins.complex',
'builtins.set',
'builtins.frozenset',
'builtins.slice',
'builtins.str',
'builtins.bytes',
'builtins.list',
'builtins.tuple',
'builtins.int',
'builtins.float',
'builtins.dict',
'builtins.bool',
'builtins.bin',
'builtins.None',
'datetime.datetime',
'datetime.time',
'datetime.timedelta',
'decimal.Decimal',
'fractions.Fraction',
'uuid.UUID',
'orderly_set.sets.OrderedSet',
'orderly_set.sets.OrderlySet',
'orderly_set.sets.StableSetEq',
'deepdiff.helper.SetOrdered',
'collections.namedtuple',
'collections.OrderedDict',
're.Pattern',
'deepdiff.helper.Opcode',
'ipaddress.IPv4Interface',
'ipaddress.IPv6Interface',
'ipaddress.IPv4Network',
'ipaddress.IPv6Network',
'ipaddress.IPv4Address',
'ipaddress.IPv6Address',
'collections.abc.KeysView',
})
TYPE_STR_TO_TYPE = {
'range': range,
'complex': complex,
'set': set,
'frozenset': frozenset,
'slice': slice,
'str': str,
'bytes': bytes,
'list': list,
'tuple': tuple,
'int': int,
'float': float,
'dict': dict,
'bool': bool,
'bin': bin,
'None': None,
'NoneType': None,
'datetime': datetime.datetime,
'time': datetime.time,
'timedelta': datetime.timedelta,
'Decimal': decimal.Decimal,
'SetOrdered': SetOrdered,
'namedtuple': collections.namedtuple,
'OrderedDict': collections.OrderedDict,
'Pattern': re.Pattern,
'iprange': str,
'IPv4Address': ipaddress.IPv4Address,
'IPv6Address': ipaddress.IPv6Address,
'KeysView': list,
}
class ModuleNotFoundError(ImportError):
"""
Raised when the module is not found in sys.modules
"""
pass
class ForbiddenModule(ImportError):
"""
Raised when a module is not explicitly allowed to be imported
"""
pass
class SerializationMixin:
def to_json_pickle(self):
"""
:ref:`to_json_pickle_label`
Get the json pickle of the diff object. Unless you need all the attributes and functionality of DeepDiff, running to_json() is the safer option that json pickle.
"""
try:
import jsonpickle
copied = self.copy() # type: ignore
return jsonpickle.encode(copied)
except ImportError: # pragma: no cover. Json pickle is getting deprecated.
logger.error('jsonpickle library needs to be installed in order to run to_json_pickle') # pragma: no cover. Json pickle is getting deprecated.
@classmethod
def from_json_pickle(cls, value):
"""
:ref:`from_json_pickle_label`
Load DeepDiff object with all the bells and whistles from the json pickle dump.
Note that json pickle dump comes from to_json_pickle
"""
try:
import jsonpickle
return jsonpickle.decode(value)
except ImportError: # pragma: no cover. Json pickle is getting deprecated.
logger.error('jsonpickle library needs to be installed in order to run from_json_pickle') # pragma: no cover. Json pickle is getting deprecated.
def to_json(self, default_mapping: Optional[dict]=None, force_use_builtin_json=False, verbose_level: Optional[int]=None, **kwargs):
"""
Dump json of the text view.
**Parameters**
default_mapping : dictionary(optional), a dictionary of mapping of different types to json types.
by default DeepDiff converts certain data types. For example Decimals into floats so they can be exported into json.
If you have a certain object type that the json serializer can not serialize it, please pass the appropriate type
conversion through this dictionary.
force_use_builtin_json: Boolean, default = False
When True, we use Python's builtin Json library for serialization,
even if Orjson is installed.
verbose_level: int, default=None
Override the verbose_level for the serialized output. See to_dict() for details.
kwargs: Any other kwargs you pass will be passed on to Python's json.dumps()
**Example**
Serialize custom objects
>>> class A:
... pass
...
>>> class B:
... pass
...
>>> t1 = A()
>>> t2 = B()
>>> ddiff = DeepDiff(t1, t2)
>>> ddiff.to_json()
TypeError: We do not know how to convert <__main__.A object at 0x10648> of type for json serialization. Please pass the default_mapping parameter with proper mapping of the object to a basic python type.
>>> default_mapping = {A: lambda x: 'obj A', B: lambda x: 'obj B'}
>>> ddiff.to_json(default_mapping=default_mapping)
'{"type_changes": {"root": {"old_type": "A", "new_type": "B", "old_value": "obj A", "new_value": "obj B"}}}'
"""
dic = self.to_dict(verbose_level=verbose_level)
return json_dumps(
dic,
default_mapping=default_mapping,
force_use_builtin_json=force_use_builtin_json,
**kwargs,
)
def to_dict(self, verbose_level: Optional[int]=None) -> dict:
"""
Convert the result to a python dictionary.
**Parameters**
verbose_level: int, default=None
Override the verbose_level for the serialized output.
When None, the behavior depends on the original view:
- If the original view is 'text', the verbose_level from DeepDiff initialization is used.
- If the original view is 'tree', verbose_level=2 is used to provide the most detailed output.
Valid values are 0, 1, or 2.
"""
if verbose_level is not None and verbose_level not in {0, 1, 2}:
raise ValueError('verbose_level should be 0, 1, or 2.')
if verbose_level is None:
if self.view == TREE_VIEW: # type: ignore
verbose_level = 2
else:
verbose_level = self.verbose_level # type: ignore
return dict(self._get_view_results(TEXT_VIEW, verbose_level=verbose_level)) # type: ignore
def _to_delta_dict(
self,
directed: bool = True,
report_repetition_required: bool = True,
always_include_values: bool = False,
) -> dict:
"""
Dump to a dictionary suitable for delta usage.
Unlike to_dict, this is not dependent on the original view that the user chose to create the diff.
**Parameters**
directed : Boolean, default=True, whether to create a directional delta dictionary or a symmetrical
Note that in the current implementation the symmetrical delta (non-directional) is ONLY used for verifying that
the delta is being applied to the exact same values as what was used to generate the delta and has
no other usages.
If this option is set as True, then the dictionary will not have the "old_value" in the output.
Otherwise it will have the "old_value". "old_value" is the value of the item in t1.
If delta = Delta(DeepDiff(t1, t2)) then
t1 + delta == t2
Note that it the items in t1 + delta might have slightly different order of items than t2 if ignore_order
was set to be True in the diff object.
"""
if self.group_by is not None: # type: ignore
raise ValueError(DELTA_ERROR_WHEN_GROUP_BY)
if directed and not always_include_values:
_iterable_opcodes = {} # type: ignore
for path, op_codes in self._iterable_opcodes.items(): # type: ignore
_iterable_opcodes[path] = []
for op_code in op_codes:
new_op_code = Opcode(
tag=op_code.tag,
t1_from_index=op_code.t1_from_index,
t1_to_index=op_code.t1_to_index,
t2_from_index=op_code.t2_from_index,
t2_to_index=op_code.t2_to_index,
new_values=op_code.new_values,
)
_iterable_opcodes[path].append(new_op_code)
else:
_iterable_opcodes = self._iterable_opcodes # type: ignore
result = DeltaResult(
tree_results=self.tree, # type: ignore
ignore_order=self.ignore_order, # type: ignore
always_include_values=always_include_values,
_iterable_opcodes=_iterable_opcodes,
)
result.remove_empty_keys()
if report_repetition_required and self.ignore_order and not self.report_repetition: # type: ignore
raise ValueError(DELTA_IGNORE_ORDER_NEEDS_REPETITION_REPORT)
if directed:
for report_key, report_value in result.items():
if isinstance(report_value, Mapping):
for path, value in report_value.items():
if isinstance(value, Mapping) and 'old_value' in value:
del value['old_value'] # type: ignore
if self._numpy_paths: # type: ignore
# Note that keys that start with '_' are considered internal to DeepDiff
# and will be omitted when counting distance. (Look inside the distance module.)
result['_numpy_paths'] = self._numpy_paths # type: ignore
if self.iterable_compare_func: # type: ignore
result['_iterable_compare_func_was_used'] = True
return deepcopy(dict(result))
def pretty(self, prefix: Optional[Union[str, Callable]]=None):
"""
The pretty human readable string output for the diff object
regardless of what view was used to generate the diff.
prefix can be a callable or a string or None.
Example:
>>> t1={1,2,4}
>>> t2={2,3}
>>> print(DeepDiff(t1, t2).pretty())
Item root[3] added to set.
Item root[4] removed from set.
Item root[1] removed from set.
"""
result = []
if prefix is None:
prefix = ''
keys = sorted(self.tree.keys()) # type: ignore # sorting keys to guarantee constant order across python versions.
for key in keys:
for item_key in self.tree[key]: # type: ignore
result += [pretty_print_diff(item_key)]
if callable(prefix):
return "\n".join(f"{prefix(diff=self)}{r}" for r in result)
return "\n".join(f"{prefix}{r}" for r in result)
# Maximum size allowed for integer arguments to constructors that allocate
# memory proportional to the argument (e.g. bytes(n), bytearray(n)).
# This prevents denial-of-service via crafted pickle payloads. (CVE-2026-33155)
_MAX_ALLOC_SIZE = 128 * 1024 * 1024 # 128 MB
# Callables where an integer argument directly controls memory allocation size.
_SIZE_SENSITIVE_CALLABLES = frozenset({bytes, bytearray})
class _SafeConstructor:
"""Wraps a type constructor to prevent excessive memory allocation via the REDUCE opcode."""
__slots__ = ('_wrapped',)
def __init__(self, wrapped):
self._wrapped = wrapped
def __call__(self, *args, **kwargs):
for arg in args:
if isinstance(arg, int) and arg > _MAX_ALLOC_SIZE:
raise pickle.UnpicklingError(
"Refusing to create {}() with size {}: "
"exceeds the maximum allowed size of {} bytes. "
"This could be a denial-of-service attack payload.".format(
self._wrapped.__name__, arg, _MAX_ALLOC_SIZE
)
)
return self._wrapped(*args, **kwargs)
class _RestrictedUnpickler(pickle.Unpickler):
def __init__(self, *args, **kwargs):
self.safe_to_import = kwargs.pop('safe_to_import', None)
if self.safe_to_import:
if isinstance(self.safe_to_import, strings):
self.safe_to_import = set([self.safe_to_import])
elif isinstance(self.safe_to_import, (set, frozenset)):
pass
else:
self.safe_to_import = set(self.safe_to_import)
self.safe_to_import = self.safe_to_import | SAFE_TO_IMPORT
else:
self.safe_to_import = SAFE_TO_IMPORT
super().__init__(*args, **kwargs)
def find_class(self, module, name):
# Only allow safe classes from self.safe_to_import.
module_dot_class = '{}.{}'.format(module, name)
if module_dot_class in self.safe_to_import:
try:
module_obj = sys.modules[module]
except KeyError:
raise ModuleNotFoundError(MODULE_NOT_FOUND_MSG.format(module_dot_class)) from None
cls = getattr(module_obj, name)
# Wrap size-sensitive callables to prevent DoS via large allocations
if cls in _SIZE_SENSITIVE_CALLABLES:
return _SafeConstructor(cls)
return cls
# Forbid everything else.
raise ForbiddenModule(FORBIDDEN_MODULE_MSG.format(module_dot_class)) from None
def persistent_load(self, pid):
if pid == "<>":
return type(None)
class _RestrictedPickler(pickle.Pickler):
def persistent_id(self, obj):
if obj is NONE_TYPE: # NOQA
return "<>"
return None
def pickle_dump(obj, file_obj=None, protocol=4):
"""
**pickle_dump**
Dumps the obj into pickled content.
**Parameters**
obj : Any python object
file_obj : (Optional) A file object to dump the contents into
**Returns**
If file_obj is passed the return value will be None. It will write the object's pickle contents into the file.
However if no file_obj is passed, then it will return the pickle serialization of the obj in the form of bytes.
"""
file_obj_passed = bool(file_obj)
file_obj = file_obj or io.BytesIO()
_RestrictedPickler(file_obj, protocol=protocol, fix_imports=False).dump(obj)
if not file_obj_passed:
return file_obj.getvalue()
def pickle_load(content=None, file_obj=None, safe_to_import=None):
"""
**pickle_load**
Load the pickled content. content should be a bytes object.
**Parameters**
content : Bytes of pickled object.
file_obj : A file object to load the content from
safe_to_import : A set of modules that needs to be explicitly allowed to be loaded.
Example: {'mymodule.MyClass', 'decimal.Decimal'}
Note that this set will be added to the basic set of modules that are already allowed.
The set of what is already allowed can be found in deepdiff.serialization.SAFE_TO_IMPORT
**Returns**
A delta object that can be added to t1 to recreate t2.
**Examples**
Importing
>>> from deepdiff import DeepDiff, Delta
>>> from pprint import pprint
"""
if not content and not file_obj:
raise ValueError('Please either pass the content or the file_obj to pickle_load.')
if isinstance(content, str):
content = content.encode('utf-8')
if content:
file_obj = io.BytesIO(content)
return _RestrictedUnpickler(file_obj, safe_to_import=safe_to_import).load()
def _get_pretty_form_text(verbose_level):
pretty_form_texts = {
"type_changes": "Type of {diff_path} changed from {type_t1} to {type_t2} and value changed from {val_t1} to {val_t2}.",
"values_changed": "Value of {diff_path} changed from {val_t1} to {val_t2}.",
"dictionary_item_added": "Item {diff_path} added to dictionary.",
"dictionary_item_removed": "Item {diff_path} removed from dictionary.",
"iterable_item_added": "Item {diff_path} added to iterable.",
"iterable_item_removed": "Item {diff_path} removed from iterable.",
"attribute_added": "Attribute {diff_path} added.",
"attribute_removed": "Attribute {diff_path} removed.",
"set_item_added": "Item root[{val_t2}] added to set.",
"set_item_removed": "Item root[{val_t1}] removed from set.",
"repetition_change": "Repetition change for item {diff_path}.",
}
if verbose_level == 2:
pretty_form_texts.update(
{
"dictionary_item_added": "Item {diff_path} ({val_t2}) added to dictionary.",
"dictionary_item_removed": "Item {diff_path} ({val_t1}) removed from dictionary.",
"iterable_item_added": "Item {diff_path} ({val_t2}) added to iterable.",
"iterable_item_removed": "Item {diff_path} ({val_t1}) removed from iterable.",
"attribute_added": "Attribute {diff_path} ({val_t2}) added.",
"attribute_removed": "Attribute {diff_path} ({val_t1}) removed.",
}
)
return pretty_form_texts
def pretty_print_diff(diff):
type_t1 = get_type(diff.t1).__name__
type_t2 = get_type(diff.t2).__name__
val_t1 = '"{}"'.format(str(diff.t1)) if type_t1 == "str" else str(diff.t1)
val_t2 = '"{}"'.format(str(diff.t2)) if type_t2 == "str" else str(diff.t2)
diff_path = diff.path(root='root')
return _get_pretty_form_text(diff.verbose_level).get(diff.report_type, "").format(
diff_path=diff_path,
type_t1=type_t1,
type_t2=type_t2,
val_t1=val_t1,
val_t2=val_t2)
def load_path_content(path, file_type=None):
"""
Loads and deserializes the content of the path.
"""
if file_type is None:
file_type = path.split('.')[-1]
if file_type == 'json':
with open(path, 'r') as the_file:
content = json_loads(the_file.read())
elif file_type in {'yaml', 'yml'}:
try:
import yaml
except ImportError: # pragma: no cover.
raise ImportError('Pyyaml needs to be installed.') from None # pragma: no cover.
with open(path, 'r') as the_file:
content = yaml.safe_load(the_file)
elif file_type == 'toml':
try:
if sys.version_info >= (3, 11):
import tomllib as tomli
else:
import tomli
except ImportError: # pragma: no cover.
raise ImportError('On python<=3.10 tomli needs to be installed.') from None # pragma: no cover.
with open(path, 'rb') as the_file:
content = tomli.load(the_file)
elif file_type == 'pickle':
with open(path, 'rb') as the_file:
content = the_file.read()
content = pickle_load(content)
elif file_type in {'csv', 'tsv'}:
try:
import clevercsv # type: ignore
content = clevercsv.read_dicts(path)
except ImportError: # pragma: no cover.
import csv
with open(path, 'r') as the_file:
content = list(csv.DictReader(the_file))
logger.info(f"NOTE: CSV content was empty in {path}")
# Everything in csv is string but we try to automatically convert any numbers we find
for row in content:
for key, value in row.items():
value = value.strip()
for type_ in [int, float, complex]:
try:
value = type_(value)
except Exception:
pass
else:
row[key] = value
break
else:
raise UnsupportedFormatErr(f'Only json, yaml, toml, csv, tsv and pickle are supported.\n'
f' The {file_type} extension is not known.')
return content
def save_content_to_path(content, path, file_type=None, keep_backup=True):
"""
Saves and serializes the content of the path.
"""
backup_path = f"{path}.bak"
os.rename(path, backup_path)
try:
_save_content(
content=content, path=path,
file_type=file_type, keep_backup=keep_backup)
except Exception:
os.rename(backup_path, path)
raise
else:
if not keep_backup:
os.remove(backup_path)
def _save_content(content, path, file_type, keep_backup=True):
if file_type == 'json':
with open(path, 'w') as the_file:
content = json_dumps(content)
the_file.write(content) # type: ignore
elif file_type in {'yaml', 'yml'}:
try:
import yaml
except ImportError: # pragma: no cover.
raise ImportError('Pyyaml needs to be installed.') from None # pragma: no cover.
with open(path, 'w') as the_file:
content = yaml.safe_dump(content, stream=the_file)
elif file_type == 'toml':
try:
import tomli_w
except ImportError: # pragma: no cover.
raise ImportError('Tomli-w needs to be installed.') from None # pragma: no cover.
with open(path, 'wb') as the_file:
content = tomli_w.dump(content, the_file)
elif file_type == 'pickle':
with open(path, 'wb') as the_file:
content = pickle_dump(content, file_obj=the_file)
elif file_type in {'csv', 'tsv'}:
try:
import clevercsv # type: ignore
dict_writer = clevercsv.DictWriter
except ImportError: # pragma: no cover.
import csv
dict_writer = csv.DictWriter
with open(path, 'w', newline='') as csvfile:
fieldnames = list(content[0].keys())
writer = dict_writer(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(content)
else:
raise UnsupportedFormatErr('Only json, yaml, toml, csv, tsv and pickle are supported.\n'
f' The {file_type} extension is not known.')
return content
def _serialize_decimal(value):
if value.as_tuple().exponent == 0:
return int(value)
else:
return float(value)
def _serialize_fraction(value):
if value.denominator == 1:
return value.numerator
else:
return float(value)
def _serialize_tuple(value):
if hasattr(value, '_asdict'): # namedtuple
return value._asdict()
return value
def _serialize_bytes(value):
"""
Serialize bytes to JSON-compatible format.
First tries UTF-8 decoding for backward compatibility.
Falls back to base64 encoding for binary data.
"""
try:
return value.decode('utf-8')
except UnicodeDecodeError:
return base64.b64encode(value).decode('ascii')
JSON_CONVERTOR = {
decimal.Decimal: _serialize_decimal,
fractions.Fraction: _serialize_fraction,
SetOrdered: list,
orderly_set.StableSetEq: list,
set: list,
type: lambda x: x.__name__,
bytes: _serialize_bytes,
datetime.datetime: lambda x: x.isoformat(),
uuid.UUID: lambda x: str(x),
np_float32: float,
np_float64: float,
np_int32: int,
np_int64: int,
np_ndarray: lambda x: x.tolist(),
tuple: _serialize_tuple,
Mapping: dict,
NotPresent: str,
ipranges: str,
memoryview: lambda x: x.tobytes(),
KeysView: list,
}
if PydanticBaseModel is not pydantic_base_model_type:
JSON_CONVERTOR[PydanticBaseModel] = lambda x: x.model_dump()
def json_convertor_default(default_mapping=None):
if default_mapping:
_convertor_mapping = JSON_CONVERTOR.copy()
_convertor_mapping.update(default_mapping)
else:
_convertor_mapping = JSON_CONVERTOR
def _convertor(obj):
for original_type, convert_to in _convertor_mapping.items():
if isinstance(obj, original_type):
return convert_to(obj)
# This is to handle reverse() which creates a generator of type list_reverseiterator
if obj.__class__.__name__ == 'list_reverseiterator':
return list(copy(obj))
# 3) gather @property values by scanning __class__.__dict__ and bases
props = {}
for cls in obj.__class__.__mro__:
for name, descriptor in cls.__dict__.items():
if isinstance(descriptor, property) and not name.startswith('_'):
try:
props[name] = getattr(obj, name)
except Exception:
# skip properties that error out
pass
if props:
return props
# 4) fallback: public __dict__ entries
if hasattr(obj, '__dict__'):
return {
k: v
for k, v in vars(obj).items()
if not k.startswith('_')
}
# 5) give up
raise TypeError(
f"Don't know how to JSON-serialize {obj!r} "
f"(type {type(obj).__name__}); "
"consider adding it to default_mapping."
)
return _convertor
class JSONDecoder(json.JSONDecoder):
def __init__(self, *args, **kwargs):
json.JSONDecoder.__init__(self, object_hook=self.object_hook, *args, **kwargs)
def object_hook(self, obj): # type: ignore
if 'old_type' in obj and 'new_type' in obj:
for type_key in ('old_type', 'new_type'):
type_str = obj[type_key]
obj[type_key] = TYPE_STR_TO_TYPE.get(type_str, type_str)
return obj
@overload
def json_dumps(
item: Any,
**kwargs,
) -> str:
...
@overload
def json_dumps(
item: Any,
default_mapping:Optional[dict],
force_use_builtin_json: bool,
return_bytes:Literal[True],
**kwargs,
) -> bytes:
...
@overload
def json_dumps(
item: Any,
default_mapping:Optional[dict],
force_use_builtin_json: bool,
return_bytes:Literal[False],
**kwargs,
) -> str:
...
_INT64_MAX = 9223372036854775807
_INT64_MIN = -9223372036854775808
def _convert_oversized_ints(obj):
"""Recursively convert integers exceeding 64-bit range to strings.
orjson cannot serialize integers outside the signed 64-bit range."""
if isinstance(obj, bool):
return obj
if isinstance(obj, int) and (obj > _INT64_MAX or obj < _INT64_MIN):
return str(obj)
if isinstance(obj, dict):
return {k: _convert_oversized_ints(v) for k, v in obj.items()}
if isinstance(obj, (list, tuple)):
converted = [_convert_oversized_ints(v) for v in obj]
if hasattr(obj, '_fields'):
# NamedTuple: reconstruct using keyword arguments
return type(obj)(**dict(zip(obj._fields, converted)))
return type(obj)(converted)
return obj
def json_dumps(
item: Any,
default_mapping:Optional[dict]=None,
force_use_builtin_json: bool = False,
return_bytes: bool = False,
**kwargs,
) -> Union[str, bytes]:
"""
Dump json with extra details that are not normally json serializable
parameters
----------
force_use_builtin_json: Boolean, default = False
When True, we use Python's builtin Json library for serialization,
even if Orjson is installed.
"""
if orjson and not force_use_builtin_json:
indent = kwargs.pop('indent', None)
kwargs['option'] = orjson.OPT_NON_STR_KEYS | orjson.OPT_SERIALIZE_NUMPY
if indent:
kwargs['option'] |= orjson.OPT_INDENT_2
if 'sort_keys' in kwargs:
raise TypeError(
"orjson does not accept the sort_keys parameter. "
"If you need to pass sort_keys, set force_use_builtin_json=True "
"to use Python's built-in json library instead of orjson.")
try:
result = orjson.dumps(
item,
default=json_convertor_default(default_mapping=default_mapping),
**kwargs)
except TypeError as e:
if 'Integer exceeds 64-bit range' in str(e):
item = _convert_oversized_ints(item)
result = orjson.dumps(
item,
default=json_convertor_default(default_mapping=default_mapping),
**kwargs)
else:
raise
if return_bytes:
return result
return result.decode(encoding='utf-8')
else:
result = json.dumps(
item,
default=json_convertor_default(default_mapping=default_mapping),
**kwargs)
if return_bytes:
return result.encode(encoding='utf-8')
return result
json_loads = partial(json.loads, cls=JSONDecoder)
qlustered-deepdiff-41c7265/deepdiff/summarize.py 0000664 0000000 0000000 00000012773 15162412645 0021717 0 ustar 00root root 0000000 0000000 from typing import Tuple
from deepdiff.helper import JSON, SummaryNodeType
from deepdiff.serialization import json_dumps
def _truncate(s: str, max_len: int) -> str:
"""
Truncate string s to max_len characters.
If possible, keep the first (max_len-5) characters, then '...' then the last 2 characters.
"""
if len(s) <= max_len:
return s
if max_len <= 5:
return s[:max_len]
return s[:max_len - 5] + "..." + s[-2:]
# Re-defining the functions due to environment reset
# Function to calculate node weights recursively
def calculate_weights(node):
if isinstance(node, dict):
weight = 0
children_weights = {}
for k, v in node.items():
try:
edge_weight = len(k)
except TypeError:
edge_weight = 1
child_weight, child_structure = calculate_weights(v)
total_weight = edge_weight + child_weight
weight += total_weight
children_weights[k] = (edge_weight, child_weight, child_structure)
return weight, (SummaryNodeType.dict, children_weights)
elif isinstance(node, list):
weight = 0
children_weights = []
for v in node:
edge_weight = 0 # Index weights are zero
child_weight, child_structure = calculate_weights(v)
total_weight = edge_weight + child_weight
weight += total_weight
children_weights.append((edge_weight, child_weight, child_structure))
return weight, (SummaryNodeType.list, children_weights)
else:
if isinstance(node, str):
node_weight = len(node)
elif isinstance(node, int):
node_weight = len(str(node))
elif isinstance(node, float):
node_weight = len(str(round(node, 2)))
elif node is None:
node_weight = 1
else:
node_weight = 0
return node_weight, (SummaryNodeType.leaf, node)
# Include previously defined functions for shrinking with threshold
# (Implementing directly the balanced summarization algorithm as above)
# Balanced algorithm (simplified version):
def shrink_tree_balanced(node_structure, max_weight: int, balance_threshold: float) -> Tuple[JSON, float]:
node_type, node_info = node_structure
if node_type is SummaryNodeType.leaf:
leaf_value = node_info
leaf_weight, _ = calculate_weights(leaf_value)
if leaf_weight <= max_weight:
return leaf_value, leaf_weight
else:
if isinstance(leaf_value, str):
truncated_value = _truncate(leaf_value, max_weight)
return truncated_value, len(truncated_value)
elif isinstance(leaf_value, (int, float)):
leaf_str = str(leaf_value)
truncated_str = leaf_str[:max_weight]
try:
return int(truncated_str), len(truncated_str)
except Exception:
try:
return float(truncated_str), len(truncated_str)
except Exception:
return truncated_str, len(truncated_str)
elif leaf_value is None:
return None, 1 if max_weight >= 1 else 0
elif node_type is SummaryNodeType.dict:
shrunk_dict = {}
total_weight = 0
sorted_children = sorted(node_info.items(), key=lambda x: x[1][0] + x[1][1], reverse=True)
for k, (edge_w, _, child_struct) in sorted_children:
allowed_branch_weight = min(max_weight * balance_threshold, max_weight - total_weight)
if allowed_branch_weight <= edge_w:
continue
remaining_weight = int(allowed_branch_weight - edge_w)
shrunk_child, shrunk_weight = shrink_tree_balanced(child_struct, remaining_weight, balance_threshold)
if shrunk_child is not None:
shrunk_dict[k[:edge_w]] = shrunk_child
total_weight += edge_w + shrunk_weight
if total_weight >= max_weight:
break
if not shrunk_dict:
return None, 0
return shrunk_dict, total_weight
elif node_type is SummaryNodeType.list:
shrunk_list = []
total_weight = 0
sorted_children = sorted(node_info, key=lambda x: x[0] + x[1], reverse=True)
for edge_w, _, child_struct in sorted_children:
allowed_branch_weight = int(min(max_weight * balance_threshold, max_weight - total_weight))
shrunk_child, shrunk_weight = shrink_tree_balanced(child_struct, allowed_branch_weight, balance_threshold)
if shrunk_child is not None:
shrunk_list.append(shrunk_child)
total_weight += shrunk_weight
if total_weight >= max_weight - 1:
shrunk_list.append("...")
break
if not shrunk_list:
return None, 0
return shrunk_list, total_weight
return None, 0
def greedy_tree_summarization_balanced(json_data: JSON, max_weight: int, balance_threshold=0.6) -> JSON:
total_weight, tree_structure = calculate_weights(json_data)
if total_weight <= max_weight:
return json_data
shrunk_tree, _ = shrink_tree_balanced(tree_structure, max_weight, balance_threshold)
return shrunk_tree
def summarize(data: JSON, max_length:int=200, balance_threshold:float=0.6) -> str:
try:
return json_dumps(
greedy_tree_summarization_balanced(data, max_length, balance_threshold)
)
except Exception:
return str(data)
qlustered-deepdiff-41c7265/docs/ 0000775 0000000 0000000 00000000000 15162412645 0016501 5 ustar 00root root 0000000 0000000 qlustered-deepdiff-41c7265/docs/AGENTS.md 0000777 0000000 0000000 00000000000 15162412645 0021256 2CLAUDE.md ustar 00root root 0000000 0000000 qlustered-deepdiff-41c7265/docs/CLAUDE.md 0000664 0000000 0000000 00000004010 15162412645 0017753 0 ustar 00root root 0000000 0000000 # Docs CLAUDE.md
## Building Documentation
```bash
cd docs && source ~/.venvs/deep/bin/activate && python buildme.py
```
The build output goes to `~/Workspace/sites/blog/public/deepdiff//` (configured via `docs/.env`).
To verify changes, check the generated HTML, e.g. `~/Workspace/sites/blog/public/deepdiff/8.7.0/faq.html`.
The Sphinx doctree cache is stored in `/tmp/sphinx_doctree`. The build script clears it each run. If you get permission errors on that directory, ask the user to `rm -rf /tmp/sphinx_doctree` first.
## Theme
Uses the **Furo** Sphinx theme. Key customizations:
- **Font**: Open Sans, loaded via `_static/custom.css` and set in `conf.py` via `light_css_variables` / `dark_css_variables`
- **Footer**: Custom `_templates/page.html` overrides the default footer to remove Sphinx/Furo credit while keeping the copyright notice
- **GA4**: Google Analytics tag (`G-KVVHD37BKD`) is injected via `_templates/page.html` in the `extrahead` block
- **Pygments**: Uses Furo's default syntax highlighting (no explicit `pygments_style` set)
## Symlinked Docstrings
Some RST files in `docs/` (e.g., `diff_doc.rst`, `deephash_doc.rst`, `search_doc.rst`) are symlinks to `deepdiff/docstrings/`. The files need to exist in both places:
- **`deepdiff/docstrings/`** — So they're included in the generated wheel. `flit_core` (our build system) only packages files under the `deepdiff/` directory, and these are loaded at runtime by `get_doc()` in `helper.py` to serve as Python docstrings.
- **`docs/`** — So Sphinx can find and build them as documentation pages.
These files have a `:orphan:` directive on line 1 (needed by Sphinx to suppress toctree warnings). `get_doc()` strips it at runtime so it doesn't appear in the Python docstrings.
## File Structure
- `conf.py` — Sphinx configuration
- `buildme.py` — Build script (reads `.env` for `BUILD_PATH` and `DOC_VERSION`)
- `_templates/page.html` — Extends `furo/page.html` for GA4 and custom footer
- `_static/custom.css` — Loads Open Sans font from Google Fonts
qlustered-deepdiff-41c7265/docs/Makefile 0000664 0000000 0000000 00000016644 15162412645 0020154 0 ustar 00root root 0000000 0000000 # Makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
PAPER =
BUILDDIR = _build
# User-friendly check for sphinx-build
ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
endif
# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
help:
@echo "Please use \`make ' where is one of"
@echo " buildme echos what to run to do live builds."
@echo " html to make standalone HTML files"
@echo " dirhtml to make HTML files named index.html in directories"
@echo " singlehtml to make a single large HTML file"
@echo " pickle to make pickle files"
@echo " json to make JSON files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " qthelp to make HTML files and a qthelp project"
@echo " applehelp to make an Apple Help Book"
@echo " devhelp to make HTML files and a Devhelp project"
@echo " epub to make an epub"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " latexpdf to make LaTeX files and run them through pdflatex"
@echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
@echo " text to make text files"
@echo " man to make manual pages"
@echo " texinfo to make Texinfo files"
@echo " info to make Texinfo files and run them through makeinfo"
@echo " gettext to make PO message catalogs"
@echo " changes to make an overview of all changed/added/deprecated items"
@echo " xml to make Docutils-native XML files"
@echo " pseudoxml to make pseudoxml-XML files for display purposes"
@echo " linkcheck to check all external links for integrity"
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
@echo " coverage to run coverage check of the documentation (if enabled)"
buildme:
@echo "Please make sure the .env is pointing to the right path for the build. Then run ./buildme.py"
clean:
rm -rf $(BUILDDIR)/*
html:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
singlehtml:
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
@echo
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
pickle:
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
@echo
@echo "Build finished; now you can process the pickle files."
json:
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
@echo
@echo "Build finished; now you can process the JSON files."
htmlhelp:
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
@echo
@echo "Build finished; now you can run HTML Help Workshop with the" \
".hhp project file in $(BUILDDIR)/htmlhelp."
qthelp:
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
@echo
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/DeepDiff.qhcp"
@echo "To view the help file:"
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/DeepDiff.qhc"
applehelp:
$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
@echo
@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
@echo "N.B. You won't be able to view it unless you put it in" \
"~/Library/Documentation/Help or install it in your application" \
"bundle."
devhelp:
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
@echo
@echo "Build finished."
@echo "To view the help file:"
@echo "# mkdir -p $$HOME/.local/share/devhelp/DeepDiff"
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/DeepDiff"
@echo "# devhelp"
epub:
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
@echo
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
@echo "Run \`make' in that directory to run these through (pdf)latex" \
"(use \`make latexpdf' here to do that automatically)."
latexpdf:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through pdflatex..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
latexpdfja:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through platex and dvipdfmx..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
text:
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
@echo
@echo "Build finished. The text files are in $(BUILDDIR)/text."
man:
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
@echo
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
texinfo:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
@echo "Run \`make' in that directory to run these through makeinfo" \
"(use \`make info' here to do that automatically)."
info:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo "Running Texinfo files through makeinfo..."
make -C $(BUILDDIR)/texinfo info
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
gettext:
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
@echo
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
changes:
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
@echo
@echo "The overview file is in $(BUILDDIR)/changes."
linkcheck:
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
@echo
@echo "Link check complete; look for any errors in the above output " \
"or in $(BUILDDIR)/linkcheck/output.txt."
doctest:
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
@echo "Testing of doctests in the sources finished, look at the " \
"results in $(BUILDDIR)/doctest/output.txt."
coverage:
$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
@echo "Testing of coverage in the sources finished, look at the " \
"results in $(BUILDDIR)/coverage/python.txt."
xml:
$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
@echo
@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
pseudoxml:
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
@echo
@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
qlustered-deepdiff-41c7265/docs/_static/ 0000775 0000000 0000000 00000000000 15162412645 0020127 5 ustar 00root root 0000000 0000000 qlustered-deepdiff-41c7265/docs/_static/Qluster_square_grey_optimized.svg 0000664 0000000 0000000 00000003711 15162412645 0027003 0 ustar 00root root 0000000 0000000
1603fa08fed7b5f4cdb19d5fea57a83cf5160f84.paxheader 0000666 0000000 0000000 00000000261 15162412645 0020746 x ustar 00root root 0000000 0000000 177 path=qlustered-deepdiff-41c7265/docs/_static/benchmark_array_no_numpy__3.8__ignore_order=True__cache_size=0__cache_tuning_sample_size=0__cutoff_intersection_for_pairs=1.png
1603fa08fed7b5f4cdb19d5fea57a83cf5160f84.data 0000664 0000000 0000000 00000100613 15162412645 0017606 0 ustar 00root root 0000000 0000000 PNG
IHDR 5 sBIT|d pHYs a a?i 8tEXtSoftware matplotlib version3.2.1, http://matplotlib.org/: IDATxy|LdYDHk
کҪB]Z\[[JUUTinUi-M*"$UuE
A$]213,$1[z1sΜy̙g9Q!n8XDDDDdYDDDDv0 @""""; HDDDdg a $"""3DDDDv0 @""""; HDDDdg a $"""3DDDDv0 @""""; HDDDdg a $"""3DDDDv0 @""""; HDDDdg a $"""3DDDDv0 @""""; HDDDdg a $"""3DDDDv0 @""""; HDDDdg a $"""3DDDDv0 @""""; HDDDdg qv2Zcǎ!((DDDuVEff&:t