pax_global_header00006660000000000000000000000064144324416110014512gustar00rootroot0000000000000052 comment=86435155c4de9857cfae9023a22733de8b5e8c3e xmldiff-2.6.3/000077500000000000000000000000001443244161100131535ustar00rootroot00000000000000xmldiff-2.6.3/.coveragerc000066400000000000000000000001121443244161100152660ustar00rootroot00000000000000[run] source = xmldiff omit = tests* xmldiff/*diff_match_patch*.py xmldiff-2.6.3/.coveralls.yml000066400000000000000000000000561443244161100157470ustar00rootroot00000000000000repo_token: b6n8XfzoDtrPQvXCSA3GkvIKvYEkx2nEb xmldiff-2.6.3/.github/000077500000000000000000000000001443244161100145135ustar00rootroot00000000000000xmldiff-2.6.3/.github/workflows/000077500000000000000000000000001443244161100165505ustar00rootroot00000000000000xmldiff-2.6.3/.github/workflows/lint.yml000066400000000000000000000010431443244161100202370ustar00rootroot00000000000000# Run the linting suite for the xmldiff package # Based on https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions name: Lint package on: [pull_request, push] env: FORCE_COLOR: 1 jobs: build: name: Run pre-commit runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v3 - name: Set up Python uses: actions/setup-python@v4 with: python-version: '3.x' - name: Run pre-commit hooks uses: pre-commit/action@v3.0.0 xmldiff-2.6.3/.github/workflows/test.yml000066400000000000000000000022031443244161100202470ustar00rootroot00000000000000# Runs the unit tests for the xmldiff package # Based on https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions name: Test package on: [pull_request, push] env: FORCE_COLOR: 1 jobs: build: name: Run package tests runs-on: ${{ matrix.os }} strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] python-version: ['3.7', '3.11', 'pypy-3.7'] exclude: - os: windows-latest python-version: '3.7' - os: windows-latest python-version: 'pypy-3.7' include: - os: windows-latest python-version: '3.8' steps: - name: Checkout uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Upgrade pip run: python -m pip install --upgrade pip - name: Install package run: pip install -e .[devenv] - name: Run tests run: python -bb -X dev -W ignore::UserWarning:setuptools.dist -m unittest xmldiff-2.6.3/.gitignore000066400000000000000000000001461443244161100151440ustar00rootroot00000000000000*.pyc __pycache__ *.bak *.egg-info *.wp? .eggs .tox .coverage htmlcov build docs/build venv build distxmldiff-2.6.3/.pre-commit-config.yaml000066400000000000000000000006271443244161100174410ustar00rootroot00000000000000repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.4.0 hooks: - id: check-yaml - id: trailing-whitespace - repo: https://github.com/psf/black rev: 22.12.0 hooks: - id: black - repo: https://github.com/pycqa/flake8 rev: 6.0.0 hooks: - id: flake8 - repo: https://github.com/regebro/pyroma rev: '4.1' hooks: - id: pyroma xmldiff-2.6.3/.travis.yml000066400000000000000000000010661443244161100152670ustar00rootroot00000000000000language: python dist: xenial os: linux jobs: fast_finish: true include: - python: 3.6 env: MATRIX=py36 - python: 3.7 dist: bionic env: MATRIX=py37 - python: 3.11 env: MATRIX=py311 - python: pypy3 env: MATRIX=pypy3 before_install: - if [ $MATRIX != pypy3 ]; then pip install black; fi - pip install coverage coveralls flake8 sphinx sphinx-argparse install: - pip install . script: - make flake - coverage run setup.py test - make -C docs doctest - make -C docs html after_success: - coveralls cache: pip: true xmldiff-2.6.3/CHANGES.rst000066400000000000000000000065571443244161100147720ustar00rootroot00000000000000Changes ======= 2.6.3 (2023-05-21) ------------------ - And there was a namespace bug in the patch as well. #118 2.6.2 (2023-05-21) ------------------ - Solved an error in the xmlformatter when using default namespaces. #89 2.6.1 (2023-04-05) ------------------ - #108: Fixed an error that happens if using namespaces like ns0 or ns1. 2.6 (2023-04-03) ---------------- - Added `InsertNamespace` and `DeleteNamespace` actions for better handling of changing namespaces. Should improve any "Unknown namespace prefix" errors. Changing the URI of a a namespace prefix is not supported, and will raise an error. 2.6b1 (2023-01-12) ------------------ - Used geometric mean for the node_ratio, for better handling of simple nodes. - Added an experimental --best-match method that is slower, but generate smaller diffs when you have many nodes that are similar. - The -F argument now also affects the --fast-match stage. 2.5 (2023-01-11) ---------------- - Make it possible to adjust the attributes considered when comparing nodes. - Python versions 3.7 to 3.11 are now supported. - Improved node matching method, that puts more emphasis similarities than differences when weighing attributes vs children. - Added a parameter to return error code 1 when there are differences between the files - Added a parameter for ignoring attributes in comparison. - Solved a bug in xmlpatch in certain namespace situations. - Added a --diff-encoding parameter to xmlpatch, to support diff-files that are not in your system default encoding. 2.4 (2019-10-09) ---------------- - Added an option to pass pairs of (element, attr) as unique attributes for tree matching. Exposed this option on the command line, too. 2.3 (2019-02-27) ---------------- - Added a simple ``xmlpatch`` command and API. - Multiple updates to documentation and code style 2.2 (2018-10-12) ---------------- - A workaround for dealing with top level comments and the xml formatter 2.1 (2018-10-03) ---------------- - Changed the substitution unicode character area to use the Private Use Area in BMP(0), to support narrow Python builds - Added --unique-attributes argument. 2.1b1 (2018-10-01) ------------------ - Added options for faster node comparisons. The "middle" option is now default, it had very few changes in matches, but is much faster. - Implemented a Fast Match algorithm for even faster diffing. - Speed improvements through caching - Fixed a bug where MoveNode actions sometimes was in the wrong order - Added an InsertComment action, as comments require different handling, so it's easier to deal with them this way. You can still use DeleteNode and UpdateTextIn for them with no special handling. - When renaming tags the XMLFormatter will mark them with "diff:rename" instead of making a new tag and deleting the old. - Tags will now be moved first, and updated and renamed later, as the new tag name or attributes might not be valid in the old location. 2.0 (2018-09-25) ---------------- - A complete, bottom-up, pure-python rewrite - New easy API - 100% test coverage - New output formats: - A new default output format with new actions - A format intended to be parseable by anyone parsing the old format. - XML with changes marked though tags and attributes - xmldiff 2.0 is significantly slower than xmldiff 0.6 or 1.0, the emphasis so far is on correctness, not speed. xmldiff-2.6.3/LICENSE.txt000066400000000000000000000020501443244161100147730ustar00rootroot00000000000000Copyright (c) 2018 Xmldiff Contributors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. xmldiff-2.6.3/MANIFEST.in000066400000000000000000000006311443244161100147110ustar00rootroot00000000000000include *.rst include *.txt include *.yml include .coveragerc include Makefile include docs/requirements.txt recursive-include tests *.py recursive-include tests *.xml recursive-include tests *.html recursive-include tests *.diff recursive-include docs *.bat recursive-include docs *.py recursive-include docs *.rst recursive-include docs *.xslt recursive-include docs Makefile recursive-exclude docs/build * xmldiff-2.6.3/Makefile000066400000000000000000000020421443244161100146110ustar00rootroot00000000000000root_dir := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) bin_dir := $(root_dir)/ve/bin dfm_source_2 := "https://raw.githubusercontent.com/google/diff-match-patch/master/python2/diff_match_patch.py" dfm_source_3 := "https://raw.githubusercontent.com/google/diff-match-patch/master/python3/diff_match_patch.py" all: check coverage # The fullrelease script is a part of zest.releaser, which is the last # package installed, so if it exists, the devenv is installed. devenv: ve/bin/fullrelease ve/bin/fullrelease: virtualenv $(root_dir)/ve --python python3 $(bin_dir)/pip install -e .[devenv] check: devenv $(bin_dir)/black xmldiff tests $(bin_dir)/flake8 xmldiff tests $(bin_dir)/pyroma -d . coverage: devenv $(bin_dir)/coverage run -m unittest $(bin_dir)/coverage html $(bin_dir)/coverage report test: devenv $(bin_dir)/python -bb -X dev -W ignore::UserWarning:setuptools.dist -m unittest --verbose release: devenv $(bin_dir)/fullrelease update-diff-match-patch: wget $(dfm_source_3) -O $(root_dir)/xmldiff/diff_match_patch.py xmldiff-2.6.3/README.rst000066400000000000000000000064051443244161100146470ustar00rootroot00000000000000xmldiff ======== .. image:: https://travis-ci.org/Shoobx/xmldiff.svg?branch=master :target: https://travis-ci.org/Shoobx/xmldiff .. image:: https://coveralls.io/repos/github/Shoobx/xmldiff/badge.svg :target: https://coveralls.io/github/Shoobx/xmldiff ``xmldiff`` is a library and a command-line utility for making diffs out of XML. This may seem like something that doesn't need a dedicated utility, but change detection in hierarchical data is very different from change detection in flat data. XML type formats are also not only used for computer readable data, it is also often used as a format for hierarchical data that can be rendered into human readable formats. A traditional diff on such a format would tell you line by line the differences, but this would not be be readable by a human. ``xmldiff`` provides tools to make human readable diffs in those situations. Full documentation is on `xmldiff.readthedocs.io `_ ``xmldiff`` is still under rapid development, and no guarantees are done that the output of one version will be the same as the output of any previous version. Quick usage ----------- ``xmldiff`` is both a command-line tool and a Python library. To use it from the command-line, just run ``xmldiff`` with two input files:: $ xmldiff file1.xml file2.xml There is also a command to patch a file with the output from the ``xmldiff`` command:: $ xmlpatch file.diff file1.xml There is a simple API for using ``xmldiff`` as a library:: from lxml import etree from xmldiff import main, formatting diff = main.diff_files('file1.xml', 'file2.xml', formatter=formatting.XMLFormatter()) There is also a method ``diff_trees()`` that take two lxml trees, and a method ``diff_texts()`` that will take strings containing XML. Similarly, there is ``patch_file()`` ``patch_text()`` and ``patch_tree()``:: result = main.patch_file('file.diff', 'file1.xml') Changes from ``xmldiff`` 0.6/1.x -------------------------------- * A complete, ground up, pure-Python rewrite * Easier to maintain, the code is less complex and more Pythonic, and uses more custom classes instead of just nesting lists and dicts. * Fixes the problems with certain large files and solves the memory leaks. * A nice, easy to use Python API for using it as a library. * Adds support for showing the diffs in different formats, mainly one where differences are marked up in the XML, useful for making human readable diffs. These formats can show text differences in a semantically meaningful way. * An output format compatible with 0.6/1.x is also available. * 2.0 is currently significantly slower than ``xmldiff`` 0.6/1.x, but this will change in the future. Currently we make no effort to make ``xmldiff`` 2.0 fast, we concentrate on making it correct and usable. Contributors ------------ * Lennart Regebro, regebro@gmail.com (main author) * Stephan Richter, srichter@shoobx.com * Albertas Agejevas, alga@shoobx.com * Greg Kempe, greg@laws.africa * Filip Demski, glamhoth@protonmail.com The diff algorithm is based on "`Change Detection in Hierarchically Structured Information `_", and the text diff is using Google's ``diff_match_patch`` algorithm. xmldiff-2.6.3/README.txt000066400000000000000000000002171443244161100146510ustar00rootroot00000000000000See README.rst for general information See LICENSE.txt for Licensing information. See docs/source/contributing.rst for development information xmldiff-2.6.3/docs/000077500000000000000000000000001443244161100141035ustar00rootroot00000000000000xmldiff-2.6.3/docs/Makefile000066400000000000000000000167651443244161100155620ustar00rootroot00000000000000# Makefile for Sphinx documentation # # You can set these variables from the command-line. SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = BUILDDIR = build # User-friendly check for sphinx-build ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) endif # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source .PHONY: help help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " applehelp to make an Apple Help Book" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" @echo " text to make text files" @echo " man to make manual pages" @echo " texinfo to make Texinfo files" @echo " info to make Texinfo files and run them through makeinfo" @echo " gettext to make PO message catalogs" @echo " changes to make an overview of all changed/added/deprecated items" @echo " xml to make Docutils-native XML files" @echo " pseudoxml to make pseudoxml-XML files for display purposes" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" @echo " coverage to run coverage check of the documentation (if enabled)" .PHONY: clean clean: rm -rf $(BUILDDIR)/* .PHONY: html html: $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." .PHONY: dirhtml dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." .PHONY: singlehtml singlehtml: $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." .PHONY: pickle pickle: $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." .PHONY: json json: $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." .PHONY: htmlhelp htmlhelp: $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." .PHONY: qthelp qthelp: $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/xmldiff.qhcp" @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/xmldiff.qhc" .PHONY: applehelp applehelp: $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp @echo @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." @echo "N.B. You won't be able to view it unless you put it in" \ "~/Library/Documentation/Help or install it in your application" \ "bundle." .PHONY: devhelp devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" @echo "# mkdir -p $$HOME/.local/share/devhelp/xmldiff" @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/xmldiff" @echo "# devhelp" .PHONY: epub epub: $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." .PHONY: latex latex: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." .PHONY: latexpdf latexpdf: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." $(MAKE) -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." .PHONY: latexpdfja latexpdfja: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through platex and dvipdfmx..." $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." .PHONY: text text: $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." .PHONY: man man: $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." .PHONY: texinfo texinfo: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." @echo "Run \`make' in that directory to run these through makeinfo" \ "(use \`make info' here to do that automatically)." .PHONY: info info: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo "Running Texinfo files through makeinfo..." make -C $(BUILDDIR)/texinfo info @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." .PHONY: gettext gettext: $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale @echo @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." .PHONY: changes changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." .PHONY: linkcheck linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." .PHONY: doctest doctest: $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." .PHONY: coverage coverage: $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage @echo "Testing of coverage in the sources finished, look at the " \ "results in $(BUILDDIR)/coverage/python.txt." .PHONY: xml xml: $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml @echo @echo "Build finished. The XML files are in $(BUILDDIR)/xml." .PHONY: pseudoxml pseudoxml: $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml @echo @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." xmldiff-2.6.3/docs/make.bat000066400000000000000000000161341443244161100155150ustar00rootroot00000000000000@ECHO OFF REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set BUILDDIR=build set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source set I18NSPHINXOPTS=%SPHINXOPTS% source if NOT "%PAPER%" == "" ( set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% ) if "%1" == "" goto help if "%1" == "help" ( :help echo.Please use `make ^` where ^ is one of echo. html to make standalone HTML files echo. dirhtml to make HTML files named index.html in directories echo. singlehtml to make a single large HTML file echo. pickle to make pickle files echo. json to make JSON files echo. htmlhelp to make HTML files and a HTML help project echo. qthelp to make HTML files and a qthelp project echo. devhelp to make HTML files and a Devhelp project echo. epub to make an epub echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter echo. text to make text files echo. man to make manual pages echo. texinfo to make Texinfo files echo. gettext to make PO message catalogs echo. changes to make an overview over all changed/added/deprecated items echo. xml to make Docutils-native XML files echo. pseudoxml to make pseudoxml-XML files for display purposes echo. linkcheck to check all external links for integrity echo. doctest to run all doctests embedded in the documentation if enabled echo. coverage to run coverage check of the documentation if enabled goto end ) if "%1" == "clean" ( for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i del /q /s %BUILDDIR%\* goto end ) REM Check if sphinx-build is available and fallback to Python version if any %SPHINXBUILD% 1>NUL 2>NUL if errorlevel 9009 goto sphinx_python goto sphinx_ok :sphinx_python set SPHINXBUILD=python -m sphinx.__init__ %SPHINXBUILD% 2> nul if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) :sphinx_ok if "%1" == "html" ( %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/html. goto end ) if "%1" == "dirhtml" ( %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. goto end ) if "%1" == "singlehtml" ( %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. goto end ) if "%1" == "pickle" ( %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the pickle files. goto end ) if "%1" == "json" ( %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the JSON files. goto end ) if "%1" == "htmlhelp" ( %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run HTML Help Workshop with the ^ .hhp project file in %BUILDDIR%/htmlhelp. goto end ) if "%1" == "qthelp" ( %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run "qcollectiongenerator" with the ^ .qhcp project file in %BUILDDIR%/qthelp, like this: echo.^> qcollectiongenerator %BUILDDIR%\qthelp\xmldiff.qhcp echo.To view the help file: echo.^> assistant -collectionFile %BUILDDIR%\qthelp\xmldiff.ghc goto end ) if "%1" == "devhelp" ( %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp if errorlevel 1 exit /b 1 echo. echo.Build finished. goto end ) if "%1" == "epub" ( %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub if errorlevel 1 exit /b 1 echo. echo.Build finished. The epub file is in %BUILDDIR%/epub. goto end ) if "%1" == "latex" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex if errorlevel 1 exit /b 1 echo. echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. goto end ) if "%1" == "latexpdf" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf cd %~dp0 echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end ) if "%1" == "latexpdfja" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf-ja cd %~dp0 echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end ) if "%1" == "text" ( %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text if errorlevel 1 exit /b 1 echo. echo.Build finished. The text files are in %BUILDDIR%/text. goto end ) if "%1" == "man" ( %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man if errorlevel 1 exit /b 1 echo. echo.Build finished. The manual pages are in %BUILDDIR%/man. goto end ) if "%1" == "texinfo" ( %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo if errorlevel 1 exit /b 1 echo. echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. goto end ) if "%1" == "gettext" ( %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale if errorlevel 1 exit /b 1 echo. echo.Build finished. The message catalogs are in %BUILDDIR%/locale. goto end ) if "%1" == "changes" ( %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes if errorlevel 1 exit /b 1 echo. echo.The overview file is in %BUILDDIR%/changes. goto end ) if "%1" == "linkcheck" ( %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck if errorlevel 1 exit /b 1 echo. echo.Link check complete; look for any errors in the above output ^ or in %BUILDDIR%/linkcheck/output.txt. goto end ) if "%1" == "doctest" ( %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest if errorlevel 1 exit /b 1 echo. echo.Testing of doctests in the sources finished, look at the ^ results in %BUILDDIR%/doctest/output.txt. goto end ) if "%1" == "coverage" ( %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage if errorlevel 1 exit /b 1 echo. echo.Testing of coverage in the sources finished, look at the ^ results in %BUILDDIR%/coverage/python.txt. goto end ) if "%1" == "xml" ( %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml if errorlevel 1 exit /b 1 echo. echo.Build finished. The XML files are in %BUILDDIR%/xml. goto end ) if "%1" == "pseudoxml" ( %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml if errorlevel 1 exit /b 1 echo. echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. goto end ) :end xmldiff-2.6.3/docs/requirements.txt000066400000000000000000000000271443244161100173660ustar00rootroot00000000000000sphinx-argparse==0.2.2 xmldiff-2.6.3/docs/source/000077500000000000000000000000001443244161100154035ustar00rootroot00000000000000xmldiff-2.6.3/docs/source/advanced.rst000066400000000000000000000211171443244161100177040ustar00rootroot00000000000000Advanced Usage ============== Diffing Formatted Text ---------------------- You can write your own formatter that understands your XML format, and therefore can apply some intelligence to the format. One common use case for this is to have more intelligent text handling. The standard formatters will treat any text as just a value, and the resulting diff will simply replace one value with another: .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> from xmldiff import main, formatting >>> left = '

Old Content

' >>> right = '

New Content

' >>> main.diff_texts(left, right) [UpdateTextIn(node='/body/p[1]', text='New Content')] The ``xml`` formatter will set tags around the text marking it as inserted or deleted: .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> formatter=formatting.XMLFormatter() >>> >>> left = '

Old Content

' >>> right = '

New Content

' >>> result = main.diff_texts(left, right, formatter=formatter) >>> print(result)

OldNew Content

But if your XML format contains text with formats, the output can in some cases be less than useful, especially in the case where formatting is added: .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> left = '

My Fine Content

' >>> right = '

My Fine Content

' >>> result = main.diff_texts(left, right, formatter=formatter) >>> print(result)

My Fine Content

My Fine Content

Notice how the the whole text was inserted with formatting, and the whole unformatted text was deleted. The XMLFormatter supports a better handling of text with the ``text_tags`` and ``formatting_tags`` parameters. Here is a simple and incomplete example with some common HTML tags: .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> formatter=formatting.XMLFormatter( ... text_tags=('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'), ... formatting_tags=('b', 'u', 'i', 'strike', 'em', 'super', ... 'sup', 'sub', 'link', 'a', 'span')) >>> result = main.diff_texts(left, right, formatter=formatter) >>> print(result)

My Fine Content

This gives a result that flags the ```` tag as new formatting. This more compact output is much more useful and easier to transform into a visual output. Making a Visual Diff -------------------- XML and HTML views will of course ignore all these ``diff:`` tags and attributes. What we want with the HTML output above is to transform the ``diff:insert-formatting`` attribute into something that will make the change visible. We can achieve that by applying XSLT before the ``render()`` method in the formatter. This requires subclassing the formatter: .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> import lxml.etree >>> XSLT = u''' ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ''' >>> XSLT_TEMPLATE = lxml.etree.fromstring(XSLT) >>> class HTMLFormatter(formatting.XMLFormatter): ... def render(self, result): ... transform = lxml.etree.XSLT(XSLT_TEMPLATE) ... result = transform(result) ... return super(HTMLFormatter, self).render(result) The XSLT template above of course only handles a few cases, like inserted formatting and insert and delete tags (used below). A more complete XSLT file is included `here `_. Now use that formatter in the diffing: .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> formatter = HTMLFormatter( ... text_tags=('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'), ... formatting_tags=('b', 'u', 'i', 'strike', 'em', 'super', ... 'sup', 'sub', 'link', 'a', 'span')) >>> result = main.diff_texts(left, right, formatter=formatter) >>> print(result)

My Fine Content

You can then add into your CSS files classes that make inserted text green, deleted text red with an overstrike, and formatting changes could for example be blue. This makes it easy to see what has been changed in a HTML document. Performance Options ------------------- The performance options available will not just change the performance, but can also change the result. The result will not necessarily be worse, it will just be less accurate. In some cases the less accurate result might actually be preferrable. As an example we take the following HTML codes: .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> left = u""" ...

The First paragraph

...

A Second paragraph

...

Last paragraph

... """ >>> right = u""" ...

Last paragraph

...

A Second paragraph

...

The First paragraph

... """ >>> result = main.diff_texts(left, right) >>> result [MoveNode(node='/html/body/p[1]', target='/html/body[1]', position=2), MoveNode(node='/html/body/p[1]', target='/html/body[1]', position=1)] We here see that the differ finds that two paragraphs needs to be moved. Don't be confused that it says ``p[1]`` in both cases. That just means to move the first paragraph, and in the second case that first paragraph has already been moved and is now last. If we format that diff to XML with the XMLFormatter, we get output that marks these paragraphs as deleted and then inserted later. .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> formatter = HTMLFormatter( ... normalize=formatting.WS_BOTH) >>> result = main.diff_texts(left, right, formatter=formatter) >>> print(result)

The First paragraph

A Second paragraph

Last paragraph

A Second paragraph

The First paragraph

Let's try diffing the same HTML with the fast match algorithm: .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> result = main.diff_texts(left, right, ... diff_options={'fast_match': True}) >>> result [UpdateTextIn(node='/html/body/p[1]', text='Last paragraph'), UpdateTextIn(node='/html/body/p[3]', text='The First paragraph')] Now we instead got two update actions. This means the resulting HTML is quite different: .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> result = main.diff_texts(left, right, ... diff_options={'fast_match': True}, ... formatter=formatter) >>> print(result)

The FirLast paragraph

A Second paragraph

LaThe First paragraph

The texts are updated instead of deleting and then reinserting the whole paragraphs. This makes the visual output more readable. Also note that the XSLT in this case replaced the ```` and ```` tags with ```` and ```` tags. This is a contrived example, though. If you are using ``xmldiff`` to generate a visual diff, you have to experiment with performance flags to find the best combination of speed and output for your case. xmldiff-2.6.3/docs/source/api.rst000066400000000000000000000433041443244161100167120ustar00rootroot00000000000000Python API ========== Main diffing API ---------------- Using ``xmldiff`` from Python is very easy, you just import and call one of the three main API methods. .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> from xmldiff import main >>> main.diff_files("../tests/test_data/insert-node.left.html", ... "../tests/test_data/insert-node.right.html", ... diff_options={'F': 0.5, 'ratio_mode': 'fast'}) [UpdateTextIn(node='/body/div[1]', text=None), InsertNode(target='/body/div[1]', tag='p', position=0), UpdateTextIn(node='/body/div/p[1]', text='Simple text')] Which one you choose depends on if the XML is contained in files, text strings or ``lxml`` trees. * ``xmldiff.main.diff_files()`` takes as input paths to files, or file streams. * ``xmldiff.main.diff_texts()`` takes as input Unicode strings. * ``xmldiff.main.diff_trees()`` takes as input lxml trees. The arguments to these functions are the same: Parameters .......... ``left``: The "left", "old" or "from" XML. The diff will show the changes to transform this XML to the "right" XML. ``right``: The "right", "new" or "target" XML. ``check``: Return error code 1 if there are any differences between the files. ``diff_options``: A dictionary containing options that will be passed into the ``Differ()``: ``F``: A value between 0 and 1 that determines how similar two XML nodes must be to match as the same in both trees. Defaults to ``0.5``. A higher value requires a smaller difference between two nodes for them to match. Set the value high, and you will see more nodes inserted and deleted instead of being updated. Set the value low, and you will get more updates instead of inserts and deletes. ``uniqueattrs``: A list of XML node attributes that will uniquely identify a node. See `Unique Attributes`_ for more info. Defaults to ``['{http://www.w3.org/XML/1998/namespace}id']``. ``ratio_mode``: The ``ratio_mode`` determines how accurately the similarity between two nodes is calculated. The choices are ``'accurate'``, ``'fast'`` and ``'faster'``. Defaults to ``'fast'``. Using ``'faster'`` often results in less optimal edits scripts, in other words, you will have more actions to achieve the same result. Using ``'accurate'`` will be significantly slower, especially if your nodes have long texts or many attributes. ``ignored_attrs``: A list of XML node attributes that will be ignored in comparison. ``fast_match``: By default ``xmldiff`` will compare each node from one tree with all nodes from the other tree. It will then pick the one node that matches best as the match, if that match passes the match threshold ``F`` (see above). If fast_match is true ``xmldiff`` will first make a faster run, trying to find chains of matching nodes, during which any match better than ``F`` will count. This significantly cuts down on the time to match nodes, but means that the matches are no longer the best match, only "good enough" matches. ``formatter``: The formatter to use, see `Using Formatters`_. If no formatter is specified the function will return a list of edit actions, see `The Edit Script`_. Result ...... If no formatter is specified the diff functions will return a list of actions. Such a list is called an Edit Script and contains all changes needed to transform the "left" XML into the "right" XML. If a formatter is specified that formatter determines the result. The included formatters, ``diff``, ``xml``, and ``old`` all return a Unicode string. ``xmldiff`` is still under rapid development, and no guarantees are done that the output of one version will be the same as the output of any previous version. The actions of the edit script can be in a different order or replaced by equivalent actions dependingon the version of ``xmldiff``, but if the Edit Script does not correctly transform one XML tree into another, that is regarded as a bug. This means that the output of the ``xml`` format also may change from version to version. There is no "correct" solution to how that output should look, as the same change can be represented in several different ways. Unique Attributes ----------------- The ``uniqueattrs`` argument is a list of strings or ``(tag, attribute)`` tuples specifying attributes that uniquely identify a node in the document. This is used by the differ when trying to match nodes. If one node in the left tree has a this attribute, the node in the right three with the same value for that attribute will match, regardless of other attributes, child nodes or text content. Respectively, if the values of the attribute on the nodes in question are different, or if only one of the nodes has this attribute, the nodes will not match regardless of their structural similarity. In case the attribute is a tuple, the attribute match applies only if both nodes have the given tag. The default is ``['{http://www.w3.org/XML/1998/namespace}id']``, which is the ``xml:id`` attribute. But if your document have other unique identifiers, you can pass them in instead. If you for some reason do not want the differ to look at the ``xml:id`` attribute, pass in an empty list. Using Formatters ---------------- By default the diff functions will return an edit script, but if you pass in a formatter the result will be whatever that formatter returns. The three included formatters all return Unicode strings. All formatters take two arguments: :``normalize``: This argument determines whitespace normalizing. It can be one of the following values, all defined in ``xmldiff.formatting``: :``WS_NONE``: No normalizing :``WS_TAGS``: Normalize whitespace between tags :``WS_TEXT``: Normalize whitespace in text tags (only used by the ``XMLFormatter``). :``WS_BOTH``: Both ``WS_TAGS`` and ``WS_TEXT``. :``pretty_print``: This argument determines if the output should be compact (``False``) or readable (``True``). Only the ``XMLFormatter`` currently uses this parameter, but it's useful enough that it was included in the ``BaseFormatter`` class, so that all subsequent formatters may use it. DiffFormatter ............. .. py:class:: xmldiff.formatting.DiffFormatter(normalize=WS_TAGS, pretty_print=False) This formatter is the one used when you specify ``-f diff`` on the command line. It will return a string with the edit script printed out, one action per line. Each line is enclosed in brackets and consists of a string describing the action, and the actions arguments. This is the output format of xmldiff 0.6/1.x, however, the actions and arguments are not the same, so the output is not compatible. .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> from xmldiff import formatting >>> formatter = formatting.DiffFormatter() >>> print(main.diff_files("../tests/test_data/insert-node.left.html", ... "../tests/test_data/insert-node.right.html", ... formatter=formatter)) [update-text, /body/div[1], null] [insert, /body/div[1], p, 0] [update-text, /body/div/p[1], "Simple text"] XmlDiffFormatter ................ .. py:class:: xmldiff.formatting.XmlDiffFormatter(normalize=WS_TAGS, pretty_print=False) This formatter works like the DiffFormatter, but the output format is different and more similar to the ``xmldiff`` output in versions 0.x and 1.x. .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> from xmldiff import formatting >>> formatter = formatting.XmlDiffFormatter(normalize=formatting.WS_NONE) >>> print(main.diff_files("../tests/test_data/insert-node.left.html", ... "../tests/test_data/insert-node.right.html", ... formatter=formatter)) [update, /body/div[1]/text()[1], "\n "] [insert-first, /body/div[1],

] [update, /body/div/p[1]/text()[1], "Simple text"] [update, /body/div/p[1]/text()[2], "\n "] XMLFormatter ............ .. py:class:: xmldiff.formatting.XMLFormatter(normalize=WS_NONE, pretty_print=True, text_tags=(), formatting_tags=())ΒΆ :param text_tags: A list of XML tags that contain human readable text, ex ``('para', 'li')`` :param formatting_tags: A list of XML tags that are tags that change text formatting, ex ``('strong', 'i', 'u' )`` This formatter return XML with tags describing the changes. These tags are designed so they easily can be changed into something that will render nicely, for example with XSLT replacing the tags with the format you need. .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> from xmldiff import formatting >>> formatter = formatting.XMLFormatter(normalize=formatting.WS_BOTH) >>> print(main.diff_files("../tests/test_data/insert-node.left.html", ... "../tests/test_data/insert-node.right.html", ... formatter=formatter))

Simple text

The Edit Script --------------- The default result of the diffing methods is to return an edit script, which is a list of Python objects called edit actions. Those actions tell you how to turn the "left" tree into the "right" tree. ``xmldiff`` has nine different actions. These specify one or two nodes in the XML tree, called ``node`` or ``target``. They are specified with an XPATH expression that will uniquely identify the node. The other arguments vary depending on the action. ``InsertNode(target, tag, position)`` ...................................... The ``InsertNode`` action means that the node specified in ``target`` needs a new subnode. ``tag`` specifies which tag that node should have. The ``position`` argument specifies which position the new node should have, ``0`` means that the new node will be inserted as the first child of the target. Note that this is different from XPATH, where the first node is ``1``. This is for ease of use, since Python is zero-indexed. Example: .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> left = 'Content' >>> right = 'Content' >>> main.diff_texts(left, right) [InsertNode(target='/document[1]', tag='newnode', position=1)] ``DeleteNode(node)`` .................... The ``DeleteNode`` action means that the node specified in ``node`` should be deleted. Example: .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> left = 'Content' >>> right = '' >>> main.diff_texts(left, right) [DeleteNode(node='/document/node[1]')] ``MoveNode(node, target, position)`` .................................... The ``MoveNode`` action means that the node specified in ``node`` should be moved to be a child under the target node. The ``position`` argument specifies which position it should have, ``0`` means that the new node will be inserted as the first child of the target. Note that this is different from XPATH, where the first node is ``1``. This is for ease of use, since Python is zero-indexed. If the move is within the same parent, the position can be ambiguous. If you have a child that is in position 1, but should be moved to position 3, that position does not include the node being moved, but signifies the position the node should end up at after the move. When implementing a ``MoveNode()`` it is therefore easiest to remove the node from the parent first, and then insert it at the given position. Example: .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> left = 'Content' >>> right = 'Content' >>> main.diff_texts(left, right) [MoveNode(node='/document/node[1]', target='/document[1]', position=1)] ``InsertAttrib(node, name, value)`` ..................................... The ``InsertAttrib`` action means that the node specified in ``node`` should get a new attribute. The ``name `` and ``value`` arguments specify the name and value of that attribute. Example: .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> left = '' >>> right = '' >>> main.diff_texts(left, right) [InsertAttrib(node='/document[1]', name='newattr', value='newvalue')] ``DeleteAttrib(node, name)`` ............................ The ``DeleteAttrib`` action means that an attribute of the node specified in ``target`` should be deleted. The ``name`` argument specify which attribute. Example: .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> left = '' >>> right = '' >>> main.diff_texts(left, right) [DeleteAttrib(node='/document[1]', name='newattr')] ``RenameAttrib(node, oldname, newname)`` ........................................ The ``RenameAttrib`` action means that an attribute of the node specified in ``node`` should be renamed. The ``oldname`` and ``newname`` arguments specify which attribute and it's new name. Example: .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> left = '' >>> right = '' >>> main.diff_texts(left, right) [RenameAttrib(node='/document[1]', oldname='attrib', newname='newattrib')] ``UpdateAttrib(node, name)`` ............................ The ``UpdateAttrib`` action means that an attribute of the node specified in ``node`` should get a new value. The ``name`` and ``value`` arguments specify which attribute and it's new value. Example: .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> left = '' >>> right = '' >>> main.diff_texts(left, right) [UpdateAttrib(node='/document[1]', name='attrib', value='newvalue')] ``UpdateTextIn(node, name)`` ............................ The ``UpdateTextIn`` action means that an text content of the node specified in ``node`` should get a new value. The ``text`` argument specify the new value of that text. Example: .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> left = 'Content' >>> right = 'New Content' >>> main.diff_texts(left, right) [UpdateTextIn(node='/document/node[1]', text='New Content')] ``UpdateTextAfter(node, name)`` ............................... The ``UpdateTextAfter`` action means that an text that trails the node specified in ``node`` should get a new value. The ``text`` argument specify the new value of that text. Example: .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> left = 'Content' >>> right = 'ContentTrailing text' >>> main.diff_texts(left, right) [UpdateTextAfter(node='/document/node[1]', text='Trailing text')] ``InsertComment(target, position, text)`` ......................................... Since comments doesn't have a tag, the normal ``InsertNode()`` action doesn't work nicely with a comment. Therefore comments get their own insert action. Just like ``InsertNode()`` it takes a target node and a position. It naturally has no tag but instead has a text argument, as all comments have text and nothing else. ``UpdateTextIn()`` and ``DeleteNode()`` works as normal for comments. Example: .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> left = 'Content' >>> right = 'Content' >>> main.diff_texts(left, right) [InsertComment(target='/document[1]', position=0, text=' A comment ')] ``InsertNamespace(prefix, uri)`` ................................ Adds a new namespace to the XML document. You need to have this before adding a node that uses a namespace that is not in the original XML tree. Example: .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> left = '' >>> right = '' >>> main.diff_texts(left, right) [InsertNamespace(prefix='new', uri='http://theuri')] ``DeleteNamespace(prefix)`` ................................ Removes a namespace from the XML document. You don't need to handle this, strictly speaking, nothing will break if there is an unused namespace, but `xmldiff` will return this action. Example: .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> left = '' >>> right = '' >>> main.diff_texts(left, right) [DeleteNamespace(prefix='new')] The patching API ---------------- There is also an API to patch files using the diff output: .. doctest:: :options: -ELLIPSIS, +NORMALIZE_WHITESPACE >>> from xmldiff import main >>> print(main.patch_file("../tests/test_data/insert-node.diff", ... "../tests/test_data/insert-node.left.html"))

Simple text

On the same line as for the patch API there are three methods: * ``xmldiff.main.patch_file()`` takes as input paths to files, or file streams, and returns a string with the resulting XML. * ``xmldiff.main.patch_text()`` takes as input Unicode strings, and returns a string with the resulting XML. * ``xmldiff.main.patch_tree()`` takes as input one edit script, (ie a list of actions, see above) and one ``lxml`` tree, and returns a patched ``lxml`` tree. They all return a string with the patched XML tree. There are currently no configuration parameters for these commands. xmldiff-2.6.3/docs/source/commandline.rst000066400000000000000000000062471443244161100204340ustar00rootroot00000000000000Command-line Usage ================== ``xmldiff`` is both a command-line tool and a Python library. To use it from the command-line, just run ``xmldiff`` with two input files: .. code-block:: bash $ xmldiff file1.xml file2.xml There are a few extra options to modify the output, but be aware that not all of the combinations are meaningful, so don't be surprised of you add one and nothing happens. Options ------- .. argparse:: :module: xmldiff.main :func: make_parser :prog: xmldiff :nodescription: Formatters ---------- You can select different output formats with ``xmldiff``, but beware that some formatters may assume certain things about the type of XML. The included formatters are generic and will work for any type of XML, but may not give you a useful output. If you are using ``xmldiff`` as a library, you can create your own formatters that is suited for your particular usage of XML. The ``diff`` formatter is default and will output a list of edit actions. The ``xml`` formatter will output XML with differences marked up by tags using the ``diff`` namespace. The ``old`` formatter is a formatter that gives a list of edit actions in a format similar to ``xmldiff`` 0.6 or 1.0. Whitespace Handling ------------------- Formatters are also responsable for whitespace handling, both in parsing and in output. By default ``xmldiff`` will strip all whitespace that is between tags, as opposed to inside tags. That whitespace isn't a part of any data and can be ignored. So this XML structure: .. code-block:: xml Will be seen as the same document as this: .. code-block:: xml Because the whitespace is between the tags. However, this structure is different, since the whitespace there occurs inside a tag: .. code-block:: xml By default the ``xml`` formatter will normalize this whitespace. You can turn that off with the ``--keep-whitespace`` argument. Pretty Printing --------------- The term "pretty printing" refers to making an output a bit more human readable by structuring it with whitespace. In the case of XML this means inserting ignorable whitespace into the XML, yes, the same in-between whitespace that is ignored by ``xmldiff`` when detecting changes between two files. ``xmldiff``'s ``xml`` formatter understands the ``--pretty-print`` argument and will insert whitespace to make the output more readable. For example, an XML output that would normally look like this: Some contentThis is some simple text with formatting. Will with the ``--pretty-print`` argument look like this: .. code-block:: xml Some content This is some simple text with formatting. This means you can actually use ``xmldiff`` to reformat XML, by using the ``xml`` formatter and passing in the same XML file twice:: $ xmldiff -f xml -p uglyfile.xml uglyfile.xml However, if you keep whitespace with ``--keep-whitespace`` or ``-w``, no reformatting will be done. xmldiff-2.6.3/docs/source/conf.py000066400000000000000000000221711443244161100167050ustar00rootroot00000000000000# # xmldiff documentation build configuration file, created by # sphinx-quickstart on Tue Sep 4 12:07:12 2018. # # This file is execfile()d with the current directory set to its # containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. # import sys # import os # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # sys.path.insert(0, os.path.abspath('.')) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ "sphinx.ext.doctest", "sphinx.ext.coverage", # "sphinxarg.ext", ] # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # source_suffix = ['.rst', '.md'] source_suffix = ".rst" # The encoding of source files. # source_encoding = 'utf-8-sig' # The master toctree document. master_doc = "index" # General information about the project. project = "xmldiff" copyright = "2018, Lennart Regebro" author = "Lennart Regebro" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. # version = u'2.0' # The full version, including alpha/beta/rc tags. # release = u'2.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command-line for these cases. language = "en" # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: # today = '' # Else, today_fmt is used as the format for a strftime call. # today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. exclude_patterns = [] # The reST default role (used for this markup: `text`) to use for all # documents. # default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. # add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). # add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. # show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. # modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. # keep_warnings = False # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. html_theme = "alabaster" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. # html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". # html_title = None # A shorter title for the navigation bar. Default is the same as html_title. # html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. # html_logo = None # The name of an image file (relative to this directory) to use as a favicon of # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. # html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["static"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. # html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. # html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. # html_use_smartypants = True # Custom sidebar templates, maps document names to template names. # html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. # html_additional_pages = {} # If false, no module index is generated. # html_domain_indices = True # If false, no index is generated. # html_use_index = True # If true, the index is split into individual pages for each letter. # html_split_index = False # If true, links to the reST sources are added to the pages. # html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. # html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. # html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. # html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). # html_file_suffix = None # Language to be used for generating the HTML full-text search index. # Sphinx supports the following languages: # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' # html_search_language = 'en' # A dictionary with options for the search language support, empty by default. # Now only 'ja' uses this config value # html_search_options = {'type': 'default'} # The name of a javascript file (relative to the configuration directory) that # implements a search results scorer. If empty, the default will be used. # html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. htmlhelp_basename = "xmldiffdoc" # -- Options for LaTeX output --------------------------------------------- latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # 'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). # 'pointsize': '10pt', # Additional stuff for the LaTeX preamble. # 'preamble': '', # Latex figure (float) alignment # 'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ (master_doc, "xmldiff.tex", "xmldiff Documentation", "Lennart Regebro", "manual"), ] # The name of an image file (relative to this directory) to place at the top of # the title page. # latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. # latex_use_parts = False # If true, show page references after internal links. # latex_show_pagerefs = False # If true, show URL addresses after external links. # latex_show_urls = False # Documents to append as an appendix to all manuals. # latex_appendices = [] # If false, no module index is generated. # latex_domain_indices = True # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [(master_doc, "xmldiff", "xmldiff Documentation", [author], 1)] # If true, show URL addresses after external links. # man_show_urls = False # -- Options for Texinfo output ------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ ( master_doc, "xmldiff", "xmldiff Documentation", author, "xmldiff", "One line description of project.", "Miscellaneous", ), ] # Documents to append as an appendix to all manuals. # texinfo_appendices = [] # If false, no module index is generated. # texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. # texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. # texinfo_no_detailmenu = False xmldiff-2.6.3/docs/source/contributing.rst000066400000000000000000000101571443244161100206500ustar00rootroot00000000000000Contributing to ``xmldiff`` =========================== ``xmldiff`` welcomes your help. Replies and responses may be slow, but don't despair, we will get to you, we will answer your questions and we will review your pull requests, but nobody has "Maintain ``xmldiff``" as their job description, so it may take a long time. That's open source. There are some extremely complex issues deep down in ``xmldiff``, but don't let that scare you away, there's easy things to do as well. Setting Up a Development Environment ------------------------------------ To set up a development environment you need a github account, git, and of course Python with pip installed. You need to clone the repository, and install the development dependencies:: git clone git@github.com:Shoobx/xmldiff.git cd xmldiff make devenv You should now be able to test your setup by running a few ``make`` commands:: make test make check or just:: make Which will run the syntax and style checkers and the test suit with coverage. These should both pass with no errors, and then you are set! Testing ------- ``xmldiff``'s tests are written using ``unittest`` and are discoverable by most test runners. There is also a ``test`` target in the make file. The following test runners/commands are known to work: * ``make test`` * ``python setup.py test`` * ``pytest`` (if you have pytest installed) * ``python -m unittest`` There is no support for ``tox`` to run test under different Python versions. This is because Travis will run all supported versions on pull requests in any case, and having yet another list of supported Python versions to maintain seems unnecessary. You can either create your own tox.ini file, or you can install `Spiny `_, which doesn't require any extra configuration in the normal case, and will run the tests on all versions that are defined as supported in ``setup.py``. Pull Requests ------------- Even if you have write permissions to the repository we discourage pushing changes to master. Make a branch and a pull request, and we'll merge that. You pull requests should: * Add a test that fails before the change is made * Keep test coverage at 100% * Include an description of the change in ``CHANGES.txt`` * Add yourself to the contributors list in ``README.txt`` if you aren't already there. Code Quality and Conventions ---------------------------- ``xmldiff`` aims to have 100% test coverage. You run a coverage report with ``make coverage``. This will generate a HTML coverage report in ``htmlcov/index.html`` We run flake8 as a part of all Travis test runs, the correct way to run it is ``make flake``, as this includes only the files that should be covered. Documentation ------------- The documentation is written with ``sphinx``. It and any other files using the ReStructuredText format, such as README's etc, are using a one line per sub-sentence structure. This is so that adding one word to a paragraph will not cause several lines of changes, as that will make any pull request harder to read. That means that every sentence and most commas should be followed by a new line, except in cases where this obviously do not make sense, for example when using commas to separate things you list. As a result of this there is no limits on line length, but if a line becomes very long you might consider rewriting it to make it more understandable. You generate the documentation with a make command:: cd docs make html The documentation is hosted on `Read the Docs `_, the official URL is https://readthedocs.org/projects/xmldiff/. Implementation Details ---------------------- ``xmldiff`` is based on `"Change Detection in Hierarchically StructuredS Information" `_ by Sudarshan S. Chawathe, Anand Rajaraman, Hector Garcia-Molina, and Jennifer Widom, 1995. It's not necessary to read and understand that paper in all it's details to help with ``xmldiff``, but if you want to improve the actual diffing algorithm it is certainly helpful. I hope to extend this section with an overview of how this library does it's thing. xmldiff-2.6.3/docs/source/index.rst000066400000000000000000000015121443244161100172430ustar00rootroot00000000000000xmldiff ======= ``xmldiff`` is a library and a command-line utility for making diffs out of XML. This may seem like something that doesn't need a dedicated utility, but change detection in hierarchical data is very different from change detection in flat data. XML type formats are also not only used for computer readable data, it is also often used as a format for hierarchical data that can be rendered into human readable formats. A traditional diff on such a format would tell you line by line the differences, but this would not be be readable by a human. This library provides tools to make human readable diffs in those situations. Contents: .. toctree:: :maxdepth: 2 installation commandline api advanced contributing Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` xmldiff-2.6.3/docs/source/installation.rst000066400000000000000000000014341443244161100206400ustar00rootroot00000000000000Installation ============ ``xmldiff`` is a standard Python package and can be installed in all the ways Python packages normally can be installed. The most common way is to use ``pip``:: pip install xmldiff You can also download the latest version from `The Cheeseshop a.k.a PyPI `_, unpack it with you favourite unpacking tool and then run:: python setup.py install That's it, ``xmldiff`` should now be available for you to use. Several Unix distributions also include ``xmldiff`` so you can install it with your distributions package manager. Be aware that currently most distribute an earlier version, typically 0.6.10, which is very different from 2.x, which this documentation is written for. You can check this by running ``xmldiff --version``. xmldiff-2.6.3/docs/source/static/000077500000000000000000000000001443244161100166725ustar00rootroot00000000000000xmldiff-2.6.3/docs/source/static/htmlformatter.xslt000066400000000000000000000110471443244161100225010ustar00rootroot00000000000000 xmldiff-2.6.3/setup.cfg000066400000000000000000000031551443244161100150000ustar00rootroot00000000000000[metadata] name = xmldiff version = 2.6.3 description = Creates diffs of XML files long_description = file: README.rst, CHANGES.rst classifiers = Development Status :: 4 - Beta Intended Audience :: Developers Intended Audience :: End Users/Desktop Topic :: Text Processing :: Markup :: XML License :: OSI Approved :: MIT License Operating System :: OS Independent Programming Language :: Python Programming Language :: Python :: 3 Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 Programming Language :: Python :: 3 :: Only Programming Language :: Python :: Implementation :: PyPy keywords = xml, html, diff author = Lennart Regebro author_email = lregebro@shoobx.com url = https://github.com/Shoobx/xmldiff license = MIT project_urls = Source Code = https://github.com/Shoobx/xmldiff [options] python_requires = >=3.7 zip_safe = True include_package_data = True packages = find: package_dir = = . install_requires = setuptools lxml>=3.1.0 [options.packages.find] where = . exclude = doc tests [options.extras_require] devenv = black coverage flake8 zest.releaser[recommended] [options.entry_points] console_scripts = xmldiff = xmldiff.main:diff_command xmlpatch = xmldiff.main:patch_command [flake8] max-line-length=120 exclude = xmldiff/*diff_match_patch*.py [tool:pytest] testpaths = tests [check-manifest] ignore = .pre-commit-config.yaml [zest.releaser] create-wheel = yes xmldiff-2.6.3/setup.py000066400000000000000000000000461443244161100146650ustar00rootroot00000000000000from setuptools import setup setup() xmldiff-2.6.3/tests/000077500000000000000000000000001443244161100143155ustar00rootroot00000000000000xmldiff-2.6.3/tests/__init__.py000066400000000000000000000000671443244161100164310ustar00rootroot00000000000000# Make the tests a module, so we they are discoverable xmldiff-2.6.3/tests/test_data/000077500000000000000000000000001443244161100162655ustar00rootroot00000000000000xmldiff-2.6.3/tests/test_data/all_actions.expected.xml000066400000000000000000000015611443244161100231020ustar00rootroot00000000000000 A bit of contained textModifiedThis is outside a tagNew tail content Here we have some text. And something else Here we have some text. My last tag xmldiff-2.6.3/tests/test_data/all_actions.left.xml000066400000000000000000000006561443244161100222370ustar00rootroot00000000000000 A bit of contained text This is outside a tag Here we have some text. And something else My last tag xmldiff-2.6.3/tests/test_data/all_actions.right.xml000066400000000000000000000006351443244161100224170ustar00rootroot00000000000000 Modified New tail content And something else Here we have some text. xmldiff-2.6.3/tests/test_data/bom_1.xml000066400000000000000000000002061443244161100200020ustar00rootroot00000000000000ο»Ώ xmldiff-2.6.3/tests/test_data/bom_2.xml000066400000000000000000000002061443244161100200030ustar00rootroot00000000000000ο»Ώ xmldiff-2.6.3/tests/test_data/complex-text-update.expected.html000066400000000000000000000012671443244161100246720ustar00rootroot00000000000000

Let's see. This is some simple text demonstrating the features of the human text differ. This feature attempts to make changelog nice & readable for humans.
The human text differ uses sentences as its first order matching. Let's see.It should handle unknown tags & such just fine.

xmldiff-2.6.3/tests/test_data/complex-text-update.left.html000066400000000000000000000005151443244161100240160ustar00rootroot00000000000000

This is some simple text demonstrating the features of the human text differ. This feature attempts to make changelog nice & readable for humans. The human text differ uses sentences as its first order matching. Let's see.

xmldiff-2.6.3/tests/test_data/complex-text-update.right.html000066400000000000000000000006271443244161100242050ustar00rootroot00000000000000

Let's see. This is some simple text demonstrating the features of the human text differ. This feature attempts to make changelog nice & readable for humans.
The human text differ uses sentences as its first order matching. It should handle unknown tags & such just fine.

xmldiff-2.6.3/tests/test_data/example.expected.html000066400000000000000000000005571443244161100224150ustar00rootroot00000000000000 <diff:insert>HTML </diff:insert>Example<diff:delete> HTML</diff:delete>

My First Heading

My first paragraph has changed.

xmldiff-2.6.3/tests/test_data/example.left.html000066400000000000000000000002211443244161100215320ustar00rootroot00000000000000 Example HTML

My First Heading

My first paragraph.

xmldiff-2.6.3/tests/test_data/example.right.html000066400000000000000000000002361443244161100217230ustar00rootroot00000000000000 HTML Example

My First Heading

My paragraph has changed.

xmldiff-2.6.3/tests/test_data/insert-node.diff000066400000000000000000000002341443244161100213450ustar00rootroot00000000000000[update-text, /body/div[1], "\n "] [insert, /body/div[1], p, 0] [update-text, /body/div/p[1], "Simple text"] [update-text-after, /body/div/p[1], "\n "] xmldiff-2.6.3/tests/test_data/insert-node.expected.html000066400000000000000000000001751443244161100232050ustar00rootroot00000000000000

Simple text

xmldiff-2.6.3/tests/test_data/insert-node.left.html000066400000000000000000000000501443244161100223260ustar00rootroot00000000000000
xmldiff-2.6.3/tests/test_data/insert-node.right.html000066400000000000000000000000771443244161100225220ustar00rootroot00000000000000

Simple text

xmldiff-2.6.3/tests/test_data/namespace.expected.xml000066400000000000000000000004211443244161100225400ustar00rootroot00000000000000 lxml doesn't handle default namespaces in xpSo we need to work around thath xmldiff-2.6.3/tests/test_data/namespace.left.xml000066400000000000000000000001671443244161100217000ustar00rootroot00000000000000 lxml doesn't handle default namespaces in xpath xmldiff-2.6.3/tests/test_data/namespace.right.xml000066400000000000000000000001461443244161100220600ustar00rootroot00000000000000 So we need to work around that xmldiff-2.6.3/tests/test_data/rmldoc.expected.xml000066400000000000000000000446141443244161100221000ustar00rootroot00000000000000 xmldiff-2.6.3/tests/test_data/rmldoc.left.xml000066400000000000000000000420231443244161100212210ustar00rootroot00000000000000 xmldiff-2.6.3/tests/test_data/rmldoc.right.xml000066400000000000000000000427661443244161100214220ustar00rootroot00000000000000 xmldiff-2.6.3/tests/test_data/sbt_template.expected.xml000066400000000000000000000027121443244161100232740ustar00rootroot00000000000000 xmldiff-2.6.3/tests/test_data/sbt_template.left.xml000066400000000000000000000016531443244161100224300ustar00rootroot00000000000000 xmldiff-2.6.3/tests/test_data/sbt_template.right.xml000066400000000000000000000023151443244161100226070ustar00rootroot00000000000000 xmldiff-2.6.3/tests/test_diff.py000066400000000000000000001603251443244161100166450ustar00rootroot00000000000000import os import unittest from lxml import etree from xmldiff import utils from xmldiff.diff import Differ from xmldiff.actions import ( UpdateTextIn, InsertNode, MoveNode, DeleteNode, UpdateAttrib, InsertAttrib, RenameAttrib, DeleteAttrib, UpdateTextAfter, RenameNode, InsertComment, ) from .testing import compare_elements def dedent(string): """Remove the maximum common indent of the lines making up the string.""" lines = string.splitlines() indent = min(len(line) - len(line.lstrip()) for line in lines if line) return "\n".join(line[indent:] if line else line for line in lines) class APITests(unittest.TestCase): left = "

Text

More

" right = "

Tokst

More

" lefttree = etree.fromstring(left) righttree = etree.fromstring(right) differ = Differ() def test_set_trees(self): # Passing in just one parameter causes an error: with self.assertRaises(TypeError): self.differ.set_trees(self.lefttree, None) # Passing in something that isn't iterable also cause errors... with self.assertRaises(TypeError): self.differ.set_trees(object(), self.righttree) # This is the way: self.differ.set_trees(self.lefttree, self.righttree) def test_match(self): # Passing in just one parameter causes an error: with self.assertRaises(TypeError): self.differ.match(self.lefttree, None) # Passing in something that isn't iterable also cause errors... with self.assertRaises(TypeError): self.differ.match(object(), self.righttree) # This is the way: res1 = self.differ.match(self.lefttree, self.righttree) lpath = self.differ.left.getroottree().getpath rpath = self.differ.right.getroottree().getpath res1x = [(lpath(x[0]), rpath(x[1]), x[2]) for x in res1] # Or, you can use set_trees: self.differ.set_trees(self.lefttree, self.righttree) res2 = self.differ.match() lpath = self.differ.left.getroottree().getpath rpath = self.differ.right.getroottree().getpath res2x = [(lpath(x[0]), rpath(x[1]), x[2]) for x in res2] # The match sequences should be the same, of course: self.assertEqual(res1x, res2x) # But importantly, they are not the same object, meaning the # matching was redone. self.assertIsNot(res1, res2) # However, if we call match() a second time without setting # new sequences, we'll get a cached result: self.assertIs(self.differ.match(), res2) def test_diff(self): # Passing in just one parameter causes an error: with self.assertRaises(TypeError): list(self.differ.diff(self.lefttree, None)) # Passing in something that isn't iterable also cause errors... with self.assertRaises(TypeError): list(self.differ.diff(object(), self.righttree)) # This is the way: res1 = list(self.differ.diff(self.lefttree, self.righttree)) # Or, you can use set_trees() or match() # We need to reparse self.lefttree, since after the diffing they # are equal. self.lefttree = etree.fromstring(self.left) self.differ.set_trees(self.lefttree, self.righttree) res2 = list(self.differ.diff()) # The match sequences should be the same, of course: self.assertEqual(res1, res2) # But importantly, they are not the same object, meaning the # matching was redone. self.assertIsNot(res1, res2) # There is no caching of diff(), so running it again means another # diffing. self.assertIsNot(list(self.differ.diff()), res2) class NodeRatioTests(unittest.TestCase): def test_compare_equal(self): xml = """
First paragraph
Last paragraph
""" tree = etree.fromstring(xml) differ = Differ() differ.set_trees(tree, tree) differ.match() # Every node in these trees should get a 1.0 leaf_ratio, # and if it has children, 1.0 child_ration, else None for left, right in zip( utils.post_order_traverse(differ.left), utils.post_order_traverse(differ.right), ): self.assertEqual(differ.leaf_ratio(left, right), 1.0) if left.getchildren(): self.assertEqual(differ.child_ratio(left, right), 1.0) else: self.assertIsNone(differ.child_ratio(left, right)) def test_compare_different_leafs(self): left = """
This doesn't match at all
First paragraph
Last paragraph
""" right = """
Completely different from before
Another paragraph
Last paragraph
""" lefttree = etree.fromstring(left) righttree = etree.fromstring(right) differ = Differ() # Make some choice comparisons here # These node are exactly the same left = lefttree.xpath("/document/story/section[3]/para")[0] right = righttree.xpath("/document/story/section[3]/para")[0] self.assertEqual(differ.leaf_ratio(left, right), 1.0) # These nodes have slightly different text, but no children left = lefttree.xpath("/document/story/section[2]/para")[0] right = righttree.xpath("/document/story/section[2]/para")[0] self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.75) # These nodes should not be very similar left = lefttree.xpath("/document/story/section[1]/para")[0] right = righttree.xpath("/document/story/section[1]/para")[0] self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.45614035087719) def test_compare_different_nodes(self): left = """
First paragraph Second paragraph
Third paragraph
Last paragraph
""" right = """
First paragraph
Second paragraph Third paragraph
Last paragraph
""" differ = Differ() differ.set_trees(etree.fromstring(left), etree.fromstring(right)) differ.match() # Make some choice comparisons here. leaf_ratio will always be 1.0, # as these leafs have the same attributes and no text, even though # attributes may be in different order. left = differ.left.xpath("/document/story/section[1]")[0] right = differ.right.xpath("/document/story/section[1]")[0] self.assertEqual(differ.leaf_ratio(left, right), 1.0) # Only one of two matches: self.assertEqual(differ.child_ratio(left, right), 0.5) left = differ.left.xpath("/document/story/section[2]")[0] right = differ.right.xpath("/document/story/section[2]")[0] self.assertEqual(differ.leaf_ratio(left, right), 1.0) # Only one of two matches: self.assertEqual(differ.child_ratio(left, right), 0.5) # These nodes should not be very similar left = differ.left.xpath("/document/story/section[3]")[0] right = differ.right.xpath("/document/story/section[3]")[0] self.assertEqual(differ.leaf_ratio(left, right), 1.0) self.assertEqual(differ.child_ratio(left, right), 1.0) def test_compare_with_xmlid(self): left = """
First paragraph This is the second paragraph
Det tredje stycket
Last paragraph
""" right = """
First paragraph
This is the second Det tredje stycket
Last paragraph
""" differ = Differ() differ.set_trees(etree.fromstring(left), etree.fromstring(right)) differ.match() # Make some choice comparisons here. left = differ.left.xpath("/document/story/section[1]")[0] right = differ.right.xpath("/document/story/section[1]")[0] # These are very similar self.assertEqual(differ.leaf_ratio(left, right), 0.9) # And one out of two children in common self.assertEqual(differ.child_ratio(left, right), 0.5) # But different id's, hence 0 as match self.assertEqual(differ.node_ratio(left, right), 0) # Here's the ones with the same id: left = differ.left.xpath("/document/story/section[1]")[0] right = differ.right.xpath("/document/story/section[2]")[0] # Only one out of two children in common self.assertEqual(differ.child_ratio(left, right), 0.5) # But same id's, hence 1 as match self.assertEqual(differ.node_ratio(left, right), 1.0) # The last ones are completely similar, but only one # has an xml:id, so they do not match. left = differ.left.xpath("/document/story/section[3]")[0] right = differ.right.xpath("/document/story/section[3]")[0] self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.81818181818) self.assertEqual(differ.child_ratio(left, right), 1.0) self.assertEqual(differ.node_ratio(left, right), 0) def test_compare_with_uniqueattrs(self): # `uniqueattrs` can be pairs of (tag, attribute) as well as just string # attributes. left = dedent( """\
First paragraph This is the second paragraph
Det tredje stycket
Last paragraph
""" ) right = dedent( """\
First paragraph
This is the second Det tredje stycket
Last paragraph
First paragraph This is the second paragraph
""" ) differ = Differ( uniqueattrs=[ ("section", "name"), "{http://www.w3.org/XML/1998/namespace}id", ] ) differ.set_trees(etree.fromstring(left), etree.fromstring(right)) differ.match() # Make some choice comparisons here. left = differ.left.xpath("/document/story/section[1]")[0] right = differ.right.xpath("/document/story/section[1]")[0] # These are very similar self.assertEqual(differ.leaf_ratio(left, right), 0.90625) # And one out of two children in common self.assertEqual(differ.child_ratio(left, right), 0.5) # But different names, hence 0 as match self.assertEqual(differ.node_ratio(left, right), 0) # Here's the ones with the same tag and name attribute: left = differ.left.xpath("/document/story/section[1]")[0] right = differ.right.xpath("/document/story/section[2]")[0] # Only one out of two children in common self.assertEqual(differ.child_ratio(left, right), 0) # But same id's, hence 1 as match self.assertEqual(differ.node_ratio(left, right), 1.0) # The last ones are completely similar, but only one # has an name, so they do not match. left = differ.left.xpath("/document/story/section[3]")[0] right = differ.right.xpath("/document/story/section[3]")[0] self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.78260869565) self.assertEqual(differ.child_ratio(left, right), 1.0) self.assertEqual(differ.node_ratio(left, right), 0) # Now these are structurally similar, have the same name, but # one of them is not a section, so the uniqueattr does not match left = differ.left.xpath("/document/story/section[1]")[0] right = differ.right.xpath("/document/story/subsection[1]")[0] self.assertAlmostEqual(differ.leaf_ratio(left, right), 1.0) self.assertEqual(differ.child_ratio(left, right), 0.5) self.assertAlmostEqual(differ.node_ratio(left, right), 0.7905694150420949) def test_compare_node_rename(self): left = """ First paragraph Second paragraph Third paragraph """ right = """
First paragraph
Second paragraph
A different text
""" differ = Differ() differ.set_trees(etree.fromstring(left), etree.fromstring(right)) differ.match() # Make some choice comparisons here. left = differ.left.xpath("/document/para[1]")[0] right = differ.right.xpath("/document/section[1]")[0] # These have different tags, but should still match self.assertEqual(differ.leaf_ratio(left, right), 1.0) # These have different tags, and different attribute value, # but still similar enough left = differ.left.xpath("/document/para[2]")[0] right = differ.right.xpath("/document/section[2]")[0] # These have different tags, but should still match self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.76190476190476) # These have different tags, and different attribute value, # but still similar enough left = differ.left.xpath("/document/para[3]")[0] right = differ.right.xpath("/document/section[3]")[0] # These are too different self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.45161290322580) def test_compare_namespaces(self): left = """ First paragraph """ right = """ First paragraph """ differ = Differ() differ.set_trees(etree.fromstring(left), etree.fromstring(right)) differ.match() # Make some choice comparisons here. left = differ.left.xpath( "/document/foo:para[1]", namespaces={"foo": "someuri"} )[0] right = differ.right.xpath( "/document/foo:para[1]", namespaces={"foo": "otheruri"} )[0] # These have different namespaces, but should still match self.assertEqual(differ.leaf_ratio(left, right), 1.0) def test_different_ratio_modes(self): node1 = etree.Element("para") node1.text = "This doesn't match at all" node2 = etree.Element("para") node2.text = "It's completely different" node3 = etree.Element("para") node3.text = "Completely different from before" # These texts are very different differ = Differ(ratio_mode="accurate") self.assertAlmostEqual(differ.leaf_ratio(node1, node2), 0.24) # However, the quick_ratio doesn't catch that, and think they match differ = Differ(ratio_mode="fast") self.assertAlmostEqual(differ.leaf_ratio(node1, node2), 0.64) # It still realizes these sentences are different, though. differ = Differ(ratio_mode="fast") self.assertAlmostEqual(differ.leaf_ratio(node1, node3), 0.4561403508) # Faster thinks the first two are the same! differ = Differ(ratio_mode="faster") self.assertAlmostEqual(differ.leaf_ratio(node1, node2), 1.0) # And that the third is almost the same differ = Differ(ratio_mode="faster") self.assertAlmostEqual(differ.leaf_ratio(node1, node3), 0.8771929824) # Invalid modes raise error: with self.assertRaises(ValueError): differ = Differ(ratio_mode="allezlebleus") class MatchTests(unittest.TestCase): def _match(self, left, right): left_tree = etree.fromstring(left) right_tree = etree.fromstring(right) differ = Differ() differ.set_trees(left_tree, right_tree) matches = differ.match() lpath = differ.left.getroottree().getpath rpath = differ.right.getroottree().getpath return [(lpath(item[0]), rpath(item[1])) for item in matches] def test_same_tree(self): xml = """
First paragraph
Last paragraph
""" result = self._match(xml, xml) nodes = list(utils.post_order_traverse(etree.fromstring(xml))) # Everything matches self.assertEqual(len(result), len(nodes)) def test_no_xml_id_match(self): # Here we insert a section first, but because they contain numbering # it's easy to match section 1 in left with section 2 in right, # though it should be detected as an insert. # If the number of similar attributes are few it works fine, the # differing content of the ref="3" section means it's detected to # be an insert. left = """
First paragraph
Last paragraph
""" # We even detect that the first section is an insert without # xmlid, but that's less reliable. right = """
New paragraph
First paragraph
Last paragraph
""" result = self._match(left, right) self.assertEqual( result, [ ("/document/story/section[1]/para", "/document/story/section[2]/para"), ("/document/story/section[1]", "/document/story/section[2]"), ("/document/story/section[2]/para", "/document/story/section[3]/para"), ("/document/story/section[2]", "/document/story/section[3]"), ("/document/story", "/document/story"), ("/document", "/document"), ], ) def test_with_xmlid(self): # This first section contains attributes that are similar (and longer # than the content text. That would trick the matcher into matching # the oldfirst and the newfirst section to match, except that we # this time also have xml:id's, and they trump everything else! left = """
First paragraph
Second paragraph
Last paragraph
""" # We even detect that the first section is an insert without # xmlid, but that's less reliable. right = """
New paragraph
First paragraph
Second paragraph
Last paragraph
""" result = self._match(left, right) self.assertEqual( result, [ ("/document/story/section[1]/para", "/document/story/section[2]/para"), ("/document/story/section[1]", "/document/story/section[2]"), ("/document/story/section[2]/para", "/document/story/section[3]/para"), ("/document/story/section[2]", "/document/story/section[3]"), ("/document/story/section[3]/para", "/document/story/section[4]/para"), ("/document/story/section[3]", "/document/story/section[4]"), ("/document/story", "/document/story"), ("/document", "/document"), ], ) def test_change_attribs(self): left = """
First
Last
""" right = """
First
Last
""" # It matches everything straight, which means the attrib changes # should become updates, which makes sense. result = self._match(left, right) self.assertEqual( result, [ ("/document/story/section[1]/para", "/document/story/section[1]/para"), ("/document/story/section[1]", "/document/story/section[1]"), ("/document/story/section[2]/para", "/document/story/section[2]/para"), ("/document/story/section[2]", "/document/story/section[2]"), ("/document/story", "/document/story"), ("/document", "/document"), ], ) def test_move_paragraph(self): left = """
First paragraph Second paragraph
Last paragraph
""" right = """
First paragraph
Second paragraph Last paragraph
""" result = self._match(left, right) self.assertEqual( result, [ ( "/document/story/section[1]/para[1]", "/document/story/section[1]/para", ), ( "/document/story/section[1]/para[2]", "/document/story/section[2]/para[1]", ), ("/document/story/section[1]", "/document/story/section[1]"), ( "/document/story/section[2]/para", "/document/story/section[2]/para[2]", ), ("/document/story/section[2]", "/document/story/section[2]"), ("/document/story", "/document/story"), ("/document", "/document"), ], ) def test_match_complex_text(self): left = """ Consultant shall not indemnify and hold Company, its affiliates and their respective directors, officers, agents and employees harmless from and against all claims, demands, losses, damages and judgments, including court costs and attorneys' fees, arising out of or based upon (a) any claim that the Services provided hereunder or, any related Intellectual Property Rights or the exercise of any rights in or to any Company-Related Development or Pre-Existing Development or related Intellectual Property Rights infringe on, constitute a misappropriation of the subject matter of, or otherwise violate any patent, copyright, trade secret, trademark or other proprietary right of any person or breaches any person's contractual rights; This is strange, but true. """ right = """ Consultant shall not indemnify and hold Company, its affiliates and their respective directors, officers, agents and employees harmless from and against all claims, demands, losses, excluding court costs and attorneys' fees, arising out of or based upon (a) any claim that the Services provided hereunder or, any related Intellectual Property Rights or the exercise of any rights in or to any Company-Related Development or Pre-Existing Development or related Intellectual Property Rights infringe on, constitute a misappropriation of the subject matter of, or otherwise violate any patent, copyright, trade secret, trademark or other proprietary right of any person or breaches any person's contractual rights; This is very strange, but true. """ result = self._match(left, right) self.assertEqual( result, [ ("/wrap/para/b", "/wrap/para/b"), ("/wrap/para", "/wrap/para"), ("/wrap", "/wrap"), ], ) def test_match_insert_node(self): left = """ """ right = """

Inserted Node

""" result = self._match(left, right) self.assertEqual( result, [ ("/document/story", "/document/story"), ("/document", "/document"), ], ) def test_entirely_different(self): left = """ """ right = """

Inserted Node

""" result = self._match(left, right) self.assertEqual( result, [ ("/document", "/document"), ], ) class BestFastMatchTests(unittest.TestCase): def _match(self, left, right, fast_match=False, best_match=False): left_tree = etree.fromstring(left) right_tree = etree.fromstring(right) differ = Differ(fast_match=fast_match, best_match=best_match) differ.set_trees(left_tree, right_tree) matches = differ.match() lpath = differ.left.getroottree().getpath rpath = differ.right.getroottree().getpath return [(lpath(item[0]), rpath(item[1])) for item in matches] def test_move_paragraph(self): left = """
First paragraph Second paragraph
Last paragraph
""" right = """
First paragraph
Second paragraph Last paragraph
""" # Same matches as the non-fast match test, but the matches are # a different order. slow_result = sorted(self._match(left, right)) fast_result = sorted(self._match(left, right, fast_match=True)) best_result = sorted(self._match(left, right, best_match=True)) self.assertEqual(slow_result, fast_result) self.assertEqual(slow_result, best_result) def test_move_children(self): # Here the paragraphs are all so similar that that each paragraph # will match any other. left = """
First paragraph Second paragraph Last paragraph
""" right = """
Second paragraph Last paragraph First paragraph
""" # The slow match will match the nodes that match *best*, so it will # find that paragraphs have moved around. slow_result = sorted(self._match(left, right, False)) self.assertEqual( slow_result, [ ("/document", "/document"), ("/document/story", "/document/story"), ("/document/story/section", "/document/story/section"), ("/document/story/section/para[1]", "/document/story/section/para[3]"), ("/document/story/section/para[2]", "/document/story/section/para[1]"), ("/document/story/section/para[3]", "/document/story/section/para[2]"), ], ) # But the fast match will just pick any that matches. fast_result = sorted(self._match(left, right, True)) self.assertEqual( fast_result, [ ("/document", "/document"), ("/document/story", "/document/story"), ("/document/story/section", "/document/story/section"), ("/document/story/section/para[1]", "/document/story/section/para[1]"), ("/document/story/section/para[2]", "/document/story/section/para[2]"), ("/document/story/section/para[3]", "/document/story/section/para[3]"), ], ) # Best should be as good as slow (but slower) best_result = sorted(self._match(left, right, best_match=True)) self.assertEqual(best_result, slow_result) def test_delete_node(self): # If you have a list of similar nodes, and delete one, that # confuses both the standard and the fast algorithm: left = """ """ right = """ """ slow_result = sorted(self._match(left, right)) fast_result = sorted(self._match(left, right, fast_match=True)) best_result = sorted(self._match(left, right, best_match=True)) self.assertEqual( slow_result, [ ("/root", "/root"), ("/root/node[1]", "/root/node[1]"), ("/root/node[2]", "/root/node[2]"), ("/root/node[3]", "/root/node[3]"), ("/root/node[4]", "/root/node[4]"), ], ) self.assertEqual(fast_result, slow_result) self.assertEqual( best_result, [ ("/root", "/root"), ("/root/node[1]", "/root/node[1]"), ("/root/node[2]", "/root/node[2]"), ("/root/node[4]", "/root/node[3]"), ("/root/node[5]", "/root/node[4]"), ], ) class UpdateNodeTests(unittest.TestCase): """Testing only the update phase of the diffing""" def _match(self, left, right): left_tree = etree.fromstring(left) right_tree = etree.fromstring(right) differ = Differ() differ.set_trees(left_tree, right_tree) matches = differ.match() steps = [] for left, right, m in matches: steps.extend(differ.update_node_attr(left, right)) steps.extend(differ.update_node_text(left, right)) return steps def test_same_tree(self): xml = """
First paragraph
Last paragraph
""" result = self._match(xml, xml) # Everything matches self.assertEqual(result, []) def test_attribute_changes(self): left = ( """The contained textAnd a tail!""" ) right = ( """The new textAlso a tail!""" ) result = self._match(left, right) self.assertEqual( result, [ UpdateAttrib("/root/node[1]", "attr2", "uhhuh"), RenameAttrib("/root/node[1]", "attr1", "attr4"), InsertAttrib("/root/node[1]", "attr5", "new"), DeleteAttrib("/root/node[1]", "attr0"), UpdateTextIn("/root/node[1]", "The new text"), UpdateTextAfter("/root/node[1]", "Also a tail!"), ], ) class AlignChildrenTests(unittest.TestCase): """Testing only the align phase of the diffing""" def _align(self, left, right): left_tree = etree.fromstring(left) right_tree = etree.fromstring(right) differ = Differ() differ.set_trees(left_tree, right_tree) matches = differ.match() steps = [] for left, right, m in matches: steps.extend(differ.align_children(left, right)) return steps def test_same_tree(self): xml = """
First paragraph
Last paragraph
""" result = self._align(xml, xml) # Everything matches self.assertEqual(result, []) def test_move_paragraph(self): left = """
First paragraph Second paragraph
Last paragraph
""" right = """
First paragraph
Second paragraph Last paragraph
""" result = self._align(left, right) # Everything matches self.assertEqual(result, []) def test_move_children(self): left = """
First paragraph Second paragraph Last paragraph
""" right = """
Second paragraph Last paragraph First paragraph
""" result = self._align(left, right) self.assertEqual( result, [ MoveNode( "/document/story/section/para[1]", "/document/story/section[1]", 2 ) ], ) class DiffTests(unittest.TestCase): """Testing only the align phase of the diffing""" def _diff(self, left, right): parser = etree.XMLParser(remove_blank_text=True) left_tree = etree.fromstring(left, parser) right_tree = etree.fromstring(right, parser) differ = Differ() differ.set_trees(left_tree, right_tree) editscript = list(differ.diff()) compare_elements(differ.left, differ.right) return editscript def test_process(self): left = """
First paragraph Second paragraph Third paragraph
Delete it
""" right = """
First paragraph Second paragraph
Third paragraph Fourth paragraph
""" result = self._diff(left, right) self.assertEqual( result, [ InsertNode("/document/story[1]", "section", 1), InsertAttrib("/document/story/section[2]", "ref", "4"), InsertAttrib("/document/story/section[2]", "single-ref", "4"), MoveNode( "/document/story/section[1]/para[3]", "/document/story/section[2]", 0, ), InsertNode("/document/story/section[2]", "para", 1), UpdateTextIn("/document/story/section[2]/para[2]", "Fourth paragraph"), DeleteNode("/document/story/deleteme/para[1]"), DeleteNode("/document/story/deleteme[1]"), ], ) def test_needs_align(self): left = "

1

2

3

4

" right = "

2

4

1

3

" result = self._diff(left, right) self.assertEqual( result, [ MoveNode("/root/n[1]", "/root[1]", 1), MoveNode("/root/n[2]/p[2]", "/root/n[1]", 0), ], ) def test_no_root_match(self): left = ( '

1

2

3

' "

4

" ) right = "

2

4

1

3

" result = self._diff(left, right) self.assertEqual( result, [ DeleteAttrib("/root[1]", "attr"), MoveNode("/root/root/n[2]", "/root[1]", 0), MoveNode("/root/root/n[1]", "/root[1]", 1), MoveNode("/root/n[2]/p[2]", "/root/n[1]", 0), DeleteNode("/root/root[1]"), ], ) def test_rmldoc(self): here = os.path.split(__file__)[0] lfile = os.path.join(here, "test_data", "rmldoc.left.xml") rfile = os.path.join(here, "test_data", "rmldoc.right.xml") with open(lfile, encoding="utf8") as infile: left = infile.read() with open(rfile, encoding="utf8") as infile: right = infile.read() result = self._diff(left, right) self.assertEqual( result, [ InsertNode( "/document/story[1]", "{http://namespaces.shoobx.com/application}section", 4, ), InsertAttrib("/document/story/app:section[4]", "hidden", "false"), InsertAttrib("/document/story/app:section[4]", "name", "sign"), InsertAttrib("/document/story/app:section[4]", "ref", "3"), InsertAttrib("/document/story/app:section[4]", "removed", "false"), InsertAttrib("/document/story/app:section[4]", "single-ref", "3"), InsertAttrib( "/document/story/app:section[4]", "title", "Signing Bonus" ), UpdateAttrib("/document/story/app:section[5]", "ref", "4"), UpdateAttrib("/document/story/app:section[5]", "single-ref", "4"), UpdateAttrib("/document/story/app:section[6]", "ref", "5"), UpdateAttrib("/document/story/app:section[6]", "single-ref", "5"), UpdateAttrib("/document/story/app:section[7]", "ref", "6"), UpdateAttrib("/document/story/app:section[7]", "single-ref", "6"), UpdateAttrib("/document/story/app:section[8]", "ref", "7"), UpdateAttrib("/document/story/app:section[8]", "single-ref", "7"), UpdateAttrib("/document/story/app:section[9]", "ref", "8"), UpdateAttrib("/document/story/app:section[9]", "single-ref", "8"), UpdateAttrib("/document/story/app:section[10]", "ref", "9"), UpdateAttrib("/document/story/app:section[10]", "single-ref", "9"), UpdateAttrib("/document/story/app:section[11]", "ref", "10"), UpdateAttrib("/document/story/app:section[11]", "single-ref", "10"), UpdateAttrib("/document/story/app:section[12]", "ref", "11"), UpdateAttrib("/document/story/app:section[12]", "single-ref", "11"), UpdateAttrib("/document/story/app:section[14]", "ref", "12"), UpdateAttrib("/document/story/app:section[14]", "single-ref", "12"), InsertNode( "/document/story/app:section[4]", "{http://namespaces.shoobx.com/application}term", 0, ), InsertAttrib( "/document/story/app:section[4]/app:term[1]", "name", "sign_bonus" ), InsertAttrib("/document/story/app:section[4]/app:term[1]", "set", "ol"), InsertNode("/document/story/app:section[4]", "para", 1), UpdateTextIn( "/document/story/app:section[1]/para[2]/" "app:placeholder[1]", "consectetur", ), InsertNode( "/document/story/app:section[4]/para[1]", "{http://namespaces.shoobx.com/application}ref", 0, ), InsertAttrib( "/document/story/app:section[4]/para/app:ref[1]", "name", "sign" ), InsertAttrib( "/document/story/app:section[4]/para/app:ref[1]", "{http://namespaces.shoobx.com/preview}body", "", ), UpdateTextIn("/document/story/app:section[4]/para/app:ref[1]", "3"), UpdateTextAfter("/document/story/app:section[4]/para/app:ref[1]", "eu"), InsertNode("/document/story/app:section[4]/para[1]", "u", 1), UpdateTextAfter( "/document/story/app:section[4]/para/u[1]", "ntum augue.\n\nAliquam nec tortor diam. Ph", ), InsertNode( "/document/story/app:section[4]/para[1]", "{http://namespaces.shoobx.com/application}placeholder", 2, ), InsertAttrib( "/document/story/app:section[4]/para/app:placeholder[1]", "field", "ol.sign_bonus_include_amt", ), InsertAttrib( "/document/story/app:section[4]/para/app:placeholder[1]", "missing", "Signing Bonus Amount", ), UpdateTextAfter( "/document/story/app:section[4]/para/app:placeholder[1]", "asellus congue accumsan tempor. Donec vel risus se", ), UpdateTextIn("/document/story/app:section[5]/para/app:ref[1]", "4"), UpdateTextIn("/document/story/app:section[6]/para/app:ref[1]", "5"), UpdateTextIn("/document/story/app:section[7]/para/app:ref[1]", "6"), UpdateTextIn("/document/story/app:section[8]/para/app:ref[1]", "7"), UpdateTextIn("/document/story/app:section[9]/para/app:ref[1]", "8"), UpdateTextIn("/document/story/app:section[10]/para/app:ref[1]", "9"), UpdateTextIn("/document/story/app:section[11]/para/app:ref[1]", "10"), UpdateTextIn("/document/story/app:section[12]/para/app:ref[1]", "11"), InsertNode("/document/story/app:section[4]/para/u[1]", "b", 0), UpdateTextIn( "/document/story/app:section[4]/para/u/b[1]", "ger nec ferme" ), ], ) def test_sbt_template(self): here = os.path.split(__file__)[0] lfile = os.path.join(here, "test_data", "sbt_template.left.xml") rfile = os.path.join(here, "test_data", "sbt_template.right.xml") with open(lfile, encoding="utf8") as infile: left = infile.read() with open(rfile, encoding="utf8") as infile: right = infile.read() result = self._diff(left, right) bm_bm_bm = "/metal:block/metal:block/metal:block" self.assertEqual( result, [ UpdateAttrib( bm_bm_bm + "/app:section[1]", "hidden", "advisor.payment_type == 'none'", ), UpdateAttrib( bm_bm_bm + "/app:section/tal:if[1]", "condition", "python: advisor.payment_type == 'stock_award'", ), InsertNode( bm_bm_bm + "/app:section[1]", "{http://xml.zope.org/namespaces/tal}if", 1, ), InsertAttrib( bm_bm_bm + "/app:section/tal:if[2]", "condition", "python: advisor.payment_type == 'cash'", ), InsertNode( bm_bm_bm + "/app:section[1]", "{http://xml.zope.org/namespaces/tal}if", 2, ), InsertAttrib( bm_bm_bm + "/app:section/tal:if[3]", "condition", "python: advisor.payment_type == 'stock_award_and_cash'", ), InsertNode(bm_bm_bm + "/app:section/tal:if[1]", "para", 0), UpdateTextIn( bm_bm_bm + "/app:section/tal:if[1]/para[1]", "\n A " ), InsertNode(bm_bm_bm + "/app:section/tal:if[2]", "para", 0), UpdateTextIn( bm_bm_bm + "/app:section/tal:if[2]/para[1]", "\n More text for diffing purposes\n ", ), InsertNode(bm_bm_bm + "/app:section/tal:if[3]", "para", 0), UpdateTextIn( bm_bm_bm + "/app:section/tal:if[3]/para[1]", "\n Lorem hipster ipso facto\n ", ), InsertNode(bm_bm_bm + "/app:section/tal:if[1]/para[1]", "i", 0), UpdateTextIn(bm_bm_bm + "/app:section/tal:if[1]/para[1]/i[1]", "whole"), UpdateTextAfter( bm_bm_bm + "/app:section/tal:if[1]/para[1]/i[1]", " load of formatted text and ", ), InsertNode(bm_bm_bm + "/app:section/tal:if[1]/para[1]", "br", 1), UpdateTextAfter( bm_bm_bm + "/app:section/tal:if[1]/para[1]/br[1]", " other stuff.\n ", ), DeleteNode(bm_bm_bm + "/app:section/tal:if[1]/para[2]/b[1]"), DeleteNode(bm_bm_bm + "/app:section/tal:if[1]/para[2]"), ], ) def test_namespace(self): # Test changing nodes and attributes with namespaces left = """ Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque feugiat metus quam. Suspendisse potenti. Vestibulum quis ornare felis, ac elementum sem. Second paragraph Third paragraph Paragraph to tweak the matching of the section node By making many matching children Until the node matches properly. """ right = """ Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque feugiat metus quam. Suspendisse potenti. Vestibulum quis ornare felis, ac elementum sem. Second paragraph Third paragraph Paragraph to tweak the matching of the section node By making many matching children Until the node matches properly. """ result = self._diff(left, right) self.assertEqual( result, [ RenameNode("/document/story/app:section/foo:para[1]", "{someuri}para"), InsertAttrib( "/document/story/app:section/app:para[3]", "{someuri}attrib", "value", ), ], ) def test_multiple_tag_deletes(self): left = """
  • One
  • Two
  • Three
""" right = """ """ result = self._diff(left, right) self.assertEqual( result, [ UpdateTextIn("/document/story[1]", "\n "), DeleteNode("/document/story/ul/li[3]"), DeleteNode("/document/story/ul/li[2]"), DeleteNode("/document/story/ul/li[1]"), DeleteNode("/document/story/ul[1]"), ], ) def test_insert_comment(self): left = "Something" right = "Something" result = self._diff(left, right) self.assertEqual(result, [InsertComment("/doc[1]", 0, " New comment! ")]) def test_issue_21_default_namespaces(self): # When you have a default namespace you get "*" instead of the # expected "tag" in the XPath. This is how libxml does it, # and they say it has to be like that, so we document it. left = 'old' right = 'new' result = self._diff(left, right) self.assertEqual(result[0].node, "/*[1]") def test_ignore_attribute(self): # this differ ignores the attribute 'skip' when diffing class IgnoringDiffer(Differ): def node_attribs(self, node): if "skip" in node.attrib: attribs = dict(node.attrib) del attribs["skip"] return attribs return node.attrib left = 'text' right = 'text' parser = etree.XMLParser(remove_blank_text=True) left_tree = etree.fromstring(left, parser) right_tree = etree.fromstring(right, parser) differ = IgnoringDiffer() differ.set_trees(left_tree, right_tree) editscript = list(differ.diff()) self.assertEqual(editscript, []) def test_compare_with_ignore_attrs(self): left = dedent( """\ """ ) right = dedent( """\
""" ) parser = etree.XMLParser(remove_blank_text=True) left_tree = etree.fromstring(left, parser) right_tree = etree.fromstring(right, parser) differ = Differ(ignored_attrs=["uuid"]) differ.set_trees(left_tree, right_tree) editscript = list(differ.diff()) self.assertEqual(editscript, []) xmldiff-2.6.3/tests/test_formatting.py000066400000000000000000000525571443244161100201160ustar00rootroot00000000000000import os import sys import unittest from lxml import etree from xmldiff import formatting, main, actions from .testing import generate_filebased_cases START = 'This is a tag with \ue006formatted\ue005 text.

", ) replacer.undo_element(element) self.assertEqual(etree.tounicode(element), text) # Non formatting tags get replaced with content text = "

This is a tag with formatted text.

" element = etree.fromstring(text) replacer.do_element(element) result = etree.tounicode(element) self.assertEqual(result, "

This is a tag with \ue007 text.

") # Single formatting tags still get two placeholders. text = "

This is a with text.

" element = etree.fromstring(text) replacer.do_element(element) result = etree.tounicode(element) self.assertEqual(result, "

This is a \ue009\ue008 with \ue00a text.

") def test_do_undo_element(self): replacer = formatting.PlaceholderMaker(["p"], ["b"]) # Formatting tags get replaced, and the content remains text = "

This a tag with formatted text.

" element = etree.fromstring(text) replacer.do_element(element) self.assertEqual( element.text, "This \ue005 a \ue006 with \ue008formatted" "\ue007 text." ) replacer.undo_element(element) result = etree.tounicode(element) self.assertEqual(result, text) def test_do_undo_element_double_format(self): replacer = formatting.PlaceholderMaker(["p"], ["b", "u"]) # Formatting tags get replaced, and the content remains text = "

This is doubly formatted text.

" element = etree.fromstring(text) replacer.do_element(element) self.assertEqual( element.text, "This is \ue006doubly \ue008formatted\ue007" "\ue005 text." ) replacer.undo_element(element) result = etree.tounicode(element) self.assertEqual(result, text) def test_rml_bug(self): etree.register_namespace(formatting.DIFF_PREFIX, formatting.DIFF_NS) before_diff = """
4. At Will Employment .\u201cText\u201d
""" tree = etree.fromstring(before_diff) replacer = formatting.PlaceholderMaker( text_tags=("para",), formatting_tags=( "b", "u", "i", ), ) replacer.do_tree(tree) after_diff = """
\ue005. \ue007\ue009At Will Employment\ue008\ue006 .\u201cNew Text\u201d
""" # The diff formatting will find some text to insert. delete_attrib = "{%s}delete-format" % formatting.DIFF_NS replacer.placeholder2tag["\ue006"].element.attrib[delete_attrib] = "" replacer.placeholder2tag["\ue007"].element.attrib[delete_attrib] = "" tree = etree.fromstring(after_diff) replacer.undo_tree(tree) result = etree.tounicode(tree) expected = """
4. At Will Employment .\u201cNew Text\u201d
""" self.assertEqual(result, expected) def test_placeholder_overflow(self): # PY3: This whole test is Python 2 support. # Test what happens when we have more than 6400 placeholders, # by patching the placeholder: try: orig_start = formatting.PLACEHOLDER_START # This is the last character of the Private use area formatting.PLACEHOLDER_START = 0xF8FF replacer = formatting.PlaceholderMaker(["p"], ["b"]) # Formatting tags get replaced, and the content remains text = "

This a tag with some text.

" element = etree.fromstring(text) replacer.do_element(element) # self.assertEqual( element.text, "This \uf904 a \uf905 with \uf907some" "\uf906 text." ) try: # If this is a wide build, also test what happens if we # get over 8192 substitutions, and overflow the 2-byte code. # (On narrow builds this will give an error) formatting.PLACEHOLDER_START = 0xFFFF replacer = formatting.PlaceholderMaker(["p"], ["b"]) # Formatting tags get replaced, and the content remains text = "

This a tag with some text.

" element = etree.fromstring(text) replacer.do_element(element) # This should raise an error on a narrow build self.assertEqual( element.text, "This \U00010004 a \U00010005 with \U00010007some" "\U00010006 text.", ) except ValueError: if sys.maxunicode > 0x10000: # This is a wide build, we should NOT get an error raise finally: # Set it back formatting.PLACEHOLDER_START = orig_start class XMLFormatTests(unittest.TestCase): def _format_test(self, left, action, expected): formatter = formatting.XMLFormatter(pretty_print=False) result = formatter.format([action], etree.fromstring(left)) self.assertEqual(result, expected) def test_incorrect_xpaths(self): left = 'Text' expected = START + ' diff:delete-attr="a">Text' + END with self.assertRaises(ValueError): action = actions.DeleteAttrib("/document/node", "a") self._format_test(left, action, expected) with self.assertRaises(ValueError): action = actions.DeleteAttrib("/document/ummagumma", "a") self._format_test(left, action, expected) def test_del_attr(self): left = 'Text' action = actions.DeleteAttrib("/document/node", "a") expected = START + ' diff:delete-attr="a">Text' + END self._format_test(left, action, expected) def test_del_node(self): left = 'Text' action = actions.DeleteNode("/document/node") expected = START + ' attr="val" diff:delete="">Text' + END self._format_test(left, action, expected) def test_del_text(self): left = 'Text' action = actions.UpdateTextIn("/document/node", None) expected = START + ' attr="val">Text' + END self._format_test(left, action, expected) def test_insert_attr(self): left = "We need more text" action = actions.InsertAttrib("/document/node", "attr", "val") expected = START + ' attr="val" diff:add-attr="attr">' "We need more text" + END self._format_test(left, action, expected) def test_insert_node(self): left = "" action = actions.InsertNode("/document", "node", 0) expected = START + ' diff:insert=""/>
' self._format_test(left, action, expected) def test_move_attr(self): # The library currently only uses move attr for when attributes are # renamed: left = 'Text' action = actions.RenameAttrib("/document/node", "attr", "bottr") expected = START + ' bottr="val" diff:rename-attr="attr:bottr"' ">Text" + END self._format_test(left, action, expected) def test_move_node(self): # Move 1 down left = '' action = actions.MoveNode("/document/node[1]", "/document", 1) expected = ( START + ' id="1" diff:delete=""/>' ) self._format_test(left, action, expected) # Move 2 up (same result, different diff) left = '' action = actions.MoveNode("/document/node[2]", "/document", 0) expected = ( START + ' id="2" diff:insert=""/>' ) self._format_test(left, action, expected) def test_rename_node(self): left = "ContentTail" action = actions.RenameNode("/document/node[1]/para[1]", "newtag") expected = START + '>Content' "Tail" + END self._format_test(left, action, expected) def test_update_attr(self): left = '' action = actions.UpdateAttrib("/document/node", "attr", "newval") expected = START + ' attr="newval" diff:update-attr="attr:val"/>' "" self._format_test(left, action, expected) def test_update_text_in(self): left = '' action = actions.UpdateTextIn("/document/node", "Text") expected = START + ' attr="val">Text' + END self._format_test(left, action, expected) left = "This is a bit of text, right" + END action = actions.UpdateTextIn("/document/node", "Also a bit of text, rick") expected = ( START + ">This is" "Also a bit of text, right" "ck" + END ) self._format_test(left, action, expected) def test_update_text_after_1(self): left = "" action = actions.UpdateTextAfter("/document/node[1]", "Text") expected = START + "/>Text" "" self._format_test(left, action, expected) def test_update_text_after_2(self): left = "This is a bit of text, right" action = actions.UpdateTextAfter("/document/node", "Also a bit of text, rick") expected = ( START + "/>This is" "Also a bit of text, ri" "ghtck" ) self._format_test(left, action, expected) class DiffFormatTests(unittest.TestCase): def _format_test(self, action, expected): formatter = formatting.DiffFormatter() result = formatter.format([action], None) self.assertEqual(result, expected) def test_del_attr(self): action = actions.DeleteAttrib("/document/node", "a") expected = "[delete-attribute, /document/node, a]" self._format_test(action, expected) def test_del_node(self): action = actions.DeleteNode("/document/node") expected = "[delete, /document/node]" self._format_test(action, expected) def test_del_text(self): action = actions.UpdateTextIn("/document/node", None) expected = "[update-text, /document/node, null]" self._format_test(action, expected) def test_insert_attr(self): action = actions.InsertAttrib("/document/node", "attr", "val") expected = '[insert-attribute, /document/node, attr, "val"]' self._format_test(action, expected) def test_insert_node(self): action = actions.InsertNode("/document", "node", 0) expected = "[insert, /document, node, 0]" self._format_test(action, expected) def test_rename_attr(self): action = actions.RenameAttrib("/document/node", "attr", "bottr") expected = "[rename-attribute, /document/node, attr, bottr]" self._format_test(action, expected) def test_move_node(self): # Move 1 down action = actions.MoveNode("/document/node[1]", "/document", 1) expected = "[move, /document/node[1], /document, 1]" self._format_test(action, expected) # Move 2 up (same result, different diff) action = actions.MoveNode("/document/node[2]", "/document", 0) expected = "[move, /document/node[2], /document, 0]" self._format_test(action, expected) def test_rename_node(self): # Move 1 down action = actions.RenameNode("/document/node[1]", "newtag") expected = "[rename, /document/node[1], newtag]" self._format_test(action, expected) # Move 2 up (same result, different diff) action = actions.MoveNode("/document/node[2]", "/document", 0) expected = "[move, /document/node[2], /document, 0]" self._format_test(action, expected) def test_update_attr(self): action = actions.UpdateAttrib("/document/node", "attr", "newval") expected = '[update-attribute, /document/node, attr, "newval"]' self._format_test(action, expected) def test_update_text_in(self): action = actions.UpdateTextIn("/document/node", "Text") expected = '[update-text, /document/node, "Text"]' self._format_test(action, expected) action = actions.UpdateTextIn("/document/node", 'Also a bit of text, "rick"') expected = "[update-text, /document/node, " '"Also a bit of text, \\"rick\\""]' self._format_test(action, expected) def test_update_text_after_1(self): action = actions.UpdateTextAfter("/document/node[1]", "Text") expected = '[update-text-after, /document/node[1], "Text"]' self._format_test(action, expected) def test_update_text_after_2(self): action = actions.UpdateTextAfter("/document/node", "Also a bit of text, rick") expected = "[update-text-after, /document/node, " '"Also a bit of text, rick"]' self._format_test(action, expected) def test_insert_comment(self): action = actions.InsertComment("/document/node", 2, "Commentary") expected = '[insert-comment, /document/node, 2, "Commentary"]' self._format_test(action, expected) class XmlDiffFormatTests(unittest.TestCase): # RenameAttr and MoveNode requires an orig_tree, so they # are not tested in the _format_test tests, but in the # all_actions test, which uses test_data files. def _format_test(self, action, expected): formatter = formatting.XmlDiffFormatter() result = formatter.format([action], None) self.assertEqual(result, expected) def test_del_attr(self): action = actions.DeleteAttrib("/document/node", "a") expected = "[remove, /document/node/@a]" self._format_test(action, expected) def test_del_node(self): action = actions.DeleteNode("/document/node") expected = "[remove, /document/node]" self._format_test(action, expected) def test_del_text(self): action = actions.UpdateTextIn("/document/node", None) expected = "[update, /document/node/text()[1], null]" self._format_test(action, expected) def test_insert_attr(self): action = actions.InsertAttrib("/document/node", "attr", "val") expected = "[insert, /document/node, \n<@attr>\nval\n]" self._format_test(action, expected) def test_insert_node(self): action = actions.InsertNode("/document", "node", 0) expected = "[insert-first, /document, \n]" self._format_test(action, expected) def test_rename_node(self): # Move 1 down action = actions.RenameNode("/document/node[1]", "newtag") expected = "[rename, /document/node[1], newtag]" self._format_test(action, expected) # Move 2 up (same result, different diff) action = actions.MoveNode("/document/node[2]", "/document", 0) expected = "[move-first, /document/node[2], /document]" self._format_test(action, expected) def test_update_attr(self): action = actions.UpdateAttrib("/document/node", "attr", "newval") expected = '[update, /document/node/@attr, "newval"]' self._format_test(action, expected) def test_update_text_in(self): action = actions.UpdateTextIn("/document/node", "Text") expected = '[update, /document/node/text()[1], "Text"]' self._format_test(action, expected) action = actions.UpdateTextIn("/document/node", 'Also a bit of text, "rick"') expected = ( "[update, /document/node/text()[1], " '"Also a bit of text, \\"rick\\""]' ) self._format_test(action, expected) def test_update_text_after_1(self): action = actions.UpdateTextAfter("/document/node[1]", "Text") expected = '[update, /document/node[1]/text()[2], "Text"]' self._format_test(action, expected) def test_update_text_after_2(self): action = actions.UpdateTextAfter("/document/node", "Also a bit of text, rick") expected = "[update, /document/node/text()[2], " '"Also a bit of text, rick"]' self._format_test(action, expected) def test_all_actions(self): here = os.path.split(__file__)[0] lfile = os.path.join(here, "test_data", "all_actions.left.xml") rfile = os.path.join(here, "test_data", "all_actions.right.xml") formatter = formatting.XmlDiffFormatter() result = main.diff_files(lfile, rfile, formatter=formatter) expected = ( "[insert-namespace, space, http://namespaces.shoobx.com/outerspace]\n" "[delete-namespace, name]\n" "[move-after, /document/node[2], /document/tag[1]]\n" "[insert-comment, /document[1], 0, Insert a new comment ]\n" '[update, /document/node[1]/@name, "was updated"]\n' "[remove, /document/node[1]/@attribute]\n" "[insert, /document/node[1], \n" "<@newtribute>\n" "renamed\n" "]\n" "[insert, /document/node[1], \n" "<@this>\n" "is new\n" "]\n" "[remove, /document/node[1]/@attr]\n" '[update, /document/node[1]/text()[1], "\\n Modified\\n "]\n' '[update, /document/node[1]/text()[2], "\\n ' 'New tail content\\n "]\n' "[rename, /document/node[2], nod]\n" "[rename, /document/name:space[1], {http://namespaces.shoobx.com/outerspace}name]\n" '[update, /document/space:name[1]/text()[2], "\\n "]\n' "[remove, /document/tail[1]]" ) self.assertEqual(result, expected) class FormatterFileTests(unittest.TestCase): formatter = None # Override this maxDiff = None def process(self, left, right): return main.diff_files(left, right, formatter=self.formatter) class XMLFormatterFileTests(FormatterFileTests): # The XMLFormatter has no text or formatting tags, so formatter = formatting.XMLFormatter( pretty_print=False, normalize=formatting.WS_TEXT ) # Also test the bits that handle text tags: class HTMLFormatterFileTests(FormatterFileTests): # We use a few tags for the placeholder tests. #
is intentionally left out, to test an edge case # with empty non-formatting tags in text. formatter = formatting.XMLFormatter( normalize=formatting.WS_BOTH, pretty_print=True, text_tags=("p", "h1", "h2", "h3", "h4", "h5", "h6", "li"), formatting_tags=( "b", "u", "i", "strike", "em", "super", "sup", "sub", "link", "a", "span", ), ) # Add tests that use no placeholder replacement (ie plain XML) data_dir = os.path.join(os.path.dirname(__file__), "test_data") generate_filebased_cases(data_dir, XMLFormatterFileTests) # Add tests that use placeholder replacement (ie HTML) data_dir = os.path.join(os.path.dirname(__file__), "test_data") generate_filebased_cases(data_dir, HTMLFormatterFileTests, suffix="html") xmldiff-2.6.3/tests/test_main.py000066400000000000000000000150701443244161100166550ustar00rootroot00000000000000import io import os import sys import unittest from lxml import etree from xmldiff import main, formatting CURDIR = os.path.split(__file__)[0] LEFT_FILE = os.path.join(CURDIR, "test_data", "rmldoc.left.xml") RIGHT_FILE = os.path.join(CURDIR, "test_data", "rmldoc.right.xml") EXPECTED_FILE = os.path.join(CURDIR, "test_data", "rmldoc.expected.xml") class MainAPITests(unittest.TestCase): def test_api_diff_files(self): # diff_files can take filenames result1 = main.diff_files(LEFT_FILE, RIGHT_FILE) # Or open file streams: with open(LEFT_FILE, "rb") as linfile: with open(RIGHT_FILE, "rb") as rinfile: result2 = main.diff_files(linfile, rinfile) self.assertEqual(result1, result2) # Give something else, and it fails: with self.assertRaises(IOError): main.diff_files("", "") def test_api_diff_texts(self): # diff_text can take bytes with open(LEFT_FILE, "rb") as linfile: with open(RIGHT_FILE, "rb") as rinfile: left = linfile.read() right = rinfile.read() result1 = main.diff_texts(left, right) # And unicode result2 = main.diff_texts(left.decode("utf8"), right.decode("utf8")) self.assertEqual(result1, result2) with open(LEFT_FILE, "rb") as infile: with open(RIGHT_FILE, "rb") as infile: # Give something else, and it fails: with self.assertRaises(ValueError): main.diff_texts(infile, infile) def test_api_diff_trees(self): # diff_tree can take ElementEtrees left = etree.parse(LEFT_FILE) right = etree.parse(RIGHT_FILE) result1 = main.diff_trees(left, right) # And Elements result2 = main.diff_trees(left.getroot(), right.getroot()) self.assertEqual(result1, result2) # Give something else, and it fails: with self.assertRaises(TypeError): main.diff_trees(LEFT_FILE, RIGHT_FILE) def test_api_diff_files_with_formatter(self): formatter = formatting.XMLFormatter() # diff_files can take filenames result = main.diff_files(LEFT_FILE, RIGHT_FILE, formatter=formatter) # This formatter will insert a diff namespace: self.assertIn('xmlns:diff="http://namespaces.shoobx.com/diff"', result) class MainCLITests(unittest.TestCase): def call_run(self, args, command=main.diff_command): output = io.StringIO() errors = io.StringIO() stdout = sys.stdout stderr = sys.stderr try: sys.stdout = output sys.stderr = errors command(args) finally: sys.stdout = stdout sys.stderr = stderr return output.getvalue(), errors.getvalue() def test_diff_cli_no_args(self): with self.assertRaises(SystemExit): stdout, stderr = self.call_run([]) def test_diff_cli_simple(self): curdir = os.path.dirname(__file__) filepath = os.path.join(curdir, "test_data") file1 = os.path.join(filepath, "insert-node.left.html") file2 = os.path.join(filepath, "insert-node.right.html") output, errors = self.call_run([file1, file2]) self.assertEqual(len(output.splitlines()), 3) # This should default to the diff formatter: self.assertEqual(output[0], "[") def test_diff_cli_BOM(self): """Test comparison of files encoded with UTF-8 prepended by Byte Order Mark""" curdir = os.path.dirname(__file__) filepath = os.path.join(curdir, "test_data") file1 = os.path.join(filepath, "bom_1.xml") file2 = os.path.join(filepath, "bom_2.xml") output, errors = self.call_run([file1, file2]) self.assertEqual(len(output.splitlines()), 1) # This should default to the diff formatter: self.assertEqual(output[0], "[") def test_diff_cli_args(self): curdir = os.path.dirname(__file__) filepath = os.path.join(curdir, "test_data") file1 = os.path.join(filepath, "example.left.html") file2 = os.path.join(filepath, "example.right.html") # Select a formatter: output, errors = self.call_run([file1, file2, "--formatter", "xml"]) # It gives a very compact output self.assertEqual(len(output.splitlines()), 2) # Now it's XML self.assertEqual(output[0], "<") # Don't strip the whitespace keeps the formatting from the source: output, errors = self.call_run( [file1, file2, "--keep-whitespace", "--formatter", "xml"] ) self.assertEqual(len(output.splitlines()), 13) # And stripping and pretty printing gives a longer readable output output, errors = self.call_run( [file1, file2, "--pretty-print", "--formatter", "xml"] ) self.assertEqual(len(output.splitlines()), 11) # The default output gives six lines for six actions output, errors = self.call_run([file1, file2, "--ratio-mode", "fast"]) self.assertEqual(len(output.splitlines()), 6) # 'fast' is default, so it's the same output output2, errors = self.call_run([file1, file2, "--ratio-mode", "fast"]) self.assertEqual(output, output2) # Accurate is the same in this case, although sometimes it isn't output2, errors = self.call_run([file1, file2, "--ratio-mode", "accurate"]) self.assertEqual(output, output2) # But "faster" gives nine actions instead of six output, errors = self.call_run([file1, file2, "--ratio-mode", "faster"]) self.assertEqual(len(output.splitlines()), 9) # You can specify unique attributes: output, errors = self.call_run( [file1, file2, "--unique-attributes", "id,foo,frotz"] ) self.assertEqual(len(output.splitlines()), 6) # Or none output, errors = self.call_run([file1, file2, "--unique-attributes"]) self.assertEqual(len(output.splitlines()), 6) def test_patch_cli_simple(self): curdir = os.path.dirname(__file__) filepath = os.path.join(curdir, "test_data") patchfile = os.path.join(filepath, "insert-node.diff") xmlfile = os.path.join(filepath, "insert-node.left.html") output, errors = self.call_run([patchfile, xmlfile], command=main.patch_command) expectedfile = os.path.join(filepath, "insert-node.right.html") with open(expectedfile) as f: expected = f.read() self.assertEqual(output, expected) xmldiff-2.6.3/tests/test_patch.py000066400000000000000000000201471443244161100170310ustar00rootroot00000000000000import os import unittest from lxml import etree from xmldiff.formatting import DiffFormatter, WS_NONE from xmldiff.main import diff_trees, diff_texts, patch_text, patch_file from xmldiff.patch import Patcher, DiffParser from xmldiff.actions import ( UpdateTextIn, InsertNode, MoveNode, DeleteNode, UpdateAttrib, InsertAttrib, RenameAttrib, DeleteAttrib, UpdateTextAfter, RenameNode, InsertComment, ) from .testing import compare_elements class PatcherTests(unittest.TestCase): patcher = Patcher() def _test(self, start, action, end): tree = etree.fromstring(start) self.patcher.handle_action(action, tree) self.assertEqual(etree.tounicode(tree), end) def test_delete_node(self): self._test("", DeleteNode("/root/deleteme"), "") def test_insert_node(self): self._test( "", InsertNode("/root/anode", "newnode", 0), "", ) def test_rename_node(self): self._test( "", RenameNode("/root/oldname", "newname"), "", ) def test_move_node(self): self._test( "", MoveNode("/root/anode/moveme", "/root", 1), "", ) def test_update_text_in(self): self._test( "", UpdateTextIn("/root/anode", "New text"), "New text", ) def test_update_text_after(self): self._test( "", UpdateTextAfter("/root/anode", "New text"), "New text", ) def test_update_attrib(self): self._test( '', UpdateAttrib("/root/anode", "attrib", "newvalue"), '', ) def test_delete_attrib(self): self._test( '', DeleteAttrib("/root/anode", "attrib"), "", ) def test_insert_attrib(self): self._test( "", InsertAttrib("/root/anode", "attrib", "value"), '', ) def test_rename_attrib(self): self._test( '', RenameAttrib("/root/anode", "oldname", "newname"), '', ) def test_insert_comment(self): self._test( "", InsertComment("/root", 1, "This is a new comment"), "", ) class DiffPatch(unittest.TestCase): def test_diff_patch(self): here = os.path.split(__file__)[0] lfile = os.path.join(here, "test_data", "all_actions.left.xml") rfile = os.path.join(here, "test_data", "all_actions.right.xml") left = etree.parse(lfile) right = etree.parse(rfile) diff = diff_trees(left, right) result = Patcher().patch(diff, left) # This example has top level comments, and lxml doesn't deal well # with that, so the trees are not EXACTLY the same, the trailing # top level comment differs, but that's OK. compare_elements(result, right.getroot()) def test_diff_default_namespace(self): here = os.path.split(__file__)[0] lfile = os.path.join(here, "test_data", "namespace.left.xml") rfile = os.path.join(here, "test_data", "namespace.right.xml") left = etree.parse(lfile) right = etree.parse(rfile) diff = diff_trees(left, right) result = Patcher().patch(diff, left) # This example has top level comments, and lxml doesn't deal well # with that, so the trees are not EXACTLY the same, the trailing # top level comment differs, but that's OK. compare_elements(result, right.getroot()) TEST_DIFF = """[delete, node] [insert, target, tag, 0] [rename, node, tag] [move, node, target, 0] [update-text, node, "text"] [update-text-after, node, "text"] [update-attribute, node, name, "value"] [delete-attribute, node, name] [insert-attribute, node, name, "value"] [rename-attribute, node, oldname, newname] [insert-comment, target, 0, "text"] """ class ParserTests(unittest.TestCase): def test_make_action(self): parser = DiffParser() self.assertEqual(parser.make_action("[delete, node]"), DeleteNode("node")) self.assertEqual( parser.make_action("[insert, target, tag, 0]"), InsertNode("target", "tag", 0), ) self.assertEqual( parser.make_action("[rename, node, tag]"), RenameNode("node", "tag") ) self.assertEqual( parser.make_action("[move, node, target, 0]"), MoveNode("node", "target", 0) ) self.assertEqual( parser.make_action('[update-text, node, "text"]'), UpdateTextIn("node", "text"), ) self.assertEqual( parser.make_action('[update-text-after, node, "text"]'), UpdateTextAfter("node", "text"), ) self.assertEqual( parser.make_action('[update-attribute, node, name, "value"]'), UpdateAttrib("node", "name", "value"), ) self.assertEqual( parser.make_action("[delete-attribute, node, name]"), DeleteAttrib("node", "name"), ) self.assertEqual( parser.make_action('[insert-attribute, node, name, "value"]'), InsertAttrib("node", "name", "value"), ) self.assertEqual( parser.make_action("[rename-attribute, node, oldname, newname]"), RenameAttrib("node", "oldname", "newname"), ) self.assertEqual( parser.make_action('[insert-comment, target, 0, "text"]'), InsertComment("target", 0, "text"), ) def test_parse(self): parser = DiffParser() actions = list(parser.parse(TEST_DIFF)) self.assertEqual(len(actions), len(TEST_DIFF.splitlines())) def test_parse_broken(self): # Testing incorrect patch files parser = DiffParser() # Empty file, nothing happens actions = list(parser.parse("")) self.assertEqual(actions, []) # Not a diff raises error with self.assertRaises(ValueError): actions = list(parser.parse("Not a diff")) # It should handle lines that have been broken, say in an email actions = list(parser.parse('[insert-comment, target,\n 0, "text"]')) self.assertEqual(actions, [InsertComment("target", 0, "text")]) # It should not handle broken files with self.assertRaises(ValueError): actions = list(parser.parse("[insert-comment, target,\n")) def test_diff_patch(self): here = os.path.split(__file__)[0] lfile = os.path.join(here, "test_data", "all_actions.left.xml") rfile = os.path.join(here, "test_data", "all_actions.right.xml") with open(lfile) as f: left = f.read() with open(rfile) as f: right = f.read() diff = diff_texts(left, right, formatter=DiffFormatter(normalize=WS_NONE)) result = patch_text(diff, left) compare_elements(etree.fromstring(result), etree.fromstring(right)) def test_patch_stream(self): here = os.path.join(os.path.split(__file__)[0], "test_data") xmlfile = os.path.join(here, "insert-node.left.html") patchfile = os.path.join(here, "insert-node.diff") result = patch_file(patchfile, xmlfile) expectedfile = os.path.join(here, "insert-node.right.html") with open(expectedfile) as f: expected = f.read() # lxml.etree.parse() will strip ending whitespace self.assertEqual(result, expected.rstrip()) xmldiff-2.6.3/tests/test_utils.py000066400000000000000000000115541443244161100170740ustar00rootroot00000000000000import unittest from lxml import etree from xmldiff import utils class TraverseTests(unittest.TestCase): def test_post_order(self): xml = """
First paragraph
Last paragraph
""" root = etree.fromstring(xml) tree = root.getroottree() res = [tree.getpath(x) for x in utils.post_order_traverse(root)] self.assertEqual( res, [ "/document/story/section[1]/para", "/document/story/section[1]", "/document/story/section[2]/para", "/document/story/section[2]", "/document/story", "/document", ], ) def test_reverse_post_order(self): xml = """
First paragraph
Last paragraph
""" root = etree.fromstring(xml) tree = root.getroottree() res = [tree.getpath(x) for x in utils.reverse_post_order_traverse(root)] self.assertEqual( res, [ "/document/story/section[2]/para", "/document/story/section[2]", "/document/story/section[1]/para", "/document/story/section[1]", "/document/story", "/document", ], ) def test_breadth_first(self): xml = """
First paragraph Second paragraph
Third paragraph Fourth paragraph
Fifth paragraph
""" root = etree.fromstring(xml) tree = root.getroottree() res = [tree.getpath(x) for x in utils.breadth_first_traverse(root)] self.assertEqual( res, [ "/document", "/document/story[1]", "/document/story[2]", "/document/story[1]/section[1]", "/document/story[1]/section[2]", "/document/story[2]/section", "/document/story[1]/section[1]/para[1]", "/document/story[1]/section[1]/para[2]", "/document/story[1]/section[2]/para[1]", "/document/story[1]/section[2]/para[2]", "/document/story[2]/section/para", "/document/story[1]/section[1]/para[1]/i", "/document/story[1]/section[2]/para[2]/b", ], ) class LongestCommonSubsequenceTests(unittest.TestCase): def _diff(self, left, right, result): res = [] for x, y in utils.longest_common_subsequence(left, right): self.assertEqual(left[x], right[y]) res.append(left[x]) self.assertEqual("".join(res), result) def test_lcs(self): self._diff("ABCDEF", "ABCDEF", "ABCDEF") self._diff("ABCDEF", "GHIJKL", "") self._diff("ABCDEF", "ACDQRB", "ACD") self._diff("CXCDEFX", "CDEFX", "CDEFX") self._diff("HUMAN", "CHIMPANZEE", "HMAN") self._diff("ABCDEF", "A", "A") self._diff("123AAAAAAAAA", "123BBBBBBBBB", "123") self._diff("AAAAAAAAA123", "BBBBBBBBB123", "123") self._diff("ABCDE1", "1FGHIJK", "1") # There are several correct options here, make sure that doesn't # confuse it, we want just one, and don't care which. self._diff("HORSEBACK", "SNOWFLAKE", "SAK") # Empty sequences: self._diff("", "", "") class MakeAsciiTreeTests(unittest.TestCase): def test_make_ascii_tree(self): xml = """
First paragraph
Last paragraph
""" root = etree.fromstring(xml) tree = utils.make_ascii_tree(root) self.assertEqual( tree, " document \n story \n section \n para (delete)\n" " section \n para \n diff:insert ", ) xmldiff-2.6.3/tests/testing.py000066400000000000000000000033021443244161100163420ustar00rootroot00000000000000import os def make_case_function(left_filename): right_filename = left_filename.replace(".left.", ".right.") expected_filename = left_filename.replace(".left.", ".expected.") def test(self): with open(expected_filename, encoding="utf8") as input_file: expected_xml = input_file.read() try: result_xml = self.process(left_filename, right_filename) except Exception as err: if ".err" not in left_filename: raise result_xml = f"{err.__class__.__name__}: {err}" self.assertEqual(expected_xml.strip(), result_xml.strip()) return test def generate_filebased_cases(data_dir, test_class, suffix="xml", ignore=()): for left_filename in os.listdir(data_dir): if not left_filename.endswith(".left." + suffix): continue if left_filename in ignore: continue left_filename = os.path.join(data_dir, left_filename) test_function = make_case_function(left_filename) function_name = os.path.split(left_filename)[-1].replace(".", "-") test_name = "test_" + function_name setattr(test_class, test_name, test_function) def compare_elements(left, right): path = left.getroottree().getpath(left) assert left.text == right.text, "Texts differ: %s" % path assert left.tail == right.tail, "Tails differ: %s" % path assert left.attrib == right.attrib, "Attributes differ: %s" % path # We intentionally do NOT compare namespaces, they are allowed to differ assert len(left) == len(right), "Children differ: %s" % path for litem, ritem in zip(left.getchildren(), right.getchildren()): compare_elements(litem, ritem) xmldiff-2.6.3/xmldiff/000077500000000000000000000000001443244161100146045ustar00rootroot00000000000000xmldiff-2.6.3/xmldiff/__init__.py000066400000000000000000000000001443244161100167030ustar00rootroot00000000000000xmldiff-2.6.3/xmldiff/actions.py000066400000000000000000000015141443244161100166170ustar00rootroot00000000000000from collections import namedtuple # The edit script actions used in xmldiff DeleteNode = namedtuple("DeleteNode", "node") InsertNode = namedtuple("InsertNode", "target tag position") RenameNode = namedtuple("RenameNode", "node tag") MoveNode = namedtuple("MoveNode", "node target position") UpdateTextIn = namedtuple("UpdateTextIn", "node text") UpdateTextAfter = namedtuple("UpdateTextAfter", "node text") UpdateAttrib = namedtuple("UpdateAttrib", "node name value") DeleteAttrib = namedtuple("DeleteAttrib", "node name") InsertAttrib = namedtuple("InsertAttrib", "node name value") RenameAttrib = namedtuple("RenameAttrib", "node oldname newname") InsertComment = namedtuple("InsertComment", "target position text") InsertNamespace = namedtuple("InsertNamespace", "prefix uri") DeleteNamespace = namedtuple("DeleteNamespace", "prefix") xmldiff-2.6.3/xmldiff/diff.py000066400000000000000000000472731443244161100161030ustar00rootroot00000000000000from copy import deepcopy from difflib import SequenceMatcher from lxml import etree from math import sqrt from xmldiff import utils, actions class Differ: def __init__( self, F=None, uniqueattrs=None, ratio_mode="fast", fast_match=False, best_match=False, ignored_attrs=[], ): # The minimum similarity between two nodes to consider them equal if F is None: F = 0.5 self.F = F # uniqueattrs is a list of attributes or (tag, attribute) pairs # that uniquely identifies a node inside a document. Defaults # to 'xml:id'. if uniqueattrs is None: uniqueattrs = ["{http://www.w3.org/XML/1998/namespace}id"] self.uniqueattrs = uniqueattrs self.fast_match = fast_match self.best_match = best_match # Avoid recreating this for every node self._sequencematcher = SequenceMatcher() if ratio_mode == "fast": self._sequence_ratio = self._sequencematcher.quick_ratio elif ratio_mode == "accurate": self._sequence_ratio = self._sequencematcher.ratio elif ratio_mode == "faster": self._sequence_ratio = self._sequencematcher.real_quick_ratio else: raise ValueError("Unknown ratio_mode '%s'" % ratio_mode) self.ignored_attrs = ignored_attrs self.clear() def clear(self): # Use None for all values, as markings that they aren't done yet. self.left = None self.right = None self._matches = None self._l2rmap = None self._r2lmap = None self._inorder = None # Well, except the text cache, it's used by the ratio tests, # so we set that to a dict so the tests work. self._text_cache = {} def set_trees(self, left, right): self.clear() # Make sure we were passed two lxml elements: if isinstance(left, etree._ElementTree): left = left.getroot() if isinstance(right, etree._ElementTree): right = right.getroot() if not (etree.iselement(left) and etree.iselement(right)): raise TypeError( "The 'left' and 'right' parameters must be " "lxml Elements." ) # Left gets modified as a part of the diff, deepcopy it first. self.left = deepcopy(left) self.right = right def append_match(self, lnode, rnode, max_match): self._matches.append((lnode, rnode, max_match)) self._l2rmap[id(lnode)] = rnode self._r2lmap[id(rnode)] = lnode def match(self, left=None, right=None): # This is not a generator, because the diff() functions needs # _l2rmap and _r2lmap, so if match() was a generator, then # diff() would have to first do list(self.match()) without storing # the result, and that would be silly. # Nothing in this library is actually using the resulting list of # matches match() returns, but it may be useful for somebody that # actually do not want a diff, but only a list of matches. # It also makes testing the match function easier. if left is not None or right is not None: self.set_trees(left, right) if self._matches is not None: # We already matched these sequences, use the cache return self._matches # Initialize the caches: self._matches = [] self._l2rmap = {} self._r2lmap = {} self._inorder = set() self._text_cache = {} # Generate the node lists lnodes = list(utils.post_order_traverse(self.left)) rnodes = list(utils.post_order_traverse(self.right)) # TODO: If the roots do not match, we should create new roots, and # have the old roots be children of the new roots, but let's skip # that for now, we don't need it. That's strictly a part of the # insert phase, but hey, even the paper defining the phases # ignores the phases, so... # For now, just make sure the roots are matched, we do that by # removing them from the lists of nodes, so it can't match, and add # them back last. lnodes.remove(self.left) rnodes.remove(self.right) if self.fast_match: # First find matches with longest_common_subsequence: matches = list( utils.longest_common_subsequence( lnodes, rnodes, lambda x, y: self.node_ratio(x, y) >= self.F ) ) # Add the matches (I prefer this from start to finish): for left_match, right_match in matches: self.append_match(lnodes[left_match], rnodes[right_match], None) # Then remove the nodes (needs to be done backwards): for left_match, right_match in reversed(matches): lnodes.pop(left_match) rnodes.pop(right_match) elif self.best_match: unmatched_lnodes = [] # First find all nodes that match perfectly for lnode in lnodes: max_match = 0 match_node = None for rnode in rnodes: match = self.node_ratio(lnode, rnode) if match == 1.0: self.append_match(lnode, rnode, 1.0) rnodes.remove(rnode) break if match > max_match: match_node = rnode max_match = match else: unmatched_lnodes.append((lnode, match_node, max_match)) # unmatched_lnodes.append(lnode) lnodes = [] for lnode, rnode, max_match in unmatched_lnodes: if max_match >= self.F and rnode in rnodes: self.append_match(lnode, rnode, max_match) else: lnodes.append(lnode) for lnode in lnodes: max_match = 0 match_node = None for rnode in rnodes: match = self.node_ratio(lnode, rnode) if match > max_match: match_node = rnode max_match = match # Try to shortcut for nodes that are not only equal but also # in the same place in the tree if match == 1.0: # This is a total match, break here break if max_match >= self.F: self.append_match(lnode, match_node, max_match) # We don't want to check nodes that already are matched if match_node is not None: rnodes.remove(match_node) # Match the roots self.append_match(self.left, self.right, 1.0) return self._matches def node_ratio(self, left, right): if left.tag is etree.Comment or right.tag is etree.Comment: if left.tag is etree.Comment and right.tag is etree.Comment: # comments self._sequencematcher.set_seqs(left.text, right.text) return self._sequence_ratio() # One is a comment the other is not: return 0 for attr in self.uniqueattrs: if not isinstance(attr, str): # If it's actually a sequence of (tag, attr), the tags must # match first. tag, attr = attr if tag != left.tag or tag != right.tag: continue if attr in left.attrib or attr in right.attrib: # One of the nodes have a unique attribute, we check only that. # If only one node has it, it means they are not the same. return int(left.attrib.get(attr) == right.attrib.get(attr)) match = self.leaf_ratio(left, right) child_ratio = self.child_ratio(left, right) if child_ratio is not None: match = sqrt((match**2 + child_ratio**2) / 2) return match def node_text(self, node): if node in self._text_cache: return self._text_cache[node] # Get the texts and the tag as a start texts = node.xpath("text()") # Then add attributes and values for tag, value in sorted(self.node_attribs(node).items()): if tag[0] == "{": tag = tag.split( "}", )[-1] texts.append(f"{tag}:{value}") # Finally make one string, useful to see how similar two nodes are text = " ".join(texts).strip() result = utils.cleanup_whitespace(text) self._text_cache[node] = result return result def node_attribs(self, node): """Return a dict of attributes to consider for this node.""" attribs = dict(node.attrib) for key in self.ignored_attrs: attribs.pop(key, None) return attribs def leaf_ratio(self, left, right): # How similar two nodes are, with no consideration of their children # We use a simple ratio here, I tried Levenshtein distances # but that took a 100 times longer. ltext = self.node_text(left) rtext = self.node_text(right) self._sequencematcher.set_seqs(ltext, rtext) return self._sequence_ratio() def child_ratio(self, left, right): # How similar the children of two nodes are left_children = left.getchildren() right_children = right.getchildren() if not left_children and not right_children: return None count = 0 child_count = max((len(left_children), len(right_children))) for lchild in left_children: for rchild in right_children: if self._l2rmap.get(id(lchild)) is rchild: count += 1 right_children.remove(rchild) break return count / child_count def update_node_tag(self, left, right): if left.tag != right.tag: left_xpath = utils.getpath(left) yield actions.RenameNode(left_xpath, right.tag) left.tag = right.tag def update_node_attr(self, left, right): left_xpath = utils.getpath(left) # Update: Look for differences in attributes left_keys = set(self.node_attribs(left).keys()) right_keys = set(self.node_attribs(right).keys()) new_keys = right_keys.difference(left_keys) removed_keys = left_keys.difference(right_keys) common_keys = left_keys.intersection(right_keys) # We sort the attributes to get a consistent order in the edit script. # That's only so we can do testing in a reasonable way... for key in sorted(common_keys): if left.attrib[key] != right.attrib[key]: yield actions.UpdateAttrib(left_xpath, key, right.attrib[key]) left.attrib[key] = right.attrib[key] # Align: Not needed here, we don't care about the order of # attributes. # Move: Check if any of the new attributes have the same value # as the removed attributes. If they do, it's actually # a renaming, and a move is one action instead of remove + insert newattrmap = {v: k for (k, v) in right.attrib.items() if k in new_keys} for lk in sorted(removed_keys): value = left.attrib[lk] if value in newattrmap: rk = newattrmap[value] yield actions.RenameAttrib(left_xpath, lk, rk) # Remove from list of new attributes new_keys.remove(rk) # Delete used attribute from map of attributes del newattrmap[value] # Update left node left.attrib[rk] = value del left.attrib[lk] # Insert: Find new attributes for key in sorted(new_keys): yield actions.InsertAttrib(left_xpath, key, right.attrib[key]) left.attrib[key] = right.attrib[key] # Delete: remove removed attributes for key in sorted(removed_keys): if key not in left.attrib: # This was already moved continue yield actions.DeleteAttrib(left_xpath, key) del left.attrib[key] def update_node_text(self, left, right): left_xpath = utils.getpath(left) if left.text != right.text: yield actions.UpdateTextIn(left_xpath, right.text) left.text = right.text if left.tail != right.tail: yield actions.UpdateTextAfter(left_xpath, right.tail) left.tail = right.tail def find_pos(self, node): parent = node.getparent() # The paper here first checks if the child is the first child in # order, but I am entirely unable to actually make that happen, and # if it does, the "else:" will catch that case anyway, and it also # deals with the case of no child being in order. # Find the last sibling before the child that is in order i = parent.index(node) while i >= 1: i -= 1 sibling = parent[i] if sibling in self._inorder: # That's it break else: # No previous sibling in order. return 0 # Now find the partner of this in the left tree sibling_match = self._r2lmap[id(sibling)] node_match = self._r2lmap.get(id(node)) i = 0 for child in sibling_match.getparent().getchildren(): if child is node_match: # Don't count the node we're looking for. continue if child in self._inorder or child not in self._l2rmap: # Count nodes that are in order, or will be deleted: i += 1 if child is sibling_match: # We found the position! break return i def align_children(self, left, right): lchildren = [ c for c in left.getchildren() if (id(c) in self._l2rmap and self._l2rmap[id(c)].getparent() is right) ] rchildren = [ c for c in right.getchildren() if (id(c) in self._r2lmap and self._r2lmap[id(c)].getparent() is left) ] if not lchildren or not rchildren: # Nothing to align return lcs = utils.longest_common_subsequence( lchildren, rchildren, lambda x, y: self._l2rmap[id(x)] is y ) for x, y in lcs: # Mark these as in order self._inorder.add(lchildren[x]) self._inorder.add(rchildren[y]) # Go over those children that are not in order: for lchild in lchildren: if lchild in self._inorder: # Already aligned continue rchild = self._l2rmap[id(lchild)] right_pos = self.find_pos(rchild) rtarget = rchild.getparent() ltarget = self._r2lmap[id(rtarget)] yield actions.MoveNode( utils.getpath(lchild), utils.getpath(ltarget), right_pos ) # Do the actual move: left.remove(lchild) ltarget.insert(right_pos, lchild) # Mark the nodes as in order self._inorder.add(lchild) self._inorder.add(rchild) def diff(self, left=None, right=None): # Make sure the matching is done first, diff() needs the l2r/r2l maps. if not self._matches: self.match(left, right) # First, deal with namespaces: rnsmap = self.right.nsmap lnsmap = self.left.nsmap for k, v in rnsmap.items(): # Make sure it's registered: if k is not None and not utils.RESERVED_NS.match(k): etree.register_namespace(k, v) if k not in lnsmap: yield actions.InsertNamespace(k, v) elif lnsmap[k] != v: raise RuntimeError( "Sorry, we do not support changing the URI of namespaces in xmldiff" ) for k, v in lnsmap.items(): # Make sure it's registered: if k is not None and not utils.RESERVED_NS.match(k): etree.register_namespace(k, v) if k not in rnsmap: yield actions.DeleteNamespace(k) # The paper talks about the five phases, and then does four of them # in one phase, in a different order that described. This # implementation in turn differs in order yet again. ltree = self.left.getroottree() for rnode in utils.breadth_first_traverse(self.right): # (a) rparent = rnode.getparent() ltarget = self._r2lmap.get(id(rparent)) # (b) Insert if id(rnode) not in self._r2lmap: # (i) pos = self.find_pos(rnode) # (ii) if rnode.tag is etree.Comment: yield actions.InsertComment( utils.getpath(ltarget, ltree), pos, rnode.text ) lnode = etree.Comment(rnode.text) else: yield actions.InsertNode( utils.getpath(ltarget, ltree), rnode.tag, pos ) lnode = ltarget.makeelement(rnode.tag) # (iii) self.append_match(lnode, rnode, 1.0) ltarget.insert(pos, lnode) self._inorder.add(lnode) self._inorder.add(rnode) # And then we update attributes. This is different from the # paper, because the paper assumes nodes only has labels and # values. Nodes also has texts, we do them later. yield from self.update_node_attr(lnode, rnode) # (c) else: # Normally there is a check that rnode isn't a root, # but that's perhaps only because comparing valueless # roots is pointless, but in an elementtree we have no such # thing as a valueless root anyway. # (i) lnode = self._r2lmap[id(rnode)] # (iii) Move lparent = lnode.getparent() if ltarget is not lparent: pos = self.find_pos(rnode) yield actions.MoveNode( utils.getpath(lnode, ltree), utils.getpath(ltarget, ltree), pos ) # Move the node from current parent to target lparent.remove(lnode) ltarget.insert(pos, lnode) self._inorder.add(lnode) self._inorder.add(rnode) # Rename yield from self.update_node_tag(lnode, rnode) # (ii) Update # XXX If they are exactly equal, we can skip this, # maybe store match results in a cache? yield from self.update_node_attr(lnode, rnode) # (d) Align yield from self.align_children(lnode, rnode) # And lastly, we update all node texts. We do this after # aligning children, because when you generate an XML diff # from this, that XML diff update generates more children, # confusing later inserts or deletes. lnode = self._r2lmap[id(rnode)] yield from self.update_node_text(lnode, rnode) for lnode in utils.reverse_post_order_traverse(self.left): if id(lnode) not in self._l2rmap: # No match yield actions.DeleteNode(utils.getpath(lnode, ltree)) lnode.getparent().remove(lnode) xmldiff-2.6.3/xmldiff/diff_match_patch.py000066400000000000000000002350121443244161100204240ustar00rootroot00000000000000#!/usr/bin/python3 """Diff Match and Patch Copyright 2018 The diff-match-patch Authors. https://github.com/google/diff-match-patch Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ """Functions for diff, match and patch. Computes the difference between two texts to create a patch. Applies the patch onto another text, allowing for errors. """ __author__ = "fraser@google.com (Neil Fraser)" import re import sys import time import urllib.parse class diff_match_patch: """Class containing the diff, match and patch methods. Also contains the behaviour settings. """ def __init__(self): """Inits a diff_match_patch object with default settings. Redefine these in your program to override the defaults. """ # Number of seconds to map a diff before giving up (0 for infinity). self.Diff_Timeout = 1.0 # Cost of an empty edit operation in terms of edit characters. self.Diff_EditCost = 4 # At what point is no match declared (0.0 = perfection, 1.0 = very loose). self.Match_Threshold = 0.5 # How far to search for a match (0 = exact location, 1000+ = broad match). # A match this many characters away from the expected location will add # 1.0 to the score (0.0 is a perfect match). self.Match_Distance = 1000 # When deleting a large block of text (over ~64 characters), how close do # the contents have to be to match the expected contents. (0.0 = perfection, # 1.0 = very loose). Note that Match_Threshold controls how closely the # end points of a delete need to match. self.Patch_DeleteThreshold = 0.5 # Chunk size for context length. self.Patch_Margin = 4 # The number of bits in an int. # Python has no maximum, thus to disable patch splitting set to 0. # However to avoid long patches in certain pathological cases, use 32. # Multiple short patches (using native ints) are much faster than long ones. self.Match_MaxBits = 32 # DIFF FUNCTIONS # The data structure representing a diff is an array of tuples: # [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")] # which means: delete "Hello", add "Goodbye" and keep " world." DIFF_DELETE = -1 DIFF_INSERT = 1 DIFF_EQUAL = 0 def diff_main(self, text1, text2, checklines=True, deadline=None): """Find the differences between two texts. Simplifies the problem by stripping any common prefix or suffix off the texts before diffing. Args: text1: Old string to be diffed. text2: New string to be diffed. checklines: Optional speedup flag. If present and false, then don't run a line-level diff first to identify the changed areas. Defaults to true, which does a faster, slightly less optimal diff. deadline: Optional time when the diff should be complete by. Used internally for recursive calls. Users should set DiffTimeout instead. Returns: Array of changes. """ # Set a deadline by which time the diff must be complete. if deadline == None: # Unlike in most languages, Python counts time in seconds. if self.Diff_Timeout <= 0: deadline = sys.maxsize else: deadline = time.time() + self.Diff_Timeout # Check for null inputs. if text1 == None or text2 == None: raise ValueError("Null inputs. (diff_main)") # Check for equality (speedup). if text1 == text2: if text1: return [(self.DIFF_EQUAL, text1)] return [] # Trim off common prefix (speedup). commonlength = self.diff_commonPrefix(text1, text2) commonprefix = text1[:commonlength] text1 = text1[commonlength:] text2 = text2[commonlength:] # Trim off common suffix (speedup). commonlength = self.diff_commonSuffix(text1, text2) if commonlength == 0: commonsuffix = "" else: commonsuffix = text1[-commonlength:] text1 = text1[:-commonlength] text2 = text2[:-commonlength] # Compute the diff on the middle block. diffs = self.diff_compute(text1, text2, checklines, deadline) # Restore the prefix and suffix. if commonprefix: diffs[:0] = [(self.DIFF_EQUAL, commonprefix)] if commonsuffix: diffs.append((self.DIFF_EQUAL, commonsuffix)) self.diff_cleanupMerge(diffs) return diffs def diff_compute(self, text1, text2, checklines, deadline): """Find the differences between two texts. Assumes that the texts do not have any common prefix or suffix. Args: text1: Old string to be diffed. text2: New string to be diffed. checklines: Speedup flag. If false, then don't run a line-level diff first to identify the changed areas. If true, then run a faster, slightly less optimal diff. deadline: Time when the diff should be complete by. Returns: Array of changes. """ if not text1: # Just add some text (speedup). return [(self.DIFF_INSERT, text2)] if not text2: # Just delete some text (speedup). return [(self.DIFF_DELETE, text1)] if len(text1) > len(text2): (longtext, shorttext) = (text1, text2) else: (shorttext, longtext) = (text1, text2) i = longtext.find(shorttext) if i != -1: # Shorter text is inside the longer text (speedup). diffs = [ (self.DIFF_INSERT, longtext[:i]), (self.DIFF_EQUAL, shorttext), (self.DIFF_INSERT, longtext[i + len(shorttext) :]), ] # Swap insertions for deletions if diff is reversed. if len(text1) > len(text2): diffs[0] = (self.DIFF_DELETE, diffs[0][1]) diffs[2] = (self.DIFF_DELETE, diffs[2][1]) return diffs if len(shorttext) == 1: # Single character string. # After the previous speedup, the character can't be an equality. return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)] # Check to see if the problem can be split in two. hm = self.diff_halfMatch(text1, text2) if hm: # A half-match was found, sort out the return data. (text1_a, text1_b, text2_a, text2_b, mid_common) = hm # Send both pairs off for separate processing. diffs_a = self.diff_main(text1_a, text2_a, checklines, deadline) diffs_b = self.diff_main(text1_b, text2_b, checklines, deadline) # Merge the results. return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b if checklines and len(text1) > 100 and len(text2) > 100: return self.diff_lineMode(text1, text2, deadline) return self.diff_bisect(text1, text2, deadline) def diff_lineMode(self, text1, text2, deadline): """Do a quick line-level diff on both strings, then rediff the parts for greater accuracy. This speedup can produce non-minimal diffs. Args: text1: Old string to be diffed. text2: New string to be diffed. deadline: Time when the diff should be complete by. Returns: Array of changes. """ # Scan the text on a line-by-line basis first. (text1, text2, linearray) = self.diff_linesToChars(text1, text2) diffs = self.diff_main(text1, text2, False, deadline) # Convert the diff back to original text. self.diff_charsToLines(diffs, linearray) # Eliminate freak matches (e.g. blank lines) self.diff_cleanupSemantic(diffs) # Rediff any replacement blocks, this time character-by-character. # Add a dummy entry at the end. diffs.append((self.DIFF_EQUAL, "")) pointer = 0 count_delete = 0 count_insert = 0 text_delete = "" text_insert = "" while pointer < len(diffs): if diffs[pointer][0] == self.DIFF_INSERT: count_insert += 1 text_insert += diffs[pointer][1] elif diffs[pointer][0] == self.DIFF_DELETE: count_delete += 1 text_delete += diffs[pointer][1] elif diffs[pointer][0] == self.DIFF_EQUAL: # Upon reaching an equality, check for prior redundancies. if count_delete >= 1 and count_insert >= 1: # Delete the offending records and add the merged ones. subDiff = self.diff_main(text_delete, text_insert, False, deadline) diffs[pointer - count_delete - count_insert : pointer] = subDiff pointer = pointer - count_delete - count_insert + len(subDiff) count_insert = 0 count_delete = 0 text_delete = "" text_insert = "" pointer += 1 diffs.pop() # Remove the dummy entry at the end. return diffs def diff_bisect(self, text1, text2, deadline): """Find the 'middle snake' of a diff, split the problem in two and return the recursively constructed diff. See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations. Args: text1: Old string to be diffed. text2: New string to be diffed. deadline: Time at which to bail if not yet complete. Returns: Array of diff tuples. """ # Cache the text lengths to prevent multiple calls. text1_length = len(text1) text2_length = len(text2) max_d = (text1_length + text2_length + 1) // 2 v_offset = max_d v_length = 2 * max_d v1 = [-1] * v_length v1[v_offset + 1] = 0 v2 = v1[:] delta = text1_length - text2_length # If the total number of characters is odd, then the front path will # collide with the reverse path. front = delta % 2 != 0 # Offsets for start and end of k loop. # Prevents mapping of space beyond the grid. k1start = 0 k1end = 0 k2start = 0 k2end = 0 for d in range(max_d): # Bail out if deadline is reached. if time.time() > deadline: break # Walk the front path one step. for k1 in range(-d + k1start, d + 1 - k1end, 2): k1_offset = v_offset + k1 if k1 == -d or (k1 != d and v1[k1_offset - 1] < v1[k1_offset + 1]): x1 = v1[k1_offset + 1] else: x1 = v1[k1_offset - 1] + 1 y1 = x1 - k1 while ( x1 < text1_length and y1 < text2_length and text1[x1] == text2[y1] ): x1 += 1 y1 += 1 v1[k1_offset] = x1 if x1 > text1_length: # Ran off the right of the graph. k1end += 2 elif y1 > text2_length: # Ran off the bottom of the graph. k1start += 2 elif front: k2_offset = v_offset + delta - k1 if k2_offset >= 0 and k2_offset < v_length and v2[k2_offset] != -1: # Mirror x2 onto top-left coordinate system. x2 = text1_length - v2[k2_offset] if x1 >= x2: # Overlap detected. return self.diff_bisectSplit(text1, text2, x1, y1, deadline) # Walk the reverse path one step. for k2 in range(-d + k2start, d + 1 - k2end, 2): k2_offset = v_offset + k2 if k2 == -d or (k2 != d and v2[k2_offset - 1] < v2[k2_offset + 1]): x2 = v2[k2_offset + 1] else: x2 = v2[k2_offset - 1] + 1 y2 = x2 - k2 while ( x2 < text1_length and y2 < text2_length and text1[-x2 - 1] == text2[-y2 - 1] ): x2 += 1 y2 += 1 v2[k2_offset] = x2 if x2 > text1_length: # Ran off the left of the graph. k2end += 2 elif y2 > text2_length: # Ran off the top of the graph. k2start += 2 elif not front: k1_offset = v_offset + delta - k2 if k1_offset >= 0 and k1_offset < v_length and v1[k1_offset] != -1: x1 = v1[k1_offset] y1 = v_offset + x1 - k1_offset # Mirror x2 onto top-left coordinate system. x2 = text1_length - x2 if x1 >= x2: # Overlap detected. return self.diff_bisectSplit(text1, text2, x1, y1, deadline) # Diff took too long and hit the deadline or # number of diffs equals number of characters, no commonality at all. return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)] def diff_bisectSplit(self, text1, text2, x, y, deadline): """Given the location of the 'middle snake', split the diff in two parts and recurse. Args: text1: Old string to be diffed. text2: New string to be diffed. x: Index of split point in text1. y: Index of split point in text2. deadline: Time at which to bail if not yet complete. Returns: Array of diff tuples. """ text1a = text1[:x] text2a = text2[:y] text1b = text1[x:] text2b = text2[y:] # Compute both diffs serially. diffs = self.diff_main(text1a, text2a, False, deadline) diffsb = self.diff_main(text1b, text2b, False, deadline) return diffs + diffsb def diff_linesToChars(self, text1, text2): """Split two texts into an array of strings. Reduce the texts to a string of hashes where each Unicode character represents one line. Args: text1: First string. text2: Second string. Returns: Three element tuple, containing the encoded text1, the encoded text2 and the array of unique strings. The zeroth element of the array of unique strings is intentionally blank. """ lineArray = [] # e.g. lineArray[4] == "Hello\n" lineHash = {} # e.g. lineHash["Hello\n"] == 4 # "\x00" is a valid character, but various debuggers don't like it. # So we'll insert a junk entry to avoid generating a null character. lineArray.append("") def diff_linesToCharsMunge(text): """Split a text into an array of strings. Reduce the texts to a string of hashes where each Unicode character represents one line. Modifies linearray and linehash through being a closure. Args: text: String to encode. Returns: Encoded string. """ chars = [] # Walk the text, pulling out a substring for each line. # text.split('\n') would would temporarily double our memory footprint. # Modifying text would create many large strings to garbage collect. lineStart = 0 lineEnd = -1 while lineEnd < len(text) - 1: lineEnd = text.find("\n", lineStart) if lineEnd == -1: lineEnd = len(text) - 1 line = text[lineStart : lineEnd + 1] if line in lineHash: chars.append(chr(lineHash[line])) else: if len(lineArray) == maxLines: # Bail out at 1114111 because chr(1114112) throws. line = text[lineStart:] lineEnd = len(text) lineArray.append(line) lineHash[line] = len(lineArray) - 1 chars.append(chr(len(lineArray) - 1)) lineStart = lineEnd + 1 return "".join(chars) # Allocate 2/3rds of the space for text1, the rest for text2. maxLines = 666666 chars1 = diff_linesToCharsMunge(text1) maxLines = 1114111 chars2 = diff_linesToCharsMunge(text2) return (chars1, chars2, lineArray) def diff_charsToLines(self, diffs, lineArray): """Rehydrate the text in a diff from a string of line hashes to real lines of text. Args: diffs: Array of diff tuples. lineArray: Array of unique strings. """ for i in range(len(diffs)): text = [] for char in diffs[i][1]: text.append(lineArray[ord(char)]) diffs[i] = (diffs[i][0], "".join(text)) def diff_commonPrefix(self, text1, text2): """Determine the common prefix of two strings. Args: text1: First string. text2: Second string. Returns: The number of characters common to the start of each string. """ # Quick check for common null cases. if not text1 or not text2 or text1[0] != text2[0]: return 0 # Binary search. # Performance analysis: https://neil.fraser.name/news/2007/10/09/ pointermin = 0 pointermax = min(len(text1), len(text2)) pointermid = pointermax pointerstart = 0 while pointermin < pointermid: if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]: pointermin = pointermid pointerstart = pointermin else: pointermax = pointermid pointermid = (pointermax - pointermin) // 2 + pointermin return pointermid def diff_commonSuffix(self, text1, text2): """Determine the common suffix of two strings. Args: text1: First string. text2: Second string. Returns: The number of characters common to the end of each string. """ # Quick check for common null cases. if not text1 or not text2 or text1[-1] != text2[-1]: return 0 # Binary search. # Performance analysis: https://neil.fraser.name/news/2007/10/09/ pointermin = 0 pointermax = min(len(text1), len(text2)) pointermid = pointermax pointerend = 0 while pointermin < pointermid: if ( text1[-pointermid : len(text1) - pointerend] == text2[-pointermid : len(text2) - pointerend] ): pointermin = pointermid pointerend = pointermin else: pointermax = pointermid pointermid = (pointermax - pointermin) // 2 + pointermin return pointermid def diff_commonOverlap(self, text1, text2): """Determine if the suffix of one string is the prefix of another. Args: text1 First string. text2 Second string. Returns: The number of characters common to the end of the first string and the start of the second string. """ # Cache the text lengths to prevent multiple calls. text1_length = len(text1) text2_length = len(text2) # Eliminate the null case. if text1_length == 0 or text2_length == 0: return 0 # Truncate the longer string. if text1_length > text2_length: text1 = text1[-text2_length:] elif text1_length < text2_length: text2 = text2[:text1_length] text_length = min(text1_length, text2_length) # Quick check for the worst case. if text1 == text2: return text_length # Start by looking for a single character match # and increase length until no match is found. # Performance analysis: https://neil.fraser.name/news/2010/11/04/ best = 0 length = 1 while True: pattern = text1[-length:] found = text2.find(pattern) if found == -1: return best length += found if found == 0 or text1[-length:] == text2[:length]: best = length length += 1 def diff_halfMatch(self, text1, text2): """Do the two texts share a substring which is at least half the length of the longer text? This speedup can produce non-minimal diffs. Args: text1: First string. text2: Second string. Returns: Five element Array, containing the prefix of text1, the suffix of text1, the prefix of text2, the suffix of text2 and the common middle. Or None if there was no match. """ if self.Diff_Timeout <= 0: # Don't risk returning a non-optimal diff if we have unlimited time. return None if len(text1) > len(text2): (longtext, shorttext) = (text1, text2) else: (shorttext, longtext) = (text1, text2) if len(longtext) < 4 or len(shorttext) * 2 < len(longtext): return None # Pointless. def diff_halfMatchI(longtext, shorttext, i): """Does a substring of shorttext exist within longtext such that the substring is at least half the length of longtext? Closure, but does not reference any external variables. Args: longtext: Longer string. shorttext: Shorter string. i: Start index of quarter length substring within longtext. Returns: Five element Array, containing the prefix of longtext, the suffix of longtext, the prefix of shorttext, the suffix of shorttext and the common middle. Or None if there was no match. """ seed = longtext[i : i + len(longtext) // 4] best_common = "" j = shorttext.find(seed) while j != -1: prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:]) suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j]) if len(best_common) < suffixLength + prefixLength: best_common = ( shorttext[j - suffixLength : j] + shorttext[j : j + prefixLength] ) best_longtext_a = longtext[: i - suffixLength] best_longtext_b = longtext[i + prefixLength :] best_shorttext_a = shorttext[: j - suffixLength] best_shorttext_b = shorttext[j + prefixLength :] j = shorttext.find(seed, j + 1) if len(best_common) * 2 >= len(longtext): return ( best_longtext_a, best_longtext_b, best_shorttext_a, best_shorttext_b, best_common, ) else: return None # First check if the second quarter is the seed for a half-match. hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) // 4) # Check again based on the third quarter. hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) // 2) if not hm1 and not hm2: return None elif not hm2: hm = hm1 elif not hm1: hm = hm2 else: # Both matched. Select the longest. if len(hm1[4]) > len(hm2[4]): hm = hm1 else: hm = hm2 # A half-match was found, sort out the return data. if len(text1) > len(text2): (text1_a, text1_b, text2_a, text2_b, mid_common) = hm else: (text2_a, text2_b, text1_a, text1_b, mid_common) = hm return (text1_a, text1_b, text2_a, text2_b, mid_common) def diff_cleanupSemantic(self, diffs): """Reduce the number of edits by eliminating semantically trivial equalities. Args: diffs: Array of diff tuples. """ changes = False equalities = [] # Stack of indices where equalities are found. lastEquality = None # Always equal to diffs[equalities[-1]][1] pointer = 0 # Index of current position. # Number of chars that changed prior to the equality. length_insertions1, length_deletions1 = 0, 0 # Number of chars that changed after the equality. length_insertions2, length_deletions2 = 0, 0 while pointer < len(diffs): if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found. equalities.append(pointer) length_insertions1, length_insertions2 = length_insertions2, 0 length_deletions1, length_deletions2 = length_deletions2, 0 lastEquality = diffs[pointer][1] else: # An insertion or deletion. if diffs[pointer][0] == self.DIFF_INSERT: length_insertions2 += len(diffs[pointer][1]) else: length_deletions2 += len(diffs[pointer][1]) # Eliminate an equality that is smaller or equal to the edits on both # sides of it. if ( lastEquality and ( len(lastEquality) <= max(length_insertions1, length_deletions1) ) and ( len(lastEquality) <= max(length_insertions2, length_deletions2) ) ): # Duplicate record. diffs.insert(equalities[-1], (self.DIFF_DELETE, lastEquality)) # Change second copy to insert. diffs[equalities[-1] + 1] = ( self.DIFF_INSERT, diffs[equalities[-1] + 1][1], ) # Throw away the equality we just deleted. equalities.pop() # Throw away the previous equality (it needs to be reevaluated). if len(equalities): equalities.pop() if len(equalities): pointer = equalities[-1] else: pointer = -1 # Reset the counters. length_insertions1, length_deletions1 = 0, 0 length_insertions2, length_deletions2 = 0, 0 lastEquality = None changes = True pointer += 1 # Normalize the diff. if changes: self.diff_cleanupMerge(diffs) self.diff_cleanupSemanticLossless(diffs) # Find any overlaps between deletions and insertions. # e.g: abcxxxxxxdef # -> abcxxxdef # e.g: xxxabcdefxxx # -> defxxxabc # Only extract an overlap if it is as big as the edit ahead or behind it. pointer = 1 while pointer < len(diffs): if ( diffs[pointer - 1][0] == self.DIFF_DELETE and diffs[pointer][0] == self.DIFF_INSERT ): deletion = diffs[pointer - 1][1] insertion = diffs[pointer][1] overlap_length1 = self.diff_commonOverlap(deletion, insertion) overlap_length2 = self.diff_commonOverlap(insertion, deletion) if overlap_length1 >= overlap_length2: if ( overlap_length1 >= len(deletion) / 2.0 or overlap_length1 >= len(insertion) / 2.0 ): # Overlap found. Insert an equality and trim the surrounding edits. diffs.insert( pointer, (self.DIFF_EQUAL, insertion[:overlap_length1]) ) diffs[pointer - 1] = ( self.DIFF_DELETE, deletion[: len(deletion) - overlap_length1], ) diffs[pointer + 1] = ( self.DIFF_INSERT, insertion[overlap_length1:], ) pointer += 1 else: if ( overlap_length2 >= len(deletion) / 2.0 or overlap_length2 >= len(insertion) / 2.0 ): # Reverse overlap found. # Insert an equality and swap and trim the surrounding edits. diffs.insert( pointer, (self.DIFF_EQUAL, deletion[:overlap_length2]) ) diffs[pointer - 1] = ( self.DIFF_INSERT, insertion[: len(insertion) - overlap_length2], ) diffs[pointer + 1] = ( self.DIFF_DELETE, deletion[overlap_length2:], ) pointer += 1 pointer += 1 pointer += 1 def diff_cleanupSemanticLossless(self, diffs): """Look for single edits surrounded on both sides by equalities which can be shifted sideways to align the edit to a word boundary. e.g: The cat came. -> The cat came. Args: diffs: Array of diff tuples. """ def diff_cleanupSemanticScore(one, two): """Given two strings, compute a score representing whether the internal boundary falls on logical boundaries. Scores range from 6 (best) to 0 (worst). Closure, but does not reference any external variables. Args: one: First string. two: Second string. Returns: The score. """ if not one or not two: # Edges are the best. return 6 # Each port of this function behaves slightly differently due to # subtle differences in each language's definition of things like # 'whitespace'. Since this function's purpose is largely cosmetic, # the choice has been made to use each language's native features # rather than force total conformity. char1 = one[-1] char2 = two[0] nonAlphaNumeric1 = not char1.isalnum() nonAlphaNumeric2 = not char2.isalnum() whitespace1 = nonAlphaNumeric1 and char1.isspace() whitespace2 = nonAlphaNumeric2 and char2.isspace() lineBreak1 = whitespace1 and (char1 == "\r" or char1 == "\n") lineBreak2 = whitespace2 and (char2 == "\r" or char2 == "\n") blankLine1 = lineBreak1 and self.BLANKLINEEND.search(one) blankLine2 = lineBreak2 and self.BLANKLINESTART.match(two) if blankLine1 or blankLine2: # Five points for blank lines. return 5 elif lineBreak1 or lineBreak2: # Four points for line breaks. return 4 elif nonAlphaNumeric1 and not whitespace1 and whitespace2: # Three points for end of sentences. return 3 elif whitespace1 or whitespace2: # Two points for whitespace. return 2 elif nonAlphaNumeric1 or nonAlphaNumeric2: # One point for non-alphanumeric. return 1 return 0 pointer = 1 # Intentionally ignore the first and last element (don't need checking). while pointer < len(diffs) - 1: if ( diffs[pointer - 1][0] == self.DIFF_EQUAL and diffs[pointer + 1][0] == self.DIFF_EQUAL ): # This is a single edit surrounded by equalities. equality1 = diffs[pointer - 1][1] edit = diffs[pointer][1] equality2 = diffs[pointer + 1][1] # First, shift the edit as far left as possible. commonOffset = self.diff_commonSuffix(equality1, edit) if commonOffset: commonString = edit[-commonOffset:] equality1 = equality1[:-commonOffset] edit = commonString + edit[:-commonOffset] equality2 = commonString + equality2 # Second, step character by character right, looking for the best fit. bestEquality1 = equality1 bestEdit = edit bestEquality2 = equality2 bestScore = diff_cleanupSemanticScore( equality1, edit ) + diff_cleanupSemanticScore(edit, equality2) while edit and equality2 and edit[0] == equality2[0]: equality1 += edit[0] edit = edit[1:] + equality2[0] equality2 = equality2[1:] score = diff_cleanupSemanticScore( equality1, edit ) + diff_cleanupSemanticScore(edit, equality2) # The >= encourages trailing rather than leading whitespace on edits. if score >= bestScore: bestScore = score bestEquality1 = equality1 bestEdit = edit bestEquality2 = equality2 if diffs[pointer - 1][1] != bestEquality1: # We have an improvement, save it back to the diff. if bestEquality1: diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1) else: del diffs[pointer - 1] pointer -= 1 diffs[pointer] = (diffs[pointer][0], bestEdit) if bestEquality2: diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2) else: del diffs[pointer + 1] pointer -= 1 pointer += 1 # Define some regex patterns for matching boundaries. BLANKLINEEND = re.compile(r"\n\r?\n$") BLANKLINESTART = re.compile(r"^\r?\n\r?\n") def diff_cleanupEfficiency(self, diffs): """Reduce the number of edits by eliminating operationally trivial equalities. Args: diffs: Array of diff tuples. """ changes = False equalities = [] # Stack of indices where equalities are found. lastEquality = None # Always equal to diffs[equalities[-1]][1] pointer = 0 # Index of current position. pre_ins = False # Is there an insertion operation before the last equality. pre_del = False # Is there a deletion operation before the last equality. post_ins = False # Is there an insertion operation after the last equality. post_del = False # Is there a deletion operation after the last equality. while pointer < len(diffs): if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found. if len(diffs[pointer][1]) < self.Diff_EditCost and ( post_ins or post_del ): # Candidate found. equalities.append(pointer) pre_ins = post_ins pre_del = post_del lastEquality = diffs[pointer][1] else: # Not a candidate, and can never become one. equalities = [] lastEquality = None post_ins = post_del = False else: # An insertion or deletion. if diffs[pointer][0] == self.DIFF_DELETE: post_del = True else: post_ins = True # Five types to be split: # ABXYCD # AXCD # ABXC # A
XCD # ABXC if lastEquality and ( (pre_ins and pre_del and post_ins and post_del) or ( (len(lastEquality) < self.Diff_EditCost / 2) and (pre_ins + pre_del + post_ins + post_del) == 3 ) ): # Duplicate record. diffs.insert(equalities[-1], (self.DIFF_DELETE, lastEquality)) # Change second copy to insert. diffs[equalities[-1] + 1] = ( self.DIFF_INSERT, diffs[equalities[-1] + 1][1], ) equalities.pop() # Throw away the equality we just deleted. lastEquality = None if pre_ins and pre_del: # No changes made which could affect previous entry, keep going. post_ins = post_del = True equalities = [] else: if len(equalities): equalities.pop() # Throw away the previous equality. if len(equalities): pointer = equalities[-1] else: pointer = -1 post_ins = post_del = False changes = True pointer += 1 if changes: self.diff_cleanupMerge(diffs) def diff_cleanupMerge(self, diffs): """Reorder and merge like edit sections. Merge equalities. Any edit section can move as long as it doesn't cross an equality. Args: diffs: Array of diff tuples. """ diffs.append((self.DIFF_EQUAL, "")) # Add a dummy entry at the end. pointer = 0 count_delete = 0 count_insert = 0 text_delete = "" text_insert = "" while pointer < len(diffs): if diffs[pointer][0] == self.DIFF_INSERT: count_insert += 1 text_insert += diffs[pointer][1] pointer += 1 elif diffs[pointer][0] == self.DIFF_DELETE: count_delete += 1 text_delete += diffs[pointer][1] pointer += 1 elif diffs[pointer][0] == self.DIFF_EQUAL: # Upon reaching an equality, check for prior redundancies. if count_delete + count_insert > 1: if count_delete != 0 and count_insert != 0: # Factor out any common prefixies. commonlength = self.diff_commonPrefix(text_insert, text_delete) if commonlength != 0: x = pointer - count_delete - count_insert - 1 if x >= 0 and diffs[x][0] == self.DIFF_EQUAL: diffs[x] = ( diffs[x][0], diffs[x][1] + text_insert[:commonlength], ) else: diffs.insert( 0, (self.DIFF_EQUAL, text_insert[:commonlength]) ) pointer += 1 text_insert = text_insert[commonlength:] text_delete = text_delete[commonlength:] # Factor out any common suffixies. commonlength = self.diff_commonSuffix(text_insert, text_delete) if commonlength != 0: diffs[pointer] = ( diffs[pointer][0], text_insert[-commonlength:] + diffs[pointer][1], ) text_insert = text_insert[:-commonlength] text_delete = text_delete[:-commonlength] # Delete the offending records and add the merged ones. new_ops = [] if len(text_delete) != 0: new_ops.append((self.DIFF_DELETE, text_delete)) if len(text_insert) != 0: new_ops.append((self.DIFF_INSERT, text_insert)) pointer -= count_delete + count_insert diffs[pointer : pointer + count_delete + count_insert] = new_ops pointer += len(new_ops) + 1 elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL: # Merge this equality with the previous one. diffs[pointer - 1] = ( diffs[pointer - 1][0], diffs[pointer - 1][1] + diffs[pointer][1], ) del diffs[pointer] else: pointer += 1 count_insert = 0 count_delete = 0 text_delete = "" text_insert = "" if diffs[-1][1] == "": diffs.pop() # Remove the dummy entry at the end. # Second pass: look for single edits surrounded on both sides by equalities # which can be shifted sideways to eliminate an equality. # e.g: ABAC -> ABAC changes = False pointer = 1 # Intentionally ignore the first and last element (don't need checking). while pointer < len(diffs) - 1: if ( diffs[pointer - 1][0] == self.DIFF_EQUAL and diffs[pointer + 1][0] == self.DIFF_EQUAL ): # This is a single edit surrounded by equalities. if diffs[pointer][1].endswith(diffs[pointer - 1][1]): # Shift the edit over the previous equality. if diffs[pointer - 1][1] != "": diffs[pointer] = ( diffs[pointer][0], diffs[pointer - 1][1] + diffs[pointer][1][: -len(diffs[pointer - 1][1])], ) diffs[pointer + 1] = ( diffs[pointer + 1][0], diffs[pointer - 1][1] + diffs[pointer + 1][1], ) del diffs[pointer - 1] changes = True elif diffs[pointer][1].startswith(diffs[pointer + 1][1]): # Shift the edit over the next equality. diffs[pointer - 1] = ( diffs[pointer - 1][0], diffs[pointer - 1][1] + diffs[pointer + 1][1], ) diffs[pointer] = ( diffs[pointer][0], diffs[pointer][1][len(diffs[pointer + 1][1]) :] + diffs[pointer + 1][1], ) del diffs[pointer + 1] changes = True pointer += 1 # If shifts were made, the diff needs reordering and another shift sweep. if changes: self.diff_cleanupMerge(diffs) def diff_xIndex(self, diffs, loc): """loc is a location in text1, compute and return the equivalent location in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8 Args: diffs: Array of diff tuples. loc: Location within text1. Returns: Location within text2. """ chars1 = 0 chars2 = 0 last_chars1 = 0 last_chars2 = 0 for x in range(len(diffs)): (op, text) = diffs[x] if op != self.DIFF_INSERT: # Equality or deletion. chars1 += len(text) if op != self.DIFF_DELETE: # Equality or insertion. chars2 += len(text) if chars1 > loc: # Overshot the location. break last_chars1 = chars1 last_chars2 = chars2 if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE: # The location was deleted. return last_chars2 # Add the remaining len(character). return last_chars2 + (loc - last_chars1) def diff_prettyHtml(self, diffs): """Convert a diff array into a pretty HTML report. Args: diffs: Array of diff tuples. Returns: HTML representation. """ html = [] for (op, data) in diffs: text = ( data.replace("&", "&") .replace("<", "<") .replace(">", ">") .replace("\n", "¶
") ) if op == self.DIFF_INSERT: html.append('%s' % text) elif op == self.DIFF_DELETE: html.append('%s' % text) elif op == self.DIFF_EQUAL: html.append("%s" % text) return "".join(html) def diff_text1(self, diffs): """Compute and return the source text (all equalities and deletions). Args: diffs: Array of diff tuples. Returns: Source text. """ text = [] for (op, data) in diffs: if op != self.DIFF_INSERT: text.append(data) return "".join(text) def diff_text2(self, diffs): """Compute and return the destination text (all equalities and insertions). Args: diffs: Array of diff tuples. Returns: Destination text. """ text = [] for (op, data) in diffs: if op != self.DIFF_DELETE: text.append(data) return "".join(text) def diff_levenshtein(self, diffs): """Compute the Levenshtein distance; the number of inserted, deleted or substituted characters. Args: diffs: Array of diff tuples. Returns: Number of changes. """ levenshtein = 0 insertions = 0 deletions = 0 for (op, data) in diffs: if op == self.DIFF_INSERT: insertions += len(data) elif op == self.DIFF_DELETE: deletions += len(data) elif op == self.DIFF_EQUAL: # A deletion and an insertion is one substitution. levenshtein += max(insertions, deletions) insertions = 0 deletions = 0 levenshtein += max(insertions, deletions) return levenshtein def diff_toDelta(self, diffs): """Crush the diff into an encoded string which describes the operations required to transform text1 into text2. E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'. Operations are tab-separated. Inserted text is escaped using %xx notation. Args: diffs: Array of diff tuples. Returns: Delta text. """ text = [] for (op, data) in diffs: if op == self.DIFF_INSERT: # High ascii will raise UnicodeDecodeError. Use Unicode instead. data = data.encode("utf-8") text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+$,# ")) elif op == self.DIFF_DELETE: text.append("-%d" % len(data)) elif op == self.DIFF_EQUAL: text.append("=%d" % len(data)) return "\t".join(text) def diff_fromDelta(self, text1, delta): """Given the original text1, and an encoded string which describes the operations required to transform text1 into text2, compute the full diff. Args: text1: Source string for the diff. delta: Delta text. Returns: Array of diff tuples. Raises: ValueError: If invalid input. """ diffs = [] pointer = 0 # Cursor in text1 tokens = delta.split("\t") for token in tokens: if token == "": # Blank tokens are ok (from a trailing \t). continue # Each token begins with a one character parameter which specifies the # operation of this token (delete, insert, equality). param = token[1:] if token[0] == "+": param = urllib.parse.unquote(param) diffs.append((self.DIFF_INSERT, param)) elif token[0] == "-" or token[0] == "=": try: n = int(param) except ValueError: raise ValueError("Invalid number in diff_fromDelta: " + param) if n < 0: raise ValueError("Negative number in diff_fromDelta: " + param) text = text1[pointer : pointer + n] pointer += n if token[0] == "=": diffs.append((self.DIFF_EQUAL, text)) else: diffs.append((self.DIFF_DELETE, text)) else: # Anything else is an error. raise ValueError( "Invalid diff operation in diff_fromDelta: " + token[0] ) if pointer != len(text1): raise ValueError( "Delta length (%d) does not equal source text length (%d)." % (pointer, len(text1)) ) return diffs # MATCH FUNCTIONS def match_main(self, text, pattern, loc): """Locate the best instance of 'pattern' in 'text' near 'loc'. Args: text: The text to search. pattern: The pattern to search for. loc: The location to search around. Returns: Best match index or -1. """ # Check for null inputs. if text == None or pattern == None: raise ValueError("Null inputs. (match_main)") loc = max(0, min(loc, len(text))) if text == pattern: # Shortcut (potentially not guaranteed by the algorithm) return 0 elif not text: # Nothing to match. return -1 elif text[loc : loc + len(pattern)] == pattern: # Perfect match at the perfect spot! (Includes case of null pattern) return loc else: # Do a fuzzy compare. match = self.match_bitap(text, pattern, loc) return match def match_bitap(self, text, pattern, loc): """Locate the best instance of 'pattern' in 'text' near 'loc' using the Bitap algorithm. Args: text: The text to search. pattern: The pattern to search for. loc: The location to search around. Returns: Best match index or -1. """ # Python doesn't have a maxint limit, so ignore this check. # if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits: # raise ValueError("Pattern too long for this application.") # Initialise the alphabet. s = self.match_alphabet(pattern) def match_bitapScore(e, x): """Compute and return the score for a match with e errors and x location. Accesses loc and pattern through being a closure. Args: e: Number of errors in match. x: Location of match. Returns: Overall score for match (0.0 = good, 1.0 = bad). """ accuracy = float(e) / len(pattern) proximity = abs(loc - x) if not self.Match_Distance: # Dodge divide by zero error. return proximity and 1.0 or accuracy return accuracy + (proximity / float(self.Match_Distance)) # Highest score beyond which we give up. score_threshold = self.Match_Threshold # Is there a nearby exact match? (speedup) best_loc = text.find(pattern, loc) if best_loc != -1: score_threshold = min(match_bitapScore(0, best_loc), score_threshold) # What about in the other direction? (speedup) best_loc = text.rfind(pattern, loc + len(pattern)) if best_loc != -1: score_threshold = min(match_bitapScore(0, best_loc), score_threshold) # Initialise the bit arrays. matchmask = 1 << (len(pattern) - 1) best_loc = -1 bin_max = len(pattern) + len(text) # Empty initialization added to appease pychecker. last_rd = None for d in range(len(pattern)): # Scan for the best match each iteration allows for one more error. # Run a binary search to determine how far from 'loc' we can stray at # this error level. bin_min = 0 bin_mid = bin_max while bin_min < bin_mid: if match_bitapScore(d, loc + bin_mid) <= score_threshold: bin_min = bin_mid else: bin_max = bin_mid bin_mid = (bin_max - bin_min) // 2 + bin_min # Use the result from this iteration as the maximum for the next. bin_max = bin_mid start = max(1, loc - bin_mid + 1) finish = min(loc + bin_mid, len(text)) + len(pattern) rd = [0] * (finish + 2) rd[finish + 1] = (1 << d) - 1 for j in range(finish, start - 1, -1): if len(text) <= j - 1: # Out of range. charMatch = 0 else: charMatch = s.get(text[j - 1], 0) if d == 0: # First pass: exact match. rd[j] = ((rd[j + 1] << 1) | 1) & charMatch else: # Subsequent passes: fuzzy match. rd[j] = ( (((rd[j + 1] << 1) | 1) & charMatch) | (((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1] ) if rd[j] & matchmask: score = match_bitapScore(d, j - 1) # This match will almost certainly be better than any existing match. # But check anyway. if score <= score_threshold: # Told you so. score_threshold = score best_loc = j - 1 if best_loc > loc: # When passing loc, don't exceed our current distance from loc. start = max(1, 2 * loc - best_loc) else: # Already passed loc, downhill from here on in. break # No hope for a (better) match at greater error levels. if match_bitapScore(d + 1, loc) > score_threshold: break last_rd = rd return best_loc def match_alphabet(self, pattern): """Initialise the alphabet for the Bitap algorithm. Args: pattern: The text to encode. Returns: Hash of character locations. """ s = {} for char in pattern: s[char] = 0 for i in range(len(pattern)): s[pattern[i]] |= 1 << (len(pattern) - i - 1) return s # PATCH FUNCTIONS def patch_addContext(self, patch, text): """Increase the context until it is unique, but don't let the pattern expand beyond Match_MaxBits. Args: patch: The patch to grow. text: Source text. """ if len(text) == 0: return pattern = text[patch.start2 : patch.start2 + patch.length1] padding = 0 # Look for the first and last matches of pattern in text. If two different # matches are found, increase the pattern length. while text.find(pattern) != text.rfind(pattern) and ( self.Match_MaxBits == 0 or len(pattern) < self.Match_MaxBits - self.Patch_Margin - self.Patch_Margin ): padding += self.Patch_Margin pattern = text[ max(0, patch.start2 - padding) : patch.start2 + patch.length1 + padding ] # Add one chunk for good luck. padding += self.Patch_Margin # Add the prefix. prefix = text[max(0, patch.start2 - padding) : patch.start2] if prefix: patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)] # Add the suffix. suffix = text[ patch.start2 + patch.length1 : patch.start2 + patch.length1 + padding ] if suffix: patch.diffs.append((self.DIFF_EQUAL, suffix)) # Roll back the start points. patch.start1 -= len(prefix) patch.start2 -= len(prefix) # Extend lengths. patch.length1 += len(prefix) + len(suffix) patch.length2 += len(prefix) + len(suffix) def patch_make(self, a, b=None, c=None): """Compute a list of patches to turn text1 into text2. Use diffs if provided, otherwise compute it ourselves. There are four ways to call this function, depending on what data is available to the caller: Method 1: a = text1, b = text2 Method 2: a = diffs Method 3 (optimal): a = text1, b = diffs Method 4 (deprecated, use method 3): a = text1, b = text2, c = diffs Args: a: text1 (methods 1,3,4) or Array of diff tuples for text1 to text2 (method 2). b: text2 (methods 1,4) or Array of diff tuples for text1 to text2 (method 3) or undefined (method 2). c: Array of diff tuples for text1 to text2 (method 4) or undefined (methods 1,2,3). Returns: Array of Patch objects. """ text1 = None diffs = None if isinstance(a, str) and isinstance(b, str) and c is None: # Method 1: text1, text2 # Compute diffs from text1 and text2. text1 = a diffs = self.diff_main(text1, b, True) if len(diffs) > 2: self.diff_cleanupSemantic(diffs) self.diff_cleanupEfficiency(diffs) elif isinstance(a, list) and b is None and c is None: # Method 2: diffs # Compute text1 from diffs. diffs = a text1 = self.diff_text1(diffs) elif isinstance(a, str) and isinstance(b, list) and c is None: # Method 3: text1, diffs text1 = a diffs = b elif isinstance(a, str) and isinstance(b, str) and isinstance(c, list): # Method 4: text1, text2, diffs # text2 is not used. text1 = a diffs = c else: raise ValueError("Unknown call format to patch_make.") if not diffs: return [] # Get rid of the None case. patches = [] patch = patch_obj() char_count1 = 0 # Number of characters into the text1 string. char_count2 = 0 # Number of characters into the text2 string. prepatch_text = text1 # Recreate the patches to determine context info. postpatch_text = text1 for x in range(len(diffs)): (diff_type, diff_text) = diffs[x] if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL: # A new patch starts here. patch.start1 = char_count1 patch.start2 = char_count2 if diff_type == self.DIFF_INSERT: # Insertion patch.diffs.append(diffs[x]) patch.length2 += len(diff_text) postpatch_text = ( postpatch_text[:char_count2] + diff_text + postpatch_text[char_count2:] ) elif diff_type == self.DIFF_DELETE: # Deletion. patch.length1 += len(diff_text) patch.diffs.append(diffs[x]) postpatch_text = ( postpatch_text[:char_count2] + postpatch_text[char_count2 + len(diff_text) :] ) elif ( diff_type == self.DIFF_EQUAL and len(diff_text) <= 2 * self.Patch_Margin and len(patch.diffs) != 0 and len(diffs) != x + 1 ): # Small equality inside a patch. patch.diffs.append(diffs[x]) patch.length1 += len(diff_text) patch.length2 += len(diff_text) if diff_type == self.DIFF_EQUAL and len(diff_text) >= 2 * self.Patch_Margin: # Time for a new patch. if len(patch.diffs) != 0: self.patch_addContext(patch, prepatch_text) patches.append(patch) patch = patch_obj() # Unlike Unidiff, our patch lists have a rolling context. # https://github.com/google/diff-match-patch/wiki/Unidiff # Update prepatch text & pos to reflect the application of the # just completed patch. prepatch_text = postpatch_text char_count1 = char_count2 # Update the current character count. if diff_type != self.DIFF_INSERT: char_count1 += len(diff_text) if diff_type != self.DIFF_DELETE: char_count2 += len(diff_text) # Pick up the leftover patch if not empty. if len(patch.diffs) != 0: self.patch_addContext(patch, prepatch_text) patches.append(patch) return patches def patch_deepCopy(self, patches): """Given an array of patches, return another array that is identical. Args: patches: Array of Patch objects. Returns: Array of Patch objects. """ patchesCopy = [] for patch in patches: patchCopy = patch_obj() # No need to deep copy the tuples since they are immutable. patchCopy.diffs = patch.diffs[:] patchCopy.start1 = patch.start1 patchCopy.start2 = patch.start2 patchCopy.length1 = patch.length1 patchCopy.length2 = patch.length2 patchesCopy.append(patchCopy) return patchesCopy def patch_apply(self, patches, text): """Merge a set of patches onto the text. Return a patched text, as well as a list of true/false values indicating which patches were applied. Args: patches: Array of Patch objects. text: Old text. Returns: Two element Array, containing the new text and an array of boolean values. """ if not patches: return (text, []) # Deep copy the patches so that no changes are made to originals. patches = self.patch_deepCopy(patches) nullPadding = self.patch_addPadding(patches) text = nullPadding + text + nullPadding self.patch_splitMax(patches) # delta keeps track of the offset between the expected and actual location # of the previous patch. If there are patches expected at positions 10 and # 20, but the first patch was found at 12, delta is 2 and the second patch # has an effective expected position of 22. delta = 0 results = [] for patch in patches: expected_loc = patch.start2 + delta text1 = self.diff_text1(patch.diffs) end_loc = -1 if len(text1) > self.Match_MaxBits: # patch_splitMax will only provide an oversized pattern in the case of # a monster delete. start_loc = self.match_main( text, text1[: self.Match_MaxBits], expected_loc ) if start_loc != -1: end_loc = self.match_main( text, text1[-self.Match_MaxBits :], expected_loc + len(text1) - self.Match_MaxBits, ) if end_loc == -1 or start_loc >= end_loc: # Can't find valid trailing context. Drop this patch. start_loc = -1 else: start_loc = self.match_main(text, text1, expected_loc) if start_loc == -1: # No match found. :( results.append(False) # Subtract the delta for this failed patch from subsequent patches. delta -= patch.length2 - patch.length1 else: # Found a match. :) results.append(True) delta = start_loc - expected_loc if end_loc == -1: text2 = text[start_loc : start_loc + len(text1)] else: text2 = text[start_loc : end_loc + self.Match_MaxBits] if text1 == text2: # Perfect match, just shove the replacement text in. text = ( text[:start_loc] + self.diff_text2(patch.diffs) + text[start_loc + len(text1) :] ) else: # Imperfect match. # Run a diff to get a framework of equivalent indices. diffs = self.diff_main(text1, text2, False) if ( len(text1) > self.Match_MaxBits and self.diff_levenshtein(diffs) / float(len(text1)) > self.Patch_DeleteThreshold ): # The end points match, but the content is unacceptably bad. results[-1] = False else: self.diff_cleanupSemanticLossless(diffs) index1 = 0 for (op, data) in patch.diffs: if op != self.DIFF_EQUAL: index2 = self.diff_xIndex(diffs, index1) if op == self.DIFF_INSERT: # Insertion text = ( text[: start_loc + index2] + data + text[start_loc + index2 :] ) elif op == self.DIFF_DELETE: # Deletion text = ( text[: start_loc + index2] + text[ start_loc + self.diff_xIndex(diffs, index1 + len(data)) : ] ) if op != self.DIFF_DELETE: index1 += len(data) # Strip the padding off. text = text[len(nullPadding) : -len(nullPadding)] return (text, results) def patch_addPadding(self, patches): """Add some padding on text start and end so that edges can match something. Intended to be called only from within patch_apply. Args: patches: Array of Patch objects. Returns: The padding string added to each side. """ paddingLength = self.Patch_Margin nullPadding = "" for x in range(1, paddingLength + 1): nullPadding += chr(x) # Bump all the patches forward. for patch in patches: patch.start1 += paddingLength patch.start2 += paddingLength # Add some padding on start of first diff. patch = patches[0] diffs = patch.diffs if not diffs or diffs[0][0] != self.DIFF_EQUAL: # Add nullPadding equality. diffs.insert(0, (self.DIFF_EQUAL, nullPadding)) patch.start1 -= paddingLength # Should be 0. patch.start2 -= paddingLength # Should be 0. patch.length1 += paddingLength patch.length2 += paddingLength elif paddingLength > len(diffs[0][1]): # Grow first equality. extraLength = paddingLength - len(diffs[0][1]) newText = nullPadding[len(diffs[0][1]) :] + diffs[0][1] diffs[0] = (diffs[0][0], newText) patch.start1 -= extraLength patch.start2 -= extraLength patch.length1 += extraLength patch.length2 += extraLength # Add some padding on end of last diff. patch = patches[-1] diffs = patch.diffs if not diffs or diffs[-1][0] != self.DIFF_EQUAL: # Add nullPadding equality. diffs.append((self.DIFF_EQUAL, nullPadding)) patch.length1 += paddingLength patch.length2 += paddingLength elif paddingLength > len(diffs[-1][1]): # Grow last equality. extraLength = paddingLength - len(diffs[-1][1]) newText = diffs[-1][1] + nullPadding[:extraLength] diffs[-1] = (diffs[-1][0], newText) patch.length1 += extraLength patch.length2 += extraLength return nullPadding def patch_splitMax(self, patches): """Look through the patches and break up any which are longer than the maximum limit of the match algorithm. Intended to be called only from within patch_apply. Args: patches: Array of Patch objects. """ patch_size = self.Match_MaxBits if patch_size == 0: # Python has the option of not splitting strings due to its ability # to handle integers of arbitrary precision. return for x in range(len(patches)): if patches[x].length1 <= patch_size: continue bigpatch = patches[x] # Remove the big old patch. del patches[x] x -= 1 start1 = bigpatch.start1 start2 = bigpatch.start2 precontext = "" while len(bigpatch.diffs) != 0: # Create one of several smaller patches. patch = patch_obj() empty = True patch.start1 = start1 - len(precontext) patch.start2 = start2 - len(precontext) if precontext: patch.length1 = patch.length2 = len(precontext) patch.diffs.append((self.DIFF_EQUAL, precontext)) while ( len(bigpatch.diffs) != 0 and patch.length1 < patch_size - self.Patch_Margin ): (diff_type, diff_text) = bigpatch.diffs[0] if diff_type == self.DIFF_INSERT: # Insertions are harmless. patch.length2 += len(diff_text) start2 += len(diff_text) patch.diffs.append(bigpatch.diffs.pop(0)) empty = False elif ( diff_type == self.DIFF_DELETE and len(patch.diffs) == 1 and patch.diffs[0][0] == self.DIFF_EQUAL and len(diff_text) > 2 * patch_size ): # This is a large deletion. Let it pass in one chunk. patch.length1 += len(diff_text) start1 += len(diff_text) empty = False patch.diffs.append((diff_type, diff_text)) del bigpatch.diffs[0] else: # Deletion or equality. Only take as much as we can stomach. diff_text = diff_text[ : patch_size - patch.length1 - self.Patch_Margin ] patch.length1 += len(diff_text) start1 += len(diff_text) if diff_type == self.DIFF_EQUAL: patch.length2 += len(diff_text) start2 += len(diff_text) else: empty = False patch.diffs.append((diff_type, diff_text)) if diff_text == bigpatch.diffs[0][1]: del bigpatch.diffs[0] else: bigpatch.diffs[0] = ( bigpatch.diffs[0][0], bigpatch.diffs[0][1][len(diff_text) :], ) # Compute the head context for the next patch. precontext = self.diff_text2(patch.diffs) precontext = precontext[-self.Patch_Margin :] # Append the end context for this patch. postcontext = self.diff_text1(bigpatch.diffs)[: self.Patch_Margin] if postcontext: patch.length1 += len(postcontext) patch.length2 += len(postcontext) if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL: patch.diffs[-1] = ( self.DIFF_EQUAL, patch.diffs[-1][1] + postcontext, ) else: patch.diffs.append((self.DIFF_EQUAL, postcontext)) if not empty: x += 1 patches.insert(x, patch) def patch_toText(self, patches): """Take a list of patches and return a textual representation. Args: patches: Array of Patch objects. Returns: Text representation of patches. """ text = [] for patch in patches: text.append(str(patch)) return "".join(text) def patch_fromText(self, textline): """Parse a textual representation of patches and return a list of patch objects. Args: textline: Text representation of patches. Returns: Array of Patch objects. Raises: ValueError: If invalid input. """ patches = [] if not textline: return patches text = textline.split("\n") while len(text) != 0: m = re.match(r"^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0]) if not m: raise ValueError("Invalid patch string: " + text[0]) patch = patch_obj() patches.append(patch) patch.start1 = int(m.group(1)) if m.group(2) == "": patch.start1 -= 1 patch.length1 = 1 elif m.group(2) == "0": patch.length1 = 0 else: patch.start1 -= 1 patch.length1 = int(m.group(2)) patch.start2 = int(m.group(3)) if m.group(4) == "": patch.start2 -= 1 patch.length2 = 1 elif m.group(4) == "0": patch.length2 = 0 else: patch.start2 -= 1 patch.length2 = int(m.group(4)) del text[0] while len(text) != 0: if text[0]: sign = text[0][0] else: sign = "" line = urllib.parse.unquote(text[0][1:]) if sign == "+": # Insertion. patch.diffs.append((self.DIFF_INSERT, line)) elif sign == "-": # Deletion. patch.diffs.append((self.DIFF_DELETE, line)) elif sign == " ": # Minor equality. patch.diffs.append((self.DIFF_EQUAL, line)) elif sign == "@": # Start of next patch. break elif sign == "": # Blank line? Whatever. pass else: # WTF? raise ValueError("Invalid patch mode: '%s'\n%s" % (sign, line)) del text[0] return patches class patch_obj: """Class representing one patch operation.""" def __init__(self): """Initializes with an empty list of diffs.""" self.diffs = [] self.start1 = None self.start2 = None self.length1 = 0 self.length2 = 0 def __str__(self): """Emulate GNU diff's format. Header: @@ -382,8 +481,9 @@ Indices are printed as 1-based, not 0-based. Returns: The GNU diff string. """ if self.length1 == 0: coords1 = str(self.start1) + ",0" elif self.length1 == 1: coords1 = str(self.start1 + 1) else: coords1 = str(self.start1 + 1) + "," + str(self.length1) if self.length2 == 0: coords2 = str(self.start2) + ",0" elif self.length2 == 1: coords2 = str(self.start2 + 1) else: coords2 = str(self.start2 + 1) + "," + str(self.length2) text = ["@@ -", coords1, " +", coords2, " @@\n"] # Escape the body of the patch with %xx notation. for (op, data) in self.diffs: if op == diff_match_patch.DIFF_INSERT: text.append("+") elif op == diff_match_patch.DIFF_DELETE: text.append("-") elif op == diff_match_patch.DIFF_EQUAL: text.append(" ") # High ascii will raise UnicodeDecodeError. Use Unicode instead. data = data.encode("utf-8") text.append(urllib.parse.quote(data, "!~*'();/?:@&=+$,# ") + "\n") return "".join(text) xmldiff-2.6.3/xmldiff/formatting.py000066400000000000000000000745651443244161100173510ustar00rootroot00000000000000import json import re from collections import namedtuple from copy import deepcopy from lxml import etree from xmldiff.diff_match_patch import diff_match_patch from xmldiff import utils DIFF_NS = "http://namespaces.shoobx.com/diff" DIFF_PREFIX = "diff" INSERT_NAME = "{%s}insert" % DIFF_NS DELETE_NAME = "{%s}delete" % DIFF_NS RENAME_NAME = "{%s}rename" % DIFF_NS # Flags for whitespace handling in the text aware formatters: WS_BOTH = 3 # Normalize ignorable whitespace and text whitespace WS_TEXT = 2 # Normalize whitespace only inside text tags WS_TAGS = 1 # Delete ignorable whitespace (between tags) WS_NONE = 0 # Preserve all whitespace # Placeholder tag type T_OPEN = 0 T_CLOSE = 1 T_SINGLE = 2 # This is the start of the BMP(0) private use area. # If you end up having more than 6400 different tags inside text tags # this will bleed over to non private use area, but that's highly # unlikely. However, once we have dropped support for Python versions # that have narrow builds, we can change this to 0xf00000, which is # the start of two 64,000 private use blocks. # PY3: Once Python 2.7 support is dropped we should change this to 0xf00000 PLACEHOLDER_START = 0xE000 # These Bases can be abstract baseclasses, but it's a pain to support # Python 2.7 in that case, because there is no abc.ABC. Right now this # is just a description of the API. class BaseFormatter: def __init__(self, normalize=WS_TAGS, pretty_print=False): """Formatters must as a minimum have a normalize parameter This is used by the main API to decide is whitespace between the tags should be stripped (the remove_blank_text flag in lxml) and if tags that are known texts tags should be normalized before comparing. String content in non-text tags will not be normalized with the included formatters. pretty_print is used to choose between a compact and a pretty output. This is currently only used by the XML and HTML formatters. Formatters may of course have more options than these, but these two are the ones that can be set from the command-line. """ def prepare(self, left_tree, right_tree): """Allows the formatter to prepare the trees before diffing That preparing may need some "unpreparing", but it's then done by the formatters format() method, and is not a part of the public interface.""" def format(self, diff, orig_tree): """Formats the diff and returns a unicode string A formatter that returns XML with diff markup will need the original tree available to do it's job, so there is an orig_tree parameter, but it may be ignored by differs that don't need it. """ PlaceholderEntry = namedtuple("PlaceholderEntry", "element ttype close_ph") class PlaceholderMaker: """Replace tags with unicode placeholders This class searches for certain tags in an XML tree and replaces them with unicode placeholders. The idea is to replace structured content (in this case XML elements) with unicode characters which then participate in the regular text diffing algorithm. This makes text diffing easier and faster. The code can then unreplace the unicode placeholders with the tags. """ def __init__(self, text_tags=(), formatting_tags=()): self.text_tags = text_tags self.formatting_tags = formatting_tags self.placeholder2tag = {} self.tag2placeholder = {} self.placeholder = PLACEHOLDER_START insert_elem = etree.Element(INSERT_NAME) insert_close = self.get_placeholder(insert_elem, T_CLOSE, None) insert_open = self.get_placeholder(insert_elem, T_OPEN, insert_close) delete_elem = etree.Element(DELETE_NAME) delete_close = self.get_placeholder(delete_elem, T_CLOSE, None) delete_open = self.get_placeholder(delete_elem, T_OPEN, delete_close) self.diff_tags = { "insert": (insert_open, insert_close), "delete": (delete_open, delete_close), } def get_placeholder(self, element, ttype, close_ph): tag = etree.tounicode(element) ph = self.tag2placeholder.get((tag, ttype, close_ph)) if ph is not None: return ph self.placeholder += 1 ph = chr(self.placeholder) self.placeholder2tag[ph] = PlaceholderEntry(element, ttype, close_ph) self.tag2placeholder[tag, ttype, close_ph] = ph return ph def is_placeholder(self, char): return len(char) == 1 and char in self.placeholder2tag def is_formatting(self, element): return element.tag in self.formatting_tags def do_element(self, element): for child in element: # Resolve all formatting text by allowing the inside text to # participate in the text diffing. tail = child.tail or "" child.tail = "" new_text = element.text or "" if self.is_formatting(child): ph_close = self.get_placeholder(child, T_CLOSE, None) ph_open = self.get_placeholder(child, T_OPEN, ph_close) # If it's known text formatting tags, do this hierarchically self.do_element(child) text = child.text or "" child.text = "" # Stick the placeholder in instead of the start and end tags: element.text = new_text + ph_open + text + ph_close + tail else: ph_single = self.get_placeholder(child, T_SINGLE, None) # Replace the whole tag including content: element.text = new_text + ph_single + tail # Remove the element from the tree now that we have inserted a # placeholder. element.remove(child) def do_tree(self, tree): if self.text_tags: for elem in tree.xpath("//" + "|//".join(self.text_tags)): self.do_element(elem) def split_string(self, text): regexp = "([%s])" % "".join(self.placeholder2tag) return re.split(regexp, text, flags=re.MULTILINE) def undo_string(self, text): result = etree.Element("wrap") element = None segments = self.split_string(text) while segments: seg = segments.pop(0) if not seg: continue # Segments can be either plain string or placeholders. if self.is_placeholder(seg): entry = self.placeholder2tag[seg] element = deepcopy(entry.element) # Is this a open/close segment? if entry.ttype == T_OPEN: # Yup next_seg = segments.pop(0) new_text = "" while next_seg != entry.close_ph: new_text += next_seg next_seg = segments.pop(0) element.text = new_text or None element.tail = None self.undo_element(element) result.append(element) else: if element is not None: element.tail = element.tail or "" + seg else: result.text = result.text or "" + seg return result def undo_element(self, elem): if self.placeholder2tag: if elem.text: index = 0 content = self.undo_string(elem.text) if elem.text != content.text: # Placeholders was replaced elem.text = content.text for child in content: self.undo_element(child) elem.insert(index, child) index += 1 for child in elem: self.undo_element(child) if elem.tail: content = self.undo_string(elem.tail) if elem.tail != content.text: # Placeholders was replaced elem.tail = content.text parent = elem.getparent() index = parent.index(elem) + 1 for child in content: self.undo_element(child) parent.insert(index, child) index += 1 def undo_tree(self, tree): self.undo_element(tree) def mark_diff(self, ph, action): entry = self.placeholder2tag[ph] if entry.ttype == T_CLOSE: # Close tag, nothing to mark return ph # Mark the tag as having a diff-action. We do need to # make a copy of it and get a new placeholder: elem = entry.element elem = deepcopy(elem) if self.is_formatting(elem): # Formatting element, add a diff attribute action += "-formatting" elem.attrib[f"{{{DIFF_NS}}}{action}"] = "" # And make a new placeholder for this new entry: return self.get_placeholder(elem, entry.ttype, entry.close_ph) def wrap_diff(self, text, action): open_ph, close_ph = self.diff_tags[action] return open_ph + text + close_ph class XMLFormatter(BaseFormatter): """A formatter that also replaces formatting tags with unicode characters The idea of this differ is to replace structured content (in this case XML elements) with unicode characters which then participate in the regular text diffing algorithm. This is done in the prepare() step. Each identical XML element will get a unique unicode character. If the node is changed for any reason, a new unicode character is assigned to the node. This allows identity detection of structured content between the two text versions while still allowing customization during diffing time, such as marking a new formatting node. The latter feature allows for granular style change detection independently of text changes. In order for the algorithm to not go crazy and convert entire XML documents to text (though that is perfectly doable), a few rules have been defined. - The `textTags` attribute lists all the XML nodes by name which can contain text. All XML nodes within those text nodes are converted to unicode placeholders. If you want better control over which parts of your XML document are considered text, you can simply override the ``insert_placeholders(tree)`` function. It is purposefully kept small to allow easy subclassing. - By default, all tags inside text tags are treated as immutable units. That means the node itself including its entire sub-structure is assigned one unicode character. - The ``formattingTags`` attribute is used to specify tags that format the text. For these tags, the opening and closing tags receive unique unicode characters, allowing for sub-structure change detection and formatting changes. During the diff markup phase, formatting notes are annotated to mark them as inserted or deleted allowing for markup specific to those formatting changes. The diffed version of the structural tree is passed into the ``finalize(tree)`` method to convert all the placeholders back into structural content before formatting. The ``normalize`` parameter decides how to normalize whitespace. WS_TEXT normalizes only inside text_tags, WS_TAGS will remove ignorable whitespace between tags, WS_BOTH do both, and WS_NONE will preserve all whitespace. """ def __init__( self, normalize=WS_NONE, pretty_print=True, text_tags=(), formatting_tags=() ): # Mapping from placeholders -> structural content and vice versa. self.normalize = normalize self.pretty_print = pretty_print self.text_tags = text_tags self.formatting_tags = formatting_tags self.placeholderer = PlaceholderMaker( text_tags=text_tags, formatting_tags=formatting_tags ) def prepare(self, left_tree, right_tree): """prepare() is run on the trees before diffing This is so the formatter can apply magic before diffing.""" # We don't want to diff comments: self._remove_comments(left_tree) self._remove_comments(right_tree) self.placeholderer.do_tree(left_tree) self.placeholderer.do_tree(right_tree) def finalize(self, result_tree): """finalize() is run on the resulting tree before returning it This is so the formatter cab apply magic after diffing.""" self.placeholderer.undo_tree(result_tree) def format(self, diff, orig_tree, differ=None): # Make a new tree, both because we want to add the diff namespace # and also because we don't want to modify the original tree. result = deepcopy(orig_tree) if isinstance(result, etree._ElementTree): root = result.getroot() else: root = result self._nsmap = [(DIFF_PREFIX, DIFF_NS)] etree.register_namespace(DIFF_PREFIX, DIFF_NS) for action in diff: self.handle_action(action, root) self.finalize(root) etree.cleanup_namespaces(result, top_nsmap=dict(self._nsmap)) return self.render(result) def render(self, result): return etree.tounicode(result, pretty_print=self.pretty_print) def handle_action(self, action, result): action_type = type(action) method = getattr(self, "_handle_" + action_type.__name__) method(action, result) def _remove_comments(self, tree): comments = tree.xpath("//comment()") for element in comments: parent = element.getparent() if parent is None: # We can't remove top level comments, but they won't # be iterated over anyway, so we just skip them. continue parent.remove(element) def _xpath(self, node, xpath): # This method finds an element with xpath and makes sure that # one and exactly one element is found. This is to protect against # formatting a diff on the wrong tree, or against using ambiguous # edit script xpaths. # First, make a namespace map that uses the left tree's URI's: nsmap = dict(self._nsmap) nsmap.update(node.nsmap) if xpath[0] == "/": root = True xpath = xpath[1:] else: root = False if "/" in xpath: path, rest = xpath.split("/", 1) else: path = xpath rest = "" if "[" in path: path, index = path[:-1].split("[") index = int(index) - 1 multiple = False else: index = 0 multiple = True if root: path = "/" + path matches = [] if None in nsmap: del nsmap[None] for match in node.xpath(path, namespaces=nsmap): # Skip nodes that have been deleted if DELETE_NAME not in match.attrib: matches.append(match) if index >= len(matches): raise ValueError( "xpath {}[{}] not found at {}.".format( path, index + 1, utils.getpath(node) ) ) if len(matches) > 1 and multiple: raise ValueError( "Multiple nodes found for xpath {} at {}.".format( path, utils.getpath(node) ) ) match = matches[index] if rest: return self._xpath(match, rest) return match def _extend_diff_attr(self, node, action, value): diffattr = f"{{{DIFF_NS}}}{action}-attr" oldvalue = node.attrib.get(diffattr, "") if oldvalue: value = oldvalue + ";" + value node.attrib[diffattr] = value def _delete_attrib(self, node, name): del node.attrib[name] self._extend_diff_attr(node, "delete", name) def _handle_DeleteAttrib(self, action, tree): node = self._xpath(tree, action.node) self._delete_attrib(node, action.name) def _delete_node(self, node): node.attrib[DELETE_NAME] = "" def _handle_DeleteNode(self, action, tree): node = self._xpath(tree, action.node) self._delete_node(node) def _insert_attrib(self, node, name, value): node.attrib[name] = value self._extend_diff_attr(node, "add", name) def _handle_InsertAttrib(self, action, tree): node = self._xpath(tree, action.node) self._insert_attrib(node, action.name, action.value) def _insert_node(self, target, node, position): node.attrib[INSERT_NAME] = "" target.insert(position, node) def _get_real_insert_position(self, target, position): # Find the real position: pos = 0 offset = 0 for child in target.getchildren(): if DELETE_NAME in child.attrib: offset += 1 else: pos += 1 if pos > position: # We found the right offset break # Real position return position + offset def _handle_InsertNode(self, action, tree): # Insert node as a child. However, position is the position in the # new tree, and the diff tree may have deleted children, so we must # adjust the position for that. target = self._xpath(tree, action.target) position = self._get_real_insert_position(target, action.position) new_node = target.makeelement(action.tag, nsmap=target.nsmap) self._insert_node(target, new_node, position) def _rename_attrib(self, node, oldname, newname): node.attrib[newname] = node.attrib[oldname] del node.attrib[oldname] self._extend_diff_attr(node, "rename", f"{oldname}:{newname}") def _handle_RenameAttrib(self, action, tree): node = self._xpath(tree, action.node) self._rename_attrib(node, action.oldname, action.newname) def _handle_MoveNode(self, action, tree): node = self._xpath(tree, action.node) inserted = deepcopy(node) target = self._xpath(tree, action.target) self._delete_node(node) position = self._get_real_insert_position(target, action.position) self._insert_node(target, inserted, position) def _handle_RenameNode(self, action, tree): node = self._xpath(tree, action.node) node.attrib[RENAME_NAME] = node.tag node.tag = action.tag def _update_attrib(self, node, name, value): oldval = node.attrib[name] node.attrib[name] = value self._extend_diff_attr(node, "update", f"{name}:{oldval}") def _handle_UpdateAttrib(self, action, tree): node = self._xpath(tree, action.node) self._update_attrib(node, action.name, action.value) def _realign_placeholders(self, diff): # Since the differ always deletes first and insert second, # placeholders that represent XML open and close tags will get # misaligned. This method will fix that order. new_diff = [] # Diff list with proper tree structure. stack = [] # Current node path. def _stack_pop(): return stack.pop() if stack else (None, None) for op, text in diff: segments = self.placeholderer.split_string(text) for seg in segments: if not seg: continue # There is nothing to do for regular text. if not self.placeholderer.is_placeholder(seg): new_diff.append((op, seg)) continue # Handle all structural replacement elements. entry = self.placeholderer.placeholder2tag[seg] if entry.ttype == T_SINGLE: # There is nothing to do for singletons since they are # fully self-contained. new_diff.append((op, seg)) continue elif entry.ttype == T_OPEN: # Opening tags are added to the stack, so we know what # needs to be closed when. We are assuming that tags are # opened in the desired order. stack.append((op, entry)) new_diff.append((op, seg)) continue elif entry.ttype == T_CLOSE: # Due to the nature of the text diffing algorithm, closing # tags can be out of order. But since we know what we need # to close, we simply glean at the stack to know what # needs to be closed before the requested node closure can # happen. stack_op, stack_entry = _stack_pop() while stack_entry is not None and stack_entry.close_ph != seg: new_diff.append((stack_op, stack_entry.close_ph)) stack_op, stack_entry = _stack_pop() # Stephan: We have situations where the opening tag # remains in place but the closing text moves from on # position to another. In those cases, we will have two # closing tags for one opening one. Since we want to # prefer the new version over the old in terms of # formatting, we ignore the deletion and close the tag # where it was inserted. # Lennart: I could not make any case that made # stack_op > op, so I removed the handling, and # put in an assert if stack_entry is not None: assert stack_op <= op new_diff.append((op, seg)) return new_diff def _make_diff_tags(self, left_value, right_value, node, target=None): if bool(self.normalize & WS_TEXT): left_value = utils.cleanup_whitespace(left_value or "").strip() right_value = utils.cleanup_whitespace(right_value or "").strip() text_diff = diff_match_patch() diff = text_diff.diff_main(left_value or "", right_value or "") text_diff.diff_cleanupSemantic(diff) diff = self._realign_placeholders(diff) cur_child = None if target is None: target = node else: cur_child = node for op, text in diff: if op == 0: if cur_child is None: node.text = (node.text or "") + text else: cur_child.tail = (cur_child.tail or "") + text continue if op == -1: action = "delete" elif op == 1: action = "insert" if self.placeholderer.is_placeholder(text): ph = self.placeholderer.mark_diff(text, action) if cur_child is None: node.text = (node.text or "") + ph else: new_text = self.placeholderer.wrap_diff(text, action) if cur_child is None: node.text = (node.text or "") + new_text else: cur_child.tail = (cur_child.tail or "") + new_text def _handle_UpdateTextIn(self, action, tree): node = self._xpath(tree, action.node) if INSERT_NAME in node.attrib: # The whole node is already marked as inserted, # we don't need to diff-wrap the text. node.text = action.text return node left_value = node.text right_value = action.text node.text = None self._make_diff_tags(left_value, right_value, node) return node def _handle_UpdateTextAfter(self, action, tree): node = self._xpath(tree, action.node) left_value = node.tail right_value = action.text node.tail = None self._make_diff_tags(left_value, right_value, node, node.getparent()) return node def _handle_InsertNamespace(self, action, tree): # There is no way to mark this so it's visible, so we'll just update the tree self._nsmap.append((action.prefix, action.uri)) def _handle_DeleteNamespace(self, action, tree): # This will be handled by the namespace cleanup pass # There is no InsertComment handler, as this formatter removes all comments class DiffFormatter(BaseFormatter): def __init__(self, normalize=WS_TAGS, pretty_print=False): self.normalize = normalize # No pretty print support, nothing to be pretty about # Nothing to prepare or finalize (one-liners for code coverage) def prepare(self, left, right): return def finalize(self, left, right): return def format(self, diff, orig_tree): # This Formatter don't need the left tree, but the XMLFormatter # does, so the parameter is required. res = "\n".join(self._format_action(action) for action in diff) return res def _format_action( self, action, ): return "[%s]" % self.handle_action(action) def handle_action(self, action): action_type = type(action) method = getattr(self, "_handle_" + action_type.__name__) return ", ".join(method(action)) def _handle_DeleteAttrib(self, action): return "delete-attribute", action.node, action.name def _handle_DeleteNode(self, action): return "delete", action.node def _handle_InsertAttrib(self, action): return ("insert-attribute", action.node, action.name, json.dumps(action.value)) def _handle_InsertNode(self, action): return "insert", action.target, action.tag, str(action.position) def _handle_RenameAttrib(self, action): return ("rename-attribute", action.node, action.oldname, action.newname) def _handle_MoveNode(self, action): return "move", action.node, action.target, str(action.position) def _handle_UpdateAttrib(self, action): return ("update-attribute", action.node, action.name, json.dumps(action.value)) def _handle_UpdateTextIn(self, action): return "update-text", action.node, json.dumps(action.text) def _handle_UpdateTextAfter(self, action): return "update-text-after", action.node, json.dumps(action.text) def _handle_RenameNode(self, action): return "rename", action.node, action.tag def _handle_InsertComment(self, action): return ( "insert-comment", action.target, str(action.position), json.dumps(action.text), ) def _handle_InsertNamespace(self, action): return ( "insert-namespace", action.prefix, action.uri, ) def _handle_DeleteNamespace(self, action): return ( "delete-namespace", action.prefix, ) class XmlDiffFormatter(BaseFormatter): """A formatter for an output trying to be xmldiff 0.6 compatible""" def __init__(self, normalize=WS_TAGS, pretty_print=False): self.normalize = normalize # No pretty print support, nothing to be pretty about # Nothing to prepare or finalize (one-liners for code coverage) def prepare(self, left, right): return def finalize(self, left, right): return def format(self, diff, orig_tree): # This Formatter don't need the left tree, but the XMLFormatter # does, so the parameter is required. actions = [] for action in diff: actions.extend(self.handle_action(action, orig_tree)) res = "\n".join(self._format_action(action) for action in actions) return res def _format_action(self, action): return "[%s]" % ", ".join(action) def handle_action(self, action, orig_tree): action_type = type(action) method = getattr(self, "_handle_" + action_type.__name__) yield from method(action, orig_tree) def _handle_DeleteAttrib(self, action, orig_tree): yield "remove", f"{action.node}/@{action.name}" def _handle_DeleteNode(self, action, orig_tree): yield "remove", action.node def _handle_InsertAttrib(self, action, orig_tree): value_text = "\n<@{0}>\n{1}\n".format(action.name, action.value) yield "insert", action.node, value_text def _handle_InsertNode(self, action, orig_tree): if action.position == 0: yield "insert-first", action.target, "\n<%s/>" % action.tag return sibling = orig_tree.xpath(action.target)[0][action.position - 1] yield "insert-after", utils.getpath(sibling), "\n<%s/>" % action.tag def _handle_RenameAttrib(self, action, orig_tree): node = orig_tree.xpath(action.node)[0] value = node.attrib[action.oldname] value_text = "\n<@{0}>\n{1}\n".format(action.newname, value) yield "remove", f"{action.node}/@{action.oldname}" yield "insert", action.node, value_text def _handle_MoveNode(self, action, orig_tree): if action.position == 0: yield "move-first", action.node, action.target return node = orig_tree.xpath(action.node)[0] target = orig_tree.xpath(action.target)[0] # Get the position of the previous sibling position = action.position - 1 if node.getparent() is target: # Moving to a new lower position in the same target, # adjust previous sibling position: if target.index(node) <= position: position += 1 sibling = target[position] yield "move-after", action.node, utils.getpath(sibling) def _handle_UpdateAttrib(self, action, orig_tree): yield ( "update", f"{action.node}/@{action.name}", json.dumps(action.value), ) def _handle_UpdateTextIn(self, action, orig_tree): yield "update", action.node + "/text()[1]", json.dumps(action.text) def _handle_UpdateTextAfter(self, action, orig_tree): yield "update", action.node + "/text()[2]", json.dumps(action.text) def _handle_RenameNode(self, action, orig_tree): yield "rename", action.node, action.tag def _handle_InsertComment(self, action, orig_tree): yield "insert-comment", action.target, str(action.position), action.text def _handle_InsertNamespace(self, action, orig_tree): yield "insert-namespace", action.prefix, action.uri def _handle_DeleteNamespace(self, action, orig_tree): yield "delete-namespace", action.prefix xmldiff-2.6.3/xmldiff/main.py000066400000000000000000000167241443244161100161140ustar00rootroot00000000000000"""All major API points and command-line tools""" import pkg_resources from argparse import ArgumentParser, ArgumentTypeError from lxml import etree from xmldiff import diff, formatting, patch __version__ = pkg_resources.require("xmldiff")[0].version FORMATTERS = { "diff": formatting.DiffFormatter, "xml": formatting.XMLFormatter, "old": formatting.XmlDiffFormatter, } def diff_trees(left, right, diff_options=None, formatter=None): """Takes two lxml root elements or element trees""" if formatter is not None: formatter.prepare(left, right) if diff_options is None: diff_options = {} differ = diff.Differ(**diff_options) diffs = differ.diff(left, right) if formatter is None: return list(diffs) return formatter.format(diffs, left) def _diff(parse_method, left, right, diff_options=None, formatter=None): normalize = bool(getattr(formatter, "normalize", 1) & formatting.WS_TAGS) parser = etree.XMLParser(remove_blank_text=normalize) left_tree = parse_method(left, parser) right_tree = parse_method(right, parser) return diff_trees( left_tree, right_tree, diff_options=diff_options, formatter=formatter ) def diff_texts(left, right, diff_options=None, formatter=None): """Takes two Unicode strings containing XML""" return _diff( etree.fromstring, left, right, diff_options=diff_options, formatter=formatter ) def diff_files(left, right, diff_options=None, formatter=None): """Takes two filenames or streams, and diffs the XML in those files""" return _diff( etree.parse, left, right, diff_options=diff_options, formatter=formatter ) def validate_F(arg): """Type function for argparse - a float within some predefined bounds""" try: F = float(arg) except ValueError: raise ArgumentTypeError("Must be a floating point number") if F <= 0: raise ArgumentTypeError("F can not be zero or lower") if F > 1: raise ArgumentTypeError("F can not be above 1") return F def make_diff_parser(): parser = ArgumentParser( description="Create a diff for two XML files.", add_help=False ) parser.add_argument("file1", type=str, help="The first input file.") parser.add_argument("file2", type=str, help="The second input file.") parser.add_argument( "-h", "--help", action="help", help="Show this help message and exit." ) parser.add_argument( "-v", "--version", action="version", help="Display version and exit.", version="xmldiff %s" % __version__, ) parser.add_argument( "--check", action="store_true", help="Return error code 1 if there are any differences between the files.", ) parser.add_argument( "-f", "--formatter", default="diff", choices=list(FORMATTERS.keys()), help="Formatter selection.", ) parser.add_argument( "-w", "--keep-whitespace", action="store_true", help="Do not strip ignorable whitespace.", ) parser.add_argument( "-p", "--pretty-print", action="store_true", help="Try to make XML output more readable.", ) parser.add_argument( "-F", type=validate_F, help="A value between 0 and 1 that determines how " "similar nodes must be to match.", ) parser.add_argument( "--unique-attributes", type=str, nargs="?", default="{http://www.w3.org/XML/1998/namespace}id", help="A comma separated list of attributes " "that uniquely identify a node. Can be empty. " "Unique attributes for certain elements can " "be specified in the format {NS}element@attr.", ) parser.add_argument( "--ratio-mode", default="fast", choices={"accurate", "fast", "faster"}, help="Choose the node comparison optimization.", ) match_group = parser.add_mutually_exclusive_group() match_group.add_argument( "--fast-match", action="store_true", help="A faster, less optimal match run." ) match_group.add_argument( "--best-match", action="store_true", help="A slower, two-stage match run that may result in smaller diffs. (Experimental)", ) parser.add_argument( "--ignored-attributes", type=str, nargs="?", help="A comma separated list of attributes " "that should be ignored in comparison.", ) return parser def _parse_uniqueattrs(uniqueattrs): if uniqueattrs is None: return [] return [ attr if "@" not in attr else attr.split("@", 1) for attr in uniqueattrs.split(",") ] def _parse_ignored_attrs(ignored_attrs): if ignored_attrs is None: return [] return [attr for attr in ignored_attrs.split(",")] def diff_command(args=None): parser = make_diff_parser() args = parser.parse_args(args=args) if args.keep_whitespace: normalize = formatting.WS_NONE else: normalize = formatting.WS_BOTH formatter = FORMATTERS[args.formatter]( normalize=normalize, pretty_print=args.pretty_print ) diff_options = { "ignored_attrs": _parse_ignored_attrs(args.ignored_attributes), "ratio_mode": args.ratio_mode, "F": args.F, "fast_match": args.fast_match, "best_match": args.best_match, "uniqueattrs": _parse_uniqueattrs(args.unique_attributes), } result = diff_files( args.file1, args.file2, diff_options=diff_options, formatter=formatter ) print(result) if args.check and len(result) > 0: return 1 def patch_tree(actions, tree): """Takes an lxml root element or element tree, and a list of actions""" patcher = patch.Patcher() return patcher.patch(actions, tree) def patch_text(actions, tree): """Takes a string with XML and a string with actions""" tree = etree.fromstring(tree) actions = patch.DiffParser().parse(actions) tree = patch_tree(actions, tree) return etree.tounicode(tree) def patch_file(actions, tree, diff_encoding=None): """Takes two filenames or streams, one with XML the other a diff""" tree = etree.parse(tree) if isinstance(actions, str): # It's a string, so it's a filename with open(actions, "rt", encoding=diff_encoding) as f: actions = f.read() else: # We assume it's a stream actions = actions.read() actions = patch.DiffParser().parse(actions) tree = patch_tree(actions, tree) return etree.tounicode(tree) def make_patch_parser(): parser = ArgumentParser( description="Patch an XML file with an xmldiff", add_help=False ) parser.add_argument("patchfile", type=str, help="An xmldiff diff file.") parser.add_argument("xmlfile", type=str, help="An unpatched XML file.") parser.add_argument( "-h", "--help", action="help", help="Show this help message and exit." ) parser.add_argument( "-v", "--version", action="version", help="Display version and exit.", version="xmldiff %s" % __version__, ) parser.add_argument( "--diff-encoding", help="The encoding used for the diff file, eg UTF-8 or UTF-16, etc.", ) return parser def patch_command(args=None): parser = make_patch_parser() args = parser.parse_args(args=args) result = patch_file(args.patchfile, args.xmlfile, args.diff_encoding) print(result) xmldiff-2.6.3/xmldiff/patch.py000066400000000000000000000132711443244161100162610ustar00rootroot00000000000000from copy import deepcopy from csv import reader from json import loads from lxml import etree from xmldiff import actions class Patcher: @property def nsmap(self): return getattr(self, "_nsmap", {}) def patch(self, actions, tree): if isinstance(tree, etree._ElementTree): tree = tree.getroot() # Save the namespace: self._nsmap = tree.nsmap if None in self._nsmap: del self._nsmap[None] # Copy the tree so we don't modify the original result = deepcopy(tree) for action in actions: self.handle_action(action, result) return result def handle_action(self, action, tree): action_type = type(action) method = getattr(self, "_handle_" + action_type.__name__) method(action, tree) def _handle_DeleteNode(self, action, tree): node = tree.xpath(action.node, namespaces=self.nsmap)[0] node.getparent().remove(node) def _handle_InsertNode(self, action, tree): target = tree.xpath(action.target, namespaces=self.nsmap)[0] node = target.makeelement(action.tag) target.insert(action.position, node) def _handle_RenameNode(self, action, tree): tree.xpath(action.node, namespaces=self.nsmap)[0].tag = action.tag def _handle_MoveNode(self, action, tree): node = tree.xpath(action.node, namespaces=self.nsmap)[0] node.getparent().remove(node) target = tree.xpath(action.target)[0] target.insert(action.position, node) def _handle_UpdateTextIn(self, action, tree): tree.xpath(action.node, namespaces=self.nsmap)[0].text = action.text def _handle_UpdateTextAfter(self, action, tree): tree.xpath(action.node, namespaces=self.nsmap)[0].tail = action.text def _handle_UpdateAttrib(self, action, tree): node = tree.xpath(action.node, namespaces=self.nsmap)[0] # This should not be used to insert new attributes. assert action.name in node.attrib node.attrib[action.name] = action.value def _handle_DeleteAttrib(self, action, tree): del tree.xpath(action.node, namespaces=self.nsmap)[0].attrib[action.name] def _handle_InsertAttrib(self, action, tree): node = tree.xpath(action.node, namespaces=self.nsmap)[0] # This should not be used to update existing attributes. assert action.name not in node.attrib node.attrib[action.name] = action.value def _handle_RenameAttrib(self, action, tree): node = tree.xpath(action.node, namespaces=self.nsmap)[0] assert action.oldname in node.attrib assert action.newname not in node.attrib node.attrib[action.newname] = node.attrib[action.oldname] del node.attrib[action.oldname] def _handle_InsertComment(self, action, tree): target = tree.xpath(action.target)[0] target.insert(action.position, etree.Comment(action.text)) def _handle_InsertNamespace(self, action, tree): self.nsmap[action.prefix] = action.uri def _handle_DeleteNamespace(self, action, tree): # Nothing needs to be done, it will be handled by cleanup pass class DiffParser: """Makes a text diff into a list of actions""" def parse(self, diff): incomplete = "" for line in diff.splitlines(): line = incomplete + line if line[0] != "[": # All actions should start with "[" raise ValueError("Unknown diff format") if line[-1] != "]": # This line has been broken into several lines incomplete = line continue # OK, we found an action incomplete = "" yield self.make_action(line) if incomplete: raise ValueError("Diff ended unexpectedly") def make_action(self, line): # Remove brackets line = line[1:-1] # Split the line on commas (ignoring commas in quoted strings) and # strip extraneous spaces. The first is the action, the rest params. parts = [x.strip() for x in next(reader([line]))] action = parts[0] params = parts[1:] # Get the method, and return the result of calling it method = getattr(self, "_handle_" + action.replace("-", "_")) return method(*params) def _handle_delete(self, node): return actions.DeleteNode(node) def _handle_insert(self, target, tag, position): return actions.InsertNode(target, tag, int(position)) def _handle_rename(self, node, tag): return actions.RenameNode(node, tag) def _handle_move(self, node, target, position): return actions.MoveNode(node, target, int(position)) def _handle_update_text(self, node, text): return actions.UpdateTextIn(node, loads(text)) def _handle_update_text_after(self, node, text): return actions.UpdateTextAfter(node, loads(text)) def _handle_update_attribute(self, node, name, value): return actions.UpdateAttrib(node, name, loads(value)) def _handle_delete_attribute(self, node, name): return actions.DeleteAttrib(node, name) def _handle_insert_attribute(self, node, name, value): return actions.InsertAttrib(node, name, loads(value)) def _handle_rename_attribute(self, node, oldname, newname): return actions.RenameAttrib(node, oldname, newname) def _handle_insert_comment(self, target, position, text): return actions.InsertComment(target, int(position), loads(text)) def _handle_insert_namespace(self, prefix, uri): return actions.InsertNamespace(prefix, uri) def _handle_delete_namespace(self, prefix): return actions.DeleteNamespace(prefix) xmldiff-2.6.3/xmldiff/utils.py000066400000000000000000000102711443244161100163170ustar00rootroot00000000000000import re from operator import eq # This namespace is reserved for lxml internal use, which only # means we get an error if we try registering it: RESERVED_NS = re.compile(r"ns\d+", flags=re.ASCII) def post_order_traverse(node): for child in node.getchildren(): # PY3: Man, I want yield from! yield from post_order_traverse(child) yield node def reverse_post_order_traverse(node): for child in reversed(node.getchildren()): # PY3: Man, I want yield from! yield from reverse_post_order_traverse(child) yield node def breadth_first_traverse(node): # First yield the root node queue = [node] while queue: item = queue.pop(0) yield item queue.extend(item.getchildren()) # LCS from Myers: An O(ND) Difference Algorithm and Its Variations. This # implementation uses Chris Marchetti's technique of only keeping the history # per dpath, and not per node, so it should be vastly less memory intensive. # It also skips any items that are equal in the beginning and end, speeding # up the search, and using even less memory. def longest_common_subsequence(left_sequence, right_sequence, eqfn=eq): start = 0 lend = lslen = len(left_sequence) rend = rslen = len(right_sequence) # Trim off the matching items at the beginning while ( start < lend and start < rend and eqfn(left_sequence[start], right_sequence[start]) ): start += 1 # trim off the matching items at the end while ( start < lend and start < rend and eqfn(left_sequence[lend - 1], right_sequence[rend - 1]) ): lend -= 1 rend -= 1 left = left_sequence[start:lend] right = right_sequence[start:rend] lmax = len(left) rmax = len(right) furthest = {1: (0, [])} if not lmax + rmax: # The sequences are equal r = range(lslen) return zip(r, r) for d in range(0, lmax + rmax + 1): for k in range(-d, d + 1, 2): if k == -d or (k != d and furthest[k - 1][0] < furthest[k + 1][0]): # Go down old_x, history = furthest[k + 1] x = old_x else: # Go left old_x, history = furthest[k - 1] x = old_x + 1 # Copy the history history = history[:] y = x - k while x < lmax and y < rmax and eqfn(left[x], right[y]): # We found a match history.append((x + start, y + start)) x += 1 y += 1 if x >= lmax and y >= rmax: # This is the best match return ( [(e, e) for e in range(start)] + history + list(zip(range(lend, lslen), range(rend, rslen))) ) else: furthest[k] = (x, history) WHITESPACE = re.compile("\\s+", flags=re.MULTILINE) def cleanup_whitespace(text): return WHITESPACE.sub(" ", text) def getpath(element, tree=None): if tree is None: tree = element.getroottree() xpath = tree.getpath(element) if xpath[-1] != "]": # The path is unique without specifying a count. However, we always # want that count, so we add [1]. xpath = xpath + "[1]" return xpath # The remainder of the functions here are helpful when debugging. # They aren't documented, nor very well tested. def _make_ascii_tree(element, indent=""): from xmldiff.formatting import DIFF_NS # Avoid circular imports diffns = "{%s}" % DIFF_NS if element.prefix: name = "{}:{}".format(element.prefix, element.tag.split("}")[1]) else: name = element.tag diff_attrs = [attr for attr in element.attrib if attr.startswith(diffns)] if diff_attrs: diff = "(%s)" % ", ".join(attr.split("}")[1] for attr in diff_attrs) else: diff = "" result = [" ".join((indent, name, diff))] indent = " " + indent for child in element.getchildren(): result.extend(_make_ascii_tree(child, indent)) return result def make_ascii_tree(element): result = _make_ascii_tree(element) return "\n".join(result)