pax_global_header00006660000000000000000000000064144505674450014530gustar00rootroot0000000000000052 comment=e0937eb9af3f4dbecf3ca35d37af0e7e6cd55c07 html-sanitizer-2.2/000077500000000000000000000000001445056744500143455ustar00rootroot00000000000000html-sanitizer-2.2/.editorconfig000066400000000000000000000003101445056744500170140ustar00rootroot00000000000000# top-most EditorConfig file root = true [*] end_of_line = lf insert_final_newline = true charset = utf-8 trim_trailing_whitespace = true indent_style = space indent_size = 2 [*.py] indent_size = 4 html-sanitizer-2.2/.github/000077500000000000000000000000001445056744500157055ustar00rootroot00000000000000html-sanitizer-2.2/.github/workflows/000077500000000000000000000000001445056744500177425ustar00rootroot00000000000000html-sanitizer-2.2/.github/workflows/test.yml000066400000000000000000000013261445056744500214460ustar00rootroot00000000000000name: Tests on: push: branches: [main] pull_request: branches: [main] jobs: build: runs-on: ubuntu-latest strategy: matrix: python-version: ["3.8", "3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip python -m pip install beautifulsoup4 lxml if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Run tests run: | python -m unittest discover -v html-sanitizer-2.2/.gitignore000066400000000000000000000001411445056744500163310ustar00rootroot00000000000000*.py? *~ *.sw? .DS_Store ._* /MANIFEST _build build dist .eggs *.egg-info .tox .coverage htmlcov html-sanitizer-2.2/.pre-commit-config.yaml000066400000000000000000000023051445056744500206260ustar00rootroot00000000000000exclude: ".yarn/|yarn.lock|\\.min\\.(css|js)$" repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.4.0 hooks: - id: check-added-large-files - id: check-builtin-literals - id: check-executables-have-shebangs - id: check-merge-conflict - id: check-toml - id: check-yaml - id: detect-private-key - id: end-of-file-fixer - id: mixed-line-ending - id: trailing-whitespace - repo: https://github.com/adamchainz/django-upgrade rev: 1.13.0 hooks: - id: django-upgrade args: [--target-version, "3.2"] - repo: https://github.com/charliermarsh/ruff-pre-commit rev: "v0.0.272" hooks: - id: ruff - repo: https://github.com/psf/black rev: 23.3.0 hooks: - id: black - repo: https://github.com/pre-commit/mirrors-prettier rev: v3.0.0-alpha.9-for-vscode hooks: - id: prettier args: [--list-different, --no-semi] exclude: "^conf/|.*\\.html$" - repo: https://github.com/tox-dev/pyproject-fmt rev: 0.11.2 hooks: - id: pyproject-fmt - repo: https://github.com/abravalheri/validate-pyproject rev: v0.13 hooks: - id: validate-pyproject html-sanitizer-2.2/CHANGELOG.rst000066400000000000000000000132321445056744500163670ustar00rootroot00000000000000========== Change log ========== Next version ============ 2.2 (2023-07-03) ================ - Changed ``keep_normalized_whitespace`` to preserve whitespace at the tail of tags, not just between tags. - Changed the parameters of ``normalize_whitespace_in_text_or_tail`` to be keyword-only. 2.1 (2023-06-29) ================ - Added a test for a type of misconfiguration. - Changed the sanitizer configuration validation to not allow unexpected data types in ``tags``, ``empty``, ``separate``, ``whitespace`` and ``attributes``. 2.0 (2023-06-28) ================ - Raised the minimum Python version to 3.7. Added Python 3.10, 3.11. - Raised the minimum lxml version to the current 4.9.1. - Switched from Travis CI to GitHub actions. Added Python 3.9 to the CI matrix. - Renamed the main branch to main. - Switched to a declarative setup. - Fixed a whitespace dependency in the testsuite. - Switched to hatchling and ruff. - Made behavior-altering arguments to ``normalize_overall_whitespace`` keyword-only. `1.9`_ (2020-01-20) =================== - Added Python 3.8 to the CI matrix. - Be able to keep the ``bar", "foobar")], sanitizer=Sanitizer( { "tags": {"impossible tag"}, "attributes": {}, "empty": set(), "separate": set(), } ), ) # allow style tag but no style attribute self.run_tests( [ ( "foobar", "foobar", ), ('

bla

', "

bla

"), ], sanitizer=Sanitizer( { "tags": {"h2", "style"}, "attributes": {}, "empty": set(), "separate": set(), } ), ) # allow style tag and style attribute self.run_tests( [ ( "foobar", "foobar", ), ( '

bla

', '

bla

', ), ], sanitizer=Sanitizer( { "tags": {"h2", "style"}, "attributes": {"h2": {"style"}}, "empty": set(), "separate": set(), } ), ) def test_billion_laughs(self): before = """\ ]> &lol9; """ after = """\ <!ELEMENT lolz (#PCDATA)> <!ENTITY lol1 "&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;"> <!ENTITY lol2 "&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;"> <!ENTITY lol3 "&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;"> <!ENTITY lol4 "&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;"> <!ENTITY lol5 "&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;"> <!ENTITY lol6 "&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;"> <!ENTITY lol7 "&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;&lol6;"> <!ENTITY lol8 "&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;&lol7;"> <!ENTITY lol9 "&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;&lol8;"> ]> &lol9; """ external_entities = """\ ]>&xxe; """ self.run_tests( [ (before, after), ( external_entities, """<!ENTITY xxe SYSTEM "file:///dev/random" >]>&xxe;""", ), ], strip=True, ) def test_data_attributes(self): sanitizer = Sanitizer( { "tags": ["span"], "empty": (), "separate": (), "attributes": {"span": {"data-title"}}, } ) entries = ( ( 'Content', 'Content', ), ( 'Content', 'Content', ), ) self.run_tests(entries, sanitizer=sanitizer) def test_entities(self): self.run_tests( [ ("‘", "\u2018"), ], ) def test_invalid_attributes(self): with self.assertRaisesRegex(TypeError, "Expected a set but got"): Sanitizer({"attributes": {"p": ("class")}}) with self.assertRaisesRegex(TypeError, "Expected a set but got"): Sanitizer({"tags": "blub"}) with self.assertRaisesRegex(TypeError, 'Tags in "empty", but not allowed:'): Sanitizer({"tags": {"blub"}}) html-sanitizer-2.2/pyproject.toml000066400000000000000000000040471445056744500172660ustar00rootroot00000000000000[build-system] build-backend = "hatchling.build" requires = [ "hatchling", ] [project] name = "html-sanitizer" description = "HTML sanitizer" readme = "README.rst" license = {text = "BSD-3-Clause"} authors = [ { name = "Matthias Kestenholz", email = "mk@feinheit.ch" }, ] requires-python = ">=3.8" classifiers = [ "Development Status :: 5 - Production/Stable", "Environment :: Web Environment", "Framework :: Django", "Intended Audience :: Developers", "License :: OSI Approved :: BSD License", "Operating System :: OS Independent", "Programming Language :: Python", "Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Topic :: Internet :: WWW/HTTP :: Dynamic Content", "Topic :: Software Development", ] dynamic = [ "version", ] dependencies = [ "beautifulsoup4", "lxml>=4.9.1", ] [project.urls] Homepage = "https://github.com/matthiask/html-sanitizer/" [tool.hatch.version] path = "html_sanitizer/__init__.py" [tool.ruff] extend-select = [ # pyflakes, pycodestyle "F", "E", "W", # mmcabe "C90", # isort "I", # pep8-naming "N", # pyupgrade "UP", # flake8-2020 "YTT", # flake8-boolean-trap "FBT", # flake8-bugbear "B", # flake8-comprehensions "C4", # flake8-django "DJ", # flake8-logging-format "G", # flake8-pie "PIE", # flake8-simplify "SIM", # flake8-tidy-imports "TID", # flake8-gettext "INT", # pygrep-hooks "PGH", # pylint "PL", # unused noqa "RUF100", ] extend-ignore = [ # Allow zip() without strict= "B905", # No line length errors "E501", ] fix = true show-fixes = true target-version = "py38" [tool.ruff.isort] combine-as-imports = true lines-after-imports = 2 [tool.ruff.mccabe] max-complexity = 15 [tool.ruff.per-file-ignores] "*/migrat*/*" = [ # Allow using PascalCase model names in migrations "N806", # Ignore the fact that migration files are invalid module names "N999", ] html-sanitizer-2.2/tox.ini000066400000000000000000000002731445056744500156620ustar00rootroot00000000000000[testenv] deps = wheel lxml beautifulsoup4 coverage changedir = {toxinidir} skip_install = true commands = coverage run -m unittest discover -v coverage report -m