pax_global_header 0000666 0000000 0000000 00000000064 15005045421 0014506 g ustar 00root root 0000000 0000000 52 comment=6422af1d50f1f46624deab585424e1ac4d08ba94
python-charset-normalizer-3.4.2/ 0000775 0000000 0000000 00000000000 15005045421 0016644 5 ustar 00root root 0000000 0000000 python-charset-normalizer-3.4.2/.coveragerc 0000664 0000000 0000000 00000001011 15005045421 0020756 0 ustar 00root root 0000000 0000000 [run]
source =
charset_normalizer
# Needed for Python 3.11 and lower
disable_warnings = no-sysmon
[paths]
source =
src/charset_normalizer
*/charset_normalizer
*\charset_normalizer
[report]
omit =
src/charset_normalizer/__main__.py
exclude_lines =
except ModuleNotFoundError:
except ImportError:
pass
import
raise NotImplementedError
.* # Platform-specific.*
.*:.* # Python \d.*
.* # Abstract
.* # Defensive:
if (?:typing.)?TYPE_CHECKING:
^\s*?\.\.\.\s*$
python-charset-normalizer-3.4.2/.github/ 0000775 0000000 0000000 00000000000 15005045421 0020204 5 ustar 00root root 0000000 0000000 python-charset-normalizer-3.4.2/.github/CODEOWNERS 0000664 0000000 0000000 00000000452 15005045421 0021600 0 ustar 00root root 0000000 0000000 # Restrict all files related to deploying to
# require lead maintainer approval.
.github/workflows/ @Ousret
.github/CODEOWNERS @Ousret
src/charset_normalizer/ @Ousret
pyproject.toml @Ousret
tests/ @Ousret
data/ @Ousret
python-charset-normalizer-3.4.2/.github/FUNDING.yml 0000664 0000000 0000000 00000000143 15005045421 0022017 0 ustar 00root root 0000000 0000000 # These are supported funding model platforms
tidelift: pypi/charset-normalizer
github:
- Ousret
python-charset-normalizer-3.4.2/.github/ISSUE_TEMPLATE/ 0000775 0000000 0000000 00000000000 15005045421 0022367 5 ustar 00root root 0000000 0000000 python-charset-normalizer-3.4.2/.github/ISSUE_TEMPLATE/bug_report.md 0000664 0000000 0000000 00000001307 15005045421 0025062 0 ustar 00root root 0000000 0000000 ---
name: Bug report
about: Create a report to help us fix something bad like an exception
title: "[BUG]"
labels: bug, help wanted
assignees: ''
---
**Describe the bug**
A clear and concise description of what the bug/exception is.
**To Reproduce**
Give us the target text file. Host it somewhere with untouched encoding.
**Expected behavior**
A clear and concise description of what you expected to happen.
**Logs**
If applicable, add console outputs to help explain your problem.
**Desktop (please complete the following information):**
- OS: [e.g. Linux, Windows or Mac]
- Python version [e.g. 3.5]
- Package version [eg. 2.0.0]
**Additional context**
Add any other context about the problem here.
python-charset-normalizer-3.4.2/.github/ISSUE_TEMPLATE/feature_request.md 0000664 0000000 0000000 00000001146 15005045421 0026116 0 ustar 00root root 0000000 0000000 ---
name: Feature request
about: Suggest an idea for this project
title: "[Proposal]"
labels: enhancement
assignees: ''
---
**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
**Describe the solution you'd like**
A clear and concise description of what you want to happen.
**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.
**Additional context**
Add any other context or screenshots about the feature request here.
python-charset-normalizer-3.4.2/.github/ISSUE_TEMPLATE/wrong_charset.md 0000664 0000000 0000000 00000003063 15005045421 0025560 0 ustar 00root root 0000000 0000000 ---
name: Wrong charset / Detection issue
about: Create a report to help us improve the detection mechanism
title: "[DETECTION]"
labels: help wanted, detection
assignees: ''
---
**Notice**
I hereby announce that my raw input is not :
- Too small content (<=32 characters) as I do know that ANY charset detector heavily depends on content
- Encoded in a deprecated/abandoned encoding that is not even supported by my interpreter
**Provide the file**
A accessible way of retrieving the file concerned. Host it somewhere with untouched encoding.
**Verbose output**
Using the CLI, run `normalizer -v ./my-file.txt` and past the result in here.
```
(venv) >normalizer -v ./data/sample.1.ar.srt
2021-05-21 08:38:44,050 | DEBUG | ascii does not fit given bytes sequence at ALL. 'ascii' codec can't decode byte 0xca in position 54: ordinal not in range(128)
2021-05-21 08:38:44,051 | DEBUG | big5 does not fit given bytes sequence at ALL. 'big5' codec can't decode byte 0xc9 in position 60: illegal multibyte sequence
2021-05-21 08:38:44,051 | DEBUG | big5hkscs does not fit given bytes sequence at ALL. 'big5hkscs' codec can't decode byte 0xc9 in position 60: illegal multibyte sequence
....
```
**Expected encoding**
A clear and concise description of what you expected as encoding. Any more details about how the current guess is wrong
is very much appreciated.
**Desktop (please complete the following information):**
- OS: [e.g. Linux, Windows or Mac]
- Python version [e.g. 3.5]
- Package version [eg. 2.0.0]
**Additional context**
Add any other context about the problem here.
python-charset-normalizer-3.4.2/.github/dependabot.yml 0000664 0000000 0000000 00000001133 15005045421 0023032 0 ustar 00root root 0000000 0000000 # To get started with Dependabot version updates, you'll need to specify which
# package ecosystems to update and where the package manifests are located.
# Please see the documentation for all configuration options:
# https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
version: 2
updates:
- package-ecosystem: "pip" # See documentation for possible values
directory: "/" # Location of package manifests
schedule:
interval: "monthly"
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "monthly"
python-charset-normalizer-3.4.2/.github/workflows/ 0000775 0000000 0000000 00000000000 15005045421 0022241 5 ustar 00root root 0000000 0000000 python-charset-normalizer-3.4.2/.github/workflows/cd.yml 0000664 0000000 0000000 00000011541 15005045421 0023354 0 ustar 00root root 0000000 0000000 name: Continuous Delivery
on:
workflow_dispatch:
release:
types:
- created
permissions:
contents: read
jobs:
pre_flight_check:
name: Preflight Checks
uses: ./.github/workflows/ci.yml
universal-wheel:
name: Build Universal Wheel
runs-on: ubuntu-latest
needs:
- pre_flight_check
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: '3'
- name: Update pip, install build
run: |
python -m pip install --upgrade pip
python -m pip install build
- name: Build Wheel
env:
CHARSET_NORMALIZER_USE_MYPYC: '0'
run: python -m build
- name: Upload artifacts
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
with:
name: dist-universal
path: dist
build-wheels:
name: Build wheels on ${{ matrix.os }} ${{ matrix.qemu }}
runs-on: ${{ matrix.os }}
needs: pre_flight_check
strategy:
matrix:
os: [ ubuntu-22.04, windows-latest, macos-13 ]
qemu: [ '' ]
include:
# Split ubuntu job for the sake of speed-up
- os: ubuntu-latest
qemu: aarch64
- os: ubuntu-latest
qemu: ppc64le
- os: ubuntu-latest
qemu: s390x
steps:
- name: Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
submodules: true
- name: Set up QEMU
if: ${{ matrix.qemu }}
uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0
with:
platforms: all
id: qemu
- name: Prepare emulation
run: |
if [[ -n "${{ matrix.qemu }}" ]]; then
# Build emulated architectures only if QEMU is set,
# use default "auto" otherwise
echo "CIBW_ARCHS_LINUX=${{ matrix.qemu }}" >> $GITHUB_ENV
fi
shell: bash
- name: Setup Python
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
- name: Build wheels
uses: pypa/cibuildwheel@faf86a6ed7efa889faf6996aa23820831055001a # v2.23.3
env:
CIBW_BUILD_FRONTEND: build
CIBW_ARCHS_MACOS: universal2
CIBW_ENVIRONMENT: CHARSET_NORMALIZER_USE_MYPYC='1'
CIBW_TEST_REQUIRES: pytest
CIBW_TEST_COMMAND: pytest -c {package} {package}/tests
CIBW_SKIP: pp*
- name: Upload artifacts
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
with:
name: dist-${{ matrix.os }}-${{ matrix.qemu }}
path: ./wheelhouse/*.whl
checksum:
name: Compute hashes
runs-on: ubuntu-latest
needs:
- build-wheels
- universal-wheel
outputs:
hashes: ${{ steps.compute.outputs.hashes }}
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Download distributions
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093
with:
pattern: dist*
merge-multiple: true
path: dist
- name: Collected dists
run: |
tree dist
- name: Generate hashes
id: compute # needs.checksum.outputs.hashes
working-directory: ./dist
run: echo "hashes=$(sha256sum * | base64 -w0)" >> $GITHUB_OUTPUT
provenance:
needs: checksum
uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
permissions:
actions: read
id-token: write
contents: write
with:
base64-subjects: ${{ needs.checksum.outputs.hashes }}
upload-assets: true
compile-generator: true
deploy:
name: 🚀 Deploy to PyPi
runs-on: ubuntu-latest
if: startsWith(github.ref, 'refs/tags/')
permissions:
id-token: write
contents: write
needs: provenance
environment:
name: pypi
url: https://pypi.org/project/charset-normalizer/
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Download distributions
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093
with:
pattern: dist*
merge-multiple: true
path: dist
- name: Collected dists
run: |
tree dist
- name: Publish package distributions to PyPI
uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # release/v1
- name: Upload dists to GitHub Release
env:
GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
run: |
gh release upload ${{ github.ref_name }} dist/* --repo ${{ github.repository }}
python-charset-normalizer-3.4.2/.github/workflows/ci.yml 0000664 0000000 0000000 00000015105 15005045421 0023361 0 ustar 00root root 0000000 0000000 name: Continuous Integration
on:
workflow_call:
pull_request:
push:
branches:
- master
permissions:
contents: read
jobs:
lint:
name: 🎨 Linters
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: '3'
- name: Install nox
run: python -m pip install nox
- name: Pre-commit checks
run: nox -s lint
tests:
name: ✅ Tests
runs-on: ubuntu-22.04
strategy:
fail-fast: false
matrix:
python-version:
- "3.7"
- "3.8"
- "3.9"
- "3.10"
- "3.11"
- "3.12"
- "3.13"
- "3.14"
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: ${{ matrix.python-version }}
allow-prereleases: true
- name: Install dependencies
run: python -m pip install nox
- name: Run tests
run: nox -s test-${{ matrix.python-version }}
- name: "Upload artifact"
uses: "actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02"
with:
name: coverage-data-${{ matrix.python-version }}
path: ".coverage.*"
include-hidden-files: true
if-no-files-found: error
detection_coverage:
needs:
- tests
name: 📈 Detection Coverage
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: '3'
- name: Install dependencies
run: python -m pip install nox
- name: Coverage WITH preemptive
run: nox -s coverage -- --coverage 97 --with-preemptive
- name: Coverage WITHOUT preemptive
run: nox -s coverage -- --coverage 95
- name: "Upload artifact"
uses: "actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02"
with:
name: coverage-data-using-internal-coverage
path: ".coverage.*"
include-hidden-files: true
if-no-files-found: error
integration_test:
needs:
- tests
name: 🔗 Integration Tests
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
downstream_project:
- niquests
- requests
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: '3'
- name: Install dependencies
run: pip install nox
- name: Integration Tests with Requests
run: nox -s downstream_${{ matrix.downstream_project }}
chardet_bc:
name: ⏪ Chardet Backward-Compatibility Test
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: '3'
- name: Install dependencies
run: pip install nox
- name: BC Coverage
run: nox -s backward_compatibility -- --coverage 80
mypyc_test:
name: ⚡ MypyC Tests
needs:
- tests
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
python-version:
- "3.8"
- "3.9"
- "3.10"
- "3.11"
- "3.12"
- "3.13"
os: [ ubuntu-latest, macos-latest, windows-latest ]
include:
- python-version: "3.7"
os: ubuntu-22.04
- python-version: "3.7"
os: macos-13
- python-version: "3.7"
os: windows-latest
env:
PYTHONIOENCODING: utf8 # only needed for Windows (console IO output encoding)
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: ${{ matrix.python-version }}
allow-prereleases: true
- name: Install nox
run: pip install nox
- name: Run tests with mypyc enabled
run: nox -s test_mypyc-${{ matrix.python-version }}
- name: "Upload artifact"
uses: "actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02"
with:
name: coverage-data-mypyc-${{ matrix.os }}-${{ matrix.python-version }}
path: ".coverage.*"
include-hidden-files: true
if-no-files-found: error
coverage:
if: always()
runs-on: "ubuntu-latest"
needs:
- tests
- mypyc_test
- detection_coverage
steps:
- name: "Checkout repository"
uses: "actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683"
- name: "Setup Python"
uses: "actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065"
with:
python-version: "3.x"
- name: "Install coverage"
run: "python -m pip install --upgrade coverage"
- name: "Download artifact"
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093
with:
pattern: coverage-data*
merge-multiple: true
- name: "Combine & check coverage"
run: |
python -m coverage combine
python -m coverage html --skip-covered --skip-empty
python -m coverage report --ignore-errors --show-missing --fail-under=92
- name: "Upload report"
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
with:
name: coverage-report
path: htmlcov
performance:
name: ⚡ Performance Test
runs-on: ubuntu-latest
needs: coverage
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: '3'
- name: Install dependencies
run: pip install nox
- name: Performance Measurement
run: nox -s performance
python-charset-normalizer-3.4.2/.github/workflows/codeql.yml 0000664 0000000 0000000 00000003250 15005045421 0024233 0 ustar 00root root 0000000 0000000 # For most projects, this workflow file will not need changing; you simply need
# to commit it to your repository.
#
# You may wish to alter this file to override the set of languages analyzed,
# or to provide custom queries or build logic.
#
# ******** NOTE ********
# We have attempted to detect the languages in your repository. Please check
# the `language` matrix defined below to confirm you have the correct set of
# supported CodeQL languages.
#
name: "CodeQL"
permissions:
contents: read
on:
push:
branches: [ "master", "2.1.x" ]
pull_request:
branches: [ "master", "2.1.x" ]
schedule:
- cron: '39 1 * * 6'
jobs:
analyze:
name: Analyze
runs-on: ubuntu-latest
permissions:
actions: read
contents: read
security-events: write
strategy:
fail-fast: false
matrix:
language: [ 'python' ]
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@28deaeda66b76a05916b6923827895f2b14ab387 # v3.28.16
with:
languages: ${{ matrix.language }}
# Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java).
# If this step fails, then you should remove it and run the build manually (see below)
- name: Autobuild
uses: github/codeql-action/autobuild@28deaeda66b76a05916b6923827895f2b14ab387 # v3.28.16
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@28deaeda66b76a05916b6923827895f2b14ab387 # v3.28.16
with:
category: "/language:${{matrix.language}}"
python-charset-normalizer-3.4.2/.github/workflows/scorecards.yml 0000664 0000000 0000000 00000005521 15005045421 0025117 0 ustar 00root root 0000000 0000000 # This workflow uses actions that are not certified by GitHub. They are provided
# by a third-party and are governed by separate terms of service, privacy
# policy, and support documentation.
name: Scorecard supply-chain security
on:
# For Branch-Protection check. Only the default branch is supported. See
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
branch_protection_rule:
# To guarantee Maintained check is occasionally updated. See
# https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
schedule:
- cron: '20 7 * * 2'
push:
branches: ["master"]
# Declare default permissions as read only.
permissions: read-all
jobs:
analysis:
name: Scorecard analysis
runs-on: ubuntu-latest
permissions:
# Needed to upload the results to code-scanning dashboard.
security-events: write
# Needed to publish results and get a badge (see publish_results below).
id-token: write
contents: read
actions: read
steps:
- name: "Checkout code"
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
persist-credentials: false
- name: "Run analysis"
uses: ossf/scorecard-action@f49aabe0b5af0936a0987cfb85d86b75731b0186 # v2.4.1
with:
results_file: results.sarif
results_format: sarif
# (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
# - you want to enable the Branch-Protection check on a *public* repository, or
# - you are installing Scorecards on a *private* repository
# To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat.
# repo_token: ${{ secrets.SCORECARD_TOKEN }}
# Public repositories:
# - Publish results to OpenSSF REST API for easy access by consumers
# - Allows the repository to include the Scorecard badge.
# - See https://github.com/ossf/scorecard-action#publishing-results.
# For private repositories:
# - `publish_results` will always be set to `false`, regardless
# of the value entered here.
publish_results: true
# Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
# format to the repository Actions tab.
- name: "Upload artifact"
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: SARIF file
path: results.sarif
retention-days: 5
# Upload the results to GitHub's code scanning dashboard.
- name: "Upload to code-scanning"
uses: github/codeql-action/upload-sarif@28deaeda66b76a05916b6923827895f2b14ab387 # v3.28.16
with:
sarif_file: results.sarif
python-charset-normalizer-3.4.2/.gitignore 0000664 0000000 0000000 00000003307 15005045421 0020637 0 ustar 00root root 0000000 0000000 # Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
.idea/
char-dataset/
python-charset-normalizer-3.4.2/.pre-commit-config.yaml 0000664 0000000 0000000 00000001431 15005045421 0023124 0 ustar 00root root 0000000 0000000 exclude: 'docs/|data/|tests/'
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: check-yaml
- id: debug-statements
- id: end-of-file-fixer
- id: trailing-whitespace
- repo: https://github.com/asottile/pyupgrade
rev: v3.19.1
hooks:
- id: pyupgrade
args: [ --py37-plus, --keep-runtime-typing ]
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.9.1
hooks:
# Run the linter.
- id: ruff
args: [ --fix ]
# Run the formatter.
- id: ruff-format
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.14.1
hooks:
- id: mypy
args: [ --check-untyped-defs ]
exclude: 'tests/|noxfile.py|setup.py|bin/'
python-charset-normalizer-3.4.2/.readthedocs.yaml 0000664 0000000 0000000 00000000621 15005045421 0022072 0 ustar 00root root 0000000 0000000 version: 2
build:
os: ubuntu-22.04
tools:
python: "3.10"
# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: docs/conf.py
# If using Sphinx, optionally build your docs in additional formats such as PDF
# formats:
# - pdf
# Optionally declare the Python requirements required to build your docs
python:
install:
- requirements: docs/requirements.txt
python-charset-normalizer-3.4.2/CHANGELOG.md 0000664 0000000 0000000 00000046644 15005045421 0020473 0 ustar 00root root 0000000 0000000 # Changelog
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [3.4.2](https://github.com/Ousret/charset_normalizer/compare/3.4.1...3.4.2) (2025-05-02)
### Fixed
- Addressed the DeprecationWarning in our CLI regarding `argparse.FileType` by backporting the target class into the package. (#591)
- Improved the overall reliability of the detector with CJK Ideographs. (#605) (#587)
### Changed
- Optional mypyc compilation upgraded to version 1.15 for Python >= 3.8
## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...3.4.1) (2024-12-24)
### Changed
- Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend.
- Enforce annotation delayed loading for a simpler and consistent types in the project.
- Optional mypyc compilation upgraded to version 1.14 for Python >= 3.8
### Added
- pre-commit configuration.
- noxfile.
### Removed
- `build-requirements.txt` as per using `pyproject.toml` native build configuration.
- `bin/integration.py` and `bin/serve.py` in favor of downstream integration test (see noxfile).
- `setup.cfg` in favor of `pyproject.toml` metadata configuration.
- Unused `utils.range_scan` function.
### Fixed
- Converting content to Unicode bytes may insert `utf_8` instead of preferred `utf-8`. (#572)
- Deprecation warning "'count' is passed as positional argument" when converting to Unicode bytes on Python 3.13+
## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)
### Added
- Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.
- Support for Python 3.13 (#512)
### Fixed
- Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537)
- Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)
## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
### Fixed
- Unintentional memory usage regression when using large payload that match several encoding (#376)
- Regression on some detection case showcased in the documentation (#371)
### Added
- Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
### Changed
- Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
- Improved the general detection reliability based on reports from the community
## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
### Added
- Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
- Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
### Removed
- (internal) Redundant utils.is_ascii function and unused function is_private_use_only
- (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
### Changed
- (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
### Fixed
- Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
### Changed
- Typehint for function `from_path` no longer enforce `PathLike` as its first argument
- Minor improvement over the global detection reliability
### Added
- Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
- Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
- Explicit support for Python 3.12
### Fixed
- Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
### Added
- Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
### Removed
- Support for Python 3.6 (PR #260)
### Changed
- Optional speedup provided by mypy/c 1.0.1
## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
### Fixed
- Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
### Changed
- Speedup provided by mypy/c 0.990 on Python >= 3.7
## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
### Added
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
### Changed
- Build with static metadata using 'build' frontend
- Make the language detection stricter
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
### Fixed
- CLI with opt --normalize fail when using full path for files
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
- Sphinx warnings when generating the documentation
### Removed
- Coherence detector no longer return 'Simple English' instead return 'English'
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
- Breaking: Method `first()` and `best()` from CharsetMatch
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
- Breaking: Top-level function `normalize`
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
- Support for the backport `unicodedata2`
## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
### Added
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
### Changed
- Build with static metadata using 'build' frontend
- Make the language detection stricter
### Fixed
- CLI with opt --normalize fail when using full path for files
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
### Removed
- Coherence detector no longer return 'Simple English' instead return 'English'
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
### Added
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
### Removed
- Breaking: Method `first()` and `best()` from CharsetMatch
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
### Fixed
- Sphinx warnings when generating the documentation
## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
### Changed
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
### Removed
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
- Breaking: Top-level function `normalize`
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
- Support for the backport `unicodedata2`
## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
### Deprecated
- Function `normalize` scheduled for removal in 3.0
### Changed
- Removed useless call to decode in fn is_unprintable (#206)
### Fixed
- Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
### Added
- Output the Unicode table version when running the CLI with `--version` (PR #194)
### Changed
- Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
- Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
### Fixed
- Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
- CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
### Removed
- Support for Python 3.5 (PR #192)
### Deprecated
- Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
### Fixed
- ASCII miss-detection on rare cases (PR #170)
## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
### Added
- Explicit support for Python 3.11 (PR #164)
### Changed
- The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
### Fixed
- Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
### Changed
- Skipping the language-detection (CD) on ASCII (PR #155)
## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
### Changed
- Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
### Fixed
- Wrong logging level applied when setting kwarg `explain` to True (PR #146)
## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
### Changed
- Improvement over Vietnamese detection (PR #126)
- MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
- Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
- call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
- Code style as refactored by Sourcery-AI (PR #131)
- Minor adjustment on the MD around european words (PR #133)
- Remove and replace SRTs from assets / tests (PR #139)
- Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
- Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
### Fixed
- Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
- Avoid using too insignificant chunk (PR #137)
### Added
- Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
- Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
### Added
- Add support for Kazakh (Cyrillic) language detection (PR #109)
### Changed
- Further, improve inferring the language from a given single-byte code page (PR #112)
- Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
- Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
- Various detection improvement (MD+CD) (PR #117)
### Removed
- Remove redundant logging entry about detected language(s) (PR #115)
### Fixed
- Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
### Fixed
- Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
- Fix CLI crash when using --minimal output in certain cases (PR #103)
### Changed
- Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
### Changed
- The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
- The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
- The Unicode detection is slightly improved (PR #93)
- Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
### Removed
- The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
### Fixed
- In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
- Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
- The MANIFEST.in was not exhaustive (PR #78)
## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
### Fixed
- The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
- Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
- The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
- Submatch factoring could be wrong in rare edge cases (PR #72)
- Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
- Fix line endings from CRLF to LF for certain project files (PR #67)
### Changed
- Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
- Allow fallback on specified encoding if any (PR #71)
## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
### Changed
- Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
- According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
### Fixed
- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
### Changed
- Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
### Fixed
- Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
- Using explain=False permanently disable the verbose output in the current runtime (PR #47)
- One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
- Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
### Changed
- Public function normalize default args values were not aligned with from_bytes (PR #53)
### Added
- You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
### Changed
- 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
- Accent has been made on UTF-8 detection, should perform rather instantaneous.
- The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
- The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
- The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
- utf_7 detection has been reinstated.
### Removed
- This package no longer require anything when used with Python 3.5 (Dropped cached_property)
- Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
- The exception hook on UnicodeDecodeError has been removed.
### Deprecated
- Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
### Fixed
- The CLI output used the relative path of the file(s). Should be absolute.
## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
### Fixed
- Logger configuration/usage no longer conflict with others (PR #44)
## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
### Removed
- Using standard logging instead of using the package loguru.
- Dropping nose test framework in favor of the maintained pytest.
- Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
- Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
- Stop support for UTF-7 that does not contain a SIG.
- Dropping PrettyTable, replaced with pure JSON output in CLI.
### Fixed
- BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
- Not searching properly for the BOM when trying utf32/16 parent codec.
### Changed
- Improving the package final size by compressing frequencies.json.
- Huge improvement over the larges payload.
### Added
- CLI now produces JSON consumable output.
- Return ASCII if given sequences fit. Given reasonable confidence.
## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
### Fixed
- In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
### Fixed
- Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
### Fixed
- The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
### Changed
- Amend the previous release to allow prettytable 2.0 (PR #35)
## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
### Fixed
- Fix error while using the package with a python pre-release interpreter (PR #33)
### Changed
- Dependencies refactoring, constraints revised.
### Added
- Add python 3.9 and 3.10 to the supported interpreters
python-charset-normalizer-3.4.2/CODE_OF_CONDUCT.md 0000664 0000000 0000000 00000006440 15005045421 0021447 0 ustar 00root root 0000000 0000000 # Contributor Covenant Code of Conduct
## Our Pledge
In the interest of fostering an open and welcoming environment, we as
contributors and maintainers pledge to making participation in our project and
our community a harassment-free experience for everyone, regardless of age, body
size, disability, ethnicity, sex characteristics, gender identity and expression,
level of experience, education, socio-economic status, nationality, personal
appearance, race, religion, or sexual identity and orientation.
## Our Standards
Examples of behavior that contributes to creating a positive environment
include:
* Using welcoming and inclusive language
* Being respectful of differing viewpoints and experiences
* Gracefully accepting constructive criticism
* Focusing on what is best for the community
* Showing empathy towards other community members
Examples of unacceptable behavior by participants include:
* The use of sexualized language or imagery and unwelcome sexual attention or
advances
* Trolling, insulting/derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or electronic
address, without explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Our Responsibilities
Project maintainers are responsible for clarifying the standards of acceptable
behavior and are expected to take appropriate and fair corrective action in
response to any instances of unacceptable behavior.
Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct, or to ban temporarily or
permanently any contributor for other behaviors that they deem inappropriate,
threatening, offensive, or harmful.
## Scope
This Code of Conduct applies both within project spaces and in public spaces
when an individual is representing the project or its community. Examples of
representing a project or community include using an official project e-mail
address, posting via an official social media account, or acting as an appointed
representative at an online or offline event. Representation of a project may be
further defined and clarified by project maintainers.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting the project team at ahmed.tahri@cloudnursery.dev. All
complaints will be reviewed and investigated and will result in a response that
is deemed necessary and appropriate to the circumstances. The project team is
obligated to maintain confidentiality with regard to the reporter of an incident.
Further details of specific enforcement policies may be posted separately.
Project maintainers who do not follow or enforce the Code of Conduct in good
faith may face temporary or permanent repercussions as determined by other
members of the project's leadership.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
[homepage]: https://www.contributor-covenant.org
For answers to common questions about this code of conduct, see
https://www.contributor-covenant.org/faq
python-charset-normalizer-3.4.2/CONTRIBUTING.md 0000664 0000000 0000000 00000007015 15005045421 0021100 0 ustar 00root root 0000000 0000000 # Contribution Guidelines
If you’re reading this, you’re probably interested in contributing to Charset Normalizer.
Thank you very much! Open source projects live-and-die based on the support they receive from others,
and the fact that you’re even considering contributing to this project is very generous of you.
## Questions
The GitHub issue tracker is for *bug reports* and *feature requests*.
Questions are allowed only when no answer are provided in docs.
## Good Bug Reports
Please be aware of the following things when filing bug reports:
1. Avoid raising duplicate issues. *Please* use the GitHub issue search feature
to check whether your bug report or feature request has been mentioned in
the past. Duplicate bug reports and feature requests are a huge maintenance
burden on the limited resources of the project. If it is clear from your
report that you would have struggled to find the original, that's ok, but
if searching for a selection of words in your issue title would have found
the duplicate then the issue will likely be closed extremely abruptly.
2. When filing bug reports about exceptions or tracebacks, please include the
*complete* traceback. Partial tracebacks, or just the exception text, are
not helpful. Issues that do not contain complete tracebacks may be closed
without warning.
3. Make sure you provide a suitable amount of information to work with. This
means you should provide:
- Guidance on **how to reproduce the issue**. Ideally, this should be a
*small* code sample that can be run immediately by the maintainers.
Failing that, let us know what you're doing, how often it happens, what
environment you're using, etc. Be thorough: it prevents us needing to ask
further questions.
- Tell us **what you expected to happen**. When we run your example code,
what are we expecting to happen? What does "success" look like for your
code?
- Tell us **what actually happens**. It's not helpful for you to say "it
doesn't work" or "it fails". Tell us *how* it fails: do you get an
exception? A None answer? How was the actual result
different from your expected result?
- Tell us **what version of Charset Normalizer you're using**, and
**how you installed it**. Different versions of Charset Normalizer behave
differently and have different bugs.
If you do not provide all of these things, it will take us much longer to
fix your problem. If we ask you to clarify these, and you never respond, we
will close your issue without fixing it.
## What PR are we accepting?
Mostly anything, from cosmetic to the detection-mechanism improvement at the solo condition that you do not break
the backward-compatibility.
## What PR may be doomed?
- Add support for a Python unsupported charset/encoding
> If you looked carefully at the project, you would see that it aims to be generic whenever possible. So adding a specific prober is out of the question.
- Of course, if the CI/CD are failing
> Getting the discussion started often mean doing the minimum effort to get it Green! (Be reassured, maintainers will look into it, given a reasonable amount of time)
- Submitting a PR without any description OR viable commit description
> This is obvious, maintainers need to understand as fast as possible what are you trying to submit without putting too much effort.
## How to run tests locally?
It is essential that you run, prior to any submissions the mandatory checks.
```shell
pip install nox
nox -s test
nox -s lint
nox -s coverage
```
python-charset-normalizer-3.4.2/LICENSE 0000664 0000000 0000000 00000002057 15005045421 0017655 0 ustar 00root root 0000000 0000000 MIT License
Copyright (c) 2025 TAHRI Ahmed R.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
python-charset-normalizer-3.4.2/MANIFEST.in 0000664 0000000 0000000 00000000337 15005045421 0020405 0 ustar 00root root 0000000 0000000 include LICENSE README.md CHANGELOG.md src/charset_normalizer/py.typed dev-requirements.txt SECURITY.md noxfile.py
recursive-include data *.md
recursive-include data *.txt
recursive-include docs *
recursive-include tests *
python-charset-normalizer-3.4.2/README.md 0000664 0000000 0000000 00000031315 15005045421 0020126 0 ustar 00root root 0000000 0000000
Charset Detection, for Everyone 👋
The Real First Universal Charset Detector
Featured Packages
In other language (unofficial port - by the community)
> A library that helps you read text from an unknown charset encoding.
Motivated by `chardet`,
> I'm trying to resolve the issue by taking a new approach.
> All IANA character set names for which the Python core library provides codecs are supported.
>>>>> 👉 Try Me Online Now, Then Adopt Me 👈 <<<<<
This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
|--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
| `Fast` | ❌ | ✅ | ✅ |
| `Universal**` | ❌ | ✅ | ❌ |
| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
| `License` | LGPL-2.1
_restrictive_ | MIT | MPL-1.1
_restrictive_ |
| `Native Python` | ✅ | ✅ | ❌ |
| `Detect spoken language` | ❌ | ✅ | N/A |
| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
| `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB |
| `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |

*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*
## ⚡ Performance
This package offer better performance than its counterpart Chardet. Here are some numbers.
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
|-----------------------------------------------|:--------:|:------------------:|:------------------:|
| [chardet](https://github.com/chardet/chardet) | 86 % | 63 ms | 16 file/sec |
| charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
| Package | 99th percentile | 95th percentile | 50th percentile |
|-----------------------------------------------|:---------------:|:---------------:|:---------------:|
| [chardet](https://github.com/chardet/chardet) | 265 ms | 71 ms | 7 ms |
| charset-normalizer | 100 ms | 50 ms | 5 ms |
_updated as of december 2024 using CPython 3.12_
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
> Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
> And yes, these results might change at any time. The dataset can be updated to include more files.
> The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
> Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
> (e.g. Supported Encoding) Challenge-them if you want.
## ✨ Installation
Using pip:
```sh
pip install charset-normalizer -U
```
## 🚀 Basic Usage
### CLI
This package comes with a CLI.
```
usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
file [file ...]
The Real First Universal Charset Detector. Discover originating encoding used
on text file. Normalize text to unicode.
positional arguments:
files File(s) to be analysed
optional arguments:
-h, --help show this help message and exit
-v, --verbose Display complementary information about file if any.
Stdout will contain logs about the detection process.
-a, --with-alternative
Output complementary possibilities if any. Top-level
JSON WILL be a list.
-n, --normalize Permit to normalize input file. If not set, program
does not write anything.
-m, --minimal Only output the charset detected to STDOUT. Disabling
JSON output.
-r, --replace Replace file when trying to normalize it instead of
creating a new one.
-f, --force Replace file without asking if you are sure, use this
flag with caution.
-t THRESHOLD, --threshold THRESHOLD
Define a custom maximum amount of chaos allowed in
decoded content. 0. <= chaos <= 1.
--version Show version information and exit.
```
```bash
normalizer ./data/sample.1.fr.srt
```
or
```bash
python -m charset_normalizer ./data/sample.1.fr.srt
```
🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
```json
{
"path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
"encoding": "cp1252",
"encoding_aliases": [
"1252",
"windows_1252"
],
"alternative_encodings": [
"cp1254",
"cp1256",
"cp1258",
"iso8859_14",
"iso8859_15",
"iso8859_16",
"iso8859_3",
"iso8859_9",
"latin_1",
"mbcs"
],
"language": "French",
"alphabets": [
"Basic Latin",
"Latin-1 Supplement"
],
"has_sig_or_bom": false,
"chaos": 0.149,
"coherence": 97.152,
"unicode_path": null,
"is_preferred": true
}
```
### Python
*Just print out normalized text*
```python
from charset_normalizer import from_path
results = from_path('./my_subtitle.srt')
print(str(results.best()))
```
*Upgrade your code without effort*
```python
from charset_normalizer import detect
```
The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
## 😇 Why
When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
reliable alternative using a completely different method. Also! I never back down on a good challenge!
I **don't care** about the **originating charset** encoding, because **two different tables** can
produce **two identical rendered string.**
What I want is to get readable text, the best I can.
In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair Unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
## 🍰 How
- Discard all charset encoding table that could not fit the binary content.
- Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
- Extract matches with the lowest mess detected.
- Additionally, we measure coherence / probe for a language.
**Wait a minute**, what is noise/mess and coherence according to **YOU ?**
*Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
**I established** some ground rules about **what is obvious** when **it seems like** a mess (aka. defining noise in rendered text).
I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
improve or rewrite it.
*Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
## ⚡ Known limitations
- Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
- Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
## ⚠️ About Python EOLs
**If you are running:**
- Python >=2.7,<3.5: Unsupported
- Python 3.5: charset-normalizer < 2.1
- Python 3.6: charset-normalizer < 3.1
- Python 3.7: charset-normalizer < 4.0
Upgrade your Python interpreter as soon as possible.
## 👤 Contributing
Contributions, issues and feature requests are very much welcome.
Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
## 📝 License
Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).
This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
## 💼 For Enterprise
Professional support for charset-normalizer is available as part of the [Tidelift
Subscription][1]. Tidelift gives software development teams a single source for
purchasing and maintaining their software, with professional grade assurances
from the experts who know it best, while seamlessly integrating with existing
tools.
[1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
[](https://www.bestpractices.dev/projects/7297)
python-charset-normalizer-3.4.2/SECURITY.md 0000664 0000000 0000000 00000000311 15005045421 0020430 0 ustar 00root root 0000000 0000000 # Security Disclosures
To report a security vulnerability, please use the [Tidelift security contact](https://tidelift.com/security).
Tidelift will coordinate the fix and disclosure with maintainers.
python-charset-normalizer-3.4.2/bin/ 0000775 0000000 0000000 00000000000 15005045421 0017414 5 ustar 00root root 0000000 0000000 python-charset-normalizer-3.4.2/bin/bc.py 0000664 0000000 0000000 00000006174 15005045421 0020362 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import argparse
from glob import glob
from os.path import isdir
from sys import argv
from chardet import detect as chardet_detect
from charset_normalizer import detect as tbt_detect
from charset_normalizer.utils import iana_name
def calc_equivalence(content: bytes, cp_a: str, cp_b: str):
try:
str_a = content.decode(cp_a)
str_b = content.decode(cp_b)
except UnicodeDecodeError:
return 0.0
character_count = len(str_a)
diff_character_count = sum(chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b))
return 1.0 - (diff_character_count / character_count)
def cli_bc(arguments: list[str]):
parser = argparse.ArgumentParser(
description="BC script checker for Charset-Normalizer with Chardet"
)
parser.add_argument(
"-c",
"--coverage",
action="store",
default=85,
type=int,
dest="coverage",
help="Define the minimum acceptable coverage to succeed",
)
args = parser.parse_args(arguments)
if not isdir("./char-dataset"):
print(
"This script require https://github.com/Ousret/char-dataset to be cloned on package root directory"
)
exit(1)
success_count = 0
total_count = 0
for tbt_path in sorted(glob("./char-dataset/**/*.*")):
total_count += 1
with open(tbt_path, "rb") as fp:
content = fp.read()
chardet_result = chardet_detect(content)
chardet_encoding = chardet_result["encoding"]
charset_normalizer_result = tbt_detect(content)
charset_normalizer_encoding = charset_normalizer_result["encoding"]
if [chardet_encoding, charset_normalizer_encoding].count(None) == 1:
print(
f"⚡⚡ '{tbt_path}' (BC-Break) New('{charset_normalizer_encoding}') vs Legacy('{chardet_encoding}')"
)
continue
if charset_normalizer_encoding == chardet_encoding:
success_count += 1
print(f"✅✅ '{tbt_path}' (BC)")
continue
if (chardet_encoding is None and charset_normalizer_encoding is None) or (
iana_name(chardet_encoding, False)
== iana_name(charset_normalizer_encoding, False)
):
success_count += 1
print(f"✅✅ '{tbt_path}' (BC)")
continue
calc_eq = calc_equivalence(
content, chardet_encoding, charset_normalizer_encoding
)
if calc_eq >= 0.98:
success_count += 1
print(
f"️✅ ️'{tbt_path}' (got '{charset_normalizer_encoding}' but "
f"eq {chardet_encoding} WITH {round(calc_eq * 100.0, 3)} %)"
)
continue
print(
f"⚡⚡ '{tbt_path}' (BC-Break) New('{charset_normalizer_encoding}') vs Legacy('{chardet_encoding}')"
)
success_ratio = round(success_count / total_count, 2) * 100.0
print(f"Total EST BC = {success_ratio} % ({success_count} / {total_count} files)")
return 0 if success_ratio >= args.coverage else 1
if __name__ == "__main__":
exit(cli_bc(argv[1:]))
python-charset-normalizer-3.4.2/bin/coverage.py 0000664 0000000 0000000 00000005574 15005045421 0021574 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import argparse
from glob import glob
from os import sep
from os.path import isdir
from sys import argv
from charset_normalizer import __version__, from_path
from charset_normalizer.utils import iana_name
def calc_equivalence(content: bytes, cp_a: str, cp_b: str):
str_a = content.decode(cp_a)
str_b = content.decode(cp_b)
character_count = len(str_a)
diff_character_count = sum(chr_a != chr_b for chr_a, chr_b in zip(str_a, str_b))
return 1.0 - (diff_character_count / character_count)
def cli_coverage(arguments: list[str]):
parser = argparse.ArgumentParser(
description="Embedded detection success coverage script checker for Charset-Normalizer"
)
parser.add_argument(
"-p",
"--with-preemptive",
action="store_true",
default=False,
dest="preemptive",
help="Enable the preemptive scan behaviour during coverage check",
)
parser.add_argument(
"-c",
"--coverage",
action="store",
default=90,
type=int,
dest="coverage",
help="Define the minimum acceptable coverage to succeed",
)
args = parser.parse_args(arguments)
if not isdir("./char-dataset"):
print(
"This script require https://github.com/Ousret/char-dataset to be cloned on package root directory"
)
exit(1)
print(f"> using charset-normalizer {__version__}")
success_count = 0
total_count = 0
for tbt_path in sorted(glob("./char-dataset/**/*.*")):
expected_encoding = tbt_path.split(sep)[-2]
total_count += 1
results = from_path(tbt_path, preemptive_behaviour=args.preemptive)
if expected_encoding == "None" and len(results) == 0:
print(f"✅✅ '{tbt_path}'")
success_count += 1
continue
if len(results) == 0:
print(f"⚡⚡ '{tbt_path}' (nothing)")
continue
result = results.best()
if (
expected_encoding in result.could_be_from_charset
or iana_name(expected_encoding) in result.could_be_from_charset
):
print(f"✅✅ '{tbt_path}'")
success_count += 1
continue
calc_eq = calc_equivalence(result.raw, expected_encoding, result.encoding)
if calc_eq >= 0.98:
success_count += 1
print(
f"️✅ ️'{tbt_path}' (got '{result.encoding}' but equivalence {round(calc_eq * 100.0, 3)} %)"
)
continue
print(f"⚡ '{tbt_path}' (got '{result.encoding}')")
success_ratio = round(success_count / total_count, 2) * 100.0
print(
f"Total EST coverage = {success_ratio} % ({success_count} / {total_count} files)"
)
return 0 if success_ratio >= args.coverage else 1
if __name__ == "__main__":
exit(cli_coverage(argv[1:]))
python-charset-normalizer-3.4.2/bin/performance.py 0000664 0000000 0000000 00000012724 15005045421 0022275 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import argparse
from glob import glob
from math import ceil
from os.path import isdir
from statistics import mean, stdev
from sys import argv
from time import perf_counter_ns
from chardet import detect as chardet_detect
from charset_normalizer import detect
def calc_percentile(data, percentile):
n = len(data)
p = n * percentile / 100
sorted_data = sorted(data)
return sorted_data[int(p)] if p.is_integer() else sorted_data[int(ceil(p)) - 1]
def performance_compare(arguments):
parser = argparse.ArgumentParser(
description="Performance CI/CD check for Charset-Normalizer"
)
parser.add_argument(
"-s",
"--size-increase",
action="store",
default=1,
type=int,
dest="size_coeff",
help="Apply artificial size increase to challenge the detection mechanism further",
)
args = parser.parse_args(arguments)
if not isdir("./char-dataset"):
print(
"This script require https://github.com/Ousret/char-dataset to be cloned on package root directory"
)
exit(1)
chardet_results = []
charset_normalizer_results = []
file_list = sorted(glob("./char-dataset/**/*.*"))
total_files = len(file_list)
for idx, tbt_path in enumerate(file_list):
with open(tbt_path, "rb") as fp:
content = fp.read() * args.size_coeff
before = perf_counter_ns()
chardet_detect(content)
chardet_time = round((perf_counter_ns() - before) / 1000000000, 5)
chardet_results.append(chardet_time)
before = perf_counter_ns()
detect(content)
charset_normalizer_time = round((perf_counter_ns() - before) / 1000000000, 5)
charset_normalizer_results.append(charset_normalizer_time)
charset_normalizer_time = charset_normalizer_time or 0.000005
cn_faster = (chardet_time / charset_normalizer_time) * 100 - 100
print(
f"{idx + 1:>3}/{total_files} {tbt_path:<82} C:{chardet_time:.5f} "
f"CN:{charset_normalizer_time:.5f} {cn_faster:.1f} %"
)
# Print the top 10 rows with the slowest execution time
print(
f"\n{'-' * 102}\nTop 10 rows with the slowest execution time of charset_normalizer:\n"
)
sorted_results = sorted(
enumerate(charset_normalizer_results), key=lambda x: x[1], reverse=True
)
for idx, time in sorted_results[:10]:
tbt_path = file_list[idx]
print(f"{idx + 1:>3}/{total_files} {tbt_path:<82} CN:{time:.5f}")
# Print charset normalizer statistics
min_time = min(charset_normalizer_results)
max_time = max(charset_normalizer_results)
stdev_time = stdev(charset_normalizer_results)
mean_time = mean(charset_normalizer_results)
cv = (stdev_time / mean_time) * 100 # Coefficient of variation
print(f"\n{'-' * 102}\nCharset Normalizer statistics:\n")
print(f"Minimum Execution Time: {min_time:.5f} seconds")
print(f"Maximum Execution Time: {max_time:.5f} seconds")
print(f"Mean Execution Time: {mean_time:.5f} seconds")
print(f"Standard Deviation: {stdev_time:.5f} seconds")
print(f"Coefficient of Variation (CV): {cv:.1f} %")
# Print comparison statistics for chardet and charset normalizer
chardet_avg_delay = round(mean(chardet_results) * 1000)
chardet_99p = round(calc_percentile(chardet_results, 99) * 1000)
chardet_95p = round(calc_percentile(chardet_results, 95) * 1000)
chardet_50p = round(calc_percentile(chardet_results, 50) * 1000)
charset_normalizer_avg_delay = round(mean(charset_normalizer_results) * 1000)
charset_normalizer_99p = round(
calc_percentile(charset_normalizer_results, 99) * 1000
)
charset_normalizer_95p = round(
calc_percentile(charset_normalizer_results, 95) * 1000
)
charset_normalizer_50p = round(
calc_percentile(charset_normalizer_results, 50) * 1000
)
# mypyc can offer performance ~1ms in the 50p. When eq to 0 assume 1 due to imprecise nature of this test.
if charset_normalizer_50p == 0:
charset_normalizer_50p = 1
print(f"\n{'-' * 102}\nCharset Normalizer vs Chardet statistics:\n")
print("------------------------------")
print("--> Chardet Conclusions")
print(" --> Avg: " + str(chardet_avg_delay) + "ms")
print(" --> 99th: " + str(chardet_99p) + "ms")
print(" --> 95th: " + str(chardet_95p) + "ms")
print(" --> 50th: " + str(chardet_50p) + "ms")
print("------------------------------")
print("--> Charset-Normalizer Conclusions")
print(" --> Avg: " + str(charset_normalizer_avg_delay) + "ms")
print(" --> 99th: " + str(charset_normalizer_99p) + "ms")
print(" --> 95th: " + str(charset_normalizer_95p) + "ms")
print(" --> 50th: " + str(charset_normalizer_50p) + "ms")
print("------------------------------")
print("--> Charset-Normalizer / Chardet: Performance Сomparison")
print(
" --> Avg: x"
+ str(round(chardet_avg_delay / charset_normalizer_avg_delay, 2))
)
print(" --> 99th: x" + str(round(chardet_99p / charset_normalizer_99p, 2)))
print(" --> 95th: x" + str(round(chardet_95p / charset_normalizer_95p, 2)))
print(" --> 50th: x" + str(round(chardet_50p / charset_normalizer_50p, 2)))
return (
0
if chardet_avg_delay > charset_normalizer_avg_delay
and chardet_99p > charset_normalizer_99p
else 1
)
if __name__ == "__main__":
exit(performance_compare(argv[1:]))
python-charset-normalizer-3.4.2/data/ 0000775 0000000 0000000 00000000000 15005045421 0017555 5 ustar 00root root 0000000 0000000 python-charset-normalizer-3.4.2/data/NOTICE.md 0000664 0000000 0000000 00000000752 15005045421 0021064 0 ustar 00root root 0000000 0000000 Included and Redistributed Files
---------------------------------
17 files are included in the source distribution tar. They are used to verify the standard functions of
this library. They are mandatory to run `pytest` but not required to make the lib usable after install.
They DO NOT guarantee that the detection-coverage will not regress.
Those are EITHER pulled from Wikipedia _(CC-BY-SA)_ OR public domain archive.
You SHALL NOT modify any of those files without explicit approval.
python-charset-normalizer-3.4.2/data/sample-arabic-1.txt 0000664 0000000 0000000 00000001612 15005045421 0023154 0 ustar 00root root 0000000 0000000 " " " " : ( ) ( ) ( ).
: " " " " " " ߡ .
( : ) ( : ). ( : Morroch)
python-charset-normalizer-3.4.2/data/sample-arabic.txt 0000664 0000000 0000000 00000003145 15005045421 0023021 0 ustar 00root root 0000000 0000000 بالموازاة مع ذلك وللإشارة إلى المنطقة المغاربية بشكل عام، كان المؤرخون العرب في القرون الوسطى يستعملون لفظ "بلاد المغرب" بينما الأوروبيون يستعملون لفظ "الساحل البربري" للدلالة على ثلاثة أقاليم: المغرب الأدنى (إفريقية أو تونس الحالية)، المغرب الأوسط (الجزائر الحالية)، المغرب الأقصى (المملكة المغربية الحالية).
أحيانًا كان يُشار للبلاد بتسمية مرتبطة بعاصمتها: كـ "موريطنية الطنجية" التي كانت عاصمتها طنجة وكذا "مملكة مراكش" و"مملكة فاس" نسبة إلى عواصمها المعروفة آنذاك، وكانت الظهائر والمعاهدات الدولية يوقّعها سلاطين المغرب تارة باسم سلطان مراكش وتارة باسم سلطان فاس.
تمت الإشارة للبلاد لاحقًا باسم المغرب الأقصى باللغة العربية حيث اعتَقد الناس في العالم القديم أن الشمس تشرق من اليابان (باللغة الصينية نيهون: مكان شروق الشمس) وتغرب في المملكة المغربية (باللغة العربية المغرب: مكان غروب الشمس). بينما اشتَقت البلاد اسمها في اللغات الأوروبية من الكلمة اللاتينية مرك (باللغة اللاتينية: Morroch) وهي تصحيف
python-charset-normalizer-3.4.2/data/sample-bulgarian.txt 0000775 0000000 0000000 00000004226 15005045421 0023550 0 ustar 00root root 0000000 0000000 Член 26
1. Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно, поне що се отнася до началното и основното образование. Hачалното образование трябва да бъде задължително. Tехническото и професионалното образование трябва да бъдат общодостъпни, а висшето образование трябва да бъде еднакво достъпно за всички на основата на техните способности.
2. Oбразованието трябва да бъде насочено към цялостно развитие на човешката личност и заcилване на уважението към правата на човека и основните свободи. Tо трябва да съдейства за разбирателството, тъпримостта и приятелството между всички народи, расови или религиозни групи, както и за осъществяване дейността на Oрганизацията на Oбединените нации за поддържане на мира.
3. Pодителите имат право, с приоритет, да избират вида образование, което да получат техните деца.
Член 27
1. Bсеки човек има право свободно да участва в културния живот на обществото, да се наслаждава на изкуствата, да участва в научния напредък и да се ползва от неговите достижения.
2. Bсеки човек има право на закрила на моралните и материалните си интереси, които са резултат от каквото и да е научно, литературно или художествено произведение, на което той е автор. python-charset-normalizer-3.4.2/data/sample-chinese.txt 0000664 0000000 0000000 00000001347 15005045421 0023220 0 ustar 00root root 0000000 0000000 j]Wikipedia^̡A¦FѤUB|BHӡAѦʬjCl@̡AC|]C
FG~QGܤ@AΤv~GQEAUyGʤQAXOCʸUءFGQjKA^LGʸUCxDѤUӦ@ӦFNUAXBHġ@Aj_jC
@AXjոܺCmlnJuAô]vAmnJuAl]vAHXANôl]C´AHڥA]C
joAPDݵoFpsDBХBBӾǡByBwCئhiw\שsF\֡AѱoPġA_ɨaHAMhҸչAKGC
ZAҾڪ̡AꭲۥѤɳ\iijAGiۥѼsǤѤUաC
娥l~CiAoGd@ʤKQEC
commons:
nHMTA@ɡJjC
python-charset-normalizer-3.4.2/data/sample-english.bom.txt 0000775 0000000 0000000 00000001533 15005045421 0024007 0 ustar 00root root 0000000 0000000 1
00:00:06,500 --> 00:00:09,000
About 2 months ago I found myself on
the comment section of YouTube
2
00:00:11,000 --> 00:00:17,000
And I was commenting,
unfortunately I was commenting,
on a video about the famous Ayn Rand
3
00:00:19,000 --> 00:00:24,000
And I
posted underneath against
this woman's tirades,
against what is essentially
the human race.
4
00:00:25,000 --> 00:00:31,000
that, this monetary system seems to have no point, seems to actually hinder people
5
00:00:31,000 --> 00:00:36,000
and hinder progress, and one of the responses I got, I didn't answer it, was:
6
00:00:37,000 --> 00:00:43,000
what actually money creates is an incentive to invent the new items, that's the driving force behind it
7
00:00:43,000 --> 00:00:50,000
So what I thought I do is instead if answering on a YouTube comment is organize a global awareness day
python-charset-normalizer-3.4.2/data/sample-french-1.txt 0000664 0000000 0000000 00000006263 15005045421 0023207 0 ustar 00root root 0000000 0000000 JEAN-BAPTISTE POQUELIN MOLIRE
N LE 15 JANVIER 1622, MORT LE 17 FVRIER 1673
Quel est le plus grand des crivains de mon rgne? demandait Louis XIV
Boileau.--Sire, c'est Molire.
Non-seulement Despraux ne se trompait pas, mais de tous les crivains
que la France a produits, sans excepter Voltaire lui-mme, imprgn de
l'esprit anglais par son sjour Londres, c'est incontestablement
Molire ou Poquelin qui reproduit avec l'exactitude la plus vive et la
plus complte le fond du gnie franais.
En raison de cette identit de son gnie avec le ntre, il exera sur
l'poque subsquente, sur le dix-huitime sicle, sur l'poque mme o
nous crivons, la plus active, la plus redoutable influence. Tout ce
qu'il a voulu dtruire est en ruine. Les types qu'il a crs ne peuvent
mourir. Le sens de la vie pratique, qu'il a recommand d'aprs Gassendi,
a fini par l'emporter sur les ides qui imposaient la socit
franaise. Il n'y a pas de superstition qu'il n'ait attaque, pas de
crdulit qu'il n'ait saisie corps corps pour la terrasser, pas de
formule qu'il ne se soit efforc de dtruire. A-t-il, comme l'exprime si
bien Swift, _dchir l'toffe avec la doublure_? l'histoire le dira. Ce
qui est certain, c'est que l'lve de Lucrce, le protg de Louis XIV,
poursuivait un but dtermin vers lequel il a march d'un pas ferme,
obstin, tantt foulant aux pieds les obstacles, tantt les tournant
avec adresse. Le sujet de _Tartuffe_ est dans Lucrce; Lucrce
appartient ce vers, vritable devise de Molire:
_Et religionis..... nodos solvere curo[1]._
La puissance de Molire sur les esprits a t telle, qu'une lgende
inexacte, calomnieuse de son vivant, romanesque aprs sa mort, s'est
forme autour de cette gloire populaire. Il est un mythe comme Jules
Csar et Apollon.
[1] Ce que je veux, c'est rompre les entraves qui nous enchanent
(_religionis.... quod religat_).
Dates, vnements, ralits, souvenirs, sont venus se confondre dans un
inextricable chaos o la figure de Molire a disparu. Tous les vices
jusqu' l'ivrognerie, jusqu' l'inceste et au vol, lui furent imputs de
son vivant. Les vertus les plus thres lui furent attribues par les
prtres de son culte. Homme d'action, sans cesse en face du public, du
roi ou de sa troupe, occup de son gouvernement et de la cration de ses
uvres, il n'a laiss aucune trace de sa propre vie, aucun document
biographique, peine une lettre. Les pamphlets pour et contre lui
composaient dj une bibliothque, lorsqu'un couteur aux portes, nomm
Grimarest, collecteur d'anas, aimant l'exagration des rcits et
incapable de critique, prtendit, trente-deux ans aprs la mort du
comdien populaire, raconter et expliquer sa vie. Vers la mme poque,
une comdienne, ce que l'on croit du moins, force de se rfugier en
Hollande, jetait dans un libelle les souvenirs de coulisse qu'elle avait
pu recueillir sur l'intrieur du mnage de Molire et de sa femme. Enfin
quelques dtails authentiques, sems dans l'dition de ses uvres
publie par Lagrange en 1682, compltent l'ensemble des documents
comtemporains qui ont servi de base cette lgende de Molire,
excellente consulter, mais qu'il est bon de soumettre l'examen le
plus scrupuleux.
python-charset-normalizer-3.4.2/data/sample-french.txt 0000664 0000000 0000000 00000006457 15005045421 0023056 0 ustar 00root root 0000000 0000000 JEAN-BAPTISTE POQUELIN MOLIÈRE
NÉ LE 15 JANVIER 1622, MORT LE 17 FÉVRIER 1673
«Quel est le plus grand des écrivains de mon règne? demandait Louis XIV
à Boileau.--Sire, c'est Molière.»
Non-seulement Despréaux ne se trompait pas, mais de tous les écrivains
que la France a produits, sans excepter Voltaire lui-même, imprégné de
l'esprit anglais par son séjour à Londres, c'est incontestablement
Molière ou Poquelin qui reproduit avec l'exactitude la plus vive et la
plus complète le fond du génie français.
En raison de cette identité de son génie avec le nôtre, il exerça sur
l'époque subséquente, sur le dix-huitième siècle, sur l'époque même où
nous écrivons, la plus active, la plus redoutable influence. Tout ce
qu'il a voulu détruire est en ruine. Les types qu'il a créés ne peuvent
mourir. Le sens de la vie pratique, qu'il a recommandé d'après Gassendi,
a fini par l'emporter sur les idées qui imposaient à la société
française. Il n'y a pas de superstition qu'il n'ait attaquée, pas de
crédulité qu'il n'ait saisie corps à corps pour la terrasser, pas de
formule qu'il ne se soit efforcé de détruire. A-t-il, comme l'exprime si
bien Swift, _déchiré l'étoffe avec la doublure_? l'histoire le dira. Ce
qui est certain, c'est que l'élève de Lucrèce, le protégé de Louis XIV,
poursuivait un but déterminé vers lequel il a marché d'un pas ferme,
obstiné, tantôt foulant aux pieds les obstacles, tantôt les tournant
avec adresse. Le sujet de _Tartuffe_ est dans Lucrèce; à Lucrèce
appartient ce vers, véritable devise de Molière:
_Et religionis..... nodos solvere curo[1]._
La puissance de Molière sur les esprits a été telle, qu'une légende
inexacte, calomnieuse de son vivant, romanesque après sa mort, s'est
formée autour de cette gloire populaire. Il est un mythe comme Jules
César et Apollon.
[1] Ce que je veux, c'est rompre les entraves qui nous enchaînent
(_religionis.... quod religat_).
Dates, événements, réalités, souvenirs, sont venus se confondre dans un
inextricable chaos où la figure de Molière a disparu. Tous les vices
jusqu'à l'ivrognerie, jusqu'à l'inceste et au vol, lui furent imputés de
son vivant. Les vertus les plus éthérées lui furent attribuées par les
prêtres de son culte. Homme d'action, sans cesse en face du public, du
roi ou de sa troupe, occupé de son gouvernement et de la création de ses
œuvres, il n'a laissé aucune trace de sa propre vie, aucun document
biographique, à peine une lettre. Les pamphlets pour et contre lui
composaient déjà une bibliothèque, lorsqu'un écouteur aux portes, nommé
Grimarest, collecteur d'anas, aimant l'exagération des récits et
incapable de critique, prétendit, trente-deux ans après la mort du
comédien populaire, raconter et expliquer sa vie. Vers la même époque,
une comédienne, à ce que l'on croit du moins, forcée de se réfugier en
Hollande, jetait dans un libelle les souvenirs de coulisse qu'elle avait
pu recueillir sur l'intérieur du ménage de Molière et de sa femme. Enfin
quelques détails authentiques, semés dans l'édition de ses œuvres
publiée par Lagrange en 1682, complètent l'ensemble des documents
comtemporains qui ont servi de base à cette légende de Molière,
excellente à consulter, mais qu'il est bon de soumettre à l'examen le
plus scrupuleux.
python-charset-normalizer-3.4.2/data/sample-greek-2.txt 0000664 0000000 0000000 00000001072 15005045421 0023031 0 ustar 00root root 0000000 0000000 12 , . , , . - , , , , , . 20 .
python-charset-normalizer-3.4.2/data/sample-greek.txt 0000664 0000000 0000000 00000001072 15005045421 0022672 0 ustar 00root root 0000000 0000000 12 , . , , . - , , , , , . 20 .
python-charset-normalizer-3.4.2/data/sample-hebrew-2.txt 0000664 0000000 0000000 00000000524 15005045421 0023211 0 ustar 00root root 0000000 0000000 . , . (), , . (), ().
python-charset-normalizer-3.4.2/data/sample-hebrew-3.txt 0000775 0000000 0000000 00000000524 15005045421 0023215 0 ustar 00root root 0000000 0000000 . , . (), , . (), ().
python-charset-normalizer-3.4.2/data/sample-korean.txt 0000664 0000000 0000000 00000000603 15005045421 0023053 0 ustar 00root root 0000000 0000000 ڵ 絵 ż ƴ, ΰμ ߱Ѵ. 20 Ŀ , 縯 ȯ ̴ ũλ ̳ ǰ ִ. ѹα ѱ ( ؿ) ũλ Ͽ, Ұϰ ִ.
python-charset-normalizer-3.4.2/data/sample-polish.txt 0000664 0000000 0000000 00000013267 15005045421 0023104 0 ustar 00root root 0000000 0000000 "source";"target"
"REF.-2";"POLISH"
"KW-P00-01";"SYSTEM VIDEODOMOFONOWY MEET"
"KW-P00-02";"URZĄDZENIE"
"KW-P00-03";"OGÓLNE"
"KW-P00-04";"SIEĆ"
"KW-P00-05";"KD"
"KW-P00-06";"ROZP. TWARZY."
"KW-P00-07";"KAMERY IP"
"KW-P00-08";"SIP"
"KW-P00-09";"SIP TRUNK"
"KW-P00-10";"PRZEKIEROWANIA"
"KW-P00-11";"ZAAWANSOWANE"
"KW-P00-12";"KOD PIN"
"KW-P00-13";"WECHAT QR"
"KW-P00-14";"PRZYWRACAĆ"
"KW-P00-16";"WINDA"
"KW-P01-01";"INFORMACJE O URZĄDZENIU"
"KW-P01-02";"PANEL VIDEO FOOBAR KIN"
"KW-P01-03";"FIRMWARE: V02.10"
"KW-P01-04";"URZĄDZENIE: PANEL BLOKOWY-CYFROWY 001-02"
"KW-P01-05";"URZĄDZENIE: PANEL BLOKOWY PRZYCISKI 020-02"
"KW-P01-06";"URZĄDZENIE: PANEL GŁÓWNY 01"
"KW-P01-07";"URZĄDZENIE: PANEL 1W 006-0102-01"
"KW-P01-08";"NUMER SERYJNY:"
"KW-P01-09";"MAC:"
"KW-P01-10";"IP:"
"KW-P01-11";"COPYRIGHT © FOOBAR "
"KW-P01-12";"www.example.com"
"KW-P02-01";"USTAWIENIA GŁÓWNE"
"KW-P02-02";"TYP:"
"KW-P02-03";"PANEL GŁÓWNY"
"KW-P02-04";"CYFROWY P. BLOKOWY"
"KW-P02-05";"P. BLOK. PRZYCISKI"
"KW-P02-06";"PANEL 1NR"
"KW-P02-07";"BLOK:"
"KW-P02-08";"LOKAL:"
"KW-P02-09";"MONIT WYŚWIETLACZA:"
"KW-P02-10";"THIS INTERFACE IS NOT ENABLED"
"KW-P02-11";"NUMER PANELU:"
"KW-P02-12";"NAZWA URZĄDZENIA:"
"KW-P02-13";"(≤16 ZNAKÓW)"
"KW-P02-14";"JĘZYK:"
"KW-P02-15";"ENGLISH"
"KW-P02-16";"中文"
"KW-P02-17";"ESPAÑOL"
"KW-P02-18";"РУССКИЙ"
"KW-P02-19";"DEUTSCH"
"KW-P02-20";"TÜRKÇE"
"KW-P02-21";"POLSKI"
"KW-P02-22";"עברית"
"KW-P02-23";"FRANÇAIS"
"KW-P02-24";"فارسی"
"KW-P02-25";"GŁOŚNOŚĆ PANELU:"
"KW-P02-26";"JASNOŚĆ"
"KW-P02-27";"ROZDZIELCZOŚĆ VIDEO:"
"KW-P02-28";"TRYB PRZEKIEROWANIA SIP:"
"KW-P02-29";"SEKWENCYJNE"
"KW-P02-30";"JEDNOCZESNE"
"KW-P02-31";"PORTIER:"
"KW-P02-32";"PORTIERNIA 1:"
"KW-P02-33";"PORTIERNIA 2:"
"KW-P02-34";"USTAW. DATY I CZASU"
"KW-P02-35";"FORMAT DATY:"
"KW-P02-36";"DATA:"
"KW-P02-37";"CZAS:"
"KW-P02-38";"STREFA CZASOWA:"
"KW-P02-39";"ZAPISZ"
"KW-P02-40";"BŁĘDNE DANE"
"KW-P02-41";"KLAWIATURA ALFANUM.:"
"KW-P02-42";"KOMUNIKAT OTWARCIA DRZWI:"
"KW-P02-43";"WYGASZACZ EKRANU:"
"KW-P02-44";"WSPARCIE:"
"KW-P02-45";"OCZEKIWANIE"
"KW-P02-46";"POŁĄCZENIE"
"KW-P02-47";"WSPARCIE"
"KW-P02-48";"lista"
"KW-P02-49";"DST:"
"KW-P02-57";"TŁO:"
"KW-P02-58";"CIEMNE"
"KW-P02-59";"JASNE"
"KW-P02-60";"IMPORT"
"KW-P02-61";"EKSPORT"
"KW-P02-62";"USUŃ"
"KW-P02-63";"WYBIERZ PRAWIDŁOWY PLIK PNG"
"KW-P02-64";"IMPORTUJ"
"KW-P02-65";"WYSYŁANIE ZAKOŃCZONE"
"KW-P02-66";"BRAK OBRAZU"
"KW-P02-67";"USUNIĘTE"
"KW-P02-68";"BŁĄD USUWANIA"
"KW-P03-01";"USTAWIENIA SIECI"
"KW-P03-02";"IP:"
"KW-P03-03";"MASKA:"
"KW-P03-04";"BRAMA:"
"KW-P03-05";"DNS:"
"KW-P03-06";"SOFTWARE IP:"
"KW-P03-07";"SW. PIN:"
"KW-P03-08";"ZAPISZ"
"KW-P04-01";"USTAWIENIA KONTROLI DOSTĘPU"
"KW-P04-02";"PRZYCISK EGRESS:"
"KW-P04-03";"CZAS ELEKTROZACZEPU:"
"KW-P04-04";"CZAS KONTAKTRONU:"
"KW-P04-05";"REF.1491 4 RELAY:"
"KW-P04-06";"CZAS ELEKTROZACZEPU:"
"KW-P04-07";"CZAS KONTAKTRONU:"
"KW-P04-08";"KARTA ADMINISTRATORA:"
"KW-P04-09";"ROZBRAJANIE KARTĄ:"
"KW-P04-10";"MONITY KART:"
"KW-P04-11";"KOD GOŚCIA:"
"KW-P04-12";"KOD DOSTĘPU:"
"KW-P04-13";"#1"
"KW-P04-14";"#2"
"KW-P04-15";"#3"
"KW-P04-16";"#4"
"KW-P04-17";"ALARM DRZWI"
"KW-P04-18";"GWAŁTOWNY ALARM OTWARCIA"
"KW-P04-19";"WIEGAND:"
"KW-P04-20";"BURST"
"KW-P04-21";"26-BIT"
"KW-P04-22";"FACILITY:"
"KW-P04-24";"ZAPISZ"
"KW-P04-25";"WYŁĄCZONY"
"KW-P04-26";"REF.1490 2 RELAY:"
"KW-P04-27";"KOD QR:"
"KW-P04-28";"WIEGAND:"
"KW-P04-29";"26-BIT"
"KW-P04-30";"34-BIT"
"KW-P04-31";"KOD MIEJSCA:"
"KW-P04-32";"AUTO AKTYWACJA:"
"KW-P04-33";"BŁĘDNE DANE"
"KW-P05-01";"ROZPOZNAWANIE TWARZY"
"KW-P05-02";"ROZPOZNAWANIE TWARZY:"
"KW-P05-04";"MODEL:"
"KW-P05-05";"Wykrycie obecności:"
"KW-P05-06";"WŁĄCZONY"
"KW-P05-07";"WYŁĄCZONY"
"KW-P05-08";"PODOBIEŃSTWO:"
"KW-P05-09";"NISKIE"
"KW-P05-10";"ŚREDNIE"
"KW-P05-11";"WYSOKIE"
"KW-P05-12";"ZAPISZ"
"KW-P06-01";"USTAWIENIA KAMER IP"
"KW-P06-02";"ILOŚĆ KAMER:"
"KW-P06-03";"KAMERA"
"KW-P06-04";"URL:"
"KW-P06-05";"ZAPISZ"
"KW-P07-01";"USTAWIENIA SIP"
"KW-P07-02";"WŁĄCZ SIP:"
"KW-P07-03";"SPRAWDŹ STATUS SIP"
"KW-P07-04";"SIP ZAREJESTROWANY"
"KW-P07-05";"BŁĄD REJESTRACJI SIP"
"KW-P07-06";"SERWER SIP:"
"KW-P07-07";"DOMENA:"
"KW-P07-08";"OUTBOUND:"
"KW-P07-09";"STUN IP:"
"KW-P07-10";"PORT STUN:"
"KW-P07-11";"H.264:"
"KW-P07-12";"UŻYTKOWNIK SIP:"
"KW-P07-13";"HASŁO SIP:"
"KW-P07-14";"CZAS ROZMOWY:"
"KW-P07-15";"CZAS DZWONIENIA:"
"KW-P07-16";"ZAPISZ"
"KW-P08-01";"USTAWIENIA SIP TRUNK"
"KW-P08-02";"WŁĄCZ SIP TRUNK:"
"KW-P08-03";"URL:"
"KW-P08-04";"ZAPISZ"
"KW-P09-01";"USTAWIENIA PRZEKIEROWAŃ"
"KW-P09-02";"IMPORT"
"KW-P09-03";"EKSPORT"
"KW-P09-04";"APARTAMENT"
"KW-P09-05";"NUMER"
"KW-P10-01";"USTAWIENIA ZAAWANSOWANE"
"KW-P10-02";"SZYBKIE WYBIERANIE:"
"KW-P10-03";"URL:"
"KW-P10-04";"ONU:"
"KW-P10-05";"MAPOWANIE POŁĄCZEŃ:"
"KW-P10-06";"BIAŁA LISTA:"
"KW-P10-07";"Lista telefoniczna:"
"KW-P10-08";"IMPORT"
"KW-P10-09";"EKSPORT"
"KW-P10-10";"IMPORTUJ"
"KW-P10-11";"WYSYŁANIE ZAKOŃCZONE"
"KW-P10-12";"UŻYJ WŁAŚCIWEGO PLIKU CSV."
"KW-P10-13";"OK"
"KW-P10-14";"ZAPISZ"
"KW-P11-01";"USTAWIENIA KODU PIN"
"KW-P11-02";"OBECNY PIN:"
"KW-P11-03";"NOWY PIN:"
"KW-P11-04";"POTWIERDŹ PIN:"
"KW-P11-05";"ZAPISZ"
"KW-P12-01";"WECHAT QR"
"KW-P12-02";"WŁĄCZ"
"KW-P12-03";"UUID:"
"KW-P12-04";"HASŁO:"
"KW-P12-05";"SERWER:"
"KW-P12-06";"WŁĄCZ CZYTNIK QR:"
"KW-P12-07";"STATUS:"
"KW-P12-08";"REJESTRACJA POMYŚLNIE"
"KW-P12-09";"REJESTRACJA NIE POWIODŁA SIĘ"
"KW-P12-10";"ZAPISZ"
"KW-P13-01";"PRZYWRACAĆ"
"KW-P13-02";"PRZYWRÓCIĆ USTAWIENIA FABRYCZNE"
"KW-P13-03";"POTWIERDZAĆ PRZYWRÓĆ USTAWIENIA FABRYCZNE?"
"KW-P13-04";"URZĄDZENIE REBOOT"
python-charset-normalizer-3.4.2/data/sample-russian-2.txt 0000664 0000000 0000000 00000004241 15005045421 0023421 0 ustar 00root root 0000000 0000000 В гимназии он не был в числе первых учеников (исключение составляли математика и латынь). Укоренившаяся система механического заучивания материала учащимися (которая, как он считал, наносит вред самому духу учёбы и творческому мышлению), а также авторитарное отношение учителей к ученикам вызывало у Альберта Эйнштейна неприятие, поэтому он часто вступал в споры со своими преподавателями.
После окончательного разорения отца семейства в 1894 году Эйнштейны переехали из Мюнхена в итальянский город Павию, близ Милана. Сам Альберт оставался в Мюнхене ещё некоторое время, чтобы окончить все шесть классов гимназии. Так и не получив аттестата зрелости, в 1895 году он присоединился к своей семье в Милане.
Осенью 1895 г. Альберт Эйнштейн прибыл в Швейцарию, чтобы сдать вступительные экзамены в Высшее техническое училище (Политехникум) в Цюрихе и стать преподавателем физики. Блестяще проявив себя на экзамене по математике, он в то же время провалил экзамены по ботанике и французскому языку, что не позволило ему поступить в Цюрихский Политехникум. Однако директор училища посоветовал молодому человеку поступить в выпускной класс школы в Аарау (Швейцария), чтобы получить аттестат и повторить поступление.
python-charset-normalizer-3.4.2/data/sample-russian-3.txt 0000664 0000000 0000000 00000005776 15005045421 0023440 0 ustar 00root root 0000000 0000000 Москва́ (произношение (инф.)) — столица России, город федерального значения, административный центр Центрального федерального округа и центр Московской области, в состав которой не входит[6]. Крупнейший по численности населения город России и её субъект — 12 655 050[3] человек (2021), самый населённый из городов, полностью расположенных в Европе, занимает 22 место среди городов мира по численности населения[7], крупнейший русскоязычный город в мире. Центр Московской городской агломерации.
Историческая столица Великого княжества Московского, Русского царства, Российской империи (в 1728—1732 годах[8][9][10][11]), Советской России и СССР. Город-герой. В Москве находятся федеральные органы государственной власти Российской Федерации (за исключением Конституционного суда), посольства иностранных государств, штаб-квартиры большинства крупнейших российских коммерческих организаций и общественных объединений.
Расположена на западе России, на реке Москве в центре Восточно-Европейской равнины, в междуречье Оки и Волги. Как субъект федерации, Москва граничит с Московской и Калужской областями.
Москва — популярный туристический центр России. Кремль, Красная площадь, Новодевичий монастырь и Церковь Вознесения в Коломенском входят в список объектов всемирного наследия ЮНЕСКО[12]. Она является важнейшим транспортным узлом: город обслуживают 6 аэропортов, 10 железнодорожных вокзалов, 3 речных порта (имеется речное сообщение с морями бассейнов Атлантического и Северного Ледовитого океанов). С 1935 года в Москве работает метрополитен. Москва — спортивный центр страны. В 1980 году в Москве прошли XXII летние Олимпийские игры, а в 2018 город стал одним из хозяев чемпионата мира по футболу. python-charset-normalizer-3.4.2/data/sample-russian.txt 0000664 0000000 0000000 00000002273 15005045421 0023265 0 ustar 00root root 0000000 0000000 ( ). (, , ), , .
1894 , . , . , 1895 .
1895 . , () . , , . (), .
python-charset-normalizer-3.4.2/data/sample-spanish.txt 0000775 0000000 0000000 00000016161 15005045421 0023252 0 ustar 00root root 0000000 0000000 La creación
1 En el principio creó Dios los cielos y la tierra. 2 Y la tierra estaba sin orden y vacía[a], y las tinieblas cubrían la superficie[b] del abismo, y el Espíritu de Dios se movía sobre la superficie[c] de las aguas. 3 Entonces dijo Dios: Sea la luz. Y hubo luz. 4 Y vio Dios que la luz era buena; y separó Dios la luz de las tinieblas. 5 Y llamó Dios a la luz día, y a las tinieblas llamó noche. Y fue la tarde y fue la mañana: un día.
6 Entonces dijo Dios: Haya expansión[d] en medio de las aguas, y separe las aguas de las aguas. 7 E hizo Dios la expansión, y separó las aguas que estaban debajo de la expansión de las aguas que estaban sobre la expansión. Y fue así. 8 Y llamó Dios a la expansión cielos. Y fue la tarde y fue la mañana: el segundo día.
9 Entonces dijo Dios: Júntense en un lugar las aguas que están debajo de los cielos, y que aparezca lo seco. Y fue así. 10 Y llamó Dios a lo seco tierra, y al conjunto de las aguas llamó mares. Y vio Dios que era bueno. 11 Y dijo Dios: Produzca la tierra vegetación[e]: hierbas[f] que den semilla, y árboles frutales que den fruto sobre la tierra según su género[g], con su semilla en él. Y fue así. 12 Y produjo la tierra vegetación[h]: hierbas[i] que dan semilla según su género, y árboles que dan fruto con su semilla en él, según su género. Y vio Dios que era bueno. 13 Y fue la tarde y fue la mañana: el tercer día.
14 Entonces dijo Dios: Haya lumbreras[j] en la expansión de los cielos para separar el día de la noche, y sean para señales y para estaciones y para días y para años; 15 y sean por luminarias en la expansión de los cielos para alumbrar sobre la tierra. Y fue así. 16 E hizo Dios las dos grandes lumbreras[k], la lumbrera[l] mayor para dominio del día y la lumbrera[m] menor para dominio de la noche; hizo también las estrellas. 17 Y Dios las puso en la expansión de los cielos para alumbrar sobre la tierra, 18 y para dominar en el día y en la noche, y para separar la luz de las tinieblas. Y vio Dios que era bueno. 19 Y fue la tarde y fue la mañana: el cuarto día.
20 Entonces dijo Dios: Llénense[n] las aguas de multitudes de seres vivientes, y vuelen las aves sobre la tierra en la abierta[o] expansión de los cielos. 21 Y creó Dios los grandes monstruos marinos y todo ser viviente que se mueve, de los cuales están llenas[p] las aguas según su género, y toda ave[q] según su género. Y vio Dios que era bueno. 22 Y Dios los bendijo, diciendo: Sed fecundos y multiplicaos, y llenad las aguas en los mares, y multiplíquense las aves en la tierra. 23 Y fue la tarde y fue la mañana: el quinto día.
24 Entonces dijo Dios: Produzca la tierra seres vivientes según su género: ganados, reptiles y bestias de la tierra según su género. Y fue así. 25 E hizo Dios las bestias de la tierra según su género, y el ganado según su género, y todo lo que se arrastra sobre la tierra según su género. Y vio Dios que era bueno.
Creación del hombre y de la mujer
26 Y dijo Dios: Hagamos al hombre a nuestra imagen, conforme a nuestra semejanza; y ejerza[r] dominio sobre los peces del mar, sobre las aves del cielo, sobre los ganados, sobre toda la tierra, y sobre todo reptil que se arrastra sobre la tierra. 27 Creó, pues, Dios al hombre a imagen suya, a imagen de Dios lo creó; varón y hembra los creó. 28 Y los bendijo Dios y les dijo[s]: Sed fecundos y multiplicaos, y llenad la tierra y sojuzgadla; ejerced dominio sobre los peces del mar, sobre las aves del cielo y sobre todo ser viviente que se mueve[t] sobre la tierra. 29 Y dijo Dios: He aquí, yo os he dado toda planta que da semilla que hay en la superficie[u] de toda la tierra, y todo árbol que tiene fruto[v] que da semilla; esto os servirá de[w] alimento. 30 Y a toda bestia de la tierra, a toda ave de los cielos y a todo lo que se mueve[x] sobre la tierra, y que tiene vida[y], les he dado toda planta verde para alimento. Y fue así. 31 Y vio Dios todo lo que había hecho, y he aquí que era bueno en gran manera. Y fue la tarde y fue la mañana: el sexto día.
Así fueron acabados los cielos y la tierra y todas sus huestes. 2 Y en el séptimo día completó Dios la[a] obra que había hecho, y reposó en el día séptimo de toda la[b] obra que había hecho. 3 Y bendijo Dios el séptimo día y lo santificó, porque en él reposó de toda la[c] obra que El[d] había creado y hecho[e].
El huerto del Edén
4 Estos son los orígenes[f] de los cielos y de la tierra cuando fueron creados, el día en que el Señor Dios hizo la tierra y los cielos. 5 Y aún no había ningún arbusto del campo en la tierra, ni había aún brotado ninguna planta[g] del campo, porque el Señor Dios no había enviado lluvia sobre la tierra, ni había hombre para labrar[h] la tierra. 6 Pero se levantaba de la tierra un vapor[i] que regaba toda la superficie[j] del suelo. 7 Entonces el Señor Dios formó al hombre del polvo de la tierra, y sopló en su nariz el aliento de vida; y fue el hombre un ser[k] viviente. 8 Y plantó el Señor Dios un huerto hacia el oriente, en Edén; y puso allí al hombre que había formado. 9 Y el Señor Dios hizo brotar de la tierra todo árbol agradable a la vista y bueno para comer; asimismo, en medio del huerto, el árbol de la vida y el árbol del conocimiento[l] del bien y del mal.
10 Y del Edén salía un río para regar el huerto, y de allí se dividía y se convertía en otros cuatro ríos[m]. 11 El nombre del primero es Pisón; éste es el que rodea toda la tierra de Havila, donde hay oro. 12 El oro de aquella tierra es bueno; allí hay bedelio y ónice. 13 Y el nombre del segundo río es Gihón; éste es el que rodea la tierra de Cus. 14 Y el nombre del tercer río es Tigris[n]; éste es el que corre[o] al oriente de Asiria. Y el cuarto río es el Eufrates[p]. 15 Entonces el Señor Dios tomó al hombre y lo puso en el huerto del Edén, para que lo cultivara y lo cuidara. 16 Y ordenó el Señor Dios al hombre, diciendo: De todo árbol del huerto podrás comer, 17 pero del árbol del conocimiento[q] del bien y del mal no comerás[r], porque el día que de él comas, ciertamente morirás.
Formación de la mujer
18 Y el Señor Dios dijo: No es bueno que el hombre esté solo; le haré una ayuda idónea[s]. 19 Y el Señor Dios formó de la tierra todo animal del campo y toda ave del cielo, y los trajo al hombre para ver cómo los llamaría; y como el hombre llamó a cada ser viviente, ése fue su nombre. 20 Y el hombre puso nombre a todo ganado y a las aves del cielo y a toda bestia del campo, mas para Adán[t] no se encontró una ayuda que fuera idónea para él[u]. 21 Entonces el Señor Dios hizo caer un sueño profundo sobre el hombre, y éste se durmió; y Dios tomó una de sus costillas, y cerró la carne en ese lugar. 22 Y de la costilla que el Señor Dios había tomado del hombre, formó[v] una mujer y la trajo al hombre. 23 Y el hombre dijo:
Esta es ahora hueso de mis huesos,
y carne de mi carne;
ella[w] será llamada mujer[x],
porque del hombre[y] fue tomada.
24 Por tanto el hombre dejará a su padre y a su madre y se unirá a su mujer, y serán una sola carne. 25 Y estaban ambos desnudos, el hombre y su mujer, y no se avergonzaban. python-charset-normalizer-3.4.2/data/sample-turkish.txt 0000664 0000000 0000000 00000003460 15005045421 0023271 0 ustar 00root root 0000000 0000000 stanbul, Trkiye'nin en kalabalk, iktisadi ve kltrel adan en nemli
ehri.[2][3][4] ktisadi byklk adan dnyada 34., nfus asndan
belediye snrlar gz nne alnarak yaplan sralamaya gre Avrupa'da
birinci srada gelir.[5][6]
stanbul Trkiye'nin kuzeybatsnda, Marmara kys ve Boazii boyunca,
Hali'i de evreleyecek ekilde kurulmutur.[7] stanbul ktalararas bir
ehir olup, Avrupa'daki blmne Avrupa Yakas veya Rumeli Yakas,
Asya'daki blmne ise Anadolu Yakas denir. Tarihte ilk olarak taraf
Marmara Denizi, Boazii ve Hali'in sard bir yarm ada zerinde kurulan
stanbul'un batdaki snrn stanbul Surlar oluturmaktayd. Gelime ve
byme srecinde surlarn her seferinde daha batya ilerletilerek ina
edilmesiyle 4 defa geniletilen ehrin [8] 39 ilesi vardr.[9] Snrlar
ierisinde ise bykehir belediyesi ile birlikte toplam 40 belediye
bulunmaktadr.
Dnyann en eski ehirlerinden biri olan stanbul, M.S. 330 - 395 yllar
arasnda Roma mparatorluu, 395 - 1204 ile 1261 - 1453 yllar arasnda
Dou Roma mparatorluu, 1204 - 1261 arasnda Latin mparatorluu ve son
olarak 1453 - 1922 yllar arasnda Osmanl mparatorluu'na bakentlik
yapt.[10] Ayrca, hilafetin Osmanl mparatorluu'na getii 1517'den,
kaldrld 1924'e kadar, stanbul slamiyet'in de merkezi oldu.[11]
1453 ylnda fetihten sonra, kent Osmanl mparatorluu'nun drdnc
bakenti ilan edilidi ve Kostantiniyye Osmanl mparatorluu tarafndan
kentin resmi ad olarak kullanld ve 1923 ylnda Osmanl
mparatorluunun kne kadar, ou zaman bu ad kullanmda
kald. rnein Osmanl mparatorluu ve mahkemeleri, Kostantiniyye'de
yaymlanan resmi belgelerin kaynan belirtmek iin, "be-Makam-
Dar's-Saltanat- Kostantiniyyet'l-Mahrust'l-Mahmiyye" gibi balklar
kullanlrd.[17]
python-charset-normalizer-3.4.2/dev-requirements.txt 0000664 0000000 0000000 00000000046 15005045421 0022704 0 ustar 00root root 0000000 0000000 coverage>=7.2.7,<7.9
pytest>=7.4.4,<9
python-charset-normalizer-3.4.2/docs/ 0000775 0000000 0000000 00000000000 15005045421 0017574 5 ustar 00root root 0000000 0000000 python-charset-normalizer-3.4.2/docs/Makefile 0000775 0000000 0000000 00000001152 15005045421 0021236 0 ustar 00root root 0000000 0000000 # Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = python -msphinx
SPHINXPROJ = Charset Normalizer
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) python-charset-normalizer-3.4.2/docs/api.rst 0000664 0000000 0000000 00000004642 15005045421 0021105 0 ustar 00root root 0000000 0000000 .. _api:
Developer Interfaces
====================
.. module:: charset_normalizer
Main Interfaces
---------------
Those functions are publicly exposed and are protected through our BC guarantee.
.. autofunction:: from_bytes
.. autofunction:: from_fp
.. autofunction:: from_path
.. autofunction:: is_binary
.. autoclass:: charset_normalizer.models.CharsetMatches
:inherited-members:
.. autoclass:: charset_normalizer.models.CharsetMatch
:inherited-members:
.. autofunction:: detect
.. autofunction:: charset_normalizer.utils.set_logging_handler
Mess Detector
-------------
.. autofunction:: charset_normalizer.md.mess_ratio
This library allows you to extend the capabilities of the mess detector by extending the
class `MessDetectorPlugin`.
.. autoclass:: charset_normalizer.md.MessDetectorPlugin
:inherited-members:
.. autofunction:: charset_normalizer.md.is_suspiciously_successive_range
Coherence Detector
------------------
.. autofunction:: charset_normalizer.cd.coherence_ratio
Utilities
---------
Some reusable functions used across the project. We do not guarantee the BC in this area.
.. autofunction:: charset_normalizer.utils.is_accentuated
.. autofunction:: charset_normalizer.utils.remove_accent
.. autofunction:: charset_normalizer.utils.unicode_range
.. autofunction:: charset_normalizer.utils.is_latin
.. autofunction:: charset_normalizer.utils.is_punctuation
.. autofunction:: charset_normalizer.utils.is_symbol
.. autofunction:: charset_normalizer.utils.is_emoticon
.. autofunction:: charset_normalizer.utils.is_separator
.. autofunction:: charset_normalizer.utils.is_case_variable
.. autofunction:: charset_normalizer.utils.is_cjk
.. autofunction:: charset_normalizer.utils.is_hiragana
.. autofunction:: charset_normalizer.utils.is_katakana
.. autofunction:: charset_normalizer.utils.is_hangul
.. autofunction:: charset_normalizer.utils.is_thai
.. autofunction:: charset_normalizer.utils.is_unicode_range_secondary
.. autofunction:: charset_normalizer.utils.any_specified_encoding
.. autofunction:: charset_normalizer.utils.is_multi_byte_encoding
.. autofunction:: charset_normalizer.utils.identify_sig_or_bom
.. autofunction:: charset_normalizer.utils.should_strip_sig_or_bom
.. autofunction:: charset_normalizer.utils.iana_name
.. autofunction:: charset_normalizer.utils.range_scan
.. autofunction:: charset_normalizer.utils.is_cp_similar
.. class:: os.PathLike
.. class:: typing.BinaryIO
python-charset-normalizer-3.4.2/docs/community/ 0000775 0000000 0000000 00000000000 15005045421 0021620 5 ustar 00root root 0000000 0000000 python-charset-normalizer-3.4.2/docs/community/faq.rst 0000664 0000000 0000000 00000007135 15005045421 0023127 0 ustar 00root root 0000000 0000000 Frequently asked questions
===========================
Is UTF-8 everywhere already?
----------------------------
Not really, that is a dangerous assumption. Looking at https://w3techs.com/technologies/overview/character_encoding may
seem like encoding detection is a thing of the past but not really. Solo based on 33k websites, you will find
3,4k responses without predefined encoding. 1,8k websites were not UTF-8, merely half!
This statistic (w3techs) does not offer any ponderation, so one should not read it as
"I have a 97 % chance of hitting UTF-8 content on HTML content".
(2021 Top 1000 sites from 80 countries in the world according to Data for SEO) https://github.com/potiuk/test-charset-normalizer
First of all, neither requests, chardet or charset-normalizer are dedicated to HTML content.
The detection concern every text document, like SubRip Subtitle files for instance. And by my own experiences, I never had
a single database using full utf-8, many translated subtitles are from another era and never updated.
It is so hard to find any stats at all regarding this matter. Users' usages can be very dispersed, so making
assumptions are unwise.
The real debate is to state if the detection is an HTTP client matter or not. That is more complicated and not my field.
Some individuals keep insisting that the *whole* Internet is UTF-8 ready. Those are absolutely wrong and very Europe and North America-centered,
In my humble experience, the countries in the world are very disparate in this evolution. And the Internet is not just about HTML content.
Having a thorough analysis of this is very scary.
Should I bother using detection?
--------------------------------
In the last resort, yes. You should use well-established standards, eg. predefined encoding, at all times.
When you are left with no clue, you may use the detector to produce a usable output as fast as possible.
Is it backward-compatible with Chardet?
---------------------------------------
If you use the legacy `detect` function,
Then this change is mostly backward-compatible, exception of a thing:
- This new library support way more code pages (x3) than its counterpart Chardet.
- Based on the 30-ich charsets that Chardet support, expect roughly 80% BC results
We do not guarantee this BC exact percentage through time. May vary but not by much.
Isn't it the same as Chardet?
-----------------------------
The objective is the same, provide you with the best answer (charset) we can given any sequence of bytes.
The method actually differs.
We do not "train" anything to build a probe for a specific encoding. In addition to finding any languages (intelligent
design) by some rudimentary statistics (character frequency ordering) we built a mess detector to assist the language
detection.
Any code page supported by your cPython is supported by charset-normalizer! It is that simple, no need to update the
library. It is as generic as we could do.
I can't build standalone executable
-----------------------------------
If you are using ``pyinstaller``, ``py2exe`` or alike, you may be encountering this or close to:
ModuleNotFoundError: No module named 'charset_normalizer.md__mypyc'
Why?
- Your package manager picked up a optimized (for speed purposes) wheel that match your architecture and operating system.
- Finally, the module ``charset_normalizer.md__mypyc`` is imported via binaries and can't be seen using your tool.
How to remedy?
If your bundler program support it, set up a hook that implicitly import the hidden module.
Otherwise, follow the guide on how to install the vanilla version of this package. (Section: *Optional speedup extension*)
python-charset-normalizer-3.4.2/docs/community/featured.rst 0000664 0000000 0000000 00000004157 15005045421 0024160 0 ustar 00root root 0000000 0000000 Featured projects
=================
Did you liked how ``charset-normalizer`` perform? and its quality?
You may be interested in those other project maintained by the same authors.
We aim to serve the opensource community the best and as inclusively as we can, no matter
your level or opinions.
Niquests
--------
Started as a simple though.. IE 11 has built-in HTTP/2 support while Requests 2.32 does not!
Most of our programs that interact with HTTP server are built with ``requests`` and
we aren't likely to switch without a substantial effort.
We just might die at any moment, no notice whatsoever, knowingly that as a Python developer,
we never interacted with a HTTP/2 over TCP or HTTP/3 over QUIC capable server in 2023...
.. image:: https://dabuttonfactory.com/button.png?t=Get+Niquests+Now&f=Ubuntu-Bold&ts=26&tc=fff&hp=45&vp=20&c=11&bgt=unicolored&bgc=15d798&be=1
:target: https://github.com/jawah/niquests
It is a fork of ``requests`` and no breaking changes are to be expected. We made sure that
your migration is effortless and safe.
httpie-next
-----------
Easy solution are cool, let us introduce you to HTTPie but with built-in support
for HTTP/2 and HTTP/3.
It is made available as a plugin, no effort required beside installing the said plugin.
Enjoy HTTPie refreshed!
.. image:: https://dabuttonfactory.com/button.png?t=Get+HTTPie-Next+Now&f=Ubuntu-Bold&ts=26&tc=fff&hp=45&vp=20&c=11&bgt=unicolored&bgc=15d798&be=1
:target: https://github.com/Ousret/httpie-next
Wassima
-------
Did you ever wonder how would it feel like to leave the headache with root CAs (certificate authority)?
Well, you may, starting today, use your operating system trusted root CAs to verify
peer certificates with the at most comfort.
It is enabled by default in Niquests, but you can use that awesome feature by itself.
.. image:: https://dabuttonfactory.com/button.png?t=OS+root+CAs+for+Python&f=Ubuntu-Bold&ts=26&tc=fff&hp=45&vp=20&c=11&bgt=unicolored&bgc=15d798&be=1
:target: https://github.com/jawah/wassima
The solution is universal and served for (almost) every possible case.
You may remove the certifi package, let it rest in peace.
python-charset-normalizer-3.4.2/docs/community/speedup.rst 0000664 0000000 0000000 00000002760 15005045421 0024024 0 ustar 00root root 0000000 0000000 Optional speedup extension
==========================
Why?
----
charset-normalizer will always remain pure Python, meaning that a environment without any build capabilities will
run this program without any additional requirements.
Nonetheless, starting from the version 3.0 we introduce and publish some platform specific wheels including a
pre-built extension.
Most of the time is spent in the module `md.py` so we decided to "compile it" using Mypyc.
(1) It does not require to have a separate code base
(2) Our project code base is rather simple and lightweight
(3) Mypyc is robust enough today
(4) Four times faster!
How?
----
If your platform and/or architecture is not served by this swift optimization you may compile it easily yourself.
Following those instructions (provided you have the necessary toolchain installed):
::
export CHARSET_NORMALIZER_USE_MYPYC=1
pip install mypy build wheel
pip install charset-normalizer --no-binary :all:
How not to?
-----------
You may install charset-normalizer without the speedups by directly using the universal wheel
(most likely hosted on PyPI or any valid mirror you use) with ``--no-binary``.
E.g. when installing ``requests`` and you don't want to use the ``charset-normalizer`` speedups, you can do:
::
pip install requests --no-binary charset-normalizer
When installing `charset-normalizer` by itself, you can also pass ``:all:`` as the specifier to ``--no-binary``.
::
pip install charset-normalizer --no-binary :all:
python-charset-normalizer-3.4.2/docs/community/why_migrate.rst 0000664 0000000 0000000 00000002104 15005045421 0024666 0 ustar 00root root 0000000 0000000 Why should I migrate to Charset-Normalizer?
===========================================
There is so many reason to migrate your current project. Here are some of them:
- Remove ANY license ambiguity/restriction for projects bundling Chardet (even indirectly).
- X10 faster than Chardet in average and X6 faster in 99% of the cases AND support 3 times more encoding.
- Never return a encoding if not suited for the given decoder. Eg. Never get UnicodeDecodeError!
- Actively maintained, open to contributors.
- Have the backward compatible function ``detect`` that come from Chardet.
- Truly detect the language used in the text.
- It is, for the first time, really universal! As there is no specific probe per charset.
- The package size is X2~X4 lower than Chardet's (5.0)! (Depends on your arch)
- Propose much more options/public kwargs to tweak the detection as you sees fit!
- Using static typing to ease your development.
- Detect Unicode content better than Chardet or cChardet does.
And much more..! What are you waiting for? Upgrade now and give us a feedback. (Even if negative)
python-charset-normalizer-3.4.2/docs/conf.py 0000775 0000000 0000000 00000011760 15005045421 0021103 0 ustar 00root root 0000000 0000000 #!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# charset-normalizer documentation build configuration file, created by
# sphinx-quickstart on Fri Jun 16 04:30:35 2017.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
import sys
import os
sys.path.insert(0, os.path.abspath(".."))
import charset_normalizer
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.doctest',
'sphinx.ext.intersphinx',
'sphinx.ext.todo',
'sphinx.ext.coverage',
'sphinx.ext.mathjax',
'sphinx.ext.ifconfig',
'sphinx.ext.viewcode',
'sphinx.ext.githubpages',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
# source_suffix = '.rst'
source_parsers = {}
source_suffix = ['.rst',]
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = 'charset_normalizer'
copyright = '2023, Ahmed TAHRI'
author = 'Ahmed TAHRI'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = charset_normalizer.__version__
# The full version, including alpha/beta/rc tags.
release = charset_normalizer.__version__
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = "en"
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This patterns also effect to html_static_path and html_extra_path
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = False
# -- Options for HTML output ----------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'furo'
html_theme_path = []
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
# html_theme_options = {}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = []
# -- Options for HTMLHelp output ------------------------------------------
# Output file base name for HTML help builder.
htmlhelp_basename = 'charset-normalizer-doc'
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#
# 'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#
# 'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#
# 'preamble': '',
# Latex figure (float) alignment
#
# 'figure_align': 'htbp',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'charset-normalizer.tex', 'Charset Normalizer Documentation',
'Ahmed TAHRI', 'manual'),
]
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
(master_doc, 'charset-normalizer', 'Charset Normalizer Documentation',
[author], 1)
]
# -- Options for Texinfo output -------------------------------------------
texinfo_documents = [
(master_doc, 'Charset Normalizer', 'Charsert Normalizer Documentation',
author, 'charset-normalizer', '🔎 Like Chardet. 🚀 Package for encoding & language detection. Charset detection.',
'Miscellaneous'),
]
python-charset-normalizer-3.4.2/docs/index.rst 0000775 0000000 0000000 00000004447 15005045421 0021451 0 ustar 00root root 0000000 0000000 ===================
Charset Normalizer
===================
Overview
========
A Library that helps you read text from unknown charset encoding.
This project is motivated by chardet, I'm trying to resolve the issue by taking another approach.
All IANA character set names for which the Python core library provides codecs are supported.
It aims to be as generic as possible.
.. image:: https://repository-images.githubusercontent.com/200259335/d3da9600-dedc-11e9-83e8-081f597505df
:width: 500px
:alt: CLI Charset Normalizer
:align: right
It is released under MIT license, see LICENSE for more
details. Be aware that no warranty of any kind is provided with this package.
Copyright (C) 2025 Ahmed TAHRI
Introduction
============
This library aim to assist you in finding what encoding suit the best to content.
It **DOES NOT** try to uncover the originating encoding, in fact this program does not care about it.
By originating we means the one that was precisely used to encode a text file.
Precisely ::
my_byte_str = 'Bonjour, je suis à la recherche d\'une aide sur les étoiles'.encode('cp1252')
We **ARE NOT** looking for cp1252 **BUT FOR** ``Bonjour, je suis à la recherche d'une aide sur les étoiles``.
Because of this ::
my_byte_str.decode('cp1252') == my_byte_str.decode('cp1256') == my_byte_str.decode('cp1258') == my_byte_str.decode('iso8859_14')
# Print True !
There is no wrong answer to decode ``my_byte_str`` to get the exact same result.
This is where this library differ from others. There's not specific probe per encoding table.
Features
========
- Encoding detection on a fp (file pointer), bytes or PathLike.
- Transpose any encoded content to Unicode the best we can.
- Detect spoken language in text.
- Ship with a great CLI.
- Also, detect binaries.
Start Guide
-----------
.. toctree::
:maxdepth: 2
user/support
user/getstarted
user/advanced_search
user/handling_result
user/miscellaneous
user/cli
Community Guide
---------------
.. toctree::
:maxdepth: 2
community/speedup
community/faq
community/why_migrate
community/featured
Developer Guide
---------------
.. toctree::
:maxdepth: 3
api
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
python-charset-normalizer-3.4.2/docs/make.bat 0000664 0000000 0000000 00000001421 15005045421 0021177 0 ustar 00root root 0000000 0000000 @ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=build
set SPHINXPROJ=charset_normalizer
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
:end
popd
python-charset-normalizer-3.4.2/docs/requirements.txt 0000775 0000000 0000000 00000000014 15005045421 0023056 0 ustar 00root root 0000000 0000000 Sphinx
furo
python-charset-normalizer-3.4.2/docs/user/ 0000775 0000000 0000000 00000000000 15005045421 0020552 5 ustar 00root root 0000000 0000000 python-charset-normalizer-3.4.2/docs/user/advanced_search.rst 0000664 0000000 0000000 00000005344 15005045421 0024404 0 ustar 00root root 0000000 0000000 Advanced Search
===============
Charset Normalizer method ``from_bytes``, ``from_fp`` and ``from_path`` provide some
optional parameters that can be tweaked.
As follow ::
from charset_normalizer import from_bytes
my_byte_str = 'Bсеки човек има право на образование.'.encode('cp1251')
results = from_bytes(
my_byte_str,
steps=10, # Number of steps/block to extract from my_byte_str
chunk_size=512, # Set block size of each extraction
threshold=0.2, # Maximum amount of chaos allowed on first pass
cp_isolation=None, # Finite list of encoding to use when searching for a match
cp_exclusion=None, # Finite list of encoding to avoid when searching for a match
preemptive_behaviour=True, # Determine if we should look into my_byte_str (ASCII-Mode) for pre-defined encoding
explain=False, # Print on screen what is happening when searching for a match
language_threshold=0.1 # Minimum coherence ratio / language ratio match accepted
)
Using CharsetMatches
------------------------------
Here, ``results`` is a ``CharsetMatches`` object. It behave like a list but does not implements all related methods.
Initially, it is sorted. Calling ``best()`` is sufficient to extract the most probable result.
.. autoclass:: charset_normalizer.CharsetMatches
:members:
List behaviour
--------------
Like said earlier, ``CharsetMatches`` object behave like a list.
::
# Call len on results also work
if not results:
print('No match for your sequence')
# Iterate over results like a list
for match in results:
print(match.encoding, 'can decode properly your sequence using', match.alphabets, 'and language', match.language)
# Using index to access results
if results:
print(str(results[0]))
Using best()
------------
Like said above, ``CharsetMatches`` object behave like a list and it is sorted by default after getting results from
``from_bytes``, ``from_fp`` or ``from_path``.
Using ``best()`` return the most probable result, the first entry of the list. Eg. idx 0.
It return a ``CharsetMatch`` object as return value or None if there is not results inside it.
::
result = results.best()
Calling first()
---------------
The very same thing than calling the method ``best()``.
Class aliases
-------------
``CharsetMatches`` is also known as ``CharsetDetector``, ``CharsetDoctor`` and ``CharsetNormalizerMatches``.
It is useful if you prefer short class name.
Verbose output
--------------
You may want to understand why a specific encoding was not picked by charset_normalizer. All you have to do is passing
``explain`` to True when using methods ``from_bytes``, ``from_fp`` or ``from_path``.
python-charset-normalizer-3.4.2/docs/user/cli.rst 0000664 0000000 0000000 00000007162 15005045421 0022061 0 ustar 00root root 0000000 0000000 Command Line Interface
======================
charset-normalizer ship with a CLI that should be available as `normalizer`.
This is a great tool to fully exploit the detector capabilities without having to write Python code.
Possible use cases:
#. Quickly discover probable originating charset from a file.
#. I want to quickly convert a non Unicode file to Unicode.
#. Debug the charset-detector.
Down below, we will guide you through some basic examples.
Arguments
---------
You may simply invoke `normalizer -h` (with the h(elp) flag) to understand the basics.
::
usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
file [file ...]
The Real First Universal Charset Detector. Discover originating encoding used
on text file. Normalize text to unicode.
positional arguments:
files File(s) to be analysed
optional arguments:
-h, --help show this help message and exit
-v, --verbose Display complementary information about file if any.
Stdout will contain logs about the detection process.
-a, --with-alternative
Output complementary possibilities if any. Top-level
JSON WILL be a list.
-n, --normalize Permit to normalize input file. If not set, program
does not write anything.
-m, --minimal Only output the charset detected to STDOUT. Disabling
JSON output.
-r, --replace Replace file when trying to normalize it instead of
creating a new one.
-f, --force Replace file without asking if you are sure, use this
flag with caution.
-t THRESHOLD, --threshold THRESHOLD
Define a custom maximum amount of chaos allowed in
decoded content. 0. <= chaos <= 1.
--version Show version information and exit.
.. code:: bash
normalizer ./data/sample.1.fr.srt
You may also run the command line interface using:
.. code:: bash
python -m charset_normalizer ./data/sample.1.fr.srt
Main JSON Output
----------------
🎉 Since version 1.4.0 the CLI produce easily usable stdout result in
JSON format.
.. code:: json
{
"path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
"encoding": "cp1252",
"encoding_aliases": [
"1252",
"windows_1252"
],
"alternative_encodings": [
"cp1254",
"cp1256",
"cp1258",
"iso8859_14",
"iso8859_15",
"iso8859_16",
"iso8859_3",
"iso8859_9",
"latin_1",
"mbcs"
],
"language": "French",
"alphabets": [
"Basic Latin",
"Latin-1 Supplement"
],
"has_sig_or_bom": false,
"chaos": 0.149,
"coherence": 97.152,
"unicode_path": null,
"is_preferred": true
}
I recommend the `jq` command line tool to easily parse and exploit specific data from the produced JSON.
Multiple File Input
-------------------
It is possible to give multiple files to the CLI. It will produce a list instead of an object at the top level.
When using the `-m` (minimal output) it will rather print one result (encoding) per line.
Unicode Conversion
------------------
If you desire to convert any file to Unicode you will need to append the flag `-n`. It will produce another file,
it won't replace it by default.
The newly created file path will be declared in `unicode_path` (JSON output).
python-charset-normalizer-3.4.2/docs/user/getstarted.rst 0000664 0000000 0000000 00000003321 15005045421 0023451 0 ustar 00root root 0000000 0000000 Installation
============
This installs a package that can be used from Python (``import charset_normalizer``).
To install for all users on the system, administrator rights (root) may be required.
Using PIP
---------
Charset Normalizer can be installed from pip::
pip install charset-normalizer
You may retrieve the latest unicodedata backport as follow::
pip install charset-normalizer[unicode_backport]
From git via master
-----------------------
You can install from dev-master branch using git::
git clone https://github.com/Ousret/charset_normalizer.git
cd charset_normalizer/
python setup.py install
Basic Usage
===========
The new way
-----------
You may want to get right to it. ::
from charset_normalizer import from_bytes, from_path
# This is going to print out your sequence once properly decoded
print(
str(
from_bytes(
my_byte_str
).best()
)
)
# You could also want the same from a file
print(
str(
from_path(
'./data/sample.1.ar.srt'
).best()
)
)
Backward compatibility
----------------------
If you were used to python chardet, we are providing the very same ``detect()`` method as chardet.
This function is mostly backward-compatible with Chardet. The migration should be painless.
::
from charset_normalizer import detect
# This will behave exactly the same as python chardet
result = detect(my_byte_str)
if result['encoding'] is not None:
print('got', result['encoding'], 'as detected encoding')
You may upgrade your code with ease.
CTRL + R ``from chardet import detect`` to ``from charset_normalizer import detect``.
python-charset-normalizer-3.4.2/docs/user/handling_result.rst 0000664 0000000 0000000 00000001123 15005045421 0024463 0 ustar 00root root 0000000 0000000 ================
Handling Result
================
When initiating search upon a buffer, bytes or file you can assign the return value and fully exploit it.
::
my_byte_str = 'Bсеки човек има право на образование.'.encode('cp1251')
# Assign return value so we can fully exploit result
result = from_bytes(
my_byte_str
).best()
print(result.encoding) # cp1251
Using CharsetMatch
----------------------------
Here, ``result`` is a ``CharsetMatch`` object or ``None``.
.. autoclass:: charset_normalizer.CharsetMatch
:members:
python-charset-normalizer-3.4.2/docs/user/miscellaneous.rst 0000664 0000000 0000000 00000003753 15005045421 0024157 0 ustar 00root root 0000000 0000000 ==============
Miscellaneous
==============
Convert to str
--------------
Any ``CharsetMatch`` object can be transformed to exploitable ``str`` variable.
::
my_byte_str = 'Bсеки човек има право на образование.'.encode('cp1251')
# Assign return value so we can fully exploit result
result = from_bytes(
my_byte_str
).best()
# This should print 'Bсеки човек има право на образование.'
print(str(result))
Logging
-------
Prior to the version 2.0.11 you may encounter some unexpected logs in your streams.
Something along the line of:
::
... | WARNING | override steps (5) and chunk_size (512) as content does not fit (465 byte(s) given) parameters.
... | INFO | ascii passed initial chaos probing. Mean measured chaos is 0.000000 %
... | INFO | ascii should target any language(s) of ['Latin Based']
It is most likely because you altered the root getLogger instance. The package has its own logic behind logging and why
it is useful. See https://docs.python.org/3/howto/logging.html to learn the basics.
If you are looking to silence and/or reduce drastically the amount of logs, please upgrade to the latest version
available for `charset-normalizer` using your package manager or by `pip install charset-normalizer -U`.
The latest version will no longer produce any entry greater than `DEBUG`.
On `DEBUG` only one entry will be observed and that is about the detection result.
Then regarding the others log entries, they will be pushed as `Level 5`. Commonly known as TRACE level, but we do
not register it globally.
Detect binaries
---------------
This package offers a neat way to detect files that can be considered as 'binaries'
meaning that it is not likely to be a text-file.
::
from charset_normalizer import is_binary
# It can receive both a path or bytes or even a file pointer.
result = is_binary("./my-file.ext")
# This should print 'True' or 'False'
print(result)
python-charset-normalizer-3.4.2/docs/user/support.rst 0000664 0000000 0000000 00000015617 15005045421 0023032 0 ustar 00root root 0000000 0000000 =================
Support
=================
**If you are running:**
- Python >=2.7,<3.5: Unsupported
- Python 3.5: charset-normalizer < 2.1
- Python 3.6: charset-normalizer < 3.1
Upgrade your Python interpreter as soon as possible.
-------------------
Supported Encodings
-------------------
Here are a list of supported encoding and supported language with latest update. Also this list
may change depending of your python version.
Charset Normalizer is able to detect any of those encoding. This list is NOT static and depends heavily on what your
current cPython version is shipped with. See https://docs.python.org/3/library/codecs.html#standard-encodings
=============== ===============================================================================================================================
IANA Code Page Aliases
=============== ===============================================================================================================================
ascii 646, ansi_x3.4_1968, ansi_x3_4_1968, ansi_x3.4_1986, cp367, csascii, ibm367, iso646_us, iso_646.irv_1991, iso_ir_6, us, us_ascii
big5 big5_tw, csbig5, x_mac_trad_chinese
big5hkscs big5_hkscs, hkscs
cp037 037, csibm037, ebcdic_cp_ca, ebcdic_cp_nl, ebcdic_cp_us, ebcdic_cp_wt, ibm037, ibm039
cp1026 1026, csibm1026, ibm1026
cp1125 1125, ibm1125, cp866u, ruscii
cp1140 1140, ibm1140
cp1250 1250, windows_1250
cp1251 1251, windows_1251
cp1252 1252, windows_1252
cp1253 1253, windows_1253
cp1254 1254, windows_1254
cp1255 1255, windows_1255
cp1256 1256, windows_1256
cp1257 1257, windows_1257
cp1258 1258, windows_1258
cp273 273, ibm273, csibm273
cp424 424, csibm424, ebcdic_cp_he, ibm424
cp437 437, cspc8codepage437, ibm437
cp500 500, csibm500, ebcdic_cp_be, ebcdic_cp_ch, ibm500
cp775 775, cspc775baltic, ibm775
cp850 850, cspc850multilingual, ibm850
cp852 852, cspcp852, ibm852
cp855 855, csibm855, ibm855
cp857 857, csibm857, ibm857
cp858 858, csibm858, ibm858
cp860 860, csibm860, ibm860
cp861 861, cp_is, csibm861, ibm861
cp862 862, cspc862latinhebrew, ibm862
cp863 863, csibm863, ibm863
cp864 864, csibm864, ibm864
cp865 865, csibm865, ibm865
cp866 866, csibm866, ibm866
cp869 869, cp_gr, csibm869, ibm869
cp932 932, ms932, mskanji, ms_kanji
cp949 949, ms949, uhc
cp950 950, ms950
euc_jis_2004 jisx0213, eucjis2004, euc_jis2004
euc_jisx0213 eucjisx0213
euc_jp eucjp, ujis, u_jis
euc_kr euckr, korean, ksc5601, ks_c_5601, ks_c_5601_1987, ksx1001, ks_x_1001, x_mac_korean
gb18030 gb18030_2000
gb2312 chinese, csiso58gb231280, euc_cn, euccn, eucgb2312_cn, gb2312_1980, gb2312_80, iso_ir_58, x_mac_simp_chinese
gbk 936, cp936, ms936
hp_roman8 roman8, r8, csHPRoman8
hz hzgb, hz_gb, hz_gb_2312
iso2022_jp csiso2022jp, iso2022jp, iso_2022_jp
iso2022_jp_1 iso2022jp_1, iso_2022_jp_1
iso2022_jp_2 iso2022jp_2, iso_2022_jp_2
iso2022_jp_3 iso2022jp_3, iso_2022_jp_3
iso2022_jp_ext iso2022jp_ext, iso_2022_jp_ext
iso2022_kr csiso2022kr, iso2022kr, iso_2022_kr
iso8859_10 csisolatin6, iso_8859_10, iso_8859_10_1992, iso_ir_157, l6, latin6
iso8859_11 thai, iso_8859_11, iso_8859_11_2001
iso8859_13 iso_8859_13, l7, latin7
iso8859_14 iso_8859_14, iso_8859_14_1998, iso_celtic, iso_ir_199, l8, latin8
iso8859_15 iso_8859_15, l9, latin9
iso8859_16 iso_8859_16, iso_8859_16_2001, iso_ir_226, l10, latin10
iso8859_2 csisolatin2, iso_8859_2, iso_8859_2_1987, iso_ir_101, l2, latin2
iso8859_3 csisolatin3, iso_8859_3, iso_8859_3_1988, iso_ir_109, l3, latin3
iso8859_4 csisolatin4, iso_8859_4, iso_8859_4_1988, iso_ir_110, l4, latin4
iso8859_5 csisolatincyrillic, cyrillic, iso_8859_5, iso_8859_5_1988, iso_ir_144
iso8859_6 arabic, asmo_708, csisolatinarabic, ecma_114, iso_8859_6, iso_8859_6_1987, iso_ir_127
iso8859_7 csisolatingreek, ecma_118, elot_928, greek, greek8, iso_8859_7, iso_8859_7_1987, iso_ir_126
iso8859_8 csisolatinhebrew, hebrew, iso_8859_8, iso_8859_8_1988, iso_ir_138
iso8859_9 csisolatin5, iso_8859_9, iso_8859_9_1989, iso_ir_148, l5, latin5
iso2022_jp_2004 iso_2022_jp_2004, iso2022jp_2004
johab cp1361, ms1361
koi8_r cskoi8r
kz1048 kz_1048, rk1048, strk1048_2002
latin_1 8859, cp819, csisolatin1, ibm819, iso8859, iso8859_1, iso_8859_1, iso_8859_1_1987, iso_ir_100, l1, latin, latin1
mac_cyrillic maccyrillic
mac_greek macgreek
mac_iceland maciceland
mac_latin2 maccentraleurope, maclatin2
mac_roman macintosh, macroman
mac_turkish macturkish
ptcp154 csptcp154, pt154, cp154, cyrillic_asian
shift_jis csshiftjis, shiftjis, sjis, s_jis, x_mac_japanese
shift_jis_2004 shiftjis2004, sjis_2004, s_jis_2004
shift_jisx0213 shiftjisx0213, sjisx0213, s_jisx0213
tis_620 tis620, tis_620_0, tis_620_2529_0, tis_620_2529_1, iso_ir_166
utf_16 u16, utf16
utf_16_be unicodebigunmarked, utf_16be
utf_16_le unicodelittleunmarked, utf_16le
utf_32 u32, utf32
utf_32_be utf_32be
utf_32_le utf_32le
utf_8 u8, utf, utf8, utf8_ucs2, utf8_ucs4 (+utf_8_sig)
utf_7* u7, unicode-1-1-utf-7
cp720 N.A.
cp737 N.A.
cp856 N.A.
cp874 N.A.
cp875 N.A.
cp1006 N.A.
koi8_r N.A.
koi8_t N.A.
koi8_u N.A.
=============== ===============================================================================================================================
*: Only if a SIG/mark is found.
-------------------
Supported Languages
-------------------
Those language can be detected inside your content. All of these are specified in ./charset_normalizer/assets/__init__.py .
| English,
| German,
| French,
| Dutch,
| Italian,
| Polish,
| Spanish,
| Russian,
| Japanese,
| Portuguese,
| Swedish,
| Chinese,
| Ukrainian,
| Norwegian,
| Finnish,
| Vietnamese,
| Czech,
| Hungarian,
| Korean,
| Indonesian,
| Turkish,
| Romanian,
| Farsi,
| Arabic,
| Danish,
| Serbian,
| Lithuanian,
| Slovene,
| Slovak,
| Malay,
| Hebrew,
| Bulgarian,
| Croatian,
| Hindi,
| Estonian,
| Thai,
| Greek,
| Tamil.
----------------------------
Incomplete Sequence / Stream
----------------------------
It is not (yet) officially supported. If you feed an incomplete byte sequence (eg. truncated multi-byte sequence) the detector will
most likely fail to return a proper result.
If you are purposely feeding part of your payload for performance concerns, you may stop doing it as this package is fairly optimized.
We are working on a dedicated way to handle streams.
python-charset-normalizer-3.4.2/noxfile.py 0000664 0000000 0000000 00000014343 15005045421 0020667 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import os
import shutil
import nox
def test_impl(
session: nox.Session,
use_mypyc: bool = False,
):
# Install deps and the package itself.
session.install("-U", "pip", "setuptools", silent=False)
session.install("-r", "dev-requirements.txt", silent=False)
session.install(
".",
silent=False,
env={"CHARSET_NORMALIZER_USE_MYPYC": "1" if use_mypyc else "0"},
)
# Show the pip version.
session.run("pip", "--version")
# Print the Python version and bytesize.
session.run("python", "--version")
# Show charset-normalizer cli info
session.run("normalizer", "--version")
# Inspired from https://hynek.me/articles/ditch-codecov-python/
# We use parallel mode and then combine in a later CI step
session.run(
"python",
"-m",
"coverage",
"run",
"--parallel-mode",
"-m",
"pytest",
"-v",
"-ra",
f"--color={'yes' if 'GITHUB_ACTIONS' in os.environ else 'auto'}",
"--tb=native",
"--durations=10",
"--strict-config",
"--strict-markers",
*(session.posargs or ("tests/",)),
env={
"PYTHONWARNINGS": "always::DeprecationWarning",
"COVERAGE_CORE": "sysmon",
},
)
@nox.session(
python=["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "3.14", "pypy"]
)
def test(session: nox.Session) -> None:
test_impl(session)
@nox.session(python=["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13"])
def test_mypyc(session: nox.Session) -> None:
test_impl(session, True)
def git_clone(session: nox.Session, git_url: str) -> None:
"""We either clone the target repository or if already exist
simply reset the state and pull.
"""
expected_directory = git_url.split("/")[-1]
if expected_directory.endswith(".git"):
expected_directory = expected_directory[:-4]
if not os.path.isdir(expected_directory):
session.run("git", "clone", "--depth", "1", git_url, external=True)
else:
session.run(
"git", "-C", expected_directory, "reset", "--hard", "HEAD", external=True
)
session.run("git", "-C", expected_directory, "pull", external=True)
@nox.session()
def backward_compatibility(session: nox.Session) -> None:
git_clone(session, "https://github.com/ousret/char-dataset")
# Install deps and the package itself.
session.install("-U", "pip", "setuptools", silent=False)
session.install("-r", "dev-requirements.txt", silent=False)
session.install(".", silent=False)
session.install("chardet")
session.run(
"python",
"bin/bc.py",
*(session.posargs or ("--coverage=85",)),
)
@nox.session()
def coverage(session: nox.Session) -> None:
git_clone(session, "https://github.com/ousret/char-dataset")
# Install deps and the package itself.
session.install("-U", "pip", "setuptools", silent=False)
session.install("-r", "dev-requirements.txt", silent=False)
session.install(".", silent=False)
# Show the pip version.
session.run("pip", "--version")
# Print the Python version and bytesize.
session.run("python", "--version")
# Show charset-normalizer cli info
session.run("normalizer", "--version")
session.run(
"python",
"-m",
"coverage",
"run",
"--parallel-mode",
"bin/coverage.py",
*(session.posargs or ("--coverage=90", "--with-preemptive")),
)
@nox.session()
def performance(session: nox.Session) -> None:
git_clone(session, "https://github.com/ousret/char-dataset")
# Install deps and the package itself.
session.install("-U", "pip", "setuptools", silent=False)
session.install("-r", "dev-requirements.txt", silent=False)
session.install("chardet")
session.install(".", silent=False, env={"CHARSET_NORMALIZER_USE_MYPYC": "1"})
session.run(
"python",
"bin/performance.py",
*(session.posargs or ()),
)
@nox.session()
def downstream_niquests(session: nox.Session) -> None:
root = os.getcwd()
tmp_dir = session.create_tmp()
session.cd(tmp_dir)
git_clone(session, "https://github.com/jawah/niquests")
session.chdir("niquests")
session.run("git", "rev-parse", "HEAD", external=True)
session.install(".[socks]", silent=False)
session.install("-r", "requirements-dev.txt", silent=False)
session.cd(root)
session.install(".", silent=False)
session.cd(f"{tmp_dir}/niquests")
session.run(
"python",
"-c",
"import charset_normalizer; print(charset_normalizer.__version__)",
)
session.run(
"python",
"-m",
"pytest",
"-v",
f"--color={'yes' if 'GITHUB_ACTIONS' in os.environ else 'auto'}",
*(session.posargs or ("tests/",)),
env={"NIQUESTS_STRICT_OCSP": "1"},
)
@nox.session()
def downstream_requests(session: nox.Session) -> None:
root = os.getcwd()
tmp_dir = session.create_tmp()
session.cd(tmp_dir)
git_clone(session, "https://github.com/psf/requests")
session.chdir("requests")
session.run("git", "rev-parse", "HEAD", external=True)
session.install(".[socks]", silent=False)
session.install("-r", "requirements-dev.txt", silent=False)
session.cd(root)
session.install(".", silent=False)
session.cd(f"{tmp_dir}/requests")
session.run(
"python",
"-c",
"import charset_normalizer; print(charset_normalizer.__version__)",
)
session.run(
"python",
"-m",
"pytest",
"-v",
f"--color={'yes' if 'GITHUB_ACTIONS' in os.environ else 'auto'}",
*(session.posargs or ("tests/",)),
)
@nox.session()
def format(session: nox.Session) -> None:
"""Run code formatters."""
lint(session)
@nox.session
def lint(session: nox.Session) -> None:
session.install("pre-commit")
session.run("pre-commit", "run", "--all-files")
@nox.session
def docs(session: nox.Session) -> None:
session.install("-r", "docs/requirements.txt")
session.install(".")
session.chdir("docs")
if os.path.exists("_build"):
shutil.rmtree("_build")
session.run("sphinx-build", "-b", "html", "-W", ".", "_build/html")
python-charset-normalizer-3.4.2/pyproject.toml 0000664 0000000 0000000 00000005306 15005045421 0021564 0 ustar 00root root 0000000 0000000 [build-system]
requires = ["setuptools", "setuptools-scm", "mypy>=1.4.1,<=1.15.0"]
build-backend = "setuptools.build_meta"
[project]
name = "charset-normalizer"
description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
license = {text = "MIT"}
keywords = ["encoding", "charset", "charset-detector", "detector", "normalization", "unicode", "chardet", "detect"]
authors = [
{name = "Ahmed R. TAHRI", email="tahri.ahmed@proton.me"},
]
maintainers = [
{name = "Ahmed R. TAHRI", email="tahri.ahmed@proton.me"},
]
classifiers = [
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
"Topic :: Text Processing :: Linguistic",
"Topic :: Utilities",
"Typing :: Typed",
]
requires-python = ">=3.7"
dynamic = ["version", "readme"]
[project.optional-dependencies]
unicode_backport = []
[tool.setuptools]
package-dir = {"" = "src"}
packages = ["charset_normalizer", "charset_normalizer.cli", ]
[tool.setuptools.dynamic]
version = {attr = "charset_normalizer.__version__"}
readme = {file = ["README.md", "CHANGELOG.md", "LICENSE"], content-type = "text/markdown"}
[project.scripts]
normalizer = "charset_normalizer:cli.cli_detect"
[project.urls]
"Changelog" = "https://github.com/jawah/charset_normalizer/blob/master/CHANGELOG.md"
"Documentation" = "https://charset-normalizer.readthedocs.io/"
"Code" = "https://github.com/jawah/charset_normalizer"
"Issue tracker" = "https://github.com/jawah/charset_normalizer/issues"
[tool.pytest.ini_options]
log_level = "DEBUG"
filterwarnings = [
"error",
]
[tool.isort]
profile = "black"
add_imports = "from __future__ import annotations"
[tool.mypy]
check_untyped_defs = true
disallow_any_generics = true
disallow_incomplete_defs = true
disallow_subclassing_any = true
disallow_untyped_calls = true
disallow_untyped_decorators = true
disallow_untyped_defs = true
no_implicit_optional = true
no_implicit_reexport = true
show_error_codes = true
strict_equality = true
warn_redundant_casts = true
warn_return_any = true
warn_unused_configs = true
warn_unused_ignores = false
python-charset-normalizer-3.4.2/setup.py 0000664 0000000 0000000 00000001131 15005045421 0020352 0 ustar 00root root 0000000 0000000 #!/usr/bin/env python
from __future__ import annotations
import os
import sys
from setuptools import setup
USE_MYPYC = False
if len(sys.argv) > 1 and sys.argv[1] == "--use-mypyc":
sys.argv.pop(1)
USE_MYPYC = True
elif os.getenv("CHARSET_NORMALIZER_USE_MYPYC", None) == "1":
USE_MYPYC = True
if USE_MYPYC:
from mypyc.build import mypycify
MYPYC_MODULES = mypycify(
[
"src/charset_normalizer/md.py",
],
debug_level="0",
opt_level="3",
)
else:
MYPYC_MODULES = None
setup(name="charset-normalizer", ext_modules=MYPYC_MODULES)
python-charset-normalizer-3.4.2/src/ 0000775 0000000 0000000 00000000000 15005045421 0017433 5 ustar 00root root 0000000 0000000 python-charset-normalizer-3.4.2/src/charset_normalizer/ 0000775 0000000 0000000 00000000000 15005045421 0023326 5 ustar 00root root 0000000 0000000 python-charset-normalizer-3.4.2/src/charset_normalizer/__init__.py 0000664 0000000 0000000 00000003066 15005045421 0025444 0 ustar 00root root 0000000 0000000 """
Charset-Normalizer
~~~~~~~~~~~~~~
The Real First Universal Charset Detector.
A library that helps you read text from an unknown charset encoding.
Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
All IANA character set names for which the Python core library provides codecs are supported.
Basic usage:
>>> from charset_normalizer import from_bytes
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
>>> best_guess = results.best()
>>> str(best_guess)
'Bсеки човек има право на образование. Oбразованието!'
Others methods and usages are available - see the full documentation
at .
:copyright: (c) 2021 by Ahmed TAHRI
:license: MIT, see LICENSE for more details.
"""
from __future__ import annotations
import logging
from .api import from_bytes, from_fp, from_path, is_binary
from .legacy import detect
from .models import CharsetMatch, CharsetMatches
from .utils import set_logging_handler
from .version import VERSION, __version__
__all__ = (
"from_fp",
"from_path",
"from_bytes",
"is_binary",
"detect",
"CharsetMatch",
"CharsetMatches",
"__version__",
"VERSION",
"set_logging_handler",
)
# Attach a NullHandler to the top level logger by default
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
python-charset-normalizer-3.4.2/src/charset_normalizer/__main__.py 0000664 0000000 0000000 00000000155 15005045421 0025421 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from .cli import cli_detect
if __name__ == "__main__":
cli_detect()
python-charset-normalizer-3.4.2/src/charset_normalizer/api.py 0000664 0000000 0000000 00000054131 15005045421 0024455 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import logging
from os import PathLike
from typing import BinaryIO
from .cd import (
coherence_ratio,
encoding_languages,
mb_encoding_languages,
merge_coherence_ratios,
)
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
from .md import mess_ratio
from .models import CharsetMatch, CharsetMatches
from .utils import (
any_specified_encoding,
cut_sequence_chunks,
iana_name,
identify_sig_or_bom,
is_cp_similar,
is_multi_byte_encoding,
should_strip_sig_or_bom,
)
logger = logging.getLogger("charset_normalizer")
explain_handler = logging.StreamHandler()
explain_handler.setFormatter(
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
)
def from_bytes(
sequences: bytes | bytearray,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.2,
cp_isolation: list[str] | None = None,
cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
If there is no results, it is a strong indicator that the source is binary/not text.
By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
but never take it for granted. Can improve the performance.
You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
purpose.
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
Custom logging format and handler can be set manually.
"""
if not isinstance(sequences, (bytearray, bytes)):
raise TypeError(
"Expected object of type bytes or bytearray, got: {}".format(
type(sequences)
)
)
if explain:
previous_logger_level: int = logger.level
logger.addHandler(explain_handler)
logger.setLevel(TRACE)
length: int = len(sequences)
if length == 0:
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
if explain: # Defensive: ensure exit path clean handler
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level or logging.WARNING)
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
if cp_isolation is not None:
logger.log(
TRACE,
"cp_isolation is set. use this flag for debugging purpose. "
"limited list of encoding allowed : %s.",
", ".join(cp_isolation),
)
cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
else:
cp_isolation = []
if cp_exclusion is not None:
logger.log(
TRACE,
"cp_exclusion is set. use this flag for debugging purpose. "
"limited list of encoding excluded : %s.",
", ".join(cp_exclusion),
)
cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
else:
cp_exclusion = []
if length <= (chunk_size * steps):
logger.log(
TRACE,
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
steps,
chunk_size,
length,
)
steps = 1
chunk_size = length
if steps > 1 and length / steps < chunk_size:
chunk_size = int(length / steps)
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
if is_too_small_sequence:
logger.log(
TRACE,
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
length
),
)
elif is_too_large_sequence:
logger.log(
TRACE,
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
length
),
)
prioritized_encodings: list[str] = []
specified_encoding: str | None = (
any_specified_encoding(sequences) if preemptive_behaviour else None
)
if specified_encoding is not None:
prioritized_encodings.append(specified_encoding)
logger.log(
TRACE,
"Detected declarative mark in sequence. Priority +1 given for %s.",
specified_encoding,
)
tested: set[str] = set()
tested_but_hard_failure: list[str] = []
tested_but_soft_failure: list[str] = []
fallback_ascii: CharsetMatch | None = None
fallback_u8: CharsetMatch | None = None
fallback_specified: CharsetMatch | None = None
results: CharsetMatches = CharsetMatches()
early_stop_results: CharsetMatches = CharsetMatches()
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
if sig_encoding is not None:
prioritized_encodings.append(sig_encoding)
logger.log(
TRACE,
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
len(sig_payload),
sig_encoding,
)
prioritized_encodings.append("ascii")
if "utf_8" not in prioritized_encodings:
prioritized_encodings.append("utf_8")
for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
if cp_isolation and encoding_iana not in cp_isolation:
continue
if cp_exclusion and encoding_iana in cp_exclusion:
continue
if encoding_iana in tested:
continue
tested.add(encoding_iana)
decoded_payload: str | None = None
bom_or_sig_available: bool = sig_encoding == encoding_iana
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
encoding_iana
)
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
logger.log(
TRACE,
"Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
encoding_iana,
)
continue
if encoding_iana in {"utf_7"} and not bom_or_sig_available:
logger.log(
TRACE,
"Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
encoding_iana,
)
continue
try:
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
except (ModuleNotFoundError, ImportError):
logger.log(
TRACE,
"Encoding %s does not provide an IncrementalDecoder",
encoding_iana,
)
continue
try:
if is_too_large_sequence and is_multi_byte_decoder is False:
str(
(
sequences[: int(50e4)]
if strip_sig_or_bom is False
else sequences[len(sig_payload) : int(50e4)]
),
encoding=encoding_iana,
)
else:
decoded_payload = str(
(
sequences
if strip_sig_or_bom is False
else sequences[len(sig_payload) :]
),
encoding=encoding_iana,
)
except (UnicodeDecodeError, LookupError) as e:
if not isinstance(e, LookupError):
logger.log(
TRACE,
"Code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
tested_but_hard_failure.append(encoding_iana)
continue
similar_soft_failure_test: bool = False
for encoding_soft_failed in tested_but_soft_failure:
if is_cp_similar(encoding_iana, encoding_soft_failed):
similar_soft_failure_test = True
break
if similar_soft_failure_test:
logger.log(
TRACE,
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
encoding_iana,
encoding_soft_failed,
)
continue
r_ = range(
0 if not bom_or_sig_available else len(sig_payload),
length,
int(length / steps),
)
multi_byte_bonus: bool = (
is_multi_byte_decoder
and decoded_payload is not None
and len(decoded_payload) < length
)
if multi_byte_bonus:
logger.log(
TRACE,
"Code page %s is a multi byte encoding table and it appear that at least one character "
"was encoded using n-bytes.",
encoding_iana,
)
max_chunk_gave_up: int = int(len(r_) / 4)
max_chunk_gave_up = max(max_chunk_gave_up, 2)
early_stop_count: int = 0
lazy_str_hard_failure = False
md_chunks: list[str] = []
md_ratios = []
try:
for chunk in cut_sequence_chunks(
sequences,
encoding_iana,
r_,
chunk_size,
bom_or_sig_available,
strip_sig_or_bom,
sig_payload,
is_multi_byte_decoder,
decoded_payload,
):
md_chunks.append(chunk)
md_ratios.append(
mess_ratio(
chunk,
threshold,
explain is True and 1 <= len(cp_isolation) <= 2,
)
)
if md_ratios[-1] >= threshold:
early_stop_count += 1
if (early_stop_count >= max_chunk_gave_up) or (
bom_or_sig_available and strip_sig_or_bom is False
):
break
except (
UnicodeDecodeError
) as e: # Lazy str loading may have missed something there
logger.log(
TRACE,
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
early_stop_count = max_chunk_gave_up
lazy_str_hard_failure = True
# We might want to check the sequence again with the whole content
# Only if initial MD tests passes
if (
not lazy_str_hard_failure
and is_too_large_sequence
and not is_multi_byte_decoder
):
try:
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
except UnicodeDecodeError as e:
logger.log(
TRACE,
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
tested_but_hard_failure.append(encoding_iana)
continue
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
tested_but_soft_failure.append(encoding_iana)
logger.log(
TRACE,
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
"Computed mean chaos is %f %%.",
encoding_iana,
early_stop_count,
round(mean_mess_ratio * 100, ndigits=3),
)
# Preparing those fallbacks in case we got nothing.
if (
enable_fallback
and encoding_iana in ["ascii", "utf_8", specified_encoding]
and not lazy_str_hard_failure
):
fallback_entry = CharsetMatch(
sequences,
encoding_iana,
threshold,
False,
[],
decoded_payload,
preemptive_declaration=specified_encoding,
)
if encoding_iana == specified_encoding:
fallback_specified = fallback_entry
elif encoding_iana == "ascii":
fallback_ascii = fallback_entry
else:
fallback_u8 = fallback_entry
continue
logger.log(
TRACE,
"%s passed initial chaos probing. Mean measured chaos is %f %%",
encoding_iana,
round(mean_mess_ratio * 100, ndigits=3),
)
if not is_multi_byte_decoder:
target_languages: list[str] = encoding_languages(encoding_iana)
else:
target_languages = mb_encoding_languages(encoding_iana)
if target_languages:
logger.log(
TRACE,
"{} should target any language(s) of {}".format(
encoding_iana, str(target_languages)
),
)
cd_ratios = []
# We shall skip the CD when its about ASCII
# Most of the time its not relevant to run "language-detection" on it.
if encoding_iana != "ascii":
for chunk in md_chunks:
chunk_languages = coherence_ratio(
chunk,
language_threshold,
",".join(target_languages) if target_languages else None,
)
cd_ratios.append(chunk_languages)
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
if cd_ratios_merged:
logger.log(
TRACE,
"We detected language {} using {}".format(
cd_ratios_merged, encoding_iana
),
)
current_match = CharsetMatch(
sequences,
encoding_iana,
mean_mess_ratio,
bom_or_sig_available,
cd_ratios_merged,
(
decoded_payload
if (
is_too_large_sequence is False
or encoding_iana in [specified_encoding, "ascii", "utf_8"]
)
else None
),
preemptive_declaration=specified_encoding,
)
results.append(current_match)
if (
encoding_iana in [specified_encoding, "ascii", "utf_8"]
and mean_mess_ratio < 0.1
):
# If md says nothing to worry about, then... stop immediately!
if mean_mess_ratio == 0.0:
logger.debug(
"Encoding detection: %s is most likely the one.",
current_match.encoding,
)
if explain: # Defensive: ensure exit path clean handler
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([current_match])
early_stop_results.append(current_match)
if (
len(early_stop_results)
and (specified_encoding is None or specified_encoding in tested)
and "ascii" in tested
and "utf_8" in tested
):
probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment]
logger.debug(
"Encoding detection: %s is most likely the one.",
probable_result.encoding,
)
if explain: # Defensive: ensure exit path clean handler
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([probable_result])
if encoding_iana == sig_encoding:
logger.debug(
"Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
"the beginning of the sequence.",
encoding_iana,
)
if explain: # Defensive: ensure exit path clean handler
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([results[encoding_iana]])
if len(results) == 0:
if fallback_u8 or fallback_ascii or fallback_specified:
logger.log(
TRACE,
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
)
if fallback_specified:
logger.debug(
"Encoding detection: %s will be used as a fallback match",
fallback_specified.encoding,
)
results.append(fallback_specified)
elif (
(fallback_u8 and fallback_ascii is None)
or (
fallback_u8
and fallback_ascii
and fallback_u8.fingerprint != fallback_ascii.fingerprint
)
or (fallback_u8 is not None)
):
logger.debug("Encoding detection: utf_8 will be used as a fallback match")
results.append(fallback_u8)
elif fallback_ascii:
logger.debug("Encoding detection: ascii will be used as a fallback match")
results.append(fallback_ascii)
if results:
logger.debug(
"Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
results.best().encoding, # type: ignore
len(results) - 1,
)
else:
logger.debug("Encoding detection: Unable to determine any suitable charset.")
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return results
def from_fp(
fp: BinaryIO,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: list[str] | None = None,
cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Same thing than the function from_bytes but using a file pointer that is already ready.
Will not close the file pointer.
"""
return from_bytes(
fp.read(),
steps,
chunk_size,
threshold,
cp_isolation,
cp_exclusion,
preemptive_behaviour,
explain,
language_threshold,
enable_fallback,
)
def from_path(
path: str | bytes | PathLike, # type: ignore[type-arg]
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: list[str] | None = None,
cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
Can raise IOError.
"""
with open(path, "rb") as fp:
return from_fp(
fp,
steps,
chunk_size,
threshold,
cp_isolation,
cp_exclusion,
preemptive_behaviour,
explain,
language_threshold,
enable_fallback,
)
def is_binary(
fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg]
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: list[str] | None = None,
cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = False,
) -> bool:
"""
Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
are disabled to be stricter around ASCII-compatible but unlikely to be a string.
"""
if isinstance(fp_or_path_or_payload, (str, PathLike)):
guesses = from_path(
fp_or_path_or_payload,
steps=steps,
chunk_size=chunk_size,
threshold=threshold,
cp_isolation=cp_isolation,
cp_exclusion=cp_exclusion,
preemptive_behaviour=preemptive_behaviour,
explain=explain,
language_threshold=language_threshold,
enable_fallback=enable_fallback,
)
elif isinstance(
fp_or_path_or_payload,
(
bytes,
bytearray,
),
):
guesses = from_bytes(
fp_or_path_or_payload,
steps=steps,
chunk_size=chunk_size,
threshold=threshold,
cp_isolation=cp_isolation,
cp_exclusion=cp_exclusion,
preemptive_behaviour=preemptive_behaviour,
explain=explain,
language_threshold=language_threshold,
enable_fallback=enable_fallback,
)
else:
guesses = from_fp(
fp_or_path_or_payload,
steps=steps,
chunk_size=chunk_size,
threshold=threshold,
cp_isolation=cp_isolation,
cp_exclusion=cp_exclusion,
preemptive_behaviour=preemptive_behaviour,
explain=explain,
language_threshold=language_threshold,
enable_fallback=enable_fallback,
)
return not guesses
python-charset-normalizer-3.4.2/src/charset_normalizer/cd.py 0000664 0000000 0000000 00000030352 15005045421 0024271 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import importlib
from codecs import IncrementalDecoder
from collections import Counter
from functools import lru_cache
from typing import Counter as TypeCounter
from .constant import (
FREQUENCIES,
KO_NAMES,
LANGUAGE_SUPPORTED_COUNT,
TOO_SMALL_SEQUENCE,
ZH_NAMES,
)
from .md import is_suspiciously_successive_range
from .models import CoherenceMatches
from .utils import (
is_accentuated,
is_latin,
is_multi_byte_encoding,
is_unicode_range_secondary,
unicode_range,
)
def encoding_unicode_range(iana_name: str) -> list[str]:
"""
Return associated unicode ranges in a single byte code page.
"""
if is_multi_byte_encoding(iana_name):
raise OSError("Function not supported on multi-byte code page")
decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
p: IncrementalDecoder = decoder(errors="ignore")
seen_ranges: dict[str, int] = {}
character_count: int = 0
for i in range(0x40, 0xFF):
chunk: str = p.decode(bytes([i]))
if chunk:
character_range: str | None = unicode_range(chunk)
if character_range is None:
continue
if is_unicode_range_secondary(character_range) is False:
if character_range not in seen_ranges:
seen_ranges[character_range] = 0
seen_ranges[character_range] += 1
character_count += 1
return sorted(
[
character_range
for character_range in seen_ranges
if seen_ranges[character_range] / character_count >= 0.15
]
)
def unicode_range_languages(primary_range: str) -> list[str]:
"""
Return inferred languages used with a unicode range.
"""
languages: list[str] = []
for language, characters in FREQUENCIES.items():
for character in characters:
if unicode_range(character) == primary_range:
languages.append(language)
break
return languages
@lru_cache()
def encoding_languages(iana_name: str) -> list[str]:
"""
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
"""
unicode_ranges: list[str] = encoding_unicode_range(iana_name)
primary_range: str | None = None
for specified_range in unicode_ranges:
if "Latin" not in specified_range:
primary_range = specified_range
break
if primary_range is None:
return ["Latin Based"]
return unicode_range_languages(primary_range)
@lru_cache()
def mb_encoding_languages(iana_name: str) -> list[str]:
"""
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
"""
if (
iana_name.startswith("shift_")
or iana_name.startswith("iso2022_jp")
or iana_name.startswith("euc_j")
or iana_name == "cp932"
):
return ["Japanese"]
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
return ["Chinese"]
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
return ["Korean"]
return []
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
def get_target_features(language: str) -> tuple[bool, bool]:
"""
Determine main aspects from a supported language if it contains accents and if is pure Latin.
"""
target_have_accents: bool = False
target_pure_latin: bool = True
for character in FREQUENCIES[language]:
if not target_have_accents and is_accentuated(character):
target_have_accents = True
if target_pure_latin and is_latin(character) is False:
target_pure_latin = False
return target_have_accents, target_pure_latin
def alphabet_languages(
characters: list[str], ignore_non_latin: bool = False
) -> list[str]:
"""
Return associated languages associated to given characters.
"""
languages: list[tuple[str, float]] = []
source_have_accents = any(is_accentuated(character) for character in characters)
for language, language_characters in FREQUENCIES.items():
target_have_accents, target_pure_latin = get_target_features(language)
if ignore_non_latin and target_pure_latin is False:
continue
if target_have_accents is False and source_have_accents:
continue
character_count: int = len(language_characters)
character_match_count: int = len(
[c for c in language_characters if c in characters]
)
ratio: float = character_match_count / character_count
if ratio >= 0.2:
languages.append((language, ratio))
languages = sorted(languages, key=lambda x: x[1], reverse=True)
return [compatible_language[0] for compatible_language in languages]
def characters_popularity_compare(
language: str, ordered_characters: list[str]
) -> float:
"""
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
"""
if language not in FREQUENCIES:
raise ValueError(f"{language} not available")
character_approved_count: int = 0
FREQUENCIES_language_set = set(FREQUENCIES[language])
ordered_characters_count: int = len(ordered_characters)
target_language_characters_count: int = len(FREQUENCIES[language])
large_alphabet: bool = target_language_characters_count > 26
for character, character_rank in zip(
ordered_characters, range(0, ordered_characters_count)
):
if character not in FREQUENCIES_language_set:
continue
character_rank_in_language: int = FREQUENCIES[language].index(character)
expected_projection_ratio: float = (
target_language_characters_count / ordered_characters_count
)
character_rank_projection: int = int(character_rank * expected_projection_ratio)
if (
large_alphabet is False
and abs(character_rank_projection - character_rank_in_language) > 4
):
continue
if (
large_alphabet is True
and abs(character_rank_projection - character_rank_in_language)
< target_language_characters_count / 3
):
character_approved_count += 1
continue
characters_before_source: list[str] = FREQUENCIES[language][
0:character_rank_in_language
]
characters_after_source: list[str] = FREQUENCIES[language][
character_rank_in_language:
]
characters_before: list[str] = ordered_characters[0:character_rank]
characters_after: list[str] = ordered_characters[character_rank:]
before_match_count: int = len(
set(characters_before) & set(characters_before_source)
)
after_match_count: int = len(
set(characters_after) & set(characters_after_source)
)
if len(characters_before_source) == 0 and before_match_count <= 4:
character_approved_count += 1
continue
if len(characters_after_source) == 0 and after_match_count <= 4:
character_approved_count += 1
continue
if (
before_match_count / len(characters_before_source) >= 0.4
or after_match_count / len(characters_after_source) >= 0.4
):
character_approved_count += 1
continue
return character_approved_count / len(ordered_characters)
def alpha_unicode_split(decoded_sequence: str) -> list[str]:
"""
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
One containing the latin letters and the other hebrew.
"""
layers: dict[str, str] = {}
for character in decoded_sequence:
if character.isalpha() is False:
continue
character_range: str | None = unicode_range(character)
if character_range is None:
continue
layer_target_range: str | None = None
for discovered_range in layers:
if (
is_suspiciously_successive_range(discovered_range, character_range)
is False
):
layer_target_range = discovered_range
break
if layer_target_range is None:
layer_target_range = character_range
if layer_target_range not in layers:
layers[layer_target_range] = character.lower()
continue
layers[layer_target_range] += character.lower()
return list(layers.values())
def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
"""
This function merge results previously given by the function coherence_ratio.
The return type is the same as coherence_ratio.
"""
per_language_ratios: dict[str, list[float]] = {}
for result in results:
for sub_result in result:
language, ratio = sub_result
if language not in per_language_ratios:
per_language_ratios[language] = [ratio]
continue
per_language_ratios[language].append(ratio)
merge = [
(
language,
round(
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
4,
),
)
for language in per_language_ratios
]
return sorted(merge, key=lambda x: x[1], reverse=True)
def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
"""
We shall NOT return "English—" in CoherenceMatches because it is an alternative
of "English". This function only keeps the best match and remove the em-dash in it.
"""
index_results: dict[str, list[float]] = dict()
for result in results:
language, ratio = result
no_em_name: str = language.replace("—", "")
if no_em_name not in index_results:
index_results[no_em_name] = []
index_results[no_em_name].append(ratio)
if any(len(index_results[e]) > 1 for e in index_results):
filtered_results: CoherenceMatches = []
for language in index_results:
filtered_results.append((language, max(index_results[language])))
return filtered_results
return results
@lru_cache(maxsize=2048)
def coherence_ratio(
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
) -> CoherenceMatches:
"""
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
A layer = Character extraction by alphabets/ranges.
"""
results: list[tuple[str, float]] = []
ignore_non_latin: bool = False
sufficient_match_count: int = 0
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
if "Latin Based" in lg_inclusion_list:
ignore_non_latin = True
lg_inclusion_list.remove("Latin Based")
for layer in alpha_unicode_split(decoded_sequence):
sequence_frequencies: TypeCounter[str] = Counter(layer)
most_common = sequence_frequencies.most_common()
character_count: int = sum(o for c, o in most_common)
if character_count <= TOO_SMALL_SEQUENCE:
continue
popular_character_ordered: list[str] = [c for c, o in most_common]
for language in lg_inclusion_list or alphabet_languages(
popular_character_ordered, ignore_non_latin
):
ratio: float = characters_popularity_compare(
language, popular_character_ordered
)
if ratio < threshold:
continue
elif ratio >= 0.8:
sufficient_match_count += 1
results.append((language, round(ratio, 4)))
if sufficient_match_count >= 3:
break
return sorted(
filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
)
python-charset-normalizer-3.4.2/src/charset_normalizer/cli/ 0000775 0000000 0000000 00000000000 15005045421 0024075 5 ustar 00root root 0000000 0000000 python-charset-normalizer-3.4.2/src/charset_normalizer/cli/__init__.py 0000664 0000000 0000000 00000000210 15005045421 0026177 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from .__main__ import cli_detect, query_yes_no
__all__ = (
"cli_detect",
"query_yes_no",
)
python-charset-normalizer-3.4.2/src/charset_normalizer/cli/__main__.py 0000664 0000000 0000000 00000030546 15005045421 0026177 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import argparse
import sys
import typing
from json import dumps
from os.path import abspath, basename, dirname, join, realpath
from platform import python_version
from unicodedata import unidata_version
import charset_normalizer.md as md_module
from charset_normalizer import from_fp
from charset_normalizer.models import CliDetectionResult
from charset_normalizer.version import __version__
def query_yes_no(question: str, default: str = "yes") -> bool:
"""Ask a yes/no question via input() and return their answer.
"question" is a string that is presented to the user.
"default" is the presumed answer if the user just hits .
It must be "yes" (the default), "no" or None (meaning
an answer is required of the user).
The "answer" return value is True for "yes" or False for "no".
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
"""
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
if default is None:
prompt = " [y/n] "
elif default == "yes":
prompt = " [Y/n] "
elif default == "no":
prompt = " [y/N] "
else:
raise ValueError("invalid default answer: '%s'" % default)
while True:
sys.stdout.write(question + prompt)
choice = input().lower()
if default is not None and choice == "":
return valid[default]
elif choice in valid:
return valid[choice]
else:
sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
class FileType:
"""Factory for creating file object types
Instances of FileType are typically passed as type= arguments to the
ArgumentParser add_argument() method.
Keyword Arguments:
- mode -- A string indicating how the file is to be opened. Accepts the
same values as the builtin open() function.
- bufsize -- The file's desired buffer size. Accepts the same values as
the builtin open() function.
- encoding -- The file's encoding. Accepts the same values as the
builtin open() function.
- errors -- A string indicating how encoding and decoding errors are to
be handled. Accepts the same value as the builtin open() function.
Backported from CPython 3.12
"""
def __init__(
self,
mode: str = "r",
bufsize: int = -1,
encoding: str | None = None,
errors: str | None = None,
):
self._mode = mode
self._bufsize = bufsize
self._encoding = encoding
self._errors = errors
def __call__(self, string: str) -> typing.IO: # type: ignore[type-arg]
# the special argument "-" means sys.std{in,out}
if string == "-":
if "r" in self._mode:
return sys.stdin.buffer if "b" in self._mode else sys.stdin
elif any(c in self._mode for c in "wax"):
return sys.stdout.buffer if "b" in self._mode else sys.stdout
else:
msg = f'argument "-" with mode {self._mode}'
raise ValueError(msg)
# all other arguments are used as file names
try:
return open(string, self._mode, self._bufsize, self._encoding, self._errors)
except OSError as e:
message = f"can't open '{string}': {e}"
raise argparse.ArgumentTypeError(message)
def __repr__(self) -> str:
args = self._mode, self._bufsize
kwargs = [("encoding", self._encoding), ("errors", self._errors)]
args_str = ", ".join(
[repr(arg) for arg in args if arg != -1]
+ [f"{kw}={arg!r}" for kw, arg in kwargs if arg is not None]
)
return f"{type(self).__name__}({args_str})"
def cli_detect(argv: list[str] | None = None) -> int:
"""
CLI assistant using ARGV and ArgumentParser
:param argv:
:return: 0 if everything is fine, anything else equal trouble
"""
parser = argparse.ArgumentParser(
description="The Real First Universal Charset Detector. "
"Discover originating encoding used on text file. "
"Normalize text to unicode."
)
parser.add_argument(
"files", type=FileType("rb"), nargs="+", help="File(s) to be analysed"
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
default=False,
dest="verbose",
help="Display complementary information about file if any. "
"Stdout will contain logs about the detection process.",
)
parser.add_argument(
"-a",
"--with-alternative",
action="store_true",
default=False,
dest="alternatives",
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
)
parser.add_argument(
"-n",
"--normalize",
action="store_true",
default=False,
dest="normalize",
help="Permit to normalize input file. If not set, program does not write anything.",
)
parser.add_argument(
"-m",
"--minimal",
action="store_true",
default=False,
dest="minimal",
help="Only output the charset detected to STDOUT. Disabling JSON output.",
)
parser.add_argument(
"-r",
"--replace",
action="store_true",
default=False,
dest="replace",
help="Replace file when trying to normalize it instead of creating a new one.",
)
parser.add_argument(
"-f",
"--force",
action="store_true",
default=False,
dest="force",
help="Replace file without asking if you are sure, use this flag with caution.",
)
parser.add_argument(
"-i",
"--no-preemptive",
action="store_true",
default=False,
dest="no_preemptive",
help="Disable looking at a charset declaration to hint the detector.",
)
parser.add_argument(
"-t",
"--threshold",
action="store",
default=0.2,
type=float,
dest="threshold",
help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.",
)
parser.add_argument(
"--version",
action="version",
version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
__version__,
python_version(),
unidata_version,
"OFF" if md_module.__file__.lower().endswith(".py") else "ON",
),
help="Show version information and exit.",
)
args = parser.parse_args(argv)
if args.replace is True and args.normalize is False:
if args.files:
for my_file in args.files:
my_file.close()
print("Use --replace in addition of --normalize only.", file=sys.stderr)
return 1
if args.force is True and args.replace is False:
if args.files:
for my_file in args.files:
my_file.close()
print("Use --force in addition of --replace only.", file=sys.stderr)
return 1
if args.threshold < 0.0 or args.threshold > 1.0:
if args.files:
for my_file in args.files:
my_file.close()
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
return 1
x_ = []
for my_file in args.files:
matches = from_fp(
my_file,
threshold=args.threshold,
explain=args.verbose,
preemptive_behaviour=args.no_preemptive is False,
)
best_guess = matches.best()
if best_guess is None:
print(
'Unable to identify originating encoding for "{}". {}'.format(
my_file.name,
(
"Maybe try increasing maximum amount of chaos."
if args.threshold < 1.0
else ""
),
),
file=sys.stderr,
)
x_.append(
CliDetectionResult(
abspath(my_file.name),
None,
[],
[],
"Unknown",
[],
False,
1.0,
0.0,
None,
True,
)
)
else:
x_.append(
CliDetectionResult(
abspath(my_file.name),
best_guess.encoding,
best_guess.encoding_aliases,
[
cp
for cp in best_guess.could_be_from_charset
if cp != best_guess.encoding
],
best_guess.language,
best_guess.alphabets,
best_guess.bom,
best_guess.percent_chaos,
best_guess.percent_coherence,
None,
True,
)
)
if len(matches) > 1 and args.alternatives:
for el in matches:
if el != best_guess:
x_.append(
CliDetectionResult(
abspath(my_file.name),
el.encoding,
el.encoding_aliases,
[
cp
for cp in el.could_be_from_charset
if cp != el.encoding
],
el.language,
el.alphabets,
el.bom,
el.percent_chaos,
el.percent_coherence,
None,
False,
)
)
if args.normalize is True:
if best_guess.encoding.startswith("utf") is True:
print(
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
my_file.name
),
file=sys.stderr,
)
if my_file.closed is False:
my_file.close()
continue
dir_path = dirname(realpath(my_file.name))
file_name = basename(realpath(my_file.name))
o_: list[str] = file_name.split(".")
if args.replace is False:
o_.insert(-1, best_guess.encoding)
if my_file.closed is False:
my_file.close()
elif (
args.force is False
and query_yes_no(
'Are you sure to normalize "{}" by replacing it ?'.format(
my_file.name
),
"no",
)
is False
):
if my_file.closed is False:
my_file.close()
continue
try:
x_[0].unicode_path = join(dir_path, ".".join(o_))
with open(x_[0].unicode_path, "wb") as fp:
fp.write(best_guess.output())
except OSError as e:
print(str(e), file=sys.stderr)
if my_file.closed is False:
my_file.close()
return 2
if my_file.closed is False:
my_file.close()
if args.minimal is False:
print(
dumps(
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
ensure_ascii=True,
indent=4,
)
)
else:
for my_file in args.files:
print(
", ".join(
[
el.encoding or "undefined"
for el in x_
if el.path == abspath(my_file.name)
]
)
)
return 0
if __name__ == "__main__":
cli_detect()
python-charset-normalizer-3.4.2/src/charset_normalizer/constant.py 0000664 0000000 0000000 00000123331 15005045421 0025534 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
from encodings.aliases import aliases
from re import IGNORECASE
from re import compile as re_compile
# Contain for each eligible encoding a list of/item bytes SIG/BOM
ENCODING_MARKS: dict[str, bytes | list[bytes]] = {
"utf_8": BOM_UTF8,
"utf_7": [
b"\x2b\x2f\x76\x38",
b"\x2b\x2f\x76\x39",
b"\x2b\x2f\x76\x2b",
b"\x2b\x2f\x76\x2f",
b"\x2b\x2f\x76\x38\x2d",
],
"gb18030": b"\x84\x31\x95\x33",
"utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
"utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
}
TOO_SMALL_SEQUENCE: int = 32
TOO_BIG_SEQUENCE: int = int(10e6)
UTF8_MAXIMAL_ALLOCATION: int = 1_112_064
# Up-to-date Unicode ucd/15.0.0
UNICODE_RANGES_COMBINED: dict[str, range] = {
"Control character": range(32),
"Basic Latin": range(32, 128),
"Latin-1 Supplement": range(128, 256),
"Latin Extended-A": range(256, 384),
"Latin Extended-B": range(384, 592),
"IPA Extensions": range(592, 688),
"Spacing Modifier Letters": range(688, 768),
"Combining Diacritical Marks": range(768, 880),
"Greek and Coptic": range(880, 1024),
"Cyrillic": range(1024, 1280),
"Cyrillic Supplement": range(1280, 1328),
"Armenian": range(1328, 1424),
"Hebrew": range(1424, 1536),
"Arabic": range(1536, 1792),
"Syriac": range(1792, 1872),
"Arabic Supplement": range(1872, 1920),
"Thaana": range(1920, 1984),
"NKo": range(1984, 2048),
"Samaritan": range(2048, 2112),
"Mandaic": range(2112, 2144),
"Syriac Supplement": range(2144, 2160),
"Arabic Extended-B": range(2160, 2208),
"Arabic Extended-A": range(2208, 2304),
"Devanagari": range(2304, 2432),
"Bengali": range(2432, 2560),
"Gurmukhi": range(2560, 2688),
"Gujarati": range(2688, 2816),
"Oriya": range(2816, 2944),
"Tamil": range(2944, 3072),
"Telugu": range(3072, 3200),
"Kannada": range(3200, 3328),
"Malayalam": range(3328, 3456),
"Sinhala": range(3456, 3584),
"Thai": range(3584, 3712),
"Lao": range(3712, 3840),
"Tibetan": range(3840, 4096),
"Myanmar": range(4096, 4256),
"Georgian": range(4256, 4352),
"Hangul Jamo": range(4352, 4608),
"Ethiopic": range(4608, 4992),
"Ethiopic Supplement": range(4992, 5024),
"Cherokee": range(5024, 5120),
"Unified Canadian Aboriginal Syllabics": range(5120, 5760),
"Ogham": range(5760, 5792),
"Runic": range(5792, 5888),
"Tagalog": range(5888, 5920),
"Hanunoo": range(5920, 5952),
"Buhid": range(5952, 5984),
"Tagbanwa": range(5984, 6016),
"Khmer": range(6016, 6144),
"Mongolian": range(6144, 6320),
"Unified Canadian Aboriginal Syllabics Extended": range(6320, 6400),
"Limbu": range(6400, 6480),
"Tai Le": range(6480, 6528),
"New Tai Lue": range(6528, 6624),
"Khmer Symbols": range(6624, 6656),
"Buginese": range(6656, 6688),
"Tai Tham": range(6688, 6832),
"Combining Diacritical Marks Extended": range(6832, 6912),
"Balinese": range(6912, 7040),
"Sundanese": range(7040, 7104),
"Batak": range(7104, 7168),
"Lepcha": range(7168, 7248),
"Ol Chiki": range(7248, 7296),
"Cyrillic Extended-C": range(7296, 7312),
"Georgian Extended": range(7312, 7360),
"Sundanese Supplement": range(7360, 7376),
"Vedic Extensions": range(7376, 7424),
"Phonetic Extensions": range(7424, 7552),
"Phonetic Extensions Supplement": range(7552, 7616),
"Combining Diacritical Marks Supplement": range(7616, 7680),
"Latin Extended Additional": range(7680, 7936),
"Greek Extended": range(7936, 8192),
"General Punctuation": range(8192, 8304),
"Superscripts and Subscripts": range(8304, 8352),
"Currency Symbols": range(8352, 8400),
"Combining Diacritical Marks for Symbols": range(8400, 8448),
"Letterlike Symbols": range(8448, 8528),
"Number Forms": range(8528, 8592),
"Arrows": range(8592, 8704),
"Mathematical Operators": range(8704, 8960),
"Miscellaneous Technical": range(8960, 9216),
"Control Pictures": range(9216, 9280),
"Optical Character Recognition": range(9280, 9312),
"Enclosed Alphanumerics": range(9312, 9472),
"Box Drawing": range(9472, 9600),
"Block Elements": range(9600, 9632),
"Geometric Shapes": range(9632, 9728),
"Miscellaneous Symbols": range(9728, 9984),
"Dingbats": range(9984, 10176),
"Miscellaneous Mathematical Symbols-A": range(10176, 10224),
"Supplemental Arrows-A": range(10224, 10240),
"Braille Patterns": range(10240, 10496),
"Supplemental Arrows-B": range(10496, 10624),
"Miscellaneous Mathematical Symbols-B": range(10624, 10752),
"Supplemental Mathematical Operators": range(10752, 11008),
"Miscellaneous Symbols and Arrows": range(11008, 11264),
"Glagolitic": range(11264, 11360),
"Latin Extended-C": range(11360, 11392),
"Coptic": range(11392, 11520),
"Georgian Supplement": range(11520, 11568),
"Tifinagh": range(11568, 11648),
"Ethiopic Extended": range(11648, 11744),
"Cyrillic Extended-A": range(11744, 11776),
"Supplemental Punctuation": range(11776, 11904),
"CJK Radicals Supplement": range(11904, 12032),
"Kangxi Radicals": range(12032, 12256),
"Ideographic Description Characters": range(12272, 12288),
"CJK Symbols and Punctuation": range(12288, 12352),
"Hiragana": range(12352, 12448),
"Katakana": range(12448, 12544),
"Bopomofo": range(12544, 12592),
"Hangul Compatibility Jamo": range(12592, 12688),
"Kanbun": range(12688, 12704),
"Bopomofo Extended": range(12704, 12736),
"CJK Strokes": range(12736, 12784),
"Katakana Phonetic Extensions": range(12784, 12800),
"Enclosed CJK Letters and Months": range(12800, 13056),
"CJK Compatibility": range(13056, 13312),
"CJK Unified Ideographs Extension A": range(13312, 19904),
"Yijing Hexagram Symbols": range(19904, 19968),
"CJK Unified Ideographs": range(19968, 40960),
"Yi Syllables": range(40960, 42128),
"Yi Radicals": range(42128, 42192),
"Lisu": range(42192, 42240),
"Vai": range(42240, 42560),
"Cyrillic Extended-B": range(42560, 42656),
"Bamum": range(42656, 42752),
"Modifier Tone Letters": range(42752, 42784),
"Latin Extended-D": range(42784, 43008),
"Syloti Nagri": range(43008, 43056),
"Common Indic Number Forms": range(43056, 43072),
"Phags-pa": range(43072, 43136),
"Saurashtra": range(43136, 43232),
"Devanagari Extended": range(43232, 43264),
"Kayah Li": range(43264, 43312),
"Rejang": range(43312, 43360),
"Hangul Jamo Extended-A": range(43360, 43392),
"Javanese": range(43392, 43488),
"Myanmar Extended-B": range(43488, 43520),
"Cham": range(43520, 43616),
"Myanmar Extended-A": range(43616, 43648),
"Tai Viet": range(43648, 43744),
"Meetei Mayek Extensions": range(43744, 43776),
"Ethiopic Extended-A": range(43776, 43824),
"Latin Extended-E": range(43824, 43888),
"Cherokee Supplement": range(43888, 43968),
"Meetei Mayek": range(43968, 44032),
"Hangul Syllables": range(44032, 55216),
"Hangul Jamo Extended-B": range(55216, 55296),
"High Surrogates": range(55296, 56192),
"High Private Use Surrogates": range(56192, 56320),
"Low Surrogates": range(56320, 57344),
"Private Use Area": range(57344, 63744),
"CJK Compatibility Ideographs": range(63744, 64256),
"Alphabetic Presentation Forms": range(64256, 64336),
"Arabic Presentation Forms-A": range(64336, 65024),
"Variation Selectors": range(65024, 65040),
"Vertical Forms": range(65040, 65056),
"Combining Half Marks": range(65056, 65072),
"CJK Compatibility Forms": range(65072, 65104),
"Small Form Variants": range(65104, 65136),
"Arabic Presentation Forms-B": range(65136, 65280),
"Halfwidth and Fullwidth Forms": range(65280, 65520),
"Specials": range(65520, 65536),
"Linear B Syllabary": range(65536, 65664),
"Linear B Ideograms": range(65664, 65792),
"Aegean Numbers": range(65792, 65856),
"Ancient Greek Numbers": range(65856, 65936),
"Ancient Symbols": range(65936, 66000),
"Phaistos Disc": range(66000, 66048),
"Lycian": range(66176, 66208),
"Carian": range(66208, 66272),
"Coptic Epact Numbers": range(66272, 66304),
"Old Italic": range(66304, 66352),
"Gothic": range(66352, 66384),
"Old Permic": range(66384, 66432),
"Ugaritic": range(66432, 66464),
"Old Persian": range(66464, 66528),
"Deseret": range(66560, 66640),
"Shavian": range(66640, 66688),
"Osmanya": range(66688, 66736),
"Osage": range(66736, 66816),
"Elbasan": range(66816, 66864),
"Caucasian Albanian": range(66864, 66928),
"Vithkuqi": range(66928, 67008),
"Linear A": range(67072, 67456),
"Latin Extended-F": range(67456, 67520),
"Cypriot Syllabary": range(67584, 67648),
"Imperial Aramaic": range(67648, 67680),
"Palmyrene": range(67680, 67712),
"Nabataean": range(67712, 67760),
"Hatran": range(67808, 67840),
"Phoenician": range(67840, 67872),
"Lydian": range(67872, 67904),
"Meroitic Hieroglyphs": range(67968, 68000),
"Meroitic Cursive": range(68000, 68096),
"Kharoshthi": range(68096, 68192),
"Old South Arabian": range(68192, 68224),
"Old North Arabian": range(68224, 68256),
"Manichaean": range(68288, 68352),
"Avestan": range(68352, 68416),
"Inscriptional Parthian": range(68416, 68448),
"Inscriptional Pahlavi": range(68448, 68480),
"Psalter Pahlavi": range(68480, 68528),
"Old Turkic": range(68608, 68688),
"Old Hungarian": range(68736, 68864),
"Hanifi Rohingya": range(68864, 68928),
"Rumi Numeral Symbols": range(69216, 69248),
"Yezidi": range(69248, 69312),
"Arabic Extended-C": range(69312, 69376),
"Old Sogdian": range(69376, 69424),
"Sogdian": range(69424, 69488),
"Old Uyghur": range(69488, 69552),
"Chorasmian": range(69552, 69600),
"Elymaic": range(69600, 69632),
"Brahmi": range(69632, 69760),
"Kaithi": range(69760, 69840),
"Sora Sompeng": range(69840, 69888),
"Chakma": range(69888, 69968),
"Mahajani": range(69968, 70016),
"Sharada": range(70016, 70112),
"Sinhala Archaic Numbers": range(70112, 70144),
"Khojki": range(70144, 70224),
"Multani": range(70272, 70320),
"Khudawadi": range(70320, 70400),
"Grantha": range(70400, 70528),
"Newa": range(70656, 70784),
"Tirhuta": range(70784, 70880),
"Siddham": range(71040, 71168),
"Modi": range(71168, 71264),
"Mongolian Supplement": range(71264, 71296),
"Takri": range(71296, 71376),
"Ahom": range(71424, 71504),
"Dogra": range(71680, 71760),
"Warang Citi": range(71840, 71936),
"Dives Akuru": range(71936, 72032),
"Nandinagari": range(72096, 72192),
"Zanabazar Square": range(72192, 72272),
"Soyombo": range(72272, 72368),
"Unified Canadian Aboriginal Syllabics Extended-A": range(72368, 72384),
"Pau Cin Hau": range(72384, 72448),
"Devanagari Extended-A": range(72448, 72544),
"Bhaiksuki": range(72704, 72816),
"Marchen": range(72816, 72896),
"Masaram Gondi": range(72960, 73056),
"Gunjala Gondi": range(73056, 73136),
"Makasar": range(73440, 73472),
"Kawi": range(73472, 73568),
"Lisu Supplement": range(73648, 73664),
"Tamil Supplement": range(73664, 73728),
"Cuneiform": range(73728, 74752),
"Cuneiform Numbers and Punctuation": range(74752, 74880),
"Early Dynastic Cuneiform": range(74880, 75088),
"Cypro-Minoan": range(77712, 77824),
"Egyptian Hieroglyphs": range(77824, 78896),
"Egyptian Hieroglyph Format Controls": range(78896, 78944),
"Anatolian Hieroglyphs": range(82944, 83584),
"Bamum Supplement": range(92160, 92736),
"Mro": range(92736, 92784),
"Tangsa": range(92784, 92880),
"Bassa Vah": range(92880, 92928),
"Pahawh Hmong": range(92928, 93072),
"Medefaidrin": range(93760, 93856),
"Miao": range(93952, 94112),
"Ideographic Symbols and Punctuation": range(94176, 94208),
"Tangut": range(94208, 100352),
"Tangut Components": range(100352, 101120),
"Khitan Small Script": range(101120, 101632),
"Tangut Supplement": range(101632, 101760),
"Kana Extended-B": range(110576, 110592),
"Kana Supplement": range(110592, 110848),
"Kana Extended-A": range(110848, 110896),
"Small Kana Extension": range(110896, 110960),
"Nushu": range(110960, 111360),
"Duployan": range(113664, 113824),
"Shorthand Format Controls": range(113824, 113840),
"Znamenny Musical Notation": range(118528, 118736),
"Byzantine Musical Symbols": range(118784, 119040),
"Musical Symbols": range(119040, 119296),
"Ancient Greek Musical Notation": range(119296, 119376),
"Kaktovik Numerals": range(119488, 119520),
"Mayan Numerals": range(119520, 119552),
"Tai Xuan Jing Symbols": range(119552, 119648),
"Counting Rod Numerals": range(119648, 119680),
"Mathematical Alphanumeric Symbols": range(119808, 120832),
"Sutton SignWriting": range(120832, 121520),
"Latin Extended-G": range(122624, 122880),
"Glagolitic Supplement": range(122880, 122928),
"Cyrillic Extended-D": range(122928, 123024),
"Nyiakeng Puachue Hmong": range(123136, 123216),
"Toto": range(123536, 123584),
"Wancho": range(123584, 123648),
"Nag Mundari": range(124112, 124160),
"Ethiopic Extended-B": range(124896, 124928),
"Mende Kikakui": range(124928, 125152),
"Adlam": range(125184, 125280),
"Indic Siyaq Numbers": range(126064, 126144),
"Ottoman Siyaq Numbers": range(126208, 126288),
"Arabic Mathematical Alphabetic Symbols": range(126464, 126720),
"Mahjong Tiles": range(126976, 127024),
"Domino Tiles": range(127024, 127136),
"Playing Cards": range(127136, 127232),
"Enclosed Alphanumeric Supplement": range(127232, 127488),
"Enclosed Ideographic Supplement": range(127488, 127744),
"Miscellaneous Symbols and Pictographs": range(127744, 128512),
"Emoticons range(Emoji)": range(128512, 128592),
"Ornamental Dingbats": range(128592, 128640),
"Transport and Map Symbols": range(128640, 128768),
"Alchemical Symbols": range(128768, 128896),
"Geometric Shapes Extended": range(128896, 129024),
"Supplemental Arrows-C": range(129024, 129280),
"Supplemental Symbols and Pictographs": range(129280, 129536),
"Chess Symbols": range(129536, 129648),
"Symbols and Pictographs Extended-A": range(129648, 129792),
"Symbols for Legacy Computing": range(129792, 130048),
"CJK Unified Ideographs Extension B": range(131072, 173792),
"CJK Unified Ideographs Extension C": range(173824, 177984),
"CJK Unified Ideographs Extension D": range(177984, 178208),
"CJK Unified Ideographs Extension E": range(178208, 183984),
"CJK Unified Ideographs Extension F": range(183984, 191472),
"CJK Compatibility Ideographs Supplement": range(194560, 195104),
"CJK Unified Ideographs Extension G": range(196608, 201552),
"CJK Unified Ideographs Extension H": range(201552, 205744),
"Tags": range(917504, 917632),
"Variation Selectors Supplement": range(917760, 918000),
"Supplementary Private Use Area-A": range(983040, 1048576),
"Supplementary Private Use Area-B": range(1048576, 1114112),
}
UNICODE_SECONDARY_RANGE_KEYWORD: list[str] = [
"Supplement",
"Extended",
"Extensions",
"Modifier",
"Marks",
"Punctuation",
"Symbols",
"Forms",
"Operators",
"Miscellaneous",
"Drawing",
"Block",
"Shapes",
"Supplemental",
"Tags",
]
RE_POSSIBLE_ENCODING_INDICATION = re_compile(
r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
IGNORECASE,
)
IANA_NO_ALIASES = [
"cp720",
"cp737",
"cp856",
"cp874",
"cp875",
"cp1006",
"koi8_r",
"koi8_t",
"koi8_u",
]
IANA_SUPPORTED: list[str] = sorted(
filter(
lambda x: x.endswith("_codec") is False
and x not in {"rot_13", "tactis", "mbcs"},
list(set(aliases.values())) + IANA_NO_ALIASES,
)
)
IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
# pre-computed code page that are similar using the function cp_similarity.
IANA_SUPPORTED_SIMILAR: dict[str, list[str]] = {
"cp037": ["cp1026", "cp1140", "cp273", "cp500"],
"cp1026": ["cp037", "cp1140", "cp273", "cp500"],
"cp1125": ["cp866"],
"cp1140": ["cp037", "cp1026", "cp273", "cp500"],
"cp1250": ["iso8859_2"],
"cp1251": ["kz1048", "ptcp154"],
"cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
"cp1253": ["iso8859_7"],
"cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
"cp1257": ["iso8859_13"],
"cp273": ["cp037", "cp1026", "cp1140", "cp500"],
"cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
"cp500": ["cp037", "cp1026", "cp1140", "cp273"],
"cp850": ["cp437", "cp857", "cp858", "cp865"],
"cp857": ["cp850", "cp858", "cp865"],
"cp858": ["cp437", "cp850", "cp857", "cp865"],
"cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"],
"cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"],
"cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"],
"cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"],
"cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"],
"cp866": ["cp1125"],
"iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"],
"iso8859_11": ["tis_620"],
"iso8859_13": ["cp1257"],
"iso8859_14": [
"iso8859_10",
"iso8859_15",
"iso8859_16",
"iso8859_3",
"iso8859_9",
"latin_1",
],
"iso8859_15": [
"cp1252",
"cp1254",
"iso8859_10",
"iso8859_14",
"iso8859_16",
"iso8859_3",
"iso8859_9",
"latin_1",
],
"iso8859_16": [
"iso8859_14",
"iso8859_15",
"iso8859_2",
"iso8859_3",
"iso8859_9",
"latin_1",
],
"iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"],
"iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"],
"iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"],
"iso8859_7": ["cp1253"],
"iso8859_9": [
"cp1252",
"cp1254",
"cp1258",
"iso8859_10",
"iso8859_14",
"iso8859_15",
"iso8859_16",
"iso8859_3",
"iso8859_4",
"latin_1",
],
"kz1048": ["cp1251", "ptcp154"],
"latin_1": [
"cp1252",
"cp1254",
"cp1258",
"iso8859_10",
"iso8859_14",
"iso8859_15",
"iso8859_16",
"iso8859_3",
"iso8859_4",
"iso8859_9",
],
"mac_iceland": ["mac_roman", "mac_turkish"],
"mac_roman": ["mac_iceland", "mac_turkish"],
"mac_turkish": ["mac_iceland", "mac_roman"],
"ptcp154": ["cp1251", "kz1048"],
"tis_620": ["iso8859_11"],
}
CHARDET_CORRESPONDENCE: dict[str, str] = {
"iso2022_kr": "ISO-2022-KR",
"iso2022_jp": "ISO-2022-JP",
"euc_kr": "EUC-KR",
"tis_620": "TIS-620",
"utf_32": "UTF-32",
"euc_jp": "EUC-JP",
"koi8_r": "KOI8-R",
"iso8859_1": "ISO-8859-1",
"iso8859_2": "ISO-8859-2",
"iso8859_5": "ISO-8859-5",
"iso8859_6": "ISO-8859-6",
"iso8859_7": "ISO-8859-7",
"iso8859_8": "ISO-8859-8",
"utf_16": "UTF-16",
"cp855": "IBM855",
"mac_cyrillic": "MacCyrillic",
"gb2312": "GB2312",
"gb18030": "GB18030",
"cp932": "CP932",
"cp866": "IBM866",
"utf_8": "utf-8",
"utf_8_sig": "UTF-8-SIG",
"shift_jis": "SHIFT_JIS",
"big5": "Big5",
"cp1250": "windows-1250",
"cp1251": "windows-1251",
"cp1252": "Windows-1252",
"cp1253": "windows-1253",
"cp1255": "windows-1255",
"cp1256": "windows-1256",
"cp1254": "Windows-1254",
"cp949": "CP949",
}
COMMON_SAFE_ASCII_CHARACTERS: set[str] = {
"<",
">",
"=",
":",
"/",
"&",
";",
"{",
"}",
"[",
"]",
",",
"|",
'"',
"-",
"(",
")",
}
# Sample character sets — replace with full lists if needed
COMMON_CHINESE_CHARACTERS = "的一是在不了有和人这中大为上个国我以要他时来用们生到作地于出就分对成会可主发年动同工也能下过子说产种面而方后多定行学法所民得经十三之进着等部度家电力里如水化高自二理起小物现实加量都两体制机当使点从业本去把性好应开它合还因由其些然前外天政四日那社义事平形相全表间样与关各重新线内数正心反你明看原又么利比或但质气第向道命此变条只没结解问意建月公无系军很情者最立代想已通并提直题党程展五果料象员革位入常文总次品式活设及管特件长求老头基资边流路级少图山统接知较将组见计别她手角期根论运农指几九区强放决西被干做必战先回则任取据处队南给色光门即保治北造百规热领七海口东导器压志世金增争济阶油思术极交受联什认六共权收证改清己美再采转更单风切打白教速花带安场身车例真务具万每目至达走积示议声报斗完类八离华名确才科张信马节话米整空元况今集温传土许步群广石记需段研界拉林律叫且究观越织装影算低持音众书布复容儿须际商非验连断深难近矿千周委素技备半办青省列习响约支般史感劳便团往酸历市克何除消构府太准精值号率族维划选标写存候毛亲快效斯院查江型眼王按格养易置派层片始却专状育厂京识适属圆包火住调满县局照参红细引听该铁价严龙飞"
COMMON_JAPANESE_CHARACTERS = "日一国年大十二本中長出三時行見月分後前生五間上東四今金九入学高円子外八六下来気小七山話女北午百書先名川千水半男西電校語土木聞食車何南万毎白天母火右読友左休父雨"
COMMON_KOREAN_CHARACTERS = "一二三四五六七八九十百千萬上下左右中人女子大小山川日月火水木金土父母天地國名年時文校學生"
# Combine all into a set
COMMON_CJK_CHARACTERS = set(
"".join(
[
COMMON_CHINESE_CHARACTERS,
COMMON_JAPANESE_CHARACTERS,
COMMON_KOREAN_CHARACTERS,
]
)
)
KO_NAMES: set[str] = {"johab", "cp949", "euc_kr"}
ZH_NAMES: set[str] = {"big5", "cp950", "big5hkscs", "hz"}
# Logging LEVEL below DEBUG
TRACE: int = 5
# Language label that contain the em dash "—"
# character are to be considered alternative seq to origin
FREQUENCIES: dict[str, list[str]] = {
"English": [
"e",
"a",
"t",
"i",
"o",
"n",
"s",
"r",
"h",
"l",
"d",
"c",
"u",
"m",
"f",
"p",
"g",
"w",
"y",
"b",
"v",
"k",
"x",
"j",
"z",
"q",
],
"English—": [
"e",
"a",
"t",
"i",
"o",
"n",
"s",
"r",
"h",
"l",
"d",
"c",
"m",
"u",
"f",
"p",
"g",
"w",
"b",
"y",
"v",
"k",
"j",
"x",
"z",
"q",
],
"German": [
"e",
"n",
"i",
"r",
"s",
"t",
"a",
"d",
"h",
"u",
"l",
"g",
"o",
"c",
"m",
"b",
"f",
"k",
"w",
"z",
"p",
"v",
"ü",
"ä",
"ö",
"j",
],
"French": [
"e",
"a",
"s",
"n",
"i",
"t",
"r",
"l",
"u",
"o",
"d",
"c",
"p",
"m",
"é",
"v",
"g",
"f",
"b",
"h",
"q",
"à",
"x",
"è",
"y",
"j",
],
"Dutch": [
"e",
"n",
"a",
"i",
"r",
"t",
"o",
"d",
"s",
"l",
"g",
"h",
"v",
"m",
"u",
"k",
"c",
"p",
"b",
"w",
"j",
"z",
"f",
"y",
"x",
"ë",
],
"Italian": [
"e",
"i",
"a",
"o",
"n",
"l",
"t",
"r",
"s",
"c",
"d",
"u",
"p",
"m",
"g",
"v",
"f",
"b",
"z",
"h",
"q",
"è",
"à",
"k",
"y",
"ò",
],
"Polish": [
"a",
"i",
"o",
"e",
"n",
"r",
"z",
"w",
"s",
"c",
"t",
"k",
"y",
"d",
"p",
"m",
"u",
"l",
"j",
"ł",
"g",
"b",
"h",
"ą",
"ę",
"ó",
],
"Spanish": [
"e",
"a",
"o",
"n",
"s",
"r",
"i",
"l",
"d",
"t",
"c",
"u",
"m",
"p",
"b",
"g",
"v",
"f",
"y",
"ó",
"h",
"q",
"í",
"j",
"z",
"á",
],
"Russian": [
"о",
"а",
"е",
"и",
"н",
"с",
"т",
"р",
"в",
"л",
"к",
"м",
"д",
"п",
"у",
"г",
"я",
"ы",
"з",
"б",
"й",
"ь",
"ч",
"х",
"ж",
"ц",
],
# Jap-Kanji
"Japanese": [
"人",
"一",
"大",
"亅",
"丁",
"丨",
"竹",
"笑",
"口",
"日",
"今",
"二",
"彳",
"行",
"十",
"土",
"丶",
"寸",
"寺",
"時",
"乙",
"丿",
"乂",
"气",
"気",
"冂",
"巾",
"亠",
"市",
"目",
"儿",
"見",
"八",
"小",
"凵",
"県",
"月",
"彐",
"門",
"間",
"木",
"東",
"山",
"出",
"本",
"中",
"刀",
"分",
"耳",
"又",
"取",
"最",
"言",
"田",
"心",
"思",
"刂",
"前",
"京",
"尹",
"事",
"生",
"厶",
"云",
"会",
"未",
"来",
"白",
"冫",
"楽",
"灬",
"馬",
"尸",
"尺",
"駅",
"明",
"耂",
"者",
"了",
"阝",
"都",
"高",
"卜",
"占",
"厂",
"广",
"店",
"子",
"申",
"奄",
"亻",
"俺",
"上",
"方",
"冖",
"学",
"衣",
"艮",
"食",
"自",
],
# Jap-Katakana
"Japanese—": [
"ー",
"ン",
"ス",
"・",
"ル",
"ト",
"リ",
"イ",
"ア",
"ラ",
"ッ",
"ク",
"ド",
"シ",
"レ",
"ジ",
"タ",
"フ",
"ロ",
"カ",
"テ",
"マ",
"ィ",
"グ",
"バ",
"ム",
"プ",
"オ",
"コ",
"デ",
"ニ",
"ウ",
"メ",
"サ",
"ビ",
"ナ",
"ブ",
"ャ",
"エ",
"ュ",
"チ",
"キ",
"ズ",
"ダ",
"パ",
"ミ",
"ェ",
"ョ",
"ハ",
"セ",
"ベ",
"ガ",
"モ",
"ツ",
"ネ",
"ボ",
"ソ",
"ノ",
"ァ",
"ヴ",
"ワ",
"ポ",
"ペ",
"ピ",
"ケ",
"ゴ",
"ギ",
"ザ",
"ホ",
"ゲ",
"ォ",
"ヤ",
"ヒ",
"ユ",
"ヨ",
"ヘ",
"ゼ",
"ヌ",
"ゥ",
"ゾ",
"ヶ",
"ヂ",
"ヲ",
"ヅ",
"ヵ",
"ヱ",
"ヰ",
"ヮ",
"ヽ",
"゠",
"ヾ",
"ヷ",
"ヿ",
"ヸ",
"ヹ",
"ヺ",
],
# Jap-Hiragana
"Japanese——": [
"の",
"に",
"る",
"た",
"と",
"は",
"し",
"い",
"を",
"で",
"て",
"が",
"な",
"れ",
"か",
"ら",
"さ",
"っ",
"り",
"す",
"あ",
"も",
"こ",
"ま",
"う",
"く",
"よ",
"き",
"ん",
"め",
"お",
"け",
"そ",
"つ",
"だ",
"や",
"え",
"ど",
"わ",
"ち",
"み",
"せ",
"じ",
"ば",
"へ",
"び",
"ず",
"ろ",
"ほ",
"げ",
"む",
"べ",
"ひ",
"ょ",
"ゆ",
"ぶ",
"ご",
"ゃ",
"ね",
"ふ",
"ぐ",
"ぎ",
"ぼ",
"ゅ",
"づ",
"ざ",
"ぞ",
"ぬ",
"ぜ",
"ぱ",
"ぽ",
"ぷ",
"ぴ",
"ぃ",
"ぁ",
"ぇ",
"ぺ",
"ゞ",
"ぢ",
"ぉ",
"ぅ",
"ゐ",
"ゝ",
"ゑ",
"゛",
"゜",
"ゎ",
"ゔ",
"゚",
"ゟ",
"゙",
"ゕ",
"ゖ",
],
"Portuguese": [
"a",
"e",
"o",
"s",
"i",
"r",
"d",
"n",
"t",
"m",
"u",
"c",
"l",
"p",
"g",
"v",
"b",
"f",
"h",
"ã",
"q",
"é",
"ç",
"á",
"z",
"í",
],
"Swedish": [
"e",
"a",
"n",
"r",
"t",
"s",
"i",
"l",
"d",
"o",
"m",
"k",
"g",
"v",
"h",
"f",
"u",
"p",
"ä",
"c",
"b",
"ö",
"å",
"y",
"j",
"x",
],
"Chinese": [
"的",
"一",
"是",
"不",
"了",
"在",
"人",
"有",
"我",
"他",
"这",
"个",
"们",
"中",
"来",
"上",
"大",
"为",
"和",
"国",
"地",
"到",
"以",
"说",
"时",
"要",
"就",
"出",
"会",
"可",
"也",
"你",
"对",
"生",
"能",
"而",
"子",
"那",
"得",
"于",
"着",
"下",
"自",
"之",
"年",
"过",
"发",
"后",
"作",
"里",
"用",
"道",
"行",
"所",
"然",
"家",
"种",
"事",
"成",
"方",
"多",
"经",
"么",
"去",
"法",
"学",
"如",
"都",
"同",
"现",
"当",
"没",
"动",
"面",
"起",
"看",
"定",
"天",
"分",
"还",
"进",
"好",
"小",
"部",
"其",
"些",
"主",
"样",
"理",
"心",
"她",
"本",
"前",
"开",
"但",
"因",
"只",
"从",
"想",
"实",
],
"Ukrainian": [
"о",
"а",
"н",
"і",
"и",
"р",
"в",
"т",
"е",
"с",
"к",
"л",
"у",
"д",
"м",
"п",
"з",
"я",
"ь",
"б",
"г",
"й",
"ч",
"х",
"ц",
"ї",
],
"Norwegian": [
"e",
"r",
"n",
"t",
"a",
"s",
"i",
"o",
"l",
"d",
"g",
"k",
"m",
"v",
"f",
"p",
"u",
"b",
"h",
"å",
"y",
"j",
"ø",
"c",
"æ",
"w",
],
"Finnish": [
"a",
"i",
"n",
"t",
"e",
"s",
"l",
"o",
"u",
"k",
"ä",
"m",
"r",
"v",
"j",
"h",
"p",
"y",
"d",
"ö",
"g",
"c",
"b",
"f",
"w",
"z",
],
"Vietnamese": [
"n",
"h",
"t",
"i",
"c",
"g",
"a",
"o",
"u",
"m",
"l",
"r",
"à",
"đ",
"s",
"e",
"v",
"p",
"b",
"y",
"ư",
"d",
"á",
"k",
"ộ",
"ế",
],
"Czech": [
"o",
"e",
"a",
"n",
"t",
"s",
"i",
"l",
"v",
"r",
"k",
"d",
"u",
"m",
"p",
"í",
"c",
"h",
"z",
"á",
"y",
"j",
"b",
"ě",
"é",
"ř",
],
"Hungarian": [
"e",
"a",
"t",
"l",
"s",
"n",
"k",
"r",
"i",
"o",
"z",
"á",
"é",
"g",
"m",
"b",
"y",
"v",
"d",
"h",
"u",
"p",
"j",
"ö",
"f",
"c",
],
"Korean": [
"이",
"다",
"에",
"의",
"는",
"로",
"하",
"을",
"가",
"고",
"지",
"서",
"한",
"은",
"기",
"으",
"년",
"대",
"사",
"시",
"를",
"리",
"도",
"인",
"스",
"일",
],
"Indonesian": [
"a",
"n",
"e",
"i",
"r",
"t",
"u",
"s",
"d",
"k",
"m",
"l",
"g",
"p",
"b",
"o",
"h",
"y",
"j",
"c",
"w",
"f",
"v",
"z",
"x",
"q",
],
"Turkish": [
"a",
"e",
"i",
"n",
"r",
"l",
"ı",
"k",
"d",
"t",
"s",
"m",
"y",
"u",
"o",
"b",
"ü",
"ş",
"v",
"g",
"z",
"h",
"c",
"p",
"ç",
"ğ",
],
"Romanian": [
"e",
"i",
"a",
"r",
"n",
"t",
"u",
"l",
"o",
"c",
"s",
"d",
"p",
"m",
"ă",
"f",
"v",
"î",
"g",
"b",
"ș",
"ț",
"z",
"h",
"â",
"j",
],
"Farsi": [
"ا",
"ی",
"ر",
"د",
"ن",
"ه",
"و",
"م",
"ت",
"ب",
"س",
"ل",
"ک",
"ش",
"ز",
"ف",
"گ",
"ع",
"خ",
"ق",
"ج",
"آ",
"پ",
"ح",
"ط",
"ص",
],
"Arabic": [
"ا",
"ل",
"ي",
"م",
"و",
"ن",
"ر",
"ت",
"ب",
"ة",
"ع",
"د",
"س",
"ف",
"ه",
"ك",
"ق",
"أ",
"ح",
"ج",
"ش",
"ط",
"ص",
"ى",
"خ",
"إ",
],
"Danish": [
"e",
"r",
"n",
"t",
"a",
"i",
"s",
"d",
"l",
"o",
"g",
"m",
"k",
"f",
"v",
"u",
"b",
"h",
"p",
"å",
"y",
"ø",
"æ",
"c",
"j",
"w",
],
"Serbian": [
"а",
"и",
"о",
"е",
"н",
"р",
"с",
"у",
"т",
"к",
"ј",
"в",
"д",
"м",
"п",
"л",
"г",
"з",
"б",
"a",
"i",
"e",
"o",
"n",
"ц",
"ш",
],
"Lithuanian": [
"i",
"a",
"s",
"o",
"r",
"e",
"t",
"n",
"u",
"k",
"m",
"l",
"p",
"v",
"d",
"j",
"g",
"ė",
"b",
"y",
"ų",
"š",
"ž",
"c",
"ą",
"į",
],
"Slovene": [
"e",
"a",
"i",
"o",
"n",
"r",
"s",
"l",
"t",
"j",
"v",
"k",
"d",
"p",
"m",
"u",
"z",
"b",
"g",
"h",
"č",
"c",
"š",
"ž",
"f",
"y",
],
"Slovak": [
"o",
"a",
"e",
"n",
"i",
"r",
"v",
"t",
"s",
"l",
"k",
"d",
"m",
"p",
"u",
"c",
"h",
"j",
"b",
"z",
"á",
"y",
"ý",
"í",
"č",
"é",
],
"Hebrew": [
"י",
"ו",
"ה",
"ל",
"ר",
"ב",
"ת",
"מ",
"א",
"ש",
"נ",
"ע",
"ם",
"ד",
"ק",
"ח",
"פ",
"ס",
"כ",
"ג",
"ט",
"צ",
"ן",
"ז",
"ך",
],
"Bulgarian": [
"а",
"и",
"о",
"е",
"н",
"т",
"р",
"с",
"в",
"л",
"к",
"д",
"п",
"м",
"з",
"г",
"я",
"ъ",
"у",
"б",
"ч",
"ц",
"й",
"ж",
"щ",
"х",
],
"Croatian": [
"a",
"i",
"o",
"e",
"n",
"r",
"j",
"s",
"t",
"u",
"k",
"l",
"v",
"d",
"m",
"p",
"g",
"z",
"b",
"c",
"č",
"h",
"š",
"ž",
"ć",
"f",
],
"Hindi": [
"क",
"र",
"स",
"न",
"त",
"म",
"ह",
"प",
"य",
"ल",
"व",
"ज",
"द",
"ग",
"ब",
"श",
"ट",
"अ",
"ए",
"थ",
"भ",
"ड",
"च",
"ध",
"ष",
"इ",
],
"Estonian": [
"a",
"i",
"e",
"s",
"t",
"l",
"u",
"n",
"o",
"k",
"r",
"d",
"m",
"v",
"g",
"p",
"j",
"h",
"ä",
"b",
"õ",
"ü",
"f",
"c",
"ö",
"y",
],
"Thai": [
"า",
"น",
"ร",
"อ",
"ก",
"เ",
"ง",
"ม",
"ย",
"ล",
"ว",
"ด",
"ท",
"ส",
"ต",
"ะ",
"ป",
"บ",
"ค",
"ห",
"แ",
"จ",
"พ",
"ช",
"ข",
"ใ",
],
"Greek": [
"α",
"τ",
"ο",
"ι",
"ε",
"ν",
"ρ",
"σ",
"κ",
"η",
"π",
"ς",
"υ",
"μ",
"λ",
"ί",
"ό",
"ά",
"γ",
"έ",
"δ",
"ή",
"ω",
"χ",
"θ",
"ύ",
],
"Tamil": [
"க",
"த",
"ப",
"ட",
"ர",
"ம",
"ல",
"ன",
"வ",
"ற",
"ய",
"ள",
"ச",
"ந",
"இ",
"ண",
"அ",
"ஆ",
"ழ",
"ங",
"எ",
"உ",
"ஒ",
"ஸ",
],
"Kazakh": [
"а",
"ы",
"е",
"н",
"т",
"р",
"л",
"і",
"д",
"с",
"м",
"қ",
"к",
"о",
"б",
"и",
"у",
"ғ",
"ж",
"ң",
"з",
"ш",
"й",
"п",
"г",
"ө",
],
}
LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
python-charset-normalizer-3.4.2/src/charset_normalizer/legacy.py 0000664 0000000 0000000 00000004357 15005045421 0025155 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from typing import TYPE_CHECKING, Any
from warnings import warn
from .api import from_bytes
from .constant import CHARDET_CORRESPONDENCE
# TODO: remove this check when dropping Python 3.7 support
if TYPE_CHECKING:
from typing_extensions import TypedDict
class ResultDict(TypedDict):
encoding: str | None
language: str
confidence: float | None
def detect(
byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
) -> ResultDict:
"""
chardet legacy method
Detect the encoding of the given byte string. It should be mostly backward-compatible.
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
This function is deprecated and should be used to migrate your project easily, consult the documentation for
further information. Not planned for removal.
:param byte_str: The byte sequence to examine.
:param should_rename_legacy: Should we rename legacy encodings
to their more modern equivalents?
"""
if len(kwargs):
warn(
f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
)
if not isinstance(byte_str, (bytearray, bytes)):
raise TypeError( # pragma: nocover
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
)
if isinstance(byte_str, bytearray):
byte_str = bytes(byte_str)
r = from_bytes(byte_str).best()
encoding = r.encoding if r is not None else None
language = r.language if r is not None and r.language != "Unknown" else ""
confidence = 1.0 - r.chaos if r is not None else None
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
# but chardet does return 'utf-8-sig' and it is a valid codec name.
if r is not None and encoding == "utf_8" and r.bom:
encoding += "_sig"
if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
encoding = CHARDET_CORRESPONDENCE[encoding]
return {
"encoding": encoding,
"language": language,
"confidence": confidence,
}
python-charset-normalizer-3.4.2/src/charset_normalizer/md.py 0000664 0000000 0000000 00000047261 15005045421 0024312 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from functools import lru_cache
from logging import getLogger
from .constant import (
COMMON_SAFE_ASCII_CHARACTERS,
TRACE,
UNICODE_SECONDARY_RANGE_KEYWORD,
)
from .utils import (
is_accentuated,
is_arabic,
is_arabic_isolated_form,
is_case_variable,
is_cjk,
is_emoticon,
is_hangul,
is_hiragana,
is_katakana,
is_latin,
is_punctuation,
is_separator,
is_symbol,
is_thai,
is_unprintable,
remove_accent,
unicode_range,
is_cjk_uncommon,
)
class MessDetectorPlugin:
"""
Base abstract class used for mess detection plugins.
All detectors MUST extend and implement given methods.
"""
def eligible(self, character: str) -> bool:
"""
Determine if given character should be fed in.
"""
raise NotImplementedError # pragma: nocover
def feed(self, character: str) -> None:
"""
The main routine to be executed upon character.
Insert the logic in witch the text would be considered chaotic.
"""
raise NotImplementedError # pragma: nocover
def reset(self) -> None: # pragma: no cover
"""
Permit to reset the plugin to the initial state.
"""
raise NotImplementedError
@property
def ratio(self) -> float:
"""
Compute the chaos ratio based on what your feed() has seen.
Must NOT be lower than 0.; No restriction gt 0.
"""
raise NotImplementedError # pragma: nocover
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._punctuation_count: int = 0
self._symbol_count: int = 0
self._character_count: int = 0
self._last_printable_char: str | None = None
self._frenzy_symbol_in_word: bool = False
def eligible(self, character: str) -> bool:
return character.isprintable()
def feed(self, character: str) -> None:
self._character_count += 1
if (
character != self._last_printable_char
and character not in COMMON_SAFE_ASCII_CHARACTERS
):
if is_punctuation(character):
self._punctuation_count += 1
elif (
character.isdigit() is False
and is_symbol(character)
and is_emoticon(character) is False
):
self._symbol_count += 2
self._last_printable_char = character
def reset(self) -> None: # Abstract
self._punctuation_count = 0
self._character_count = 0
self._symbol_count = 0
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
ratio_of_punctuation: float = (
self._punctuation_count + self._symbol_count
) / self._character_count
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
class TooManyAccentuatedPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._character_count: int = 0
self._accentuated_count: int = 0
def eligible(self, character: str) -> bool:
return character.isalpha()
def feed(self, character: str) -> None:
self._character_count += 1
if is_accentuated(character):
self._accentuated_count += 1
def reset(self) -> None: # Abstract
self._character_count = 0
self._accentuated_count = 0
@property
def ratio(self) -> float:
if self._character_count < 8:
return 0.0
ratio_of_accentuation: float = self._accentuated_count / self._character_count
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
class UnprintablePlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._unprintable_count: int = 0
self._character_count: int = 0
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
if is_unprintable(character):
self._unprintable_count += 1
self._character_count += 1
def reset(self) -> None: # Abstract
self._unprintable_count = 0
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
return (self._unprintable_count * 8) / self._character_count
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._successive_count: int = 0
self._character_count: int = 0
self._last_latin_character: str | None = None
def eligible(self, character: str) -> bool:
return character.isalpha() and is_latin(character)
def feed(self, character: str) -> None:
self._character_count += 1
if (
self._last_latin_character is not None
and is_accentuated(character)
and is_accentuated(self._last_latin_character)
):
if character.isupper() and self._last_latin_character.isupper():
self._successive_count += 1
# Worse if its the same char duplicated with different accent.
if remove_accent(character) == remove_accent(self._last_latin_character):
self._successive_count += 1
self._last_latin_character = character
def reset(self) -> None: # Abstract
self._successive_count = 0
self._character_count = 0
self._last_latin_character = None
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
return (self._successive_count * 2) / self._character_count
class SuspiciousRange(MessDetectorPlugin):
def __init__(self) -> None:
self._suspicious_successive_range_count: int = 0
self._character_count: int = 0
self._last_printable_seen: str | None = None
def eligible(self, character: str) -> bool:
return character.isprintable()
def feed(self, character: str) -> None:
self._character_count += 1
if (
character.isspace()
or is_punctuation(character)
or character in COMMON_SAFE_ASCII_CHARACTERS
):
self._last_printable_seen = None
return
if self._last_printable_seen is None:
self._last_printable_seen = character
return
unicode_range_a: str | None = unicode_range(self._last_printable_seen)
unicode_range_b: str | None = unicode_range(character)
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
self._suspicious_successive_range_count += 1
self._last_printable_seen = character
def reset(self) -> None: # Abstract
self._character_count = 0
self._suspicious_successive_range_count = 0
self._last_printable_seen = None
@property
def ratio(self) -> float:
if self._character_count <= 13:
return 0.0
ratio_of_suspicious_range_usage: float = (
self._suspicious_successive_range_count * 2
) / self._character_count
return ratio_of_suspicious_range_usage
class SuperWeirdWordPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._word_count: int = 0
self._bad_word_count: int = 0
self._foreign_long_count: int = 0
self._is_current_word_bad: bool = False
self._foreign_long_watch: bool = False
self._character_count: int = 0
self._bad_character_count: int = 0
self._buffer: str = ""
self._buffer_accent_count: int = 0
self._buffer_glyph_count: int = 0
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
if character.isalpha():
self._buffer += character
if is_accentuated(character):
self._buffer_accent_count += 1
if (
self._foreign_long_watch is False
and (is_latin(character) is False or is_accentuated(character))
and is_cjk(character) is False
and is_hangul(character) is False
and is_katakana(character) is False
and is_hiragana(character) is False
and is_thai(character) is False
):
self._foreign_long_watch = True
if (
is_cjk(character)
or is_hangul(character)
or is_katakana(character)
or is_hiragana(character)
or is_thai(character)
):
self._buffer_glyph_count += 1
return
if not self._buffer:
return
if (
character.isspace() or is_punctuation(character) or is_separator(character)
) and self._buffer:
self._word_count += 1
buffer_length: int = len(self._buffer)
self._character_count += buffer_length
if buffer_length >= 4:
if self._buffer_accent_count / buffer_length >= 0.5:
self._is_current_word_bad = True
# Word/Buffer ending with an upper case accentuated letter are so rare,
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
elif (
is_accentuated(self._buffer[-1])
and self._buffer[-1].isupper()
and all(_.isupper() for _ in self._buffer) is False
):
self._foreign_long_count += 1
self._is_current_word_bad = True
elif self._buffer_glyph_count == 1:
self._is_current_word_bad = True
self._foreign_long_count += 1
if buffer_length >= 24 and self._foreign_long_watch:
camel_case_dst = [
i
for c, i in zip(self._buffer, range(0, buffer_length))
if c.isupper()
]
probable_camel_cased: bool = False
if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
probable_camel_cased = True
if not probable_camel_cased:
self._foreign_long_count += 1
self._is_current_word_bad = True
if self._is_current_word_bad:
self._bad_word_count += 1
self._bad_character_count += len(self._buffer)
self._is_current_word_bad = False
self._foreign_long_watch = False
self._buffer = ""
self._buffer_accent_count = 0
self._buffer_glyph_count = 0
elif (
character not in {"<", ">", "-", "=", "~", "|", "_"}
and character.isdigit() is False
and is_symbol(character)
):
self._is_current_word_bad = True
self._buffer += character
def reset(self) -> None: # Abstract
self._buffer = ""
self._is_current_word_bad = False
self._foreign_long_watch = False
self._bad_word_count = 0
self._word_count = 0
self._character_count = 0
self._bad_character_count = 0
self._foreign_long_count = 0
@property
def ratio(self) -> float:
if self._word_count <= 10 and self._foreign_long_count == 0:
return 0.0
return self._bad_character_count / self._character_count
class CjkUncommonPlugin(MessDetectorPlugin):
"""
Detect messy CJK text that probably means nothing.
"""
def __init__(self) -> None:
self._character_count: int = 0
self._uncommon_count: int = 0
def eligible(self, character: str) -> bool:
return is_cjk(character)
def feed(self, character: str) -> None:
self._character_count += 1
if is_cjk_uncommon(character):
self._uncommon_count += 1
return
def reset(self) -> None: # Abstract
self._character_count = 0
self._uncommon_count = 0
@property
def ratio(self) -> float:
if self._character_count < 8:
return 0.0
uncommon_form_usage: float = self._uncommon_count / self._character_count
# we can be pretty sure it's garbage when uncommon characters are widely
# used. otherwise it could just be traditional chinese for example.
return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._buf: bool = False
self._character_count_since_last_sep: int = 0
self._successive_upper_lower_count: int = 0
self._successive_upper_lower_count_final: int = 0
self._character_count: int = 0
self._last_alpha_seen: str | None = None
self._current_ascii_only: bool = True
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
is_concerned = character.isalpha() and is_case_variable(character)
chunk_sep = is_concerned is False
if chunk_sep and self._character_count_since_last_sep > 0:
if (
self._character_count_since_last_sep <= 64
and character.isdigit() is False
and self._current_ascii_only is False
):
self._successive_upper_lower_count_final += (
self._successive_upper_lower_count
)
self._successive_upper_lower_count = 0
self._character_count_since_last_sep = 0
self._last_alpha_seen = None
self._buf = False
self._character_count += 1
self._current_ascii_only = True
return
if self._current_ascii_only is True and character.isascii() is False:
self._current_ascii_only = False
if self._last_alpha_seen is not None:
if (character.isupper() and self._last_alpha_seen.islower()) or (
character.islower() and self._last_alpha_seen.isupper()
):
if self._buf is True:
self._successive_upper_lower_count += 2
self._buf = False
else:
self._buf = True
else:
self._buf = False
self._character_count += 1
self._character_count_since_last_sep += 1
self._last_alpha_seen = character
def reset(self) -> None: # Abstract
self._character_count = 0
self._character_count_since_last_sep = 0
self._successive_upper_lower_count = 0
self._successive_upper_lower_count_final = 0
self._last_alpha_seen = None
self._buf = False
self._current_ascii_only = True
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
return self._successive_upper_lower_count_final / self._character_count
class ArabicIsolatedFormPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._character_count: int = 0
self._isolated_form_count: int = 0
def reset(self) -> None: # Abstract
self._character_count = 0
self._isolated_form_count = 0
def eligible(self, character: str) -> bool:
return is_arabic(character)
def feed(self, character: str) -> None:
self._character_count += 1
if is_arabic_isolated_form(character):
self._isolated_form_count += 1
@property
def ratio(self) -> float:
if self._character_count < 8:
return 0.0
isolated_form_usage: float = self._isolated_form_count / self._character_count
return isolated_form_usage
@lru_cache(maxsize=1024)
def is_suspiciously_successive_range(
unicode_range_a: str | None, unicode_range_b: str | None
) -> bool:
"""
Determine if two Unicode range seen next to each other can be considered as suspicious.
"""
if unicode_range_a is None or unicode_range_b is None:
return True
if unicode_range_a == unicode_range_b:
return False
if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
return False
if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
return False
# Latin characters can be accompanied with a combining diacritical mark
# eg. Vietnamese.
if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
"Combining" in unicode_range_a or "Combining" in unicode_range_b
):
return False
keywords_range_a, keywords_range_b = (
unicode_range_a.split(" "),
unicode_range_b.split(" "),
)
for el in keywords_range_a:
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
continue
if el in keywords_range_b:
return False
# Japanese Exception
range_a_jp_chars, range_b_jp_chars = (
unicode_range_a
in (
"Hiragana",
"Katakana",
),
unicode_range_b in ("Hiragana", "Katakana"),
)
if (range_a_jp_chars or range_b_jp_chars) and (
"CJK" in unicode_range_a or "CJK" in unicode_range_b
):
return False
if range_a_jp_chars and range_b_jp_chars:
return False
if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
return False
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
return False
# Chinese/Japanese use dedicated range for punctuation and/or separators.
if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
unicode_range_a in ["Katakana", "Hiragana"]
and unicode_range_b in ["Katakana", "Hiragana"]
):
if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
return False
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
return False
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
return False
return True
@lru_cache(maxsize=2048)
def mess_ratio(
decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
) -> float:
"""
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
"""
detectors: list[MessDetectorPlugin] = [
md_class() for md_class in MessDetectorPlugin.__subclasses__()
]
length: int = len(decoded_sequence) + 1
mean_mess_ratio: float = 0.0
if length < 512:
intermediary_mean_mess_ratio_calc: int = 32
elif length <= 1024:
intermediary_mean_mess_ratio_calc = 64
else:
intermediary_mean_mess_ratio_calc = 128
for character, index in zip(decoded_sequence + "\n", range(length)):
for detector in detectors:
if detector.eligible(character):
detector.feed(character)
if (
index > 0 and index % intermediary_mean_mess_ratio_calc == 0
) or index == length - 1:
mean_mess_ratio = sum(dt.ratio for dt in detectors)
if mean_mess_ratio >= maximum_threshold:
break
if debug:
logger = getLogger("charset_normalizer")
logger.log(
TRACE,
"Mess-detector extended-analysis start. "
f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
f"maximum_threshold={maximum_threshold}",
)
if len(decoded_sequence) > 16:
logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
for dt in detectors:
logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
return round(mean_mess_ratio, 3)
python-charset-normalizer-3.4.2/src/charset_normalizer/models.py 0000664 0000000 0000000 00000030152 15005045421 0025164 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from encodings.aliases import aliases
from hashlib import sha256
from json import dumps
from re import sub
from typing import Any, Iterator, List, Tuple
from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
from .utils import iana_name, is_multi_byte_encoding, unicode_range
class CharsetMatch:
def __init__(
self,
payload: bytes,
guessed_encoding: str,
mean_mess_ratio: float,
has_sig_or_bom: bool,
languages: CoherenceMatches,
decoded_payload: str | None = None,
preemptive_declaration: str | None = None,
):
self._payload: bytes = payload
self._encoding: str = guessed_encoding
self._mean_mess_ratio: float = mean_mess_ratio
self._languages: CoherenceMatches = languages
self._has_sig_or_bom: bool = has_sig_or_bom
self._unicode_ranges: list[str] | None = None
self._leaves: list[CharsetMatch] = []
self._mean_coherence_ratio: float = 0.0
self._output_payload: bytes | None = None
self._output_encoding: str | None = None
self._string: str | None = decoded_payload
self._preemptive_declaration: str | None = preemptive_declaration
def __eq__(self, other: object) -> bool:
if not isinstance(other, CharsetMatch):
if isinstance(other, str):
return iana_name(other) == self.encoding
return False
return self.encoding == other.encoding and self.fingerprint == other.fingerprint
def __lt__(self, other: object) -> bool:
"""
Implemented to make sorted available upon CharsetMatches items.
"""
if not isinstance(other, CharsetMatch):
raise ValueError
chaos_difference: float = abs(self.chaos - other.chaos)
coherence_difference: float = abs(self.coherence - other.coherence)
# Below 1% difference --> Use Coherence
if chaos_difference < 0.01 and coherence_difference > 0.02:
return self.coherence > other.coherence
elif chaos_difference < 0.01 and coherence_difference <= 0.02:
# When having a difficult decision, use the result that decoded as many multi-byte as possible.
# preserve RAM usage!
if len(self._payload) >= TOO_BIG_SEQUENCE:
return self.chaos < other.chaos
return self.multi_byte_usage > other.multi_byte_usage
return self.chaos < other.chaos
@property
def multi_byte_usage(self) -> float:
return 1.0 - (len(str(self)) / len(self.raw))
def __str__(self) -> str:
# Lazy Str Loading
if self._string is None:
self._string = str(self._payload, self._encoding, "strict")
return self._string
def __repr__(self) -> str:
return f""
def add_submatch(self, other: CharsetMatch) -> None:
if not isinstance(other, CharsetMatch) or other == self:
raise ValueError(
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
other.__class__
)
)
other._string = None # Unload RAM usage; dirty trick.
self._leaves.append(other)
@property
def encoding(self) -> str:
return self._encoding
@property
def encoding_aliases(self) -> list[str]:
"""
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
"""
also_known_as: list[str] = []
for u, p in aliases.items():
if self.encoding == u:
also_known_as.append(p)
elif self.encoding == p:
also_known_as.append(u)
return also_known_as
@property
def bom(self) -> bool:
return self._has_sig_or_bom
@property
def byte_order_mark(self) -> bool:
return self._has_sig_or_bom
@property
def languages(self) -> list[str]:
"""
Return the complete list of possible languages found in decoded sequence.
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
"""
return [e[0] for e in self._languages]
@property
def language(self) -> str:
"""
Most probable language found in decoded sequence. If none were detected or inferred, the property will return
"Unknown".
"""
if not self._languages:
# Trying to infer the language based on the given encoding
# Its either English or we should not pronounce ourselves in certain cases.
if "ascii" in self.could_be_from_charset:
return "English"
# doing it there to avoid circular import
from charset_normalizer.cd import encoding_languages, mb_encoding_languages
languages = (
mb_encoding_languages(self.encoding)
if is_multi_byte_encoding(self.encoding)
else encoding_languages(self.encoding)
)
if len(languages) == 0 or "Latin Based" in languages:
return "Unknown"
return languages[0]
return self._languages[0][0]
@property
def chaos(self) -> float:
return self._mean_mess_ratio
@property
def coherence(self) -> float:
if not self._languages:
return 0.0
return self._languages[0][1]
@property
def percent_chaos(self) -> float:
return round(self.chaos * 100, ndigits=3)
@property
def percent_coherence(self) -> float:
return round(self.coherence * 100, ndigits=3)
@property
def raw(self) -> bytes:
"""
Original untouched bytes.
"""
return self._payload
@property
def submatch(self) -> list[CharsetMatch]:
return self._leaves
@property
def has_submatch(self) -> bool:
return len(self._leaves) > 0
@property
def alphabets(self) -> list[str]:
if self._unicode_ranges is not None:
return self._unicode_ranges
# list detected ranges
detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
# filter and sort
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
return self._unicode_ranges
@property
def could_be_from_charset(self) -> list[str]:
"""
The complete list of encoding that output the exact SAME str result and therefore could be the originating
encoding.
This list does include the encoding available in property 'encoding'.
"""
return [self._encoding] + [m.encoding for m in self._leaves]
def output(self, encoding: str = "utf_8") -> bytes:
"""
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
Any errors will be simply ignored by the encoder NOT replaced.
"""
if self._output_encoding is None or self._output_encoding != encoding:
self._output_encoding = encoding
decoded_string = str(self)
if (
self._preemptive_declaration is not None
and self._preemptive_declaration.lower()
not in ["utf-8", "utf8", "utf_8"]
):
patched_header = sub(
RE_POSSIBLE_ENCODING_INDICATION,
lambda m: m.string[m.span()[0] : m.span()[1]].replace(
m.groups()[0],
iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
),
decoded_string[:8192],
count=1,
)
decoded_string = patched_header + decoded_string[8192:]
self._output_payload = decoded_string.encode(encoding, "replace")
return self._output_payload # type: ignore
@property
def fingerprint(self) -> str:
"""
Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
"""
return sha256(self.output()).hexdigest()
class CharsetMatches:
"""
Container with every CharsetMatch items ordered by default from most probable to the less one.
Act like a list(iterable) but does not implements all related methods.
"""
def __init__(self, results: list[CharsetMatch] | None = None):
self._results: list[CharsetMatch] = sorted(results) if results else []
def __iter__(self) -> Iterator[CharsetMatch]:
yield from self._results
def __getitem__(self, item: int | str) -> CharsetMatch:
"""
Retrieve a single item either by its position or encoding name (alias may be used here).
Raise KeyError upon invalid index or encoding not present in results.
"""
if isinstance(item, int):
return self._results[item]
if isinstance(item, str):
item = iana_name(item, False)
for result in self._results:
if item in result.could_be_from_charset:
return result
raise KeyError
def __len__(self) -> int:
return len(self._results)
def __bool__(self) -> bool:
return len(self._results) > 0
def append(self, item: CharsetMatch) -> None:
"""
Insert a single match. Will be inserted accordingly to preserve sort.
Can be inserted as a submatch.
"""
if not isinstance(item, CharsetMatch):
raise ValueError(
"Cannot append instance '{}' to CharsetMatches".format(
str(item.__class__)
)
)
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
if len(item.raw) < TOO_BIG_SEQUENCE:
for match in self._results:
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
match.add_submatch(item)
return
self._results.append(item)
self._results = sorted(self._results)
def best(self) -> CharsetMatch | None:
"""
Simply return the first match. Strict equivalent to matches[0].
"""
if not self._results:
return None
return self._results[0]
def first(self) -> CharsetMatch | None:
"""
Redundant method, call the method best(). Kept for BC reasons.
"""
return self.best()
CoherenceMatch = Tuple[str, float]
CoherenceMatches = List[CoherenceMatch]
class CliDetectionResult:
def __init__(
self,
path: str,
encoding: str | None,
encoding_aliases: list[str],
alternative_encodings: list[str],
language: str,
alphabets: list[str],
has_sig_or_bom: bool,
chaos: float,
coherence: float,
unicode_path: str | None,
is_preferred: bool,
):
self.path: str = path
self.unicode_path: str | None = unicode_path
self.encoding: str | None = encoding
self.encoding_aliases: list[str] = encoding_aliases
self.alternative_encodings: list[str] = alternative_encodings
self.language: str = language
self.alphabets: list[str] = alphabets
self.has_sig_or_bom: bool = has_sig_or_bom
self.chaos: float = chaos
self.coherence: float = coherence
self.is_preferred: bool = is_preferred
@property
def __dict__(self) -> dict[str, Any]: # type: ignore
return {
"path": self.path,
"encoding": self.encoding,
"encoding_aliases": self.encoding_aliases,
"alternative_encodings": self.alternative_encodings,
"language": self.language,
"alphabets": self.alphabets,
"has_sig_or_bom": self.has_sig_or_bom,
"chaos": self.chaos,
"coherence": self.coherence,
"unicode_path": self.unicode_path,
"is_preferred": self.is_preferred,
}
def to_json(self) -> str:
return dumps(self.__dict__, ensure_ascii=True, indent=4)
python-charset-normalizer-3.4.2/src/charset_normalizer/py.typed 0000664 0000000 0000000 00000000000 15005045421 0025013 0 ustar 00root root 0000000 0000000 python-charset-normalizer-3.4.2/src/charset_normalizer/utils.py 0000664 0000000 0000000 00000027612 15005045421 0025050 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import importlib
import logging
import unicodedata
from codecs import IncrementalDecoder
from encodings.aliases import aliases
from functools import lru_cache
from re import findall
from typing import Generator
from _multibytecodec import ( # type: ignore[import-not-found,import]
MultibyteIncrementalDecoder,
)
from .constant import (
ENCODING_MARKS,
IANA_SUPPORTED_SIMILAR,
RE_POSSIBLE_ENCODING_INDICATION,
UNICODE_RANGES_COMBINED,
UNICODE_SECONDARY_RANGE_KEYWORD,
UTF8_MAXIMAL_ALLOCATION,
COMMON_CJK_CHARACTERS,
)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_accentuated(character: str) -> bool:
try:
description: str = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return (
"WITH GRAVE" in description
or "WITH ACUTE" in description
or "WITH CEDILLA" in description
or "WITH DIAERESIS" in description
or "WITH CIRCUMFLEX" in description
or "WITH TILDE" in description
or "WITH MACRON" in description
or "WITH RING ABOVE" in description
)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def remove_accent(character: str) -> str:
decomposed: str = unicodedata.decomposition(character)
if not decomposed:
return character
codes: list[str] = decomposed.split(" ")
return chr(int(codes[0], 16))
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def unicode_range(character: str) -> str | None:
"""
Retrieve the Unicode range official name from a single character.
"""
character_ord: int = ord(character)
for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
if character_ord in ord_range:
return range_name
return None
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_latin(character: str) -> bool:
try:
description: str = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "LATIN" in description
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_punctuation(character: str) -> bool:
character_category: str = unicodedata.category(character)
if "P" in character_category:
return True
character_range: str | None = unicode_range(character)
if character_range is None:
return False
return "Punctuation" in character_range
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_symbol(character: str) -> bool:
character_category: str = unicodedata.category(character)
if "S" in character_category or "N" in character_category:
return True
character_range: str | None = unicode_range(character)
if character_range is None:
return False
return "Forms" in character_range and character_category != "Lo"
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_emoticon(character: str) -> bool:
character_range: str | None = unicode_range(character)
if character_range is None:
return False
return "Emoticons" in character_range or "Pictographs" in character_range
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_separator(character: str) -> bool:
if character.isspace() or character in {"|", "+", "<", ">"}:
return True
character_category: str = unicodedata.category(character)
return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_case_variable(character: str) -> bool:
return character.islower() != character.isupper()
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_cjk(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "CJK" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_hiragana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "HIRAGANA" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_katakana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "KATAKANA" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_hangul(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "HANGUL" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_thai(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "THAI" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_arabic(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "ARABIC" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_arabic_isolated_form(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "ARABIC" in character_name and "ISOLATED FORM" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_cjk_uncommon(character: str) -> bool:
return character not in COMMON_CJK_CHARACTERS
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
def is_unicode_range_secondary(range_name: str) -> bool:
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_unprintable(character: str) -> bool:
return (
character.isspace() is False # includes \n \t \r \v
and character.isprintable() is False
and character != "\x1a" # Why? Its the ASCII substitute character.
and character != "\ufeff" # bug discovered in Python,
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
)
def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> str | None:
"""
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
"""
if not isinstance(sequence, bytes):
raise TypeError
seq_len: int = len(sequence)
results: list[str] = findall(
RE_POSSIBLE_ENCODING_INDICATION,
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
)
if len(results) == 0:
return None
for specified_encoding in results:
specified_encoding = specified_encoding.lower().replace("-", "_")
encoding_alias: str
encoding_iana: str
for encoding_alias, encoding_iana in aliases.items():
if encoding_alias == specified_encoding:
return encoding_iana
if encoding_iana == specified_encoding:
return encoding_iana
return None
@lru_cache(maxsize=128)
def is_multi_byte_encoding(name: str) -> bool:
"""
Verify is a specific encoding is a multi byte one based on it IANA name
"""
return name in {
"utf_8",
"utf_8_sig",
"utf_16",
"utf_16_be",
"utf_16_le",
"utf_32",
"utf_32_le",
"utf_32_be",
"utf_7",
} or issubclass(
importlib.import_module(f"encodings.{name}").IncrementalDecoder,
MultibyteIncrementalDecoder,
)
def identify_sig_or_bom(sequence: bytes) -> tuple[str | None, bytes]:
"""
Identify and extract SIG/BOM in given sequence.
"""
for iana_encoding in ENCODING_MARKS:
marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding]
if isinstance(marks, bytes):
marks = [marks]
for mark in marks:
if sequence.startswith(mark):
return iana_encoding, mark
return None, b""
def should_strip_sig_or_bom(iana_encoding: str) -> bool:
return iana_encoding not in {"utf_16", "utf_32"}
def iana_name(cp_name: str, strict: bool = True) -> str:
"""Returns the Python normalized encoding name (Not the IANA official name)."""
cp_name = cp_name.lower().replace("-", "_")
encoding_alias: str
encoding_iana: str
for encoding_alias, encoding_iana in aliases.items():
if cp_name in [encoding_alias, encoding_iana]:
return encoding_iana
if strict:
raise ValueError(f"Unable to retrieve IANA for '{cp_name}'")
return cp_name
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
return 0.0
decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder
decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder
id_a: IncrementalDecoder = decoder_a(errors="ignore")
id_b: IncrementalDecoder = decoder_b(errors="ignore")
character_match_count: int = 0
for i in range(255):
to_be_decoded: bytes = bytes([i])
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
character_match_count += 1
return character_match_count / 254
def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
"""
Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
the function cp_similarity.
"""
return (
iana_name_a in IANA_SUPPORTED_SIMILAR
and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
)
def set_logging_handler(
name: str = "charset_normalizer",
level: int = logging.INFO,
format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
) -> None:
logger = logging.getLogger(name)
logger.setLevel(level)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(format_string))
logger.addHandler(handler)
def cut_sequence_chunks(
sequences: bytes,
encoding_iana: str,
offsets: range,
chunk_size: int,
bom_or_sig_available: bool,
strip_sig_or_bom: bool,
sig_payload: bytes,
is_multi_byte_decoder: bool,
decoded_payload: str | None = None,
) -> Generator[str, None, None]:
if decoded_payload and is_multi_byte_decoder is False:
for i in offsets:
chunk = decoded_payload[i : i + chunk_size]
if not chunk:
break
yield chunk
else:
for i in offsets:
chunk_end = i + chunk_size
if chunk_end > len(sequences) + 8:
continue
cut_sequence = sequences[i : i + chunk_size]
if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence
chunk = cut_sequence.decode(
encoding_iana,
errors="ignore" if is_multi_byte_decoder else "strict",
)
# multi-byte bad cutting detector and adjustment
# not the cleanest way to perform that fix but clever enough for now.
if is_multi_byte_decoder and i > 0:
chunk_partial_size_chk: int = min(chunk_size, 16)
if (
decoded_payload
and chunk[:chunk_partial_size_chk] not in decoded_payload
):
for j in range(i, i - 4, -1):
cut_sequence = sequences[j:chunk_end]
if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
if chunk[:chunk_partial_size_chk] in decoded_payload:
break
yield chunk
python-charset-normalizer-3.4.2/src/charset_normalizer/version.py 0000664 0000000 0000000 00000000163 15005045421 0025365 0 ustar 00root root 0000000 0000000 """
Expose version
"""
from __future__ import annotations
__version__ = "3.4.2"
VERSION = __version__.split(".")
python-charset-normalizer-3.4.2/tests/ 0000775 0000000 0000000 00000000000 15005045421 0020006 5 ustar 00root root 0000000 0000000 python-charset-normalizer-3.4.2/tests/__init__.py 0000664 0000000 0000000 00000000000 15005045421 0022105 0 ustar 00root root 0000000 0000000 python-charset-normalizer-3.4.2/tests/test_base_detection.py 0000664 0000000 0000000 00000012747 15005045421 0024402 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import pytest
from charset_normalizer.api import from_bytes
from charset_normalizer.models import CharsetMatches
def test_empty():
best_guess = from_bytes(b"").best()
assert best_guess is not None, "Empty bytes payload SHOULD NOT return None"
assert (
best_guess.encoding == "utf_8"
), "Empty bytes payload SHOULD be guessed as UTF-8 (arbitrary)"
assert len(best_guess.alphabets) == 0, ""
def test_bool_matches():
guesses_not_empty = from_bytes(b"")
guesses_empty = CharsetMatches([])
assert (
bool(guesses_not_empty) is True
), "Bool behaviour of CharsetMatches altered, should be True"
assert (
bool(guesses_empty) is False
), "Bool behaviour of CharsetMatches altered, should be False"
@pytest.mark.parametrize(
"payload, expected_encoding",
[
(b"\xfe\xff", "utf_16"),
("\uFEFF".encode("gb18030"), "gb18030"),
(b"\xef\xbb\xbf", "utf_8"),
("".encode("utf_32"), "utf_32"),
],
)
def test_empty_but_with_bom_or_sig(payload, expected_encoding):
best_guess = from_bytes(payload).best()
assert best_guess is not None, "Empty detection but with SIG/BOM has failed!"
assert (
best_guess.encoding == expected_encoding
), "Empty detection but with SIG/BOM is wrongly detected!"
assert (
best_guess.raw == payload
), "The RAW property should contain the original payload given for detection."
assert best_guess.byte_order_mark is True, "The BOM/SIG property should return True"
assert str(best_guess) == "", "The cast to str SHOULD be empty"
@pytest.mark.parametrize(
"payload, expected_encoding",
[
(
("\uFEFF" + "我没有埋怨,磋砣的只是一些时间。").encode("gb18030"),
"gb18030",
),
(
"我没有埋怨,磋砣的只是一些时间。".encode("utf_32"),
"utf_32",
),
(
"我没有埋怨,磋砣的只是一些时间。".encode("utf_8_sig"),
"utf_8",
),
],
)
def test_content_with_bom_or_sig(payload, expected_encoding):
best_guess = from_bytes(payload).best()
assert best_guess is not None, "Detection but with SIG/BOM has failed!"
assert (
best_guess.encoding == expected_encoding
), "Detection but with SIG/BOM is wrongly detected!"
assert best_guess.byte_order_mark is True, "The BOM/SIG property should return True"
@pytest.mark.parametrize(
"payload",
[
b"AbAdZ pOoooOlDl mmlDoDkA lldDkeEkddA mpAlkDF",
b"g4UsPJdfzNkGW2jwmKDGDilKGKYtpF2X.mx3MaTWL1tL7CNn5U7DeCcodKX7S3lwwJPKNjBT8etY",
b'{"token": "g4UsPJdfzNkGW2jwmKDGDilKGKYtpF2X.mx3MaTWL1tL7CNn5U7DeCcodKX7S3lwwJPKNjBT8etY"}',
b"81f4ab054b39cb0e12701e734077d84264308f5fc79494fc5f159fa2ebc07b73c8cc0e98e009664a20986706f90146e8eefcb929ce1f74a8eab21369fdc70198",
b"{}",
],
)
def test_obviously_ascii_content(payload):
best_guess = from_bytes(payload).best()
assert best_guess is not None, "Dead-simple ASCII detection has failed!"
assert (
best_guess.encoding == "ascii"
), "Dead-simple ASCII detection is wrongly detected!"
@pytest.mark.parametrize(
"payload",
[
"\u020d\x1b".encode(),
"h\xe9llo world!\n".encode(),
"我没有埋怨,磋砣的只是一些时间。".encode(),
"Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно, поне що се отнася до началното и основното образование.".encode(),
"Bсеки човек има право на образование.".encode(),
"(° ͜ʖ °), creepy face, smiley 😀".encode(),
"""["Financiën", "La France"]""".encode(),
"Qu'est ce que une étoile?".encode(),
"""Financiën""".encode(),
"😀".encode(),
],
)
def test_obviously_utf8_content(payload):
best_guess = from_bytes(payload).best()
assert best_guess is not None, "Dead-simple UTF-8 detection has failed!"
assert (
best_guess.encoding == "utf_8"
), "Dead-simple UTF-8 detection is wrongly detected!"
def test_mb_cutting_chk():
# This payload should be wrongfully split and the autofix should ran automatically
# on chunks extraction.
payload = (
b"\xbf\xaa\xbb\xe7\xc0\xfb \xbf\xb9\xbc\xf6 "
b" \xbf\xac\xb1\xb8\xc0\xda\xb5\xe9\xc0\xba \xba\xb9\xc0\xbd\xbc\xad\xb3\xaa "
* 128
)
guesses = from_bytes(payload, cp_isolation=["cp949"])
best_guess = guesses.best()
assert len(guesses) == 1, "cp isolation is set and given seq should be clear CP949!"
assert best_guess.encoding == "cp949"
def test_alphabets_property():
best_guess = from_bytes("😀 Hello World! How affairs are going? 😀".encode()).best()
assert "Basic Latin" in best_guess.alphabets
assert "Emoticons range(Emoji)" in best_guess.alphabets
assert best_guess.alphabets.count("Basic Latin") == 1
def test_doc_example_short_cp1251():
best_guess = from_bytes(
"Bсеки човек има право на образование.".encode("cp1251")
).best()
assert best_guess.encoding == "cp1251"
def test_direct_cmp_charset_match():
best_guess = from_bytes("😀 Hello World! How affairs are going? 😀".encode()).best()
assert best_guess == "utf_8"
assert best_guess == "utf-8"
assert best_guess != 8
assert best_guess != None
python-charset-normalizer-3.4.2/tests/test_cli.py 0000664 0000000 0000000 00000006611 15005045421 0022172 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import unittest
from os import pardir, path, remove
from os.path import exists
from unittest.mock import patch
from charset_normalizer.cli import cli_detect, query_yes_no
DIR_PATH = path.join(path.dirname(path.realpath(__file__)), pardir)
class TestCommandLineInterface(unittest.TestCase):
@patch("builtins.input", lambda *args: "y")
def test_simple_yes_input(self):
self.assertTrue(query_yes_no("Are u willing to chill a little bit ?"))
@patch("builtins.input", lambda *args: "N")
def test_simple_no_input(self):
self.assertFalse(query_yes_no("Are u willing to chill a little bit ?"))
def test_single_file(self):
self.assertEqual(0, cli_detect([DIR_PATH + "/data/sample-arabic-1.txt"]))
def test_version_output_success(self):
with self.assertRaises(SystemExit):
cli_detect(["--version"])
def test_single_file_normalize(self):
self.assertEqual(
0, cli_detect([DIR_PATH + "/data/sample-arabic-1.txt", "--normalize"])
)
self.assertTrue(exists(DIR_PATH + "/data/sample-arabic-1.cp1256.txt"))
try:
remove(DIR_PATH + "/data/sample-arabic-1.cp1256.txt")
except:
pass
def test_single_verbose_file(self):
self.assertEqual(
0, cli_detect([DIR_PATH + "/data/sample-arabic-1.txt", "--verbose"])
)
def test_multiple_file(self):
self.assertEqual(
0,
cli_detect(
[
DIR_PATH + "/data/sample-arabic-1.txt",
DIR_PATH + "/data/sample-french.txt",
DIR_PATH + "/data/sample-chinese.txt",
]
),
)
def test_with_alternative(self):
self.assertEqual(
0,
cli_detect(
[
"-a",
DIR_PATH + "/data/sample-arabic-1.txt",
DIR_PATH + "/data/sample-french.txt",
DIR_PATH + "/data/sample-chinese.txt",
]
),
)
def test_with_minimal_output(self):
self.assertEqual(
0,
cli_detect(
[
"-m",
DIR_PATH + "/data/sample-arabic-1.txt",
DIR_PATH + "/data/sample-french.txt",
DIR_PATH + "/data/sample-chinese.txt",
]
),
)
def test_with_minimal_and_alt(self):
self.assertEqual(
0,
cli_detect(
[
"-m",
"-a",
DIR_PATH + "/data/sample-arabic-1.txt",
DIR_PATH + "/data/sample-french.txt",
DIR_PATH + "/data/sample-chinese.txt",
]
),
)
def test_non_existent_file(self):
with self.assertRaises(SystemExit) as cm:
cli_detect([DIR_PATH + "/data/not_found_data.txt"])
self.assertEqual(cm.exception.code, 2)
def test_replace_without_normalize(self):
self.assertEqual(
cli_detect([DIR_PATH + "/data/sample-arabic-1.txt", "--replace"]), 1
)
def test_force_replace_without_replace(self):
self.assertEqual(
cli_detect([DIR_PATH + "/data/sample-arabic-1.txt", "--force"]), 1
)
if __name__ == "__main__":
unittest.main()
python-charset-normalizer-3.4.2/tests/test_coherence_detection.py 0000664 0000000 0000000 00000005265 15005045421 0025420 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import pytest
from charset_normalizer.cd import (
encoding_languages,
filter_alt_coherence_matches,
get_target_features,
is_multi_byte_encoding,
mb_encoding_languages,
)
@pytest.mark.parametrize(
"iana_encoding, expected_languages",
[
("cp864", ["Arabic", "Farsi"]),
("cp862", ["Hebrew"]),
("cp737", ["Greek"]),
("cp424", ["Hebrew"]),
("cp273", ["Latin Based"]),
("johab", ["Korean"]),
("shift_jis", ["Japanese"]),
("mac_greek", ["Greek"]),
("iso2022_jp", ["Japanese"]),
],
)
def test_infer_language_from_cp(iana_encoding, expected_languages):
languages = (
mb_encoding_languages(iana_encoding)
if is_multi_byte_encoding(iana_encoding)
else encoding_languages(iana_encoding)
)
for expected_language in expected_languages:
assert (
expected_language in languages
), "Wrongly detected language for given code page"
@pytest.mark.parametrize(
"language, expected_have_accents, expected_pure_latin",
[
("English", False, True),
("French", True, True),
("Hebrew", False, False),
("Arabic", False, False),
("Vietnamese", True, True),
("Turkish", True, True),
],
)
def test_target_features(language, expected_have_accents, expected_pure_latin):
target_have_accents, target_pure_latin = get_target_features(language)
assert target_have_accents is expected_have_accents
assert target_pure_latin is expected_pure_latin
@pytest.mark.parametrize(
"matches, expected_return",
[
(
[
(
"English",
0.88,
),
("English—", 0.99),
],
[("English", 0.99)],
),
(
[
(
"English",
0.88,
),
("English—", 0.99),
("English——", 0.999),
],
[("English", 0.999)],
),
(
[
(
"English",
0.88,
),
("English—", 0.77),
],
[("English", 0.88)],
),
(
[
(
"English",
0.88,
),
("Italian", 0.77),
],
[("English", 0.88), ("Italian", 0.77)],
),
],
)
def test_filter_alt_coherence_matches(matches, expected_return):
results = filter_alt_coherence_matches(matches)
assert results == expected_return
python-charset-normalizer-3.4.2/tests/test_detect_legacy.py 0000664 0000000 0000000 00000002773 15005045421 0024224 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import unittest
from charset_normalizer.legacy import detect
class TestDetectLegacy(unittest.TestCase):
def test_detect_dict_keys(self):
r = detect(("\uFEFF" + "我没有埋怨,磋砣的只是一些时间。").encode("gb18030"))
with self.subTest("encoding key present"):
self.assertIn("encoding", r.keys())
with self.subTest("language key present"):
self.assertIn("language", r.keys())
with self.subTest("confidence key present"):
self.assertIn("confidence", r.keys())
def test_detect_dict_value_type(self):
r = detect("我没有埋怨,磋砣的只是一些时间。".encode())
with self.subTest("encoding instance of str"):
self.assertIsInstance(r["encoding"], str)
with self.subTest("language instance of str"):
self.assertIsInstance(r["language"], str)
with self.subTest("confidence instance of float"):
self.assertIsInstance(r["confidence"], float)
def test_detect_dict_value(self):
r = detect("我没有埋怨,磋砣的只是一些时间。".encode("utf_32"))
with self.subTest("encoding is equal to utf_32"):
self.assertEqual(r["encoding"], "UTF-32")
def test_utf8_sig_not_striped(self):
r = detect("Hello World".encode("utf-8-sig"))
with self.subTest("Verify that UTF-8-SIG is returned when using legacy detect"):
self.assertEqual(r["encoding"], "UTF-8-SIG")
python-charset-normalizer-3.4.2/tests/test_edge_case.py 0000664 0000000 0000000 00000003154 15005045421 0023321 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import platform
import pytest
from charset_normalizer import from_bytes
@pytest.mark.xfail(
platform.python_version_tuple()[0] == "3"
and platform.python_version_tuple()[1] == "7",
reason="Unicode database is too old for this case (Python 3.7)",
)
def test_unicode_edge_case():
payload = b"\xef\xbb\xbf\xf0\x9f\xa9\xb3"
best_guess = from_bytes(payload).best()
assert (
best_guess is not None
), "Payload should have given something, detection failure"
assert best_guess.encoding == "utf_8", "UTF-8 payload wrongly detected"
def test_issue_gh520():
"""Verify that minorities does not strip basic latin characters!"""
payload = b"/includes/webform.compon\xd2\xaants.inc/"
best_guess = from_bytes(payload).best()
assert (
best_guess is not None
), "Payload should have given something, detection failure"
assert "Basic Latin" in best_guess.alphabets
def test_issue_gh509():
"""Two common ASCII punctuations should render as-is."""
payload = b");"
best_guess = from_bytes(payload).best()
assert (
best_guess is not None
), "Payload should have given something, detection failure"
assert "ascii" == best_guess.encoding
def test_issue_gh498():
"""This case was mistaken for utf-16-le, this should never happen again."""
payload = b"\x84\xae\xaa\xe3\xac\xa5\xad\xe2 Microsoft Word.docx"
best_guess = from_bytes(payload).best()
assert (
best_guess is not None
), "Payload should have given something, detection failure"
assert "Cyrillic" in best_guess.alphabets
python-charset-normalizer-3.4.2/tests/test_full_detection.py 0000664 0000000 0000000 00000003432 15005045421 0024421 0 ustar 00root root 0000000 0000000 from __future__ import annotations
from os import pardir, path
import pytest
from charset_normalizer.api import from_path
DIR_PATH = path.join(path.dirname(path.realpath(__file__)), pardir)
@pytest.mark.parametrize(
"input_data_file, expected_charset, expected_language",
[
("sample-arabic-1.txt", "cp1256", "Arabic"),
("sample-french-1.txt", "cp1252", "French"),
("sample-arabic.txt", "utf_8", "Arabic"),
("sample-russian-3.txt", "utf_8", "Russian"),
("sample-french.txt", "utf_8", "French"),
("sample-chinese.txt", "big5", "Chinese"),
("sample-greek.txt", "cp1253", "Greek"),
("sample-greek-2.txt", "cp1253", "Greek"),
("sample-hebrew-2.txt", "cp1255", "Hebrew"),
("sample-hebrew-3.txt", "cp1255", "Hebrew"),
("sample-bulgarian.txt", "utf_8", "Bulgarian"),
("sample-english.bom.txt", "utf_8", "English"),
("sample-spanish.txt", "utf_8", "Spanish"),
("sample-korean.txt", "cp949", "Korean"),
("sample-turkish.txt", "cp1254", "Turkish"),
("sample-russian-2.txt", "utf_8", "Russian"),
("sample-russian.txt", "mac_cyrillic", "Russian"),
("sample-polish.txt", "utf_8", "Polish"),
],
)
def test_elementary_detection(
input_data_file: str,
expected_charset: str,
expected_language: str,
):
best_guess = from_path(DIR_PATH + f"/data/{input_data_file}").best()
assert (
best_guess is not None
), f"Elementary detection has failed upon '{input_data_file}'"
assert (
best_guess.encoding == expected_charset
), f"Elementary charset detection has failed upon '{input_data_file}'"
assert (
best_guess.language == expected_language
), f"Elementary language detection has failed upon '{input_data_file}'"
python-charset-normalizer-3.4.2/tests/test_isbinary.py 0000664 0000000 0000000 00000001340 15005045421 0023235 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import typing
from base64 import b64decode
from io import BytesIO
from os import pardir, path
import pytest
from charset_normalizer import is_binary
DIR_PATH = path.join(path.dirname(path.realpath(__file__)), pardir)
@pytest.mark.parametrize(
"raw, expected",
[
(b"\x00\x5f\x2f\xff" * 50, True),
(b64decode("R0lGODlhAQABAAAAACw="), True),
(BytesIO(b64decode("R0lGODlhAQABAAAAACw=")), True),
("sample-polish.txt", False),
("sample-arabic.txt", False),
],
)
def test_isbinary(raw: bytes | typing.BinaryIO | str, expected: bool) -> None:
if isinstance(raw, str):
raw = DIR_PATH + f"/data/{raw}"
assert is_binary(raw) is expected
python-charset-normalizer-3.4.2/tests/test_large_payload.py 0000664 0000000 0000000 00000003637 15005045421 0024233 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import pytest
from charset_normalizer import from_bytes
from charset_normalizer.constant import TOO_BIG_SEQUENCE
def test_large_payload_u8_sig_basic_entry():
payload = ("0" * TOO_BIG_SEQUENCE).encode("utf_8_sig")
best_guess = from_bytes(payload).best()
assert best_guess is not None, "Large U8 payload case detection completely failed"
assert (
best_guess.encoding == "utf_8"
), "Large U8 payload case detection wrongly detected!"
assert best_guess.bom is True, "SIG/BOM property should be True"
assert len(best_guess.raw) == len(
payload
), "Large payload should remain untouched when accessed through .raw"
assert (
best_guess._string is not None
), "str should be decoded before direct access (sig available)"
def test_large_payload_ascii_basic_entry():
payload = ("0" * TOO_BIG_SEQUENCE).encode("utf_8")
best_guess = from_bytes(payload).best()
assert (
best_guess is not None
), "Large ASCII payload case detection completely failed"
assert (
best_guess.encoding == "ascii"
), "Large ASCII payload case detection wrongly detected!"
assert best_guess.bom is False, "SIG/BOM property should be False"
assert len(best_guess.raw) == len(
payload
), "Large payload should remain untouched when accessed through .raw"
assert best_guess._string is None, "str should not be decoded until direct access"
def test_misleading_large_sequence():
content = (
("hello simple ascii " * TOO_BIG_SEQUENCE) + ("我没有埋怨,磋砣的只是一些时间。 磋砣的只是一些时间。")
).encode("utf_8")
guesses = from_bytes(content)
assert len(guesses) > 0
match = guesses.best()
assert match is not None
assert match._string is not None, "str should be cached as only match"
assert match.encoding == "utf_8"
assert str(match) is not None
python-charset-normalizer-3.4.2/tests/test_logging.py 0000664 0000000 0000000 00000004244 15005045421 0023051 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import logging
import pytest
from charset_normalizer.api import explain_handler, from_bytes
from charset_normalizer.constant import TRACE
from charset_normalizer.utils import set_logging_handler
class TestLogBehaviorClass:
def setup_method(self):
self.logger = logging.getLogger("charset_normalizer")
self.logger.handlers.clear()
self.logger.addHandler(logging.NullHandler())
self.logger.level = logging.WARNING
def test_explain_true_behavior(self, caplog):
test_sequence = b"This is a test sequence of bytes that should be sufficient"
from_bytes(test_sequence, steps=1, chunk_size=50, explain=True)
assert explain_handler not in self.logger.handlers
for record in caplog.records:
assert record.levelname in ["Level 5", "DEBUG"]
def test_explain_false_handler_set_behavior(self, caplog):
test_sequence = b"This is a test sequence of bytes that should be sufficient"
set_logging_handler(level=TRACE, format_string="%(message)s")
from_bytes(test_sequence, steps=1, chunk_size=50, explain=False)
assert any(
isinstance(hdl, logging.StreamHandler) for hdl in self.logger.handlers
)
for record in caplog.records:
assert record.levelname in ["Level 5", "DEBUG"]
assert "Encoding detection: ascii is most likely the one." in caplog.text
def test_set_stream_handler(self, caplog):
set_logging_handler("charset_normalizer", level=logging.DEBUG)
self.logger.debug("log content should log with default format")
for record in caplog.records:
assert record.levelname in ["Level 5", "DEBUG"]
assert "log content should log with default format" in caplog.text
def test_set_stream_handler_format(self, caplog):
set_logging_handler("charset_normalizer", format_string="%(message)s")
self.logger.info("log content should only be this message")
assert caplog.record_tuples == [
(
"charset_normalizer",
logging.INFO,
"log content should only be this message",
)
]
python-charset-normalizer-3.4.2/tests/test_mess_detection.py 0000664 0000000 0000000 00000004544 15005045421 0024433 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import pytest
from charset_normalizer.md import mess_ratio
@pytest.mark.parametrize(
"content, min_expected_ratio, max_expected_ratio",
[
(
"典肇乎庚辰年十二月廿一,及己丑年二月十九,收各方語言二百五十,合逾七百萬目;二十大卷佔八成,單英文卷亦過二百萬。悉文乃天下有志共筆而成;有意助之,幾網路、隨纂作,大典茁焉。",
0.0,
0.0,
),
("العقلية , التنويم المغناطيسي و / أو الاقتراح", 0.0, 0.0),
("RadoZ تـــعــــديــل الـــتــــوقــيــــت مـــن قــبــل", 0.0, 0.0),
("Cehennemin Sava■þ²s²'da kim?", 0.1, 0.5),
("´Á¥½³ø§i -- ±i®Ìºû, ³¯·Ø©v", 0.5, 1.0),
(
"ïstanbul, T■rkiye'nin en kalabal»k, iktisadi ve k■lt■rel aÓ»dan en —nemli",
0.1,
0.5,
),
(
"Parce que Óa, c'est la vÕritable histoire de la rencontre avec votre Tante Robin.",
0.01,
0.5,
),
(
"""ØĢØŠØģاØĶŲ ŲŲ ØĢŲ Ø§ŲŲØ§Øģ ŲŲŲ
Ų
ا ØģŲŲبШ쨧ØĶŲŲŲØ ØŊØđŲØ§ ŲØģŲ
Øđ ØđŲ (ŲØąŲØŊŲ) ŲØ§ŲØŪا؊Ų
""",
0.8,
3.0,
),
("""ÇáÚŞáíÉ , ÇáÊäæíã ÇáãÛäÇØíÓí æ / Ãæ ÇáÇŞÊÑÇÍ""", 0.8, 2.5),
(
"""hishamkoc@yahoo.com ุชุฑุฌู
ููุฉ ููุดูููุงู
ุงููููููููุงูRadoZ ุชูููุนููููุฏูููู ุงููููุชูููููููููููููุช ู
ูููู ูููุจููู""",
0.5,
2.0,
),
],
)
def test_mess_detection(content, min_expected_ratio, max_expected_ratio):
calculated_mess_ratio = mess_ratio(content, maximum_threshold=1.0)
assert (
min_expected_ratio <= calculated_mess_ratio <= max_expected_ratio
), "The mess detection ratio calculated for given content is not well adjusted!"
python-charset-normalizer-3.4.2/tests/test_preemptive_detection.py 0000664 0000000 0000000 00000005737 15005045421 0025651 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import pytest
from charset_normalizer import CharsetMatch
from charset_normalizer.utils import any_specified_encoding
@pytest.mark.parametrize(
"payload, expected_encoding",
[
(b'', "euc_jp"),
(b'', "utf_8"),
(b'', None),
(b"# coding: utf-8", "utf_8"),
(b'', "utf_8"),
(b'', "ascii"),
(b'', "johab"),
(b'', "cp037"),
(b"", "cp1252"),
(b'', "cp1256"),
],
)
def test_detect_most_common_body_encoding(payload, expected_encoding):
specified_encoding = any_specified_encoding(payload)
assert (
specified_encoding == expected_encoding
), "Unable to determine properly encoding from given body"
@pytest.mark.parametrize(
"payload, expected_outcome",
[
(
b'',
b'',
),
(
b'',
b'',
),
(
b'',
b'',
),
(b"# coding: utf-8", b"# coding: utf-8"),
(
b'',
b'',
),
(
b'',
b'',
),
(
b'',
b'',
),
(
b"",
b"",
),
(
b'',
b'',
),
],
)
def test_preemptive_mark_replacement(payload, expected_outcome):
"""
When generating (to Unicode converted) bytes, we want to change any potential declarative charset
to utf-8. This test that.
"""
specified_encoding = any_specified_encoding(payload)
detected_encoding = (
specified_encoding if specified_encoding is not None else "utf-8"
)
m = CharsetMatch(
payload,
detected_encoding,
0.0,
False,
[],
preemptive_declaration=specified_encoding,
)
transformed_output = m.output()
assert transformed_output == expected_outcome
python-charset-normalizer-3.4.2/tests/test_utils.py 0000664 0000000 0000000 00000002447 15005045421 0022566 0 ustar 00root root 0000000 0000000 from __future__ import annotations
import logging
import pytest
from charset_normalizer.utils import cp_similarity, is_accentuated, set_logging_handler
@pytest.mark.parametrize(
"character, expected_is_accentuated",
[
("é", True),
("è", True),
("à", True),
("À", True),
("Ù", True),
("ç", True),
("a", False),
("€", False),
("&", False),
("Ö", True),
("ü", True),
("ê", True),
("Ñ", True),
("Ý", True),
("Ω", False),
("ø", False),
("Ё", False),
],
)
def test_is_accentuated(character, expected_is_accentuated):
assert (
is_accentuated(character) is expected_is_accentuated
), "is_accentuated behavior incomplete"
@pytest.mark.parametrize(
"cp_name_a, cp_name_b, expected_is_similar",
[
("cp1026", "cp1140", True),
("cp1140", "cp1026", True),
("latin_1", "cp1252", True),
("latin_1", "iso8859_4", True),
("latin_1", "cp1251", False),
("cp1251", "mac_turkish", False),
],
)
def test_cp_similarity(cp_name_a, cp_name_b, expected_is_similar):
is_similar = cp_similarity(cp_name_a, cp_name_b) >= 0.8
assert is_similar is expected_is_similar, "cp_similarity is broken"