pax_global_header00006660000000000000000000000064145103357570014524gustar00rootroot0000000000000052 comment=f5a6187d2ab3f87b02dbe481d834ee57b73083a6 normality-2.5.0/000077500000000000000000000000001451033575700135465ustar00rootroot00000000000000normality-2.5.0/.bumpversion.cfg000066400000000000000000000003251451033575700166560ustar00rootroot00000000000000[bumpversion] current_version = 2.5.0 tag_name = {new_version} commit = True tag = True [bumpversion:file:normality/__init__.py] search = __version__ = "{current_version}" replace = __version__ = "{new_version}" normality-2.5.0/.github/000077500000000000000000000000001451033575700151065ustar00rootroot00000000000000normality-2.5.0/.github/dependabot.yml000066400000000000000000000002201451033575700177300ustar00rootroot00000000000000version: 2 updates: - package-ecosystem: pip directory: "/" schedule: interval: daily time: "04:00" open-pull-requests-limit: 100 normality-2.5.0/.github/workflows/000077500000000000000000000000001451033575700171435ustar00rootroot00000000000000normality-2.5.0/.github/workflows/build.yml000066400000000000000000000020071451033575700207640ustar00rootroot00000000000000name: build on: [push] jobs: python: runs-on: ubuntu-latest steps: - uses: actions/checkout@v1 - name: Show ref run: | echo "$GITHUB_REF" - name: Set up Python uses: actions/setup-python@v1 with: python-version: '3.x' - name: Install dependencies env: DEBIAN_FRONTEND: noninteractive run: | sudo apt-get update -y -qq sudo apt-get install -y -qq libicu-dev pip install --upgrade pip setuptools setuptools_scm wheel build twine pyicu pip install -e ".[dev]" - name: Validate mypy typing run: | make typecheck - name: Run unit tests run: | make test - name: Build a distribution run: | python -m build -nwsx - name: Publish a Python distribution to PyPI if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') uses: pypa/gh-action-pypi-publish@master with: user: __token__ password: ${{ secrets.pypi_password }} normality-2.5.0/.gitignore000066400000000000000000000001051451033575700155320ustar00rootroot00000000000000*.egg-info *.pyc dist/* build/* .vscode/* .pytest_cache .mypy_cache/*normality-2.5.0/LICENSE000066400000000000000000000020741451033575700145560ustar00rootroot00000000000000Copyright (c) 2013-2022, Friedrich Lindenberg, Gregor Aisch Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. normality-2.5.0/MANIFEST.in000066400000000000000000000000741451033575700153050ustar00rootroot00000000000000include LICENSE include README.md include normality/py.typednormality-2.5.0/Makefile000066400000000000000000000005331451033575700152070ustar00rootroot00000000000000 all: clean test install: pip install -q '.[dev]' check: test typecheck test: pytest typecheck: mypy --strict normality clean: rm -rf dist build .eggs .mypy_cache .pytest_cache find . -name '*.egg-info' -exec rm -fr {} + find . -name '*.egg' -exec rm -f {} + find . -name '*.pyc' -exec rm -f {} + find . -name '*.pyo' -exec rm -f {} +normality-2.5.0/README.md000066400000000000000000000023541451033575700150310ustar00rootroot00000000000000# normality text cleanup [![build](https://github.com/pudo/normality/actions/workflows/build.yml/badge.svg)](https://github.com/pudo/normality/actions/workflows/build.yml) Normality is a Python micro-package that contains a small set of text normalization functions for easier re-use. These functions accept a snippet of unicode or utf-8 encoded text and remove various classes of characters, such as diacritics, punctuation etc. This is useful as a preparation to further text analysis. **WARNING**: This library works much better when used in combination with ``pyicu``, a Python binding for the International Components for Unicode C library. ICU provides much better text transliteration than the default ``text-unidecode``. ## Example ```python # coding: utf-8 from normality import normalize, slugify, collapse_spaces text = normalize('Nie wieder "Grüne Süppchen" kochen!') assert text == 'nie wieder grune suppchen kochen' slug = slugify('My first blog post!') assert slug == 'my-first-blog-post' text = 'this \n\n\r\nhas\tlots of \nodd spacing.' assert collapse_spaces(text) == 'this has lots of odd spacing.' ``` ## License ``normality`` is open source, licensed under a standard MIT license (included in this repository as ``LICENSE``). normality-2.5.0/normality/000077500000000000000000000000001451033575700155645ustar00rootroot00000000000000normality-2.5.0/normality/__init__.py000066400000000000000000000061531451033575700177020ustar00rootroot00000000000000"""Helper functions for string cleaning. `normality` includes functions to convert arbitrary Python objects to strings, transliterate them into the latin alphabet, make slugs for URLs, or perform the substitution of characters based on unicode character categories. """ from typing import Any, Optional from normality.cleaning import collapse_spaces, category_replace from normality.constants import UNICODE_CATEGORIES, WS from normality.transliteration import latinize_text, ascii_text from normality.encoding import guess_encoding, guess_file_encoding from normality.encoding import predict_encoding, predict_file_encoding from normality.encoding import DEFAULT_ENCODING from normality.stringify import stringify from normality.paths import safe_filename from normality.slugify import slugify from normality.util import Categories, Encoding __version__ = "2.5.0" __all__ = [ "collapse_spaces", "category_replace", "safe_filename", "normalize", "stringify", "slugify", "guess_encoding", "guess_file_encoding", "predict_encoding", "predict_file_encoding", "latinize_text", "ascii_text", "WS", "UNICODE_CATEGORIES", ] def normalize( value: Any, lowercase: bool = True, collapse: bool = True, latinize: bool = False, ascii: bool = False, encoding_default: Encoding = DEFAULT_ENCODING, encoding: Optional[str] = None, replace_categories: Categories = UNICODE_CATEGORIES, ) -> Optional[str]: """The main normalization function for text. This will take a string and apply a set of transformations to it so that it can be processed more easily afterwards. Arguments: * ``lowercase``: not very mysterious. * ``collapse``: replace multiple whitespace-like characters with a single whitespace. This is especially useful with category replacement which can lead to a lot of whitespace. * ``decompose``: apply a unicode normalization (NFKD) to separate simple characters and their diacritics. * ``replace_categories``: This will perform a replacement of whole classes of unicode characters (e.g. symbols, marks, numbers) with a given character. It is used to replace any non-text elements of the input string. """ text = stringify(value, encoding_default=encoding_default, encoding=encoding) if text is None: return None if lowercase: # Yeah I made a Python package for this. text = text.lower() if ascii: # A stricter form of transliteration that leaves only ASCII # characters. text = ascii_text(text) elif latinize: # Perform unicode-based transliteration, e.g. of cyricllic # or CJK scripts into latin. text = latinize_text(text) # Perform unicode category-based character replacement. This is # used to filter out whole classes of characters, such as symbols, # punctuation, or whitespace-like characters. if replace_categories is not None: text = category_replace(text, replace_categories) if collapse: # Remove consecutive whitespace. text = collapse_spaces(text) return text normality-2.5.0/normality/cleaning.py000066400000000000000000000054061451033575700177230ustar00rootroot00000000000000import re import unicodedata from typing import Any, Optional from normality.constants import UNICODE_CATEGORIES, CONTROL_CODES, WS from normality.util import Categories, is_text COLLAPSE_RE = re.compile(r"\s+", re.U) BOM_RE = re.compile("^\ufeff", re.U) UNSAFE_RE = re.compile(r"^\ufeff|[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f\x80-\x9f]|\u2028") QUOTES_RE = re.compile(r'^["\'](.*)["\']$') def decompose_nfkd(text: Any) -> Optional[str]: """Perform unicode compatibility decomposition. This will replace some non-standard value representations in unicode and normalise them, while also separating characters and their diacritics into two separate codepoints. """ if not is_text(text): return None return unicodedata.normalize("NFKD", text) def compose_nfc(text: Any) -> Optional[str]: """Perform unicode composition.""" if not is_text(text): return None return unicodedata.normalize("NFC", text) def compose_nfkc(text: Any) -> Optional[str]: """Perform unicode composition.""" if not is_text(text): return None return unicodedata.normalize("NFKC", text) def strip_quotes(text: Any) -> Optional[str]: """Remove double or single quotes surrounding a string.""" if not is_text(text): return None return QUOTES_RE.sub("\\1", text) def category_replace( text: Any, replacements: Categories = UNICODE_CATEGORIES ) -> Optional[str]: """Remove characters from a string based on unicode classes. This is a method for removing non-text characters (such as punctuation, whitespace, marks and diacritics) from a piece of text by class, rather than specifying them individually. """ text = decompose_nfkd(text) if not is_text(text): return None characters = [] for character in text: cat = unicodedata.category(character) replacement = replacements.get(cat, character) if replacement is not None: characters.append(replacement) return "".join(characters) def remove_control_chars(text: Any) -> Optional[str]: """Remove just the control codes from a piece of text.""" return category_replace(text, replacements=CONTROL_CODES) def remove_unsafe_chars(text: Any) -> Optional[str]: """Remove unsafe unicode characters from a piece of text.""" if not is_text(text): return None return UNSAFE_RE.sub("", text) def remove_byte_order_mark(text: Any) -> Optional[str]: """Remove a BOM from the beginning of the text.""" if not is_text(text): return None return BOM_RE.sub("", text) def collapse_spaces(text: Any) -> Optional[str]: """Remove newlines, tabs and multiple spaces with single spaces.""" if not is_text(text): return None return COLLAPSE_RE.sub(WS, text).strip(WS) normality-2.5.0/normality/constants.py000066400000000000000000000032531451033575700201550ustar00rootroot00000000000000from normality.util import Categories # https://en.wikipedia.org/wiki/Cyrillic_script_in_Unicode # Cyrillic: U+0400–U+04FF, 256 characters # Cyrillic Supplement: U+0500–U+052F, 48 characters # Cyrillic Extended-A: U+2DE0–U+2DFF, 32 characters # Cyrillic Extended-B: U+A640–U+A69F, 96 characters # Cyrillic Extended-C: U+1C80–U+1C8F, 9 characters # Phonetic Extensions: U+1D2B, U+1D78, 2 Cyrillic characters # Combining Half Marks: U+FE2E–U+FE2F, 2 Cyrillic characters WS: str = " " # Unicode character classes, see: # http://www.fileformat.info/info/unicode/category/index.htm # https://en.wikipedia.org/wiki/Unicode_character_property # http://www.unicode.org/charts/beta/script/ UNICODE_CATEGORIES: Categories = { "Cc": WS, "Cf": None, "Cs": None, "Co": None, "Cn": None, "Lm": None, "Mn": None, "Mc": WS, "Me": None, "No": None, "Zs": WS, "Zl": WS, "Zp": WS, "Pc": WS, # TODO: figure out if this wants to be None "Pd": WS, "Ps": WS, "Pe": WS, "Pi": WS, "Pf": WS, "Po": WS, "Sm": WS, "Sc": None, "Sk": None, "So": WS, "Zs": WS, "Zl": WS, "Zp": WS, } SLUG_CATEGORIES: Categories = { "Cc": None, "Cf": None, "Cs": None, "Co": None, "Cn": None, # "Lm": None, # "Mn": None, "Mc": WS, "Me": None, "No": None, "Zs": WS, "Zl": WS, "Zp": WS, "Pc": WS, "Pd": WS, "Ps": WS, "Pe": WS, "Pi": WS, "Pf": WS, "Po": WS, "Sm": WS, "Sc": None, "Sk": None, "So": WS, "Zs": WS, "Zl": WS, "Zp": WS, } CONTROL_CODES: Categories = {"Cc": WS, "Cf": WS, "Cs": WS, "Co": WS, "Cn": WS, "Zl": WS} normality-2.5.0/normality/encoding.py000066400000000000000000000072551451033575700177350ustar00rootroot00000000000000import codecs import chardet import warnings from charset_normalizer import from_bytes, CharsetMatches from typing import Any, BinaryIO, TYPE_CHECKING from normality.util import Encoding if TYPE_CHECKING: from charset_normalizer import CharsetMatches DEFAULT_ENCODING = "utf-8" def normalize_encoding(encoding: str, default: Encoding = DEFAULT_ENCODING) -> str: """Normalize the encoding name, replace ASCII w/ UTF-8.""" warnings.warn( "normalize_encoding is now deprecated. Use tidy_encoding instead", DeprecationWarning, ) return tidy_encoding(encoding, default) def tidy_encoding(encoding: str, default: Encoding = DEFAULT_ENCODING) -> str: """Normalize the encoding name, replace ASCII w/ UTF-8.""" if encoding is None: return default encoding = encoding.strip() if encoding.lower() in ["", "ascii"]: return default try: codec = codecs.lookup(encoding) return codec.name except LookupError: return default def normalize_result( result: Any, default: Encoding, threshold: float = 0.2 ) -> Encoding: """Interpret a chardet result.""" warnings.warn( "normalize_result is now deprecated. Use tidy_result instead", DeprecationWarning, ) if result is None: return default confidence: float = result.get("confidence") if confidence is None: return default if float(confidence) < threshold: return default encoding: Encoding = result.get("encoding") if encoding is None: return default return normalize_encoding(encoding, default=default) def tidy_result(result: CharsetMatches, default: Encoding) -> Encoding: """Interpret a chardet result.""" res = result.best() if res is None: return default encoding: Encoding = res.encoding if encoding is None: return default return tidy_encoding(encoding, default=default) def guess_encoding(text: bytes, default: Encoding = DEFAULT_ENCODING) -> Encoding: """Guess string encoding. Given a piece of text, apply character encoding detection to guess the appropriate encoding of the text. """ warnings.warn( "guess_encoding is now deprecated. Use predict_encoding instead", DeprecationWarning, ) return predict_encoding(text, default=default) def predict_encoding(text: bytes, default: Encoding = DEFAULT_ENCODING) -> Encoding: """Guess string encoding. Given a piece of text, apply character encoding detection to guess the appropriate encoding of the text. """ result = from_bytes(text, explain=False) return tidy_result(result, default=default) def guess_file_encoding(fh: BinaryIO, default: Encoding = DEFAULT_ENCODING) -> Encoding: """Guess encoding from a file handle.""" warnings.warn( "guess_encoding is now deprecated. Use predict_encoding instead", DeprecationWarning, ) start = fh.tell() detector = chardet.UniversalDetector() while True: data = fh.read(1024 * 10) if not data: detector.close() break detector.feed(data) if detector.done: break fh.seek(start) return normalize_result(detector.result, default=default) def predict_file_encoding( fh: BinaryIO, default: Encoding = DEFAULT_ENCODING ) -> Encoding: """Guess encoding from a file handle.""" start = fh.tell() result: CharsetMatches = CharsetMatches() while True: data = fh.read(1024 * 10) if not data: break result = from_bytes(data, explain=False) if result: break fh.seek(start) return tidy_result(result, default=default) normality-2.5.0/normality/paths.py000066400000000000000000000031021451033575700172510ustar00rootroot00000000000000import os from typing import Optional from banal import decode_path from normality.stringify import stringify from normality.cleaning import collapse_spaces, category_replace from normality.constants import UNICODE_CATEGORIES, WS from normality.transliteration import ascii_text MAX_LENGTH = 254 def _safe_name(file_name: Optional[str], sep: str) -> Optional[str]: """Convert the file name to ASCII and normalize the string.""" file_name = stringify(file_name) if file_name is None: return None file_name = ascii_text(file_name) file_name = category_replace(file_name, UNICODE_CATEGORIES) file_name = collapse_spaces(file_name) if file_name is None or not len(file_name): return None return file_name.replace(WS, sep) def safe_filename( file_name: Optional[str], sep: str = "_", default: Optional[str] = None, extension: Optional[str] = None, ) -> Optional[str]: """Create a secure filename for plain file system storage.""" if file_name is None: return decode_path(default) file_name = decode_path(file_name) if file_name is None: return None file_name = os.path.basename(file_name) file_name, _extension = os.path.splitext(file_name) file_name = _safe_name(file_name, sep=sep) if file_name is None: return decode_path(default) file_name = file_name[:MAX_LENGTH] extension = _safe_name(extension or _extension, sep=sep) if extension is not None: file_name = ".".join((file_name, extension)) file_name = file_name[:MAX_LENGTH] return file_name normality-2.5.0/normality/py.typed000066400000000000000000000000001451033575700172510ustar00rootroot00000000000000normality-2.5.0/normality/scripts.py000066400000000000000000000511141451033575700176270ustar00rootroot00000000000000from typing import Tuple from functools import lru_cache ALPHABET = 1 LATIN = 2 CYRILLIC = 3 GREEK = 4 ARABIC = 5 CJK = 6 HANGUL = 7 ABJAD = 99 ABUGIDA = 100 SYLLABARY = 101 HISTORIC = 99999 FUNKY = 100000 UNKNOWN = 0 # Source: https://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt UNICODE_BLOCKS: Tuple[Tuple[int, int, str, Tuple[int, ...]]] = ( # type: ignore ( 0x0000, 0x007F, "Basic Latin", ( ALPHABET, LATIN, ), ), ( 0x0080, 0x00FF, "Latin-1 Supplement", ( ALPHABET, LATIN, ), ), ( 0x0100, 0x017F, "Latin Extended-A", ( ALPHABET, LATIN, ), ), ( 0x0180, 0x024F, "Latin Extended-B", ( ALPHABET, LATIN, ), ), (0x0250, 0x02AF, "IPA Extensions", ()), (0x02B0, 0x02FF, "Spacing Modifier Letters", ()), (0x0300, 0x036F, "Combining Diacritical Marks", ()), ( 0x0370, 0x03FF, "Greek and Coptic", ( ALPHABET, GREEK, ), ), ( 0x0400, 0x04FF, "Cyrillic", ( ALPHABET, CYRILLIC, ), ), ( 0x0500, 0x052F, "Cyrillic Supplement", ( ALPHABET, CYRILLIC, ), ), (0x0530, 0x058F, "Armenian", (ALPHABET,)), (0x0590, 0x05FF, "Hebrew", (ABJAD,)), ( 0x0600, 0x06FF, "Arabic", ( ARABIC, ABJAD, ), ), (0x0700, 0x074F, "Syriac", (ABJAD,)), ( 0x0750, 0x077F, "Arabic Supplement", ( ARABIC, ABJAD, ), ), (0x0780, 0x07BF, "Thaana", (ABUGIDA,)), (0x07C0, 0x07FF, "NKo", (FUNKY,)), (0x0800, 0x083F, "Samaritan", (ABJAD,)), ( 0x0840, 0x085F, "Mandaic", ( ALPHABET, HISTORIC, ), ), (0x0860, 0x086F, "Syriac Supplement", (ABJAD,)), ( 0x0870, 0x089F, "Arabic Extended-B", ( ARABIC, ABJAD, ), ), ( 0x08A0, 0x08FF, "Arabic Extended-A", ( ARABIC, ABJAD, ), ), (0x0900, 0x097F, "Devanagari", (ABUGIDA,)), (0x0980, 0x09FF, "Bengali", (ABUGIDA,)), (0x0A00, 0x0A7F, "Gurmukhi", (ABUGIDA,)), (0x0A80, 0x0AFF, "Gujarati", (ABUGIDA,)), ( 0x0B00, 0x0B7F, "Oriya", ( ABUGIDA, HISTORIC, ), ), (0x0B80, 0x0BFF, "Tamil", (ABUGIDA,)), (0x0C00, 0x0C7F, "Telugu", (ABUGIDA,)), (0x0C80, 0x0CFF, "Kannada", (ABUGIDA,)), (0x0D00, 0x0D7F, "Malayalam", (ABUGIDA,)), (0x0D80, 0x0DFF, "Sinhala", (ABUGIDA,)), (0x0E00, 0x0E7F, "Thai", (ABUGIDA,)), (0x0E80, 0x0EFF, "Lao", (ABUGIDA,)), (0x0F00, 0x0FFF, "Tibetan", (ABUGIDA,)), (0x1000, 0x109F, "Myanmar", (ABUGIDA,)), (0x10A0, 0x10FF, "Georgian", (ALPHABET,)), ( 0x1100, 0x11FF, "Hangul Jamo", ( ALPHABET, HANGUL, ), ), (0x1200, 0x137F, "Ethiopic", (ABUGIDA,)), (0x1380, 0x139F, "Ethiopic Supplement", (ABUGIDA,)), (0x13A0, 0x13FF, "Cherokee", (SYLLABARY,)), (0x1400, 0x167F, "Unified Canadian Aboriginal Syllabics", (SYLLABARY,)), ( 0x1680, 0x169F, "Ogham", ( ALPHABET, HISTORIC, ), ), ( 0x16A0, 0x16FF, "Runic", ( ALPHABET, HISTORIC, ), ), (0x1700, 0x171F, "Tagalog", (ABUGIDA,)), (0x1720, 0x173F, "Hanunoo", (ABUGIDA,)), (0x1740, 0x175F, "Buhid", (ABUGIDA,)), (0x1760, 0x177F, "Tagbanwa", (ABUGIDA,)), (0x1780, 0x17FF, "Khmer", (ABUGIDA,)), ( 0x1800, 0x18AF, "Mongolian", ( ALPHABET, FUNKY, ), ), (0x18B0, 0x18FF, "Unified Canadian Aboriginal Syllabics Extended", (SYLLABARY,)), (0x1900, 0x194F, "Limbu", (ABUGIDA,)), (0x1950, 0x197F, "Tai Le", (ABUGIDA,)), (0x1980, 0x19DF, "New Tai Lue", (ABUGIDA,)), (0x19E0, 0x19FF, "Khmer Symbols", (ABUGIDA,)), (0x1A00, 0x1A1F, "Buginese", (ABUGIDA,)), (0x1A20, 0x1AAF, "Tai Tham", (ABUGIDA,)), (0x1AB0, 0x1AFF, "Combining Diacritical Marks Extended", ()), (0x1B00, 0x1B7F, "Balinese", (ABUGIDA,)), (0x1B80, 0x1BBF, "Sundanese", (ABUGIDA,)), (0x1BC0, 0x1BFF, "Batak", (ABUGIDA,)), (0x1C00, 0x1C4F, "Lepcha", (ABUGIDA,)), ( 0x1C50, 0x1C7F, "Ol Chiki", ( ALPHABET, FUNKY, ), ), ( 0x1C80, 0x1C8F, "Cyrillic Extended-C", ( ALPHABET, CYRILLIC, ), ), (0x1C90, 0x1CBF, "Georgian Extended", (ALPHABET,)), (0x1CC0, 0x1CCF, "Sundanese Supplement", (ABUGIDA,)), (0x1CD0, 0x1CFF, "Vedic Extensions", ()), (0x1D00, 0x1D7F, "Phonetic Extensions", ()), (0x1D80, 0x1DBF, "Phonetic Extensions Supplement", ()), (0x1DC0, 0x1DFF, "Combining Diacritical Marks Supplement", ()), ( 0x1E00, 0x1EFF, "Latin Extended Additional", ( ALPHABET, LATIN, ), ), ( 0x1F00, 0x1FFF, "Greek Extended", ( ALPHABET, GREEK, ), ), (0x2000, 0x206F, "General Punctuation", ()), (0x2070, 0x209F, "Superscripts and Subscripts", ()), (0x20A0, 0x20CF, "Currency Symbols", ()), (0x20D0, 0x20FF, "Combining Diacritical Marks for Symbols", ()), (0x2100, 0x214F, "Letterlike Symbols", ()), (0x2150, 0x218F, "Number Forms", ()), (0x2190, 0x21FF, "Arrows", ()), (0x2200, 0x22FF, "Mathematical Operators", ()), (0x2300, 0x23FF, "Miscellaneous Technical", ()), (0x2400, 0x243F, "Control Pictures", ()), (0x2440, 0x245F, "Optical Character Recognition", ()), (0x2460, 0x24FF, "Enclosed Alphanumerics", ()), (0x2500, 0x257F, "Box Drawing", ()), (0x2580, 0x259F, "Block Elements", ()), (0x25A0, 0x25FF, "Geometric Shapes", ()), (0x2600, 0x26FF, "Miscellaneous Symbols", ()), (0x2700, 0x27BF, "Dingbats", ()), (0x27C0, 0x27EF, "Miscellaneous Mathematical Symbols-A", ()), (0x27F0, 0x27FF, "Supplemental Arrows-A", ()), (0x2800, 0x28FF, "Braille Patterns", ()), (0x2900, 0x297F, "Supplemental Arrows-B", ()), (0x2980, 0x29FF, "Miscellaneous Mathematical Symbols-B", ()), (0x2A00, 0x2AFF, "Supplemental Mathematical Operators", ()), (0x2B00, 0x2BFF, "Miscellaneous Symbols and Arrows", ()), ( 0x2C00, 0x2C5F, "Glagolitic", ( ALPHABET, HISTORIC, ), ), ( 0x2C60, 0x2C7F, "Latin Extended-C", ( ALPHABET, LATIN, ), ), ( 0x2C80, 0x2CFF, "Coptic", ( ALPHABET, HISTORIC, ), ), (0x2D00, 0x2D2F, "Georgian Supplement", (ALPHABET,)), (0x2D30, 0x2D7F, "Tifinagh", (ABJAD,)), (0x2D80, 0x2DDF, "Ethiopic Extended", (ABUGIDA,)), ( 0x2DE0, 0x2DFF, "Cyrillic Extended-A", ( ALPHABET, CYRILLIC, ), ), (0x2E00, 0x2E7F, "Supplemental Punctuation", ()), (0x2E80, 0x2EFF, "CJK Radicals Supplement", (CJK,)), (0x2F00, 0x2FDF, "Kangxi Radicals", (CJK,)), (0x2FF0, 0x2FFF, "Ideographic Description Characters", ()), (0x3000, 0x303F, "CJK Symbols and Punctuation", (CJK,)), (0x3040, 0x309F, "Hiragana", (CJK,)), (0x30A0, 0x30FF, "Katakana", (CJK,)), (0x3100, 0x312F, "Bopomofo", (CJK,)), ( 0x3130, 0x318F, "Hangul Compatibility Jamo", (HANGUL, CJK), ), (0x3190, 0x319F, "Kanbun", (CJK,)), (0x31A0, 0x31BF, "Bopomofo Extended", (CJK,)), (0x31C0, 0x31EF, "CJK Strokes", (CJK,)), (0x31F0, 0x31FF, "Katakana Phonetic Extensions", (CJK,)), (0x3200, 0x32FF, "Enclosed CJK Letters and Months", (CJK,)), (0x3300, 0x33FF, "CJK Compatibility", (CJK,)), (0x3400, 0x4DBF, "CJK Unified Ideographs Extension A", (CJK,)), (0x4DC0, 0x4DFF, "Yijing Hexagram Symbols", (CJK,)), (0x4E00, 0x9FFF, "CJK Unified Ideographs", (CJK,)), (0xA000, 0xA48F, "Yi Syllables", (SYLLABARY,)), (0xA490, 0xA4CF, "Yi Radicals", ()), (0xA4D0, 0xA4FF, "Lisu", (ABUGIDA,)), (0xA500, 0xA63F, "Vai", (SYLLABARY,)), ( 0xA640, 0xA69F, "Cyrillic Extended-B", ( ALPHABET, CYRILLIC, ), ), (0xA6A0, 0xA6FF, "Bamum", (SYLLABARY,)), (0xA700, 0xA71F, "Modifier Tone Letters", ()), ( 0xA720, 0xA7FF, "Latin Extended-D", ( ALPHABET, LATIN, ), ), (0xA800, 0xA82F, "Syloti Nagri", ()), (0xA830, 0xA83F, "Common Indic Number Forms", ()), (0xA840, 0xA87F, "Phags-pa", ()), (0xA880, 0xA8DF, "Saurashtra", ()), (0xA8E0, 0xA8FF, "Devanagari Extended", (ABUGIDA,)), (0xA900, 0xA92F, "Kayah Li", ()), (0xA930, 0xA95F, "Rejang", ()), (0xA960, 0xA97F, "Hangul Jamo Extended-A", (HANGUL,)), (0xA980, 0xA9DF, "Javanese", (ABUGIDA,)), (0xA9E0, 0xA9FF, "Myanmar Extended-B", (ABUGIDA,)), (0xAA00, 0xAA5F, "Cham", (ABUGIDA,)), (0xAA60, 0xAA7F, "Myanmar Extended-A", (ABUGIDA,)), (0xAA80, 0xAADF, "Tai Viet", ()), (0xAAE0, 0xAAFF, "Meetei Mayek Extensions", ()), (0xAB00, 0xAB2F, "Ethiopic Extended-A", (ABUGIDA,)), ( 0xAB30, 0xAB6F, "Latin Extended-E", ( ALPHABET, LATIN, ), ), (0xAB70, 0xABBF, "Cherokee Supplement", ()), (0xABC0, 0xABFF, "Meetei Mayek", ()), ( 0xAC00, 0xD7AF, "Hangul Syllables", (HANGUL, SYLLABARY), ), (0xD7B0, 0xD7FF, "Hangul Jamo Extended-B", (HANGUL,)), (0xD800, 0xDB7F, "High Surrogates", ()), (0xDB80, 0xDBFF, "High Private Use Surrogates", ()), (0xDC00, 0xDFFF, "Low Surrogates", ()), (0xE000, 0xF8FF, "Private Use Area", ()), (0xF900, 0xFAFF, "CJK Compatibility Ideographs", (CJK,)), (0xFB00, 0xFB4F, "Alphabetic Presentation Forms", ()), ( 0xFB50, 0xFDFF, "Arabic Presentation Forms-A", ( ARABIC, ABJAD, ), ), (0xFE00, 0xFE0F, "Variation Selectors", ()), (0xFE10, 0xFE1F, "Vertical Forms", ()), (0xFE20, 0xFE2F, "Combining Half Marks", (CJK,)), (0xFE30, 0xFE4F, "CJK Compatibility Forms", (CJK,)), (0xFE50, 0xFE6F, "Small Form Variants", ()), ( 0xFE70, 0xFEFF, "Arabic Presentation Forms-B", ( ARABIC, ABJAD, ), ), (0xFF00, 0xFFEF, "Halfwidth and Fullwidth Forms", (CJK,)), (0xFFF0, 0xFFFF, "Specials", ()), (0x10000, 0x1007F, "Linear B Syllabary", (SYLLABARY,)), (0x10080, 0x100FF, "Linear B Ideograms", ()), (0x10100, 0x1013F, "Aegean Numbers", ()), ( 0x10140, 0x1018F, "Ancient Greek Numbers", ( ALPHABET, GREEK, ), ), (0x10190, 0x101CF, "Ancient Symbols", ()), (0x101D0, 0x101FF, "Phaistos Disc", ()), (0x10280, 0x1029F, "Lycian", ()), (0x102A0, 0x102DF, "Carian", ()), (0x102E0, 0x102FF, "Coptic Epact Numbers", ()), (0x10300, 0x1032F, "Old Italic", ()), (0x10330, 0x1034F, "Gothic", ()), (0x10350, 0x1037F, "Old Permic", ()), (0x10380, 0x1039F, "Ugaritic", (ABJAD,)), (0x103A0, 0x103DF, "Old Persian", (ABJAD,)), (0x10400, 0x1044F, "Deseret", ()), (0x10450, 0x1047F, "Shavian", ()), (0x10480, 0x104AF, "Osmanya", ()), (0x104B0, 0x104FF, "Osage", ()), (0x10500, 0x1052F, "Elbasan", ()), (0x10530, 0x1056F, "Caucasian Albanian", ()), (0x10570, 0x105BF, "Vithkuqi", ()), (0x10600, 0x1077F, "Linear A", ()), ( 0x10780, 0x107BF, "Latin Extended-F", ( ALPHABET, LATIN, ), ), (0x10800, 0x1083F, "Cypriot Syllabary", (SYLLABARY,)), (0x10840, 0x1085F, "Imperial Aramaic", (ABJAD,)), (0x10860, 0x1087F, "Palmyrene", ()), (0x10880, 0x108AF, "Nabataean", (ABJAD,)), (0x108E0, 0x108FF, "Hatran", ()), (0x10900, 0x1091F, "Phoenician", ()), (0x10920, 0x1093F, "Lydian", ()), (0x10980, 0x1099F, "Meroitic Hieroglyphs", ()), (0x109A0, 0x109FF, "Meroitic Cursive", ()), (0x10A00, 0x10A5F, "Kharoshthi", ()), (0x10A60, 0x10A7F, "Old South Arabian", (ABJAD,)), (0x10A80, 0x10A9F, "Old North Arabian", (ABJAD,)), (0x10AC0, 0x10AFF, "Manichaean", ()), (0x10B00, 0x10B3F, "Avestan", ()), (0x10B40, 0x10B5F, "Inscriptional Parthian", ()), (0x10B60, 0x10B7F, "Inscriptional Pahlavi", ()), (0x10B80, 0x10BAF, "Psalter Pahlavi", (ABJAD,)), (0x10C00, 0x10C4F, "Old Turkic", ()), (0x10C80, 0x10CFF, "Old Hungarian", ()), (0x10D00, 0x10D3F, "Hanifi Rohingya", ()), (0x10E60, 0x10E7F, "Rumi Numeral Symbols", ()), ( 0x10E80, 0x10EBF, "Yezidi", ( ALPHABET, FUNKY, ), ), ( 0x10EC0, 0x10EFF, "Arabic Extended-C", ( ARABIC, ABJAD, ), ), (0x10F00, 0x10F2F, "Old Sogdian", (ABJAD,)), (0x10F30, 0x10F6F, "Sogdian", (ABJAD,)), (0x10F70, 0x10FAF, "Old Uyghur", ()), (0x10FB0, 0x10FDF, "Chorasmian", ()), (0x10FE0, 0x10FFF, "Elymaic", ()), (0x11000, 0x1107F, "Brahmi", (ABUGIDA,)), (0x11080, 0x110CF, "Kaithi", ()), (0x110D0, 0x110FF, "Sora Sompeng", ()), (0x11100, 0x1114F, "Chakma", ()), (0x11150, 0x1117F, "Mahajani", ()), (0x11180, 0x111DF, "Sharada", ()), (0x111E0, 0x111FF, "Sinhala Archaic Numbers", ()), (0x11200, 0x1124F, "Khojki", ()), (0x11280, 0x112AF, "Multani", ()), (0x112B0, 0x112FF, "Khudawadi", ()), (0x11300, 0x1137F, "Grantha", ()), (0x11400, 0x1147F, "Newa", ()), (0x11480, 0x114DF, "Tirhuta", ()), (0x11580, 0x115FF, "Siddham", ()), (0x11600, 0x1165F, "Modi", ()), (0x11660, 0x1167F, "Mongolian Supplement", ()), (0x11680, 0x116CF, "Takri", ()), (0x11700, 0x1174F, "Ahom", ()), (0x11800, 0x1184F, "Dogra", ()), (0x118A0, 0x118FF, "Warang Citi", ()), (0x11900, 0x1195F, "Dives Akuru", ()), (0x119A0, 0x119FF, "Nandinagari", ()), (0x11A00, 0x11A4F, "Zanabazar Square", ()), (0x11A50, 0x11AAF, "Soyombo", ()), ( 0x11AB0, 0x11ABF, "Unified Canadian Aboriginal Syllabics Extended-A", (SYLLABARY,), ), (0x11AC0, 0x11AFF, "Pau Cin Hau", ()), (0x11B00, 0x11B5F, "Devanagari Extended-A", ()), (0x11C00, 0x11C6F, "Bhaiksuki", ()), (0x11C70, 0x11CBF, "Marchen", ()), (0x11D00, 0x11D5F, "Masaram Gondi", ()), (0x11D60, 0x11DAF, "Gunjala Gondi", ()), (0x11EE0, 0x11EFF, "Makasar", ()), (0x11F00, 0x11F5F, "Kawi", ()), (0x11FB0, 0x11FBF, "Lisu Supplement", ()), (0x11FC0, 0x11FFF, "Tamil Supplement", ()), (0x12000, 0x123FF, "Cuneiform", ()), (0x12400, 0x1247F, "Cuneiform Numbers and Punctuation", ()), (0x12480, 0x1254F, "Early Dynastic Cuneiform", ()), (0x12F90, 0x12FFF, "Cypro-Minoan", ()), (0x13000, 0x1342F, "Egyptian Hieroglyphs", ()), (0x13430, 0x1345F, "Egyptian Hieroglyph Format Controls", ()), (0x14400, 0x1467F, "Anatolian Hieroglyphs", ()), (0x16800, 0x16A3F, "Bamum Supplement", ()), (0x16A40, 0x16A6F, "Mro", ()), (0x16A70, 0x16ACF, "Tangsa", ()), (0x16AD0, 0x16AFF, "Bassa Vah", ()), (0x16B00, 0x16B8F, "Pahawh Hmong", ()), (0x16E40, 0x16E9F, "Medefaidrin", ()), (0x16F00, 0x16F9F, "Miao", ()), (0x16FE0, 0x16FFF, "Ideographic Symbols and Punctuation", ()), (0x17000, 0x187FF, "Tangut", ()), (0x18800, 0x18AFF, "Tangut Components", ()), (0x18B00, 0x18CFF, "Khitan Small Script", ()), (0x18D00, 0x18D7F, "Tangut Supplement", ()), (0x1AFF0, 0x1AFFF, "Kana Extended-B", ()), (0x1B000, 0x1B0FF, "Kana Supplement", ()), (0x1B100, 0x1B12F, "Kana Extended-A", ()), (0x1B130, 0x1B16F, "Small Kana Extension", ()), (0x1B170, 0x1B2FF, "Nushu", ()), (0x1BC00, 0x1BC9F, "Duployan", ()), (0x1BCA0, 0x1BCAF, "Shorthand Format Controls", ()), (0x1CF00, 0x1CFCF, "Znamenny Musical Notation", ()), (0x1D000, 0x1D0FF, "Byzantine Musical Symbols", ()), (0x1D100, 0x1D1FF, "Musical Symbols", ()), (0x1D200, 0x1D24F, "Ancient Greek Musical Notation", ()), (0x1D2C0, 0x1D2DF, "Kaktovik Numerals", ()), (0x1D2E0, 0x1D2FF, "Mayan Numerals", ()), (0x1D300, 0x1D35F, "Tai Xuan Jing Symbols", ()), (0x1D360, 0x1D37F, "Counting Rod Numerals", ()), (0x1D400, 0x1D7FF, "Mathematical Alphanumeric Symbols", ()), (0x1D800, 0x1DAAF, "Sutton SignWriting", ()), ( 0x1DF00, 0x1DFFF, "Latin Extended-G", ( ALPHABET, LATIN, ), ), (0x1E000, 0x1E02F, "Glagolitic Supplement", ()), ( 0x1E030, 0x1E08F, "Cyrillic Extended-D", ( ALPHABET, CYRILLIC, ), ), (0x1E100, 0x1E14F, "Nyiakeng Puachue Hmong", ()), (0x1E290, 0x1E2BF, "Toto", ()), (0x1E2C0, 0x1E2FF, "Wancho", ()), (0x1E4D0, 0x1E4FF, "Nag Mundari", ()), (0x1E7E0, 0x1E7FF, "Ethiopic Extended-B", (ABUGIDA,)), (0x1E800, 0x1E8DF, "Mende Kikakui", ()), (0x1E900, 0x1E95F, "Adlam", ()), (0x1EC70, 0x1ECBF, "Indic Siyaq Numbers", ()), (0x1ED00, 0x1ED4F, "Ottoman Siyaq Numbers", ()), ( 0x1EE00, 0x1EEFF, "Arabic Mathematical Alphabetic Symbols", ( ARABIC, ABJAD, ), ), (0x1F000, 0x1F02F, "Mahjong Tiles", ()), (0x1F030, 0x1F09F, "Domino Tiles", ()), (0x1F0A0, 0x1F0FF, "Playing Cards", ()), (0x1F100, 0x1F1FF, "Enclosed Alphanumeric Supplement", ()), (0x1F200, 0x1F2FF, "Enclosed Ideographic Supplement", ()), (0x1F300, 0x1F5FF, "Miscellaneous Symbols and Pictographs", ()), (0x1F600, 0x1F64F, "Emoticons", ()), (0x1F650, 0x1F67F, "Ornamental Dingbats", ()), (0x1F680, 0x1F6FF, "Transport and Map Symbols", ()), (0x1F700, 0x1F77F, "Alchemical Symbols", ()), (0x1F780, 0x1F7FF, "Geometric Shapes Extended", ()), (0x1F800, 0x1F8FF, "Supplemental Arrows-C", ()), (0x1F900, 0x1F9FF, "Supplemental Symbols and Pictographs", ()), (0x1FA00, 0x1FA6F, "Chess Symbols", ()), (0x1FA70, 0x1FAFF, "Symbols and Pictographs Extended-A", ()), (0x1FB00, 0x1FBFF, "Symbols for Legacy Computing", ()), (0x20000, 0x2A6DF, "CJK Unified Ideographs Extension B", (CJK,)), (0x2A700, 0x2B73F, "CJK Unified Ideographs Extension C", (CJK,)), (0x2B740, 0x2B81F, "CJK Unified Ideographs Extension D", (CJK,)), (0x2B820, 0x2CEAF, "CJK Unified Ideographs Extension E", (CJK,)), (0x2CEB0, 0x2EBEF, "CJK Unified Ideographs Extension F", (CJK,)), (0x2EBF0, 0x2EE5F, "CJK Unified Ideographs Extension I", (CJK,)), (0x2F800, 0x2FA1F, "CJK Compatibility Ideographs Supplement", (CJK,)), (0x30000, 0x3134F, "CJK Unified Ideographs Extension G", (CJK,)), (0x31350, 0x323AF, "CJK Unified Ideographs Extension H", (CJK,)), (0xE0000, 0xE007F, "Tags", ()), (0xE0100, 0xE01EF, "Variation Selectors Supplement", ()), (0xF0000, 0xFFFFF, "Supplementary Private Use Area-A", ()), (0x100000, 0x10FFFF, "Supplementary Private Use Area-B", ()), ) BLOCK_TAGS = [(s, e, t) for s, e, _, t in UNICODE_BLOCKS if len(t)] @lru_cache(maxsize=5000) def char_tags(char: str) -> Tuple[int, ...]: """Get the tags applicable to a particular character.""" codepoint = ord(char) for start, end, tags in BLOCK_TAGS: if start <= codepoint <= end: return tags return () def is_modern_alphabet(word: str) -> bool: """Check if a word is written in a modern alphabet. The term alphabet is used in a narrow sense here: it includes only alphabets that have vowels and are safely transliterated to latin. Basically: Cyrillic, Greek, Armenian, and Latin.""" for char in word: tags = char_tags(char) if not len(tags): continue if ALPHABET not in tags: return False if HISTORIC in tags or FUNKY in tags: return False return True def is_latin(word: str) -> bool: """Check if a word is written in the latin alphabet.""" for char in word: tags = char_tags(char) if not len(tags): continue if LATIN not in tags: return False if HISTORIC in tags or FUNKY in tags: return False return True normality-2.5.0/normality/slugify.py000066400000000000000000000020521451033575700176170ustar00rootroot00000000000000import string from typing import Any, Optional from normality.cleaning import collapse_spaces, category_replace from normality.constants import SLUG_CATEGORIES, WS from normality.transliteration import latinize_text from normality.stringify import stringify VALID_CHARS = string.ascii_lowercase + string.digits + WS def slugify(value: Any, sep: str = "-") -> Optional[str]: """A simple slug generator. Slugs are pure ASCII lowercase strings that can be used in URLs an other places where a name has to be machine-safe.""" text = stringify(value) if text is None: return None text = text.replace(sep, WS) # run this first because it'll give better results on special # characters. text = category_replace(text, SLUG_CATEGORIES) text = latinize_text(text, ascii=True) if text is None: return None text = text.lower() text = "".join([c for c in text if c in VALID_CHARS]) text = collapse_spaces(text) if text is None or len(text) == 0: return None return text.replace(WS, sep) normality-2.5.0/normality/stringify.py000066400000000000000000000026661451033575700201660ustar00rootroot00000000000000from datetime import datetime, date from decimal import Decimal from typing import Any, Optional from normality.cleaning import remove_unsafe_chars from normality.encoding import predict_encoding from normality.encoding import DEFAULT_ENCODING def _clean_empty(value: str) -> Optional[str]: # XXX: is this really a good idea? value = value.strip() if not len(value): return None return value def stringify( value: Any, encoding_default: str = DEFAULT_ENCODING, encoding: Optional[str] = None ) -> Optional[str]: """Brute-force convert a given object to a string. This will attempt an increasingly mean set of conversions to make a given object into a unicode string. It is guaranteed to either return unicode or None, if all conversions failed (or the value is indeed empty). """ if value is None: return None if isinstance(value, str): return _clean_empty(value) if isinstance(value, (date, datetime)): return value.isoformat() elif isinstance(value, (float, Decimal)): return Decimal(value).to_eng_string() elif isinstance(value, bytes): if encoding is None: encoding = predict_encoding(value, default=encoding_default) value = value.decode(encoding, "replace") value = remove_unsafe_chars(value) if value is None: return None return _clean_empty(value) return _clean_empty(str(value)) normality-2.5.0/normality/transliteration.py000066400000000000000000000053661451033575700213720ustar00rootroot00000000000000""" Transliterate the given text to the latin script. This attempts to convert a given text to latin script using the closest match of characters vis a vis the original script. Transliteration requires an extensive unicode mapping. Since all Python implementations are either GPL-licensed (and thus more restrictive than this library) or come with a massive C code dependency, this module requires neither but will use a package if it is installed. """ import warnings from typing import cast, Optional, Callable from functools import lru_cache from normality.cleaning import compose_nfkc from normality.util import is_text # Transform to latin, separate accents, decompose, remove # symbols, compose, push to ASCII ASCII_SCRIPT = "Any-Latin; NFKD; [:Nonspacing Mark:] Remove; Accents-Any; [:Symbol:] Remove; [:Nonspacing Mark:] Remove; Latin-ASCII" # noqa # nb. 2021-11-05 Accents-Any is now followed with another nonspacing mark remover. # This script is becoming a bit silly, there has to be a nicer way to do this? class ICUWarning(UnicodeWarning): pass @lru_cache(maxsize=2**16) def latinize_text(text: Optional[str], ascii: bool = False) -> Optional[str]: """Transliterate the given text to the latin script. This attempts to convert a given text to latin script using the closest match of characters vis a vis the original script. """ if text is None or not is_text(text) or not len(text): return text if ascii: if not hasattr(latinize_text, "_ascii"): latinize_text._ascii = make_trans(ASCII_SCRIPT) # type: ignore return latinize_text._ascii(text) # type: ignore if not hasattr(latinize_text, "_tr"): latinize_text._tr = make_trans("Any-Latin") # type: ignore return latinize_text._tr(text) # type: ignore def ascii_text(text: Optional[str]) -> Optional[str]: """Transliterate the given text and make sure it ends up as ASCII.""" text = latinize_text(text, ascii=True) if text is None or not is_text(text): return None return text.encode("ascii", "replace").decode("ascii") def make_trans(script: str) -> Callable[[str], Optional[str]]: try: from icu import Transliterator # type: ignore inst = Transliterator.createInstance(script) return cast(Callable[[str], str], inst.transliterate) except ImportError: from text_unidecode import unidecode # type: ignore warnings.warn( "Install 'pyicu' for better text transliteration.", ICUWarning, stacklevel=4 ) # noqa def transliterate(text: str) -> Optional[str]: clean = compose_nfkc(text) if clean is None: return None return cast(Optional[str], unidecode(clean)) return transliterate normality-2.5.0/normality/util.py000066400000000000000000000003431451033575700171130ustar00rootroot00000000000000# Given the whole thing is a utility package, this is really meta. from typing import Any, Dict, Optional Categories = Dict[str, Optional[str]] Encoding = str def is_text(data: Any) -> bool: return isinstance(data, str) normality-2.5.0/pyproject.toml000066400000000000000000000024461451033575700164700ustar00rootroot00000000000000[build-system] requires = ["setuptools>=61.2.0", "wheel", "setuptools_scm[toml]>=3.4.3"] build-backend = "setuptools.build_meta" [project] name = "normality" authors = [{name = "Friedrich Lindenberg", email = "friedrich@pudo.org"}] license = {text = "MIT"} description = "Micro-library to normalize text strings" readme = "README.md" keywords = ["text", "unicode", "normalization", "slugs"] classifiers = [ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", ] urls = {Homepage = "http://github.com/pudo/normality"} dependencies = [ "banal >= 1.0.1", "text-unidecode", "chardet", "charset-normalizer >= 2.0.0", ] dynamic = ["version"] [project.optional-dependencies] icu = ["pyicu >= 1.9.3"] dev = [ "pyicu >= 1.9.3", "mypy", "pytest", "types-chardet", ] [tool.setuptools] zip-safe = false include-package-data = true [tool.setuptools.packages.find] exclude = ["ez_setup", "examples", "tests"] namespaces = false [tool.setuptools.package-data] banal = ["py.typed"] [tool.setuptools_scm] [tool.distutils.bdist_wheel] universal = 1 normality-2.5.0/setup.cfg000066400000000000000000000000001451033575700153550ustar00rootroot00000000000000normality-2.5.0/tests/000077500000000000000000000000001451033575700147105ustar00rootroot00000000000000normality-2.5.0/tests/__init__.py000066400000000000000000000000001451033575700170070ustar00rootroot00000000000000normality-2.5.0/tests/fixtures/000077500000000000000000000000001451033575700165615ustar00rootroot00000000000000normality-2.5.0/tests/fixtures/utf-16.txt000066400000000000000000000000701451033575700203410ustar00rootroot00000000000000>@>H5=:> 5B@> ;5:AV9>28Gnormality-2.5.0/tests/test_normality.py000066400000000000000000000072111451033575700203400ustar00rootroot00000000000000import unittest from datetime import datetime from normality import normalize, latinize_text, ascii_text from normality import ( stringify, slugify, guess_encoding, guess_file_encoding, predict_file_encoding, predict_encoding, ) class NormalityTest(unittest.TestCase): def test_empty(self): self.assertEqual(None, slugify(None)) self.assertEqual(None, ascii_text(None)) self.assertEqual(None, latinize_text(None)) self.assertEqual(None, normalize(None)) self.assertEqual(None, normalize("")) self.assertEqual(None, normalize(" ")) def test_petro(self): text = u"Порошенко Петро Олексійович" self.assertEqual("porosenko-petro-oleksijovic", slugify(text)) self.assertEqual("Porosenko Petro Oleksijovic", ascii_text(text)) self.assertEqual(u"Porošenko Petro Oleksíjovič", latinize_text(text)) self.assertEqual(u"порошенко петро олексіиович", normalize(text)) def test_ahmad(self): text = u"əhməd" self.assertEqual("ahmad", ascii_text(text)) def test_azeri(self): text = u"FUAD ALIYEV ƏHMƏD OĞLU" self.assertEqual("FUAD ALIYEV AHMAD OGLU", ascii_text(text)) def test_slugify(self): text = u"BABY! camel-is good" self.assertEqual("baby-camel-is-good", slugify(text, sep="-")) self.assertEqual("tests", slugify("testʼs", sep="-")) self.assertEqual("test-s", slugify("test_s", sep="-")) self.assertEqual(None, slugify("-", sep="-")) self.assertEqual(None, slugify("", sep="-")) self.assertEqual(None, slugify("- -", sep="-")) self.assertEqual(None, slugify(None, sep="-")) def test_georgian(self): text = u"ავლაბრის ფონდი" self.assertEqual("avlabris pondi", ascii_text(text)) def test_german(self): text = u"Häschen Spaß" self.assertEqual("Haschen Spass", ascii_text(text)) self.assertEqual("haschen-spass", slugify(text, sep="-")) def test_stringify(self): self.assertEqual(".", stringify(" . ")) self.assertEqual("5", stringify(5)) self.assertEqual("0.5", stringify(0.5)) def test_stringify_datetime(self): dt = datetime.utcnow() text = stringify(dt) self.assertTrue(text.startswith("%s-" % dt.year), text) def test_guess_encoding(self): text = u"Порошенко Петро Олексійович" encoded = text.encode("iso-8859-5") out = guess_encoding(encoded) self.assertEqual("iso8859-5", out) def test_predict_encoding(self): text = u"Порошенко Петро Олексійович" encoded = text.encode("iso-8859-5") out = predict_encoding(encoded) self.assertEqual("iso8859-5", out) def test_guess_file_encoding(self): with open("tests/fixtures/utf-16.txt", "rb") as fh: out = guess_file_encoding(fh) self.assertEqual("utf-16", out) def test_predict_file_encoding(self): with open("tests/fixtures/utf-16.txt", "rb") as fh: out = predict_file_encoding(fh) self.assertEqual("utf-16", out) def test_petro_iso_encoded(self): text = u"Порошенко Петро Олексійович" encoded = text.encode("iso8859-5") out = stringify(encoded) self.assertEqual(text, out) def test_petro_utf16_encoded(self): text = u"Порошенко Петро Олексійович" encoded = text.encode("utf-16") out = stringify(encoded) self.assertEqual(text, out) normality-2.5.0/tests/test_paths.py000066400000000000000000000022501451033575700174370ustar00rootroot00000000000000import unittest from normality.paths import MAX_LENGTH, safe_filename class PathsTest(unittest.TestCase): def test_safe_filename(self): self.assertEqual(None, safe_filename(None)) self.assertEqual("test.txt", safe_filename("test.txt")) self.assertEqual("test.txt", safe_filename("test .txt")) self.assertEqual("test_bla.txt", safe_filename("test bla.txt")) self.assertEqual("test_bla.txt", safe_filename("test_bla.txt")) self.assertEqual("test_bla.txt", safe_filename("test.bla.txt")) self.assertEqual("test.txt", safe_filename("test", extension="txt")) def test_long_filename(self): long_name = ["long name"] * 100 long_name = "-".join(long_name) shortened = safe_filename(long_name) assert len(shortened) <= MAX_LENGTH, shortened shortened = safe_filename(long_name, extension="html") assert len(shortened) <= MAX_LENGTH, shortened shortened = safe_filename("bla", extension=long_name) assert len(shortened) <= MAX_LENGTH, shortened shortened = safe_filename(long_name, extension=long_name) assert len(shortened) <= MAX_LENGTH, shortened normality-2.5.0/tests/test_scripts.py000066400000000000000000000012371451033575700200130ustar00rootroot00000000000000from normality.scripts import ALPHABET, CYRILLIC, CJK from normality.scripts import char_tags, is_modern_alphabet def test_char_tags(): assert ALPHABET in char_tags("a") assert CYRILLIC not in char_tags("a") assert CYRILLIC in char_tags("д") assert CJK in char_tags("近") assert ALPHABET not in char_tags("近") def test_is_modern_alphabet(): assert not is_modern_alphabet(" 习近平") assert is_modern_alphabet("Xí Jìnpíng") assert is_modern_alphabet("Ротенберг Аркадий") assert is_modern_alphabet(".,[]{}()!@#$%^&*()_+)«»‘“") assert not is_modern_alphabet("တပ်မတော်(ကြည်")