pax_global_header00006660000000000000000000000064141627474670014534gustar00rootroot0000000000000052 comment=f7e1db2ee1638eb2cff38f4d6658f5c352b71293 mutf8-1.0.6/000077500000000000000000000000001416274746700126035ustar00rootroot00000000000000mutf8-1.0.6/.bumpversion.cfg000066400000000000000000000001341416274746700157110ustar00rootroot00000000000000[bumpversion] current_version = 1.0.6 commit = True tag = True [bumpversion:file:setup.py] mutf8-1.0.6/.clang-format000066400000000000000000000010021416274746700151470ustar00rootroot00000000000000# Does a reasonable job of following PEP7 linting conventions. # # Yoinked from: # https://gist.github.com/pganssle/0e3a5f828b4d07d79447f6ced8e7e4db BasedOnStyle: Google AlwaysBreakAfterReturnType: All AllowShortIfStatementsOnASingleLine: false AlignAfterOpenBracket: Align BreakBeforeBraces: Stroustrup ColumnLimit: 79 DerivePointerAlignment: false IndentWidth: 4 Language: Cpp PointerAlignment: Right ReflowComments: true SpaceBeforeParens: ControlStatements SpacesInParentheses: false TabWidth: 4 UseTab: Never mutf8-1.0.6/.github/000077500000000000000000000000001416274746700141435ustar00rootroot00000000000000mutf8-1.0.6/.github/FUNDING.yml000066400000000000000000000010761416274746700157640ustar00rootroot00000000000000# These are supported funding model platforms github: TkTech patreon: # Replace with a single Patreon username open_collective: # Replace with a single Open Collective username ko_fi: # Replace with a single Ko-fi username tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry liberapay: # issuehunt: # Replace with a single IssueHunt username otechie: # Replace with a single Otechie username custom: # Replace with a single custom sponsorship URL mutf8-1.0.6/.github/workflows/000077500000000000000000000000001416274746700162005ustar00rootroot00000000000000mutf8-1.0.6/.github/workflows/release.yml000066400000000000000000000045721416274746700203530ustar00rootroot00000000000000on: push: # Sequence of patterns matched against refs/tags tags: - 'v*' # Push events to matching v*, i.e. v1.0, v20.15.10 name: Release jobs: build: name: Creating Windows and OSX wheels runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: os: [macos-latest, windows-latest] python-version: [3.6, 3.7, 3.8, 3.9] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install python dependencies run: | python -m pip install --upgrade pip python -m pip install --upgrade setuptools pip install wheel twine pip install -e '.' - name: Publishing env: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | python setup.py bdist_wheel twine upload dist/* build_manylinux2010: name: Creating manylinux releases runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - name: Build (manylinux2014) uses: RalfG/python-wheels-manylinux-build@v0.3-manylinux2014_x86_64 with: python-versions: 'cp36-cp36m cp37-cp37m cp38-cp38 cp39-cp39' - name: Set up Python uses: actions/setup-python@v1 with: python-version: 3.8 - name: Install python dependencies run: | python -m pip install --upgrade pip python -m pip install --upgrade setuptools pip install wheel twine - name: Publishing env: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | twine upload dist/*-manylinux*.whl sdist: name: Creating source release runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - name: Set up Python uses: actions/setup-python@v1 with: python-version: 3.8 - name: Install python dependencies run: | python -m pip install --upgrade pip python -m pip install --upgrade setuptools pip install wheel twine - name: Publishing env: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | python setup.py sdist twine upload dist/* mutf8-1.0.6/.github/workflows/test.yml000066400000000000000000000015721416274746700177070ustar00rootroot00000000000000name: Tests on: [push] jobs: build: name: Running tests. runs-on: ubuntu-latest strategy: fail-fast: false matrix: python-version: [3.6, 3.7, 3.8, 3.9] steps: # Python needs to be setup before checkout to prevent files from being # left in the source tree. See setup-python/issues/106. - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - uses: actions/checkout@v2 - name: Installing platform dependencies if: matrix.os == 'ubuntu-latest' run: > sudo apt-get install -y build-essential clang-6.0 - name: Installing python dependencies run: | python -m pip install --upgrade pip pip install -e '.[test]' - name: Running tests run: | pytest mutf8-1.0.6/.gitignore000066400000000000000000000001051416274746700145670ustar00rootroot00000000000000*.pyc *.pyd venv .eggs cython_debug/* build/ _build/ *.egg-info *.so mutf8-1.0.6/LICENCE000066400000000000000000000021241416274746700135670ustar00rootroot00000000000000Copyright (c) 2012-2015 Tyler Kennedy . All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. mutf8-1.0.6/README.md000066400000000000000000000052701416274746700140660ustar00rootroot00000000000000![Tests](https://github.com/TkTech/mutf8/workflows/Tests/badge.svg?branch=master) # mutf-8 This package contains simple pure-python as well as C encoders and decoders for the MUTF-8 character encoding. In most cases, you can also parse the even-rarer CESU-8. These days, you'll most likely encounter MUTF-8 when working on files or protocols related to the JVM. Strings in a Java `.class` file are encoded using MUTF-8, strings passed by the JNI, as well as strings exported by the object serializer. This library was extracted from [Lawu][], a Python library for working with JVM class files. ## 🎉 Installation Install the package from PyPi: ``` pip install mutf8 ``` Binary wheels are available for the following: | | py3.6 | py3.7 | py3.8 | py3.9 | | ---------------- | ----- | ----- | ----- | ----- | | OS X (x86_64) | y | y | y | y | | Windows (x86_64) | y | y | y | y | | Linux (x86_64) | y | y | y | y | If binary wheels are not available, it will attempt to build the C extension from source with any C99 compiler. If it could not build, it will fall back to a pure-python version. ## Usage Encoding and decoding is simple: ```python from mutf8 import encode_modified_utf8, decode_modified_utf8 unicode = decode_modified_utf8(byte_like_object) bytes = encode_modified_utf8(unicode) ``` This module *does not* register itself globally as a codec, since importing should be side-effect-free. ## 📈 Benchmarks The C extension is significantly faster - often 20x to 40x faster. ### MUTF-8 Decoding | Name | Min (μs) | Max (μs) | StdDev | Ops | |------------------------------|------------|------------|----------|---------------| | cmutf8-decode_modified_utf8 | 0.00009 | 0.00080 | 0.00000 | 9957678.56358 | | pymutf8-decode_modified_utf8 | 0.00190 | 0.06040 | 0.00000 | 450455.96019 | ### MUTF-8 Encoding | Name | Min (μs) | Max (μs) | StdDev | Ops | |------------------------------|------------|------------|----------|----------------| | cmutf8-encode_modified_utf8 | 0.00008 | 0.00151 | 0.00000 | 11897361.05101 | | pymutf8-encode_modified_utf8 | 0.00180 | 0.16650 | 0.00000 | 474390.98091 | ## C Extension The C extension is optional. If a binary package is not available, or a C compiler is not present, the pure-python version will be used instead. If you want to ensure you're using the C version, import it directly: ```python from mutf8.cmutf8 import decode_modified_utf8 decode_modified_utf(b'\xED\xA1\x80\xED\xB0\x80') ``` [Lawu]: https://github.com/tktech/lawu mutf8-1.0.6/mutf8/000077500000000000000000000000001416274746700136465ustar00rootroot00000000000000mutf8-1.0.6/mutf8/__init__.py000066400000000000000000000011421416274746700157550ustar00rootroot00000000000000""" Utility methods for handling oddities in character encoding encountered when parsing and writing JVM ClassFiles or object serialization archives. MUTF-8 is the same as CESU-8, but with different encoding for 0x00 bytes. .. note:: http://bugs.python.org/issue2857 was an attempt in 2008 to get support for MUTF-8/CESU-8 into the python core. """ try: from mutf8.cmutf8 import decode_modified_utf8, encode_modified_utf8 except ImportError: from mutf8.mutf8 import decode_modified_utf8, encode_modified_utf8 # Shut up linters. ALL_IMPORTS = [decode_modified_utf8, encode_modified_utf8] mutf8-1.0.6/mutf8/cmutf8.c000066400000000000000000000200661416274746700152240ustar00rootroot00000000000000#define PY_SSIZE_T_CLEAN #include #include PyDoc_STRVAR(decode_doc, "Decodes a bytestring containing MUTF-8 as defined in section\n" "4.4.7 of the JVM specification.\n\n" ":param s: A byte/buffer-like to be converted.\n" ":returns: A unicode representation of the original string."); static PyObject * decode_modified_utf8(PyObject *self, PyObject *args) { #define return_err(_msg) \ do { \ PyObject *exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, \ "sy#nns", "mutf-8", view.buf, \ view.len, ix, ix + 1, _msg); \ if (exc != NULL) { \ PyCodec_StrictErrors(exc); \ Py_DECREF(exc); \ } \ PyMem_Free(cp_out); \ PyBuffer_Release(&view); \ return NULL; \ } while (0) Py_buffer view; if (!PyArg_ParseTuple(args, "y*", &view)) { return NULL; } // MUTF-8 input. uint8_t *buf = (uint8_t *)view.buf; // Array of temporary UCS-4 codepoints. // There's no point using PyUnicode_new and _WriteChar, because // it requires us to have iterated the string to get the maximum unicode // codepoint and count anyways. Py_UCS4 *cp_out = PyMem_Calloc(view.len, sizeof(Py_UCS4)); if (!cp_out) { return PyErr_NoMemory(); } // # of codepoints we found & current index into cp_out. Py_ssize_t cp_count = 0; for (Py_ssize_t ix = 0; ix < view.len; ix++) { Py_UCS4 x = buf[ix]; if (x == 0) { return_err("Embedded NULL byte in input."); } else if (x < 0x80) { // ASCII/one-byte codepoint. x &= 0x7F; } else if ((x & 0xE0) == 0xC0) { // Two-byte codepoint. if (ix + 1 >= view.len) { return_err( "2-byte codepoint started, but input too short" " to finish."); } x = ((x & 0x1F) << 0x06 | (buf[ix + 1] & 0x3F)); ix++; } else if ((x & 0xF0) == 0xE0) { // Three-byte codepoint. if (ix + 2 >= view.len) { return_err( "3-byte or 6-byte codepoint started, but input too short" " to finish."); } uint8_t b2 = buf[ix + 1]; uint8_t b3 = buf[ix + 2]; if (x == 0xED && (b2 & 0xF0) == 0xA0) { if (ix + 5 >= view.len) { return_err( "6-byte codepoint started, but input too short" " to finish."); } // Possible six-byte codepoint. uint8_t b4 = buf[ix + 3]; uint8_t b5 = buf[ix + 4]; uint8_t b6 = buf[ix + 5]; if (b4 == 0xED && (b5 & 0xF0) == 0xB0) { // Definite six-byte codepoint. x = ( 0x10000 | (b2 & 0x0F) << 0x10 | (b3 & 0x3F) << 0x0A | (b5 & 0x0F) << 0x06 | (b6 & 0x3F) ); ix += 5; cp_out[cp_count++] = x; continue; } } x = ( (x & 0x0F) << 0x0C | (b2 & 0x3F) << 0x06 | (b3 & 0x3F) ); ix += 2; } cp_out[cp_count++] = x; } PyObject *out = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, cp_out, cp_count); PyMem_Free(cp_out); PyBuffer_Release(&view); return out; #undef return_err } inline Py_ssize_t _encoded_size(void *data, Py_ssize_t length, int kind) { Py_ssize_t byte_count = 0; for (Py_ssize_t i = 0; i < length; i++) { Py_UCS4 cp = PyUnicode_READ(kind, data, i); if (cp == 0x00) { // NULLs will get encoded as C0 80. byte_count += 2; } else if (cp <= 0x7F) { byte_count++; } else if (cp <= 0x7FF) { byte_count += 2; } else if (cp <= 0xFFFF) { byte_count += 3; } else { byte_count += 6; } } return byte_count; } PyDoc_STRVAR(encoded_size_doc, "Returns the number of bytes required to store the given\n" "unicode string when encoded as MUTF-8.\n\n" ":param u: Unicode string to be converted.\n" ":returns: The number of bytes required."); static PyObject * encoded_size(PyObject *self, PyObject *args) { PyObject *src = NULL; if (!PyArg_ParseTuple(args, "U", &src)) { return NULL; } return PyLong_FromSsize_t( _encoded_size( PyUnicode_DATA(src), PyUnicode_GET_LENGTH(src), PyUnicode_KIND(src) ) ); } PyDoc_STRVAR(encode_doc, "Encodes a unicode string as MUTF-8 as defined in section\n" "4.4.7 of the JVM specification.\n\n" ":param u: Unicode string to be converted.\n" ":returns: The encoded string as a `bytes` object."); static PyObject * encode_modified_utf8(PyObject *self, PyObject *args) { PyObject *src = NULL; if (!PyArg_ParseTuple(args, "U", &src)) { return NULL; } void *data = PyUnicode_DATA(src); Py_ssize_t length = PyUnicode_GET_LENGTH(src); int kind = PyUnicode_KIND(src); char *byte_out = PyMem_Calloc(_encoded_size(data, length, kind), 1); if (!byte_out) { return PyErr_NoMemory(); } Py_ssize_t byte_count = 0; for (Py_ssize_t i = 0; i < length; i++) { Py_UCS4 cp = PyUnicode_READ(kind, data, i); if (cp == 0x00) { // NULL byte encoding shortcircuit. byte_out[byte_count++] = 0xC0; byte_out[byte_count++] = 0x80; } else if (cp <= 0x7F) { // ASCII byte_out[byte_count++] = cp; } else if (cp <= 0x7FF) { // Two-byte codepoint. byte_out[byte_count++] = (0xC0 | (0x1F & (cp >> 0x06))); byte_out[byte_count++] = (0x80 | (0x3F & cp)); } else if (cp <= 0xFFFF) { // Three-byte codepoint byte_out[byte_count++] = (0xE0 | (0x0F & (cp >> 0x0C))); byte_out[byte_count++] = (0x80 | (0x3F & (cp >> 0x06))); byte_out[byte_count++] = (0x80 | (0x3F & cp)); } else { // "Two-times-three" byte codepoint. byte_out[byte_count++] = 0xED; byte_out[byte_count++] = 0xA0 | ((cp >> 0x10) & 0x0F); byte_out[byte_count++] = 0x80 | ((cp >> 0x0A) & 0x3F); byte_out[byte_count++] = 0xED; byte_out[byte_count++] = 0xB0 | ((cp >> 0x06) & 0x0F); byte_out[byte_count++] = 0x80 | (cp & 0x3F); } } PyObject *out = PyBytes_FromStringAndSize(byte_out, byte_count); PyMem_Free(byte_out); return out; } static PyMethodDef module_methods[] = { {"decode_modified_utf8", decode_modified_utf8, METH_VARARGS, decode_doc}, {"encode_modified_utf8", encode_modified_utf8, METH_VARARGS, encode_doc}, {"encoded_size", encoded_size, METH_VARARGS, encoded_size_doc}, {NULL, NULL, 0, NULL}}; static struct PyModuleDef cmutf8_module = { PyModuleDef_HEAD_INIT, "mutf8.cmutf8", PyDoc_STR("Encoders and decoders for the MUTF-8 encoding."), -1, module_methods, }; PyMODINIT_FUNC PyInit_cmutf8(void) { PyObject *m; m = PyModule_Create(&cmutf8_module); if (m == NULL) return NULL; return m; } mutf8-1.0.6/mutf8/mutf8.py000066400000000000000000000104621416274746700152660ustar00rootroot00000000000000def decode_modified_utf8(s: bytes) -> str: """ Decodes a bytestring containing modified UTF-8 as defined in section 4.4.7 of the JVM specification. :param s: bytestring to be converted. :returns: A unicode representation of the original string. """ s_out = [] s_len = len(s) s_ix = 0 while s_ix < s_len: b1 = s[s_ix] s_ix += 1 if b1 == 0: raise UnicodeDecodeError( 'mutf-8', s, s_ix - 1, s_ix, 'Embedded NULL byte in input.' ) if b1 < 0x80: # ASCII/one-byte codepoint. s_out.append(chr(b1)) elif (b1 & 0xE0) == 0xC0: # Two-byte codepoint. if s_ix >= s_len: raise UnicodeDecodeError( 'mutf-8', s, s_ix - 1, s_ix, '2-byte codepoint started, but input too short to' ' finish.' ) s_out.append( chr( (b1 & 0x1F) << 0x06 | (s[s_ix] & 0x3F) ) ) s_ix += 1 elif (b1 & 0xF0) == 0xE0: # Three-byte codepoint. if s_ix + 1 >= s_len: raise UnicodeDecodeError( 'mutf-8', s, s_ix - 1, s_ix, '3-byte or 6-byte codepoint started, but input too' ' short to finish.' ) b2 = s[s_ix] b3 = s[s_ix + 1] if b1 == 0xED and (b2 & 0xF0) == 0xA0: # Possible six-byte codepoint. if s_ix + 4 >= s_len: raise UnicodeDecodeError( 'mutf-8', s, s_ix - 1, s_ix, '3-byte or 6-byte codepoint started, but input too' ' short to finish.' ) b4 = s[s_ix + 2] b5 = s[s_ix + 3] b6 = s[s_ix + 4] if b4 == 0xED and (b5 & 0xF0) == 0xB0: # Definite six-byte codepoint. s_out.append( chr( 0x10000 | (b2 & 0x0F) << 0x10 | (b3 & 0x3F) << 0x0A | (b5 & 0x0F) << 0x06 | (b6 & 0x3F) ) ) s_ix += 5 continue s_out.append( chr( (b1 & 0x0F) << 0x0C | (b2 & 0x3F) << 0x06 | (b3 & 0x3F) ) ) s_ix += 2 else: raise RuntimeError return u''.join(s_out) def encode_modified_utf8(u: str) -> bytes: """ Encodes a unicode string as modified UTF-8 as defined in section 4.4.7 of the JVM specification. :param u: unicode string to be converted. :returns: A decoded bytearray. """ final_string = bytearray() for c in (ord(char) for char in u): if c == 0x00: # NULL byte encoding shortcircuit. final_string.extend([0xC0, 0x80]) elif c <= 0x7F: # ASCII final_string.append(c) elif c <= 0x7FF: # Two-byte codepoint. final_string.extend([ (0xC0 | (0x1F & (c >> 0x06))), (0x80 | (0x3F & c)) ]) elif c <= 0xFFFF: # Three-byte codepoint. final_string.extend([ (0xE0 | (0x0F & (c >> 0x0C))), (0x80 | (0x3F & (c >> 0x06))), (0x80 | (0x3F & c)) ]) else: # Six-byte codepoint. final_string.extend([ 0xED, 0xA0 | ((c >> 0x10) & 0x0F), 0x80 | ((c >> 0x0A) & 0x3f), 0xED, 0xb0 | ((c >> 0x06) & 0x0f), 0x80 | (c & 0x3f) ]) return bytes(final_string) mutf8-1.0.6/setup.py000066400000000000000000000021051416274746700143130ustar00rootroot00000000000000import os import os.path from setuptools import setup, find_packages, Extension root = os.path.abspath(os.path.dirname(__file__)) with open(os.path.join(root, 'README.md'), 'rb') as readme: long_description = readme.read().decode('utf-8') setup( name='mutf8', packages=find_packages(), version='1.0.6', description='Fast MUTF-8 encoder & decoder', long_description=long_description, long_description_content_type='text/markdown', author='Tyler Kennedy', author_email='tk@tkte.ch', url='http://github.com/TkTech/mutf8', keywords=['mutf-8', 'cesu-8', 'jvm'], classifiers=[ 'Programming Language :: Python :: 3', 'License :: OSI Approved :: MIT License', 'Operating System :: OS Independent', 'Intended Audience :: Developers', ], extras_require={ 'test': [ 'pytest', 'pytest-benchmark' ] }, ext_modules=[ Extension( 'mutf8.cmutf8', ['mutf8/cmutf8.c'], language='c', optional=True ) ] ) mutf8-1.0.6/tests/000077500000000000000000000000001416274746700137455ustar00rootroot00000000000000mutf8-1.0.6/tests/conftest.py000066400000000000000000000016331416274746700161470ustar00rootroot00000000000000import pytest import mutf8.mutf8 as pymutf8 import mutf8.cmutf8 as cmutf8 def pytest_addoption(parser): parser.addoption( '--runslow', action='store_true', default=False, help='run slow tests' ) def pytest_configure(config): config.addinivalue_line('markers', 'slow: mark test as slow to run') def pytest_collection_modifyitems(config, items): if config.getoption('--runslow'): # --runslow given in cli: do not skip slow tests return skip_slow = pytest.mark.skip(reason='need --runslow option to run') for item in items: if 'slow' in item.keywords: item.add_marker(skip_slow) @pytest.fixture(params=[pymutf8, cmutf8]) def module(request): yield request.param @pytest.fixture() def decoder(module): yield module.decode_modified_utf8 @pytest.fixture() def encoder(module): yield module.encode_modified_utf8 mutf8-1.0.6/tests/test_benchmark.py000066400000000000000000000015341416274746700173130ustar00rootroot00000000000000import pytest import mutf8.mutf8 as pymutf8 import mutf8.cmutf8 as cmutf8 @pytest.mark.parametrize('group,decoder', ( ('pymutf8', pymutf8.decode_modified_utf8), ('cmutf8', cmutf8.decode_modified_utf8) )) @pytest.mark.slow def test_decode(group, decoder, benchmark): """Compare the performance of the python and C decoders.""" benchmark.group = 'MUTF-8 Decoding' benchmark.extra_info['group'] = group benchmark(decoder, b'\xED\xA0\xBD\xED\xB8\x88') @pytest.mark.parametrize('group,encoder', ( ('pymutf8', pymutf8.encode_modified_utf8), ('cmutf8', cmutf8.encode_modified_utf8) )) @pytest.mark.slow def test_encode(group, encoder, benchmark): """Compare the performance of the python and C encoders.""" benchmark.group = 'MUTF-8 Encoding' benchmark.extra_info['group'] = group benchmark(encoder, '\U0001F608') mutf8-1.0.6/tests/test_bugs.py000066400000000000000000000022321416274746700163150ustar00rootroot00000000000000def test_issue_1(encoder, decoder): """ Ensure we do not regress on https://github.com/TkTech/mutf8/issues/1. Two issues found here: 1. Python-based decoder could hit an infinite loop, since it didn't *always* increment s_ix on an iteration. 2. C & Python decoders would incorrectly decode the `sample` below, because the logic for surrogate pair decoding made the incorrect assumption that we could short-circuit when b1 == 0xED. """ # b'[\xea\xb0\x80 \xeb\x82\x98 \xeb\x8b\xa4 \xeb\x9d\xbc \xeb\xa7\x88 # \xeb\xb0\x94 \xec\x82\xac \xec\x95\x84\xec\x9e\x90 \xec\xb0\xa8 # \xec\xb9\xb4 \xed\x83\x80 \xed\x8c\x8c \xed\x95\x98]' sample = ( u'[\uAC00 \uB098 \uB2E4 \uB77C \uB9C8 \uBC14 \uC0AC \uC544' u'\uC790 \uCC28 \uCE74 \uD0C0 \uD30C \uD558]' ) encoded = encoder(sample) decoded = decoder(encoded) assert sample == decoded def test_issue_3(encoder, decoder): """ Underallocation due to an incorrect assumption on the maximum expansion of an encoded string. """ str = '黑人抬棺組裝包' assert decoder(encoder(str)) == str mutf8-1.0.6/tests/test_modified_utf8.py000066400000000000000000000137521416274746700201140ustar00rootroot00000000000000import pytest def _pairs(*args): for a, b in args: yield chr(a), bytearray(b) def test_decode_bad_mutf8(decoder): """Ensure we do the right thing when we encounter invalid MUTF-8.""" # There should never be a null byte in a MUTF-8 string. It's the # entire point of using MUTF-8. with pytest.raises(UnicodeDecodeError) as excinfo: decoder(b'\x00') assert excinfo.value.encoding == 'mutf-8' assert 'Embedded NULL' in excinfo.value.reason # Start of a two-byte codepoint without the sibling. with pytest.raises(UnicodeDecodeError) as excinfo: decoder(b'\xC2') assert excinfo.value.encoding == 'mutf-8' assert '2-byte' in excinfo.value.reason # Start of a six-byte codepoint without the sibling. with pytest.raises(UnicodeDecodeError) as excinfo: decoder(b'\xED') assert excinfo.value.encoding == 'mutf-8' assert '6-byte' in excinfo.value.reason # Start of a three-byte codepoint without the sibling. with pytest.raises(UnicodeDecodeError) as excinfo: decoder(b'\xE2') assert excinfo.value.encoding == 'mutf-8' assert '3-byte' in excinfo.value.reason def test_two_byte(module): """ Test two-byte encoding and decoding. Test data taken from py2jdbc. """ pairs = _pairs( (0x0080, (0xc2, 0x80)), (0x0081, (0xc2, 0x81)), (0x0082, (0xc2, 0x82)), (0x0084, (0xc2, 0x84)), (0x0088, (0xc2, 0x88)), (0x0090, (0xc2, 0x90)), (0x00a0, (0xc2, 0xa0)), (0x00c0, (0xc3, 0x80)), (0x0180, (0xc6, 0x80)), (0x0280, (0xca, 0x80)), (0x0480, (0xd2, 0x80)), (0x0481, (0xd2, 0x81)), (0x0483, (0xd2, 0x83)), (0x0487, (0xd2, 0x87)), (0x048f, (0xd2, 0x8f)), (0x049f, (0xd2, 0x9f)), (0x04af, (0xd2, 0xaf)), (0x04bf, (0xd2, 0xbf)), (0x04ff, (0xd3, 0xbf)), (0x05ff, (0xd7, 0xbf)), (0x05ff, (0xd7, 0xbf)), (0x07ff, (0xdf, 0xbf)) ) for decoded, original in pairs: assert module.decode_modified_utf8(original) == decoded assert module.encode_modified_utf8(decoded) == original def test_three_byte(module): """ Test three-byte encoding and decoding. Test data taken from py2jdbc. """ pairs = _pairs( (0x0800, (0xe0, 0xa0, 0x80)), (0x0801, (0xe0, 0xa0, 0x81)), (0x0802, (0xe0, 0xa0, 0x82)), (0x0804, (0xe0, 0xa0, 0x84)), (0x0808, (0xe0, 0xa0, 0x88)), (0x0810, (0xe0, 0xa0, 0x90)), (0x0820, (0xe0, 0xa0, 0xa0)), (0x0840, (0xe0, 0xa1, 0x80)), (0x0880, (0xe0, 0xa2, 0x80)), (0x0900, (0xe0, 0xa4, 0x80)), (0x0a00, (0xe0, 0xa8, 0x80)), (0x0c00, (0xe0, 0xb0, 0x80)), (0x1800, (0xe1, 0xa0, 0x80)), (0x2800, (0xe2, 0xa0, 0x80)), (0x4800, (0xe4, 0xa0, 0x80)), (0x8800, (0xe8, 0xa0, 0x80)), (0x8801, (0xe8, 0xa0, 0x81)), (0x8803, (0xe8, 0xa0, 0x83)), (0x8807, (0xe8, 0xa0, 0x87)), (0x880f, (0xe8, 0xa0, 0x8f)), (0x881f, (0xe8, 0xa0, 0x9f)), (0x883f, (0xe8, 0xa0, 0xbf)), (0x887f, (0xe8, 0xa1, 0xbf)), (0x88ff, (0xe8, 0xa3, 0xbf)), (0x89ff, (0xe8, 0xa7, 0xbf)), (0x8bff, (0xe8, 0xaf, 0xbf)), (0x8fff, (0xe8, 0xbf, 0xbf)), (0x9fff, (0xe9, 0xbf, 0xbf)), (0xbfff, (0xeb, 0xbf, 0xbf)), (0xffff, (0xef, 0xbf, 0xbf)) ) for decoded, original in pairs: assert module.decode_modified_utf8(original) == decoded assert module.encode_modified_utf8(decoded) == original def test_six_byte(module): """ Test six-byte encoding and decoding. Test data taken from py2jdbc. """ pairs = _pairs( (0x10000, (0xed, 0xa1, 0x80, 0xed, 0xb0, 0x80)), (0x10001, (0xed, 0xa1, 0x80, 0xed, 0xb0, 0x81)), (0x10002, (0xed, 0xa1, 0x80, 0xed, 0xb0, 0x82)), (0x10004, (0xed, 0xa1, 0x80, 0xed, 0xb0, 0x84)), (0x10008, (0xed, 0xa1, 0x80, 0xed, 0xb0, 0x88)), (0x10010, (0xed, 0xa1, 0x80, 0xed, 0xb0, 0x90)), (0x10020, (0xed, 0xa1, 0x80, 0xed, 0xb0, 0xa0)), (0x10040, (0xed, 0xa1, 0x80, 0xed, 0xb1, 0x80)), (0x10080, (0xed, 0xa1, 0x80, 0xed, 0xb2, 0x80)), (0x10100, (0xed, 0xa1, 0x80, 0xed, 0xb4, 0x80)), (0x10200, (0xed, 0xa1, 0x80, 0xed, 0xb8, 0x80)), (0x10400, (0xed, 0xa1, 0x81, 0xed, 0xb0, 0x80)), (0x10800, (0xed, 0xa1, 0x82, 0xed, 0xb0, 0x80)), (0x11000, (0xed, 0xa1, 0x84, 0xed, 0xb0, 0x80)), (0x12000, (0xed, 0xa1, 0x88, 0xed, 0xb0, 0x80)), (0x14000, (0xed, 0xa1, 0x90, 0xed, 0xb0, 0x80)), (0x18000, (0xed, 0xa1, 0xa0, 0xed, 0xb0, 0x80)), (0x30000, (0xed, 0xa3, 0x80, 0xed, 0xb0, 0x80)), (0x50000, (0xed, 0xa5, 0x80, 0xed, 0xb0, 0x80)), (0x90000, (0xed, 0xa9, 0x80, 0xed, 0xb0, 0x80)), (0x10003, (0xed, 0xa1, 0x80, 0xed, 0xb0, 0x83)), (0x10007, (0xed, 0xa1, 0x80, 0xed, 0xb0, 0x87)), (0x1000f, (0xed, 0xa1, 0x80, 0xed, 0xb0, 0x8f)), (0x1001f, (0xed, 0xa1, 0x80, 0xed, 0xb0, 0x9f)), (0x1003f, (0xed, 0xa1, 0x80, 0xed, 0xb0, 0xbf)), (0x1007f, (0xed, 0xa1, 0x80, 0xed, 0xb1, 0xbf)), (0x100ff, (0xed, 0xa1, 0x80, 0xed, 0xb3, 0xbf)), (0x101ff, (0xed, 0xa1, 0x80, 0xed, 0xb7, 0xbf)), (0x103ff, (0xed, 0xa1, 0x80, 0xed, 0xbf, 0xbf)), (0x107ff, (0xed, 0xa1, 0x81, 0xed, 0xbf, 0xbf)), (0x10fff, (0xed, 0xa1, 0x83, 0xed, 0xbf, 0xbf)), (0x11fff, (0xed, 0xa1, 0x87, 0xed, 0xbf, 0xbf)), (0x13fff, (0xed, 0xa1, 0x8f, 0xed, 0xbf, 0xbf)), (0x17fff, (0xed, 0xa1, 0x9f, 0xed, 0xbf, 0xbf)), (0x1ffff, (0xed, 0xa1, 0xbf, 0xed, 0xbf, 0xbf)), (0x3ffff, (0xed, 0xa3, 0xbf, 0xed, 0xbf, 0xbf)), (0x7ffff, (0xed, 0xa7, 0xbf, 0xed, 0xbf, 0xbf)), (0xfffff, (0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf)) ) for decoded, original in pairs: assert module.decode_modified_utf8(original) == decoded assert module.encode_modified_utf8(decoded) == original