pax_global_header00006660000000000000000000000064146537660160014530gustar00rootroot0000000000000052 comment=cdb8a5f60c9770f05584ecff7b77ef33b4a8d35c tokenize-rt-6.0.0/000077500000000000000000000000001465376601600140065ustar00rootroot00000000000000tokenize-rt-6.0.0/.github/000077500000000000000000000000001465376601600153465ustar00rootroot00000000000000tokenize-rt-6.0.0/.github/workflows/000077500000000000000000000000001465376601600174035ustar00rootroot00000000000000tokenize-rt-6.0.0/.github/workflows/main.yml000066400000000000000000000006131465376601600210520ustar00rootroot00000000000000name: main on: push: branches: [main, test-me-*] tags: '*' pull_request: jobs: main-windows: uses: asottile/workflows/.github/workflows/tox.yml@v1.6.0 with: env: '["py38"]' os: windows-latest main-linux: uses: asottile/workflows/.github/workflows/tox.yml@v1.6.0 with: env: '["py38", "py39", "py310", "py311", "py312"]' os: ubuntu-latest tokenize-rt-6.0.0/.gitignore000066400000000000000000000000421465376601600157720ustar00rootroot00000000000000*.egg-info *.pyc /.coverage /.tox tokenize-rt-6.0.0/.pre-commit-config.yaml000066400000000000000000000021561465376601600202730ustar00rootroot00000000000000repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.6.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml - id: debug-statements - id: double-quote-string-fixer - id: name-tests-test - id: requirements-txt-fixer - repo: https://github.com/asottile/setup-cfg-fmt rev: v2.5.0 hooks: - id: setup-cfg-fmt - repo: https://github.com/asottile/reorder-python-imports rev: v3.13.0 hooks: - id: reorder-python-imports args: [--py38-plus, --add-import, 'from __future__ import annotations'] - repo: https://github.com/asottile/add-trailing-comma rev: v3.1.0 hooks: - id: add-trailing-comma - repo: https://github.com/asottile/pyupgrade rev: v3.17.0 hooks: - id: pyupgrade args: [--py38-plus] - repo: https://github.com/hhatto/autopep8 rev: v2.3.1 hooks: - id: autopep8 - repo: https://github.com/PyCQA/flake8 rev: 7.1.0 hooks: - id: flake8 - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.11.0 hooks: - id: mypy tokenize-rt-6.0.0/LICENSE000066400000000000000000000020431465376601600150120ustar00rootroot00000000000000Copyright (c) 2017 Anthony Sottile Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. tokenize-rt-6.0.0/README.md000066400000000000000000000065561465376601600153010ustar00rootroot00000000000000[![build status](https://github.com/asottile/tokenize-rt/actions/workflows/main.yml/badge.svg)](https://github.com/asottile/tokenize-rt/actions/workflows/main.yml) [![pre-commit.ci status](https://results.pre-commit.ci/badge/github/asottile/tokenize-rt/main.svg)](https://results.pre-commit.ci/latest/github/asottile/tokenize-rt/main) tokenize-rt =========== The stdlib `tokenize` module does not properly roundtrip. This wrapper around the stdlib provides two additional tokens `ESCAPED_NL` and `UNIMPORTANT_WS`, and a `Token` data type. Use `src_to_tokens` and `tokens_to_src` to roundtrip. This library is useful if you're writing a refactoring tool based on the python tokenization. ## Installation ```bash pip install tokenize-rt ``` ## Usage ### datastructures #### `tokenize_rt.Offset(line=None, utf8_byte_offset=None)` A token offset, useful as a key when cross referencing the `ast` and the tokenized source. #### `tokenize_rt.Token(name, src, line=None, utf8_byte_offset=None)` Construct a token - `name`: one of the token names listed in `token.tok_name` or `ESCAPED_NL` or `UNIMPORTANT_WS` - `src`: token's source as text - `line`: the line number that this token appears on. - `utf8_byte_offset`: the utf8 byte offset that this token appears on in the line. #### `tokenize_rt.Token.offset` Retrieves an `Offset` for this token. ### converting to and from `Token` representations #### `tokenize_rt.src_to_tokens(text: str) -> List[Token]` #### `tokenize_rt.tokens_to_src(Iterable[Token]) -> str` ### additional tokens added by `tokenize-rt` #### `tokenize_rt.ESCAPED_NL` #### `tokenize_rt.UNIMPORTANT_WS` ### helpers #### `tokenize_rt.NON_CODING_TOKENS` A `frozenset` containing tokens which may appear between others while not affecting control flow or code: - `COMMENT` - `ESCAPED_NL` - `NL` - `UNIMPORTANT_WS` #### `tokenize_rt.parse_string_literal(text: str) -> Tuple[str, str]` parse a string literal into its prefix and string content ```pycon >>> parse_string_literal('f"foo"') ('f', '"foo"') ``` #### `tokenize_rt.reversed_enumerate(Sequence[Token]) -> Iterator[Tuple[int, Token]]` yields `(index, token)` pairs. Useful for rewriting source. #### `tokenize_rt.rfind_string_parts(Sequence[Token], i) -> Tuple[int, ...]` find the indices of the string parts of a (joined) string literal - `i` should start at the end of the string literal - returns `()` (an empty tuple) for things which are not string literals ```pycon >>> tokens = src_to_tokens('"foo" "bar".capitalize()') >>> rfind_string_parts(tokens, 2) (0, 2) >>> tokens = src_to_tokens('("foo" "bar").capitalize()') >>> rfind_string_parts(tokens, 4) (1, 3) ``` ## Differences from `tokenize` - `tokenize-rt` adds `ESCAPED_NL` for a backslash-escaped newline "token" - `tokenize-rt` adds `UNIMPORTANT_WS` for whitespace (discarded in `tokenize`) - `tokenize-rt` normalizes string prefixes, even if they are not parsed -- for instance, this means you'll see `Token('STRING', "f'foo'", ...)` even in python 2. - `tokenize-rt` normalizes python 2 long literals (`4l` / `4L`) and octal literals (`0755`) in python 3 (for easier rewriting of python 2 code while running python 3). ## Sample usage - https://github.com/asottile/add-trailing-comma - https://github.com/asottile/future-annotations - https://github.com/asottile/future-fstrings - https://github.com/asottile/pyupgrade - https://github.com/asottile/yesqa tokenize-rt-6.0.0/requirements-dev.txt000066400000000000000000000000411465376601600200410ustar00rootroot00000000000000covdefaults>=2.1 coverage pytest tokenize-rt-6.0.0/setup.cfg000066400000000000000000000021031465376601600156230ustar00rootroot00000000000000[metadata] name = tokenize_rt version = 6.0.0 description = A wrapper around the stdlib `tokenize` which roundtrips. long_description = file: README.md long_description_content_type = text/markdown url = https://github.com/asottile/tokenize-rt author = Anthony Sottile author_email = asottile@umich.edu license = MIT license_files = LICENSE classifiers = License :: OSI Approved :: MIT License Programming Language :: Python :: 3 Programming Language :: Python :: 3 :: Only Programming Language :: Python :: Implementation :: CPython Programming Language :: Python :: Implementation :: PyPy [options] py_modules = tokenize_rt python_requires = >=3.8 [options.entry_points] console_scripts = tokenize-rt = tokenize_rt:main [bdist_wheel] universal = True [coverage:run] plugins = covdefaults [mypy] check_untyped_defs = true disallow_any_generics = true disallow_incomplete_defs = true disallow_untyped_defs = true warn_redundant_casts = true warn_unused_ignores = true [mypy-testing.*] disallow_untyped_defs = false [mypy-tests.*] disallow_untyped_defs = false tokenize-rt-6.0.0/setup.py000066400000000000000000000001111465376601600155110ustar00rootroot00000000000000from __future__ import annotations from setuptools import setup setup() tokenize-rt-6.0.0/testing/000077500000000000000000000000001465376601600154635ustar00rootroot00000000000000tokenize-rt-6.0.0/testing/resources/000077500000000000000000000000001465376601600174755ustar00rootroot00000000000000tokenize-rt-6.0.0/testing/resources/backslash_continuation.py000066400000000000000000000001711465376601600245730ustar00rootroot00000000000000from __future__ import annotations x = \ 5 # Also with multiple lines of backslashing x = \ \ \ \ 5 tokenize-rt-6.0.0/testing/resources/empty.py000066400000000000000000000000001465376601600211730ustar00rootroot00000000000000tokenize-rt-6.0.0/testing/resources/unicode_snowman.py000066400000000000000000000000551465376601600232370ustar00rootroot00000000000000from __future__ import annotations x = '☃' tokenize-rt-6.0.0/tests/000077500000000000000000000000001465376601600151505ustar00rootroot00000000000000tokenize-rt-6.0.0/tests/__init__.py000066400000000000000000000000001465376601600172470ustar00rootroot00000000000000tokenize-rt-6.0.0/tests/tokenize_rt_test.py000066400000000000000000000265211465376601600211240ustar00rootroot00000000000000from __future__ import annotations import re import sys import pytest from tokenize_rt import _re_partition from tokenize_rt import ESCAPED_NL from tokenize_rt import main from tokenize_rt import Offset from tokenize_rt import parse_string_literal from tokenize_rt import reversed_enumerate from tokenize_rt import rfind_string_parts from tokenize_rt import src_to_tokens from tokenize_rt import Token from tokenize_rt import tokens_to_src from tokenize_rt import UNIMPORTANT_WS def test_re_partition_no_match(): ret = _re_partition(re.compile('z'), 'abc') assert ret == ('abc', '', '') def test_re_partition_match(): ret = _re_partition(re.compile('b'), 'abc') assert ret == ('a', 'b', 'c') def test_offset_default_values(): assert Offset() == Offset(line=None, utf8_byte_offset=None) def test_token_offset(): token = Token('NAME', 'x', line=1, utf8_byte_offset=2) assert token.offset == Offset(line=1, utf8_byte_offset=2) def test_token_matches(): token = Token('NAME', 'x', line=1, utf8_byte_offset=2) assert token.matches(name='NAME', src='x') assert not token.matches(name='OP', src='x') assert not token.matches(name='NAME', src='y') assert not token.matches(name='OP', src=':') def test_src_to_tokens_simple(): src = 'x = 5\n' ret = src_to_tokens(src) assert ret == [ Token('NAME', 'x', line=1, utf8_byte_offset=0), Token(UNIMPORTANT_WS, ' ', line=1, utf8_byte_offset=1), Token('OP', '=', line=1, utf8_byte_offset=2), Token(UNIMPORTANT_WS, ' ', line=1, utf8_byte_offset=3), Token('NUMBER', '5', line=1, utf8_byte_offset=4), Token('NEWLINE', '\n', line=1, utf8_byte_offset=5), Token('ENDMARKER', '', line=2, utf8_byte_offset=0), ] def test_src_to_tokens_escaped_nl(): src = ( 'x = \\\n' ' 5\n' ) ret = src_to_tokens(src) assert ret == [ Token('NAME', 'x', line=1, utf8_byte_offset=0), Token(UNIMPORTANT_WS, ' ', line=1, utf8_byte_offset=1), Token('OP', '=', line=1, utf8_byte_offset=2), Token(UNIMPORTANT_WS, ' ', line=1, utf8_byte_offset=3), Token(ESCAPED_NL, '\\\n', line=1, utf8_byte_offset=4), Token(UNIMPORTANT_WS, ' ', line=2, utf8_byte_offset=0), Token('NUMBER', '5', line=2, utf8_byte_offset=4), Token('NEWLINE', '\n', line=2, utf8_byte_offset=5), Token('ENDMARKER', '', line=3, utf8_byte_offset=0), ] def test_src_to_tokens_escaped_nl_no_left_ws(): src = ( 'x =\\\n' ' 5\n' ) ret = src_to_tokens(src) assert ret == [ Token('NAME', 'x', line=1, utf8_byte_offset=0), Token(UNIMPORTANT_WS, ' ', line=1, utf8_byte_offset=1), Token('OP', '=', line=1, utf8_byte_offset=2), Token(ESCAPED_NL, '\\\n', line=1, utf8_byte_offset=3), Token(UNIMPORTANT_WS, ' ', line=2, utf8_byte_offset=0), Token('NUMBER', '5', line=2, utf8_byte_offset=4), Token('NEWLINE', '\n', line=2, utf8_byte_offset=5), Token('ENDMARKER', '', line=3, utf8_byte_offset=0), ] def test_src_to_tokens_escaped_nl_windows(): src = ( 'x = \\\r\n' ' 5\r\n' ) ret = src_to_tokens(src) assert ret == [ Token('NAME', 'x', line=1, utf8_byte_offset=0), Token(UNIMPORTANT_WS, ' ', line=1, utf8_byte_offset=1), Token('OP', '=', line=1, utf8_byte_offset=2), Token(UNIMPORTANT_WS, ' ', line=1, utf8_byte_offset=3), Token(ESCAPED_NL, '\\\r\n', line=1, utf8_byte_offset=4), Token(UNIMPORTANT_WS, ' ', line=2, utf8_byte_offset=0), Token('NUMBER', '5', line=2, utf8_byte_offset=4), Token('NEWLINE', '\r\n', line=2, utf8_byte_offset=5), Token('ENDMARKER', '', line=3, utf8_byte_offset=0), ] def test_src_to_tokens_implicit_continue(): src = ( 'x = (\n' ' 1,\n' ' 2,\n' ')\n' ) ret = src_to_tokens(src) assert ret == [ Token(name='NAME', src='x', line=1, utf8_byte_offset=0), Token(name='UNIMPORTANT_WS', src=' ', line=1, utf8_byte_offset=1), Token(name='OP', src='=', line=1, utf8_byte_offset=2), Token(name='UNIMPORTANT_WS', src=' ', line=1, utf8_byte_offset=3), Token(name='OP', src='(', line=1, utf8_byte_offset=4), Token(name='NL', src='\n', line=1, utf8_byte_offset=5), Token(name='UNIMPORTANT_WS', src=' ', line=2, utf8_byte_offset=0), Token(name='NUMBER', src='1', line=2, utf8_byte_offset=4), Token(name='OP', src=',', line=2, utf8_byte_offset=5), Token(name='NL', src='\n', line=2, utf8_byte_offset=6), Token(name='UNIMPORTANT_WS', src=' ', line=3, utf8_byte_offset=0), Token(name='NUMBER', src='2', line=3, utf8_byte_offset=4), Token(name='OP', src=',', line=3, utf8_byte_offset=5), Token(name='NL', src='\n', line=3, utf8_byte_offset=6), Token(name='OP', src=')', line=4, utf8_byte_offset=0), Token(name='NEWLINE', src='\n', line=4, utf8_byte_offset=1), Token(name='ENDMARKER', src='', line=5, utf8_byte_offset=0), ] def test_src_to_tokens_no_eol_eof(): ret = src_to_tokens('1') assert ret == [ Token('NUMBER', '1', line=1, utf8_byte_offset=0), Token('NEWLINE', '', line=1, utf8_byte_offset=1), Token('ENDMARKER', '', line=2, utf8_byte_offset=0), ] def test_src_to_tokens_multiline_string(): src = ( 'x = """\n' ' y\n' '""".format(1)\n' ) ret = src_to_tokens(src) assert ret == [ Token(name='NAME', src='x', line=1, utf8_byte_offset=0), Token(name='UNIMPORTANT_WS', src=' ', line=1, utf8_byte_offset=1), Token(name='OP', src='=', line=1, utf8_byte_offset=2), Token(name='UNIMPORTANT_WS', src=' ', line=1, utf8_byte_offset=3), Token(name='STRING', src='"""\n y\n"""', line=1, utf8_byte_offset=4), Token(name='OP', src='.', line=3, utf8_byte_offset=3), Token(name='NAME', src='format', line=3, utf8_byte_offset=4), Token(name='OP', src='(', line=3, utf8_byte_offset=10), Token(name='NUMBER', src='1', line=3, utf8_byte_offset=11), Token(name='OP', src=')', line=3, utf8_byte_offset=12), Token(name='NEWLINE', src='\n', line=3, utf8_byte_offset=13), Token(name='ENDMARKER', src='', line=4, utf8_byte_offset=0), ] def test_src_to_tokens_fstring_with_escapes(): src = 'f" a {{ {b} }} c"' ret = src_to_tokens(src) if sys.version_info >= (3, 12): # pragma: >=3.12 cover assert ret == [ Token(name='FSTRING_START', src='f"', line=1, utf8_byte_offset=0), Token(name='FSTRING_MIDDLE', src=' a {{', line=1, utf8_byte_offset=2), # noqa: E501 Token(name='FSTRING_MIDDLE', src=' ', line=1, utf8_byte_offset=7), Token(name='OP', src='{', line=1, utf8_byte_offset=8), Token(name='NAME', src='b', line=1, utf8_byte_offset=9), Token(name='OP', src='}', line=1, utf8_byte_offset=10), Token(name='FSTRING_MIDDLE', src=' }}', line=1, utf8_byte_offset=11), # noqa: E501 Token(name='FSTRING_MIDDLE', src=' c', line=1, utf8_byte_offset=14), # noqa: E501 Token(name='FSTRING_END', src='"', line=1, utf8_byte_offset=16), Token(name='NEWLINE', src='', line=1, utf8_byte_offset=17), Token(name='ENDMARKER', src='', line=2, utf8_byte_offset=0), ] else: # pragma: <3.12 cover assert ret == [ Token(name='STRING', src='f" a {{ {b} }} c"', line=1, utf8_byte_offset=0), # noqa: E501 Token(name='NEWLINE', src='', line=1, utf8_byte_offset=17), Token(name='ENDMARKER', src='', line=2, utf8_byte_offset=0), ] @pytest.mark.parametrize( 'filename', ( 'testing/resources/empty.py', 'testing/resources/unicode_snowman.py', 'testing/resources/backslash_continuation.py', ), ) def test_roundtrip_tokenize(filename): with open(filename) as f: contents = f.read() ret = tokens_to_src(src_to_tokens(contents)) assert ret == contents def test_reversed_enumerate(): tokens = src_to_tokens('x = 5\n') ret = reversed_enumerate(tokens) assert next(ret) == (6, Token('ENDMARKER', '', line=2, utf8_byte_offset=0)) rest = list(ret) assert rest == [ (5, Token(name='NEWLINE', src='\n', line=1, utf8_byte_offset=5)), (4, Token('NUMBER', '5', line=1, utf8_byte_offset=4)), (3, Token(UNIMPORTANT_WS, ' ', line=1, utf8_byte_offset=3)), (2, Token('OP', '=', line=1, utf8_byte_offset=2)), (1, Token(UNIMPORTANT_WS, ' ', line=1, utf8_byte_offset=1)), (0, Token('NAME', 'x', line=1, utf8_byte_offset=0)), ] @pytest.mark.parametrize( ('s', 'expected'), ( ('""', ('', '""')), ('u"foo"', ('u', '"foo"')), ('F"hi"', ('F', '"hi"')), ('r"""x"""', ('r', '"""x"""')), ), ) def test_parse_string_literal(s, expected): assert parse_string_literal(s) == expected @pytest.mark.parametrize('src', ('""', "b''", "r'''.'''")) def test_rfind_string_parts_only_literal(src): tokens = src_to_tokens(src) assert rfind_string_parts(tokens, 0) == (0,) def test_rfind_string_parts_py312_plus(): # in 3.12 this was changed to have its own tokenization (not as a string) tokens = src_to_tokens("f''") if sys.version_info >= (3, 12): # pragma: >=3.12 cover assert rfind_string_parts(tokens, 0) == () else: # pragma: <3.12 cover assert rfind_string_parts(tokens, 0) == (0,) @pytest.mark.parametrize( ('src', 'n', 'expected'), ( ('"foo" "bar"', 2, (0, 2)), ('"""foo""" "bar"', 2, (0, 2)), ( '(\n' ' "foo"\n' ' "bar"\n' ')', 8, (3, 6), ), ( 'print(\n' ' "foo"\n' ' "bar"\n' ')', 7, (4, 7), ), ), ) def test_rfind_string_parts_multiple_tokens(src, n, expected): tokens = src_to_tokens(src) assert rfind_string_parts(tokens, n) == expected def test_rfind_string_parts_not_a_string(): tokens = src_to_tokens('print') assert rfind_string_parts(tokens, 0) == () @pytest.mark.parametrize( ('src', 'n'), ( # v ('x(1, "foo")', 6), # v ('x ("foo")', 4), # v ('x[0]("foo")', 6), # v ('x(0)("foo")', 6), ), ) def test_rfind_string_parts_end_of_call_looks_like_string(src, n): tokens = src_to_tokens(src) assert rfind_string_parts(tokens, n) == () @pytest.mark.parametrize( ('src', 'n', 'expected_i'), ( # v ('("foo")', 2, 1), # v ('((("foo")))', 6, 3), # v ('a + ("foo")', 6, 5), # v ('a or ("foo")', 6, 5), ), ) def test_rfind_string_parts_parenthesized(src, n, expected_i): tokens = src_to_tokens(src) assert rfind_string_parts(tokens, n) == (expected_i,) def test_main(capsys, tmp_path): f = tmp_path.joinpath('simple.py') f.write_text('x = 5\n') main((str(f),)) out, _ = capsys.readouterr() assert out == ( "1:0 NAME 'x'\n" "1:1 UNIMPORTANT_WS ' '\n" "1:2 OP '='\n" "1:3 UNIMPORTANT_WS ' '\n" "1:4 NUMBER '5'\n" "1:5 NEWLINE '\\n'\n" "2:0 ENDMARKER ''\n" ) tokenize-rt-6.0.0/tokenize_rt.py000066400000000000000000000135541465376601600167250ustar00rootroot00000000000000from __future__ import annotations import argparse import io import keyword import re import sys import tokenize from typing import Generator from typing import Iterable from typing import NamedTuple from typing import Pattern from typing import Sequence # this is a performance hack. see https://bugs.python.org/issue43014 if ( # pragma: no branch sys.version_info < (3, 10) and callable(getattr(tokenize, '_compile', None)) ): # pragma: <3.10 cover from functools import lru_cache tokenize._compile = lru_cache(tokenize._compile) ESCAPED_NL = 'ESCAPED_NL' UNIMPORTANT_WS = 'UNIMPORTANT_WS' NON_CODING_TOKENS = frozenset(('COMMENT', ESCAPED_NL, 'NL', UNIMPORTANT_WS)) class Offset(NamedTuple): line: int | None = None utf8_byte_offset: int | None = None class Token(NamedTuple): name: str src: str line: int | None = None utf8_byte_offset: int | None = None @property def offset(self) -> Offset: return Offset(self.line, self.utf8_byte_offset) def matches(self, *, name: str, src: str) -> bool: return self.name == name and self.src == src _string_re = re.compile('^([^\'"]*)(.*)$', re.DOTALL) _escaped_nl_re = re.compile(r'\\(\n|\r\n|\r)') def _re_partition(regex: Pattern[str], s: str) -> tuple[str, str, str]: match = regex.search(s) if match: return s[:match.start()], s[slice(*match.span())], s[match.end():] else: return (s, '', '') def src_to_tokens(src: str) -> list[Token]: tokenize_target = io.StringIO(src) lines = ('',) + tuple(tokenize_target) tokenize_target.seek(0) tokens = [] last_line = 1 last_col = 0 end_offset = 0 gen = tokenize.generate_tokens(tokenize_target.readline) for tok_type, tok_text, (sline, scol), (eline, ecol), line in gen: if sline > last_line: newtok = lines[last_line][last_col:] for lineno in range(last_line + 1, sline): newtok += lines[lineno] if scol > 0: newtok += lines[sline][:scol] # a multiline unimportant whitespace may contain escaped newlines while _escaped_nl_re.search(newtok): ws, nl, newtok = _re_partition(_escaped_nl_re, newtok) if ws: tokens.append( Token(UNIMPORTANT_WS, ws, last_line, end_offset), ) end_offset += len(ws.encode()) tokens.append(Token(ESCAPED_NL, nl, last_line, end_offset)) end_offset = 0 last_line += 1 if newtok: tokens.append(Token(UNIMPORTANT_WS, newtok, sline, 0)) end_offset = len(newtok.encode()) else: end_offset = 0 elif scol > last_col: newtok = line[last_col:scol] tokens.append(Token(UNIMPORTANT_WS, newtok, sline, end_offset)) end_offset += len(newtok.encode()) tok_name = tokenize.tok_name[tok_type] if tok_name == 'FSTRING_MIDDLE': # pragma: >=3.12 cover ecol += tok_text.count('{') + tok_text.count('}') tok_text = tok_text.replace('{', '{{').replace('}', '}}') tokens.append(Token(tok_name, tok_text, sline, end_offset)) last_line, last_col = eline, ecol if sline != eline: end_offset = len(lines[last_line][:last_col].encode()) else: end_offset += len(tok_text.encode()) return tokens def tokens_to_src(tokens: Iterable[Token]) -> str: return ''.join(tok.src for tok in tokens) def reversed_enumerate( tokens: Sequence[Token], ) -> Generator[tuple[int, Token]]: for i in reversed(range(len(tokens))): yield i, tokens[i] def parse_string_literal(src: str) -> tuple[str, str]: """parse a string literal's source into (prefix, string)""" match = _string_re.match(src) assert match is not None return match.group(1), match.group(2) def rfind_string_parts(tokens: Sequence[Token], i: int) -> tuple[int, ...]: """find the indicies of the string parts of a (joined) string literal - `i` should start at the end of the string literal - returns `()` (an empty tuple) for things which are not string literals """ ret = [] depth = 0 for i in range(i, -1, -1): token = tokens[i] if token.name == 'STRING': ret.append(i) elif token.name in NON_CODING_TOKENS: pass elif token.src == ')': depth += 1 elif depth and token.src == '(': depth -= 1 # if we closed the paren(s) make sure it was a parenthesized string # and not actually a call if depth == 0: for j in range(i - 1, -1, -1): tok = tokens[j] if tok.name in NON_CODING_TOKENS: pass # this was actually a call and not a parenthesized string elif ( tok.src in {']', ')'} or ( tok.name == 'NAME' and tok.src not in keyword.kwlist ) ): return () else: break break elif depth: # it looked like a string but wasn't return () else: break return tuple(reversed(ret)) def main(argv: Sequence[str] | None = None) -> int: parser = argparse.ArgumentParser() parser.add_argument('filename') args = parser.parse_args(argv) with open(args.filename) as f: tokens = src_to_tokens(f.read()) for token in tokens: line, col = str(token.line), str(token.utf8_byte_offset) print(f'{line}:{col} {token.name} {token.src!r}') return 0 if __name__ == '__main__': raise SystemExit(main()) tokenize-rt-6.0.0/tox.ini000066400000000000000000000004751465376601600153270ustar00rootroot00000000000000[tox] envlist = py,pre-commit [testenv] deps = -rrequirements-dev.txt commands = coverage erase coverage run -m pytest {posargs:tests} coverage report [testenv:pre-commit] skip_install = true deps = pre-commit commands = pre-commit run --all-files --show-diff-on-failure [pep8] ignore = E265,E501,W504