latexcodec-1.0.5/0000755005105600024240000000000013120706257013603 5ustar dma0mtdma00000000000000latexcodec-1.0.5/CHANGELOG.rst0000644005105600024240000000411313120705203015611 0ustar dma0mtdma000000000000001.0.5 (16 June 2017) -------------------- * More maths symbols (naturals, reals, ...). * Fix lower case z with accents (reported by AndrewSwann, see issue #51). 1.0.4 (21 September 2016) ------------------------- * Fix encoding and decoding of percent sign (reported by jgosmann, see issue #48). 1.0.3 (26 March 2016) --------------------- * New ``'keep'`` error for the ulatex encoder to keep unicode characters that cannot be translated (contributed by xuhdev, see pull requestion #45). 1.0.2 (1 March 2016) -------------------- * New ``ulatex`` codec which works as a text transform on unicode strings. * Fix spacing when translating math (see issue #29, reported by beltiste). * Performance improvements in latex to unicode translation. * Support old-style math mode (see pull request #40, contributed by xuhdev). * Treat tab character as a space character (see discussion in issue #40, raised by xuhdev). 1.0.1 (24 September 2014) ------------------------- * br"\\par" is now decoded using two newlines (see issue #26, reported by Jorrit Wronski). * Fix encoding and decoding of the ogonek (see issue #24, reported by beltiste). 1.0.0 (5 August 2014) --------------------- * Add Python 3.4 support. * Fix "DZ" decoding (see issue #21, reported and fixed by Philipp Spitzer). 0.3.2 (17 April 2014) --------------------- * Fix underscore "\\_" encoding (see issue #17, reported and fixed by Michael Radziej). 0.3.1 (5 February 2014) ----------------------- * Drop Python 3.2 support. * Drop 2to3 and instead use six to support both Python 2 and 3 from a single code base. * Fix control space "\\ " decoding. * Fix LaTeX encoding of number sign "#" and other special ascii characters (see issues #11 and #13, reported by beltiste). 0.3.0 (19 August 2013) ---------------------- * Copied lexer and codec from sphinxcontrib-bibtex. * Initial usage and API documentation. * Some small bugs fixed. 0.2 (28 September 2012) ----------------------- * Adding additional codec with brackets around special characters. 0.1 (26 May 2012) ----------------- * Initial release. latexcodec-1.0.5/README.rst0000644005105600024240000000117012770454474015304 0ustar dma0mtdma00000000000000latexcodec ========== |travis| |coveralls| A lexer and codec to work with LaTeX code in Python. * Download: http://pypi.python.org/pypi/latexcodec/#downloads * Documentation: http://latexcodec.readthedocs.org/ * Development: http://github.com/mcmtroffaes/latexcodec/ .. |travis| image:: https://travis-ci.org/mcmtroffaes/latexcodec.png?branch=develop :target: https://travis-ci.org/mcmtroffaes/latexcodec :alt: travis-ci .. |coveralls| image:: https://coveralls.io/repos/mcmtroffaes/latexcodec/badge.png?branch=develop :target: https://coveralls.io/r/mcmtroffaes/latexcodec?branch=develop :alt: coveralls.io latexcodec-1.0.5/requirements.txt0000644005105600024240000000001212274466514017070 0ustar dma0mtdma00000000000000six>=1.4.1latexcodec-1.0.5/latexcodec/0000755005105600024240000000000013120706257015716 5ustar dma0mtdma00000000000000latexcodec-1.0.5/latexcodec/__init__.py0000644005105600024240000000006412274464320020030 0ustar dma0mtdma00000000000000import latexcodec.codec latexcodec.codec.register() latexcodec-1.0.5/latexcodec/codec.py0000644005105600024240000012116313117447277017363 0ustar dma0mtdma00000000000000# -*- coding: utf-8 -*- """ LaTeX Codec ~~~~~~~~~~~ The :mod:`latexcodec.codec` module contains all classes and functions for LaTeX code translation. For practical use, you should only ever need to import the :mod:`latexcodec` module, which will automatically register the codec so it can be used by :meth:`str.encode`, :meth:`str.decode`, and any of the functions defined in the :mod:`codecs` module such as :func:`codecs.open` and so on. The other functions and classes are exposed in case someone would want to extend them. .. autofunction:: register .. autofunction:: find_latex .. autoclass:: LatexIncrementalEncoder :show-inheritance: :members: .. autoclass:: LatexIncrementalDecoder :show-inheritance: :members: .. autoclass:: LatexCodec :show-inheritance: :members: .. autoclass:: LatexUnicodeTable :members: """ # Copyright (c) 2003, 2008 David Eppstein # Copyright (c) 2011-2014 Matthias C. M. Troffaes # # Permission is hereby granted, free of charge, to any person # obtaining a copy of this software and associated documentation # files (the "Software"), to deal in the Software without # restriction, including without limitation the rights to use, # copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following # conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR # OTHER DEALINGS IN THE SOFTWARE. from __future__ import print_function import codecs from six import string_types from six.moves import range from latexcodec import lexer def register(): """Register the :func:`find_latex` codec search function. .. seealso:: :func:`codecs.register` """ codecs.register(find_latex) # returns the codec search function # this is used if latex_codec.py were to be placed in stdlib def getregentry(): """Encodings module API.""" return find_latex('latex') class LatexUnicodeTable: """Tabulates a translation between LaTeX and unicode.""" def __init__(self, lexer): self.lexer = lexer self.unicode_map = {} self.max_length = 0 self.latex_map = {} self.register_all() def register_all(self): """Register all symbols and their LaTeX equivalents (called by constructor). """ # TODO complete this list # register special symbols self.register(u'\n\n', b' \\par', encode=False) self.register(u'\n\n', b'\\par', encode=False) self.register(u' ', b'\\ ', encode=False) self.register(u'%', b'\\%') self.register(u'\N{EN DASH}', b'--') self.register(u'\N{EN DASH}', b'\\textendash') self.register(u'\N{EM DASH}', b'---') self.register(u'\N{EM DASH}', b'\\textemdash') self.register(u'\N{LEFT SINGLE QUOTATION MARK}', b'`', decode=False) self.register(u'\N{RIGHT SINGLE QUOTATION MARK}', b"'", decode=False) self.register(u'\N{LEFT DOUBLE QUOTATION MARK}', b'``') self.register(u'\N{RIGHT DOUBLE QUOTATION MARK}', b"''") self.register(u'\N{DOUBLE LOW-9 QUOTATION MARK}', b'\\glqq') self.register(u'\N{DAGGER}', b'\\dag') self.register(u'\N{DOUBLE DAGGER}', b'\\ddag') self.register(u'\\', b'\\textbackslash', encode=False) self.register(u'\\', b'\\backslash', mode='math', encode=False) self.register(u'\N{TILDE OPERATOR}', b'\\sim', mode='math') self.register(u'\N{MODIFIER LETTER LOW TILDE}', b'\\texttildelow', package='textcomp') self.register(u'\N{SMALL TILDE}', b'\\~{}') self.register(u'~', b'\\textasciitilde') self.register(u'\N{BULLET}', b'\\bullet', mode='math') self.register(u'\N{BULLET}', b'\\textbullet', package='textcomp') self.register(u'\N{NUMBER SIGN}', b'\\#') self.register(u'\N{LOW LINE}', b'\\_') self.register(u'\N{AMPERSAND}', b'\\&') self.register(u'\N{NO-BREAK SPACE}', b'~') self.register(u'\N{INVERTED EXCLAMATION MARK}', b'!`') self.register(u'\N{CENT SIGN}', b'\\not{c}') self.register(u'\N{POUND SIGN}', b'\\pounds') self.register(u'\N{POUND SIGN}', b'\\textsterling', package='textcomp') self.register(u'\N{SECTION SIGN}', b'\\S') self.register(u'\N{DIAERESIS}', b'\\"{}') self.register(u'\N{NOT SIGN}', b'\\neg') self.register(u'\N{SOFT HYPHEN}', b'\\-') self.register(u'\N{MACRON}', b'\\={}') self.register(u'\N{DEGREE SIGN}', b'^\\circ', mode='math') self.register(u'\N{DEGREE SIGN}', b'\\textdegree', package='textcomp') self.register(u'\N{PLUS-MINUS SIGN}', b'\\pm', mode='math') self.register(u'\N{PLUS-MINUS SIGN}', b'\\textpm', package='textcomp') self.register(u'\N{SUPERSCRIPT TWO}', b'^2', mode='math') self.register( u'\N{SUPERSCRIPT TWO}', b'\\texttwosuperior', package='textcomp') self.register(u'\N{SUPERSCRIPT THREE}', b'^3', mode='math') self.register( u'\N{SUPERSCRIPT THREE}', b'\\textthreesuperior', package='textcomp') self.register(u'\N{ACUTE ACCENT}', b"\\'{}") self.register(u'\N{MICRO SIGN}', b'\\mu', mode='math') self.register(u'\N{MICRO SIGN}', b'\\micro', package='gensymb') self.register(u'\N{PILCROW SIGN}', b'\\P') self.register(u'\N{MIDDLE DOT}', b'\\cdot', mode='math') self.register( u'\N{MIDDLE DOT}', b'\\textperiodcentered', package='textcomp') self.register(u'\N{CEDILLA}', b'\\c{}') self.register(u'\N{SUPERSCRIPT ONE}', b'^1', mode='math') self.register( u'\N{SUPERSCRIPT ONE}', b'\\textonesuperior', package='textcomp') self.register(u'\N{INVERTED QUESTION MARK}', b'?`') self.register(u'\N{LATIN CAPITAL LETTER A WITH GRAVE}', b'\\`A') self.register(u'\N{LATIN CAPITAL LETTER A WITH CIRCUMFLEX}', b'\\^A') self.register(u'\N{LATIN CAPITAL LETTER A WITH TILDE}', b'\\~A') self.register(u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}', b'\\"A') self.register(u'\N{LATIN CAPITAL LETTER A WITH RING ABOVE}', b'\\AA') self.register(u'\N{LATIN CAPITAL LETTER AE}', b'\\AE') self.register(u'\N{LATIN CAPITAL LETTER C WITH CEDILLA}', b'\\c C') self.register(u'\N{LATIN CAPITAL LETTER E WITH GRAVE}', b'\\`E') self.register(u'\N{LATIN CAPITAL LETTER E WITH ACUTE}', b"\\'E") self.register(u'\N{LATIN CAPITAL LETTER E WITH CIRCUMFLEX}', b'\\^E') self.register(u'\N{LATIN CAPITAL LETTER E WITH DIAERESIS}', b'\\"E') self.register(u'\N{LATIN CAPITAL LETTER I WITH GRAVE}', b'\\`I') self.register(u'\N{LATIN CAPITAL LETTER I WITH CIRCUMFLEX}', b'\\^I') self.register(u'\N{LATIN CAPITAL LETTER I WITH DIAERESIS}', b'\\"I') self.register(u'\N{LATIN CAPITAL LETTER N WITH TILDE}', b'\\~N') self.register(u'\N{LATIN CAPITAL LETTER O WITH GRAVE}', b'\\`O') self.register(u'\N{LATIN CAPITAL LETTER O WITH ACUTE}', b"\\'O") self.register(u'\N{LATIN CAPITAL LETTER O WITH CIRCUMFLEX}', b'\\^O') self.register(u'\N{LATIN CAPITAL LETTER O WITH TILDE}', b'\\~O') self.register(u'\N{LATIN CAPITAL LETTER O WITH DIAERESIS}', b'\\"O') self.register(u'\N{MULTIPLICATION SIGN}', b'\\times', mode='math') self.register(u'\N{LATIN CAPITAL LETTER O WITH STROKE}', b'\\O') self.register(u'\N{LATIN CAPITAL LETTER U WITH GRAVE}', b'\\`U') self.register(u'\N{LATIN CAPITAL LETTER U WITH ACUTE}', b"\\'U") self.register(u'\N{LATIN CAPITAL LETTER U WITH CIRCUMFLEX}', b'\\^U') self.register(u'\N{LATIN CAPITAL LETTER U WITH DIAERESIS}', b'\\"U') self.register(u'\N{LATIN CAPITAL LETTER Y WITH ACUTE}', b"\\'Y") self.register(u'\N{LATIN SMALL LETTER SHARP S}', b'\\ss') self.register(u'\N{LATIN SMALL LETTER A WITH GRAVE}', b'\\`a') self.register(u'\N{LATIN SMALL LETTER A WITH ACUTE}', b"\\'a") self.register(u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}', b'\\^a') self.register(u'\N{LATIN SMALL LETTER A WITH TILDE}', b'\\~a') self.register(u'\N{LATIN SMALL LETTER A WITH DIAERESIS}', b'\\"a') self.register(u'\N{LATIN SMALL LETTER A WITH RING ABOVE}', b'\\aa') self.register(u'\N{LATIN SMALL LETTER AE}', b'\\ae') self.register(u'\N{LATIN SMALL LETTER C WITH CEDILLA}', b'\\c c') self.register(u'\N{LATIN SMALL LETTER E WITH GRAVE}', b'\\`e') self.register(u'\N{LATIN SMALL LETTER E WITH ACUTE}', b"\\'e") self.register(u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}', b'\\^e') self.register(u'\N{LATIN SMALL LETTER E WITH DIAERESIS}', b'\\"e') self.register(u'\N{LATIN SMALL LETTER I WITH GRAVE}', b'\\`\\i') self.register(u'\N{LATIN SMALL LETTER I WITH GRAVE}', b'\\`i') self.register(u'\N{LATIN SMALL LETTER I WITH ACUTE}', b"\\'\\i") self.register(u'\N{LATIN SMALL LETTER I WITH ACUTE}', b"\\'i") self.register(u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}', b'\\^\\i') self.register(u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}', b'\\^i') self.register(u'\N{LATIN SMALL LETTER I WITH DIAERESIS}', b'\\"\\i') self.register(u'\N{LATIN SMALL LETTER I WITH DIAERESIS}', b'\\"i') self.register(u'\N{LATIN SMALL LETTER N WITH TILDE}', b'\\~n') self.register(u'\N{LATIN SMALL LETTER O WITH GRAVE}', b'\\`o') self.register(u'\N{LATIN SMALL LETTER O WITH ACUTE}', b"\\'o") self.register(u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}', b'\\^o') self.register(u'\N{LATIN SMALL LETTER O WITH TILDE}', b'\\~o') self.register(u'\N{LATIN SMALL LETTER O WITH DIAERESIS}', b'\\"o') self.register(u'\N{DIVISION SIGN}', b'\\div', mode='math') self.register(u'\N{LATIN SMALL LETTER O WITH STROKE}', b'\\o') self.register(u'\N{LATIN SMALL LETTER U WITH GRAVE}', b'\\`u') self.register(u'\N{LATIN SMALL LETTER U WITH ACUTE}', b"\\'u") self.register(u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}', b'\\^u') self.register(u'\N{LATIN SMALL LETTER U WITH DIAERESIS}', b'\\"u') self.register(u'\N{LATIN SMALL LETTER Y WITH ACUTE}', b"\\'y") self.register(u'\N{LATIN SMALL LETTER Y WITH DIAERESIS}', b'\\"y') self.register(u'\N{LATIN CAPITAL LETTER A WITH MACRON}', b'\\=A') self.register(u'\N{LATIN SMALL LETTER A WITH MACRON}', b'\\=a') self.register(u'\N{LATIN CAPITAL LETTER A WITH BREVE}', b'\\u A') self.register(u'\N{LATIN SMALL LETTER A WITH BREVE}', b'\\u a') self.register(u'\N{LATIN CAPITAL LETTER A WITH OGONEK}', b'\\k A') self.register(u'\N{LATIN SMALL LETTER A WITH OGONEK}', b'\\k a') self.register(u'\N{LATIN CAPITAL LETTER C WITH ACUTE}', b"\\'C") self.register(u'\N{LATIN SMALL LETTER C WITH ACUTE}', b"\\'c") self.register(u'\N{LATIN CAPITAL LETTER C WITH CIRCUMFLEX}', b'\\^C') self.register(u'\N{LATIN SMALL LETTER C WITH CIRCUMFLEX}', b'\\^c') self.register(u'\N{LATIN CAPITAL LETTER C WITH DOT ABOVE}', b'\\.C') self.register(u'\N{LATIN SMALL LETTER C WITH DOT ABOVE}', b'\\.c') self.register(u'\N{LATIN CAPITAL LETTER C WITH CARON}', b'\\v C') self.register(u'\N{LATIN SMALL LETTER C WITH CARON}', b'\\v c') self.register(u'\N{LATIN CAPITAL LETTER D WITH CARON}', b'\\v D') self.register(u'\N{LATIN SMALL LETTER D WITH CARON}', b'\\v d') self.register(u'\N{LATIN CAPITAL LETTER E WITH MACRON}', b'\\=E') self.register(u'\N{LATIN SMALL LETTER E WITH MACRON}', b'\\=e') self.register(u'\N{LATIN CAPITAL LETTER E WITH BREVE}', b'\\u E') self.register(u'\N{LATIN SMALL LETTER E WITH BREVE}', b'\\u e') self.register(u'\N{LATIN CAPITAL LETTER E WITH DOT ABOVE}', b'\\.E') self.register(u'\N{LATIN SMALL LETTER E WITH DOT ABOVE}', b'\\.e') self.register(u'\N{LATIN CAPITAL LETTER E WITH OGONEK}', b'\\k E') self.register(u'\N{LATIN SMALL LETTER E WITH OGONEK}', b'\\k e') self.register(u'\N{LATIN CAPITAL LETTER E WITH CARON}', b'\\v E') self.register(u'\N{LATIN SMALL LETTER E WITH CARON}', b'\\v e') self.register(u'\N{LATIN CAPITAL LETTER G WITH CIRCUMFLEX}', b'\\^G') self.register(u'\N{LATIN SMALL LETTER G WITH CIRCUMFLEX}', b'\\^g') self.register(u'\N{LATIN CAPITAL LETTER G WITH BREVE}', b'\\u G') self.register(u'\N{LATIN SMALL LETTER G WITH BREVE}', b'\\u g') self.register(u'\N{LATIN CAPITAL LETTER G WITH DOT ABOVE}', b'\\.G') self.register(u'\N{LATIN SMALL LETTER G WITH DOT ABOVE}', b'\\.g') self.register(u'\N{LATIN CAPITAL LETTER G WITH CEDILLA}', b'\\c G') self.register(u'\N{LATIN SMALL LETTER G WITH CEDILLA}', b'\\c g') self.register(u'\N{LATIN CAPITAL LETTER H WITH CIRCUMFLEX}', b'\\^H') self.register(u'\N{LATIN SMALL LETTER H WITH CIRCUMFLEX}', b'\\^h') self.register(u'\N{LATIN CAPITAL LETTER I WITH TILDE}', b'\\~I') self.register(u'\N{LATIN SMALL LETTER I WITH TILDE}', b'\\~\\i') self.register(u'\N{LATIN SMALL LETTER I WITH TILDE}', b'\\~i') self.register(u'\N{LATIN CAPITAL LETTER I WITH MACRON}', b'\\=I') self.register(u'\N{LATIN SMALL LETTER I WITH MACRON}', b'\\=\\i') self.register(u'\N{LATIN SMALL LETTER I WITH MACRON}', b'\\=i') self.register(u'\N{LATIN CAPITAL LETTER I WITH BREVE}', b'\\u I') self.register(u'\N{LATIN SMALL LETTER I WITH BREVE}', b'\\u\\i') self.register(u'\N{LATIN SMALL LETTER I WITH BREVE}', b'\\u i') self.register(u'\N{LATIN CAPITAL LETTER I WITH OGONEK}', b'\\k I') self.register(u'\N{LATIN SMALL LETTER I WITH OGONEK}', b'\\k i') self.register(u'\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}', b'\\.I') self.register(u'\N{LATIN SMALL LETTER DOTLESS I}', b'\\i') self.register(u'\N{LATIN CAPITAL LIGATURE IJ}', b'IJ', decode=False) self.register(u'\N{LATIN SMALL LIGATURE IJ}', b'ij', decode=False) self.register(u'\N{LATIN CAPITAL LETTER J WITH CIRCUMFLEX}', b'\\^J') self.register(u'\N{LATIN SMALL LETTER J WITH CIRCUMFLEX}', b'\\^\\j') self.register(u'\N{LATIN SMALL LETTER J WITH CIRCUMFLEX}', b'\\^j') self.register(u'\N{LATIN CAPITAL LETTER K WITH CEDILLA}', b'\\c K') self.register(u'\N{LATIN SMALL LETTER K WITH CEDILLA}', b'\\c k') self.register(u'\N{LATIN CAPITAL LETTER L WITH ACUTE}', b"\\'L") self.register(u'\N{LATIN SMALL LETTER L WITH ACUTE}', b"\\'l") self.register(u'\N{LATIN CAPITAL LETTER L WITH CEDILLA}', b'\\c L') self.register(u'\N{LATIN SMALL LETTER L WITH CEDILLA}', b'\\c l') self.register(u'\N{LATIN CAPITAL LETTER L WITH CARON}', b'\\v L') self.register(u'\N{LATIN SMALL LETTER L WITH CARON}', b'\\v l') self.register(u'\N{LATIN CAPITAL LETTER L WITH STROKE}', b'\\L') self.register(u'\N{LATIN SMALL LETTER L WITH STROKE}', b'\\l') self.register(u'\N{LATIN CAPITAL LETTER N WITH ACUTE}', b"\\'N") self.register(u'\N{LATIN SMALL LETTER N WITH ACUTE}', b"\\'n") self.register(u'\N{LATIN CAPITAL LETTER N WITH CEDILLA}', b'\\c N') self.register(u'\N{LATIN SMALL LETTER N WITH CEDILLA}', b'\\c n') self.register(u'\N{LATIN CAPITAL LETTER N WITH CARON}', b'\\v N') self.register(u'\N{LATIN SMALL LETTER N WITH CARON}', b'\\v n') self.register(u'\N{LATIN CAPITAL LETTER O WITH MACRON}', b'\\=O') self.register(u'\N{LATIN SMALL LETTER O WITH MACRON}', b'\\=o') self.register(u'\N{LATIN CAPITAL LETTER O WITH BREVE}', b'\\u O') self.register(u'\N{LATIN SMALL LETTER O WITH BREVE}', b'\\u o') self.register( u'\N{LATIN CAPITAL LETTER O WITH DOUBLE ACUTE}', b'\\H O') self.register(u'\N{LATIN SMALL LETTER O WITH DOUBLE ACUTE}', b'\\H o') self.register(u'\N{LATIN CAPITAL LIGATURE OE}', b'\\OE') self.register(u'\N{LATIN SMALL LIGATURE OE}', b'\\oe') self.register(u'\N{LATIN CAPITAL LETTER R WITH ACUTE}', b"\\'R") self.register(u'\N{LATIN SMALL LETTER R WITH ACUTE}', b"\\'r") self.register(u'\N{LATIN CAPITAL LETTER R WITH CEDILLA}', b'\\c R') self.register(u'\N{LATIN SMALL LETTER R WITH CEDILLA}', b'\\c r') self.register(u'\N{LATIN CAPITAL LETTER R WITH CARON}', b'\\v R') self.register(u'\N{LATIN SMALL LETTER R WITH CARON}', b'\\v r') self.register(u'\N{LATIN CAPITAL LETTER S WITH ACUTE}', b"\\'S") self.register(u'\N{LATIN SMALL LETTER S WITH ACUTE}', b"\\'s") self.register(u'\N{LATIN CAPITAL LETTER S WITH CIRCUMFLEX}', b'\\^S') self.register(u'\N{LATIN SMALL LETTER S WITH CIRCUMFLEX}', b'\\^s') self.register(u'\N{LATIN CAPITAL LETTER S WITH CEDILLA}', b'\\c S') self.register(u'\N{LATIN SMALL LETTER S WITH CEDILLA}', b'\\c s') self.register(u'\N{LATIN CAPITAL LETTER S WITH CARON}', b'\\v S') self.register(u'\N{LATIN SMALL LETTER S WITH CARON}', b'\\v s') self.register(u'\N{LATIN CAPITAL LETTER T WITH CEDILLA}', b'\\c T') self.register(u'\N{LATIN SMALL LETTER T WITH CEDILLA}', b'\\c t') self.register(u'\N{LATIN CAPITAL LETTER T WITH CARON}', b'\\v T') self.register(u'\N{LATIN SMALL LETTER T WITH CARON}', b'\\v t') self.register(u'\N{LATIN CAPITAL LETTER U WITH TILDE}', b'\\~U') self.register(u'\N{LATIN SMALL LETTER U WITH TILDE}', b'\\~u') self.register(u'\N{LATIN CAPITAL LETTER U WITH MACRON}', b'\\=U') self.register(u'\N{LATIN SMALL LETTER U WITH MACRON}', b'\\=u') self.register(u'\N{LATIN CAPITAL LETTER U WITH BREVE}', b'\\u U') self.register(u'\N{LATIN SMALL LETTER U WITH BREVE}', b'\\u u') self.register(u'\N{LATIN CAPITAL LETTER U WITH RING ABOVE}', b'\\r U') self.register(u'\N{LATIN SMALL LETTER U WITH RING ABOVE}', b'\\r u') self.register( u'\N{LATIN CAPITAL LETTER U WITH DOUBLE ACUTE}', b'\\H U') self.register(u'\N{LATIN SMALL LETTER U WITH DOUBLE ACUTE}', b'\\H u') self.register(u'\N{LATIN CAPITAL LETTER U WITH OGONEK}', b'\\k U') self.register(u'\N{LATIN SMALL LETTER U WITH OGONEK}', b'\\k u') self.register(u'\N{LATIN CAPITAL LETTER W WITH CIRCUMFLEX}', b'\\^W') self.register(u'\N{LATIN SMALL LETTER W WITH CIRCUMFLEX}', b'\\^w') self.register(u'\N{LATIN CAPITAL LETTER Y WITH CIRCUMFLEX}', b'\\^Y') self.register(u'\N{LATIN SMALL LETTER Y WITH CIRCUMFLEX}', b'\\^y') self.register(u'\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}', b'\\"Y') self.register(u'\N{LATIN CAPITAL LETTER Z WITH ACUTE}', b"\\'Z") self.register(u'\N{LATIN SMALL LETTER Z WITH ACUTE}', b"\\'z") self.register(u'\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}', b'\\.Z') self.register(u'\N{LATIN SMALL LETTER Z WITH DOT ABOVE}', b'\\.z') self.register(u'\N{LATIN CAPITAL LETTER Z WITH CARON}', b'\\v Z') self.register(u'\N{LATIN SMALL LETTER Z WITH CARON}', b'\\v z') self.register(u'\N{LATIN CAPITAL LETTER DZ WITH CARON}', b'D\\v Z') self.register( u'\N{LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON}', b'D\\v z') self.register(u'\N{LATIN SMALL LETTER DZ WITH CARON}', b'd\\v z') self.register(u'\N{LATIN CAPITAL LETTER LJ}', b'LJ', decode=False) self.register( u'\N{LATIN CAPITAL LETTER L WITH SMALL LETTER J}', b'Lj', decode=False) self.register(u'\N{LATIN SMALL LETTER LJ}', b'lj', decode=False) self.register(u'\N{LATIN CAPITAL LETTER NJ}', b'NJ', decode=False) self.register( u'\N{LATIN CAPITAL LETTER N WITH SMALL LETTER J}', b'Nj', decode=False) self.register(u'\N{LATIN SMALL LETTER NJ}', b'nj', decode=False) self.register(u'\N{LATIN CAPITAL LETTER A WITH CARON}', b'\\v A') self.register(u'\N{LATIN SMALL LETTER A WITH CARON}', b'\\v a') self.register(u'\N{LATIN CAPITAL LETTER I WITH CARON}', b'\\v I') self.register(u'\N{LATIN SMALL LETTER I WITH CARON}', b'\\v\\i') self.register(u'\N{LATIN CAPITAL LETTER O WITH CARON}', b'\\v O') self.register(u'\N{LATIN SMALL LETTER O WITH CARON}', b'\\v o') self.register(u'\N{LATIN CAPITAL LETTER U WITH CARON}', b'\\v U') self.register(u'\N{LATIN SMALL LETTER U WITH CARON}', b'\\v u') self.register(u'\N{LATIN CAPITAL LETTER G WITH CARON}', b'\\v G') self.register(u'\N{LATIN SMALL LETTER G WITH CARON}', b'\\v g') self.register(u'\N{LATIN CAPITAL LETTER K WITH CARON}', b'\\v K') self.register(u'\N{LATIN SMALL LETTER K WITH CARON}', b'\\v k') self.register(u'\N{LATIN CAPITAL LETTER O WITH OGONEK}', b'\\k O') self.register(u'\N{LATIN SMALL LETTER O WITH OGONEK}', b'\\k o') self.register(u'\N{LATIN SMALL LETTER J WITH CARON}', b'\\v\\j') self.register(u'\N{LATIN CAPITAL LETTER DZ}', b'DZ', decode=False) self.register( u'\N{LATIN CAPITAL LETTER D WITH SMALL LETTER Z}', b'Dz', decode=False) self.register(u'\N{LATIN SMALL LETTER DZ}', b'dz', decode=False) self.register(u'\N{LATIN CAPITAL LETTER G WITH ACUTE}', b"\\'G") self.register(u'\N{LATIN SMALL LETTER G WITH ACUTE}', b"\\'g") self.register(u'\N{LATIN CAPITAL LETTER AE WITH ACUTE}', b"\\'\\AE") self.register(u'\N{LATIN SMALL LETTER AE WITH ACUTE}', b"\\'\\ae") self.register( u'\N{LATIN CAPITAL LETTER O WITH STROKE AND ACUTE}', b"\\'\\O") self.register( u'\N{LATIN SMALL LETTER O WITH STROKE AND ACUTE}', b"\\'\\o") self.register(u'\N{PARTIAL DIFFERENTIAL}', b'\\partial', mode='math') self.register(u'\N{N-ARY PRODUCT}', b'\\prod', mode='math') self.register(u'\N{N-ARY SUMMATION}', b'\\sum', mode='math') self.register(u'\N{SQUARE ROOT}', b'\\surd', mode='math') self.register(u'\N{INFINITY}', b'\\infty', mode='math') self.register(u'\N{INTEGRAL}', b'\\int', mode='math') self.register(u'\N{INTERSECTION}', b'\\cap', mode='math') self.register(u'\N{UNION}', b'\\cup', mode='math') self.register(u'\N{RIGHTWARDS ARROW}', b'\\rightarrow', mode='math') self.register( u'\N{RIGHTWARDS DOUBLE ARROW}', b'\\Rightarrow', mode='math') self.register(u'\N{LEFTWARDS ARROW}', b'\\leftarrow', mode='math') self.register( u'\N{LEFTWARDS DOUBLE ARROW}', b'\\Leftarrow', mode='math') self.register(u'\N{LOGICAL OR}', b'\\vee', mode='math') self.register(u'\N{LOGICAL AND}', b'\\wedge', mode='math') self.register(u'\N{ALMOST EQUAL TO}', b'\\approx', mode='math') self.register(u'\N{NOT EQUAL TO}', b'\\neq', mode='math') self.register(u'\N{LESS-THAN OR EQUAL TO}', b'\\leq', mode='math') self.register(u'\N{GREATER-THAN OR EQUAL TO}', b'\\geq', mode='math') self.register(u'\N{MODIFIER LETTER CIRCUMFLEX ACCENT}', b'\\^{}') self.register(u'\N{CARON}', b'\\v{}') self.register(u'\N{BREVE}', b'\\u{}') self.register(u'\N{DOT ABOVE}', b'\\.{}') self.register(u'\N{RING ABOVE}', b'\\r{}') self.register(u'\N{OGONEK}', b'\\k{}') self.register(u'\N{DOUBLE ACUTE ACCENT}', b'\\H{}') self.register(u'\N{LATIN SMALL LIGATURE FI}', b'fi', decode=False) self.register(u'\N{LATIN SMALL LIGATURE FL}', b'fl', decode=False) self.register(u'\N{LATIN SMALL LIGATURE FF}', b'ff', decode=False) self.register(u'\N{GREEK SMALL LETTER ALPHA}', b'\\alpha', mode='math') self.register(u'\N{GREEK SMALL LETTER BETA}', b'\\beta', mode='math') self.register(u'\N{GREEK SMALL LETTER GAMMA}', b'\\gamma', mode='math') self.register(u'\N{GREEK SMALL LETTER DELTA}', b'\\delta', mode='math') self.register( u'\N{GREEK SMALL LETTER EPSILON}', b'\\epsilon', mode='math') self.register(u'\N{GREEK SMALL LETTER ZETA}', b'\\zeta', mode='math') self.register(u'\N{GREEK SMALL LETTER ETA}', b'\\eta', mode='math') self.register(u'\N{GREEK SMALL LETTER THETA}', b'\\theta', mode='math') self.register(u'\N{GREEK SMALL LETTER IOTA}', b'\\iota', mode='math') self.register(u'\N{GREEK SMALL LETTER KAPPA}', b'\\kappa', mode='math') self.register( u'\N{GREEK SMALL LETTER LAMDA}', b'\\lambda', mode='math') # LAMDA not LAMBDA self.register(u'\N{GREEK SMALL LETTER MU}', b'\\mu', mode='math') self.register(u'\N{GREEK SMALL LETTER NU}', b'\\nu', mode='math') self.register(u'\N{GREEK SMALL LETTER XI}', b'\\xi', mode='math') self.register( u'\N{GREEK SMALL LETTER OMICRON}', b'\\omicron', mode='math') self.register(u'\N{GREEK SMALL LETTER PI}', b'\\pi', mode='math') self.register(u'\N{GREEK SMALL LETTER RHO}', b'\\rho', mode='math') self.register(u'\N{GREEK SMALL LETTER SIGMA}', b'\\sigma', mode='math') self.register(u'\N{GREEK SMALL LETTER TAU}', b'\\tau', mode='math') self.register( u'\N{GREEK SMALL LETTER UPSILON}', b'\\upsilon', mode='math') self.register(u'\N{GREEK SMALL LETTER PHI}', b'\\phi', mode='math') self.register(u'\N{GREEK SMALL LETTER CHI}', b'\\chi', mode='math') self.register(u'\N{GREEK SMALL LETTER PSI}', b'\\psi', mode='math') self.register(u'\N{GREEK SMALL LETTER OMEGA}', b'\\omega', mode='math') self.register( u'\N{GREEK CAPITAL LETTER ALPHA}', b'\\Alpha', mode='math') self.register(u'\N{GREEK CAPITAL LETTER BETA}', b'\\Beta', mode='math') self.register( u'\N{GREEK CAPITAL LETTER GAMMA}', b'\\Gamma', mode='math') self.register( u'\N{GREEK CAPITAL LETTER DELTA}', b'\\Delta', mode='math') self.register( u'\N{GREEK CAPITAL LETTER EPSILON}', b'\\Epsilon', mode='math') self.register(u'\N{GREEK CAPITAL LETTER ZETA}', b'\\Zeta', mode='math') self.register(u'\N{GREEK CAPITAL LETTER ETA}', b'\\Eta', mode='math') self.register( u'\N{GREEK CAPITAL LETTER THETA}', b'\\Theta', mode='math') self.register(u'\N{GREEK CAPITAL LETTER IOTA}', b'\\Iota', mode='math') self.register( u'\N{GREEK CAPITAL LETTER KAPPA}', b'\\Kappa', mode='math') self.register( u'\N{GREEK CAPITAL LETTER LAMDA}', b'\\Lambda', mode='math') # LAMDA not LAMBDA self.register(u'\N{GREEK CAPITAL LETTER MU}', b'\\Mu', mode='math') self.register(u'\N{GREEK CAPITAL LETTER NU}', b'\\Nu', mode='math') self.register(u'\N{GREEK CAPITAL LETTER XI}', b'\\Xi', mode='math') self.register( u'\N{GREEK CAPITAL LETTER OMICRON}', b'\\Omicron', mode='math') self.register(u'\N{GREEK CAPITAL LETTER PI}', b'\\Pi', mode='math') self.register(u'\N{GREEK CAPITAL LETTER RHO}', b'\\Rho', mode='math') self.register( u'\N{GREEK CAPITAL LETTER SIGMA}', b'\\Sigma', mode='math') self.register(u'\N{GREEK CAPITAL LETTER TAU}', b'\\Tau', mode='math') self.register( u'\N{GREEK CAPITAL LETTER UPSILON}', b'\\Upsilon', mode='math') self.register(u'\N{GREEK CAPITAL LETTER PHI}', b'\\Phi', mode='math') self.register(u'\N{GREEK CAPITAL LETTER CHI}', b'\\Chi', mode='math') self.register(u'\N{GREEK CAPITAL LETTER PSI}', b'\\Psi', mode='math') self.register( u'\N{GREEK CAPITAL LETTER OMEGA}', b'\\Omega', mode='math') self.register(u'\N{COPYRIGHT SIGN}', b'\\copyright') self.register(u'\N{COPYRIGHT SIGN}', b'\\textcopyright') self.register(u'\N{LATIN CAPITAL LETTER A WITH ACUTE}', b"\\'A") self.register(u'\N{LATIN CAPITAL LETTER I WITH ACUTE}', b"\\'I") self.register(u'\N{HORIZONTAL ELLIPSIS}', b'\\ldots') self.register(u'\N{TRADE MARK SIGN}', b'^{TM}', mode='math') self.register( u'\N{TRADE MARK SIGN}', b'\\texttrademark', package='textcomp') # \=O and \=o will be translated into Ō and ō before we can # match the full latex string... so decoding disabled for now self.register(u'Ǭ', br'\textogonekcentered{\=O}', decode=False) self.register(u'ǭ', br'\textogonekcentered{\=o}', decode=False) self.register(u'ℕ', br'\mathbb{N}', mode='math') self.register(u'ℕ', br'\mathbb N', mode='math', decode=False) self.register(u'ℤ', br'\mathbb{Z}', mode='math') self.register(u'ℤ', br'\mathbb Z', mode='math', decode=False) self.register(u'ℚ', br'\mathbb{Q}', mode='math') self.register(u'ℚ', br'\mathbb Q', mode='math', decode=False) self.register(u'ℝ', br'\mathbb{R}', mode='math') self.register(u'ℝ', br'\mathbb R', mode='math', decode=False) self.register(u'ℂ', br'\mathbb{C}', mode='math') self.register(u'ℂ', br'\mathbb C', mode='math', decode=False) def register(self, unicode_text, latex_text, mode='text', package=None, decode=True, encode=True): """Register a correspondence between *unicode_text* and *latex_text*. :param str unicode_text: A unicode character. :param bytes latex_text: Its corresponding LaTeX translation. :param str mode: LaTeX mode in which the translation applies (``'text'`` or ``'math'``). :param str package: LaTeX package requirements (currently ignored). :param bool decode: Whether this translation applies to decoding (default: ``True``). :param bool encode: Whether this translation applies to encoding (default: ``True``). """ if mode == 'math': # also register text version self.register(unicode_text, b'$' + latex_text + b'$', mode='text', package=package, decode=decode, encode=encode) self.register(unicode_text, br'\(' + latex_text + br'\)', mode='text', package=package, decode=decode, encode=encode) # XXX for the time being, we do not perform in-math substitutions return if not self.lexer.binary_mode: latex_text = latex_text.decode("ascii") if package is not None: # TODO implement packages pass # tokenize, and register unicode translation self.lexer.reset() self.lexer.state = 'M' tokens = tuple(self.lexer.get_tokens(latex_text, final=True)) if decode: if tokens not in self.unicode_map: self.max_length = max(self.max_length, len(tokens)) self.unicode_map[tokens] = unicode_text # also register token variant with brackets, if appropriate # for instance, "\'{e}" for "\'e", "\c{c}" for "\c c", etc. # note: we do not remove brackets (they sometimes matter, # e.g. bibtex uses them to prevent lower case transformation) if (len(tokens) == 2 and tokens[0].name.startswith('control') and tokens[1].name == 'chars'): alt_tokens = (tokens[0], self.lexer.curlylefttoken, tokens[1], self.lexer.curlyrighttoken) if alt_tokens not in self.unicode_map: self.max_length = max(self.max_length, len(alt_tokens)) self.unicode_map[alt_tokens] = u"{" + unicode_text + u"}" if encode and unicode_text not in self.latex_map: assert len(unicode_text) == 1 self.latex_map[unicode_text] = (latex_text, tokens) _LATEX_UNICODE_TABLE = LatexUnicodeTable(lexer.LatexIncrementalDecoder()) _ULATEX_UNICODE_TABLE = LatexUnicodeTable( lexer.UnicodeLatexIncrementalDecoder()) # incremental encoder does not need a buffer # but decoder does class LatexIncrementalEncoder(lexer.LatexIncrementalEncoder): """Translating incremental encoder for latex. Maintains a state to determine whether control spaces etc. need to be inserted. """ table = _LATEX_UNICODE_TABLE """Translation table.""" def __init__(self, errors='strict'): super(LatexIncrementalEncoder, self).__init__(errors=errors) self.reset() def reset(self): super(LatexIncrementalEncoder, self).reset() self.state = 'M' def get_space_bytes(self, bytes_): """Inserts space bytes in space eating mode.""" if self.state == 'S': # in space eating mode # control space needed? if bytes_.startswith(self.spacechar): # replace by control space return self.controlspacechar, bytes_[1:] else: # insert space (it is eaten, but needed for separation) return self.spacechar, bytes_ else: return self.emptychar, bytes_ def _get_latex_bytes_tokens_from_char(self, c): # if ascii, try latex equivalents # (this covers \, #, &, and other special LaTeX characters) if ord(c) < 128: try: return self.table.latex_map[c] except KeyError: pass # next, try input encoding try: bytes_ = c.encode(self.inputenc, 'strict') except UnicodeEncodeError: pass else: if self.binary_mode: return bytes_, (lexer.Token(name='chars', text=bytes_),) else: return c, (lexer.Token(name='chars', text=c),) # next, try latex equivalents of common unicode characters try: return self.table.latex_map[c] except KeyError: # translation failed if self.errors == 'strict': raise UnicodeEncodeError( "latex", # codec c, # problematic input 0, 1, # location of problematic character "don't know how to translate {0} into latex" .format(repr(c))) elif self.errors == 'ignore': return self.emptychar, (self.emptytoken,) elif self.errors == 'replace': # use the \\char command # this assumes # \usepackage[T1]{fontenc} # \usepackage[utf8]{inputenc} if self.binary_mode: bytes_ = b'{\\char' + str(ord(c)).encode("ascii") + b'}' else: bytes_ = u'{\\char' + str(ord(c)) + u'}' return bytes_, (lexer.Token(name='chars', text=bytes_),) elif self.errors == 'keep' and not self.binary_mode: return c, (lexer.Token(name='chars', text=c),) else: raise ValueError( "latex codec does not support {0} errors" .format(self.errors)) def get_latex_bytes(self, unicode_, final=False): if not isinstance(unicode_, string_types): raise TypeError( "expected unicode for encode input, but got {0} instead" .format(unicode_.__class__.__name__)) # convert character by character for pos, c in enumerate(unicode_): bytes_, tokens = self._get_latex_bytes_tokens_from_char(c) space, bytes_ = self.get_space_bytes(bytes_) # update state if tokens[-1].name == 'control_word': # we're eating spaces self.state = 'S' else: self.state = 'M' if space: yield space yield bytes_ class LatexIncrementalDecoder(lexer.LatexIncrementalDecoder): """Translating incremental decoder for LaTeX.""" table = _LATEX_UNICODE_TABLE """Translation table.""" def __init__(self, errors='strict'): lexer.LatexIncrementalDecoder.__init__(self, errors=errors) def reset(self): lexer.LatexIncrementalDecoder.reset(self) self.token_buffer = [] # python codecs API does not support multibuffer incremental decoders def getstate(self): raise NotImplementedError def setstate(self, state): raise NotImplementedError def get_unicode_tokens(self, bytes_, final=False): for token in self.get_tokens(bytes_, final=final): # at this point, token_buffer does not match anything self.token_buffer.append(token) # new token appended at the end, see if we have a match now # note: match is only possible at the *end* of the buffer # because all other positions have already been checked in # earlier iterations for i in range(len(self.token_buffer), 0, -1): last_tokens = tuple(self.token_buffer[-i:]) # last i tokens try: unicode_text = self.table.unicode_map[last_tokens] except KeyError: # no match: continue continue else: # match!! flush buffer, and translate last bit # exclude last i tokens for token in self.token_buffer[:-i]: yield self.decode_token(token) yield unicode_text self.token_buffer = [] break # flush tokens that can no longer match while len(self.token_buffer) >= self.table.max_length: yield self.decode_token(self.token_buffer.pop(0)) # also flush the buffer at the end if final: for token in self.token_buffer: yield self.decode_token(token) self.token_buffer = [] class LatexCodec(codecs.Codec): IncrementalEncoder = None IncrementalDecoder = None def encode(self, unicode_, errors='strict'): """Convert unicode string to LaTeX bytes.""" encoder = self.IncrementalEncoder(errors=errors) return ( encoder.encode(unicode_, final=True), len(unicode_), ) def decode(self, bytes_, errors='strict'): """Convert LaTeX bytes to unicode string.""" decoder = self.IncrementalDecoder(errors=errors) return ( decoder.decode(bytes_, final=True), len(bytes_), ) class UnicodeLatexIncrementalDecoder(LatexIncrementalDecoder): table = _ULATEX_UNICODE_TABLE binary_mode = False class UnicodeLatexIncrementalEncoder(LatexIncrementalEncoder): table = _ULATEX_UNICODE_TABLE binary_mode = False def find_latex(encoding): """Return a :class:`codecs.CodecInfo` instance for the requested LaTeX *encoding*, which must be equal to ``latex``, or to ``latex+`` where ```` describes another encoding. """ encoding, _, inputenc_ = encoding.partition(u"+") if not inputenc_: inputenc_ = "ascii" if encoding == "latex": IncEnc = LatexIncrementalEncoder DecEnc = LatexIncrementalDecoder elif encoding == "ulatex": IncEnc = UnicodeLatexIncrementalEncoder DecEnc = UnicodeLatexIncrementalDecoder else: return None class IncrementalEncoder_(IncEnc): inputenc = inputenc_ class IncrementalDecoder_(DecEnc): inputenc = inputenc_ class Codec(LatexCodec): IncrementalEncoder = IncrementalEncoder_ IncrementalDecoder = IncrementalDecoder_ class StreamWriter(Codec, codecs.StreamWriter): pass class StreamReader(Codec, codecs.StreamReader): pass return codecs.CodecInfo( encode=Codec().encode, decode=Codec().decode, incrementalencoder=Codec.IncrementalEncoder, incrementaldecoder=Codec.IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, ) latexcodec-1.0.5/latexcodec/lexer.py0000644005105600024240000004270412770454474017431 0ustar dma0mtdma00000000000000# -*- coding: utf-8 -*- """ LaTeX Lexer ~~~~~~~~~~~ This module contains all classes for lexing LaTeX code, as well as general purpose base classes for incremental LaTeX decoders and encoders, which could be useful in case you are writing your own custom LaTeX codec. .. autoclass:: Token(name, text) .. autoclass:: LatexLexer :show-inheritance: :members: .. autoclass:: LatexIncrementalLexer :show-inheritance: :members: .. autoclass:: LatexIncrementalDecoder :show-inheritance: :members: .. autoclass:: LatexIncrementalEncoder :show-inheritance: :members: """ # Copyright (c) 2003, 2008 David Eppstein # Copyright (c) 2011-2014 Matthias C. M. Troffaes # # Permission is hereby granted, free of charge, to any person # obtaining a copy of this software and associated documentation # files (the "Software"), to deal in the Software without # restriction, including without limitation the rights to use, # copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following # conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR # OTHER DEALINGS IN THE SOFTWARE. import codecs import collections import re from six import add_metaclass, binary_type, string_types import unicodedata Token = collections.namedtuple("Token", "name text") # implementation note: we derive from IncrementalDecoder because this # class serves excellently as a base class for incremental decoders, # but of course we don't decode yet until later class MetaLatexCoder(type): def __init__(cls, name, bases, dct): super(MetaLatexCoder, cls).__init__(name, bases, dct) cls.emptytoken = Token(u"unknown", cls._fixit(b"")) cls.partoken = Token("control_word", cls._fixit(b"\\par")) cls.spacetoken = Token("space", cls._fixit(b" ")) cls.replacetoken = Token( "chars", b"?" if cls.binary_mode else u"\ufffd") cls.curlylefttoken = Token("chars", cls._fixit(b"{")) cls.curlyrighttoken = Token("chars", cls._fixit(b"}")) cls.emptychar = cls._fixit(b"") cls.spacechar = cls._fixit(b" ") cls.controlspacechar = cls._fixit(b"\\ ") def _fixit(cls, bytes_): return bytes_ if cls.binary_mode else bytes_.decode("ascii") class MetaRegexpLexer(MetaLatexCoder): """Metaclass for :class:`RegexpLexer`. Compiles tokens into a regular expression. """ def __init__(cls, name, bases, dct): super(MetaRegexpLexer, cls).__init__(name, bases, dct) regexp_string = cls._fixit(b"|".join( b"(?P<" + name.encode("ascii") + b">" + regexp + b")" for name, regexp in cls.tokens)) cls.regexp = re.compile(regexp_string, re.DOTALL) @add_metaclass(MetaRegexpLexer) class RegexpLexer(codecs.IncrementalDecoder): """Abstract base class for regexp based lexers.""" tokens = () """Tuple containing all token regular expressions.""" binary_mode = True """Whether this lexer processes binary data (bytes) or text data (unicode). """ def __init__(self, errors='strict'): """Initialize the codec.""" self.errors = errors self.reset() def reset(self): """Reset state.""" # buffer for storing last (possibly incomplete) token self.raw_buffer = self.emptytoken def getstate(self): """Get state.""" return (self.raw_buffer.text, 0) def setstate(self, state): """Set state. The *state* must correspond to the return value of a previous :meth:`getstate` call. """ self.raw_buffer = Token('unknown', state[0]) def get_raw_tokens(self, bytes_, final=False): """Yield tokens without any further processing. Tokens are one of: - ``\\``: a control word (i.e. a command) - ``\\``: a control symbol (i.e. \\^ etc.) - ``#``: a parameter - a series of byte characters """ if self.raw_buffer.text: bytes_ = self.raw_buffer.text + bytes_ self.raw_buffer = self.emptytoken for match in self.regexp.finditer(bytes_): # yield the buffer token if self.raw_buffer.text: yield self.raw_buffer # fill buffer with next token self.raw_buffer = Token(match.lastgroup, match.group(0)) if final: for token in self.flush_raw_tokens(): yield token def flush_raw_tokens(self): """Flush the raw token buffer.""" if self.raw_buffer.text: yield self.raw_buffer self.raw_buffer = self.emptytoken class LatexLexer(RegexpLexer): """A very simple lexer for tex/latex bytes.""" # implementation note: every token **must** be decodable by inputenc tokens = ( # comment: for ease, and for speed, we handle it as a token (u'comment', br'(?= 0 # first token includes at least self.raw_buffer if token.name == 'newline': if self.state == 'N': # if state was 'N', generate new paragraph yield self.partoken elif self.state == 'S': # switch to 'N' state, do not generate a space self.state = 'N' elif self.state == 'M': # switch to 'N' state, generate a space self.state = 'N' yield self.spacetoken else: raise AssertionError( "unknown tex state {0!r}".format(self.state)) elif token.name == 'space': if self.state == 'N': # remain in 'N' state, no space token generated pass elif self.state == 'S': # remain in 'S' state, no space token generated pass elif self.state == 'M': # in M mode, generate the space, # but switch to space skip mode self.state = 'S' yield token else: raise AssertionError( "unknown state {0!r}".format(self.state)) elif token.name == 'mathshift': self.inline_math = not self.inline_math self.state = 'M' yield token elif token.name == 'parameter': self.state = 'M' yield token elif token.name == 'control_word': # go to space skip mode self.state = 'S' yield token elif token.name == 'control_symbol': # go to space skip mode self.state = 'S' yield token elif token.name == 'control_symbol_x': # don't skip following space, so go to M mode self.state = 'M' yield token elif token.name == 'comment': # no token is generated # note: comment does not include the newline self.state = 'S' elif token.name == 'chars': self.state = 'M' yield token elif token.name == 'unknown': if self.errors == 'strict': # hack around a bug in Python: UnicodeDecodeError # expects binary input if not self.binary_mode: bytes_ = bytes_.encode("utf8") # current position within bytes_ # this is the position right after the unknown token raise UnicodeDecodeError( "latex", # codec bytes_, # problematic input pos - len(token.text), # start of problematic token pos, # end of it "unknown token {0!r}".format(token.text)) elif self.errors == 'ignore': # do nothing pass elif self.errors == 'replace': yield self.replacetoken else: raise NotImplementedError( "error mode {0!r} not supported".format(self.errors)) else: raise AssertionError( "unknown token name {0!r}".format(token.name)) class LatexIncrementalDecoder(LatexIncrementalLexer): """Simple incremental decoder. Transforms lexed LaTeX tokens into unicode. To customize decoding, subclass and override :meth:`get_unicode_tokens`. """ inputenc = "ascii" """Input encoding. **Must** extend ascii.""" def decode_token(self, token): """Returns the decoded token text in :attr:`inputenc` encoding. .. note:: Control words get an extra space added at the back to make sure separation from the next token, so that decoded token sequences can be :meth:`str.join`\ ed together. For example, the tokens ``b'\\hello'`` and ``b'world'`` will correctly result in ``u'\\hello world'`` (remember that LaTeX eats space following control words). If no space were added, this would wrongfully result in ``u'\\helloworld'``. """ # in python 3, the token text can be a memoryview # which do not have a decode method; must cast to bytes explicitly if self.binary_mode: text = binary_type(token.text).decode(self.inputenc) else: text = token.text return text if token.name != 'control_word' else text + u' ' def get_unicode_tokens(self, bytes_, final=False): """Decode every token in :attr:`inputenc` encoding. Override to process the tokens in some other way (for example, for token translation). """ for token in self.get_tokens(bytes_, final=final): yield self.decode_token(token) def decode(self, bytes_, final=False): """Decode LaTeX *bytes_* into a unicode string. This implementation calls :meth:`get_unicode_tokens` and joins the resulting unicode strings together. """ try: return u''.join(self.get_unicode_tokens(bytes_, final=final)) except UnicodeDecodeError as e: # API requires that the encode method raises a ValueError # in this case raise ValueError(e) @add_metaclass(MetaLatexCoder) class LatexIncrementalEncoder(codecs.IncrementalEncoder): """Simple incremental encoder for LaTeX. Transforms unicode into :class:`bytes`. To customize decoding, subclass and override :meth:`get_latex_bytes`. """ inputenc = "ascii" """Input encoding. **Must** extend ascii.""" binary_mode = True """Whether this encoder processes binary data (bytes) or text data (unicode). """ def __init__(self, errors='strict'): """Initialize the codec.""" self.errors = errors self.reset() def reset(self): """Reset state.""" # buffer for storing last (possibly incomplete) token self.buffer = u"" def getstate(self): """Get state.""" return self.buffer def setstate(self, state): """Set state. The *state* must correspond to the return value of a previous :meth:`getstate` call. """ self.buffer = state def get_unicode_tokens(self, unicode_, final=False): """Split unicode into tokens so that every token starts with a non-combining character. """ if not isinstance(unicode_, string_types): raise TypeError( "expected unicode for encode input, but got {0} instead" .format(unicode_.__class__.__name__)) for c in unicode_: if not unicodedata.combining(c): for token in self.flush_unicode_tokens(): yield token self.buffer += c if final: for token in self.flush_unicode_tokens(): yield token def flush_unicode_tokens(self): """Flush the buffer.""" if self.buffer: yield self.buffer self.buffer = u"" def get_latex_bytes(self, unicode_, final=False): """Encode every character in :attr:`inputenc` encoding. Override to process the unicode in some other way (for example, for character translation). """ if self.binary_mode: for token in self.get_unicode_tokens(unicode_, final=final): yield token.encode(self.inputenc, self.errors) else: for token in self.get_unicode_tokens(unicode_, final=final): yield token def encode(self, unicode_, final=False): """Encode the *unicode_* string into LaTeX :class:`bytes`. This implementation calls :meth:`get_latex_bytes` and joins the resulting :class:`bytes` together. """ try: return self.emptychar.join( self.get_latex_bytes(unicode_, final=final)) except UnicodeEncodeError as e: # API requires that the encode method raises a ValueError # in this case raise ValueError(e) class UnicodeLatexLexer(LatexLexer): binary_mode = False class UnicodeLatexIncrementalDecoder(LatexIncrementalDecoder): binary_mode = False class UnicodeLatexIncrementalEncoder(LatexIncrementalEncoder): binary_mode = False latexcodec-1.0.5/setup.cfg0000644005105600024240000000025613120706257015427 0ustar dma0mtdma00000000000000[nosetests] with-coverage = 1 cover-package = latexcodec cover-branches = 1 cover-html = 1 [wheel] universal = 1 [egg_info] tag_build = tag_date = 0 tag_svn_revision = 0 latexcodec-1.0.5/MANIFEST.in0000644005105600024240000000044312274466514015352 0ustar dma0mtdma00000000000000include VERSION include README.rst include INSTALL.rst include CHANGELOG.rst include LICENSE.rst include AUTHORS.rst include requirements.txt include tox.ini recursive-include doc * recursive-include test * global-exclude *.pyc global-exclude .gitignore prune doc/_build exclude .travis.yml latexcodec-1.0.5/setup.py0000644005105600024240000000304613120705220015305 0ustar dma0mtdma00000000000000# -*- coding: utf-8 -*- import io from setuptools import setup, find_packages def readfile(filename): with io.open(filename, encoding="utf-8") as stream: return stream.read().split("\n") readme = readfile("README.rst")[5:] # skip title and badges requires = readfile("requirements.txt") version = readfile("VERSION")[0].strip() setup( name='latexcodec', version=version, url='https://github.com/mcmtroffaes/latexcodec', download_url='http://pypi.python.org/pypi/latexcodec', license='MIT', author='Matthias C. M. Troffaes', author_email='matthias.troffaes@gmail.com', description=readme[0], long_description="\n".join(readme[2:]), zip_safe=True, classifiers=[ 'Development Status :: 5 - Production/Stable', 'Environment :: Console', 'Intended Audience :: Developers', 'License :: OSI Approved :: MIT License', 'Operating System :: OS Independent', 'Programming Language :: Python', 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Topic :: Text Processing :: Markup :: LaTeX', 'Topic :: Text Processing :: Filters', ], platforms='any', packages=find_packages(), install_requires=requires, ) latexcodec-1.0.5/latexcodec.egg-info/0000755005105600024240000000000013120706257017410 5ustar dma0mtdma00000000000000latexcodec-1.0.5/latexcodec.egg-info/dependency_links.txt0000644005105600024240000000000113120706256023455 0ustar dma0mtdma00000000000000 latexcodec-1.0.5/latexcodec.egg-info/requires.txt0000644005105600024240000000001313120706256022001 0ustar dma0mtdma00000000000000six>=1.4.1 latexcodec-1.0.5/latexcodec.egg-info/top_level.txt0000644005105600024240000000001313120706256022133 0ustar dma0mtdma00000000000000latexcodec latexcodec-1.0.5/latexcodec.egg-info/SOURCES.txt0000644005105600024240000000115713120706257021300 0ustar dma0mtdma00000000000000AUTHORS.rst CHANGELOG.rst INSTALL.rst LICENSE.rst MANIFEST.in README.rst VERSION requirements.txt setup.cfg setup.py doc/Makefile doc/api.rst doc/authors.rst doc/changes.rst doc/conf.py doc/index.rst doc/license.rst doc/make.bat doc/quickstart.rst doc/api/codec.rst doc/api/lexer.rst latexcodec/__init__.py latexcodec/codec.py latexcodec/lexer.py latexcodec.egg-info/PKG-INFO latexcodec.egg-info/SOURCES.txt latexcodec.egg-info/dependency_links.txt latexcodec.egg-info/requires.txt latexcodec.egg-info/top_level.txt latexcodec.egg-info/zip-safe test/test_install_example.py test/test_latex_codec.py test/test_latex_lexer.pylatexcodec-1.0.5/latexcodec.egg-info/PKG-INFO0000644005105600024240000000333613120706256020511 0ustar dma0mtdma00000000000000Metadata-Version: 1.1 Name: latexcodec Version: 1.0.5 Summary: A lexer and codec to work with LaTeX code in Python. Home-page: https://github.com/mcmtroffaes/latexcodec Author: Matthias C. M. Troffaes Author-email: matthias.troffaes@gmail.com License: MIT Download-URL: http://pypi.python.org/pypi/latexcodec Description: * Download: http://pypi.python.org/pypi/latexcodec/#downloads * Documentation: http://latexcodec.readthedocs.org/ * Development: http://github.com/mcmtroffaes/latexcodec/ .. |travis| image:: https://travis-ci.org/mcmtroffaes/latexcodec.png?branch=develop :target: https://travis-ci.org/mcmtroffaes/latexcodec :alt: travis-ci .. |coveralls| image:: https://coveralls.io/repos/mcmtroffaes/latexcodec/badge.png?branch=develop :target: https://coveralls.io/r/mcmtroffaes/latexcodec?branch=develop :alt: coveralls.io Platform: any Classifier: Development Status :: 5 - Production/Stable Classifier: Environment :: Console Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: MIT License Classifier: Operating System :: OS Independent Classifier: Programming Language :: Python Classifier: Programming Language :: Python :: 2 Classifier: Programming Language :: Python :: 2.6 Classifier: Programming Language :: Python :: 2.7 Classifier: Programming Language :: Python :: 3 Classifier: Programming Language :: Python :: 3.3 Classifier: Programming Language :: Python :: 3.4 Classifier: Programming Language :: Python :: 3.5 Classifier: Programming Language :: Python :: 3.6 Classifier: Topic :: Text Processing :: Markup :: LaTeX Classifier: Topic :: Text Processing :: Filters latexcodec-1.0.5/latexcodec.egg-info/zip-safe0000644005105600024240000000000113120706256021037 0ustar dma0mtdma00000000000000 latexcodec-1.0.5/PKG-INFO0000644005105600024240000000333613120706257014705 0ustar dma0mtdma00000000000000Metadata-Version: 1.1 Name: latexcodec Version: 1.0.5 Summary: A lexer and codec to work with LaTeX code in Python. Home-page: https://github.com/mcmtroffaes/latexcodec Author: Matthias C. M. Troffaes Author-email: matthias.troffaes@gmail.com License: MIT Download-URL: http://pypi.python.org/pypi/latexcodec Description: * Download: http://pypi.python.org/pypi/latexcodec/#downloads * Documentation: http://latexcodec.readthedocs.org/ * Development: http://github.com/mcmtroffaes/latexcodec/ .. |travis| image:: https://travis-ci.org/mcmtroffaes/latexcodec.png?branch=develop :target: https://travis-ci.org/mcmtroffaes/latexcodec :alt: travis-ci .. |coveralls| image:: https://coveralls.io/repos/mcmtroffaes/latexcodec/badge.png?branch=develop :target: https://coveralls.io/r/mcmtroffaes/latexcodec?branch=develop :alt: coveralls.io Platform: any Classifier: Development Status :: 5 - Production/Stable Classifier: Environment :: Console Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: MIT License Classifier: Operating System :: OS Independent Classifier: Programming Language :: Python Classifier: Programming Language :: Python :: 2 Classifier: Programming Language :: Python :: 2.6 Classifier: Programming Language :: Python :: 2.7 Classifier: Programming Language :: Python :: 3 Classifier: Programming Language :: Python :: 3.3 Classifier: Programming Language :: Python :: 3.4 Classifier: Programming Language :: Python :: 3.5 Classifier: Programming Language :: Python :: 3.6 Classifier: Topic :: Text Processing :: Markup :: LaTeX Classifier: Topic :: Text Processing :: Filters latexcodec-1.0.5/LICENSE.rst0000644005105600024240000000217112274466514015430 0ustar dma0mtdma00000000000000| latexcodec is a lexer and codec to work with LaTeX code in Python | Copyright (c) 2011-2014 by Matthias C. M. Troffaes Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. latexcodec-1.0.5/AUTHORS.rst0000644005105600024240000000076712370126012015463 0ustar dma0mtdma00000000000000Main authors: * David Eppstein - wrote the original LaTeX codec as a recipe on ActiveState http://code.activestate.com/recipes/252124-latex-codec/ * Peter Tröger - wrote the original latexcodec package, which contained a simple but very effective LaTeX encoder * Matthias Troffaes (matthias.troffaes@gmail.com) - wrote the lexer - integrated codec with the lexer for a simpler and more robust design - various bugfixes Contributors: * Michael Radziej * Philipp Spitzer latexcodec-1.0.5/VERSION0000644005105600024240000000000613120704612014640 0ustar dma0mtdma000000000000001.0.5 latexcodec-1.0.5/INSTALL.rst0000644005105600024240000000622613117445671015457 0ustar dma0mtdma00000000000000Install the module with ``pip install latexcodec``, or from source using ``python setup.py install``. Minimal Example --------------- Simply import the :mod:`latexcodec` module to enable ``"latex"`` to be used as an encoding: .. code-block:: python import latexcodec text_latex = b"\\'el\\`eve" assert text_latex.decode("latex") == u"élève" text_unicode = u"ångström" assert text_unicode.encode("latex") == b'\\aa ngstr\\"om' There are also a ``ulatex`` encoding for text transforms. The simplest way to use this codec goes through the codecs module (as for all text transform codecs on Python): .. code-block:: python import codecs import latexcodec text_latex = u"\\'el\\`eve" assert codecs.decode(text_latex, "ulatex") == u"élève" text_unicode = u"ångström" assert codecs.encode(text_unicode, "ulatex") == u'\\aa ngstr\\"om' By default, the LaTeX input is assumed to be ascii, as per standard LaTeX. However, you can also specify an extra codec as ``latex+`` or ``ulatex+``, where ```` describes another encoding. In this case characters will be translated to and from that encoding whenever possible. The following code snippet demonstrates this behaviour: .. code-block:: python import latexcodec text_latex = b"\xfe" assert text_latex.decode("latex+latin1") == u"þ" assert text_latex.decode("latex+latin2") == u"ţ" text_unicode = u"ţ" assert text_unicode.encode("latex+latin1") == b'\\c t' # ţ is not latin1 assert text_unicode.encode("latex+latin2") == b'\xfe' # but it is latin2 When encoding using the ``ulatex`` codec, you have the option to pass through characters that cannot be encoded in the desired encoding, by using the ``'keep'`` error. This can be a useful fallback option if you want to encode as much as possible, whilst still retaining as much as possible of the original code when encoding fails. If instead you want to translate to LaTeX but keep as much of the unicode as possible, use the ``ulatex+utf8`` codec, which should never fail. .. code-block:: python import codecs import latexcodec text_unicode = u'⌨' # \u2328 = keyboard symbol, currently not translated try: # raises a value error as \u2328 cannot be encoded into latex codecs.encode(text_unicode, "ulatex+ascii") except ValueError: pass assert codecs.encode(text_unicode, "ulatex+ascii", "keep") == u'⌨' assert codecs.encode(text_unicode, "ulatex+utf8") == u'⌨' Limitations ----------- * Not all unicode characters are registered. If you find any missing, please report them on the tracker: https://github.com/mcmtroffaes/latexcodec/issues * Unicode combining characters are currently not handled. * By design, the codec never removes curly brackets. This is because it is very hard to guess whether brackets are part of a command or not (this would require a full latex parser). Moreover, bibtex uses curly brackets as a guard against case conversion, in which case automatic removal of curly brackets may not be desired at all, even if they are not part of a command. Also see: http://stackoverflow.com/a/19754245/2863746 latexcodec-1.0.5/doc/0000755005105600024240000000000013120706257014350 5ustar dma0mtdma00000000000000latexcodec-1.0.5/doc/make.bat0000644005105600024240000001176012177740034015764 0ustar dma0mtdma00000000000000@ECHO OFF REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set BUILDDIR=_build set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . set I18NSPHINXOPTS=%SPHINXOPTS% . if NOT "%PAPER%" == "" ( set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% ) if "%1" == "" goto help if "%1" == "help" ( :help echo.Please use `make ^` where ^ is one of echo. html to make standalone HTML files echo. dirhtml to make HTML files named index.html in directories echo. singlehtml to make a single large HTML file echo. pickle to make pickle files echo. json to make JSON files echo. htmlhelp to make HTML files and a HTML help project echo. qthelp to make HTML files and a qthelp project echo. devhelp to make HTML files and a Devhelp project echo. epub to make an epub echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter echo. text to make text files echo. man to make manual pages echo. texinfo to make Texinfo files echo. gettext to make PO message catalogs echo. changes to make an overview over all changed/added/deprecated items echo. linkcheck to check all external links for integrity echo. doctest to run all doctests embedded in the documentation if enabled goto end ) if "%1" == "clean" ( for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i del /q /s %BUILDDIR%\* goto end ) if "%1" == "html" ( %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/html. goto end ) if "%1" == "dirhtml" ( %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. goto end ) if "%1" == "singlehtml" ( %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. goto end ) if "%1" == "pickle" ( %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the pickle files. goto end ) if "%1" == "json" ( %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the JSON files. goto end ) if "%1" == "htmlhelp" ( %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run HTML Help Workshop with the ^ .hhp project file in %BUILDDIR%/htmlhelp. goto end ) if "%1" == "qthelp" ( %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run "qcollectiongenerator" with the ^ .qhcp project file in %BUILDDIR%/qthelp, like this: echo.^> qcollectiongenerator %BUILDDIR%\qthelp\latexcodec.qhcp echo.To view the help file: echo.^> assistant -collectionFile %BUILDDIR%\qthelp\latexcodec.ghc goto end ) if "%1" == "devhelp" ( %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp if errorlevel 1 exit /b 1 echo. echo.Build finished. goto end ) if "%1" == "epub" ( %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub if errorlevel 1 exit /b 1 echo. echo.Build finished. The epub file is in %BUILDDIR%/epub. goto end ) if "%1" == "latex" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex if errorlevel 1 exit /b 1 echo. echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. goto end ) if "%1" == "text" ( %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text if errorlevel 1 exit /b 1 echo. echo.Build finished. The text files are in %BUILDDIR%/text. goto end ) if "%1" == "man" ( %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man if errorlevel 1 exit /b 1 echo. echo.Build finished. The manual pages are in %BUILDDIR%/man. goto end ) if "%1" == "texinfo" ( %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo if errorlevel 1 exit /b 1 echo. echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. goto end ) if "%1" == "gettext" ( %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale if errorlevel 1 exit /b 1 echo. echo.Build finished. The message catalogs are in %BUILDDIR%/locale. goto end ) if "%1" == "changes" ( %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes if errorlevel 1 exit /b 1 echo. echo.The overview file is in %BUILDDIR%/changes. goto end ) if "%1" == "linkcheck" ( %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck if errorlevel 1 exit /b 1 echo. echo.Link check complete; look for any errors in the above output ^ or in %BUILDDIR%/linkcheck/output.txt. goto end ) if "%1" == "doctest" ( %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest if errorlevel 1 exit /b 1 echo. echo.Testing of doctests in the sources finished, look at the ^ results in %BUILDDIR%/doctest/output.txt. goto end ) :end latexcodec-1.0.5/doc/Makefile0000644005105600024240000001271412204362324016010 0ustar dma0mtdma00000000000000# Makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = BUILDDIR = _build # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " text to make text files" @echo " man to make manual pages" @echo " texinfo to make Texinfo files" @echo " info to make Texinfo files and run them through makeinfo" @echo " gettext to make PO message catalogs" @echo " changes to make an overview of all changed/added/deprecated items" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" clean: -rm -rf $(BUILDDIR)/* html: $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." singlehtml: $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." pickle: $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." htmlhelp: $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." qthelp: $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/latexcodec.qhcp" @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/latexcodec.qhc" devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" @echo "# mkdir -p $$HOME/.local/share/devhelp/latexcodec" @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/latexcodec" @echo "# devhelp" epub: $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." latex: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." latexpdf: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." $(MAKE) -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." text: $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." man: $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." texinfo: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." @echo "Run \`make' in that directory to run these through makeinfo" \ "(use \`make info' here to do that automatically)." info: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo "Running Texinfo files through makeinfo..." make -C $(BUILDDIR)/texinfo info @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." gettext: $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale @echo @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." latexcodec-1.0.5/doc/api.rst0000644005105600024240000000010412274464314015652 0ustar dma0mtdma00000000000000API ~~~ .. toctree:: :maxdepth: 2 api/codec api/lexer latexcodec-1.0.5/doc/index.rst0000644005105600024240000000047212274464314016220 0ustar dma0mtdma00000000000000Welcome to latexcodec's documentation! ====================================== :Release: |release| :Date: |today| Contents -------- .. toctree:: :maxdepth: 2 quickstart api changes authors license Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` latexcodec-1.0.5/doc/license.rst0000644005105600024240000000044112274464314016527 0ustar dma0mtdma00000000000000License ======= .. include:: ../LICENSE.rst .. rubric:: Remark Versions 0.1 and 0.2 of the latexcodec package were written by Peter Tröger, and were released under the Academic Free License 3.0. The current version of the latexcodec package shares no code with those earlier versions. latexcodec-1.0.5/doc/conf.py0000644005105600024240000000227213117445560015655 0ustar dma0mtdma00000000000000# -*- coding: utf-8 -*- # # latexcodec documentation build configuration file, created by # sphinx-quickstart on Wed Aug 3 15:45:22 2011. extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.imgmath', 'sphinx.ext.viewcode'] source_suffix = '.rst' master_doc = 'index' project = u'latexcodec' copyright = u'2011-2014, Matthias C. M. Troffaes' with open("../VERSION", "rb") as version_file: release = version_file.read().strip() version = '.'.join(release.split('.')[:2]) exclude_patterns = ['_build'] pygments_style = 'sphinx' html_theme = 'default' htmlhelp_basename = 'latexcodecdoc' latex_documents = [ ('index', 'latexcodec.tex', u'latexcodec Documentation', u'Matthias C. M. Troffaes', 'manual'), ] man_pages = [ ('index', 'latexcodec', u'latexcodec Documentation', [u'Matthias C. M. Troffaes'], 1) ] texinfo_documents = [ ('index', 'latexcodec', u'latexcodec Documentation', u'Matthias C. M. Troffaes', 'latexcodec', 'One line description of project.', 'Miscellaneous'), ] intersphinx_mapping = { 'python': ('http://docs.python.org/', None), } latexcodec-1.0.5/doc/changes.rst0000644005105600024240000000007612274464314016521 0ustar dma0mtdma00000000000000:tocdepth: 1 Changes ======= .. include:: ../CHANGELOG.rst latexcodec-1.0.5/doc/api/0000755005105600024240000000000013120706257015121 5ustar dma0mtdma00000000000000latexcodec-1.0.5/doc/api/lexer.rst0000644005105600024240000000004112274464314016771 0ustar dma0mtdma00000000000000.. automodule:: latexcodec.lexer latexcodec-1.0.5/doc/api/codec.rst0000644005105600024240000000004112274464314016727 0ustar dma0mtdma00000000000000.. automodule:: latexcodec.codec latexcodec-1.0.5/doc/quickstart.rst0000644005105600024240000000023112274466514017300 0ustar dma0mtdma00000000000000Getting Started =============== Overview -------- .. include:: ../README.rst :start-line: 5 Installation ------------ .. include:: ../INSTALL.rst latexcodec-1.0.5/doc/authors.rst0000644005105600024240000000005612274464314016574 0ustar dma0mtdma00000000000000Authors ======= .. include:: ../AUTHORS.rst latexcodec-1.0.5/test/0000755005105600024240000000000013120706257014562 5ustar dma0mtdma00000000000000latexcodec-1.0.5/test/test_install_example.py0000644005105600024240000000255012770454474021371 0ustar dma0mtdma00000000000000# -*- coding: utf-8 -*- def test_install_example_1(): import latexcodec # noqa text_latex = b"\\'el\\`eve" assert text_latex.decode("latex") == u"élève" text_unicode = u"ångström" assert text_unicode.encode("latex") == b'\\aa ngstr\\"om' def test_install_example_2(): import codecs import latexcodec # noqa text_latex = u"\\'el\\`eve" assert codecs.decode(text_latex, "ulatex") == u"élève" text_unicode = u"ångström" assert codecs.encode(text_unicode, "ulatex") == u'\\aa ngstr\\"om' def test_install_example_3(): import latexcodec # noqa text_latex = b"\xfe" assert text_latex.decode("latex+latin1") == u"þ" assert text_latex.decode("latex+latin2") == u"ţ" text_unicode = u"ţ" assert text_unicode.encode("latex+latin1") == b'\\c t' # ţ is not latin1 assert text_unicode.encode("latex+latin2") == b'\xfe' # but it is latin2 def test_install_example_4(): import codecs import latexcodec # noqa text_unicode = u'⌨' # \u2328 = keyboard symbol, currently not translated try: # raises a value error as \u2328 cannot be encoded into latex codecs.encode(text_unicode, "ulatex+ascii") except ValueError: pass assert codecs.encode(text_unicode, "ulatex+ascii", "keep") == u'⌨' assert codecs.encode(text_unicode, "ulatex+utf8") == u'⌨' latexcodec-1.0.5/test/test_latex_lexer.py0000644005105600024240000003330612770454474020527 0ustar dma0mtdma00000000000000# -*- coding: utf-8 -*- """Tests for the tex lexer.""" import nose.tools from unittest import TestCase from latexcodec.lexer import ( LatexLexer, UnicodeLatexLexer, LatexIncrementalLexer, LatexIncrementalDecoder, UnicodeLatexIncrementalDecoder, LatexIncrementalEncoder, UnicodeLatexIncrementalEncoder, Token) class MockLexer(LatexLexer): tokens = ( (u'chars', br'mock'), (u'unknown', br'.'), ) class MockIncrementalDecoder(LatexIncrementalDecoder): tokens = ( (u'chars', br'mock'), (u'unknown', br'.'), ) def test_token_create_with_args(): t = Token('hello', b'world') nose.tools.assert_equal(t.name, 'hello') nose.tools.assert_equal(t.text, b'world') @nose.tools.raises(AttributeError) def test_token_assign_name(): t = Token('hello', b'world') t.name = 'test' @nose.tools.raises(AttributeError) def test_token_assign_text(): t = Token('hello', b'world') t.text = 'test' @nose.tools.raises(AttributeError) def test_token_assign_other(): t = Token('hello', b'world') t.blabla = 'test' class BaseLatexLexerTest(TestCase): errors = 'strict' Lexer = None def setUp(self): self.lexer = self.Lexer(errors=self.errors) def lex_it(self, latex_code, latex_tokens, final=False): if not self.lexer.binary_mode: latex_code = latex_code.decode("ascii") latex_tokens = [token.decode("ascii") for token in latex_tokens] tokens = self.lexer.get_raw_tokens(latex_code, final=final) self.assertEqual( list(token.text for token in tokens), latex_tokens) def tearDown(self): del self.lexer class LatexLexerTest(BaseLatexLexerTest): Lexer = LatexLexer def test_null(self): self.lex_it(b'', [], final=True) def test_hello(self): self.lex_it( b'hello! [#1] This \\is\\ \\^ a \ntest.\n' b' \nHey.\n\n\# x \#x', br'h|e|l|l|o|!| | |[|#1|]| |T|h|i|s| |\is|\ | | |\^| |a| ' b'|\n|t|e|s|t|.|\n| | | | |\n|H|e|y|.|\n|\n' br'|\#| |x| |\#|x'.split(b'|'), final=True ) def test_comment(self): self.lex_it( b'test% some comment\ntest', b't|e|s|t|% some comment|\n|t|e|s|t'.split(b'|'), final=True ) def test_comment_newline(self): self.lex_it( b'test% some comment\n\ntest', b't|e|s|t|% some comment|\n|\n|t|e|s|t'.split(b'|'), final=True ) def test_control(self): self.lex_it( b'\\hello\\world', b'\\hello|\\world'.split(b'|'), final=True ) def test_control_whitespace(self): self.lex_it( b'\\hello \\world ', b'\\hello| | | |\\world| | | '.split(b'|'), final=True ) def test_controlx(self): self.lex_it( b'\\#\\&', b'\\#|\\&'.split(b'|'), final=True ) def test_controlx_whitespace(self): self.lex_it( b'\\# \\& ', b'\\#| | | | |\\&| | | '.split(b'|'), final=True ) def test_buffer(self): self.lex_it( b'hi\\t', b'h|i'.split(b'|'), ) self.lex_it( b'here', [b'\\there'], final=True, ) def test_state(self): self.lex_it( b'hi\\t', b'h|i'.split(b'|'), ) state = self.lexer.getstate() self.lexer.reset() self.lex_it( b'here', b'h|e|r|e'.split(b'|'), final=True, ) self.lexer.setstate(state) self.lex_it( b'here', [b'\\there'], final=True, ) @nose.tools.raises(NotImplementedError) def test_decode(self): self.lexer.decode(b'') def test_final_backslash(self): self.lex_it( b'notsogood\\', b'n|o|t|s|o|g|o|o|d|\\'.split(b'|'), final=True ) def test_final_comment(self): self.lex_it( b'hello%', b'h|e|l|l|o|%'.split(b'|'), final=True ) def test_hash(self): self.lex_it(b'#', [b'#'], final=True) def test_tab(self): self.lex_it(b'\c\tc', b'\c|\t|c'.split(b'|'), final=True) def test_percent(self): self.lex_it(b'This is a \\% test.', b'T|h|i|s| |i|s| |a| |\\%| |t|e|s|t|.'.split(b'|'), final=True) self.lex_it(b'\\% %test', b'\\%| |%test'.split(b'|'), final=True) self.lex_it(b'\\% %test\nhi', b'\\%| |%test|\n|h|i'.split(b'|'), final=True) class UnicodeLatexLexerTest(LatexLexerTest): Lexer = UnicodeLatexLexer class BaseLatexIncrementalDecoderTest(TestCase): """Tex lexer fixture.""" errors = 'strict' IncrementalDecoder = None def setUp(self): self.lexer = self.IncrementalDecoder(self.errors) def fix(self, s): return s if self.lexer.binary_mode else s.decode("ascii") def lex_it(self, latex_code, latex_tokens, final=False): latex_code = self.fix(latex_code) latex_tokens = [self.fix(token) for token in latex_tokens] tokens = self.lexer.get_tokens(latex_code, final=final) self.assertEqual( list(token.text for token in tokens), latex_tokens) def tearDown(self): del self.lexer class LatexIncrementalDecoderTest(BaseLatexIncrementalDecoderTest): IncrementalDecoder = LatexIncrementalDecoder def test_null(self): self.lex_it(b'', [], final=True) def test_hello(self): self.lex_it( b'hello! [#1] This \\is\\ \\^ a \ntest.\n' b' \nHey.\n\n\# x \#x', br'h|e|l|l|o|!| |[|#1|]| |T|h|i|s| |\is|\ |\^|a| ' br'|t|e|s|t|.| |\par|H|e|y|.| ' br'|\par|\#| |x| |\#|x'.split(b'|'), final=True ) def test_comment(self): self.lex_it( b'test% some comment\ntest', b't|e|s|t|t|e|s|t'.split(b'|'), final=True ) def test_comment_newline(self): self.lex_it( b'test% some comment\n\ntest', b't|e|s|t|\\par|t|e|s|t'.split(b'|'), final=True ) def test_control(self): self.lex_it( b'\\hello\\world', b'\\hello|\\world'.split(b'|'), final=True ) def test_control_whitespace(self): self.lex_it( b'\\hello \\world ', b'\\hello|\\world'.split(b'|'), final=True ) def test_controlx(self): self.lex_it( b'\\#\\&', b'\\#|\\&'.split(b'|'), final=True ) def test_controlx_whitespace(self): self.lex_it( b'\\# \\& ', b'\\#| |\\&| '.split(b'|'), final=True ) def test_buffer(self): self.lex_it( b'hi\\t', b'h|i'.split(b'|'), ) self.lex_it( b'here', [b'\\there'], final=True, ) def test_buffer_decode(self): self.assertEqual( self.lexer.decode(self.fix(b'hello! [#1] This \\i')), u'hello! [#1] This ', ) self.assertEqual( self.lexer.decode(self.fix(b's\\ \\^ a \ntest.\n')), u'\\is \\ \\^a test.', ) self.assertEqual( self.lexer.decode(self.fix(b' \nHey.\n\n\# x \#x'), final=True), u' \\par Hey. \\par \\# x \\#x', ) def test_state_middle(self): self.lex_it( b'hi\\t', b'h|i'.split(b'|'), ) state = self.lexer.getstate() self.assertEqual(self.lexer.state, 'M') self.assertEqual(self.lexer.raw_buffer.name, 'control_word') self.assertEqual(self.lexer.raw_buffer.text, self.fix(b'\\t')) self.lexer.reset() self.assertEqual(self.lexer.state, 'N') self.assertEqual(self.lexer.raw_buffer.name, 'unknown') self.assertEqual(self.lexer.raw_buffer.text, self.fix(b'')) self.lex_it( b'here', b'h|e|r|e'.split(b'|'), final=True, ) self.lexer.setstate(state) self.assertEqual(self.lexer.state, 'M') self.assertEqual(self.lexer.raw_buffer.name, 'control_word') self.assertEqual(self.lexer.raw_buffer.text, self.fix(b'\\t')) self.lex_it( b'here', [b'\\there'], final=True, ) def test_state_inline_math(self): self.lex_it( b'hi$t', b'h|i|$'.split(b'|'), ) assert self.lexer.inline_math self.lex_it( b'here$', b't|h|e|r|e|$'.split(b'|'), final=True, ) assert not self.lexer.inline_math # counterintuitive? @nose.tools.raises(UnicodeDecodeError) def test_final_backslash(self): self.lex_it( b'notsogood\\', [b'notsogood'], final=True ) def test_final_comment(self): self.lex_it( b'hello%', b'h|e|l|l|o'.split(b'|'), final=True ) def test_hash(self): self.lex_it(b'#', [b'#'], final=True) def test_tab(self): self.lex_it(b'\c\tc', b'\c|c'.split(b'|'), final=True) class UnicodeLatexIncrementalDecoderTest(LatexIncrementalDecoderTest): IncrementalDecoder = UnicodeLatexIncrementalDecoder class LatexIncrementalDecoderReplaceTest(BaseLatexIncrementalDecoderTest): errors = 'replace' IncrementalDecoder = MockIncrementalDecoder def test_errors_replace(self): self.lex_it( b'helmocklo', b'?|?|?|mock|?|?'.split(b'|'), final=True ) class LatexIncrementalDecoderIgnoreTest(BaseLatexIncrementalDecoderTest): errors = 'ignore' IncrementalDecoder = MockIncrementalDecoder def test_errors_ignore(self): self.lex_it( b'helmocklo', b'mock'.split(b'|'), final=True ) class LatexIncrementalDecoderInvalidErrorTest(BaseLatexIncrementalDecoderTest): errors = '**baderror**' IncrementalDecoder = MockIncrementalDecoder @nose.tools.raises(NotImplementedError) def test_errors_invalid(self): self.lex_it( b'helmocklo', b'?|?|?|mock|?|?'.split(b'|'), final=True ) def invalid_token_test(): lexer = LatexIncrementalDecoder() # piggyback an implementation which results in invalid tokens lexer.get_raw_tokens = lambda bytes_, final: [Token('**invalid**', bytes_)] nose.tools.assert_raises(AssertionError, lambda: lexer.decode(b'hello')) def invalid_state_test_1(): lexer = LatexIncrementalDecoder() # piggyback invalid state lexer.state = '**invalid**' nose.tools.assert_raises(AssertionError, lambda: lexer.decode(b'\n\n\n')) def invalid_state_test_2(): lexer = LatexIncrementalDecoder() # piggyback invalid state lexer.state = '**invalid**' nose.tools.assert_raises(AssertionError, lambda: lexer.decode(b' ')) class LatexIncrementalLexerTest(TestCase): errors = 'strict' def setUp(self): self.lexer = LatexIncrementalLexer(errors=self.errors) def lex_it(self, latex_code, latex_tokens, final=False): tokens = self.lexer.get_tokens(latex_code, final=final) self.assertEqual( list(token.text for token in tokens), latex_tokens) def tearDown(self): del self.lexer def test_newline(self): self.lex_it( b"hello\nworld", b"h|e|l|l|o| |w|o|r|l|d".split(b'|'), final=True) def test_par(self): self.lex_it( b"hello\n\nworld", b"h|e|l|l|o| |\\par|w|o|r|l|d".split(b'|'), final=True) class LatexIncrementalEncoderTest(TestCase): """Encoder test fixture.""" errors = 'strict' IncrementalEncoder = LatexIncrementalEncoder def setUp(self): self.encoder = self.IncrementalEncoder(self.errors) def encode(self, latex_code, latex_bytes, final=False): result = self.encoder.encode(latex_code, final=final) self.assertEqual(result, latex_bytes) def tearDown(self): del self.encoder @nose.tools.raises(TypeError) def test_invalid_type(self): self.encoder.encode(object(), final=True) @nose.tools.raises(ValueError) def test_invalid_code(self): # default encoding is ascii, \u00ff is not ascii translatable self.encoder.encode(u"\u00ff", final=True) def test_hello(self): self.encode( u'hello', b'hello' if self.encoder.binary_mode else u'hello', final=True) def test_unicode_tokens(self): self.assertEqual( list(self.encoder.get_unicode_tokens( u"ĄąĄ̊ą̊ĘęĮįǪǫǬǭŲųY̨y̨", final=True)), u"Ą|ą|Ą̊|ą̊|Ę|ę|Į|į|Ǫ|ǫ|Ǭ|ǭ|Ų|ų|Y̨|y̨".split(u"|")) def test_state(self): self.assertEqual( list(self.encoder.get_unicode_tokens( u"Ą", final=False)), []) state = self.encoder.getstate() self.encoder.reset() self.assertEqual( list(self.encoder.get_unicode_tokens( u"ABC", final=True)), [u"A", u"B", u"C"]) self.encoder.setstate(state) self.assertEqual( list(self.encoder.get_unicode_tokens( u"̊", final=True)), [u"Ą̊"]) class UnicodeLatexIncrementalEncoderTest(LatexIncrementalEncoderTest): IncrementalEncoder = UnicodeLatexIncrementalEncoder def test_invalid_code(self): pass latexcodec-1.0.5/test/test_latex_codec.py0000644005105600024240000003523312770454474020466 0ustar dma0mtdma00000000000000# -*- coding: utf-8 -*- """Tests for the latex codec.""" from __future__ import print_function import codecs import nose.tools from six import text_type, binary_type, BytesIO, PY2 from unittest import TestCase import latexcodec def test_getregentry(): assert latexcodec.codec.getregentry() is not None def test_find_latex(): assert latexcodec.codec.find_latex('hello') is None def test_latex_incremental_decoder_getstate(): encoder = codecs.getincrementaldecoder('latex')() nose.tools.assert_raises(NotImplementedError, lambda: encoder.getstate()) def test_latex_incremental_decoder_setstate(): encoder = codecs.getincrementaldecoder('latex')() state = (u'', 0) nose.tools.assert_raises( NotImplementedError, lambda: encoder.setstate(state)) def split_input(input_): """Helper function for testing the incremental encoder and decoder.""" if not isinstance(input_, (text_type, binary_type)): raise TypeError("expected unicode or bytes input") if input_: for i in range(len(input_)): if i + 1 < len(input_): yield input_[i:i + 1], False else: yield input_[i:i + 1], True else: yield input_, True class TestDecoder(TestCase): """Stateless decoder tests.""" maxDiff = None def decode(self, text_utf8, text_latex, inputenc=None): """Main test function.""" encoding = 'latex+' + inputenc if inputenc else 'latex' decoded, n = codecs.getdecoder(encoding)(text_latex) self.assertEqual((decoded, n), (text_utf8, len(text_latex))) @nose.tools.raises(TypeError) def test_invalid_type(self): codecs.getdecoder("latex")(object()) @nose.tools.raises(ValueError) def test_invalid_code(self): # b'\xe9' is invalid utf-8 code self.decode(u'', b'\xe9 ', 'utf-8') def test_null(self): self.decode(u'', b'') def test_maelstrom(self): self.decode(u"mælström", br'm\ae lstr\"om') def test_maelstrom_latin1(self): self.decode(u"mælström", b'm\\ae lstr\xf6m', 'latin1') def test_laren(self): self.decode( u"© låren av björn", br'\copyright\ l\aa ren av bj\"orn') def test_laren_brackets(self): self.decode( u"© l{å}ren av bj{ö}rn", br'\copyright\ l{\aa}ren av bj{\"o}rn') def test_laren_latin1(self): self.decode( u"© låren av björn", b'\\copyright\\ l\xe5ren av bj\xf6rn', 'latin1') def test_droitcivil(self): self.decode( u"Même s'il a fait l'objet d'adaptations suite à l'évolution, " u"la transformation sociale, économique et politique du pays, " u"le code civil fran{ç}ais est aujourd'hui encore le texte " u"fondateur " u"du droit civil français mais aussi du droit civil belge " u"ainsi que " u"de plusieurs autres droits civils.", b"M\\^eme s'il a fait l'objet d'adaptations suite " b"\\`a l'\\'evolution, \nla transformation sociale, " b"\\'economique et politique du pays, \nle code civil " b"fran\\c{c}ais est aujourd'hui encore le texte fondateur \n" b"du droit civil fran\\c cais mais aussi du droit civil " b"belge ainsi que \nde plusieurs autres droits civils.", ) def test_oeuf(self): self.decode( u"D'un point de vue diététique, l'œuf apaise la faim.", br"D'un point de vue di\'et\'etique, l'\oe uf apaise la faim.", ) def test_oeuf_latin1(self): self.decode( u"D'un point de vue diététique, l'œuf apaise la faim.", b"D'un point de vue di\xe9t\xe9tique, l'\\oe uf apaise la faim.", 'latin1' ) def test_alpha(self): self.decode(u"α", b"$\\alpha$") def test_maelstrom_multibyte_encoding(self): self.decode(u"\\c öké", b'\\c \xc3\xb6k\xc3\xa9', 'utf8') def test_serafin(self): self.decode(u"Seraf{\xed}n", b"Seraf{\\'i}n") def test_astrom(self): self.decode(u"{\xc5}str{\xf6}m", b'{\\AA}str{\\"o}m') def test_space_1(self): self.decode(u"ææ", br'\ae \ae') def test_space_2(self): self.decode(u"æ æ", br'\ae\ \ae') def test_number_sign_1(self): self.decode(u"# hello", br'\#\ hello') def test_number_sign_2(self): # LaTeX does not absorb the space following '\#': # check decoding is correct self.decode(u"# hello", br'\# hello') def test_number_sign_3(self): # a single '#' is not valid LaTeX: # for the moment we ignore this error and return # unchanged self.decode(u"# hello", br'# hello') def test_underscore(self): self.decode(u"_", br'\_') def test_dz(self): self.decode(u"DZ", br'DZ') def test_newline(self): self.decode(u"hello world", b"hello\nworld") def test_par1(self): self.decode(u"hello\n\nworld", b"hello\n\nworld") def test_par2(self): self.decode(u"hello\n\nworld", b"hello\\par world") def test_par3(self): self.decode(u"hello\n\nworld", b"hello \\par world") def test_ogonek1(self): self.decode(u"ĄąĘęĮįǪǫŲų", br'\k A\k a\k E\k e\k I\k i\k O\k o\k U\k u') def test_ogonek2(self): # note: should decode into u"Ǭǭ" but can't support this yet... self.decode(u"\\textogonekcentered {Ō}\\textogonekcentered {ō}", br'\textogonekcentered{\=O}\textogonekcentered{\=o}') def test_math_spacing_dollar(self): self.decode(u'This is a ψ test.', br'This is a $\psi$ test.') def test_math_spacing_brace(self): self.decode(u'This is a ψ test.', br'This is a \(\psi\) test.') def test_double_math(self): # currently no attempt to translate maths inside $$ self.decode(u'This is a $$\psi $$ test.', br'This is a $$\psi$$ test.') def test_tilde(self): self.decode(u'This is a ˜, ˷, ∼ and ~test.', (br'This is a \~{}, \texttildelow, ' br'$\sim$ and \textasciitilde test.')) def test_backslash(self): self.decode(u'This is a \\ \\test.', br'This is a $\backslash$ \textbackslash test.') def test_percent(self): self.decode(u'This is a % test.', br'This is a \% test.') class TestStreamDecoder(TestDecoder): """Stream decoder tests.""" def decode(self, text_utf8, text_latex, inputenc=None): encoding = 'latex+' + inputenc if inputenc else 'latex' stream = BytesIO(text_latex) reader = codecs.getreader(encoding)(stream) self.assertEqual(text_utf8, reader.read()) # in this test, BytesIO(object()) is eventually called # this is valid on Python 2, so we skip this test there def test_invalid_type(self): if PY2: raise nose.plugins.skip.SkipTest else: TestDecoder.test_invalid_type(self) class TestIncrementalDecoder(TestDecoder): """Incremental decoder tests.""" def decode(self, text_utf8, text_latex, inputenc=None): encoding = 'latex+' + inputenc if inputenc else 'latex' decoder = codecs.getincrementaldecoder(encoding)() decoded_parts = ( decoder.decode(text_latex_part, final) for text_latex_part, final in split_input(text_latex)) self.assertEqual(text_utf8, u''.join(decoded_parts)) class TestEncoder(TestCase): """Stateless encoder tests.""" def encode(self, text_utf8, text_latex, inputenc=None, errors='strict'): """Main test function.""" encoding = 'latex+' + inputenc if inputenc else 'latex' encoded, n = codecs.getencoder(encoding)(text_utf8, errors=errors) self.assertEqual((encoded, n), (text_latex, len(text_utf8))) @nose.tools.raises(TypeError) def test_invalid_type(self): codecs.getencoder("latex")(object()) # note concerning test_invalid_code_* methods: # u'\u2328' (0x2328 = 9000) is unicode for keyboard symbol # we currently provide no translation for this into LaTeX code @nose.tools.raises(ValueError) def test_invalid_code_strict(self): self.encode(u'\u2328', b'', 'ascii', 'strict') def test_invalid_code_ignore(self): self.encode(u'\u2328', b'', 'ascii', 'ignore') def test_invalid_code_replace(self): self.encode(u'\u2328', b'{\\char9000}', 'ascii', 'replace') @nose.tools.raises(ValueError) def test_invalid_code_baderror(self): self.encode(u'\u2328', b'', 'ascii', '**baderror**') def test_null(self): self.encode(u'', b'') def test_maelstrom(self): self.encode(u"mælström", br'm\ae lstr\"om') def test_maelstrom_latin1(self): self.encode(u"mælström", b'm\xe6lstr\xf6m', 'latin1') def test_laren(self): self.encode( u"© låren av björn", br'\copyright\ l\aa ren av bj\"orn') def test_laren_latin1(self): self.encode( u"© låren av björn", b'\xa9 l\xe5ren av bj\xf6rn', 'latin1') def test_droitcivil(self): self.encode( u"Même s'il a fait l'objet d'adaptations suite à l'évolution, \n" u"la transformation sociale, économique et politique du pays, \n" u"le code civil fran{ç}ais est aujourd'hui encore le texte " u"fondateur \n" u"du droit civil français mais aussi du droit civil belge " u"ainsi que \n" u"de plusieurs autres droits civils.", b"M\\^eme s'il a fait l'objet d'adaptations suite " b"\\`a l'\\'evolution, \nla transformation sociale, " b"\\'economique et politique du pays, \nle code civil " b"fran{\\c c}ais est aujourd'hui encore le texte fondateur \n" b"du droit civil fran\\c cais mais aussi du droit civil " b"belge ainsi que \nde plusieurs autres droits civils.", ) def test_oeuf(self): self.encode( u"D'un point de vue diététique, l'œuf apaise la faim.", br"D'un point de vue di\'et\'etique, l'\oe uf apaise la faim.", ) def test_oeuf_latin1(self): self.encode( u"D'un point de vue diététique, l'œuf apaise la faim.", b"D'un point de vue di\xe9t\xe9tique, l'\\oe uf apaise la faim.", 'latin1' ) def test_alpha(self): self.encode(u"α", b"$\\alpha$") def test_serafin(self): self.encode(u"Seraf{\xed}n", b"Seraf{\\'\\i }n") def test_space_1(self): self.encode(u"ææ", br'\ae \ae') def test_space_2(self): self.encode(u"æ æ", br'\ae\ \ae') def test_number_sign(self): # note: no need for control space after \# self.encode(u"# hello", br'\# hello') def test_underscore(self): self.encode(u"_", br'\_') def test_dz1(self): self.encode(u"DZ", br'DZ') def test_dz2(self): self.encode(u"DZ", br'DZ') def test_newline(self): self.encode(u"hello\nworld", b"hello\nworld") def test_par1(self): self.encode(u"hello\n\nworld", b"hello\n\nworld") def test_par2(self): self.encode(u"hello\\par world", b"hello\\par world") def test_ogonek1(self): self.encode(u"ĄąĘęĮįǪǫŲų", br'\k A\k a\k E\k e\k I\k i\k O\k o\k U\k u') def test_ogonek2(self): self.encode(u"Ǭǭ", br'\textogonekcentered{\=O}\textogonekcentered{\=o}') def test_math_spacing(self): self.encode(u'This is a ψ test.', br'This is a $\psi$ test.') def test_double_math(self): # currently no attempt to translate maths inside $$ self.encode(u'This is a $$\psi$$ test.', br'This is a $$\psi$$ test.') def test_tilde(self): self.encode(u'This is a ˜, ˷, ∼ and ~test.', (br'This is a \~{}, \texttildelow , ' br'$\sim$ and \textasciitilde test.')) def test_percent(self): self.encode(u'This is a % test.', br'This is a \% test.') class TestStreamEncoder(TestEncoder): """Stream encoder tests.""" def encode(self, text_utf8, text_latex, inputenc=None, errors='strict'): encoding = 'latex+' + inputenc if inputenc else 'latex' stream = BytesIO() writer = codecs.getwriter(encoding)(stream, errors=errors) writer.write(text_utf8) self.assertEqual(text_latex, stream.getvalue()) class TestIncrementalEncoder(TestEncoder): """Incremental encoder tests.""" def encode(self, text_utf8, text_latex, inputenc=None, errors='strict'): encoding = 'latex+' + inputenc if inputenc else 'latex' encoder = codecs.getincrementalencoder(encoding)(errors=errors) encoded_parts = ( encoder.encode(text_utf8_part, final) for text_utf8_part, final in split_input(text_utf8)) self.assertEqual(text_latex, b''.join(encoded_parts)) class TestUnicodeDecoder(TestDecoder): def decode(self, text_utf8, text_latex, inputenc=None): """Main test function.""" text_latex = text_latex.decode(inputenc if inputenc else "ascii") decoded, n = codecs.getdecoder('ulatex')(text_latex) self.assertEqual((decoded, n), (text_utf8, len(text_latex))) class TestUnicodeEncoder(TestEncoder): def encode(self, text_utf8, text_latex, inputenc=None, errors='strict'): """Main test function.""" encoding = 'ulatex+' + inputenc if inputenc else 'ulatex' text_latex = text_latex.decode(inputenc if inputenc else 'ascii') encoded, n = codecs.getencoder(encoding)(text_utf8, errors=errors) self.assertEqual((encoded, n), (text_latex, len(text_utf8))) def uencode(self, text_utf8, text_ulatex, inputenc=None, errors='strict'): """Main test function.""" encoding = 'ulatex+' + inputenc if inputenc else 'ulatex' encoded, n = codecs.getencoder(encoding)(text_utf8, errors=errors) self.assertEqual((encoded, n), (text_ulatex, len(text_utf8))) def test_ulatex_ascii(self): self.uencode(u'# ψ', u'\# $\psi$', 'ascii') def test_ulatex_utf8(self): self.uencode(u'# ψ', u'\# ψ', 'utf8') # the following tests rely on the fact that \u2328 is not in our # translation table @nose.tools.raises(ValueError) def test_ulatex_ascii_invalid(self): self.uencode(u'# \u2328', u'', 'ascii') def test_ulatex_utf8_invalid(self): self.uencode(u'# ψ \u2328', u'\# ψ \u2328', 'utf8') def test_invalid_code_keep(self): self.uencode(u'# ψ \u2328', u'\# $\psi$ \u2328', 'ascii', 'keep')