pyth-0.5.6/0000755000175000017500000000000011436501227012667 5ustar brendonhbrendonhpyth-0.5.6/PKG-INFO0000644000175000017500000000517411436501227013773 0ustar brendonhbrendonhMetadata-Version: 1.0 Name: pyth Version: 0.5.6 Summary: Python text markup and conversion Home-page: http://wiki.github.com/brendonh/pyth Author: Brendon Hogger Author-email: brendonh@taizilla.com License: UNKNOWN Description: ======================================== pyth - Python text markup and conversion ======================================== Pyth is intended to make it easy to convert marked-up text between different common formats. *Marked-up text* means text which has: * Paragraphs * Headings * Bold, italic, and underlined text * Hyperlinks * Bullet lists * Simple tables * Very little else Formats I initially want to support are: * xhtml * rtf * pdf (output) These three formats cover web, Word / OpenOffice, and print. Design principles ================= * Ignore unsupported information in input formats (e.g. page layout) * Ignore font issues -- output in a single font. * Ignore specific text sizes -- support relative sizes (bigger, littler) only. Output in a single base size. * Have no dependencies unless they are written in Python, and work * Make it easy to add support for new formats, by using an architecture based on *plugins* and *adapters*. Examples ======== See http://github.com/brendonh/pyth/tree/master/examples/ Unit tests ========== The sources contains some unit tests (written using python unittest module) in the 'tests' directory. To run the tests we can either run them individually as python script, either use `python nose`_. If using nose then we just need to go into the tests directory and invoke nosetest from there (make sure that pyth module is in PYTHONPATH). .. _python nose: http://code.google.com/p/python-nose/ Platform: UNKNOWN Classifier: Development Status :: 5 - Production/Stable Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: MIT License Classifier: Programming Language :: Python :: 2.5 Classifier: Programming Language :: Python :: 2.6 Classifier: Topic :: Office/Business Classifier: Topic :: Software Development :: Libraries Classifier: Topic :: Text Editors :: Word Processors Classifier: Topic :: Text Processing Classifier: Topic :: Text Processing :: Markup Classifier: Topic :: Text Processing :: Filters pyth-0.5.6/examples/0000755000175000017500000000000011436501227014505 5ustar brendonhbrendonhpyth-0.5.6/examples/writing/0000755000175000017500000000000011436501227016170 5ustar brendonhbrendonhpyth-0.5.6/examples/writing/rst.py0000644000175000017500000000024411240464667017363 0ustar brendonhbrendonhfrom pyth.plugins.rst.writer import RSTWriter import pythonDoc if __name__ == "__main__": doc = pythonDoc.buildDoc() print RSTWriter.write(doc).getvalue() pyth-0.5.6/examples/writing/rtf15.py0000644000175000000000000000021011240225655016644 0ustar brendonhrootfrom pyth.plugins.rtf15.writer import Rtf15Writer import pythonDoc doc = pythonDoc.buildDoc() print Rtf15Writer.write(doc).getvalue() pyth-0.5.6/examples/writing/xhtml.py0000644000175000000000000000077711302452216017053 0ustar brendonhrootfrom pyth.plugins.xhtml.writer import XHTMLWriter import pythonDoc docTemplate = ''' Pyth document %s ''' if __name__ == "__main__": doc = pythonDoc.buildDoc() print docTemplate % XHTMLWriter.write(doc, pretty=True).getvalue() pyth-0.5.6/examples/writing/pdf.py0000644000175000017500000000165511347236427017332 0ustar brendonhbrendonh# -*- coding: utf-8 -*- from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.pdf.writer import PDFWriter doc = Rtf15Reader.read(open("../reading/sample.rtf")) fontfile = "/usr/share/fonts/truetype/msttcorefonts/Times_New_Roman%s.ttf" from reportlab.lib.styles import getSampleStyleSheet from reportlab.pdfbase.pdfmetrics import registerFont, registerFontFamily from reportlab.pdfbase.ttfonts import TTFont registerFont(TTFont('TimesTTF', fontfile % "")) registerFont(TTFont('TimesTTF_B', fontfile % "_Bold")) registerFont(TTFont('TimesTTF_I', fontfile % "_Italic")) registerFont(TTFont('TimesTTF_BI', fontfile % "_Bold_Italic")) registerFontFamily("TimesTTF", normal="TimesTTF", bold="TimesTTF_B", italic="TimesTTF_I", boldItalic="TimesTTF_BI") stylesheet = getSampleStyleSheet() paragraphStyle = stylesheet['Normal'] paragraphStyle.fontName = "TimesTTF" PDFWriter.write(doc, open("output.pdf", "wb"), paragraphStyle) pyth-0.5.6/examples/writing/plaintext.py0000644000175000000000000000022411240225655017720 0ustar brendonhrootfrom pyth.plugins.plaintext.writer import PlaintextWriter import pythonDoc doc = pythonDoc.buildDoc() print PlaintextWriter.write(doc).getvalue() pyth-0.5.6/examples/writing/pythonDoc.py0000644000175000000000000000153311347221026017657 0ustar brendonhroot# -*- coding: utf-8 -*- from pyth.plugins.python.reader import * def buildDoc(): return PythonReader.read(( P [ T(ITALIC, BOLD) [ u"Hello World" ], u", hee hee hee! ", T(url=u'http://www.google.com') [ u"This seems to work" ] ], L [ [unicode(word) for word in ("One", "Two", "Three", "Four")] ], L [ u"Introduction", LE [ u"First sentence in the\nsub-section", u"Also some other stuff", L [ u"Alpha", L [ u"Beta\nWhomble", LE [ u"Beta", u"Whoop\nWhoa" ], u"Beta", ], u"Gamma", u"Gamma", ], u"Final sentence in the sub-section", ], T(BOLD) [ u"Conclusion" ], ], u"That's all, folks! 再見!" )) pyth-0.5.6/examples/writing/latex.py0000644000175000017500000000025211240464667017667 0ustar brendonhbrendonhfrom pyth.plugins.latex.writer import LatexWriter import pythonDoc if __name__ == "__main__": doc = pythonDoc.buildDoc() print LatexWriter.write(doc).getvalue() pyth-0.5.6/examples/reading/0000755000175000017500000000000011436501227016116 5ustar brendonhbrendonhpyth-0.5.6/examples/reading/rtf15.py0000644000175000000000000000044411436477123016611 0ustar brendonhrootfrom pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.xhtml.writer import XHTMLWriter import sys if len(sys.argv) > 1: filename = sys.argv[1] else: filename = "sample.rtf" doc = Rtf15Reader.read(open(filename, "rb")) print XHTMLWriter.write(doc, pretty=True).read() pyth-0.5.6/examples/reading/xhtml.py0000644000175000017500000000246411302452414017625 0ustar brendonhbrendonh# -*- coding: utf-8 -*- from pyth.plugins.xhtml.reader import XHTMLReader from pyth.plugins.xhtml.writer import XHTMLWriter import xhtml from cStringIO import StringIO # A simple xhtml document with limited features. content = StringIO(r"""

Simple document

this document has no hypertext links yet.

bold text. italic text.

bold text from css style this is bold and italic

this is bold too

unicode characters : 你好

bold too

example super example sub

a list

a link single space here.
a br tag

""") css = """ .important {font-weight: bold} p.bold {font-weight: bold} .other {font-weight: normal; color: blue} """ if __name__ == '__main__': # Parse the document and then reconstruct it using the xhtml # writer. doc = XHTMLReader.read(content, css) print XHTMLWriter.write(doc).getvalue() pyth-0.5.6/tests/0000755000175000017500000000000011436501227014031 5ustar brendonhbrendonhpyth-0.5.6/tests/test_writelatex.py0000644000175000017500000000302211253467130017630 0ustar brendonhbrendonh""" unit tests of the latex writer """ import unittest import subprocess import tempfile import os import sys import BeautifulSoup from pyth.plugins.latex.writer import LatexWriter from pyth.plugins.python.reader import * class TestWriteLatex(unittest.TestCase): def test_basic(self): """ Try to create an empty latex document """ doc = PythonReader.read([]) latex = LatexWriter.write(doc).getvalue() def test_paragraph(self): """ Try a single paragraph document """ doc = PythonReader.read(P[u"the text"]) latex = LatexWriter.write(doc).getvalue() assert "the text" in latex def test_bold(self): doc = PythonReader.read([P[T(BOLD)[u"bold text"]]]) latex = LatexWriter.write(doc).getvalue() assert r"\textbf{bold text}" in latex, latex def test_italic(self): doc = PythonReader.read([P[T(ITALIC)[u"italic text"]]]) latex = LatexWriter.write(doc).getvalue() assert r"\emph{italic text}" in latex, latex def test_metadata(self): """ assert that the document metadata are added into the latex file """ doc = PythonReader.read([]) doc["author"] = "The Author" doc["subject"] = "The Subject" doc["title"] = "The Title" latex = LatexWriter.write(doc).getvalue() assert "pdfauthor={The Author}" in latex, latex assert "pdfsubject={The Subject}" in latex, latex assert "pdftitle={The Title}" in latex, latex pyth-0.5.6/tests/test_writepdf.py0000644000175000017500000000550611253467130017275 0ustar brendonhbrendonh""" unit tests of the pdf writer """ import unittest import subprocess import tempfile import os import sys import BeautifulSoup from pyth.plugins.pdf.writer import PDFWriter from pyth.plugins.python.reader import * class TestWritePDF(unittest.TestCase): def pdf_to_html(self, pdf): """ We are using pdftohtml to convert the pdf document to an html document. Since it is difficult to check a pdf document, this allow us to first convert it into html, and then perform the checks on this html document. """ # pdftohtml needs its input from a file so we first save the # pdf into a temporary file. _, filename = tempfile.mkstemp(suffix='.pdf') file = open(filename, "wb") try: file.write(pdf) file.close() command = ["pdftohtml", "-stdout", filename] try: proc = subprocess.Popen(command, stdout=subprocess.PIPE) except OSError: print "Make sure that pdftohtml is installed" raise ret = proc.communicate()[0] return ret finally: # Make sure to remove the tmp file file.close() os.remove(filename) def test_basic(self): """ Try to create an empty pdf document """ doc = PythonReader.read([]) pdf = PDFWriter.write(doc).getvalue() html = self.pdf_to_html(pdf) def test_paragraph(self): """ Try a simple document with one paragraph """ doc = PythonReader.read(P[u"the text"]) pdf = PDFWriter.write(doc).getvalue() html = self.pdf_to_html(pdf) assert "the text" in html def test_bold(self): doc = PythonReader.read([P[T(BOLD)[u"bold text"]]]) pdf = PDFWriter.write(doc).getvalue() html = self.pdf_to_html(pdf) soup = BeautifulSoup.BeautifulSoup(html) node = soup.find("b") assert node assert node.string == "bold text" def test_italic(self): doc = PythonReader.read([P[T(ITALIC)[u"italic text"]]]) pdf = PDFWriter.write(doc).getvalue() html = self.pdf_to_html(pdf) soup = BeautifulSoup.BeautifulSoup(html) node = soup.find("i") assert node assert node.string == "italic text" def test_latex(self): doc = PythonReader.read(P[u"the-text"]) pdf = PDFWriter.write(doc, method='latex').getvalue() html = self.pdf_to_html(pdf) assert "the-text" in html, html def test_rst(self): doc = PythonReader.read(P[u"the-text"]) pdf = PDFWriter.write(doc, method='rst').getvalue() print pdf html = self.pdf_to_html(pdf) assert "the-text" in html, html if __name__ == '__main__': unittest.main() pyth-0.5.6/tests/test_readxhtml.py0000644000175000017500000000463311253467130017441 0ustar brendonhbrendonh """ Unit tests of the xhtml reader. """ import unittest import pyth.document from pyth.plugins.xhtml.reader import XHTMLReader class TestReadXHTML(unittest.TestCase): def test_basic(self): """ Try to read an empty xhtml document """ xhtml = "
" doc = XHTMLReader.read(xhtml) self.assert_(isinstance(doc, pyth.document.Document)) self.assert_(not doc.content) def test_paragraphs(self): """ Try to read a simple xhtml document containing tree paragraphs """ xhtml = "

p0

p1

p2

" doc = XHTMLReader.read(xhtml) self.assert_(len(doc.content) == 3) for i, p in enumerate(doc.content): self.assert_(isinstance(p, pyth.document.Paragraph)) self.assert_(len(p.content) == 1) self.assert_(isinstance(p.content[0], pyth.document.Text)) text = p.content[0] self.assert_(len(text.content) == 1) self.assert_(text.content[0] == 'p%d' % i) def test_bold(self): """ Try to read a paragraph containing bold text """ xhtml = "

bold

" doc = XHTMLReader.read(xhtml) text = doc.content[0].content[0] assert text['bold'] def test_italic(self): """ Try to read a paragraph containing italic text """ xhtml = "

italic

" doc = XHTMLReader.read(xhtml) text = doc.content[0].content[0] assert text['italic'] def test_sub(self): """ Try to read a paragraph containing subscript """ xhtml = "

sub

" doc = XHTMLReader.read(xhtml) text = doc.content[0].content[0] assert text['sub'] def test_sup(self): """ Try to read a paragraph containing supscript """ xhtml = "

super

" doc = XHTMLReader.read(xhtml) text = doc.content[0].content[0] assert text['super'] def test_url(self): """ Try to read a paragraph containing an url """ xhtml = '

link

' doc = XHTMLReader.read(xhtml) text = doc.content[0].content[0] assert text['url'] == "http://google.com" if __name__ == '__main__': unittest.main() pyth-0.5.6/setup.cfg0000644000175000017500000000007311436501227014510 0ustar brendonhbrendonh[egg_info] tag_build = tag_date = 0 tag_svn_revision = 0 pyth-0.5.6/pyth/0000755000175000017500000000000011436501227013653 5ustar brendonhbrendonhpyth-0.5.6/pyth/format.py0000644000175000000000000000165411240225655014673 0ustar brendonhroot""" Stuff for format implementations to subclass / use. """ class PythReader(object): """ Base class for all Pyth readers. Readers must implement these methods. """ @classmethod def read(self, source): """ source: An object to read the document from. Usually (but not necessarily) a file object. Returns: A pyth.document.Document object. """ pass class PythWriter(object): """ Base class for all Pyth writers. Writers must implement these methods. """ @classmethod def write(self, document, target=None): """ document: An instance of pyth.document.Document target: An object to write the document to. Usually (but not necessarily) a file object. If target is None, return something sensible (like a StringIO object) Returns: The target object """ pass pyth-0.5.6/pyth/document.py0000644000175000017500000000614611302217257016051 0ustar brendonhbrendonh""" Abstract document representation """ class _PythBase(object): def __init__(self, properties={}, content=[]): self.properties = {} self.content = [] for (k,v) in properties.iteritems(): self[k] = v for item in content: self.append(item) def __setitem__(self, key, value): if key not in self.validProperties: raise ValueError("Invalid %s property: %s" % (self.__class__.__name__, repr(key))) self.properties[key] = value def __getitem__(self, key): if key not in self.validProperties: raise ValueError("Invalid %s property: %s" % (self.__class__.__name__, repr(key))) return self.properties.get(key) def append(self, item): """ Try to add an item to this element. If the item is of the wrong type, and if this element has a sub-type, then try to create such a sub-type and insert the item into that, instead. This happens recursively, so (in python-markup): L [ u'Foo' ] actually creates: L [ LE [ P [ T [ u'Foo' ] ] ] ] If that doesn't work, raise a TypeError. """ okay = True if not isinstance(item, self.contentType): if hasattr(self.contentType, 'contentType'): try: item = self.contentType(content=[item]) except TypeError: okay = False else: okay = False if not okay: raise TypeError("Wrong content type for %s: %s (%s)" % ( self.__class__.__name__, repr(type(item)), repr(item))) self.content.append(item) class Text(_PythBase): """ Text runs are strings of text with markup properties, like 'bold' or 'italic' (or 'hyperlink to ...'). They are rendered inline (not as blocks). They do not inherit their properties from anything. """ validProperties = ('bold', 'italic', 'underline', 'url', 'sub', 'super') contentType = unicode def __repr__(self): return "Text('%s' %s)" % ("".join("[%s]" % r.encode("utf-8") for r in self.content), self.properties) class Paragraph(_PythBase): """ Paragraphs contain zero or more text runs. They cannot contain other paragraphs (but see List). They have no text markup properties, but may have rendering properties (e.g. margins) """ validProperties = () contentType = Text class ListEntry(_PythBase): """ A list of paragraphs representing one item in a list """ validProperties = () contentType = Paragraph class List(Paragraph): """ A list of paragraphs which will be rendered as a bullet list. A List is a Paragraph, so Lists can be nested. """ validProperties = () contentType = ListEntry class Document(_PythBase): """ Top-level item. One document is exactly one file. Documents consist of a list of paragraphs. """ validProperties = ('title', 'subject', 'author') contentType = Paragraph pyth-0.5.6/pyth/errors.py0000644000175000017500000000005311323331657015542 0ustar brendonhbrendonhclass WrongFileType(ValueError): pass pyth-0.5.6/pyth/encodings/0000755000175000017500000000000011436501227015624 5ustar brendonhbrendonhpyth-0.5.6/pyth/encodings/symbol.py0000644000175000017500000001014011436476373017514 0ustar brendonhbrendonh""" Maps Symbol typeface to Unicode, extracted from http://en.wikipedia.org/wiki/Symbol_(typeface) """ import codecs decodeTable = { 33: 33, 34: 8704, 35: 35, 36: 8707, 37: 37, 38: 38, 39: 8717, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 8773, 65: 913, 66: 914, 67: 935, 68: 916, 69: 917, 70: 934, 71: 915, 72: 919, 73: 921, 74: 977, 75: 922, 76: 923, 77: 924, 78: 925, 79: 927, 80: 928, 81: 920, 82: 929, 83: 931, 84: 932, 85: 933, 86: 962, 87: 937, 88: 926, 89: 936, 90: 918, 91: 91, 92: 8756, 93: 93, 94: 8869, 95: 95, 96: 63717, 97: 945, 98: 946, 99: 967, 100: 948, 101: 949, 102: 966, 103: 947, 104: 951, 105: 953, 106: 981, 107: 954, 108: 955, 109: 956, 110: 957, 111: 959, 112: 960, 113: 952, 114: 961, 115: 963, 116: 964, 117: 965, 118: 982, 119: 969, 120: 958, 121: 968, 122: 950, 123: 123, 124: 124, 125: 125, 126: 126, 160: 8364, 161: 978, 162: 697, 163: 8804, 164: 8260, 165: 8734, 166: 402, 167: 9827, 168: 9830, 169: 9829, 170: 9824, 171: 8596, 172: 8592, 173: 8593, 174: 8594, 175: 8595, 176: 176, 177: 177, 178: 698, 179: 8805, 180: 215, 181: 8733, 182: 8706, 183: 8226, 184: 247, 185: 8800, 186: 8801, 187: 8776, 188: 8230, 189: 9168, 190: 9135, 191: 8629, 192: 8501, 193: 8465, 194: 8476, 195: 8472, 196: 8855, 197: 8853, 198: 8709, 199: 8745, 200: 8746, 201: 8835, 202: 8839, 203: 8836, 204: 8834, 205: 8838, 206: 8712, 207: 8713, 208: 8736, 209: 8711, 210: 174, 211: 169, 212: 8482, 213: 8719, 214: 8730, 215: 8901, 216: 172, 217: 8743, 218: 8744, 219: 8660, 220: 8656, 221: 8657, 222: 8658, 223: 8659, 224: 9674, 225: 12296, 226: 174, 227: 169, 228: 8482, 229: 8721, 230: 9115, 231: 9116, 232: 9117, 233: 9121, 234: 9122, 235: 9123, 236: 9127, 237: 9128, 238: 9129, 239: 9130, 241: 12297, 242: 8747, 243: 8992, 244: 9134, 245: 8993, 246: 9118, 247: 9119, 248: 9120, 249: 9124, 250: 9125, 251: 9126, 252: 9131, 253: 9132, 254: 9133} encodeTable = dict((v, k) for (k, v) in decodeTable.iteritems()) ERROR_STRING = "Ordinal not in range (255)" def symbol_decode(input, errors='strict'): chars = [] for (i, c) in enumerate(input): try: chars.append(decodeTable[ord(c)]) except KeyError: if errors == 'replace': chars.append(ord(u'?')) else: raise UnicodeDecodeError("symbol", input, i, i+1, ERROR_STRING) return (u"".join(map(unichr, chars)), len(input)) def symbol_encode(input, errors='strict'): chars = [] for (i, c) in enumerate(input): try: chars.append(encodeTable[ord(c)]) except KeyError: if errors == 'replace': chars.append(ord('?')) else: raise UnicodeEncodeError("symbol", input, i, i+1, ERROR_STRING) return ("".join(map(chr, chars)), len(input)) ### Codec APIs class Codec(codecs.Codec): def encode(self, input,errors='strict'): return symbol_encode(input, errors) def decode(self, input,errors='strict'): return symbol_decode(input, errors) class IncrementalEncoder(codecs.IncrementalEncoder): def encode(self, input, final=False): try: return symbol_encode(input)[0] except UnicodeEncodeError: raise ValueError(ERROR_STRING) class IncrementalDecoder(codecs.IncrementalDecoder): def decode(self, input, final=False): try: return symbol_decode(input)[0] except UnicodeDecodeError: raise ValueError(ERROR_STRING) class StreamWriter(Codec,codecs.StreamWriter): pass class StreamReader(Codec,codecs.StreamReader): pass ### encodings module API info = codecs.CodecInfo( name='symbol', encode=symbol_encode, decode=symbol_decode, incrementalencoder=IncrementalEncoder, incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, ) def search(name): # What the hell is this actually supposed to do? return info codecs.register(search) pyth-0.5.6/pyth/encodings/__init__.py0000644000175000017500000000003111436473612017735 0ustar brendonhbrendonh""" Custom encodings """ pyth-0.5.6/pyth/__init__.py0000644000175000017500000000222011436500772015764 0ustar brendonhbrendonh""" Pyth -- Python text markup and conversion """ import os.path __version__ = '0.5.6' writerMap = { '.rtf': 'pyth.plugins.rtf15.writer.Rtf15Writer', '.html': 'pyth.plugins.xhtml.writer.XHTMLWriter', '.xhtml': 'pyth.plugins.xhtml.writer.XHTMLWriter', '.txt': 'pyth.plugins.plaintext.writer.PlaintextWriter', '.pdf': 'pyth.plugins.pdf.writer.PDFWriter', } mimeMap = { '.rtf': 'application/rtf', '.html': 'text/html', '.xhtml': 'application/xhtml+xml', '.txt': 'text/plain', } def write(doc, filename): ext = os.path.splitext(filename)[1] writer = namedObject(writerMap[ext]) buff = writer.write(doc) buff.seek(0) return (buff, mimeMap[ext]) # Stolen from twisted.python.reflect def namedModule(name): """Return a module given its name.""" topLevel = __import__(name) packages = name.split(".")[1:] m = topLevel for p in packages: m = getattr(m, p) return m def namedObject(name): """Get a fully named module-global object. """ classSplit = name.split('.') module = namedModule('.'.join(classSplit[:-1])) return getattr(module, classSplit[-1]) pyth-0.5.6/pyth/plugins/0000755000175000017500000000000011436501227015334 5ustar brendonhbrendonhpyth-0.5.6/pyth/plugins/pdf/0000755000175000017500000000000011436501227016105 5ustar brendonhbrendonhpyth-0.5.6/pyth/plugins/pdf/writer.py0000644000175000017500000000650711347241214020001 0ustar brendonhbrendonh""" Render documents as Reportlab PDF stories """ from cStringIO import StringIO import cgi # For escape() from pyth import document from pyth.format import PythWriter from reportlab.platypus import SimpleDocTemplate, Paragraph from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle from reportlab.lib.pagesizes import A4 from reportlab.lib.units import inch _tagNames = {'bold': 'b', 'italic': 'i', 'underline': 'u', 'sub': 'sub', 'super': 'super'} LIST_INDENT = 0.3 * inch BULLET_INDENT = 0.2 * inch DEFAULT_PARA_SPACE = 0.2 * inch BULLET_TEXT = "\xe2\x80\xa2" class PDFWriter(PythWriter): @classmethod def write(klass, document, target=None, paragraphStyle=None): writer = PDFWriter(document, paragraphStyle) story = writer.go() if target is None: target = StringIO() doc = SimpleDocTemplate(target) doc.build(story) return target def __init__(self, doc, paragraphStyle=None): self.document = doc if paragraphStyle is None: stylesheet = getSampleStyleSheet() paragraphStyle = stylesheet['Normal'] self.paragraphStyle = paragraphStyle self.paragraphStyle.spaceAfter = 0.2 * inch self.paragraphDispatch = { document.List: self._list, document.Paragraph: self._paragraph} def go(self): self.paragraphs = [] for para in self.document.content: self._dispatch(para) return self.paragraphs def _dispatch(self, para, level=0, **kw): handler = self.paragraphDispatch[type(para)] return handler(para, level=level, **kw) def _paragraph(self, paragraph, level=0, bulletText=None): text = u"".join(self._text(t) for t in paragraph.content) self.paragraphs.append(Paragraph(text, self.paragraphStyle, bulletText=bulletText)) def _text(self, text): content = cgi.escape(u"".join(text.content)) tags = [] for prop, value in text.properties.items(): if prop == "url": tags.append((u'' % value, u"")) if prop in _tagNames: tag = _tagNames[prop] tags.append((u"<%s>" % tag, u"" % tag)) open_tags = u"".join(tag[0] for tag in tags) close_tags = u"".join(tag[1] for tag in reversed(tags)) return u"%s%s%s" % (open_tags, content, close_tags) def _list(self, plist, level=0, bulletText=None): for entry in plist.content: self._list_entry(entry, level=level+1) def _list_entry(self, entry, level): first = True prevStyle = self.paragraphStyle self.paragraphStyle = ParagraphStyle("ListStyle", self.paragraphStyle) for para in entry.content: if first: bullet = BULLET_TEXT self.paragraphStyle.leftIndent = LIST_INDENT * level self.paragraphStyle.bulletIndent = (LIST_INDENT * level - 1) + BULLET_INDENT else: bullet = None self.paragraphStyle.leftIndent = LIST_INDENT * (level + 1) self._dispatch(para, level=level, bulletText=bullet) first = False self.paragraphStyle = prevStyle pyth-0.5.6/pyth/plugins/pdf/__init__.py0000644000175000017500000000001411240464667020222 0ustar brendonhbrendonh""" PDF """ pyth-0.5.6/pyth/plugins/latex/0000755000175000017500000000000011436501227016451 5ustar brendonhbrendonhpyth-0.5.6/pyth/plugins/latex/writer.py0000644000175000017500000000470611253467130020347 0ustar brendonhbrendonh""" Render documents as latex. For the moment we generate the latex document from the reStructuredText writer output. """ from cStringIO import StringIO import docutils.core from pyth import document from pyth.format import PythWriter from pyth.plugins.rst.writer import RSTWriter class LatexWriter(PythWriter): @classmethod def write(klass, document, target=None, stylesheet=""): """ convert a pyth document to a latex document we can specify a stylesheet as a latex document fragment that will be inserted after the headers. This way we can override the default style. """ writer = LatexWriter(document, target, stylesheet) return writer.go() def __init__(self, doc, target=None, stylesheet=""): """Create a writer that produce a latex document we can specify a stylesheet as a latex document fragment that will be inserted after the headers. This way we can override the default style. """ self.document = doc self.stylesheet = stylesheet self.target = target if target is not None else StringIO() @property def full_stylesheet(self): """ Return the style sheet that will ultimately be inserted into the latex document. This is the user given style sheet plus some additional parts to add the meta data. """ latex_fragment = r""" \usepackage[colorlinks=true,linkcolor=blue,urlcolor=blue]{hyperref} \hypersetup{ pdftitle={%s}, pdfauthor={%s}, pdfsubject={%s} } """ % (self.document.properties.get("title"), self.document.properties.get("author"), self.document.properties.get("subject")) return latex_fragment + self.stylesheet def go(self): rst = RSTWriter.write(self.document).getvalue() settings = dict(input_encoding="UTF-8", output_encoding="UTF-8", stylesheet="stylesheet.tex") latex = docutils.core.publish_string(rst, writer_name="latex", settings_overrides=settings) # We don't want to keep an \input command in the latex file latex = latex.replace(r"\input{stylesheet.tex}", self.full_stylesheet) self.target.write(latex) return self.target pyth-0.5.6/pyth/plugins/latex/__init__.py0000644000175000017500000000001611240464667020570 0ustar brendonhbrendonh""" Latex """ pyth-0.5.6/pyth/plugins/plaintext/0000755000175000017500000000000011436501227017344 5ustar brendonhbrendonhpyth-0.5.6/pyth/plugins/plaintext/writer.py0000644000175000017500000000360711314467005021240 0ustar brendonhbrendonh""" Render documents as plaintext. """ from pyth import document from pyth.format import PythWriter from cStringIO import StringIO class PlaintextWriter(PythWriter): @classmethod def write(klass, document, target=None, newline="\n"): if target is None: target = StringIO() writer = PlaintextWriter(document, target, newline) return writer.go() def __init__(self, doc, target, newline): self.document = doc self.target = target self.newline = newline self.indent = -1 self.paragraphDispatch = { document.List: self.list, document.Paragraph: self.paragraph } def go(self): for (i, paragraph) in enumerate(self.document.content): handler = self.paragraphDispatch[paragraph.__class__] handler(paragraph) self.target.write("\n") # Heh heh, remove final paragraph spacing self.target.seek(-2, 1) self.target.truncate() self.target.seek(0) return self.target def paragraph(self, paragraph, prefix=""): content = [] for text in paragraph.content: content.append(u"".join(text.content)) content = u"".join(content).encode("utf-8") for line in content.split("\n"): self.target.write(" " * self.indent) self.target.write(prefix) self.target.write(line) self.target.write("\n") if prefix: prefix = " " def list(self, list, prefix=None): self.indent += 1 for (i, entry) in enumerate(list.content): for (j, paragraph) in enumerate(entry.content): prefix = "* " if j == 0 else " " handler = self.paragraphDispatch[paragraph.__class__] handler(paragraph, prefix) self.indent -= 1 pyth-0.5.6/pyth/plugins/plaintext/__init__.py0000644000175000000000000000002311240225655020620 0ustar brendonhroot""" Plaintext """ pyth-0.5.6/pyth/plugins/rst/0000755000175000017500000000000011436501227016144 5ustar brendonhbrendonhpyth-0.5.6/pyth/plugins/rst/writer.py0000644000175000017500000000554711240464667020056 0ustar brendonhbrendonh""" Render documents as reStructuredText. """ from pyth import document from pyth.format import PythWriter from cStringIO import StringIO class RSTWriter(PythWriter): @classmethod def write(klass, document, target=None): if target is None: target = StringIO() writer = RSTWriter(document, target) return writer.go() def __init__(self, doc, target): self.document = doc self.target = target self.indent = -1 self.paragraphDispatch = {document.List: self.list, document.Paragraph: self.paragraph} def go(self): for (i, paragraph) in enumerate(self.document.content): handler = self.paragraphDispatch[paragraph.__class__] handler(paragraph) self.target.write("\n") # Heh heh, remove final paragraph spacing self.target.seek(-2, 1) self.target.truncate() return self.target def text(self, text): """ process a pyth text and return the formatted string """ ret = u"".join(text.content) if 'url' in text.properties: return u"`%s`_" % ret if 'bold' in text.properties: return u"**%s**" % ret if 'italic' in text.properties: return u"*%s*" % ret if 'sub' in text.properties: return ur"\ :sub:`%s`\ " % ret if 'super' in text.properties: return ur"\ :sup:`%s`\ " % ret return ret def paragraph(self, paragraph, prefix=""): """ process a pyth paragraph into the target """ content = [] for text in paragraph.content: content.append(self.text(text)) content = u"".join(content).encode("utf-8") for line in content.split("\n"): self.target.write(" " * self.indent) self.target.write(prefix) self.target.write(line) self.target.write("\n") if prefix: prefix = " " # handle the links if any('url' in text.properties for text in paragraph.content): self.target.write("\n") for text in paragraph.content: if 'url' in text.properties: string = u"".join(text.content) url = text.properties['url'] self.target.write(".. _%s: %s\n" % (string, url)) def list(self, list, prefix=None): """ Process a pyth list into the target """ self.indent += 1 for (i, entry) in enumerate(list.content): for (j, paragraph) in enumerate(entry.content): prefix = "- " if j == 0 else " " handler = self.paragraphDispatch[paragraph.__class__] handler(paragraph, prefix) self.target.write("\n") self.indent -= 1 pyth-0.5.6/pyth/plugins/rst/__init__.py0000644000175000017500000000001411240464667020261 0ustar brendonhbrendonh""" RST """ pyth-0.5.6/pyth/plugins/rtf15/0000755000175000017500000000000011436501227016275 5ustar brendonhbrendonhpyth-0.5.6/pyth/plugins/rtf15/writer.py0000644000175000000000000001614511240471770017343 0ustar brendonhroot""" Render documents as RTF 1.5 http://www.biblioscape.com/rtf15_spec.htm """ from pyth import document from pyth.format import PythWriter from cStringIO import StringIO # XXX Todo -- Make these parameters PARAGRAPH_SPACING = 150 LIST_ITEM_SPACING = 50 _styleFlags = { 'bold': r'\b', 'italic': r'\i', 'underline': r'\ul', } class Rtf15Writer(PythWriter): # Calibri is the default font in Office2007. # So we'll use that for swiss, and let it fall back # to Arial everywhere else. fonts = { 'swiss': 'Calibri', 'roman': 'Times New Roman', } @classmethod def write(klass, document, target=None, fontFamily='roman'): if target is None: target = StringIO() writer = Rtf15Writer(document, target, fontFamily) return writer.go() def __init__(self, doc, target, family): self.document = doc self.target = target if family not in self.fonts: raise ValueError("Family %s not found (Try %s)" % ( family, " or ".join("'%s'" % fam for fam in self.fonts))) self.fontFamily = family self._paragraphDispatch = { document.List: self._list, document.Paragraph: self._paragraph } def go(self): self.listLevel = -1 self.addSpacing = None self.target.write('{') self._writeHeader() self._writeDocument() self.target.write('}') return self.target # ----------------------------------------------- # Header section def _writeHeader(self): # Do this first to get the default font number fontTable = self._getFontTable() self.target.write(r'\rtf1\ansi\deff%s' % self.fontNumber); # Not strictly necessary self.target.write('\n') for part in (fontTable, self._getColorTable(), self._getStyleSheet(), self._getListTable(), self._getListOverrides(), self._getRevTable()): if part: self.target.write(part) self.target.write('\n') def _getFontTable(self): output = [r'{\fonttbl'] for i, (fontFamily, fontName) in enumerate(self.fonts.iteritems()): output.append(r'{\f%d\f%s %s;}' % (i, fontFamily, fontName)) if fontFamily == self.fontFamily: self.fontNumber = i # We need Symbol for list bullets output.append(r'{\f%d\fnil\fprq0\fcharset128 Symbol;}' % (i+1)) self.symbolFontNumber = i+1 output.append('}') return "".join(output) def _getColorTable(self): # We only need black, and blue (for hyperlinks) return (r'{\colortbl;' r'\red0\green0\blue0;' r'\red0\green0\blue255;}') def _getStyleSheet(self): # OpenOffice won't render bullets unless there's a stylesheet entry # even if it doesn't do anything. return r'''{\stylesheet{\s1 List Paragraph;}}''' def _getListTable(self): # levelnfc23 means bullets (rather than numbering) # leveljc0 means left justified # levelfollow0 means a tab after the bullet output = [r'{\*\listtable{\list\listid1\listtemplateid1'] for i in range(9): output.append(( r'{\listlevel\levelstartat1\levelnfc23\leveljc0\levelfollow0' r'{\leveltext \'01\u61623 ?;}' # The bullet character r'\fi-180\f%d' # Indent the bullet left, and use the symbol font '}') % self.symbolFontNumber) output.append('}}') return "".join(output) def _getListOverrides(self): # I have no idea what the point is of this, # but we need it. return r'{\listoverridetable{\listoverride\listid1\listoverridecount0\ls0}}' def _getRevTable(self): # Hell no I don't think so pass # ----------------------------------------------- # Document section def _writeDocument(self): for part in (self._getInfo(), self._getDocFormat(), self._getSecFormat()): if part: self.target.write(part) self.target.write('\n') for paragraph in self.document.content: handler = self._paragraphDispatch[paragraph.__class__] handler(paragraph) def _getInfo(self): pass def _getDocFormat(self): pass def _getSecFormat(self): pass # ----------------------------------------------- # Content def _paragraph(self, paragraph, spacing=PARAGRAPH_SPACING): if self.addSpacing is not None: self.target.write(r'\sb%d' % self.addSpacing) self.addSpacing = None # Space after the paragraph, # expressed in units of god-knows-what self.target.write(r'\sa%d{' % spacing) for text in paragraph.content: self._text(text) self.target.write(r'}\par\pard' '\n') def _list(self, lst, spacing=PARAGRAPH_SPACING): self.listLevel += 1 for entry in lst.content: for paragraph in entry.content: # It doesn't seem like RTF supports multiple paragraphs # in the same list item, so just let them be an item each. self.target.write(r'\ilvl%d\ls0\li%d\s1' % ( self.listLevel, 720*(self.listLevel+1))) handler = self._paragraphDispatch[paragraph.__class__] handler(paragraph, spacing=LIST_ITEM_SPACING) self.listLevel -= 1 # When going back from a list to regular paragraphs, # add some extra spacing to balance the list out. if self.listLevel == -1: self.addSpacing = 150 def _text(self, text): if 'url' in text.properties: self.target.write( r'{\field{\*\fldinst HYPERLINK %s}{\fldrslt \*\cf2\ul ' % text.properties['url']) props = [] if 'super' in text.properties: self.target.write('{\up9 ') elif 'sub' in text.properties: self.target.write('{\dn9 ') for prop in text.properties: if prop in _styleFlags: props.append(_styleFlags[prop]) if props: self.target.write("".join(props) + " ") for run in text.content: for unichar in run: if unichar == '\n': self.target.write(r'\line ') continue point = ord(unichar) if point < 128: self.target.write(str(unichar)) else: self.target.write(r'\u%d?' % point) if props: self.target.write("".join("%s0" % p for p in props) + " ") if 'super' in text.properties or 'sub' in text.properties: self.target.write("}") if 'url' in text.properties: self.target.write('}}') pyth-0.5.6/pyth/plugins/rtf15/__init__.py0000644000175000000000000000017611240225655017562 0ustar brendonhroot""" RTF 1.5 This was the version used in Word97, and the first to support Unicode, making it the most compatible choice. """ pyth-0.5.6/pyth/plugins/rtf15/reader.py0000644000175000017500000004225711436477746020145 0ustar brendonhbrendonh""" Read documents from RTF 1.5 http://www.biblioscape.com/rtf15_spec.htm This module is potentially compatible with RTF versions up to 1.9.1, but may not ignore all necessary control groups. """ import string, re, itertools, struct from pyth import document from pyth.format import PythReader from pyth.encodings import symbol _CONTROLCHARS = set(string.ascii_letters + string.digits + "-*") _DIGITS = set(string.digits) _CODEPAGES = { 0: "cp1252", # ANSI 1: "cp1252", # Default (this is wrong, but there is no right) 2: "symbol", # Symbol 77: "mac-roman", # Mac Roman # Does Python have built-in support for these? What is it? # 78: "10001", # Mac Shift Jis # 79: "10003", # Mac Hangul # 80: "10008", # Mac GB2312 # 81: "10002", # Mac Big5 # 83: "10005", # Mac Hebrew 84: "mac-arabic", # Mac Arabic 85: "mac-greek", # Mac Greek 86: "mac-turkish", # Mac Turkish # 87: "10021", # Mac Thai # 88: "10029", # Mac East Europe # 89: "10007", # Mac Russian 128: "cp932", # Shift JIS 129: "cp949", # Hangul 130: "cp1361", # Johab 134: "cp936", # GB2312 136: "cp950", # Big5 161: "cp1253", # Greek 162: "cp1254", # Turkish 163: "cp1258", # Vietnamese 177: "cp1255", # Hebrew 178: "cp1256", # Arabic 186: "cp1257", # Baltic 204: "cp1251", # Russian 222: "cp874", # Thai 238: "cp1250", # Eastern European 254: "cp437", # PC 437 255: "cp850", # OEM } # All the ones named by number in my 2.6 encodings dir _CODEPAGES_BY_NUMBER = dict( (x, "cp%s" % x) for x in (37, 1006, 1026, 1140, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 424, 437, 500, 737, 775, 850, 852, 855, 856, 857, 860, 861, 862, 863, 864, 865, 866, 869, 874, 875, 932, 949, 950)) # Miscellaneous, incomplete _CODEPAGES_BY_NUMBER.update({ 10000: "mac-roman", 10007: "mac-greek", }) class BackslashEscape(Exception): pass class Rtf15Reader(PythReader): @classmethod def read(self, source, errors='strict'): """ source: A list of P objects. """ reader = Rtf15Reader(source, errors) return reader.go() def __init__(self, source, errors='strict'): self.source = source self.errors = errors self.document = document.Document def go(self): self.source.seek(0) if self.source.read(5) != r"{\rtf": from pyth.errors import WrongFileType raise WrongFileType("Doesn't look like an RTF file") self.source.seek(0) self.charsetTable = None self.charset = 'cp1252' self.group = Group(self) self.stack = [self.group] self.parse() return self.build() def parse(self): while True: next = self.source.read(1) if not next: break if next in '\r\n': continue if next == '{': subGroup = Group(self, self.group, self.charsetTable) self.stack.append(subGroup) subGroup.skip = self.group.skip self.group = subGroup elif next == '}': subGroup = self.stack.pop() self.group = self.stack[-1] subGroup.finalize() if subGroup.specialMeaning == 'FONT_TABLE': self.charsetTable = subGroup.charsetTable self.group.content.append(subGroup) elif self.group.skip: # Avoid crashing on stuff we can't handle # inside groups we don't care about anyway continue elif next == '\\': control, digits = self.getControl() self.group.handle(control, digits) else: self.group.char(next) def getControl(self): chars = [] digits = [] current = chars first = True while True: next = self.source.read(1) if not next: break if first and next in '\\{}': chars.extend("control_symbol") digits.append(next) break if first and next in '\r\n': # Special-cased in RTF, equivalent to a \par chars.extend("par") break first = False if next == "'": # ANSI escape, takes two hex digits chars.extend("ansi_escape") digits.extend(self.source.read(2)) break if next == ' ': # Don't rewind, the space is just a delimiter break if next not in _CONTROLCHARS: # Rewind, it's a meaningful character self.source.seek(-1, 1) break if next in _DIGITS: current = digits current.append(next) return "".join(chars), "".join(digits) def build(self): doc = document.Document() ctx = DocBuilder(doc) for bit in self.group.flatten(): typeName = type(bit).__name__ getattr(ctx, "handle_%s" % typeName)(bit) ctx.flushParagraph() return doc class DocBuilder(object): def __init__(self, doc): self.run = [] self.propStack = [{}] self.block = None self.listLevel = None self.listStack = [doc] def flushRun(self): if self.block is None: self.block = document.Paragraph() self.block.content.append( document.Text(self.propStack[-1].copy(), [u"".join(self.run)])) self.run[:] = [] def cleanParagraph(self): """ Compress text runs, remove whitespace at start and end, skip empty blocks, etc """ runs = self.block.content if not runs: self.block = None return joinedRuns = [] hasContent = False for run in runs: if run.content[0]: hasContent = True else: continue # For whitespace-only groups, remove any property stuff, # to avoid extra markup in output if not run.content[0].strip(): run.properties = {} # Join runs only if their properties match if joinedRuns and (run.properties == joinedRuns[-1].properties): joinedRuns[-1].content[0] += run.content[0] else: joinedRuns.append(run) if hasContent: # Strip beginning of paragraph joinedRuns[0].content[0] = joinedRuns[0].content[0].lstrip() # And then strip the end joinedRuns[-1].content[0] = joinedRuns[-1].content[0].rstrip() self.block.content = joinedRuns else: self.block = None def flushParagraph(self): self.flushRun() if self.block.content: self.cleanParagraph() if self.block is not None: self.listStack[-1].append(self.block) def handle_unicode(self, bit): self.run.append(bit) def handle_Push(self, _): self.propStack.append(self.propStack[-1].copy()) def handle_Pop(self, _): self.flushRun() self.propStack.pop() def handle_Para(self, para): self.flushParagraph() prevListLevel = self.listLevel self.listLevel = para.listLevel if self.listLevel > prevListLevel: l = document.List() self.listStack.append(l) elif self.listLevel < prevListLevel: l = self.listStack.pop() self.listStack[-1].append(l) self.block = None def handle_Reset(self, _): self.flushRun() self.propStack[-1].clear() def handle_ReadableMarker(self, marker): self.flushRun() if marker.val: # RTF needs underline markers for hyperlinks, # but nothing else does. If we're in a hyperlink, # ignore underlines. if 'url' in self.propStack[-1] and marker.name == 'underline': return self.propStack[-1][marker.name] = marker.val else: if marker.name in self.propStack[-1]: del self.propStack[-1][marker.name] class Group(object): def __init__(self, reader, parent=None, charsetTable=None): self.reader = reader self.parent = parent if parent: self.props = parent.props.copy() self.charset = self.parent.charset else: self.props = {} self.charset = self.reader.charset self.specialMeaning = None self.skip = False self.url = None self.currentParaTag = None self.destination = False self.charsetTable = charsetTable self.content = [] def handle(self, control, digits): if control == '*': self.destination = True return handler = getattr(self, 'handle_%s' % control, None) if handler is None: return if digits: handler(digits) else: handler() def char(self, char): self.content.append(char.decode(self.charset, self.reader.errors)) def _finalize(self): if self.destination: self.skip = True if self.specialMeaning is not None: self.skip = True if self.skip: return stuff = [] i = 0 while i < len(self.content): thing = self.content[i] if isinstance(thing, Skip): i += thing.count else: stuff.append(thing) i += 1 self.content = stuff # This is only the default, # and is overridden by some controls finalize = _finalize def flatten(self): if self.skip: return [] stuff = [Push] for thing in self.content: if isinstance(thing, Group): stuff.extend(thing.flatten()) else: stuff.append(thing) stuff.append(Pop) return stuff # Header stuff def handle_ansi(self): self.charset = self.reader.charset = 'cp1252' def handle_mac(self): self.charset = self.reader.charset = 'mac-roman' def handle_pc(self): self.charset = self.reader.charset = 'cp437' def handle_pca(self): self.charset = self.reader.charset = 'cp850' def handle_ansicpg(self, codepage): codepage = int(codepage) if codepage in _CODEPAGES_BY_NUMBER: self.charset = self.reader.charset = _CODEPAGES_BY_NUMBER[codepage] else: raise ValueError("Unknown codepage %s" % codepage) def handle_fonttbl(self): self.specialMeaning = 'FONT_TABLE' self.charsetTable = {} def _setFontCharset(self, charset=None): if charset is None: charset = self.reader.charset # XXX Todo: Figure out a more graceful way to handle the fact that # RTF font declarations can be in their own groups or not if self.parent.charsetTable is not None: self.parent.charsetTable[self.fontNum] = charset else: self.charsetTable[self.fontNum] = charset def handle_f(self, fontNum): if 'FONT_TABLE' in (self.parent.specialMeaning, self.specialMeaning): self.fontNum = int(fontNum) self._setFontCharset() elif self.charsetTable is not None: self.charset = self.charsetTable[int(fontNum)] def handle_fcharset(self, charsetNum): if 'FONT_TABLE' in (self.parent.specialMeaning, self.specialMeaning): # Theoretically, \fN should always be before \fcharsetN # I don't really expect that will always be true, but let's crash # if it's not, and see if it happens in the real world. charset = _CODEPAGES.get(int(charsetNum)) if charset is None: raise ValueError("Unsupported charset %s" % charsetNum) self._setFontCharset(charset) def handle_ansi_escape(self, code): code = int(code, 16) if isinstance(self.charset, dict): uni_code = self.charset.get(code) if uni_code is None: char = u'?' else: char = unichr(uni_code) else: char = chr(code).decode(self.charset, self.reader.errors) self.content.append(char) def handle_control_symbol(self, symbol): # Ignore ~, -, and _, since they are optional crap. if symbol in '\\{}': self.content.append(unicode(symbol)) def handle_u(self, codepoint): codepoint = int(codepoint) try: char = unichr(codepoint) except ValueError: if self.reader.errors == 'replace': char = '?' else: raise self.content.append(char) self.content.append(Skip(self.props.get('unicode_skip', 1))) def handle_par(self): p = Para() self.content.append(p) self.currentParaTag = p def handle_pard(self): self.content.append(Reset) def handle_plain(self): self.content.append(Reset) def handle_line(self): self.content.append(u"\n") def handle_b(self, onOff=None): val = onOff in (None, "", "1") self.content.append(ReadableMarker("bold", val)) def handle_i(self, onOff=None): val = onOff in (None, "", "1") self.content.append(ReadableMarker("italic", val)) def handle_ul(self, onOff=None): val = onOff in (None, "", "1") self.content.append(ReadableMarker("underline", val)) def handle_ilvl(self, level): if self.currentParaTag is not None: self.currentParaTag.listLevel = level else: # Well, now we're in trouble. But I'm pretty sure this # isn't supposed to happen anyway. pass def handle_up(self, amount): self.content.append(ReadableMarker("super", True)) def handle_super(self): self.content.append(ReadableMarker("super", True)) def handle_dn(self, amount): self.content.append(ReadableMarker("sub", True)) def handle_sub(self): self.content.append(ReadableMarker("sub", True)) def handle_emdash(self): self.content.append(u'\u2014') def handle_endash(self): self.content.append(u'\u2013') def handle_lquote(self): self.content.append(u'\u2018') def handle_rquote(self): self.content.append(u'\u2019') def handle_ldblquote(self): self.content.append(u'\u201C') def handle_rdblquote(self): self.content.append(u'\u201D') def handle_field(self): def finalize(): if len(self.content) != 2: return u"" destination, content = self.content # The destination isn't allowed to contain any controls, # so this should be safe. # Except when it isn't, like this: # {\field{\*\fldinst {\rtlch\fcs1 \af0 \ltrch\fcs0 \insrsid15420660 PAGE \\* MERGEFORMAT }} try: destination = u"".join(destination.content) except: return u"" match = re.match(ur'HYPERLINK "(.*)"', destination) if match: content.skip = False self.content = [ReadableMarker("url", match.group(1)), content] else: return u"" self.finalize = finalize def __repr__(self): return "G(%s)" % repr(self.content) def ignore(self, _=None): self.skip = True # Header handle_filetbl = ignore handle_colortbl = ignore handle_stylesheet = ignore handle_listtable = ignore handle_listoverridetable = ignore handle_revtbl = ignore handle_mmath = ignore handle_header = ignore handle_footer = ignore handle_headerl = ignore handle_headerr = ignore handle_headerf = ignore handle_footerl = ignore handle_footerr = ignore handle_footerf = ignore # Document handle_info = ignore handle_docfmt = ignore handle_pgdsctbl = ignore handle_listtext = ignore # Revision hacks handle_revauthdel = ignore class Skip(object): def __init__(self, count): self.count = count class ReadableMarker(object): def __init__(self, name=None, val=None): if name is not None: self.name = name self.val = val def __repr__(self): if self.val is None: return "!%s!" % self.name else: return "!%s::%s!" % (self.name, self.val) class Para(ReadableMarker): listLevel = None def __init__(self): ReadableMarker.__init__(self, "Para") def __repr__(self): return "!Para:%s!" % self.listLevel class Reset(ReadableMarker): name = "Reset" class Push(ReadableMarker): name = "Push" class Pop(ReadableMarker): name = "Pop" # Yes, yes, I know, I'll clean it up later. Reset = Reset() Push = Push() Pop = Pop() pyth-0.5.6/pyth/plugins/python/0000755000175000017500000000000011436501227016655 5ustar brendonhbrendonhpyth-0.5.6/pyth/plugins/python/__init__.py0000644000175000000000000000003511240225655020134 0ustar brendonhroot""" Python object input """ pyth-0.5.6/pyth/plugins/python/reader.py0000644000175000017500000000504011253467130020471 0ustar brendonhbrendonh""" Write Pyth documents straight in Python, a la Nevow's Stan. """ from pyth.format import PythReader from pyth.document import * def _convert(content): if isinstance(content, _PythonBase): return content.toPyth() return content class PythonReader(PythReader): @classmethod def read(self, source): """ source: A list of P objects. """ return Document(content=[_convert(c) for c in source]) class _Shortcut(object): def __init__(self, key): self.key = key def asDict(self): return dict(((self.key, True),)) BOLD = _Shortcut("bold") ITALIC = _Shortcut("italic") UNDERLINE = _Shortcut("underline") SUPER = _Shortcut("super") SUB = _Shortcut("sub") def _MetaPythonBase(): """ Return a metaclass which implements __getitem__, allowing e.g. P[...] instead of P()[...] """ class MagicGetItem(type): def __new__(mcs, name, bases, dict): klass = type.__new__(mcs, name, bases, dict) mcs.__getitem__ = lambda _, k: klass()[k] return klass return MagicGetItem class _PythonBase(object): """ Base class for Python markup objects, providing stan-ish interface """ def __init__(self, *shortcuts, **properties): self.properties = properties.copy() for shortcut in shortcuts: self.properties.update(shortcut.asDict()) self.content = [] def toPyth(self): return self.pythType(self.properties, [_convert(c) for c in self.content]) def __getitem__(self, item): if isinstance(item, (tuple, list)): for i in item: self [i] elif isinstance(item, int): return self.content[item] else: self.content.append(item) return self def __str__(self): return "%s(%s) [ %s ]" % ( self.__class__.__name__, ", ".join("%s=%s" % (k, repr(v)) for (k,v) in self.properties.iteritems()), ", ".join(repr(x) for x in self.content)) class P(_PythonBase): __metaclass__ = _MetaPythonBase() pythType = Paragraph class LE(_PythonBase): __metaclass__ = _MetaPythonBase() pythType = ListEntry class L(_PythonBase): __metaclass__ = _MetaPythonBase() pythType = List class T(_PythonBase): __metaclass__ = _MetaPythonBase() __repr__ = _PythonBase.__str__ pythType = Text def toPyth(self): return Text(self.properties, self.content) pyth-0.5.6/pyth/plugins/__init__.py0000644000175000000000000000004011240225655016607 0ustar brendonhroot""" Document format plugins """ pyth-0.5.6/pyth/plugins/xhtml/0000755000175000017500000000000011436501227016470 5ustar brendonhbrendonhpyth-0.5.6/pyth/plugins/xhtml/writer.py0000644000175000000000000001063211253464507017535 0ustar brendonhroot""" Render documents as XHTML fragments """ from pyth import document from pyth.format import PythWriter from cStringIO import StringIO _tagNames = { 'bold': 'strong', 'italic': 'em', 'underline': 'u', # ? } class XHTMLWriter(PythWriter): @classmethod def write(klass, document, target=None, cssClasses=True, pretty=False): if target is None: target = StringIO() writer = XHTMLWriter(document, target, cssClasses, pretty) final = writer.go() final.seek(0) # Doesn't work all that well -- appends an tag, # and puts line breaks in unusual places for HTML. #if pretty: # content = final.read() # final.seek(0) # from xml.dom.ext import PrettyPrint # from xml.dom.ext.reader.Sax import FromXml # PrettyPrint(FromXml(content), final) # final.seek(0) return final def __init__(self, doc, target, cssClasses=True, pretty=False): self.document = doc self.target = target self.cssClasses = cssClasses self.pretty = pretty self.paragraphDispatch = { document.List: self._list, document.Paragraph: self._paragraph } def go(self): self.listLevel = -1 tag = Tag("div") for element in self.document.content: handler = self.paragraphDispatch[element.__class__] tag.content.extend(handler(element)) tag.render(self.target) return self.target def _paragraph(self, paragraph): p = Tag("p") for text in paragraph.content: p.content.append(self._text(text)) if self.pretty: return [_prettyBreak, p, _prettyBreak] else: return [p] def _list(self, lst): self.listLevel += 1 ul = Tag("ul") if self.cssClasses: ul.attrs['class'] = 'pyth_list_%s' % self.listLevel for entry in lst.content: li = Tag("li") for element in entry.content: handler = self.paragraphDispatch[element.__class__] li.content.extend(handler(element)) ul.content.append(li) self.listLevel -= 1 return [ul] def _text(self, text): if 'url' in text.properties: tag = Tag("a") tag.attrs['href'] = text.properties['url'] else: tag = Tag(None) current = tag for prop in ('bold', 'italic', 'underline'): if prop in text.properties: newTag = Tag(_tagNames[prop]) current.content.append(newTag) current = newTag for prop in ('sub', 'super'): if prop in text.properties: if current.tag is None: newTag = Tag("span") current.content.append(newTag) current = newTag current.attrs['style'] = "vertical-align: %s; font-size: smaller" % prop current.content.append(u"".join(text.content)) return tag _prettyBreak = object() class Tag(object): def __init__(self, tag, attrs=None, content=None): self.tag = tag self.attrs = attrs or {} self.content = content or [] def render(self, target): if self.tag is not None: attrString = self.attrString() if attrString: attrString = " " + attrString target.write('<%s%s>' % (self.tag, attrString)) for c in self.content: if isinstance(c, Tag): c.render(target) elif c is _prettyBreak: target.write('\n') else: target.write(quoteText(c).encode("utf-8").replace('\n', '
')) if self.tag is not None: target.write('' % self.tag) def attrString(self): return " ".join( '%s="%s"' % (k, quoteAttr(v)) for (k, v) in self.attrs.iteritems()) def __repr__(self): return "T(%s)[%s]" % (self.tag, repr(self.content)) def quoteText(text): return text.replace( u"&", u"&").replace( u"<", u"<").replace( u">", u">") def quoteAttr(text): return quoteText(text).replace( u'"', u""").replace( u"'", u"'") pyth-0.5.6/pyth/plugins/xhtml/css.py0000644000175000017500000000776411242763551017655 0ustar brendonhbrendonh """ Parse a css document into a python class that can be used to apply the style to a BeautifulSoup document. """ import re class Selector(object): """ Represent a css selector. The __call__ method takes a BeautifulSoup node as argument end return True if the selector applies to the node. """ def __init__(self, tag=None, klass=None): self.tag = tag self.klass = klass def check_tag(self, node): return not self.tag or node.findParent(self.tag) def check_class(self, node): return not self.klass or node.findParent(attrs={'class': self.klass}) def __call__(self, node): return self.check_tag(node) and self.check_class(node) def __repr__(self): tag = self.tag if self.tag else "" klass = ".%s" % self.klass if self.klass else "" return "%s%s" % (tag, klass) class Rule(object): """ Represents a css rule. A rule consists of a selector and a dictionary of properties. """ def __init__(self, selector, properties=None): self.selector = selector self.properties = properties or {} def __repr__(self): return "%s %s" % (self.selector, self.properties) class CSS(object): """ Represents a css document """ # The regular expressions used to parse the css document # match a rule e.g: '.imp {font-weight: bold; color: blue}' ruleset_re = re.compile(r'\s*(.+?)\s+\{(.*?)\}') # match a property declaration, e.g: 'font-weight = bold' declaration_re = re.compile(r'\s*(.+?):\s*(.+?)\s*?(?:;|$)') # match a selector selector_re = re.compile(r'(.*?)(?:\.(.*))?$') def __init__(self, source=None): self.rules = [] if source: self.parse_css(source) def __repr__(self): return repr(self.rules) def parse_css(self, css): """ Parse a css style sheet into the CSS object. For the moment this will only work for very simple css documents. It works by using regular expression matching css syntax. This is not bullet proof. """ rulesets = self.ruleset_re.findall(css) for (selector, declarations) in rulesets: rule = Rule(self.parse_selector(selector)) rule.properties = self.parse_declarations(declarations) self.rules.append(rule) def parse_declarations(self, declarations): """ parse a css declaration list """ declarations = self.declaration_re.findall(declarations) return dict(declarations) def parse_selector(self, selector): """ parse a css selector """ tag, klass = self.selector_re.match(selector).groups() return Selector(tag, klass) def get_properties(self, node): """ return a dict of all the properties of a given BeautifulSoup node found by applying the css style. """ ret = {} # Try all the rules one by one for rule in self.rules: if rule.selector(node): ret.update(rule.properties) # Also search for direct 'style' arguments in the html doc for style_node in node.findParents(attrs={'style': True}): style = style_node.get('style') properties = self.parse_declarations(style) ret.update(properties) return ret def is_bold(self, node): """ convenience method equivalent to self.get_properties(node).get('font-weight', None) == 'bold' """ properties = self.get_properties(node) return properties.get('font-weight') == 'bold' def is_italic(self, node): properties = self.get_properties(node) return properties.get('font-style') == 'italic' def is_sub(self, node): properties = self.get_properties(node) return properties.get('vertical-align') == 'sub' def is_super(self, node): properties = self.get_properties(node) return properties.get('vertical-align') == 'super' pyth-0.5.6/pyth/plugins/xhtml/__init__.py0000644000175000000000000000001611240225655017746 0ustar brendonhroot""" XHTML """ pyth-0.5.6/pyth/plugins/xhtml/reader.py0000644000175000017500000001175711243533263020320 0ustar brendonhbrendonh""" Read documents from xhtml """ import BeautifulSoup from pyth import document from pyth.format import PythReader from pyth.plugins.xhtml.css import CSS class XHTMLReader(PythReader): @classmethod def read(self, source, css_source=None, encoding="utf-8"): reader = XHTMLReader(source, css_source, encoding) return reader.go() def __init__(self, source, css_source=None, encoding="utf-8"): self.source = source self.css_source = css_source self.encoding = encoding def go(self): soup = BeautifulSoup.BeautifulSoup(self.source, convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES, fromEncoding=self.encoding, smartQuotesTo=None) # Make sure the document content doesn't use multi-lines soup = self.format(soup) doc = document.Document() if self.css_source: self.css = CSS(self.css_source) else: self.css = CSS() # empty css self.process_into(soup, doc) return doc def format(self, soup): """format a BeautifulSoup document This will transform the block elements content from multi-lines text into single line. This allow us to avoid having to deal with further text rendering once this step has been done. """ # Remove all the newline characters before a closing tag. for node in soup.findAll(text=True): if node.rstrip(" ").endswith("\n"): node.replaceWith(node.rstrip(" ").rstrip("\n")) # Join the block elements lines into a single long line for tag in ['p', 'li']: for node in soup.findAll(tag): text = unicode(node) lines = [x.strip() for x in text.splitlines()] text = ' '.join(lines) node.replaceWith(text) soup = BeautifulSoup.BeautifulSoup(unicode(soup)) # replace all
tag by newline character for node in soup.findAll('br'): node.replaceWith("\n") soup = BeautifulSoup.BeautifulSoup(unicode(soup)) return soup def is_bold(self, node): """ Return true if the BeautifulSoup node needs to be rendered as bold. """ return (node.findParent(['b', 'strong']) is not None or self.css.is_bold(node)) def is_italic(self, node): """ Return true if the BeautifulSoup node needs to be rendered as italic. """ return (node.findParent(['em', 'i']) is not None or self.css.is_italic(node)) def is_sub(self, node): """ Return true if the BeautifulSoup node needs to be rendered as sub. """ return (node.findParent(['sub']) is not None or self.css.is_sub(node)) def is_super(self, node): """ Return true if the BeautifulSoup node needs to be rendered as super. """ return (node.findParent(['sup']) is not None or self.css.is_super(node)) def url(self, node): """ return the url of a BeautifulSoup node or None if there is no url. """ a_node = node.findParent('a') if not a_node: return None return a_node.get('href') def process_text(self, node): """ Return a pyth Text object from a BeautifulSoup node or None if the text is empty. """ text = node.string.strip() if not text: return # Set all the properties properties=dict() if self.is_bold(node): properties['bold'] = True if self.is_italic(node): properties['italic'] = True if self.url(node): properties['url'] = self.url(node) if self.is_sub(node): properties['sub'] = True if self.is_super(node): properties['super'] = True content=[node.string] return document.Text(properties, content) def process_into(self, node, obj): """ Process a BeautifulSoup node and fill its elements into a pyth base object. """ if isinstance(node, BeautifulSoup.NavigableString): text = self.process_text(node) if text: obj.append(text) return if node.name == 'p': # add a new paragraph into the pyth object new_obj = document.Paragraph() obj.append(new_obj) obj = new_obj elif node.name == 'ul': # add a new list new_obj = document.List() obj.append(new_obj) obj = new_obj elif node.name == 'li': # add a new list entry new_obj = document.ListEntry() obj.append(new_obj) obj = new_obj for child in node: self.process_into(child, obj) pyth-0.5.6/setup.py0000755000175000017500000000163311436501004014400 0ustar brendonhbrendonhfrom setuptools import setup, find_packages setup(name="pyth", version="0.5.6", packages = find_packages(), zip_safe = False, description="Python text markup and conversion", author="Brendon Hogger", author_email="brendonh@taizilla.com", url="http://wiki.github.com/brendonh/pyth", long_description=open('README').read(), classifiers = [ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 2.5", "Programming Language :: Python :: 2.6", "Topic :: Office/Business", "Topic :: Software Development :: Libraries", "Topic :: Text Editors :: Word Processors", "Topic :: Text Processing", "Topic :: Text Processing :: Markup", "Topic :: Text Processing :: Filters", ], ) pyth-0.5.6/README0000644000175000017500000000266611302461231013551 0ustar brendonhbrendonh======================================== pyth - Python text markup and conversion ======================================== Pyth is intended to make it easy to convert marked-up text between different common formats. *Marked-up text* means text which has: * Paragraphs * Headings * Bold, italic, and underlined text * Hyperlinks * Bullet lists * Simple tables * Very little else Formats I initially want to support are: * xhtml * rtf * pdf (output) These three formats cover web, Word / OpenOffice, and print. Design principles ================= * Ignore unsupported information in input formats (e.g. page layout) * Ignore font issues -- output in a single font. * Ignore specific text sizes -- support relative sizes (bigger, littler) only. Output in a single base size. * Have no dependencies unless they are written in Python, and work * Make it easy to add support for new formats, by using an architecture based on *plugins* and *adapters*. Examples ======== See http://github.com/brendonh/pyth/tree/master/examples/ Unit tests ========== The sources contains some unit tests (written using python unittest module) in the 'tests' directory. To run the tests we can either run them individually as python script, either use `python nose`_. If using nose then we just need to go into the tests directory and invoke nosetest from there (make sure that pyth module is in PYTHONPATH). .. _python nose: http://code.google.com/p/python-nose/ pyth-0.5.6/LICENSE0000644000175000017500000000206411305007275013675 0ustar brendonhbrendonhCopyright (c) 2009 Brendon Hogger Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. pyth-0.5.6/MANIFEST.in0000644000175000017500000000011411305007414014413 0ustar brendonhbrendonhrecursive-include examples *.py recursive-include tests *.py include LICENSEpyth-0.5.6/pyth.egg-info/0000755000175000017500000000000011436501227015345 5ustar brendonhbrendonhpyth-0.5.6/pyth.egg-info/dependency_links.txt0000644000175000017500000000000111436501227021413 0ustar brendonhbrendonh pyth-0.5.6/pyth.egg-info/not-zip-safe0000644000175000017500000000000111436501136017572 0ustar brendonhbrendonh pyth-0.5.6/pyth.egg-info/PKG-INFO0000644000175000017500000000517411436501227016451 0ustar brendonhbrendonhMetadata-Version: 1.0 Name: pyth Version: 0.5.6 Summary: Python text markup and conversion Home-page: http://wiki.github.com/brendonh/pyth Author: Brendon Hogger Author-email: brendonh@taizilla.com License: UNKNOWN Description: ======================================== pyth - Python text markup and conversion ======================================== Pyth is intended to make it easy to convert marked-up text between different common formats. *Marked-up text* means text which has: * Paragraphs * Headings * Bold, italic, and underlined text * Hyperlinks * Bullet lists * Simple tables * Very little else Formats I initially want to support are: * xhtml * rtf * pdf (output) These three formats cover web, Word / OpenOffice, and print. Design principles ================= * Ignore unsupported information in input formats (e.g. page layout) * Ignore font issues -- output in a single font. * Ignore specific text sizes -- support relative sizes (bigger, littler) only. Output in a single base size. * Have no dependencies unless they are written in Python, and work * Make it easy to add support for new formats, by using an architecture based on *plugins* and *adapters*. Examples ======== See http://github.com/brendonh/pyth/tree/master/examples/ Unit tests ========== The sources contains some unit tests (written using python unittest module) in the 'tests' directory. To run the tests we can either run them individually as python script, either use `python nose`_. If using nose then we just need to go into the tests directory and invoke nosetest from there (make sure that pyth module is in PYTHONPATH). .. _python nose: http://code.google.com/p/python-nose/ Platform: UNKNOWN Classifier: Development Status :: 5 - Production/Stable Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: MIT License Classifier: Programming Language :: Python :: 2.5 Classifier: Programming Language :: Python :: 2.6 Classifier: Topic :: Office/Business Classifier: Topic :: Software Development :: Libraries Classifier: Topic :: Text Editors :: Word Processors Classifier: Topic :: Text Processing Classifier: Topic :: Text Processing :: Markup Classifier: Topic :: Text Processing :: Filters pyth-0.5.6/pyth.egg-info/SOURCES.txt0000644000175000017500000000215311436501227017232 0ustar brendonhbrendonhLICENSE MANIFEST.in README setup.py examples/reading/rtf15.py examples/reading/xhtml.py examples/writing/latex.py examples/writing/pdf.py examples/writing/plaintext.py examples/writing/pythonDoc.py examples/writing/rst.py examples/writing/rtf15.py examples/writing/xhtml.py pyth/__init__.py pyth/document.py pyth/errors.py pyth/format.py pyth.egg-info/PKG-INFO pyth.egg-info/SOURCES.txt pyth.egg-info/dependency_links.txt pyth.egg-info/not-zip-safe pyth.egg-info/top_level.txt pyth/encodings/__init__.py pyth/encodings/symbol.py pyth/plugins/__init__.py pyth/plugins/latex/__init__.py pyth/plugins/latex/writer.py pyth/plugins/pdf/__init__.py pyth/plugins/pdf/writer.py pyth/plugins/plaintext/__init__.py pyth/plugins/plaintext/writer.py pyth/plugins/python/__init__.py pyth/plugins/python/reader.py pyth/plugins/rst/__init__.py pyth/plugins/rst/writer.py pyth/plugins/rtf15/__init__.py pyth/plugins/rtf15/reader.py pyth/plugins/rtf15/writer.py pyth/plugins/xhtml/__init__.py pyth/plugins/xhtml/css.py pyth/plugins/xhtml/reader.py pyth/plugins/xhtml/writer.py tests/test_readxhtml.py tests/test_writelatex.py tests/test_writepdf.pypyth-0.5.6/pyth.egg-info/top_level.txt0000644000175000017500000000000511436501227020072 0ustar brendonhbrendonhpyth