pytidylib-0.2.1/ 0000755 0001750 0001750 00000000000 11301050430 012514 5 ustar janos janos pytidylib-0.2.1/LICENSE 0000644 0001750 0001750 00000002033 11163707615 013543 0 ustar janos janos Copyright 2009 Jason Stitt Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. pytidylib-0.2.1/._LICENSE 0000644 0001750 0001750 00000000272 11163707615 013763 0 ustar janos janos Mac OS X 2 ˆ º TxMt ATTR \DÓ º ˜ " ˜ " com.macromates.caret { column = 0; line = 19; } pytidylib-0.2.1/tests/ 0000755 0001750 0001750 00000000000 11301050430 013656 5 ustar janos janos pytidylib-0.2.1/tests/._threadsafety.py 0000644 0001750 0001750 00000000272 11166672363 017161 0 ustar janos janos Mac OS X 2 ˆ º TxMt ATTR ]‡ º ˜ " ˜ " com.macromates.caret { column = 0; line = 20; } pytidylib-0.2.1/tests/SinkMemTest.py 0000644 0001750 0001750 00000003204 11166672360 016457 0 ustar janos janos # Copyright 2009 Jason Stitt # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. import unittest from tidylib import tidy_document, tidy_fragment, sink class TestSinkMemory(unittest.TestCase): """ Make sure error sinks are cleared properly """ def test_tidy_document(self): h = "
hello" for i in xrange(100): doc, err = tidy_document(h) self.assertEqual(sink.sinks, {}) def test_tidy_fragment(self): h = "
hello" for i in xrange(100): doc, err = tidy_fragment(h) self.assertEqual(sink.sinks, {}) if __name__ == '__main__': unittest.main() pytidylib-0.2.1/tests/threadsafety.py 0000644 0001750 0001750 00000004140 11166672363 016742 0 ustar janos janos # Copyright 2009 Jason Stitt # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. import threading from Queue import Queue from tidylib import tidy_document error_queue = Queue() DOC = '''
hello" expected = '''
hello
''' doc, err = tidy_fragment(h) self.assertEqual(doc, expected) def test_frag_with_incomplete_img_tag(self): h = "hello" expected = DOC % '''
hello
''' doc, err = tidy_document(h) self.assertEqual(doc, expected) def test_doc_with_incomplete_img_tag(self): h = "fõo ''',
options={'numeric-entities':1})
print document
print errors
pytidylib-0.2.1/tidylib/ 0000755 0001750 0001750 00000000000 11301050430 014154 5 ustar janos janos pytidylib-0.2.1/tidylib/._sink.py 0000644 0001750 0001750 00000000273 11274656346 015744 0 ustar janos janos Mac OS X 2 ‰ » TxMt ATTR \.l » ˜ # ˜ # com.macromates.caret {
column = 26;
line = 13;
} pytidylib-0.2.1/tidylib/sink.py 0000644 0001750 0001750 00000007023 11274656346 015527 0 ustar janos janos # Copyright 2009 Jason Stitt
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
import ctypes
import sys
import threading
import platform
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
__all__ = ['Sink', 'create_sink', 'destroy_sink']
#----------------------------------------------------------------------------#
# Globals
sinks = {} # of int: Sink
last_sink_id = 0
sink_id_lock = threading.Lock()
#----------------------------------------------------------------------------#
# ctypes type definitions
# Fix for Windows b/c tidy uses stdcall on Windows
if "Windows" == platform.system():
functype = ctypes.WINFUNCTYPE
else:
functype = ctypes.CFUNCTYPE
PutByteType = functype(None, ctypes.c_int, ctypes.c_char)
class TidyOutputSink(ctypes.Structure):
""" Mirrors the _TidyOutputSink structure in tidy.h """
_fields_ = [
('sinkData', ctypes.c_void_p),
('putByte', PutByteType)
]
#----------------------------------------------------------------------------#
# Python interface
class Sink(object):
""" Represent a buffer to which Tidy writes errors with a callback function """
def __init__(self, sink_id):
self.data = StringIO()
self.sink_id = sink_id
self.struct = TidyOutputSink()
self.struct.sinkData = ctypes.cast(
ctypes.pointer(ctypes.c_int(sink_id)), ctypes.c_void_p) # Windows fix
write_func = self.data.write # Avoid 2 attr accesses per byte
def put_byte(sink_id, byte):
# We don't need sink_id because we have a separate put_byte
# function for each sink
write_func(byte)
self.struct.putByte = PutByteType(put_byte)
self._as_parameter_ = ctypes.byref(self.struct)
def __str__(self):
return self.data.getvalue()
def create_sink():
""" Return a new Sink with a numeric ID incremented in a threadsafe way """
global last_sink_id, sink_id_lock, sinks
sink_id_lock.acquire()
try:
this_sink_id = last_sink_id
last_sink_id = (last_sink_id + 1) % sys.maxint
# If you have more than maxint sinks open at a time, you're screwed
finally:
sink_id_lock.release()
sink = Sink(this_sink_id)
sinks[this_sink_id] = sink
return sink
def destroy_sink(sink):
""" Free a Sink object by eliminating the reference from the global map """
global sinks
del sinks[sink.sink_id]
del sink
#----------------------------------------------------------------------------#
pytidylib-0.2.1/tidylib/.___init__.py 0000644 0001750 0001750 00000000273 11300315340 016507 0 ustar janos janos Mac OS X 2 ‰ » TxMt ATTR \.i » ˜ # ˜ # com.macromates.caret {
column = 42;
line = 32;
} pytidylib-0.2.1/tidylib/__init__.py 0000644 0001750 0001750 00000016425 11300315340 016300 0 ustar janos janos # Copyright 2009 Jason Stitt
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
import ctypes
import threading
import re
import platform
from sink import create_sink, destroy_sink
__all__ = ['tidy_document', 'tidy_fragment', 'release_tidy_doc']
#----------------------------------------------------------------------------#
# Constants
LIB_NAMES = ['libtidy', 'libtidy.so', 'libtidy-0.99.so.0', 'cygtidy-0-99-0',
'tidylib', 'libtidy.dylib', 'tidy']
ENOMEM = -12
RE_BODY = re.compile(r"
fõo fõo fõo ''',
options={'numeric-entities':1})
print document
print errors
Docs
====
Documentation is shipped with the source distribution and is available at
the `PyTidyLib`_ web page.
.. _`HTML Tidy`: http://tidy.sourceforge.net/
.. _`PyTidyLib`: http://countergram.com/open-source/pytidylib/
Platform: UNKNOWN
Classifier: Development Status :: 4 - Beta
Classifier: Environment :: Other Environment
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: MIT License
Classifier: Programming Language :: Python
Classifier: Natural Language :: English
Classifier: Topic :: Utilities
Classifier: Topic :: Text Processing :: Markup :: HTML
Classifier: Topic :: Text Processing :: Markup :: XML
pytidylib-0.2.1/docs/ 0000755 0001750 0001750 00000000000 11556361745 013476 5 ustar janos janos pytidylib-0.2.1/docs/rst/ 0000755 0001750 0001750 00000000000 11301050430 014254 5 ustar janos janos pytidylib-0.2.1/docs/rst/index.rst 0000644 0001750 0001750 00000014222 11274665033 016142 0 ustar janos janos PyTidyLib: A Python Interface to HTML Tidy
------------------------------------------
`PyTidyLib`_ is a Python package that wraps the `HTML Tidy`_ library. This allows you, from Python code, to "fix" invalid (X)HTML markup. Some of the library's many capabilities include:
* Clean up unclosed tags and unescaped characters such as ampersands
* Output HTML 4 or XHTML, strict or transitional, and add missing doctypes
* Convert named entities to numeric entities, which can then be used in XML documents without an HTML doctype.
* Clean up HTML from programs such as Word (to an extent)
* Indent the output, including proper (i.e. no) indenting for ``pre`` elements, which some (X)HTML indenting code overlooks.
PyTidyLib is intended as as replacement for uTidyLib, which fills a similar purpose. The author previously used uTidyLib but found several areas for improvement, including OS X support, 64-bit platform support, unicode support, fixing a memory leak, and better speed.
Naming conventions
==================
`HTML Tidy`_ is a longstanding open-source library written in C that implements the actual functionality of cleaning up (X)HTML markup. It provides a shared library (``so``, ``dll``, or ``dylib``) that can variously be called ``tidy``, ``libtidy``, or ``tidylib``, as well as a command-line executable named ``tidy``. For clarity, this document will consistently refer to it by the project name, HTML Tidy.
`PyTidyLib`_ is the name of the Python package discussed here. As this is the package name, ``easy_install pytidylib`` or ``pip install pytidylib`` is correct (they are case-insenstive). The *module* name is ``tidylib``, so ``import tidylib`` is correct in Python code. This document will consistently use the package name, PyTidyLib, outside of code examples.
Installing HTML Tidy
====================
You must have both `HTML Tidy`_ and `PyTidyLib`_ installed in order to use the functionality described here. There is no affiliation between the two projects. The following briefly outlines what you must do to install HTML Tidy. See the `HTML Tidy`_ web site for more information.
**Linux/BSD or similar:** First, try to use your distribution's package management system (``apt-get``, ``yum``, etc.) to install HTML Tidy. It might go under the name ``libtidy``, ``tidylib``, ``tidy``, or something similar. Otherwise see *Building from Source*, below.
**OS X:** You may already have HTML Tidy installed. In the Terminal, run ``locate libtidy`` and see if you get any results, which should end in ``dylib``. Otherwise see *Building from Source*, below.
**Windows:** (Use PyTidyLib version 0.2 or later!) Prebuilt HTML Tidy DLLs are available from at least two locations. The `int64.org Tidy Binaries`_ page provides binaries that were built in 2005, for both 32-bit and 64-bit Windows, against a patched version of the source. The `HTML Tidy`_ web site links to a DLL built in 2006, for 32-bit Windows only, using the vanilla source (scroll near the bottom to "Other Builds" -- use the one that reads "exe/lib/dll", *not* the "exe"-only version.)
Once you have a DLL (which may be named ``tidy.dll``, ``libtidy.dll``, or ``tidylib.dll``), you must place it in a directory on your system path. If you are running Python from the command-line, placing the DLL in the present working directory will work, but this is unreliable otherwise (e.g. for server software).
See the articles `How to set the path in Windows 2000/Windows XP
''',
options={'numeric-entities':1})
print document
print errors
Configuration options
=====================
The Python interface allows you to pass options directly to HTML Tidy. For a complete list of options, see the `HTML Tidy Configuration Options Quick Reference`_ or, from the command line, run ``tidy -help-config``.
.. _`HTML Tidy Configuration Options Quick Reference`: http://tidy.sourceforge.net/docs/quickref.html
This module sets certain default options, as follows::
BASE_OPTIONS = {
"output-xhtml": 1, # XHTML instead of HTML4
"indent": 1, # Pretty; not too much of a performance hit
"tidy-mark": 0, # No tidy meta tag in output
"wrap": 0, # No wrapping
"alt-text": "", # Help ensure validation
"doctype": 'strict', # Little sense in transitional for tool-generated markup...
"force-output": 1, # May not get what you expect but you will get something
}
If you do not like these options to be set for you, do the following after importing ``tidylib``::
tidylib.BASE_OPTIONS = {}
Function reference
==================
.. autofunction:: tidylib.tidy_document
.. autofunction:: tidylib.tidy_fragment
.. autofunction:: tidylib.release_tidy_doc
.. _`HTML Tidy`: http://tidy.sourceforge.net/
.. _`PyTidyLib`: http://countergram.com/open-source/pytidylib/
.. _`int64.org Tidy Binaries`: http://int64.org/projects/tidy-binaries
.. _`setuptools`: http://pypi.python.org/pypi/setuptools
.. _`pip`: http://pypi.python.org/pypi/pip
pytidylib-0.2.1/docs/rst/conf.py 0000644 0001750 0001750 00000000630 11274665256 015605 0 ustar janos janos # Configuration file for Sphinx documentation tool
extensions = ['sphinx.ext.autodoc']
master_doc = "index"
project = "pytidylib"
copyright = "2009 Jason Stitt"
version = "0.1"
language = "en"
html_title = "pytidylib module"
latex_use_modindex = False
latex_documents = [
(
master_doc,
'pytidylib.tex',
'PyTidyLib documentation',
'Jason Stitt',
'howto',
False,
)
]
pytidylib-0.2.1/docs/rst/._index.rst 0000644 0001750 0001750 00000000272 11274665033 016357 0 ustar janos janos Mac OS X 2 ˆ º TxMt ATTR \PS º ˜ " ˜ " com.macromates.caret {
column = 0;
line = 95;
} pytidylib-0.2.1/docs/rst/._conf.py 0000644 0001750 0001750 00000000273 11274665256 016025 0 ustar janos janos Mac OS X 2 ‰ » TxMt ATTR \P= » ˜ # ˜ # com.macromates.caret {
column = 28;
line = 19;
} pytidylib-0.2.1/._README 0000644 0001750 0001750 00000000272 11274706651 013640 0 ustar janos janos Mac OS X 2 ˆ º TxMt ATTR \DÒ º ˜ " ˜ " com.macromates.caret {
column = 21;
line = 3;
} pytidylib-0.2.1/setup.py 0000644 0001750 0001750 00000006340 11301050372 014236 0 ustar janos janos # Copyright 2009 Jason Stitt
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
from distutils.core import setup
longdesc = """\
0.2.0: Works on Windows! See documentation for available DLL download
locations. Documentation rewritten and expanded.
`PyTidyLib`_ is a Python package that wraps the `HTML Tidy`_ library. This
allows you, from Python code, to "fix" invalid (X)HTML markup. Some of the
library's many capabilities include:
* Clean up unclosed tags and unescaped characters such as ampersands
* Output HTML 4 or XHTML, strict or transitional, and add missing doctypes
* Convert named entities to numeric entities, which can then be used in XML
documents without an HTML doctype.
* Clean up HTML from programs such as Word (to an extent)
* Indent the output, including proper (i.e. no) indenting for ``pre`` elements,
which some (X)HTML indenting code overlooks.
Small example of use
====================
The following code cleans up an invalid HTML document and sets an option::
from tidylib import tidy_document
document, errors = tidy_document('''
''',
options={'numeric-entities':1})
print document
print errors
Docs
====
Documentation is shipped with the source distribution and is available at
the `PyTidyLib`_ web page.
.. _`HTML Tidy`: http://tidy.sourceforge.net/
.. _`PyTidyLib`: http://countergram.com/open-source/pytidylib/
"""
VERSION = "0.2.1"
setup(
name="pytidylib",
version=VERSION,
description="Python wrapper for HTML Tidy (tidylib)",
long_description=longdesc,
author="Jason Stitt",
author_email="js@jasonstitt.com",
url="http://countergram.com/open-source/pytidylib/",
download_url="http://cloud.github.com/downloads/countergram/pytidylib/pytidylib-%s.tar.gz" % VERSION,
packages=['tidylib'],
classifiers=[
'Development Status :: 4 - Beta',
'Environment :: Other Environment',
'Intended Audience :: Developers',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python',
'Natural Language :: English',
'Topic :: Utilities',
'Topic :: Text Processing :: Markup :: HTML',
'Topic :: Text Processing :: Markup :: XML',
],
)
pytidylib-0.2.1/MANIFEST.in 0000644 0001750 0001750 00000000405 11164003527 014265 0 ustar janos janos include README
include LICENSE
include MANIFEST.in
include tidylib/*.py
include tests/*.py
include *.py
include docs/pytidylib.pdf
include docs/html/*.html
include docs/html/*.js
include docs/html/_static/*.*
include docs/html/_sources/*.*
include docs/rst/*.*
pytidylib-0.2.1/._setup.py 0000644 0001750 0001750 00000000273 11301050372 014452 0 ustar janos janos Mac OS X 2 ‰ » TxMt ATTR \DÍ » ˜ # ˜ # com.macromates.caret {
column = 16;
line = 59;
} pytidylib-0.2.1/._MANIFEST.in 0000644 0001750 0001750 00000000272 11164003527 014504 0 ustar janos janos Mac OS X 2 ˆ º TxMt ATTR \Dâ º ˜ " ˜ " com.macromates.caret {
column = 0;
line = 12;
}