pax_global_header00006660000000000000000000000064113527443360014522gustar00rootroot0000000000000052 comment=e72207066be1ec5412cc9250e24450a2a635026b nodebox-web-1.9.4.6/000077500000000000000000000000001135274433600141125ustar00rootroot00000000000000nodebox-web-1.9.4.6/BeautifulSoup.py000066400000000000000000002300471135274433600172610ustar00rootroot00000000000000"""Beautiful Soup Elixir and Tonic "The Screen-Scraper's Friend" http://www.crummy.com/software/BeautifulSoup/ Beautiful Soup parses a (possibly invalid) XML or HTML document into a tree representation. It provides methods and Pythonic idioms that make it easy to navigate, search, and modify the tree. A well-formed XML/HTML document yields a well-formed data structure. An ill-formed XML/HTML document yields a correspondingly ill-formed data structure. If your document is only locally well-formed, you can use this library to find and process the well-formed part of it. Beautiful Soup works with Python 2.2 and up. It has no external dependencies, but you'll have more success at converting data to UTF-8 if you also install these three packages: * chardet, for auto-detecting character encodings http://chardet.feedparser.org/ * cjkcodecs and iconv_codec, which add more encodings to the ones supported by stock Python. http://cjkpython.i18n.org/ Beautiful Soup defines classes for two main parsing strategies: * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific language that kind of looks like XML. * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid or invalid. This class has web browser-like heuristics for obtaining a sensible parse tree in the face of common HTML errors. Beautiful Soup also defines a class (UnicodeDammit) for autodetecting the encoding of an HTML or XML document, and converting it to Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. For more than you ever wanted to know about Beautiful Soup, see the documentation: http://www.crummy.com/software/BeautifulSoup/documentation.html Here, have some legalese: Copyright (c) 2004-2008, Leonard Richardson All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the the Beautiful Soup Consortium and All Night Kosher Bakery nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. """ from __future__ import generators __author__ = "Leonard Richardson (leonardr@segfault.org)" __version__ = "3.0.7a" __copyright__ = "Copyright (c) 2004-2008 Leonard Richardson" __license__ = "New-style BSD" from sgmllib import SGMLParser, SGMLParseError import codecs import markupbase import types import re import sgmllib try: from htmlentitydefs import name2codepoint except ImportError: name2codepoint = {} try: set except NameError: from sets import Set as set #These hacks make Beautiful Soup able to parse XML with namespaces sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match DEFAULT_OUTPUT_ENCODING = "utf-8" # First, the classes that represent markup elements. class PageElement: """Contains the navigational information for some part of the page (either a tag or a piece of text)""" def setup(self, parent=None, previous=None): """Sets up the initial relations between this element and other elements.""" self.parent = parent self.previous = previous self.next = None self.previousSibling = None self.nextSibling = None if self.parent and self.parent.contents: self.previousSibling = self.parent.contents[-1] self.previousSibling.nextSibling = self def replaceWith(self, replaceWith): oldParent = self.parent myIndex = self.parent.contents.index(self) if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: # We're replacing this element with one of its siblings. index = self.parent.contents.index(replaceWith) if index and index < myIndex: # Furthermore, it comes before this element. That # means that when we extract it, the index of this # element will change. myIndex = myIndex - 1 self.extract() oldParent.insert(myIndex, replaceWith) def extract(self): """Destructively rips this element out of the tree.""" if self.parent: try: self.parent.contents.remove(self) except ValueError: pass #Find the two elements that would be next to each other if #this element (and any children) hadn't been parsed. Connect #the two. lastChild = self._lastRecursiveChild() nextElement = lastChild.next if self.previous: self.previous.next = nextElement if nextElement: nextElement.previous = self.previous self.previous = None lastChild.next = None self.parent = None if self.previousSibling: self.previousSibling.nextSibling = self.nextSibling if self.nextSibling: self.nextSibling.previousSibling = self.previousSibling self.previousSibling = self.nextSibling = None return self def _lastRecursiveChild(self): "Finds the last element beneath this object to be parsed." lastChild = self while hasattr(lastChild, 'contents') and lastChild.contents: lastChild = lastChild.contents[-1] return lastChild def insert(self, position, newChild): if (isinstance(newChild, basestring) or isinstance(newChild, unicode)) \ and not isinstance(newChild, NavigableString): newChild = NavigableString(newChild) position = min(position, len(self.contents)) if hasattr(newChild, 'parent') and newChild.parent != None: # We're 'inserting' an element that's already one # of this object's children. if newChild.parent == self: index = self.find(newChild) if index and index < position: # Furthermore we're moving it further down the # list of this object's children. That means that # when we extract this element, our target index # will jump down one. position = position - 1 newChild.extract() newChild.parent = self previousChild = None if position == 0: newChild.previousSibling = None newChild.previous = self else: previousChild = self.contents[position-1] newChild.previousSibling = previousChild newChild.previousSibling.nextSibling = newChild newChild.previous = previousChild._lastRecursiveChild() if newChild.previous: newChild.previous.next = newChild newChildsLastElement = newChild._lastRecursiveChild() if position >= len(self.contents): newChild.nextSibling = None parent = self parentsNextSibling = None while not parentsNextSibling: parentsNextSibling = parent.nextSibling parent = parent.parent if not parent: # This is the last element in the document. break if parentsNextSibling: newChildsLastElement.next = parentsNextSibling else: newChildsLastElement.next = None else: nextChild = self.contents[position] newChild.nextSibling = nextChild if newChild.nextSibling: newChild.nextSibling.previousSibling = newChild newChildsLastElement.next = nextChild if newChildsLastElement.next: newChildsLastElement.next.previous = newChildsLastElement self.contents.insert(position, newChild) def append(self, tag): """Appends the given tag to the contents of this tag.""" self.insert(len(self.contents), tag) def findNext(self, name=None, attrs={}, text=None, **kwargs): """Returns the first item that matches the given criteria and appears after this Tag in the document.""" return self._findOne(self.findAllNext, name, attrs, text, **kwargs) def findAllNext(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns all items that match the given criteria and appear after this Tag in the document.""" return self._findAll(name, attrs, text, limit, self.nextGenerator, **kwargs) def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): """Returns the closest sibling to this Tag that matches the given criteria and appears after this Tag in the document.""" return self._findOne(self.findNextSiblings, name, attrs, text, **kwargs) def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns the siblings of this Tag that match the given criteria and appear after this Tag in the document.""" return self._findAll(name, attrs, text, limit, self.nextSiblingGenerator, **kwargs) fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x def findPrevious(self, name=None, attrs={}, text=None, **kwargs): """Returns the first item that matches the given criteria and appears before this Tag in the document.""" return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns all items that match the given criteria and appear before this Tag in the document.""" return self._findAll(name, attrs, text, limit, self.previousGenerator, **kwargs) fetchPrevious = findAllPrevious # Compatibility with pre-3.x def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): """Returns the closest sibling to this Tag that matches the given criteria and appears before this Tag in the document.""" return self._findOne(self.findPreviousSiblings, name, attrs, text, **kwargs) def findPreviousSiblings(self, name=None, attrs={}, text=None, limit=None, **kwargs): """Returns the siblings of this Tag that match the given criteria and appear before this Tag in the document.""" return self._findAll(name, attrs, text, limit, self.previousSiblingGenerator, **kwargs) fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x def findParent(self, name=None, attrs={}, **kwargs): """Returns the closest parent of this Tag that matches the given criteria.""" # NOTE: We can't use _findOne because findParents takes a different # set of arguments. r = None l = self.findParents(name, attrs, 1) if l: r = l[0] return r def findParents(self, name=None, attrs={}, limit=None, **kwargs): """Returns the parents of this Tag that match the given criteria.""" return self._findAll(name, attrs, None, limit, self.parentGenerator, **kwargs) fetchParents = findParents # Compatibility with pre-3.x #These methods do the real heavy lifting. def _findOne(self, method, name, attrs, text, **kwargs): r = None l = method(name, attrs, text, 1, **kwargs) if l: r = l[0] return r def _findAll(self, name, attrs, text, limit, generator, **kwargs): "Iterates over a generator looking for things that match." if isinstance(name, SoupStrainer): strainer = name else: # Build a SoupStrainer strainer = SoupStrainer(name, attrs, text, **kwargs) results = ResultSet(strainer) g = generator() while True: try: i = g.next() except StopIteration: break if i: found = strainer.search(i) if found: results.append(found) if limit and len(results) >= limit: break return results #These Generators can be used to navigate starting from both #NavigableStrings and Tags. def nextGenerator(self): i = self while i: i = i.next yield i def nextSiblingGenerator(self): i = self while i: i = i.nextSibling yield i def previousGenerator(self): i = self while i: i = i.previous yield i def previousSiblingGenerator(self): i = self while i: i = i.previousSibling yield i def parentGenerator(self): i = self while i: i = i.parent yield i # Utility methods def substituteEncoding(self, str, encoding=None): encoding = encoding or "utf-8" return str.replace("%SOUP-ENCODING%", encoding) def toEncoding(self, s, encoding=None): """Encodes an object to a string in some encoding, or to Unicode. .""" if isinstance(s, unicode): if encoding: s = s.encode(encoding) elif isinstance(s, str): if encoding: s = s.encode(encoding) else: s = unicode(s) else: if encoding: s = self.toEncoding(str(s), encoding) else: s = unicode(s) return s class NavigableString(unicode, PageElement): def __new__(cls, value): """Create a new NavigableString. When unpickling a NavigableString, this method is called with the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be passed in to the superclass's __new__ or the superclass won't know how to handle non-ASCII characters. """ if isinstance(value, unicode): return unicode.__new__(cls, value) return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) def __getnewargs__(self): return (NavigableString.__str__(self),) def __getattr__(self, attr): """text.string gives you text. This is for backwards compatibility for Navigable*String, but for CData* it lets you get the string without the CData wrapper.""" if attr == 'string': return self else: raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) def __unicode__(self): return str(self).decode(DEFAULT_OUTPUT_ENCODING) def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): if encoding: return self.encode(encoding) else: return self class CData(NavigableString): def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): return "" % NavigableString.__str__(self, encoding) class ProcessingInstruction(NavigableString): def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): output = self if "%SOUP-ENCODING%" in output: output = self.substituteEncoding(output, encoding) return "" % self.toEncoding(output, encoding) class Comment(NavigableString): def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): return "" % NavigableString.__str__(self, encoding) class Declaration(NavigableString): def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): return "" % NavigableString.__str__(self, encoding) class Tag(PageElement): """Represents a found HTML tag with its attributes and contents.""" def _invert(h): "Cheap function to invert a hash." i = {} for k,v in h.items(): i[v] = k return i XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", "quot" : '"', "amp" : "&", "lt" : "<", "gt" : ">" } XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) def _convertEntities(self, match): """Used in a call to re.sub to replace HTML, XML, and numeric entities with the appropriate Unicode characters. If HTML entities are being converted, any unrecognized entities are escaped.""" x = match.group(1) if self.convertHTMLEntities and x in name2codepoint: return unichr(name2codepoint[x]) elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] else: return u'&%s;' % x elif len(x) > 0 and x[0] == '#': # Handle numeric entities if len(x) > 1 and x[1] == 'x': return unichr(int(x[2:], 16)) else: return unichr(int(x[1:])) elif self.escapeUnrecognizedEntities: return u'&%s;' % x else: return u'&%s;' % x def __init__(self, parser, name, attrs=None, parent=None, previous=None): "Basic constructor." # We don't actually store the parser object: that lets extracted # chunks be garbage-collected self.parserClass = parser.__class__ self.isSelfClosing = parser.isSelfClosingTag(name) self.name = name if attrs == None: attrs = [] self.attrs = attrs self.contents = [] self.setup(parent, previous) self.hidden = False self.containsSubstitutions = False self.convertHTMLEntities = parser.convertHTMLEntities self.convertXMLEntities = parser.convertXMLEntities self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities # Convert any HTML, XML, or numeric entities in the attribute values. convert = lambda(k, val): (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", self._convertEntities, val)) self.attrs = map(convert, self.attrs) def get(self, key, default=None): """Returns the value of the 'key' attribute for the tag, or the value given for 'default' if it doesn't have that attribute.""" return self._getAttrMap().get(key, default) def has_key(self, key): return self._getAttrMap().has_key(key) def __getitem__(self, key): """tag[key] returns the value of the 'key' attribute for the tag, and throws an exception if it's not there.""" return self._getAttrMap()[key] def __iter__(self): "Iterating over a tag iterates over its contents." return iter(self.contents) def __len__(self): "The length of a tag is the length of its list of contents." return len(self.contents) def __contains__(self, x): return x in self.contents def __nonzero__(self): "A tag is non-None even if it has no contents." return True def __setitem__(self, key, value): """Setting tag[key] sets the value of the 'key' attribute for the tag.""" self._getAttrMap() self.attrMap[key] = value found = False for i in range(0, len(self.attrs)): if self.attrs[i][0] == key: self.attrs[i] = (key, value) found = True if not found: self.attrs.append((key, value)) self._getAttrMap()[key] = value def __delitem__(self, key): "Deleting tag[key] deletes all 'key' attributes for the tag." for item in self.attrs: if item[0] == key: self.attrs.remove(item) #We don't break because bad HTML can define the same #attribute multiple times. self._getAttrMap() if self.attrMap.has_key(key): del self.attrMap[key] def __call__(self, *args, **kwargs): """Calling a tag like a function is the same as calling its findAll() method. Eg. tag('a') returns a list of all the A tags found within this tag.""" return apply(self.findAll, args, kwargs) def __getattr__(self, tag): #print "Getattr %s.%s" % (self.__class__, tag) if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: return self.find(tag[:-3]) elif tag.find('__') != 0: return self.find(tag) raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) def __eq__(self, other): """Returns true iff this tag has the same name, the same attributes, and the same contents (recursively) as the given tag. NOTE: right now this will return false if two tags have the same attributes in a different order. Should this be fixed?""" if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): return False for i in range(0, len(self.contents)): if self.contents[i] != other.contents[i]: return False return True def __ne__(self, other): """Returns true iff this tag is not identical to the other tag, as defined in __eq__.""" return not self == other def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): """Renders this tag as a string.""" return self.__str__(encoding) def __unicode__(self): return self.__str__(None) BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + ")") def _sub_entity(self, x): """Used with a regular expression to substitute the appropriate XML entity for an XML special character.""" return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0): """Returns a string or Unicode representation of this tag and its contents. To get Unicode, pass None for encoding. NOTE: since Python's HTML parser consumes whitespace, this method is not certain to reproduce the whitespace present in the original string.""" encodedName = self.toEncoding(self.name, encoding) attrs = [] if self.attrs: for key, val in self.attrs: fmt = '%s="%s"' if isString(val): if self.containsSubstitutions and '%SOUP-ENCODING%' in val: val = self.substituteEncoding(val, encoding) # The attribute value either: # # * Contains no embedded double quotes or single quotes. # No problem: we enclose it in double quotes. # * Contains embedded single quotes. No problem: # double quotes work here too. # * Contains embedded double quotes. No problem: # we enclose it in single quotes. # * Embeds both single _and_ double quotes. This # can't happen naturally, but it can happen if # you modify an attribute value after parsing # the document. Now we have a bit of a # problem. We solve it by enclosing the # attribute in single quotes, and escaping any # embedded single quotes to XML entities. if '"' in val: fmt = "%s='%s'" if "'" in val: # TODO: replace with apos when # appropriate. val = val.replace("'", "&squot;") # Now we're okay w/r/t quotes. But the attribute # value might also contain angle brackets, or # ampersands that aren't part of entities. We need # to escape those to XML entities too. val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) attrs.append(fmt % (self.toEncoding(key, encoding), self.toEncoding(val, encoding))) close = '' closeTag = '' if self.isSelfClosing: close = ' /' else: closeTag = '' % encodedName indentTag, indentContents = 0, 0 if prettyPrint: indentTag = indentLevel space = (' ' * (indentTag-1)) indentContents = indentTag + 1 contents = self.renderContents(encoding, prettyPrint, indentContents) if self.hidden: s = contents else: s = [] attributeString = '' if attrs: attributeString = ' ' + ' '.join(attrs) if prettyPrint: s.append(space) s.append('<%s%s%s>' % (encodedName, attributeString, close)) if prettyPrint: s.append("\n") s.append(contents) if prettyPrint and contents and contents[-1] != "\n": s.append("\n") if prettyPrint and closeTag: s.append(space) s.append(closeTag) if prettyPrint and closeTag and self.nextSibling: s.append("\n") s = ''.join(s) return s def decompose(self): """Recursively destroys the contents of this tree.""" contents = [i for i in self.contents] for i in contents: if isinstance(i, Tag): i.decompose() else: i.extract() self.extract() def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): return self.__str__(encoding, True) def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0): """Renders the contents of this tag as a string in the given encoding. If encoding is None, returns a Unicode string..""" s=[] for c in self: text = None if isinstance(c, NavigableString): text = c.__str__(encoding) elif isinstance(c, Tag): s.append(c.__str__(encoding, prettyPrint, indentLevel)) if text and prettyPrint: text = text.strip() if text: if prettyPrint: s.append(" " * (indentLevel-1)) s.append(text) if prettyPrint: s.append("\n") return ''.join(s) #Soup methods def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs): """Return only the first child of this Tag matching the given criteria.""" r = None l = self.findAll(name, attrs, recursive, text, 1, **kwargs) if l: r = l[0] return r findChild = find def findAll(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs): """Extracts a list of Tag objects that match the given criteria. You can specify the name of the Tag and any attributes you want the Tag to have. The value of a key-value pair in the 'attrs' map can be a string, a list of strings, a regular expression object, or a callable that takes a string and returns whether or not the string matches for some custom definition of 'matches'. The same is true of the tag name.""" generator = self.recursiveChildGenerator if not recursive: generator = self.childGenerator return self._findAll(name, attrs, text, limit, generator, **kwargs) findChildren = findAll # Pre-3.x compatibility methods first = find fetch = findAll def fetchText(self, text=None, recursive=True, limit=None): return self.findAll(text=text, recursive=recursive, limit=limit) def firstText(self, text=None, recursive=True): return self.find(text=text, recursive=recursive) #Private methods def _getAttrMap(self): """Initializes a map representation of this tag's attributes, if not already initialized.""" if not getattr(self, 'attrMap'): self.attrMap = {} for (key, value) in self.attrs: self.attrMap[key] = value return self.attrMap #Generator methods def childGenerator(self): for i in range(0, len(self.contents)): yield self.contents[i] raise StopIteration def recursiveChildGenerator(self): stack = [(self, 0)] while stack: tag, start = stack.pop() if isinstance(tag, Tag): for i in range(start, len(tag.contents)): a = tag.contents[i] yield a if isinstance(a, Tag) and tag.contents: if i < len(tag.contents) - 1: stack.append((tag, i+1)) stack.append((a, 0)) break raise StopIteration # Next, a couple classes to represent queries and their results. class SoupStrainer: """Encapsulates a number of ways of matching a markup element (tag or text).""" def __init__(self, name=None, attrs={}, text=None, **kwargs): self.name = name if isString(attrs): kwargs['class'] = attrs attrs = None if kwargs: if attrs: attrs = attrs.copy() attrs.update(kwargs) else: attrs = kwargs self.attrs = attrs self.text = text def __str__(self): if self.text: return self.text else: return "%s|%s" % (self.name, self.attrs) def searchTag(self, markupName=None, markupAttrs={}): found = None markup = None if isinstance(markupName, Tag): markup = markupName markupAttrs = markup callFunctionWithTagData = callable(self.name) \ and not isinstance(markupName, Tag) if (not self.name) \ or callFunctionWithTagData \ or (markup and self._matches(markup, self.name)) \ or (not markup and self._matches(markupName, self.name)): if callFunctionWithTagData: match = self.name(markupName, markupAttrs) else: match = True markupAttrMap = None for attr, matchAgainst in self.attrs.items(): if not markupAttrMap: if hasattr(markupAttrs, 'get'): markupAttrMap = markupAttrs else: markupAttrMap = {} for k,v in markupAttrs: markupAttrMap[k] = v attrValue = markupAttrMap.get(attr) if not self._matches(attrValue, matchAgainst): match = False break if match: if markup: found = markup else: found = markupName return found def search(self, markup): #print 'looking for %s in %s' % (self, markup) found = None # If given a list of items, scan it for a text element that # matches. if isList(markup) and not isinstance(markup, Tag): for element in markup: if isinstance(element, NavigableString) \ and self.search(element): found = element break # If it's a Tag, make sure its name or attributes match. # Don't bother with Tags if we're searching for text. elif isinstance(markup, Tag): if not self.text: found = self.searchTag(markup) # If it's text, make sure the text matches. elif isinstance(markup, NavigableString) or \ isString(markup): if self._matches(markup, self.text): found = markup else: raise Exception, "I don't know how to match against a %s" \ % markup.__class__ return found def _matches(self, markup, matchAgainst): #print "Matching %s against %s" % (markup, matchAgainst) result = False if matchAgainst == True and type(matchAgainst) == types.BooleanType: result = markup != None elif callable(matchAgainst): result = matchAgainst(markup) else: #Custom match methods take the tag as an argument, but all #other ways of matching match the tag name as a string. if isinstance(markup, Tag): markup = markup.name if markup and not isString(markup): markup = unicode(markup) #Now we know that chunk is either a string, or None. if hasattr(matchAgainst, 'match'): # It's a regexp object. result = markup and matchAgainst.search(markup) elif isList(matchAgainst): result = markup in matchAgainst elif hasattr(matchAgainst, 'items'): result = markup.has_key(matchAgainst) elif matchAgainst and isString(markup): if isinstance(markup, unicode): matchAgainst = unicode(matchAgainst) else: matchAgainst = str(matchAgainst) if not result: result = matchAgainst == markup return result class ResultSet(list): """A ResultSet is just a list that keeps track of the SoupStrainer that created it.""" def __init__(self, source): list.__init__([]) self.source = source # Now, some helper functions. def isList(l): """Convenience method that works with all 2.x versions of Python to determine whether or not something is listlike.""" return hasattr(l, '__iter__') \ or (type(l) in (types.ListType, types.TupleType)) def isString(s): """Convenience method that works with all 2.x versions of Python to determine whether or not something is stringlike.""" try: return isinstance(s, unicode) or isinstance(s, basestring) except NameError: return isinstance(s, str) def buildTagMap(default, *args): """Turns a list of maps, lists, or scalars into a single map. Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and NESTING_RESET_TAGS maps out of lists and partial maps.""" built = {} for portion in args: if hasattr(portion, 'items'): #It's a map. Merge it. for k,v in portion.items(): built[k] = v elif isList(portion): #It's a list. Map each item to the default. for k in portion: built[k] = default else: #It's a scalar. Map it to the default. built[portion] = default return built # Now, the parser classes. class BeautifulStoneSoup(Tag, SGMLParser): """This class contains the basic parser and search code. It defines a parser that knows nothing about tag behavior except for the following: You can't close a tag without closing all the tags it encloses. That is, "" actually means "". [Another possible explanation is "", but since this class defines no SELF_CLOSING_TAGS, it will never use that explanation.] This class is useful for parsing XML or made-up markup languages, or when BeautifulSoup makes an assumption counter to what you were expecting.""" SELF_CLOSING_TAGS = {} NESTABLE_TAGS = {} RESET_NESTING_TAGS = {} QUOTE_TAGS = {} PRESERVE_WHITESPACE_TAGS = [] MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), lambda x: x.group(1) + ' />'), (re.compile(']*)>'), lambda x: '') ] ROOT_TAG_NAME = u'[document]' HTML_ENTITIES = "html" XML_ENTITIES = "xml" XHTML_ENTITIES = "xhtml" # TODO: This only exists for backwards-compatibility ALL_ENTITIES = XHTML_ENTITIES # Used when determining whether a text node is all whitespace and # can be replaced with a single space. A text node that contains # fancy Unicode spaces (usually non-breaking) should be left # alone. STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, markupMassage=True, smartQuotesTo=XML_ENTITIES, convertEntities=None, selfClosingTags=None, isHTML=False): """The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser. sgmllib will process most bad HTML, and the BeautifulSoup class has some tricks for dealing with some HTML that kills sgmllib, but Beautiful Soup can nonetheless choke or lose data if your data uses self-closing tags or declarations incorrectly. By default, Beautiful Soup uses regexes to sanitize input, avoiding the vast majority of these problems. If the problems don't apply to you, pass in False for markupMassage, and you'll get better performance. The default parser massage techniques fix the two most common instances of invalid HTML that choke sgmllib:
(No space between name of closing tag and tag close) (Extraneous whitespace in declaration) You can pass in a custom list of (RE object, replace method) tuples to get Beautiful Soup to scrub your input the way you want.""" self.parseOnlyThese = parseOnlyThese self.fromEncoding = fromEncoding self.smartQuotesTo = smartQuotesTo self.convertEntities = convertEntities # Set the rules for how we'll deal with the entities we # encounter if self.convertEntities: # It doesn't make sense to convert encoded characters to # entities even while you're converting entities to Unicode. # Just convert it all to Unicode. self.smartQuotesTo = None if convertEntities == self.HTML_ENTITIES: self.convertXMLEntities = False self.convertHTMLEntities = True self.escapeUnrecognizedEntities = True elif convertEntities == self.XHTML_ENTITIES: self.convertXMLEntities = True self.convertHTMLEntities = True self.escapeUnrecognizedEntities = False elif convertEntities == self.XML_ENTITIES: self.convertXMLEntities = True self.convertHTMLEntities = False self.escapeUnrecognizedEntities = False else: self.convertXMLEntities = False self.convertHTMLEntities = False self.escapeUnrecognizedEntities = False self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) SGMLParser.__init__(self) if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() self.markup = markup self.markupMassage = markupMassage try: self._feed(isHTML=isHTML) except StopParsing: pass self.markup = None # The markup can now be GCed def convert_charref(self, name): """This method fixes a bug in Python's SGMLParser.""" try: n = int(name) except ValueError: return if not 0 <= n <= 127 : # ASCII ends at 127, not 255 return return self.convert_codepoint(n) def _feed(self, inDocumentEncoding=None, isHTML=False): # Convert the document to Unicode. markup = self.markup if isinstance(markup, unicode): if not hasattr(self, 'originalEncoding'): self.originalEncoding = None else: dammit = UnicodeDammit\ (markup, [self.fromEncoding, inDocumentEncoding], smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) markup = dammit.unicode self.originalEncoding = dammit.originalEncoding self.declaredHTMLEncoding = dammit.declaredHTMLEncoding if markup: if self.markupMassage: if not isList(self.markupMassage): self.markupMassage = self.MARKUP_MASSAGE for fix, m in self.markupMassage: markup = fix.sub(m, markup) # TODO: We get rid of markupMassage so that the # soup object can be deepcopied later on. Some # Python installations can't copy regexes. If anyone # was relying on the existence of markupMassage, this # might cause problems. del(self.markupMassage) self.reset() SGMLParser.feed(self, markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag() def __getattr__(self, methodName): """This method routes method call requests to either the SGMLParser superclass or the Tag superclass, depending on the method name.""" #print "__getattr__ called on %s.%s" % (self.__class__, methodName) if methodName.find('start_') == 0 or methodName.find('end_') == 0 \ or methodName.find('do_') == 0: return SGMLParser.__getattr__(self, methodName) elif methodName.find('__') != 0: return Tag.__getattr__(self, methodName) else: raise AttributeError def isSelfClosingTag(self, name): """Returns true iff the given string is the name of a self-closing tag according to this parser.""" return self.SELF_CLOSING_TAGS.has_key(name) \ or self.instanceSelfClosingTags.has_key(name) def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 SGMLParser.reset(self) self.currentData = [] self.currentTag = None self.tagStack = [] self.quoteStack = [] self.pushTag(self) def popTag(self): tag = self.tagStack.pop() # Tags with just one string-owning child get the child as a # 'string' property, so that soup.tag.string is shorthand for # soup.tag.contents[0] if len(self.currentTag.contents) == 1 and \ isinstance(self.currentTag.contents[0], NavigableString): self.currentTag.string = self.currentTag.contents[0] #print "Pop", tag.name if self.tagStack: self.currentTag = self.tagStack[-1] return self.currentTag def pushTag(self, tag): #print "Push", tag.name if self.currentTag: self.currentTag.contents.append(tag) self.tagStack.append(tag) self.currentTag = self.tagStack[-1] def endData(self, containerClass=NavigableString): if self.currentData: currentData = u''.join(self.currentData) if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and not set([tag.name for tag in self.tagStack]).intersection( self.PRESERVE_WHITESPACE_TAGS)): if '\n' in currentData: currentData = '\n' else: currentData = ' ' self.currentData = [] if self.parseOnlyThese and len(self.tagStack) <= 1 and \ (not self.parseOnlyThese.text or \ not self.parseOnlyThese.search(currentData)): return o = containerClass(currentData) o.setup(self.currentTag, self.previous) if self.previous: self.previous.next = o self.previous = o self.currentTag.contents.append(o) def _popToTag(self, name, inclusivePop=True): """Pops the tag stack up to and including the most recent instance of the given tag. If inclusivePop is false, pops the tag stack up to but *not* including the most recent instqance of the given tag.""" #print "Popping to %s" % name if name == self.ROOT_TAG_NAME: return numPops = 0 mostRecentTag = None for i in range(len(self.tagStack)-1, 0, -1): if name == self.tagStack[i].name: numPops = len(self.tagStack)-i break if not inclusivePop: numPops = numPops - 1 for i in range(0, numPops): mostRecentTag = self.popTag() return mostRecentTag def _smartPop(self, name): """We need to pop up to the previous tag of this type, unless one of this tag's nesting reset triggers comes between this tag and the previous tag of this type, OR unless this tag is a generic nesting trigger and another generic nesting trigger comes between this tag and the previous tag of this type. Examples:

FooBar *

* should pop to 'p', not 'b'.

FooBar *

* should pop to 'table', not 'p'.

Foo

Bar *

* should pop to 'tr', not 'p'.

    • *
    • * should pop to 'ul', not the first 'li'.
  • ** should pop to 'table', not the first 'tr' tag should implicitly close the previous tag within the same
    ** should pop to 'tr', not the first 'td' """ nestingResetTriggers = self.NESTABLE_TAGS.get(name) isNestable = nestingResetTriggers != None isResetNesting = self.RESET_NESTING_TAGS.has_key(name) popTo = None inclusive = True for i in range(len(self.tagStack)-1, 0, -1): p = self.tagStack[i] if (not p or p.name == name) and not isNestable: #Non-nestable tags get popped to the top or to their #last occurance. popTo = name break if (nestingResetTriggers != None and p.name in nestingResetTriggers) \ or (nestingResetTriggers == None and isResetNesting and self.RESET_NESTING_TAGS.has_key(p.name)): #If we encounter one of the nesting reset triggers #peculiar to this tag, or we encounter another tag #that causes nesting to reset, pop up to but not #including that tag. popTo = p.name inclusive = False break p = p.parent if popTo: self._popToTag(popTo, inclusive) def unknown_starttag(self, name, attrs, selfClosing=0): #print "Start tag %s: %s" % (name, attrs) if self.quoteStack: #This is not a real tag. #print "<%s> is not real!" % name attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) self.handle_data('<%s%s>' % (name, attrs)) return self.endData() if not self.isSelfClosingTag(name) and not selfClosing: self._smartPop(name) if self.parseOnlyThese and len(self.tagStack) <= 1 \ and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): return tag = Tag(self, name, attrs, self.currentTag, self.previous) if self.previous: self.previous.next = tag self.previous = tag self.pushTag(tag) if selfClosing or self.isSelfClosingTag(name): self.popTag() if name in self.QUOTE_TAGS: #print "Beginning quote (%s)" % name self.quoteStack.append(name) self.literal = 1 return tag def unknown_endtag(self, name): #print "End tag %s" % name if self.quoteStack and self.quoteStack[-1] != name: #This is not a real end tag. #print " is not real!" % name self.handle_data('' % name) return self.endData() self._popToTag(name) if self.quoteStack and self.quoteStack[-1] == name: self.quoteStack.pop() self.literal = (len(self.quoteStack) > 0) def handle_data(self, data): self.currentData.append(data) def _toStringSubclass(self, text, subclass): """Adds a certain piece of text to the tree as a NavigableString subclass.""" self.endData() self.handle_data(text) self.endData(subclass) def handle_pi(self, text): """Handle a processing instruction as a ProcessingInstruction object, possibly one with a %SOUP-ENCODING% slot into which an encoding will be plugged later.""" if text[:3] == "xml": text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" self._toStringSubclass(text, ProcessingInstruction) def handle_comment(self, text): "Handle comments as Comment objects." self._toStringSubclass(text, Comment) def handle_charref(self, ref): "Handle character references as data." if self.convertEntities: data = unichr(int(ref)) else: data = '&#%s;' % ref self.handle_data(data) def handle_entityref(self, ref): """Handle entity references as data, possibly converting known HTML and/or XML entity references to the corresponding Unicode characters.""" data = None if self.convertHTMLEntities: try: data = unichr(name2codepoint[ref]) except KeyError: pass if not data and self.convertXMLEntities: data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) if not data and self.convertHTMLEntities and \ not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): # TODO: We've got a problem here. We're told this is # an entity reference, but it's not an XML entity # reference or an HTML entity reference. Nonetheless, # the logical thing to do is to pass it through as an # unrecognized entity reference. # # Except: when the input is "&carol;" this function # will be called with input "carol". When the input is # "AT&T", this function will be called with input # "T". We have no way of knowing whether a semicolon # was present originally, so we don't know whether # this is an unknown entity or just a misplaced # ampersand. # # The more common case is a misplaced ampersand, so I # escape the ampersand and omit the trailing semicolon. data = "&%s" % ref if not data: # This case is different from the one above, because we # haven't already gone through a supposedly comprehensive # mapping of entities to Unicode characters. We might not # have gone through any mapping at all. So the chances are # very high that this is a real entity, and not a # misplaced ampersand. data = "&%s;" % ref self.handle_data(data) def handle_decl(self, data): "Handle DOCTYPEs and the like as Declaration objects." self._toStringSubclass(data, Declaration) def parse_declaration(self, i): """Treat a bogus SGML declaration as raw data. Treat a CDATA declaration as a CData object.""" j = None if self.rawdata[i:i+9] == '', i) if k == -1: k = len(self.rawdata) data = self.rawdata[i+9:k] j = k+3 self._toStringSubclass(data, CData) else: try: j = SGMLParser.parse_declaration(self, i) except SGMLParseError: toHandle = self.rawdata[i:] self.handle_data(toHandle) j = i + len(toHandle) return j class BeautifulSoup(BeautifulStoneSoup): """This parser knows the following facts about HTML: * Some tags have no closing tag and should be interpreted as being closed as soon as they are encountered. * The text inside some tags (ie. 'script') may contain tags which are not really part of the document and which should be parsed as text, not tags. If you want to parse the text as tags, you can always fetch it and parse it explicitly. * Tag nesting rules: Most tags can't be nested at all. For instance, the occurance of a

    tag should implicitly close the previous

    tag.

    Para1

    Para2 should be transformed into:

    Para1

    Para2 Some tags can be nested arbitrarily. For instance, the occurance of a

    tag should _not_ implicitly close the previous
    tag. Alice said:
    Bob said:
    Blah should NOT be transformed into: Alice said:
    Bob said:
    Blah Some tags can be nested, but the nesting is reset by the interposition of other tags. For instance, a
    , but not close a tag in another table.
    BlahBlah should be transformed into:
    BlahBlah but, Blah
    Blah should NOT be transformed into Blah
    Blah Differing assumptions about tag nesting rules are a major source of problems with the BeautifulSoup class. If BeautifulSoup is not treating as nestable a tag your page author treats as nestable, try ICantBelieveItsBeautifulSoup, MinimalSoup, or BeautifulStoneSoup before writing your own subclass.""" def __init__(self, *args, **kwargs): if not kwargs.has_key('smartQuotesTo'): kwargs['smartQuotesTo'] = self.HTML_ENTITIES kwargs['isHTML'] = True BeautifulStoneSoup.__init__(self, *args, **kwargs) SELF_CLOSING_TAGS = buildTagMap(None, ['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) QUOTE_TAGS = {'script' : None, 'textarea' : None} #According to the HTML standard, each of these inline tags can #contain another tag of the same type. Furthermore, it's common #to actually use these tags this way. NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 'center'] #According to the HTML standard, these block tags can contain #another tag of the same type. Furthermore, it's common #to actually use these tags this way. NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del'] #Lists can contain other lists, but there are restrictions. NESTABLE_LIST_TAGS = { 'ol' : [], 'ul' : [], 'li' : ['ul', 'ol'], 'dl' : [], 'dd' : ['dl'], 'dt' : ['dl'] } #Tables can contain other tables, but there are restrictions. NESTABLE_TABLE_TAGS = {'table' : [], 'tr' : ['table', 'tbody', 'tfoot', 'thead'], 'td' : ['tr'], 'th' : ['tr'], 'thead' : ['table'], 'tbody' : ['table'], 'tfoot' : ['table'], } NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre'] #If one of these tags is encountered, all tags up to the next tag of #this type are popped. RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', NON_NESTABLE_BLOCK_TAGS, NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) # Used to detect the charset in a META tag; see start_meta CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) def start_meta(self, attrs): """Beautiful Soup can detect a charset included in a META tag, try to convert the document to that charset, and re-parse the document from the beginning.""" httpEquiv = None contentType = None contentTypeIndex = None tagNeedsEncodingSubstitution = False for i in range(0, len(attrs)): key, value = attrs[i] key = key.lower() if key == 'http-equiv': httpEquiv = value elif key == 'content': contentType = value contentTypeIndex = i if httpEquiv and contentType: # It's an interesting meta tag. match = self.CHARSET_RE.search(contentType) if match: if (self.declaredHTMLEncoding is not None or self.originalEncoding == self.fromEncoding): # An HTML encoding was sniffed while converting # the document to Unicode, or an HTML encoding was # sniffed during a previous pass through the # document, or an encoding was specified # explicitly and it worked. Rewrite the meta tag. def rewrite(match): return match.group(1) + "%SOUP-ENCODING%" newAttr = self.CHARSET_RE.sub(rewrite, contentType) attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], newAttr) tagNeedsEncodingSubstitution = True else: # This is our first pass through the document. # Go through it again with the encoding information. newCharset = match.group(3) if newCharset and newCharset != self.originalEncoding: self.declaredHTMLEncoding = newCharset self._feed(self.declaredHTMLEncoding) raise StopParsing pass tag = self.unknown_starttag("meta", attrs) if tag and tagNeedsEncodingSubstitution: tag.containsSubstitutions = True class StopParsing(Exception): pass class ICantBelieveItsBeautifulSoup(BeautifulSoup): """The BeautifulSoup class is oriented towards skipping over common HTML errors like unclosed tags. However, sometimes it makes errors of its own. For instance, consider this fragment: FooBar This is perfectly valid (if bizarre) HTML. However, the BeautifulSoup class will implicitly close the first b tag when it encounters the second 'b'. It will think the author wrote "FooBar", and didn't close the first 'b' tag, because there's no real-world reason to bold something that's already bold. When it encounters '' it will close two more 'b' tags, for a grand total of three tags closed instead of two. This can throw off the rest of your document structure. The same is true of a number of other tags, listed below. It's much more common for someone to forget to close a 'b' tag than to actually use nested 'b' tags, and the BeautifulSoup class handles the common case. This class handles the not-co-common case: where you can't believe someone wrote what they did, but it's valid HTML and BeautifulSoup screwed up by assuming it wouldn't be.""" I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', 'big'] I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript'] NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) class MinimalSoup(BeautifulSoup): """The MinimalSoup class is for parsing HTML that contains pathologically bad markup. It makes no assumptions about tag nesting, but it does know which tags are self-closing, that ", html) def strip_inline_css(html): return strip_between("", html) def strip_comments(html): return strip_between("", html) def strip_forms(html): return strip_between("", html) #### COLLAPSE WHITESPACE ############################################################################# def collapse_spaces(str): # If there are 10 consecutive spaces, 9 of them are removed. # Tabs not at the beginning of a line are truncated as well, e.g "this is untidy". #str = re.sub(r"[[^$\t]\t]+", " ", str) str = re.sub(r"[ ]+", " ", str).strip(" ") return str def collapse_linebreaks(str, max=2): # Allow only a maximum of max linebreaks to build up, # stripping additional whitespace lines from the output. lines = str.split("\n") str = "" i = 0 for l in lines: if l.strip() == "": i += 1 else: i = 0 if i < max: str += l.strip(" ") str += "\n" return str.strip() def collapse_tabs(str, indent=False): # Converts tabs to spaces, optionally leaving the left indentation unmodified. # collapse_spaces() should be called after this. if not indent: return str.replace("\t", " ") else: p = re.compile(r"^(\t+)", re.MULTILINE) delimiter = "$$$_INDENTATION" str = re.sub(p, "\\1"+delimiter, str) lines = str.split("\n") str = "" for l in lines: i = l.find(delimiter) #if i >= 0: l = l[:i] + l[i:].replace("\t", " ") str += l + "\n" str = str.replace(delimiter, "") return str def plain(html): try: html = str(html) except: pass if html == "None": html = "" html = strip_javascript(html) html = strip_inline_css(html) html = strip_comments(html) html = strip_forms(html) html = strip_tags(html, columns="") html = replace_entities(html) html = collapse_tabs(html) html = collapse_spaces(html) html = collapse_linebreaks(html) return html #from urllib import urlopen #html = urlopen("http://nodebox.net").read() #print html #print "##############################################" #print plain(html)nodebox-web-1.9.4.6/json.py000066400000000000000000000000771135274433600154410ustar00rootroot00000000000000import simplejson def read(str): return simplejson.loads(str)nodebox-web-1.9.4.6/kuler.py000066400000000000000000000205371135274433600156150ustar00rootroot00000000000000from url import URLAccumulator from urllib import quote from cache import Cache from xml.dom import minidom import colorsys def clear_cache(): Cache("kuler").clear() ### COLOR MODELS ##################################################################################### def cmyk_to_rgb(c, m, y, k): r = 1.0 - (c+k) g = 1.0 - (m+k) b = 1.0 - (y+k) return r, g, b def hex_to_rgb(hex): hex = hex.lstrip("#") if len(hex) < 6: hex += hex[-1] * (6-len(hex)) r, g, b = hex[0:2], hex[2:4], hex[4:] r, g, b = [int(n, 16) for n in (r, g, b)] return (r/255.0, g/255.0, b/255.0) def lab_to_rgb(l, a, b): """ Converts CIE Lab to RGB components. First we have to convert to XYZ color space. Conversion involves using a white point, in this case D65 which represents daylight illumination. Algorithms adopted from: http://www.easyrgb.com/math.php """ y = (l+16) / 116.0 x = a/500.0 + y z = y - b/200.0 v = [x,y,z] for i in range(3): if pow(v[i],3) > 0.008856: v[i] = pow(v[i],3) else: v[i] = (v[i]-16/116.0) / 7.787 # Observer = 2, Illuminant = D65 x = v[0] * 95.047/100 y = v[1] * 100.0/100 z = v[2] * 108.883/100 r = x * 3.2406 + y *-1.5372 + z *-0.4986 g = x *-0.9689 + y * 1.8758 + z * 0.0415 b = x * 0.0557 + y *-0.2040 + z * 1.0570 v = [r,g,b] for i in range(3): if v[i] > 0.0031308: v[i] = 1.055 * pow(v[i], 1/2.4) - 0.055 else: v[i] = 12.92 * v[i] #r, g, b = v[0]*255, v[1]*255, v[2]*255 r, g, b = v[0], v[1], v[2] return r, g, b ### KULER THEME ###################################################################################### class KulerTheme(list): def __init__(self): self.id = 0 self.author = u"" self.label = u"" self.tags = [] def _darkest(self): """ Returns the darkest swatch. Knowing the contract between a light and a dark swatch can help us decide how to display readable typography. """ rgb, n = (1.0, 1.0, 1.0), 3.0 for r,g,b in self: if r+g+b < n: rgb, n = (r,g,b), r+g+b return rgb darkest = property(_darkest) def _lightest(self): """ Returns the lightest swatch. """ rgb, n = (0.0, 0.0, 0.0), 0.0 for r,g,b in self: if r+g+b > n: rgb, n = (r,g,b), r+g+b return rgb lightest = property(_lightest) def draw(self, x, y, w=40, h=40): try: from web import _ctx except: pass from nodebox.graphics import RGB for r,g,b in self: _ctx.colormode(RGB) _ctx.fill(r,g,b) _ctx.rect(x, y, w, h) x += w ### KULER ############################################################################################ class Kuler(list, URLAccumulator): def __init__(self, q, page=0, wait=10, asynchronous=False, cached=True): """ Parses color themes from Adobe Kuler. Valid queries are "popular", "rating", a theme id as an integer, or a search string. """ if cached: cache = "kuler" else: cache = None # Requests for search, popular, rating and id have different url. url = "http://kuler.adobe.com/kuler/services/" self.id_string = url + "theme/get.cfm?themeId=" if isinstance(q, int): url = self.id_string + str(q) elif q in ["popular", "rating"]: url += "theme/getList.cfm?listType="+q url += "&startIndex="+str(page*30)+"&itemsPerPage=30" else: url += "search/get.cfm?searchQuery="+quote(q) url += "&startIndex="+str(page*30)+"&itemsPerPage=30" # Refresh cached results every day # for highest rating or popular requests. if q in ["popular", "rating"]: if cached and Cache(cache).age(url) > 0: Cache(cache).remove(url) URLAccumulator.__init__(self, url, wait, asynchronous, cache, type=".xml", throttle=3) def load(self, data): if data == "": return if data.find("0") > 0: return dom = minidom.parseString(data) for theme in dom.getElementsByTagName("theme"): try: self.append(self.parse_theme(theme)) except: pass def parse_tag(self, xml, tag): return xml.getElementsByTagName(tag)[0].childNodes[0].nodeValue def parse_theme(self, xml): """ Parses a theme from XML returned by Kuler. Gets the theme's id, label and swatches. All of the swatches are converted to RGB. If we have a full description for a theme id in cache, parse that to get tags associated with the theme. """ kt = KulerTheme() kt.author = xml.getElementsByTagName("author")[0] kt.author = kt.author.childNodes[1].childNodes[0].nodeValue kt.id = int(self.parse_tag(xml, "id")) kt.label = self.parse_tag(xml, "label") mode = self.parse_tag(xml, "mode") for swatch in xml.getElementsByTagName("swatch"): c1 = float(self.parse_tag(swatch, "c1")) c2 = float(self.parse_tag(swatch, "c2")) c3 = float(self.parse_tag(swatch, "c3")) c4 = float(self.parse_tag(swatch, "c4")) if mode == "rgb": kt.append((c1,c2,c3)) if mode == "cmyk": kt.append(cmyk_to_rgb(c1,c2,c3,c4)) if mode == "hsv": kt.append(colorsys.hsv_to_rgb(c1,c2,c3)) if mode == "hex": kt.append(hex_to_rgb(c1)) if mode == "lab": kt.append(lab_to_rgb(c1,c2,c3)) # If we have the full theme in cache, # parse tags from it. if self._cache.exists(self.id_string + str(kt.id)): xml = self._cache.read(self.id_string + str(kt.id)) xml = minidom.parseString(xml) for tags in xml.getElementsByTagName("tag"): tags = self.parse_tag(tags, "label") tags = tags.split(" ") kt.tags.extend(tags) return kt ###################################################################################################### def search_by_popularity(page=0, wait=10, asynchronous=False, cached=True): return Kuler("popular", page, wait, asynchronous, cached) def search_by_rating(page=0, wait=10, asynchronous=False, cached=True): return Kuler("rating", page, wait, asynchronous, cached) def search(q, page=0, wait=10, asynchronous=False, cached=True): return Kuler(str(q), page, wait, asynchronous, cached) def search_by_id(id, page=0, wait=10, asynchronous=False, cached=True): return Kuler(int(id), page, wait, asynchronous, cached) ###################################################################################################### def preview(theme): try: from web import _ctx except: pass # Use the darkest swatch as background. r,g,b = theme.darkest c = _ctx.color(r, g, b) c.brightness *= 0.5 c.brightness = max(0.1, c.brightness) c.brightness = 0.15 _ctx.background(c) #_ctx.background(0.1) from random import random, choice for i in range(100): r,g,b = choice(theme) _ctx.fill(r,g,b) r,g,b = choice(theme) _ctx.stroke(r,g,b) _ctx.strokewidth(random()*30) r = random()*100 _ctx.oval(random()*400, random()*400, r, r) # Draw swatches. _ctx.nostroke() theme.draw(20, 480) # Theme info colored in the lightest swatch. r,g,b = theme.lightest _ctx.fontsize(18) _ctx.fill(r,g,b) _ctx.text(theme.label + u" | " + str(theme.id), 20, 540) _ctx.fontsize(_ctx.fontsize()/2) _ctx.text(", ".join(theme.tags), 20, 555, width=400) #size(500, 650) #themes = search("rating") ##for theme in themes[:10]: Kuler(theme.id) #theme = themes[0] #preview(theme)nodebox-web-1.9.4.6/mathtex.py000066400000000000000000000026521135274433600161430ustar00rootroot00000000000000### MATHTEX ########################################################################################## # Author: Tom De Smedt, Cedric Foellmi. # Copyright (c) 2008 by Tom De Smedt, Cedric Foellmi. # See LICENSE.txt for details. from url import URLAccumulator from urllib import quote from cache import Cache def clear_cache(): Cache("mathtex").clear() class mathTeX(URLAccumulator): """ The mathTeX server returns a GIF or PNG-image for a LaTeX math expression. http://www.forkosh.com/mathtex.html """ def __init__(self, eq, type="png", dpi=120, color="", wait=10, asynchronous=False): eq = "\\"+type+" "+eq eq = "\dpi{"+str(dpi)+"} " + eq if color: eq = "\usepackage{color} \color{"+color+"} " + eq print eq url = "http://www.forkosh.dreamhost.com/mathtex.cgi?"+quote(eq) URLAccumulator.__init__(self, url, wait, asynchronous, "mathtex", type="."+type, throttle=1) def load(self, data): # Provide the path to the image stored in cache. self.image = self._cache.hash(self.url) def gif(eq, dpi=120, color=""): return mathTeX(eq, type="gif", dpi=dpi, color=color).image def png(eq, dpi=120, color=""): return mathTeX(eq, type="png", dpi=dpi, color=color).image def png300(eq, color=""): return png(eq, color, 300) #eq = "E = hf = \frac{hc}{\lambda} \,\! " #image(gif(eq), 10, 10)nodebox-web-1.9.4.6/mimetex.py000066400000000000000000000020401135274433600161300ustar00rootroot00000000000000### MIMETEX ########################################################################################## # Code for connecting to mimeTeX server to convert LaTeX math to images. # Author: Tom De Smedt. # Copyright (c) 2007 by Tom De Smedt. # See LICENSE.txt for details. from url import URLAccumulator from urllib import quote from cache import Cache def clear_cache(): Cache("mimetex").clear() class mimeTeX(URLAccumulator): """ The mimeTeX server returns a GIF-image for a LaTeX math expression. http://www.forkosh.com/mimetex.html """ def __init__(self, eq, wait=10, asynchronous=False): url = "http://www.forkosh.dreamhost.com/mimetex.cgi?"+quote(eq) URLAccumulator.__init__(self, url, wait, asynchronous, "mimetex", type=".gif", throttle=1) def load(self, data): # Provide the path to the GIF stored in cache. self.image = self._cache.hash(self.url) def gif(eq): return mimeTeX(eq).image #eq = "E = hf = \frac{hc}{\lambda} \,\! " #image(gif(eq), 10, 10)nodebox-web-1.9.4.6/morguefile.py000066400000000000000000000124371135274433600166310ustar00rootroot00000000000000### MORGUEFILE ####################################################################################### # Code for downloading images from MorgueFile. # Author: Tom De Smedt, Stuart Axon. # Copyright (c) 2007 by Tom De Smedt. # See LICENSE.txt for details. import os from urllib import quote_plus from xml.dom.minidom import parseString from url import URLAccumulator from cache import Cache def clear_cache(): Cache("morguefile").clear() ### MORGUEFILE IMAGE ################################################################################# SIZE_THUMBNAIL = "thumbnail" SIZE_LARGE = "medium" def disambiguate_size(size): if size == True : return SIZE_THUMBNAIL if size == False : return SIZE_LARGE if size.lower() in ("thumbnail", "thumb", "t", "th", "small", "s"): return SIZE_THUMBNAIL if size.lower() in ("medium", "m", "large", "l"): return SIZE_LARGE return size class MorgueFileImage(URLAccumulator): def __init__(self): self.id = 0 self.author = "" self.name = "" self.url = "" self.date = "" self.hi_res = None self.width = None self.height = None # For backwards compatibility (don't exist anymore now). self.category = "" self.views = 0 self.downloads = 0 self.path = "" def __str__(self): return self.name.encode("utf-8") def download(self, size=SIZE_LARGE, thumbnail=False, wait=60, asynchronous=False): """ Downloads this image to cache. Calling the download() method instantiates an asynchronous URLAccumulator. Once it is done downloading, this image will have its path property set to an image file in the cache. """ if thumbnail == True: size = SIZE_THUMBNAIL # backwards compatibility self._size = disambiguate_size(size) if self._size == SIZE_THUMBNAIL: url = self.url.replace("/preview/", "/med/") else: url = self.url cache = "morguefile" extension = os.path.splitext(url)[1] URLAccumulator.__init__(self, url, wait, asynchronous, cache, extension, 2) if not asynchronous: return self.path def load(self, data): if self._size == SIZE_THUMBNAIL: url = self.url.replace("/preview/", "/med/") else: url = self.url self.path = self._cache.hash(url) ### MORGUEFILE ####################################################################################### class MorgueFile(list): def __init__(self, xml): self._parse(xml) def _parse_data(self, e, tag): return e.getElementsByTagName(tag)[0].childNodes[0].data def _parse_attribute(self, e, tag, attr): return e.getElementsByTagName(tag)[0].attributes[attr].value def _parse(self, xml): if xml == "": return xml = xml.replace("& ", "& ") xml = xml.decode("utf-8", "ignore") dom = parseString(xml) for e in dom.getElementsByTagName("item"): img = MorgueFileImage() img.id = self._parse_data(e, "media:guid") img.author = self._parse_data(e, "media:credit") img.name = self._parse_data(e, "media:title") img.date = self._parse_data(e, "pubDate") img.url = self._parse_attribute(e, "media:thumbnail", "url").replace("/med/", "/preview/") img.hi_res = self._parse_attribute(e, "media:content", "url") img.width = float(self._parse_attribute(e, "media:content", "width")) img.height = float(self._parse_attribute(e, "media:content", "height")) # The width of /preview/ image is always 620, # calculacte the height according to this ratio: img.width, img.height = 620.0, img.height / img.width * 620.0 self.append(img) ### MORGUEFILE SEARCH ################################################################################ class MorgueFileSearch(MorgueFile, URLAccumulator): def __init__(self, q, author=False, max=100, wait=10, asynchronous=False, cached=True): if cached: cache = "morguefile" else: cache = None arg = "qury" if author == True: arg = "author" url = "http://morguefile.com/archive/xml/" url += "?" + arg + "=" + quote_plus(q) + "&lmt=" + str(max) URLAccumulator.__init__(self, url, wait, asynchronous, cache, ".xml", 1) def load(self, data): MorgueFile.__init__(self, data) ###################################################################################################### def search(q, max=100, wait=10, asynchronous=False, cached=True): return MorgueFileSearch(q, False, max, wait, asynchronous, cached) def search_by_author(q, max=100, wait=10, asynchronous=False, cached=True): return MorgueFileSearch(q, True, max, wait, asynchronous, cached) #images = search("apple") #images.sort() #for img in images: # print img.name #img = images[0] #img.download() #image(img.path, 0, 0)nodebox-web-1.9.4.6/newsfeed.py000066400000000000000000000104371135274433600162710ustar00rootroot00000000000000### NEWSFEED ######################################################################################### # Code for parsing newsfeeds. # It wraps the Universal Feedparser by Mark Pilgrim. # Author: Tom De Smedt. # Copyright (c) 2007 by Tom De Smedt. # See LICENSE.txt for details. import os from feedparser import feedparser from url import URLAccumulator from html import strip_tags from cache import Cache def clear_cache(): Cache("newsfeed").clear() ### FAVORITE NEWSFEED ################################################################################ favorites = {} try: path = os.path.join(os.path.dirname(__file__), "newsfeed.txt") for f in open(path).readlines(): f = f.split(",") favorites[f[0].strip()] = f[1].strip() except: pass def favorite_url(name): if favorites.has_key(name): return favorites[name] for key in favorites: if key.lower().find(name.lower()) >= 0: return favorites[key] return None favorite = favorite_url ### NEWSFEED ######################################################################################### class Newsfeed: """ Wrapper for the feedparser.FeedParserDict class. Ensures that Newsfeed.items redirects to Newsfeed.entries, and returns an empty string (by default) instead of raising an error when a key could not be found - this way we don't have to check if a key exists before fetching its value. """ def __init__(self, feed, none=u""): self._feed = feed self._none = none def __call__(self, *args): raise TypeError, "Newsfeed object not callable" def __repr__(self): return strip_tags(self._feed.__repr__()) def __unicode__(self): return strip_tags(self._feed) def __str__(self): try: s = self._feed.encode("utf-8") except: s = self._feed.__str__() return strip_tags(s) def __getitem__(self, item): try: return Newsfeed(self._feed.__getitem__(item)) except: return Newsfeed(self._none) def has_key(self, key): return self._feed.has_key(key) def __iter__(self): return self._feed.__iter__() def __getattr__(self, a): if a == "items": a = "entries" try: a = self._feed.__getattr__(a) if isinstance(a, list): a = [Newsfeed(x, self._none) for x in a] return Newsfeed(a) except: return Newsfeed(self._none) ### NEWSFEED DOWNLOAD ################################################################################ class NewsfeedDownload(Newsfeed, URLAccumulator): """ Asynchronous cached Newsfeed. """ def __init__(self, url, wait=10, asynchronous=False, cached=True, none=""): self._feed = None self._none = none if cached: cache = "newsfeed" else: cache = None # Refresh cached news results every day. if cached and Cache(cache).age(url) > 0: Cache(cache).remove(url) URLAccumulator.__init__(self, url, wait, asynchronous, cache, ".xml") def load(self, data): parsed = feedparser.parse(data) Newsfeed.__init__(self, parsed, self._none) def parse(url, wait=10, asynchronous=False, cached=True, none=""): nf = NewsfeedDownload(url, wait, asynchronous, cached, none) try: # Some shortcuts: nf.title = nf.channel.title nf.description = nf.channel.description nf.link = nf.channel.link nf.date = nf.channel.data except: pass return nf """ url = favorite_url("white house") newsfeed = parse(url) print "Channel:", newsfeed.channel.title print "Channel description:", newsfeed.channel.description print "Channel link:", newsfeed.channel.link print "Channel date:", newsfeed.channel.date print "Encoding:", newsfeed.encoding for item in newsfeed.items: print "Title:", item.title print "Link:", item.link print "Description", item.description print "Date:", item.date print ">>>", item.date_parsed print "Author:", item.author print ">>", item.author_detail.name print ">>", item.author_detail.email """nodebox-web-1.9.4.6/newsfeed.txt000066400000000000000000000034411135274433600164550ustar00rootroot00000000000000White House, http://www.whitehouse.gov/rss/news.xml Second Life, http://blog.secondlife.com/feed/ delicious, http://del.icio.us/rss/ Reuters, http://feeds.reuters.com/reuters/topNews/ Slashdot, http://rss.slashdot.org/Slashdot/slashdot Reuters Oddly Enough, http://feeds.reuters.com/reuters/oddlyEnoughNews/ New York Times, http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml MSNBC, http://rss.msnbc.msn.com/id/3032091/device/rss/rss.xml MIT, http://web.mit.edu/newsoffice/mitnews-rss.xml Cosmopolitan, http://www.ivillage.com/rss/0,,187712131206,00.xml Nature, http://www.nature.com/nature/current_issue/rss/index.html UC Berkeley, http://www.berkeley.edu/news/rss/UCBNewsCenter.rss Wired, http://feeds.wired.com/wired/topheadlines freshmeat, http://rss.freshmeat.net/freshmeat/feeds/fm-releases-global Harvard University, http://www.hno.harvard.edu/rss/home.xml Herald Tribune, http://www.iht.com/rss/frontpage.xml Al Jazeera, http://csociety.ecn.purdue.edu/~jacoby/XML/aljazeera.rss Urban Dictionary, http://feeds.urbandictionary.com/UrbanWordOfTheDay BBC, http://newsrss.bbc.co.uk/rss/newsonline_world_edition/front_page/rss.xml MTV, http://www.mtv.com/rss/news/latestcached.jhtml CNN, http://rss.cnn.com/rss/edition.rss Processing Blogs, http://www.processingblogs.org/feed/atom/ Yahoo!, http://rss.news.yahoo.com/rss/topstories Wikinews, http://feeds.feedburner.com/WikinewsLatestNews Google, http://news.google.com/news?ned=us&topic=h&output=rss Apple, http://images.apple.com/main/rss/hotnews/hotnews.rss Blogger, http://feeds.feedburner.com/BloggerBuzz Science Magazine, http://www.sciencemag.org/rss/podcast.xml NASA, http://www.nasa.gov/rss/breaking_news.rss The Washington Post, http://www.washingtonpost.com/wp-dyn/rss/linkset/2005/03/24/LI2005032400102.xml TIME, http://feeds.feedburner.com/time/topstoriesnodebox-web-1.9.4.6/page.py000066400000000000000000000112541135274433600154030ustar00rootroot00000000000000### PAGE ############################################################################################# # Code for querying the HTML DOM. # It wraps BeautifulSoup by Leonard Richardson. # Author: Tom De Smedt. # Copyright (c) 2007 by Tom De Smedt. # See LICENSE.txt for details. from BeautifulSoup import BeautifulSoup, Tag from url import URLAccumulator, URLParser from html import replace_entities, plain from cache import Cache def clear_cache(): Cache("html").clear() ### PAGE ERRORS ###################################################################################### class PageUnicodeError(Exception): def __str__(self): return str(self.__class__) class PageParseError(Exception): def __str__(self): return str(self.__class__) ### PAGE #########@################################################################################### Tag.find_all = Tag.findAll class Page(BeautifulSoup, URLAccumulator): """ DOM tree of a HTML page. Page is essentially an asynchronous download of a BeautifulSoup page. It has the following methods: description() - returns meta description keywords() - returns meta keywords links() - by default, returns external links find(tag, attribute=value) - find the first tag with given attributes find_all(tag, attribute=value) - find all tags with given attributes find() and find_all() return objects that have find() and find_all() too. They're essentially lists of Tag objects. Alternatively, get tags directly as properties, e.g. page.body.p - returns a list of all p Tag objects (each has find() and find_all() ) To get attributes from a Tag: p["id"] """ def __init__(self, url, wait=10, asynchronous=False, cached=True): if cached: cache = "html" else: cache = None URLAccumulator.__init__(self, url, wait, asynchronous, cache) def load(self, data): data = replace_entities(data) try: BeautifulSoup.__init__(self, data) except UnicodeEncodeError: self.error = PageUnicodeError() BeautifulSoup.__init__(self, "") except: self.error = PageParseError() BeautifulSoup.__init__(self, "") def _title(self): """ Returns the page title. """ return self.find("title").string title = property(_title) def _description(self): """ Returns the meta description in the page. """ meta = self.find("meta", {"name":"description"}) if isinstance(meta, dict) and \ meta.has_key("content"): return meta["content"] else: return u"" description = property(_description) def _keywords(self): """ Returns the meta keywords in the page. """ meta = self.find("meta", {"name":"keywords"}) if isinstance(meta, dict) and \ meta.has_key("content"): keywords = [k.strip() for k in meta["content"].split(",")] else: keywords = [] return keywords keywords = property(_keywords) def links(self, external=True): """ Retrieves links in the page. Returns a list of URL's. By default, only external URL's are returned. External URL's starts with http:// and point to another domain than the domain the page is on. """ domain = URLParser(self.url).domain links = [] for a in self("a"): for attribute, value in a.attrs: if attribute == "href": if not external \ or (value.startswith("http://") and value.find("http://"+domain) < 0): links.append(value) return links def find_class(self, classname, tag=""): return self( tag, {"class": classname} ) def parse(url, wait=10, asynchronous=False, cached=True): return Page(url, wait, asynchronous, cached) """ import url url = url.create("http://nodebox.net/code/index.php/Share") url.query["p"] = 2 print url page = parse(url) print page.title print page.title.string print page.description() print page.keywords() print page.find(id="content")["id"] # find() returns a list of Tags and has a find() method for p in page.body.find("div", id="content").find_all("p"): print ">>>", plain(p) print page.links() print page.find_all("h2") print page.contents[0].name # .div returns a list of Tags print page.body.div(id="content")[0].p """nodebox-web-1.9.4.6/simplejson/000077500000000000000000000000001135274433600162755ustar00rootroot00000000000000nodebox-web-1.9.4.6/simplejson/__init__.py000066400000000000000000000337331135274433600204170ustar00rootroot00000000000000r""" A simple, fast, extensible JSON encoder and decoder JSON (JavaScript Object Notation) is a subset of JavaScript syntax (ECMA-262 3rd edition) used as a lightweight data interchange format. simplejson exposes an API familiar to uses of the standard library marshal and pickle modules. Encoding basic Python object hierarchies:: >>> import simplejson >>> simplejson.dumps(['foo', {'bar': ('baz', None, 1.0, 2)}]) '["foo", {"bar": ["baz", null, 1.0, 2]}]' >>> print simplejson.dumps("\"foo\bar") "\"foo\bar" >>> print simplejson.dumps(u'\u1234') "\u1234" >>> print simplejson.dumps('\\') "\\" >>> print simplejson.dumps({"c": 0, "b": 0, "a": 0}, sort_keys=True) {"a": 0, "b": 0, "c": 0} >>> from StringIO import StringIO >>> io = StringIO() >>> simplejson.dump(['streaming API'], io) >>> io.getvalue() '["streaming API"]' Compact encoding:: >>> import simplejson >>> simplejson.dumps([1,2,3,{'4': 5, '6': 7}], separators=(',',':')) '[1,2,3,{"4":5,"6":7}]' Pretty printing:: >>> import simplejson >>> print simplejson.dumps({'4': 5, '6': 7}, sort_keys=True, indent=4) { "4": 5, "6": 7 } Decoding JSON:: >>> import simplejson >>> simplejson.loads('["foo", {"bar":["baz", null, 1.0, 2]}]') [u'foo', {u'bar': [u'baz', None, 1.0, 2]}] >>> simplejson.loads('"\\"foo\\bar"') u'"foo\x08ar' >>> from StringIO import StringIO >>> io = StringIO('["streaming API"]') >>> simplejson.load(io) [u'streaming API'] Specializing JSON object decoding:: >>> import simplejson >>> def as_complex(dct): ... if '__complex__' in dct: ... return complex(dct['real'], dct['imag']) ... return dct ... >>> simplejson.loads('{"__complex__": true, "real": 1, "imag": 2}', ... object_hook=as_complex) (1+2j) >>> import decimal >>> simplejson.loads('1.1', parse_float=decimal.Decimal) Decimal("1.1") Extending JSONEncoder:: >>> import simplejson >>> class ComplexEncoder(simplejson.JSONEncoder): ... def default(self, obj): ... if isinstance(obj, complex): ... return [obj.real, obj.imag] ... return simplejson.JSONEncoder.default(self, obj) ... >>> dumps(2 + 1j, cls=ComplexEncoder) '[2.0, 1.0]' >>> ComplexEncoder().encode(2 + 1j) '[2.0, 1.0]' >>> list(ComplexEncoder().iterencode(2 + 1j)) ['[', '2.0', ', ', '1.0', ']'] Using simplejson from the shell to validate and pretty-print:: $ echo '{"json":"obj"}' | python -msimplejson { "json": "obj" } $ echo '{ 1.2:3.4}' | python -msimplejson Expecting property name: line 1 column 2 (char 2) Note that the JSON produced by this module's default settings is a subset of YAML, so it may be used as a serializer for that as well. """ __version__ = '1.9.1' __all__ = [ 'dump', 'dumps', 'load', 'loads', 'JSONDecoder', 'JSONEncoder', ] if __name__ == '__main__': from simplejson.decoder import JSONDecoder from simplejson.encoder import JSONEncoder else: from decoder import JSONDecoder from encoder import JSONEncoder _default_encoder = JSONEncoder( skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, indent=None, separators=None, encoding='utf-8', default=None, ) def dump(obj, fp, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, cls=None, indent=None, separators=None, encoding='utf-8', default=None, **kw): """ Serialize ``obj`` as a JSON formatted stream to ``fp`` (a ``.write()``-supporting file-like object). If ``skipkeys`` is ``True`` then ``dict`` keys that are not basic types (``str``, ``unicode``, ``int``, ``long``, ``float``, ``bool``, ``None``) will be skipped instead of raising a ``TypeError``. If ``ensure_ascii`` is ``False``, then the some chunks written to ``fp`` may be ``unicode`` instances, subject to normal Python ``str`` to ``unicode`` coercion rules. Unless ``fp.write()`` explicitly understands ``unicode`` (as in ``codecs.getwriter()``) this is likely to cause an error. If ``check_circular`` is ``False``, then the circular reference check for container types will be skipped and a circular reference will result in an ``OverflowError`` (or worse). If ``allow_nan`` is ``False``, then it will be a ``ValueError`` to serialize out of range ``float`` values (``nan``, ``inf``, ``-inf``) in strict compliance of the JSON specification, instead of using the JavaScript equivalents (``NaN``, ``Infinity``, ``-Infinity``). If ``indent`` is a non-negative integer, then JSON array elements and object members will be pretty-printed with that indent level. An indent level of 0 will only insert newlines. ``None`` is the most compact representation. If ``separators`` is an ``(item_separator, dict_separator)`` tuple then it will be used instead of the default ``(', ', ': ')`` separators. ``(',', ':')`` is the most compact JSON representation. ``encoding`` is the character encoding for str instances, default is UTF-8. ``default(obj)`` is a function that should return a serializable version of obj or raise TypeError. The default simply raises TypeError. To use a custom ``JSONEncoder`` subclass (e.g. one that overrides the ``.default()`` method to serialize additional types), specify it with the ``cls`` kwarg. """ # cached encoder if (skipkeys is False and ensure_ascii is True and check_circular is True and allow_nan is True and cls is None and indent is None and separators is None and encoding == 'utf-8' and default is None and not kw): iterable = _default_encoder.iterencode(obj) else: if cls is None: cls = JSONEncoder iterable = cls(skipkeys=skipkeys, ensure_ascii=ensure_ascii, check_circular=check_circular, allow_nan=allow_nan, indent=indent, separators=separators, encoding=encoding, default=default, **kw).iterencode(obj) # could accelerate with writelines in some versions of Python, at # a debuggability cost for chunk in iterable: fp.write(chunk) def dumps(obj, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, cls=None, indent=None, separators=None, encoding='utf-8', default=None, **kw): """ Serialize ``obj`` to a JSON formatted ``str``. If ``skipkeys`` is ``True`` then ``dict`` keys that are not basic types (``str``, ``unicode``, ``int``, ``long``, ``float``, ``bool``, ``None``) will be skipped instead of raising a ``TypeError``. If ``ensure_ascii`` is ``False``, then the return value will be a ``unicode`` instance subject to normal Python ``str`` to ``unicode`` coercion rules instead of being escaped to an ASCII ``str``. If ``check_circular`` is ``False``, then the circular reference check for container types will be skipped and a circular reference will result in an ``OverflowError`` (or worse). If ``allow_nan`` is ``False``, then it will be a ``ValueError`` to serialize out of range ``float`` values (``nan``, ``inf``, ``-inf``) in strict compliance of the JSON specification, instead of using the JavaScript equivalents (``NaN``, ``Infinity``, ``-Infinity``). If ``indent`` is a non-negative integer, then JSON array elements and object members will be pretty-printed with that indent level. An indent level of 0 will only insert newlines. ``None`` is the most compact representation. If ``separators`` is an ``(item_separator, dict_separator)`` tuple then it will be used instead of the default ``(', ', ': ')`` separators. ``(',', ':')`` is the most compact JSON representation. ``encoding`` is the character encoding for str instances, default is UTF-8. ``default(obj)`` is a function that should return a serializable version of obj or raise TypeError. The default simply raises TypeError. To use a custom ``JSONEncoder`` subclass (e.g. one that overrides the ``.default()`` method to serialize additional types), specify it with the ``cls`` kwarg. """ # cached encoder if (skipkeys is False and ensure_ascii is True and check_circular is True and allow_nan is True and cls is None and indent is None and separators is None and encoding == 'utf-8' and default is None and not kw): return _default_encoder.encode(obj) if cls is None: cls = JSONEncoder return cls( skipkeys=skipkeys, ensure_ascii=ensure_ascii, check_circular=check_circular, allow_nan=allow_nan, indent=indent, separators=separators, encoding=encoding, default=default, **kw).encode(obj) _default_decoder = JSONDecoder(encoding=None, object_hook=None) def load(fp, encoding=None, cls=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, **kw): """ Deserialize ``fp`` (a ``.read()``-supporting file-like object containing a JSON document) to a Python object. If the contents of ``fp`` is encoded with an ASCII based encoding other than utf-8 (e.g. latin-1), then an appropriate ``encoding`` name must be specified. Encodings that are not ASCII based (such as UCS-2) are not allowed, and should be wrapped with ``codecs.getreader(fp)(encoding)``, or simply decoded to a ``unicode`` object and passed to ``loads()`` ``object_hook`` is an optional function that will be called with the result of any object literal decode (a ``dict``). The return value of ``object_hook`` will be used instead of the ``dict``. This feature can be used to implement custom decoders (e.g. JSON-RPC class hinting). To use a custom ``JSONDecoder`` subclass, specify it with the ``cls`` kwarg. """ return loads(fp.read(), encoding=encoding, cls=cls, object_hook=object_hook, parse_float=parse_float, parse_int=parse_int, parse_constant=parse_constant, **kw) def loads(s, encoding=None, cls=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, **kw): """ Deserialize ``s`` (a ``str`` or ``unicode`` instance containing a JSON document) to a Python object. If ``s`` is a ``str`` instance and is encoded with an ASCII based encoding other than utf-8 (e.g. latin-1) then an appropriate ``encoding`` name must be specified. Encodings that are not ASCII based (such as UCS-2) are not allowed and should be decoded to ``unicode`` first. ``object_hook`` is an optional function that will be called with the result of any object literal decode (a ``dict``). The return value of ``object_hook`` will be used instead of the ``dict``. This feature can be used to implement custom decoders (e.g. JSON-RPC class hinting). ``parse_float``, if specified, will be called with the string of every JSON float to be decoded. By default this is equivalent to float(num_str). This can be used to use another datatype or parser for JSON floats (e.g. decimal.Decimal). ``parse_int``, if specified, will be called with the string of every JSON int to be decoded. By default this is equivalent to int(num_str). This can be used to use another datatype or parser for JSON integers (e.g. float). ``parse_constant``, if specified, will be called with one of the following strings: -Infinity, Infinity, NaN, null, true, false. This can be used to raise an exception if invalid JSON numbers are encountered. To use a custom ``JSONDecoder`` subclass, specify it with the ``cls`` kwarg. """ if (cls is None and encoding is None and object_hook is None and parse_int is None and parse_float is None and parse_constant is None and not kw): return _default_decoder.decode(s) if cls is None: cls = JSONDecoder if object_hook is not None: kw['object_hook'] = object_hook if parse_float is not None: kw['parse_float'] = parse_float if parse_int is not None: kw['parse_int'] = parse_int if parse_constant is not None: kw['parse_constant'] = parse_constant return cls(encoding=encoding, **kw).decode(s) # # Compatibility cruft from other libraries # def decode(s): """ demjson, python-cjson API compatibility hook. Use loads(s) instead. """ import warnings warnings.warn("simplejson.loads(s) should be used instead of decode(s)", DeprecationWarning) return loads(s) def encode(obj): """ demjson, python-cjson compatibility hook. Use dumps(s) instead. """ import warnings warnings.warn("simplejson.dumps(s) should be used instead of encode(s)", DeprecationWarning) return dumps(obj) def read(s): """ jsonlib, JsonUtils, python-json, json-py API compatibility hook. Use loads(s) instead. """ import warnings warnings.warn("simplejson.loads(s) should be used instead of read(s)", DeprecationWarning) return loads(s) def write(obj): """ jsonlib, JsonUtils, python-json, json-py API compatibility hook. Use dumps(s) instead. """ import warnings warnings.warn("simplejson.dumps(s) should be used instead of write(s)", DeprecationWarning) return dumps(obj) # # Pretty printer: # curl http://mochikit.com/examples/ajax_tables/domains.json | python -msimplejson # def main(): import sys if len(sys.argv) == 1: infile = sys.stdin outfile = sys.stdout elif len(sys.argv) == 2: infile = open(sys.argv[1], 'rb') outfile = sys.stdout elif len(sys.argv) == 3: infile = open(sys.argv[1], 'rb') outfile = open(sys.argv[2], 'wb') else: raise SystemExit("%s [infile [outfile]]" % (sys.argv[0],)) try: obj = load(infile) except ValueError, e: raise SystemExit(e) dump(obj, outfile, sort_keys=True, indent=4) outfile.write('\n') if __name__ == '__main__': main() nodebox-web-1.9.4.6/simplejson/_speedups.c000066400000000000000000000502371135274433600204370ustar00rootroot00000000000000#include "Python.h" #if PY_VERSION_HEX < 0x02050000 && !defined(PY_SSIZE_T_MIN) typedef int Py_ssize_t; #define PY_SSIZE_T_MAX INT_MAX #define PY_SSIZE_T_MIN INT_MIN #endif #ifdef __GNUC__ #define UNUSED __attribute__((__unused__)) #else #define UNUSED #endif #define DEFAULT_ENCODING "utf-8" static Py_ssize_t ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars); static PyObject * ascii_escape_unicode(PyObject *pystr); static PyObject * ascii_escape_str(PyObject *pystr); static PyObject * py_encode_basestring_ascii(PyObject* self UNUSED, PyObject *pystr); void init_speedups(void); #define S_CHAR(c) (c >= ' ' && c <= '~' && c != '\\' && c != '"') #define MIN_EXPANSION 6 #ifdef Py_UNICODE_WIDE #define MAX_EXPANSION (2 * MIN_EXPANSION) #else #define MAX_EXPANSION MIN_EXPANSION #endif static Py_ssize_t ascii_escape_char(Py_UNICODE c, char *output, Py_ssize_t chars) { Py_UNICODE x; output[chars++] = '\\'; switch (c) { case '\\': output[chars++] = (char)c; break; case '"': output[chars++] = (char)c; break; case '\b': output[chars++] = 'b'; break; case '\f': output[chars++] = 'f'; break; case '\n': output[chars++] = 'n'; break; case '\r': output[chars++] = 'r'; break; case '\t': output[chars++] = 't'; break; default: #ifdef Py_UNICODE_WIDE if (c >= 0x10000) { /* UTF-16 surrogate pair */ Py_UNICODE v = c - 0x10000; c = 0xd800 | ((v >> 10) & 0x3ff); output[chars++] = 'u'; x = (c & 0xf000) >> 12; output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); x = (c & 0x0f00) >> 8; output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); x = (c & 0x00f0) >> 4; output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); x = (c & 0x000f); output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); c = 0xdc00 | (v & 0x3ff); output[chars++] = '\\'; } #endif output[chars++] = 'u'; x = (c & 0xf000) >> 12; output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); x = (c & 0x0f00) >> 8; output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); x = (c & 0x00f0) >> 4; output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); x = (c & 0x000f); output[chars++] = (x < 10) ? '0' + x : 'a' + (x - 10); } return chars; } static PyObject * ascii_escape_unicode(PyObject *pystr) { Py_ssize_t i; Py_ssize_t input_chars; Py_ssize_t output_size; Py_ssize_t chars; PyObject *rval; char *output; Py_UNICODE *input_unicode; input_chars = PyUnicode_GET_SIZE(pystr); input_unicode = PyUnicode_AS_UNICODE(pystr); /* One char input can be up to 6 chars output, estimate 4 of these */ output_size = 2 + (MIN_EXPANSION * 4) + input_chars; rval = PyString_FromStringAndSize(NULL, output_size); if (rval == NULL) { return NULL; } output = PyString_AS_STRING(rval); chars = 0; output[chars++] = '"'; for (i = 0; i < input_chars; i++) { Py_UNICODE c = input_unicode[i]; if (S_CHAR(c)) { output[chars++] = (char)c; } else { chars = ascii_escape_char(c, output, chars); } if (output_size - chars < (1 + MAX_EXPANSION)) { /* There's more than four, so let's resize by a lot */ output_size *= 2; /* This is an upper bound */ if (output_size > 2 + (input_chars * MAX_EXPANSION)) { output_size = 2 + (input_chars * MAX_EXPANSION); } if (_PyString_Resize(&rval, output_size) == -1) { return NULL; } output = PyString_AS_STRING(rval); } } output[chars++] = '"'; if (_PyString_Resize(&rval, chars) == -1) { return NULL; } return rval; } static PyObject * ascii_escape_str(PyObject *pystr) { Py_ssize_t i; Py_ssize_t input_chars; Py_ssize_t output_size; Py_ssize_t chars; PyObject *rval; char *output; char *input_str; input_chars = PyString_GET_SIZE(pystr); input_str = PyString_AS_STRING(pystr); /* One char input can be up to 6 chars output, estimate 4 of these */ output_size = 2 + (MIN_EXPANSION * 4) + input_chars; rval = PyString_FromStringAndSize(NULL, output_size); if (rval == NULL) { return NULL; } output = PyString_AS_STRING(rval); chars = 0; output[chars++] = '"'; for (i = 0; i < input_chars; i++) { Py_UNICODE c = (Py_UNICODE)input_str[i]; if (S_CHAR(c)) { output[chars++] = (char)c; } else if (c > 0x7F) { /* We hit a non-ASCII character, bail to unicode mode */ PyObject *uni; Py_DECREF(rval); uni = PyUnicode_DecodeUTF8(input_str, input_chars, "strict"); if (uni == NULL) { return NULL; } rval = ascii_escape_unicode(uni); Py_DECREF(uni); return rval; } else { chars = ascii_escape_char(c, output, chars); } /* An ASCII char can't possibly expand to a surrogate! */ if (output_size - chars < (1 + MIN_EXPANSION)) { /* There's more than four, so let's resize by a lot */ output_size *= 2; if (output_size > 2 + (input_chars * MIN_EXPANSION)) { output_size = 2 + (input_chars * MIN_EXPANSION); } if (_PyString_Resize(&rval, output_size) == -1) { return NULL; } output = PyString_AS_STRING(rval); } } output[chars++] = '"'; if (_PyString_Resize(&rval, chars) == -1) { return NULL; } return rval; } void raise_errmsg(char *msg, PyObject *s, Py_ssize_t end) { static PyObject *errmsg_fn = NULL; PyObject *pymsg; if (errmsg_fn == NULL) { PyObject *decoder = PyImport_ImportModule("simplejson.decoder"); if (decoder == NULL) return; errmsg_fn = PyObject_GetAttrString(decoder, "errmsg"); if (errmsg_fn == NULL) return; Py_XDECREF(decoder); } #if PY_VERSION_HEX < 0x02050000 pymsg = PyObject_CallFunction(errmsg_fn, "(zOi)", msg, s, end); #else pymsg = PyObject_CallFunction(errmsg_fn, "(zOn)", msg, s, end); #endif PyErr_SetObject(PyExc_ValueError, pymsg); Py_XDECREF(pymsg); /* def linecol(doc, pos): lineno = doc.count('\n', 0, pos) + 1 if lineno == 1: colno = pos else: colno = pos - doc.rindex('\n', 0, pos) return lineno, colno def errmsg(msg, doc, pos, end=None): lineno, colno = linecol(doc, pos) if end is None: return '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos) endlineno, endcolno = linecol(doc, end) return '%s: line %d column %d - line %d column %d (char %d - %d)' % ( msg, lineno, colno, endlineno, endcolno, pos, end) */ } static PyObject * join_list_unicode(PyObject *lst) { static PyObject *ustr = NULL; static PyObject *joinstr = NULL; if (ustr == NULL) { Py_UNICODE c = 0; ustr = PyUnicode_FromUnicode(&c, 0); } if (joinstr == NULL) { joinstr = PyString_FromString("join"); } if (joinstr == NULL || ustr == NULL) { return NULL; } return PyObject_CallMethodObjArgs(ustr, joinstr, lst, NULL); } static PyObject * scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict) { PyObject *rval; Py_ssize_t len = PyString_GET_SIZE(pystr); Py_ssize_t begin = end - 1; Py_ssize_t next = begin; char *buf = PyString_AS_STRING(pystr); PyObject *chunks = PyList_New(0); if (chunks == NULL) { goto bail; } while (1) { /* Find the end of the string or the next escape */ Py_UNICODE c = 0; PyObject *chunk = NULL; for (next = end; next < len; next++) { c = buf[next]; if (c == '"' || c == '\\') { break; } else if (strict && c <= 0x1f) { raise_errmsg("Invalid control character at", pystr, begin); goto bail; } } if (!(c == '"' || c == '\\')) { raise_errmsg("Unterminated string starting at", pystr, begin); goto bail; } /* Pick up this chunk if it's not zero length */ if (next != end) { PyObject *strchunk = PyBuffer_FromMemory(&buf[end], next - end); if (strchunk == NULL) { goto bail; } chunk = PyUnicode_FromEncodedObject(strchunk, encoding, NULL); Py_XDECREF(strchunk); if (chunk == NULL) { goto bail; } if (PyList_Append(chunks, chunk)) { goto bail; } Py_DECREF(chunk); } next++; if (c == '"') { end = next; break; } if (next == len) { raise_errmsg("Unterminated string starting at", pystr, begin); goto bail; } c = buf[next]; if (c != 'u') { /* Non-unicode backslash escapes */ end = next + 1; switch (c) { case '"': break; case '\\': break; case '/': break; case 'b': c = '\b'; break; case 'f': c = '\f'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; default: c = 0; } if (c == 0) { raise_errmsg("Invalid \\escape", pystr, end - 2); goto bail; } } else { c = 0; next++; end = next + 4; if (end >= len) { raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1); goto bail; } /* Decode 4 hex digits */ for (; next < end; next++) { Py_ssize_t shl = (end - next - 1) << 2; Py_UNICODE digit = buf[next]; switch (digit) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': c |= (digit - '0') << shl; break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': c |= (digit - 'a' + 10) << shl; break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': c |= (digit - 'A' + 10) << shl; break; default: raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); goto bail; } } #ifdef Py_UNICODE_WIDE /* Surrogate pair */ if (c >= 0xd800 && c <= 0xdbff) { Py_UNICODE c2 = 0; if (end + 6 >= len) { raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr, end - 5); } if (buf[next++] != '\\' || buf[next++] != 'u') { raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr, end - 5); } end += 6; /* Decode 4 hex digits */ for (; next < end; next++) { Py_ssize_t shl = (end - next - 1) << 2; Py_UNICODE digit = buf[next]; switch (digit) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': c2 |= (digit - '0') << shl; break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': c2 |= (digit - 'a' + 10) << shl; break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': c2 |= (digit - 'A' + 10) << shl; break; default: raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); goto bail; } } c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00)); } #endif } chunk = PyUnicode_FromUnicode(&c, 1); if (chunk == NULL) { goto bail; } if (PyList_Append(chunks, chunk)) { goto bail; } Py_DECREF(chunk); } rval = join_list_unicode(chunks); if (rval == NULL) { goto bail; } Py_DECREF(chunks); chunks = NULL; #if PY_VERSION_HEX < 0x02050000 return Py_BuildValue("(Ni)", rval, end); #else return Py_BuildValue("(Nn)", rval, end); #endif bail: Py_XDECREF(chunks); return NULL; } static PyObject * scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict) { PyObject *rval; Py_ssize_t len = PyUnicode_GET_SIZE(pystr); Py_ssize_t begin = end - 1; Py_ssize_t next = begin; const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr); PyObject *chunks = PyList_New(0); if (chunks == NULL) { goto bail; } while (1) { /* Find the end of the string or the next escape */ Py_UNICODE c = 0; PyObject *chunk = NULL; for (next = end; next < len; next++) { c = buf[next]; if (c == '"' || c == '\\') { break; } else if (strict && c <= 0x1f) { raise_errmsg("Invalid control character at", pystr, begin); goto bail; } } if (!(c == '"' || c == '\\')) { raise_errmsg("Unterminated string starting at", pystr, begin); goto bail; } /* Pick up this chunk if it's not zero length */ if (next != end) { chunk = PyUnicode_FromUnicode(&buf[end], next - end); if (chunk == NULL) { goto bail; } if (PyList_Append(chunks, chunk)) { goto bail; } Py_DECREF(chunk); } next++; if (c == '"') { end = next; break; } if (next == len) { raise_errmsg("Unterminated string starting at", pystr, begin); goto bail; } c = buf[next]; if (c != 'u') { /* Non-unicode backslash escapes */ end = next + 1; switch (c) { case '"': break; case '\\': break; case '/': break; case 'b': c = '\b'; break; case 'f': c = '\f'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; default: c = 0; } if (c == 0) { raise_errmsg("Invalid \\escape", pystr, end - 2); goto bail; } } else { c = 0; next++; end = next + 4; if (end >= len) { raise_errmsg("Invalid \\uXXXX escape", pystr, next - 1); goto bail; } /* Decode 4 hex digits */ for (; next < end; next++) { Py_ssize_t shl = (end - next - 1) << 2; Py_UNICODE digit = buf[next]; switch (digit) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': c |= (digit - '0') << shl; break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': c |= (digit - 'a' + 10) << shl; break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': c |= (digit - 'A' + 10) << shl; break; default: raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); goto bail; } } #ifdef Py_UNICODE_WIDE /* Surrogate pair */ if (c >= 0xd800 && c <= 0xdbff) { Py_UNICODE c2 = 0; if (end + 6 >= len) { raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr, end - 5); } if (buf[next++] != '\\' || buf[next++] != 'u') { raise_errmsg("Invalid \\uXXXX\\uXXXX surrogate pair", pystr, end - 5); } end += 6; /* Decode 4 hex digits */ for (; next < end; next++) { Py_ssize_t shl = (end - next - 1) << 2; Py_UNICODE digit = buf[next]; switch (digit) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': c2 |= (digit - '0') << shl; break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': c2 |= (digit - 'a' + 10) << shl; break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': c2 |= (digit - 'A' + 10) << shl; break; default: raise_errmsg("Invalid \\uXXXX escape", pystr, end - 5); goto bail; } } c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00)); } #endif } chunk = PyUnicode_FromUnicode(&c, 1); if (chunk == NULL) { goto bail; } if (PyList_Append(chunks, chunk)) { goto bail; } Py_DECREF(chunk); } rval = join_list_unicode(chunks); if (rval == NULL) { goto bail; } Py_DECREF(chunks); chunks = NULL; #if PY_VERSION_HEX < 0x02050000 return Py_BuildValue("(Ni)", rval, end); #else return Py_BuildValue("(Nn)", rval, end); #endif bail: Py_XDECREF(chunks); return NULL; } PyDoc_STRVAR(pydoc_scanstring, "scanstring(basestring, end, encoding) -> (str, end)\n" "\n" "..." ); static PyObject * py_scanstring(PyObject* self UNUSED, PyObject *args) { PyObject *pystr; Py_ssize_t end; char *encoding = NULL; int strict = 0; #if PY_VERSION_HEX < 0x02050000 if (!PyArg_ParseTuple(args, "Oi|zi:scanstring", &pystr, &end, &encoding, &strict)) { #else if (!PyArg_ParseTuple(args, "On|zi:scanstring", &pystr, &end, &encoding, &strict)) { #endif return NULL; } if (encoding == NULL) { encoding = DEFAULT_ENCODING; } if (PyString_Check(pystr)) { return scanstring_str(pystr, end, encoding, strict); } else if (PyUnicode_Check(pystr)) { return scanstring_unicode(pystr, end, strict); } PyErr_SetString(PyExc_TypeError, "first argument must be a string"); return NULL; } PyDoc_STRVAR(pydoc_encode_basestring_ascii, "encode_basestring_ascii(basestring) -> str\n" "\n" "..." ); static PyObject * py_encode_basestring_ascii(PyObject* self UNUSED, PyObject *pystr) { /* METH_O */ if (PyString_Check(pystr)) { return ascii_escape_str(pystr); } else if (PyUnicode_Check(pystr)) { return ascii_escape_unicode(pystr); } PyErr_SetString(PyExc_TypeError, "first argument must be a string"); return NULL; } static PyMethodDef speedups_methods[] = { {"encode_basestring_ascii", (PyCFunction)py_encode_basestring_ascii, METH_O, pydoc_encode_basestring_ascii}, {"scanstring", (PyCFunction)py_scanstring, METH_VARARGS, pydoc_scanstring}, {NULL, NULL, 0, NULL} }; void init_speedups(void) { PyObject *m; m = Py_InitModule4("_speedups", speedups_methods, NULL, NULL, PYTHON_API_VERSION); } nodebox-web-1.9.4.6/simplejson/decoder.py000066400000000000000000000257031135274433600202630ustar00rootroot00000000000000""" Implementation of JSONDecoder """ import re import sys from scanner import Scanner, pattern try: from _speedups import scanstring as c_scanstring except ImportError: pass FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL def _floatconstants(): import struct import sys _BYTES = '7FF80000000000007FF0000000000000'.decode('hex') if sys.byteorder != 'big': _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1] nan, inf = struct.unpack('dd', _BYTES) return nan, inf, -inf NaN, PosInf, NegInf = _floatconstants() def linecol(doc, pos): lineno = doc.count('\n', 0, pos) + 1 if lineno == 1: colno = pos else: colno = pos - doc.rindex('\n', 0, pos) return lineno, colno def errmsg(msg, doc, pos, end=None): lineno, colno = linecol(doc, pos) if end is None: return '%s: line %d column %d (char %d)' % (msg, lineno, colno, pos) endlineno, endcolno = linecol(doc, end) return '%s: line %d column %d - line %d column %d (char %d - %d)' % ( msg, lineno, colno, endlineno, endcolno, pos, end) _CONSTANTS = { '-Infinity': NegInf, 'Infinity': PosInf, 'NaN': NaN, 'true': True, 'false': False, 'null': None, } def JSONConstant(match, context, c=_CONSTANTS): s = match.group(0) fn = getattr(context, 'parse_constant', None) if fn is None: rval = c[s] else: rval = fn(s) return rval, None pattern('(-?Infinity|NaN|true|false|null)')(JSONConstant) def JSONNumber(match, context): match = JSONNumber.regex.match(match.string, *match.span()) integer, frac, exp = match.groups() if frac or exp: fn = getattr(context, 'parse_float', None) or float res = fn(integer + (frac or '') + (exp or '')) else: fn = getattr(context, 'parse_int', None) or int res = fn(integer) return res, None pattern(r'(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?')(JSONNumber) STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS) BACKSLASH = { '"': u'"', '\\': u'\\', '/': u'/', 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t', } DEFAULT_ENCODING = "utf-8" def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match): if encoding is None: encoding = DEFAULT_ENCODING chunks = [] _append = chunks.append begin = end - 1 while 1: chunk = _m(s, end) if chunk is None: raise ValueError( errmsg("Unterminated string starting at", s, begin)) end = chunk.end() content, terminator = chunk.groups() if content: if not isinstance(content, unicode): content = unicode(content, encoding) _append(content) if terminator == '"': break elif terminator != '\\': if strict: raise ValueError(errmsg("Invalid control character %r at", s, end)) else: _append(terminator) continue try: esc = s[end] except IndexError: raise ValueError( errmsg("Unterminated string starting at", s, begin)) if esc != 'u': try: m = _b[esc] except KeyError: raise ValueError( errmsg("Invalid \\escape: %r" % (esc,), s, end)) end += 1 else: esc = s[end + 1:end + 5] next_end = end + 5 msg = "Invalid \\uXXXX escape" try: if len(esc) != 4: raise ValueError uni = int(esc, 16) if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535: msg = "Invalid \\uXXXX\\uXXXX surrogate pair" if not s[end + 5:end + 7] == '\\u': raise ValueError esc2 = s[end + 7:end + 11] if len(esc2) != 4: raise ValueError uni2 = int(esc2, 16) uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) next_end += 6 m = unichr(uni) except ValueError: raise ValueError(errmsg(msg, s, end)) end = next_end _append(m) return u''.join(chunks), end # Use speedup try: scanstring = c_scanstring except NameError: scanstring = py_scanstring def JSONString(match, context): encoding = getattr(context, 'encoding', None) strict = getattr(context, 'strict', True) return scanstring(match.string, match.end(), encoding, strict) pattern(r'"')(JSONString) WHITESPACE = re.compile(r'\s*', FLAGS) def JSONObject(match, context, _w=WHITESPACE.match): pairs = {} s = match.string end = _w(s, match.end()).end() nextchar = s[end:end + 1] # Trivial empty object if nextchar == '}': return pairs, end + 1 if nextchar != '"': raise ValueError(errmsg("Expecting property name", s, end)) end += 1 encoding = getattr(context, 'encoding', None) strict = getattr(context, 'strict', True) iterscan = JSONScanner.iterscan while True: key, end = scanstring(s, end, encoding, strict) end = _w(s, end).end() if s[end:end + 1] != ':': raise ValueError(errmsg("Expecting : delimiter", s, end)) end = _w(s, end + 1).end() try: value, end = iterscan(s, idx=end, context=context).next() except StopIteration: raise ValueError(errmsg("Expecting object", s, end)) pairs[key] = value end = _w(s, end).end() nextchar = s[end:end + 1] end += 1 if nextchar == '}': break if nextchar != ',': raise ValueError(errmsg("Expecting , delimiter", s, end - 1)) end = _w(s, end).end() nextchar = s[end:end + 1] end += 1 if nextchar != '"': raise ValueError(errmsg("Expecting property name", s, end - 1)) object_hook = getattr(context, 'object_hook', None) if object_hook is not None: pairs = object_hook(pairs) return pairs, end pattern(r'{')(JSONObject) def JSONArray(match, context, _w=WHITESPACE.match): values = [] s = match.string end = _w(s, match.end()).end() # Look-ahead for trivial empty array nextchar = s[end:end + 1] if nextchar == ']': return values, end + 1 iterscan = JSONScanner.iterscan while True: try: value, end = iterscan(s, idx=end, context=context).next() except StopIteration: raise ValueError(errmsg("Expecting object", s, end)) values.append(value) end = _w(s, end).end() nextchar = s[end:end + 1] end += 1 if nextchar == ']': break if nextchar != ',': raise ValueError(errmsg("Expecting , delimiter", s, end)) end = _w(s, end).end() return values, end pattern(r'\[')(JSONArray) ANYTHING = [ JSONObject, JSONArray, JSONString, JSONConstant, JSONNumber, ] JSONScanner = Scanner(ANYTHING) class JSONDecoder(object): """ Simple JSON decoder Performs the following translations in decoding by default: +---------------+-------------------+ | JSON | Python | +===============+===================+ | object | dict | +---------------+-------------------+ | array | list | +---------------+-------------------+ | string | unicode | +---------------+-------------------+ | number (int) | int, long | +---------------+-------------------+ | number (real) | float | +---------------+-------------------+ | true | True | +---------------+-------------------+ | false | False | +---------------+-------------------+ | null | None | +---------------+-------------------+ It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as their corresponding ``float`` values, which is outside the JSON spec. """ _scanner = Scanner(ANYTHING) __all__ = ['__init__', 'decode', 'raw_decode'] def __init__(self, encoding=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, strict=True): """ ``encoding`` determines the encoding used to interpret any ``str`` objects decoded by this instance (utf-8 by default). It has no effect when decoding ``unicode`` objects. Note that currently only encodings that are a superset of ASCII work, strings of other encodings should be passed in as ``unicode``. ``object_hook``, if specified, will be called with the result of every JSON object decoded and its return value will be used in place of the given ``dict``. This can be used to provide custom deserializations (e.g. to support JSON-RPC class hinting). ``parse_float``, if specified, will be called with the string of every JSON float to be decoded. By default this is equivalent to float(num_str). This can be used to use another datatype or parser for JSON floats (e.g. decimal.Decimal). ``parse_int``, if specified, will be called with the string of every JSON int to be decoded. By default this is equivalent to int(num_str). This can be used to use another datatype or parser for JSON integers (e.g. float). ``parse_constant``, if specified, will be called with one of the following strings: -Infinity, Infinity, NaN, null, true, false. This can be used to raise an exception if invalid JSON numbers are encountered. """ self.encoding = encoding self.object_hook = object_hook self.parse_float = parse_float self.parse_int = parse_int self.parse_constant = parse_constant self.strict = strict def decode(self, s, _w=WHITESPACE.match): """ Return the Python representation of ``s`` (a ``str`` or ``unicode`` instance containing a JSON document) """ obj, end = self.raw_decode(s, idx=_w(s, 0).end()) end = _w(s, end).end() if end != len(s): raise ValueError(errmsg("Extra data", s, end, len(s))) return obj def raw_decode(self, s, **kw): """ Decode a JSON document from ``s`` (a ``str`` or ``unicode`` beginning with a JSON document) and return a 2-tuple of the Python representation and the index in ``s`` where the document ended. This can be used to decode a JSON document from a string that may have extraneous data at the end. """ kw.setdefault('context', self) try: obj, end = self._scanner.iterscan(s, **kw).next() except StopIteration: raise ValueError("No JSON object could be decoded") return obj, end __all__ = ['JSONDecoder'] nodebox-web-1.9.4.6/simplejson/encoder.py000066400000000000000000000322641135274433600202750ustar00rootroot00000000000000""" Implementation of JSONEncoder """ import re try: from simplejson._speedups import encode_basestring_ascii as c_encode_basestring_ascii except ImportError: pass ESCAPE = re.compile(r'[\x00-\x1f\\"\b\f\n\r\t]') ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])') HAS_UTF8 = re.compile(r'[\x80-\xff]') ESCAPE_DCT = { '\\': '\\\\', '"': '\\"', '\b': '\\b', '\f': '\\f', '\n': '\\n', '\r': '\\r', '\t': '\\t', } for i in range(0x20): ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,)) # Assume this produces an infinity on all machines (probably not guaranteed) INFINITY = float('1e66666') FLOAT_REPR = repr def floatstr(o, allow_nan=True): # Check for specials. Note that this type of test is processor- and/or # platform-specific, so do tests which don't depend on the internals. if o != o: text = 'NaN' elif o == INFINITY: text = 'Infinity' elif o == -INFINITY: text = '-Infinity' else: return FLOAT_REPR(o) if not allow_nan: raise ValueError("Out of range float values are not JSON compliant: %r" % (o,)) return text def encode_basestring(s): """ Return a JSON representation of a Python string """ def replace(match): return ESCAPE_DCT[match.group(0)] return '"' + ESCAPE.sub(replace, s) + '"' def py_encode_basestring_ascii(s): if isinstance(s, str) and HAS_UTF8.search(s) is not None: s = s.decode('utf-8') def replace(match): s = match.group(0) try: return ESCAPE_DCT[s] except KeyError: n = ord(s) if n < 0x10000: return '\\u%04x' % (n,) else: # surrogate pair n -= 0x10000 s1 = 0xd800 | ((n >> 10) & 0x3ff) s2 = 0xdc00 | (n & 0x3ff) return '\\u%04x\\u%04x' % (s1, s2) return '"' + str(ESCAPE_ASCII.sub(replace, s)) + '"' try: encode_basestring_ascii = c_encode_basestring_ascii except NameError: encode_basestring_ascii = py_encode_basestring_ascii class JSONEncoder(object): """ Extensible JSON encoder for Python data structures. Supports the following objects and types by default: +-------------------+---------------+ | Python | JSON | +===================+===============+ | dict | object | +-------------------+---------------+ | list, tuple | array | +-------------------+---------------+ | str, unicode | string | +-------------------+---------------+ | int, long, float | number | +-------------------+---------------+ | True | true | +-------------------+---------------+ | False | false | +-------------------+---------------+ | None | null | +-------------------+---------------+ To extend this to recognize other objects, subclass and implement a ``.default()`` method with another method that returns a serializable object for ``o`` if possible, otherwise it should call the superclass implementation (to raise ``TypeError``). """ __all__ = ['__init__', 'default', 'encode', 'iterencode'] item_separator = ', ' key_separator = ': ' def __init__(self, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, sort_keys=False, indent=None, separators=None, encoding='utf-8', default=None): """ Constructor for JSONEncoder, with sensible defaults. If skipkeys is False, then it is a TypeError to attempt encoding of keys that are not str, int, long, float or None. If skipkeys is True, such items are simply skipped. If ensure_ascii is True, the output is guaranteed to be str objects with all incoming unicode characters escaped. If ensure_ascii is false, the output will be unicode object. If check_circular is True, then lists, dicts, and custom encoded objects will be checked for circular references during encoding to prevent an infinite recursion (which would cause an OverflowError). Otherwise, no such check takes place. If allow_nan is True, then NaN, Infinity, and -Infinity will be encoded as such. This behavior is not JSON specification compliant, but is consistent with most JavaScript based encoders and decoders. Otherwise, it will be a ValueError to encode such floats. If sort_keys is True, then the output of dictionaries will be sorted by key; this is useful for regression tests to ensure that JSON serializations can be compared on a day-to-day basis. If indent is a non-negative integer, then JSON array elements and object members will be pretty-printed with that indent level. An indent level of 0 will only insert newlines. None is the most compact representation. If specified, separators should be a (item_separator, key_separator) tuple. The default is (', ', ': '). To get the most compact JSON representation you should specify (',', ':') to eliminate whitespace. If specified, default is a function that gets called for objects that can't otherwise be serialized. It should return a JSON encodable version of the object or raise a ``TypeError``. If encoding is not None, then all input strings will be transformed into unicode using that encoding prior to JSON-encoding. The default is UTF-8. """ self.skipkeys = skipkeys self.ensure_ascii = ensure_ascii self.check_circular = check_circular self.allow_nan = allow_nan self.sort_keys = sort_keys self.indent = indent self.current_indent_level = 0 if separators is not None: self.item_separator, self.key_separator = separators if default is not None: self.default = default self.encoding = encoding def _newline_indent(self): return '\n' + (' ' * (self.indent * self.current_indent_level)) def _iterencode_list(self, lst, markers=None): if not lst: yield '[]' return if markers is not None: markerid = id(lst) if markerid in markers: raise ValueError("Circular reference detected") markers[markerid] = lst yield '[' if self.indent is not None: self.current_indent_level += 1 newline_indent = self._newline_indent() separator = self.item_separator + newline_indent yield newline_indent else: newline_indent = None separator = self.item_separator first = True for value in lst: if first: first = False else: yield separator for chunk in self._iterencode(value, markers): yield chunk if newline_indent is not None: self.current_indent_level -= 1 yield self._newline_indent() yield ']' if markers is not None: del markers[markerid] def _iterencode_dict(self, dct, markers=None): if not dct: yield '{}' return if markers is not None: markerid = id(dct) if markerid in markers: raise ValueError("Circular reference detected") markers[markerid] = dct yield '{' key_separator = self.key_separator if self.indent is not None: self.current_indent_level += 1 newline_indent = self._newline_indent() item_separator = self.item_separator + newline_indent yield newline_indent else: newline_indent = None item_separator = self.item_separator first = True if self.ensure_ascii: encoder = encode_basestring_ascii else: encoder = encode_basestring allow_nan = self.allow_nan if self.sort_keys: keys = dct.keys() keys.sort() items = [(k, dct[k]) for k in keys] else: items = dct.iteritems() _encoding = self.encoding _do_decode = (_encoding is not None and not (_encoding == 'utf-8')) for key, value in items: if isinstance(key, str): if _do_decode: key = key.decode(_encoding) elif isinstance(key, basestring): pass # JavaScript is weakly typed for these, so it makes sense to # also allow them. Many encoders seem to do something like this. elif isinstance(key, float): key = floatstr(key, allow_nan) elif isinstance(key, (int, long)): key = str(key) elif key is True: key = 'true' elif key is False: key = 'false' elif key is None: key = 'null' elif self.skipkeys: continue else: raise TypeError("key %r is not a string" % (key,)) if first: first = False else: yield item_separator yield encoder(key) yield key_separator for chunk in self._iterencode(value, markers): yield chunk if newline_indent is not None: self.current_indent_level -= 1 yield self._newline_indent() yield '}' if markers is not None: del markers[markerid] def _iterencode(self, o, markers=None): if isinstance(o, basestring): if self.ensure_ascii: encoder = encode_basestring_ascii else: encoder = encode_basestring _encoding = self.encoding if (_encoding is not None and isinstance(o, str) and not (_encoding == 'utf-8')): o = o.decode(_encoding) yield encoder(o) elif o is None: yield 'null' elif o is True: yield 'true' elif o is False: yield 'false' elif isinstance(o, (int, long)): yield str(o) elif isinstance(o, float): yield floatstr(o, self.allow_nan) elif isinstance(o, (list, tuple)): for chunk in self._iterencode_list(o, markers): yield chunk elif isinstance(o, dict): for chunk in self._iterencode_dict(o, markers): yield chunk else: if markers is not None: markerid = id(o) if markerid in markers: raise ValueError("Circular reference detected") markers[markerid] = o for chunk in self._iterencode_default(o, markers): yield chunk if markers is not None: del markers[markerid] def _iterencode_default(self, o, markers=None): newobj = self.default(o) return self._iterencode(newobj, markers) def default(self, o): """ Implement this method in a subclass such that it returns a serializable object for ``o``, or calls the base implementation (to raise a ``TypeError``). For example, to support arbitrary iterators, you could implement default like this:: def default(self, o): try: iterable = iter(o) except TypeError: pass else: return list(iterable) return JSONEncoder.default(self, o) """ raise TypeError("%r is not JSON serializable" % (o,)) def encode(self, o): """ Return a JSON string representation of a Python data structure. >>> JSONEncoder().encode({"foo": ["bar", "baz"]}) '{"foo": ["bar", "baz"]}' """ # This is for extremely simple cases and benchmarks. if isinstance(o, basestring): if isinstance(o, str): _encoding = self.encoding if (_encoding is not None and not (_encoding == 'utf-8')): o = o.decode(_encoding) if self.ensure_ascii: return encode_basestring_ascii(o) else: return encode_basestring(o) # This doesn't pass the iterator directly to ''.join() because the # exceptions aren't as detailed. The list call should be roughly # equivalent to the PySequence_Fast that ''.join() would do. chunks = list(self.iterencode(o)) return ''.join(chunks) def iterencode(self, o): """ Encode the given object and yield each string representation as available. For example:: for chunk in JSONEncoder().iterencode(bigobject): mysocket.write(chunk) """ if self.check_circular: markers = {} else: markers = None return self._iterencode(o, markers) __all__ = ['JSONEncoder'] nodebox-web-1.9.4.6/simplejson/scanner.py000066400000000000000000000040501135274433600202770ustar00rootroot00000000000000""" Iterator based sre token scanner """ import re from re import VERBOSE, MULTILINE, DOTALL import sre_parse import sre_compile import sre_constants from sre_constants import BRANCH, SUBPATTERN __all__ = ['Scanner', 'pattern'] FLAGS = (VERBOSE | MULTILINE | DOTALL) class Scanner(object): def __init__(self, lexicon, flags=FLAGS): self.actions = [None] # Combine phrases into a compound pattern s = sre_parse.Pattern() s.flags = flags p = [] for idx, token in enumerate(lexicon): phrase = token.pattern try: subpattern = sre_parse.SubPattern(s, [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))]) except sre_constants.error: raise p.append(subpattern) self.actions.append(token) s.groups = len(p) + 1 # NOTE(guido): Added to make SRE validation work p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) self.scanner = sre_compile.compile(p) def iterscan(self, string, idx=0, context=None): """ Yield match, end_idx for each match """ match = self.scanner.scanner(string, idx).match actions = self.actions lastend = idx end = len(string) while True: m = match() if m is None: break matchbegin, matchend = m.span() if lastend == matchend: break action = actions[m.lastindex] if action is not None: rval, next_pos = action(m, context) if next_pos is not None and next_pos != matchend: # "fast forward" the scanner matchend = next_pos match = self.scanner.scanner(string, matchend).match yield rval, matchend lastend = matchend def pattern(pattern, flags=FLAGS): def decorator(fn): fn.pattern = pattern fn.regex = re.compile(pattern, flags) return fn return decoratornodebox-web-1.9.4.6/simplejson/tests/000077500000000000000000000000001135274433600174375ustar00rootroot00000000000000nodebox-web-1.9.4.6/simplejson/tests/__init__.py000066400000000000000000000011031135274433600215430ustar00rootroot00000000000000import unittest import doctest def additional_tests(): import simplejson import simplejson.encoder import simplejson.decoder suite = unittest.TestSuite() for mod in (simplejson, simplejson.encoder, simplejson.decoder): suite.addTest(doctest.DocTestSuite(mod)) return suite def main(): suite = additional_tests() runner = unittest.TextTestRunner() runner.run(suite) if __name__ == '__main__': import os import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) main()nodebox-web-1.9.4.6/simplejson/tests/test_decode.py000066400000000000000000000007041135274433600222740ustar00rootroot00000000000000import decimal from unittest import TestCase import simplejson as S class TestDecode(TestCase): def test_decimal(self): rval = S.loads('1.1', parse_float=decimal.Decimal) self.assert_(isinstance(rval, decimal.Decimal)) self.assertEquals(rval, decimal.Decimal('1.1')) def test_float(self): rval = S.loads('1', parse_int=float) self.assert_(isinstance(rval, float)) self.assertEquals(rval, 1.0) nodebox-web-1.9.4.6/simplejson/tests/test_default.py000066400000000000000000000003251135274433600224740ustar00rootroot00000000000000from unittest import TestCase import simplejson as S class TestDefault(TestCase): def test_default(self): self.assertEquals( S.dumps(type, default=repr), S.dumps(repr(type))) nodebox-web-1.9.4.6/simplejson/tests/test_dump.py000066400000000000000000000004661135274433600220230ustar00rootroot00000000000000from unittest import TestCase from cStringIO import StringIO import simplejson as S class TestDump(TestCase): def test_dump(self): sio = StringIO() S.dump({}, sio) self.assertEquals(sio.getvalue(), '{}') def test_dumps(self): self.assertEquals(S.dumps({}), '{}') nodebox-web-1.9.4.6/simplejson/tests/test_encode_basestring_ascii.py000066400000000000000000000034101135274433600256740ustar00rootroot00000000000000from unittest import TestCase import simplejson.encoder CASES = [ (u'/\\"\ucafe\ubabe\uab98\ufcde\ubcda\uef4a\x08\x0c\n\r\t`1~!@#$%^&*()_+-=[]{}|;:\',./<>?', '"/\\\\\\"\\ucafe\\ubabe\\uab98\\ufcde\\ubcda\\uef4a\\b\\f\\n\\r\\t`1~!@#$%^&*()_+-=[]{}|;:\',./<>?"'), (u'\u0123\u4567\u89ab\ucdef\uabcd\uef4a', '"\\u0123\\u4567\\u89ab\\ucdef\\uabcd\\uef4a"'), (u'controls', '"controls"'), (u'\x08\x0c\n\r\t', '"\\b\\f\\n\\r\\t"'), (u'{"object with 1 member":["array with 1 element"]}', '"{\\"object with 1 member\\":[\\"array with 1 element\\"]}"'), (u' s p a c e d ', '" s p a c e d "'), (u'\U0001d120', '"\\ud834\\udd20"'), (u'\u03b1\u03a9', '"\\u03b1\\u03a9"'), ('\xce\xb1\xce\xa9', '"\\u03b1\\u03a9"'), (u'\u03b1\u03a9', '"\\u03b1\\u03a9"'), ('\xce\xb1\xce\xa9', '"\\u03b1\\u03a9"'), (u'\u03b1\u03a9', '"\\u03b1\\u03a9"'), (u'\u03b1\u03a9', '"\\u03b1\\u03a9"'), (u"`1~!@#$%^&*()_+-={':[,]}|;.?", '"`1~!@#$%^&*()_+-={\':[,]}|;.?"'), (u'\x08\x0c\n\r\t', '"\\b\\f\\n\\r\\t"'), (u'\u0123\u4567\u89ab\ucdef\uabcd\uef4a', '"\\u0123\\u4567\\u89ab\\ucdef\\uabcd\\uef4a"'), ] class TestEncodeBaseStringAscii(TestCase): def test_py_encode_basestring_ascii(self): self._test_encode_basestring_ascii(simplejson.encoder.py_encode_basestring_ascii) def test_c_encode_basestring_ascii(self): self._test_encode_basestring_ascii(simplejson.encoder.c_encode_basestring_ascii) def _test_encode_basestring_ascii(self, encode_basestring_ascii): fname = encode_basestring_ascii.__name__ for input_string, expect in CASES: result = encode_basestring_ascii(input_string) self.assertEquals(result, expect, '%r != %r for %s(%r)' % (result, expect, fname, input_string)) nodebox-web-1.9.4.6/simplejson/tests/test_fail.py000066400000000000000000000055111135274433600217650ustar00rootroot00000000000000from unittest import TestCase import simplejson as S # Fri Dec 30 18:57:26 2005 JSONDOCS = [ # http://json.org/JSON_checker/test/fail1.json '"A JSON payload should be an object or array, not a string."', # http://json.org/JSON_checker/test/fail2.json '["Unclosed array"', # http://json.org/JSON_checker/test/fail3.json '{unquoted_key: "keys must be quoted}', # http://json.org/JSON_checker/test/fail4.json '["extra comma",]', # http://json.org/JSON_checker/test/fail5.json '["double extra comma",,]', # http://json.org/JSON_checker/test/fail6.json '[ , "<-- missing value"]', # http://json.org/JSON_checker/test/fail7.json '["Comma after the close"],', # http://json.org/JSON_checker/test/fail8.json '["Extra close"]]', # http://json.org/JSON_checker/test/fail9.json '{"Extra comma": true,}', # http://json.org/JSON_checker/test/fail10.json '{"Extra value after close": true} "misplaced quoted value"', # http://json.org/JSON_checker/test/fail11.json '{"Illegal expression": 1 + 2}', # http://json.org/JSON_checker/test/fail12.json '{"Illegal invocation": alert()}', # http://json.org/JSON_checker/test/fail13.json '{"Numbers cannot have leading zeroes": 013}', # http://json.org/JSON_checker/test/fail14.json '{"Numbers cannot be hex": 0x14}', # http://json.org/JSON_checker/test/fail15.json '["Illegal backslash escape: \\x15"]', # http://json.org/JSON_checker/test/fail16.json '["Illegal backslash escape: \\\'"]', # http://json.org/JSON_checker/test/fail17.json '["Illegal backslash escape: \\017"]', # http://json.org/JSON_checker/test/fail18.json '[[[[[[[[[[[[[[[[[[[["Too deep"]]]]]]]]]]]]]]]]]]]]', # http://json.org/JSON_checker/test/fail19.json '{"Missing colon" null}', # http://json.org/JSON_checker/test/fail20.json '{"Double colon":: null}', # http://json.org/JSON_checker/test/fail21.json '{"Comma instead of colon", null}', # http://json.org/JSON_checker/test/fail22.json '["Colon instead of comma": false]', # http://json.org/JSON_checker/test/fail23.json '["Bad value", truth]', # http://json.org/JSON_checker/test/fail24.json "['single quote']", # http://code.google.com/p/simplejson/issues/detail?id=3 u'["A\u001FZ control characters in string"]', ] SKIPS = { 1: "why not have a string payload?", 18: "spec doesn't specify any nesting limitations", } class TestFail(TestCase): def test_failures(self): for idx, doc in enumerate(JSONDOCS): idx = idx + 1 if idx in SKIPS: S.loads(doc) continue try: S.loads(doc) except ValueError: pass else: self.fail("Expected failure for fail%d.json: %r" % (idx, doc)) nodebox-web-1.9.4.6/simplejson/tests/test_float.py000066400000000000000000000004001135274433600221470ustar00rootroot00000000000000import math from unittest import TestCase import simplejson as S class TestFloat(TestCase): def test_floats(self): for num in [1617161771.7650001, math.pi, math.pi**100, math.pi**-100]: self.assertEquals(float(S.dumps(num)), num) nodebox-web-1.9.4.6/simplejson/tests/test_indent.py000066400000000000000000000016111135274433600223300ustar00rootroot00000000000000from unittest import TestCase import simplejson as S import textwrap class TestIndent(TestCase): def test_indent(self): h = [['blorpie'], ['whoops'], [], 'd-shtaeou', 'd-nthiouh', 'i-vhbjkhnth', {'nifty': 87}, {'field': 'yes', 'morefield': False} ] expect = textwrap.dedent("""\ [ [ "blorpie" ], [ "whoops" ], [], "d-shtaeou", "d-nthiouh", "i-vhbjkhnth", { "nifty": 87 }, { "field": "yes", "morefield": false } ]""") d1 = S.dumps(h) d2 = S.dumps(h, indent=2, sort_keys=True, separators=(',', ': ')) h1 = S.loads(d1) h2 = S.loads(d2) self.assertEquals(h1, h) self.assertEquals(h2, h) self.assertEquals(d2, expect) nodebox-web-1.9.4.6/simplejson/tests/test_pass1.py000066400000000000000000000035401135274433600221010ustar00rootroot00000000000000from unittest import TestCase import simplejson as S # from http://json.org/JSON_checker/test/pass1.json JSON = r''' [ "JSON Test Pattern pass1", {"object with 1 member":["array with 1 element"]}, {}, [], -42, true, false, null, { "integer": 1234567890, "real": -9876.543210, "e": 0.123456789e-12, "E": 1.234567890E+34, "": 23456789012E666, "zero": 0, "one": 1, "space": " ", "quote": "\"", "backslash": "\\", "controls": "\b\f\n\r\t", "slash": "/ & \/", "alpha": "abcdefghijklmnopqrstuvwyz", "ALPHA": "ABCDEFGHIJKLMNOPQRSTUVWYZ", "digit": "0123456789", "special": "`1~!@#$%^&*()_+-={':[,]}|;.?", "hex": "\u0123\u4567\u89AB\uCDEF\uabcd\uef4A", "true": true, "false": false, "null": null, "array":[ ], "object":{ }, "address": "50 St. James Street", "url": "http://www.JSON.org/", "comment": "// /* */": " ", " s p a c e d " :[1,2 , 3 , 4 , 5 , 6 ,7 ], "compact": [1,2,3,4,5,6,7], "jsontext": "{\"object with 1 member\":[\"array with 1 element\"]}", "quotes": "" \u0022 %22 0x22 034 "", "\/\\\"\uCAFE\uBABE\uAB98\uFCDE\ubcda\uef4A\b\f\n\r\t`1~!@#$%^&*()_+-=[]{}|;:',./<>?" : "A key can be any string" }, 0.5 ,98.6 , 99.44 , 1066 ,"rosebud"] ''' class TestPass1(TestCase): def test_parse(self): # test in/out equivalence and parsing res = S.loads(JSON) out = S.dumps(res) self.assertEquals(res, S.loads(out)) try: S.dumps(res, allow_nan=False) except ValueError: pass else: self.fail("23456789012E666 should be out of range") nodebox-web-1.9.4.6/simplejson/tests/test_pass2.py000066400000000000000000000005671135274433600221100ustar00rootroot00000000000000from unittest import TestCase import simplejson as S # from http://json.org/JSON_checker/test/pass2.json JSON = r''' [[[[[[[[[[[[[[[[[[["Not too deep"]]]]]]]]]]]]]]]]]]] ''' class TestPass2(TestCase): def test_parse(self): # test in/out equivalence and parsing res = S.loads(JSON) out = S.dumps(res) self.assertEquals(res, S.loads(out)) nodebox-web-1.9.4.6/simplejson/tests/test_pass3.py000066400000000000000000000007271135274433600221070ustar00rootroot00000000000000from unittest import TestCase import simplejson as S # from http://json.org/JSON_checker/test/pass3.json JSON = r''' { "JSON Test Pattern pass3": { "The outermost value": "must be an object or array.", "In this test": "It is an object." } } ''' class TestPass3(TestCase): def test_parse(self): # test in/out equivalence and parsing res = S.loads(JSON) out = S.dumps(res) self.assertEquals(res, S.loads(out)) nodebox-web-1.9.4.6/simplejson/tests/test_recursion.py000066400000000000000000000031661135274433600230670ustar00rootroot00000000000000from unittest import TestCase import simplejson as S class JSONTestObject: pass class RecursiveJSONEncoder(S.JSONEncoder): recurse = False def default(self, o): if o is JSONTestObject: if self.recurse: return [JSONTestObject] else: return 'JSONTestObject' return S.JSONEncoder.default(o) class TestRecursion(TestCase): def test_listrecursion(self): x = [] x.append(x) try: S.dumps(x) except ValueError: pass else: self.fail("didn't raise ValueError on list recursion") x = [] y = [x] x.append(y) try: S.dumps(x) except ValueError: pass else: self.fail("didn't raise ValueError on alternating list recursion") y = [] x = [y, y] # ensure that the marker is cleared S.dumps(x) def test_dictrecursion(self): x = {} x["test"] = x try: S.dumps(x) except ValueError: pass else: self.fail("didn't raise ValueError on dict recursion") x = {} y = {"a": x, "b": x} # ensure that the marker is cleared S.dumps(x) def test_defaultrecursion(self): enc = RecursiveJSONEncoder() self.assertEquals(enc.encode(JSONTestObject), '"JSONTestObject"') enc.recurse = True try: enc.encode(JSONTestObject) except ValueError: pass else: self.fail("didn't raise ValueError on default recursion") nodebox-web-1.9.4.6/simplejson/tests/test_scanstring.py000066400000000000000000000070451135274433600232310ustar00rootroot00000000000000import sys import decimal from unittest import TestCase import simplejson.decoder class TestScanString(TestCase): def test_py_scanstring(self): self._test_scanstring(simplejson.decoder.py_scanstring) def test_c_scanstring(self): self._test_scanstring(simplejson.decoder.c_scanstring) def _test_scanstring(self, scanstring): self.assertEquals( scanstring('"z\\ud834\\udd20x"', 1, None, True), (u'z\U0001d120x', 16)) if sys.maxunicode == 65535: self.assertEquals( scanstring(u'"z\U0001d120x"', 1, None, True), (u'z\U0001d120x', 6)) else: self.assertEquals( scanstring(u'"z\U0001d120x"', 1, None, True), (u'z\U0001d120x', 5)) self.assertEquals( scanstring('"\\u007b"', 1, None, True), (u'{', 8)) self.assertEquals( scanstring('"A JSON payload should be an object or array, not a string."', 1, None, True), (u'A JSON payload should be an object or array, not a string.', 60)) self.assertEquals( scanstring('["Unclosed array"', 2, None, True), (u'Unclosed array', 17)) self.assertEquals( scanstring('["extra comma",]', 2, None, True), (u'extra comma', 14)) self.assertEquals( scanstring('["double extra comma",,]', 2, None, True), (u'double extra comma', 21)) self.assertEquals( scanstring('["Comma after the close"],', 2, None, True), (u'Comma after the close', 24)) self.assertEquals( scanstring('["Extra close"]]', 2, None, True), (u'Extra close', 14)) self.assertEquals( scanstring('{"Extra comma": true,}', 2, None, True), (u'Extra comma', 14)) self.assertEquals( scanstring('{"Extra value after close": true} "misplaced quoted value"', 2, None, True), (u'Extra value after close', 26)) self.assertEquals( scanstring('{"Illegal expression": 1 + 2}', 2, None, True), (u'Illegal expression', 21)) self.assertEquals( scanstring('{"Illegal invocation": alert()}', 2, None, True), (u'Illegal invocation', 21)) self.assertEquals( scanstring('{"Numbers cannot have leading zeroes": 013}', 2, None, True), (u'Numbers cannot have leading zeroes', 37)) self.assertEquals( scanstring('{"Numbers cannot be hex": 0x14}', 2, None, True), (u'Numbers cannot be hex', 24)) self.assertEquals( scanstring('[[[[[[[[[[[[[[[[[[[["Too deep"]]]]]]]]]]]]]]]]]]]]', 21, None, True), (u'Too deep', 30)) self.assertEquals( scanstring('{"Missing colon" null}', 2, None, True), (u'Missing colon', 16)) self.assertEquals( scanstring('{"Double colon":: null}', 2, None, True), (u'Double colon', 15)) self.assertEquals( scanstring('{"Comma instead of colon", null}', 2, None, True), (u'Comma instead of colon', 25)) self.assertEquals( scanstring('["Colon instead of comma": false]', 2, None, True), (u'Colon instead of comma', 25)) self.assertEquals( scanstring('["Bad value", truth]', 2, None, True), (u'Bad value', 12)) nodebox-web-1.9.4.6/simplejson/tests/test_separators.py000066400000000000000000000016371135274433600232420ustar00rootroot00000000000000import textwrap from unittest import TestCase import simplejson as S class TestSeparators(TestCase): def test_separators(self): h = [['blorpie'], ['whoops'], [], 'd-shtaeou', 'd-nthiouh', 'i-vhbjkhnth', {'nifty': 87}, {'field': 'yes', 'morefield': False} ] expect = textwrap.dedent("""\ [ [ "blorpie" ] , [ "whoops" ] , [] , "d-shtaeou" , "d-nthiouh" , "i-vhbjkhnth" , { "nifty" : 87 } , { "field" : "yes" , "morefield" : false } ]""") d1 = S.dumps(h) d2 = S.dumps(h, indent=2, sort_keys=True, separators=(' ,', ' : ')) h1 = S.loads(d1) h2 = S.loads(d2) self.assertEquals(h1, h) self.assertEquals(h2, h) self.assertEquals(d2, expect) nodebox-web-1.9.4.6/simplejson/tests/test_unicode.py000066400000000000000000000036141135274433600225020ustar00rootroot00000000000000from unittest import TestCase import simplejson as S class TestUnicode(TestCase): def test_encoding1(self): encoder = S.JSONEncoder(encoding='utf-8') u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}' s = u.encode('utf-8') ju = encoder.encode(u) js = encoder.encode(s) self.assertEquals(ju, js) def test_encoding2(self): u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}' s = u.encode('utf-8') ju = S.dumps(u, encoding='utf-8') js = S.dumps(s, encoding='utf-8') self.assertEquals(ju, js) def test_encoding3(self): u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}' j = S.dumps(u) self.assertEquals(j, '"\\u03b1\\u03a9"') def test_encoding4(self): u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}' j = S.dumps([u]) self.assertEquals(j, '["\\u03b1\\u03a9"]') def test_encoding5(self): u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}' j = S.dumps(u, ensure_ascii=False) self.assertEquals(j, u'"%s"' % (u,)) def test_encoding6(self): u = u'\N{GREEK SMALL LETTER ALPHA}\N{GREEK CAPITAL LETTER OMEGA}' j = S.dumps([u], ensure_ascii=False) self.assertEquals(j, u'["%s"]' % (u,)) def test_big_unicode_encode(self): u = u'\U0001d120' self.assertEquals(S.dumps(u), '"\\ud834\\udd20"') self.assertEquals(S.dumps(u, ensure_ascii=False), u'"\U0001d120"') def test_big_unicode_decode(self): u = u'z\U0001d120x' self.assertEquals(S.loads('"' + u + '"'), u) self.assertEquals(S.loads('"z\\ud834\\udd20x"'), u) def test_unicode_decode(self): for i in range(0, 0xd7ff): u = unichr(i) json = '"\\u%04x"' % (i,) self.assertEquals(S.loads(json), u) nodebox-web-1.9.4.6/soap.py000077500000000000000000003674701135274433600154520ustar00rootroot00000000000000#!/usr/bin/python ################################################################################ # # SOAP.py 0.9.7 - Cayce Ullman (cayce@actzero.com) # Brian Matthews (blm@actzero.com) # # INCLUDED: # - General SOAP Parser based on sax.xml (requires Python 2.0) # - General SOAP Builder # - SOAP Proxy for RPC client code # - SOAP Server framework for RPC server code # # FEATURES: # - Handles all of the types in the BDG # - Handles faults # - Allows namespace specification # - Allows SOAPAction specification # - Homogeneous typed arrays # - Supports multiple schemas # - Header support (mustUnderstand and actor) # - XML attribute support # - Multi-referencing support (Parser/Builder) # - Understands SOAP-ENC:root attribute # - Good interop, passes all client tests for Frontier, SOAP::LITE, SOAPRMI # - Encodings # - SSL clients (with OpenSSL configured in to Python) # - SSL servers (with OpenSSL configured in to Python and M2Crypto installed) # # TODO: # - Timeout on method calls - MCU # - Arrays (sparse, multidimensional and partial) - BLM # - Clean up data types - BLM # - Type coercion system (Builder) - MCU # - Early WSDL Support - MCU # - Attachments - BLM # - setup.py - MCU # - mod_python example - MCU # - medusa example - MCU # - Documentation - JAG # - Look at performance # ################################################################################ # # Copyright (c) 2001, Cayce Ullman. # Copyright (c) 2001, Brian Matthews. # # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # # Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # Neither the name of actzero, inc. nor the names of its contributors may # be used to endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ################################################################################ # # Additional changes: # 0.9.7.3 - 4/18/2002 - Mark Pilgrim (f8dy@diveintomark.org) # added dump_dict as alias for dump_dictionary for Python 2.2 compatibility # 0.9.7.2 - 4/12/2002 - Mark Pilgrim (f8dy@diveintomark.org) # fixed logic to unmarshal the value of "null" attributes ("true" or "1" # means true, others false) # 0.9.7.1 - 4/11/2002 - Mark Pilgrim (f8dy@diveintomark.org) # added "dump_str" as alias for "dump_string" for Python 2.2 compatibility # Between 2.1 and 2.2, type("").__name__ changed from "string" to "str" ################################################################################ import xml.sax import UserList import base64 import cgi import urllib import exceptions import copy import re import socket import string import sys import time import SocketServer from types import * try: from M2Crypto import SSL except: pass ident = '$Id: SOAP.py,v 1.1.1.1 2004/01/16 16:15:18 bluecoat93 Exp $' __version__ = "0.9.7.3" # Platform hackery # Check float support try: float("NaN") float("INF") float("-INF") good_float = 1 except: good_float = 0 ################################################################################ # Exceptions ################################################################################ class Error(exceptions.Exception): def __init__(self, msg): self.msg = msg def __str__(self): return "" % self.msg __repr__ = __str__ class RecursionError(Error): pass class UnknownTypeError(Error): pass class HTTPError(Error): # indicates an HTTP protocol error def __init__(self, code, msg): self.code = code self.msg = msg def __str__(self): return "" % (self.code, self.msg) __repr__ = __str__ ############################################################################## # Namespace Class ################################################################################ def invertDict(dict): d = {} for k, v in dict.items(): d[v] = k return d class NS: XML = "http://www.w3.org/XML/1998/namespace" ENV = "http://schemas.xmlsoap.org/soap/envelope/" ENC = "http://schemas.xmlsoap.org/soap/encoding/" XSD = "http://www.w3.org/1999/XMLSchema" XSD2 = "http://www.w3.org/2000/10/XMLSchema" XSD3 = "http://www.w3.org/2001/XMLSchema" XSD_L = [XSD, XSD2, XSD3] EXSD_L= [ENC, XSD, XSD2, XSD3] XSI = "http://www.w3.org/1999/XMLSchema-instance" XSI2 = "http://www.w3.org/2000/10/XMLSchema-instance" XSI3 = "http://www.w3.org/2001/XMLSchema-instance" XSI_L = [XSI, XSI2, XSI3] URN = "http://soapinterop.org/xsd" # For generated messages XML_T = "xml" ENV_T = "SOAP-ENV" ENC_T = "SOAP-ENC" XSD_T = "xsd" XSD2_T= "xsd2" XSD3_T= "xsd3" XSI_T = "xsi" XSI2_T= "xsi2" XSI3_T= "xsi3" URN_T = "urn" NSMAP = {ENV_T: ENV, ENC_T: ENC, XSD_T: XSD, XSD2_T: XSD2, XSD3_T: XSD3, XSI_T: XSI, XSI2_T: XSI2, XSI3_T: XSI3, URN_T: URN} NSMAP_R = invertDict(NSMAP) STMAP = {'1999': (XSD_T, XSI_T), '2000': (XSD2_T, XSI2_T), '2001': (XSD3_T, XSI3_T)} STMAP_R = invertDict(STMAP) def __init__(self): raise Error, "Don't instantiate this" ################################################################################ # Configuration class ################################################################################ class SOAPConfig: __readonly = ('SSLserver', 'SSLclient') def __init__(self, config = None, **kw): d = self.__dict__ if config: if not isinstance(config, SOAPConfig): raise AttributeError, \ "initializer must be SOAPConfig instance" s = config.__dict__ for k, v in s.items(): if k[0] != '_': d[k] = v else: # Setting debug also sets returnFaultInfo, dumpFaultInfo, # dumpHeadersIn, dumpHeadersOut, dumpSOAPIn, and dumpSOAPOut self.debug = 0 # Setting namespaceStyle sets typesNamespace, typesNamespaceURI, # schemaNamespace, and schemaNamespaceURI self.namespaceStyle = '1999' self.strictNamespaces = 0 self.typed = 1 self.buildWithNamespacePrefix = 1 self.returnAllAttrs = 0 try: SSL; d['SSLserver'] = 1 except: d['SSLserver'] = 0 try: socket.ssl; d['SSLclient'] = 1 except: d['SSLclient'] = 0 for k, v in kw.items(): if k[0] != '_': setattr(self, k, v) def __setattr__(self, name, value): if name in self.__readonly: raise AttributeError, "readonly configuration setting" d = self.__dict__ if name in ('typesNamespace', 'typesNamespaceURI', 'schemaNamespace', 'schemaNamespaceURI'): if name[-3:] == 'URI': base, uri = name[:-3], 1 else: base, uri = name, 0 if type(value) == StringType: if NS.NSMAP.has_key(value): n = (value, NS.NSMAP[value]) elif NS.NSMAP_R.has_key(value): n = (NS.NSMAP_R[value], value) else: raise AttributeError, "unknown namespace" elif type(value) in (ListType, TupleType): if uri: n = (value[1], value[0]) else: n = (value[0], value[1]) else: raise AttributeError, "unknown namespace type" d[base], d[base + 'URI'] = n try: d['namespaceStyle'] = \ NS.STMAP_R[(d['typesNamespace'], d['schemaNamespace'])] except: d['namespaceStyle'] = '' elif name == 'namespaceStyle': value = str(value) if not NS.STMAP.has_key(value): raise AttributeError, "unknown namespace style" d[name] = value n = d['typesNamespace'] = NS.STMAP[value][0] d['typesNamespaceURI'] = NS.NSMAP[n] n = d['schemaNamespace'] = NS.STMAP[value][1] d['schemaNamespaceURI'] = NS.NSMAP[n] elif name == 'debug': d[name] = \ d['returnFaultInfo'] = \ d['dumpFaultInfo'] = \ d['dumpHeadersIn'] = \ d['dumpHeadersOut'] = \ d['dumpSOAPIn'] = \ d['dumpSOAPOut'] = value else: d[name] = value Config = SOAPConfig() ################################################################################ # Types and Wrappers ################################################################################ class anyType: _validURIs = (NS.XSD, NS.XSD2, NS.XSD3, NS.ENC) def __init__(self, data = None, name = None, typed = 1, attrs = None): if self.__class__ == anyType: raise Error, "anyType can't be instantiated directly" if type(name) in (ListType, TupleType): self._ns, self._name = name else: self._ns, self._name = self._validURIs[0], name self._typed = typed self._attrs = {} self._cache = None self._type = self._typeName() self._data = self._checkValueSpace(data) if attrs != None: self._setAttrs(attrs) def __str__(self): if self._name: return "<%s %s at %d>" % (self.__class__, self._name, id(self)) return "<%s at %d>" % (self.__class__, id(self)) __repr__ = __str__ def _checkValueSpace(self, data): return data def _marshalData(self): return str(self._data) def _marshalAttrs(self, ns_map, builder): a = '' for attr, value in self._attrs.items(): ns, n = builder.genns(ns_map, attr[0]) a += n + ' %s%s="%s"' % \ (ns, attr[1], cgi.escape(str(value), 1)) return a def _fixAttr(self, attr): if type(attr) in (StringType, UnicodeType): attr = (None, attr) elif type(attr) == ListType: attr = tuple(attr) elif type(attr) != TupleType: raise AttributeError, "invalid attribute type" if len(attr) != 2: raise AttributeError, "invalid attribute length" if type(attr[0]) not in (NoneType, StringType, UnicodeType): raise AttributeError, "invalid attribute namespace URI type" return attr def _getAttr(self, attr): attr = self._fixAttr(attr) try: return self._attrs[attr] except: return None def _setAttr(self, attr, value): attr = self._fixAttr(attr) self._attrs[attr] = str(value) def _setAttrs(self, attrs): if type(attrs) in (ListType, TupleType): for i in range(0, len(attrs), 2): self._setAttr(attrs[i], attrs[i + 1]) return if type(attrs) == DictType: d = attrs elif isinstance(attrs, anyType): d = attrs._attrs else: raise AttributeError, "invalid attribute type" for attr, value in d.items(): self._setAttr(attr, value) def _setMustUnderstand(self, val): self._setAttr((NS.ENV, "mustUnderstand"), val) def _getMustUnderstand(self): return self._getAttr((NS.ENV, "mustUnderstand")) def _setActor(self, val): self._setAttr((NS.ENV, "actor"), val) def _getActor(self): return self._getAttr((NS.ENV, "actor")) def _typeName(self): return self.__class__.__name__[:-4] def _validNamespaceURI(self, URI, strict): if not self._typed: return None if URI in self._validURIs: return URI if not strict: return self._ns raise AttributeError, \ "not a valid namespace for type %s" % self._type class voidType(anyType): pass class stringType(anyType): def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) not in (StringType, UnicodeType): raise AttributeError, "invalid %s type" % self._type return data class untypedType(stringType): def __init__(self, data = None, name = None, attrs = None): stringType.__init__(self, data, name, 0, attrs) class IDType(stringType): pass class NCNameType(stringType): pass class NameType(stringType): pass class ENTITYType(stringType): pass class IDREFType(stringType): pass class languageType(stringType): pass class NMTOKENType(stringType): pass class QNameType(stringType): pass class tokenType(anyType): _validURIs = (NS.XSD2, NS.XSD3) __invalidre = '[\n\t]|^ | $| ' def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) not in (StringType, UnicodeType): raise AttributeError, "invalid %s type" % self._type if type(self.__invalidre) == StringType: self.__invalidre = re.compile(self.__invalidre) if self.__invalidre.search(data): raise ValueError, "invalid %s value" % self._type return data class normalizedStringType(anyType): _validURIs = (NS.XSD3,) __invalidre = '[\n\r\t]' def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) not in (StringType, UnicodeType): raise AttributeError, "invalid %s type" % self._type if type(self.__invalidre) == StringType: self.__invalidre = re.compile(self.__invalidre) if self.__invalidre.search(data): raise ValueError, "invalid %s value" % self._type return data class CDATAType(normalizedStringType): _validURIs = (NS.XSD2,) class booleanType(anyType): def __int__(self): return self._data __nonzero__ = __int__ def _marshalData(self): return ['false', 'true'][self._data] def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if data in (0, '0', 'false', ''): return 0 if data in (1, '1', 'true'): return 1 raise ValueError, "invalid %s value" % self._type class decimalType(anyType): def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) not in (IntType, LongType, FloatType): raise Error, "invalid %s value" % self._type return data class floatType(anyType): def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) not in (IntType, LongType, FloatType) or \ data < -3.4028234663852886E+38 or \ data > 3.4028234663852886E+38: raise ValueError, "invalid %s value" % self._type return data def _marshalData(self): return "%.18g" % self._data # More precision class doubleType(anyType): def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) not in (IntType, LongType, FloatType) or \ data < -1.7976931348623158E+308 or \ data > 1.7976931348623157E+308: raise ValueError, "invalid %s value" % self._type return data def _marshalData(self): return "%.18g" % self._data # More precision class durationType(anyType): _validURIs = (NS.XSD3,) def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type try: # A tuple or a scalar is OK, but make them into a list if type(data) == TupleType: data = list(data) elif type(data) != ListType: data = [data] if len(data) > 6: raise Exception, "too many values" # Now check the types of all the components, and find # the first nonzero element along the way. f = -1 for i in range(len(data)): if data[i] == None: data[i] = 0 continue if type(data[i]) not in \ (IntType, LongType, FloatType): raise Exception, "element %d a bad type" % i if data[i] and f == -1: f = i # If they're all 0, just use zero seconds. if f == -1: self._cache = 'PT0S' return (0,) * 6 # Make sure only the last nonzero element has a decimal fraction # and only the first element is negative. d = -1 for i in range(f, len(data)): if data[i]: if d != -1: raise Exception, \ "all except the last nonzero element must be " \ "integers" if data[i] < 0 and i > f: raise Exception, \ "only the first nonzero element can be negative" elif data[i] != long(data[i]): d = i # Pad the list on the left if necessary. if len(data) < 6: n = 6 - len(data) f += n d += n data = [0] * n + data # Save index of the first nonzero element and the decimal # element for _marshalData. self.__firstnonzero = f self.__decimal = d except Exception, e: raise ValueError, "invalid %s value - %s" % (self._type, e) return tuple(data) def _marshalData(self): if self._cache == None: d = self._data t = 0 if d[self.__firstnonzero] < 0: s = '-P' else: s = 'P' t = 0 for i in range(self.__firstnonzero, len(d)): if d[i]: if i > 2 and not t: s += 'T' t = 1 if self.__decimal == i: s += "%g" % abs(d[i]) else: s += "%d" % long(abs(d[i])) s += ['Y', 'M', 'D', 'H', 'M', 'S'][i] self._cache = s return self._cache class timeDurationType(durationType): _validURIs = (NS.XSD, NS.XSD2, NS.ENC) class dateTimeType(anyType): _validURIs = (NS.XSD3,) def _checkValueSpace(self, data): try: if data == None: data = time.time() if (type(data) in (IntType, LongType)): data = list(time.gmtime(data)[:6]) elif (type(data) == FloatType): f = data - int(data) data = list(time.gmtime(int(data))[:6]) data[5] += f elif type(data) in (ListType, TupleType): if len(data) < 6: raise Exception, "not enough values" if len(data) > 9: raise Exception, "too many values" data = list(data[:6]) cleanDate(data) else: raise Exception, "invalid type" except Exception, e: raise ValueError, "invalid %s value - %s" % (self._type, e) return tuple(data) def _marshalData(self): if self._cache == None: d = self._data s = "%04d-%02d-%02dT%02d:%02d:%02d" % ((abs(d[0]),) + d[1:]) if d[0] < 0: s = '-' + s f = d[5] - int(d[5]) if f != 0: s += ("%g" % f)[1:] s += 'Z' self._cache = s return self._cache class recurringInstantType(anyType): _validURIs = (NS.XSD,) def _checkValueSpace(self, data): try: if data == None: data = list(time.gmtime(time.time())[:6]) if (type(data) in (IntType, LongType)): data = list(time.gmtime(data)[:6]) elif (type(data) == FloatType): f = data - int(data) data = list(time.gmtime(int(data))[:6]) data[5] += f elif type(data) in (ListType, TupleType): if len(data) < 1: raise Exception, "not enough values" if len(data) > 9: raise Exception, "too many values" data = list(data[:6]) if len(data) < 6: data += [0] * (6 - len(data)) f = len(data) for i in range(f): if data[i] == None: if f < i: raise Exception, \ "only leftmost elements can be none" else: f = i break cleanDate(data, f) else: raise Exception, "invalid type" except Exception, e: raise ValueError, "invalid %s value - %s" % (self._type, e) return tuple(data) def _marshalData(self): if self._cache == None: d = self._data e = list(d) neg = '' if e[0] < 0: neg = '-' e[0] = abs(e[0]) if not e[0]: e[0] = '--' elif e[0] < 100: e[0] = '-' + "%02d" % e[0] else: e[0] = "%04d" % e[0] for i in range(1, len(e)): if e[i] == None or (i < 3 and e[i] == 0): e[i] = '-' else: if e[i] < 0: neg = '-' e[i] = abs(e[i]) e[i] = "%02d" % e[i] if d[5]: f = abs(d[5] - int(d[5])) if f: e[5] += ("%g" % f)[1:] s = "%s%s-%s-%sT%s:%s:%sZ" % ((neg,) + tuple(e)) self._cache = s return self._cache class timeInstantType(dateTimeType): _validURIs = (NS.XSD, NS.XSD2, NS.ENC) class timePeriodType(dateTimeType): _validURIs = (NS.XSD2, NS.ENC) class timeType(anyType): def _checkValueSpace(self, data): try: if data == None: data = time.gmtime(time.time())[3:6] elif (type(data) == FloatType): f = data - int(data) data = list(time.gmtime(int(data))[3:6]) data[2] += f elif type(data) in (IntType, LongType): data = time.gmtime(data)[3:6] elif type(data) in (ListType, TupleType): if len(data) == 9: data = data[3:6] elif len(data) > 3: raise Exception, "too many values" data = [None, None, None] + list(data) if len(data) < 6: data += [0] * (6 - len(data)) cleanDate(data, 3) data = data[3:] else: raise Exception, "invalid type" except Exception, e: raise ValueError, "invalid %s value - %s" % (self._type, e) return tuple(data) def _marshalData(self): if self._cache == None: d = self._data s = '' s = time.strftime("%H:%M:%S", (0, 0, 0) + d + (0, 0, -1)) f = d[2] - int(d[2]) if f != 0: s += ("%g" % f)[1:] s += 'Z' self._cache = s return self._cache class dateType(anyType): def _checkValueSpace(self, data): try: if data == None: data = time.gmtime(time.time())[0:3] elif type(data) in (IntType, LongType, FloatType): data = time.gmtime(data)[0:3] elif type(data) in (ListType, TupleType): if len(data) == 9: data = data[0:3] elif len(data) > 3: raise Exception, "too many values" data = list(data) if len(data) < 3: data += [1, 1, 1][len(data):] data += [0, 0, 0] cleanDate(data) data = data[:3] else: raise Exception, "invalid type" except Exception, e: raise ValueError, "invalid %s value - %s" % (self._type, e) return tuple(data) def _marshalData(self): if self._cache == None: d = self._data s = "%04d-%02d-%02dZ" % ((abs(d[0]),) + d[1:]) if d[0] < 0: s = '-' + s self._cache = s return self._cache class gYearMonthType(anyType): _validURIs = (NS.XSD3,) def _checkValueSpace(self, data): try: if data == None: data = time.gmtime(time.time())[0:2] elif type(data) in (IntType, LongType, FloatType): data = time.gmtime(data)[0:2] elif type(data) in (ListType, TupleType): if len(data) == 9: data = data[0:2] elif len(data) > 2: raise Exception, "too many values" data = list(data) if len(data) < 2: data += [1, 1][len(data):] data += [1, 0, 0, 0] cleanDate(data) data = data[:2] else: raise Exception, "invalid type" except Exception, e: raise ValueError, "invalid %s value - %s" % (self._type, e) return tuple(data) def _marshalData(self): if self._cache == None: d = self._data s = "%04d-%02dZ" % ((abs(d[0]),) + d[1:]) if d[0] < 0: s = '-' + s self._cache = s return self._cache class gYearType(anyType): _validURIs = (NS.XSD3,) def _checkValueSpace(self, data): try: if data == None: data = time.gmtime(time.time())[0:1] elif type(data) in (IntType, LongType, FloatType): data = [data] if type(data) in (ListType, TupleType): if len(data) == 9: data = data[0:1] elif len(data) < 1: raise Exception, "too few values" elif len(data) > 1: raise Exception, "too many values" if type(data[0]) == FloatType: try: s = int(data[0]) except: s = long(data[0]) if s != data[0]: raise Exception, "not integral" data = [s] elif type(data[0]) not in (IntType, LongType): raise Exception, "bad type" else: raise Exception, "invalid type" except Exception, e: raise ValueError, "invalid %s value - %s" % (self._type, e) return data[0] def _marshalData(self): if self._cache == None: d = self._data s = "%04dZ" % abs(d) if d < 0: s = '-' + s self._cache = s return self._cache class centuryType(anyType): _validURIs = (NS.XSD2, NS.ENC) def _checkValueSpace(self, data): try: if data == None: data = time.gmtime(time.time())[0:1] / 100 elif type(data) in (IntType, LongType, FloatType): data = [data] if type(data) in (ListType, TupleType): if len(data) == 9: data = data[0:1] / 100 elif len(data) < 1: raise Exception, "too few values" elif len(data) > 1: raise Exception, "too many values" if type(data[0]) == FloatType: try: s = int(data[0]) except: s = long(data[0]) if s != data[0]: raise Exception, "not integral" data = [s] elif type(data[0]) not in (IntType, LongType): raise Exception, "bad type" else: raise Exception, "invalid type" except Exception, e: raise ValueError, "invalid %s value - %s" % (self._type, e) return data[0] def _marshalData(self): if self._cache == None: d = self._data s = "%02dZ" % abs(d) if d < 0: s = '-' + s self._cache = s return self._cache class yearType(gYearType): _validURIs = (NS.XSD2, NS.ENC) class gMonthDayType(anyType): _validURIs = (NS.XSD3,) def _checkValueSpace(self, data): try: if data == None: data = time.gmtime(time.time())[1:3] elif type(data) in (IntType, LongType, FloatType): data = time.gmtime(data)[1:3] elif type(data) in (ListType, TupleType): if len(data) == 9: data = data[0:2] elif len(data) > 2: raise Exception, "too many values" data = list(data) if len(data) < 2: data += [1, 1][len(data):] data = [0] + data + [0, 0, 0] cleanDate(data, 1) data = data[1:3] else: raise Exception, "invalid type" except Exception, e: raise ValueError, "invalid %s value - %s" % (self._type, e) return tuple(data) def _marshalData(self): if self._cache == None: self._cache = "--%02d-%02dZ" % self._data return self._cache class recurringDateType(gMonthDayType): _validURIs = (NS.XSD2, NS.ENC) class gMonthType(anyType): _validURIs = (NS.XSD3,) def _checkValueSpace(self, data): try: if data == None: data = time.gmtime(time.time())[1:2] elif type(data) in (IntType, LongType, FloatType): data = [data] if type(data) in (ListType, TupleType): if len(data) == 9: data = data[1:2] elif len(data) < 1: raise Exception, "too few values" elif len(data) > 1: raise Exception, "too many values" if type(data[0]) == FloatType: try: s = int(data[0]) except: s = long(data[0]) if s != data[0]: raise Exception, "not integral" data = [s] elif type(data[0]) not in (IntType, LongType): raise Exception, "bad type" if data[0] < 1 or data[0] > 12: raise Exception, "bad value" else: raise Exception, "invalid type" except Exception, e: raise ValueError, "invalid %s value - %s" % (self._type, e) return data[0] def _marshalData(self): if self._cache == None: self._cache = "--%02d--Z" % self._data return self._cache class monthType(gMonthType): _validURIs = (NS.XSD2, NS.ENC) class gDayType(anyType): _validURIs = (NS.XSD3,) def _checkValueSpace(self, data): try: if data == None: data = time.gmtime(time.time())[2:3] elif type(data) in (IntType, LongType, FloatType): data = [data] if type(data) in (ListType, TupleType): if len(data) == 9: data = data[2:3] elif len(data) < 1: raise Exception, "too few values" elif len(data) > 1: raise Exception, "too many values" if type(data[0]) == FloatType: try: s = int(data[0]) except: s = long(data[0]) if s != data[0]: raise Exception, "not integral" data = [s] elif type(data[0]) not in (IntType, LongType): raise Exception, "bad type" if data[0] < 1 or data[0] > 31: raise Exception, "bad value" else: raise Exception, "invalid type" except Exception, e: raise ValueError, "invalid %s value - %s" % (self._type, e) return data[0] def _marshalData(self): if self._cache == None: self._cache = "---%02dZ" % self._data return self._cache class recurringDayType(gDayType): _validURIs = (NS.XSD2, NS.ENC) class hexBinaryType(anyType): _validURIs = (NS.XSD3,) def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) not in (StringType, UnicodeType): raise AttributeError, "invalid %s type" % self._type return data def _marshalData(self): if self._cache == None: self._cache = encodeHexString(self._data) return self._cache class base64BinaryType(anyType): _validURIs = (NS.XSD3,) def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) not in (StringType, UnicodeType): raise AttributeError, "invalid %s type" % self._type return data def _marshalData(self): if self._cache == None: self._cache = base64.encodestring(self._data) return self._cache class base64Type(base64BinaryType): _validURIs = (NS.ENC,) class binaryType(anyType): _validURIs = (NS.XSD, NS.ENC) def __init__(self, data, name = None, typed = 1, encoding = 'base64', attrs = None): anyType.__init__(self, data, name, typed, attrs) self._setAttr('encoding', encoding) def _marshalData(self): if self._cache == None: if self._getAttr((None, 'encoding')) == 'base64': self._cache = base64.encodestring(self._data) else: self._cache = encodeHexString(self._data) return self._cache def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) not in (StringType, UnicodeType): raise AttributeError, "invalid %s type" % self._type return data def _setAttr(self, attr, value): attr = self._fixAttr(attr) if attr[1] == 'encoding': if attr[0] != None or value not in ('base64', 'hex'): raise AttributeError, "invalid encoding" self._cache = None anyType._setAttr(self, attr, value) class anyURIType(anyType): _validURIs = (NS.XSD3,) def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) not in (StringType, UnicodeType): raise AttributeError, "invalid %s type" % self._type return data def _marshalData(self): if self._cache == None: self._cache = urllib.quote(self._data) return self._cache class uriType(anyURIType): _validURIs = (NS.XSD,) class uriReferenceType(anyURIType): _validURIs = (NS.XSD2,) class NOTATIONType(anyType): def __init__(self, data, name = None, typed = 1, attrs = None): if self.__class__ == NOTATIONType: raise Error, "a NOTATION can't be instantiated directly" anyType.__init__(self, data, name, typed, attrs) class ENTITIESType(anyType): def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) in (StringType, UnicodeType): return (data,) if type(data) not in (ListType, TupleType) or \ filter (lambda x: type(x) not in (StringType, UnicodeType), data): raise AttributeError, "invalid %s type" % self._type return data def _marshalData(self): return ' '.join(self._data) class IDREFSType(ENTITIESType): pass class NMTOKENSType(ENTITIESType): pass class integerType(anyType): def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) not in (IntType, LongType): raise ValueError, "invalid %s value" % self._type return data class nonPositiveIntegerType(anyType): _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) not in (IntType, LongType) or data > 0: raise ValueError, "invalid %s value" % self._type return data class non_Positive_IntegerType(nonPositiveIntegerType): _validURIs = (NS.XSD,) def _typeName(self): return 'non-positive-integer' class negativeIntegerType(anyType): _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) not in (IntType, LongType) or data >= 0: raise ValueError, "invalid %s value" % self._type return data class negative_IntegerType(negativeIntegerType): _validURIs = (NS.XSD,) def _typeName(self): return 'negative-integer' class longType(anyType): _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) not in (IntType, LongType) or \ data < -9223372036854775808L or \ data > 9223372036854775807L: raise ValueError, "invalid %s value" % self._type return data class intType(anyType): _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) not in (IntType, LongType) or \ data < -2147483648L or \ data > 2147483647: raise ValueError, "invalid %s value" % self._type return data class shortType(anyType): _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) not in (IntType, LongType) or \ data < -32768 or \ data > 32767: raise ValueError, "invalid %s value" % self._type return data class byteType(anyType): _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) not in (IntType, LongType) or \ data < -128 or \ data > 127: raise ValueError, "invalid %s value" % self._type return data class nonNegativeIntegerType(anyType): _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) not in (IntType, LongType) or data < 0: raise ValueError, "invalid %s value" % self._type return data class non_Negative_IntegerType(nonNegativeIntegerType): _validURIs = (NS.XSD,) def _typeName(self): return 'non-negative-integer' class unsignedLongType(anyType): _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) not in (IntType, LongType) or \ data < 0 or \ data > 18446744073709551615L: raise ValueError, "invalid %s value" % self._type return data class unsignedIntType(anyType): _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) not in (IntType, LongType) or \ data < 0 or \ data > 4294967295L: raise ValueError, "invalid %s value" % self._type return data class unsignedShortType(anyType): _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) not in (IntType, LongType) or \ data < 0 or \ data > 65535: raise ValueError, "invalid %s value" % self._type return data class unsignedByteType(anyType): _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) not in (IntType, LongType) or \ data < 0 or \ data > 255: raise ValueError, "invalid %s value" % self._type return data class positiveIntegerType(anyType): _validURIs = (NS.XSD2, NS.XSD3, NS.ENC) def _checkValueSpace(self, data): if data == None: raise ValueError, "must supply initial %s value" % self._type if type(data) not in (IntType, LongType) or data <= 0: raise ValueError, "invalid %s value" % self._type return data class positive_IntegerType(positiveIntegerType): _validURIs = (NS.XSD,) def _typeName(self): return 'positive-integer' # Now compound types class compoundType(anyType): def __init__(self, data = None, name = None, typed = 1, attrs = None): if self.__class__ == compoundType: raise Error, "a compound can't be instantiated directly" anyType.__init__(self, data, name, typed, attrs) self._aslist = [] self._asdict = {} self._keyord = [] if type(data) == DictType: self.__dict__.update(data) def __getitem__(self, item): if type(item) == IntType: return self._aslist[item] return getattr(self, item) def __len__(self): return len(self._aslist) def __nonzero__(self): return 1 def _keys(self): return filter(lambda x: x[0] != '_', self.__dict__.keys()) def _addItem(self, name, value, attrs = None): d = self._asdict if d.has_key(name): if type(d[name]) != ListType: d[name] = [d[name]] d[name].append(value) else: d[name] = value self._keyord.append(name) self._aslist.append(value) self.__dict__[name] = d[name] def _placeItem(self, name, value, pos, subpos = 0, attrs = None): d = self._asdict if subpos == 0 and type(d[name]) != ListType: d[name] = value else: d[name][subpos] = value self._keyord[pos] = name self._aslist[pos] = value self.__dict__[name] = d[name] def _getItemAsList(self, name, default = []): try: d = self.__dict__[name] except: return default if type(d) == ListType: return d return [d] class structType(compoundType): pass class headerType(structType): _validURIs = (NS.ENV,) def __init__(self, data = None, typed = 1, attrs = None): structType.__init__(self, data, "Header", typed, attrs) class bodyType(structType): _validURIs = (NS.ENV,) def __init__(self, data = None, typed = 1, attrs = None): structType.__init__(self, data, "Body", typed, attrs) class arrayType(UserList.UserList, compoundType): def __init__(self, data = None, name = None, attrs = None, offset = 0, rank = None, asize = 0, elemsname = None): if data: if type(data) not in (ListType, TupleType): raise Error, "Data must be a sequence" UserList.UserList.__init__(self, data) compoundType.__init__(self, data, name, 0, attrs) self._elemsname = elemsname or "item" if data == None: self._rank = rank # According to 5.4.2.2 in the SOAP spec, each element in a # sparse array must have a position. _posstate keeps track of # whether we've seen a position or not. It's possible values # are: # -1 No elements have been added, so the state is indeterminate # 0 An element without a position has been added, so no # elements can have positions # 1 An element with a position has been added, so all elements # must have positions self._posstate = -1 self._full = 0 if asize in ('', None): asize = '0' self._dims = map (lambda x: int(x), str(asize).split(',')) self._dims.reverse() # It's easier to work with this way self._poss = [0] * len(self._dims) # This will end up # reversed too for i in range(len(self._dims)): if self._dims[i] < 0 or \ self._dims[i] == 0 and len(self._dims) > 1: raise TypeError, "invalid Array dimensions" if offset > 0: self._poss[i] = offset % self._dims[i] offset = int(offset / self._dims[i]) # Don't break out of the loop if offset is 0 so we test all the # dimensions for > 0. if offset: raise AttributeError, "invalid Array offset" a = [None] * self._dims[0] for i in range(1, len(self._dims)): b = [] for j in range(self._dims[i]): b.append(copy.deepcopy(a)) a = b self.data = a def _addItem(self, name, value, attrs): if self._full: raise ValueError, "Array is full" pos = attrs.get((NS.ENC, 'position')) if pos != None: if self._posstate == 0: raise AttributeError, \ "all elements in a sparse Array must have a " \ "position attribute" self._posstate = 1 try: if pos[0] == '[' and pos[-1] == ']': pos = map (lambda x: int(x), pos[1:-1].split(',')) pos.reverse() if len(pos) == 1: pos = pos[0] curpos = [0] * len(self._dims) for i in range(len(self._dims)): curpos[i] = pos % self._dims[i] pos = int(pos / self._dims[i]) if pos == 0: break if pos: raise Exception elif len(pos) != len(self._dims): raise Exception else: for i in range(len(self._dims)): if pos[i] >= self._dims[i]: raise Exception curpos = pos else: raise Exception except: raise AttributeError, \ "invalid Array element position %s" % str(pos) else: if self._posstate == 1: raise AttributeError, \ "only elements in a sparse Array may have a " \ "position attribute" self._posstate = 0 curpos = self._poss a = self.data for i in range(len(self._dims) - 1, 0, -1): a = a[curpos[i]] if curpos[0] >= len(a): a += [None] * (len(a) - curpos[0] + 1) a[curpos[0]] = value if pos == None: self._poss[0] += 1 for i in range(len(self._dims) - 1): if self._poss[i] < self._dims[i]: break self._poss[i] = 0 self._poss[i + 1] += 1 if self._dims[-1] and self._poss[-1] >= self._dims[-1]: self._full = 1 def _placeItem(self, name, value, pos, subpos, attrs = None): curpos = [0] * len(self._dims) for i in range(len(self._dims)): if self._dims[i] == 0: curpos[0] = pos break curpos[i] = pos % self._dims[i] pos = int(pos / self._dims[i]) if pos == 0: break if self._dims[i] != 0 and pos: raise Error, "array index out of range" a = self.data for i in range(len(self._dims) - 1, 0, -1): a = a[curpos[i]] if curpos[0] >= len(a): a += [None] * (len(a) - curpos[0] + 1) a[curpos[0]] = value class typedArrayType(arrayType): def __init__(self, data = None, name = None, typed = None, attrs = None, offset = 0, rank = None, asize = 0, elemsname = None): arrayType.__init__(self, data, name, attrs, offset, rank, asize, elemsname) self._type = typed class faultType(structType, Error): def __init__(self, faultcode = "", faultstring = "", detail = None): self.faultcode = faultcode self.faultstring = faultstring if detail != None: self.detail = detail structType.__init__(self, None, 0) def _setDetail(self, detail = None): if detail != None: self.detail = detail else: try: del self.detail except AttributeError: pass def __repr__(self): return "" % (self.faultcode, self.faultstring) __str__ = __repr__ ################################################################################ class RefHolder: def __init__(self, name, frame): self.name = name self.parent = frame self.pos = len(frame) self.subpos = frame.namecounts.get(name, 0) def __repr__(self): return "<%s %s at %d>" % (self.__class__, self.name, id(self)) ################################################################################ # Utility infielders ################################################################################ def collapseWhiteSpace(s): return re.sub('\s+', ' ', s).strip() def decodeHexString(data): conv = {'0': 0x0, '1': 0x1, '2': 0x2, '3': 0x3, '4': 0x4, '5': 0x5, '6': 0x6, '7': 0x7, '8': 0x8, '9': 0x9, 'a': 0xa, 'b': 0xb, 'c': 0xc, 'd': 0xd, 'e': 0xe, 'f': 0xf, 'A': 0xa, 'B': 0xb, 'C': 0xc, 'D': 0xd, 'E': 0xe, 'F': 0xf,} ws = string.whitespace bin = '' i = 0 while i < len(data): if data[i] not in ws: break i += 1 low = 0 while i < len(data): c = data[i] if c in string.whitespace: break try: c = conv[c] except KeyError: raise ValueError, \ "invalid hex string character `%s'" % c if low: bin += chr(high * 16 + c) low = 0 else: high = c low = 1 i += 1 if low: raise ValueError, "invalid hex string length" while i < len(data): if data[i] not in string.whitespace: raise ValueError, \ "invalid hex string character `%s'" % c i += 1 return bin def encodeHexString(data): h = '' for i in data: h += "%02X" % ord(i) return h def leapMonth(year, month): return month == 2 and \ year % 4 == 0 and \ (year % 100 != 0 or year % 400 == 0) def cleanDate(d, first = 0): ranges = (None, (1, 12), (1, 31), (0, 23), (0, 59), (0, 61)) months = (0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) names = ('year', 'month', 'day', 'hours', 'minutes', 'seconds') if len(d) != 6: raise ValueError, "date must have 6 elements" for i in range(first, 6): s = d[i] if type(s) == FloatType: if i < 5: try: s = int(s) except OverflowError: if i > 0: raise s = long(s) if s != d[i]: raise ValueError, "%s must be integral" % names[i] d[i] = s elif type(s) == LongType: try: s = int(s) except: pass elif type(s) != IntType: raise TypeError, "%s isn't a valid type" % names[i] if i == first and s < 0: continue if ranges[i] != None and \ (s < ranges[i][0] or ranges[i][1] < s): raise ValueError, "%s out of range" % names[i] if first < 6 and d[5] >= 61: raise ValueError, "seconds out of range" if first < 2: leap = first < 1 and leapMonth(d[0], d[1]) if d[2] > months[d[1]] + leap: raise ValueError, "day out of range" class UnderflowError(exceptions.ArithmeticError): pass def debugHeader(title): s = '*** ' + title + ' ' print s + ('*' * (72 - len(s))) def debugFooter(title): print '*' * 72 sys.stdout.flush() ################################################################################ # SOAP Parser ################################################################################ class SOAPParser(xml.sax.handler.ContentHandler): class Frame: def __init__(self, name, kind = None, attrs = {}, rules = {}): self.name = name self.kind = kind self.attrs = attrs self.rules = rules self.contents = [] self.names = [] self.namecounts = {} self.subattrs = [] def append(self, name, data, attrs): self.names.append(name) self.contents.append(data) self.subattrs.append(attrs) if self.namecounts.has_key(name): self.namecounts[name] += 1 else: self.namecounts[name] = 1 def _placeItem(self, name, value, pos, subpos = 0, attrs = None): self.contents[pos] = value if attrs: self.attrs.update(attrs) def __len__(self): return len(self.contents) def __repr__(self): return "<%s %s at %d>" % (self.__class__, self.name, id(self)) def __init__(self, rules = None): xml.sax.handler.ContentHandler.__init__(self) self.body = None self.header = None self.attrs = {} self._data = None self._next = "E" # Keeping state for message validity self._stack = [self.Frame('SOAP')] # Make two dictionaries to store the prefix <-> URI mappings, and # initialize them with the default self._prem = {NS.XML_T: NS.XML} self._prem_r = {NS.XML: NS.XML_T} self._ids = {} self._refs = {} self._rules = rules def startElementNS(self, name, qname, attrs): # Workaround two sax bugs if name[0] == None and name[1][0] == ' ': name = (None, name[1][1:]) else: name = tuple(name) # First some checking of the layout of the message if self._next == "E": if name[1] != 'Envelope': raise Error, "expected `SOAP-ENV:Envelope', got `%s:%s'" % \ (self._prem_r[name[0]], name[1]) if name[0] != NS.ENV: raise faultType, ("%s:VersionMismatch" % NS.ENV_T, "Don't understand version `%s' Envelope" % name[0]) else: self._next = "HorB" elif self._next == "HorB": if name[0] == NS.ENV and name[1] in ("Header", "Body"): self._next = None else: raise Error, \ "expected `SOAP-ENV:Header' or `SOAP-ENV:Body', " \ "got `%s'" % self._prem_r[name[0]] + ':' + name[1] elif self._next == "B": if name == (NS.ENV, "Body"): self._next = None else: raise Error, "expected `SOAP-ENV:Body', got `%s'" % \ self._prem_r[name[0]] + ':' + name[1] elif self._next == "": raise Error, "expected nothing, got `%s'" % \ self._prem_r[name[0]] + ':' + name[1] if len(self._stack) == 2: rules = self._rules else: try: rules = self._stack[-1].rules[name[1]] except: rules = None if type(rules) not in (NoneType, DictType): kind = rules else: kind = attrs.get((NS.ENC, 'arrayType')) if kind != None: del attrs._attrs[(NS.ENC, 'arrayType')] i = kind.find(':') if i >= 0: kind = (self._prem[kind[:i]], kind[i + 1:]) else: kind = None self.pushFrame(self.Frame(name[1], kind, attrs._attrs, rules)) self._data = '' # Start accumulating def pushFrame(self, frame): self._stack.append(frame) def popFrame(self): return self._stack.pop() def endElementNS(self, name, qname): # Workaround two sax bugs if name[0] == None and name[1][0] == ' ': ns, name = None, name[1][1:] else: ns, name = tuple(name) if self._next == "E": raise Error, "didn't get SOAP-ENV:Envelope" if self._next in ("HorB", "B"): raise Error, "didn't get SOAP-ENV:Body" cur = self.popFrame() attrs = cur.attrs idval = None if attrs.has_key((None, 'id')): idval = attrs[(None, 'id')] if self._ids.has_key(idval): raise Error, "duplicate id `%s'" % idval del attrs[(None, 'id')] root = 1 if len(self._stack) == 3: if attrs.has_key((NS.ENC, 'root')): root = int(attrs[(NS.ENC, 'root')]) # Do some preliminary checks. First, if root="0" is present, # the element must have an id. Next, if root="n" is present, # n something other than 0 or 1, raise an exception. if root == 0: if idval == None: raise Error, "non-root element must have an id" elif root != 1: raise Error, "SOAP-ENC:root must be `0' or `1'" del attrs[(NS.ENC, 'root')] while 1: href = attrs.get((None, 'href')) if href: if href[0] != '#': raise Error, "only do local hrefs right now" if self._data != None and self._data.strip() != '': raise Error, "hrefs can't have data" href = href[1:] if self._ids.has_key(href): data = self._ids[href] else: data = RefHolder(name, self._stack[-1]) if self._refs.has_key(href): self._refs[href].append(data) else: self._refs[href] = [data] del attrs[(None, 'href')] break kind = None if attrs: for i in NS.XSI_L: if attrs.has_key((i, 'type')): kind = attrs[(i, 'type')] del attrs[(i, 'type')] if kind != None: i = kind.find(':') if i >= 0: kind = (self._prem[kind[:i]], kind[i + 1:]) else: # XXX What to do here? (None, kind) is just going to fail in convertType kind = (None, kind) null = 0 if attrs: for i in (NS.XSI, NS.XSI2): if attrs.has_key((i, 'null')): null = attrs[(i, 'null')] del attrs[(i, 'null')] if attrs.has_key((NS.XSI3, 'nil')): null = attrs[(NS.XSI3, 'nil')] del attrs[(NS.XSI3, 'nil')] #MAP 4/12/2002 - must also support "true" #null = int(null) null = (str(null).lower() in ['true', '1']) if null: if len(cur) or \ (self._data != None and self._data.strip() != ''): raise Error, "nils can't have data" data = None break if len(self._stack) == 2: if (ns, name) == (NS.ENV, "Header"): self.header = data = headerType(attrs = attrs) self._next = "B" break elif (ns, name) == (NS.ENV, "Body"): self.body = data = bodyType(attrs = attrs) self._next = "" break elif len(self._stack) == 3 and self._next == None: if (ns, name) == (NS.ENV, "Fault"): data = faultType() self._next = "" break if cur.rules != None: rule = cur.rules if type(rule) in (StringType, UnicodeType): # XXX Need a namespace here rule = (None, rule) elif type(rule) == ListType: rule = tuple(rule) # XXX What if rule != kind? if callable(rule): data = rule(self._data) elif type(rule) == DictType: data = structType(name = (ns, name), attrs = attrs) else: data = self.convertType(self._data, rule, attrs) break if (kind == None and cur.kind != None) or \ (kind == (NS.ENC, 'Array')): kind = cur.kind if kind == None: kind = 'ur-type[%d]' % len(cur) else: kind = kind[1] if len(cur.namecounts) == 1: elemsname = cur.names[0] else: elemsname = None data = self.startArray((ns, name), kind, attrs, elemsname) break if len(self._stack) == 3 and kind == None and \ len(cur) == 0 and \ (self._data == None or self._data.strip() == ''): data = structType(name = (ns, name), attrs = attrs) break if len(cur) == 0 and ns != NS.URN: # Nothing's been added to the current frame so it must be a # simple type. if kind == None: # If the current item's container is an array, it will # have a kind. If so, get the bit before the first [, # which is the type of the array, therefore the type of # the current item. kind = self._stack[-1].kind if kind != None: i = kind[1].find('[') if i >= 0: kind = (kind[0], kind[1][:i]) elif ns != None: kind = (ns, name) if kind != None: try: data = self.convertType(self._data, kind, attrs) except UnknownTypeError: data = None else: data = None if data == None: data = self._data or '' if len(attrs) == 0: try: data = str(data) except: pass break data = structType(name = (ns, name), attrs = attrs) break if isinstance(data, compoundType): for i in range(len(cur)): v = cur.contents[i] data._addItem(cur.names[i], v, cur.subattrs[i]) if isinstance(v, RefHolder): v.parent = data if root: self._stack[-1].append(name, data, attrs) if idval != None: self._ids[idval] = data if self._refs.has_key(idval): for i in self._refs[idval]: i.parent._placeItem(i.name, data, i.pos, i.subpos, attrs) del self._refs[idval] self.attrs[id(data)] = attrs if isinstance(data, anyType): data._setAttrs(attrs) self._data = None # Stop accumulating def endDocument(self): if len(self._refs) == 1: raise Error, \ "unresolved reference " + self._refs.keys()[0] elif len(self._refs) > 1: raise Error, \ "unresolved references " + ', '.join(self._refs.keys()) def startPrefixMapping(self, prefix, uri): self._prem[prefix] = uri self._prem_r[uri] = prefix def endPrefixMapping(self, prefix): try: del self._prem_r[self._prem[prefix]] del self._prem[prefix] except: pass def characters(self, c): if self._data != None: self._data += c arrayre = '^(?:(?P[^:]*):)?' \ '(?P[^[]+)' \ '(?:\[(?P,*)\])?' \ '(?:\[(?P\d+(?:,\d+)*)?\])$' def startArray(self, name, kind, attrs, elemsname): if type(self.arrayre) == StringType: self.arrayre = re.compile (self.arrayre) offset = attrs.get((NS.ENC, "offset")) if offset != None: del attrs[(NS.ENC, "offset")] try: if offset[0] == '[' and offset[-1] == ']': offset = int(offset[1:-1]) if offset < 0: raise Exception else: raise Exception except: raise AttributeError, "invalid Array offset" else: offset = 0 try: m = self.arrayre.search(kind) if m == None: raise Exception t = m.group('type') if t == 'ur-type': return arrayType(None, name, attrs, offset, m.group('rank'), m.group('asize'), elemsname) elif m.group('ns') != None: return typedArrayType(None, name, (self._prem[m.group('ns')], t), attrs, offset, m.group('rank'), m.group('asize'), elemsname) else: return typedArrayType(None, name, (None, t), attrs, offset, m.group('rank'), m.group('asize'), elemsname) except: raise AttributeError, "invalid Array type `%s'" % kind # Conversion class DATETIMECONSTS: SIGNre = '(?P-?)' CENTURYre = '(?P\d{2,})' YEARre = '(?P\d{2})' MONTHre = '(?P\d{2})' DAYre = '(?P\d{2})' HOURre = '(?P\d{2})' MINUTEre = '(?P\d{2})' SECONDre = '(?P\d{2}(?:\.\d*)?)' TIMEZONEre = '(?PZ)|(?P[-+])(?P\d{2}):' \ '(?P\d{2})' BOSre = '^\s*' EOSre = '\s*$' __allres = {'sign': SIGNre, 'century': CENTURYre, 'year': YEARre, 'month': MONTHre, 'day': DAYre, 'hour': HOURre, 'minute': MINUTEre, 'second': SECONDre, 'timezone': TIMEZONEre, 'b': BOSre, 'e': EOSre} dateTime = '%(b)s%(sign)s%(century)s%(year)s-%(month)s-%(day)sT' \ '%(hour)s:%(minute)s:%(second)s(%(timezone)s)?%(e)s' % __allres timeInstant = dateTime timePeriod = dateTime time = '%(b)s%(hour)s:%(minute)s:%(second)s(%(timezone)s)?%(e)s' % \ __allres date = '%(b)s%(sign)s%(century)s%(year)s-%(month)s-%(day)s' \ '(%(timezone)s)?%(e)s' % __allres century = '%(b)s%(sign)s%(century)s(%(timezone)s)?%(e)s' % __allres gYearMonth = '%(b)s%(sign)s%(century)s%(year)s-%(month)s' \ '(%(timezone)s)?%(e)s' % __allres gYear = '%(b)s%(sign)s%(century)s%(year)s(%(timezone)s)?%(e)s' % \ __allres year = gYear gMonthDay = '%(b)s--%(month)s-%(day)s(%(timezone)s)?%(e)s' % __allres recurringDate = gMonthDay gDay = '%(b)s---%(day)s(%(timezone)s)?%(e)s' % __allres recurringDay = gDay gMonth = '%(b)s--%(month)s--(%(timezone)s)?%(e)s' % __allres month = gMonth recurringInstant = '%(b)s%(sign)s(%(century)s|-)(%(year)s|-)-' \ '(%(month)s|-)-(%(day)s|-)T' \ '(%(hour)s|-):(%(minute)s|-):(%(second)s|-)' \ '(%(timezone)s)?%(e)s' % __allres duration = '%(b)s%(sign)sP' \ '((?P\d+)Y)?' \ '((?P\d+)M)?' \ '((?P\d+)D)?' \ '((?PT)' \ '((?P\d+)H)?' \ '((?P\d+)M)?' \ '((?P\d*(?:\.\d*)?)S)?)?%(e)s' % \ __allres timeDuration = duration # The extra 31 on the front is: # - so the tuple is 1-based # - so months[month-1] is December's days if month is 1 months = (31, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) def convertDateTime(self, value, kind): def getZoneOffset(d): zoffs = 0 try: if d['zulu'] == None: zoffs = 60 * int(d['tzhour']) + int(d['tzminute']) if d['tzsign'] != '-': zoffs = -zoffs except TypeError: pass return zoffs def applyZoneOffset(months, zoffs, date, minfield, posday = 1): if zoffs == 0 and (minfield > 4 or 0 <= date[5] < 60): return date if minfield > 5: date[5] = 0 if minfield > 4: date[4] = 0 if date[5] < 0: date[4] += int(date[5]) / 60 date[5] %= 60 date[4] += zoffs if minfield > 3 or 0 <= date[4] < 60: return date date[3] += date[4] / 60 date[4] %= 60 if minfield > 2 or 0 <= date[3] < 24: return date date[2] += date[3] / 24 date[3] %= 24 if minfield > 1: if posday and date[2] <= 0: date[2] += 31 # zoffs is at most 99:59, so the # day will never be less than -3 return date while 1: # The date[1] == 3 (instead of == 2) is because we're # going back a month, so we need to know if the previous # month is February, so we test if this month is March. leap = minfield == 0 and date[1] == 3 and \ date[0] % 4 == 0 and \ (date[0] % 100 != 0 or date[0] % 400 == 0) if 0 < date[2] <= months[date[1]] + leap: break date[2] += months[date[1] - 1] + leap date[1] -= 1 if date[1] > 0: break date[1] = 12 if minfield > 0: break date[0] -= 1 return date try: exp = getattr(self.DATETIMECONSTS, kind) except AttributeError: return None if type(exp) == StringType: exp = re.compile(exp) setattr (self.DATETIMECONSTS, kind, exp) m = exp.search(value) try: if m == None: raise Exception d = m.groupdict() f = ('century', 'year', 'month', 'day', 'hour', 'minute', 'second') fn = len(f) # Index of first non-None value r = [] if kind in ('duration', 'timeDuration'): if d['sep'] != None and d['hour'] == None and \ d['minute'] == None and d['second'] == None: raise Exception f = f[1:] for i in range(len(f)): s = d[f[i]] if s != None: if f[i] == 'second': s = float(s) else: try: s = int(s) except ValueError: s = long(s) if i < fn: fn = i r.append(s) if fn > len(r): # Any non-Nones? raise Exception if d['sign'] == '-': r[fn] = -r[fn] return tuple(r) if kind == 'recurringInstant': for i in range(len(f)): s = d[f[i]] if s == None or s == '-': if i > fn: raise Exception s = None else: if i < fn: fn = i if f[i] == 'second': s = float(s) else: try: s = int(s) except ValueError: s = long(s) r.append(s) s = r.pop(0) if fn == 0: r[0] += s * 100 else: fn -= 1 if fn < len(r) and d['sign'] == '-': r[fn] = -r[fn] cleanDate(r, fn) return tuple(applyZoneOffset(self.DATETIMECONSTS.months, getZoneOffset(d), r, fn, 0)) r = [0, 0, 1, 1, 0, 0, 0] for i in range(len(f)): field = f[i] s = d.get(field) if s != None: if field == 'second': s = float(s) else: try: s = int(s) except ValueError: s = long(s) if i < fn: fn = i r[i] = s if fn > len(r): # Any non-Nones? raise Exception s = r.pop(0) if fn == 0: r[0] += s * 100 else: fn -= 1 if d.get('sign') == '-': r[fn] = -r[fn] cleanDate(r, fn) zoffs = getZoneOffset(d) if zoffs: r = applyZoneOffset(self.DATETIMECONSTS.months, zoffs, r, fn) if kind == 'century': return r[0] / 100 s = [] for i in range(1, len(f)): if d.has_key(f[i]): s.append(r[i - 1]) if len(s) == 1: return s[0] return tuple(s) except Exception, e: raise Error, "invalid %s value `%s' - %s" % (kind, value, e) intlimits = \ { 'nonPositiveInteger': (0, None, 0), 'non-positive-integer': (0, None, 0), 'negativeInteger': (0, None, -1), 'negative-integer': (0, None, -1), 'long': (1, -9223372036854775808L, 9223372036854775807L), 'int': (0, -2147483648L, 2147483647), 'short': (0, -32768, 32767), 'byte': (0, -128, 127), 'nonNegativeInteger': (0, 0, None), 'non-negative-integer': (0, 0, None), 'positiveInteger': (0, 1, None), 'positive-integer': (0, 1, None), 'unsignedLong': (1, 0, 18446744073709551615L), 'unsignedInt': (0, 0, 4294967295L), 'unsignedShort': (0, 0, 65535), 'unsignedByte': (0, 0, 255), } floatlimits = \ { 'float': (7.0064923216240861E-46, -3.4028234663852886E+38, 3.4028234663852886E+38), 'double': (2.4703282292062327E-324, -1.7976931348623158E+308, 1.7976931348623157E+308), } zerofloatre = '[1-9]' def convertType(self, d, t, attrs): dnn = d or '' if t[0] in NS.EXSD_L: if t[1] == "integer": try: d = int(d) if len(attrs): d = long(d) except: d = long(d) return d if self.intlimits.has_key (t[1]): l = self.intlimits[t[1]] try: d = int(d) except: d = long(d) if l[1] != None and d < l[1]: raise UnderflowError, "%s too small" % d if l[2] != None and d > l[2]: raise OverflowError, "%s too large" % d if l[0] or len(attrs): return long(d) return d if t[1] == "string": if len(attrs): return unicode(dnn) try: return str(dnn) except: return dnn if t[1] == "boolean": d = d.strip().lower() if d in ('0', 'false'): return 0 if d in ('1', 'true'): return 1 raise AttributeError, "invalid boolean value" if self.floatlimits.has_key (t[1]): l = self.floatlimits[t[1]] s = d.strip().lower() try: d = float(s) except: # Some platforms don't implement the float stuff. This # is close, but NaN won't be > "INF" as required by the # standard. if s in ("nan", "inf"): return 1e300**2 if s == "-inf": return -1e300**2 raise if str (d) == 'nan': if s != 'nan': raise ValueError, "invalid %s" % t[1] elif str (d) == '-inf': if s != '-inf': raise UnderflowError, "%s too small" % t[1] elif str (d) == 'inf': if s != 'inf': raise OverflowError, "%s too large" % t[1] elif d < 0: if d < l[1]: raise UnderflowError, "%s too small" % t[1] elif d > 0: if d < l[0] or d > l[2]: raise OverflowError, "%s too large" % t[1] elif d == 0: if type(self.zerofloatre) == StringType: self.zerofloatre = re.compile(self.zerofloatre) if self.zerofloatre.search(s): raise UnderflowError, "invalid %s" % t[1] return d if t[1] in ("dateTime", "date", "timeInstant", "time"): return self.convertDateTime(d, t[1]) if t[1] == "decimal": return float(d) if t[1] in ("language", "QName", "NOTATION", "NMTOKEN", "Name", "NCName", "ID", "IDREF", "ENTITY"): return collapseWhiteSpace(d) if t[1] in ("IDREFS", "ENTITIES", "NMTOKENS"): d = collapseWhiteSpace(d) return d.split() if t[0] in NS.XSD_L: if t[1] in ("base64", "base64Binary"): return base64.decodestring(d) if t[1] == "hexBinary": return decodeHexString(d) if t[1] == "anyURI": return urllib.unquote(collapseWhiteSpace(d)) if t[1] in ("normalizedString", "token"): return collapseWhiteSpace(d) if t[0] == NS.ENC: if t[1] == "base64": return base64.decodestring(d) if t[0] == NS.XSD: if t[1] == "binary": try: e = attrs[(None, 'encoding')] if e == 'hex': return decodeHexString(d) elif e == 'base64': return base64.decodestring(d) except: pass raise Error, "unknown or missing binary encoding" if t[1] == "uri": return urllib.unquote(collapseWhiteSpace(d)) if t[1] == "recurringInstant": return self.convertDateTime(d, t[1]) if t[0] in (NS.XSD2, NS.ENC): if t[1] == "uriReference": return urllib.unquote(collapseWhiteSpace(d)) if t[1] == "timePeriod": return self.convertDateTime(d, t[1]) if t[1] in ("century", "year"): return self.convertDateTime(d, t[1]) if t[0] in (NS.XSD, NS.XSD2, NS.ENC): if t[1] == "timeDuration": return self.convertDateTime(d, t[1]) if t[0] == NS.XSD3: if t[1] == "anyURI": return urllib.unquote(collapseWhiteSpace(d)) if t[1] in ("gYearMonth", "gMonthDay"): return self.convertDateTime(d, t[1]) if t[1] == "gYear": return self.convertDateTime(d, t[1]) if t[1] == "gMonth": return self.convertDateTime(d, t[1]) if t[1] == "gDay": return self.convertDateTime(d, t[1]) if t[1] == "duration": return self.convertDateTime(d, t[1]) if t[0] in (NS.XSD2, NS.XSD3): if t[1] == "token": return collapseWhiteSpace(d) if t[1] == "recurringDate": return self.convertDateTime(d, t[1]) if t[1] == "month": return self.convertDateTime(d, t[1]) if t[1] == "recurringDay": return self.convertDateTime(d, t[1]) if t[0] == NS.XSD2: if t[1] == "CDATA": return collapseWhiteSpace(d) raise UnknownTypeError, "unknown type `%s'" % (t[0] + ':' + t[1]) ################################################################################ # call to SOAPParser that keeps all of the info ################################################################################ def _parseSOAP(xml_str, rules = None): try: from cStringIO import StringIO except ImportError: from StringIO import StringIO parser = xml.sax.make_parser() t = SOAPParser(rules = rules) parser.setContentHandler(t) e = xml.sax.handler.ErrorHandler() parser.setErrorHandler(e) inpsrc = xml.sax.xmlreader.InputSource() inpsrc.setByteStream(StringIO(xml_str)) # turn on namespace mangeling parser.setFeature(xml.sax.handler.feature_namespaces,1) parser.parse(inpsrc) return t ################################################################################ # SOAPParser's more public interface ################################################################################ def parseSOAP(xml_str, attrs = 0): t = _parseSOAP(xml_str) if attrs: return t.body, t.attrs return t.body def parseSOAPRPC(xml_str, header = 0, body = 0, attrs = 0, rules = None): t = _parseSOAP(xml_str, rules = rules) p = t.body._aslist[0] # Empty string, for RPC this translates into a void if type(p) in (type(''), type(u'')) and p in ('', u''): name = "Response" for k in t.body.__dict__.keys(): if k[0] != "_": name = k p = structType(name) if header or body or attrs: ret = (p,) if header : ret += (t.header,) if body: ret += (t.body,) if attrs: ret += (t.attrs,) return ret else: return p ################################################################################ # SOAP Builder ################################################################################ class SOAPBuilder: _xml_top = '\n' _xml_enc_top = '\n' _env_top = '%(ENV_T)s:Envelope %(ENV_T)s:encodingStyle="%(ENC)s"' % \ NS.__dict__ _env_bot = '\n' % NS.__dict__ # Namespaces potentially defined in the Envelope tag. _env_ns = {NS.ENC: NS.ENC_T, NS.ENV: NS.ENV_T, NS.XSD: NS.XSD_T, NS.XSD2: NS.XSD2_T, NS.XSD3: NS.XSD3_T, NS.XSI: NS.XSI_T, NS.XSI2: NS.XSI2_T, NS.XSI3: NS.XSI3_T} def __init__(self, args = (), kw = {}, method = None, namespace = None, header = None, methodattrs = None, envelope = 1, encoding = 'UTF-8', use_refs = 0, config = Config): # Test the encoding, raising an exception if it's not known if encoding != None: ''.encode(encoding) self.args = args self.kw = kw self.envelope = envelope self.encoding = encoding self.method = method self.namespace = namespace self.header = header self.methodattrs= methodattrs self.use_refs = use_refs self.config = config self.out = '' self.tcounter = 0 self.ncounter = 1 self.icounter = 1 self.envns = {} self.ids = {} self.depth = 0 self.multirefs = [] self.multis = 0 self.body = not isinstance(args, bodyType) def build(self): ns_map = {} # Cache whether typing is on or not typed = self.config.typed if self.header: # Create a header. self.dump(self.header, "Header", typed = typed) self.header = None # Wipe it out so no one is using it. if self.body: # Call genns to record that we've used SOAP-ENV. self.depth += 1 body_ns = self.genns(ns_map, NS.ENV)[0] self.out += "<%sBody>\n" % body_ns if self.method: self.depth += 1 a = '' if self.methodattrs: for (k, v) in self.methodattrs.items(): a += ' %s="%s"' % (k, v) if self.namespace: # Use the namespace info handed to us methodns, n = self.genns(ns_map, self.namespace) else: methodns, n = '', '' self.out += '<%s%s%s%s%s>\n' % \ (methodns, self.method, n, a, self.genroot(ns_map)) try: if type(self.args) != TupleType: args = (self.args,) else: args = self.args for i in args: self.dump(i, typed = typed, ns_map = ns_map) for (k, v) in self.kw.items(): self.dump(v, k, typed = typed, ns_map = ns_map) except RecursionError: if self.use_refs == 0: # restart b = SOAPBuilder(args = self.args, kw = self.kw, method = self.method, namespace = self.namespace, header = self.header, methodattrs = self.methodattrs, envelope = self.envelope, encoding = self.encoding, use_refs = 1, config = self.config) return b.build() raise if self.method: self.out += "\n" % (methodns, self.method) self.depth -= 1 if self.body: # dump may add to self.multirefs, but the for loop will keep # going until it has used all of self.multirefs, even those # entries added while in the loop. self.multis = 1 for obj, tag in self.multirefs: self.dump(obj, tag, typed = typed, ns_map = ns_map) self.out += "\n" % body_ns self.depth -= 1 if self.envelope: e = map (lambda ns: 'xmlns:%s="%s"' % (ns[1], ns[0]), self.envns.items()) self.out = '<' + self._env_top + ' '.join([''] + e) + '>\n' + \ self.out + \ self._env_bot if self.encoding != None: self.out = self._xml_enc_top % self.encoding + self.out return self.out.encode(self.encoding) return self._xml_top + self.out def gentag(self): self.tcounter += 1 return "v%d" % self.tcounter def genns(self, ns_map, nsURI): if nsURI == None: return ('', '') if type(nsURI) == TupleType: # already a tuple if len(nsURI) == 2: ns, nsURI = nsURI else: ns, nsURI = None, nsURI[0] else: ns = None if ns_map.has_key(nsURI): return (ns_map[nsURI] + ':', '') if self._env_ns.has_key(nsURI): ns = self.envns[nsURI] = ns_map[nsURI] = self._env_ns[nsURI] return (ns + ':', '') if not ns: ns = "ns%d" % self.ncounter self.ncounter += 1 ns_map[nsURI] = ns if self.config.buildWithNamespacePrefix: return (ns + ':', ' xmlns:%s="%s"' % (ns, nsURI)) else: return ('', ' xmlns="%s"' % (nsURI)) def genroot(self, ns_map): if self.depth != 2: return '' ns, n = self.genns(ns_map, NS.ENC) return ' %sroot="%d"%s' % (ns, not self.multis, n) # checkref checks an element to see if it needs to be encoded as a # multi-reference element or not. If it returns None, the element has # been handled and the caller can continue with subsequent elements. # If it returns a string, the string should be included in the opening # tag of the marshaled element. def checkref(self, obj, tag, ns_map): if self.depth < 2: return '' if not self.ids.has_key(id(obj)): n = self.ids[id(obj)] = self.icounter self.icounter = n + 1 if self.use_refs == 0: return '' if self.depth == 2: return ' id="i%d"' % n self.multirefs.append((obj, tag)) else: if self.use_refs == 0: raise RecursionError, "Cannot serialize recursive object" n = self.ids[id(obj)] if self.multis and self.depth == 2: return ' id="i%d"' % n self.out += '<%s href="#i%d"%s/>\n' % (tag, n, self.genroot(ns_map)) return None # dumpers def dump(self, obj, tag = None, typed = 1, ns_map = {}): ns_map = ns_map.copy() self.depth += 1 if type(tag) not in (NoneType, StringType, UnicodeType): raise KeyError, "tag must be a string or None" try: meth = getattr(self, "dump_" + type(obj).__name__) meth(obj, tag, typed, ns_map) except AttributeError: if type(obj) == LongType: obj_type = "integer" else: obj_type = type(obj).__name__ self.out += self.dumper(None, obj_type, obj, tag, typed, ns_map, self.genroot(ns_map)) self.depth -= 1 # generic dumper def dumper(self, nsURI, obj_type, obj, tag, typed = 1, ns_map = {}, rootattr = '', id = '', xml = '<%(tag)s%(type)s%(id)s%(attrs)s%(root)s>%(data)s\n'): if nsURI == None: nsURI = self.config.typesNamespaceURI tag = tag or self.gentag() a = n = t = '' if typed and obj_type: ns, n = self.genns(ns_map, nsURI) ins = self.genns(ns_map, self.config.schemaNamespaceURI)[0] t = ' %stype="%s%s"%s' % (ins, ns, obj_type, n) try: a = obj._marshalAttrs(ns_map, self) except: pass try: data = obj._marshalData() except: data = obj return xml % {"tag": tag, "type": t, "data": data, "root": rootattr, "id": id, "attrs": a} def dump_float(self, obj, tag, typed = 1, ns_map = {}): # Terrible windows hack if not good_float: if obj == float(1e300**2): obj = "INF" elif obj == float(-1e300**2): obj = "-INF" obj = str(obj) if obj in ('inf', '-inf'): obj = str(obj).upper() elif obj == 'nan': obj = 'NaN' self.out += self.dumper(None, "float", obj, tag, typed, ns_map, self.genroot(ns_map)) def dump_string(self, obj, tag, typed = 0, ns_map = {}): tag = tag or self.gentag() id = self.checkref(obj, tag, ns_map) if id == None: return try: data = obj._marshalData() except: data = obj self.out += self.dumper(None, "string", cgi.escape(data), tag, typed, ns_map, self.genroot(ns_map), id) dump_unicode = dump_string dump_str = dump_string # 4/12/2002 - MAP - for Python 2.2 def dump_None(self, obj, tag, typed = 0, ns_map = {}): tag = tag or self.gentag() ns = self.genns(ns_map, self.config.schemaNamespaceURI)[0] self.out += '<%s %snull="1"%s/>\n' % (tag, ns, self.genroot(ns_map)) def dump_list(self, obj, tag, typed = 1, ns_map = {}): if type(obj) == InstanceType: data = obj.data else: data = obj tag = tag or self.gentag() id = self.checkref(obj, tag, ns_map) if id == None: return try: sample = data[0] empty = 0 except: sample = structType() empty = 1 # First scan list to see if all are the same type same_type = 1 if not empty: for i in data[1:]: if type(sample) != type(i) or \ (type(sample) == InstanceType and \ sample.__class__ != i.__class__): same_type = 0 break ndecl = '' if same_type: if (isinstance(sample, structType)) or \ type(sample) == DictType: # force to urn struct try: tns = obj._ns or NS.URN except: tns = NS.URN ns, ndecl = self.genns(ns_map, tns) try: typename = last._typename except: typename = "SOAPStruct" t = ns + typename elif isinstance(sample, anyType): ns = sample._validNamespaceURI(self.config.typesNamespaceURI, self.config.strictNamespaces) if ns: ns, ndecl = self.genns(ns_map, ns) t = ns + sample._type else: t = 'ur-type' else: t = self.genns(ns_map, self.config.typesNamespaceURI)[0] + \ type(sample).__name__ else: t = self.genns(ns_map, self.config.typesNamespaceURI)[0] + \ "ur-type" try: a = obj._marshalAttrs(ns_map, self) except: a = '' ens, edecl = self.genns(ns_map, NS.ENC) ins, idecl = self.genns(ns_map, self.config.schemaNamespaceURI) self.out += \ '<%s %sarrayType="%s[%d]" %stype="%sArray"%s%s%s%s%s%s>\n' %\ (tag, ens, t, len(data), ins, ens, ndecl, edecl, idecl, self.genroot(ns_map), id, a) typed = not same_type try: elemsname = obj._elemsname except: elemsname = "item" for i in data: self.dump(i, elemsname, typed, ns_map) self.out += '\n' % tag dump_tuple = dump_list def dump_dictionary(self, obj, tag, typed = 1, ns_map = {}): tag = tag or self.gentag() id = self.checkref(obj, tag, ns_map) if id == None: return try: a = obj._marshalAttrs(ns_map, self) except: a = '' self.out += '<%s%s%s%s>\n' % \ (tag, id, a, self.genroot(ns_map)) for (k, v) in obj.items(): if k[0] != "_": self.dump(v, k, 1, ns_map) self.out += '\n' % tag dump_dict = dump_dictionary # 4/18/2002 - MAP - for Python 2.2 def dump_instance(self, obj, tag, typed = 1, ns_map = {}): if not tag: # If it has a name use it. if isinstance(obj, anyType) and obj._name: tag = obj._name else: tag = self.gentag() if isinstance(obj, arrayType): # Array self.dump_list(obj, tag, typed, ns_map) return if isinstance(obj, faultType): # Fault cns, cdecl = self.genns(ns_map, NS.ENC) vns, vdecl = self.genns(ns_map, NS.ENV) self.out += '''<%sFault %sroot="1"%s%s> %s %s ''' % (vns, cns, vdecl, cdecl, obj.faultcode, obj.faultstring) if hasattr(obj, "detail"): self.dump(obj.detail, "detail", typed, ns_map) self.out += "\n" % vns return r = self.genroot(ns_map) try: a = obj._marshalAttrs(ns_map, self) except: a = '' if isinstance(obj, voidType): # void self.out += "<%s%s%s>\n" % (tag, a, r, tag) return id = self.checkref(obj, tag, ns_map) if id == None: return if isinstance(obj, structType): # Check for namespace ndecl = '' ns = obj._validNamespaceURI(self.config.typesNamespaceURI, self.config.strictNamespaces) if ns: ns, ndecl = self.genns(ns_map, ns) tag = ns + tag self.out += "<%s%s%s%s%s>\n" % (tag, ndecl, id, a, r) # If we have order use it. order = 1 for i in obj._keys(): if i not in obj._keyord: order = 0 break if order: for i in range(len(obj._keyord)): self.dump(obj._aslist[i], obj._keyord[i], 1, ns_map) else: # don't have pristine order information, just build it. for (k, v) in obj.__dict__.items(): if k[0] != "_": self.dump(v, k, 1, ns_map) if isinstance(obj, bodyType): self.multis = 1 for v, k in self.multirefs: self.dump(v, k, typed = typed, ns_map = ns_map) self.out += '\n' % tag elif isinstance(obj, anyType): t = '' if typed: ns = obj._validNamespaceURI(self.config.typesNamespaceURI, self.config.strictNamespaces) if ns: ons, ondecl = self.genns(ns_map, ns) ins, indecl = self.genns(ns_map, self.config.schemaNamespaceURI) t = ' %stype="%s%s"%s%s' % \ (ins, ons, obj._type, ondecl, indecl) self.out += '<%s%s%s%s%s>%s\n' % \ (tag, t, id, a, r, obj._marshalData(), tag) else: # Some Class self.out += '<%s%s%s>\n' % (tag, id, r) for (k, v) in obj.__dict__.items(): if k[0] != "_": self.dump(v, k, 1, ns_map) self.out += '\n' % tag ################################################################################ # SOAPBuilder's more public interface ################################################################################ def buildSOAP(args=(), kw={}, method=None, namespace=None, header=None, methodattrs=None,envelope=1,encoding='UTF-8',config=Config): t = SOAPBuilder(args=args,kw=kw, method=method, namespace=namespace, header=header, methodattrs=methodattrs,envelope=envelope, encoding=encoding, config=config) return t.build() ################################################################################ # RPC ################################################################################ def SOAPUserAgent(): return "SOAP.py " + __version__ + " (actzero.com)" ################################################################################ # Client ################################################################################ class SOAPAddress: def __init__(self, url, config = Config): proto, uri = urllib.splittype(url) # apply some defaults if uri[0:2] != '//': if proto != None: uri = proto + ':' + uri uri = '//' + uri proto = 'http' host, path = urllib.splithost(uri) try: int(host) host = 'localhost:' + host except: pass if not path: path = '/' if proto not in ('http', 'https'): raise IOError, "unsupported SOAP protocol" if proto == 'https' and not config.SSLclient: raise AttributeError, \ "SSL client not supported by this Python installation" self.proto = proto self.host = host self.path = path def __str__(self): return "%(proto)s://%(host)s%(path)s" % self.__dict__ __repr__ = __str__ class HTTPTransport: # Need a Timeout someday? def call(self, addr, data, soapaction = '', encoding = None, http_proxy = None, config = Config): import httplib if not isinstance(addr, SOAPAddress): addr = SOAPAddress(addr, config) # Build a request if http_proxy: real_addr = http_proxy real_path = addr.proto + "://" + addr.host + addr.path else: real_addr = addr.host real_path = addr.path if addr.proto == 'https': r = httplib.HTTPS(real_addr) else: r = httplib.HTTP(real_addr) r.putrequest("POST", real_path) r.putheader("Host", addr.host) r.putheader("User-agent", SOAPUserAgent()) t = 'text/xml'; if encoding != None: t += '; charset="%s"' % encoding r.putheader("Content-type", t) r.putheader("Content-length", str(len(data))) r.putheader("SOAPAction", '"%s"' % soapaction) if config.dumpHeadersOut: s = 'Outgoing HTTP headers' debugHeader(s) print "POST %s %s" % (real_path, r._http_vsn_str) print "Host:", addr.host print "User-agent: SOAP.py " + __version__ + " (actzero.com)" print "Content-type:", t print "Content-length:", len(data) print 'SOAPAction: "%s"' % soapaction debugFooter(s) r.endheaders() if config.dumpSOAPOut: s = 'Outgoing SOAP' debugHeader(s) print data, if data[-1] != '\n': print debugFooter(s) # send the payload r.send(data) # read response line code, msg, headers = r.getreply() if config.dumpHeadersIn: s = 'Incoming HTTP headers' debugHeader(s) if headers.headers: print "HTTP/1.? %d %s" % (code, msg) print "\n".join(map (lambda x: x.strip(), headers.headers)) else: print "HTTP/0.9 %d %s" % (code, msg) debugFooter(s) if config.dumpSOAPIn: data = r.getfile().read() s = 'Incoming SOAP' debugHeader(s) print data, if data[-1] != '\n': print debugFooter(s) if code not in (200, 500): raise HTTPError(code, msg) if not config.dumpSOAPIn: data = r.getfile().read() # return response payload return data ################################################################################ # SOAP Proxy ################################################################################ class SOAPProxy: def __init__(self, proxy, namespace = None, soapaction = '', header = None, methodattrs = None, transport = HTTPTransport, encoding = 'UTF-8', throw_faults = 1, unwrap_results = 1, http_proxy=None, config = Config): # Test the encoding, raising an exception if it's not known if encoding != None: ''.encode(encoding) self.proxy = SOAPAddress(proxy, config) self.namespace = namespace self.soapaction = soapaction self.header = header self.methodattrs = methodattrs self.transport = transport() self.encoding = encoding self.throw_faults = throw_faults self.unwrap_results = unwrap_results self.http_proxy = http_proxy self.config = config def __call(self, name, args, kw, ns = None, sa = None, hd = None, ma = None): ns = ns or self.namespace ma = ma or self.methodattrs if sa: # Get soapaction if type(sa) == TupleType: sa = sa[0] else: sa = self.soapaction if hd: # Get header if type(hd) == TupleType: hd = hd[0] else: hd = self.header hd = hd or self.header if ma: # Get methodattrs if type(ma) == TupleType: ma = ma[0] else: ma = self.methodattrs ma = ma or self.methodattrs m = buildSOAP(args = args, kw = kw, method = name, namespace = ns, header = hd, methodattrs = ma, encoding = self.encoding, config = self.config) #print m r = self.transport.call(self.proxy, m, sa, encoding = self.encoding, http_proxy = self.http_proxy, config = self.config) #print r p, attrs = parseSOAPRPC(r, attrs = 1) try: throw_struct = self.throw_faults and \ isinstance (p, faultType) except: throw_struct = 0 if throw_struct: raise p # Bubble a regular result up, if there is only element in the # struct, assume that is the result and return it. # Otherwise it will return the struct with all the elements # as attributes. if self.unwrap_results: try: count = 0 for i in p.__dict__.keys(): if i[0] != "_": # don't move the private stuff count += 1 t = getattr(p, i) if count == 1: p = t # Only one piece of data, bubble it up except: pass if self.config.returnAllAttrs: return p, attrs return p def _callWithBody(self, body): return self.__call(None, body, {}) def __getattr__(self, name): # hook to catch method calls return self.__Method(self.__call, name, config = self.config) # To handle attribute wierdness class __Method: # Some magic to bind a SOAP method to an RPC server. # Supports "nested" methods (e.g. examples.getStateName) -- concept # borrowed from xmlrpc/soaplib -- www.pythonware.com # Altered (improved?) to let you inline namespaces on a per call # basis ala SOAP::LITE -- www.soaplite.com def __init__(self, call, name, ns = None, sa = None, hd = None, ma = None, config = Config): self.__call = call self.__name = name self.__ns = ns self.__sa = sa self.__hd = hd self.__ma = ma self.__config = config if self.__name[0] == "_": if self.__name in ["__repr__","__str__"]: self.__call__ = self.__repr__ else: self.__call__ = self.__f_call else: self.__call__ = self.__r_call def __getattr__(self, name): if self.__name[0] == "_": # Don't nest method if it is a directive return self.__class__(self.__call, name, self.__ns, self.__sa, self.__hd, self.__ma) return self.__class__(self.__call, "%s.%s" % (self.__name, name), self.__ns, self.__sa, self.__hd, self.__ma) def __f_call(self, *args, **kw): if self.__name == "_ns": self.__ns = args elif self.__name == "_sa": self.__sa = args elif self.__name == "_hd": self.__hd = args elif self.__name == "_ma": self.__ma = args return self def __r_call(self, *args, **kw): return self.__call(self.__name, args, kw, self.__ns, self.__sa, self.__hd, self.__ma) def __repr__(self): return "<%s at %d>" % (self.__class__, id(self)) ################################################################################ # Server ################################################################################ # Method Signature class for adding extra info to registered funcs, right now # used just to indicate it should be called with keywords, instead of ordered # params. class MethodSig: def __init__(self, func, keywords=0, context=0): self.func = func self.keywords = keywords self.context = context self.__name__ = func.__name__ def __call__(self, *args, **kw): return apply(self.func,args,kw) class SOAPContext: def __init__(self, header, body, attrs, xmldata, connection, httpheaders, soapaction): self.header = header self.body = body self.attrs = attrs self.xmldata = xmldata self.connection = connection self.httpheaders= httpheaders self.soapaction = soapaction # A class to describe how header messages are handled class HeaderHandler: # Initially fail out if there are any problems. def __init__(self, header, attrs): for i in header.__dict__.keys(): if i[0] == "_": continue d = getattr(header, i) try: fault = int(attrs[id(d)][(NS.ENV, 'mustUnderstand')]) except: fault = 0 if fault: raise faultType, ("%s:MustUnderstand" % NS.ENV_T, "Don't understand `%s' header element but " "mustUnderstand attribute is set." % i) ################################################################################ # SOAP Server ################################################################################ class SOAPServer(SocketServer.TCPServer): import BaseHTTPServer class SOAPRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler): def version_string(self): return '' + \ 'SOAP.py ' + __version__ + ' (Python ' + \ sys.version.split()[0] + ')' def date_time_string(self): self.__last_date_time_string = \ SOAPServer.BaseHTTPServer.BaseHTTPRequestHandler.\ date_time_string(self) return self.__last_date_time_string def do_POST(self): try: if self.server.config.dumpHeadersIn: s = 'Incoming HTTP headers' debugHeader(s) print self.raw_requestline.strip() print "\n".join(map (lambda x: x.strip(), self.headers.headers)) debugFooter(s) data = self.rfile.read(int(self.headers["content-length"])) if self.server.config.dumpSOAPIn: s = 'Incoming SOAP' debugHeader(s) print data, if data[-1] != '\n': print debugFooter(s) (r, header, body, attrs) = \ parseSOAPRPC(data, header = 1, body = 1, attrs = 1) method = r._name args = r._aslist kw = r._asdict ns = r._ns resp = "" # For fault messages if ns: nsmethod = "%s:%s" % (ns, method) else: nsmethod = method try: # First look for registered functions if self.server.funcmap.has_key(ns) and \ self.server.funcmap[ns].has_key(method): f = self.server.funcmap[ns][method] else: # Now look at registered objects # Check for nested attributes. This works even if # there are none, because the split will return # [method] f = self.server.objmap[ns] l = method.split(".") for i in l: f = getattr(f, i) except: resp = buildSOAP(faultType("%s:Client" % NS.ENV_T, "No method %s found" % nsmethod, "%s %s" % tuple(sys.exc_info()[0:2])), encoding = self.server.encoding, config = self.server.config) status = 500 else: try: if header: x = HeaderHandler(header, attrs) # If it's wrapped, some special action may be needed if isinstance(f, MethodSig): c = None if f.context: # Build context object c = SOAPContext(header, body, attrs, data, self.connection, self.headers, self.headers["soapaction"]) if f.keywords: # This is lame, but have to de-unicode # keywords strkw = {} for (k, v) in kw.items(): strkw[str(k)] = v if c: strkw["_SOAPContext"] = c fr = apply(f, (), strkw) elif c: fr = apply(f, args, {'_SOAPContext':c}) else: fr = apply(f, args, {}) else: fr = apply(f, args, {}) if type(fr) == type(self) and \ isinstance(fr, voidType): resp = buildSOAP(kw = {'%sResponse' % method: fr}, encoding = self.server.encoding, config = self.server.config) else: resp = buildSOAP(kw = {'%sResponse' % method: {'Result': fr}}, encoding = self.server.encoding, config = self.server.config) except Exception, e: import traceback info = sys.exc_info() if self.server.config.dumpFaultInfo: s = 'Method %s exception' % nsmethod debugHeader(s) traceback.print_exception(info[0], info[1], info[2]) debugFooter(s) if isinstance(e, faultType): f = e else: f = faultType("%s:Server" % NS.ENV_T, "Method %s failed." % nsmethod) if self.server.config.returnFaultInfo: f._setDetail("".join(traceback.format_exception( info[0], info[1], info[2]))) elif not hasattr(f, 'detail'): f._setDetail("%s %s" % (info[0], info[1])) resp = buildSOAP(f, encoding = self.server.encoding, config = self.server.config) status = 500 else: status = 200 except faultType, e: import traceback info = sys.exc_info() if self.server.config.dumpFaultInfo: s = 'Received fault exception' debugHeader(s) traceback.print_exception(info[0], info[1], info[2]) debugFooter(s) if self.server.config.returnFaultInfo: e._setDetail("".join(traceback.format_exception( info[0], info[1], info[2]))) elif not hasattr(e, 'detail'): e._setDetail("%s %s" % (info[0], info[1])) resp = buildSOAP(e, encoding = self.server.encoding, config = self.server.config) status = 500 except: # internal error, report as HTTP server error if self.server.config.dumpFaultInfo: import traceback s = 'Internal exception' debugHeader(s) traceback.print_exc () debugFooter(s) self.send_response(500) self.end_headers() if self.server.config.dumpHeadersOut and \ self.request_version != 'HTTP/0.9': s = 'Outgoing HTTP headers' debugHeader(s) if self.responses.has_key(status): s = ' ' + self.responses[status][0] else: s = '' print "%s %d%s" % (self.protocol_version, 500, s) print "Server:", self.version_string() print "Date:", self.__last_date_time_string debugFooter(s) else: # got a valid SOAP response self.send_response(status) t = 'text/xml'; if self.server.encoding != None: t += '; charset="%s"' % self.server.encoding self.send_header("Content-type", t) self.send_header("Content-length", str(len(resp))) self.end_headers() if self.server.config.dumpHeadersOut and \ self.request_version != 'HTTP/0.9': s = 'Outgoing HTTP headers' debugHeader(s) if self.responses.has_key(status): s = ' ' + self.responses[status][0] else: s = '' print "%s %d%s" % (self.protocol_version, status, s) print "Server:", self.version_string() print "Date:", self.__last_date_time_string print "Content-type:", t print "Content-length:", len(resp) debugFooter(s) if self.server.config.dumpSOAPOut: s = 'Outgoing SOAP' debugHeader(s) print resp, if resp[-1] != '\n': print debugFooter(s) self.wfile.write(resp) self.wfile.flush() # We should be able to shut down both a regular and an SSL # connection, but under Python 2.1, calling shutdown on an # SSL connections drops the output, so this work-around. # This should be investigated more someday. if self.server.config.SSLserver and \ isinstance(self.connection, SSL.Connection): self.connection.set_shutdown(SSL.SSL_SENT_SHUTDOWN | SSL.SSL_RECEIVED_SHUTDOWN) else: self.connection.shutdown(1) def log_message(self, format, *args): if self.server.log: SOAPServer.BaseHTTPServer.BaseHTTPRequestHandler.\ log_message (self, format, *args) def __init__(self, addr = ('localhost', 8000), RequestHandler = SOAPRequestHandler, log = 1, encoding = 'UTF-8', config = Config, namespace = None, ssl_context = None): # Test the encoding, raising an exception if it's not known if encoding != None: ''.encode(encoding) if ssl_context != None and not config.SSLserver: raise AttributeError, \ "SSL server not supported by this Python installation" self.namespace = namespace self.objmap = {} self.funcmap = {} self.ssl_context = ssl_context self.encoding = encoding self.config = config self.log = log self.allow_reuse_address= 1 SocketServer.TCPServer.__init__(self, addr, RequestHandler) def get_request(self): sock, addr = SocketServer.TCPServer.get_request(self) if self.ssl_context: sock = SSL.Connection(self.ssl_context, sock) sock._setup_ssl(addr) if sock.accept_ssl() != 1: raise socket.error, "Couldn't accept SSL connection" return sock, addr def registerObject(self, object, namespace = ''): if namespace == '': namespace = self.namespace self.objmap[namespace] = object def registerFunction(self, function, namespace = '', funcName = None): if not funcName : funcName = function.__name__ if namespace == '': namespace = self.namespace if self.funcmap.has_key(namespace): self.funcmap[namespace][funcName] = function else: self.funcmap[namespace] = {funcName : function} def registerKWObject(self, object, namespace = ''): if namespace == '': namespace = self.namespace for i in dir(object.__class__): if i[0] != "_" and callable(getattr(object, i)): self.registerKWFunction(getattr(object,i), namespace) # convenience - wraps your func for you. def registerKWFunction(self, function, namespace = '', funcName = None): self.registerFunction(MethodSig(function,keywords=1), namespace, funcName) nodebox-web-1.9.4.6/urbandictionary.py000066400000000000000000000050321135274433600176610ustar00rootroot00000000000000import url import soap import re from cache import Cache import pickle def clear_cache(): Cache("urbandictionary").clear() class UrbanDictionaryDefinition: def __init__(self, word, url, description, example, author): self.word = word self.url = url self.description = description self.example = example self.author = author self._parse() def _parse(self): """ Strips links from the definition and gathers them in a links property. """ p1 = "\[.*?\](.*?)\[\/.*?\]" p2 = "\[(.*?)\]" self.links = [] for p in (p1,p2): for link in re.findall(p, self.description): self.links.append(link) self.description = re.sub(p, "\\1", self.description) self.description = self.description.strip() def __str__(self): return self.description class UrbanDictionaryError(Exception): pass class UrbanDictionary(list): def __init__(self, q, cached=True): url = "http://api.urbandictionary.com/soap?wsdl" key = "91cf66fb7f14bbf7fb59c7cf5e22155f" # Live connect for uncached queries # or queries we do not have in cache. cache = Cache("urbandictionary", ".pickle") if not cached or not cache.exists(q): server = soap.SOAPProxy(url) try: definitions = server.lookup(key, q) except Exception, soap.faultType: raise UrbanDictionaryError, "the API is no longer supported" data = [] for item in definitions: ubd = UrbanDictionaryDefinition( item.word, item.url, item.definition, item.example, item.author ) self.append(ubd) data.append( [item.word, item.word, item.definition, item.example, item.author] ) # Cache a pickled version of the response. if cached: data = pickle.dumps(data) cache.write(q, data) # For cached queries, # unpack the pickled version in the cache. else: definitions = cache.read(q) definitions = pickle.loads(definitions) for item in definitions: ubd = UrbanDictionaryDefinition( item[0], item[1], item[2], item[3], item[4] ) self.append(ubd) def search(q, cached=True): return UrbanDictionary(q, cached)nodebox-web-1.9.4.6/url.py000066400000000000000000000357641135274433600153050ustar00rootroot00000000000000### URL ############################################################################################## # Code for identifying, parsing and retrieving URL's. # The asynchronous URLAccumulator is subclassed in all other services. # Author: Tom De Smedt. # Copyright (c) 2007 by Tom De Smedt. # See LICENSE.txt for details. import os import socket, urllib, urllib2, urlparse import thread, time from warnings import warn from cache import Cache ### SETTINGS ######################################################################################### USER_AGENT = "NodeBox/1.9.4 +http://nodebox.net" REFERER = "http://nodebox.net" ### URLERROR ######################################################################################### class URLError(Exception): # A fault in the URL, like a missing t in htp:// def __str__(self): return str(self.__class__) class URLTimeout(URLError): # URL took to long to load. def __str__(self): return str(self.__class__) class HTTPError(URLError): # Error on server. def __str__(self): return str(self.__class__) class HTTP401Authentication(HTTPError): # URL requires a login and password. def __str__(self): return str(self.__class__) class HTTP403Forbidden(HTTPError): # No access to this URL (user-agent?) def __str__(self): return str(self.__class__) class HTTP404NotFound(HTTPError): # URL doesn't exist on the internet. def __str__(self): return str(self.__class__) ### URLPARSER ######################################################################################## class URLParser: def __init__(self, url="", method="get"): """ Splits an url string into different parts. The parts are: protocol, domain, login, username, password, port, path, page, query, anchor. The method defaults to get when the url has a query part. Setting it to post will submit the query by POST when opening the url. """ # If the url is a URLParser, copy POST parameters correctly. is_post_urlparser = False if isinstance(url, URLParser) and url.method == "post": is_post_urlparser = True url.method = "get" # If the url is a URLParser, use its string representation. # See that the original object's method is correctly reset. urlstr = str(url) if is_post_urlparser: url.method = "post" url = urlstr # Consider the following url: # http://user:pass@example.com:992/animal/bird?species=seagull#wings # protocol: http # domain: example.com url = urlparse.urlsplit(url) self.protocol = url[0] self.domain = url[1] # username: user # password: pass self.username = "" self.password = "" if self.domain.find("@") >= 0: login = self.domain.split("@")[0] if login.find(":") >= 0: self.username = login.split(":")[0] self.password = login.split(":")[1] self.domain = self.domain.split("@")[1] # port: 992 self.port = "" if self.domain.find(":") >= 0: p = self.domain.split(":") if p[1].isdigit(): self.port = p[1] self.domain = p[0] # path: /animal/ # page: bird self.path = url[2] self.page = "" if not self.path.endswith("/"): if self.path.find("/") >= 0: self.page = self.path.split("/")[-1] self.path = self.path[:-len(self.page)] else: self.page = self.path self.path = "" self.filename = self.page # query: {"species": "seagull"} self.query = {} self.method = method if url[3] != "": self.method = "get" if is_post_urlparser: self.method = "post" for param in url[3].split("&"): key, value = "", "" if param.find("=") >= 0: try: (key, value) = param.split("=") except: key = param else: key = param if key != "": self.query[key] = value # anchor: wings self.anchor = url[4] def __str__(self): """ Reforms a url string from the different parts. """ url = "" if self.protocol != "" : url += self.protocol + "://" if self.username != "" : url += self.username + ":" + self.password + "@" if self.domain != "" : url += self.domain if self.port != "" : url += ":" + self.port if self.path != "" : url += self.path if self.page != "" : url += self.page if self.method == "get" and \ len(self.query) > 0 : url += "?" + urllib.urlencode(self.query) if self.anchor != "" : url += "#" + self.anchor return url def _address(self): return str(self) address = property(_address) def parse(url): return URLParser(url) def create(url="", method="get"): return URLParser(url, method) #url = parse("http://user:pass@example.com:992/animal/bird?species=seagull#wings") #print url.domain #print url.path #print url.page #print url ### URL OPENER ####################################################################################### PROXY = None def set_proxy(host, type="https"): global PROXY if host != None: PROXY = (host, type) else: PROXY = None def open(url, wait=10): """ Returns a connection to a url which you can read(). When the wait amount is exceeded, raises a URLTimeout. When an error occurs, raises a URLError. 404 errors specifically return a HTTP404NotFound. """ # If the url is a URLParser, get any POST parameters. post = None if isinstance(url, URLParser) and url.method == "post": post = urllib.urlencode(url.query) # If the url is a URLParser (or a YahooResult or something), # use its string representation. url = str(url) # Use urllib instead of urllib2 for local files. if os.path.exists(url): return urllib.urlopen(url) else: socket.setdefaulttimeout(wait) try: #connection = urllib2.urlopen(url, post) request = urllib2.Request(url, post, {"User-Agent": USER_AGENT, "Referer": REFERER}) if PROXY: p = urllib2.ProxyHandler({PROXY[1]: PROXY[0]}) o = urllib2.build_opener(p, urllib2.HTTPHandler) urllib2.install_opener(o) connection = urllib2.urlopen(request) except urllib2.HTTPError, e: if e.code == 401: raise HTTP401Authentication if e.code == 403: raise HTTP403Forbidden if e.code == 404: raise HTTP404NotFound raise HTTPError except urllib2.URLError, e: if e.reason[0] == 36: raise URLTimeout raise URLError return connection #print open("http://nodebox.net") #print open("http:/nodebox.net") #print open("http://ndoebox.net") #print open("http://nodebox.net/doink") #print open("url.py").info() #print open("boink.py").info() #print open("file://url.py").info() ### URL VALIDATION ################################################################################### def is_url(url, wait=10): """ Returns False when no connection can be opened to the url. """ try: connection = open(url, wait) except: return False return True def not_found(url, wait=10): """ Returns True when the url generates a "404 Not Found" error. """ try: connection = open(url, wait) except HTTP404NotFound: return True except: return False return False #url = "http://ndoebox.net" #print is_url(url) #print not_found("http://nodebox.net/nonexistent.html") #print not_found("http://nodebox.net/") ### URL MIMETYPE ################################################################################### def is_type(url, types=[], wait=10): """ Determine the MIME-type of the document behind the url. MIME is more reliable than simply checking the document extension. Returns True when the MIME-type starts with anything in the list of types. """ # Types can also be a single string for convenience. if isinstance(types, str): types = [types] try: connection = open(url, wait) except: return False type = connection.info()["Content-Type"] for t in types: if type.startswith(t): return True return False def is_webpage(url, wait=10): return is_type(url, "text/html", wait) is_page = is_webpage def is_stylesheet(url, wait=10): return is_type(url, "text/css", wait) def is_plaintext(url, wait=10): return is_type(url, "text/plain", wait) def is_pdf(url, wait=10): return is_type(url, "application/pdf", wait) def is_newsfeed(url, wait=10): return is_type(url, ["application/rss+xml", "application/atom+xml"], wait) def is_image(url, wait=10): return is_type(url, ["image/gif", "image/jpeg", "image/x-png"], wait) def is_audio(url, wait=10): return is_type(url, ["audio/mpeg", "audio/x-aiff", "audio/x-wav"], wait) def is_video(url, wait=10): return is_type(url, ["video/mpeg", "video/quicktime"], wait) def is_archive(url, wait=10): return is_type(url, ["application/x-stuffit", "application/x-tar", "application/zip"], wait) #print is_webpage("http://nodebox.net") #print is_archive("http://nodebox.net/code/data/media/coreimage.zip") ### URLACCUMULATOR ################################################################################### urlaccumulator_throttle = {} class URLAccumulator: def __init__(self, url, wait=60, asynchronous=False, cache=None, type=".html", throttle=0): """ Creates a threaded connection to a url and reads data. URLAccumulator can run asynchronously which is useful for animations. The done property is set to True when downloading is complete. The error attribute contains a URLError exception when no data is found. URLAccumulator data can be cached. Downloads that resulted in an error will write an empty file to the cache, the data property will be an empty string but no error is logged when the data is read from the cache in later calls. URLAccumulator can be throttled. This ensures only a certain amount of requests to a domain will happen in a given period of time. URLAccumulator data is loaded. It has a load() method that is called once when done. """ self.url = url self.data = None self.redirect = None self.error = None if cache != None: self.cached = True self._cache = Cache(cache, type) else: self.cached = False self._cache = None self._domain = URLParser(self.url).domain self._throttle = throttle global urlaccumulator_throttle if not self._domain in urlaccumulator_throttle: urlaccumulator_throttle[self._domain] = time.time() - self._throttle self._start = time.time() self._wait = wait self._busy = True self._loaded = False # Synchronous downloads wait until completed, # otherwise check the done property. thread.start_new_thread(self._retrieve, (self.url,)) if not asynchronous: while not self._done(): time.sleep(0.1) def _queued(self): # Throttles live requests: # waits until the current time is greater than # the time of the last request plus the throttle amount. global urlaccumulator_throttle if self.cached and self._cache.exists(str(self.url)): return False elif time.time() < urlaccumulator_throttle[self._domain] + self._throttle: return True else: urlaccumulator_throttle[self._domain] = time.time() return False def _retrieve(self, url): # When the url data is stored in cache, load that. # Otherwise, retrieve it from the web. if self.cached and self._cache.exists(str(url)): self.data = self._cache.read(str(url)) else: try: connection = open(url) self.data = connection.read() self.redirect = connection.geturl() if self.redirect == str(url): self.redirect = None except Exception, e: self.data = u"" self.error = e self._busy = False def _done(self): # Will continue downloading asynchronously. # 1) When the time limit is exceeded, logs a Timeout error. # 2) Once uncached data is ready, stores it in cache. # 3) Loads the data. # 4) Issues a warning when an error occured. if (not self._busy or time.time() >= self._start + self._wait) \ and not self._queued(): # 1 if self.data == None and \ self.error == None: self.data = u"" self.error = URLTimeout() self.load(self.data) self._busy = False if self.cached and not self._cache.exists(str(self.url)) \ and self.data != None and self.data != "": # 2 self._cache.write(str(self.url), str(self.data)) if not self._loaded and self.error == None: # 3 self.load(self.data) self._loaded = True if self.error != None: # 4 warn(str(self.error)+" in "+str(self.__class__)+" for "+str(self.url), Warning) return True else: return False done = property(_done) def load(self, data): """ Override this method in subclasses to process downloaded data. """ pass def retrieve(url, wait=60, asynchronous=False, cache=None, type=".html"): ua = URLAccumulator(url, wait, asynchronous, cache, type) return ua #r = retrieve("http://nodebox.net") #print r.data #print r.redirect #url = create("http://api.search.yahoo.com/ContentAnalysisService/V1/termExtraction", method="post") #url.query["appid"] = "YahooDemo" #url.query["context"] = "Italian sculptors and painters of the renaissance favored the Virgin Mary for inspiration" #url.query["query"] = "madonna" #r = retrieve(url) #print r.data #r = retrieve("http://nodebox.net", asynchronous=True) #while not r.done: # print "wait...", # time.sleep(0.1) #print r.redirect # XXX - should or should we not do quote_plus() somewhere in here?nodebox-web-1.9.4.6/wikipedia.py000066400000000000000000001603121135274433600164350ustar00rootroot00000000000000# coding: utf-8 ### WIKIPEDIA ######################################################################################## # Code for querying Wikipedia and parsing articles. # The parser is as-is, it handles a lot but not everything. # Author: Tom De Smedt. # Copyright (c) 2007 by Tom De Smedt. # See LICENSE.txt for details. import re from xml.dom import minidom from urllib import quote from url import URLAccumulator from html import replace_entities, strip_tags from cache import Cache import mimetex def clear_cache(): Cache("wikipedia").clear() ### WIKIPEDIA PAGE MISSING ERROR ##################################################################### class WikipediaPageMissing(Exception): def __str__(self): return str(self.__class__) ### WIKIPEDIA LANGUAGES ############################################################################## languages = { "aa" : u"Afar", "ab" : u"Abkhaz", "af" : u"Afrikaans", "ak" : u"Akan", "als" : u"Alemannic", "am" : u"Amharic", "an" : u"Aragonese", "ang" : u"Old English", "ar" : u"Arabic", "arc" : u"Aramaic", "as" : u"Assamese", "ast" : u"Asturian", "av" : u"Avar", "ay" : u"Aymara", "az" : u"Azerbaijani", "ba" : u"Bashkir", "bar" : u"Bavarian", "bat-smg" : u"Samogitian", "be" : u"Belarusian", "bg" : u"Bulgarian", "bh" : u"Bihara", "bi" : u"Bislama", "bm" : u"Bambara", "bn" : u"Bengali", "bo" : u"Tibetan", "bpy" : u"Bishnupriya Manipuri", "br" : u"Breton", "bs" : u"Bosnian", "bug" : u"Buginese", "bxr" : u"Buryat (Russia)", "ca" : u"Catalan", "cbk-zam" : u"Zamboanga Chavacano", "cdo" : u"Min Dong", "ce" : u"Chechen", "ceb" : u"Cebuano", "ch" : u"Chamorro", "cho" : u"Choctaw", "chr" : u"Cherokee", "chy" : u"Cheyenne", "co" : u"Corsican", "cr" : u"Cree", "crh" : u"Crimean Tatar", "crh-latn" : u"Crimean Tatar (Latin)", "crh-cyrl" : u"Crimean Tatar (Cyrillic)", "cs" : u"Czech", "csb" : u"Cassubian", "cu" : u"Old Church Slavonic (ancient language)", "cv" : u"Chuvash", "cy" : u"Welsh", "da" : u"Danish", "de" : u"German", "diq" : u"Zazaki", "dk" : u"Danish", "dv" : u"Dhivehi", "dz" : u"Bhutani", "ee" : u"Ewe", "el" : u"Greek", "eml" : u"Emilian-Romagnol/Sammarinese", "en" : u"English", "eo" : u"Esperanto", "es" : u"Spanish", "et" : u"Estonian", "eu" : u"Basque", "fa" : u"Persian", "ff" : u"Fulah", "fi" : u"Finnish", "fiu-vro" : u"Voro", "fj" : u"Fijian", "fo" : u"Faroese", "fr" : u"French", "frp" : u"Franco-Provencal/Arpitan", "fur" : u"Friulian", "fy" : u"Frisian", "ga" : u"Irish", "gd" : u"Scots Gaelic", "gl" : u"Gallegan", "glk" : u"Gilaki", "gn" : u"Guarani", "got" : u"Gothic", "gsw" : u"Alemannic", "gu" : u"Gujarati", "gv" : u"Manx", "ha" : u"Hausa", "haw" : u"Hawaiian", "he" : u"Hebrew", "hi" : u"Hindi", "hil" : u"Hiligaynon", "ho" : u"Hiri Motu", "hr" : u"Croatian", "hsb" : u"Upper Sorbian", "ht" : u"Haitian", "hu" : u"Hungarian", "hy" : u"Armenian", "hz" : u"Herero", "ia" : u"Interlingua (IALA)", "id" : u"Indonesian", "ie" : u"Interlingue (Occidental)", "ig" : u"Igbo", "ii" : u"Sichuan Yi", "ik" : u"Inupiak", "ilo" : u"Ilokano", "io" : u"Ido", "is" : u"Icelandic", "it" : u"Italian", "iu" : u"Inuktitut", "ja" : u"Japanese", "jbo" : u"Lojban", "jv" : u"Javanese", "ka" : u"Georgian", "kaa" : u"Karakalpak", "kab" : u"Kabyle", "kg" : u"KiKongo", "ki" : u"Kikuyu", "kj" : u"Kuanyama", "kk" : u"Kazakh", "kk-cn" : u"Kazakh Arabic", "kk-kz" : u"Kazakh Cyrillic", "kk-tr" : u"Kazakh Latin", "kl" : u"Greenlandic", "km" : u"Cambodian", "kn" : u"Kannada", "ko" : u"Korean", "kr" : u"Kanuri", "ks" : u"Kashmiri", "ksh" : u"Ripuarian", "ku" : u"Kurdish", "kv" : u"Komi", "kw" : u"Cornish", "ky" : u"Kirghiz", "la" : u"Latin", "lad" : u"Ladino", "lbe" : u"Lak", "lb" : u"Luxemburguish", "lg" : u"Ganda", "li" : u"Limburgian", "lij" : u"Ligurian", "lld" : u"Ladin", "lmo" : u"Lombard", "ln" : u"Lingala", "lo" : u"Laotian", "lt" : u"Lithuanian", "lv" : u"Latvian", "lzz" : u"Laz", "map-bms" : u"Banyumasan", "mg" : u"Malagasy", "mh" : u"Marshallese", "mi" : u"Maori", "minnan" : u"Min-nan", "mk" : u"Macedonian", "ml" : u"Malayalam", "mn" : u"Mongoloian", "mo" : u"Moldovan", "mr" : u"Marathi", "ms" : u"Malay", "mt" : u"Maltese", "mus" : u"Creek", "my" : u"Burmese", "mzn" : u"Mazandarin", "na" : u"Nauruan", "nah" : u"Nahuatl", "nan" : u"Min-nan", "nap" : u"Neapolitan", "nb" : u"Norwegian (Bokmal)", "nds" : u"Low German", "nds-nl" : u"Dutch Low Saxon", "ne" : u"Nepali", "new" : u"Newar/Nepal Bhasa", "ng" : u"Ndonga", "nl" : u"Dutch", "nn" : u"Norwegian (Nynorsk)", "no" : u"Norwegian", "non" : u"Old Norse", "nov" : u"Novial", "nrm" : u"Norman", "nv" : u"Navajo", "ny" : u"Chichewa", "oc" : u"Occitan", "om" : u"Oromo", "or" : u"Oriya", "os" : u"Ossetic", "pa" : u"Punjabi", "pag" : u"Pangasinan", "pam" : u"Pampanga", "pap" : u"Papiamentu", "pdc" : u"Pennsylvania German", "pih" : u"Norfuk/Pitcairn/Norfolk", "pi" : u"Pali", "pl" : u"Polish", "pms" : u"Piedmontese", "ps" : u"Pashto", "pt" : u"Portuguese", "pt-br" : u"Brazilian Portuguese", "qu" : u"Quechua", "rm" : u"Raeto-Romance", "rmy" : u"Vlax Romany", "rn" : u"Kirundi", "ro" : u"Romanian", "roa-rup" : u"Aromanian", "roa-tara" : u"Tarantino", "ru" : u"Russian", "ru-sib" : u"Siberian/North Russian", "rw" : u"Kinyarwanda", "sa" : u"Sanskrit", "sc" : u"Sardinian", "scn" : u"Sicilian", "sco" : u"Scots", "sd" : u"Sindhi", "se" : u"Northern Sami", "sg" : u"Sango", "sh" : u"Serbocroatian", "si" : u"Sinhalese", "simple" : u"Simple English", "sk" : u"Slovak", "sl" : u"Slovenian", "sm" : u"Samoan", "sn" : u"Shona", "so" : u"Somali", "sq" : u"Albanian", "sr" : u"Serbian", "sr-ec" : u"Serbian cyrillic ekavian", "sr-jc" : u"Serbian cyrillic iyekvian", "sr-el" : u"Serbian latin ekavian", "sr-jl" : u"Serbian latin iyekavian", "ss" : u"Swati", "st" : u"Southern Sotho", "su" : u"Sundanese", "sv" : u"Swedish", "sw" : u"Swahili", "ta" : u"Tamil", "te" : u"Telugu", "tet" : u"Tetun", "tg" : u"Tajik", "th" : u"Thai", "ti" : u"Tigrinya", "tk" : u"Turkmen", "tl" : u"Tagalog (Filipino)", "tlh" : u"Klingon", "tn" : u"Setswana", "to" : u"Tonga (Tonga Islands)", "tokipona" : u"Toki Pona", "tp" : u"Toki Pona", "tpi" : u"Tok Pisin", "tr" : u"Turkish", "ts" : u"Tsonga", "tt" : u"Tatar", "tum" : u"Tumbuka", "tw" : u"Twi", "ty" : u"Tahitian", "tyv" : u"Tyvan", "udm" : u"Udmurt", "ug" : u"Uyghur", "uk" : u"Ukrainian", "ur" : u"Urdu", "uz" : u"Uzbek", "ve" : u"Venda", "vec" : u"Venetian", "vi" : u"Vietnamese", "vls" : u"West Flemish", "vo" : u"Volapuk", "wa" : u"Walloon", "war" : u"Waray-Waray", "wo" : u"Wolof", "wuu" : u"Wu", "xal" : u"Kalmyk", "xh" : u"Xhosan", "yi" : u"Yiddish", "yo" : u"Yoruba", "za" : u"Zhuang", "zea" : u"Zealandic", "zh" : u"Chinese", # correct? "zh-cfr" : u"Min-nan", "zh-classical" : u"Classical Chinese/Literary Chinese", "zh-cn" : u"Simplified", "zh-hk" : u"Traditional (Hong Kong)", "zh-min-nan" : u"Min-nan", "zh-sg" : u"Simplified (Singapore)", "zh-tw" : u"Traditional", "zh-yue" : u"Cantonese", "zu" : u"Zulu", } ### WIKIPEDIALINK #################################################################################### # Currently not in use. class WikipediaLink: def __init__(self, page, anchor=u"", display=u""): self.page = page self.anchor = anchor self.display = display def __str__(self): return self.page.encode("utf-8") def __unicode__(self): return self.page ### WIKIPEDIAPARAGRAPH ############################################################################### class WikipediaParagraph(list): def __init__(self, title=u"", main=[], related=[], tables=[]): self.title = title self.main = main self.related = related self.tables = [] self.depth = 0 self.parent = None self.children = [] def __str__(self): s = "\n\n".join(self) return s.encode("utf-8") def __unicode__(self): s = "\n\n".join(self) return s ### WIKIPEDIAIMAGE ################################################################################### class WikipediaImage: def __init__(self, path, description=u"", links=[], properties=[]): self.path = path self.description = description self.links = links self.properties = properties def __str__(self): return self.path.encode("utf-8") def __unicode__(self): return self.path ### WIKIPEDIAREFERENCES ############################################################################## class WikipediaReference: def __init__(self, title=u"", url=u""): self.title = title self.url = url self.author = u"" self.first = u"" self.last = u"" self.journal = u"" self.publisher = u"" self.date = u"" self.year = u"" self.id = u"" self.note = u"" def __str__(self): s = "" for key in ["note", "author", "title", "journal", "publisher", "date", "id", "url"]: value = getattr(self, key) if value != "": s += value.rstrip(".,") + ", " s = s.strip(", \n") return s.encode("utf-8") def __unicode__(self): return str(self).decode("utf-8") ### WIKIPEDIATABLE ################################################################################### class WikipediaTable(list): def __init__(self, title=u"", properties=u"", paragraph=None): self.title = u"" self.properties = properties self.paragraph = None class WikipediaTableRow(list): def __init__(self, heading=False, properties=u""): self.properties = properties class WikipediaTableCell(unicode): def __init__(self, data): unicode.__init__(self, data) self.properties = u"" ### WIKIPEDIAPAGE #################################################################################### class WikipediaPage: def __init__(self, title, markup, light=False, full_strip=True): """ Wikipedia page parser. The expected markup is the stuff in Wikipedia's edit textarea. With light=True, it will onlt parse links to other articles (which is faster). With full_strip=False, it will preserve some HTML markup (links, bold, italic). """ self.title = title self.markup = markup self.full_strip = full_strip self.disambiguation = [] self.categories = [] self.links = [] self.paragraphs = [] self.images = [] self.tables = [] self.references = [] self.translations = {} self.important = [] # Main regular expressions used in the parser. self.re = { "disambiguation" : r"\{\{dablink\|(.*)\}\}", "category" : r"\[\[[:]{0,1}Category:(.*?)\]\]", "link" : r"\[\[([^\:]*?)\]\]", "image" : re.compile(r"\[\[Image:[^\[]*\|.*\]\]", re.I), "gallery" : re.compile("(.*?)", re.DOTALL), "table" : re.compile(r"\{\|.*?\|\}", re.DOTALL), "html-table" : re.compile(r".*?
    ", re.DOTALL), "reference" : re.compile(r".*?", re.DOTALL), "citation" : re.compile(r"\{\{cite.*?\}\}", re.DOTALL), "url" : r"\[(http\://.*?)\]", "preformatted" : re.compile(r".*?", re.DOTALL), "translation" : r"\[\[([^\].]*?):(.*?)\]\]", "bold" : r"\'\'\'(.*?)\'\'\'", "comment" : re.compile(r"", re.DOTALL), } # In the process of stripping references and citations from the markup, # they are temporarily marked by this pattern. # Don't use any regex characters in it. self.ref = "--REF--" self.parse(light) def __unicode__(self): str = u"" for paragraph in self.paragraphs: str += paragraph.title+"\n\n" for textblock in paragraph: str += unicode(textblock)+"\n\n" return str def __str__(self): s = "" for p in self.paragraphs: s += (p.title.encode("utf-8") + "\n\n").lstrip("\n") s += (str(p) + "\n\n").lstrip("\n") return s def __unicode__(self): return str(self).decode("utf-8") def parse(self, light=False): """ Parses data from Wikipedia page markup. The markup comes from Wikipedia's edit page. We parse it here into objects containing plain text. The light version parses only links to other articles, it's faster than a full parse. """ markup = self.markup self.disambiguation = self.parse_disambiguation(markup) self.categories = self.parse_categories(markup) self.links = self.parse_links(markup) if not light: # Conversion of HTML markup to Wikipedia markup. markup = self.convert_pre(markup) markup = self.convert_li(markup) markup = self.convert_table(markup) markup = replace_entities(markup) # Harvest references from the markup # and replace them by footnotes. markup = markup.replace("{{Cite", "{{cite") markup = re.sub("\{\{ {1,2}cite", "{{cite", markup) self.references, markup = self.parse_references(markup) # Make sure there are no legend linebreaks in image links. # Then harvest images and strip them from the markup. markup = re.sub("\n+(\{\{legend)", "\\1", markup) self.images, markup = self.parse_images(markup) self.images.extend(self.parse_gallery_images(markup)) self.paragraphs = self.parse_paragraphs(markup) self.tables = self.parse_tables(markup) self.translations = self.parse_translations(markup) self.important = self.parse_important(markup) def plain(self, markup): """ Strips Wikipedia markup from given text. This creates a "plain" version of the markup, stripping images and references and the like. Does some commonsense maintenance as well, like collapsing multiple spaces. If you specified full_strip=False for WikipediaPage instance, some markup is preserved as HTML (links, bold, italic). """ # Strip bold and italic. if self.full_strip: markup = markup.replace("'''", "") markup = markup.replace("''", "") else: markup = re.sub("'''([^']*?)'''", "\\1", markup) markup = re.sub("''([^']*?)''", "\\1", markup) # Strip image gallery sections. markup = re.sub(self.re["gallery"], "", markup) # Strip tables. markup = re.sub(self.re["table"], "", markup) markup = markup.replace("||", "") markup = markup.replace("|}", "") # Strip links, keeping the display alias. # We'll strip the ending ]] later. if self.full_strip: markup = re.sub(r"\[\[[^\]]*?\|", "", markup) else: markup = re.sub(r"\[\[([^]|]*|)\]\]", '\\1', markup) markup = re.sub(r"\[\[([^]|]*|)\|([^]]*)\]\]", '\\2', markup) # Strip translations, users, etc. markup = re.sub(self.re["translation"], "", markup) # This math TeX is not supported: markup = markup.replace("\displaytyle", "") markup = markup.replace("\textstyle", "") markup = markup.replace("\scriptstyle", "") markup = markup.replace("\scriptscriptstyle", "") # Before stripping [ and ] brackets, # make sure they are retained inside equations. markup = re.sub("(.*?)\[(.*?)", "\\1MATH___OPEN\\2", markup) markup = re.sub("(.*?)\](.*?)", "\\1MATH___CLOSE\\2", markup) markup = markup.replace("[", "") markup = markup.replace("]", "") markup = markup.replace("MATH___OPEN", "[") markup = markup.replace("MATH___CLOSE", "]") # a) Strip references. # b) Strip tags. # c) Strip tags. # d) Replace --REF--(12) by [12]. # e) Remove space between [12] and trailing punctuation ., # f) Remove HTML comment # g) Keep the Latin Extended-B template: {{latinx| }} # h) Strip Middle-Earth references. # i) Keep quotes: {{quote| }} # j) Remove templates markup = re.sub(self.re["reference"], "", markup) # a markup = re.sub("", "", markup) # b markup = re.sub("", "", markup) # c markup = re.sub(self.ref+"\(([0-9]*?)\)", "[\\1] ", markup) # d markup = re.sub("\] ([,.\"\?\)])", "]\\1", markup) # e markup = re.sub(self.re["comment"], "", markup) # f markup = re.sub("\{\{latinx\|(.*?)\}\}", "\\1", markup) # g markup = re.sub("\{\{ME-ref.*?\}\}", "", markup) # h markup = re.sub("\{\{quote\|(.*?)\}\}", "\"\\1\"", markup) # i markup = re.sub(re.compile("\{\{.*?\}\}", re.DOTALL), "", markup) # j markup = markup.replace("}}", "") # Collapse multiple spaces between words, # unless they appear in preformatted text. markup = re.sub("", " ", markup) markup = markup.split("\n") for i in range(len(markup)): if not markup[i].startswith(" "): markup[i] = re.sub(r"[ ]+", " ", markup[i]) markup = "\n".join(markup) markup = markup.replace(" .", ".") # Strip all HTML except tags. if self.full_strip: markup = strip_tags(markup, exclude=["math"], linebreaks=True) markup = markup.strip() return markup def convert_pre(self, markup): """ Substitutes

     to Wikipedia markup by adding a space at the start of a line.
            """
            
            for m in re.findall(self.re["preformatted"], markup):
                markup = markup.replace(m, m.replace("\n", "\n "))
                markup = re.sub("\n{0,}", "", markup)
                markup = re.sub("\W{0,}
    ", "", markup) return markup def convert_li(self, markup): """ Subtitutes
  • content to Wikipedia markup. """ for li in re.findall("", markup): markup = re.sub(li, "\n* ", markup) markup = markup.replace("
  • ", "") return markup def convert_table(self, markup): """ Subtitutes content to Wikipedia markup. """ for table in re.findall(self.re["html-table"], markup): wiki = table wiki = re.sub(r"", "{|\\1", wiki) wiki = re.sub(r"", "|-\\1", wiki) wiki = re.sub(r"", "|\\1|", wiki) wiki = wiki.replace("", "\n") wiki = wiki.replace("", "\n") wiki = wiki.replace("
    ", "\n|}") markup = markup.replace(table, wiki) return markup def parse_links(self, markup): """ Returns a list of internal Wikipedia links in the markup. # A Wikipedia link looks like: # [[List of operating systems#Embedded | List of embedded operating systems]] # It does not contain a colon, this indicates images, users, languages, etc. The return value is a list containing the first part of the link, without the anchor. """ links = [] m = re.findall(self.re["link"], markup) for link in m: # We don't like [[{{{1|Universe (disambiguation)}}}]] if link.find("{") >= 0: link = re.sub("\{{1,3}[0-9]{0,2}\|", "", link) link = link.replace("{", "") link = link.replace("}", "") link = link.split("|") link[0] = link[0].split("#") page = link[0][0].strip() #anchor = u"" #display = u"" #if len(link[0]) > 1: # anchor = link[0][1].strip() #if len(link) > 1: # display = link[1].strip() if not page in links: links.append(page) #links[page] = WikipediaLink(page, anchor, display) links.sort() return links def parse_images(self, markup, treshold=6): """ Returns a list of images found in the markup. An image has a pathname, a description in plain text and a list of properties Wikipedia uses to size and place images. # A Wikipedia image looks like: # [[Image:Columbia Supercomputer - NASA Advanced Supercomputing Facility.jpg|right|thumb| # The [[NASA]] [[Columbia (supercomputer)|Columbia Supercomputer]].]] # Parts are separated by "|". # The first part is the image file, the last part can be a description. # In between are display properties, like "right" or "thumb". """ images = [] m = re.findall(self.re["image"], markup) for p in m: p = self.parse_balanced_image(p) img = p.split("|") path = img[0].replace("[[Image:", "").strip() description = u"" links = {} properties = [] if len(img) > 1: img = "|".join(img[1:]) links = self.parse_links(img) properties = self.plain(img).split("|") description = u"" # Best guess: an image description is normally # longer than six characters, properties like # "thumb" and "right" are less than six characters. if len(properties[-1]) > treshold: description = properties[-1] properties = properties[:-1] img = WikipediaImage(path, description, links, properties) images.append(img) markup = markup.replace(p, "") return images, markup.strip() def parse_balanced_image(self, markup): """ Corrects Wikipedia image markup. Images have a description inside their link markup that can contain link markup itself, make sure the outer "[" and "]" brackets delimiting the image are balanced correctly (e.g. no [[ ]] ]]). Called from parse_images(). """ opened = 0 closed = 0 for i in range(len(markup)): if markup[i] == "[": opened += 1 if markup[i] == "]": closed += 1 if opened == closed: return markup[:i+1] return markup def parse_gallery_images(self, markup): """ Parses images from the section. Images inside tags do not have outer "[[" brackets. Add these and then parse again. """ gallery = re.search(self.re["gallery"], markup) if gallery: gallery = gallery.group(1) gallery = gallery.replace("Image:", "[[Image:") gallery = gallery.replace("\n", "]]\n") images, markup = self.parse_images(gallery) return images return [] def parse_paragraph(self, markup): """ Creates a list from lines of text in a paragraph. Each line of text is a new item in the list, except lists and preformatted chunks (
  • and
    ),
            these are kept together as a single chunk.
            
            Lists are formatted using parse_paragraph_list().
            
            Empty lines are stripped from the output.
            Indentation (i.e. lines starting with ":") is ignored.
            
            Called from parse_paragraphs() method.
            
            """
            
            s = self.plain(markup)
            # Add an extra linebreak between the last list item
            # and the normal line following after it, so they don't stick together, e.g.
            # **[[Alin Magic]], magic used in the videogame ''[[Rise of Nations: Rise of Legends]]''
            # In '''popular culture''':
            # * [[Magic (film)|''Magic'' (film)]], a 1978 film starring Anthony Hopkins and Ann-Margret
            s = re.sub(re.compile("\n([*#;].*?)\n([^*#?])", re.DOTALL), "\n\\1\n\n\\2", s)
            # This keeps list items with linebreaks 
            # between them nicely together.
            s = re.sub("\n{2,3}([*#;])", "\n\\1", s)
            chunks = []
            ch = ""
            i = 1
            for chunk in s.split("\n"):
                if chunk.startswith(":"):
                    chunk = chunk.lstrip(":")
                if len(chunk.strip()) > 1:
                    # Leave out taxoboxes and infoboxes.
                    if not chunk.startswith("|"):
                        ch += chunk + "\n"
                if ch.strip() != "":
                    if not re.search("^[ *#;]", chunk):
                        ch = self.parse_paragraph_list(ch)
                        chunks.append(ch.rstrip())
                        ch = ""
    
            if ch.strip() != "":
                ch = self.parse_paragraph_list(ch)
                chunks.append(ch.strip())
                
            return chunks        
        
        def parse_paragraph_list(self, markup, indent="\t"):
            
            """ Formats bullets and numbering of Wikipedia lists.
            
            List items are marked by "*", "#" or ";" at the start of a line.
            We treat ";" the same as "*",
            and replace "#" with real numbering (e.g. "2.").
            Sublists (e.g. *** and ###) get indented by tabs.
            
            Called from parse_paragraphs() method.
            
            """
    
            def lastleft(ch, str):
                n = 0
                while n < len(str) and str[n] == ch: n += 1
                return n        
    
            tally = [1 for i in range(10)]
            chunks = markup.split("\n")
            for i in range(len(chunks)):
                if chunks[i].startswith("#"):
                    j = min(lastleft("#", chunks[i]), len(tally)-1)
                    chunks[i] = indent*(j-1) + str(tally[j])+". " + chunks[i][j:]
                    chunks[i] = chunks[i].replace(".  ", ". ")
                    tally[j] += 1
                    # Reset the numbering of sublists.
                    for k in range(j+1, len(tally)): 
                        tally[k] = 1
                if chunks[i].startswith(";"):
                    chunks[i] = "*" + chunks[i][1:]
                if chunks[i].startswith("*"):
                    j = lastleft("*", chunks[i])  
                    chunks[i] = indent*(j-1) + "* " + chunks[i][j:]
                    chunks[i] = chunks[i].replace("*  ", "* ")
            
            return "\n".join(chunks)
        
        def parse_paragraph_heading_depth(self, markup):
            
            """ Returns the depth of a heading.
            
            The depth determines parent and child relations,
            which headings (and hence which paragraphs) are a child to a heading higher up.
            Returns 0 for 

    =, 1 for

    ==, etc. Called from parse_paragraphs() method. """ return markup.count("=")/2 - 1 def connect_paragraph(self, paragraph, paragraphs): """ Create parent/child links to other paragraphs. The paragraphs parameters is a list of all the paragraphs parsed up till now. The parent is the previous paragraph whose depth is less. The parent's children include this paragraph. Called from parse_paragraphs() method. """ if paragraph.depth > 0: n = range(len(paragraphs)) n.reverse() for i in n: if paragraphs[i].depth == paragraph.depth-1: paragraph.parent = paragraphs[i] paragraphs[i].children.append(paragraph) break return paragraph def parse_paragraph_references(self, markup): """ Updates references with content from specific paragraphs. The "references", "notes", "external links" paragraphs are double-checked for references. Not all items in the list might have been referenced inside the article, or the item might contain more info than we initially parsed from it. Called from parse_paragraphs() method. """ for chunk in markup.split("\n"): # We already parsed this, it contains the self.ref mark. # See if we can strip more notes from it. m = re.search(self.ref+"\(([0-9]*?)\)", chunk) if m: chunk = chunk.strip("* ") chunk = chunk.replace(m.group(0), "") chunk = self.plain(chunk) i = int(m.group(1)) if chunk != "": self.references[i-1].note = chunk # If it's not a citation we don't have this reference yet. elif chunk.strip().startswith("*") \ and chunk.find("{{cite") < 0: chunk = chunk.strip("* ") chunk = self.plain(chunk) if chunk != "": r = WikipediaReference() r.note = chunk self.references.append(r) def parse_paragraphs(self, markup): """ Returns a list of paragraphs in the markup. A paragraph has a title and multiple lines of plain text. A paragraph might have parent and child paragraphs, denoting subtitles or bigger chapters. A paragraph might have links to additional articles. Formats numbered lists by replacing # by 1. Formats bulleted sublists like ** or *** with indentation. """ # Paragraphs to exclude. refs = ["references", "notes", "notes and references", "external links", "further reading"] exclude = ["see also", "media", "gallery", "related topics", "lists", "gallery", "images"] exclude.extend(refs) paragraphs = [] paragraph = WikipediaParagraph(self.title) paragraph_data = "" for chunk in markup.split("\n"): # Strip each line of whitespace, # unless it's a preformatted line (starts with a space). if not chunk.startswith(" "): chunk = chunk.strip() # A title wrapped in "=", "==", "==="... # denotes a new paragraphs section. if chunk.startswith("="): if paragraph.title.lower() in refs \ or (paragraph.parent and paragraph.parent.title.lower() in refs): self.parse_paragraph_references(paragraph_data) paragraph.extend(self.parse_paragraph(paragraph_data)) paragraphs.append(paragraph) # Initialise a new paragraph. # Create parent/child links to other paragraphs. title = chunk.strip().strip("=") title = self.plain(title) paragraph = WikipediaParagraph(title) paragraph.depth = self.parse_paragraph_heading_depth(chunk) if paragraph.title.lower() not in exclude: paragraph = self.connect_paragraph(paragraph, paragraphs) paragraph_data = "" # Underneath a title might be links to in-depth articles, # e.g. Main articles: Computer program and Computer programming # which in wiki markup would be {{main|Computer program|Computer programming}} # The second line corrects" {{Main|Credit (finance)}} or {{Main|Usury}}". elif re.search(re.compile("^{{main", re.I), chunk): paragraph.main = [link.strip("} ") for link in chunk.split("|")[1:]] paragraph.main = [re.sub(re.compile("}}.*?{{main", re.I), "", link) for link in paragraph.main] # At the bottom might be links to related articles, # e.g. See also: Abundance of the chemical elements # which in wiki markup would be {{see also|Abundance of the chemical elements}} elif re.search(re.compile("^{{see {0,1}also", re.I), chunk): paragraph.related = [link.strip("} ") for link in chunk.split("|")[1:]] # Accumulate the data in this paragraph, # we'll process it once a new paragraph starts. else: paragraph_data += chunk +"\n" # Append the last paragraph. if paragraph.title.lower() in refs \ or (paragraph.parent and paragraph.parent.title.lower() in refs): self.parse_paragraph_references(paragraph_data) paragraph.extend(self.parse_paragraph(paragraph_data)) paragraphs.append(paragraph) # The "See also" paragraph is an enumeration of links # which we already parsed so don't show them. # We also did references, and other paragraphs are not that relevant. paragraphs_exclude = [] for paragraph in paragraphs: if paragraph.title.lower() not in exclude \ and not (paragraph.parent and paragraph.parent.title.lower() in exclude): paragraphs_exclude.append(paragraph) if len(paragraphs_exclude) == 1 and \ len(paragraphs_exclude[0]) == 0: return [] return paragraphs_exclude def parse_table_row(self, markup, row): """ Parses a row of cells in a Wikipedia table. Cells in the row are separated by "||". A "!" indicates a row of heading columns. Each cell can contain properties before a "|", # e.g. align="right" | Cell 2 (right aligned). """ if row == None: row = WikipediaTableRow() markup = markup.replace("!!", "||") for cell in markup.lstrip("|!").split("||"): # The "|" after the properties can't be part of a link. i = cell.find("|") j = cell.find("[[") if i>0 and (j<0 or i 0: tables.append(table) return tables def parse_references(self, markup): """ Returns a list of references found in the markup. References appear inline as footnotes, http:// external links, or {{cite}} citations. We replace it with (1)-style footnotes. Additional references data is gathered in parse_paragraph_references() when we parse paragraphs. References can also appear in image descriptions, tables and taxoboxes, so they might not always pop up in a paragraph. The plain() method finally replaces (1) by [1]. """ references = [] # A Wikipedia reference note looks like: # In 1946, [[ENIAC]] consumed an estimated 174 kW. # By comparison, a typical personal computer may use around 400 W; # over four hundred times less. {{Ref harvard|kempf1961|Kempf 1961|a}} m = re.findall(self.re["reference"], markup) for reference in m: reference = re.sub(" {0,1}cite", "{{cite", reference) if not reference.strip().startswith("[http://") and \ not re.search("\{\{cite", reference): r = WikipediaReference() r.note = self.plain(re.sub("", "", reference)) if r.note != "": references.append(r) p = " "+self.ref+"("+str(len(references))+")" markup = markup.replace(reference, p, 1) else: # References containing a citation or url # are better handled by the next patterns. pass # A Wikipedia citation looks like: # {{cite journal # | last = Einstein # | first = Albert # | authorlink = Albert Einstein # | title = Sidelights on Relativity (Geometry and Experience) # | publisher = P. Dutton., Co # | date = 1923}} m = re.findall(self.re["citation"], markup) for citation in m: c = citation.replace("\n", "") r = WikipediaReference() for key in r.__dict__.keys(): value = re.search("\| {0,1}"+key+"(.*?)[\|}]", c) if value: value = value.group(1) value = value.replace("link", "") value = value.strip().strip(" =[]") value = self.plain(value) setattr(r, key, value) if r.first != "" and r.last != "": r.author = r.first + " " + r.last references.append(r) p = " "+self.ref+"("+str(len(references))+")" markup = markup.replace(citation, p, 1) # A Wikipedia embedded url looks like: # [http://www.pbs.org/wnet/hawking/html/home.html ''Stephen Hawking's Universe''] m = re.findall(self.re["url"], markup) for url in m: r = WikipediaReference() i = url.find(" ") if i > 0: r.url = url[:i].strip() r.note = self.plain(url[i:]) else: r.url = url.strip() references.append(r) p = r.note+" "+self.ref+"("+str(len(references))+")" markup = markup.replace("["+url+"]", p, 1) # Since we parsed all citations first and then all notes and urls, # the ordering will not be correct in the markup, # e.g. (1) (11) (12) (2) (3). sorted = [] m = re.findall(self.ref+"\(([0-9]*)\)", markup) for i in m: sorted.append(references[int(i)-1]) markup = markup.replace( self.ref+"("+i+")", self.ref+"**("+str(len(sorted))+")" ) markup = markup.replace(self.ref+"**", self.ref) for r in references: if r not in sorted: sorted.append(r) references = sorted return references, markup.strip() def parse_categories(self, markup): """ Returns a list of categories the page belongs to. # A Wikipedia category link looks like: # [[Category:Computing]] # This indicates the page is included in the given category. # If "Category" is preceded by ":" this indicates a link to a category. """ categories = [] m = re.findall(self.re["category"], markup) for category in m: category = category.split("|") page = category[0].strip() display = u"" if len(category) > 1: display = category[1].strip() #if not categories.has_key(page): # categories[page] = WikipediaLink(page, u"", display) if not page in categories: categories.append(page) return categories def parse_translations(self, markup): """ Returns a dictionary of translations for the page title. A Wikipedia language link looks like: [[af:Rekenaar]]. The parser will also fetch links like "user:" and "media:" but these are stripped against the dictionary of Wikipedia languages. You can get a translated page by searching Wikipedia with the appropriate language code and supplying the translated title as query. """ global languages translations = {} m = re.findall(self.re["translation"], markup) for language, translation in m: if language in languages: translations[language] = translation return translations def parse_disambiguation(self, markup): """ Gets the Wikipedia disambiguation page for this article. A Wikipedia disambiguation link refers to other pages with the same title but of smaller significance, e.g. {{dablink|For the IEEE magazine see [[Computer (magazine)]].}} """ m = re.search(self.re["disambiguation"], markup) if m: return self.parse_links(m.group(1)) else: return [] def parse_important(self, markup): """ Returns a list of words that appear in bold in the article. Things like table titles are not added to the list, these are probably bold because it makes the layout nice, not necessarily because they are important. """ important = [] table_titles = [table.title for table in self.tables] m = re.findall(self.re["bold"], markup) for bold in m: bold = self.plain(bold) if not bold in table_titles: important.append(bold.lower()) return important ### DRAWING UTILITIES ################################################################################ def is_preformatted(str): """ Determines if an item in a paragraph is preformatted. If all of the lines in the markup start with a " " this indicates preformatted text. Preformatted is usually used for programming code. """ for chunk in str.split("\n"): if not chunk.startswith(" "): return False return True def is_list(str): """ Determines if an item in a paragraph is a list. If all of the lines in the markup start with a "*" or "1." this indicates a list as parsed by parse_paragraphs(). It can be drawn with draw_list(). """ for chunk in str.split("\n"): chunk = chunk.replace("\t", "") if not chunk.lstrip().startswith("*") \ and not re.search(r"^([0-9]{1,3}\. )", chunk.lstrip()): return False return True def is_math(str): """ Determines if an item in a paragraph is a LaTeX math equation. Math equations are wrapped in tags. They can be drawn as an image using draw_math(). """ str = str.strip() if str.startswith("") and str.endswith(""): return True else: return False def draw_math(str, x, y, alpha=1.0): """ Uses mimetex to generate a GIF-image from the LaTeX equation. """ try: from web import _ctx except: pass str = re.sub("", "", str.strip()) img = mimetex.gif(str) w, h = _ctx.imagesize(img) _ctx.image(img, x, y, alpha=alpha) return w, h def textwidth(str): """textwidth() reports incorrectly when lineheight() is smaller than 1.0 """ try: from web import _ctx except: pass l = _ctx.lineheight() _ctx.lineheight(1) w = _ctx.textwidth(str) _ctx.lineheight(l) return w def draw_list(markup, x, y, w, padding=5, callback=None): """ Draws list markup with indentation in NodeBox. Draw list markup at x, y coordinates using indented bullets or numbers. The callback is a command that takes a str and an int. """ try: from web import _ctx except: pass i = 1 for chunk in markup.split("\n"): if callback != None: callback(chunk, i) m = re.search("^([0-9]{1,3}\. )", chunk.lstrip()) if m: indent = re.search("[0-9]", chunk).start()*padding*2 bullet = m.group(1) dx = textwidth("000.") chunk = chunk.lstrip(m.group(1)+"\t") if chunk.lstrip().startswith("*"): indent = chunk.find("*")*padding*2 bullet = u"•" dx = textwidth("*") chunk = chunk.lstrip("* \t") _ctx.text(bullet, x+indent, y) dx += padding + indent _ctx.text(chunk, x+dx, y, width=w-dx) y += _ctx.textheight(chunk, width=w-dx) y += _ctx.textheight(" ") * 0.25 i += 1 def draw_table(table, x, y, w, padding=5): """ This is a very poor algorithm to draw Wikipedia tables in NodeBox. """ try: from web import _ctx except: pass f = _ctx.fill() _ctx.stroke(f) h = _ctx.textheight(" ") + padding*2 row_y = y if table.title != "": _ctx.fill(f) _ctx.rect(x, row_y, w, h) _ctx.fill(1) _ctx.text(table.title, x+padding, row_y+_ctx.fontsize()+ padding) row_y += h # A table of flags marking how long a cell # from a previous row is still spanning in a column. rowspans = [1 for i in range(10)] previous_cell_w = 0 for row in table: cell_x = x # The width of a cell is the total table width # evenly divided by the number of cells. # Previous rows' cells still spanning will push cells # to the right and decrease their width. cell_w = 1.0 * w cell_w -= previous_cell_w * len([n for n in rowspans if n > 1]) cell_w /= len(row) # The height of each cell is the highest cell in the row. # The height depends on the amount of text in the cell. cell_h = 0 for cell in row: this_h = _ctx.textheight(cell, width=cell_w-padding*2) + padding*2 cell_h = max(cell_h, this_h) # Traverse each cell in this row. i = 0 for cell in row: # If a previous row's cell is still spanning, # push this cell to the right. if rowspans[i] > 1: rowspans[i] -= 1 cell_x += previous_cell_w i += 1 # Get the rowspan attribute for this cell. m = re.search("rowspan=\"(.*?)\"", cell.properties) if m: rowspan = int(m.group(1)) rowspans[i] = rowspan else: rowspan = 1 # Padded cell text. # Horizontal line above each cell. # Vertical line before each cell. _ctx.fill(f) _ctx.text(cell, cell_x+padding, row_y+_ctx.fontsize()+padding, cell_w-padding*2) _ctx.line(cell_x, row_y, cell_x+cell_w, row_y) if cell_x > x: _ctx.nofill() _ctx.line(cell_x, row_y, cell_x, row_y+cell_h) cell_x += cell_w i += 1 # Move to next row. row_y += cell_h previous_cell_w = cell_w # Table's bounding rectangle. _ctx.nofill() _ctx.rect(x, y, w, row_y-y) ### WIKIPEDIASEARCH ################################################################################## class WikipediaSearch(WikipediaPage, URLAccumulator): def _api_request(self, q, language="en"): url = "http://"+language+".wikipedia.org/w/api.php" url += "?action=query&redirects&format=xml&prop=revisions&rvprop=content&titles=" url += quote(q) return url def __init__(self, q, language="en", light=False, wait=10, asynchronous=False, cached=True, case_sensitive=False, full_strip=True): """ A download manager for Wikipedia pages. WikipediaSearch is a combination of URLAccumulator that handles asynchronous and cached web downloads and WikipediaPage that parses XML retrieved from the Wikipedia API. Retrieves the latest revision. Redirects are handled by the Wikipedia server. """ self._light = light self._full_strip = full_strip if cached: cache = "wikipedia" else: cache = None if not case_sensitive: q = str(q.lower()) q = q.replace(" ", "_") url = self._api_request(q, language) URLAccumulator.__init__(self, url, wait, asynchronous, cache, type=".xml", throttle=2) def load(self, data): dom = minidom.parseString(self.data) page = dom.getElementsByTagName("page")[0] title = page.getAttribute("title") try: rev = dom.getElementsByTagName("rev")[0] data = rev.childNodes[0].nodeValue.strip() except: if not self.error: self.error = WikipediaPageMissing() data = "" WikipediaPage.__init__(self, title, data, light=self._light, full_strip=self._full_strip) def search(q, language="en", light=False, wait=10, asynchronous=False, cached=True, case_sensitive=False, full_strip=True): return WikipediaSearch(q, language, light, wait, asynchronous, cached, case_sensitive, full_strip) ###################################################################################################### # Some interesting things... # Redirects are now handled by the Wikipedia server but for some reason I'm keeping this code around. # The superscript could be used to format references and footnotes. def is_redirect(page): m = re.search(r"#REDIRECT \[\[.*?\]\]", page) if m and len(m.group(0)) == len(page): return True else: return False def redirect(page): m = re.search(r"#REDIRECT \[\[(.*?)\]\]", page) if m: return m.group(1) else: return None def superscript(number): digits = [ u"\u2070", u"\u2071", u"\u2072", u"\u2073", u"\u2074", u"\u2075", u"\u2076", u"\u2077", u"\u2078", u"\u2079", ] s = u"" for digit in str(number): s += digits[int(digit)] return s ###################################################################################################### nodebox-web-1.9.4.6/yahoo.py000066400000000000000000000252611135274433600156110ustar00rootroot00000000000000### YAHOO ############################################################################################ # Code for querying Yahoo! for search terms, images, news and spelling. # Also contains the searchenginesort algorithm originally used in Prism for NodeBox. # Authors: Frederik De Bleser, Tom De Smedt. # Copyright (c) 2007 by Tom De Smedt. # See LICENSE.txt for details. import urllib import xml.dom.minidom from url import URLAccumulator, HTTP403Forbidden from html import replace_entities from cache import Cache def clear_cache(): Cache("yahoo").clear() ### YAHOO SETTINGS ################################################################################### YAHOO_ID = "Bsx0rSzV34HQ9sXprWCaAWCHCINnLFtRF_4wahO1tiVEPpFSltMdqkM1z6Xubg" ### YAHOO SERVICES ################################################################################### YAHOO_SEARCH = "search" YAHOO_IMAGES = "images" YAHOO_NEWS = "news" YAHOO_SPELLING = "spelling" ### YAHOOERROR ####################################################################################### class YahooError(Exception): def __str__(self): return str(self.__class__) class YahooLimitError(YahooError): # Daily limit was exceeded. def __str__(self): return str(self.__class__) ### YAHOO LICENSE #################################################################################### def license_key(id=None): global YAHOO_ID if id != None: YAHOO_ID = id return YAHOO_ID ### YAHOO UNICODE #################################################################################### def format_data(s): """ Yahoo library returns Unicode strings. """ return s.encode("utf-8") ### YAHOORESULT ###################################################################################### class YahooResult: """ Creates an item in a YahooSearch list object. """ def __init__(self): self.title = None self.url = None self.description = None self.type = None self.date = None self.width = None # images self.height = None # images self.source = None # news self.language = None # news def __repr__(self): s = format_data(self.url) return s ### YAHOORESULTS ##################################################################################### class YahooResults(list): """ Creates a list of results from a Yahoo query. The total number of available results is stored in the results property. Each item in the list is a YahooResult object. """ def __init__(self, q, data, service=YAHOO_SEARCH): self.query = q self.total = 0 if data == "": return dom = xml.dom.minidom.parseString(data) doc = dom.childNodes[0] self.total = int(doc.attributes["totalResultsAvailable"].value) for r in doc.getElementsByTagName('Result'): item = YahooResult() item.title = self._parse(r, 'Title') item.url = self._parse(r, 'Url') item.description = self._parse(r, 'Summary') if service == YAHOO_SEARCH: item.type = self._parse(r, 'MimeType') item.date = self._parse(r, 'ModificationDate') if service == YAHOO_IMAGES: item.type = self._parse(r, 'FileFormat') item.width = int(self._parse(r, 'Width')) item.height = int(self._parse(r, 'Height')) if service == YAHOO_NEWS: item.date = self._parse(r, 'ModificationDate') item.source = self._parse(r, 'NewsSourceUrl') item.language = self._parse(r, 'Language') self.append(item) def _parse(self, e, tag): """ Parses the text data from an XML element defined by tag. """ tags = e.getElementsByTagName(tag) children = tags[0].childNodes if len(children) != 1: return None assert children[0].nodeType == xml.dom.minidom.Element.TEXT_NODE s = children[0].nodeValue s = format_data(s) s = replace_entities(s) return s def __cmp__(self, other): """ Compares with another YahooSearch based on the number of results. """ if self.total > other.total: return 1 elif self.total < other.total: return -1 else: return 0 #### YAHOOSEARCH ##################################################################################### class YahooSearch(YahooResults, URLAccumulator): def __init__(self, q, start=1, count=10, service=YAHOO_SEARCH, context=None, wait=10, asynchronous=False, cached=True): """ Searches Yahoo for the given query. By default, return cached results whenever possible. Otherwise, go online and update the local cache. The number of results is limited to count and starts at the given index. The returned results depend on the service used: web pages, images, news, spelling suggestion or contextual links. """ self.query = q self.service = service if cached: cache = "yahoo" else: cache = None url = "http://search.yahooapis.com/" if service == YAHOO_SEARCH and context == None : url += "WebSearchService/V1/webSearch?" if service == YAHOO_SEARCH and context != None : url += "WebSearchService/V1/contextSearch?" if service == YAHOO_IMAGES : url += "ImageSearchService/V1/imageSearch?" if service == YAHOO_NEWS : url += "NewsSearchService/V1/newsSearch?" if service == YAHOO_SPELLING : url += "WebSearchService/V1/spellingSuggestion?" arg = urllib.urlencode((("appid", YAHOO_ID), ("query", q), ("start", start), ("results", count), ("context", unicode(context)))) url += arg URLAccumulator.__init__(self, url, wait, asynchronous, cache, ".xml") def load(self, data): if str(self.error.__class__) == str(HTTP403Forbidden().__class__): self.error = YahooLimitError() YahooResults.__init__(self, self.query, data, self.service) ###################################################################################################### def search(q, start=1, count=10, context=None, wait=10, asynchronous=False, cached=False): """ Returns a Yahoo web query formatted as a YahooSearch list object. """ service = YAHOO_SEARCH return YahooSearch(q, start, count, service, context, wait, asynchronous, cached) def search_images(q, start=1, count=10, wait=10, asynchronous=False, cached=False): """ Returns a Yahoo images query formatted as a YahooSearch list object. """ service = YAHOO_IMAGES return YahooSearch(q, start, count, service, None, wait, asynchronous, cached) def search_news(q, start=1, count=10, wait=10, asynchronous=False, cached=False): """ Returns a Yahoo news query formatted as a YahooSearch list object. """ service = YAHOO_NEWS return YahooSearch(q, start, count, service, None, wait, asynchronous, cached) #### YAHOOSPELLING ################################################################################### class YahooSpelling(YahooSearch): def __init__(self, q, wait, asynchronous, cached): service = YAHOO_SPELLING YahooSearch.__init__(self, q, 1, 1, service, None, wait, asynchronous, cached) def load(self, data): dom = xml.dom.minidom.parseString(data) doc = dom.childNodes[0] r = doc.getElementsByTagName('Result') if len(r) > 0: r = r[0].childNodes[0].nodeValue r = format_data(r) else: r = q self.append(r) def suggest_spelling(q, wait=10, asynchronous=False, cached=False): """ Returns list of suggested spelling corrections for the given query. """ return YahooSpelling(q, wait, asynchronous, cached) #### YAHOO SORT ###################################################################################### def sort(words, context="", strict=True, relative=True, service=YAHOO_SEARCH, wait=10, asynchronous=False, cached=False): """Performs a Yahoo sort on the given list. Sorts the items in the list according to the result count Yahoo yields on an item. Setting a context sorts the items according to their relation to this context; for example sorting [red, green, blue] by "love" yields red as the highest results, likely because red is the color commonly associated with love. """ results = [] for word in words: q = word + " " + context q.strip() if strict: q = "\""+q+"\"" r = YahooSearch(q, 1, 1, service, context, wait, asynchronous, cached) results.append(r) results.sort(YahooResults.__cmp__) results.reverse() if relative and len(results) > 0: sum = 0.000000000000000001 for r in results: sum += r.total for r in results: r.total /= float(sum) results = [(r.query, r.total) for r in results] return results ###################################################################################################### #r = search("nodebox", cached=False, start=5, count=5) #print r.total #for item in r: # print item #r = search_images("nodebox", cached=False, start=1) #print r.total #for item in r: # print item, item.width, "x", item.height #r = search_news("apple", cached=False, start=1, asynchronous=True) #import time #while not r.done: # print "waiting..." # time.sleep(0.1) #print r.total #for item in r: # print item, item.source, item.language #print suggest_spelling("amazoon") #results = sort(["green", "blue", "red"], "sky", strict=False, cached=True) #for word, count in results: # print word, count #ctx = ''' #The apple tree was perhaps the earliest tree to be cultivated, #and apples have remained an important food in all cooler climates. #To a greater degree than other tree fruit, except possibly citrus, #apples store for months while still retaining much of their nutritive value. #We are not looking for a company named Apple. #''' #r = search("apple", cached=False, start=1, context=ctx) #print r.total #for item in r: # print item.title