hachoir-regex-1.0.5/0000755000175000017500000000000011330155056013263 5ustar haypohaypohachoir-regex-1.0.5/hachoir_regex/0000755000175000017500000000000011330155056016072 5ustar haypohaypohachoir-regex-1.0.5/hachoir_regex/pattern.py0000644000175000017500000001165211251277274020137 0ustar haypohaypofrom hachoir_regex import RegexEmpty, RegexOr, parse, createString from hachoir_regex.tools import makePrintable class Pattern: """ Abstract class used to define a pattern used in pattern matching """ def __init__(self, user): self.user = user class StringPattern(Pattern): """ Static string pattern """ def __init__(self, text, user=None): Pattern.__init__(self, user) self.text = text def __str__(self): return makePrintable(self.text, 'ASCII', to_unicode=True) def __repr__(self): return "" % self class RegexPattern(Pattern): """ Regular expression pattern """ def __init__(self, regex, user=None): Pattern.__init__(self, user) self.regex = parse(regex) self._compiled_regex = None def __str__(self): return makePrintable(str(self.regex), 'ASCII', to_unicode=True) def __repr__(self): return "" % self def match(self, data): return self.compiled_regex.match(data) def _getCompiledRegex(self): if self._compiled_regex is None: self._compiled_regex = self.regex.compile(python=True) return self._compiled_regex compiled_regex = property(_getCompiledRegex) class PatternMatching: """ Fast pattern matching class: match multiple patterns at the same time. Create your patterns: >>> p=PatternMatching() >>> p.addString("a") >>> p.addString("b") >>> p.addRegex("[cd]e") Search patterns: >>> for item in p.search("a b ce"): ... print item ... (0, 1, ) (2, 3, ) (4, 6, ) """ def __init__(self): self.string_patterns = [] self.string_dict = {} self.regex_patterns = [] self._need_commit = True # Following attributes are generated by _commit() method self._regex = None self._compiled_regex = None self._max_length = None def commit(self): """ Generate whole regex merging all (string and regex) patterns """ if not self._need_commit: return self._need_commit = False length = 0 regex = None for item in self.string_patterns: if regex: regex |= createString(item.text) else: regex = createString(item.text) length = max(length, len(item.text)) for item in self.regex_patterns: if regex: regex |= item.regex else: regex = item.regex length = max(length, item.regex.maxLength()) if not regex: regex = RegexEmpty() self._regex = regex self._compiled_regex = regex.compile(python=True) self._max_length = length def addString(self, magic, user=None): item = StringPattern(magic, user) if item.text in self.string_dict: # Skip duplicates return self.string_patterns.append(item) self.string_dict[item.text] = item self._need_commit = True def addRegex(self, regex, user=None): item = RegexPattern(regex, user) if item.regex.maxLength() is None: raise ValueError("Regular expression with no maximum size has forbidden") self.regex_patterns.append(item) self._need_commit = True def getPattern(self, data): """ Get pattern item matching data. Raise KeyError if no pattern does match it. """ # Try in string patterns try: return self.string_dict[data] except KeyError: pass # Try in regex patterns for item in self.regex_patterns: if item.match(data): return item raise KeyError("Unable to get pattern item") def search(self, data): """ Search patterns in data. Return a generator of tuples: (start, end, item) """ if not self.max_length: # No pattern: returns nothing return for match in self.compiled_regex.finditer(data): item = self.getPattern(match.group(0)) yield (match.start(0), match.end(0), item) def __str__(self): return makePrintable(str(self.regex), 'ASCII', to_unicode=True) def _getAttribute(self, name): self.commit() return getattr(self, name) def _getRegex(self): return self._getAttribute("_regex") regex = property(_getRegex) def _getCompiledRegex(self): return self._getAttribute("_compiled_regex") compiled_regex = property(_getCompiledRegex) def _getMaxLength(self): return self._getAttribute("_max_length") max_length = property(_getMaxLength) if __name__ == "__main__": import doctest, sys failure, nb_test = doctest.testmod() if failure: sys.exit(1) hachoir-regex-1.0.5/hachoir_regex/tools.py0000644000175000017500000000475511251277274017630 0ustar haypohaypo# -*- coding: UTF-8 -*- import re regex_control_code = re.compile("([\x00-\x1f\x7f])") controlchars = tuple({ # Don't use "\0", because "\0"+"0"+"1" = "\001" = "\1" (1 character) # Same rease to not use octal syntax ("\1") ord("\n"): r"\n", ord("\r"): r"\r", ord("\t"): r"\t", ord("\a"): r"\a", ord("\b"): r"\b", }.get(code, '\\x%02x' % code) for code in xrange(128) ) def makePrintable(data, charset, quote=None, to_unicode=False, smart=True): r""" Prepare a string to make it printable in the specified charset. It escapes control characters. Characters with codes bigger than 127 are escaped if data type is 'str' or if charset is "ASCII". Examples with Unicode: >>> aged = unicode("âgé", "UTF-8") >>> repr(aged) # text type is 'unicode' "u'\\xe2g\\xe9'" >>> makePrintable("abc\0", "UTF-8") 'abc\\0' >>> makePrintable(aged, "latin1") '\xe2g\xe9' >>> makePrintable(aged, "latin1", quote='"') '"\xe2g\xe9"' Examples with string encoded in latin1: >>> aged_latin = unicode("âgé", "UTF-8").encode("latin1") >>> repr(aged_latin) # text type is 'str' "'\\xe2g\\xe9'" >>> makePrintable(aged_latin, "latin1") '\\xe2g\\xe9' >>> makePrintable("", "latin1") '' >>> makePrintable("a", "latin1", quote='"') '"a"' >>> makePrintable("", "latin1", quote='"') '(empty)' >>> makePrintable("abc", "latin1", quote="'") "'abc'" Control codes: >>> makePrintable("\0\x03\x0a\x10 \x7f", "latin1") '\\0\\3\\n\\x10 \\x7f' Quote character may also be escaped (only ' and "): >>> print makePrintable("a\"b", "latin-1", quote='"') "a\"b" >>> print makePrintable("a\"b", "latin-1", quote="'") 'a"b' >>> print makePrintable("a'b", "latin-1", quote="'") 'a\'b' """ if data: if not isinstance(data, unicode): data = unicode(data, "ISO-8859-1") charset = "ASCII" data = regex_control_code.sub( lambda regs: controlchars[ord(regs.group(1))], data) if quote: if quote in "\"'": data = data.replace(quote, '\\' + quote) data = ''.join((quote, data, quote)) elif quote: data = "(empty)" data = data.encode(charset, "backslashreplace") if smart: # Replace \x00\x01 by \0\1 data = re.sub(r"\\x0([0-7])(?=[^0-7]|$)", r"\\\1", data) if to_unicode: data = unicode(data, charset) return data hachoir-regex-1.0.5/hachoir_regex/compatibility.py0000644000175000017500000000160611251277274021331 0ustar haypohaypo""" Compatibility functions for old version of Python. any() function ============== any() returns True if at least one items is True, or False otherwise. >>> any([False, True]) True >>> any([True, True]) True >>> any([False, False]) False all() function ============== all() returns True if all items are True, or False otherwise. This function is just apply binary and operator (&) on all values. >>> all([True, True]) True >>> all([False, True]) False >>> all([False, False]) False """ import operator # --- any() from Python 2.5 --- try: from __builtin__ import any except ImportError: def any(items): for item in items: if item: return True return False # ---all() from Python 2.5 --- try: from __builtin__ import all except ImportError: def all(items): return reduce(operator.__and__, items) __all__ = ("any", "all") hachoir-regex-1.0.5/hachoir_regex/__init__.py0000644000175000017500000000047311251277274020220 0ustar haypohaypofrom hachoir_regex.regex import (RegexEmpty, RegexString, createString, RegexRangeItem, RegexRangeCharacter, RegexRange, createRange, RegexAnd, RegexOr, RegexRepeat, RegexDot, RegexStart, RegexEnd, RegexWord) from hachoir_regex.parser import parse from hachoir_regex.pattern import PatternMatching hachoir-regex-1.0.5/hachoir_regex/parser.py0000644000175000017500000001376211320224754017752 0ustar haypohaypo""" Parse string to create Regex object. TODO: - Support \: \001, \x00, \0, \ \[, \(, \{, etc. - Support Python extensions: (?:...), (?P...), etc. - Support \<, \>, \s, \S, \w, \W, \Z <=> $, \d, \D, \A <=> ^, \b, \B, [[:space:]], etc. """ from hachoir_regex import (RegexString, RegexEmpty, RegexRepeat, RegexDot, RegexWord, RegexStart, RegexEnd, RegexRange, RegexRangeItem, RegexRangeCharacter) import re REGEX_COMMAND_CHARACTERS = '.^$[](){}|+?*\\' def parseRange(text, start): r""" >>> parseRange('[a]b', 1) (, 3) >>> parseRange('[a-z]b', 1) (, 5) >>> parseRange('[^a-z-]b', 1) (, 7) >>> parseRange('[^]-]b', 1) (, 5) >>> parseRange(r'[\]abc]', 1) (, 7) >>> parseRange(r'[a\-x]', 1) (, 6) """ index = start char_range = [] exclude = False if text[index] == '^': exclude = True index += 1 if text[index] == ']': char_range.append(RegexRangeCharacter(']')) index += 1 while index < len(text) and text[index] != ']': if index+1 < len(text) \ and text[index] == '\\': char_range.append(RegexRangeCharacter(text[index+1])) index += 2 elif index+1 < len(text) \ and text[index] == '-' and text[index+1] == ']': break elif index+3 < len(text) \ and text[index+1] == '-' \ and text[index+2] != ']': char_range.append(RegexRangeItem(ord(text[index]), ord(text[index+2]))) index += 3 else: char_range.append(RegexRangeCharacter(text[index])) index += 1 if index < len(text) and text[index] == '-': char_range.append(RegexRangeCharacter('-')) index += 1 if index == len(text) or text[index] != ']': raise SyntaxError('Invalid range: %s' % text[start-1:index]) return RegexRange(char_range, exclude), index+1 def parseOr(text, start): """ >>> parseOr('(a)', 1) (, 3) >>> parseOr('(a|c)', 1) (, 5) >>> parseOr(' (a|[bc]|d)', 2) (, 11) """ index = start # (?:...): Skip Python prefix '?:' if text[index:index+2] == '?:': index += 2 if text[index] == '?': raise NotImplementedError("Doesn't support Python extension (?...)") regex = None while True: new_regex, index = _parse(text, index, "|)") if regex: regex = regex | new_regex else: regex = new_regex if len(text) <= index: raise SyntaxError('Missing closing parenthesis') if text[index] == ')': break index += 1 index += 1 if regex is None: regex = RegexEmpty() return regex, index REPEAT_REGEX = re.compile("([0-9]+)(,[0-9]*)?}") def parseRepeat(text, start): """ >>> parseRepeat('a{0,1}b', 2) (0, 1, 6) >>> parseRepeat('a{12}', 2) (12, 12, 5) """ match = REPEAT_REGEX.match(text, start) if not match: raise SyntaxError('Unable to parse repetition '+text[start:]) rmin = int(match.group(1)) if match.group(2): text = match.group(2)[1:] if text: rmax = int(text) else: rmax = None else: rmax = rmin return (rmin, rmax, match.end(0)) CHAR_TO_FUNC = {'[': parseRange, '(': parseOr} CHAR_TO_CLASS = {'.': RegexDot, '^': RegexStart, '$': RegexEnd} CHAR_TO_REPEAT = {'*': (0, None), '?': (0, 1), '+': (1, None)} def _parse(text, start=0, until=None): if len(text) == start: return RegexEmpty(), 0 index = start regex = RegexEmpty() last = None done = False while index < len(text): char = text[index] if until and char in until: done = True break if char in REGEX_COMMAND_CHARACTERS: if char in CHAR_TO_FUNC: new_regex, index = CHAR_TO_FUNC[char] (text, index+1) elif char in CHAR_TO_CLASS: new_regex = CHAR_TO_CLASS[char]() index += 1 elif char == '{': rmin, rmax, index = parseRepeat(text, index+1) new_regex = RegexRepeat(last, rmin, rmax) last = None elif char in CHAR_TO_REPEAT: rmin, rmax = CHAR_TO_REPEAT[char] if last is None: raise SyntaxError('Repetition character (%s) without previous expression' % text[index]) new_regex = RegexRepeat(last, rmin, rmax) last = None index += 1 elif char == "\\": index += 1 if index == len(text): raise SyntaxError("Antislash (\\) without escaped character") char = text[index] if char == 'b': new_regex = RegexWord() else: if not(char in REGEX_COMMAND_CHARACTERS or char in " '"): raise SyntaxError("Operator '\\%s' is not supported" % char) new_regex = RegexString(char) index += 1 else: raise NotImplementedError("Operator '%s' is not supported" % char) if last: regex = regex + last last = new_regex else: subtext = text[index] index += 1 if last: regex = regex + last last = RegexString(subtext) if last: regex = regex + last return regex, index def parse(text): r""" >>> parse('') >>> parse('abc') >>> parse("chats?") >>> parse('[bc]d') >>> parse("\\.") """ regex, index = _parse(text) assert index == len(text) return regex if __name__ == "__main__": import doctest doctest.testmod() hachoir-regex-1.0.5/hachoir_regex/version.py0000644000175000017500000000021211330153327020123 0ustar haypohaypo__version__ = "1.0.5" PACKAGE = "hachoir-regex" WEBSITE = "http://bitbucket.org/haypo/hachoir/wiki/hachoir-regex" LICENSE = 'GNU GPL v2' hachoir-regex-1.0.5/hachoir_regex/regex.py0000644000175000017500000006260511320224127017562 0ustar haypohaypo# -*- coding: UTF-8 -*- """ Object to manage regular expressions, try to optimize the result: - '(a|b)' => '[ab]' - '(color red|color blue)' => 'color (red|blue)' - '([ab]|c)' => '[abc]' - 'ab' + 'cd' => 'abcd' (one long string) - [a-z]|[b] => [a-z] - [a-c]|[a-e] => [a-z] - [a-c]|[d] => [a-d] - [a-c]|[d-f] => [a-f] Operation: - str(): convert to string - repr(): debug string - a & b: concatenation, eg. "big " & "car" => "big car" - a + b: alias to a & b - a | b: a or b, eg. "dog" | "cat" => "dog|cat" - minLength(): minimum length of matching pattern, "(cat|horse)".minLength() => 3 - maxLength(): maximum length of matching pattern, "(cat|horse)".maxLength() => 5 Utilities: - createString(): create a regex matching a string - createRange(): create a regex matching character ranges TODO: - Support Unicode regex (avoid mixing str and unicode types) - createString("__tax") | parse("__[12]") => group '__' - Make sure that all RegexXXX() classes are inmutable - Use singleton for dot, start and end See also CPAN Regexp::Assemble (Perl module): http://search.cpan.org/~dland/Regexp-Assemble-0.28/Assemble.pm """ from hachoir_regex.compatibility import all, any from hachoir_regex.tools import makePrintable import re import itertools import operator def matchSingleValue(regex): """ Regex only match one exact string. >>> matchSingleValue(RegexEmpty()) True >>> matchSingleValue(createString("abc")) True >>> matchSingleValue(createRange("a", "b")) False >>> matchSingleValue(createRange("a")) True >>> matchSingleValue(RegexAnd((RegexStart(), createString("abc")))) True """ cls = regex.__class__ if cls in (RegexEmpty, RegexString, RegexStart, RegexEnd): return True if cls == RegexAnd: return all( matchSingleValue(item) for item in regex ) if cls == RegexRange: return len(regex.ranges) == 1 and len(regex.ranges[0]) == 1 return False def escapeRegex(text): """ Escape string to use it in a regular expression: prefix special characters « ^.+*?{}[]|()\$ » by an antislash. """ return re.sub(r"([][^.+*?{}|()\\$])", r"\\\1", text) def _join(func, regex_list): if not isinstance(regex_list, (tuple, list)): regex_list = list(regex_list) if len(regex_list) == 0: return RegexEmpty() regex = regex_list[0] for item in regex_list[1:]: regex = func(regex, item) return regex def createString(text): """ >>> createString('') >>> createString('abc') """ if text: return RegexString(text) else: return RegexEmpty() def createRange(*text, **kw): """ Create a regex range using character list. >>> createRange("a", "d", "b") >>> createRange("-", "9", "4", "3", "0") """ ranges = ( RegexRangeCharacter(item) for item in text ) return RegexRange(ranges, kw.get('exclude', False)) class Regex: """ Abstract class defining a regular expression atom """ def minLength(self): """ Maximum length in characters of the regex. Returns None if there is no limit. """ raise NotImplementedError() def maxLength(self): """ Maximum length in characters of the regex. Returns None if there is no limit. """ return self.minLength() def __str__(self, **kw): if not hasattr(self, "_str_value"): self._str_value = {} key = kw.get('python', False) if key not in self._str_value: self._str_value[key] = self._str(**kw) return self._str_value[key] def _str(self, **kw): raise NotImplementedError() def __repr__(self, **kw): regex = self.__str__(**kw) regex = makePrintable(regex, 'ASCII', to_unicode=True) return "<%s '%s'>" % ( self.__class__.__name__, regex) def __contains__(self, item): raise NotImplementedError() def match(self, other): """ Guess if self may matchs regex. May returns False even if self does match regex. """ if self == other: return True return self._match(other) def _match(self, other): """ Does regex match other regex? Eg. "." matchs "0" or "[a-z]" but "0" doesn't match ".". This function is used by match() which already check regex identity. """ return False def _and(self, regex): """ Create new optimized version of a+b. Returns None if there is no interesting optimization. """ return None def __and__(self, regex): """ Create new optimized version of a & b. Returns None if there is no interesting optimization. >>> RegexEmpty() & RegexString('a') """ if regex.__class__ == RegexEmpty: return self new_regex = self._and(regex) if new_regex: return new_regex else: return RegexAnd( (self, regex) ) def __add__(self, regex): return self.__and__(regex) def or_(self, other): """ Create new optimized version of a|b. Returns None if there is no interesting optimization. """ # (a|a) => a if self == other: return self # a matchs b => a if self._match(other): return self # b matchs a => b if other._match(self): return other # Try to optimize (a|b) if self.__class__ != other.__class__: new_regex = self._or_(other, False) if new_regex: return new_regex # Try to optimize (b|a) new_regex = other._or_(self, True) if new_regex: return new_regex return None else: return self._or_(other, False) def _or_(self, other, reverse): """ Try to create optimized version of self|other if reverse if False, or of other|self if reverse if True. """ return None def __or__(self, other): """ Public method of OR operator: a|b. It call or_() internal method. If or_() returns None: RegexOr object is used (and otherwise, use or_() result). """ # Try to optimize (a|b) new_regex = self.or_(other) if new_regex: return new_regex # Else use (a|b) return RegexOr( (self, other) ) def __eq__(self, regex): if self.__class__ != regex.__class__: return False return self._eq(regex) def _eq(self, other): """ Check if two objects of the same class are equals """ raise NotImplementedError("Class %s has no method _eq()" % self.__class__.__name__) def compile(self, **kw): return re.compile(self.__str__(**kw)) def findPrefix(self, regex): """ Try to create a common prefix between two regex. Eg. "abc" and "abd" => "ab" Return None if no prefix can be found. """ return None def __iter__(self): raise NotImplementedError() class RegexEmpty(Regex): def minLength(self): return 0 def _str(self, **kw): return '' def _and(self, other): return other def _eq(self, other): return True class RegexWord(RegexEmpty): def _and(self, other): if other.__class__ == RegexWord: return self return None def _str(self, **kw): return r'\b' class RegexStart(RegexEmpty): def _and(self, other): if other.__class__ == RegexStart: return self return None def _str(self, **kw): return '^' class RegexEnd(RegexStart): def _and(self, other): if other.__class__ == RegexEnd: return self return None def _str(self, **kw): return '$' class RegexDot(Regex): def minLength(self): return 1 def _str(self, **kw): return '.' def _match(self, other): if other.__class__ == RegexRange: return True if other.__class__ == RegexString and len(other.text) == 1: return True return False def _eq(self, other): return True class RegexString(Regex): def __init__(self, text=""): assert isinstance(text, str) self.text = text assert 1 <= len(self.text) def minLength(self): return len(self.text) def _and(self, regex): """ >>> RegexString('a') + RegexString('b') """ if regex.__class__ == RegexString: return RegexString(self.text + regex.text) return None def _str(self, **kw): return escapeRegex(self.text) def findPrefix(self, regex): """ Try to find a common prefix of two string regex, returns: - None if there is no common prefix - (prefix, regexa, regexb) otherwise => prefix + (regexa|regexb) >>> RegexString('color red').findPrefix(RegexString('color blue')) (, , ) """ if regex.__class__ != RegexString: return None texta = self.text textb = regex.text # '(a|b)' => '[ab]' if len(texta) == len(textb) == 1: return (createRange(texta, textb), RegexEmpty(), RegexEmpty()) # '(text abc|text def)' => 'text (abc|def)' common = None for length in xrange(1, min(len(texta),len(textb))+1): if textb.startswith(texta[:length]): common = length else: break if not common: return None return (RegexString(texta[:common]), createString(texta[common:]), createString(textb[common:])) def _or_(self, other, reverse): """ Remove duplicate: >>> RegexString("color") | RegexString("color") Group prefix: >>> RegexString("color red") | RegexString("color blue") >>> RegexString("color red") | RegexString("color") """ # Don't know any other optimization for str|other if other.__class__ != RegexString: return None # Find common prefix common = self.findPrefix(other) if common: if not reverse: regex = common[1] | common[2] else: regex = common[2] | common[1] return common[0] + regex return None def _eq(self, other): return self.text == other.text class RegexRangeItem: def __init__(self, cmin, cmax=None): try: self.cmin = cmin if cmax is not None: self.cmax = cmax else: self.cmax = cmin except TypeError: raise TypeError("RegexRangeItem: two characters expected (%s, %s) found" % (type(cmin), type(cmax))) if self.cmax < self.cmin: raise TypeError("RegexRangeItem: minimum (%u) is bigger than maximum (%u)" % (self.cmin, self.cmax)) def __len__(self): return (self.cmax - self.cmin + 1) def __contains__(self, value): assert issubclass(value.__class__, RegexRangeItem) return (self.cmin <= value.cmin) and (value.cmax <= self.cmax) def __str__(self, **kw): cmin = chr(self.cmin) if self.cmin != self.cmax: cmax = chr(self.cmax) if (self.cmin+1) == self.cmax: return "%s%s" % (cmin, cmax) else: return "%s-%s" % (cmin, cmax) else: return cmin def __repr__(self): return "" % (self.cmin, self.cmax) class RegexRangeCharacter(RegexRangeItem): def __init__(self, char): RegexRangeItem.__init__(self, ord(char), ord(char)) class RegexRange(Regex): def __init__(self, ranges, exclude=False, optimize=True): if optimize: self.ranges = [] for item in ranges: RegexRange.rangeAdd(self.ranges, item) self.ranges.sort(key=lambda item: item.cmin) else: self.ranges = tuple(ranges) self.exclude = exclude @staticmethod def rangeAdd(ranges, itemb): """ Add a value in a RegexRangeItem() list: remove duplicates and merge ranges when it's possible. """ new = None for index, itema in enumerate(ranges): if itema in itemb: # [b] + [a-c] => [a-c] new = itemb break elif itemb in itema: # [a-c] + [b] => [a-c] return elif (itemb.cmax+1) == itema.cmin: # [d-f] + [a-c] => [a-f] new = RegexRangeItem(itemb.cmin, itema.cmax) break elif (itema.cmax+1) == itemb.cmin: # [a-c] + [d-f] => [a-f] new = RegexRangeItem(itema.cmin, itemb.cmax) break if new: del ranges[index] RegexRange.rangeAdd(ranges, new) return else: ranges.append(itemb) def minLength(self): return 1 def _match(self, other): """ >>> createRange("a") | createRange("b") >>> createRange("a", "b", exclude=True) | createRange("a", "c", exclude=True) """ if not self.exclude and other.__class__ == RegexString and len(other.text) == 1: branges = (RegexRangeCharacter(other.text),) elif other.__class__ == RegexRange and self.exclude == other.exclude: branges = other.ranges else: return None for itemb in branges: if not any( itemb in itema for itema in self.ranges ): return False return True def _or_(self, other, reverse): """ >>> createRange("a") | createRange("b") >>> createRange("a", "b", exclude=True) | createRange("a", "c", exclude=True) """ if not self.exclude and other.__class__ == RegexString and len(other.text) == 1: branges = (RegexRangeCharacter(other.text),) elif other.__class__ == RegexRange and self.exclude == other.exclude: branges = other.ranges else: return None ranges = list(self.ranges) for itemb in branges: RegexRange.rangeAdd(ranges, itemb) return RegexRange(ranges, self.exclude, optimize=False) def _str(self, **kw): content = [str(item) for item in self.ranges] if "-" in content: content.remove("-") suffix = "-" else: suffix = "" if "]" in content: content.remove("]") prefix = "]" else: prefix = "" text = prefix + (''.join(content)) + suffix if self.exclude: return "[^%s]" % text else: return "[%s]" % text def _eq(self, other): if self.exclude != other.exclude: return False return self.ranges == other.ranges class RegexAnd(Regex): def __init__(self, items): self.content = list(items) assert 2 <= len(self.content) def _minmaxLength(self, lengths): total = 0 for length in lengths: if length is None: return None total += length return total def minLength(self): """ >>> regex=((RegexString('a') | RegexString('bcd')) + RegexString('z')) >>> regex.minLength() 2 """ return self._minmaxLength( regex.minLength() for regex in self.content ) def maxLength(self): """ >>> regex=RegexOr((RegexString('a'), RegexString('bcd'))) >>> RegexAnd((regex, RegexString('z'))).maxLength() 4 """ return self._minmaxLength( regex.maxLength() for regex in self.content ) def _or_(self, other, reverse): if other.__class__ == RegexString: contentb = [other] elif other.__class__ == RegexAnd: contentb = other.content else: return None contenta = self.content if reverse: contenta, contentb = contentb, contenta # Find common prefix # eg. (ab|ac) => a(b|c) and (abc|abd) => ab(c|d) index = 0 last_index = min(len(contenta), len(contentb)) while index < last_index and contenta[index] == contentb[index]: index += 1 if index: regex = RegexAnd.join(contenta[index:]) | RegexAnd.join(contentb[index:]) return RegexAnd.join(contenta[:index]) + regex # Find common prefix: (abc|aef) => a(bc|ef) common = contenta[0].findPrefix(contentb[0]) if common: regexa = common[1] & RegexAnd.join( contenta[1:] ) regexb = common[2] & RegexAnd.join( contentb[1:] ) regex = (regexa | regexb) if matchSingleValue(common[0]) or matchSingleValue(regex): return common[0] + regex return None def _and(self, regex): """ >>> RegexDot() + RegexDot() >>> RegexDot() + RegexString('a') + RegexString('b') """ if regex.__class__ == RegexAnd: total = self for item in regex.content: total = total + item return total new_item = self.content[-1]._and(regex) if new_item: self.content[-1] = new_item return self return RegexAnd( self.content + [regex] ) def _str(self, **kw): return ''.join( item.__str__(**kw) for item in self.content ) @classmethod def join(cls, regex): """ >>> RegexAnd.join( (RegexString('Big '), RegexString('fish')) ) """ return _join(operator.__and__, regex) def __iter__(self): return iter(self.content) def _eq(self, other): if len(self.content) != len(other.content): return False return all( item[0] == item[1] for item in itertools.izip(self.content, other.content) ) class RegexOr(Regex): def __init__(self, items, optimize=True): if optimize: self.content = [] for item in items: if item in self: continue self.content.append(item) else: self.content = tuple(items) assert 2 <= len(self.content) def __contains__(self, regex): for item in self.content: if item == regex: return True return False def _or_(self, other, reverse): """ >>> (RegexString("abc") | RegexString("123")) | (RegexString("plop") | RegexString("456")) >>> RegexString("mouse") | createRange('a') | RegexString("2006") | createRange('z') """ if other.__class__ == RegexOr: total = self for item in other.content: total = total | item return total for index, item in enumerate(self.content): new_item = item.or_(other) if new_item: content = list(self.content) content = content[:index] + [new_item] + content[index+1:] return RegexOr(content, optimize=False) if not reverse: content = list(self.content) + [other] else: content = [other] + list(self.content) return RegexOr(content, optimize=False) def _str(self, **kw): content = '|'.join( item.__str__(**kw) for item in self.content ) if kw.get('python', False): return "(?:%s)" % content else: return "(%s)" % content def _minmaxLength(self, lengths, func): value = None for length in lengths: if length is None: return None if value is None: value = length else: value = func(value, length) return value def minLength(self): lengths = ( regex.minLength() for regex in self.content ) return self._minmaxLength(lengths, min) def maxLength(self): lengths = ( regex.maxLength() for regex in self.content ) return self._minmaxLength(lengths, max) @classmethod def join(cls, regex): """ >>> RegexOr.join( (RegexString('a'), RegexString('b'), RegexString('c')) ) """ return _join(operator.__or__, regex) def __iter__(self): return iter(self.content) def _eq(self, other): if len(self.content) != len(other.content): return False return all( item[0] == item[1] for item in itertools.izip(self.content, other.content) ) def optimizeRepeatOr(rmin, rmax, regex): # Fix rmin/rmax for item in regex: cls = item.__class__ if cls == RegexEmpty: # (a|b|){x,y} => (a|b){0,y} rmin = 0 elif cls == RegexRepeat: # (a{0,n}|b){x,y} => (a{1,n}|b){0,y} if item.min == 0 and rmin == 1: rmin = 0 # Create new (optimized) RegexOr expression content = [] for item in regex: cls = item.__class__ if cls == RegexEmpty: # (a|){x,y} => a{0,y} continue if cls == RegexRepeat: if item.min == 0: if rmin in (0, 1): if rmax == item.max == None: # (a*|b){x,} => (a|b){x,} item = item.regex else: # (a{0,p}|b){x,} => (a{1,p}|b){x,} item = RegexRepeat(item.regex, 1, item.max, optimize=False) elif item.min == 1: if rmax == item.max == None: # (a+|b){x,} => (a|b){x,} item = item.regex else: if rmax == item.max == None: # (a{n,}|b){x,} => (a{n}|b){x,} item = RegexRepeat(item.regex, item.min, item.min, optimize=False) content.append(item) regex = RegexOr.join(content) return (rmin, rmax, regex) class RegexRepeat(Regex): """ >>> a=createString('a') >>> RegexRepeat(a, 0, None) >>> RegexRepeat(a, 1, None) >>> RegexRepeat(a, 0, 1) >>> RegexRepeat(a, 0, 1) >>> RegexRepeat(a, 1, 3) """ def __init__(self, regex, rmin, rmax, optimize=True): # Optimisations if optimize: cls = regex.__class__ if cls == RegexRepeat: # (a{n,p}){x,y) => a{n*x,p*y} if not (rmin == 0 and rmax == 1): rmin *= regex.min if regex.max and rmax: rmax *= regex.max else: rmax = None regex = regex.regex elif cls == RegexOr: rmin, rmax, regex = optimizeRepeatOr(rmin, rmax, regex) # Store attributes self.regex = regex self.min = rmin self.max = rmax # Post-conditions assert 0 <= rmin if self.max is not None: if self.max < self.min: raise ValueError("RegexRepeat: minimum (%s) is bigger than maximum (%s)!" % (self.min, self.max)) if (self.max == 0) \ or (self.min == self.max == 1): raise ValueError("RegexRepeat: invalid values (min=%s, max=%s)!" % (self.min, self.max)) def minLength(self): """ >>> r=RegexRepeat(createString("abc") | createString("01"), 1, 3) >>> r.minLength(), r.maxLength() (2, 9) >>> r=RegexRepeat(createString("abc") | createString("01"), 4, None) >>> r.minLength(), r.maxLength() (8, None) """ if self.min is not None: return self.regex.minLength() * self.min else: return None def maxLength(self): if self.max is not None: return self.regex.maxLength() * self.max else: return None def _str(self, **kw): text = str(self.regex) if self.regex.__class__ == RegexAnd \ or (self.regex.__class__ == RegexString and 1 < len(self.regex.text)): text = "(%s)" % text if self.min == 0 and self.max == 1: return "%s?" % text if self.min == self.max: return "%s{%u}" % (text, self.min) if self.max is None: if self.min == 0: return "%s*" % text elif self.min == 1: return "%s+" % text else: return "%s{%u,}" % (text, self.min) return "%s{%u,%u}" % (text, self.min, self.max) def _eq(self, other): if self.min != other.min: return False if self.max != other.max: return False return (self.regex == other.regex) if __name__ == "__main__": import doctest doctest.testmod() hachoir-regex-1.0.5/regex.rst0000644000175000017500000000720211251277274015141 0ustar haypohaypoCreate regular expressions ========================== There is two ways to create regular expressions: use string or directly use the API. Atom classes: * RegexEmpty: empty regex (match nothing) * RegexStart, RegexEnd, RegexDot: symbols ^, $ and . * RegexString * RegexRange: character range like [a-z] or [^0-9] * RegexAnd * RegexOr * RegexRepeat All classes are based on Regex class. Create regex with string ------------------------ >>> from hachoir_regex import parse >>> parse('') >>> parse('abc') >>> parse('[bc]d') >>> parse('a(b|[cd]|(e|f))g') >>> parse('([a-z]|[b-])') >>> parse('^^..$$') >>> parse('chats?') >>> parse(' +abc') Create regex with the API ------------------------- >>> from hachoir_regex import createString, createRange >>> createString('') >>> createString('abc') >>> createRange('a', 'b', 'c') >>> createRange('a', 'b', 'c', exclude=True) Manipulate regular expressions ============================== Convert to string: >>> from hachoir_regex import createRange, createString >>> str(createString('abc')) 'abc' >>> repr(createString('abc')) "" Operatiors "and" and "or": >>> createString("bike") & createString("motor") >>> createString("bike") | createString("motor") You can also use operator "+", it's just an alias to a & b: >>> createString("big ") + createString("bike") Compute minimum/maximum matched pattern: >>> r=parse('(cat|horse)') >>> r.minLength(), r.maxLength() (3, 5) Optimizations ============= The library includes many optimization to keep small and fast expressions. Group prefix: >>> createString("blue") | createString("brown") >>> createString("moto") | parse("mot.") >>> parse("(ma|mb|mc)") >>> parse("(maa|mbb|mcc)") Merge ranges: >>> from hachoir_regex import createRange >>> regex = createString("1") | createString("3"); regex >>> regex = regex | createRange("2"); regex >>> regex = regex | createString("0"); regex >>> regex = regex | createRange("5", "6"); regex >>> regex = regex | createRange("4"); regex PatternMaching class ==================== Use PatternMaching if you would like to find many strings or regex in a string. Use addString() and addRegex() to add your patterns. >>> from hachoir_regex import PatternMatching >>> p = PatternMatching() >>> p.addString("a") >>> p.addString("b") >>> p.addRegex("[cd]") And then use search() to find all patterns: >>> for start, end, item in p.search("a b c d"): ... print "%s..%s: %s" % (start, end, item) ... 0..1: a 2..3: b 4..5: [cd] 6..7: [cd] Item is a Pattern object, not the matched string. To be exact, it's a StringPattern for string and a RegexPattern for regex. You can associate an "user" value to each Pattern object. >>> p2 = PatternMatching() >>> p2.addString("un", 1) >>> p2.addString("deux", 2) >>> p2.addRegex("(trois|three)", 3) >>> for start, end, item in p2.search("un deux trois"): ... print "%r at %s: user=%r" % (item, start, item.user) ... at 0: user=1 at 3: user=2 at 8: user=3 You can associate any Python object to an item, not only an integer! hachoir-regex-1.0.5/MANIFEST.in0000644000175000017500000000016511330154036015020 0ustar haypohaypoinclude AUTHORS include COPYING include INSTALL include MANIFEST.in include README include *.rst include test_doc.py hachoir-regex-1.0.5/regression.rst0000644000175000017500000000371011251277274016207 0ustar haypohaypoRegex regression (repeat) ========================= Factorisation of (a{n,p}){x,y}: ------------------------------- >>> from hachoir_regex import parse >>> parse("(a{2,3}){4,5}") >>> parse("(a{2,}){3,4}") >>> parse("(a{2,3})+") >>> parse("(a*){2,3}") >>> parse("(a+){2,3}") Factorisation of (a|b)*: ------------------------ >>> parse("(a*|b)*") >>> parse("(a+|b)*") >>> parse("(a{2,}|b)*") Factorisation of (a|b)+: ------------------------ >>> parse("(a*|b)+") >>> parse("(a+|b|)+") >>> parse("(a+|b)+") >>> parse("(a{5,}|b)+") Factorisation of (a|b){x,}: --------------------------- >>> parse("(a+|b){3,}") >>> parse("(a{2,}|b){3,}") Factorisation of (a|b){x,y}: ---------------------------- >>> parse("(a*|b|){4,5}") >>> parse("(a+|b|){4,5}") >>> parse("(a*|b){4,5}") Do not optimize: ---------------- >>> parse('(a*|b){3,}') >>> parse("(a{2,3}|b){3,}") >>> parse("(a{2,3}|b)*") >>> parse("(a{2,3}|b)+") >>> parse("(a+|b){4,5}") >>> parse("(a{2,}|b){4,5}") >>> parse("(a{2,3}|b){4,5}") Regex regression (b) ==================== >>> from hachoir_regex import parse >>> parse("(M(SCF|Thd)|B(MP4|Zh))") >>> parse("(FWS1|CWS1|FWS2|CWS2)") >>> parse("(abcdeZ|abZ)") >>> parse("(00t003|10t003|00[12]0[1-9].abc\0|1CD001)") hachoir-regex-1.0.5/hachoir_regex.egg-info/0000755000175000017500000000000011330155056017564 5ustar haypohaypohachoir-regex-1.0.5/hachoir_regex.egg-info/SOURCES.txt0000644000175000017500000000067511330155055021457 0ustar haypohaypoAUTHORS COPYING INSTALL MANIFEST.in README regex.rst regression.rst setup.py test_doc.py hachoir_regex/__init__.py hachoir_regex/compatibility.py hachoir_regex/parser.py hachoir_regex/pattern.py hachoir_regex/regex.py hachoir_regex/tools.py hachoir_regex/version.py hachoir_regex.egg-info/PKG-INFO hachoir_regex.egg-info/SOURCES.txt hachoir_regex.egg-info/dependency_links.txt hachoir_regex.egg-info/top_level.txt hachoir_regex.egg-info/zip-safehachoir-regex-1.0.5/hachoir_regex.egg-info/zip-safe0000644000175000017500000000000111330155055021213 0ustar haypohaypo hachoir-regex-1.0.5/hachoir_regex.egg-info/PKG-INFO0000644000175000017500000001134111330155055020660 0ustar haypohaypoMetadata-Version: 1.0 Name: hachoir-regex Version: 1.0.5 Summary: Manipulation of regular expressions (regex) Home-page: http://bitbucket.org/haypo/hachoir/wiki/hachoir-regex Author: Victor Stinner Author-email: UNKNOWN License: GNU GPL v2 Download-URL: http://bitbucket.org/haypo/hachoir/wiki/hachoir-regex Description: Hachoir regex ============= hachoir-regex is a Python library for regular expression (regex or regexp) manupulation. You can use a|b (or) and a+b (and) operators. Expressions are optimized during the construction: merge ranges, simplify repetitions, etc. It also contains a class for pattern matching allowing to search multiple strings and regex at the same time. Website: http://bitbucket.org/haypo/hachoir/wiki/hachoir-regex Changelog ========= Version 1.0.5 (2010-01-28) * Create a MANIFEST.in to include extra files: regex.rst, test_doc.py, etc. * Create an INSTALL file Version 1.0.4 (2010-01-13) * Support \b (match a word) * Fix parser: support backslash in a range, eg. parse(r"[a\]x]") Version 1.0.3 (2008-04-01) * Raise SyntaxError on unsupported escape character * Two dot atoms are always equals Version 1.0.2 (2007-07-12) * Refix PatternMatching without any pattern Version 1.0.1 (2007-06-28) * Fix PatternMatching without any pattern Version 1.0 (2007-06-28) * First public version Regex examples ============== Regex are optimized during their creation: >>> from hachoir_regex import parse, createRange, createString >>> createString("bike") + createString("motor") >>> parse('(foo|fooo|foot|football)') Create character range: >>> regex = createString("1") | createString("3") >>> regex >>> regex |= createRange("2", "4") >>> regex As you can see, you can use classic "a|b" (or) and "a+b" (and) Python operators. Example of regular expressions using repetition: >>> parse("(a{2,}){3,4}") >>> parse("(a*|b)*") >>> parse("(a*|b|){4,5}") Compute minimum/maximum matched pattern: >>> r=parse('(cat|horse)') >>> r.minLength(), r.maxLength() (3, 5) >>> r=parse('(a{2,}|b+)') >>> r.minLength(), r.maxLength() (1, None) Pattern maching =============== Use PatternMaching if you would like to find many strings or regex in a string. Use addString() and addRegex() to add your patterns. >>> from hachoir_regex import PatternMatching >>> p = PatternMatching() >>> p.addString("a") >>> p.addString("b") >>> p.addRegex("[cd]") And then use search() to find all patterns: >>> for start, end, item in p.search("a b c d"): ... print "%s..%s: %s" % (start, end, item) ... 0..1: a 2..3: b 4..5: [cd] 6..7: [cd] You can also attach an objet to a pattern with 'user' (user data) argument: >>> p = PatternMatching() >>> p.addString("un", 1) >>> p.addString("deux", 2) >>> for start, end, item in p.search("un deux"): ... print "%r at %s: user=%r" % (item, start, item.user) ... at 0: user=1 at 3: user=2 Installation ============ With distutils: sudo ./setup.py install Or using setuptools: sudo ./setup.py --setuptools install Platform: UNKNOWN Classifier: Development Status :: 5 - Production/Stable Classifier: Intended Audience :: Developers Classifier: Intended Audience :: Education Classifier: License :: OSI Approved :: GNU General Public License (GPL) Classifier: Natural Language :: English Classifier: Operating System :: OS Independent Classifier: Programming Language :: Python Classifier: Topic :: Scientific/Engineering :: Information Analysis Classifier: Topic :: Software Development :: Interpreters Classifier: Topic :: Software Development :: Libraries :: Python Modules Classifier: Topic :: Text Processing Classifier: Topic :: Utilities hachoir-regex-1.0.5/hachoir_regex.egg-info/dependency_links.txt0000644000175000017500000000000111330155055023631 0ustar haypohaypo hachoir-regex-1.0.5/hachoir_regex.egg-info/top_level.txt0000644000175000017500000000001611330155055022312 0ustar haypohaypohachoir_regex hachoir-regex-1.0.5/test_doc.py0000755000175000017500000000211111251277274015450 0ustar haypohaypo#!/usr/bin/python import doctest from sys import exit def testDoc(filename, name=None): print "--- %s: Run tests" % filename failure, nb_test = doctest.testfile( filename, optionflags=doctest.ELLIPSIS, name=name) if failure: exit(1) print "--- %s: End of tests" % filename def importModule(name): mod = __import__(name) components = name.split('.') for comp in components[1:]: mod = getattr(mod, comp) return mod def testModule(name): print "--- Test module %s" % name module = importModule(name) failure, nb_test = doctest.testmod(module) if failure: exit(1) print "--- End of test" def main(): # Test documentation in doc/*.rst files testDoc('README') testDoc('regex.rst') testDoc('regression.rst') # Test documentation of some functions/classes testModule("hachoir_regex.compatibility") testModule("hachoir_regex.tools") testModule("hachoir_regex.parser") testModule("hachoir_regex.regex") testModule("hachoir_regex.pattern") if __name__ == "__main__": main() hachoir-regex-1.0.5/setup.py0000755000175000017500000000472611330154635015013 0ustar haypohaypo#!/usr/bin/env python # Todo list to prepare a release: # - edit hachoir_regex/version.py: VERSION = "XXX" # - run: ./test_doc.py # - edit README: set release date # - run: hg commit # - run: hg tag hachoir-regex-XXX # - run: hg push # - run: python2.5 ./setup.py --setuptools register sdist bdist_egg upload # - run: python2.4 ./setup.py --setuptools bdist_egg upload # - run: python2.6 ./setup.py --setuptools bdist_egg upload # - check http://pypi.python.org/pypi/hachoir-regex # - update the website # * http://bitbucket.org/haypo/hachoir/wiki/Install/source # * http://bitbucket.org/haypo/hachoir/wiki/Home # - set version to N+1 in hachoir_regex/version.py # - edit README: add a new "hachoir-metadata N+1" subsection in the ChangeLog # with the text "XXX" # Constants AUTHORS = 'Victor Stinner' DESCRIPTION = "Manipulation of regular expressions (regex)" CLASSIFIERS = [ 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', 'Intended Audience :: Education', 'License :: OSI Approved :: GNU General Public License (GPL)', 'Natural Language :: English', 'Operating System :: OS Independent', 'Programming Language :: Python', 'Topic :: Scientific/Engineering :: Information Analysis', 'Topic :: Software Development :: Interpreters', 'Topic :: Software Development :: Libraries :: Python Modules', 'Topic :: Text Processing', 'Topic :: Utilities', ] PACKAGES = {"hachoir_regex": "hachoir_regex"} from imp import load_source from os import path import sys def main(): if "--setuptools" in sys.argv: sys.argv.remove("--setuptools") from setuptools import setup use_setuptools = True else: from distutils.core import setup use_setuptools = False hachoir_regex = load_source("version", path.join("hachoir_regex", "version.py")) install_options = { "name": hachoir_regex.PACKAGE, "version": hachoir_regex.__version__, "url": hachoir_regex.WEBSITE, "download_url": hachoir_regex.WEBSITE, "license": hachoir_regex.LICENSE, "author": AUTHORS, "description": DESCRIPTION, "classifiers": CLASSIFIERS, "packages": PACKAGES.keys(), "package_dir": PACKAGES, "long_description": open('README').read(), } if use_setuptools: install_options["zip_safe"] = True # Call main() setup function setup(**install_options) if __name__ == "__main__": main() hachoir-regex-1.0.5/INSTALL0000644000175000017500000000011511330154004014301 0ustar haypohaypoInstall ======= Run the follow command as root: :: ./setup.py install hachoir-regex-1.0.5/setup.cfg0000644000175000017500000000007311330155056015104 0ustar haypohaypo[egg_info] tag_build = tag_date = 0 tag_svn_revision = 0 hachoir-regex-1.0.5/README0000644000175000017500000000574511330154733014157 0ustar haypohaypoHachoir regex ============= hachoir-regex is a Python library for regular expression (regex or regexp) manupulation. You can use a|b (or) and a+b (and) operators. Expressions are optimized during the construction: merge ranges, simplify repetitions, etc. It also contains a class for pattern matching allowing to search multiple strings and regex at the same time. Website: http://bitbucket.org/haypo/hachoir/wiki/hachoir-regex Changelog ========= Version 1.0.5 (2010-01-28) * Create a MANIFEST.in to include extra files: regex.rst, test_doc.py, etc. * Create an INSTALL file Version 1.0.4 (2010-01-13) * Support \b (match a word) * Fix parser: support backslash in a range, eg. parse(r"[a\]x]") Version 1.0.3 (2008-04-01) * Raise SyntaxError on unsupported escape character * Two dot atoms are always equals Version 1.0.2 (2007-07-12) * Refix PatternMatching without any pattern Version 1.0.1 (2007-06-28) * Fix PatternMatching without any pattern Version 1.0 (2007-06-28) * First public version Regex examples ============== Regex are optimized during their creation: >>> from hachoir_regex import parse, createRange, createString >>> createString("bike") + createString("motor") >>> parse('(foo|fooo|foot|football)') Create character range: >>> regex = createString("1") | createString("3") >>> regex >>> regex |= createRange("2", "4") >>> regex As you can see, you can use classic "a|b" (or) and "a+b" (and) Python operators. Example of regular expressions using repetition: >>> parse("(a{2,}){3,4}") >>> parse("(a*|b)*") >>> parse("(a*|b|){4,5}") Compute minimum/maximum matched pattern: >>> r=parse('(cat|horse)') >>> r.minLength(), r.maxLength() (3, 5) >>> r=parse('(a{2,}|b+)') >>> r.minLength(), r.maxLength() (1, None) Pattern maching =============== Use PatternMaching if you would like to find many strings or regex in a string. Use addString() and addRegex() to add your patterns. >>> from hachoir_regex import PatternMatching >>> p = PatternMatching() >>> p.addString("a") >>> p.addString("b") >>> p.addRegex("[cd]") And then use search() to find all patterns: >>> for start, end, item in p.search("a b c d"): ... print "%s..%s: %s" % (start, end, item) ... 0..1: a 2..3: b 4..5: [cd] 6..7: [cd] You can also attach an objet to a pattern with 'user' (user data) argument: >>> p = PatternMatching() >>> p.addString("un", 1) >>> p.addString("deux", 2) >>> for start, end, item in p.search("un deux"): ... print "%r at %s: user=%r" % (item, start, item.user) ... at 0: user=1 at 3: user=2 Installation ============ With distutils: sudo ./setup.py install Or using setuptools: sudo ./setup.py --setuptools install hachoir-regex-1.0.5/AUTHORS0000644000175000017500000000010411251277274014337 0ustar haypohaypoVictor Stinner aka haypo hachoir-regex-1.0.5/COPYING0000644000175000017500000004327711251277274014344 0ustar haypohaypo GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc. 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Library General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Library General Public License instead of this License. hachoir-regex-1.0.5/PKG-INFO0000644000175000017500000001134111330155056014360 0ustar haypohaypoMetadata-Version: 1.0 Name: hachoir-regex Version: 1.0.5 Summary: Manipulation of regular expressions (regex) Home-page: http://bitbucket.org/haypo/hachoir/wiki/hachoir-regex Author: Victor Stinner Author-email: UNKNOWN License: GNU GPL v2 Download-URL: http://bitbucket.org/haypo/hachoir/wiki/hachoir-regex Description: Hachoir regex ============= hachoir-regex is a Python library for regular expression (regex or regexp) manupulation. You can use a|b (or) and a+b (and) operators. Expressions are optimized during the construction: merge ranges, simplify repetitions, etc. It also contains a class for pattern matching allowing to search multiple strings and regex at the same time. Website: http://bitbucket.org/haypo/hachoir/wiki/hachoir-regex Changelog ========= Version 1.0.5 (2010-01-28) * Create a MANIFEST.in to include extra files: regex.rst, test_doc.py, etc. * Create an INSTALL file Version 1.0.4 (2010-01-13) * Support \b (match a word) * Fix parser: support backslash in a range, eg. parse(r"[a\]x]") Version 1.0.3 (2008-04-01) * Raise SyntaxError on unsupported escape character * Two dot atoms are always equals Version 1.0.2 (2007-07-12) * Refix PatternMatching without any pattern Version 1.0.1 (2007-06-28) * Fix PatternMatching without any pattern Version 1.0 (2007-06-28) * First public version Regex examples ============== Regex are optimized during their creation: >>> from hachoir_regex import parse, createRange, createString >>> createString("bike") + createString("motor") >>> parse('(foo|fooo|foot|football)') Create character range: >>> regex = createString("1") | createString("3") >>> regex >>> regex |= createRange("2", "4") >>> regex As you can see, you can use classic "a|b" (or) and "a+b" (and) Python operators. Example of regular expressions using repetition: >>> parse("(a{2,}){3,4}") >>> parse("(a*|b)*") >>> parse("(a*|b|){4,5}") Compute minimum/maximum matched pattern: >>> r=parse('(cat|horse)') >>> r.minLength(), r.maxLength() (3, 5) >>> r=parse('(a{2,}|b+)') >>> r.minLength(), r.maxLength() (1, None) Pattern maching =============== Use PatternMaching if you would like to find many strings or regex in a string. Use addString() and addRegex() to add your patterns. >>> from hachoir_regex import PatternMatching >>> p = PatternMatching() >>> p.addString("a") >>> p.addString("b") >>> p.addRegex("[cd]") And then use search() to find all patterns: >>> for start, end, item in p.search("a b c d"): ... print "%s..%s: %s" % (start, end, item) ... 0..1: a 2..3: b 4..5: [cd] 6..7: [cd] You can also attach an objet to a pattern with 'user' (user data) argument: >>> p = PatternMatching() >>> p.addString("un", 1) >>> p.addString("deux", 2) >>> for start, end, item in p.search("un deux"): ... print "%r at %s: user=%r" % (item, start, item.user) ... at 0: user=1 at 3: user=2 Installation ============ With distutils: sudo ./setup.py install Or using setuptools: sudo ./setup.py --setuptools install Platform: UNKNOWN Classifier: Development Status :: 5 - Production/Stable Classifier: Intended Audience :: Developers Classifier: Intended Audience :: Education Classifier: License :: OSI Approved :: GNU General Public License (GPL) Classifier: Natural Language :: English Classifier: Operating System :: OS Independent Classifier: Programming Language :: Python Classifier: Topic :: Scientific/Engineering :: Information Analysis Classifier: Topic :: Software Development :: Interpreters Classifier: Topic :: Software Development :: Libraries :: Python Modules Classifier: Topic :: Text Processing Classifier: Topic :: Utilities