SimpleParse-2.2.0/0000755000175000017500000000000012620710576015341 5ustar mcfletchmcfletch00000000000000SimpleParse-2.2.0/SimpleParse.egg-info/0000755000175000017500000000000012620710576021257 5ustar mcfletchmcfletch00000000000000SimpleParse-2.2.0/SimpleParse.egg-info/PKG-INFO0000644000175000017500000000165512620710576022363 0ustar mcfletchmcfletch00000000000000Metadata-Version: 1.1 Name: SimpleParse Version: 2.2.0 Summary: A Parser Generator for Python (w/mxTextTools derivative) Home-page: http://simpleparse.sourceforge.net/ Author: Mike C. Fletcher Author-email: mcfletch@users.sourceforge.net License: UNKNOWN Description: A Parser Generator for Python (w/mxTextTools derivative) Provides a moderately fast parser generator for use with Python, includes a forked version of the mxTextTools text-processing library modified to eliminate recursive operation and fix a number of undesirable behaviours. Converts EBNF grammars directly to single-pass parsers for many largely deterministic grammars. Keywords: parse,parser,parsing,text,ebnf,grammar,generator Platform: Any Classifier: Programming Language :: Python Classifier: Topic :: Software Development :: Libraries :: Python Modules Classifier: Intended Audience :: Developers SimpleParse-2.2.0/SimpleParse.egg-info/SOURCES.txt0000644000175000017500000000635412620710576023153 0ustar mcfletchmcfletch00000000000000MANIFEST.in license.txt setup.py tox.ini /home/mcfletch/OpenGL-dev/simpleparse/simpleparse/stt/TextTools/mxTextTools/mxTextTools.c /home/mcfletch/OpenGL-dev/simpleparse/simpleparse/stt/TextTools/mxTextTools/mxbmse.c /home/mcfletch/OpenGL-dev/simpleparse/simpleparse/stt/TextTools/mxTextTools/mxte.c SimpleParse.egg-info/PKG-INFO SimpleParse.egg-info/SOURCES.txt SimpleParse.egg-info/dependency_links.txt SimpleParse.egg-info/top_level.txt doc/common_problems.html doc/index.html doc/mxLicense.html doc/processing_result_trees.html doc/scanning_with_simpleparse.html doc/simpleparse_grammars.html doc/sitestyle.css simpleparse/__init__.py simpleparse/baseparser.py simpleparse/dispatchprocessor.py simpleparse/error.py simpleparse/generator.py simpleparse/objectgenerator.py simpleparse/parser.py simpleparse/printers.py simpleparse/processor.py simpleparse/simpleparsegrammar.py simpleparse/common/__init__.py simpleparse/common/calendar_names.py simpleparse/common/chartypes.py simpleparse/common/comments.py simpleparse/common/iso_date.py simpleparse/common/iso_date_loose.py simpleparse/common/numbers.py simpleparse/common/phonetics.py simpleparse/common/strings.py simpleparse/common/timezone_names.py simpleparse/stt/COPYRIGHT simpleparse/stt/LICENSE simpleparse/stt/__init__.py simpleparse/stt/mxLicense.html simpleparse/stt/Doc/eGenix-mx-Extensions.html simpleparse/stt/Doc/mxLicense.html simpleparse/stt/Doc/mxTextTools.html simpleparse/stt/TextTools/COPYRIGHT simpleparse/stt/TextTools/LICENSE simpleparse/stt/TextTools/Makefile.pkg simpleparse/stt/TextTools/README simpleparse/stt/TextTools/TextTools.py simpleparse/stt/TextTools/__init__.py simpleparse/stt/TextTools/Constants/Sets.py simpleparse/stt/TextTools/Constants/TagTables.py simpleparse/stt/TextTools/Constants/__init__.py simpleparse/stt/TextTools/mxTextTools/Makefile.pre.in simpleparse/stt/TextTools/mxTextTools/__init__.py simpleparse/stt/TextTools/mxTextTools/highcommands.h simpleparse/stt/TextTools/mxTextTools/lowlevelcommands.h simpleparse/stt/TextTools/mxTextTools/mx.h simpleparse/stt/TextTools/mxTextTools/mxTextTools.c simpleparse/stt/TextTools/mxTextTools/mxTextTools.c.~1~ simpleparse/stt/TextTools/mxTextTools/mxTextTools.def simpleparse/stt/TextTools/mxTextTools/mxTextTools.h simpleparse/stt/TextTools/mxTextTools/mxbmse.c simpleparse/stt/TextTools/mxTextTools/mxbmse.h simpleparse/stt/TextTools/mxTextTools/mxh.h simpleparse/stt/TextTools/mxTextTools/mxpyapi.h simpleparse/stt/TextTools/mxTextTools/mxstdlib.h simpleparse/stt/TextTools/mxTextTools/mxte.c simpleparse/stt/TextTools/mxTextTools/mxte_impl.h simpleparse/stt/TextTools/mxTextTools/recursecommands.h simpleparse/stt/TextTools/mxTextTools/speccommands.h simpleparse/xmlparser/__init__.py simpleparse/xmlparser/xml_parser.py tests/__init__.py tests/genericvalues.py tests/mx_flag.py tests/mx_high.py tests/mx_low.py tests/mx_recursive.py tests/mx_special.py tests/test_backup_on_subtable_failure.py tests/test_common_chartypes.py tests/test_common_comments.py tests/test_common_iso_date.py tests/test_common_numbers.py tests/test_common_strings.py tests/test_deep_nesting.py tests/test_erroronfail.py tests/test_grammarparser.py tests/test_objectgenerator.py tests/test_optimisation.py tests/test_printers.py tests/test_simpleparsegrammar.py tests/test_xml.pySimpleParse-2.2.0/SimpleParse.egg-info/dependency_links.txt0000644000175000017500000000000112620710576025325 0ustar mcfletchmcfletch00000000000000 SimpleParse-2.2.0/SimpleParse.egg-info/top_level.txt0000644000175000017500000000001412620710576024004 0ustar mcfletchmcfletch00000000000000simpleparse SimpleParse-2.2.0/doc/0000755000175000017500000000000012620710576016106 5ustar mcfletchmcfletch00000000000000SimpleParse-2.2.0/doc/common_problems.html0000644000175000017500000001410012037615407022162 0ustar mcfletchmcfletch00000000000000 Common Problems with SimpleParse 2.0

Common Problems

Describes common errors, anti-patterns and known bugs with the SimpleParse 2.0 engine.

Repetition-as-Recursion

Is extremely inefficient, it generates 4 new Python objects and a number of new object pointers for every match (figure > 100 bytes for each match), on top of the engine overhead in tracking the recursion, so if you have a 1-million character match that's “matching” for every character, you'll have hundreds of megabytes of memory used.

In addition, if you are not using the non-recursive rewrite of mx.TextTools, you can actually blow up the C stack with the recursive calls to tag().  Symptoms of this are a memory access error when attempting to parse.

a := 'b', a? # bad!
a := 'b'+ # good!

Null-match Children of Repeating Groups

At present, there's no way for the engine to know whether a child has been satisfied (matched) because they are optional (or all of their children are optional), or because they actually matched. The problem with the obvious solution of just checking whether we've moved forward in the text is that many classes of match object may match depending on external (non-text-based) conditions, so if we do the check, all of those mechanisms suddenly fail. For now, make sure:

You can recognize this situation by the process going into an endless loop with little or no memory being consumed.  To fix this one, I'd likely need to add another return value type to the mxTextTools engine.

No Backtracking

The TextTools engine does not support backtracking as seen in RE engines and many parsers, so productions like this can never match:

a := (b/c)*, c

Because the 'c' productions will all have been consumed by the FirstOfGroup, so the last 'c' can never match. This is a fundamental limit of the current back-end, so unless a new back-end is created, the problem will not go away. You will need to design your grammars accordingly.

First-Of, not Longest-Of (Meaning of / )

The production c := (a/b) produces a FirstOfGroup, that is, a group which matches the first child to match. Many parsers and regex engines use an algorithm that matches all children and chooses the longest successful match. It would be possible to define a new TextTools tagging command to support the longest-of semantics for Table/SubTable matches, but I haven't felt the need to do so. If such a command is created, it will likely be spelled '|' rather than '/' in the SimpleParse grammar.

Grouping Rules

Although not particularly likely, users of SimpleParse 1.0 may have relied on the (extremely non-intuitive) grouping mechanism for element tokens in their grammars.  With that mechanism, the group:

a,b,c/d,e

was interpreted as:

a,b,(c/(d,e))

The new rule is simply that alternation binds closer than sequences, so the same grammar becomes:

a,b,(c/d),e

which, though no more (or less) intuitive than:

(a,b,c)/(d,e) ### it doesn't work this way!!!

is certainly better than the original mechanism.

mxTextTools Versions

You will, if possible, want to use the non-recursive rewrite of the 2.1.0 mxTextTools engine (2.1.0nr).  At the time of writing, the mainline 2.1.0b3 has some errors (which I'm told are fixed for 2.1.0final), while the non-recursive rewrite passes all tests.  The bugs in the (recursive) engine(s) that are known (and not likely to be fixed in the case of 2.1.0 final) are:

Up to index...

A SourceForge Logo
Open Source project


SimpleParse-2.2.0/doc/mxLicense.html0000644000175000017500000006224612037615407020734 0ustar mcfletchmcfletch00000000000000 mx Extension Series - License Information

mx Extension Series - License Information


Public License : Commercial License : Home Version 1.0.0

Introduction

eGenix.com Public License

eGenix.com Commercial License


© 2000, Copyright by eGenix.com Software GmbH, Langengeld, Germany; All Rights Reserved. mailto: info@egenix.com
SimpleParse-2.2.0/doc/processing_result_trees.html0000644000175000017500000003171612037615407023757 0ustar mcfletchmcfletch00000000000000 Processing Result Trees

Processing Result Trees

SimpleParse parsers generate tree structures describing the structure of your parsed content. This document briefly describes the structures, a simple mechanism for processing the structures, and ways to alter the structures as they are generated to accomplish specific goals.

Prerequisites:

Standard Result Trees

SimpleParse uses the same result format as is used for the underlying mx.TextTools engine. The engine returns a three-item tuple from the parsing of the top-level (root) production like so:

success, resultTrees, nextCharacter = myParser.parse( someText, processor=None)

Success is a Boolean value indicating whether the production (by default the root production) matched (was satisfied) at all. If success is true, nextCharacter is an integer value indicating the next character to be parsed in the text (i.e. someText[ startCharacter:nextCharacter ] was parsed).

[New in 2.0.0b2] Note: If success is false, then nextCharacter is set to the (very ill-defined) "error position", which is the position reached by the last TextTools command in the top-level production before the entire table failed. This is a lower-level value than is usefully predictable within SimpleParse (for instance, negative results which cause a failure will actually report the position after the positive version of the element token succeeds).  You might, I suppose, use it as a hint to your users of where the error occured, but using error-on-fail SyntaxErrors is by far the prefered method.  Basically, if success is false, consider nextCharacter to contain garbage data.

When the processor argument to parse is false (or a non-callable object), the system does not attempt to use the default processing mechanism, and returns the result trees directly. The standard format for result-tree nodes is as follows:

(production_name, start, stop, children_trees)

Where start and stop represent indexes in the source text such that sourcetext [ start: stop] is the text which matched this production. The list of children is the list of a list of the result-trees for the child productions within the production, or None (Note: that last is important, you can't automatically do a "for" over the children_trees).

Expanded productions, as well as unreported productions (and the children of unreported productions), will not appear in the result trees, neither will the root production. See Understanding SimpleParse Grammars for details. However, LookAhead productions where the non-lookahead value would normally return results, will return their results in the position where the LookAhead is included in the grammar.

If the processor argument to parse is true and callable, the processor object will be called with (success, resultTrees, nextCharacter) on completion of parsing.  The processor can then take whatever processing steps desired, the return value from calling the processor with the results is returned directly to the caller of parse.

DispatchProcessor

SimpleParse 2.0 provides a simple mechanism for processing result trees, a recursive series of calls to attributes of a “Processor” object with functions to automate the call-by-name dispatching.  This processor implementation is available for examination in the simpleparse.dispatchprocessor module.  The main functions are:

def dispatch( source, tag, buffer ):
"""Dispatch on source for tag with buffer

Find the attribute or key "tag-object" (tag[0]) of source,
then call it with (tag, buffer)
"""
def dispatchList( source, taglist, buffer ):
"""Dispatch on source for each tag in taglist with buffer"""

def multiMap( taglist, source=None, buffer=None ):
"""Convert a taglist to a mapping from tag-object:[list-of-tags]

For instance, if you have items of 3 different types, in any order,
you can retrieve them all sorted by type with multimap( childlist)
then access them by tagobject key.

If source and buffer are specified, call dispatch on all items.
"""

def singleMap( taglist, source=None, buffer=None ):
"""Convert a taglist to a mapping from tag-object:tag,
overwritting early with late tags. If source and buffer
are specified, call dispatch on all items."""

def getString( (tag, left, right, sublist), buffer):
"""Return the string value of the tag passed"""

def lines( start=None, end=None, buffer=None ):
"""Return number of lines in buffer[start:end]"""

With a class DispatchProcessor, which provides a __call__ implementation to trigger dispatching for both "called as root processor" and "called to process an individual result element" cases.

You define a DispatchProcessor sub-class with methods named for each production that will be processed by the processor, with signatures of:

from simpleparse.dispatchprocessor import *
class MyProcessorClass( DispatchProcessor ):
def production_name( self, (tag,start,stop,subtags), buffer ):
"""Process the given production and it's children"""

Within those production-handling methods, you can call the dispatch functions to process the sub-tags of the current production (keep in mind that the sub-tags "list" may be a None object).  You can see examples of this processing methodology in simpleparse.simpleparsegrammar, simpleparse.common.iso_date and simpleparse.common.strings (among others).

For real-world Parsers, where you normally use the same processing class for all runs of the parser, you can define a default Processor class like so:

class MyParser( Parser ):
def buildProcessor( self ):
return MyProcessorClass()

so that if no processor is explicitly specified in the parse call, your "MyProcessorClass" instance will be used for processing the results.

Non-standard Result Trees (AppendMatch, AppendToTagobj, AppendTagobj, CallTag)

SimpleParse 2.0 introduced features which expose certain of the mx.TextTool library's features for producing non-standard result trees. Although not generally recommended for use in “normal” parsers, these features are useful for certain types of text processing, and their exposure was requested. Each flag has a different effect on the result tree, the particular effects are discussed below.

The exposure is through the Processor (or more precisely, a super-class of Processor called “MethodSource”) object. To specify the use of one of the flags, you set an attribute in your MethodSource object (your Processor object) with the name _m_productionname (for the “method” to use, which is either an actual callable object for use with CallTag, or one of the other mx.TextTools flag constants above). In the case of AppendTagobj , you will likely want to specify a particular tagobj object to be appended, you do that by setting an attribute named _o_productionname in your MethodSource. For AppendToTagobj, you must specify an _o_productionname object with an “append” method.

Note: you can use MethodSource as your direct ancestor if you want to define a non-standard result tree, but don't want to do any processing of the results (this is the reason for having seperate classes).  MethodSource does not define a __call__ method.

CallTag

_m_productionname = callableObject(
taglist,
text,
left,
right,
subtags
)

The given object/method is called on a successful match with the values shown. The text argument is the entire text buffer being parsed, the rest of the values are what you're accustomed to seeing in result tuples.

Notes:

AppendToTagobj

_m_productionname = AppendToTagobj
_o_productionname = objectWithAppendMethod

On a successful match, the system will call _o_productionname.append((None,l,r,subtags)) method. For some processing tasks, it's conceivable you might want to use this method to pull out all instances of a production from a larger (already-written) grammar where going through the whole results tree to find the deeply nested productions is considered too involved.

Notes:

AppendMatch

_m_productionname = AppendMatch

On a successful match, the system will append the matched text to the result tree, rather than a tuple of results. In situations where you just want to extract the text, this can be useful. The downside is that your results tree has a non-standard format that you need to explicitly watch out for while processing the results.

AppendTagobj

_m_productionname = AppendTagobj
_o_productionname = any object
# object is optional, if omitted, the production name string is used

On a successful match, the system will append the tagobject to the result tree, rather than a tuple of results. In situations where you just want notification that the production has matched (and it doesn't matter what it matched), this can be useful. The downside, again, is that your results tree has a non-standard format that you need to explicitly watch out for while processing the results.

Up to index...

A SourceForge Logo
Open Source project





SimpleParse-2.2.0/doc/scanning_with_simpleparse.html0000644000175000017500000002137112037615407024236 0ustar mcfletchmcfletch00000000000000 Text Scanning with SimpleParse 2.0

Text Scanning with SimpleParse 2.0

SimpleParse 2.0 provides a parser generator which converts an EBNF grammar into a run-time parser for use in scanning/marking up texts. This document describes the process of developing and using an EBNF grammar to perform the text-scanning process.

Prerequisites:

Creation of a Simple Grammar

The primary function of SimpleParse is to convert an EBNF grammar into an in-memory object which can do the work of scanning (and potentially processing) data which conforms to that grammar. Therefore, to use the system effectively, we need to be able to create grammars.

For our first experiment, we'll define a simple grammar for use in parsing an INI-file-like format. Users of SimpleParse 1.0 will recognise the format from the original documentation. This version uses somewhat more features (and is shorter as a result) than was easily accomplished with SimpleParse 1.0.

Here's the grammar definition:

____ simpleexample2_1.py ____

from simpleparse.common import numbers, strings, comments

declaration = r'''# note use of raw string when embedding in python code...
file := [ \t\n]*, section+
section := '[',identifier!,']'!, ts,'\n', body
body := statement*
statement := (ts,semicolon_comment)/equality/nullline
nullline := ts,'\n'
equality := ts, identifier,ts,'=',ts,identified,ts,'\n'
identifier := [a-zA-Z], [a-zA-Z0-9_]*
identified := string/number/identifier
ts := [ \t]*
'''

The first line incorporates a new feature of SimpleParse 2.0, namely the ability to automatically include (and build your own, incidentally) libraries of commonly used productions (rules/patterns/grammars). By importing these three modules, I've made the productions “string”, “number” and “semicolon_comment” (among others) available to all the Parser instances I create for the rest of this session.

New Feature Note: The identifier! and ']'! element tokens in the "section" production tell the parser generator to report a ParserSyntaxError if we attempt to parse these element tokens and fail.  We could also have spelled this particular segment of the grammar:

section        :=  '[',!,identifier,']', ts,'\n', body

which spelling is often easier to use in complex grammars.

If you are not familiar with EBNF grammars, or would like a reference to the various features of the SimpleParse grammar, please see: SimpleParse Grammars . We will assume that you understand the grammars being presented.

Checking a Grammar

SimpleParse does not have a separate compilation step, but it's useful as you're writing your grammar to set up tests both for whether the grammar itself is syntactically correct, and for whether the productions match the values you expect them to (and don't match those you don't want them to).

To check that a grammar is syntactically correct, the easiest approach is to attempt to create a Parser with the grammar. The Parser will complain if your grammar is syntactically incorrect, generating a ValueError which reports the last line of the declaration which parsed correctly, and the remainder of the declaration.

from simpleparse.parser import Parser
parser = Parser( declaration)

If, for example, you had left out a comma in the “section” production between the literal ']' and ts, you would get an error like so:

S:\sp\simpleparse\examples>bad_declaration.py
Traceback (most recent call last):
File "S:\sp\simpleparse\examples\bad_declaration.py", line 21, in ?
parser = Parser( declaration, "file" ) # will raise ValueError
File "S:\sp\simpleparse\parser.py", line 34, in __init__
definitionSources = definitionSources,
File "S:\sp\simpleparse\simpleparsegrammar.py", line 380, in __init__
raise ValueError(
ValueError: Unable to complete parsing of the EBNF, stopped at line 3 (134 chars
of 467)
Unparsed:
ts,'\n', body
body := statement*
statement := (ts,semicolon_comment)/equality/nulll...

You can see this for yourself by running examples/bad_declaration.py .

If your grammar is correct, Parser( declaration) will simply create the underlying generator objects which can produce a parser for your grammar. If you want to check that particular production has all of it's required sub-productions, you can call myparser.buildTagger( productionname ), but I normally leave that test to be caught during the “production checking” phase below.

Checking a Production

Now that we have our Parser object, and know that the grammar is syntactically correct, we can test that our productions match/don't match the values we expect. Depending on your particular philosophy, this may be done using the unittest module, or merely as informal tests during development.

In our grammar above, let's try checking that the equality production really does match some values we expect it to match:

testEquality = [
"s=3\n",
"s = 3\n",
''' s="three\\nthere"\n''',
''' s=three\n''',
]

production = "equality"

for testData in testEquality:
success, children, nextcharacter = parser.parse( testData, production=production)
assert success and nextcharacter==len(testData), """Wasn't able to parse %s as a %s (%s chars parsed of %s), returned value was %s"""%( repr(testData), production, nextcharacter, len(testData), (success, children, nextcharacter))

You should be prepared to have those tests fail a few times. It's easy to miss the effect of a particular feature of your grammar (such as the inclusion of “newline” in the equality production above). It took 3 tries before I got the tests above properly defined. Setting up your tests within an automated framework such as unittest is probably a good idea. It's also a good idea to set up tests that check that that values which shouldn't match don't.

Note: You may receive an error message from the parser.parse( ) call saying that a particular production name isn't defined within the grammar. You'll need to figure out why that name isn't there (did you include the common module you were planning to use, or did you mis-type a name somewhere?) and correct the problem before the tests will run. This error serves as a check that the production has all required sub-productions (as noted in the previous section).

Scanning Text with the Grammar

You saw the basic approach to parsing in the section on testing above, but there are a few differences when you're creating a “real world” parser. The first is that you will likely want to define a default root production for the parser. In the examples above, the “root” was specified explicitly during the call to parse to allow us to test any of the productions in the grammar. In normal use, you don't want users of your parser to need to know what production is used for parsing a buffer, so you provide a default in the Parser's initialiser:

parser = Parser( declaration, "file" )
parser.parse( testData)

Note: the root is treated differently than all other productions, as it doesn't return a result-tuple in the results tree, but instead governs the overall operation of the parser, determining whether it “succeeds” or “fails” as a whole. The children of the root production produce the top-level results of the parsing pass.

You can see the result tree returned from the parse method by running examples/simpleexample2_3.py . You can read about how to process the results tree in “Processing Result Trees”.

Up to index...

A SourceForge Logo
Open Source project

SimpleParse-2.2.0/doc/index.html0000644000175000017500000004240612620706017020104 0ustar mcfletchmcfletch00000000000000 SimpleParse 3.0

SimpleParse A Parser Generator for mxTextTools v3.0.0

SimpleParse is a BSD-licensed Python package providing a simple and fast parser generator using a modified version of the mxTextTools text-tagging engine. SimpleParse allows you to generate parsers directly from your EBNF grammar.

Unlike most parser generators, SimpleParse generates single-pass parsers (there is no distinct tokenization stage), an approach taken from the predecessor project (mcf.pars) which attempted to create "autonomously parsing regex objects". The resulting parsers are not as generalized as those created by, for instance, the Earley algorithm, but they do tend to be useful for the parsing of computer file formats and the like (as distinct from natural language and similar "hard" parsing problems).

As of version 2.1.0 the SimpleParse project includes a patched copy of the mxTextTools tagging library with the non-recursive rewrite of the core parsing loop.  This means that you will need to build the extension module to use SimpleParse, but the effect is to provide a uniform parsing platform where all of the features of a give SimpleParse version are always available.

For those interested in working on the project, I'm actively interested in welcoming and supporting both new developers and new users. Feel free to contact me.

Documentation

Acquisition and Installation

You will need a copy of Python 2.7, 3.3 or above. If you are compiling the package you'll also need a C compiler compatible with your Python.

To install the base SimpleParse engine:

$ pip install SimpleParse

Features/Changelog

New in 3.0.0:

New in 2.1.1:

New in 2.1.1a2:

New in 2.1.1a1:

New in 2.1.0a1:

New in 2.0.1:

diff -w -r1.4 error.py
32c32
<             return '%s: %s'%( self.__class__.__name__, self.messageFormat(message) )
---
>             return '%s: %s'%( self.__class__.__name__, self.messageFormat(self.message) )

New in 2.0:

General

"Class" of Parsers Generated

Our (current) parsers are top-down, in that they work from the top of the parsing graph (the root production). They are not, however, tokenising parsers, so there is no appropriate LL(x) designation as far as I can see, and there is an arbitrary lookahead mechanism that could theoretically parse the entire rest of the file just to see if a particular character matches).  I would hazard a guess that they are theoretically closest to a deterministic recursive-descent parser.

There are no backtracking facilities, so any ambiguity is handled by choosing the first successful match of a grammar (not the longest, as in most top-down parsers, mostly because without tokenisation, it would be expensive to do checks for each possible match's length).  As a result of this, the parsers are entirely deterministic.

The time/memory characteristics are such that, in general, the time to parse an input text varies with the amount of text to parse. There are two major factors, the time to do the actual parsing (which, for simple deterministic grammars should be close to linear with the length of the text, though a pathalogical grammar might have radically different operating characteristics) and the time to build the results tree (which depends on the memory architecture of the machine, the currently free memory, and the phase of the moon).  As a rule, SimpleParse parsers will be faster (for suitably limited grammars) than anything you can code directly in Python.  They will not generally outperform grammar-specific parsers written in C.

Missing Features

Possible Future Directions

mxTextTools Rewrite Enhancements

Alternate C Back-end?

mxBase/mxTextTools Installation

NOTE: This section only applies to SimpleParse versions before 2.1.0, SimpleParse 2.1.0 and above include a patched version of mxTextTools already!

You will want an mxBase 2.1.0 distribution to run SimpleParse, preferably with the non-recursive rewrite. If you want to use the non-recursive implementation, you will need to get the source archive for mxTextTools.  It is possible to use mxBase 2.0.3 with SimpleParse, but not to use it for building the non-recursive TextTools engine (2.0.3 also lacks a lot of features and bug-fixes found in the 2.1.0 versions).

Note: without the non-recursive rewrite of 2.1.0 (i.e. with the recursive version), the test suite will not pass all tests.  I'm not sure why they fail with the recursive version, but it does argue for using the non-recursive rewrite.

To build the non-recursive TextTools engine, you'll need to get the source distribution for the non-recursive implementation from the SimpleParse file repository.  Note, there are incompatabilities in the mxBase 2.1 versions that make it necessary to use the versions specified below to build the non-recursive versions.

This archive is intended to be expanded over the mxBase source archive from the top-level directory, replacing one file and adding four others.

cd egenix-mx-base-2.1.0
gunzip non-recursive-1.0.0b1.tar.gz
tar -xvf non-recursive-1.0.0b1.tar

(Or use WinZip on Windows). When you have completed that, run:

setup.py build --force install

in the top directory of the eGenix-mx-base source tree.

Copyright, License & Disclaimer

The 2.1.0 and greater releases include the eGenix mxTextTools extension:

Licensed under the eGenix.com Public License see the mxLicense.html file for details on licensing terms for the original library, the eGenix extensions are:

    Copyright (c) 1997-2000, Marc-Andre Lemburg
    Copyright (c) 2000-2001, eGenix.com Software GmbH

Extensions to the eGenix extensions (most significantly the rewrite of the core loop) are copyright Mike Fletcher and released under the SimpleParse License below:

    Copyright Å  2003-2006, Mike Fletcher

SimpleParse License:

Copyright Å  1998-2006, Copyright by Mike C. Fletcher; All Rights Reserved.
mailto: mcfletch@users.sourceforge.net

Permission to use, copy, modify, and distribute this software and its documentation for any purpose and without fee or royalty is hereby granted, provided that the above copyright notice appear in all copies and that both the copyright notice and this permission notice appear in supporting documentation or portions thereof, including modifications, that you make.

THE AUTHOR MIKE C. FLETCHER DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE!

A SourceForge Logo
Open Source project

SimpleParse-2.2.0/doc/simpleparse_grammars.html0000644000175000017500000005112312620706017023206 0ustar mcfletchmcfletch00000000000000 SimpleParse Grammars

SimpleParse Grammars

SimpleParse uses a particular EBNF grammar which reflects the current set of features in the system. Though the system is modular enough that you could replace that grammar, most users will simply want to use the provided grammar. This document provides a quick reference for the various features of the grammar with examples of use and descriptions of their effects.

Prerequisites:

An Example

Here is an example of a basic SimpleParse grammar:

declaration = r'''# note use of raw string when embedding in python code...
file := [ \t\n]*, section+
section := '[',identifier!,']'!, ts,'\n', body
body := statement*
statement := (ts,semicolon_comment)/equality/nullline
nullline := ts,'\n'
comment := -'\n'*
equality := ts, identifier,ts,'=',ts,identified,ts,'\n'
identifier := [a-zA-Z], [a-zA-Z0-9_]*
identified := string/number/identifier
ts := [ \t]*
'''

You can see that the format allows for comments in Python style, and fairly free-form treatment of whitespace around the various items (i.e. “s:=x” and “s := x” are equivalent). The grammar is actually written such that you can break productions (rules) across multiple lines if that will make your grammar more readable.  The grammar also allows both ':=' and '::=' for the "defined as" symbol.

Element Tokens

Element tokens are the basic operational unit of the grammar. The concrete implementation of the various tokens is the module simpleparse.objectgenerator, their syntax is defined in the module simpleparse.simpleparsegrammar. You can read a formal definition of the grammar used to define them at the end of this document.

Element Token

Examples

Effect

Character Range

[ \t\n]
[a-zA-Z]
[a-zA-Z0-9_]

Matches any 1 character in the given range

String Literal

“[“
'['
'\t'
'\xa0'

Match the sequence of characters as given, allowing for special, octal and hexadecimal escape characters

Case-insensitive String Literal
(new in 2.0.1a2)

c"this"
c'this'
c' this\t\n'
c'\xa0'

Match the sequence of characters without regard to the case of the target text, allowing for special, octal and hexadecimal escape characters. 

Note: Case-insensitive literals are far slower than regular literals!

Name Reference

statement
semicolon_comment
ts

Match the production whose name is specified. With 2.0, those productions may have been included from a library module or created by you and passed to the Parser object's initialiser.

Sequential Groups

(a,b,c,d)

Match a sequence of element token children. Sequential groups have a lower precedence than FirstOf groups (below), so the group (a,b/c,d) is equivalent to (a,(b/c),d).

FirstOf Groups

(a/b/c/d)

Match the first child which matches.

Note that this is very different from system which parse all children and choose the longest/most successful child-match.

Sequential groups have a lower precedence than FirstOf groups, so the group (a,b/c,d) is equivalent to (a,(b/c),d).

Error On Fail (Cut)

!
! "Error Message"

(a,!,b,c,d,e,f)

Used as a "token", the ErrorOnFail modifer (also called "cut" after Prolog's cut directive), declares that all subsequent items in the enclosing sequential group should be marked ErrorOnFail, as if the given ErrorOnFail modifier were applied to each one individually.

Note: can only be used as a member of a Sequential group, cannot be a member of a FirstOf group.

See the section Modifiers/Operators below for more details of the semantics surrounding this token.

Character Classes, Strings and Escape Characters

Both character classes and strings in simpleparse may use octal escaping (of 1 to 3 octal digits), hexadecimal escaping (2 digits), or Unicode escaping (4 or 8 digits) or standard Python character escapes (\a\b\f\n\r\t\v)

Strings may be either single or double quoted (but not triple quoted).

To include a "]" character in a character class, make it the first character of the class. Similarly, a literal "-" character must be either the first (after the optional "]" character) or the last character. The grammar definition for a character class is as follows:

'[', CHARBRACE?,CHARDASH?, (CHARRANGE/CHARNOBRACE)*, CHARDASH?, ']'

It is a common error to have declared something like [+-*] as a character range (every character including and between + and *) intending to specify [-+*] or [+*-] (three distinct characters). Symptoms include not matching '-' or matching characters that were not expected.

Modifiers/Operators

Each element token can have a prefix and/or a postfix modifier applied to it to alter how the engine treats a match of the “base” element token.

Modifier

Example

Meaning

-

-"this"
-those*
-[A-Z]+
-(them,their)
-(them/their)

Match a single character at the current position if the entire base element token doesn't match. If repeating, match any number of characters until the base element token matches.

(postfix)?

"this"?
those?
[A-Z]?
(them,their)?
(them/their)?

Match the base element token, or if the base element token cannot match, match nothing.

? (prefix)

?"this"
?-those
?[A-Z]
?-(them,their)

Match the base element token, then return to the previous position (this is called "LookAhead" in the mx.TextTools documentation). The - modifier is applied "after" the lookahead, so that a lookahead on a negative match equates to "is not followed by", while lookahead on positive matches equates to "is followed by".

*

"this"*
those*
[A-Z]*
(them,their)*
(them/their)*

Match the base element token from 0 to an infinite number of times.

+

"this"+
those+
[A-Z]+
(them,their)+
(them/their)+

Match the base element token from 1 to an infinite number of times.


!

"this"!
"this"+!


"those" ! "Expected 'those' at position %(position)s"

Consider a failure to match a SyntaxError (stop parsing, and raise an exception). If the optional string-literal is included, it specifies the message (template) to be used for the SyntaxError. You can use %(varname)s formats to have the following variables substituted:

  • position -- the character position at which parsing failed
  • line -- the (approximate) line on which parsing failed
  • lineChar -- the position on the line where parsing failed
  • expected -- the grammar's definition of the failed item
  • production -- the top-level production which included the failing item
  • text -- the text from the failure position to 50 characters beyond the failure position.

Note: the error_on_failure flag is ignored for optional items (since they can never fail), and only raises an error if a repeating non-optional production fails completely.


Using the ErrorOnFail operator can be somewhat tricky.  It is often easier to use the "stand-alone" element-token version of cut.  Here's an example of use:

top := a/b/bp
a := 'a', !, ws, '=', ws, something
b := 'b', ws, '=', !, ws, something
bp := 'b', ws, '+=', !, ws, something

The production top can match an 'a =', a 'b =', or a 'b +=', but if it encounters an 'a' without an '=' following, it will raise a syntax error.  For the two "b" productions, we don't want to raise a Syntax error if the 'b' is not followed by the '=' or '+=' because the grammar might match the other production, so we only cut off back-tracking after the operator is found.  Consider this alternative:

top := a/b/bp
a := 'a'!, ws, '=', ws, something # BAD DON'T DO THIS!
b := 'b', ws, '=', !, ws, something
bp := 'b', ws, '+=', !, ws, something

This grammar does something very different (and somewhat useless).  When the "top" production goes to match, it tries to match the "a" production, which tries to match the 'a' literal.  If literal isn't there, for instance, for the text 'b =', then the 'a' literal will raise a SyntaxError.  The result is that the "b" and "bp" productions can never match with this grammar.

Declarations (Productions) and Result Trees

Each simpleparsegrammar is a series of declarations which define a production (rule) and bind it to a name which can be referenced by any production in the declaration set. Defining a production generally causes a result tuple to be created in the results tree (see below for what else can happen).

Default Result Tree Generation

Here are some examples showing sample productions and the result trees they would generate.

s := "this" ('s', start, stop, [] ) # no children
s := them, those? ('s', start, stop, [ ("them", start, stop, [...]), ("those", start, stop, [...]) ] )
('s', start, stop, [ ("them", start, stop, [...]) ) # optional value not there
s := them*, those ('s', start, stop, [ ("them", start, stop, [...]), ("them", start, stop, [...]), ("those", start, stop, [...]) ] )
('s', start, stop, [ ("those", start, stop, [...]) ) # optional repeating value not present

As a general rule, when a production matches, a match tuple is added to the result tree. The first item of this tuple is normally the name of the production (as a string), the second is the staring position of the match, the third is the stopping position of the match, and the fourth is a list of any child-production's result trees.

Expanded and Unreported Productions

Using these features allows you to trim unwanted entries out of your results tree (which is good for efficiency, as the system doesn't need to store the result-trees). Using expanded productions can allow you to reduce the complexity of your grammars by factoring out common patterns and allowing them to be included in multiple productions without generating extraneous result-tuples in the results tree. Both of these methods still produce standard results trees so no special work is required to process the results tree. (There are methods described in Processing Results Trees which can generate non-standard result trees for special purposes).

Report Type

Examples

Return Value

Normal

a := (b,c)
('a', l, r, [
("b", l, r, [...]),
("c", l, r, [...])
] )

Unreported

a := (b,c)
# b is “unreported”
<b> := “b”
('a', l, r, [
# no b, no children of b
("c", l, r, [...])
] )

Expanded

a := (b,c)
# b is “expanded”
>b< := (d,e)
('a', l, r, [
# children of b are
# returned as if they
# were direct children of a
("d", l, r, [...]),
("e", l, r, [...]),
("c", l, r, [...])
] )

Pre-built and Library Element Tokens

There are situations where the base parsing library simply isn't capable of accomplishing a particular matching task, or where it would be much something to define a method for matching a particular class of value than to define it with an EBNF grammar. In other instances, a particularly common pattern, such as floating point numbers or strings with standard (Python) escapes are wanted, and have been provided in a parsing library.

SimpleParse allows you to pass a set of “pre-built” element tokens to the Parser during initialization. These pre-built parsers can either be instances of simpleparse.objectgenerator.ElementToken, or raw mx.TextTools tag-tables. To use them, pass the Parser's initializer a list of two-tuples of (name, parserObject):

parser = Parser( declaration, "v", prebuilts = [
("word", parserObject1 ),
("white", parserObject2 ),
]
)

You can see a working example (which uses Python's re module to create a prebuilt parser) in examples/prebuilt_call.py .

SimpleParse 2.0 has introduced the ability to create libraries of common parsers for inclusion in other parsers. At present, the common package includes numbers, basic strings, the ISO date format, some character types, and some comment types. New contributions to the library are welcome.

In general, importing a particular module from the common package makes the production names from the module available in any subsequent grammar defined. Refer to the documentation for a particular module to see what production names are exported.

from simpleparse.common import strings, comments, numbers

Many of the standard common parser modules also include “Interpreter” objects which can be used to process the results tree generated by the mini-grammar into a Python-friendly form. See the documentation for the individual modules.

class MyParser( Parser ):
string = strings.StringInterpreter()

Formal SimpleParse 2.0 EBNF Grammar

This is the formal definition of the SimpleParse 2.0 grammar. Although the grammar is functional (should parse any proper grammar), the grammar used during parser generation is a manually generated version found in the simpleparse.simpleparsegrammar module.

declaration = r"""declarationset      :=  declaration+
declaration := ts, (unreportedname/expandedname/name) ,ts,':',':'?,'=',seq_group

element_token := lookahead_indicator?, ts, negpos_indicator?,ts, (literal/range/group/name),ts, occurence_indicator?, ts, error_on_fail?

negpos_indicator := [-+]
lookahead_indicator := "?"
occurence_indicator := [+*?]
error_on_fail := "!", (ts,literal)?

>group< := '(',seq_group, ')'
seq_group := ts,(error_on_fail/fo_group/element_token),
(ts, seq_indicator, ts,
(error_on_fail/fo_group/element_token)
)*, ts

fo_group := element_token, (ts, fo_indicator, ts, element_token)+


# following two are likely something peoples might want to
# replace in many instances...
<fo_indicator> := "/"
<seq_indicator> := ','

unreportedname := '<', name, '>'
expandedname := '>', name, '<'
name := [a-zA-Z_],[a-zA-Z0-9_]*
<ts> := ( [ \011-\015]+ / comment )*
comment := '#',-'\n'*,'\n'
literal := literalDecorator?,("'",(CHARNOSNGLQUOTE/ESCAPEDCHAR)*,"'") / ('"',(CHARNODBLQUOTE/ESCAPEDCHAR)*,'"')
literalDecorator := [c]



range := '[',CHARBRACE?,CHARDASH?, (CHARRANGE/CHARNOBRACE)*, CHARDASH?,']'
CHARBRACE := ']'
CHARDASH := '-'
CHARRANGE := CHARNOBRACE, '-', CHARNOBRACE
CHARNOBRACE := ESCAPEDCHAR/CHAR
CHAR := -[]]
ESCAPEDCHAR := '\\',( SPECIALESCAPEDCHAR / ('x',HEXESCAPEDCHAR) / ("u",UNICODEESCAPEDCHAR_16) /("U",UNICODEESCAPEDCHAR_32)/OCTALESCAPEDCHAR )
SPECIALESCAPEDCHAR := [\\abfnrtv"']
OCTALESCAPEDCHAR := [0-7],[0-7]?,[0-7]?
HEXESCAPEDCHAR := [0-9a-fA-F],[0-9a-fA-F]
CHARNODBLQUOTE := -[\\"]+
CHARNOSNGLQUOTE := -[\\']+
UNICODEESCAPEDCHAR_16 := [0-9a-fA-F],[0-9a-fA-F],[0-9a-fA-F],[0-9a-fA-F]
UNICODEESCAPEDCHAR_32 := [0-9a-fA-F],[0-9a-fA-F],[0-9a-fA-F],[0-9a-fA-F],[0-9a-fA-F],[0-9a-fA-F],[0-9a-fA-F],[0-9a-fA-F]
"""
Up to index...

A SourceForge Logo
Open Source project

SimpleParse-2.2.0/doc/sitestyle.css0000644000175000017500000000102212037615407020637 0ustar mcfletchmcfletch00000000000000pre { color: #000080; margin-left: 90; } p { margin-left: 30; } ul { margin-left: 30; } blockquote { margin-left: 90; } table { margin-left: 30; } th { background-color: #F5F5F5; } td { vertical-align: top; } h1 { background-color: #F5F5F5; border-top-style: solid; border-top-width: 1 } h2 { background-color: #F5F5F5; border-top-style: solid; border-top-width: 1 } h3 { background-color: #F5F5F5; border-top-style: solid; border-top-width: 1 } body { background-color: #FFFFFF; color: #000000 } SimpleParse-2.2.0/PKG-INFO0000644000175000017500000000165512620710576016445 0ustar mcfletchmcfletch00000000000000Metadata-Version: 1.1 Name: SimpleParse Version: 2.2.0 Summary: A Parser Generator for Python (w/mxTextTools derivative) Home-page: http://simpleparse.sourceforge.net/ Author: Mike C. Fletcher Author-email: mcfletch@users.sourceforge.net License: UNKNOWN Description: A Parser Generator for Python (w/mxTextTools derivative) Provides a moderately fast parser generator for use with Python, includes a forked version of the mxTextTools text-processing library modified to eliminate recursive operation and fix a number of undesirable behaviours. Converts EBNF grammars directly to single-pass parsers for many largely deterministic grammars. Keywords: parse,parser,parsing,text,ebnf,grammar,generator Platform: Any Classifier: Programming Language :: Python Classifier: Topic :: Software Development :: Libraries :: Python Modules Classifier: Intended Audience :: Developers SimpleParse-2.2.0/license.txt0000644000175000017500000000252512620706017017523 0ustar mcfletchmcfletch00000000000000Includes the eGenix mxTextTools extensions, which are licensed under the eGenix.com Public License see the stt/LICENSE file for details on licensing terms, the eGenix extensions are: Copyright (c) 1997-2000, Marc-Andre Lemburg; mailto:mal@lemburg.com Copyright (c) 2000-2001, eGenix.com Software GmbH; mailto:info@egenix.com Copyright, License & Disclaimer for SimpleParse: © 1998-2015, Copyright by Contributors; All Rights Reserved. Permission to use, copy, modify, and distribute this software and its documentation for any purpose and without fee or royalty is hereby granted, provided that the above copyright notice appear in all copies and that both the copyright notice and this permission notice appear in supporting documentation or portions thereof, including modifications, that you make. THE CONTRIBUTORS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE! Contributors: Mike C. Fletcher Anthony Tuininga SimpleParse-2.2.0/MANIFEST.in0000644000175000017500000000107612620707600017075 0ustar mcfletchmcfletch00000000000000include MANIFEST.in include license.txt include tox.ini include setup.py recursive-include . *.py recursive-include simpleparse/stt * recursive-include doc *.html recursive-include doc *.css recursive-include tests *.py prune examples/html.py prune examples/py* prune examples/rtf* global-exclude *CVS* global-exclude *Cvs* global-exclude *CVS* global-exclude *cvs* global-exclude *.scc global-exclude *.pyc global-exclude *.pyo global-exclude *.gz global-exclude *.zip global-exclude *.bat global-exclude *.exe global-exclude *.sxw global-exclude *.so global-exclude *.pyd SimpleParse-2.2.0/setup.py0000644000175000017500000000705312620706017017053 0ustar mcfletchmcfletch00000000000000#!/usr/bin/env python """Installs SimpleParse using distutils Run: python setup.py install to install the packages from the source archive. """ try: from setuptools import setup, Extension except ImportError as err: from distutils.core import setup, Extension import os, sys def findVersion( ): a = {} exec( open( os.path.join( 'simpleparse', '__init__.py') ).read(), a, a ) return a['__version__'] def isPackage( filename ): """Is the given filename a Python package""" return ( os.path.isdir(filename) and os.path.isfile( os.path.join(filename,'__init__.py')) ) def packagesFor( filename, basePackage="" ): """Find all packages in filename""" set = {} for item in os.listdir(filename): dir = os.path.join(filename, item) if item.lower() != 'cvs' and isPackage( dir ): if basePackage: moduleName = basePackage+'.'+item else: moduleName = item set[ moduleName] = dir set.update( packagesFor( dir, moduleName)) return set packages = packagesFor( "simpleparse", 'simpleparse' ) packages.update( {'simpleparse':'simpleparse'} ) options = { 'sdist': { 'force_manifest':1,'formats':['gztar','zip'] }, } if sys.platform == 'win32': options.setdefault( 'build_ext',{} )['define'] = 'BAD_STATIC_FORWARD' def abs_rel( path ): return os.path.normpath( os.path.abspath(path)) if __name__ == "__main__": from sys import hexversion if hexversion >= 0x2030000: # work around distutils complaints under Python 2.2.x extraArguments = { 'classifiers': [ """Programming Language :: Python""", """Topic :: Software Development :: Libraries :: Python Modules""", """Intended Audience :: Developers""", ], 'keywords': 'parse,parser,parsing,text,ebnf,grammar,generator', 'long_description' : """A Parser Generator for Python (w/mxTextTools derivative) Provides a moderately fast parser generator for use with Python, includes a forked version of the mxTextTools text-processing library modified to eliminate recursive operation and fix a number of undesirable behaviours. Converts EBNF grammars directly to single-pass parsers for many largely deterministic grammars.""", 'platforms': ['Any'], } else: extraArguments = { } setup ( name = "SimpleParse", version = findVersion(), description = "A Parser Generator for Python (w/mxTextTools derivative)", author = "Mike C. Fletcher", author_email = "mcfletch@users.sourceforge.net", url = "http://simpleparse.sourceforge.net/", package_dir = packages, options = options, packages = list(packages.keys()), ext_modules=[ Extension( "simpleparse.stt.TextTools.mxTextTools.mxTextTools", [ abs_rel(f) for f in [ 'simpleparse/stt/TextTools/mxTextTools/mxTextTools.c', 'simpleparse/stt/TextTools/mxTextTools/mxte.c', 'simpleparse/stt/TextTools/mxTextTools/mxbmse.c', ] ], include_dirs=[ abs_rel('simpleparse/stt/TextTools/mxTextTools'), ], define_macros=[ ('MX_BUILDING_MXTEXTTOOLS',1), ('PY_SSIZE_T_CLEAN',1),], ), ], **extraArguments ) SimpleParse-2.2.0/tox.ini0000644000175000017500000000017012620707415016650 0ustar mcfletchmcfletch00000000000000[tox] # py35 is broken on Ubuntu at the moment envlist=py27,py34,py26 [testenv] deps=nose commands=nosetests -w tests SimpleParse-2.2.0/tests/0000755000175000017500000000000012620710576016503 5ustar mcfletchmcfletch00000000000000SimpleParse-2.2.0/tests/mx_low.py0000644000175000017500000001765012620710227020364 0ustar mcfletchmcfletch00000000000000"""Low-level matching tests for mx.TextTools""" import unittest from simpleparse.stt.TextTools import * from simpleparse.stt import TextTools mxVersion = tuple(TextTools.__version__.split('.')[:3]) from .genericvalues import AnyInt class MXLowTests(unittest.TestCase): def doBasicTest(self, table, testvalue, expected, startPosition=0 ): result = tag( testvalue, table , startPosition) assert result == expected, '''\n\texpected:%s\n\tgot:%s\n'''%( expected, result ) def testAllIn1( self ): """Test simple AllIn command""" self.doBasicTest( ( ( "ab", AllIn, "ab", 0 ), ), "abbaab", ( 1,[("ab",0,6,None)],6), ) def testAllIn2( self ): """Test simple AllIn command ignore fail""" self.doBasicTest( ( ( "ab", AllIn, "ab", 1,1 ), ), "c", ( 1,[],0), ) def testAllIn3( self ): """Test simple AllIn command w 2 items""" self.doBasicTest( ( ( "ab", AllIn, "ab", 1,1 ), ( "c", AllIn, "cde", 0 ), ), "abbaabccdd", ( 1,[ ("ab",0,6,None), ("c",6,10,None), ],10), ) def testAllIn4( self ): """Test simple AllIn command fail on second This should truncate the results list back to [], as well as returning 0 as length. This is broken under mx.TextTools 2.1.0b1! """ self.doBasicTest( ( ( "ab", AllIn, "ab", 1,1 ), ( "c", AllIn, "cde", 0 ), ), "abbaab", ( 0,[ ],AnyInt), ) def testAllIn5( self ): """Test simple AllIn command with None tagobj""" self.doBasicTest( ( ( None, AllIn, "ab", 0 ), ), "abbaab", ( 1,[],6), ) def testAllNotIn1( self ): """Test simple AllNotIn command""" self.doBasicTest( ( ( "ab", AllNotIn, "ab", 0 ), ), "ccddee", ( 1,[("ab",0,6,None)],6), ) def testAllNotIn2( self ): """Test simple AllNotIn command ignore fail""" self.doBasicTest( ( ( "ab", AllNotIn, "ab", 1,1 ), ), "a", ( 1,[],0), ) def testAllNotIn3( self ): """Test simple AllNotIn command w 2 items""" self.doBasicTest( ( ( "ab", AllNotIn, "ab", 1,1 ), ( "c", AllNotIn, "cde", 0 ), ), "ccddabbaab", ( 1,[ ("ab",0,4,None), ("c",4,10,None), ],10), ) def testIs1( self ): """Test simple Is command""" self.doBasicTest( ( ( "ab", Is, "a", 0 ), ), "abbaab", ( 1,[("ab",0,1,None)],1), ) def testIs2( self ): """Test simple Is command ignore fail""" self.doBasicTest( ( ( "ab", Is, "a", 1,1), ), "c", ( 1,[],0), ) def testIsIn1( self ): """Test simple IsIn command""" self.doBasicTest( ( ( "ab", IsIn, "ab", 0 ), ), "abbaab", ( 1,[("ab",0,1,None)],1), ) def testIsIn2( self ): """Test simple IsIn command ignore fail""" self.doBasicTest( ( ( "ab", IsIn, "ab", 1,1), ), "c", ( 1,[],0), ) def testIsNotIn1( self ): """Test simple IsNotIn command""" self.doBasicTest( ( ( "ab", IsNotIn, "ab", 0 ), ), "ccddee", ( 1,[("ab",0,1,None)],1), ) def testIsNotIn2( self ): """Test simple IsNotIn command ignore fail""" self.doBasicTest( ( ( "ab", IsNotIn, "ab", 1,1), ), "abb", ( 1,[],0), ) def testWord1( self ): """Test simple Word command""" self.doBasicTest( ( ( "ab", Word, "ab", 0 ), ), "ab", ( 1,[("ab",0,2,None)],2), ) def testWord2( self ): """Test simple Word command ignore fail""" self.doBasicTest( ( ( "ab", Word, "ab", 1,1), ), "cd", ( 1,[],0), ) def testWordStart1( self ): """Test simple WordStart command""" self.doBasicTest( ( ( "ab", WordStart, "ab", 0 ), ), "ddeeffab", ( 1,[("ab",0,6,None)],6), ) def testWordStart2( self ): """Test simple WordStart command ignore fail""" self.doBasicTest( ( ( "ab", WordStart, "ab", 1,1), ), "cdffgg", ( 1,[],0), ) def testWordEnd1( self ): """Test simple WordEnd command""" self.doBasicTest( ( ( "ab", WordEnd, "ab", 0 ), ), "ddeeffab", ( 1,[("ab",0,8,None)],8), ) def testWordEnd2( self ): """Test simple WordEnd command ignore fail""" self.doBasicTest( ( ( "ab", WordEnd, "ab", 1,1), ), "cdffgg", ( 1,[],0), ) def testAllInSet1( self ): """Test simple AllInSet command""" self.doBasicTest( ( ( b"ab", AllInSet, set(b"ab"), 0 ), ), b"abbaab", ( 1,[(b"ab",0,6,None)],6), ) def testAllInSet2( self ): """Test simple AllInSet command ignore fail""" self.doBasicTest( ( ( b"ab", AllInSet, set(b"ab"), 1,1 ), ), b"c", ( 1,[],0), ) def testIsInSet1( self ): """Test simple IsInSet command""" self.doBasicTest( ( ( b"ab", IsInSet, set(b"ab"), 0 ), ), b"abbaab", ( 1,[(b"ab",0,1,None)],1), ) def testIsInSet2( self ): """Test simple IsInSet command ignore fail""" self.doBasicTest( ( ( b"ab", IsInSet, set(b"ab"), 1,1), ), b"c", ( 1,[],0), ) if mxVersion >= ('2','1'): def testIsInCharSet1( self ): """Test simple IsInCharSet command""" self.doBasicTest( ( ( b"ab", IsInCharSet, CharSet(b"ab"), 0 ), ), b"abbaab", ( 1,[(b"ab",0,1,None)],1), ) def testIsInCharSet2( self ): """Test simple IsInCharSet command ignore fail""" self.doBasicTest( ( ( "ab", IsInCharSet, CharSet("ab"), 1,1), ), "c", ( 1,[],0), ) def testAllInCharSet1( self ): """Test simple AllInSet command w/ CharSet object""" self.doBasicTest( ( ( "ab", AllInCharSet, CharSet("ab"), 0 ), ), "abbaab", ( 1,[("ab",0,6,None)],6), ) def testAllInCharSet2( self ): """Test simple AllInSet command ignore fail""" self.doBasicTest( ( ( "ab", AllInCharSet, CharSet("ab"), 1,1), ), "ccd", ( 1,[],0), ) def getSuite(): return unittest.makeSuite(MXLowTests,'test') if __name__ == "__main__": unittest.main(defaultTest="getSuite") SimpleParse-2.2.0/tests/genericvalues.py0000644000175000017500000000132612620706017021706 0ustar mcfletchmcfletch00000000000000"""Values to match result-trees even when implementations change These values match the "logical" values of the result-trees as they apply to SimpleParse's usage, rather than the particular concrete results returned by the engine. So, for instance, you can say "returns no children" (NullResults) for result-tuples or "whatever failure position" for failure return values. """ class _NullResults(object): def __eq__(self, other): return other == [] or other == None def __repr__( self ): return "" NullResult = _NullResults() class _AnyInt: def __eq__(self, other): return type(other) == type(1) def __repr__( self ): return "" AnyInt = _AnyInt() SimpleParse-2.2.0/tests/mx_special.py0000644000175000017500000001440712620710227021200 0ustar mcfletchmcfletch00000000000000"""Low-level matching tests for mx.TextTools""" import unittest from simpleparse.stt.TextTools import * from simpleparse.stt import TextTools mxVersion = tuple(TextTools.__version__.split('.')[:3]) from .genericvalues import AnyInt class MXSpecialTests(unittest.TestCase): def doBasicTest(self, table, testvalue, expected, startPosition=0 ): result = tag( testvalue, table , startPosition) assert result == expected, '''\n\texpected:%s\n\tgot:%s\n'''%( expected, result ) def testFail1( self ): """Test Fail command""" self.doBasicTest( ( ( "ab", Fail, None, 0 ), ), "abbaab", ( 0,[ ],AnyInt), ) def testFail2( self ): """Test Fail command with ignore fail (Jump)""" self.doBasicTest( ( ( "ab", Fail, None, 1), ), "abbaab", ( 1,[ ],0), ) def testSkip1( self ): """Test Skip command""" self.doBasicTest( ( ( "ab", Skip, 1, 0 ), ), "abbaab", ( 1,[ ("ab",0,1,None), ],1), ) def testSkip2( self ): """Test Skip command with negative to before buffer Note: I don't like this, but it's what we should expect from the system, so blah. Would be better IMO to have success (within the buffer) and failure (outside the buffer) but then we need a way to spell (jump, even outside buffer) Should have a test for what to do when we have AppendMatch flag in this case... """ self.assertRaises( TypeError, self.doBasicTest, ( ( "ab", Skip, -1, 0 ), ), "abbaab", ( 1,[ ("ab",0,-1,None), ],-1), ) def testMove1( self ): """Test Move command XXX Should have tests for after buffer moves """ self.doBasicTest( ( ( "ab", Move, 4, 0 ), ), "abbaab", ( 1,[ ("ab",0,4,None), ],4), ) def testMove2( self ): """Test Move command with negative to middle of buffer XXX should have tests for before buffer Note: this command is non-intuitive for Python users, the negative slicing is 1 beyond what it would be for Python (i.e. -1 in Python is 1 before the end, whereas in this command it is the end) """ self.doBasicTest( ( ( "ab", Move, -4, 0 ), ), "abbaab", ( 1,[ ("ab",0,3,None), ],3), ) def testMove3( self ): """Test Move command """ self.doBasicTest( ( ( "ab", Move, 7, 0 ), ), "abbaab", ( 1,[ ("ab",0,7,None), ],7), ) def testMove4( self ): """Test Move to EOF """ self.doBasicTest( ( ( "ab", Move, ToEOF, 0), ), "abbaab", ( 1,[ ("ab",0,6,None), ],6), ) def testEOF1( self ): """Test EOF command Although it's not documented, the original code returned the EOF position as the left and right coords for the match, so we mimic that behaviour now. """ self.doBasicTest( ( ( "ab", Move, 7, 1 ), ( "c", EOF, Here, 0 ), ), "abbaab", ( 1,[ ("ab",0,7,None), ("c",6,6,None), ],6), ) ## def testEOF2( self ): ## """Test EOF command when before buffer (can't test this any more, because of new sanity check raising error before we get to check)""" ## self.doBasicTest( ## ( ## ( "ab", Move, -10, 1 ), ## ( "c", EOF, Here, 0 ), ## ), ## "abbaab", ## ( 0,[ ## ],0), ## ) def testEOF3( self ): """Test EOF command when in middle of buffer""" self.doBasicTest( ( ( "ab", Move, 3, 1 ), ( "c", EOF, Here, 0 ), ), "abbaab", ( 0,[ ],AnyInt), ) def testJumpBeforeTable( self ): """Test Jump to before table (explicit fail) Note: this reports the position attained by the matching child (2) as the "error position", not the position before that child (0). """ self.doBasicTest( ( ("ab",Word,"ab",1,-3), ), "abbaab", ( 0,[ ],AnyInt), ) ### tests for ObjectGenerator-idioms def testNegativeOptString1( self ): """Negative, optional string value with positive match (should return 0 as length of match)""" self.doBasicTest( ( (None, WordEnd, 'test', 2, 1), (None, Skip, -4, 2, 2), (None, Skip, 1) ), "test", (1,[ ],0), ) def testBMSMove( self ): """Negative, optional string value""" self.doBasicTest( ( (None, sWordStart, BMS( "cd" ),1,2), (None, Move, ToEOF ) ), "a", (1,[ ],1), ) if mxVersion >= ('2','1'): def testJumpTargetNamed( self ): """Test JumpTarget command with tagobj specified""" self.doBasicTest( ( ( b"ab", JumpTarget, b"SomeString" ), ), b"abbaab", ( 1,[ (b"ab",0,0,None), ],0), ) def testJumpTarget( self ): """Test JumpTarget command in normal usage""" self.doBasicTest( ( b"this", ), b"abbaab", ( 1,[ ],0), ) def getSuite(): return unittest.makeSuite(MXSpecialTests,'test') if __name__ == "__main__": unittest.main(defaultTest="getSuite") SimpleParse-2.2.0/tests/test_common_comments.py0000644000175000017500000000443012620706017023305 0ustar mcfletchmcfletch00000000000000"""Test the various common library comment productions""" import unittest from simpleparse.parser import Parser from simpleparse.common import comments from simpleparse import dispatchprocessor parseTests = [ # each production should match the whole of all of the first, # and not match any of the second... ("c_comment", [ """/* this */""", """/* this \n\n*/""", ],[ """// this""", """# this""", """# this\n""", """# this\r\n""", ]), ("c_nest_comment", [ """/* this */""", """/* this \n\n*/""", """/* /* this */ */""", """/* /* this \n*/ */""", ],[ """// this""", """# this""", """; this""", ]), ("hash_comment", [ """# this""", """# this\n""", """# this\r\n""", ],[ """// this""", """/* this */""", """/* /* this */ */""", ]), ("semicolon_comment", [ """; this""", """; this\n""", """; this\r\n""", ],[ """# this""", """// this""", """/* this */""", """/* /* this */ */""", ]), ("slashslash_comment", [ """// this""", """// this\n""", """// this\r\n""", ],[ """# this""", """/ this""", """/* this */""", """/* /* this */ */""", ]), ] class CommonTests(unittest.TestCase): def testBasic( self ): for production, yestable, notable in parseTests: p = Parser( "x := %s"%production, 'x') for data in yestable: success, results, next = p.parse( data) assert success and (next == len(data)), """Did not parse comment %s as a %s result=%s"""%( repr(data), production, (success, results, next)) assert results, """Didn't get any results for comment %s as a %s result=%s"""%( repr(data), production, (success, results, next)) for data in notable: success, results, next = p.parse( data) assert not success, """Parsed %s of %s as a %s result=%s"""%( next, repr(data), production, results ) def getSuite(): return unittest.makeSuite(CommonTests, 'test') if __name__ == "__main__": unittest.main(defaultTest="getSuite") SimpleParse-2.2.0/tests/test_xml.py0000644000175000017500000001755712620706017020726 0ustar mcfletchmcfletch00000000000000from simpleparse.xmlparser import xml_parser from simpleparse.parser import Parser import unittest try: unicode except NameError: unicode = str p = Parser( xml_parser.declaration ) class XMLProductionTests(unittest.TestCase): """Tests that XML grammar productions match appropriate values""" ### ProductionTests will be added here by loop below... class ProductionTest: def __init__( self, production, should, shouldnot ): self.production = production self.should = should self.shouldnot = shouldnot def __call__( self ): """Perform the test""" for item in self.should: if isinstance(item,unicode): item = item.encode('utf-8') success, children, next = p.parse( item, self.production ) assert success, """Didn't parse %s as a %s, should have"""%( repr(item), self.production) assert next == len(item), """Didn't parse whole of %s as a %s, parsed %s of %s characters, results were:\n%s\nRest was:\n%s"""%( repr(item), self.production, next, len(item), children, item[next:]) for item in shouldnot: if isinstance(item,unicode): item = item.encode('utf-8') success, children, next = p.parse( item, self.production ) assert not success, """Parsed %s chars of %s as a %s, shouldn't have, result was:\n%s"""%( next, repr(item), self.production, children) def getSuite(): return unittest.makeSuite(XMLProductionTests, 'test') testData = { "CharData":( [# should match """Type """, ], [# should not match ], ), "Attribute":( [# should match """s=&this;""", '''s="&this;"''', """&this;""", ], [# should not match # unfinished elements ], ), "element":( [# should match """""", """""", """""", """""", """""", ], [# should not match # unfinished elements """""", """""", """""", # end with no start... """""", # malformed end tags """""", """""", ], ), "content":( [# should match """Type less-than (<) to save options. This document was prepared on &docdate; and is classified &security-level;.""", """""", """""", """""", """""", """&this;""", """""", ], [# should not match # unfinished elements """""", """""", """""", # end with no start... """""", # malformed end tags """""", """""", ], ), "AttValue":( [# should match '''"&this;"''', ], [# should not match ], ), "Name": ( [# should match "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-:._", "_a", ":a", ":a", ], [# should not match "-a", "0", "0.0", ".this", ], ), "Comment": ( [# should match "", "", "", "", "", "", ], [# should not match "", "", "", ], ), "prolog": ( [ # should match """ """, """ ]>""", """""", """ ]>""", """ %ISOLat2; ]>""", ], [ # should not match ], ), "ExternalID": ( [# should match '''SYSTEM "hello.dtd"''', ], [# should not match ], ), "elementdecl": ( [# should match '''''', """""", """""", """""", """""", """""", """""", """""", ], [# should not match """""", ], ), "elementdecl_pe": ( [# should match """ %name.para; %content.para;""", ], [# should not match ], ), "contentspec": ( [# should match '''EMPTY''', '''ANY''', '''%content.para;''', ], [# should not match ], ), "AttlistDecl": ( [# should match '''''', """""", """""", ], [# should not match ], ), "AttDef": ( [# should match ''' id ID #REQUIRED''', """ name CDATA #IMPLIED""", ''' type (bullets|ordered|glossary) "ordered"''', ''' method CDATA #FIXED "POST"''', ], [# should not match ], ), "EntityDecl": ( [ """""", """""", """""", """""", ], [# should not match ], ), "EntityDef":( [ '''PUBLIC "-//Textuality//TEXT Standard open-hatch boilerplate//EN" "http://www.textuality.com/boilerplate/OpenHatch.xml"''', ], [# should not match ], ), "PubidLiteral":( [ '''"-//Textuality//TEXT Standard open-hatch boilerplate//EN"''', ], [# should not match ], ), } for production, (should,shouldnot) in list(testData.items()): setattr( XMLProductionTests, 'test'+production, ProductionTest(production, should, shouldnot)) if __name__ == "__main__": unittest.main(defaultTest="getSuite") SimpleParse-2.2.0/tests/test_common_numbers.py0000644000175000017500000001253712620706017023142 0ustar mcfletchmcfletch00000000000000import unittest from simpleparse.parser import Parser from simpleparse.common import numbers from simpleparse import dispatchprocessor _data = [ ( "int_unsigned", numbers.IntInterpreter, [ # should match, value, length that should match, expected result ("0 ", 1, 0), ("1 ", 1, 1), ("23 ",2, 23), ("0x ", 1,0), ("0. ", 1,0), ], [ # should not match... ".0", "a", ], ), ( "int", numbers.IntInterpreter, [ # should match, value, length that should match, expected result ("0 ", 1, 0), ("1 ", 1, 1), ("23 ",2, 23), ("0x ", 1,0), ("0. ", 1,0), ("+0 ", 2, 0), ("+1 ", 2, 1), ("+23 ",3, 23), ("+0x ", 2,0), ("+0. ", 2,0), ("-0 ", 2, 0), ("-1 ", 2, -1), ("-23 ",3, -23), ("-0x ", 2,0), ("-0. ", 2,0), ], [ # should not match... ".0", "a", "+.0", "+a", "-.0", "-a", ], ), ( "hex", numbers.HexInterpreter, [ # should match, value, length that should match, expected result ("0x0 ", 3, 0), ("0x1 ", 3, 1), ("0x23 ",4, 35), ("0x0x ", 3,0), ("0x0. ", 3,0), ("+0x0 ", 4, 0), ("+0x1 ", 4, 1), ("+0x23 ",5, 35), ("+0x0x ", 4,0), ("+0x0. ", 4,0), ("-0x0 ", 4, 0), ("-0x1 ", 4, -1), ("-0x23 ",5, -35), ("-0x0x ", 4,0), ("-0x0. ", 4,0), ("0xa ", 3, 10), ("0xaaaaaaaaaaaaaaaaa ", 19, 196765270119568550570), ("0xA ", 3, 10), ("0xAAAAAAAAAAAAAAAAA ", 19, 196765270119568550570), ], [ # should not match... ".0", "a", "+.0", "+a", "-.0", "-a", "0x ", "0xg", "0x", ], ), ( "binary_number", numbers.BinaryInterpreter, [ # should match, value, length that should match, expected result ("0b0 ", 2, 0), ("1b0 ", 2, 1), ("10b0 ", 3, 2), ("10000000000b0 ", 12, 1024), ("0B0 ", 2, 0), ("1B0 ", 2, 1), ("10B0 ", 3, 2), ("10000000000B0 ", 12, 1024), ], [ # should not match... ".0", "a", "+.0", "+a", "-.0", "-a", "0x ", "0xg", "0x", ], ), ( "float", numbers.FloatInterpreter, [ # should match, value, length that should match, expected result ("0. ", 2, 0), ("1. ", 2, 1), ("23. ",3, 23), (".0 ", 2, 0), (".1 ", 2, .1), (".23 ",3, .23), ("0.0x ", 3,0), ("1.1x ", 3,1.1), ("2000000.22222222x ", 16, 2000000.22222222), ("1.1e20 ", 6, 1.1e20), ("1.1e-20 ",7, 1.1e-20), ("-1.1e20 ", 7, -1.1e20), ], [ # should not match... "0x.0", "23", "-23", "-43*2a", "+23", "-a", ], ), ( "float_floatexp", numbers.FloatFloatExpInterpreter, [ # should match, value, length that should match, expected result ("0. ", 2, 0), ("1. ", 2, 1), ("23. ",3, 23), (".0 ", 2, 0), (".1 ", 2, .1), (".23 ",3, .23), ("0.0x ", 3,0), ("1.1x ", 3,1.1), ("2000000.22222222x ", 16, 2000000.22222222), ("1.1e20 ", 6, 1.1* (1e20)), ("1.1e-20 ",7, 1.1* (1e-20)), ("-1.1e20 ", 7, -1.1* (1e20)), ("1.1e20.34 ", 9, 1.1* (10 ** 20.34)), ("1.1e-.34 ", 8, 1.1*( 10 ** -.34)), ], [ # should not match... "0x.0", "23", "-23", "-43*2a", "+23", "-a", ], ), ] class CommonTests(unittest.TestCase): def testBasic( self ): for production, processor, yestable, notable in _data: p = Parser( "x := %s"%production, 'x') proc = dispatchprocessor.DispatchProcessor() setattr(proc, production, processor()) for data, length, value in yestable: success, results, next = p.parse( data, processor = proc) assert next == length, """Did not parse string %s of %s as a %s result=%s"""%( repr(data[:length]), repr(data), production, (success, results, next)) assert results[0] == value, """Didn't get expected value from processing value %s, expected %s, got %s"""%( data[:length], value, results[0]) for data in notable: success, results, next = p.parse( data) assert not success, """Parsed %s of %s as a %s result=%s"""%( repr(data[:length]), repr(data), production, (success, results, next)) def getSuite(): return unittest.makeSuite(CommonTests, 'test') if __name__ == "__main__": unittest.main(defaultTest="getSuite") SimpleParse-2.2.0/tests/test_deep_nesting.py0000644000175000017500000000206712620710227022556 0ustar mcfletchmcfletch00000000000000from __future__ import print_function from simpleparse.simpleparsegrammar import Parser from simpleparse.stt.TextTools import TextTools from .genericvalues import NullResult declaration = r'''testparser := as? as := a,as? a := 'a' ''' testdata = 'aaaa' expectedResult = (1, [ ('as', 0, 4, [ ('a', 0, 1, NullResult), ('as', 1, 4, [ ('a', 1, 2, NullResult), ('as', 2, 4, [ ('a', 2, 3, NullResult), ('as', 3, 4, [ ('a', 3, 4, NullResult) ]) ]) ]) ]) ], 4) parser = Parser( declaration ).generator.buildParser( 'testparser' ) print("About to attempt the deep-nesting test") print("If python goes into an infinite loop, then the test failed ;) ") print() result = TextTools.tag( testdata, parser ) if result != expectedResult: print('test-deep-nesting failed') print('\texpected', expectedResult) print('\tgot', result) else: print("test-deep-nesting succeeded!\nYou're probably using the non-recursive mx.TextTools rewrite") SimpleParse-2.2.0/tests/mx_high.py0000644000175000017500000001363612620706017020504 0ustar mcfletchmcfletch00000000000000"""Low-level matching tests for mx.TextTools""" import unittest from simpleparse.stt.TextTools import * from simpleparse.stt import TextTools mxVersion = tuple(TextTools.__version__.split('.')[:3]) class MXHighTests(unittest.TestCase): def doBasicTest(self, table, testvalue, expected, startPosition=0 ): result = tag( testvalue, table , startPosition) assert result == expected, '''\n\texpected:%s\n\tgot:%s\n'''%( expected, result ) ### XXX Need to figure out what the heck loop is for and how to test it def testCall( self ): """Test call-to-match Call command""" def function( text, start, end ): return end self.doBasicTest( ( ( "ab", Call, function, 0 ), ), "cdffgg", ( 1,[ ("ab",0,6,None), ],6), ) def testCall2( self ): """Test call-to-match Call command with object instance""" class X: def __call__( self, text, start, end ): return end self.doBasicTest( ( ( "ab", Call, X(), 0 ), ), "cdffgg", ( 1,[ ("ab",0,6,None), ],6), ) def testCallArg( self ): """Test call-to-match CallArg command""" def function( text, start, end, *arguments ): assert arguments == (1,2,3), """Passed arguments were not what we passed in""" return end self.doBasicTest( ( ( "ab", CallArg, (function,1,2,3), 0 ), ), "cdffgg", ( 1,[ ("ab",0,6,None), ],6), ) if mxVersion >= ('2','1'): def testsWordStart1( self ): """Test simple sWordStart command""" for algo in [BOYERMOORE, TRIVIAL]: self.doBasicTest( ( ( b"ab", sWordStart, TextSearch(b"ab", algorithm=algo), 0 ), ), b"ddeeffab", ( 1,[(b"ab",0,6,None)],6), ) def testsWordStart2( self ): """Test simple sWordStart command ignore fail""" for algo in [BOYERMOORE, TRIVIAL]: self.doBasicTest( ( ( b"ab", sWordStart, TextSearch(b"ab", algorithm=algo), 1,1), ), b"cdffgg", ( 1,[],0), ) def testsWordEnd1( self ): """Test simple sWordEnd command""" for algo in [BOYERMOORE, TRIVIAL]: self.doBasicTest( ( ( b"ab", sWordEnd, TextSearch(b"ab", algorithm=algo), 0 ), ), b"ddeeffab", ( 1,[(b"ab",0,8,None)],8), ) def testsWordEnd2( self ): """Test simple sWordEnd command ignore fail""" for algo in [BOYERMOORE, TRIVIAL]: self.doBasicTest( ( ( b"ab", sWordEnd, TextSearch(b"ab", algorithm=algo), 1,1), ), b"cdffgg", ( 1,[],0), ) def testsFindWord1( self ): """Test simple sWordFind command""" for algo in [BOYERMOORE, TRIVIAL]: self.doBasicTest( ( ( b"ab", sFindWord, TextSearch(b"ab", algorithm=algo), 0 ), ), b"ddeeffab", ( 1,[(b"ab",6,8,None)],8), ) def testsFindWord2( self ): """Test simple sFindWord command ignore fail""" for algo in [BOYERMOORE, TRIVIAL]: self.doBasicTest( ( ( b"ab", sFindWord, TextSearch(b"ab", algorithm=algo), 1,1), ), b"cdffgg", ( 1,[],0), ) else: def testsWordStart1( self ): """Test simple sWordStart command""" self.doBasicTest( ( ( b"ab", sWordStart, BMS("ab"), 0 ), ), b"ddeeffab", ( 1,[(b"ab",0,6,None)],6), ) def testsWordStart2( self ): """Test simple sWordStart command ignore fail""" self.doBasicTest( ( ( b"ab", sWordStart, BMS("ab"), 1,1), ), b"cdffgg", ( 1,[],0), ) def testsWordEnd1( self ): """Test simple sWordEnd command""" self.doBasicTest( ( ( b"ab", sWordEnd, BMS(b"ab"), 0 ), ), b"ddeeffab", ( 1,[(b"ab",0,8,None)],8), ) def testsWordEnd2( self ): """Test simple sWordEnd command ignore fail""" self.doBasicTest( ( ( b"ab", sWordEnd, BMS(b"ab"), 1,1), ), b"cdffgg", ( 1,[],0), ) def testsFindWord1( self ): """Test simple sWordFind command""" self.doBasicTest( ( ( "ab", sFindWord, BMS("ab"), 0 ), ), "ddeeffab", ( 1,[("ab",6,8,None)],8), ) def testsFindWord2( self ): """Test simple sFindWord command ignore fail""" self.doBasicTest( ( ( "ab", sFindWord, BMS("ab"), 1,1), ), "cdffgg", ( 1,[],0), ) def getSuite(): return unittest.makeSuite(MXHighTests,'test') if __name__ == "__main__": unittest.main(defaultTest="getSuite") SimpleParse-2.2.0/tests/test_common_strings.py0000644000175000017500000000446712620706017023163 0ustar mcfletchmcfletch00000000000000import unittest from simpleparse.parser import Parser from simpleparse.common import strings from simpleparse import dispatchprocessor parseTests = [ # each production should match the whole of all of the first, # and not match any of the second... ("string_triple_single", [ """'''this and that'''""", """'''this \\''' '''""", """''''''""", """''''\\''''""", ],[]), ("string_triple_double", [ '''"""this and that"""''', '''"""this \\""" """''', '''""""""''', '''""""\\""""''', ],[]), ("string_double_quote", [ '"\\p"', '"\\""', ],[]), ("string",[ "'this'", '"that"', r'"\b\f\n\r"', r'"\x32\xff\xcf"', r'"\032\033\055\077"', r'"\t\v\\\a\b\f\n\r"', r'"\t"', r'"\v"', r'"\""', ], []), ] class CommonTests(unittest.TestCase): def testBasic( self ): proc = dispatchprocessor.DispatchProcessor() setattr(proc, "string", strings.StringInterpreter()) for production, yestable, notable in parseTests: p = Parser( "x := %s"%production, 'x') for data in yestable: if production == 'string': success, results, next = p.parse( data, processor=proc) else: success, results, next = p.parse( data) assert success and (next == len(data)), """Did not parse string %s as a %s result=%s"""%( repr(data), production, (success, results, next)) assert results, """Didn't get any results for string %s as a %s result=%s"""%( repr(data), production, (success, results, next)) if production == 'string': expected = eval( data, {},{}) assert results[0] == expected, """Got different interpreted value for data %s, we got %s, expected %s"""%( repr(data), repr(results[0]), repr(expected)) for data in notable: success, results, next = p.parse( data) assert not success, """Parsed %s of %s as a %s result=%s"""%( repr(data), production, (success, results, next)) def getSuite(): return unittest.makeSuite(CommonTests, 'test') if __name__ == "__main__": unittest.main(defaultTest="getSuite") SimpleParse-2.2.0/tests/test_simpleparsegrammar.py0000644000175000017500000003470212620710227024006 0ustar mcfletchmcfletch00000000000000import unittest, sys from simpleparse.parser import Parser from simpleparse.stt.TextTools import TextTools from .genericvalues import NullResult, AnyInt from . import test_grammarparser from . import test_erroronfail class ParserGenerationTests(unittest.TestCase): def doBasicTest(self, definition, parserName, testValue, expected, ): result = Parser( definition).parse( testValue, parserName ) assert result == expected, '''\nexpected:%s\n got:%s\n'''%( expected, result ) def testGenNegRange1( self ): self.doBasicTest( '''s := - something * := [ab]''', 's', 'mmmab', (1,[],3) ) def testGenNegRange2( self ): self.doBasicTest( '''s := - something := [ab]''', 's', 'mmmab', (1,[],1) ) def testGenNegLit1( self ): self.doBasicTest( '''s := - something * := "a"''', 's', 'mmmab', (1,[],3) ) def testGenPosReptOpt1( self ): self.doBasicTest( '''s := something * something := "a" ''', 's', 'aammmab', (1,[("something",0,1,NullResult),("something",1,2,NullResult)],2) ) def testGenPosReptOpt2( self ): self.doBasicTest( '''s := something * something := "a" ''', 's', 'mmmab', (1,[],0) ) def testGenPosRept1( self ): self.doBasicTest( '''s := something + something := "a" ''', 's', 'mmmab', (0,[],AnyInt) ) def testLookaheadPositive( self ): self.doBasicTest( '''s := ?"b" ''', 's', 'bbbba', (1,[ ],0) ) def testLookaheadNeg( self ): self.doBasicTest( '''s := ?-"b" ''', 's', 'bbbba', (0,[ ],AnyInt) ) def testLookaheadNeg2( self ): self.doBasicTest( '''s := ?-"b"? ''', 's', 'bbbba', (1,[ ],0) ) def testLookaheadNeg3( self ): self.doBasicTest( '''s := "b", ?-"a" ''', 's', 'bbbba', (1,[ ],1) ) def testLookaheadNeg4( self ): self.doBasicTest( '''s := "b", ?-"a", "ba" ''', 's', 'bba', (1,[ ],3) ) def testLookaheadNeg5( self ): self.doBasicTest( '''s := ?-t, "ba" t := "bad" ''', 's', 'bac', (1,[ ],2) ) def testLookaheadNeg6( self ): self.doBasicTest( '''s := ?-t, "ba" t := "bad" ''', 's', 'bad', (0,[ ],AnyInt) ) def testLookahead2( self ): """Test lookahead on literals (more complex)""" self.doBasicTest( '''s := something+, "ba" something := "b",?-"a" ''', 's', 'bbbba', (1,[ ("something",0,1,NullResult), ("something",1,2,NullResult), ("something",2,3,NullResult), ],5) ) def testLookahead3( self ): """Test lookahead on reported positive productions""" self.doBasicTest( '''s := ?trailer trailer := "bad" ''', 's', 'badba', (1,[ ("trailer",0,3,NullResult), ],0) ) def testLookahead4( self ): self.doBasicTest( '''s := ?-trailer? trailer := "bad" ''', 's', 'badba', (1,[ ],0) ) def testLookahead5( self ): self.doBasicTest( '''s := ?-trailer, 'ba' trailer := "bad" ''', 's', 'babba', (1,[ ],2) ) def testLookahead6( self ): self.doBasicTest( '''s := ?-trailer, 'ba' trailer := "bad" ''', 's', 'badba', (0,[ ],AnyInt) ) def testGenPos1( self ): self.doBasicTest( '''s := something something := "a" ''', 's', 'mmmab', (0,[],AnyInt) ) def testGenPos2( self ): self.doBasicTest( '''s := something something := "a" ''', 's', 'ammmab', (1,[('something',0,1,NullResult),],1) ) def testOptionalGroupHitEOF( self ): """Test optional group hitting an EOF during success run""" self.doBasicTest( '''s := something* something := ("a"/"b") ''', 's', 'aa', (1,[ ('something',0,1,NullResult), ('something',1,2,NullResult), ],2) ) def testMultiLineDef( self ): """Test multi-line definitions""" self.doBasicTest( '''s := something* something := ( "a"/ "b" ) ''', 's', 'aa', (1,[ ('something',0,1,NullResult), ('something',1,2,NullResult), ],2) ) ## def testRepeatOptionalFail( self ): ## """Explicit test of the optional-repeating-child of repeating object ## """ ## self.doBasicTest( ## r''' ## controlword := '\\',('*','\\')?,[-a-zA-Z0-9]+ ## contents := -[\012}\\]* ## file := (controlword/contents)+ ## ''', ## "file", ## "\\*\\test sdf ff f f sdfff\\", ## (1, [ ## ("controlword", 0,7,[]), ## ("contents",7,24), ## ],24), ## ) def testGenCILiteral1( self ): self.doBasicTest( '''s := c"this"''', 's', 'this', (1,[],4) ) def testGenCILiteral2( self ): self.doBasicTest( '''s := c"this"''', 's', 'This', (1,[],4) ) def testGenCILiteral3( self ): self.doBasicTest( '''s := c"this"''', 's', 'THIS', (1,[],4) ) def testGenCILiteral4( self ): self.doBasicTest( '''s := -c"this"''', 's', ' THIS', (1,[],1) ) def testGenCILiteral5( self ): self.doBasicTest( '''s := -c"this"''', 's', ' thi', (1,[],1) ) def testGenCILiteral6( self ): self.doBasicTest( '''s := -c"this"*''', 's', ' thi', (1,[],4) ) def testGenUnicodeRange( self ): self.doBasicTest( '''s := [\u0600-\u06ff]+''', 's', u'\u0600\u06ff', (1,[],2) ) if sys.version_info[0] < 3: def testGenUnicodeRangeBroken( self ): self.assertRaises( ValueError, self.doBasicTest, '''s := [a-\u06ff]+''', 's', u'\u0600\u06ff', (1,[],2) ) class NameTests(unittest.TestCase): def doBasicTest(self, definition, parserName, testValue, expected, ): result = Parser( definition).parse( testValue, production=parserName ) assert result == expected, '''\nexpected:%s\n got:%s\n'''%( expected, result ) def test_p( self ): self.doBasicTest( '''s := something something := "a" ''', 's', 'ammmab', (1,[('something',0,1,NullResult),],1) ) def test_po( self ): self.doBasicTest( '''s := something? something := "a" ''', 's', 'ammmab', (1,[('something',0,1,NullResult),],1) ) def test_por( self ): self.doBasicTest( '''s := something* something := "a" ''', 's', 'ammmab', (1,[('something',0,1,NullResult),],1) ) def test_pr( self ): self.doBasicTest( '''s := something+ something := "a" ''', 's', 'ammmab', (1,[('something',0,1,NullResult),],1) ) def test_n( self ): self.doBasicTest( '''s := - something := [ab]''', 's', 'mmmab', (1,[],1) ) def test_no( self ): self.doBasicTest( '''s := - something? := [ab]''', 's', 'mmmab', (1,[],1) ) def test_nor( self ): self.doBasicTest( '''s := - something* := [ab]''', 's', 'mmmab', (1,[],3) ) def test_nr( self ): self.doBasicTest( '''s := - something+ := [ab]''', 's', 'mmmab', (1,[],3) ) def test_n_f( self ): self.doBasicTest( '''s := - something := [ab]''', 's', 'ammmab', (0,[],AnyInt) ) def test_no_f( self ): self.doBasicTest( '''s := - something? := [ab]''', 's', 'ammmab', (1,[],0) ) def test_nor_f( self ): self.doBasicTest( '''s := - something* := [ab]''', 's', 'ammmab', (1,[],0) ) def test_nr_f( self ): self.doBasicTest( '''s := - something + := [ab]''', 's', 'ammmab', (0,[],AnyInt) ) ## def test_por_big( self ): ## """This test creates 1,000,000 result tuples (very inefficiently, I might add)... ## on my machine that takes a long time, so I do not bother with the test ## (note that with a recursive mx.TextTools, this should actually blow up ## long before you get into memory problems :) ). ## """ ## self.doBasicTest( ## '''s := something* ## something := "a" ''', ## 's', ## 'a'*1000000, ## (1,[ ## ],1000000) ## ) def test_expanded_name( self ): """Non-reporting (expanded) name test Tests new feature, a name whose children are reported, but which is not itself reported, basically this lets you create anonymous groups which can be referenced from other productions. """ self.doBasicTest( '''s := something + >something< := r r := [ab] v := [c] ''', 's', 'abammmab', (1,[ ('r',0,1, NullResult), ('r',1,2, NullResult), ('r',2,3, NullResult), ],3) ) def test_expanded_SingleNameChild( self ): """Expanded group with single child which is a Name itself This originally failed when the Name object's report value was changed to 0 (redundant information for the "expanded" code), resulting in the child production not getting reported. """ self.doBasicTest( '''s := something + something := r r := [ab]''', 'something', 'abammmab', (1,[ ('r',0,1, NullResult), ],1) ) class BasicMethodSource: def __init__( self ): self.results = [] def _m_a( self, taglist,text,l,r,subtags ): self.results.append( ('a',text[l:r])) def _m_b( self, taglist, text, l,r,subtags): self.results.append( ('b',l,r) ) _m_c = TextTools.AppendMatch _m_d = TextTools.AppendTagobj _o_d = "hello world" class AppendToTagobjMethodSource: def __init__( self ): self._o_d = [] _m_d = TextTools.AppendToTagobj class CallTests(unittest.TestCase): """Tests semantics of calling objects from a method source during parsing""" def parse( self, definition, parserName, testValue, source): result = Parser( definition, ).parse(testValue, production=parserName, processor = source) return result def test_basic_call( self ): """Test basic ability to call a method instead of regular functioning""" source = BasicMethodSource() self.parse( """ x := (a/b)* a := "a" b := "b" """, 'x', 'abba', source) assert source.results == [ ('a','a'),('b',1,2),('b',2,3),('a','a'),], """Method source methods were not called, or called improperly:\n%s"""%(source.results,) def test_AppendMatch( self ): """Test ability to append the text-string match to the results list""" source = BasicMethodSource() result = self.parse( """ x := c* c := 'c' """, 'x', 'ccc', source) assert result == (1,[ 'c','c','c', ],3), """Result was %s"""%( result, ) def test_AppendTagObj( self ): """Test appending the tagobject to the results list""" source = BasicMethodSource() result = self.parse( """ x := d* d := 'd' """, 'x', 'ddd', source) assert result == (1,[ "hello world","hello world","hello world", ],3) def test_AppendToTagObj( self ): """Test basic ability to call a method instead of regular functioning""" source = AppendToTagobjMethodSource() self.parse( """ x := d* d := 'd' """, 'x', 'ddd', source) assert source._o_d == [ (None,0,1,NullResult),(None,1,2,NullResult),(None,2,3,NullResult)], """Method source methods were not called, or called improperly:\n%s"""%(source._o_d,) def getSuite(): return unittest.TestSuite(( test_grammarparser.getSuite(), test_erroronfail.getSuite(), unittest.makeSuite(ParserGenerationTests, 'test'), unittest.makeSuite(NameTests, 'test'), unittest.makeSuite(CallTests, 'test'), )) if __name__ == "__main__": unittest.main(defaultTest="getSuite") SimpleParse-2.2.0/tests/test_common_chartypes.py0000644000175000017500000001026112620706017023461 0ustar mcfletchmcfletch00000000000000import unittest, string from simpleparse.parser import Parser from simpleparse.common import chartypes, timezone_names assert chartypes from simpleparse import dispatchprocessor try: fulltrans = string.maketrans(b"",b"") translate = string.translate except AttributeError: fulltrans = bytes.maketrans(b"",b"") translate = bytes.translate class CommonTests(unittest.TestCase): def doBasicTest(self, definition, parserName, testValue, expected, ): result = Parser( definition).parse( testValue, parserName ) assert result == expected, '''\nexpected:%s\n got:%s\n'''%( expected, result ) def _testSet( self, set, singleName, multiName ): """Test multi-line definitions""" decl = """single := %s multiple := %s"""%( singleName, multiName ) p = Parser(decl) notset = translate( fulltrans, fulltrans, set ) for char in set: if isinstance(char,int): char = chr(char) success, children, next = p.parse( char, singleName) assert success and (next == 1), """Parser for %s couldn't parse %s"""%( singleName, char ) for char in notset: if isinstance(char,int): char = chr(char) success, children, next = p.parse( char, singleName) assert (not success) and (next == 0), """Parser for %s parsed %s"""%( singleName, char ) success, children, next = p.parse( char, multiName) assert (not success) and (next == 0), """Parser for %s parsed %s"""%( multiName, char ) success, children, next = p.parse( set, multiName) assert success and (next == len(set)), """Parser for %s couldn't parse full set of chars, failed at %s"""%( multiName, set[next:] ) def testBasic( self ): for set, single, multiple in ( ("digits", "digit", "digits"), ("ascii_uppercase", "uppercasechar", "uppercase"), ("ascii_lowercase", "lowercasechar", "lowercase"), ("ascii_letters", "letter", "letters"), ("whitespace", "whitespacechar", "whitespace"), ("octdigits", "octdigit", "octdigits"), ("hexdigits", "hexdigit", "hexdigits"), ("printable", "printablechar", "printable"), ("punctuation", "punctuationchar", "punctuation"), ("ascii_lowercase", "ascii_lowercasechar", "ascii_lowercase"), ("ascii_uppercase", "ascii_uppercasechar", "ascii_uppercase"), ): try: set = getattr( string, set) self._testSet( set.encode('ascii'), single, multiple, ) except AttributeError: raise except TypeError as err: err.args += (set,single,multiple) raise def testEOF( self ): p = Parser( """this := 'a',EOF""", 'this') success, children, next = p.parse( 'a' ) assert success, """EOF didn't match at end of string""" def testEOFFail( self ): p = Parser( """this := 'a',EOF""", 'this') success, children, next = p.parse( 'a ' ) assert not success, """EOF matched before end of string""" def testTZ( self ): names = list(timezone_names.timezone_mapping.keys()) names.sort() # tests that the items don't match shorter versions... decl = Parser("""this := (timezone_name, ' '?)+""", 'this') proc = dispatchprocessor.DispatchProcessor() proc.timezone_name = timezone_names.TimeZoneNameInterpreter() text = ' '.join(names) success, result, next = decl.parse( text, processor = proc ) assert success, """Unable to complete parsing the timezone names, stopped parsing at char %s %s"""%(next, text[next:]) assert result == list(map( timezone_names.timezone_mapping.get, names)), """Got different results for interpretation than expected (expected first, recieved second)\n%s\n%s"""%(list(map( timezone_names.timezone_mapping.get, names)), result) def getSuite(): return unittest.makeSuite(CommonTests, 'test') if __name__ == "__main__": unittest.main(defaultTest="getSuite") SimpleParse-2.2.0/tests/test_grammarparser.py0000644000175000017500000006423112620710227022756 0ustar mcfletchmcfletch00000000000000"""Tests that simpleparsegrammar does parse SimpleParse grammars """ import unittest,pprint from simpleparse.simpleparsegrammar import SPGenerator, declaration from simpleparse.parser import Parser from simpleparse.stt.TextTools import TextTools from .genericvalues import NullResult, AnyInt from simpleparse.stt.TextTools import print_tagtable print_tagtable( SPGenerator.buildParser( 'range' ) ) class SimpleParseGrammarTests(unittest.TestCase): """Test parsing of the the simpleparse grammar elements""" def doBasicTest(self, parserName, testValue, expected, ): parser = SPGenerator.buildParser( parserName ) result = TextTools.tag( testValue, parser ) assert result == expected, '''\nexpected:%s\n got:%s\n'''%( pprint.pformat(expected), pprint.pformat(result)) def testChar1( self ): self.doBasicTest( "CHARNODBLQUOTE", 'test\\""', (1, [], 4), ) def testChar2( self ): self.doBasicTest( "ESCAPEDCHAR", '\\n"', (1, [('SPECIALESCAPEDCHAR', 1, 2, NullResult)], 2), ) def testChar3( self ): self.doBasicTest( "ESCAPEDCHAR", '\\007"', (1, [('OCTALESCAPEDCHAR', 1, 4, NullResult)], 4), ) def testChar4( self ): testValue = '\\""' self.doBasicTest( "CHARNODBLQUOTE", testValue, (0, [], AnyInt), ) def testChar5( self ): self.doBasicTest( "CHARNODBLQUOTE", 'ehllo\\""', (1, [], 5), ) def testChar6( self ): self.doBasicTest( "CHARNODBLQUOTE", '007', (1, [], 3), ) def testChar7( self ): self.doBasicTest( "ESCAPEDCHAR", '\\"', (1, [('SPECIALESCAPEDCHAR', 1, 2, NullResult)], 2), ) def testChar8( self ): self.doBasicTest( "ESCAPEDCHAR", '\\"', (1, [('SPECIALESCAPEDCHAR', 1, 2, NullResult)], 2), ) def testChar9( self ): self.doBasicTest( "ESCAPEDCHAR", '\\x10', (1, [('HEXESCAPEDCHAR', 2, 4, NullResult)], 4), ) def testChar85( self ): self.doBasicTest( "HEXESCAPEDCHAR", '10', (1, [], 2), ) def testCharNoBrace1( self ): self.doBasicTest( "CHARNOBRACE", 'a-z', (1, [('CHAR', 0, 1, NullResult)], 1), ) def testCharRange1( self ): self.doBasicTest( "CHARRANGE", 'a-z', (1, [('CHARNOBRACE', 0, 1, [('CHAR', 0, 1, NullResult)]),('CHARNOBRACE', 2, 3, [('CHAR', 2, 3, NullResult)])], 3), ) def testRange1( self ): self.doBasicTest( "range", '[a-zA-Z]', (1, [ ('CHARRANGE',1,4,[ ('CHARNOBRACE', 1, 2, [('CHAR', 1, 2, NullResult)]), ('CHARNOBRACE', 3, 4, [('CHAR', 3, 4, NullResult)]), ]), ('CHARRANGE',4,7,[ ('CHARNOBRACE', 4, 5, [('CHAR', 4, 5, NullResult)]), ('CHARNOBRACE', 6, 7, [('CHAR', 6, 7, NullResult)]), ]), ], 8) ) def testRange2( self ): self.doBasicTest( "range", '[-a-zA-Z]', (1, [ ('CHARDASH', 1, 2, NullResult), ('CHARRANGE',2,5,[ ('CHARNOBRACE', 2, 3, [('CHAR', 2, 3, NullResult)]), ('CHARNOBRACE', 4, 5, [('CHAR', 4, 5, NullResult)]), ]), ('CHARRANGE',5,8,[ ('CHARNOBRACE', 5, 6, [('CHAR', 5, 6, NullResult)]), ('CHARNOBRACE', 7, 8, [('CHAR', 7, 8, NullResult)]), ]), ], 9), ) def testRange3( self ): self.doBasicTest( "range", '[]a-zA-Z]', (1, [ ('CHARBRACE', 1, 2, NullResult), ('CHARRANGE',2,5,[ ('CHARNOBRACE', 2, 3, [('CHAR', 2, 3, NullResult)]), ('CHARNOBRACE', 4, 5, [('CHAR', 4, 5, NullResult)]), ]), ('CHARRANGE',5,8,[ ('CHARNOBRACE', 5, 6, [('CHAR', 5, 6, NullResult)]), ('CHARNOBRACE', 7, 8, [('CHAR', 7, 8, NullResult)]), ]), ], 9), ) def testRange4( self ): """Test optional repeating children running into eof Original SimpleParse had a major failure here, system hung trying to parse the [] string. Basically, there was no check for EOF during a repeating-item parse (save for literals and character sets), so you wound up with infinite loops. """ self.doBasicTest( "range", '[]', (0, [], AnyInt), ) def testRange5( self ): """Test optional repeating children with no termination Original SimpleParse had a major failure here, system hung trying to parse the [] string. Basically, there was no check for EOF during a repeating-item parse (save for literals and character sets), so you wound up with infinite loops. """ self.doBasicTest( "range", '[] ', (0, [], AnyInt), ) def testLiteral1( self ): self.doBasicTest( "literal", '"test"', (1, [('CHARNODBLQUOTE', 1, 5, NullResult)], 6), ) def testLiteral2( self ): self.doBasicTest( "literal", '"test\\""', (1, [ ('CHARNODBLQUOTE', 1, 5, NullResult), ('ESCAPEDCHAR', 5, 7, [ ('SPECIALESCAPEDCHAR', 6, 7, NullResult) ]) ], 8) ) def testLiteral3( self ): self.doBasicTest( "literal", '""', (1, [], 2), ) def testLiteral4( self ): self.doBasicTest( "literal", '"\'"', (1, [('CHARNODBLQUOTE', 1, 2, NullResult),], 3), ) def testLiteral5( self ): self.doBasicTest( "literal", '"\\"test"', (1, [ ('ESCAPEDCHAR', 1, 3, [ ('SPECIALESCAPEDCHAR', 2, 3, NullResult) ]), ('CHARNODBLQUOTE', 3, 7, NullResult) ], 8) ) def testLiteral6( self ): self.doBasicTest( "literal", '"test\\023""', (1, [ ('CHARNODBLQUOTE', 1, 5, NullResult), ('ESCAPEDCHAR', 5, 9, [ ('OCTALESCAPEDCHAR', 6, 9, NullResult) ]) ], 10) ) def testLiteralDecorator( self ): self.doBasicTest( "literalDecorator", 'c', (1, [], 1), ) def testLiteralDecorator2( self ): self.doBasicTest( "literal", 'c"this"', (1, [('literalDecorator',0,1,NullResult),('CHARNODBLQUOTE',2,6,NullResult)], 7), ) def testLiteralDecorator3( self ): """Decorator must be right next to literal, no whitespace""" self.doBasicTest( "literal", 'c "this"', (0, [], AnyInt), ) def testWhitespace1( self ): self.doBasicTest( "ts", ' \t', (1, [], 3) ) def testWhitespace2( self ): self.doBasicTest( "ts", ' \t\n', (1, [], 4) ) def testWhitespace3( self ): self.doBasicTest( "ts", ' \t#testing\r\n', (1, [('comment', 3, 13, NullResult)], 13) ) def testWhitespace4( self ): self.doBasicTest( "ts", 'nospace', (1, [], 0) ) def testWhitespace5( self ): """Bug in 2.0.0 where Null comments such as: "#\n" didn't parse. """ self.doBasicTest( "ts", ' #\n ', (1, [('comment',1,3,NullResult)], 4) ) def testName1( self ): self.doBasicTest( "name", 'abcdefg', (1, [], 7) ) def testName2( self ): self.doBasicTest( "name", '2abcdefg', (0, [], AnyInt) ) def testName3( self ): self.doBasicTest( "name", '_abcdefg_-', (1, [], 9) ) def testUnreportedName1( self ): self.doBasicTest( "unreportedname", '', (1, [('name',1,8,NullResult)], 9) ) def testUnreportedName2( self ): self.doBasicTest( "unreportedname", '<>', (0, [], AnyInt) ) def testExpandedName1( self ): self.doBasicTest( "expandedname", '>abcdefg<', (1, [('name',1,8,NullResult)], 9) ) def testExpandedName2( self ): self.doBasicTest( "expandedname", '><', (0, [], AnyInt) ) def testComment1( self ): self.doBasicTest( "comment", '>', (0, [], AnyInt) ) def testComment2( self ): self.doBasicTest( "comment", '#testing\n', (1, [], 9) ) def testOccurenceIndicator1( self ): self.doBasicTest( "occurence_indicator", '*', (1, [], 1) ) def testOccurenceIndicator2( self ): self.doBasicTest( "occurence_indicator", '+', (1, [], 1) ) def testOccurenceIndicator3( self ): self.doBasicTest( "occurence_indicator", '?', (1, [], 1) ) def testOccurenceIndicator4( self ): self.doBasicTest( "occurence_indicator", 'hello', (0, [], AnyInt) ) def testOccurenceIndicator5( self ): self.doBasicTest( "occurence_indicator", '', (0, [], AnyInt) ) def testLookAheadIndicator1( self ): self.doBasicTest( "lookahead_indicator", '?', (1, [], 1) ) def testLookAheadIndicator2( self ): self.doBasicTest( "lookahead_indicator", '', (0, [], AnyInt) ) def testNegposIndicator1( self ): self.doBasicTest( "negpos_indicator", '-', (1, [], 1) ) def testNegposIndicator2( self ): self.doBasicTest( "negpos_indicator", '+', (1, [], 1) ) def testNegposIndicator3( self ): self.doBasicTest( "negpos_indicator", ')', (0, [], AnyInt) ) def testErrorOnFailFlag1( self ): self.doBasicTest( "error_on_fail", '!', (1, [], 1) ) def testFOGroup1( self ): self.doBasicTest( "fo_group", 'a/b', (1, [ ('element_token', 0,1,[ ("name",0,1,NullResult), ]), ('element_token', 2,3,[ ("name",2,3,NullResult), ]), ], 3) ) def testSEQToken1( self ): self.doBasicTest( "seq_group", 'a,b', (1, [ ('element_token', 0,1,[ ("name",0,1,NullResult), ]), ('element_token', 2,3,[ ("name",2,3,NullResult), ]), ], 3) ) def testSEQGroup1( self ): self.doBasicTest( "seq_group", 'a,#c\012b', (1, [ ('element_token', 0,1,[ ("name",0,1,NullResult), ]), ('element_token', 5,6,[ ("name",5,6,NullResult), ]), ], 6) ) def testSeqGroup2( self ): self.doBasicTest( "seq_group", 'ts, (unreportedname/expandedname/name)', (1, [ ('element_token', 0,2,[ ("name",0,2,NullResult), ]), ('element_token', 4,38,[ ('seq_group',5,37,[ ('fo_group',5,37,[ ('element_token', 5,19,[ ("name",5,19,NullResult), ]), ('element_token', 20,32,[ ("name",20,32,NullResult), ]), ('element_token', 33,37,[ ("name",33,37,NullResult), ]), ]), ]), ]), ], 38) ) def testSeqGroup3( self ): self.doBasicTest( "seq_group", '(a/b/c)', (1, [ ('element_token',0,7,[ ('seq_group',1,6,[ ('fo_group',1,6,[ ('element_token', 1,2,[ ("name",1,2,NullResult), ]), ('element_token', 3,4,[ ("name",3,4,NullResult), ]), ('element_token', 5,6,[ ("name",5,6,NullResult), ]), ]), ]), ]), ], 7) ) def testGroup1( self ): self.doBasicTest( "group", '()', (0, [], AnyInt) ) def testGroup2( self ): self.doBasicTest( "group", '(hello)', (1, [ ('seq_group',1,6,[ ('element_token', 1,6,[ ("name",1,6,NullResult), ]), ]), ], 7) ) def testGroup3( self ): '''Test group with sequential added group Note that this test also serves to test the function of non-reporting names''' self.doBasicTest( "group", '(hello, there)', (1, [ ('seq_group', 1,13,[ ('element_token', 1,6,[ ("name",1,6,NullResult), ]), ('element_token', 8,13,[ ("name",8,13,NullResult), ]), ]), ], 14) ) def testGroup4( self ): '''Test group with sequential added group Note that this test also serves to test the function of non-reporting names''' self.doBasicTest( "group", '(hello/there)', (1, [ ('seq_group',1,12,[ ('fo_group',1,12,[ ('element_token', 1,6,[ ("name",1,6,NullResult), ]), ('element_token', 7,12,[ ("name",7,12,NullResult), ]), ]), ]), ], 13) ) def testGroup5( self ): '''Test group with sequential added group Note that this test also serves to test the function of non-reporting names''' self.doBasicTest( "group", '([the]/"and")', (1, [ ('seq_group',1,12,[ ('fo_group',1,12,[ ('element_token', 1,6,[ ("range",1,6,[ ('CHARNOBRACE', 2,3,[ # this should really be a collapsed level ('CHAR', 2,3,NullResult), ]), ('CHARNOBRACE', 3,4,[ # this should really be a collapsed level ('CHAR', 3,4,NullResult), ]), ('CHARNOBRACE', 4,5,[ # this should really be a collapsed level ('CHAR', 4,5,NullResult), ]), ]), ]), ('element_token', 7,12,[ ("literal",7,12,[ ('CHARNODBLQUOTE', 8,11,NullResult), ]), ]), ]), ]), ], 13) ) def testGroup6( self ): '''Test group with multiple / 'd values''' self.doBasicTest( "group", '(hello/there/a)', (1, [ ('seq_group',1,14,[ ('fo_group',1,14,[ ('element_token', 1,6,[ ("name",1,6,NullResult), ]), ('element_token', 7,12,[ ("name",7,12,NullResult), ]), ('element_token', 13,14,[ ("name",13,14,NullResult), ]), ]), ]), ], 15) ) def testElementToken1( self ): self.doBasicTest( "element_token", 'hello', (1, [ ("name",0,5,NullResult), ], 5) ) def testElementToken2( self ): self.doBasicTest( "element_token", '-hello', (1, [ ("negpos_indicator",0,1,NullResult), ("name",1,6,NullResult), ], 6) ) def testElementToken3( self ): self.doBasicTest( "element_token", '-hello?', (1, [ ("negpos_indicator",0,1,NullResult), ("name",1,6,NullResult), ("occurence_indicator",6,7,NullResult), ], 7) ) def testElementToken4( self ): self.doBasicTest( "element_token", '- hello ?', (1, [ ("negpos_indicator",0,1,NullResult), ("name",2,7,NullResult), ("occurence_indicator",8,9,NullResult), ], 9) ) def testElementToken5( self ): self.doBasicTest( "element_token", '+ hello ?', (1, [ ("negpos_indicator",0,1,NullResult), ("name",2,7,NullResult), ("occurence_indicator",8,9,NullResult), ], 9) ) def testElementToken6( self ): """Lookahead indicator with positive""" self.doBasicTest( "element_token", '? + hello ?', (1, [ ("lookahead_indicator",0,1,NullResult), ("negpos_indicator",2,3,NullResult), ("name",4,9,NullResult), ("occurence_indicator",10,11,NullResult), ], 11) ) def testElementToken7( self ): """Lookahead indicator with negative""" self.doBasicTest( "element_token", '? - hello ?', (1, [ ("lookahead_indicator",0,1,NullResult), ("negpos_indicator",2,3,NullResult), ("name",4,9,NullResult), ("occurence_indicator",10,11,NullResult), ], 11) ) def testElementToken8( self ): """Lookahead indicator with no neg or pos""" self.doBasicTest( "element_token", '?hello?', (1, [ ("lookahead_indicator",0,1,NullResult), ("name",1,6,NullResult), ("occurence_indicator",6,7,NullResult), ], 7) ) def testElementToken9( self ): """Error on fail indicator""" self.doBasicTest( "element_token", 'hello+!', (1, [ ("name",0,5,NullResult), ("occurence_indicator",5,6,NullResult), ("error_on_fail",6,7,NullResult), ], 7) ) def testElementToken10( self ): """Error on fail indicator with message""" self.doBasicTest( "element_token", 'hello+! "Unable to complete parse, yikes!"', (1, [ ("name",0,5,NullResult), ("occurence_indicator",5,6,NullResult), ("error_on_fail",6,42,[ ("literal",8,42,[ ("CHARNODBLQUOTE",9,41,NullResult), ]), ]), ], 42) ) def testCutToken2( self ): self.doBasicTest( "element_token", '(!,a)', (1, [ ('seq_group', 1,4, [ ("error_on_fail",1,2,NullResult), ('element_token',3,4,[ ("name",3,4,NullResult), ]), ]), ], 5) ) def testCutToken3( self ): self.doBasicTest( "element_token", '(a,!"this")', (1, [ ('seq_group', 1,10, [ ('element_token',1,2,[ ("name",1,2,NullResult), ]), ("error_on_fail",3,10,[ ("literal",4,10,[ ("CHARNODBLQUOTE",5,9,NullResult), ]), ]), ]), ], 11) ) def testCutToken4( self ): self.doBasicTest( "element_token", '(a,!"this",b)', (1, [ ('seq_group', 1,12, [ ('element_token',1,2,[ ("name",1,2,NullResult), ]), ("error_on_fail",3,10,[ ("literal",4,10,[ ("CHARNODBLQUOTE",5,9,NullResult), ]), ]), ('element_token',11,12,[ ("name",11,12,NullResult), ]), ]), ], 13) ) def testDeclaration( self ): self.doBasicTest( "declaration", 'a := "a"', (1, [ ("name",0,1,NullResult), ('seq_group',4,8,[ ('element_token', 5,8,[ ("literal",5,8,[ ('CHARNODBLQUOTE', 6,7,NullResult), ]), ]), ]), ], 8) ) def testDeclaration2( self ): self.doBasicTest( "declaration", 'a := b', (1, [ ("name",0,1,NullResult), ('seq_group',4,6,[ ('element_token', 5,6,[ ("name",5,6,NullResult), ]) ]), ], 6) ) def testDeclaration3( self ): self.doBasicTest( "declaration", 'a := ', (0,[],AnyInt) ) def testDeclaration4( self ): self.doBasicTest( "declaration", ' := b', (1, [ ("unreportedname",0,3,[ ("name",1,2,NullResult), ]), ('seq_group',6,8,[ ('element_token', 7,8,[ ("name",7,8,NullResult), ]), ]) ], 8) ) def testDeclaration5( self ): self.doBasicTest( "declaration", '>a< := b', (1, [ ("expandedname",0,3,[ ("name",1,2,NullResult), ]), ('seq_group',6,8,[ ('element_token', 7,8,[ ("name",7,8,NullResult), ]) ]), ], 8) ) def testDeclarationSet1( self ): self.doBasicTest( "declarationset", 'a := b #hello\012b:="c"', (1, [ ('declaration', 0,15,[ ("name",0,1,NullResult), ('seq_group',4,15,[ ('element_token', 5,15,[ ("name",5,6,NullResult), ]) ]) ]), ('declaration', 15,21,[ ("name",15,16,NullResult), ('seq_group',18,21,[ ('element_token', 18,21,[ ("literal",18,21,[ ('CHARNODBLQUOTE', 19,20,NullResult), ]), ]), ]), ]), ], 21) ) def testDeclarationSet2( self ): '''Just tries to parse and sees that everything was parsed, doesn't predict the result''' parser = SPGenerator.buildParser( "declarationset" ) result = TextTools.tag( declaration, parser ) assert result[-1] == len(declaration), '''Didn't complete parse of the simpleparse declaration, only got %s chars, should have %s'''%(result[-1], len(declaration)) recursiveParser = Parser(declaration) class SimpleParseRecursiveTests(SimpleParseGrammarTests): """Test parsing of grammar elements with generated version of simpleparse grammar""" def doBasicTest(self, parserName, testValue, expected, ): result = recursiveParser.parse( testValue, production=parserName ) assert result == expected, '''\nexpected:%s\n got:%s\n'''%( expected, result ) def getSuite(): return unittest.TestSuite(( unittest.makeSuite(SimpleParseGrammarTests,'test'), unittest.makeSuite(SimpleParseRecursiveTests,'test'), )) if __name__ == "__main__": unittest.main(defaultTest="getSuite") SimpleParse-2.2.0/tests/test_objectgenerator.py0000644000175000017500000002742112620710227023270 0ustar mcfletchmcfletch00000000000000import unittest from simpleparse.objectgenerator import * from .genericvalues import AnyInt try: _unichr = unichr except NameError: _unichr = chr class ElementTokenTests(unittest.TestCase): def doBasicTest(self, instance, testvalue, expected, startPosition=0 ): table = tuple(instance.toParser()) result = tag( testvalue, table , startPosition) assert result == expected, '''\n\texpected:%s\n\tgot:%s\n'''%( expected, result ) def testString1( self ): self.doBasicTest( Literal( value = 'test' ), 'test', (1, [],4), ) def testString2( self ): self.doBasicTest( Literal( value = 'test', optional =1 ), 'test', (1, [],4), ) def testString3( self ): self.doBasicTest( Literal( value = 'test', optional =1, negative=1 ), 'test', (1, [],0), ) def testString4( self ): self.doBasicTest( Literal( value = 'test', negative=1 ), 'test', (0, [],AnyInt), ) def testString5( self ): self.doBasicTest( Literal( value = 'test', repeating=1), 'testtest', (1, [],8), ) def testString6( self ): self.doBasicTest( Literal( value = 'test', repeating=1, optional = 1), 'testtest', (1, [],8), ) def testString7( self ): self.doBasicTest( Literal( value = 'test', repeating=1, optional = 1, negative = 1), 'testtest', (1, [],0), ) def testString8( self ): """Test repeating negative string""" self.doBasicTest( Literal( value = 'test', repeating=1, negative = 1), 'testtest', (0, [],AnyInt), ) def testString9( self ): self.doBasicTest( Literal( value = '\\',), '\\', (1, [],1), ) def testRange1( self ): self.doBasicTest( Range( value = 'abc'), 'aabbcc', (1, [],1), ) def testRange2( self ): self.doBasicTest( Range( value = 'abc', optional=1), 'aabbcc', (1, [],1), ) def testRange3( self ): self.doBasicTest( Range( value = 'abc', optional=1, repeating=1), 'aabbcc', (1, [],6), ) def testRange4( self ): self.doBasicTest( Range( value = 'abc', optional=1, repeating=1, negative=1), 'aabbcc', (1, [],0), ) def testRange5( self ): self.doBasicTest( Range( value = 'abc', optional=1, negative=1), 'aabbcc', (1, [],0), ) def testRange6( self ): self.doBasicTest( Range( value = 'abc', negative=1), 'aabbcc', (0, [],AnyInt), ) def testRange7( self ): self.doBasicTest( Range( value = 'abc', negative=1, repeating=1), 'aabbcc', (0, [],AnyInt), ) def testRange8( self ): self.doBasicTest( Range( value = 'abc', negative=1, repeating=1), 'defc', (1, [],3), ) def testRange9( self ): self.doBasicTest( Range( value = 'abc', negative=1), 'defc', (1, [],1), ) def testUnicodeRange10( self ): urange = Range( value = u''.join([_unichr(x) for x in range( 0x600, 0x6FF+1 )]), repeating=True ) self.doBasicTest( urange, u'\u0600\u06FF', (1,[],2), ) def testSequential1( self ): self.doBasicTest( SequentialGroup( children = [ Range( value = 'abc',), Literal( value = 'test', ), ], negative=0, ), 'atest', (1, [],5), ) def testSequential2( self ): self.doBasicTest( SequentialGroup( children = [ Range( value = 'abc',), Literal( value = 'test', ), ], negative=1, ), 'atest', (0, [],AnyInt), ) def testSequential3( self ): self.doBasicTest( SequentialGroup( children = [ Range( value = 'abc',), Literal( value = 'test', ), ], negative=1, optional=1, ), 'atest', (1, [],0), ) def testSequential4( self ): self.doBasicTest( SequentialGroup( children = [ Range( value = 'abc',), Literal( value = 'test', ), ], negative=1, optional=1, repeating=1, ), 'sdatest', (1, [],2), ) def testSequential5( self ): self.doBasicTest( SequentialGroup( children = [ Range( value = 'abc',), Literal( value = 'test', ), ], optional=1, repeating=1, ), 'atestbtestctest', (1, [],15), ) def testSequential6( self ): self.doBasicTest( SequentialGroup( children = [ Range( value = 'abc',), Literal( value = 'test', ), ], optional=1, ), 'atestbtestctest', (1, [],5), ) def testSequential7( self ): self.doBasicTest( SequentialGroup( children = [ Range( value = 'abc',), Literal( value = 'test', ), ], optional=1, ), 'satestbtestctest', (1, [],0), ) def testFirstOf1( self ): self.doBasicTest( FirstOfGroup( children = [ Range( value = 'abc',), Literal( value = 'test', ), ], negative=0, ), 'atest', (1, [],1), ) def testFirstOf2( self ): self.doBasicTest( FirstOfGroup( children = [ Range( value = 'abc',), Literal( value = 'test', ), ], negative=0, ), 'testa', (1, [],4), ) def testFirstOf3( self ): self.doBasicTest( FirstOfGroup( children = [ Range( value = 'abc',), Literal( value = 'test', ), ], negative=1, ), 'testa', (0, [],AnyInt), ) def testFirstOf4( self ): self.doBasicTest( FirstOfGroup( children = [ Range( value = 'abc',), Literal( value = 'test', ), ], negative=1, optional=1, ), 'testa', (1, [],0), ) def testFirstOf5( self ): self.doBasicTest( FirstOfGroup( children = [ Range( value = 'abc',), Literal( value = 'test', ), ], repeating=1, ), 'testabtest', (1, [],10), ) def testFirstOf6( self ): self.doBasicTest( FirstOfGroup( children = [ Range( value = 'abc',), Literal( value = 'test', ), ], repeating=1, negative = 1, ), 'hellotheretestabtest', (1, [],10), ) def testCIString1( self ): self.doBasicTest( CILiteral( value = 'test'), 'test', (1, [],4), ) def testCIString2( self ): self.doBasicTest( CILiteral( value = 'test'), 'Test', (1, [],4), ) def testCIString3( self ): self.doBasicTest( CILiteral( value = 'test'), 'TEST', (1, [],4), ) def testCIString4( self ): self.doBasicTest( CILiteral( value = 'test'), 'tes', (0, [],AnyInt), ) def testCIString5( self ): self.doBasicTest( CILiteral( value = 'test', optional=1), 'tes', (1, [], 0), ) ### Simpleparse 2.0.0b4 introduced an explicit check that ## rejects FOGroups with optional children to prevent ## infinite recursions ## def testFirstOf7( self ): ## '''([abc]?/"test"?)* ## ## Demonstrates a recently fixed error, namely a fix to the repeating ## code which explicitly checks for EOF condition during repeating ## loops. Result is that this condition should be handled correctly. ## ## Old Note: ## This test exposes a problem with both the original generator ## and the sub-class here. FOGroups with optional children are ## in danger of never returning as the children always "succeed" ## even if they consume nothing. ## Failure in this case is likely to be an endless loop, so we ## can expect that if this is broken there will be heck to pay ;) ## ''' ## generator = FirstOfGroup( ## children = [ ## Range( value = 'abc', optional=1), ## Literal( value = 'test', optional=1), ## ], ## repeating=1, optional=1, ## ) ## self.doBasicTest( ## generator, ## 'testabtest', ## (1, [],10), ## ) ## generator = FirstOfGroup( ## children = [ ## Range( value = 'abc', optional=1), ## Literal( value = 'test', optional=1), ## SequentialGroup( ## children = [ ## Literal( value = 'm', optional=1), ## Literal( value = 'n', optional=1), ## ], ## ), ## ], ## repeating=1, optional=1, ## ) ## self.doBasicTest( ## generator, ## 'testmnabtest', ## (1, [],12), ## ) def testNegative1( self ): self.doBasicTest( Literal( value = 's', negative=1), 's\\', (0, [],AnyInt), ) def testNegative2( self ): self.doBasicTest( Literal( value = 's', negative=1), 'asa\\', (1, [],1), ) def testNegative3( self ): self.doBasicTest( Literal( value = 's', negative=1, repeating=1), 'aasa\\', (1, [],2), ) def testNegative4( self ): self.doBasicTest( Literal( value = 's', negative=1, repeating=1, optional=1), 'a', (1, [],1), ) def testNegative4a( self ): self.doBasicTest( Literal( value = 's', negative=1, repeating=1, optional=1), 'as', (1, [],1), ) def testNegative4b( self ): self.doBasicTest( Literal( value = 's', negative=1, repeating=1, optional=1), 'sas', (1, [],0), ) def testNegative5( self ): self.doBasicTest( Range( value = 'sat', negative=1), 'aasat\\', (0, [],AnyInt), ) def testNegative6( self ): self.doBasicTest( Range( value = 'sat', negative=1, repeating=1), 'aasat\\', (0, [],AnyInt), ) def testNegative7( self ): self.doBasicTest( Range( value = 'sat', negative=1, repeating=1, optional=1), 'aasat\\', (1, [],0), ) def getSuite(): return unittest.makeSuite(ElementTokenTests,'test') if __name__ == "__main__": unittest.main(defaultTest="getSuite") SimpleParse-2.2.0/tests/test_common_iso_date.py0000644000175000017500000001367612620706017023263 0ustar mcfletchmcfletch00000000000000import unittest, string, logging from simpleparse.parser import Parser from simpleparse.common import iso_date, iso_date_loose log = logging.getLogger(__name__) try: from mx import DateTime except ImportError: log.warn("No mx.DateTime module available") else: import time try: fulltrans = string.maketrans(b"",b"") except AttributeError: fulltrans = bytes.maketrans(b"",b"") tzOffset = DateTime.DateTimeDelta( 0,0,0, time.timezone ) class CommonTests(unittest.TestCase): def testISODateLoose( self ): """Test the parsing of ISO date and time formats""" values = [ ("2002-02-03", DateTime.DateTime( 2002, 2,3)), ("2002-02",DateTime.DateTime( 2002, 2)), ("2002",DateTime.DateTime( 2002)), ("2002-02-03 04:15", DateTime.DateTime( 2002, 2,3, 4,15)), ("2002-02-03 04:15:16", DateTime.DateTime( 2002, 2,3, 4,15, 16)), ("2002-02-03 04:15:16 +00:00", DateTime.DateTime( 2002, 2,3, 4,15, 16)-tzOffset), ("2002-02-03 4:5", DateTime.DateTime( 2002, 2,3, 4,5)), ("2002-02-03 4:5:16", DateTime.DateTime( 2002, 2,3, 4,5, 16)), ("2002-02-03 4:5:16 +00:00", DateTime.DateTime( 2002, 2,3, 4, 5,16)-tzOffset), ] p = Parser ("d:= ISO_date_time_loose", "d") proc = iso_date_loose.MxInterpreter() for to_parse, date in values: success, children, next = p.parse( to_parse, processor = proc) assert success, """Unable to parse any of the string %s with the ISO date-time parser"""% (to_parse) assert next == len(to_parse),"""Did not finish parsing string %s with the ISO date-time parser, remainder was %s, found was %s"""%( to_parse, to_parse [next:],children) assert children [0] == date,"""Returned different date for string %s than expected, got %s, expected %s"""% (to_parse,children [0], date) def testISODate( self ): """Test the parsing of ISO date and time formats""" values = [ ("2002-02-03", DateTime.DateTime( 2002, 2,3)), ("2002-02",DateTime.DateTime( 2002, 2)), ("2002",DateTime.DateTime( 2002)), ("2002-02-03T04:15", DateTime.DateTime( 2002, 2,3, 4,15)), ("2002-02-03T04:15:16", DateTime.DateTime( 2002, 2,3, 4,15, 16)), ("2002-02-03T04:15:16+00:00", DateTime.DateTime( 2002, 2,3, 4,15, 16)-tzOffset), ] p = Parser ("d:= ISO_date_time", "d") proc = iso_date.MxInterpreter() for to_parse, date in values: success, children, next = p.parse( to_parse, processor=proc) assert success, """Unable to parse any of the string %s with the ISO date-time parser"""% (to_parse) assert next == len(to_parse),"""Did not finish parsing string %s with the ISO date-time parser, remainder was %s, found was %s"""%( to_parse, to_parse [next:],children) assert children [0] == date,"""Returned different date for string %s than expected, got %s, expected %s"""% (to_parse,children [0], date) def testProductionsStrict( self ): for to_parse, production in [ ("2002", "year"), ("02", "month"), ("02", "day"), ("24:00:00", "ISO_time"), ("02", "ISO_time"), (":", "time_separator"), ("02:02", "ISO_time"), ("02:02:02", "ISO_time"), ("2002-02-30", "ISO_date"), ("2002-02-30", "ISO_date_time"), ("02", "hour"), ("02", "minute"), ("02", "second"), ("20", "second"), ("+0500", "offset"), ("+00:00", "offset"), ("-", "offset_sign"), ("-00:00", "offset"), ("-04:00", "offset"), ("-0500", "offset"), ("02:13", "ISO_time"), ("02:13:16", "ISO_time"), ("2002-02-01T02:13-0500", "ISO_date_time"), ]: success, children, next = iso_date._p.parse( to_parse,production) assert next == len(to_parse), "couldn't parse %s as a %s"%( to_parse, production) def testProductions2( self ): for to_parse, production in [ ("2002", "year"), ("02", "month"), ("02", "day"), ("24:00:00", "ISO_time_loose"), ("02", "ISO_time_loose"), (":", "time_separator"), ("02:02", "ISO_time_loose"), ("02:02:02", "ISO_time_loose"), ("2002-02-30", "ISO_date_loose"), ("2002-02-30", "ISO_date_time_loose"), ("2002-2-1", "ISO_date_time_loose"), ("02", "hour"), ("02", "minute"), ("2", "second"), ("02", "second"), ("20", "second"), ("20.", "second"), ("20.3", "second"), ("+0500", "offset"), ("+00:00", "offset"), ("-", "offset_sign"), ("-00:00", "offset"), ("-04:00", "offset"), ("-0500", "offset"), ("02:13", "ISO_time_loose"), ("02:13:16", "ISO_time_loose"), ("2002-2-1 2:13", "ISO_date_time_loose"), ("2002-2-1 2:13 -0500", "ISO_date_time_loose"), ("2002-2-1 2:13 -05:30", "ISO_date_time_loose"), ("2002-2-1 2:13 +05:30", "ISO_date_time_loose"), ("2002-2-1 2:13 +00:00", "ISO_date_time_loose"), ]: success, children, next = iso_date_loose._p.parse( to_parse,production ) assert next == len(to_parse), "couldn't parse %s as a %s"%( to_parse, production) SimpleParse-2.2.0/tests/mx_flag.py0000644000175000017500000000744512620706017020477 0ustar mcfletchmcfletch00000000000000import unittest, pprint from simpleparse.stt.TextTools import * from simpleparse.stt import TextTools mxVersion = tuple(TextTools.__version__.split('.')[:3]) class MXFlagTests(unittest.TestCase): """Test Flags for returning/calling different functions on success""" def doBasicTest(self, table, testvalue, expected, startPosition=0 ): result = tag( testvalue, table , startPosition) assert result == expected, '''\n\texpected:%s\n\tgot:%s\n'''%( expected, result ) ### Return-type handling tests... def testCallTag1( self ): """Test CallTag""" def function (parentList, text, l,r,children): parentList.append( (text[l:r], children) ) self.doBasicTest( ( ( function, AllIn + CallTag, "ab", 0 ), ), "abbaabccd", ( 1,[ ("abbaab",None), ],6), ) def testCallTag2( self ): """Test CallTag with a class instance""" class A: def __call__(self, parentList, text, l,r,children): parentList.append( (text[l:r], children) ) self.doBasicTest( ( ( A(), AllIn + CallTag, "ab", 0 ), ), "abbaabccd", ( 1,[ ("abbaab",None), ],6), ) def testAppendMatch1( self ): """Test AppendMatch""" def function (parentList, text, l,r,children): parentList.append( (text[l:r], children) ) self.doBasicTest( ( ( function, AllIn + AppendMatch, "ab", 0 ), ), "abbaabccd", ( 1,[ "abbaab", ],6), ) def testAppendToTagobj1( self ): """Test AppendToTagobj""" class X: successful = "" def append(self, value): self.successful = value tag = X() self.doBasicTest( ( ( tag, AllIn + AppendToTagobj, "ab", 0 ), ), "abbaabccd", ( 1,[ ],6), ) assert tag.successful == (None,0,6,None), "TagObject's append was called with %s"%(repr(tag.successful),) def testAppendToTagobj2( self ): """Test AppendToTagobj with a simple list""" tag = [] self.doBasicTest( ( ( tag, AllIn + AppendToTagobj, "ab", 0 ), ), "abbaabccd", ( 1,[ ],6), ) assert tag[0] == (None,0,6,None), "TagObject's append was called with %s"%(repr(tag.successful),) def testAppendTagobj1( self ): """Test AppendTagobj""" self.doBasicTest( ( ( "Hi there world!", AllIn + AppendTagobj, "ab", 0 ), ), "abbaabccd", ( 1,[ "Hi there world!", ],6), ) if mxVersion >= ('2','1'): def testLookAhead1( self ): """Test LookAhead""" self.doBasicTest( ( ( "whatever", AllIn + LookAhead, "ab", 0 ), ), "abbaabccd", ( 1,[ ("whatever",0,6,None), ],0), ) def testLookAhead2( self ): """Test LookAhead""" self.doBasicTest( ( ( "whatever", AllIn + LookAhead, "ab", 0 ), ( "whatever2", AllIn, "ab", 0 ), ), "abbaabccd", ( 1,[ ("whatever",0,6,None), ("whatever2",0,6,None), ],6), ) def getSuite(): return unittest.makeSuite(MXFlagTests,'test') if __name__ == "__main__": unittest.main(defaultTest="getSuite") SimpleParse-2.2.0/tests/mx_recursive.py0000644000175000017500000001102112620706017021556 0ustar mcfletchmcfletch00000000000000"""Low-level matching tests for mx.TextTools""" import unittest, pprint from simpleparse.stt.TextTools import * ab = ( ( "ab", Word, "ab", 0 ), ) cdef = ( ( "cd", Word, "cd", 0 ), ( "ef", Word, "ef", 1,1 ), ) tableList = [ ab, cdef ] class MXRecursiveTests(unittest.TestCase): def doBasicTest(self, table, testvalue, expected, startPosition=0 ): result = tag( testvalue, table , startPosition) assert result == expected, '''\n\texpected:%s\n\tgot:%s\n'''%( expected, result ) def testAB( self ): """Test AB testing command""" self.doBasicTest( ab, "abcdef", ( 1,[ ("ab",0,2,None), ],2), ) def testCDEF( self ): """Test CDEF testing command""" self.doBasicTest( cdef, "cdef", ( 1,[ ("cd",0,2,None), ("ef",2,4,None), ],4), ) def testABCDEF( self ): """Test abcdef all together""" self.doBasicTest( ab+cdef, "abcdef", ( 1,[ ("ab",0,2,None), ("cd",2,4,None), ("ef",4,6,None), ],6), ) def testTable1( self ): """Test Table command""" self.doBasicTest( ( ("first", Table, ab), ("second", Table, cdef), ), "abcdef", ( 1,[ ("first",0,2,[ ("ab",0,2,None), ]), ("second",2,6,[ ("cd",2,4,None), ("ef",4,6,None), ]), ],6), ) def testTableInList1( self ): """Test TableInList command""" self.doBasicTest( ( ("first", TableInList, (tableList,0)), ("second", TableInList,(tableList,1)), ), "abcdef", ( 1,[ ("first",0,2,[ ("ab",0,2,None), ]), ("second",2,6,[ ("cd",2,4,None), ("ef",4,6,None), ]), ],6), ) def testSubTable1( self ): """Test SubTable command""" self.doBasicTest( ( ("first", SubTable, ab), ("second", SubTable, cdef), ), "abcdef", ( 1,[ ("ab",0,2,None), ("first", 0,2, None), ("cd",2,4,None), ("ef",4,6,None), ("second", 2,6, None), ],6), ) def testSubTable2( self ): """Test SubTable command with no reporting of st groups""" self.doBasicTest( ( (None, SubTable, ab), (None, SubTable, cdef), ), "abcdef", ( 1,[ ("ab",0,2,None), ("cd",2,4,None), ("ef",4,6,None), ],6), ) def testSubTableInList1( self ): """Test SubTableInList command""" self.doBasicTest( ( ("first", SubTableInList, (tableList,0)), ("second", SubTableInList, (tableList,1)), ), "abcdef", ( 1,[ ("ab",0,2,None), ("first", 0,2, None), ("cd",2,4,None), ("ef",4,6,None), ("second", 2,6, None), ],6), ) def testSubTableNotReturnRecursive( self ): """Test that SubTable calls don't return a recursive structure""" result = tag( "abcdef", ( ("first", SubTableInList, (tableList,0)), ("second", SubTableInList, (tableList,1)), ), 0) assert result [1] is not result[1][1][3], """Subtable results list was the same list as the list enclosing it, looped data structure created""" def testSubTableInList2( self ): """Test SubTable command with no reporting of st groups""" self.doBasicTest( ( (None, SubTableInList, (tableList,0)), (None, SubTableInList, (tableList,1)), ), "abcdef", ( 1,[ ("ab",0,2,None), ("cd",2,4,None), ("ef",4,6,None), ],6), ) def getSuite(): return unittest.makeSuite(MXRecursiveTests,'test') if __name__ == "__main__": unittest.main(defaultTest="getSuite")SimpleParse-2.2.0/tests/test_optimisation.py0000644000175000017500000001136512620706017022634 0ustar mcfletchmcfletch00000000000000from __future__ import print_function import unittest, pprint, traceback from simpleparse.parser import Parser from simpleparse import printers def rcmp( table1, table2 ): """Silly utility function to get around text search object lack of __cmp__""" if len(table1) != len(table2): return 0 else: for x,y in zip(table1, table2): if not _rcmp( x,y): return 0 return 1 def _rcmp( item1, item2 ): if len(item1) != len(item2): return 0 if item1[1] in (204,): if cmp(item1[:2], item2[:2]) != 0: return 0 try: if not rcmp( item1[2][0][item1[2][1]], item2[2][0][item2[2][1]]): return 0 except TypeError: print(item1) print(item2) elif item1[1] == 207: if item2[:2] != item2[:2]: return 0 if not rcmp( item1[2], item2[2]): return 0 else: for a,b in zip(item1, item2): if hasattr(a,'match') and hasattr(b,'match'): if not (a.match == b.match and a.translate == b.translate): return 0 elif a != b: return 0 return 1 class OptimisationTests(unittest.TestCase): def testTermCompression( self ): """Test that unreported productions are compressed Term compression is basically an inlining of terminal expressions into the calling table. At the moment the terminal expressions are all duplicated, which may balloon the size of the grammar, not sure if this will be an actual problem. As written, this optimization should provide a significant speed up, but there may the even more of a speed up if we allow for sharing the terminal tuples as well. This: a:=b := -c* c:='this' Should eventually compress to this: a := -'this'* """ failures = [] for first, second in [ ("""a:=b := -c* c:='this'""", """a := -'this'*"""), ("""a:=b >b<:= c c:= 'this'""", """a := c c:= 'this'"""), ("""a:=b >b<:= c := 'this'""", """a := 'this'"""), ("""a:=b >b<:= c+ := 'this'""", """a := 'this'+"""), # The following will never work, so eventually may raise # an error or at least give a warning! ("""a:=b,c >b<:= c+ := 'this'""", """a := 'this'+,'this'"""), ("""a:=b/c >b<:= c+ := 'this'""", """a := 'this'+/'this'"""), # This is requiring group-compression, which isn't yet written ("""a:=-b/c >b<:= c+ := 'this'""", """a := -'this'+/'this'"""), ("""a := (table1 / table2 / any_line)* := ANY*, EOL := -EOL := '\n' table1 := 'a' table2 := 'b' """, """a := (table1 / table2 / (-'\n'*, '\n'))* table1 := 'a' table2 := 'b' """), ("""a:= b,c := -c* := '\n'""", """a := -'\n'*,'\n'"""), ]: pFirst = Parser( first, "a") pSecond = Parser( second, "a") tFirst = pFirst.buildTagger() tSecond = pSecond.buildTagger() if not rcmp( tFirst , tSecond): tFirstRepr = pprint.pformat(tFirst) tSecondRepr = pprint.pformat(tSecond) failures.append( """%(first)r did not produce the same parser as %(second)r\n\t%(tFirstRepr)s\n\t%(tSecondRepr)s"""%locals()) if failures: raise ValueError( "\n".join(failures)) def testTermSharing( self ): """Test that shared terminal productions are using the same parser""" first =""" a := b,b >b<:= d d:= 'this'""" pFirst = Parser( first, "a") tFirst = pFirst.buildTagger() b,c = tFirst assert b is c, """Not sharing the same tuple for b and c instances""" def testNoReportPassDown( self ): """Test that a non-reporting production does not produce reporting sub-productions""" first =""" a := b := d,e d:= e e:= 'this'""" second =""" a := 'this' """ assert Parser( first, 'a').parse( 'thisthis' ) == (1,[ ],8) def testNameCollapseForSimple( self ): """Test that a name reference, given a single-item reporting avoids extra table""" first =""" a := b,b b:= 'this'""" # The result should be... expected = ( ('b',21,'this'),('b',21,'this')) table = Parser( first, 'a').buildTagger( ) assert table == expected, "%s != %s"%( pprint.pformat( table), pprint.pformat(expected), ) def getSuite(): return unittest.makeSuite(OptimisationTests,'test') if __name__ == "__main__": unittest.main(defaultTest="getSuite") SimpleParse-2.2.0/tests/test_erroronfail.py0000644000175000017500000000646612620706017022445 0ustar mcfletchmcfletch00000000000000import unittest, pprint from simpleparse.parser import Parser from simpleparse.error import ParserSyntaxError class ErrorOnFailTests( unittest.TestCase ): """Tests of the error-on failure mechanisms""" def shouldRaise(self, definition, parserName, testValue, ): self.assertRaises( ParserSyntaxError, Parser( definition).parse, testValue, parserName ) def shouldNotRaise(self, definition, parserName, testValue, ): success,result, next = Parser( definition).parse( testValue, parserName ) assert success, """Didn't parse %s (error on fail test for definition %s)"""%( repr(testValue), repr(definition)) def testErrorOnFail1( self ): self.shouldRaise( '''s := -trailer! trailer := "bad" ''', 's', 'badba', ) def testErrorOnFail2( self ): self.shouldRaise( '''s := -"bad"! ''', 's', 'badba', ) def testErrorOnFail3( self ): self.shouldRaise( '''s := -(a,b)! a := "a" b := "b" ''', 's', 'abdba', ) def testErrorOnFail4( self ): self.shouldRaise( '''s := -[ab]! ''', 's', 'abdba', ) def testErrorOnFail5( self ): self.shouldRaise( '''s := !,'a','b' ''', 's', 'badba', ) def testErrorOnFail6( self ): self.shouldNotRaise( '''s := 'a',!,'b' ''', 's', 'abdba', ) def testErrorOnFail7( self ): self.shouldNotRaise( '''s := 'a',!,'b'? ''', 's', 'acbdba', ) def testErrorOnFail8( self ): self.shouldRaise( '''s := 'a',!,'b' ''', 's', 'acbdba', ) def testErrorOnFail9( self ): self.shouldRaise( '''s := !,'a','b' ''', 's', 'bcbdba', ) def testErrorOnFail10( self ): """Test for use of setting message in definition""" self.shouldRaise( '''s := 'a',! "Blargh!",'b' ''', 's', 'acbdba', ) def testErrorOnFail11( self ): """Test proper setting of err message text from !"message" syntax""" try: Parser( '''s := 'a',! "Blargh!",'b' ''', 's' ).parse( 'acbdba', ) except ParserSyntaxError as err: assert err.args[0] == "Blargh!", """Error message was %r, should have been "Blargh!"."""%(err.args[0],) def testErrorOnFail12( self ): """Test proper setting of err message text from !"message" syntax""" try: Parser( '''s := 'a',! "Blargh!",'b' ''', 's' ).parse( 'acbdba', ) except ParserSyntaxError as err: description = str( err ) assert description == 'ParserSyntaxError: Blargh!', """Didn't get expected error description, got: %s"""%( str(err), ) def getSuite(): return unittest.makeSuite(ErrorOnFailTests,'test') if __name__ == "__main__": unittest.main(defaultTest="getSuite")SimpleParse-2.2.0/tests/test_printers.py0000644000175000017500000000373312620710227021761 0ustar mcfletchmcfletch00000000000000"""Test the print-to-python-file module This just uses the simpleparsegrammar declaration, which is parsed, then linearised, then loaded as a Python module. """ import os, unittest, shutil from . import test_grammarparser try: reload except NameError: from importlib import reload testModuleFile = 'test_printers_garbage.py' HERE = os.path.dirname(__file__) TEST_DIR = os.path.join( HERE, 'tempmodules' ) def setUp(self): if os.path.exists(TEST_DIR): shutil.rmtree( TEST_DIR ) os.makedirs(TEST_DIR) open( os.path.join(TEST_DIR,'__init__.py'), 'w' ).close() def tearDown(self): shutil.rmtree( TEST_DIR ) class PrintersTests(test_grammarparser.SimpleParseGrammarTests): def setUp( self ): from simpleparse import simpleparsegrammar, parser, printers, baseparser name = self.id().split('.')[-1] filename = name + '.py' testModuleFile = os.path.join(TEST_DIR,filename) p = parser.Parser( simpleparsegrammar.declaration, 'declarationset') with open(testModuleFile,'w') as fh: fh.write(printers.asGenerator( p._generator )) mod_name = '%s.tempmodules.%s'%(__name__.rsplit('.',1)[0],name,) test_printers_garbage = __import__( mod_name,{},{},mod_name.split('.') ) reload( test_printers_garbage ) class RParser( test_printers_garbage.Parser, baseparser.BaseParser ): pass self.recursiveParser = RParser() def tearDown( self ): try: os.remove( testModuleFile ) except (OSError,IOError): pass def doBasicTest(self, parserName, testValue, expected, ): result = self.recursiveParser.parse( testValue, production=parserName ) assert result == expected, '''\nexpected:%s\n got:%s\n'''%( expected, result ) def getSuite(): return unittest.makeSuite(PrintersTests,'test') if __name__ == "__main__": unittest.main(defaultTest="getSuite") SimpleParse-2.2.0/tests/__init__.py0000644000175000017500000000051612620710227020607 0ustar mcfletchmcfletch00000000000000"""Package of test scripts, is a package to make setup.py include it :) Run test.py from the command line to run all the primary tests, it will take a while (10 seconds or so) even on a properly configured system. A system with an old copy of mx.TextTools might actually experience an infinite loop or a C stack recursion error. """SimpleParse-2.2.0/tests/test_backup_on_subtable_failure.py0000644000175000017500000000107412620706017025442 0ustar mcfletchmcfletch00000000000000from __future__ import print_function declaration = r'''testparser := (a,b)* a := 'a' b := 'b' ''' testdata = 'aba' expectedResult = (1, [('a',0,1,[]), ('b',1,2,[])], 2 ) from simpleparse.simpleparsegrammar import Parser from simpleparse.stt.TextTools import TextTools import pprint parser = Parser( declaration ).generator.buildParser('testparser' ) result = TextTools.tag( testdata, parser ) if result != expectedResult: print('backup-on-subtable-test failed') print('\texpected', pprint.pprint( expectedResult )) print('\tgot', pprint.pprint( result )) SimpleParse-2.2.0/simpleparse/0000755000175000017500000000000012620710576017665 5ustar mcfletchmcfletch00000000000000SimpleParse-2.2.0/simpleparse/common/0000755000175000017500000000000012620710576021155 5ustar mcfletchmcfletch00000000000000SimpleParse-2.2.0/simpleparse/common/calendar_names.py0000644000175000017500000000556312620706017024467 0ustar mcfletchmcfletch00000000000000"""Locale-specific calendar names (day-of-week and month-of-year) These values are those returned by the calendar module. Available productions: locale_day_names locale_day_names_uc locale_day_names_lc Names for the days of the week locale_day_abbrs locale_day_abbrs_uc locale_day_abbrs_lc Short-forms (3 characters normally) for the days of the week. locale_month_names locale_month_names_uc locale_month_names_lc Names for the months of the year locale_month_abbrs locale_month_abbrs_uc locale_month_abbrs_lc Short-forms (3 characters normally) for the months of the year Interpreters: MonthNameInterpreter DayNameInterpreter Both offer the ability to set an index other than the default (of 1) for the first item in the list. """ import calendar from simpleparse import objectgenerator, common c = {} da = calendar.day_abbr[:] dn = calendar.day_name[:] ma = calendar.month_abbr[:] mn = calendar.month_name[:] def _build( name, set ): # make sure longest equal-prefix items are first set = set[:] set.sort() set.reverse() l,u,r = [],[],[] for item in set: l.append( objectgenerator.Literal( value = item.lower() )) u.append( objectgenerator.Literal( value = item.upper() )) r.append( objectgenerator.Literal( value = item )) c[ name + '_lc' ] = objectgenerator.FirstOfGroup( children = l ) c[ name + '_uc' ] = objectgenerator.FirstOfGroup( children = u ) c[ name ] = objectgenerator.FirstOfGroup( children = r ) _build( 'locale_day_names', dn ) _build( 'locale_day_abbrs', da ) _build( 'locale_month_names', mn ) _build( 'locale_month_abbrs', ma ) da = [s.lower() for s in da] dn = [s.lower() for s in dn] ma = [s.lower() for s in ma] mn = [s.lower() for s in mn] common.share( c ) class NameInterpreter: offset = 1 def __init__( self, offset = 1 ): self.offset = offset def __call__( self, info, buffer ): (tag, left, right, children) = info value = buffer[left:right].lower() for table in self.tables: try: return table.index( value )+ self.offset except ValueError: pass raise ValueError( """Unrecognised (but parsed) %s name %s at character %s"""%( self.nameType, value, left)) class MonthNameInterpreter( NameInterpreter): """Interpret a month-of-year name as an integer index Pass an "offset" value to __init__ to use an offset other than 1 (Monday = 1), normally 0 (Monday = 0) """ nameType = "Month" tables = (mn,ma) class DayNameInterpreter( NameInterpreter ): """Interpret a day-of-week name as an integer index Pass an "offset" value to __init__ to use an offset other than 1 (January = 1), normally 0 (January = 0) """ nameType = "Day" tables = (dn,da) SimpleParse-2.2.0/simpleparse/common/chartypes.py0000644000175000017500000000512212620706017023524 0ustar mcfletchmcfletch00000000000000"""Common locale-specific character types Following productions are all based on string module, with the default locale specified. The first production is a single character of the class and the second a repeating character version: digit, digits uppercasechar, uppercase lowercasechar, lowercase letter, letters whitespacechar, whitespace punctuationchar, punctuation octdigit, octdigits hexdigit, hexdigits printablechar, printable For Python versions with the constants in the string module: ascii_letter, ascii_letters ascii_lowercasechar, ascii_lowercase ascii_uppercasechar, ascii_uppercase Following are locale-specific values, both are single-character values: locale_decimal_point -- locale-specific decimal seperator locale_thousands_seperator -- locale-specific "thousands" seperator Others: EOF -- Matches iff parsing has reached the end of the buffer There are no interpreters provided (the types are considered too common to provide meaningful interpreters). """ from simpleparse import objectgenerator, common import string, locale locale.setlocale(locale.LC_ALL, "" ) c = {} # string-module items... for source,single,repeat in [ ("digits","digit","digits"), ("ascii_uppercase", "uppercasechar", "uppercase"), ("ascii_lowercase", "lowercasechar", "lowercase"), ("ascii_letters", "letter", "letters" ), ("ascii_letters", "ascii_letter", "ascii_letters" ), # alias ("ascii_lowercase", "ascii_lowercasechar", "ascii_lowercase"), ("ascii_uppercase", "ascii_uppercasechar", "ascii_uppercase"), ("whitespace", "whitespacechar", "whitespace"), ("punctuation", "punctuationchar", "punctuation"), ("octdigits", "octdigit", "octdigits"), ("hexdigits", "hexdigit", "hexdigits"), ("printable", "printablechar", "printable"), ]: try: value = getattr( string, source ) c[ single ] = objectgenerator.Range( value = value ) c[ repeat ] = objectgenerator.Range( value = value, repeating =1 ) except AttributeError: pass # locale-module items _lc = locale.localeconv() c[ "locale_decimal_point" ] = objectgenerator.Literal( value = _lc["decimal_point"] ) c[ "locale_thousands_seperator" ] = objectgenerator.Literal( value = _lc["thousands_sep"] ) del _lc # common, but not really well defined sets # this is the set of characters which are interpreted # specially by Python's string-escaping when they # follow a \\ char. from simpleparse.stt import TextTools c[ "EOF" ] = objectgenerator.Prebuilt( value = ( (None, TextTools.EOF, TextTools.Here), ) ) common.share( c ) SimpleParse-2.2.0/simpleparse/common/iso_date.py0000644000175000017500000001250312620706017023312 0ustar mcfletchmcfletch00000000000000"""Canonical ISO date format YYYY-MM-DDTHH:mm:SS+HH:mm This parser is _extremely_ strict, and the dates that match it, though really easy to work with for the computer, are not particularly readable. See the iso_date_loose module for a slightly relaxed definition which allows the "T" character to be replaced by a " " character, and allows a space before the timezone offset, as well as allowing the integer values to use non-0-padded integers. ISO_date -- YYYY-MM-DD format, with a month and date optional ISO_time -- HH:mm:SS format, with minutes and seconds optional ISO_date_time -- YYYY-MM-DD HH:mm:SS+HH:mm format, with time optional and TimeZone offset optional Interpreter: MxInterpreter Interprets the parse tree as mx.DateTime values ISO_date and ISO_time returns DateTime objects Time only returns RelativeDateTime object which, when added to a DateTime gives you the given time within that day """ try: from mx import DateTime haveMX = 1 except ImportError: haveMX = 0 from simpleparse.parser import Parser from simpleparse import common, objectgenerator from simpleparse.common import chartypes, numbers from simpleparse.dispatchprocessor import * c = {} declaration =""" year := digit,digit,digit,digit month := digit,digit day := digit,digit hour := digit,digit minute := digit,digit second := digit,digit offset_sign := [-+] offset := offset_sign, hour, time_separator?, minute := '-' := ':' ISO_date := year, (date_separator, month, (date_separator, day)?)? ISO_time := hour, (time_separator, minute, (time_separator, second)?)? ISO_date_time := ISO_date, ([T], ISO_time)?, offset? """ _p = Parser( declaration ) for name in ["ISO_time","ISO_date", "ISO_date_time"]: c[ name ] = objectgenerator.LibraryElement( generator = _p._generator, production = name, ) common.share( c ) if haveMX: class MxInterpreter(DispatchProcessor): """Interpret a parsed ISO_date_time_loose in GMT/UTC time or localtime """ def __init__( self, inputLocal = 1, returnLocal = 1, ): self.inputLocal = inputLocal self.returnLocal = returnLocal dateName = 'ISO_date' timeName = 'ISO_time' def ISO_date_time( self, info, buffer): """Interpret the loose ISO date + time format""" (tag, left, right, sublist) = info set = singleMap( sublist, self, buffer ) base, time, offset = ( set.get(self.dateName), set.get(self.timeName) or DateTime.RelativeDateTime(hour=0,minute=0,second=0), set.get( "offset" ), ) base = base + time offset = set.get( "offset" ) if offset is not None: # an explicit timezone was entered, convert to gmt and return as appropriate... gmt = base - offset if self.returnLocal: return gmt.localtime() else: return gmt # was in the default input locale (either gmt or local) if self.inputLocal and self.returnLocal: return base elif not self.inputLocal and not self.returnLocal: return base elif self.inputLocal and not self.returnLocal: # return gmt from local... return base.gmtime() else: return base.localtime() def ISO_date( self, info, buffer): """Interpret the ISO date format""" (tag, left, right, sublist) = info set = {} for item in sublist: set[ item[0] ] = dispatch( self, item, buffer) return DateTime.DateTime( set.get("year") or now().year, set.get("month") or 1, set.get("day") or 1, ) def ISO_time( self, info, buffer): """Interpret the ISO time format""" (tag, left, right, sublist) = info set = {} for item in sublist: set[ item[0] ] = dispatch( self, item, buffer) return DateTime.RelativeDateTime( hour = set.get("hour") or 0, minute = set.get("minute") or 0, second = set.get("second") or 0, ) integer = numbers.IntInterpreter() second = offset_minute = offset_hour = year = month = day = hour =minute =integer def offset( self, info, buffer): """Calculate the time zone offset as a date-time delta""" (tag, left, right, sublist) = info set = singleMap( sublist, self, buffer ) direction = set.get('offset_sign',1) hour = set.get( "hour", 0) minute = set.get( "minute", 0) delta = DateTime.DateTimeDelta( 0, hour*direction, minute*direction) return delta def offset_sign( self , info, buffer): """Interpret the offset sign as a multiplier""" (tag, left, right, sublist) = info v = buffer [left: right] if v in ' +': return 1 else: return -1 SimpleParse-2.2.0/simpleparse/common/comments.py0000644000175000017500000000375412620706017023360 0ustar mcfletchmcfletch00000000000000"""Common comment formats To process, handle the "comment" production, (the specific named comment formats are all expanded productions, so you won't get them returned for processing). hash_comment # to EOL comments slashslash_comment // to EOL comments semicolon_comment ; to EOL comments slashbang_comment c_comment non-nesting /* */ comments slashbang_nest_comment c_nest_comment nesting /* /* */ */ comments """ from simpleparse.parser import Parser from simpleparse import common, objectgenerator from simpleparse.common import chartypes c = {} eolcomments = r""" ### comment formats where the comment goes ### from a marker to the end of the line comment := -'\012'* := ('\r'?,'\n')/EOF >hash_comment< := '#', comment, EOL >semicolon_comment< := ';', comment, EOL >slashslash_comment< := '//', comment, EOL """ _p = Parser( eolcomments ) for name in ["hash_comment", "semicolon_comment", "slashslash_comment"]: c[ name ] = objectgenerator.LibraryElement( generator = _p._generator, production = name, ) ccomments = r""" ### comments in format /* comment */ with no recursion allowed comment := -"*/"* >slashbang_comment< := '/*', comment, '*/' """ _p = Parser( ccomments ) for name in ["c_comment","slashbang_comment"]: c[ name ] = objectgenerator.LibraryElement( generator = _p._generator, production = "slashbang_comment", ) nccomments = r""" ### nestable C comments of form /* comment /* innercomment */ back to previous */ := '/*' := '*/' comment := (-(comment_stop/comment_start)+/slashbang_nest_comment)* >slashbang_nest_comment< := comment_start, comment, comment_stop """ _p = Parser( nccomments ) for name in ["c_nest_comment","slashbang_nest_comment"]: c[ name ] = objectgenerator.LibraryElement( generator = _p._generator, production = "slashbang_nest_comment", ) common.share(c) SimpleParse-2.2.0/simpleparse/common/strings.py0000644000175000017500000001324412620706017023217 0ustar mcfletchmcfletch00000000000000"""Python string parsers with escape characters Python-string-like operation as much as possible, this includes: support for single and double-quoted strings support for triple-quoted versions of the same support for special character escapes as seen in 8-bit python strings support for octal and hexidecimal character escapes string_single_quote string_double_quote string_triple_single string_triple_double Individual string types with the above features string Any of the above string types, in a simple FirstOf group with the triple-quoted types first, then the single quoted i.e. generated with this grammar: string_triple_double/string_triple_single/string_double_quote/string_single_quote Interpreters: StringInterpreter Interprets any/all of the above as a normal (non-Raw) Python regular (non-unicode) string. Hopefully the action is identical to doing eval( matchedString, {},{}), without the negative security implications of that approach. Note that you need to make the interpreter available under each name you use directly in your grammar, so if you use string_single_quote and string_double_quote directly, then you need to add: string_single_quote = myStringInterpreterInstance string_double_quote = myStringInterpreterInstance to your processor class. """ from simpleparse.parser import Parser from simpleparse import common, objectgenerator from simpleparse.common import chartypes assert chartypes from simpleparse.dispatchprocessor import * c = {} stringDeclaration = r""" # note that non-delimiter can never be hit by non-triple strings str := delimiter, (char_no_quote/escaped_char/backslash_char/nondelimiter)*,delimiter escaped_char := '\\',( string_special_escapes / ('x',hex_escaped_char) / octal_escaped_char ) octal_escaped_char := octdigit, octdigit?, octdigit? hex_escaped_char := hexdigit,hexdigit backslash_char := "\\" # i.e. a backslash preceding a non-special char """ _stringTypeData = [ ("string_double_quote", """ := '"' nondelimiter := -'"' char_no_quote := -[\\\\"]+ string_special_escapes := [\\\\abfnrtv"] """), ("string_single_quote", """ := "'" nondelimiter := -"'" char_no_quote := -[\\\\']+ string_special_escapes := [\\\\abfnrtv'] """), ("string_triple_single", """ nondelimiter := -"'''" := "'''" char_no_quote := -[\\\\']+ string_special_escapes := [\\\\abfnrtv'] """), ("string_triple_double",''' nondelimiter := -'"""' := '"""' char_no_quote := -[\\\\"]+ string_special_escapes := [\\\\abfnrtv"] '''), ] for name, partial in _stringTypeData: _p = Parser( stringDeclaration + partial ) c[ name ] = objectgenerator.LibraryElement( generator = _p._generator, production = "str", ) common.share( c ) _p = Parser( """ string := string_triple_double/string_triple_single/string_double_quote/string_single_quote """ ) c[ "string"] = objectgenerator.LibraryElement( generator = _p._generator, production = "string", ) class StringInterpreter(DispatchProcessor): """Processor for converting parsed string values to their "intended" value Basically this processor handles de-escaping and stripping the surrounding quotes, so that you get the string as a Python string value. You use the processor by creating an instance of StringInterpreter() as an item in another processor's methodSource object (often the Parser itself). For example: class MyProcessor( DispatchProcessor ): string = StringInterpreter() # following would be used if you have, for instance, # used string_single_quote in an area where double # or triple-quoted strings are not allowed, but have # used string in another area. string_single_quote = string """ def string( self, info, buffer): """Dispatch any of the string types and return the result""" (tag, left, right, sublist) = info return dispatch( self, sublist[0], buffer ) def string_single_quote( self, info, buffer): (tag, left, right, sublist) = info return "".join(dispatchList(self, sublist, buffer)) string_double_quote = string_single_quote string_triple_single = string_single_quote string_triple_double = string_single_quote def char_no_quote( self, info, buffer): (tag, left, right, sublist) = info return buffer[left:right] nondelimiter = char_no_quote def escaped_char( self, info, buffer): (tag, left, right, sublist) = info return "".join(dispatchList(self,sublist,buffer)) def octal_escaped_char(self, info, buffer): (tag, left, right, sublist) = info return chr(int( buffer[left:right], 8 )) def hex_escaped_char( self, info, buffer): (tag, left, right, sublist) = info return chr(int( buffer[left:right], 16 )) def backslash_char( self, info, buffer): return "\\" def string_special_escapes( self, info, buffer): """Maps "special" escapes to the corresponding characters""" (tag, left, right, sublist) = info return self.specialescapedmap[ buffer[left:right]] specialescapedmap = { 'a':'\a', 'b':'\b', 'f':'\f', 'n':'\n', 'r':'\r', 't':'\t', 'v':'\v', '\\':'\\', '\n':'', '"':'"', "'":"'", } SimpleParse-2.2.0/simpleparse/common/phonetics.py0000644000175000017500000000350112620706017023515 0ustar mcfletchmcfletch00000000000000"""Phonetic spellings for character values At the moment, only contains the "military alphabet" (Alpha, Bravo ... Yankee, Zulu), which is used as alternative timezone names by the military and apparently some aviation groups. Note, these are fairly common spellings, but they aren't necessarily going to match a particular usage. I may have missed some of the possibilities... military_alphabet_char -- fully spelled out versions of the Alpha, Bravo ... Yankee, Zulu phonetic alphabet, including a few minor variations in spelling such as Xray and X-ray. All characters use title-caps format, so Zulu, not zulu will match. military_alphabet_char_lower -- as for above, but with lowercased versions of the above No interpreters are provided. Taking the first character of the name will always give you the equivalent character uppercase for the military_alphabet_char and lowercase for the military_alphabet_char_lower. """ from simpleparse import objectgenerator, common c = {} # note that Juliette comes before Juliet, because # otherwise Juliette could never match in an FOGroup! _letters = """Alpha Bravo Charlie Delta Echo Echo Foxtrot Golf Gulf Hotel India Juliette Juliet Kilo Lima Mike November Oscar Papa Quebec Romeo Sierra Tango Uniform Victor Whiskey Xray X-ray Yankee Zulu""".split() set1,set2 = [], [] for item in _letters: set1.append( objectgenerator.Literal( value=item) ) set2.append( objectgenerator.Literal( value=item.lower()) ) military_alphabet_char = objectgenerator.FirstOfGroup( children = set1 ) military_alphabet_char_lower = objectgenerator.FirstOfGroup( children = set2 ) del set1, set2 c[ "military_alphabet_char" ] = military_alphabet_char c[ "military_alphabet_char_lower" ] = military_alphabet_char_lower common.share( c ) SimpleParse-2.2.0/simpleparse/common/iso_date_loose.py0000644000175000017500000001201712620706017024513 0ustar mcfletchmcfletch00000000000000"""Somewhat Looser ISO date format YYYY-MM-DD HH:mm:SS +HH:mm ISO_date_loose -- YYYY-MM-DD format, with a month and day optional, month or day may be specified without leading 0 ISO_time_loose -- HH:mm:SS format, with minutes and seconds optional all numbers may be specified without leading 0 ISO_date_time_loose -- YYYY-MM-DD HH:mm:SS +HH:mm format, with time optional and TimeZone offset optional, same format for date and time as above Interpreter: MxInterpreter Interprets the parse tree as mx.DateTime values Date and DateTime -> DateTime objects Time only -> RelativeDateTime """ try: from mx import DateTime haveMX = 1 except ImportError: haveMX = 0 from simpleparse.parser import Parser from simpleparse import common, objectgenerator from simpleparse.common import chartypes, numbers from simpleparse.dispatchprocessor import * c = {} declaration = """ := [-] := ':' offset_sign := [-+] year := int month := int day := int hour := int minute := int second := float/int ISO_date_loose := year, (date_separator, month, (date_separator, day)?)? ISO_time_loose := hour, (time_separator, minute, (time_separator, second)?)? offset := offset_sign, offset_hour, time_separator?, offset_minute? offset_hour := digit, digit offset_minute := digit, digit ISO_date_time_loose := ISO_date_loose, ([T ], ISO_time_loose)?, [ ]?, offset? """ _p = Parser( declaration ) for name in ["ISO_time_loose","ISO_date_time_loose", "ISO_date_loose"]: c[ name ] = objectgenerator.LibraryElement( generator = _p._generator, production = name, ) common.share( c ) if haveMX: class MxInterpreter(DispatchProcessor): """Interpret a parsed ISO_date_time_loose in GMT/UTC time or localtime """ int = numbers.IntInterpreter() offset_minute = offset_hour = year = month = day = hour = minute = int float = numbers.FloatInterpreter() second = float def __init__( self, inputLocal = 1, returnLocal = 1, ): self.inputLocal = inputLocal self.returnLocal = returnLocal dateName = 'ISO_date_loose' timeName = 'ISO_time_loose' def ISO_date_time_loose( self, info, buffer): """Interpret the loose ISO date + time format""" (tag, left, right, sublist) = info set = singleMap( sublist, self, buffer ) base, time, offset = ( set.get(self.dateName), set.get(self.timeName) or DateTime.RelativeDateTime(hour=0,minute=0,second=0), set.get( "offset" ), ) base = base + time offset = set.get( "offset" ) if offset is not None: # an explicit timezone was entered, convert to gmt and return as appropriate... gmt = base - offset if self.returnLocal: return gmt.localtime() else: return gmt # was in the default input locale (either gmt or local) if self.inputLocal and self.returnLocal: return base elif not self.inputLocal and not self.returnLocal: return base elif self.inputLocal and not self.returnLocal: # return gmt from local... return base.gmtime() else: return base.localtime() def ISO_date_loose( self, info, buffer): """Interpret the loose ISO date format""" (tag, left, right, sublist) = info set = singleMap( sublist, self, buffer ) return DateTime.DateTime( set.get("year") or now().year, set.get("month") or 1, set.get("day") or 1, ) def ISO_time_loose( self, info, buffer): """Interpret the loose ISO time format""" (tag, left, right, sublist) = info set = singleMap( sublist, self, buffer ) return DateTime.RelativeDateTime( hour = set.get("hour") or 0, minute = set.get("minute") or 0, second = set.get("second") or 0, ) def offset( self, info, buffer): """Calculate the time zone offset as a date-time delta""" (tag, left, right, sublist) = info set = singleMap( sublist, self, buffer ) direction = set.get('offset_sign',1) hour = set.get( "offset_hour", 0) minute = set.get( "offset_minute", 0) delta = DateTime.DateTimeDelta( 0, hour*direction, minute*direction) return delta def offset_sign( self , info, buffer): """Interpret the offset sign as a multiplier""" (tag, left, right, sublist) = info v = buffer [left: right] if v in ' +': return 1 else: return -1 SimpleParse-2.2.0/simpleparse/common/numbers.py0000644000175000017500000001334212620706017023200 0ustar mcfletchmcfletch00000000000000"""Samples showing the parsing of common programming-language constructs numbers integers int int_unsigned hexidecimal integers hex floats (including exponents, requring a '.' in the literal) float floats, with optional integer-only exponents float_floatexp floats, with optional integer or float exponents imaginary_number (float/int),[jJ] number hex/float/int number_full binary_number/imaginary_number/hex/float/int binary_number signed binary number 1001001b or 1001001B bit-field format, optional sign can be used with number as (binary_number/number) Interpreters: IntInterpreter int, int_unsigned HexInterpreter hex FloatInterpreter float FloatFloatExpInterpreter float_floatexp BinaryInterpreter binary_number ImaginaryInterpreter imaginary_number """ from simpleparse.parser import Parser from simpleparse import common, objectgenerator from simpleparse.common import chartypes from simpleparse.dispatchprocessor import * c = {} declaration = r""" # sample for parsing integer and float numbers # including hexidecimal numbers in 0xFFF format sign := [-+]+ := digits := hexdigits decimal_fraction := '.',int_unsigned? # float which is explicitly a float, cannot be an integer # because it includes a decimal point explicit_base := sign?, ((int_unsigned, decimal_fraction) / decimal_fraction / (int_unsigned,'.')) exponent := int exponent_loose := explicit_base/int float := explicit_base, ([eE],exponent)? float_floatexp := explicit_base, ([eE],exponent_loose)? hex := sign?, '0', [xX], hexdigits int_unsigned := l_digits int := sign?, l_digits binary_digits := [01]+ binary_number := sign?, binary_digits,('b'/'B') imaginary_number := (float/int), [jJ] ##number := binary_number/hex/float/int number := hex/float/int number_full := binary_number/imaginary_number/hex/float/int """ _p = Parser( declaration ) for name in ["int","hex", "int_unsigned", "number", "float", "binary_number", "float_floatexp", "imaginary_number", "number_full"]: c[ name ] = objectgenerator.LibraryElement( generator = _p._generator, production = name, ) if __name__ == "__main__": test() common.share( c ) def _toInt( s, base ): try: return int( s, base) except TypeError: return int( s, base) def _toLong( s, base ): return int( s, base) class IntInterpreter(DispatchProcessor): """Interpret an integer (or unsigned integer) string as an integer""" def __call__( self, info, buffer): (tag, left, right, children) = info try: return _toInt( buffer[left:right], 10) except ValueError: return _toLong( buffer[left:right], 10) class HexInterpreter(DispatchProcessor): """Interpret a hexidecimal integer string as an integer value""" def __call__( self, info, buffer): (tag, left, right, children) = info try: return _toInt( buffer[left:right], 16) except ValueError: return _toLong( buffer[left:right], 16) class FloatFloatExpInterpreter(DispatchProcessor): """Interpret a float string as an integer value Note: we're allowing float exponentiation, which gives you a nice way to write 2e.5 """ def __call__( self, info, buffer): (tag, left, right, children) = info tag, l, r, _ = children[0] base = float( buffer[l:r] ) if len(children) > 1: # figure out the exponent... exp = children[1] exp = buffer[ exp[1]:exp[2]] ## import pdb ## pdb.set_trace() exp = float( exp ) base = base * (10** exp) return base class FloatInterpreter(DispatchProcessor): """Interpret a standard float value as a float""" def __call__( self, info, buffer): (tag, left, right, children) = info return float( buffer[left:right]) import sys if hasattr( sys,'version_info') and sys.version_info[:2] > (2,0): class BinaryInterpreter(DispatchProcessor): def __call__( self, info, buffer): """Interpret a bitfield set as an integer""" (tag, left, right, children) = info return _toInt( buffer[left:right-1], 2) else: class BinaryInterpreter(DispatchProcessor): def __call__( self, info, buffer): """Interpret a bitfield set as an integer, not sure this algo is correct, will see I suppose""" (tag, left, right, children) = info sign = 1 if len(children) > 2: s = children[0] for schar in buffer[s[1]:s[2]]: if schar == '-': sign = sign * -1 bits = buffer[children[1][1]:children[1][2]] else: bits = buffer[children[0][1]:children[0][2]] value = 0 for bit in bits: value = (value << 1) if bit == '1': value = value + 1 return value class ImaginaryInterpreter( DispatchProcessor ): map = { "float":FloatInterpreter(), "int":IntInterpreter() } def __call__( self, info, buffer): """Interpret a bitfield set as an integer, not sure this algo is correct, will see I suppose""" (tag, left, right, children) = info base = children[0] base = self.mapSet[base[0]](base, buffer) return base * 1j SimpleParse-2.2.0/simpleparse/common/timezone_names.py0000644000175000017500000001507312620706017024545 0ustar mcfletchmcfletch00000000000000"""Common timezone names (civilian, military and combined) These productions are a collection of common civilian and military timezone names. The list of names is by no means exhaustive (nor definitive), but it gives most timezones at least one named value (to make it possible to enter the name), and it doesn't repeat any names (I hope ;) ). You have three major classes of names, civilian (EST, PST, GMT, UTC), military single-character (A,B,C,D,E...) and military phonetic spelling (Alpha, Bravo... Zulu). The military variants are combined into a single production, however. civilian_timezone_name -- the "familiar" timezones, most real-world data entry will want to use this as their "timezone" definition I'm guessing. military_timezone_name -- military timezones in the two formats outlined above. timezone_name -- combination of the two above into a single production. Interpreter: TimeZoneNameInterpreter -- see below for details, by default takes the timezone name and converts to a second offset in West-negative format. Note: this is the _opposite_ of the time module, but is the more commonly used format AFAIK. Null matches will return a default TimeZone as specified. """ from simpleparse import objectgenerator, common from simpleparse.common import phonetics import time c = {} timezone_data = [] civilian_data = [ # Basically this defines our recognised input locales, # it is by no means exhaustive, but it gives fairly # good coverage with minimal overlap ('NZDT',46800), ('IDLE',43200), ('NZST',43200), ('NZT',43200), ('AESST',39600), ('ACSST',37800), ('CADT',37800), ('SADT',37800), ('AEST',36000), ('EAST',36000), ('GST',36000), ('LIGT',36000), ('ACST',34200), ('CAST',34200), ('SAT',34200), ('AWSST',32400), ('JST',32400), ('KST',32400), ('WDT',32400), ('MT',30600), ('AWST',28800), ('CCT',28800), ('WADT',28800), ('WST',28800), ('JT',27000), ('WAST',25200), ('IT',12600), ('BT',10800), ('EETDST',10800), ('MSK', 10800), ('CETDST',7200), ('EET',7200), ('FWT',7200), ('IST',7200), ('MEST',7200), ('METDST',7200), ('SST',7200), ('BST',3600), ('CET',3600), ('DNT',3600), ('DST',3600), ('FST',3600), ('MET',3600), ('MEWT',3600), ('MEZ',3600), ('NOR',3600), ('SET',3600), ('SWT',3600), ('WETDST',3600), ('GMT',0), ('UTC', 0), ('WET',0), ('WAT',-3600), ('NDT',-5400), ('AT', -7200), ('ADT',-10800), ('NFT',-9000), ('NST',-9000), ('AST',-14400), ('EDT',-14400), ('ZP4',-14400), ('CDT',-18000), ('EST',-18000), ('ZP5',-18000), ('CST',-21600), ('MDT',-21600), ('ZP6',-21600), ('MST',-25200), ('PDT',-25200), ('PST',-28800), ('YDT',-28800), ('HDT',-32400), ('YST',-32400), ('AKST',-32400), ('AHST',-36000), ('HST',-36000), ('CAT',-36000), ('NT',-39600), ('IDLW',-43200), ] timezone_data = timezone_data + civilian_data ### add military timezones ##A-I then K-Z are used... ## z = 0 ## a - i, k-m -> + values up to 12 ## n-y - values up to -12 ## what a totally messed up system! ## I've checked with a number of sites, they all seem to think ## it works this way... darned if I can figure out why they don't ## make N -12, o -11 etceteras so that z would come in order and you'd ## have a simple progression around the globe... sigh. zulu_data = [ ('A', 3600), ('B', 7200), ('C', 10800), ('D', 14400), ('E', 18000), ('F', 21600), ('G', 25200), ('H', 28800), ('I', 32400), ('K', 36000), ('L', 39600), ('M', 43200), ('N', -3600), ('O', -7200), ('P', -10800), ('Q', -14400), ('R', -18000), ('S', -21600), ('T', -25200), ('U', -28800), ('V', -32400), ('W', -36000), ('X', -39600), ('Y', -43200), ('Z', 0), ] # now add these, plus the expanded versions to the dict above... # note that we only allow capitalised versions of the military # zones! tztemp = [] for key, value in zulu_data: for item in phonetics._letters: if item[0] == key: tztemp.append( (item, value) ) # order is important here, want longer first zulu_data = tztemp + zulu_data del tztemp # and call that done for now, folks... timezone_data = timezone_data + zulu_data # the rules are really big, but oh well... def _build( data ): """Build the name:time map and match rule for each dataset""" data = data[:] data.sort() # get shortest and least values first forcefully... # then reverse that, to get longest first... data.reverse() names = [] mapping = {} for key,value in data: names.append( objectgenerator.Literal(value=key)) mapping[key] = value rule = objectgenerator.FirstOfGroup( children = names ) return mapping, rule zulu_mapping, zulu_rule = _build( zulu_data ) civilian_mapping, civilian_rule = _build( civilian_data ) timezone_mapping, timezone_rule = _build( timezone_data ) c[ "military_timezone_name" ] = zulu_rule c[ "civilian_timezone_name" ] = civilian_rule c[ "timezone_name" ] = timezone_rule common.share(c) import time if time.daylight: LOCAL_ZONE = time.altzone else: LOCAL_ZONE = time.timezone # account for time module's different counting procedure... LOCAL_ZONE = -LOCAL_ZONE class TimeZoneNameInterpreter: """Intepret a timezone specified as a military or civilian timezone name Return value is an offset from UTC given in seconds. If a null-match is passed uses the passed defaultZone. Returns values in seconds difference from UTC (negative West) divided by the passed "seconds" argument. """ def __init__( self, defaultZone=LOCAL_ZONE, seconds=1.0): """ defaultZone -- ofset in seconds to be returned if there is no value specified (null-match) seconds -- divisor applied to the value before returning, if you want hours, use 3600.0, if you want minutes, use 60.0, if you want days (why?), use 86400.0 """ self.defaultZone = defaultZone self.seconds = seconds def __call__( self, info, buffer ): (tag, left, right, children) = info value = buffer[ left: right ] if value: try: return timezone_mapping[ value ]/self.seconds except KeyError: raise ValueError( "Unrecognised (but parsed!) TimeZone Name %s found at character position %s"%(value, left)) else: return self.defaultZone/self.seconds SimpleParse-2.2.0/simpleparse/common/__init__.py0000644000175000017500000000103712620706017023262 0ustar mcfletchmcfletch00000000000000"""Common (library) definitions You normally use this module by importing one of our sub-modules (which automatically registers itself with the SOURCES list defined here). Calling common.share( dictionary ) with a dictionary mapping string names to element token instances will make the element tokens available under those string names in default parsers. Note: a Parser can override this by specifying an explicit definitionSources parameter in its initialiser. """ def share( dictionary ): SOURCES.append( dictionary) SOURCES = [ ]SimpleParse-2.2.0/simpleparse/parser.py0000644000175000017500000000364112620706017021532 0ustar mcfletchmcfletch00000000000000"""Real-world parsers using the SimpleParse EBNF""" from simpleparse import baseparser, simpleparsegrammar, common class Parser( baseparser.BaseParser ): """EBNF-generated Parsers with results-handling The Parser is a two-stage object: Passed an EBNF definition during initialisation, it compiles the definition into a tagging table (which in turn requires creating a tagging table for parsing the EBNF). You then call the parser's parse method to perform the actual parsing of your data, with the parser passing the results to your processor object and then back to you. """ def __init__( self, declaration, root='root', prebuilts=(), definitionSources=common.SOURCES, ): """Initialise the parser, creating the tagging table for it declaration -- simpleparse ebnf declaration of the language being parsed root -- root production used for parsing if none explicitly specified prebuilts -- sequence of (name,value) tuples with prebuilt tables, values can be either objectgenerator EventToken sub-classes or TextTools tables definitionSources -- dictionaries of common constructs for use in building your grammar """ self._rootProduction = root self._declaration = declaration self._generator = simpleparsegrammar.Parser( declaration, prebuilts, definitionSources = definitionSources, ).generator def buildTagger( self, production=None, processor=None): """Get a particular parsing table for a particular production""" if production is None: production = self._rootProduction if processor is None: processor = self.buildProcessor() return self._generator.buildParser( production, methodSource=processor, ) SimpleParse-2.2.0/simpleparse/dispatchprocessor.py0000644000175000017500000000735012620706017023776 0ustar mcfletchmcfletch00000000000000"""Dispatch-processor API This is a post-processing processor API based on dispatching each element of a result tree in a top-down recursive call structure. It is the API used by the SimpleParseGrammar Parser, and likely will be the default processor for SimpleParse. """ from simpleparse.processor import Processor class DispatchProcessor(Processor): """Dispatch results-tree in a top-down recursive pattern with attribute lookup to determine production -> method correspondence. To use the class, subclass it, then define methods for processing each production. The methods should take this form: def production_name( self, (tag, left, right, children), buffer): pass Where children may be either a list, or None, and buffer is the entire buffer being parsed. """ def __call__( self, value, buffer ): """Process the results of the parsing run over buffer Value can either be: (success, tags, next) for a top-level production, or (tag, left, right, children) for a non-top production. """ if len( value ) == 3: # is a top-level production success, tags, next = value if success: result = dispatchList( self, tags, buffer ) return success, result, next else: return success, tags, next else: # is a 4-item result tuple/tree return dispatch( self, value, buffer ) def dispatch( source, tag, buffer ): """Dispatch on source for tag with buffer Find the attribute or key tag[0] of source, then call it with (tag, buffer) """ try: function = getattr (source, tag[0]) except AttributeError: try: function = source[tag[0]] except: raise AttributeError( '''No processing function for tag "%s" in object %s! Check the parser definition!'''%(tag[0], repr(source))) return function( tag, buffer ) def dispatchList( source, taglist, buffer ): """Dispatch on source for each tag in taglist with buffer""" if taglist: return list(map( dispatch, [source]*len(taglist), taglist, [buffer]*len(taglist))) else: return [] def multiMap( taglist, source=None, buffer=None ): """Convert a taglist to a mapping from tag-object:[list-of-tags] For instance, if you have items of 3 different types, in any order, you can retrieve them all sorted by type with multimap( childlist) then access them by tagobject key. """ set = {} if not taglist: return set for tag in taglist: key = tag[0] if source and buffer: tag = dispatch( source, tag, buffer ) set.setdefault(key,[]).append( tag ) return set def singleMap( taglist, source=None, buffer=None ): """Convert a taglist to a mapping from tag-object:tag, overwritting early with late tags""" set = {} if not taglist: return set for tag in taglist: key = tag[0] if source and buffer: tag = dispatch( source, tag, buffer ) set[key] = tag return set def getString(info, buffer): """Return the string value of the tag passed""" (tag, left, right, sublist) = info return buffer[ left:right ] try: from simpleparse.stt.TextTools import countlines except ImportError: def lines( start=None, end=None, buffer=None ): """Return line number in file at character index (string.count version)""" return buffer.count('\n', start or 0, end or len(buffer)) else: def lines( start=None, end=None, buffer=None ): """Return line number in file at character index (mx.TextTools version)""" return countlines (buffer[start or 0:end or len(buffer)]) SimpleParse-2.2.0/simpleparse/baseparser.py0000644000175000017500000000534212620706017022365 0ustar mcfletchmcfletch00000000000000"""Base class for real-world parsers (such as parser.Parser)""" from simpleparse.stt.TextTools.TextTools import * from simpleparse.generator import Generator class BaseParser: """Class on which real-world parsers build Normally you use a sub-class of this class, such as simpleparser.parser.Parser """ _rootProduction = "" # primary API... def parse( self, data, production=None, processor=None, start=0, stop=None): """Parse data with production "production" of this parser data -- data to be parsed, a Python string, for now production -- optional string specifying a non-default production to use for parsing data processor -- optional pointer to a Processor or MethodSource object for use in determining reporting format and/or post-processing the results of the parsing pass. Can be None if neither is desired (default) start -- starting index for the parsing, default 0 stop -- stoping index for the parsing, default len(data) """ self.resetBeforeParse() if processor is None: processor = self.buildProcessor() if stop is None: stop = len(data) value = tag( data, self.buildTagger( production, processor), start, stop ) if processor and callable(processor): return processor( value, data ) else: return value # abstract methods def buildProcessor( self ): """Build default processor object for this parser class The default implementation returns None. The processor can either implement the "method source" API (just provides information about Callouts and the like), or the processor API and the method-source API. The processor API merely requires that the object be callable, and have the signature: object( (success, children, nextPosition), buffer) (Note: your object can treat the first item as a single tuple if it likes). See: simpleparse.processor module for details. """ return None def buildTagger( self, name, processor ): """Build the tag-table for the parser This method must be implemented by your base class and _not_ call the implementation here. """ raise NotImplementedError( """Parser sub-class %s hasn't implemented a buildTagger method"""%(self.__class__.__name__)) def resetBeforeParse( self ): """Called just before the parser's parse method starts working, Allows you to set up special-purpose structures, such as stacks or local storage values. There is no base implementation. The base implementation does nothing. """SimpleParse-2.2.0/simpleparse/printers.py0000644000175000017500000000442612620706017022106 0ustar mcfletchmcfletch00000000000000"""Utility to print Python code for a given generator object's element tokens""" class _GeneratorFormatter: """Singleton Class to give a generator's element tokens as a source string Call this as: printers.asGenerator( generator ) to get a Python source string that tries to recreate the generator as a set of objectgenerator element token objects (as seen in simpleparsegrammar). """ HEAD = """from simpleparse import generator from simpleparse.objectgenerator import * GENERATOR = generator.Generator () class Parser: '''Mix-in class for simpleparse.parser.Parser which uses this GENERATOR to build tagging tables. You'll likely want to override __init__ to avoid building a new parser from a grammar (or subclass BaseParser instead of Parser) ''' def buildTagger( self, name=None, processor = None ): '''Build the tag-table for parsing the EBNF for this parser''' return GENERATOR.buildParser( name, processor ) """ ITEM = """GENERATOR.addDefinition( %(name)s, %(element)s, ) """ def __call__( self, generator ): temp = [self.HEAD] for name,element in zip(generator.getNames(), generator.getRootObjects()): name = repr(name) element = self.reprObject(element,1) temp.append( self.ITEM%locals()) return "".join(temp) def reprObject( self, obj, depth=0, indent=' ' ): """Return a recognisable version of an objectgenerator element token""" argTemplate = (indent*(depth+1))+"%s = %s," temp = ["""%s("""%(obj.__class__.__name__)] for key,value in list(obj.__dict__.items()): if key == 'children': childTemplate = (indent*(depth+2)) + '%s,' childTemp = ["["] for child in value: childTemp.append(childTemplate%self.reprObject(child,depth+2)) childTemp.append( (indent*(depth+1))+']' ) temp.append( argTemplate% (key, '\n'.join(childTemp)) ) else: temp.append( argTemplate%( key, repr(value))) temp.append( (indent*depth)+')') return '\n'.join(temp) asGenerator = _GeneratorFormatter() asObject = asGenerator.reprObject SimpleParse-2.2.0/simpleparse/simpleparsegrammar.py0000644000175000017500000005566012620706017024141 0ustar mcfletchmcfletch00000000000000'''Default SimpleParse EBNF grammar as a generator with productions This module defines the original SimpleParse grammar. It uses the generator objects directly as this is the first grammar being written. ''' from simpleparse.objectgenerator import * from simpleparse import generator, baseparser from simpleparse.dispatchprocessor import * try: _unichr = unichr _unicode = unicode except NameError: _unichr = chr _unicode = str # note that whitespace is slightly different # due to a bug with NULL-matching repeating groups # we make all the ts references ts? whitespace = Name (value = "ts", report = 0) element_token = Name( value = "element_token" ) literal = Name ( value = "literal") group = Name ( value = "group") characterrange = Name ( value = "range") name = Name ( value = "name") SPGenerator = generator.Generator () SPGenerator.addDefinition( "declarationset", Name (value = "declaration", repeating = 1), ) SPGenerator.addDefinition ( "declaration", SequentialGroup ( children = [ whitespace, FirstOfGroup ( children = [ Name (value = "unreportedname", ), Name (value = "expandedname", ), Name (value = "name", ), ], ), whitespace, Literal (value = ":"), Literal (value = ":", optional=1), Literal (value = "=",), Name( value = "seq_group"), ], ) ) SPGenerator.addDefinition ( "group", SequentialGroup ( children = [ Literal (value ="("), Name( value= "seq_group"), Literal (value =")"), ], expanded = 1, ) ) _seq_children = FirstOfGroup( children = [ Name(value="error_on_fail"), Name(value="fo_group"), Name(value="element_token"), ], ) SPGenerator.addDefinition ( "seq_group", SequentialGroup ( children = [ whitespace, _seq_children, SequentialGroup( children = [ whitespace, Name( value="seq_indicator"), whitespace, _seq_children, ], repeating = 1, optional = 1, ), whitespace, ], ), ) SPGenerator.addDefinition ( "fo_group", SequentialGroup ( children = [ element_token, SequentialGroup( children = [ whitespace, Name( value="fo_indicator"), whitespace, element_token, ], repeating = 1, ), ], ) ) SPGenerator.addDefinition ( "seq_indicator", Literal(value = ",", report=0 ), ) SPGenerator.addDefinition ( "fo_indicator", Literal(value = "/", report=0 ), ) SPGenerator.addDefinition ( "element_token", SequentialGroup ( children = [ Name (value = "lookahead_indicator", optional = 1), whitespace, Name (value = "negpos_indicator", optional = 1), whitespace, FirstOfGroup ( children = [ literal, characterrange, group, name, ] ), whitespace, Name (value = "occurence_indicator", optional = 1), whitespace, Name (value = "error_on_fail", optional = 1), ] ) ) SPGenerator.addDefinition ( "negpos_indicator", Range (value = "+-" ) ) SPGenerator.addDefinition ( "lookahead_indicator", Literal(value = "?" ), ) SPGenerator.addDefinition ( "occurence_indicator", Range (value = "+*?" ), ) SPGenerator.addDefinition ( "error_on_fail", SequentialGroup ( children = [ Literal (value ="!"), SequentialGroup ( children = [ whitespace, Name( value="literal"), ], optional = 1, ), ], ), ) SPGenerator.addDefinition ( "unreportedname", SequentialGroup ( children = [ Literal (value ="<"), whitespace, name, whitespace, Literal (value =">"), ] ) ) SPGenerator.addDefinition ( "expandedname", SequentialGroup ( children = [ Literal (value =">"), whitespace, name, whitespace, Literal (value ="<"), ] ) ) SPGenerator.addDefinition ( "name", SequentialGroup ( children = [ Range(value ='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_'), Range(value ='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789', optional= 1, repeating= 1), ] ) ) SPGenerator.addDefinition ( "ts", # ( [ \011-\015]+ / ('#',-'\n'+,'\n')+ )* FirstOfGroup ( children = [ Range(value =' \011\012\013\014\015', repeating=1), Name( value = "comment" ), ], repeating = 1, optional=1, ) ) SPGenerator.addDefinition ( "comment", # ( [ \011-\015]+ / ('#',-'\n'+,'\n')+ )* SequentialGroup ( children = [ Literal ( value ="#"), Literal (value ="\n", negative = 1, repeating = 1, optional=1), Literal (value = "\n",), ], ), ) SPGenerator.addDefinition ( "literalDecorator", # literalDecorator := [c] Range( value = 'c' ) ) SPGenerator.addDefinition ( "literal", # ("'",(CHARNOSNGLQUOTE/ESCAPEDCHAR)*,"'") / ('"',(CHARNODBLQUOTE/ESCAPEDCHAR)*,'"') SequentialGroup( children = [ Name( value = 'literalDecorator', optional=1 ), FirstOfGroup ( children = [ SequentialGroup ( children = [ Literal (value ="'"), FirstOfGroup ( children = [ Name (value = "CHARNOSNGLQUOTE"), Name (value = "ESCAPEDCHAR"), ], optional = 1, repeating = 1, ), Literal (value ="'"), ], ), SequentialGroup ( children = [ Literal (value ='"'), FirstOfGroup ( children = [ Name (value = "CHARNODBLQUOTE"), Name (value = "ESCAPEDCHAR"), ], optional = 1, repeating = 1, ), Literal (value ='"'), ], ) ], ), ], ) ) SPGenerator.addDefinition ( "range", # '[',CHARBRACE?,CHARDASH?, (CHARRANGE/CHARNOBRACE)*, CHARDASH?,']' SequentialGroup ( children =[ Literal (value ="["), Name (value ="CHARBRACE",optional = 1), Name (value ="CHARDASH",optional = 1), FirstOfGroup( children = [ Name (value ="CHARRANGE"), Name (value ="CHARNOBRACE"), ], optional = 1, repeating = 1, ), Name (value ="CHARDASH",optional = 1), Literal (value ="]"), ], ) ) SPGenerator.addDefinition ( "CHARBRACE", Literal (value = "]"), ) SPGenerator.addDefinition ( "CHARDASH", Literal (value = "-"), ) SPGenerator.addDefinition ( "CHARRANGE", # CHARRANGE := CHARNOBRACE, '-', CHARNOBRACE SequentialGroup ( children =[ Name (value ="CHARNOBRACE"), Literal (value ="-"), Name (value ="CHARNOBRACE"), ], ), ) SPGenerator.addDefinition ( "CHARNOBRACE", # CHARRANGE := CHARNOBRACE, '-', CHARNOBRACE FirstOfGroup( children =[ Name (value ="ESCAPEDCHAR"), Name (value ="CHAR"), ], ), ) SPGenerator.addDefinition ( "CHAR", Literal ( value ="]", negative = 1, ), ) SPGenerator.addDefinition ( "ESCAPEDCHAR", # '\\',( SPECIALESCAPEDCHAR / ('x',HEXESCAPEDCHAR) / UNICODEESCAPEDCHAR_16 / OCTALESCAPEDCHAR / ) SequentialGroup ( children =[ Literal (value ="\\"), FirstOfGroup( children = [ Name (value ="SPECIALESCAPEDCHAR"), SequentialGroup( children = [ Range( value = 'xX' ), Name( value="HEXESCAPEDCHAR"), ] ), Name (value ="OCTALESCAPEDCHAR"), SequentialGroup( children = [ Range( value='uU'), Name( value='UNICODEESCAPEDCHAR' ), ], ), ], ), ], ) ) SPGenerator.addDefinition ( "SPECIALESCAPEDCHAR", Range(value ='\\abfnrtv"\''), ) SPGenerator.addDefinition ( "OCTALESCAPEDCHAR", # [0-7],[0-7]?,[0-7]? SequentialGroup ( children =[ Range (value ="01234567"), Range (value ="01234567", optional = 1), Range (value ="01234567", optional = 1), ], ) ) SPGenerator.addDefinition ( "HEXESCAPEDCHAR", # [0-9a-fA-F],[0-9a-fA-F] SequentialGroup ( children =[ Range (value ="0123456789abcdefABCDEF"), Range (value ="0123456789abcdefABCDEF"), ], ) ) SPGenerator.addDefinition( "UNICODEESCAPEDCHAR", SequentialGroup( children=[ Range (value ="0123456789abcdefABCDEF"), Range (value ="0123456789abcdefABCDEF"), Range (value ="0123456789abcdefABCDEF"), Range (value ="0123456789abcdefABCDEF"), SequentialGroup( children = [ Range (value ="0123456789abcdefABCDEF"), Range (value ="0123456789abcdefABCDEF"), Range (value ="0123456789abcdefABCDEF"), Range (value ="0123456789abcdefABCDEF"), ], optional = True, ) ] ) ) SPGenerator.addDefinition ( "CHARNODBLQUOTE", Range(value ='\\"', negative = 1, repeating = 1), ) SPGenerator.addDefinition ( "CHARNOSNGLQUOTE", Range(value ="\\'", negative = 1, repeating = 1), ) declaration = r"""declarationset := declaration+ declaration := ts, (unreportedname/expandedname/name) ,ts,':',':'?,'=',seq_group element_token := lookahead_indicator?, ts, negpos_indicator?,ts, (literal/range/group/name),ts, occurence_indicator?, ts, error_on_fail? negpos_indicator := [-+] lookahead_indicator := "?" occurence_indicator := [+*?] error_on_fail := "!", (ts,literal)? >group< := '(',seq_group, ')' seq_group := ts,(error_on_fail/fo_group/element_token), (ts, seq_indicator, ts, (error_on_fail/fo_group/element_token) )*, ts fo_group := element_token, (ts, fo_indicator, ts, element_token)+ # following two are likely something peoples might want to # replace in many instances... := "/" := ',' unreportedname := '<', name, '>' expandedname := '>', name, '<' name := [a-zA-Z_],[a-zA-Z0-9_]* := ( [ \011-\015]+ / comment )* comment := '#',-'\n'*,'\n' literal := literalDecorator?,("'",(CHARNOSNGLQUOTE/ESCAPEDCHAR)*,"'") / ('"',(CHARNODBLQUOTE/ESCAPEDCHAR)*,'"') literalDecorator := [c] range := '[',CHARBRACE?,CHARDASH?, (CHARRANGE/CHARNOBRACE)*, CHARDASH?,']' CHARBRACE := ']' CHARDASH := '-' CHARRANGE := CHARNOBRACE, '-', CHARNOBRACE CHARNOBRACE := ESCAPEDCHAR/CHAR CHAR := -[]] ESCAPEDCHAR := '\\',( SPECIALESCAPEDCHAR / ('x',HEXESCAPEDCHAR) / ([uU],UNICODEESCAPEDCHAR) / OCTALESCAPEDCHAR ) SPECIALESCAPEDCHAR := [\\abfnrtv"'] OCTALESCAPEDCHAR := [0-7],[0-7]?,[0-7]? HEXESCAPEDCHAR := [0-9a-fA-F],[0-9a-fA-F] CHARNODBLQUOTE := -[\\"]+ CHARNOSNGLQUOTE := -[\\']+ UNICODEESCAPEDCHAR := [0-9a-fA-F],[0-9a-fA-F],[0-9a-fA-F],[0-9a-fA-F],([0-9a-fA-F],[0-9a-fA-F],[0-9a-fA-F],[0-9a-fA-F])? """ ### Now the interpreter objects... class Parser(baseparser.BaseParser): """Parser which generates new parsers from EBNF grammars This parser class allows you to pass in an EBNF grammar as the initialisation parameter. The EBNF is processed, and a SimpleParse generator object is created as self.generator. Unlike most Parsers, this object is intended to be re-created for each bit of data it parses (i.e. each EBNF), so it warps the standard API a lot. """ _rootProduction = 'declarationset' def __init__( self, ebnf, prebuilts=(), methodSource=None, definitionSources=() ): """Create a new generator based on the EBNF in simpleparse format""" processor = SPGrammarProcessor( prebuilts, definitionSources ) success, tags, next = self.parse( ebnf, self._rootProduction, processor=processor ) if next != len(ebnf): lineNumber = lines(0, next, ebnf) raise ValueError( """Unable to complete parsing of the EBNF, stopped at line %s (%s chars of %s) Unparsed:\n%s..."""%(lineNumber, next, len(ebnf), ebnf[next:next+100]) ) self.generator = processor.generator def buildTagger( self, name=None, processor = None ): """Build the tag-table for parsing the EBNF for this parser""" return SPGenerator.buildParser( name, processor ) class SPGrammarProcessor( DispatchProcessor ): """Processing object for post-processing an EBNF into a new generator""" ### top level def __init__( self, prebuilts=(), definitionSources=() ): """Create a new generator based on the EBNF in simpleparse format""" self.generator = generator.Generator() for (name, table) in prebuilts: if isinstance( table, ElementToken): self.generator.addDefinition( name, table) else: self.generator.addDefinition( name, Prebuilt(value=table)) for source in definitionSources: self.generator.addDefinitionSource( source ) def declaration( self, info, buffer): '''Base declaration from the grammar, a "production" or "rule"''' (tag, left, right, sublist) = info name = sublist[0] expanded = 0 if name[0] == "unreportedname": name = name[3][0] # note that the info is stored in the wrong place :( report = 0 elif name[0] == 'expandedname': report = 1 expanded = 1 name = name[3][0] else: report = 1 name = getString( name, buffer ) self.currentProduction = name content = dispatch( self, sublist[1], buffer ) content.report = report content.expanded = expanded self.generator.addDefinition( name, content, ) del self.currentProduction ### element configuration def element_token( self, info, buffer): '''get the children, then configure''' (tag, left, right, sublist) = info base = None negative = 0 optional = 0 repeating = 0 lookahead = 0 errorOnFail = None for tup in sublist: result = dispatch( self, tup, buffer ) if tup[0] == 'negpos_indicator': negative = result elif tup[0] == 'occurence_indicator': optional, repeating = result elif tup[0] == 'lookahead_indicator': lookahead = result elif tup[0] == 'error_on_fail': # we do some extra work here errorOnFail = result self._config_error_on_fail( errorOnFail, (tag,left,tup[1],[]), buffer ) else: base = result base.optional = optional base.negative = negative base.repeating = repeating base.lookahead = lookahead if errorOnFail: base.errorOnFail = errorOnFail return base ### generator-node-builders def seq_group( self, info, buffer): """Process a sequential-group into a SequentialGroup element token""" (tag, left, right, sublist) = info children = dispatchList( self, sublist, buffer ) errorOnFail = None result = [] for (item,tup) in zip(children,sublist): if isinstance( item, ErrorOnFail ): errorOnFail = item else: if errorOnFail: item.errorOnFail = errorOnFail.copy() self._config_error_on_fail( item.errorOnFail, tup, buffer ) result.append( item ) if len(result) == 1: # single-item sequential group (very common) return result[0] elif not result: raise ValueError( """SequentialGroup on line %s doesn't have an element-token child! grammar was %s"""%( lines(0,left, buffer), buffer[left:left+25])) base = SequentialGroup( children = result, ) return base def fo_group( self, info, buffer): """Process a first-of-group into a FirstOf element token""" (tag, left, right, sublist) = info children = dispatchList( self, sublist, buffer ) if len(children) == 1: # this should never happen, but if it does, we can deal with it I suppose... return children[0] base = FirstOfGroup( children = children ) return base def literal( self, info, buffer): '''Turn a literal result into a literal generator''' (tag, left, right, sublist) = info if sublist and sublist[0][0] == 'literalDecorator': # right now only have the one decorator... sublist = sublist[1:] classObject = CILiteral else: classObject = Literal elements = dispatchList( self, sublist, buffer) ### Should check for CILiteral with non-CI string or single-character value! return classObject( value = "".join(elements) ) def range( self, info, buffer): ## if hasattr( Range, 'requiresExpandedSet') and Range.requiresExpandedSet: (tag, left, right, sublist) = info return Range( value = ''.join(dispatchList( self, sublist, buffer)), ) ## else: ## # need to build up a new-syntax version of the range... ## # escape ^ to \^ ## # escape \ to \\ ## # escape - to \- ## # make sure range-sets are in proper order... ## raise NotImplementedError( """Haven't got the new CharSet version implemented yet""") def name( self, tup, buffer): return Name( value = getString(tup, buffer), ) ### simple translators occurenceIndicatorMap = { '*': (1,1), '+': (0,1), '?': (1,0), } def occurence_indicator( self, tup, buffer): '''Return optional, repeating as a tuple of true/false values''' value = getString(tup, buffer) return self.occurenceIndicatorMap[value] def lookahead_indicator( self, tup, buffer ): """If present, the lookahead indictor just says "yes", so just return 1""" return 1 def error_on_fail( self, info, buffer ): """If present, we are going to make the current object an errorOnFail type, If there's a string literal child, then we use it to create the "message" attribute of the errorOnFail object. """ (tag,left,right,children) = info err = ErrorOnFail() if children: (tag,left,right,children) = children[0] message = "".join(dispatchList( self, children, buffer)) err.message = message return err def _config_error_on_fail( self, errorOnFail, tup, buffer ): """Configure an error-on-fail instance for a given child tuple""" # what we expected to find... errorOnFail.expected = buffer[tup[1]:tup[2]] if hasattr( self, "currentProduction"): errorOnFail.production = self.currentProduction negposIndicatorMap = { '+': 0, '-': 1, } def negpos_indicator( self, tup, buffer ): '''return whether indicates negative''' value = getString(tup, buffer) return self.negposIndicatorMap[value] def CHARNODBLQUOTE( self, tup, buffer): return getString(tup, buffer) CHAR = CHARNOSNGLQUOTE = CHARNODBLQUOTE def ESCAPEDCHAR( self, info, buffer): (tag, left, right, sublist) = info return "".join(dispatchList( self, sublist, buffer)) specialescapedmap = { 'a':'\a', 'b':'\b', 'f':'\f', 'n':'\n', 'r':'\r', 't':'\t', 'v':'\v', '\\':'\\', '"':'"', "'":"'", } def SPECIALESCAPEDCHAR( self, tup, buffer): return self.specialescapedmap[ getString(tup, buffer)] def OCTALESCAPEDCHAR(self, tup, buffer): return chr(int( getString(tup, buffer), 8 )) def HEXESCAPEDCHAR( self, tup , buffer): return chr(int( getString(tup, buffer), 16 )) def CHARNOBRACE( self, info, buffer): (tag, left, right, sublist) = info return "".join(dispatchList( self, sublist, buffer)) def CHARRANGE( self, info, buffer): '''Create a string from first to second item''' (tag, left, right, sublist) = info first,second = dispatchList( self, sublist, buffer) if second < first: second, first = first, second if isinstance( first, _unicode ) or isinstance( second, _unicode ): _chr = _unichr if not (isinstance( second, _unicode ) and isinstance( first, _unicode )): raise ValueError( 'Range %s uses one unicode and one string escape, cannot mix'%(buffer[left:right]) ) else: _chr = chr first, second = list(map( ord, (first,second) )) return u''.join([_chr(u) for u in range(first,second+1)]) def CHARDASH( self, tup , buffer): return '-' def CHARBRACE( self, tup , buffer): return ']' def UNICODEESCAPEDCHAR( self, info, buffer): """Decode a unicode-escaped hex character into a character value""" (tag, left, right, sublist) = info char = _unichr(int( buffer[left:right], 16 )) return char SimpleParse-2.2.0/simpleparse/generator.py0000644000175000017500000001517612620706017022232 0ustar mcfletchmcfletch00000000000000"""Abstract representation of an in-memory grammar that generates parsers""" from simpleparse.stt.TextTools import TextTools import traceback class Generator: '''Abstract representation of an in-memory grammar that generates parsers The generator class manages a collection of ElementToken objects. These element token objects allow the generator to be separated from the particular parser associated with any particular EBNF grammar. In fact, it is possible to create entire grammars using only the generator objects as a python API. ''' def __init__( self ): """Initialise the Generator""" self.names = [] self.rootObjects = [] self.methodSource = None self.definitionSources = [] def getNameIndex( self, name ): '''Return the index into the main list for the given name''' try: return self.names.index( name ) except ValueError: for source in self.definitionSources: if name in source: return self.addDefinition( name, source[name]) ## import pdb ## pdb.set_trace() raise NameError( '''The name %s is not defined within this generator'''%(repr(name)), self ) def getRootObjects( self, ): '''Return the list of root generator objects''' return self.rootObjects def getNames( self, ): '''Return the list of root generator objects''' return self.names def getRootObject( self, name ): """Get a particular root object by name""" return self.getRootObjects()[ self.getNameIndex(name)] def addDefinition( self, name, rootElement ): '''Add a new definition (object) to the generator''' try: self.names.index( name ) raise NameError( '''Attempt to redefine an existing name %s'''%(name), self ) except ValueError: self.names.append( name ) self.rootObjects.append( rootElement ) return self.getNameIndex( name ) def buildParser( self, name, methodSource=None ): '''Build the given parser definition, returning a TextTools parsing tuple''' self.parserList = [] self.terminalParserCache = {} self.methodSource = methodSource i = 0 while i < len(self.rootObjects): # XXX Note: rootObjects will grow in certain cases where # a grammar is loading secondary grammars into itself rootObject = self.rootObjects[i] try: if len(self.parserList) <= i or self.parserList[i] is None: parser = tuple(rootObject.toParser( self )) self.setTerminalParser( i, parser ) except NameError as err: currentRuleName = self.names[i] err.args = err.args + ('current declaration is %s'%(currentRuleName), ) raise i = i + 1 assert None not in self.parserList, str( self.parserList) return self.parserList [self.getNameIndex (name)] def setTerminalParser( self, index, parser ): """Explicitly set the parser value for given name""" while index >= len(self.parserList): self.parserList.append(None) self.parserList[index] = parser def getTerminalParser( self, index ): """Try to retrieve a parser from the parser-list""" try: return self.parserList[ index ] except IndexError: return None def cacheCustomTerminalParser( self, index, flags, parser ): """Optimization to reuse customized terminal parsers""" self.terminalParserCache[ (index,flags) ] = parser def getCustomTerminalParser( self, index, flags ): """Retrieved a cached customized terminal parser or None""" return self.terminalParserCache.get( (index, flags)) def getParserList (self): return self.parserList def getObjectForName( self, name): """Determine whether our methodSource has a parsing method for the given name returns ( flags or 0 , tagobject) """ testName = "_m_"+name if hasattr( self.methodSource, testName): method = getattr( self.methodSource, testName ) if callable(method): return TextTools.CallTag, method elif method == TextTools.AppendMatch: return method, name elif method in (TextTools.AppendToTagobj, TextTools.AppendTagobj): object = self.getTagObjectForName( name ) if method == TextTools.AppendToTagobj: if not ( hasattr( object, 'append') and callable(object.append)): raise ValueError( """Method source %s declares production %s to use AppendToTagobj method, but doesn't given an object with an append method in _o_%s (gave %s)"""%(repr(self.methodSource), name,name, repr(object))) return method, object else: raise ValueError( """Unrecognised command value %s (not callable, not one of the Append* constants) found in methodSource %s, name=%s"""%( repr(method),repr(methodSource),name)) return 0, name def getTagObjectForName( self, name ): """Get any explicitly defined tag object for the given name""" testName = "_o_"+name if hasattr( self.methodSource, testName): object = getattr( self.methodSource, testName ) return object return name def addDefinitionSource( self, item ): """Add a source for definitions when the current grammar doesn't supply a particular rule (effectively common/shared items for the grammar).""" self.definitionSources.append( item ) ### Compatability API ## This API exists to allow much of the code written with SimpleParse 1.0 ## to work with SimpleParse 2.0 class GeneratorAPI1: """Stand-in class supporting operation of SimpleParse 1.0 applications There was really only the one method of interest, parserbyname, everything else was internal (and is now part of simpleparsegrammar.py). """ def __init__( self, production, prebuilt=() ): from simpleparse.parser import Parser self.parser = Parser( production, prebuilts=prebuilt ) def parserbyname( self, name ): """Retrieve a tag-table by production name""" return self.parser.buildTagger( name ) def buildParser( declaration, prebuiltnodes=() ): """API 1.0 primary entry point, returns a GeneratorAPI1 instance That object will respond to the parserbyname API expected by SimpleParse 1.0 applications. """ return GeneratorAPI1( declaration, prebuiltnodes ) SimpleParse-2.2.0/simpleparse/processor.py0000644000175000017500000000413612620706017022255 0ustar mcfletchmcfletch00000000000000"""Definitions of the MethodSource and Processor APIs""" class MethodSource(object): """Base class for MethodSource objects (including Processors and Parsers) Most applications will use either Processor or Parser objects, rather than directly using a MethodSource object. The MethodSource is basically just a generic object whose attributes are accessed during generation and/or post-processing of parse results. The following are the special attribute forms for use in _m_productionname -- alters the method used in the TextTools engine for storing results. If this is a callable object, then call the object with: object( taglist,text,l,r,subtags ) If it is TextTools.AppendToTagobj, then append the result tuple to the associated object (_o_productionname). This requires that _o_productionname have an "append" method, obviously. If it is the constant TextTools.AppendMatch, then append the string value which matched the production. If it is TextTools.AppendTagobj, then append the associated tagobject itself to the results tree. _o_productionname -- with AppendToTagobj, AppendTagobj and cases where there is no _m_productionname defined, this allows you to provide an explicit tagobject for reporting in the results tree/getting called with results. """ class Processor(MethodSource): """Provides definition of a generic processing API Basically, a Processor has a method __call__ which takes two arguments, a value (which is either a 3-tuple or a 4-tuple depending on whether a top-level production is being processed), and a pointer to the buffer being parsed. """ def __call__( self, value, buffer ): """Process the results of a parsing run over buffer""" return value def __repr__( self ): """Return a representation of the class""" return "<%s object @ %s>"%( self.__class__.__name__, id(self)) SimpleParse-2.2.0/simpleparse/xmlparser/0000755000175000017500000000000012620710576021702 5ustar mcfletchmcfletch00000000000000SimpleParse-2.2.0/simpleparse/xmlparser/xml_parser.py0000644000175000017500000001733012620706017024427 0ustar mcfletchmcfletch00000000000000"""XML Parser based (loosely) on the XML Spec's EBNF This is a hand-coded parser based on the W3C's XML specification, there was a lot of busy-work rewriting to make the syntax agree, but also a number of signficant structural changes required by the limitations of the SimpleParse engine, and the completely procedural definition of References in the XML spec (the References don't occur in most places they can occur, and they are seen as altering the buffer directly as soon as they are encountered, this isn't something that fits readily into the mx.TextTools engine. http://www.w3.org/TR/REC-xml#sec-references Major Deviations from Spec: No support for the unicode-style character classes No support for UTF-16 (or Unicode at all, for that matter) No support for References that alter the production being parsed, so you can't have a Reference to an item "and" or similar non-structure- respecting References. References have particular locations they can occur, and they are just ignored elsewhere No support for parsing the contents of References within the primary parsing pass No support for excluded start/end tags Comments allowed in both tags and declarations (but not inside content-specifiers). Allows end tags of the form """ declaration = """ # Simple (changable) literals # These should be chosen based on the encoding # of the file, which is actually embedded in the # file :( := [\x20\x09\x0D\x0A]+ := [a-zA-Z] := letter/[_:] := letter/digit/[-._:] # don't change for XML, but would change for SGML or HTML := '=' := '&' := '%' := ';' := ' := '?>' := '<' := '>' := ' := '>' := '/>' # an XML-comment, note that this follows # SGML semantics, so that you can embed comment_sets # in the middle of the various declarations... >Comment< := "" >comment_set< := '--', xml_comment,'--' xml_comment := -'--'* # whitespace in tag (including possible comment) >TS< := (Comment/S)+ # general structures AttValue := ('"', (Reference/ -[&"] )*, '"') / ( "'", (Reference / -[&'])*, "'") # Names Name := namestart, namechar* Names := Name, (S,Name)* Nmtoken := namechar+ Nmtokens := Nmtoken, (S,Nmtoken)* # processing instructions PI := PIO, PITarget, S?, PIContent, PIC PIContent := -PIC* PITarget := ?-( [Xx],[Mm],[Ll]), Name ## references # character reference CharRef := REFO,'#',('x',hex)/(int),REFC # entity reference EntityRef := REFO, Name, REFC # parsed entity ref PEReference := PREFO, Name, REFC Reference := EntityRef / CharRef Misc := Comment/S ### PROLOG definitions... prolog := XMLDecl?, Misc*, (doctypedecl, Misc*)? XMLDecl := '' VersionInfo := TS?, 'version', TS?, Eq, TS?, (('"',VersionNum,'"')/("'",VersionNum,"'")) VersionNum := [a-zA-Z0-9_.:-]+ ### Document-type declarations (DTDs) doctypedecl := '' DeclSep := PEReference / S markupdecl := elementdecl / AttlistDecl / EntityDecl / NotationDecl / PI / Comment EncodingDecl := TS, 'encoding', Eq, (('"', EncName, '"') / ("'", EncName, "'") ) EncName := [A-Za-z],[A-Za-z0-9._-]* SDDecl := TS, 'standalone', Eq, (("'", ('yes' / 'no'), "'") / ('"', ('yes' / 'no'), '"')) ExternalID := ('SYSTEM', TS?, SystemLiteral) / ('PUBLIC', TS?, PubidLiteral, TS?, SystemLiteral ) / PEReference NDataDecl := (TS, 'NDATA', TS, Name)/ (TS,PEReference,TS,(Name/ PEReference)?) SystemLiteral := ('"', -["]*, '"') / ("'", -[']*, "'") / PEReference PubidLiteral := ('"', [\x20\x0D\x0Aa-zA-Z0-9'()+,./:=?;!*#@$_%-]*, '"') / ("'", [\x20\x0D\x0Aa-zA-Z0-9()+,./:=?;!*#@$_%-]*, "'") / PEReference PublicID := ('PUBLIC', TS, PubidLiteral) / PEReference ### Element-type declarations # hack to try and get PEReference parsing for the "normal case" # where the PEReference doesn't change the production level, which # seems to be suggested by the spec... elementdecl := '' >elementdecl_pe< := (TS, PEReference, TS?, contentspec?) contentspec := 'EMPTY' / 'ANY' / Mixed / children Mixed := ('(', S?, '#PCDATA', (S?, '|', S?, (Name/PEReference))*, S?, ')*' ) /('(', S?, '#PCDATA', S?, ')') repetition_specifier := ('?' / '*' / '+')? children := (choice / seq/ PEReference), repetition_specifier cp := (choice / seq / Name/ PEReference ), repetition_specifier choice := '(', S?, cp, ( S?, '|', S?, cp )+, S?, ')' seq := '(', S?, cp, ( S?, ',', S?, cp )*, S?, ')' ### Attribute list declarations... AttlistDecl := '' AttDef := TS, ((Name, TS, AttType, TS, DefaultDecl)/(PEReference, TS?, AttType?, TS?, DefaultDecl?)) AttType := StringType / TokenizedType / EnumeratedType/ PEReference StringType := 'CDATA' TokenizedType := 'ID' / 'IDREF' / 'IDREFS' / 'ENTITY' / 'ENTITIES' / 'NMTOKEN' / 'NMTOKENS' EnumeratedType := NotationType / Enumeration NotationType := 'NOTATION', TS, ('(', NameOrList, ')')/PEReference Enumeration := '(', (NmTokenOrList/PEReference), ')' >NameOrList< := S?, (Name/PEReference), (S?, '|', S?, (Name/PEReference))*, S? >NmTokenOrList< := S?, (Nmtoken/PEReference), (S?, '|', S?, (Nmtoken/PEReference))*, S? DefaultDecl := '#REQUIRED' / '#IMPLIED' / ((('#FIXED', TS)/PEReference)?, (AttValue/PEReference)) / PEReference ### Entity declarations EntityDecl := GEDecl / PEDecl GEDecl := '' PEDecl := '' EntityDef := EntityValue / (ExternalID, NDataDecl?) / PEReference PEDef := EntityValue / ExternalID / PEReference EntityValue := ('"', (PEReference / Reference / -[%&"])*, '"') / ("'", (PEReference / Reference / -[%&'])*, "'") NotationDecl := '' ### elements (nodes/tags/you-know :) ) # limitations in the SimpleParse engine mean that this # particular structure will be basically useless... element := EmptyElemTag / (STag, content, ETag) EmptyElemTag := STagO, Name, (TS, Attribute)*, TS?, EmptyElemTagC STag := STagO, Name, (TS, Attribute)*, TS?, STagC ETag := ETagO, Name?, TS?, ETagC content := (element / Reference / CDSect / PI / Comment / CharData)* Attribute := (Name, Eq, (AttValue/Reference))/(Reference,(Eq,(AttValue/Reference))?) # general content of an element CharData := ( -[<&]+ / -(STag / EmptyElemTag / ETag / Reference / CDSect / PI / Comment) )+ # special non-parsed character data sections CDSect := CDStart, CData, CDEnd := ' := ']]>' document := prolog, element, Misc* """ from simpleparse.common import numbers, strings, chartypes SimpleParse-2.2.0/simpleparse/xmlparser/__init__.py0000644000175000017500000000050412620706017024005 0ustar mcfletchmcfletch00000000000000"""XML Parsing package At the moment it's really limited, but it does the basics, and the rest is mostly just a matter of fiddling about with Unicode and CharacterType support. There is only very minimal support for Reference types, basically we note that a Reference exists, but don't do any further processing of it. """SimpleParse-2.2.0/simpleparse/stt/0000755000175000017500000000000012620710576020477 5ustar mcfletchmcfletch00000000000000SimpleParse-2.2.0/simpleparse/stt/Doc/0000755000175000017500000000000012620710576021204 5ustar mcfletchmcfletch00000000000000SimpleParse-2.2.0/simpleparse/stt/Doc/mxLicense.html0000644000175000017500000007531712037615407024035 0ustar mcfletchmcfletch00000000000000 mx Extension Series - License Information

mx Extension Series - License Information


Public License : Commercial License : Home Version 1.1.0

Introduction

eGenix.com Public License

    The eGenix.com Public License is similar to the Python 2.0 and considered an Open Source license (in the sense defined by the Open Source Intiative (OSI)) by eGenix.com.

    The license should also be compatible to the GNU Public License in case that matters. The only part which is known to have caused some problems with Richard Stallmann in the past is the choice of law clause.

    ________________________________________________________________________
    
    EGENIX.COM PUBLIC LICENSE AGREEMENT                        VERSION 1.1.0
    ________________________________________________________________________
    
    1.  Introduction
    
        This "License Agreement" is between eGenix.com Software, Skills
        and Services GmbH ("eGenix.com"), having an office at
        Pastor-Loeh-Str. 48, D-40764 Langenfeld, Germany, and the
        Individual or Organization ("Licensee") accessing and otherwise
        using this software in source or binary form and its associated
        documentation ("the Software").
    
    2.  License 
    
        Subject to the terms and conditions of this eGenix.com Public
        License Agreement, eGenix.com hereby grants Licensee a
        non-exclusive, royalty-free, world-wide license to reproduce,
        analyze, test, perform and/or display publicly, prepare derivative
        works, distribute, and otherwise use the Software alone or in any
        derivative version, provided, however, that the eGenix.com Public
        License Agreement is retained in the Software, or in any
        derivative version of the Software prepared by Licensee.
    
    3.  NO WARRANTY
    
        eGenix.com is making the Software available to Licensee on an "AS
        IS" basis.  SUBJECT TO ANY STATUTORY WARRANTIES WHICH CAN NOT BE
        EXCLUDED, EGENIX.COM MAKES NO REPRESENTATIONS OR WARRANTIES,
        EXPRESS OR IMPLIED.  BY WAY OF EXAMPLE, BUT NOT LIMITATION,
        EGENIX.COM MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY
        OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT
        THE USE OF THE SOFTWARE WILL NOT INFRINGE ANY THIRD PARTY RIGHTS.
    
    4.  LIMITATION OF LIABILITY
    
        EGENIX.COM SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF
        THE SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES
        OR LOSS (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF
        BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS
        INFORMATION, OR OTHER PECUNIARY LOSS) AS A RESULT OF USING,
        MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY DERIVATIVE THEREOF,
        EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
    
        SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OR LIMITATION OF
        INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO THE ABOVE EXCLUSION OR
        LIMITATION MAY NOT APPLY TO LICENSEE.
    
    5.  Termination
    
        This License Agreement will automatically terminate upon a
        material breach of its terms and conditions.
    
    6.  Third Party Rights 
    
        Any software or documentation in source or binary form provided
        along with the Software that is associated with a separate license
        agreement is licensed to Licensee under the terms of that license
        agreement. This License Agreement does not apply to those portions
        of the Software. Copies of the third party licenses are included
        in the Software Distribution.
    
    7.  General
    
        Nothing in this License Agreement affects any statutory rights of
        consumers that cannot be waived or limited by contract.
    
        Nothing in this License Agreement shall be deemed to create any
        relationship of agency, partnership, or joint venture between
        eGenix.com and Licensee.
    
        If any provision of this License Agreement shall be unlawful,
        void, or for any reason unenforceable, such provision shall be
        modified to the extent necessary to render it enforceable without
        losing its intent, or, if no such modification is possible, be
        severed from this License Agreement and shall not affect the
        validity and enforceability of the remaining provisions of this
        License Agreement.
    
        This License Agreement shall be governed by and interpreted in all
        respects by the law of Germany, excluding conflict of law
        provisions. It shall not be governed by the United Nations
        Convention on Contracts for International Sale of Goods.
    
        This License Agreement does not grant permission to use eGenix.com
        trademarks or trade names in a trademark sense to endorse or
        promote products or services of Licensee, or any third party.
    
        The controlling language of this License Agreement is English. If
        Licensee has received a translation into another language, it has
        been provided for Licensee's convenience only.
    
    8.  Agreement
    
        By downloading, copying, installing or otherwise using the
        Software, Licensee agrees to be bound by the terms and conditions
        of this License Agreement.
    
    
        For question regarding this License Agreement, please write to:
    
    	      eGenix.com Software, Skills and Services GmbH
    	      Pastor-Loeh-Str. 48
    	      D-40764 Langenfeld
    	      Germany
    		    


eGenix.com Commercial License

    The eGenix.com Commercial License is covers commercial eGenix.com software, notably the mxODBC package. Only private and non-commercial use is free of charge.

    Usage of the software in commercial settings such as for implementing in-house applications in or for companies, governments, for-profit organizations, etc. requires a signed "Proof of Authorization" which can be bought from eGenix.com in order to authorize this use.

    eGenix.com Commercial Licensing Models

    We currently offer four models to choose from:

    1. CPU License: per-installation licenses (both for commercial and non-commercial use)

    2. Developer CPU License: per-developer-seat licenses which allow redistribution

    3. Reseller Agreement: agreement which allows reselling the software to third parties for standalone use

    4. Product Reseller Agreement: agreement which allows reselling the software to third parties for use in a specific product only

    The first two options are covered by the eGenix.com Commercial License through the "Proof of Authorization" forms we provide below. The two reseller options have to be negotiated between the reseller and eGenix.com. Please contact sales@eGenix.com if you are interested in becoming an eGenix.com software reseller.

    ________________________________________________________________________
    
    EGENIX.COM COMMERCIAL LICENSE AGREEMENT                    VERSION 1.1.0
    ________________________________________________________________________
    
    1.  Introduction
    
        This "License Agreement" is between eGenix.com Software, Skills
        and Services GmbH ("eGenix.com"), having an office at
        Pastor-Loeh-Str. 48, D-40764 Langenfeld, Germany, and the
        Individual or Organization ("Licensee") accessing and otherwise
        using this software in source or binary form and its associated
        documentation ("the Software").
    
    2.  Terms and Definitions
    
        The "Software" covered under this License Agreement includes
        without limitation, all object code, source code, help files,
        publications, documentation and other programs, products or tools
        that are included in the official "Software Distribution"
        available from eGenix.com.
    
        The "Proof of Authorization" for the Software is a written and
        signed notice from eGenix.com providing evidence of the extent of
        authorizations the Licensee has acquired to use the Software and
        of Licensee's eligibility for future upgrade program prices (if
        announced) and potential special or promotional opportunities. As
        such, the Proof of Authorization becomes part of this License
        Agreement.
    
        Installation of the Software ("Installation") refers to the
        process of unpacking or copying the files included in the Software
        Distribution to an Installation Target.
    
        "Installation Target" refers to the target of an installation
        operation.  Targets are defined as follows:
    
    	1) "CPU" refers to a central processing unit which is able to
    	store and/or execute the Software (a server, personal
        	computer, or other computer-like device) using at most two (2)
        	processors,
    
    	2) "Site" refers to at most one hundred fifty (150) CPUs
        	installed at a single site of a company,
    
    	3) "Corporate" refers to at most one thousand (1000) CPUs
    	installed at an unlimited number of sites of the company,
    
    	4) "Developer CPU" refers to a single CPU used by at most one (1)
        	developer.
    
        When installing the Software on a server CPU for use by other CPUs
        in a network, Licensee must obtain a License for the server CPU
        and for all client CPUs attached to the network which will make
        use of the Software by copying the Software in binary or source
        form from the server into their CPU memory. If a CPU makes use of
        more than two (2) processors, Licensee must obtain additional CPU
        licenses to cover the total number of installed
        processors. Likewise, if a Developer CPU is used by more than one
        developer, Licensee must obtain additional Developer CPU licenses
        to cover the total number of developers using the CPU.
    
        "Commercial Environment" refers to any application environment
        which is aimed at directly or indirectly generating profit. This
        includes, without limitation, for-profit organizations,
        governments, private educational institutions, work as independent
        contractor, consultant and other profit generating relationships
        with organizations or individuals.
    
        "Non-Commercial Environments" are all those application
        environments which do not directly or indirectly generate profit.
        Public educational institutions and officially acknowledged
        non-profit organizations are regarded as being a Non-Commercial
        Environments in the aforementioned sense.
    
    3.  License Grant
    
        Subject to the terms and conditions of this License Agreement,
        eGenix.com hereby grants Licensee a non-exclusive, world-wide
        license to
    
    	1) use the Software to the extent of authorizations Licensee has
    	acquired and
    
    	2) distribute, make and install copies to support the level of use
    	authorized, providing Licensee reproduces this License Agreement
    	and any other legends of ownership on each copy, or partial copy,
    	of the Software.
    
        If Licensee acquires this Software as a program upgrade,
        Licensee's authorization to use the Software from which Licensee
        upgraded is terminated.
    
        Licensee will ensure that anyone who uses the Software does so
        only in compliance with the terms of this License Agreement.
    
        Licensee may not 
    
    	1) use, copy, install, compile, modify, or distribute the
        	Software except as provided in this License Agreement;
    
    	2) reverse assemble, reverse engineer, reverse compile, or
    	otherwise translate the Software except as specifically
        	permitted by law without the possibility of contractual
        	waiver; or
    
    	3) rent, sublicense or lease the Software.
    
    4.  Authorizations
    
        The extent of authorization depends on the ownership of a Proof of
        Authorization for the Software.
    
        Usage of the Software for any other purpose not explicitly covered
        by this License Agreement or granted by the Proof of Authorization
        is not permitted and requires the written prior permission from
        eGenix.com.
    
    5.  Modifications
    
        Software modifications may only be distributed in form of patches
        to the original files contained in the Software Distribution.
    
        The patches must be accompanied by a legend of origin and
        ownership and a visible message stating that the patches are not
        original Software delivered by eGenix.com, nor that eGenix.com can
        be held liable for possible damages related directly or indirectly
        to the patches if they are applied to the Software.
    
    6.  Experimental Code or Features
    
        The Software may include components containing experimental code
        or features which may be modified substantially before becoming
        generally available.
    
        These experimental components or features may not be at the level
        of performance or compatibility of generally available eGenix.com
        products. eGenix.com does not guarantee that any of the
        experimental components or features contained in the eGenix.com
        will ever be made generally available.
    
    7.  Expiration and License Control Devices
    
        Components of the Software may contain disabling or license
        control devices that will prevent them from being used after the
        expiration of a period of time or on Installation Targets for
        which no license was obtained.
    
        Licensee will not tamper with these disabling devices or the
        components. Licensee will take precautions to avoid any loss of
        data that might result when the components can no longer be used.
    
    8.  NO WARRANTY
    
        eGenix.com is making the Software available to Licensee on an "AS
        IS" basis. SUBJECT TO ANY STATUTORY WARRANTIES WHICH CAN NOT BE
        EXCLUDED, EGENIX.COM MAKES NO REPRESENTATIONS OR WARRANTIES,
        EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION,
        EGENIX.COM MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY
        OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT
        THE USE OF THE SOFTWARE WILL NOT INFRINGE ANY THIRD PARTY RIGHTS.
    
    9.  LIMITATION OF LIABILITY
    
        TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT
        SHALL EGENIX.COM BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE
        SOFTWARE FOR (I) ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES
        OR LOSS (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF
        BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS
        INFORMATION, OR OTHER PECUNIARY LOSS) AS A RESULT OF USING,
        MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY DERIVATIVE THEREOF,
        EVEN IF ADVISED OF THE POSSIBILITY THEREOF; OR (II) ANY AMOUNTS IN
        EXCESS OF THE AGGREGATE AMOUNTS PAID TO EGENIX.COM UNDER THIS
        LICENSE AGREEMENT DURING THE TWELVE (12) MONTH PERIOD PRECEEDING
        THE DATE THE CAUSE OF ACTION AROSE.
    
        SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OR LIMITATION OF
        INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO THE ABOVE EXCLUSION OR
        LIMITATION MAY NOT APPLY TO LICENSEE.
    
    10. Termination
    
        This License Agreement will automatically terminate upon a
        material breach of its terms and conditions if not cured within
        thirty (30) days of written notice by eGenix.com. Upon
        termination, Licensee shall discontinue use and remove all
        installed copies of the Software.
    
    11. Indemnification 
    
        Licensee hereby agrees to indemnify eGenix.com against and hold
        harmless eGenix.com from any claims, lawsuits or other losses that
        arise out of Licensee's breach of any provision of this License
        Agreement.
    
    12. Third Party Rights 
    
        Any software or documentation in source or binary form provided
        along with the Software that is associated with a separate license
        agreement is licensed to Licensee under the terms of that license
        agreement. This License Agreement does not apply to those portions
        of the Software. Copies of the third party licenses are included
        in the Software Distribution.
    
    13. High Risk Activities 
    
        The Software is not fault-tolerant and is not designed,
        manufactured or intended for use or resale as on-line control
        equipment in hazardous environments requiring fail-safe
        performance, such as in the operation of nuclear facilities,
        aircraft navigation or communication systems, air traffic control,
        direct life support machines, or weapons systems, in which the
        failure of the Software, or any software, tool, process, or
        service that was developed using the Software, could lead directly
        to death, personal injury, or severe physical or environmental
        damage ("High Risk Activities").
    
        Accordingly, eGenix.com specifically disclaims any express or
        implied warranty of fitness for High Risk Activities.
    
        Licensee agree that eGenix.com will not be liable for any claims
        or damages arising from the use of the Software, or any software,
        tool, process, or service that was developed using the Software,
        in such applications.
    
    14. General
    
        Nothing in this License Agreement affects any statutory rights of
        consumers that cannot be waived or limited by contract.
    
        Nothing in this License Agreement shall be deemed to create any
        relationship of agency, partnership, or joint venture between
        eGenix.com and Licensee.
    
        If any provision of this License Agreement shall be unlawful,
        void, or for any reason unenforceable, such provision shall be
        modified to the extent necessary to render it enforceable without
        losing its intent, or, if no such modification is possible, be
        severed from this License Agreement and shall not affect the
        validity and enforceability of the remaining provisions of this
        License Agreement.
    
        This License Agreement shall be governed by and interpreted in all
        respects by the law of Germany, excluding conflict of law
        provisions. It shall not be governed by the United Nations
        Convention on Contracts for International Sale of Goods.
    
        This License Agreement does not grant permission to use eGenix.com
        trademarks or trade names in a trademark sense to endorse or
        promote products or services of Licensee, or any third party.
    
        The controlling language of this License Agreement is English. If
        Licensee has received a translation into another language, it has
        been provided for Licensee's convenience only.
    
    15. Agreement
    
        By downloading, copying, installing or otherwise using the
        Software, Licensee agrees to be bound by the terms and conditions
        of this License Agreement.
    
    
        For question regarding this License Agreement, please write to:
    
    	      eGenix.com Software, Skills and Services GmbH
    	      Pastor-Loeh-Str. 48
    	      D-40764 Langenfeld
    	      Germany
    		    

    If you have questions, please send e-mail to licenses@egenix.com or use the above postal address.

    eGenix.com Proof of Authorization (1 CPU License)

    This is the "Proof of Authorization" we send out for a "1 CPU License" subject to the above license. It permits you to install and use the Software on one machine having at most 2 processors.

    ________________________________________________________________________
    
    EGENIX.COM PROOF OF AUTHORIZATION:                         1 CPU License
    ________________________________________________________________________
    
    1.  License Grant
    
        eGenix.com Software, Skills and Services GmbH ("eGenix.com"),
        having an office at Pastor-Loeh-Str. 48, D-40764 Langenfeld,
        Germany, hereby grants the Individual or Organization ("Licensee")
    
           Licensee:  xxxxxx
    
        a non-exclusive, world-wide license to use the software listed
        below in source or binary form and its associated documentation
        ("the Software") under the terms and conditions of this License
        Agreement and to the extent authorized by this Proof of
        Authorization.
    
    2.  Covered Software
    
           Software Name:              mxODBC Python ODBC Interface
           Software Version:           2.1
    				   (including all patch level releases)
           Software Distribution:      As officially made available by 
    				   eGenix.com on http://www.egenix.com/
           Operating System:           any compatible operating system
    
    3.  Authorizations
    
        eGenix.com hereby authorizes Licensee to copy, install, compile,
        modify and use the Software on the following Installation Targets
        under the terms of this License Agreement.
    
           Installation Targets:       one (1) CPU
    
        Use of the Software for any other purpose or redistribution IS NOT
        PERMITTED BY THIS PROOF OF AUTHORIZATION.
    
    4.  Proof
    
        This Proof of Authorization was issued by
    
    	Marc-Andre Lemburg, CEO eGenix.com
    	Langenfeld, xxxx-xx-xx
    
    	Proof of Authorization Key:
    	xxxx-xxxx-xxxx-xxxx-xxxx-xxxx
    		    

    When you buy CPU licenses, you will receive a digitally signed "Proof of Authorization" by e-mail.

    The PGP key used to signed these proofs is named "eGenix.com Licenses <licenses@egenix.com>" and can be fetched from any PGP key server, e.g. OpenPGP Public Key Server. The PGP key ID is 8C25C2A2; its fingerprint is "2E1B D691 A231 E09B CEF5 C9D5 C792 13DD 8C25 C2A2". To check the digital signature, use one of the available PGP or GPG programs available on the Internet.

    eGenix.com Proof of Authorization (1 Developer CPU License)

    This is the "Proof of Authorization" we send out for a "1 Developer CPU License" subject to the above license. It allows you to redistribute the Software developed on the developer machine under certain conditions and is targetted at product developers wanting to use the Software in their products.

    Please contact sales@eGenix.com if you have questions about the redistribution conditions or other requirements.

    ________________________________________________________________________
    
    EGENIX.COM PROOF OF AUTHORIZATION:               1 Developer CPU License
    ________________________________________________________________________
    
    1.  License Grant
    
        eGenix.com Software, Skills and Services GmbH ("eGenix.com"),
        having an office at Pastor-Loeh-Str. 48, D-40764 Langenfeld,
        Germany, hereby grants the Individual or Organization ("Licensee")
    
           Licensee:  xxxxxx
    
        a non-exclusive, world-wide license to use the software listed
        below in source or binary form and its associated documentation
        ("the Software") under the terms and conditions of this License
        Agreement and to the extent authorized by this Proof of
        Authorization.
    
    2.  Covered Software
    
           Software Name:              mxODBC Python ODBC Interface
           Software Version:           2.1
    				   (including all patch level releases)
           Software Distribution:      As officially made available by 
    				   eGenix.com on http://www.egenix.com/
           Operating System:           any compatible operating system
    
    3.  Authorizations
    
    3.1. Application Development
    
        eGenix.com hereby authorizes Licensee to copy, install, compile,
        modify and use the Software on the following Developer
        Installation Targets for the purpose of developing products using
        the Software as integral part.
    
           Developer Installation Targets: one (1) CPU
    
    3.2. Redistribution
    
        eGenix.com hereby authorizes Licensee to redistribute the Software
        bundled with a product developed by Licensee on the Developer
        Installation Targets ("the Product") subject to the terms and
        conditions of this License Agreement for installation and use in
        combination with the Product on the following Redistribution
        Installation Targets, provided that:
    
            1) Licensee shall not and shall not permit or assist any third
            party to sell or distribute the Software as a separate
            product;
    
            2) Licensee shall not and shall not permit any third party to
    
               (i) market, sell or distribute the Software to any end user
               except subject to the eGenix Commercial License Agreement,
    
               (ii) rent, sell, lease or otherwise transfer the Software
               or any part thereof or use it for the benefit of any third
               party,
    
               (iii) use the Software outside the Product or for any other
               purpose not expressly licensed hereunder;
    
            3) the Product does not provide functions or capabilities
            similar to those of the Software itself, i.e. the Product does
            not introduce commercial competition for the Software as sold
            by eGenix.com.
    
           Redistribution Installation Targets: any number of CPUs capable of
                                                running the Product and the
                                                Software
    
    4.  Proof
    
        This Proof of Authorization was issued by
    
    	Marc-Andre Lemburg, CEO eGenix.com
    	Langenfeld, xxxx-xx-xx
    
    	Proof of Authorization Key:
    	xxxx-xxxx-xxxx-xxxx-xxxx-xxxx
    		    

    When you buy Developer CPU licenses, you will receive a digitally signed "Proof of Authorization" by e-mail.

    The PGP key used to signed these proofs is named "eGenix.com Licenses <licenses@egenix.com>" and can be fetched from any PGP key server, e.g. OpenPGP Public Key Server. The PGP key ID is 8C25C2A2; its fingerprint is "2E1B D691 A231 E09B CEF5 C9D5 C792 13DD 8C25 C2A2". To check the digital signature, use one of the available PGP or GPG programs available on the Internet.

    If you have questions, please send e-mail to licenses@egenix.com or use the above postal address.

    eGenix.com Proof of Authorization (Non-Commercial-Use 1 CPU License)

    This is the "Proof of Authorization" we send out for a "Non-Commercial-Use 1 CPU License" subject to the above license agreement. It permits you to install and use the Software on one machine having at most 2 processors in a Non-Commercial Environment as defined in the license agreement.

    Please contact licenses@eGenix.com if you have questions about the term "Non-Commercial Environment" and whether this license covers your needs or not.

    ________________________________________________________________________
    
    EGENIX.COM PROOF OF AUTHORIZATION:      Non-Commercial-Use 1 CPU License
    ________________________________________________________________________
    
    1.  License Grant
    
        eGenix.com Software, Skills and Services GmbH ("eGenix.com"),
        having an office at Pastor-Loeh-Str. 48, D-40764 Langenfeld,
        Germany, hereby grants the Individual or Organization ("Licensee")
    
           Licensee:  xxxxxx
    
        a non-exclusive, world-wide license to use the software listed
        below in source or binary form and its associated documentation
        ("the Software") under the terms and conditions of this License
        Agreement and to the extent authorized by this Proof of
        Authorization.
    
    2.  Covered Software
    
           Software Name:              mxODBC Python ODBC Interface
           Software Version:           2.1
    				   (including all patch level releases)
           Software Distribution:      As officially made available by 
    				   eGenix.com on http://www.egenix.com/
           Operating System:           any compatible operating system
    
    3.  Authorizations
    
        eGenix.com hereby authorizes Licensee to copy, install, compile,
        modify and use the Software on the following Installation Targets
        under the terms of this License Agreement IN NON-COMMERCIAL
        ENVIRONMENTS ONLY.
    
           Installation Targets:       one (1) CPU
    
        Use of the Software in a Commercial Environment or for any other
        purpose or redistribution IS NOT PERMITTED BY THIS PROOF OF
        AUTHORIZATION.
    
    4.  Proof
    
        This Proof of Authorization was issued by
    
    	Marc-Andre Lemburg, CEO eGenix.com
    	Langenfeld, xxxx-xx-xx
    
    	Proof of Authorization Key:
    	xxxx-xxxx-xxxx-xxxx-xxxx-xxxx
    		    

    When you request Non-Commercial-Use CPU licenses, you will receive a digitally signed "Proof of Authorization" by e-mail.

    The PGP key used to signed these proofs is named "eGenix.com Licenses <licenses@egenix.com>" and can be fetched from any PGP key server, e.g. OpenPGP Public Key Server. The PGP key ID is 8C25C2A2; its fingerprint is "2E1B D691 A231 E09B CEF5 C9D5 C792 13DD 8C25 C2A2". To check the digital signature, use one of the available PGP or GPG programs available on the Internet.


© 2000-2003, Copyright by eGenix.com Software GmbH, Langengeld, Germany; All Rights Reserved. mailto: info@egenix.com
SimpleParse-2.2.0/simpleparse/stt/Doc/eGenix-mx-Extensions.html0000644000175000017500000015606212037615407026101 0ustar mcfletchmcfletch00000000000000 eGenix.com mx Extensions for Python

eGenix.com mx Extensions for Python


BASE package
    ( mxDateTime : mxTextTools : mxStack : mxTools : mxProxy : mxURL : mxUID : History : Download )
COMMERCIAL package
    ( mxODBC : History : Buy Licenses : Special Offer : Download )
EXPERIMENTAL package
    ( mxNumber : mxTidy : History : Download )
Commercial Support : Home
   

Introduction

    The eGenix.com mx Extensions for Python are a collection of professional quality Python software tools which enhance Python's usability in many important areas such as ODBC database connectivity, fast text processing, date/time processing and web site programming.

    The tools have a proven record of being portable across many Unix and Windows platforms, e.g. you can write applications which use an ODBC database on Windows which then run on Unix platforms without change due to the consistent platforms independent interfaces.

    All of the available packages have shown their stability and usefulness in many mission critical applications and various commercial settings all around the world.

    The two most well-known packages from the mx Extension Series are mxDateTime and mxODBC providing date/time services and professional ODBC database connectivity on practically all supported Python platforms. These two packages enable database software which is portable not only across platforms, but also across database backends.

Overview


Packages

    The following subpackages are included in the eGenix.com mx Extension series, each providing fast and efficient implementations for various application domains. All subpackages live in the mx top-level Python package to avoid naming collisions with other Python software.

    eGenix.com mx BASE Package:

      mxDateTime - Generic Date/Time Datatypes
      mxTextTools - Fast Text Processing Tools
      mxStack - Fast and Memory-Efficient Stack Datatype
      mxTools - Collection of Additional Builtins
      mxProxy - Generic Object Proxy & Weak Reference Datatype
      mxBeeBase - On-disk B+Tree Database Construction Kit
      mxURL - Efficient Storage and Management of URL/URI Information
      mxUID - Create and Manage Unique IDs

      >>> Download

    eGenix.com mx COMMERCIAL Package:

      mxODBC - Python DB-API compatible ODBC 2.0 - 3.5 database interface;
      supports Python 1.5.2 and Unicode for Python 2.0 and later

      >>> Download and Buy Licenses

    eGenix.com mx EXPERIMENTAL Package:

      mxNumber - Interface to GNU MP's High Precision Numerics
      mxTidy - Interface to a library version of HTML Tidy

      >>> Download



eGenix.com mx Extensions - BASE Package

  Version 2.0.3  


eGenix.com mx Extensions - COMMERCIAL Package

  Version 2.0.4  


eGenix.com mx Extensions - EXPERIMENTAL Package

  Version 0.7.0  


Commercial Support


© 1997-2000, Copyright by Marc-André Lemburg; All Rights Reserved. mailto: mal@lemburg.com
© 2000-2001, Copyright by eGenix.com Software GmbH, Langenfeld; All Rights Reserved. mailto: info@egenix.com
Trademarks: "mx Extensions" is a trademark of Marc-Andre Lemburg and eGenix.com GmbH.
SimpleParse-2.2.0/simpleparse/stt/Doc/mxTextTools.html0000644000175000017500000022276012037615407024414 0ustar mcfletchmcfletch00000000000000 TextTools - Fast Text Manipulation Tools for Python

mxTextTools - Fast Text Manipulation Tools for Python


Engine : TextSearch Objects : CharSet Objects : Functions : Constants : Examples : Structure : Support : Download : Copyright & License : History : Home Version 2.1.0

Introduction

    mxTextTools is a collection of high-speed string manipulation routines and new Python objects for dealing with common text processing tasks.

    One of the major features of this package is the integrated Tagging Engine which allows accessing the speed of compiled C programs while maintaining the portability of Python. The Tagging Engine uses byte code "programs" written in form of Python tuples. These programs are then translated into an internal binary form which gets processed by a very fast virtual machine designed specifically for scanning text data.

    As a result, the Tagging Engine allows parsing text at higher speeds than e.g. regular expression packages while still maintaining the flexibility of programming the parser in Python. Callbacks and user-defined matching functions extends this approach far beyond what you could do with other common text processing methods.

    Two other major features are the search and character set objects provided by the package. Both are implemented in C to give you maximum performance on all supported platforms.

    A note about the word 'tagging': This originated from what is done in HTML to mark some text with a certain extra information. The Tagging Engine extends this notion to assigning Python objects to text substrings. Every substring marked in this way carries a 'tag' (the object) which can be used to do all kinds of useful things.

    If you are looking for more tutorial style documentation of mxTextTools, there's a new book by David Mertz about Text Processing with Python which covers mxTextTools and other text oriented tools at great length.

Tagging Engine

TextSearch Object

    The TextSearch object is immutable and usable for one search string per object only. However, once created, the TextSearch objects can be applied to as many text strings as you like -- much like compiled regular expressions. Matching is done exact (doing translations on-the-fly if supported by the search algorithm).

    Furthermore, the TextSearch objects can be pickled and implement the copy protocol as defined by the copy module. Comparisons and hashing are not implemented (the objects are stored by id in dictionaries).

    Depending on the search algorithm, TextSearch objects can search in 8-bit strings and/or Unicode. Searching in memory buffers is currently not supported. Accordingly, the search string itself may also be an 8-bit string or Unicode.

    TextSearch Object Constructors

      In older versions of mxTextTools there were two separate constructors for search objects: BMS() for Boyer-Moore and FS() for the (unpublished) FastSearch algorithm. With 2.1.0 the interface was changed to merge these two constructors into one having the algorithm type as parameter.

      Note: The FastSearch algorithm is *not* included in the public release of mxTextTools.

      TextSearch(match,translate=None,algorithm=default_algorithm)
      Create a TextSearch substring search object for the string match implementing the algorithm specified in the constructor.

      algorithm defines the algorithm to use. Possible values are:

      BOYERMOORE
      Enhanced Boyer-Moore-Horspool style algorithm for searching in 8-bit text. Unicode is not supported. On-the-fly translation is supported.

      FASTSEARCH
      Enhanced Boyer-Moore style algorithm for searching in 8-bit text. This algorithm provides better performance for match patterns having repeating sequences, like e.g. DNA strings. Unicode is not supported. On-the-fly translation is supported.

      Not included in the public release of mxTextTools.

      TRIVIAL
      Trivial right-to-left search algorithm. This algorithm can be used to search in 8-bit text and Unicode. On-the-fly translation is not supported.

      algorithm defaults to BOYERMOORE (or FASTSEARCH if available) for 8-bit match strings and TRIVIAL for Unicode match strings.

      translate is an optional translate-string like the one used in the module 're', i.e. a 256 character string mapping the oridnals of the base character set to new characters. It is supported by the BOYERMOORE and the FASTSEARCH algorithm only.

      This function supports keyword arguments.

      BMS(match[,translate])
      DEPRECATED: Use TextSearch(match, translate, BOYERMOORE) instead.

      FS(match[,translate])
      DEPRECATED: Use TextSearch(match, translate, FASTSEARCH) instead.

    TextSearch Object Instance Variables

      To provide some help for reflection and pickling the TextSearch object gives (read-only) access to these attributes.

      match
      The string that the search object will look for in the search text.

      translate
      The translate string used by the object or None (if no translate string was passed to the constructor).

      algorithm
      The algorithm used by the TextSearch object. For possible values, see the TextSearch() constructor documentation.

    TextSearch Object Instance Methods

      The TextSearch object has the following methods:

      search(text,[start=0,stop=len(text)])
      Search for the substring match in text, looking only at the slice [start:stop] and return the slice (l,r) where the substring was found, or (start,start) if it was not found.

      find(text,[start=0,stop=len(text)])
      Search for the substring match in text, looking only at the slice [start:stop] and return the index where the substring was found, or -1 if it was not found. This interface is compatible with string.find.

      findall(text,start=0,stop=len(text))
      Same as search(), but return a list of all non-overlapping slices (l,r) where the match string can be found in text.

      Note that translating the text before doing the search often results in a better performance. Use string.translate() to do that efficiently.

CharSet Object

    The CharSet object is an immutable object which can be used for character set based string operations like text matching, searching, splitting etc.

    CharSet objects can be pickled and implement the copy protocol as defined by the copy module as well as the 'in'-protocol, so that c in charset works as expected. Comparisons and hashing are not implemented (the objects are stored by id in dictionaries).

    The objects support both 8-bit strings and UCS-2 Unicode in both the character set definition and the various methods. Mixing of the supported types is also allowed. Memory buffers are currently not supported.

    CharSet Object Constructor

      CharSet(definition)
      Create a CharSet object for the given character set definition.

      definition may be an 8-bit string or Unicode.

      The constructor supports the re-module syntax for defining character sets: "a-e" maps to "abcde" (the backslash can be used to escape the special meaning of "-", e.g. r"a\-e" maps to "a-e") and "^a-e" maps to the set containing all but the characters "abcde".

      Note that the special meaning of "^" only applies if it appears as first character in a CharSet definition. If you want to create a CharSet with the single character "^", then you'll have to use the escaped form: r"\^". The non-escape form "^" would result in a CharSet matching all characters.

      To add the backslash character to a CharSet you have to escape with itself: r"\\".

      Watch out for the Python quoting semantics in these explanations: the small r in front of some of these strings makes the raw Python literal strings which means that no interpretation of backslashes is applied: r"\\" == "\\\\" and r"a\-e" == "a\\-e".

    CharSet Object Instance Variables

      To provide some help for reflection and pickling the CharSet object gives (read-only) access to these attributes.

      definition
      The definition string which was passed to the constructor.

    CharSet Object Instance Methods

      The CharSet object has these methods:

      contains(char)
      Return 1 if char is included in the character set, 0 otherwise.

      search(text[, direction=1, start=0, stop=len(text)])
      Search text[start:stop] for the first character included in the character set. Returns None if no such character is found or the index position of the found character.

      direction defines the search direction: a positive value searches forward starting from text[start], while a negative value searches backwards from text[stop-1].

      match(text[, direction=1, start=0, stop=len(text)])
      Look for the longest match of characters in text[start:stop] which appear in the character set. Returns the length of this match as integer.

      direction defines the match direction: a positive value searches forward starting from text[start] giving a prefix match, while a negative value searches backwards from text[stop-1] giving a suffix match.

      split(text, [,start=0, stop=len(text)])
      Split text[start:stop] into a list of substrings using the character set definition, omitting the splitting parts and empty substrings.

      splitx(text, [,start=0, stop=len(text)])
      Split text[start:stop] into a list of substrings using the character set definition, such that every second entry consists only of characters in the set.

      strip(text[, where=0, start=0, stop=len(text)])
      Strip all characters in text[start:stop] appearing in the character set.

      where indicates where to strip (<0: left; =0: left and right; >0: right).

Functions

Constants

    The package exports these constants. They are defined in Constants/Sets.

    Note that Unicode defines many more characters in the following categories. The character sets defined here are restricted to ASCII (and parts of Latin-1) only.

      a2z
      'abcdefghijklmnopqrstuvwxyz'

      A2Z
      'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

      a2z
      'abcdefghijklmnopqrstuvwxyz'

      umlaute
      'äöüß'

      Umlaute
      'ÄÖÜ'

      alpha
      A2Z + a2z

      a2z
      'abcdefghijklmnopqrstuvwxyz'

      german_alpha
      A2Z + a2z + umlaute + Umlaute

      number
      '0123456789'

      alphanumeric
      alpha + number

      white
      ' \t\v'

      newline
      '\n\r'

      formfeed
      '\f'

      whitespace
      white + newline + formfeed

      any
      All characters from \000-\377

      *_charset
      All of the above as CharSet() objects.

      *_set
      All of the above as set() compatible character sets.

      tagtable_cache
      This the cache dictionary which is used by the TagTable() compiler to store compiled Tag Table definitions. It has a hard limit of 100 entries, but can also be managed by user routines to lower this limit.

      BOYERMOORE, FASTSEARCH, TRIVIAL
      TextSearch() algorithm values.

Examples of Use

    The Examples/ subdirectory of the package contains a few examples of how tables can be written and used. Here is a non-trivial example for parsing HTML (well, most of it):

    
        from simpleparse.stt.TextTools import *
    
        error = '***syntax error'			# error tag obj
    
        tagname_set = set(alpha+'-'+number)
        tagattrname_set = set(alpha+'-'+number)
        tagvalue_set = set('"\'> ',0)
        white_set = set(' \r\n\t')
    
        tagattr = (
    	   # name
    	   ('name',AllInSet,tagattrname_set),
    	   # with value ?
    	   (None,Is,'=',MatchOk),
    	   # skip junk
    	   (None,AllInSet,white_set,+1),
    	   # unquoted value
    	   ('value',AllInSet,tagvalue_set,+1,MatchOk),
    	   # double quoted value
    	   (None,Is,'"',+5),
    	     ('value',AllNotIn,'"',+1,+2),
    	     ('value',Skip,0),
    	     (None,Is,'"'),
    	     (None,Jump,To,MatchOk),
    	   # single quoted value
    	   (None,Is,'\''),
    	     ('value',AllNotIn,'\'',+1,+2),
    	     ('value',Skip,0),
    	     (None,Is,'\'')
    	   )
    
        valuetable = (
    	# ignore whitespace + '='
    	(None,AllInSet,set(' \r\n\t='),+1),
    	# unquoted value
    	('value',AllInSet,tagvalue_set,+1,MatchOk),
    	# double quoted value
    	(None,Is,'"',+5),
    	 ('value',AllNotIn,'"',+1,+2),
    	 ('value',Skip,0),
    	 (None,Is,'"'),
    	 (None,Jump,To,MatchOk),
    	# single quoted value
    	(None,Is,'\''),
    	 ('value',AllNotIn,'\'',+1,+2),
    	 ('value',Skip,0),
    	 (None,Is,'\'')
    	)
    
        allattrs = (# look for attributes
    	       (None,AllInSet,white_set,+4),
    	        (None,Is,'>',+1,MatchOk),
    	        ('tagattr',Table,tagattr),
    	        (None,Jump,To,-3),
    	       (None,Is,'>',+1,MatchOk),
    	       # handle incorrect attributes
    	       (error,AllNotIn,'> \r\n\t'),
    	       (None,Jump,To,-6)
    	       )
    
        htmltag = ((None,Is,'<'),
    	       # is this a closing tag ?
    	       ('closetag',Is,'/',+1),
    	       # a coment ?
    	       ('comment',Is,'!',+8),
    		(None,Word,'--',+4),
    		('text',sWordStart,BMS('-->'),+1),
    		(None,Skip,3),
    		(None,Jump,To,MatchOk),
    		# a SGML-Tag ?
    		('other',AllNotIn,'>',+1),
    		(None,Is,'>'),
    		    (None,Jump,To,MatchOk),
    		   # XMP-Tag ?
    		   ('tagname',Word,'XMP',+5),
    		    (None,Is,'>'),
    		    ('text',WordStart,'</XMP>'),
    		    (None,Skip,len('</XMP>')),
    		    (None,Jump,To,MatchOk),
    		   # get the tag name
    		   ('tagname',AllInSet,tagname_set),
    		   # look for attributes
    		   (None,AllInSet,white_set,+4),
    		    (None,Is,'>',+1,MatchOk),
    		    ('tagattr',Table,tagattr),
    		    (None,Jump,To,-3),
    		   (None,Is,'>',+1,MatchOk),
    		   # handle incorrect attributes
    		   (error,AllNotIn,'> \n\r\t'),
    		   (None,Jump,To,-6)
    		  )
    
        htmltable = (# HTML-Tag
    		 ('htmltag',Table,htmltag,+1,+4),
    		 # not HTML, but still using this syntax: error or inside XMP-tag !
    		 (error,Is,'<',+3),
    		  (error,AllNotIn,'>',+1),
    		  (error,Is,'>'),
    		 # normal text
    		 ('text',AllNotIn,'<',+1),
    		 # end of file
    		 ('eof',EOF,Here,-5),
    		)
          
    	

    I hope this doesn't scare you away :-) ... it's fast as hell.

Package Structure

    [TextTools]
           [Constants]
                  Sets.py
                  TagTables.py
           Doc/
           [Examples]
                  HTML.py
                  Loop.py
                  Python.py
                  RTF.py
                  RegExp.py
                  Tim.py
                  Words.py
                  altRTF.py
                  pytag.py
           [mxTextTools]
                  test.py
           TextTools.py
        

    Entries enclosed in brackets are packages (i.e. they are directories that include a __init__.py file). Ones with slashes are just ordinary subdirectories that are not accessible via import.

    The package TextTools imports everything needed from the other components. It is sometimes also handy to do a from simpleparse.stt.TextTools.Constants.TagTables import *.

    Examples/ contains a few demos of what the Tag Tables can do.

Optional Add-Ons for mxTextTools

    Mike C. Fletcher is working on a Tag Table generator called SimpleParse. It works as parser generating front end to the Tagging Engine and converts a EBNF style grammar into a Tag Table directly useable with the tag() function.

    Tony J. Ibbs has started to work on a meta-language for mxTextTools. It aims at simplifying the task of writing Tag Table tuples using a Python style syntax. It also gets rid off the annoying jump offset calculations.

    Andrew Dalke has started work on a parser generator called Martel built upon mxTextTools which takes a regular expression grammer for a format and turns the resultant parsed tree into a set of callback events emulating the XML/SAX API. The results look very promising !

Support

Copyright & License

History & Future


© 1997-2000, Copyright by Marc-André Lemburg; All Rights Reserved. mailto: mal@lemburg.com

© 2000-2001, Copyright by eGenix.com Software GmbH; All Rights Reserved. mailto: info@egenix.com

SimpleParse-2.2.0/simpleparse/stt/mxLicense.html0000644000175000017500000006224612037615407023325 0ustar mcfletchmcfletch00000000000000 mx Extension Series - License Information

mx Extension Series - License Information


Public License : Commercial License : Home Version 1.0.0

Introduction

    The mx Extensions Series packages are brought to you by the eGenix.com Software, Skills and Services GmbH, Langenfeld, Germany. We are licensing our products under the following two different licenses:

    The Public License is very similar to the Python 2.0 license and covers the open source software made available by eGenix.com which is free of charge even for commercial use.

    The Commercial License is intended for covering commercial eGenix.com software, notably the mxODBC package. Only private and non-commercial use is free of charge.

    If you have questions regarding these licenses, please contact Licenses@eGenix.com. If you would like to bundle the software with your commercial product, please write to Sales@eGenix.com for more information about the redistribution conditions and terms.

eGenix.com Public License

    The eGenix.com Public License is similar to the Python 2.0 and considered an Open Source license (in the sense defined by the Open Source Intiative (OSI)) by eGenix.com.

    The license should also be compatible to the GNU Public License in case that matters. The only part which is known to have caused some problems with Richard Stallmann in the past is the choice of law clause.

    EGENIX.COM PUBLIC LICENSE AGREEMENT VERSION 1.0.0

    1. Introduction

    This "License Agreement" is between eGenix.com Software, Skills and Services GmbH ("eGenix.com"), having an office at Pastor-Loeh-Str. 48, D-40764 Langenfeld, Germany, and the Individual or Organization ("Licensee") accessing and otherwise using this software in source or binary form and its associated documentation ("the Software").

    2. License

    Subject to the terms and conditions of this eGenix.com Public License Agreement, eGenix.com hereby grants Licensee a non-exclusive, royalty-free, world-wide license to reproduce, analyze, test, perform and/or display publicly, prepare derivative works, distribute, and otherwise use the Software alone or in any derivative version, provided, however, that the eGenix.com Public License Agreement is retained in the Software, or in any derivative version of the Software prepared by Licensee.

    3. NO WARRANTY

    eGenix.com is making the Software available to Licensee on an "AS IS" basis. SUBJECT TO ANY STATUTORY WARRANTIES WHICH CAN NOT BE EXCLUDED, EGENIX.COM MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, EGENIX.COM MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT INFRINGE ANY THIRD PARTY RIGHTS.

    4. LIMITATION OF LIABILITY

    EGENIX.COM SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY LOSS) AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.

    SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OR LIMITATION OF INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO THE ABOVE EXCLUSION OR LIMITATION MAY NOT APPLY TO LICENSEE.

    5. Termination

    This License Agreement will automatically terminate upon a material breach of its terms and conditions.

    6. General

    Nothing in this License Agreement affects any statutory rights of consumers that cannot be waived or limited by contract.

    Nothing in this License Agreement shall be deemed to create any relationship of agency, partnership, or joint venture between eGenix.com and Licensee.

    If any provision of this License Agreement shall be unlawful, void, or for any reason unenforceable, such provision shall be modified to the extent necessary to render it enforceable without losing its intent, or, if no such modification is possible, be severed from this License Agreement and shall not affect the validity and enforceability of the remaining provisions of this License Agreement.

    This License Agreement shall be governed by and interpreted in all respects by the law of Germany, excluding conflict of law provisions. It shall not be governed by the United Nations Convention on Contracts for International Sale of Goods.

    This License Agreement does not grant permission to use eGenix.com trademarks or trade names in a trademark sense to endorse or promote products or services of Licensee, or any third party.

    The controlling language of this License Agreement is English. If Licensee has received a translation into another language, it has been provided for Licensee's convenience only.

    14. Agreement

    By downloading, copying, installing or otherwise using the Software, Licensee agrees to be bound by the terms and conditions of this License Agreement.

eGenix.com Commercial License

    The eGenix.com Commercial License is covers commercial eGenix.com software, notably the mxODBC package. Only private and non-commercial use is free of charge. Usage of the software in commercial settings such as for implementing in-house applications in/for companies or consulting work where the software is used as tool requires a "Proof of Authorization" which can be bought from eGenix.com.

    EGENIX.COM COMMERCIAL LICENSE AGREEMENT VERSION 1.0.0

    1. Introduction

    This "License Agreement" is between eGenix.com Software, Skills and Services GmbH ("eGenix.com"), having an office at Pastor-Loeh-Str. 48, D-40764 Langenfeld, Germany, and the Individual or Organization ("Licensee") accessing and otherwise using this software in source or binary form and its associated documentation ("the Software").

    2. Terms and Definitions

    The "Software" covered under this License Agreement includes without limitation, all object code, source code, help files, publications, documentation and other programs, products or tools that are included in the official "Software Distribution" available from eGenix.com.

    The "Proof of Authorization" for the Software is a written and signed notice from eGenix.com providing evidence of the extent of authorizations the Licensee has acquired to use the Software and of Licensee's eligibility for future upgrade program prices (if announced) and potential special or promotional opportunities. As such, the Proof of Authorization becomes part of this License Agreement.

    Installation of the Software ("Installation") refers to the process of unpacking or copying the files included in the Software Distribution to an Installation Target.

    "Installation Target" refers to the target of an installation operation. Targets are defined as follows:

    1) "CPU" refers to a central processing unit which is able to store and/or execute the Software (a server, personal computer, or other computer-like device) using at most two (2) processors,

    2) "Site" refers to at most one hundred fifty (150) CPUs installed at a single site of a company,

    3) "Corporate" refers to at most one thousand (1000) CPUs installed at an unlimited number of sites of the company,

    4) "Developer CPU" refers to a single CPU used by at most one (1) developer.

    When installing the Software on a server CPU for use by other CPUs in a network, Licensee must obtain a License for the server CPU and for all client CPUs attached to the network which will make use of the Software by copying the Software in binary or source form from the server into their CPU memory. If a CPU makes use of more than two (2) processors, Licensee must obtain additional CPU licenses to cover the total number of installed processors. Likewise, if a Developer CPU is used by more than one developer, Licensee must obtain additional Developer CPU licenses to cover the total number of developers using the CPU.

    "Commercial Environment" refers to any application environment which is aimed at producing profit. This includes, without limitation, for-profit organizations, work as independent contractor, consultant and other profit generating relationships with organizations or individuals.

    "Non-Commercial Environments" are all those application environments which do not directly or indirectly generate profit. Educational and other officially acknowledged non-profit organizations are regarded as being a Non-Commercial Environment in the above sense.

    3. License Grant

    Subject to the terms and conditions of this License Agreement, eGenix.com hereby grants Licensee a non-exclusive, world-wide license to

    1) use the Software to the extent of authorizations Licensee has acquired and

    2) distribute, make and install copies to support the level of use authorized, providing Licensee reproduces this License Agreement and any other legends of ownership on each copy, or partial copy, of the Software.

    If Licensee acquires this Software as a program upgrade, Licensee's authorization to use the Software from which Licensee upgraded is terminated.

    Licensee will ensure that anyone who uses the Software does so only in compliance with the terms of this License Agreement.

    Licensee may not

    1) use, copy, install, compile, modify, or distribute the Software except as provided in this License Agreement;

    2) reverse assemble, reverse engineer, reverse compile, or otherwise translate the Software except as specifically permitted by law without the possibility of contractual waiver; or

    3) rent, sublicense or lease the Software.

    4. Authorizations

    The extent of authorization depends on the ownership of a Proof of Authorization for the Software.

    Usage of the Software for any other purpose not explicitly covered by this License Agreement or granted by the Proof of Authorization is not permitted and requires the written prior permission from eGenix.com.

    4.1. Non-Commercial Environments

    This section applies to all uses of the Software without a Proof of Authorization for the Software in a Non-Commercial Environment.

    Licensee may copy, install, compile, modify and use the Software under the terms of this License Agreement FOR NON-COMMERCIAL PURPOSES ONLY.

    Use of the Software in a Commercial Environment or for any other purpose, such as redistribution, IS NOT PERMITTED BY THIS LICENSE and requires a Proof of Authorization from eGenix.com.

    4.2. Evaluation Period for Commercial Environments

    This section applies to all uses of the Software without a Proof of Authorization for the Software in a Commercial Environment.

    Licensee may copy, install, compile, modify and use the Software under the terms of this License Agreement FOR EVALUATION AND TESTING PURPOSES and DURING A LIMITED EVALUATION PERIOD OF AT MOST THIRTY (30) DAYS AFTER INITIAL INSTALLATION ONLY.

    For use of the Software after the evaluation period or for any other purpose, such as redistribution, Licensee must obtain a Proof of Authorization from eGenix.com.

    If Licensee decides not to obtain a Proof of Authorization after the evaluation period, Licensee agrees to cease using and to remove all installed copies of the Software.

    4.3. Usage under Proof of Authorization

    This section applies to all uses of the Software provided that Licensee owns a Proof of Authorization for the Software.

    Licensee may copy, install, compile, modify, use and distribute the Software to the extent of authorization acquired by the Proof of Authorization and under the terms an conditions of this License Agreement.

    5. Transfer of Rights and Obligations

    Licensee may transfer all license rights and obligations under a Proof of Authorization for the Software to another party by transferring the Proof of Authorization and a copy of this License Agreement and all documentation.

    The transfer of Licensee's license rights and obligations terminates Licensee's authorization to use the Software under the Proof of Authorization.

    6. Modifications

    Software modifications may only be distributed in form of patches to the original files contained in the Software Distribution.

    The patches must be accompanied by a legend of origin and ownership and a visible message stating that the patches are not original Software delivered by eGenix.com, nor that eGenix.com can be held liable for possible damages related directly or indirectly to the patches if they are applied to the Software.

    7. Experimental Code or Features

    The Software may include components containing experimental code or features which may be modified substantially before becoming generally available.

    These experimental components or features may not be at the level of performance or compatibility of generally available eGenix.com products. eGenix.com does not guarantee that any of the experimental components or features contained in the eGenix.com will ever be made generally available.

    8. Expiration and License Control Devices

    Components of the Software may contain disabling or license control devices that will prevent them from being used after the expiration of a period of time or on Installation Targets for which no license was obtained.

    Licensee will not tamper with these disabling devices or the components. Licensee will take precautions to avoid any loss of data that might result when the components can no longer be used.

    9. NO WARRANTY

    eGenix.com is making the Software available to Licensee on an "AS IS" basis. SUBJECT TO ANY STATUTORY WARRANTIES WHICH CAN NOT BE EXCLUDED, EGENIX.COM MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, EGENIX.COM MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT INFRINGE ANY THIRD PARTY RIGHTS.

    10. LIMITATION OF LIABILITY

    TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL EGENIX.COM BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE SOFTWARE FOR (I) ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY LOSS) AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF; OR (II) ANY AMOUNTS IN EXCESS OF THE AGGREGATE AMOUNTS PAID TO EGENIX.COM UNDER THIS LICENSE AGREEMENT DURING THE TWELVE (12) MONTH PERIOD PRECEEDING THE DATE THE CAUSE OF ACTION AROSE.

    SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OR LIMITATION OF INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO THE ABOVE EXCLUSION OR LIMITATION MAY NOT APPLY TO LICENSEE.

    11. Termination

    This License Agreement will automatically terminate upon a material breach of its terms and conditions if not cured within thirty (30) days of written notice by eGenix.com. Upon termination, Licensee shall discontinue use and remove all installed copies of the Software.

    12. Indemnification

    Licensee hereby agrees to indemnify eGenix.com against and hold harmless eGenix.com from any claims, lawsuits or other losses that arise out of Licensee's breach of any provision of this License Agreement.

    13. Third Party Rights

    Any software or documentation in source or binary form provided along with the Software that is associated with a separate license agreement is licensed to Licensee under the terms of that license agreement. This License Agreement does not apply to those portions of the Software. Copies of the third party licenses are included in the Software Distribution.

    14. High Risk Activities

    The Software is not fault-tolerant and is not designed, manufactured or intended for use or resale as on-line control equipment in hazardous environments requiring fail-safe performance, such as in the operation of nuclear facilities, aircraft navigation or communication systems, air traffic control, direct life support machines, or weapons systems, in which the failure of the Software, or any software, tool, process, or service that was developed using the Software, could lead directly to death, personal injury, or severe physical or environmental damage ("High Risk Activities").

    Accordingly, eGenix.com specifically disclaims any express or implied warranty of fitness for High Risk Activities.

    Licensee agree that eGenix.com will not be liable for any claims or damages arising from the use of the Software, or any software, tool, process, or service that was developed using the Software, in such applications.

    15. General

    Nothing in this License Agreement affects any statutory rights of consumers that cannot be waived or limited by contract.

    Nothing in this License Agreement shall be deemed to create any relationship of agency, partnership, or joint venture between eGenix.com and Licensee.

    If any provision of this License Agreement shall be unlawful, void, or for any reason unenforceable, such provision shall be modified to the extent necessary to render it enforceable without losing its intent, or, if no such modification is possible, be severed from this License Agreement and shall not affect the validity and enforceability of the remaining provisions of this License Agreement.

    This License Agreement shall be governed by and interpreted in all respects by the law of Germany, excluding conflict of law provisions. It shall not be governed by the United Nations Convention on Contracts for International Sale of Goods.

    This License Agreement does not grant permission to use eGenix.com trademarks or trade names in a trademark sense to endorse or promote products or services of Licensee, or any third party.

    The controlling language of this License Agreement is English. If Licensee has received a translation into another language, it has been provided for Licensee's convenience only.

    16. Agreement

    By downloading, copying, installing or otherwise using the Software, Licensee agrees to be bound by the terms and conditions of this License Agreement.

    For question regarding this license agreement, please write to:

    	  eGenix.com Software, Skills and Services GmbH
    	  Pastor-Loeh-Str. 48
    	  D-40764 Langenfeld
    	  Germany
    

    The following two sections give examples of the "Proof of Authorization" for a commercial use license of product under this license.

    When you buy such a license, you will receive a signed "Proof of Authorization" by postal mail within a week or two. We will also send you the Proof of Authorization Key by e-mail to acknowledge acceptance of the payment.

    EGENIX.COM PROOF OF AUTHORIZATION (Example: CPU License)

    1. License Grant

    eGenix.com Software, Skills and Services GmbH ("eGenix.com"), having an office at Pastor-Loeh-Str. 48, D-40764 Langenfeld, Germany, hereby grants the Individual or Organization ("Licensee") a non-exclusive, world-wide license to use the software listed below in source or binary form and its associated documentation ("the Software") under the terms and conditions of the eGenix.com Commercial License Agreement Version 1.0.0 and to the extent authorized by this Proof of Authorization.

    2. Covered Software

       Software Name:		   mxODBC Python ODBC Interface
       Software Version:		   Version 2.0.0
       Software Distribution:	   mxODBC-2.0.0.zip
       Software Distribution MD5 Hash: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
       Operating System:		   any compatible operating system
    

    3. Authorizations

    eGenix.com hereby authorizes Licensee to copy, install, compile, modify and use the Software on the following Installation Targets.

       Installation Targets:	   one (1) CPU
    
    Redistribution of the Software is not allowed under this Proof of Authorization.

    4. Proof

    This Proof of Authorization was issued by

    	      __________________________________
    
    
    	      Langenfeld, ______________________
    
                  Proof of Authorization Key:
                  xxxx-xxxx-xxxx-xxxx-xxxx-xxxx
    

    The next section gives an example of a "Developer CPU Licenses" which allows you to redistribute software built around the Software or integrating it. Please contact sales@eGenix.com for questions about the redistribution conditions.

    EGENIX.COM PROOF OF AUTHORIZATION (Example: Developer License)

    1. License Grant

    eGenix.com Software, Skills and Services GmbH ("eGenix.com"), having an office at Pastor-Loeh-Str. 48, D-40764 Langenfeld, Germany, hereby grants the Individual or Organization ("Licensee") a non-exclusive, world-wide license to use and distribute the software listed below in source or binary form and its associated documentation ("the Software") under the terms and conditions of the eGenix.com Commercial License Agreement Version 1.0.0 and to the extent authorized by this Proof of Authorization.

    2. Covered Software

       Software Name:		   mxODBC Python ODBC Interface
       Software Version:		   Version 2.0.0
       Software Distribution:	   mxODBC-2.0.0.zip
       Software Distribution MD5 Hash: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
       Operating System:		   any compatible operating system
    

    3. Authorizations

    3.1. Application Development

    eGenix.com hereby authorizes Licensee to copy, install, compile, modify and use the Software on the following Developer Installation Targets for the purpose of developing products using the Software as integral part.

       Developer Installation Targets: one (1) CPU
    

    3.2. Redistribution

    eGenix.com hereby authorizes Licensee to redistribute the Software bundled with a products developed by Licensee on the Developer Installation Targets ("the Product") subject to the terms and conditions of the eGenix.com Commercial License Agreement for installation and use in combination with the Product on the following Redistribution Installation Targets, provided that:

    1) Licensee shall not and shall not permit or assist any third party to sell or distribute the Software as a separate product;

    2) Licensee shall not and shall not permit any third party to

    (i) market, sell or distribute the Software to any end user except subject to the eGenix Commercial License Agreement,

    (ii) rent, sell, lease or otherwise transfer the Software or any part thereof or use it for the benefit of any third party,

    (iii) use the Software outside the Product or for any other purpose not expressly licensed hereunder;

    3) the Product does not provide functions or capabilities similar to those of the Software itself, i.e. the Product does not introduce commercial competition for the Software as sold by eGenix.com.

       Redistribution Installation Targets:	any number of CPUs capable of
    					running the Product and the
    					Software
    

    4. Proof

    This Proof of Authorization was issued by

    	      __________________________________
    
    
    	      Langenfeld, ______________________
    
                  Proof of Authorization Key:
                  xxxx-xxxx-xxxx-xxxx-xxxx-xxxx
    


© 2000, Copyright by eGenix.com Software GmbH, Langengeld, Germany; All Rights Reserved. mailto: info@egenix.com
SimpleParse-2.2.0/simpleparse/stt/COPYRIGHT0000644000175000017500000000124612037615407021774 0ustar mcfletchmcfletch00000000000000 --------------------------------------------------------------------- COPYRIGHT NOTICE --------------------------------------------------------------------- Copyright (c) 1997-2000, Marc-Andre Lemburg; mailto:mal@lemburg.com Copyright (c) 2000-2001, eGenix.com Software GmbH; mailto:info@egenix.com Copyright (c) 2003-2006, Mike Fletcher; mailto:mcfletch@vrplumber.com All Rights Reserved. This software is protected by copyright and other intellectual property laws and by international treaties. SimpleParse-2.2.0/simpleparse/stt/TextTools/0000755000175000017500000000000012620710576022444 5ustar mcfletchmcfletch00000000000000SimpleParse-2.2.0/simpleparse/stt/TextTools/Makefile.pkg0000644000175000017500000000154312037615407024666 0ustar mcfletchmcfletch00000000000000all: compile install # Compile target. This should setup the package and compile it # into a state ready for installation. compile: cd mxTextTools; \ if ! test -f Makefile; then $(MAKE) -f Makefile.pre.in boot; fi; \ $(MAKE) # Install target. This should do the install step. If the package # needs no further installation step (i.e. the extension is not # needed by other extension modules), then you can leave this target # blank. install: cd mxTextTools; \ $(MAKE) install # Test target. Should do some testing of the extension. Writing # something like 'ok' or 'works' and the end of a successful run. test: cd mxTextTools; \ python test.py # Documentation target. Use this to have the documentation for # an extension generated at the user. doc: # ??? # Cleanup before distributing the package # distclean: cd mxTextTools; \ $(MAKE) distclean SimpleParse-2.2.0/simpleparse/stt/TextTools/README0000644000175000017500000000021612037615407023322 0ustar mcfletchmcfletch00000000000000Please see the documentation in the Doc/ subdirectory for further information on installation and usage. Marc-Andre Lemburg, mal@lemburg.com SimpleParse-2.2.0/simpleparse/stt/TextTools/Constants/0000755000175000017500000000000012620710576024420 5ustar mcfletchmcfletch00000000000000SimpleParse-2.2.0/simpleparse/stt/TextTools/Constants/TagTables.py0000644000175000017500000000166612620706017026644 0ustar mcfletchmcfletch00000000000000""" Constants for writing tag tables These are defined in mxte.h and imported here via the C extension. See the documentation for details about the various constants. Copyright (c) 2000, Marc-Andre Lemburg; mailto:mal@lemburg.com Copyright (c) 2000-2003, eGenix.com Software GmbH; mailto:info@egenix.com See the documentation for further information on copyrights, or contact the author. All Rights Reserved. """ ### Module init. def _module_init(): from simpleparse.stt.TextTools.mxTextTools import mxTextTools global id2cmd id2cmd = {} # Fetch symbols from the C extension and add them to this module ns = globals() for name, value in list(vars(mxTextTools).items()): if name[:7] == '_const_': cmd = name[7:] ns[cmd] = value if value == 0: id2cmd[0] = 'Fail/Jump' else: id2cmd[value] = cmd _module_init()SimpleParse-2.2.0/simpleparse/stt/TextTools/Constants/Sets.py0000644000175000017500000000723612620706017025713 0ustar mcfletchmcfletch00000000000000# -*- coding: latin-1 -*- """ Constants for sets (of characters) Copyright (c) 2000, Marc-Andre Lemburg; mailto:mal@lemburg.com Copyright (c) 2000-2003, eGenix.com Software GmbH; mailto:info@egenix.com See the documentation for further information on copyrights, or contact the author. All Rights Reserved. """ import string from simpleparse.stt.TextTools.mxTextTools import CharSet # Simple character strings a2z = 'abcdefghijklmnopqrstuvwxyz' A2Z = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' umlaute = 'äöüß' Umlaute = 'ÄÖÜ' alpha = A2Z + a2z german_alpha = A2Z + a2z + umlaute + Umlaute number = '0123456789' alphanumeric = alpha + number white = ' \t\v' newline = '\r\n' formfeed = '\f' whitespace = white + newline + formfeed any = '\000\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037 !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\177\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377' # Precompiled as sets, e.g. a2z_set = set(a2z) a2z_set = '\000\000\000\000\000\000\000\000\000\000\000\000\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000' A2Z_set = '\000\000\000\000\000\000\000\000\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000' alpha_set = '\000\000\000\000\000\000\000\000\376\377\377\007\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000' german_alpha_set = '\000\000\000\000\000\000\000\000\376\377\377\007\376\377\377\007\000\000\000\000\000\000\000\000\020\000@\220\020\000@\020' number_set = '\000\000\000\000\000\000\377\003\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000' alphanumeric_set = '\000\000\000\000\000\000\377\003\376\377\377\007\376\377\377\007\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000' white_set = '\000\002\000\000\001\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000' newline_set = '\000$\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000' whitespace_set = '\000&\000\000\001\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000' nonwhitespace_set = '\377\301\377\377\376\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377' any_set = '\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377' # Compiled as CharSet instances a2z_charset = CharSet('a-z') A2Z_charset = CharSet('A-Z') umlaute_charset = CharSet('äöüß') Umlaute_charset = CharSet('ÄÖÜ') alpha_charset = CharSet(A2Z + a2z) german_alpha_charset = CharSet(A2Z + a2z + umlaute + Umlaute) number_charset = CharSet('0-9') alphanumeric_charset = CharSet(alpha + number) white_charset = CharSet(' \t\v') newline_charset = CharSet('\r\n') formfeed_charset = CharSet('\f') whitespace_charset = CharSet(white + newline + formfeed) nonwhitespace_charset = CharSet('^' + white + newline + formfeed) any_charset = CharSet('\000-\377') # Clean up del CharSet, stringSimpleParse-2.2.0/simpleparse/stt/TextTools/Constants/__init__.py0000644000175000017500000000000012037615407026516 0ustar mcfletchmcfletch00000000000000SimpleParse-2.2.0/simpleparse/stt/TextTools/COPYRIGHT0000644000175000017500000000206712037615407023743 0ustar mcfletchmcfletch00000000000000 --------------------------------------------------------------------- COPYRIGHT NOTICE --------------------------------------------------------------------- Copyright (c) 1997-2000, Marc-Andre Lemburg; mailto:mal@lemburg.com Copyright (c) 2000-2002, eGenix.com Software GmbH; mailto:info@egenix.com Copyright (c) 2003-2006, Mike Fletcher; mailto:mcfletch@vrplumber.com All Rights Reserved. This software is protected by copyright and other intellectual property laws and by international treaties. It may only be used under the conditions and terms of the eGenix.com Public License Agreement. You should have received a copy of the eGenix.com Public License Agreement with this software (usually in the file LICENSE located in the package's or software's main directory). Please write to licenses@egenix.com to obtain a copy in case you should not have received a copy. SimpleParse-2.2.0/simpleparse/stt/TextTools/mxTextTools/0000755000175000017500000000000012620710576024756 5ustar mcfletchmcfletch00000000000000SimpleParse-2.2.0/simpleparse/stt/TextTools/mxTextTools/recursecommands.h0000644000175000017500000000624112620706017030317 0ustar mcfletchmcfletch00000000000000/* recursive tag-table commands */ case MATCH_TABLE: case MATCH_SUBTABLE: case MATCH_TABLEINLIST: case MATCH_SUBTABLEINLIST: { PyObject * newTable = NULL; if (childReturnCode == NULL_CODE ) { /* haven't yet parsed the sub-table match */ switch (command) { /* determine the table to which we will transfer control */ case MATCH_TABLE: case MATCH_SUBTABLE: { /* switch to either current tag table or a compiled sub-table */ if (PyInt_Check(match) && PyInt_AS_LONG(match) == MATCH_THISTABLE) { newTable = (PyObject *)table; } else { newTable = match; } /* XXX Fix to auto-compile that match argument Should also test that it _is_ a compiled TagTable, rather than that it _isn't_ a tuple? */ if (!mxTagTable_Check(newTable)) { childReturnCode = ERROR_CODE; errorType = PyExc_TypeError; errorMessage = PyString_FromFormat( "Match argument must be compiled TagTable: was a %.50s", Py_TYPE(newTable)->tp_name ); } else { /* we decref in POP */ Py_INCREF(newTable); } break; } case MATCH_TABLEINLIST: case MATCH_SUBTABLEINLIST: { /* switch to explicitly specified table in a list (compiling if necessary) */ newTable = PyList_GetItem( PyTuple_GET_ITEM(match, 0), PyInt_AS_LONG( PyTuple_GET_ITEM(match, 1) ) ); if (newTable == NULL) { childReturnCode = ERROR_CODE; errorType = PyExc_TypeError; errorMessage = PyString_FromFormat( "Tag table entry %d: Could not find target table in list of tables", (unsigned int)index ); } else { if (mxTagTable_Check(newTable)) { /* This is decref'd in POP */ Py_INCREF(newTable); } else { /* These tables are considered to be cacheable. */ newTable = mxTagTable_New(newTable, table->tabletype, 1); /* why didn't we increment the refcount here? does New give us a new ref? */ if (newTable == NULL) { childReturnCode = ERROR_CODE; errorType = PyExc_TypeError; errorMessage = PyString_FromFormat( "Tag table entry %d: Could not compile target table", (unsigned int)index ); } } } break; } } if (childReturnCode == NULL_CODE) { /* we found a valid newTable */ PyObject *subtags = NULL; if (taglist != Py_None && command != MATCH_SUBTABLE && command != MATCH_SUBTABLEINLIST) { /* Create a new list for use as subtaglist Will be decref'd by the child-finished clause if necessary */ subtags = PyList_New(0); if (subtags == NULL) { childReturnCode = ERROR_CODE; errorType = PyExc_MemoryError; } } else { /* Use taglist as subtaglist We don't incref it as we check explicitly for whether it's the same when we go to decref (during childReturnCode handling) */ subtags = taglist; } /* match other table */ PUSH_STACK( newTable, subtags ); RESET_TABLE_VARIABLES } } break; } SimpleParse-2.2.0/simpleparse/stt/TextTools/mxTextTools/mxTextTools.c0000644000175000017500000037166112620706017027445 0ustar mcfletchmcfletch00000000000000/* mxTextTools -- Fast text manipulation routines Copyright (c) 2000, Marc-Andre Lemburg; mailto:mal@lemburg.com Copyright (c) 2000-2002, eGenix.com Software GmbH; mailto:info@egenix.com */ /* We want all our symbols to be exported */ #ifndef MX_BUILDING_MXTEXTTOOLS #define MX_BUILDING_MXTEXTTOOLS #endif /* Logging file used by debugging facility */ #ifndef MAL_DEBUG_OUTPUTFILE # define MAL_DEBUG_OUTPUTFILE "mxTextTools.log" #endif #include "mx.h" #include "mxTextTools.h" #include "structmember.h" #include #define VERSION "2.1.0" /* Initial list size used by e.g. setsplit(), setsplitx(),... */ #define INITIAL_LIST_SIZE 64 /* Maximum TagTable cache size. If this limit is reached, the cache is cleared to make room for new compile TagTables. */ #define MAX_TAGTABLES_CACHE_SIZE 100 /* Define this to enable the copy-protocol (__copy__, __deepcopy__) */ #define COPY_PROTOCOL /* Convenience macro for reducing clutter */ #define ADD_INT_CONSTANT(name, value) \ if (PyModule_AddIntConstant(module, name, value) < 0) \ return NULL; /* --- module doc-string -------------------------------------------------- */ PyDoc_STRVAR(Module_docstring, MXTEXTTOOLS_MODULE" -- Tools for fast text processing. Version "VERSION"\n\n" "Copyright (c) 1997-2000, Marc-Andre Lemburg; mailto:mal@lemburg.com\n" "Copyright (c) 2000-2002, eGenix.com Software GmbH; mailto:info@egenix.com\n\n" "Copyright (c) 2003-2006, Mike Fletcher; mailto:mcfletch@vrplumber.com\n\n" " All Rights Reserved\n\n" "See the documentation for further information on copyrights,\n" "or contact the author.") ; /* --- internal macros ---------------------------------------------------- */ /* --- module globals ----------------------------------------------------- */ /* Translation strings for the 8-bit versions of lower() and upper() */ static PyObject *mx_ToUpper; static PyObject *mx_ToLower; static PyObject *mxTextTools_Error; /* mxTextTools specific error */ static PyObject *mxTextTools_TagTables; /* TagTable cache dictionary */ /* Flag telling us whether the module was initialized or not. */ static int mxTextTools_Initialized = 0; /* --- forward declarations ----------------------------------------------- */ /* --- module helper ------------------------------------------------------ */ static PyObject *mxTextTools_ToUpper(void) { char tr[256]; Py_ssize_t i; for (i = 0; i < 256; i++) tr[i] = toupper((char)i); return PyString_FromStringAndSize(tr,sizeof(tr)); } static PyObject *mxTextTools_ToLower(void) { char tr[256]; Py_ssize_t i; for (i = 0; i < 256; i++) tr[i] = tolower((char)i); return PyString_FromStringAndSize(tr,sizeof(tr)); } /* Create an exception object, insert it into the module dictionary under the given name and return the object pointer; this is NULL in case an error occurred. base can be given to indicate the base object to be used by the exception object. It should be NULL otherwise */ /* --- module interface --------------------------------------------------- */ /* --- Text Search Object ----------------------------------------------*/ /* allocation */ static PyObject *mxTextSearch_New(PyObject *match, PyObject *translate, int algorithm) { mxTextSearchObject *so; so = PyObject_NEW(mxTextSearchObject, &mxTextSearch_Type); if (so == NULL) return NULL; so->data = NULL; so->translate = NULL; so->match = NULL; Py_INCREF(match); so->match = match; if (translate == Py_None) translate = NULL; else if (translate) { Py_Assert(PyString_Check(translate), PyExc_TypeError, "translate table must be a string"); Py_Assert(PyString_GET_SIZE(translate) == 256, PyExc_TypeError, "translate string must have exactly 256 chars"); Py_INCREF(translate); } so->translate = translate; /* Init algorithm */ so->algorithm = algorithm; switch (algorithm) { case MXTEXTSEARCH_BOYERMOORE: Py_Assert(PyString_Check(match), PyExc_TypeError, "match must be a string for Boyer-Moore"); so->data = bm_init(PyString_AS_STRING(match), PyString_GET_SIZE(match)); Py_Assert(so->data != NULL, PyExc_TypeError, "error initializing the search object"); break; case MXTEXTSEARCH_TRIVIAL: Py_Assert(PyString_Check(match) || PyUnicode_Check(match), PyExc_TypeError, "match must be a string or unicode"); Py_Assert(so->translate == NULL, PyExc_TypeError, "trivial search algorithm does not support translate"); break; default: Py_Error(PyExc_ValueError, "unknown or unsupported algorithm"); } return (PyObject *)so; onError: Py_DECREF(so); return NULL; } Py_C_Function_WithKeywords( mxTextSearch_TextSearch, "TextSearch(match[,translate=None,algorithm=default_algorithm])\n\n" "Create a substring search object for the string match;\n" "translate is an optional translate-string like the one used\n" "in the module re." ) { PyObject *match = 0; PyObject *translate = 0; int algorithm = -424242; Py_KeywordsGet3Args("O|Oi:TextSearch",match,translate,algorithm); if (algorithm == -424242) { if (PyUnicode_Check(match)) algorithm = MXTEXTSEARCH_TRIVIAL; else algorithm = MXTEXTSEARCH_BOYERMOORE; } return mxTextSearch_New(match, translate, algorithm); onError: return NULL; } static void mxTextSearch_Free(mxTextSearchObject *so) { if (so->data) { switch (so->algorithm) { case MXTEXTSEARCH_BOYERMOORE: bm_free(so->data); break; case MXTEXTSEARCH_TRIVIAL: break; } } Py_XDECREF(so->match); Py_XDECREF(so->translate); PyObject_Del(so); } /* C APIs */ #define so ((mxTextSearchObject *)self) /* Get the match length from an TextSearch object or -1 in case of an error. */ Py_ssize_t mxTextSearch_MatchLength(PyObject *self) { Py_Assert(mxTextSearch_Check(self), PyExc_TypeError, "expected a TextSearch object"); switch (so->algorithm) { case MXTEXTSEARCH_BOYERMOORE: return BM_MATCH_LEN(so->data); break; case MXTEXTSEARCH_TRIVIAL: if (PyString_Check(so->match)) return PyString_GET_SIZE(so->match); #ifdef HAVE_UNICODE else if (PyUnicode_Check(so->match)) return PyUnicode_GET_SIZE(so->match); #endif break; } Py_Error(mxTextTools_Error, "internal error"); onError: return -1; } static Py_ssize_t trivial_search(const char *text, Py_ssize_t start, Py_ssize_t stop, const char *match, Py_ssize_t match_len) { Py_ssize_t ml1 = match_len - 1; register const char *tx = &text[start]; register Py_ssize_t x = start; if (ml1 < 0) return start; /* Brute-force method; from right to left */ for (;;) { register Py_ssize_t j = ml1; register const char *mj = &match[j]; if (x + j >= stop) /* reached eof: no match */ return start; /* scan from right to left */ for (tx += j; j >= 0 && *tx == *mj; tx--, mj--, j--) ; if (j < 0) { /* found */ x += ml1 + 1; return x; } /* not found: rewind and advance one char */ tx -= j - 1; x++; } return start; } #ifdef HAVE_UNICODE static Py_ssize_t trivial_unicode_search(const Py_UNICODE *text, Py_ssize_t start, Py_ssize_t stop, const Py_UNICODE *match, Py_ssize_t match_len) { Py_ssize_t ml1 = match_len - 1; register const Py_UNICODE *tx = &text[start]; register Py_ssize_t x = start; if (ml1 < 0) return start; /* Brute-force method; from right to left */ for (;;) { register Py_ssize_t j = ml1; register const Py_UNICODE *mj = &match[j]; if (x + j >= stop) /* reached eof: no match */ return start; /* scan from right to left */ for (tx += j; j >= 0 && *tx == *mj; tx--, mj--, j--) ; if (j < 0) { /* found */ x += ml1 + 1; return x; } /* not found: rewind and advance one char */ tx -= j - 1; x++; } return start; } #endif /* Search for the match in text[start:stop]. Returns 1 in case a match was found and sets sliceleft, sliceright to the matching slice. Returns 0 in case no match was found and -1 in case of an error. */ Py_ssize_t mxTextSearch_SearchBuffer(PyObject *self, char *text, Py_ssize_t start, Py_ssize_t stop, Py_ssize_t *sliceleft, Py_ssize_t *sliceright) { Py_ssize_t nextpos; Py_ssize_t match_len; Py_Assert(mxTextSearch_Check(self), PyExc_TypeError, "expected a TextSearch object"); switch (so->algorithm) { case MXTEXTSEARCH_BOYERMOORE: if (so->translate) { /* search with translate table */ nextpos = bm_tr_search((mxbmse_data *)so->data, text, start, stop, PyString_AS_STRING(so->translate)); } else { /* exact search */ nextpos = bm_search((mxbmse_data *)so->data, text, start, stop); } match_len = BM_MATCH_LEN(so->data); break; case MXTEXTSEARCH_TRIVIAL: { const char *match; if (PyString_Check(so->match)) { match = PyString_AS_STRING(so->match); match_len = PyString_GET_SIZE(so->match); } else if (PyObject_AsCharBuffer(so->match, &match, &match_len)) goto onError; nextpos = trivial_search(text, start, stop, match, match_len); } break; default: Py_Error(mxTextTools_Error, "unknown algorithm type in mxTextSearch_SearchBuffer"); } /* Found ? */ if (nextpos != start) { if (sliceleft) *sliceleft = nextpos - match_len; if (sliceright) *sliceright = nextpos; return 1; } /* Not found */ return 0; onError: return -1; } #ifdef HAVE_UNICODE Py_ssize_t mxTextSearch_SearchUnicode(PyObject *self, Py_UNICODE *text, Py_ssize_t start, Py_ssize_t stop, Py_ssize_t *sliceleft, Py_ssize_t *sliceright) { Py_ssize_t nextpos; Py_ssize_t match_len; Py_Assert(mxTextSearch_Check(self), PyExc_TypeError, "expected a TextSearch object"); switch (so->algorithm) { case MXTEXTSEARCH_BOYERMOORE: Py_Error(PyExc_TypeError, "Boyer-Moore search algorithm does not support Unicode"); break; case MXTEXTSEARCH_TRIVIAL: { PyObject *u; Py_UNICODE *match; if (PyUnicode_Check(so->match)) { u = NULL; match = PyUnicode_AS_UNICODE(so->match); match_len = PyUnicode_GET_SIZE(so->match); } else { u = PyUnicode_FromEncodedObject(so->match, NULL, NULL); if (u == NULL) goto onError; match = PyUnicode_AS_UNICODE(u); match_len = PyUnicode_GET_SIZE(u); } nextpos = trivial_unicode_search(text, start, stop, match, match_len); Py_XDECREF(u); } break; default: Py_Error(mxTextTools_Error, "unknown algorithm type in mxTextSearch_SearchUnicode"); } /* Found ? */ if (nextpos != start) { if (sliceleft) *sliceleft = nextpos - match_len; if (sliceright) *sliceright = nextpos; return 1; } /* Not found */ return 0; onError: return -1; } #endif /* methods */ Py_C_Function( mxTextSearch_search, "TextSearch.search(text,start=0,stop=len(text))\n\n" "Search for the substring in text, looking only at the\n" "slice [start:stop] and return the slice (l,r)\n" "where the substring was found, (start,start) otherwise.") { PyObject *text; Py_ssize_t start = 0; Py_ssize_t stop = INT_MAX; Py_ssize_t sliceleft, sliceright; int rc; Py_Get3Args("O|ii:TextSearch.search", text,start,stop); if (PyString_Check(text)) { Py_CheckStringSlice(text, start, stop); rc = mxTextSearch_SearchBuffer(self, PyString_AS_STRING(text), start, stop, &sliceleft, &sliceright); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, stop); rc = mxTextSearch_SearchUnicode(self, PyUnicode_AS_UNICODE(text), start, stop, &sliceleft, &sliceright); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); if (rc < 0) goto onError; if (rc == 0) { sliceleft = start; sliceright = start; } /* Return the slice */ Py_Return2("ii", sliceleft, sliceright); onError: return NULL; } Py_C_Function( mxTextSearch_find, "TextSearch.find(text,start=0,stop=len(text))\n\n" "Search for the substring in text, looking only at the\n" "slice [start:stop] and return the index\n" "where the substring was found, -1 otherwise.") { PyObject *text; Py_ssize_t start = 0; Py_ssize_t stop = INT_MAX; Py_ssize_t sliceleft, sliceright; int rc; Py_Get3Args("O|ii:TextSearch.find", text,start,stop); if (PyString_Check(text)) { Py_CheckStringSlice(text, start, stop); rc = mxTextSearch_SearchBuffer(self, PyString_AS_STRING(text), start, stop, &sliceleft, &sliceright); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, stop); rc = mxTextSearch_SearchUnicode(self, PyUnicode_AS_UNICODE(text), start, stop, &sliceleft, &sliceright); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); if (rc < 0) goto onError; if (rc == 0) sliceleft = -1; return PyInt_FromLong(sliceleft); onError: return NULL; } Py_C_Function( mxTextSearch_findall, "TextSearch.findall(text,start=0,stop=len(text))\n\n" "Search for the substring in text, looking only at the\n" "slice [start:stop] and return a list of all\n" "non overlapping slices (l,r) in text where the match\n" "string can be found.") { PyObject *text; PyObject *list = 0; Py_ssize_t start = 0; Py_ssize_t stop = INT_MAX; Py_ssize_t stop_index; Py_ssize_t match_len; Py_ssize_t listsize = INITIAL_LIST_SIZE; Py_ssize_t listitem = 0; Py_Get3Args("O|ii:TextSearch.findall", text,start,stop); if (PyString_Check(text)) { Py_CheckStringSlice(text, start, stop); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, stop); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); list = PyList_New(listsize); if (!list) goto onError; match_len = mxTextSearch_MatchLength(self); if (match_len < 0) goto onError; stop_index = stop - match_len; while (start <= stop_index) { register PyObject *t,*v; int rc; Py_ssize_t sliceleft, sliceright; /* exact search */ if (PyString_Check(text)) rc = mxTextSearch_SearchBuffer(self, PyString_AS_STRING(text), start, stop, &sliceleft, &sliceright); #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) rc = mxTextSearch_SearchUnicode(self, PyUnicode_AS_UNICODE(text), start, stop, &sliceleft, &sliceright); #endif else break; if (rc < 0) goto onError; if (rc == 0) break; /* Build slice and append to list */ t = PyTuple_New(2); if (!t) goto onError; v = PyInt_FromLong(sliceleft); if (!v) goto onError; PyTuple_SET_ITEM(t,0,v); v = PyInt_FromLong(sliceright); if (!v) goto onError; PyTuple_SET_ITEM(t,1,v); if (listitem < listsize) PyList_SET_ITEM(list, listitem, t); else { PyList_Append(list, t); Py_DECREF(t); } listitem++; start = sliceright; } /* Resize list if necessary */ if (listitem < listsize) PyList_SetSlice(list, listitem, listsize, (PyObject*)NULL); return list; onError: Py_XDECREF(list); return NULL; } #ifdef COPY_PROTOCOL Py_C_Function( mxTextSearch_copy, "copy([memo])\n\n" "Return a new reference for the instance. This function\n" "is used for the copy-protocol. Real copying doesn't take\n" "place, since the instances are immutable.") { PyObject *memo; Py_GetArg("|O",memo); Py_INCREF(so); return (PyObject *)so; onError: return NULL; } #endif #undef so /* --- slots --- */ static PyObject *mxTextSearch_Repr(mxTextSearchObject *self) { char *algoname; PyObject *v; char t[500], *reprstr; v = PyObject_Repr(self->match); if (v == NULL) return NULL; reprstr = PyString_AsString(v); if (reprstr == NULL) return NULL; switch (self->algorithm) { case MXTEXTSEARCH_BOYERMOORE: algoname = "Boyer-Moore"; break; case MXTEXTSEARCH_TRIVIAL: algoname = "Trivial"; break; default: algoname = ""; } sprintf(t, "<%.50s TextSearch object for %.400s at 0x%lx>", algoname, reprstr, (long)self); Py_DECREF(v); return PyString_FromString(t); } /* Python Method Table */ static PyMethodDef mxTextSearch_Methods[] = { Py_MethodListEntry("search",mxTextSearch_search), Py_MethodListEntry("find",mxTextSearch_find), Py_MethodListEntry("findall",mxTextSearch_findall), #ifdef COPY_PROTOCOL Py_MethodListEntry("__deepcopy__",mxTextSearch_copy), Py_MethodListEntry("__copy__",mxTextSearch_copy), #endif {NULL,NULL} /* end of list */ }; static PyMemberDef mxTextSearch_members[] = { {"match",T_OBJECT_EX,offsetof(mxTextSearchObject,match),READONLY,"Text that this search matches"}, {"translate",T_OBJECT,offsetof(mxTextSearchObject,translate),READONLY,"Translated search term"}, {"algorithm",T_INT,offsetof(mxTextSearchObject,algorithm),READONLY,"Algorithm in use by the text search"}, {NULL} }; /* Python Type Table */ PyTypeObject mxTextSearch_Type = { PyVarObject_HEAD_INIT(NULL, 0) /* init at startup ! */ "TextSearch", /*tp_name*/ sizeof(mxTextSearchObject), /*tp_basicsize*/ 0, /*tp_itemsize*/ /* methods */ (destructor)mxTextSearch_Free, /*tp_dealloc*/ (printfunc)0, /*tp_print*/ (getattrfunc)0, /*tp_getattr*/ (setattrfunc)0, /*tp_setattr*/ 0, /*tp_compare*/ (reprfunc)mxTextSearch_Repr, /*tp_repr*/ 0, /*tp_as_number*/ 0, /*tp_as_number*/ 0, /*tp_as_mapping*/ (hashfunc)0, /*tp_hash*/ (ternaryfunc)0, /*tp_call*/ (reprfunc)0, /*tp_str*/ (getattrofunc)0, /*tp_getattro*/ (setattrofunc)0, /*tp_setattro*/ 0, /*tp_asbuffer*/ Py_TPFLAGS_DEFAULT, /*tp_flags*/ "mxTextTools text-search object", /*tp_doc*/ 0, /*tp_traverse*/ 0, /*tp_clear*/ 0, /*tp_richcompare*/ 0, /*tp_weaklistoffset*/ 0, /*tp_iter*/ 0, /*tp_iternext*/ mxTextSearch_Methods, /*tp_methods*/ mxTextSearch_members, /*tp_members*/ }; /* --- Character Set Object --------------------------------------------*/ /* internal */ /* 8-bit character sets are implemented using a simple 32-byte long bitmap with one bit per character. Addressing is done as follows: def char_is_set(ordinal): return bitmap[ordinal >> 3] & (1 << (ordinal & 7)) */ #define STRING_CHARSET_SIZE 256 #define STRING_CHARSET_BITMAP_SIZE (STRING_CHARSET_SIZE / 8) typedef struct { unsigned char bitmap[STRING_CHARSET_BITMAP_SIZE]; /* character bitmap */ } string_charset; static int init_string_charset(mxCharSetObject *cs, PyObject *definition) { register Py_ssize_t i, j; char *def = PyString_AS_STRING(definition); const Py_ssize_t len = PyString_GET_SIZE(definition); string_charset *lookup = 0; register unsigned char *bitmap; int logic = 1; /* Handle logic change (first char is '^' for negative matching) */ if (len > 0 && def[0] == '^') { logic = 0; i = 1; } else i = 0; /* Build 32-byte lookup bitmap (one bit per character) */ lookup = (string_charset *)PyMem_Malloc(sizeof(string_charset)); if (lookup == NULL) { PyErr_NoMemory(); goto onError; } memset(lookup, 0, sizeof(string_charset)); cs->mode = MXCHARSET_8BITMODE; cs->lookup = (void *)lookup; bitmap = lookup->bitmap; for (; i < len; i++) { /* Handle escapes: "b\-d", "\\" */ if (def[i] == '\\') { if (i < len - 1 && def[i+1] == '\\') { j = (unsigned char)'\\'; bitmap[j >> 3] |= 1 << (j & 7); i++; } continue; } /* Handle ranges: "b-d", "\\-z", "\--z" */ if (i < len - 2 && def[i+1] == '-') { unsigned char range_left = def[i]; unsigned char range_right = def[i+2]; for (j = range_left; j <= range_right; j++) bitmap[j >> 3] |= 1 << (j & 7); i++; continue; } /* Normal processing */ j = (unsigned char)def[i]; bitmap[j >> 3] |= 1 << (j & 7); } /* Invert bitmap if negative matching is requested */ if (!logic) { DPRINTF("init_string_charset: inverting bitmap\n"); for (i = 0; i < STRING_CHARSET_BITMAP_SIZE; i++) bitmap[i] ^= 0xFF; } return 0; onError: if (lookup) PyMem_Free((void *)lookup); cs->lookup = 0; return -1; } #ifdef HAVE_UNICODE /* Unicode character sets are implemented using two step indexing which is a good compromise between lookup speed and memory usage. Lookup is done using a variable length array of 32-byte bitmap blocks. There can be 256 such blocks. Identical blocks are collapsed into a single copy. Addressing is done as follows: def char_is_set(ordinal): index = bitmapindex[ordinal >> 8] bitmap = bitmaps[index] return bitmap[(ordinal >> 3) & 31] & (1 << (ordinal & 7)) The technique used here is very similar to what is done in Python's SRE (see the BIGCHARSET patch by Martin von Loewis). Compression should be reasonably good since character sets in practice usually only contains a few single characters or longer ranges of Unicode characters. */ #define UNICODE_CHARSET_SIZE 65536 #define UNICODE_CHARSET_BITMAP_SIZE 32 #define UNICODE_CHARSET_BITMAPS (UNICODE_CHARSET_SIZE / (UNICODE_CHARSET_BITMAP_SIZE * 8)) #define UNICODE_CHARSET_BIGMAP_SIZE (UNICODE_CHARSET_SIZE / 8) typedef struct { unsigned char bitmapindex[UNICODE_CHARSET_BITMAPS]; /* Index to char bitmaps */ unsigned char bitmaps[UNICODE_CHARSET_BITMAPS][UNICODE_CHARSET_BITMAP_SIZE]; /* Variable length bitmap array */ } unicode_charset; static int init_unicode_charset(mxCharSetObject *cs, PyObject *definition) { register Py_ssize_t i, j; Py_UNICODE *def = PyUnicode_AS_UNICODE(definition); const Py_ssize_t len = PyUnicode_GET_SIZE(definition); unicode_charset *lookup = 0; unsigned char bigmap[UNICODE_CHARSET_BIGMAP_SIZE]; Py_ssize_t blocks; int logic = 1; /* Handle logic change (first char is '^' for negative matching) */ if (len > 0 && def[0] == '^') { logic = 0; i = 1; } else i = 0; /* Build bigmap */ memset(bigmap, 0, sizeof(bigmap)); for (; i < len; i++) { /* Handle escapes: "b\-d", "\\" */ if (def[i] == '\\') { if (i < len - 1 && def[i+1] == '\\') { j = (int)'\\'; bigmap[j >> 3] |= 1 << (j & 7); i++; } continue; } /* Handle ranges: "b-d", "\\-z", "\--z" */ if (i < len - 2 && def[i+1] == '-') { Py_UNICODE range_left = def[i]; Py_UNICODE range_right = def[i+2]; if (range_right >= UNICODE_CHARSET_SIZE) { Py_Error(PyExc_ValueError, "unicode ordinal out of supported range"); } for (j = range_left; j <= range_right; j++) bigmap[j >> 3] |= 1 << (j & 7); i++; continue; } /* Normal processing */ j = def[i]; if (j >= UNICODE_CHARSET_SIZE) { Py_Error(PyExc_ValueError, "unicode ordinal out of supported range"); } bigmap[j >> 3] |= 1 << (j & 7); } /* Build lookup table XXX Could add dynamic resizing here... probably not worth it though, since sizeof(unicode_charset) isn't all that large. */ lookup = (unicode_charset *)PyMem_Malloc(sizeof(unicode_charset)); if (lookup == NULL) { PyErr_NoMemory(); goto onError; } blocks = 0; for (i = UNICODE_CHARSET_BITMAPS - 1; i >= 0; i--) { unsigned char *block = &bigmap[i << 5]; for (j = blocks - 1; j >= 0; j--) if (memcmp(lookup->bitmaps[j], block, UNICODE_CHARSET_BITMAP_SIZE) == 0) break; if (j < 0) { j = blocks; DPRINTF("init_unicode_charset: Creating new block %i for %i\n", j, i); memcpy(lookup->bitmaps[j], block, UNICODE_CHARSET_BITMAP_SIZE); blocks++; } else DPRINTF("init_unicode_charset: Reusing block %i for %i\n", j, i); lookup->bitmapindex[i] = j; } DPRINTF("init_unicode_charset: Map size: %i block(s) = %i bytes\n", blocks, UNICODE_CHARSET_BITMAPS + blocks * UNICODE_CHARSET_BITMAP_SIZE); lookup = (unicode_charset *)PyMem_Realloc(lookup, UNICODE_CHARSET_BITMAPS + blocks * UNICODE_CHARSET_BITMAP_SIZE); if (lookup == NULL) { PyErr_NoMemory(); goto onError; } /* Invert bitmaps if negative matching is requested */ if (!logic) { register unsigned char *bitmap = &lookup->bitmaps[0][0]; DPRINTF("init_unicode_charset: inverting bitmaps\n"); for (i = 0; i < blocks * UNICODE_CHARSET_BITMAP_SIZE; i++) bitmap[i] ^= 0xFF; } cs->mode = MXCHARSET_UCS2MODE; cs->lookup = (void *)lookup; return 0; onError: if (lookup) PyMem_Free((void *)lookup); cs->lookup = 0; return -1; } #endif /* allocation */ static PyObject *mxCharSet_New(PyObject *definition) { mxCharSetObject *cs; cs = PyObject_NEW(mxCharSetObject, &mxCharSet_Type); if (cs == NULL) return NULL; Py_INCREF(definition); cs->definition = definition; cs->lookup = NULL; cs->mode = -1; if (PyString_Check(definition)) { if (init_string_charset(cs, definition)) goto onError; } #ifdef HAVE_UNICODE else if (PyUnicode_Check(definition)) { if (init_unicode_charset(cs, definition)) goto onError; } #endif else Py_Error(PyExc_TypeError, "character set definition must be string or unicode"); return (PyObject *)cs; onError: Py_DECREF(cs); return NULL; } Py_C_Function( mxCharSet_CharSet, "CharSet(definition)\n\n" "Create a character set matching object from the string" ) { PyObject *definition; Py_GetArg("O:CharSet", definition); return mxCharSet_New(definition); onError: return NULL; } static void mxCharSet_Free(mxCharSetObject *cs) { Py_XDECREF(cs->definition); if (cs->lookup) PyMem_Free(cs->lookup); PyObject_Del(cs); } /* C APIs */ #define cs ((mxCharSetObject *)self) int mxCharSet_ContainsChar(PyObject *self, register unsigned char ch) { if (!mxCharSet_Check(self)) { PyErr_BadInternalCall(); goto onError; } if (cs->mode == MXCHARSET_8BITMODE) { unsigned char *bitmap = ((string_charset *)cs->lookup)->bitmap; return ((bitmap[ch >> 3] & (1 << (ch & 7))) != 0); } #ifdef HAVE_UNICODE else if (cs->mode == MXCHARSET_UCS2MODE) { unicode_charset *lookup = (unicode_charset *)cs->lookup; unsigned char *bitmap = lookup->bitmaps[lookup->bitmapindex[0]]; return ((bitmap[ch >> 3] & (1 << (ch & 7))) != 0); } #endif else { Py_Error(mxTextTools_Error, "unsupported character set mode"); } onError: return -1; } #ifdef HAVE_UNICODE int mxCharSet_ContainsUnicodeChar(PyObject *self, register Py_UNICODE ch) { if (!mxCharSet_Check(self)) { PyErr_BadInternalCall(); goto onError; } if (cs->mode == MXCHARSET_8BITMODE) { unsigned char *bitmap = ((string_charset *)cs->lookup)->bitmap; if (ch >= 256) return 0; return ((bitmap[ch >> 3] & (1 << (ch & 7))) != 0); } else if (cs->mode == MXCHARSET_UCS2MODE) { unicode_charset *lookup = (unicode_charset *)cs->lookup; unsigned char *bitmap = lookup->bitmaps[lookup->bitmapindex[ch >> 8]]; return ((bitmap[(ch >> 3) & 31] & (1 << (ch & 7))) != 0); } else { Py_Error(mxTextTools_Error, "unsupported character set mode"); } onError: return -1; } #endif static int mxCharSet_Contains(PyObject *self, PyObject *other) { if (PyString_Check(other)) { Py_Assert(PyString_GET_SIZE(other) == 1, PyExc_TypeError, "expected a single character"); return mxCharSet_ContainsChar(self, PyString_AS_STRING(other)[0]); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(other)) { Py_Assert(PyUnicode_GET_SIZE(other) == 1, PyExc_TypeError, "expected a single unicode character"); return mxCharSet_ContainsUnicodeChar(self, PyUnicode_AS_UNICODE(other)[0]); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode character"); onError: return -1; } /* In mode 1, find the position of the first character in text belonging to set. This may also be stop or start-1 in case no such character is found during the search (depending on the direction). In mode 0, find the first character not in set. This may also be stop or start-1 in case no such character is found during the search (depending on the direction). The search is done in the slice start:stop. -2 is returned in case of an error. */ static int mxCharSet_FindChar(PyObject *self, unsigned char *text, Py_ssize_t start, Py_ssize_t stop, const int mode, const int direction) { register Py_ssize_t i; register unsigned int c; register unsigned int block; unsigned char *bitmap; if (!mxCharSet_Check(self)) { PyErr_BadInternalCall(); goto onError; } if (cs->mode == MXCHARSET_8BITMODE) bitmap = ((string_charset *)cs->lookup)->bitmap; #ifdef HAVE_UNICODE else if (cs->mode == MXCHARSET_UCS2MODE) { unicode_charset *lookup = (unicode_charset *)cs->lookup; bitmap = lookup->bitmaps[lookup->bitmapindex[0]]; } #endif else { Py_Error(mxTextTools_Error, "unsupported character set mode"); } if (direction > 0) { if (mode) /* Find first char in set */ for (i = start; i < stop; i++) { c = text[i]; block = bitmap[c >> 3]; if (block && ((block & (1 << (c & 7))) != 0)) break; } else /* Find first char not in set */ for (i = start; i < stop; i++) { c = text[i]; block = bitmap[c >> 3]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } } else { if (mode) /* Find first char in set, searching from the end */ for (i = stop - 1; i >= start; i--) { c = text[i]; block = bitmap[c >> 3]; if (block && ((block & (1 << (c & 7))) != 0)) break; } else /* Find first char not in set, searching from the end */ for (i = stop - 1; i >= start; i--) { c = text[i]; block = bitmap[c >> 3]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } } return i; onError: return -2; } #ifdef HAVE_UNICODE static int mxCharSet_FindUnicodeChar(PyObject *self, Py_UNICODE *text, Py_ssize_t start, Py_ssize_t stop, const int mode, const int direction) { register int i; register unsigned int c; register unsigned int block; unsigned char *bitmap; if (!mxCharSet_Check(self)) { PyErr_BadInternalCall(); goto onError; } if (cs->mode == MXCHARSET_8BITMODE) { bitmap = ((string_charset *)cs->lookup)->bitmap; if (direction > 0) { if (mode) /* Find first char in set */ for (i = start; i < stop; i++) { c = text[i]; if (c > 256) continue; block = bitmap[c >> 3]; if (block && ((block & (1 << (c & 7))) != 0)) break; } else /* Find first char not in set */ for (i = start; i < stop; i++) { c = text[i]; if (c > 256) break; block = bitmap[c >> 3]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } } else { if (mode) /* Find first char in set, searching from the end */ for (i = stop - 1; i >= start; i--) { c = text[i]; if (c > 256) continue; block = bitmap[c >> 3]; if (block && ((block & (1 << (c & 7))) != 0)) break; } else /* Find first char not in set, searching from the end */ for (i = stop - 1; i >= start; i--) { c = text[i]; if (c > 256) break; block = bitmap[c >> 3]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } } return i; } #ifdef HAVE_UNICODE else if (cs->mode == MXCHARSET_UCS2MODE) { unicode_charset *lookup = (unicode_charset *)cs->lookup; if (direction > 0) { if (mode) /* Find first char in set */ for (i = start; i < stop; i++) { c = text[i]; bitmap = lookup->bitmaps[lookup->bitmapindex[c >> 8]]; block = bitmap[(c >> 3) & 31]; if (block && ((block & (1 << (c & 7))) != 0)) break; } else /* Find first char not in set */ for (i = start; i < stop; i++) { c = text[i]; bitmap = lookup->bitmaps[lookup->bitmapindex[c >> 8]]; block = bitmap[(c >> 3) & 31]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } } else { if (mode) /* Find first char in set, searching from the end */ for (i = stop - 1; i >= start; i--) { c = text[i]; bitmap = lookup->bitmaps[lookup->bitmapindex[c >> 8]]; block = bitmap[(c >> 3) & 31]; if (block && ((block & (1 << (c & 7))) != 0)) break; } else /* Find first char not in set, searching from the end */ for (i = stop - 1; i >= start; i--) { c = text[i]; bitmap = lookup->bitmaps[lookup->bitmapindex[c >> 8]]; block = bitmap[(c >> 3) & 31]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } } return i; } #endif else { Py_Error(mxTextTools_Error, "unsupported character set mode"); } onError: return -2; } #endif /* Return the position of the first character in text[start:stop] occurring in set or -1 in case no such character exists. */ static int mxCharSet_Search(PyObject *self, PyObject *text, Py_ssize_t start, Py_ssize_t stop, int direction) { Py_ssize_t position; if (PyString_Check(text)) { Py_CheckStringSlice(text, start, stop); position = mxCharSet_FindChar(self, (unsigned char *)PyString_AS_STRING(text), start, stop, 1, direction); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, stop); position = mxCharSet_FindUnicodeChar(self, PyUnicode_AS_UNICODE(text), start, stop, 1, direction); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); if ((direction > 0 && position >= stop) || (direction <= 0 && position < start)) position = -1; return position; onError: return -2; } /* Return the longest match of characters from set in text[start:stop]. If direction is positive, the search is done from the left (longest prefix), otherwise it is started from the right (longest suffix). -1 is returned in case of an error. */ Py_ssize_t mxCharSet_Match(PyObject *self, PyObject *text, Py_ssize_t start, Py_ssize_t stop, int direction) { Py_ssize_t position; if (PyString_Check(text)) { Py_CheckStringSlice(text, start, stop); position = mxCharSet_FindChar(self, (unsigned char *)PyString_AS_STRING(text), start, stop, 0, direction); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, stop); position = mxCharSet_FindUnicodeChar(self, PyUnicode_AS_UNICODE(text), start, stop, 0, direction); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); if (position < -1) goto onError; if (direction > 0) return position - start; else return stop-1 - position; onError: return -1; } /* Stips off characters appearing in the character set from text[start:stop] and returns the result as Python string object. where indicates the mode: where < 0: strip left only where = 0: strip left and right where > 0: strip right only */ static PyObject *mxCharSet_Strip(PyObject *self, PyObject *text, Py_ssize_t start, Py_ssize_t stop, Py_ssize_t where) { Py_ssize_t left,right; if (!mxCharSet_Check(self)) { PyErr_BadInternalCall(); goto onError; } if (PyString_Check(text)) { Py_CheckStringSlice(text, start, stop); /* Strip left */ if (where <= 0) { left = mxCharSet_FindChar(self, (unsigned char *)PyString_AS_STRING(text), start, stop, 0, 1); if (left < 0) goto onError; } else left = start; /* Strip right */ if (where >= 0) { right = mxCharSet_FindChar(self, (unsigned char *)PyString_AS_STRING(text), left, stop, 0, -1) + 1; if (right < 0) goto onError; } else right = stop; return PyString_FromStringAndSize(PyString_AS_STRING(text) + left, max(right - left, 0)); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, stop); /* Strip left */ if (where <= 0) { left = mxCharSet_FindUnicodeChar(self, PyUnicode_AS_UNICODE(text), start, stop, 0, 1); if (left < 0) goto onError; } else left = start; /* Strip right */ if (where >= 0) { right = mxCharSet_FindUnicodeChar(self, PyUnicode_AS_UNICODE(text), start, stop, 0, -1) + 1; if (right < 0) goto onError; } else right = stop; return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(text) + left, max(right - left, 0)); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); onError: return NULL; } static PyObject *mxCharSet_Split(PyObject *self, PyObject *text, Py_ssize_t start, Py_ssize_t text_len, int include_splits) { PyObject *list = NULL; PyObject *s; register Py_ssize_t x; Py_ssize_t listitem = 0; Py_ssize_t listsize = INITIAL_LIST_SIZE; if (!mxCharSet_Check(self)) { PyErr_BadInternalCall(); goto onError; } list = PyList_New(listsize); if (!list) goto onError; if (PyString_Check(text)) { unsigned char *tx = (unsigned char *)PyString_AS_STRING(text); Py_CheckStringSlice(text, start, text_len); x = start; while (x < text_len) { Py_ssize_t z; /* Skip all text in set (include_splits == 0), not in set (include_splits == 1) */ z = x; x = mxCharSet_FindChar(self, tx, x, text_len, include_splits, 1); /* Append the slice to list */ if (include_splits) { s = PyString_FromStringAndSize((char *)&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; if (x >= text_len) break; } /* Skip all text in set (include_splits == 1), not in set (include_splits == 0) */ z = x; x = mxCharSet_FindChar(self, tx, x, text_len, !include_splits, 1); /* Append the slice to list if it is not empty */ if (x > z) { s = PyString_FromStringAndSize((char *)&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; } } } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_UNICODE *tx = PyUnicode_AS_UNICODE(text); Py_CheckUnicodeSlice(text, start, text_len); x = start; while (x < text_len) { Py_ssize_t z; /* Skip all text in set (include_splits == 0), not in set (include_splits == 1) */ z = x; x = mxCharSet_FindUnicodeChar(self, tx, x, text_len, include_splits, 1); /* Append the slice to list */ if (include_splits) { s = PyUnicode_FromUnicode(&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; if (x >= text_len) break; } /* Skip all text in set (include_splits == 1), not in set (include_splits == 0) */ z = x; x = mxCharSet_FindUnicodeChar(self, tx, x, text_len, !include_splits, 1); /* Append the slice to list if it is not empty */ if (x > z) { s = PyUnicode_FromUnicode(&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; } } } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); /* Resize list if necessary */ if (listitem < listsize) PyList_SetSlice(list, listitem, listsize, (PyObject*)NULL); return list; onError: Py_XDECREF(list); return NULL; } /* methods */ Py_C_Function( mxCharSet_contains, ".contains(char)\n\n" ) { PyObject *chr; int rc; Py_GetArg("O:CharSet.contains", chr); rc = mxCharSet_Contains(self, chr); if (rc < 0) goto onError; return PyInt_FromLong(rc); onError: return NULL; } Py_C_Function( mxCharSet_search, ".search(text[, direction=1, start=0, stop=len(text)])\n\n" ) { PyObject *text; int direction = 1; Py_ssize_t start = 0, stop = INT_MAX; int rc; Py_Get4Args("O|iii:CharSet.search", text, direction, start, stop); rc = mxCharSet_Search(self, text, start, stop, direction); if (rc == -1) Py_ReturnNone(); if (rc < -1) goto onError; return PyInt_FromLong(rc); onError: return NULL; } Py_C_Function( mxCharSet_match, ".match(text[, direction=1, start=0, stop=len(text)])\n\n" ) { PyObject *text; int direction = 1; Py_ssize_t start = 0, stop = INT_MAX; int rc; Py_Get4Args("O|iii:CharSet.match", text, direction, start, stop); rc = mxCharSet_Match(self, text, start, stop, direction); if (rc < 0) goto onError; return PyInt_FromLong(rc); onError: return NULL; } Py_C_Function( mxCharSet_split, ".split(text[, start=0, stop=len(text)])\n\n" ) { PyObject *text; Py_ssize_t start = 0, stop = INT_MAX; Py_Get3Args("O|ii:CharSet.split", text, start, stop); return mxCharSet_Split(self, text, start, stop, 0); onError: return NULL; } Py_C_Function( mxCharSet_splitx, ".splitx(text[, start=0, stop=len(text)])\n\n" ) { PyObject *text; Py_ssize_t start = 0, stop = INT_MAX; Py_Get3Args("O|ii:CharSet.splitx", text, start, stop); return mxCharSet_Split(self, text, start, stop, 1); onError: return NULL; } Py_C_Function( mxCharSet_strip, ".strip(text[, where=0, start=0, stop=len(text)])\n\n" ) { PyObject *text; Py_ssize_t where = 0; Py_ssize_t start = 0, stop = INT_MAX; Py_Get4Args("O|iii:CharSet.strip", text, where, start, stop); return mxCharSet_Strip(self, text, start, stop, where); onError: return NULL; } #ifdef COPY_PROTOCOL Py_C_Function( mxCharSet_copy, "copy([memo])\n\n" "Return a new reference for the instance. This function\n" "is used for the copy-protocol. Real copying doesn't take\n" "place, since the instances are immutable.") { PyObject *memo; Py_GetArg("|O",memo); Py_INCREF(cs); return (PyObject *)cs; onError: return NULL; } #endif #undef cs /* --- slots --- */ static PyObject *mxCharSet_Repr(mxCharSetObject *self) { PyObject *v; char t[500], *reprstr; v = PyObject_Repr(self->definition); if (v == NULL) return NULL; reprstr = PyString_AsString(v); if (reprstr == NULL) return NULL; sprintf(t, "", reprstr, (long)self); Py_DECREF(v); return PyString_FromString(t); } /* Python Type Tables */ static PySequenceMethods mxCharSet_TypeAsSequence = { (lenfunc)0, /*sq_length*/ (binaryfunc)0, /*sq_concat*/ (ssizeargfunc)0, /*sq_repeat*/ (ssizeargfunc)0, /*sq_item*/ (ssizessizeargfunc)0, /*sq_slice*/ (ssizeobjargproc)0, /*sq_ass_item*/ (ssizessizeobjargproc)0, /*sq_ass_slice*/ (objobjproc)mxCharSet_Contains, /*sq_contains*/ }; static PyMemberDef mxCharSet_Members[] = { {"definition",T_OBJECT_EX,offsetof(mxCharSetObject,definition),READONLY,"Definition"}, {NULL} }; static PyMethodDef mxCharSet_Methods[] = { Py_MethodListEntry("contains",mxCharSet_contains), Py_MethodListEntry("search",mxCharSet_search), Py_MethodListEntry("match",mxCharSet_match), Py_MethodListEntry("strip",mxCharSet_strip), Py_MethodListEntry("split",mxCharSet_split), Py_MethodListEntry("splitx",mxCharSet_splitx), #ifdef COPY_PROTOCOL Py_MethodListEntry("__deepcopy__",mxCharSet_copy), Py_MethodListEntry("__copy__",mxCharSet_copy), #endif {NULL,NULL} /* end of list */ }; PyTypeObject mxCharSet_Type = { PyVarObject_HEAD_INIT(NULL, 0) /* init at startup ! */ "Character Set", /* tp_name */ sizeof(mxCharSetObject), /* tp_basicsize */ 0, /* tp_itemsize */ /* methods */ (destructor)mxCharSet_Free, /* tp_dealloc */ (printfunc)0, /* tp_print */ (getattrfunc)0, /* tp_getattr */ (setattrfunc)0, /* tp_setattr */ 0, /* tp_compare */ (reprfunc)mxCharSet_Repr, /* tp_repr */ 0, /* tp_as_number */ &mxCharSet_TypeAsSequence, /* tp_as_sequence */ 0, /* tp_as_mapping */ (hashfunc)0, /* tp_hash */ (ternaryfunc)0, /* tp_call */ (reprfunc)0, /* tp_str */ (getattrofunc)0, /* tp_getattro */ (setattrofunc)0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT, /* tp_flags */ (char*) 0, /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ mxCharSet_Methods, /* tp_methods */ mxCharSet_Members, /* tp_members */ }; /* --- Tag Table Object ------------------------------------------------*/ PyObject *mxTagTable_New(PyObject *definition, int tabletype, int cacheable); /* internal APIs */ static PyObject *tc_get_item(register PyObject *obj, register Py_ssize_t i) { if (PyTuple_Check(obj)) { if (i > PyTuple_GET_SIZE(obj)) return NULL; return PyTuple_GET_ITEM(obj, i); } else if (PyList_Check(obj)) { if (i > PyList_GET_SIZE(obj)) return NULL; return PyList_GET_ITEM(obj, i); } else return NULL; } static Py_ssize_t tc_length(register PyObject *obj) { if (obj == NULL) return -1; else if (PyTuple_Check(obj)) return PyTuple_GET_SIZE(obj); else if (PyList_Check(obj)) return PyList_GET_SIZE(obj); else return -1; } /* Add a jump target to the jump dictionary */ static Py_ssize_t tc_add_jumptarget(PyObject *jumpdict, PyObject *targetname, Py_ssize_t index) { PyObject *v; v = PyDict_GetItem(jumpdict, targetname); if (v != NULL) Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "jump target already defined", (unsigned int) index); v = PyInt_FromLong(index); if (v == NULL) goto onError; if (PyDict_SetItem(jumpdict, targetname, v)) goto onError; Py_DECREF(v); return 0; onError: return -1; } /* Convert a string command argument to either an 8-bit string or Unicode depending on the tabletype. */ static PyObject *tc_convert_string_arg(PyObject *arg, Py_ssize_t tableposition, int tabletype) { /* Convert to strings */ if (tabletype == MXTAGTABLE_STRINGTYPE) { if (PyString_Check(arg)) return arg; #ifdef HAVE_UNICODE else if (PyUnicode_Check(arg)) { Py_DECREF(arg); arg = PyUnicode_AsEncodedString(arg, NULL, NULL); if (arg == NULL) Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "conversion from Unicode to " "string failed", (unsigned int)tableposition); } #endif else Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "command argument must be a " "string or unicode", (unsigned int)tableposition); } #ifdef HAVE_UNICODE /* Convert to Unicode */ else if (tabletype == MXTAGTABLE_UNICODETYPE) { if (PyUnicode_Check(arg)) return arg; else if (PyString_Check(arg)) { Py_DECREF(arg); arg = PyUnicode_Decode(PyString_AS_STRING(arg), PyString_GET_SIZE(arg), NULL, NULL); if (arg == NULL) Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "conversion from string to " "Unicode failed", (unsigned int)tableposition); } else Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "command argument must be a " "string or unicode", (unsigned int)tableposition); } #endif else Py_Error(mxTextTools_Error, "unsupported table type"); return arg; onError: return NULL; } /* Cleanup any references in the tag table. */ static int tc_cleanup(mxTagTableObject *tagtable) { Py_ssize_t i; for (i = 0; i < tagtable->numentries; i++) { mxTagTableEntry *tagtableentry = &tagtable->entry[i]; Py_XDECREF(tagtableentry->tagobj); tagtableentry->tagobj = NULL; Py_XDECREF(tagtableentry->args); tagtableentry->args = NULL; } return 0; } /* Initialize the tag table (this is the actual Tag Table compiler) */ static int init_tag_table(mxTagTableObject *tagtable, PyObject *table, Py_ssize_t size, int tabletype, int cacheable) { Py_ssize_t i; PyObject *entry; Py_ssize_t entry_len; PyObject *tagobj, *command, *args = 0, *je, *jne; PyObject *jumpdict, *v; int secondpass, own_args = 0; jumpdict = PyDict_New(); if (jumpdict == NULL) return -1; /* Reset to all fields to 0 */ memset(&tagtable->entry[0], 0, size * sizeof(mxTagTableEntry)); /* First pass */ secondpass = 0; tagtable->numentries = size; for (i = 0; i < size; i++) { mxTagTableEntry *tagtableentry = &tagtable->entry[i]; /* Get table entry i and parse it */ entry = tc_get_item(table, i); if (entry == NULL) { Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "not found or not a supported entry type", (unsigned int)i); } /* Special handling for jump marks (args is set to the jump mark string, jump target index is the next table entry) */ if (PyString_Check(entry)) { if (tc_add_jumptarget(jumpdict, entry, i + 1)) goto onError; tagtableentry->tagobj = NULL; tagtableentry->cmd = MATCH_JUMPTARGET; tagtableentry->flags = 0; Py_INCREF(entry); tagtableentry->args = entry; tagtableentry->jne = 0; tagtableentry->je = 1; continue; } /* Get entry length */ entry_len = tc_length(entry); if (entry_len < 3) { Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "expected an entry of the form " "(tagobj,command,arg[,jne[,je]])", (unsigned int)i); } /* Decode entry parts: (tagobj, command, args[, jne[, je]]) */ tagobj = tc_get_item(entry, 0); command = tc_get_item(entry, 1); args = tc_get_item(entry, 2); if (entry_len >= 4) jne = tc_get_item(entry, 3); else jne = NULL; if (entry_len >= 5) je = tc_get_item(entry, 4); else je = NULL; if (tagobj == NULL || command == NULL || args == NULL || (entry_len >= 4 && jne == NULL) || (entry_len >= 5 && je == NULL)) { Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "expected an entry of the form " "(tagobj,command,arg[,jne[,je]])",(unsigned int) i); } /* Store tagobj, None gets converted to NULL */ if (tagobj != Py_None) Py_INCREF(tagobj); else tagobj = NULL; tagtableentry->tagobj = tagobj; /* Decode command and flags */ Py_AssertWithArg(PyInt_Check(command), PyExc_TypeError, "tag table entry %d: " "command must be an integer",(unsigned int)i); tagtableentry->cmd = PyInt_AS_LONG(command) & 0xFF; tagtableentry->flags = PyInt_AS_LONG(command) - tagtableentry->cmd; /* Check command arguments */ Py_INCREF(args); own_args = 1; switch (tagtableentry->cmd) { case MATCH_JUMP: /* == MATCH_FAIL */ case MATCH_EOF: case MATCH_LOOP: /* args is ignored */ break; case MATCH_SKIP: case MATCH_MOVE: case MATCH_LOOPCONTROL: Py_AssertWithArg(PyInt_Check(args), PyExc_TypeError, "tag table entry %d: " "Skip|Move|LoopControl command argument " "must be an integer", (unsigned int)i); break; case MATCH_JUMPTARGET: Py_AssertWithArg(PyString_Check(args), PyExc_TypeError, "tag table entry %d: " "JumpMark command argument must be a string",(unsigned int)i); if (tc_add_jumptarget(jumpdict, args, i + 1)) goto onError; break; case MATCH_ALLIN: case MATCH_ALLNOTIN: case MATCH_IS: case MATCH_ISIN: case MATCH_ISNOTIN: case MATCH_WORD: case MATCH_WORDSTART: case MATCH_WORDEND: args = tc_convert_string_arg(args, i, tabletype); if (args == NULL) goto onError; break; case MATCH_ALLINSET: case MATCH_ISINSET: Py_AssertWithArg(PyString_Check(args) && PyString_GET_SIZE(args) == 32, PyExc_TypeError, "tag table entry %d: " "AllInSet|IsInSet command argument must " "be a set() string",(unsigned int)i); break; case MATCH_ALLINCHARSET: case MATCH_ISINCHARSET: Py_AssertWithArg(mxCharSet_Check(args), PyExc_TypeError, "tag table entry %d: " "AllInCharSet|IsInCharSet command argument must " "be a CharSet instance",(unsigned int)i); break; case MATCH_SWORDSTART: /* == MATCH_NOWORD */ case MATCH_SWORDEND: case MATCH_SFINDWORD: Py_AssertWithArg(mxTextSearch_Check(args), PyExc_TypeError, "tag table entry %d: " "sWordStart|sWordEnd|sFindWord command " "argument must be a TextSearch search " "object",(unsigned int)i); break; case MATCH_TABLE: case MATCH_SUBTABLE: Py_AssertWithArg(mxTagTable_Check(args) || PyTuple_Check(args) || PyList_Check(args) || (PyInt_Check(args) && PyInt_AS_LONG(args) == MATCH_THISTABLE), PyExc_TypeError, "tag table entry %d: " "Table|SubTable command argument " "must be a tag table tuple/object or " "ThisTable", (unsigned int)i); /* XXX We shouldn't recursively compile tag table tuples here because this will slow down the compile process too much and it's not clear whether this particular table will ever be used during tagging. */ if (!mxTagTable_Check(args) && !PyInt_Check(args)) { Py_DECREF(args); args = mxTagTable_New(args, tabletype, cacheable); if (args == NULL) goto onError; } break; case MATCH_TABLEINLIST: case MATCH_SUBTABLEINLIST: Py_AssertWithArg(PyTuple_Check(args) && PyTuple_GET_SIZE(args) == 2 && PyList_Check(PyTuple_GET_ITEM(args, 0)) && PyInt_Check(PyTuple_GET_ITEM(args, 1)), PyExc_TypeError, "tag table entry %d: " "TableInList|SubTableInList command argument " "must be a 2-tuple (list, integer)", (unsigned int)i); break; case MATCH_CALL: Py_AssertWithArg(PyCallable_Check(args), PyExc_TypeError, "tag table entry %d: " "Call command argument " "must be a callable object", (unsigned int)i); break; case MATCH_CALLARG: Py_AssertWithArg(PyTuple_Check(args) && PyTuple_GET_SIZE(args) > 0 && PyCallable_Check(PyTuple_GET_ITEM(args, 0)), PyExc_TypeError, "tag table entry %d: " "CallArg command argument " "must be a tuple (fct,[arg0,arg1,...])", (unsigned int)i); break; default: Py_ErrorWith2Args(PyExc_TypeError, "tag table entry %d: " "unknown command integer: %i", (unsigned int)i, tagtableentry->cmd); } /* Store command args */ tagtableentry->args = args; own_args = 0; /* Decode jump offsets */ if (jne) { if (PyInt_Check(jne)) tagtableentry->jne = PyInt_AS_LONG(jne); else if (PyString_Check(jne)) { /* Mark for back-patching */ tagtableentry->jne = -424242; secondpass = 1; } else Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "jne must be an integer or string", (unsigned int)i); } else tagtableentry->jne = 0; if (je) { if (PyInt_Check(je)) tagtableentry->je = PyInt_AS_LONG(je); else if (PyString_Check(je)) { /* Mark for back-patching */ tagtableentry->je = -424242; secondpass = 1; } else Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "je must be an integer or string", (unsigned int)i); } else tagtableentry->je = 1; } /* Second pass (needed to patch string jump targets) */ if (secondpass) for (i = 0; i < size; i++) { mxTagTableEntry *tagtableentry = &tagtable->entry[i]; if (tagtableentry->je != -424242 && tagtableentry->jne != -424242) continue; /* Entry (most probably) needs back-patching */ entry = tc_get_item(table, i); if (entry == NULL) { Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "unexpected error (not found)", (unsigned int)i); } /* Get entry length */ entry_len = tc_length(entry); if (entry_len < 0) { Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "unexpected error (no length)", (unsigned int)i); } /* Decode jump offsets */ if (entry_len >= 4) jne = tc_get_item(entry, 3); else jne = NULL; if (entry_len >= 5) je = tc_get_item(entry, 4); else je = NULL; /* Patch jump offsets */ if (jne && PyString_Check(jne)) { v = PyDict_GetItem(jumpdict, jne); if (v == NULL || !PyInt_Check(v)) Py_ErrorWith2Args(PyExc_TypeError, "tag table entry %d: " "jne jump target '%s' not found", (unsigned int)i, PyString_AS_STRING(jne)); tagtableentry->jne = PyInt_AS_LONG(v) - i; } if (je && PyString_Check(je)) { v = PyDict_GetItem(jumpdict, je); if (v == NULL || !PyInt_Check(v)) Py_ErrorWith2Args(PyExc_TypeError, "tag table entry %d: " "je jump target '%s' not found", (unsigned int)i, PyString_AS_STRING(je)); tagtableentry->je = PyInt_AS_LONG(v) - i; } } Py_DECREF(jumpdict); return 0; onError: if (own_args) { Py_DECREF(args); } return -1; } /* Check the cache for an already compiled TagTable for this definition. Return NULL in case of an error, Py_None without INCREF in case no such table was found or the TagTable object. */ static PyObject *consult_tagtable_cache(PyObject *definition, int tabletype, int cacheable) { PyObject *v, *key, *tt; if (!PyTuple_Check(definition) || !cacheable) return Py_None; key = PyTuple_New(2); if (key == NULL) goto onError; v = PyInt_FromLong((long) definition); if (v == NULL) goto onError; PyTuple_SET_ITEM(key, 0, v); v = PyInt_FromLong(tabletype); if (v == NULL) goto onError; PyTuple_SET_ITEM(key, 1, v); tt = PyDict_GetItem(mxTextTools_TagTables, key); Py_DECREF(key); if (tt != NULL) { Py_INCREF(tt); return tt; } return Py_None; onError: return NULL; } /* Adds the compiled tagtable to the cache. Returns -1 in case of an error, 0 on success. */ static int add_to_tagtable_cache(PyObject *definition, int tabletype, int cacheable, PyObject *tagtable) { PyObject *v, *key; int rc; if (!PyTuple_Check(definition) || !cacheable) return 0; key = PyTuple_New(2); if (key == NULL) goto onError; v = PyInt_FromLong((long) definition); if (v == NULL) goto onError; PyTuple_SET_ITEM(key, 0, v); v = PyInt_FromLong(tabletype); if (v == NULL) goto onError; PyTuple_SET_ITEM(key, 1, v); /* Hard-limit the cache size */ if (PyDict_Size(mxTextTools_TagTables) >= MAX_TAGTABLES_CACHE_SIZE) PyDict_Clear(mxTextTools_TagTables); rc = PyDict_SetItem(mxTextTools_TagTables, key, tagtable); Py_DECREF(key); if (rc) goto onError; return 0; onError: return -1; } /* allocation */ PyObject *mxTagTable_New(PyObject *definition, int tabletype, int cacheable) { mxTagTableObject *tagtable = 0; PyObject *v; Py_ssize_t size; /* First, consult the TagTable cache */ v = consult_tagtable_cache(definition, tabletype, cacheable); if (v == NULL) goto onError; else if (v != Py_None) return v; size = tc_length(definition); if (size < 0) Py_Error(PyExc_TypeError, "tag table definition must be a tuple or a list"); tagtable = PyObject_NEW_VAR(mxTagTableObject, &mxTagTable_Type, size); if (tagtable == NULL) goto onError; if (cacheable) { Py_INCREF(definition); tagtable->definition = definition; } else tagtable->definition = NULL; tagtable->tabletype = tabletype; /* Compile table ... */ if (init_tag_table(tagtable, definition, size, tabletype, cacheable)) goto onError; /* Cache the compiled table if it is cacheable and derived from a tuple */ if (add_to_tagtable_cache(definition, tabletype, cacheable, (PyObject *)tagtable)) goto onError; return (PyObject *)tagtable; onError: Py_XDECREF(tagtable); return NULL; } Py_C_Function( mxTagTable_TagTable, "TagTable(definition[,cachable=1])\n\n" ) { PyObject *definition; int cacheable = 1; Py_Get2Args("O|i:TagTable", definition, cacheable); return mxTagTable_New(definition, 0, cacheable); onError: return NULL; } #ifdef HAVE_UNICODE Py_C_Function( mxTagTable_UnicodeTagTable, "TagTable(definition[,cachable=1])\n\n" ) { PyObject *definition; int cacheable = 1; Py_Get2Args("O|i:UnicodeTagTable", definition, cacheable); return mxTagTable_New(definition, 1, cacheable); onError: return NULL; } #endif static void mxTagTable_Free(mxTagTableObject *tagtable) { tc_cleanup(tagtable); Py_XDECREF(tagtable->definition); PyObject_Del(tagtable); } /* C APIs */ #define tagtable ((mxTagTableObject *)self) static PyObject *mxTagTable_CompiledDefinition(PyObject *self) { PyObject *tuple = 0, *v, *w; Py_ssize_t i; Py_ssize_t size; if (!mxTagTable_Check(self)) { PyErr_BadInternalCall(); goto onError; } size = tagtable->numentries; tuple = PyTuple_New(size); if (tuple == NULL) goto onError; for (i = 0; i < size; i++) { mxTagTableEntry *tagtableentry = &tagtable->entry[i]; /* Build tuple (tagobj, command, args, jne, je) */ v = PyTuple_New(5); if (v == NULL) goto onError; w = tagtableentry->tagobj; if (w == NULL) w = Py_None; Py_INCREF(w); PyTuple_SET_ITEM(v, 0, w); PyTuple_SET_ITEM(v, 1, PyInt_FromLong(tagtableentry->cmd | tagtableentry->flags)); w = tagtableentry->args; if (w == NULL) w = Py_None; Py_INCREF(w); PyTuple_SET_ITEM(v, 2, w); PyTuple_SET_ITEM(v, 3, PyInt_FromLong(tagtableentry->jne)); PyTuple_SET_ITEM(v, 4, PyInt_FromLong(tagtableentry->je)); if (PyErr_Occurred()) { Py_DECREF(v); goto onError; } PyTuple_SET_ITEM(tuple, i, v); } return tuple; onError: Py_XDECREF(tuple); return NULL; } /* methods */ Py_C_Function( mxTagTable_compiled, ".compiled()\n\n" ) { Py_NoArgsCheck(); return mxTagTable_CompiledDefinition(self); onError: return NULL; } #ifdef COPY_PROTOCOL Py_C_Function( mxTagTable_copy, "copy([memo])\n\n" "Return a new reference for the instance. This function\n" "is used for the copy-protocol. Real copying doesn't take\n" "place, since the instances are immutable.") { PyObject *memo; Py_GetArg("|O",memo); Py_INCREF(tagtable); return (PyObject *)tagtable; onError: return NULL; } #endif #undef tagtable /* --- slots --- */ static PyObject *mxTagTable_Repr(mxTagTableObject *self) { char t[100]; if (self->tabletype == MXTAGTABLE_STRINGTYPE) sprintf(t,"", (long)self); else if (self->tabletype == MXTAGTABLE_UNICODETYPE) sprintf(t,"", (long)self); else sprintf(t,"", (long)self); return PyString_FromString(t); } static PyMethodDef mxTagTable_Methods[] = { Py_MethodListEntryNoArgs("compiled",mxTagTable_compiled), #ifdef COPY_PROTOCOL Py_MethodListEntry("__deepcopy__",mxTagTable_copy), Py_MethodListEntry("__copy__",mxTagTable_copy), #endif {NULL,NULL} /* end of list */ }; static PyMemberDef mxTagTable_Members[] = { {"definition",T_OBJECT_EX,offsetof(mxTagTableObject,definition),READONLY,"Definition"}, {NULL} }; /* Python Type Tables */ PyTypeObject mxTagTable_Type = { PyVarObject_HEAD_INIT(NULL, 0) /* init at startup ! */ "Tag Table", /* tp_name */ sizeof(mxTagTableObject), /* tp_basicsize */ sizeof(mxTagTableEntry), /* tp_itemsize */ /* methods */ (destructor)mxTagTable_Free, /* tp_dealloc */ (printfunc)0, /* tp_print */ (getattrfunc)0, /* tp_getattr */ (setattrfunc)0, /* tp_setattr */ 0, /* tp_compare */ (reprfunc)mxTagTable_Repr, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ (hashfunc)0, /* tp_hash */ (ternaryfunc)0, /* tp_call */ (reprfunc)0, /* tp_str */ (getattrofunc)0, /* tp_getattro */ (setattrofunc)0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT, /* tp_flags */ (char*) 0, /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ mxTagTable_Methods, /* tp_methods */ mxTagTable_Members, /* tp_members */ }; /* --- Internal functions ----------------------------------------------*/ #ifdef HAVE_UNICODE /* Same as mxTextTools_Join() for Unicode objects. */ static PyObject *mxTextTools_UnicodeJoin(PyObject *seq, Py_ssize_t start, Py_ssize_t stop, PyObject *separator) { PyObject *newstring = 0, *tempstr = 0; Py_ssize_t newstring_len,current_len = 0; Py_UNICODE *p; Py_ssize_t i; Py_UNICODE *sep; Py_ssize_t sep_len; if (separator) { separator = PyUnicode_FromObject(separator); if (separator == NULL) goto onError; sep = PyUnicode_AS_UNICODE(separator); sep_len = PyUnicode_GET_SIZE(separator); } else { sep = NULL; sep_len = 0; } /* Create an empty new string */ newstring_len = (10 + sep_len) * (stop - start); newstring = PyUnicode_FromUnicode(NULL, newstring_len); if (newstring == NULL) goto onError; p = PyUnicode_AS_UNICODE(newstring); /* Join with separator */ for (i = start; i < stop; i++) { register PyObject *o; Py_UNICODE *st; Py_ssize_t len_st; o = PySequence_GetItem(seq, i); if PyTuple_Check(o) { /* Tuple entry: (string,l,r,[...]) */ register Py_ssize_t l,r; /* parse tuple */ Py_Assert((PyTuple_GET_SIZE(o) >= 3) && PyInt_Check(PyTuple_GET_ITEM(o,1)) && PyInt_Check(PyTuple_GET_ITEM(o,2)), PyExc_TypeError, "tuples must be of the format (string,l,r[,...])"); tempstr = PyUnicode_FromObject(PyTuple_GET_ITEM(o,0)); if (tempstr == NULL) goto onError; st = PyUnicode_AS_UNICODE(tempstr); len_st = PyUnicode_GET_SIZE(tempstr); l = PyInt_AS_LONG(PyTuple_GET_ITEM(o,1)); r = PyInt_AS_LONG(PyTuple_GET_ITEM(o,2)); /* compute slice */ if (r > len_st) r = len_st; else if (r < 0) { r += len_st + 1; if (r < 0) r = 0; } if (l > len_st) l = len_st; else if (l < 0) { l += len_st + 1; if (l < 0) l = 0; } /* empty ? */ if (l > r) continue; len_st = r - l; if (len_st == 0) continue; /* get pointer right */ st += l; } else { /* Must be a string entry: take the whole string */ tempstr = PyUnicode_FromObject(o); if (tempstr == NULL) goto onError; st = PyUnicode_AS_UNICODE(tempstr); len_st = PyUnicode_GET_SIZE(tempstr); } Py_DECREF(o); /* Resize the new string if needed */ while (current_len + len_st + sep_len >= newstring_len) { newstring_len += newstring_len >> 1; if (PyUnicode_Resize(&newstring, newstring_len)) goto onError; p = PyUnicode_AS_UNICODE(newstring) + current_len; } /* Insert separator */ if (i > 0 && sep_len > 0) { Py_UNICODE_COPY(p, sep, sep_len); p += sep_len; current_len += sep_len; } /* Copy snippet into new string */ Py_UNICODE_COPY(p, st, len_st); p += len_st; current_len += len_st; Py_DECREF(tempstr); tempstr = NULL; } /* Resize new string to the actual length */ if (PyUnicode_Resize(&newstring, current_len)) goto onError; Py_XDECREF(separator); return newstring; onError: Py_XDECREF(newstring); Py_XDECREF(separator); Py_XDECREF(tempstr); return NULL; } #endif /* Enhanced string join: also excepts tuple (text, left, right,...) entries which then cause text[left:right] to be used as string snippet. separator may be NULL; in that case, "" is used as separator. */ static PyObject *mxTextTools_Join(PyObject *seq, Py_ssize_t start, Py_ssize_t stop, PyObject *separator) { PyObject *newstring = 0; Py_ssize_t newstring_len, current_len = 0; char *p; Py_ssize_t i; char *sep; Py_ssize_t sep_len; if (separator) { #ifdef HAVE_UNICODE if (PyUnicode_Check(separator)) return mxTextTools_UnicodeJoin(seq, start, stop, separator); #endif Py_Assert(PyString_Check(separator), PyExc_TypeError, "separator must be a string"); sep = PyString_AS_STRING(separator); sep_len = PyString_GET_SIZE(separator); } else { sep = NULL; sep_len = 0; } /* Create an empty new string */ newstring_len = (10 + sep_len) * (stop - start); newstring = PyString_FromStringAndSize((char*)NULL, newstring_len); if (newstring == NULL) goto onError; p = PyString_AS_STRING(newstring); /* Join with separator */ for (i = start; i < stop; i++) { register PyObject *o; char *st; Py_ssize_t len_st; o = PySequence_GetItem(seq, i); if PyTuple_Check(o) { /* Tuple entry: (string,l,r,[...]) */ register Py_ssize_t l,r; /* parse tuple */ Py_Assert((PyTuple_GET_SIZE(o) >= 3) && PyInt_Check(PyTuple_GET_ITEM(o,1)) && PyInt_Check(PyTuple_GET_ITEM(o,2)), PyExc_TypeError, "tuples must be of the format (string,int,int[,...])"); #ifdef HAVE_UNICODE if (PyUnicode_Check(PyTuple_GET_ITEM(o,0))) { /* Redirect to Unicode implementation; all previous work is lost. */ Py_DECREF(o); Py_DECREF(newstring); return mxTextTools_UnicodeJoin(seq, start, stop, separator); } #endif Py_Assert(PyString_Check(PyTuple_GET_ITEM(o,0)), PyExc_TypeError, "tuples must be of the format (string,int,int[,...])"); st = PyString_AS_STRING(PyTuple_GET_ITEM(o,0)); len_st = PyString_GET_SIZE(PyTuple_GET_ITEM(o,0)); l = PyInt_AS_LONG(PyTuple_GET_ITEM(o,1)); r = PyInt_AS_LONG(PyTuple_GET_ITEM(o,2)); /* compute slice */ if (r > len_st) r = len_st; else if (r < 0) { r += len_st + 1; if (r < 0) r = 0; } if (l > len_st) l = len_st; else if (l < 0) { l += len_st + 1; if (l < 0) l = 0; } /* empty ? */ if (l > r) continue; len_st = r - l; if (len_st == 0) continue; /* get pointer right */ st += l; } else if (PyString_Check(o)) { /* String entry: take the whole string */ st = PyString_AS_STRING(o); len_st = PyString_GET_SIZE(o); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(o)) { /* Redirect to Unicode implementation; all previous work is lost. */ Py_DECREF(o); Py_DECREF(newstring); return mxTextTools_UnicodeJoin(seq, start, stop, separator); } #endif else { Py_DECREF(o); Py_Error(PyExc_TypeError, "list must contain tuples or strings as entries"); } Py_DECREF(o); /* Resize the new string if needed */ while (current_len + len_st + sep_len >= newstring_len) { newstring_len += newstring_len >> 1; if (_PyString_Resize(&newstring, newstring_len)) goto onError; p = PyString_AS_STRING(newstring) + current_len; } /* Insert separator */ if (i > 0 && sep_len > 0) { memcpy(p, sep, sep_len); p += sep_len; current_len += sep_len; } /* Copy snippet into new string */ memcpy(p,st,len_st); p += len_st; current_len += len_st; } /* Resize new string to the actual length */ if (_PyString_Resize(&newstring, current_len)) goto onError; return newstring; onError: Py_XDECREF(newstring); return NULL; } static PyObject *mxTextTools_HexStringFromString(char *str, Py_ssize_t len) { PyObject *w = 0; Py_ssize_t i; char *hex; static const char hexdigits[] = "0123456789abcdef"; /* Convert to HEX */ w = PyString_FromStringAndSize(NULL,2*len); if (!w) goto onError; hex = PyString_AS_STRING(w); for (i = 0; i < len; i ++) { unsigned char c = (unsigned char)*str; *hex++ = hexdigits[c >> 4]; *hex++ = hexdigits[c & 0x0F]; str++; } return w; onError: Py_XDECREF(w); return NULL; } static PyObject *mxTextTools_StringFromHexString(char *hex, Py_ssize_t len) { PyObject *w = 0; Py_ssize_t i; char *str; static const char hexdigits[] = "0123456789abcdef"; /* Convert to string */ Py_Assert(len % 2 == 0, PyExc_TypeError, "need 2-digit hex string argument"); len >>= 1; w = PyString_FromStringAndSize(NULL,len); if (!w) goto onError; str = PyString_AS_STRING(w); for (i = 0; i < len; i++,str++) { register char c; register Py_ssize_t j; c = tolower(*hex++); for (j = 0; j < (Py_ssize_t)sizeof(hexdigits); j++) if (c == hexdigits[j]) { *str = j << 4; break; } if (j == sizeof(hexdigits)) { DPRINTF("Failed: '%c' (%u) at %i\n",c,(unsigned int)c,i); Py_Error(PyExc_ValueError, "argument contains non-hex characters"); } c = tolower(*hex++); for (j = 0; j < (Py_ssize_t)sizeof(hexdigits); j++) if (c == hexdigits[j]) { *str += j; break; } if (j == sizeof(hexdigits)) { DPRINTF("Failed2: '%c' (%u) at %i\n",c,(unsigned int)c,i); Py_Error(PyExc_ValueError, "argument contains non-hex characters"); } } return w; onError: Py_XDECREF(w); return NULL; } static int mxTextTools_IsASCII(PyObject *text, Py_ssize_t left, Py_ssize_t right) { if (PyString_Check(text)) { Py_ssize_t len; register Py_ssize_t i; register unsigned char *str = (unsigned char *)PyString_AS_STRING(text); len = PyString_GET_SIZE(text); Py_CheckSequenceSlice(len, left, right); for (i = left; i < right; i++) if (str[i] >= 128) return 0; return 1; } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_ssize_t len; register Py_ssize_t i; register Py_UNICODE *str = PyUnicode_AS_UNICODE(text); len = PyUnicode_GET_SIZE(text); Py_CheckSequenceSlice(len, left, right); for (i = left; i < right; i++) if (str[i] >= 128) return 0; return 1; } #endif else Py_Error(PyExc_TypeError, "need string object"); onError: return -1; } /* Takes a list of tuples (replacement,l,r,...) and produces a taglist suitable for mxTextTools_Join() which creates a copy of text where every slice [l:r] is replaced by the given replacement. */ static PyObject *mxTextTools_Joinlist(PyObject *text, PyObject *list, Py_ssize_t pos, Py_ssize_t text_len) { PyObject *joinlist = 0; Py_ssize_t list_len; Py_ssize_t i; Py_ssize_t listitem = 0; Py_ssize_t listsize = INITIAL_LIST_SIZE; if (PyString_Check(text)) { Py_CheckStringSlice(text, pos, text_len); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, pos, text_len); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); Py_Assert(PyList_Check(list), PyExc_TypeError, "expected a list of tuples as second argument"); list_len = PyList_GET_SIZE(list); joinlist = PyList_New(listsize); if (joinlist == NULL) goto onError; for (i = 0; i < list_len; i++) { register PyObject *t; register Py_ssize_t left, right; t = PyList_GET_ITEM(list, i); Py_Assert(PyTuple_Check(t) && (PyTuple_GET_SIZE(t) >= 3) && (PyString_Check(PyTuple_GET_ITEM(t,0)) || PyUnicode_Check(PyTuple_GET_ITEM(t,0))) && PyInt_Check(PyTuple_GET_ITEM(t,1)) && PyInt_Check(PyTuple_GET_ITEM(t,2)), PyExc_TypeError, "tuples must be of the form (string,int,int,...)"); left = PyInt_AS_LONG(PyTuple_GET_ITEM(t,1)); right = PyInt_AS_LONG(PyTuple_GET_ITEM(t,2)); Py_Assert(left >= pos, PyExc_ValueError, "list is not sorted ascending"); if (left > pos) { /* joinlist.append((text,pos,left)) */ register PyObject *v; register PyObject *w; v = PyTuple_New(3); if (v == NULL) goto onError; Py_INCREF(text); PyTuple_SET_ITEM(v,0,text); w = PyInt_FromLong(pos); if (w == NULL) goto onError; PyTuple_SET_ITEM(v,1,w); w = PyTuple_GET_ITEM(t,1); Py_INCREF(w); PyTuple_SET_ITEM(v,2,w); if (listitem < listsize) PyList_SET_ITEM(joinlist,listitem,v); else { PyList_Append(joinlist,v); Py_DECREF(v); } listitem++; } /* joinlist.append(string) */ if (listitem < listsize) { register PyObject *v = PyTuple_GET_ITEM(t,0); Py_INCREF(v); PyList_SET_ITEM(joinlist,listitem,v); } else PyList_Append(joinlist,PyTuple_GET_ITEM(t,0)); listitem++; pos = right; } if (pos < text_len) { /* joinlist.append((text,pos,text_len)) */ register PyObject *v; register PyObject *w; v = PyTuple_New(3); if (v == NULL) goto onError; Py_INCREF(text); PyTuple_SET_ITEM(v,0,text); w = PyInt_FromLong(pos); if (w == NULL) goto onError; PyTuple_SET_ITEM(v,1,w); w = PyInt_FromLong(text_len); if (w == NULL) goto onError; PyTuple_SET_ITEM(v,2,w); if (listitem < listsize) PyList_SET_ITEM(joinlist,listitem,v); else { PyList_Append(joinlist,v); Py_DECREF(v); } listitem++; } /* Resize list if necessary */ if (listitem < listsize) PyList_SetSlice(joinlist,listitem,listsize,(PyObject*)NULL); return joinlist; onError: Py_XDECREF(joinlist); return NULL; } #ifdef HAVE_UNICODE static PyObject *mxTextTools_UnicodeCharSplit(PyObject *text, PyObject *separator, Py_ssize_t start, Py_ssize_t text_len) { PyObject *list = NULL; register Py_ssize_t x; Py_ssize_t listitem = 0; Py_ssize_t listsize = INITIAL_LIST_SIZE; Py_UNICODE *tx; Py_UNICODE sep; text = PyUnicode_FromObject(text); if (text == NULL) { separator = NULL; goto onError; } separator = PyUnicode_FromObject(separator); if (separator == NULL) goto onError; Py_CheckUnicodeSlice(text, start, text_len); Py_Assert(PyUnicode_GET_SIZE(separator) == 1, PyExc_TypeError, "separator must be a single character"); tx = PyUnicode_AS_UNICODE(text); sep = *PyUnicode_AS_UNICODE(separator); list = PyList_New(listsize); if (!list) goto onError; x = start; while (1) { PyObject *s; register Py_ssize_t z; /* Skip to next separator */ z = x; for (;x < text_len; x++) if (tx[x] == sep) break; /* Append the slice to list */ s = PyUnicode_FromUnicode(&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; if (x == text_len) break; /* Skip separator */ x++; } /* Resize list if necessary */ if (listitem < listsize) PyList_SetSlice(list,listitem,listsize,(PyObject*)NULL); Py_DECREF(text); Py_DECREF(separator); return list; onError: Py_XDECREF(list); Py_XDECREF(text); Py_XDECREF(separator); return NULL; } #endif static PyObject *mxTextTools_CharSplit(PyObject *text, PyObject *separator, Py_ssize_t start, Py_ssize_t text_len) { PyObject *list = 0; register Py_ssize_t x; Py_ssize_t listitem = 0; Py_ssize_t listsize = INITIAL_LIST_SIZE; char *tx; char sep; #ifdef HAVE_UNICODE if (PyUnicode_Check(text) || PyUnicode_Check(separator)) return mxTextTools_UnicodeCharSplit(text, separator, start, text_len); #endif if (PyString_Check(text) && PyString_Check(separator)) { Py_CheckStringSlice(text, start, text_len); } else Py_Error(PyExc_TypeError, "text and separator must be strings or unicode"); Py_Assert(PyString_GET_SIZE(separator) == 1, PyExc_TypeError, "separator must be a single character"); tx = PyString_AS_STRING(text); sep = *PyString_AS_STRING(separator); list = PyList_New(listsize); if (!list) goto onError; x = start; while (1) { PyObject *s; register Py_ssize_t z; /* Skip to next separator */ z = x; for (;x < text_len; x++) if (tx[x] == sep) break; /* Append the slice to list */ s = PyString_FromStringAndSize(&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; if (x == text_len) break; /* Skip separator */ x++; } /* Resize list if necessary */ if (listitem < listsize) PyList_SetSlice(list,listitem,listsize,(PyObject*)NULL); return list; onError: Py_XDECREF(list); return NULL; } #ifdef HAVE_UNICODE static PyObject *mxTextTools_UnicodeSplitAt(PyObject *text, PyObject *separator, Py_ssize_t nth, Py_ssize_t start, Py_ssize_t text_len) { PyObject *tuple = 0; register Py_ssize_t x; PyObject *s; Py_UNICODE *tx; Py_UNICODE sep; text = PyUnicode_FromObject(text); if (text == NULL) { separator = NULL; goto onError; } separator = PyUnicode_FromObject(separator); if (separator == NULL) goto onError; Py_CheckUnicodeSlice(text, start, text_len); Py_Assert(PyUnicode_GET_SIZE(separator) == 1, PyExc_TypeError, "separator must be a single character"); tx = PyUnicode_AS_UNICODE(text); sep = *PyUnicode_AS_UNICODE(separator); tuple = PyTuple_New(2); if (!tuple) goto onError; if (nth > 0) { /* Skip to nth separator from the left */ x = start; while (1) { for (; x < text_len; x++) if (tx[x] == sep) break; if (--nth == 0 || x == text_len) break; x++; } } else if (nth < 0) { /* Skip to nth separator from the right */ x = text_len - 1; while (1) { for (; x >= start; x--) if (tx[x] == sep) break; if (++nth == 0 || x < start) break; x--; } } else Py_Error(PyExc_ValueError, "nth must be non-zero"); /* Add to tuple */ if (x < start) s = PyUnicode_FromUnicode((Py_UNICODE *)"", 0); else s = PyUnicode_FromUnicode(&tx[start], x - start); if (!s) goto onError; PyTuple_SET_ITEM(tuple,0,s); /* Skip separator */ x++; if (x >= text_len) s = PyUnicode_FromUnicode((Py_UNICODE *)"", 0); else s = PyUnicode_FromUnicode(&tx[x], text_len - x); if (!s) goto onError; PyTuple_SET_ITEM(tuple,1,s); Py_DECREF(text); Py_DECREF(separator); return tuple; onError: Py_XDECREF(tuple); Py_XDECREF(text); Py_XDECREF(separator); return NULL; } #endif static PyObject *mxTextTools_SplitAt(PyObject *text, PyObject *separator, Py_ssize_t nth, Py_ssize_t start, Py_ssize_t text_len) { PyObject *tuple = 0; register Py_ssize_t x; PyObject *s; char *tx; char sep; #ifdef HAVE_UNICODE if (PyUnicode_Check(text) || PyUnicode_Check(separator)) return mxTextTools_UnicodeSplitAt(text, separator, nth, start, text_len); #endif if (PyString_Check(text) && PyString_Check(separator)) { Py_CheckStringSlice(text, start, text_len); } else Py_Error(PyExc_TypeError, "text and separator must be strings or unicode"); Py_Assert(PyString_GET_SIZE(separator) == 1, PyExc_TypeError, "separator must be a single character"); tx = PyString_AS_STRING(text); sep = *PyString_AS_STRING(separator); tuple = PyTuple_New(2); if (!tuple) goto onError; if (nth > 0) { /* Skip to nth separator from the left */ x = start; while (1) { for (; x < text_len; x++) if (tx[x] == sep) break; if (--nth == 0 || x == text_len) break; x++; } } else if (nth < 0) { /* Skip to nth separator from the right */ x = text_len - 1; while (1) { for (; x >= start; x--) if (tx[x] == sep) break; if (++nth == 0 || x < start) break; x--; } } else Py_Error(PyExc_ValueError, "nth must be non-zero"); /* Add to tuple */ if (x < start) s = PyString_FromStringAndSize("",0); else s = PyString_FromStringAndSize(&tx[start], x - start); if (!s) goto onError; PyTuple_SET_ITEM(tuple,0,s); /* Skip separator */ x++; if (x >= text_len) s = PyString_FromStringAndSize("",0); else s = PyString_FromStringAndSize(&tx[x], text_len - x); if (!s) goto onError; PyTuple_SET_ITEM(tuple,1,s); return tuple; onError: Py_XDECREF(tuple); return NULL; } #ifdef HAVE_UNICODE static PyObject *mxTextTools_UnicodeSuffix(PyObject *text, PyObject *suffixes, Py_ssize_t start, Py_ssize_t text_len, PyObject *translate) { Py_ssize_t i; Py_UNICODE *tx; text = PyUnicode_FromObject(text); if (text == NULL) goto onError; if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, text_len); } else Py_Error(PyExc_TypeError, "expected unicode"); Py_Assert(PyTuple_Check(suffixes), PyExc_TypeError, "suffixes needs to be a tuple of unicode strings"); /* XXX Add support for translate... */ Py_Assert(translate == NULL, PyExc_TypeError, "translate is not supported for Unicode suffix()es"); tx = PyUnicode_AS_UNICODE(text); for (i = 0; i < PyTuple_GET_SIZE(suffixes); i++) { PyObject *suffix = PyTuple_GET_ITEM(suffixes,i); Py_ssize_t start_cmp; suffix = PyUnicode_FromObject(suffix); if (suffix == NULL) goto onError; start_cmp = text_len - PyUnicode_GET_SIZE(suffix); if (start_cmp >= start && PyUnicode_AS_UNICODE(suffix)[0] == tx[start_cmp] && memcmp(PyUnicode_AS_UNICODE(suffix), &tx[start_cmp], PyUnicode_GET_DATA_SIZE(suffix)) == 0) { Py_DECREF(text); return suffix; } Py_DECREF(suffix); } Py_DECREF(text); Py_ReturnNone(); onError: Py_XDECREF(text); return NULL; } #endif static PyObject *mxTextTools_Suffix(PyObject *text, PyObject *suffixes, Py_ssize_t start, Py_ssize_t text_len, PyObject *translate) { Py_ssize_t i; char *tx; #ifdef HAVE_UNICODE if (PyUnicode_Check(text)) return mxTextTools_UnicodeSuffix(text, suffixes, start, text_len, translate); #endif if (PyString_Check(text)) { Py_CheckStringSlice(text, start, text_len); } else Py_Error(PyExc_TypeError, "expected string or unicode"); Py_Assert(PyTuple_Check(suffixes), PyExc_TypeError, "suffixes needs to be a tuple of strings"); tx = PyString_AS_STRING(text); if (translate) { char *tr; Py_Assert(PyString_Check(translate) && PyString_GET_SIZE(translate) == 256, PyExc_TypeError, "translate must be a string having 256 characters"); tr = PyString_AS_STRING(translate); for (i = 0; i < PyTuple_GET_SIZE(suffixes); i++) { PyObject *suffix = PyTuple_GET_ITEM(suffixes, i); Py_ssize_t start_cmp; register char *s; register char *t; register Py_ssize_t j; Py_AssertWithArg(PyString_Check(suffix), PyExc_TypeError, "tuple entry %d is not a string",(unsigned int)i); start_cmp = text_len - PyString_GET_SIZE(suffix); if (start_cmp < start) continue; /* Do the compare using a translate table */ s = PyString_AS_STRING(suffix); t = tx + start_cmp; for (j = start_cmp; j < text_len; j++, s++, t++) if (*s != tr[(unsigned char)*t]) break; if (j == text_len) { Py_INCREF(suffix); return suffix; } } } else for (i = 0; i < PyTuple_GET_SIZE(suffixes); i++) { PyObject *suffix = PyTuple_GET_ITEM(suffixes,i); Py_ssize_t start_cmp; Py_AssertWithArg(PyString_Check(suffix), PyExc_TypeError, "tuple entry %d is not a string",(unsigned int)i); start_cmp = text_len - PyString_GET_SIZE(suffix); if (start_cmp < start) continue; /* Compare without translate table */ if (PyString_AS_STRING(suffix)[0] == tx[start_cmp] && strncmp(PyString_AS_STRING(suffix), &tx[start_cmp], PyString_GET_SIZE(suffix)) == 0) { Py_INCREF(suffix); return suffix; } } Py_ReturnNone(); onError: return NULL; } #ifdef HAVE_UNICODE static PyObject *mxTextTools_UnicodePrefix(PyObject *text, PyObject *prefixes, Py_ssize_t start, Py_ssize_t text_len, PyObject *translate) { Py_ssize_t i; Py_UNICODE *tx; text = PyUnicode_FromObject(text); if (text == NULL) goto onError; if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, text_len); } else Py_Error(PyExc_TypeError, "expected unicode"); Py_Assert(PyTuple_Check(prefixes), PyExc_TypeError, "prefixes needs to be a tuple of unicode strings"); /* XXX Add support for translate... */ Py_Assert(translate == NULL, PyExc_TypeError, "translate is not supported for Unicode prefix()es"); tx = PyUnicode_AS_UNICODE(text); for (i = 0; i < PyTuple_GET_SIZE(prefixes); i++) { PyObject *prefix = PyTuple_GET_ITEM(prefixes,i); prefix = PyUnicode_FromObject(prefix); if (prefix == NULL) goto onError; /* Compare without translate table */ if (start + PyString_GET_SIZE(prefix) <= text_len && PyUnicode_AS_UNICODE(prefix)[0] == tx[start] && memcmp(PyUnicode_AS_UNICODE(prefix), &tx[start], PyUnicode_GET_DATA_SIZE(prefix)) == 0) { Py_INCREF(prefix); return prefix; } Py_DECREF(prefix); } Py_DECREF(text); Py_ReturnNone(); onError: Py_XDECREF(text); return NULL; } #endif static PyObject *mxTextTools_Prefix(PyObject *text, PyObject *prefixes, Py_ssize_t start, Py_ssize_t text_len, PyObject *translate) { Py_ssize_t i; char *tx; #ifdef HAVE_UNICODE if (PyUnicode_Check(text)) return mxTextTools_UnicodePrefix(text, prefixes, start, text_len, translate); #endif if (PyString_Check(text)) { Py_CheckStringSlice(text, start, text_len); } else Py_Error(PyExc_TypeError, "expected string or unicode"); Py_Assert(PyTuple_Check(prefixes), PyExc_TypeError, "prefixes needs to be a tuple of strings"); tx = PyString_AS_STRING(text); if (translate) { char *tr; Py_Assert(PyString_Check(translate) && PyString_GET_SIZE(translate) == 256, PyExc_TypeError, "translate must be a string having 256 characters"); tr = PyString_AS_STRING(translate); for (i = 0; i < PyTuple_GET_SIZE(prefixes); i++) { PyObject *prefix = PyTuple_GET_ITEM(prefixes,i); Py_ssize_t cmp_len; register char *s; register char *t; register Py_ssize_t j; Py_AssertWithArg(PyString_Check(prefix), PyExc_TypeError, "tuple entry %d is not a string",(unsigned int)i); cmp_len = PyString_GET_SIZE(prefix); if (start + cmp_len > text_len) continue; /* Do the compare using a translate table */ s = PyString_AS_STRING(prefix); t = tx + start; for (j = 0; j < cmp_len; j++, s++, t++) if (*s != tr[(unsigned char)*t]) break; if (j == cmp_len) { Py_INCREF(prefix); return prefix; } } } else for (i = 0; i < PyTuple_GET_SIZE(prefixes); i++) { PyObject *prefix = PyTuple_GET_ITEM(prefixes,i); Py_AssertWithArg(PyString_Check(prefix), PyExc_TypeError, "tuple entry %d is not a string",(unsigned int)i); if (start + PyString_GET_SIZE(prefix) > text_len) continue; /* Compare without translate table */ if (PyString_AS_STRING(prefix)[0] == tx[start] && strncmp(PyString_AS_STRING(prefix), &tx[start], PyString_GET_SIZE(prefix)) == 0) { Py_INCREF(prefix); return prefix; } } Py_ReturnNone(); onError: return NULL; } /* Stips off characters appearing in the character set from text[start:stop] and returns the result as Python string object. where indicates the mode: where < 0: strip left only where = 0: strip left and right where > 0: strip right only */ static PyObject *mxTextTools_SetStrip(char *tx, Py_ssize_t tx_len, char *setstr, Py_ssize_t setstr_len, Py_ssize_t start, Py_ssize_t stop, Py_ssize_t where) { Py_ssize_t left, right; Py_Assert(setstr_len == 32, PyExc_TypeError, "separator needs to be a set as obtained from set()"); Py_CheckBufferSlice(tx_len, start, stop); /* Strip left */ if (where <= 0) { register Py_ssize_t x; for (x = start; x < stop; x++) if (!Py_CharInSet(tx[x], setstr)) break; left = x; } else left = start; /* Strip right */ if (where >= 0) { register Py_ssize_t x; for (x = stop - 1; x >= start; x--) if (!Py_CharInSet(tx[x], setstr)) break; right = x + 1; } else right = stop; return PyString_FromStringAndSize(tx + left, max(right - left, 0)); onError: return NULL; } static PyObject *mxTextTools_SetSplit(char *tx, Py_ssize_t tx_len, char *setstr, Py_ssize_t setstr_len, Py_ssize_t start, Py_ssize_t text_len) { PyObject *list = NULL; register Py_ssize_t x; Py_ssize_t listitem = 0; Py_ssize_t listsize = INITIAL_LIST_SIZE; Py_Assert(setstr_len == 32, PyExc_TypeError, "separator needs to be a set as obtained from set()"); Py_CheckBufferSlice(tx_len,start,text_len); list = PyList_New(listsize); if (!list) goto onError; x = start; while (x < text_len) { Py_ssize_t z; /* Skip all text in set */ for (;x < text_len; x++) { register Py_ssize_t c = (unsigned char)tx[x]; register Py_ssize_t block = (unsigned char)setstr[c >> 3]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } /* Skip all text not in set */ z = x; for (;x < text_len; x++) { register Py_ssize_t c = (unsigned char)tx[x]; register Py_ssize_t block = (unsigned char)setstr[c >> 3]; if (block && ((block & (1 << (c & 7))) != 0)) break; } /* Append the slice to list if it is not empty */ if (x > z) { PyObject *s; s = PyString_FromStringAndSize((char *)&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; } } /* Resize list if necessary */ if (listitem < listsize) PyList_SetSlice(list,listitem,listsize,(PyObject*)NULL); return list; onError: Py_XDECREF(list); return NULL; } static PyObject *mxTextTools_SetSplitX(char *tx, Py_ssize_t tx_len, char *setstr, Py_ssize_t setstr_len, Py_ssize_t start, Py_ssize_t text_len) { PyObject *list = NULL; register Py_ssize_t x; Py_ssize_t listitem = 0; Py_ssize_t listsize = INITIAL_LIST_SIZE; Py_Assert(setstr_len == 32, PyExc_TypeError, "separator needs to be a set as obtained from set()"); Py_CheckBufferSlice(tx_len,start,text_len); list = PyList_New(listsize); if (!list) goto onError; x = start; while (x < text_len) { PyObject *s; register Py_ssize_t z; /* Skip all text not in set */ z = x; for (;x < text_len; x++) { register unsigned int c = (unsigned char)tx[x]; register unsigned int block = (unsigned char)setstr[c >> 3]; if (block && ((block & (1 << (c & 7))) != 0)) break; } /* Append the slice to list */ s = PyString_FromStringAndSize((char *)&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; if (x >= text_len) break; /* Skip all text in set */ z = x; for (;x < text_len; x++) { register unsigned int c = (unsigned char)tx[x]; register unsigned int block = (unsigned char)setstr[c >> 3]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } /* Append the slice to list if it is not empty */ s = PyString_FromStringAndSize((char *)&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; } /* Resize list if necessary */ if (listitem < listsize) PyList_SetSlice(list,listitem,listsize,(PyObject*)NULL); return list; onError: Py_XDECREF(list); return NULL; } static PyObject *mxTextTools_Upper(PyObject *text) { PyObject *ntext; register unsigned char *s; register unsigned char *orig; register Py_ssize_t i; unsigned char *tr; Py_ssize_t len; Py_Assert(PyString_Check(text), PyExc_TypeError, "expected a Python string"); len = PyString_GET_SIZE(text); ntext = PyString_FromStringAndSize(NULL,len); if (!ntext) goto onError; /* Translate */ tr = (unsigned char *)PyString_AS_STRING(mx_ToUpper); orig = (unsigned char *)PyString_AS_STRING(text); s = (unsigned char *)PyString_AS_STRING(ntext); for (i = 0; i < len; i++, s++, orig++) *s = tr[*orig]; return ntext; onError: return NULL; } #ifdef HAVE_UNICODE static PyObject *mxTextTools_UnicodeUpper(PyObject *text) { PyObject *ntext; register Py_UNICODE *s; register Py_UNICODE *orig; register Py_ssize_t i; Py_ssize_t len; text = PyUnicode_FromObject(text); if (text == NULL) goto onError; len = PyUnicode_GET_SIZE(text); ntext = PyUnicode_FromUnicode(NULL, len); if (!ntext) goto onError; /* Translate */ orig = (Py_UNICODE *)PyUnicode_AS_UNICODE(text); s = (Py_UNICODE *)PyUnicode_AS_UNICODE(ntext); for (i = 0; i < len; i++, s++, orig++) *s = Py_UNICODE_TOUPPER(*orig); Py_DECREF(text); return ntext; onError: Py_XDECREF(text); return NULL; } #endif static PyObject *mxTextTools_Lower(PyObject *text) { PyObject *ntext; register unsigned char *s; register unsigned char *orig; register Py_ssize_t i; unsigned char *tr; Py_ssize_t len; Py_Assert(PyString_Check(text), PyExc_TypeError, "expected a Python string"); len = PyString_GET_SIZE(text); ntext = PyString_FromStringAndSize(NULL,len); if (!ntext) goto onError; /* Translate */ tr = (unsigned char *)PyString_AS_STRING(mx_ToLower); orig = (unsigned char *)PyString_AS_STRING(text); s = (unsigned char *)PyString_AS_STRING(ntext); for (i = 0; i < len; i++, s++, orig++) *s = tr[*orig]; return ntext; onError: return NULL; } #ifdef HAVE_UNICODE static PyObject *mxTextTools_UnicodeLower(PyObject *text) { PyObject *ntext; register Py_UNICODE *s; register Py_UNICODE *orig; register Py_ssize_t i; Py_ssize_t len; text = PyUnicode_FromObject(text); if (text == NULL) goto onError; len = PyUnicode_GET_SIZE(text); ntext = PyUnicode_FromUnicode(NULL, len); if (!ntext) goto onError; /* Translate */ orig = (Py_UNICODE *)PyUnicode_AS_UNICODE(text); s = (Py_UNICODE *)PyUnicode_AS_UNICODE(ntext); for (i = 0; i < len; i++, s++, orig++) *s = Py_UNICODE_TOLOWER(*orig); Py_DECREF(text); return ntext; onError: Py_XDECREF(text); return NULL; } #endif /* --- Module functions ------------------------------------------------*/ /* Interface to the tagging engine in mxte.c */ Py_C_Function_WithKeywords( mxTextTools_tag, "tag(text,tagtable,sliceleft=0,sliceright=len(text),taglist=[],context=None) \n""" "Produce a tag list for a string, given a tag-table\n" "- returns a tuple (success, taglist, nextindex)\n" "- if taglist == None, then no taglist is created" ) { PyObject *text; PyObject *tagtable; Py_ssize_t sliceright = INT_MAX; Py_ssize_t sliceleft = 0; PyObject *taglist = 0; Py_ssize_t taglist_len; PyObject *context = 0; Py_ssize_t next, result; PyObject *res; Py_KeywordsGet6Args("OO|iiOO:tag", text,tagtable,sliceleft,sliceright,taglist,context); if (taglist == NULL) { /* not given, so use default: an empty list */ taglist = PyList_New(0); if (taglist == NULL) goto onError; taglist_len = 0; } else { Py_INCREF(taglist); Py_Assert(PyList_Check(taglist) || taglist == Py_None, PyExc_TypeError, "taglist must be a list or None"); if (taglist != Py_None) { taglist_len = PyList_Size(taglist); if (taglist_len < 0) goto onError; } else taglist_len = 0; } Py_Assert(mxTagTable_Check(tagtable) || PyTuple_Check(tagtable) || PyList_Check(tagtable), PyExc_TypeError, "tagtable must be a TagTable instance, list or tuple"); /* Prepare the argument for the Tagging Engine and let it process the request */ if (PyString_Check(text)) { Py_CheckStringSlice(text, sliceleft, sliceright); if (!mxTagTable_Check(tagtable)) { tagtable = mxTagTable_New(tagtable, MXTAGTABLE_STRINGTYPE, 1); if (tagtable == NULL) goto onError; } else if (mxTagTable_Type(tagtable) != MXTAGTABLE_STRINGTYPE) { Py_Error(PyExc_TypeError, "TagTable instance is not intended for parsing strings"); } else Py_INCREF(tagtable); /* Call the Tagging Engine */ result = mxTextTools_TaggingEngine(text, sliceleft, sliceright, (mxTagTableObject *)tagtable, taglist, context, &next); Py_DECREF(tagtable); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, sliceleft, sliceright); if (!mxTagTable_Check(tagtable)) { tagtable = mxTagTable_New(tagtable, 1, 1); if (tagtable == NULL) goto onError; } else if (mxTagTable_Type(tagtable) != MXTAGTABLE_UNICODETYPE) { Py_Error(PyExc_TypeError, "TagTable instance is not intended for parsing Unicode"); } else Py_INCREF(tagtable); /* Call the Tagging Engine */ result = mxTextTools_UnicodeTaggingEngine(text, sliceleft, sliceright, (mxTagTableObject *)tagtable, taglist, context, &next); Py_DECREF(tagtable); } #endif else Py_Error(PyExc_TypeError, "text must be a string or unicode"); /* Check for exceptions during matching */ if (result == 0) goto onError; /* Undo changes to taglist in case of a match failure (result == 1) */ if (result == 1 && taglist != Py_None) { DPRINTF(" undoing changes: del taglist[%i:%i]\n", taglist_len, PyList_Size(taglist)); if (PyList_SetSlice(taglist, taglist_len, PyList_Size(taglist), NULL)) goto onError; } /* Convert result to the documented external values: 0 - no match, 1 - match. */ result--; /* Build result tuple */ res = PyTuple_New(3); if (!res) goto onError; PyTuple_SET_ITEM(res,0,PyInt_FromLong(result)); PyTuple_SET_ITEM(res,1,taglist); PyTuple_SET_ITEM(res,2,PyInt_FromLong(next)); return res; onError: if (!PyErr_Occurred()) Py_Error(PyExc_SystemError, "NULL result without error in builtin tag()"); Py_XDECREF(taglist); return NULL; } /* An extended version of string.join() for taglists: */ Py_C_Function( mxTextTools_join, "join(joinlist,sep='',start=0,stop=len(joinlist))\n\n" "Copy snippets from different strings together producing a\n" "new string\n" "The first argument must be a list of tuples or strings;\n" "tuples must be of the form (string,l,r[,...]) and turn out\n" "as string[l:r]\n" "NOTE: the syntax used for negative slices is different\n" "than the Python standard: -1 corresponds to the first\n" "character *after* the string, e.g. ('Example',0,-1) gives\n" "'Example' and not 'Exampl', like in Python\n" "sep is an optional separator string, start and stop\n" "define the slice of joinlist that is taken into accont." ) { PyObject *joinlist = NULL; Py_ssize_t joinlist_len; PyObject *separator = NULL; Py_ssize_t start=0, stop=INT_MAX; Py_Get4Args("O|Oii:join", joinlist,separator,start,stop); Py_Assert(PySequence_Check(joinlist), PyExc_TypeError, "first argument needs to be a sequence"); joinlist_len = PySequence_Length(joinlist); Py_Assert(joinlist_len >= 0, PyExc_TypeError, "first argument needs to have a __len__ method"); Py_CheckSequenceSlice(joinlist_len, start, stop); /* Short-cut */ if ((stop - start) <= 0) return PyString_FromString(""); return mxTextTools_Join(joinlist, start, stop, separator); onError: return NULL; } /* Special compare function for taglist-tuples, comparing the text-slices given: - slices starting at a smaller index come first - for slices starting at the same index, the longer one wins */ Py_C_Function( mxTextTools_cmp, "cmp(a,b)\n\n" "Compare two valid taglist tuples w/r to their slice\n" "position; this is useful for sorting joinlists.") { PyObject *v,*w; short index; int cmp; Py_Get2Args("OO:cmp",v,w); Py_Assert(PyTuple_Check(v) && PyTuple_Check(w) && PyTuple_GET_SIZE(v) >= 3 && PyTuple_GET_SIZE(w) >= 3, PyExc_TypeError, "invalid taglist-tuple"); for (index = 1; index < 3; index++) { cmp = PyObject_RichCompareBool(PyTuple_GET_ITEM(v,1),PyTuple_GET_ITEM(w,1),Py_LT); if (cmp) return PyInt_FromLong(cmp); cmp = PyObject_RichCompareBool(PyTuple_GET_ITEM(v,2),PyTuple_GET_ITEM(w,2), Py_GT); if (cmp) return PyInt_FromLong(cmp); } return PyInt_FromLong(0); onError: return NULL; } Py_C_Function( mxTextTools_joinlist, "joinlist(text,list,start=0,stop=len(text))\n\n" "Takes a list of tuples (replacement,l,r,...) and produces\n" "a taglist suitable for join() which creates a copy\n" "of text where every slice [l:r] is replaced by the\n" "given replacement\n" "- the list must be sorted using cmp() as compare function\n" "- it may not contain overlapping slices\n" "- the slices may not contain negative indices\n" "- if the taglist cannot contain overlapping slices, you can\n" " give this function the taglist produced by tag() directly\n" " (sorting is not needed, as the list will already be sorted)\n" "- start and stop set the slice to work in, i.e. text[start:stop]" ) { PyObject *list; PyObject *text; Py_ssize_t text_len = INT_MAX; Py_ssize_t pos = 0; Py_Get4Args("OO|ii:joinlist",text,list,pos,text_len); return mxTextTools_Joinlist(text, list, pos, text_len); onError: return NULL; } Py_C_Function( mxTextTools_charsplit, "charsplit(text,char,start=0,stop=len(text))\n\n" "Split text[start:stop] into substrings at char and\n" "return the result as list of strings." ) { PyObject *text, *separator; Py_ssize_t text_len = INT_MAX; Py_ssize_t start = 0; Py_Get4Args("OO|ii:charsplit", text,separator,start,text_len); return mxTextTools_CharSplit(text, separator, start, text_len); onError: return NULL; } Py_C_Function( mxTextTools_splitat, "splitat(text,char,nth=1,start=0,stop=len(text))\n\n" "Split text[start:stop] into two substrings at the nth\n" "occurance of char and return the result as 2-tuple. If the\n" "character is not found, the second string is empty. nth may\n" "be negative: the search is then done from the right and the\n" "first string is empty in case the character is not found." ) { PyObject *text, *separator; Py_ssize_t text_len = INT_MAX; Py_ssize_t start = 0; Py_ssize_t nth = 1; Py_Get5Args("OO|iii:splitat", text,separator,nth,start,text_len); return mxTextTools_SplitAt(text, separator, nth, start, text_len); onError: return NULL; } Py_C_Function( mxTextTools_suffix, "suffix(text,suffixes,start=0,stop=len(text)[,translate])\n\n" "Looks at text[start:stop] and returns the first matching\n" "suffix out of the tuple of strings given in suffixes.\n" "If no suffix is found to be matching, None is returned.\n" "The optional 256 char translate string is used to translate\n" "the text prior to comparing it with the given suffixes." ) { PyObject *text, *suffixes, *translate = NULL; Py_ssize_t text_len = INT_MAX; Py_ssize_t start = 0; Py_Get5Args("OO|iiO:suffix", text,suffixes,start,text_len,translate); return mxTextTools_Suffix(text, suffixes, start, text_len, translate); onError: return NULL; } Py_C_Function( mxTextTools_prefix, "prefix(text,prefixes,start=0,stop=len(text)[,translate])\n\n" "Looks at text[start:stop] and returns the first matching\n" "prefix out of the tuple of strings given in prefixes.\n" "If no prefix is found to be matching, None is returned.\n" "The optional 256 char translate string is used to translate\n" "the text prior to comparing it with the given suffixes." ) { PyObject *text, *prefixes, *translate = NULL; Py_ssize_t text_len = INT_MAX; Py_ssize_t start = 0; Py_Get5Args("OO|iiO:prefix", text,prefixes,start,text_len,translate); return mxTextTools_Prefix(text, prefixes, start, text_len, translate); onError: return NULL; } Py_C_Function( mxTextTools_set, "set(string,logic=1)\n\n" "Returns a character set for string: a bit encoded version\n" "of the characters occurring in string.\n" "- logic can be set to 0 if all characters *not* in string\n" " should go into the set") { PyObject *sto; char *s,*st; Py_ssize_t len_s; int logic = 1; Py_ssize_t i; Py_Get3Args("s#|i:set", s,len_s,logic); sto = PyString_FromStringAndSize(NULL,32); if (sto == NULL) goto onError; st = PyString_AS_STRING(sto); if (logic) { memset(st,0x00,32); for (i = 0; i < len_s; i++,s++) { int j = (unsigned char)*s; st[j >> 3] |= 1 << (j & 7); } } else { memset(st,0xFF,32); for (i = 0; i < len_s; i++,s++) { int j = (unsigned char)*s; st[j >> 3] &= ~(1 << (j & 7)); } } return sto; onError: return NULL; } Py_C_Function( mxTextTools_setfind, "setfind(text,set,start=0,stop=len(text))\n\n" "Find the first occurence of any character from set in\n" "text[start:stop]\n set must be a string obtained with set()\n" "DEPRECATED: use CharSet().search() instead." ) { PyObject *text; PyObject *set; Py_ssize_t text_len = INT_MAX; Py_ssize_t start = 0; register Py_ssize_t x; register char *tx; register unsigned char *setstr; Py_Get4Args("OO|ii:setfind",text,set,start,text_len); Py_Assert(PyString_Check(text), PyExc_TypeError, "first argument needs to be a string"); Py_Assert(PyString_Check(set) && PyString_GET_SIZE(set) == 32, PyExc_TypeError, "second argument needs to be a set"); Py_CheckStringSlice(text,start,text_len); x = start; tx = PyString_AS_STRING(text) + x; setstr = (unsigned char *)PyString_AS_STRING(set); for (;x < text_len; tx++, x++) if (Py_CharInSet(*tx,setstr)) break; if (x == text_len) /* Not found */ return PyInt_FromLong(-1L); else return PyInt_FromLong(x); onError: return NULL; } Py_C_Function( mxTextTools_setstrip, "setstrip(text,set,start=0,stop=len(text),mode=0)\n\n" "Strip all characters in text[start:stop] appearing in set.\n" "mode indicates where to strip (<0: left; =0: left and right;\n" ">0: right). set must be a string obtained with set()\n" "DEPRECATED: use CharSet().strip() instead." ) { char *tx; Py_ssize_t tx_len; char *setstr; Py_ssize_t setstr_len; Py_ssize_t start = 0; Py_ssize_t stop = INT_MAX; int mode = 0; Py_Get7Args("s#s#|iii:setstip", tx,tx_len,setstr,setstr_len,start,stop,mode); return mxTextTools_SetStrip(tx, tx_len, setstr, setstr_len, start, stop, mode); onError: return NULL; } Py_C_Function( mxTextTools_setsplit, "setsplit(text,set,start=0,stop=len(text))\n\n" "Split text[start:stop] into substrings using set,\n" "omitting the splitting parts and empty substrings.\n" "set must be a string obtained from set()\n" "DEPRECATED: use CharSet().split() instead." ) { char *tx; Py_ssize_t tx_len; char *setstr; Py_ssize_t setstr_len; Py_ssize_t start = 0; Py_ssize_t stop = INT_MAX; Py_Get6Args("s#s#|ii:setsplit", tx,tx_len,setstr,setstr_len,start,stop); return mxTextTools_SetSplit(tx, tx_len, setstr, setstr_len, start, stop); onError: return NULL; } Py_C_Function( mxTextTools_setsplitx, "setsplitx(text,set,start=0,stop=len(text))\n\n" "Split text[start:stop] into substrings using set, so\n" "that every second entry consists only of characters in set.\n" "set must be a string obtained with set()\n" "DEPRECATED: use CharSet().splitx() instead." ) { Py_ssize_t text_len = INT_MAX; Py_ssize_t start = 0; char *tx; Py_ssize_t tx_len; char *setstr; Py_ssize_t setstr_len; Py_Get6Args("s#s#|ii:setsplitx", tx,tx_len,setstr,setstr_len,start,text_len); return mxTextTools_SetSplitX(tx, tx_len, setstr, setstr_len, start, text_len); onError: return NULL; } Py_C_Function( mxTextTools_upper, "upper(text)\n\n" "Return text converted to upper case.") { PyObject *text; Py_GetArgObject(text); if (PyString_Check(text)) return mxTextTools_Upper(text); #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) return mxTextTools_UnicodeUpper(text); #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); onError: return NULL; } Py_C_Function( mxTextTools_lower, "lower(text)\n\n" "Return text converted to lower case.") { PyObject *text; Py_GetArgObject(text); if (PyString_Check(text)) return mxTextTools_Lower(text); #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) return mxTextTools_UnicodeLower(text); #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); onError: return NULL; } Py_C_Function( mxTextTools_str2hex, "str2hex(text)\n\n" "Return text converted to a string consisting of two byte\n" "HEX values.") { char *str; Py_ssize_t len; Py_Get2Args("s#",str,len); return mxTextTools_HexStringFromString(str,len); onError: return NULL; } Py_C_Function( mxTextTools_hex2str, "hex2str(text)\n\n" "Return text interpreted as two byte HEX values converted\n" "to a string.") { char *str; Py_ssize_t len; Py_Get2Args("s#",str,len); return mxTextTools_StringFromHexString(str,len); onError: return NULL; } Py_C_Function( mxTextTools_isascii, "isascii(text,start=0,stop=len(text))\n\n" "Return 1/0 depending on whether text only contains ASCII\n" "characters." ) { PyObject *text; Py_ssize_t start=0, stop = INT_MAX; int rc; Py_GetArgObject(text); rc = mxTextTools_IsASCII(text, start, stop); if (rc < 0) goto onError; return PyInt_FromLong(rc); onError: return NULL; } /* --- module init --------------------------------------------------------- */ /* Python Method Table */ static PyMethodDef Module_methods[] = { Py_MethodWithKeywordsListEntry("tag",mxTextTools_tag), Py_MethodListEntry("join",mxTextTools_join), Py_MethodListEntry("cmp",mxTextTools_cmp), Py_MethodListEntry("joinlist",mxTextTools_joinlist), Py_MethodListEntry("set",mxTextTools_set), Py_MethodListEntry("setfind",mxTextTools_setfind), Py_MethodListEntry("setsplit",mxTextTools_setsplit), Py_MethodListEntry("setsplitx",mxTextTools_setsplitx), Py_MethodListEntry("setstrip",mxTextTools_setstrip), Py_MethodWithKeywordsListEntry("TextSearch",mxTextSearch_TextSearch), Py_MethodListEntry("CharSet",mxCharSet_CharSet), Py_MethodListEntry("TagTable",mxTagTable_TagTable), #ifdef HAVE_UNICODE Py_MethodListEntry("UnicodeTagTable",mxTagTable_UnicodeTagTable), #endif Py_MethodListEntrySingleArg("upper",mxTextTools_upper), Py_MethodListEntrySingleArg("lower",mxTextTools_lower), Py_MethodListEntry("charsplit",mxTextTools_charsplit), Py_MethodListEntry("splitat",mxTextTools_splitat), Py_MethodListEntry("suffix",mxTextTools_suffix), Py_MethodListEntry("prefix",mxTextTools_prefix), Py_MethodListEntry("hex2str",mxTextTools_hex2str), Py_MethodListEntry("str2hex",mxTextTools_str2hex), Py_MethodListEntrySingleArg("isascii",mxTextTools_isascii), {NULL,NULL} /* end of list */ }; /* Cleanup function */ static void mxTextToolsModule_Cleanup(void) { mxTextTools_TagTables = NULL; /* Reset mxTextTools_Initialized flag */ mxTextTools_Initialized = 0; } #if PY_MAJOR_VERSION >= 3 static struct PyModuleDef mxTextTools_ModuleDef = { PyModuleDef_HEAD_INIT, MXTEXTTOOLS_MODULE, Module_docstring, -1, Module_methods }; #endif static PyObject* mxTextToolsModule_Initialize(void) { PyObject *module; if (mxTextTools_Initialized) { PyErr_SetString(PyExc_SystemError, "can't initialize "MXTEXTTOOLS_MODULE" more than once"); return NULL; } /* Init type objects */ if (PyType_Ready(&mxTextSearch_Type) < 0) return NULL; if (PyType_Ready(&mxCharSet_Type) < 0) return NULL; if (PyType_Ready(&mxTagTable_Type) < 0) return NULL; /* create module */ #if PY_MAJOR_VERSION >= 3 module = PyModule_Create(&mxTextTools_ModuleDef); #else module = Py_InitModule4(MXTEXTTOOLS_MODULE, /* Module name */ Module_methods, /* Method list */ Module_docstring, /* Module doc-string */ (PyObject *)NULL, /* always pass this as *self */ PYTHON_API_VERSION); /* API Version */ #endif if (!module) return NULL; /* Init TagTable cache */ mxTextTools_TagTables = PyDict_New(); if (!mxTextTools_TagTables) return NULL; /* Register cleanup function */ if (Py_AtExit(mxTextToolsModule_Cleanup) < 0) return NULL; /* Add some symbolic constants to the module */ if (PyModule_AddStringConstant(module, "__version__", VERSION) < 0) return NULL; mx_ToUpper = mxTextTools_ToUpper(); if (!mx_ToUpper) return NULL; if (PyModule_AddObject(module, "to_upper", mx_ToUpper) < 0) return NULL; mx_ToLower = mxTextTools_ToLower(); if (!mx_ToLower) return NULL; if (PyModule_AddObject(module, "to_lower", mx_ToLower) < 0) return NULL; /* Let the tag table cache live in the module dictionary; we just keep a weak reference in mxTextTools_TagTables around. */ if (PyModule_AddObject(module, "tagtable_cache", mxTextTools_TagTables) < 0) return NULL; Py_DECREF(mxTextTools_TagTables); ADD_INT_CONSTANT("BOYERMOORE", MXTEXTSEARCH_BOYERMOORE); ADD_INT_CONSTANT("FASTSEARCH", MXTEXTSEARCH_FASTSEARCH); ADD_INT_CONSTANT("TRIVIAL", MXTEXTSEARCH_TRIVIAL); /* Init exceptions */ mxTextTools_Error = PyErr_NewException("mxTextTools.Error", PyExc_Exception, NULL); if (!mxTextTools_Error) return NULL; if (PyModule_AddObject(module, "Error", mxTextTools_Error) < 0) return NULL; /* Type objects */ Py_INCREF(&mxTextSearch_Type); if (PyModule_AddObject(module, "TextSearchType", (PyObject*) &mxTextSearch_Type) < 0) return NULL; Py_INCREF(&mxCharSet_Type); if (PyModule_AddObject(module, "CharSetType", (PyObject*) &mxCharSet_Type) < 0) return NULL; Py_INCREF(&mxTagTable_Type); if (PyModule_AddObject(module, "TagTableType", (PyObject*) &mxTagTable_Type) < 0) return NULL; /* Tag Table command symbols (these will be exposed via simpleparse.stt.TextTools.Constants.TagTables) */ ADD_INT_CONSTANT("_const_AllIn", MATCH_ALLIN); ADD_INT_CONSTANT("_const_AllNotIn", MATCH_ALLNOTIN); ADD_INT_CONSTANT("_const_Is", MATCH_IS); ADD_INT_CONSTANT("_const_IsIn", MATCH_ISIN); ADD_INT_CONSTANT("_const_IsNot", MATCH_ISNOTIN); ADD_INT_CONSTANT("_const_IsNotIn", MATCH_ISNOTIN); ADD_INT_CONSTANT("_const_Word", MATCH_WORD); ADD_INT_CONSTANT("_const_WordStart", MATCH_WORDSTART); ADD_INT_CONSTANT("_const_WordEnd", MATCH_WORDEND); ADD_INT_CONSTANT("_const_AllInSet", MATCH_ALLINSET); ADD_INT_CONSTANT("_const_IsInSet", MATCH_ISINSET); ADD_INT_CONSTANT("_const_AllInCharSet", MATCH_ALLINCHARSET); ADD_INT_CONSTANT("_const_IsInCharSet", MATCH_ISINCHARSET); ADD_INT_CONSTANT("_const_Fail", MATCH_FAIL); ADD_INT_CONSTANT("_const_Jump", MATCH_JUMP); ADD_INT_CONSTANT("_const_EOF", MATCH_EOF); ADD_INT_CONSTANT("_const_Skip", MATCH_SKIP); ADD_INT_CONSTANT("_const_Move", MATCH_MOVE); ADD_INT_CONSTANT("_const_JumpTarget", MATCH_JUMPTARGET); ADD_INT_CONSTANT("_const_sWordStart", MATCH_SWORDSTART); ADD_INT_CONSTANT("_const_sWordEnd", MATCH_SWORDEND); ADD_INT_CONSTANT("_const_sFindWord", MATCH_SFINDWORD); ADD_INT_CONSTANT("_const_NoWord", MATCH_NOWORD); ADD_INT_CONSTANT("_const_Call", MATCH_CALL); ADD_INT_CONSTANT("_const_CallArg", MATCH_CALLARG); ADD_INT_CONSTANT("_const_Table", MATCH_TABLE); ADD_INT_CONSTANT("_const_SubTable", MATCH_SUBTABLE); ADD_INT_CONSTANT("_const_TableInList", MATCH_TABLEINLIST); ADD_INT_CONSTANT("_const_SubTableInList", MATCH_SUBTABLEINLIST); ADD_INT_CONSTANT("_const_Loop", MATCH_LOOP); ADD_INT_CONSTANT("_const_LoopControl", MATCH_LOOPCONTROL); /* Tag Table command flags */ ADD_INT_CONSTANT("_const_CallTag", MATCH_CALLTAG); ADD_INT_CONSTANT("_const_AppendToTagobj", MATCH_APPENDTAG); ADD_INT_CONSTANT("_const_AppendTagobj", MATCH_APPENDTAGOBJ); ADD_INT_CONSTANT("_const_AppendMatch", MATCH_APPENDMATCH); ADD_INT_CONSTANT("_const_LookAhead", MATCH_LOOKAHEAD); /* Tag Table argument integers */ ADD_INT_CONSTANT("_const_To", MATCH_JUMP_TO); ADD_INT_CONSTANT("_const_MatchOk", MATCH_JUMP_MATCHOK); ADD_INT_CONSTANT("_const_MatchFail", MATCH_JUMP_MATCHFAIL); ADD_INT_CONSTANT("_const_ToEOF", MATCH_MOVE_EOF); ADD_INT_CONSTANT("_const_ToBOF", MATCH_MOVE_BOF); ADD_INT_CONSTANT("_const_Here", MATCH_FAIL_HERE); ADD_INT_CONSTANT("_const_ThisTable", MATCH_THISTABLE); ADD_INT_CONSTANT("_const_Break", MATCH_LOOPCONTROL_BREAK); ADD_INT_CONSTANT("_const_Reset", MATCH_LOOPCONTROL_RESET); DPRINTF("sizeof(string_charset)=%i bytes\n", sizeof(string_charset)); #ifdef HAVE_UNICODE DPRINTF("sizeof(unicode_charset)=%i bytes\n", sizeof(unicode_charset)); #endif /* We are now initialized */ mxTextTools_Initialized = 1; return module; } #if PY_MAJOR_VERSION >= 3 PyMODINIT_FUNC PyInit_mxTextTools(void) { return mxTextToolsModule_Initialize(); } #else MX_EXPORT(void) initmxTextTools(void) { mxTextToolsModule_Initialize(); } #endif SimpleParse-2.2.0/simpleparse/stt/TextTools/mxTextTools/highcommands.h0000644000175000017500000001415412620706017027570 0ustar mcfletchmcfletch00000000000000/* non-recursive high-level commands The contract here is: The commands may alter any of the tag-specific variables errors may be indicated if encountered in childReturnCode and the error* variables */ case MATCH_SWORDSTART: case MATCH_SWORDEND: case MATCH_SFINDWORD: /* these items basically follow the low-level contract, with the only exception being that MATCH_SFINDWORD will change childStart */ { Py_ssize_t wordstart, wordend; int returnCode; DPRINTF("\nsWordStart/End/sFindWord :\n" " in string = '%.40s'\n",text+childPosition); childStart = childPosition; returnCode = TE_SEARCHAPI( match, text, childStart, sliceright, &wordstart, &wordend ); if (returnCode < 0) { childReturnCode = ERROR_CODE; errorType = PyExc_SystemError; errorMessage = PyString_FromFormat( "Search-object search returned value < 0 (%i): probable bug in text processing engine", returnCode ); } else if (returnCode == 0) { /* not matched */ DPRINTF(" (no success)\n"); childReturnCode = FAILURE_CODE; } else { /* matched, adjust childPosition according to the word start/end/find requirements */ if (command == MATCH_SWORDSTART) { childPosition = wordstart; } else { childPosition = wordend; } if (command == MATCH_SFINDWORD) { /* XXX logic problem with lookahead should it reset to real childStart or the fake one created here? */ childStart = wordstart; } DPRINTF(" [%i:%i] (matched and remembered this slice)\n", childStart,childPosition); } break; } case MATCH_LOOP: /* No clue what this is supposed to do, real surprising if it works... */ DPRINTF("\nLoop: pre loop counter = %i\n",loopcount); if (loopcount > 0) { /* we are inside a loop */ loopcount--; } else if (loopcount < 0) { /* starting a new loop */ if (PyInt_Check(match)) { loopcount = PyInt_AS_LONG(match); loopstart = childPosition; } else { childReturnCode = ERROR_CODE; errorType = PyExc_TypeError; errorMessage = PyString_FromFormat( "Tag Table entry %d: expected an integer (command=Loop) got a %.50s", (unsigned int)index, Py_TYPE(match)->tp_name ); } } if (childReturnCode == NULL_CODE ) { if (loopcount == 0) { /* finished loop */ loopcount = -1; } if (loopstart == childPosition) { /* not matched */ childReturnCode = FAILURE_CODE; } else { childReturnCode = SUCCESS_CODE; /* on success, add match from start of the whole loop to end of current iteration? Would be really good if I had a clue what this is supposed to do :) . */ childStart = loopstart; } DPRINTF("\nloop: post loop counter = %i\n",loopcount); } break; case MATCH_LOOPCONTROL: DPRINTF("\nLoopControl: loop counter = %i, " "setting it to = %li\n", loopcount,PyInt_AS_LONG(match)); loopcount = PyInt_AS_LONG(match); break; case MATCH_CALL: case MATCH_CALLARG: /* call and callarg actually follow the low-level contract */ { PyObject *fct = NULL; int argc = -1; if (!PyTuple_Check(match)) { argc = 0; fct = match; } else { argc = PyTuple_GET_SIZE(match) - 1; if (argc < 0) { /* how is this even possible? */ childReturnCode = ERROR_CODE; errorType = PyExc_TypeError; errorMessage = PyString_FromFormat( "Tag Table entry %d: " "expected a tuple (fct,arg0,arg1,...)" "(command=CallArg)", (unsigned int)index ); } else { fct = PyTuple_GET_ITEM(match,0); } } if (childReturnCode == NULL_CODE && PyCallable_Check(fct)) { PyObject *args; register PyObject *w; register Py_ssize_t argIndex; DPRINTF("\nCall[Arg] :\n"); childStart = childPosition; /* Build args = (textobj,childStart,sliceright[,arg0,arg1,...]) */ args = PyTuple_New(3 + argc); if (!args) { childReturnCode = ERROR_CODE; errorType = PyExc_SystemError; errorMessage = PyString_FromFormat( "Unable to create argument tuple for CallArgs command at index %d", (unsigned int)index ); } else { Py_INCREF(textobj); PyTuple_SET_ITEM(args,0,textobj); w = PyInt_FromLong(childStart); if (!w){ childReturnCode = ERROR_CODE; errorType = PyExc_SystemError; errorMessage = PyString_FromFormat( "Unable to convert an integer %d to a Python Integer", (unsigned int)childStart ); } else { PyTuple_SET_ITEM(args,1,w); w = PyInt_FromLong(sliceright); if (!w) { childReturnCode = ERROR_CODE; errorType = PyExc_SystemError; errorMessage = PyString_FromFormat( "Unable to convert an integer %d to a Python Integer", (unsigned int)sliceright ); } else { PyTuple_SET_ITEM(args,2,w); for (argIndex = 0; argIndex < argc; argIndex++) { w = PyTuple_GET_ITEM(match,argIndex + 1); Py_INCREF(w); PyTuple_SET_ITEM(args,3 + argIndex,w); } /* now actually call the object */ w = PyEval_CallObject(fct,args); Py_DECREF(args); if (w == NULL) { childReturnCode = ERROR_CODE; /* child's error should be allowed to propagate */ } else if (!PyInt_Check(w)) { childReturnCode = ERROR_CODE; errorType = PyExc_TypeError; errorMessage = PyString_FromFormat( "Tag Table entry %d: matching function has to return an integer, returned a %.50s", (unsigned int)index, Py_TYPE(w)->tp_name ); } else { childPosition = PyInt_AS_LONG(w); Py_DECREF(w); if (childStart == childPosition) { /* not matched */ DPRINTF(" (no success)\n"); childReturnCode = FAILURE_CODE; } } } } } } else { childReturnCode = ERROR_CODE; errorType = PyExc_TypeError; errorMessage = PyString_FromFormat( "Tag Table entry %d: " "expected a callable object, got a %.50s" "(command=Call[Arg])", (unsigned int)index, Py_TYPE(fct)->tp_name ); } break; } SimpleParse-2.2.0/simpleparse/stt/TextTools/mxTextTools/mxbmse.h0000644000175000017500000000303012037615407026415 0ustar mcfletchmcfletch00000000000000#ifndef MXBMSE_H #define MXBMSE_H /* mxbmse -- Fast Boyer Moore Search Algorithm (Version 0.8) The implementation is reentrant and thread safe. While the general idea behind the Boyer Moore algorithm are in the public domain, this implementation falls under the following copyright: Copyright (c) 1997-2000, Marc-Andre Lemburg; mailto:mal@lemburg.com Copyright (c) 2000-2002, eGenix.com Software GmbH; mailto:info@egenix.com All Rights Reserved See the documentation for copying information or contact the author (mal@lemburg.com). */ #ifdef __cplusplus extern "C" { #endif /* --- Fast Boyer-Moore Implementation (8-bit) ---------------------------- */ /* sanity check switches */ /*#define SAFER 1*/ /* SHIFT must have enough bits to store len(match) - using 'char' here makes the routines run 15% slower than with 'int', on the other hand, 'int' is at least 4 times larger than 'char' */ #ifndef BM_SHIFT_TYPE # define BM_SHIFT_TYPE int #endif typedef struct { char *match; int match_len; char *eom; char *pt; BM_SHIFT_TYPE shift[256]; /* char-based shift table */ } mxbmse_data; extern mxbmse_data *bm_init(char *match, int match_len); extern void bm_free(mxbmse_data *c); extern int bm_search(mxbmse_data *c, char *text, int start, int stop); extern int bm_tr_search(mxbmse_data *c, char *text, int start, int stop, char *tr); #define BM_MATCH_LEN(bm) ((mxbmse_data *)bm)->match_len /* EOF */ #ifdef __cplusplus } #endif #endif SimpleParse-2.2.0/simpleparse/stt/TextTools/mxTextTools/speccommands.h0000644000175000017500000000625012037615407027605 0ustar mcfletchmcfletch00000000000000/* "Special" commands code fragment The contract here is: The commands may alter any of the tag-specific variables errors may be indicated if encountered in childReturnCode and the error* variables setting childReturnCode equal to FAILURE_CODE declares that the read head has not moved childReturnCode must be set (or default "you have to move forward to match" semantics are used) */ /* doesn't there need to be a check for integer arguments that the value is an integer? Or does the compiler do that now */ case MATCH_FAIL: /* == MATCH_JUMP */ /* dumb question, what is MATCH_JUMP supposed to do? */ childReturnCode = FAILURE_CODE; break; case MATCH_SKIP: /* Argh, what to do when moves past buffer? Where do we check that this is still in-bounds? documented as always succeeding, but results in result-tuples with negative or out-of-range values in current code. Can't do: if (childPosition < sliceleft) { childPosition = 0; } else if (childPosition > sliceright) { childPosition = sliceright; } because we might have another move, or an EOF or whatever coming up. Marc-André want's these conditions: childPosition < 0 { # (not sliceleft!) raise TypeError: Tag Table entry %(index): moved/skipped beyond start of text } and no check for > right or beyond end of buffer... */ DPRINTF("\nSkip %li characters\n" " in string = '%.40s'\n", PyInt_AS_LONG(match),text+childPosition); childPosition += PyInt_AS_LONG(match); childReturnCode = SUCCESS_CODE; break; case MATCH_MOVE: /* same potential out-of-bounds issue as with skip */ childPosition = PyInt_AS_LONG(match); if (childPosition < 0) { /* Relative to end of the slice */ childPosition += sliceright + 1; } else { /* Relative to beginning of the slice */ childPosition += sliceleft; } DPRINTF("\nMove to position %i \n" " string = '%.40s'\n", childPosition,text+childPosition); childReturnCode = SUCCESS_CODE; break; case MATCH_EOF: DPRINTF("\nEOF at position %i ? \n" " string = '%.40s'\n", childPosition,text+childPosition); if (sliceright > childPosition) { /* not matched */ childReturnCode = FAILURE_CODE; } else { /* I don't see why this would necessarily be the end of the parsing run, after all you might want to match EOF, then back up X characters? The documentation doesn't mention anything about such a restriction. Approach here seems to match documentation functionality but still suffers the out-of-range problems seen in move and skip commands as well. */ childReturnCode = SUCCESS_CODE; childPosition = sliceright; childStart = sliceright; } break; case MATCH_JUMPTARGET: /* note: currently this can report a value, though I don't think that was intended originally. I see it as useful because it lets you enter a flag in the results table just by specifying a non-None tagobj */ /* null operation */ DPRINTF("\nJumpTarget '%.40s' (skipped)\n", PyString_AsString(match)); childReturnCode = SUCCESS_CODE; break; SimpleParse-2.2.0/simpleparse/stt/TextTools/mxTextTools/mx.h0000644000175000017500000004704712620706017025562 0ustar mcfletchmcfletch00000000000000#ifndef MX_H #define MX_H /* mx -- Marc's eXtension modules for Python: basic macros This file is only meant to be included by the extension modules. DO NOT include it in the extension module's header file, since it will definitely cause troubles then. To enable debugging ceratin things, define one of these before including this file: MAL_REF_DEBUG -- debug reference counts (Py_MY_xxx) [this file] MAL_DEBUG -- enable debug output (DPRINTF) [mxstdlib.h] MAL_MEM_DEBUG -- enable malloc output (new,cnew,free,...) [mxstdlib.h] Copyright (c) 2000, Marc-Andre Lemburg; mailto:mal@lemburg.com Copyright (c) 2000-2002, eGenix.com Software GmbH; mailto:info@egenix.com See the documentation for further copyright information or contact the author. */ /* --- Platform or compiler specific tweaks ------------------------------- */ /* Add some platform specific symbols to enable work-arounds for the static forward declaration of type definitions; note that the GNU C compiler does not have this problem. Many thanks to all who have contributed to this list. */ #if (!defined(__GNUC__)) # if (defined(NeXT) || defined(sgi) || defined(_AIX) || (defined(__osf__) && defined(__DECC)) || defined(TrueCompaq64) || defined(__VMS)) # define BAD_STATIC_FORWARD # endif #endif /* Some more tweaks for various platforms. */ /* VMS needs this define. Thanks to Jean-François PIÉRONNE */ #if defined(__VMS) # define __SC__ #endif /* xlC on AIX doesn't like the Python work-around for static forwards in ANSI mode (default), so we switch on extended mode. Thanks to Albert Chin-A-Young */ #if defined(__xlC__) # pragma langlvl extended #endif /* --- Standard header files ---------------------------------------------- */ /* Include the generic mx header file */ #include "mxh.h" /* Include nearly all Python symbols & definitions */ #include "Python.h" /* Include other standard stuff */ #include "mxstdlib.h" /* Include Python backward compatibility stuff */ #include "mxpyapi.h" /* --- Compiler support --------------------------------------------------- */ /* Support for compilers which don't like static forward declarations of arrays; Python 2.3 removed the support for BAD_STATIC_FORWARD which is why we now use our own little helpers here. */ #undef staticforward #undef statichere #ifdef BAD_STATIC_FORWARD # define staticforward extern # define statichere static #else # define staticforward static # define statichere static #endif /* --- Declare macros ----------------------------------------------------- */ #define Py_NONE (Py_INCREF(Py_None),Py_None) #ifdef MAL_REF_DEBUG # define printref(x) printf("* refcount for "#x" = %i\n",(long) x->ob_refcnt); #else # define printref(x) #endif /* --- Error handling ----------------------------------------------------- */ #define Py_Do(x) {if (!(x)) goto onError;} #define Py_ReturnOnError(errortype,errorstr) {PyErr_SetString(errortype,errorstr);return NULL;} #define Py_Assert(x,errortype,errorstr) {if (!(x)) {PyErr_SetString(errortype,errorstr);goto onError;}} #define Py_AssertWithArg(x,errortype,errorstr,a1) {if (!(x)) {PyErr_Format(errortype,errorstr,a1);goto onError;}} #define Py_AssertWith2Args(x,errortype,errorstr,a1,a2) {if (!(x)) {PyErr_Format(errortype,errorstr,a1,a2);goto onError;}} #define Py_AssertWith3Args(x,errortype,errorstr,a1,a2,a3) {if (!(x)) {PyErr_Format(errortype,errorstr,a1,a2,a3);goto onError;}} #define Py_Error(errortype,errorstr) {PyErr_SetString(errortype,errorstr);goto onError;} #define Py_ErrorWithArg(errortype,errorstr,a1) {PyErr_Format(errortype,errorstr,a1);goto onError;} #define Py_ErrorWith2Args(errortype,errorstr,a1,a2) {PyErr_Format(errortype,errorstr,a1,a2);goto onError;} #define Py_ErrorWith3Args(errortype,errorstr,a1,a2,a3) {PyErr_Format(errortype,errorstr,a1,a2,a3);goto onError;} /* --- Reference counting ------------------------------------------------- */ #ifdef MAL_REF_DEBUG static void mx_Py_INCREF(PyObject *v, char *name, char *filename, int lineno) { if (!Py_DebugFlag) { Py_XINCREF(v); return; } if (!v) mxDebugPrintf("[%s:%5i] Py_XINCREF( %-8s == NULL );\n", filename,lineno,name); else { Py_INCREF(v);; mxDebugPrintf("[%s:%5i] Py_XINCREF( %-8s at 0x%x [%s]); " "new refcount = %i\n", filename,lineno,name,(int)v,Py_TYPE(v)->tp_name, v->ob_refcnt); } } static void mx_Py_DECREF(PyObject *v, char *name, char *filename, int lineno) { if (!Py_DebugFlag) { Py_XDECREF(v); return; } if (!v) mxDebugPrintf("[%s:%5i] Py_XDECREF( %-8s == NULL );\n", filename,lineno,name); else { int refcnt = v->ob_refcnt; Py_DECREF(v); if (refcnt <= 1) mxDebugPrintf("[%s:%5i] Py_XDECREF( %-8s at 0x%x [%s]); " "object deleted\n", filename,lineno,name,(int)v,Py_TYPE(v)->tp_name); else mxDebugPrintf("[%s:%5i] Py_XDECREF( %-8s at 0x%x [%s]); " "new refcount = %i\n", filename,lineno,name,(int)v,Py_TYPE(v)->tp_name, v->ob_refcnt); } } static void mx_Py_PRINT_REFCOUNT(PyObject *v, char *name, char *filename, int lineno) { if (!v) mxDebugPrintf("[%s:%5i] Py_PRINT_REFCOUNT( %-8s == NULL );\n", filename,lineno,name); else { mxDebugPrintf("[%s:%5i] Py_PRINT_REFCOUNT( %-8s at 0x%x [%s]) = %i;\n", filename,lineno,name,(int)v,Py_TYPE(v)->tp_name, v->ob_refcnt); } } # undef Py_INCREF # define Py_INCREF(x) mx_Py_INCREF((PyObject *)x,#x,__FILE__,__LINE__) # undef Py_DECREF # define Py_DECREF(x) mx_Py_DECREF((PyObject *)x,#x,__FILE__,__LINE__) # undef Py_XINCREF # define Py_XINCREF(x) mx_Py_INCREF((PyObject *)x,#x,__FILE__,__LINE__) # undef Py_XDECREF # define Py_XDECREF(x) mx_Py_DECREF((PyObject *)x,#x,__FILE__,__LINE__) # define Py_DELETE(x) {if (x->ob_refcnt > 1) mxDebugPrintf("[%s:%5i] Py_DELETE( "#x" ) WARNING: Refcount = %i > 1\n",__FILE__,__LINE__,(int)x->ob_refcnt);Py_DECREF(x);} # define Py_PRINT_REFCOUNT(x) mx_Py_PRINT_REFCOUNT((PyObject *)x,#x,__FILE__,__LINE__) #else # define Py_DELETE(x) Py_DECREF(x) # define Py_PRINT_REFCOUNT(x) #endif #define Py_DEC_REF(x) {Py_XDECREF(x); x=0;} /* doing this once too often doesn't hurt */ /* Unreference a Python object. This is only used in Python debug builds and needed to keep track of all allocated references. Use in object constructors or free list implementations. */ #ifndef _Py_DEC_REFTOTAL # ifdef Py_REF_DEBUG # define _Py_DEC_REFTOTAL _Py_RefTotal-- # else # define _Py_DEC_REFTOTAL # endif #endif #define mxPy_UNREF(x) _Py_DEC_REFTOTAL /* --- Argument passing and checking -------------------------------------- */ /* No arguments expected; also use Py_MethodListEntryNoArgs() for this kind of fct; this check is no longer needed in Python 2.3 and later */ #if PY_VERSION_HEX >= 0x02030000 # define Py_NoArgsCheck() {if (0) goto onError;} #else # define Py_NoArgsCheck() {if (!PyArg_NoArgs(args)) goto onError;} #endif /* For functions with old style args (Py_MethodListEntrySingleArg) */ #define Py_GetArgObject(a) {a = args; if (!a) {PyErr_SetString(PyExc_TypeError,"function/method requires an argument"); goto onError;}} #define Py_GetSingleArg(format,a1) {if (!PyArg_Parse(args,format,&a1)) goto onError;} /* For functions with new style args: */ #define Py_GetArg(format,a1) {if (!PyArg_ParseTuple(args,format,&a1)) goto onError;} #define Py_Get2Args(format,a1,a2) {if (!PyArg_ParseTuple(args,format,&a1,&a2)) goto onError;} #define Py_Get3Args(format,a1,a2,a3) {if (!PyArg_ParseTuple(args,format,&a1,&a2,&a3)) goto onError;} #define Py_Get4Args(format,a1,a2,a3,a4) {if (!PyArg_ParseTuple(args,format,&a1,&a2,&a3,&a4)) goto onError;} #define Py_Get5Args(format,a1,a2,a3,a4,a5) {if (!PyArg_ParseTuple(args,format,&a1,&a2,&a3,&a4,&a5)) goto onError;} #define Py_Get6Args(format,a1,a2,a3,a4,a5,a6) {if (!PyArg_ParseTuple(args,format,&a1,&a2,&a3,&a4,&a5,&a6)) goto onError;} #define Py_Get7Args(format,a1,a2,a3,a4,a5,a6,a7) {if (!PyArg_ParseTuple(args,format,&a1,&a2,&a3,&a4,&a5,&a6,&a7)) goto onError;} #define Py_Get8Args(format,a1,a2,a3,a4,a5,a6,a7,a8) {if (!PyArg_ParseTuple(args,format,&a1,&a2,&a3,&a4,&a5,&a6,&a7,&a8)) goto onError;} /* For functions with keywords -- the first macro parameter must be the keywords array given as e.g. static char *keywords[] = {"first","second","third", 0}; with an entry for every argument (in the correct order). The functions must be included in the method list using Py_MethodWithKeywordsListEntry() and be declared as Py_C_Function_WithKeywords(). */ #define Py_KeywordGetArg(keywords,format,a1) {if (!PyArg_ParseTupleAndKeywords(args,kws,format,keywords,&a1)) goto onError;} #define Py_KeywordGet2Args(keywords,format,a1,a2) {if (!PyArg_ParseTupleAndKeywords(args,kws,format,keywords,&a1,&a2)) goto onError;} #define Py_KeywordGet3Args(keywords,format,a1,a2,a3) {if (!PyArg_ParseTupleAndKeywords(args,kws,format,keywords,&a1,&a2,&a3)) goto onError;} #define Py_KeywordGet4Args(keywords,format,a1,a2,a3,a4) {if (!PyArg_ParseTupleAndKeywords(args,kws,format,keywords,&a1,&a2,&a3,&a4)) goto onError;} #define Py_KeywordGet5Args(keywords,format,a1,a2,a3,a4,a5) {if (!PyArg_ParseTupleAndKeywords(args,kws,format,keywords,&a1,&a2,&a3,&a4,&a5)) goto onError;} #define Py_KeywordGet6Args(keywords,format,a1,a2,a3,a4,a5,a6) {if (!PyArg_ParseTupleAndKeywords(args,kws,format,keywords,&a1,&a2,&a3,&a4,&a5,&a6)) goto onError;} #define Py_KeywordGet7Args(keywords,format,a1,a2,a3,a4,a5,a6,a7) {if (!PyArg_ParseTupleAndKeywords(args,kws,format,keywords,&a1,&a2,&a3,&a4,&a5,&a6,&a7)) goto onError;} #define Py_KeywordGet8Args(keywords,format,a1,a2,a3,a4,a5,a6,a7,a8) {if (!PyArg_ParseTupleAndKeywords(args,kws,format,keywords,&a1,&a2,&a3,&a4,&a5,&a6,&a7,&a8)) goto onError;} /* New style macros fof functions supporting keywords -- the C variable names are used as template for the keyword list, i.e. they must match the Python keyword parameter names. Note that format strings with special parameters (e.g. "#s") are not allowed since they would cause the keyword list to be out of sync. The functions must be included in the method list using Py_MethodWithKeywordsListEntry() and be declared as Py_C_Function_WithKeywords(). Example: Py_C_Function_WithKeywords( myfunction, "myfunction(filename,dupkeys=0,filemode=0,sectorsize=512)\n\n" "Returns a myobject" ) { char *filename; int sectorsize = 512; int dupkeys = 0; int filemode = 0; Py_KeywordsGet4Args("s|iii", filename,dupkeys,filemode,sectorsize); return (PyObject *)myobject_New(filename, filemode, sectorsize, dupkeys); onError: return NULL; } */ #define Py_KeywordsGetArg(format,a1) {static char *kwslist[] = {#a1,NULL}; if (!PyArg_ParseTupleAndKeywords(args,kws,format,kwslist,&a1)) goto onError;} #define Py_KeywordsGet2Args(format,a1,a2) {static char *kwslist[] = {#a1,#a2,NULL}; if (!PyArg_ParseTupleAndKeywords(args,kws,format,kwslist,&a1,&a2)) goto onError;} #define Py_KeywordsGet3Args(format,a1,a2,a3) {static char *kwslist[] = {#a1,#a2,#a3,NULL}; if (!PyArg_ParseTupleAndKeywords(args,kws,format,kwslist,&a1,&a2,&a3)) goto onError;} #define Py_KeywordsGet4Args(format,a1,a2,a3,a4) {static char *kwslist[] = {#a1,#a2,#a3,#a4,NULL}; if (!PyArg_ParseTupleAndKeywords(args,kws,format,kwslist,&a1,&a2,&a3,&a4)) goto onError;} #define Py_KeywordsGet5Args(format,a1,a2,a3,a4,a5) {static char *kwslist[] = {#a1,#a2,#a3,#a4,#a5,NULL}; if (!PyArg_ParseTupleAndKeywords(args,kws,format,kwslist,&a1,&a2,&a3,&a4,&a5)) goto onError;} #define Py_KeywordsGet6Args(format,a1,a2,a3,a4,a5,a6) {static char *kwslist[] = {#a1,#a2,#a3,#a4,#a5,#a6,NULL}; if (!PyArg_ParseTupleAndKeywords(args,kws,format,kwslist,&a1,&a2,&a3,&a4,&a5,&a6)) goto onError;} #define Py_KeywordsGet7Args(format,a1,a2,a3,a4,a5,a6,a7) {static char *kwslist[] = {#a1,#a2,#a3,#a4,#a5,#a6,#a7,NULL}; if (!PyArg_ParseTupleAndKeywords(args,kws,format,kwslist,&a1,&a2,&a3,&a4,&a5,&a6,&a7)) goto onError;} #define Py_KeywordsGet8Args(format,a1,a2,a3,a4,a5,a6,a7,a8) {static char *kwslist[] = {#a1,#a2,#a3,#a4,#a5,#a6,#a7,#a8,NULL}; if (!PyArg_ParseTupleAndKeywords(args,kws,format,kwslist,&a1,&a2,&a3,&a4,&a5,&a6,&a7,&a8)) goto onError;} /* --- Returning values to Python ----------------------------------------- */ /* XXX Don't always work: every time you have an 'O' in the BuildValue format string, you need to DECREF the variable *after* the tuple has been built !!! */ #define Py_ReturnNone() {Py_INCREF(Py_None);return Py_None;} #define Py_ReturnTrue() {Py_INCREF(Py_True);return Py_True;} #define Py_ReturnFalse() {Py_INCREF(Py_False);return Py_False;} #define Py_ReturnArg(format,a1) return Py_BuildValue(format,a1); #define Py_Return Py_ReturnArg #define Py_Return2Args(format,a1,a2) return Py_BuildValue(format,a1,a2); #define Py_Return2 Py_Return2Args #define Py_Return3Args(format,a1,a2,a3) return Py_BuildValue(format,a1,a2,a3); #define Py_Return3 Py_Return3Args #define Py_Return4Args(format,a1,a2,a3) return Py_BuildValue(format,a1,a2,a3,a4); #define Py_Return5Args(format,a1,a2,a3) return Py_BuildValue(format,a1,a2,a3,a4,a5); #define Py_Return6Args(format,a1,a2,a3) return Py_BuildValue(format,a1,a2,a3,a4,a5,a6); #define Py_Return7Args(format,a1,a2,a3) return Py_BuildValue(format,a1,a2,a3,a4,a5,a6,a7); /* Build values */ #define Py_BuildNone() Py_NONE #define Py_Build(format,x) Py_BuildValue(format,x) #define Py_Build2(format,x,y) Py_BuildValue(format,x,y) #define Py_Build3(format,x,y,z) Py_BuildValue(format,x,y,z) /* --- Declaring Python builtin functions/methods ------------------------- */ /* Declare C function/method fct, having docstring docstr; may use vargargs */ #define Py_C_Function(fct,docstr) \ static char fct##_docstring[] = docstr;\ static PyObject *fct(PyObject *self, PyObject *args) /* Declare C function/method fct, having keywords keywordsarray and a docstring docstr; may use vargargs & keywords */ #define Py_C_Function_WithKeywords(fct,docstr) \ static char fct##_docstring[] = docstr;\ static PyObject *fct(PyObject *self, PyObject *args, PyObject *kws) /* These declare: self -- instance pointer for methods, NULL for functions args -- argument tuple kws -- keywords dict (if applicable) plus as statics: _docstring -- the docstring as given _keywords -- the keyword array as given Note: use the Py_GetArg macros for functions without keywords, and Py_KeywordGetArg macros for functions with keywords */ /* --- Method list entries for builtin functions/methods ------------------ */ /* Add a C function/method cname to the module dict as pyname; no doc-string */ #define Py_MethodListEntryAny(pyname,cname) {pyname,(PyCFunction)cname,METH_VARARGS} /* Add a C function/method cname to the module dict as pyname; the function can use varargs */ #define Py_MethodListEntry(pyname,cname) {pyname,(PyCFunction)cname,METH_VARARGS,cname##_docstring} /* Add a C function/method cname to the module dict as pyname; the function takes no args; in Python 2.3 a new flag was added for these which implements the no args check in the interpreter itself. */ #ifdef METH_NOARGS # define Py_MethodListEntryNoArgs(pyname,cname) {pyname,(PyCFunction)cname,METH_NOARGS,cname##_docstring} #else # define Py_MethodListEntryNoArgs(pyname,cname) {pyname,(PyCFunction)cname,0,cname##_docstring} #endif /* Add a C function/method cname to the module dict as pyname; the function takes one argument: the object is passed in directly (without wrapping it into a tuple first), i.e. don't use the Py_GetArg-macros or PyArg_ParseTuple(). */ #define Py_MethodListEntrySingleArg(pyname,cname) {pyname,(PyCFunction)cname,0,cname##_docstring} /* Add a C function/method that uses keywords to the module dict */ #define Py_MethodWithKeywordsListEntry(pyname,cname) {pyname,(PyCFunction)cname,METH_VARARGS | METH_KEYWORDS,cname##_docstring} /* --- Sequence slicing --------------------------------------------------- */ /* Check a given slice and apply the usual rules for negative indices */ #define Py_CheckSequenceSlice(len,start,stop) { \ if (stop > len) \ stop = len; \ else { \ if (stop < 0) \ stop += len; \ if (stop < 0) \ stop = 0; \ } \ if (start < 0) { \ start += len; \ if (start < 0) \ start = 0; \ } \ if (stop < start) \ start = stop; \ } /* --- Text macros -------------------------------------------------------- */ /* Check a given text slice and apply the usual rules for negative indices */ #define Py_CheckBufferSlice(textlen,start,stop) \ Py_CheckSequenceSlice(textlen,start,stop) /* Dito for string objects */ #define Py_CheckStringSlice(textobj,start,stop) \ Py_CheckSequenceSlice(PyString_GET_SIZE(textobj),start,stop) /* For b/w compatibility */ #define Py_CheckSlice(textobj,start,stop) \ Py_CheckStringSlice(textobj,start,stop) /* Dito for Unicode objects */ #ifdef PyUnicode_GET_SIZE # define Py_CheckUnicodeSlice(unicode,start,stop) \ Py_CheckSequenceSlice(PyUnicode_GET_SIZE(unicode),start,stop) #endif /* This assumes that fixed is a constant char array; the strcmp function is only called in case the attribute name length exceeds 10 characters and the first 10 characters match; optimizing compilers should eliminate any unused parts of this comparison automatically. Note: The latest egcs compiler warns about the subscripts being out of range for shorter fixed strings; since no code is generated for those comparisons, these warning can safely be ignored. Still, they are annoying. See the Py_StringsCompareEqual() macro below for a way to work around this. */ #define Py_StringsCompareEqualEx(var,fixed,fixedsize) \ (var[0] == fixed[0] && \ (fixed[0] == 0 || \ (fixedsize >= 1 && (var[1] == fixed[1] && \ (fixed[1] == 0 || \ (fixedsize >= 2 && (var[2] == fixed[2] && \ (fixed[2] == 0 || \ (fixedsize >= 3 && (var[3] == fixed[3] && \ (fixed[3] == 0 || \ (fixedsize >= 4 && (var[4] == fixed[4] && \ (fixed[4] == 0 || \ (fixedsize >= 5 && (var[5] == fixed[5] && \ (fixed[5] == 0 || \ (fixedsize >= 6 && (var[6] == fixed[6] && \ (fixed[6] == 0 || \ (fixedsize >= 7 && (var[7] == fixed[7] && \ (fixed[7] == 0 || \ (fixedsize >= 8 && (var[8] == fixed[8] && \ (fixed[8] == 0 || \ (fixedsize >= 9 && (var[9] == fixed[9] && \ (fixed[9] == 0 || \ (fixedsize >= 10 && \ strcmp(&var[10],&fixed[10]) == 0 \ )))))))))))))))))))))))))))))) /* This assumes that fixed is a constant char array. The appended string snippet is to shut up the warnings produced by newer egcs/gcc compilers about offsets being outside bounds. Note that some compilers do the inlining by themselves or don't like the above trick (OpenVMS is one such platform). For these we simply use the standard way. */ #ifndef __VMS # define Py_StringsCompareEqual(var,fixed) \ Py_StringsCompareEqualEx(var,fixed"\0\0\0\0\0\0\0\0\0\0",sizeof(fixed)) #else # define Py_StringsCompareEqual(var,fixed) (strcmp(var, fixed) == 0) #endif /* Fast character set member check; set must be a "static unsigned *char set" array of exactly 32 bytes length generated with TextTools.set() */ #define Py_CharInSet(chr,set) \ (((unsigned char)(set)[(unsigned char)(chr) >> 3] & \ (1 << ((unsigned char)(chr) & 7))) != 0) /* --- SWIG addons -------------------------------------------------------- */ /* Throw this error after having set the correct Python exception using e.g. PyErr_SetString(); */ #define mxSWIGError "mxSWIGError" /* EOF */ #endif SimpleParse-2.2.0/simpleparse/stt/TextTools/mxTextTools/mxpyapi.h0000644000175000017500000000236012620706017026612 0ustar mcfletchmcfletch00000000000000#ifndef MXPYAPI_H #define MXPYAPI_H /* mxpyapi.h This header file includes some new APIs that are not available in older API versions, yet are used by the mx-Extensions. Copyright (c) 2000, Marc-Andre Lemburg; mailto:mal@lemburg.com Copyright (c) 2000-2003, eGenix.com Software GmbH; mailto:info@egenix.com */ #if defined(PyUnicode_Check) && !defined(HAVE_UNICODE) # define HAVE_UNICODE #endif #if defined(HAVE_UNICODE) && !defined(Py_USING_UNICODE) # undef HAVE_UNICODE #endif #ifndef HAVE_UNICODE # undef PyUnicode_Check # define PyUnicode_Check(obj) 0 #endif #if PY_MAJOR_VERSION >= 3 #define PyInt_FromLong PyLong_FromLong #define PyInt_Check PyLong_Check #define PyInt_AS_LONG PyLong_AS_LONG #define PyString_FromStringAndSize PyBytes_FromStringAndSize #define PyString_AsString PyBytes_AsString #define PyString_FromString PyBytes_FromString #define PyString_Check PyBytes_Check #define PyString_FromFormat PyBytes_FromFormat #define PyString_GET_SIZE PyBytes_GET_SIZE #define PyString_AS_STRING PyBytes_AS_STRING #define _PyString_Resize _PyBytes_Resize #endif /* EOF */ #endif SimpleParse-2.2.0/simpleparse/stt/TextTools/mxTextTools/mxTextTools.def0000644000175000017500000000003112037615407027741 0ustar mcfletchmcfletch00000000000000EXPORTS initmxTextTools SimpleParse-2.2.0/simpleparse/stt/TextTools/mxTextTools/mxte.c0000644000175000017500000000376512037615407026111 0ustar mcfletchmcfletch00000000000000/* mxte -- A table driven tagging engine for Python (Version 0.9) Copyright (c) 2000, Marc-Andre Lemburg; mailto:mal@lemburg.com Copyright (c) 2000-2002, eGenix.com Software GmbH; mailto:info@egenix.com */ /* Debugging switches */ /*#define MAL_DEBUG*/ /*#define MAL_REF_DEBUG*/ /* Logging file used by debugging facility */ #ifndef MAL_DEBUG_OUTPUTFILE # define MAL_DEBUG_OUTPUTFILE "mxTagEngine.log" #endif #include "mx.h" #include "mxstdlib.h" #include "mxTextTools.h" /* --- Tagging Engine --- 8-bit String version ---------------------------- */ #undef TE_STRING_CHECK #define TE_STRING_CHECK(obj) PyString_Check(obj) #undef TE_STRING_AS_STRING #define TE_STRING_AS_STRING(obj) PyString_AS_STRING(obj) #undef TE_STRING_GET_SIZE #define TE_STRING_GET_SIZE(obj) PyString_GET_SIZE(obj) #undef TE_STRING_FROM_STRING #define TE_STRING_FROM_STRING(str, size) PyString_FromStringAndSize(str, size) #undef TE_CHAR #define TE_CHAR char #undef TE_HANDLE_MATCH #define TE_HANDLE_MATCH string_handle_match #undef TE_ENGINE_API #define TE_ENGINE_API mxTextTools_TaggingEngine #undef TE_TABLETYPE #define TE_TABLETYPE MXTAGTABLE_STRINGTYPE #undef TE_SEARCHAPI #define TE_SEARCHAPI mxTextSearch_SearchBuffer #include "mxte_impl.h" /* --- Tagging Engine --- Unicode version --------------------------------- */ #ifdef HAVE_UNICODE #undef TE_STRING_CHECK #define TE_STRING_CHECK(obj) PyUnicode_Check(obj) #undef TE_STRING_AS_STRING #define TE_STRING_AS_STRING(obj) PyUnicode_AS_UNICODE(obj) #undef TE_STRING_GET_SIZE #define TE_STRING_GET_SIZE(obj) PyUnicode_GET_SIZE(obj) #undef TE_STRING_FROM_STRING #define TE_STRING_FROM_STRING(str, size) PyUnicode_FromUnicode(str, size) #undef TE_CHAR #define TE_CHAR Py_UNICODE #undef TE_HANDLE_MATCH #define TE_HANDLE_MATCH unicode_handle_match #undef TE_ENGINE_API #define TE_ENGINE_API mxTextTools_UnicodeTaggingEngine #undef TE_TABLETYPE #define TE_TABLETYPE MXTAGTABLE_UNICODETYPE #undef TE_SEARCHAPI #define TE_SEARCHAPI mxTextSearch_SearchUnicode #include "mxte_impl.h" #endif SimpleParse-2.2.0/simpleparse/stt/TextTools/mxTextTools/Makefile.pre.in0000644000175000017500000002374012037615407027615 0ustar mcfletchmcfletch00000000000000# Universal Unix Makefile for Python extensions # ============================================= # Short Instructions # ------------------ # 1. Build and install Python (1.5 or newer). # 2. "make -f Makefile.pre.in boot" # 3. "make" # You should now have a shared library. # Long Instructions # ----------------- # Build *and install* the basic Python 1.5 distribution. See the # Python README for instructions. (This version of Makefile.pre.in # only withs with Python 1.5, alpha 3 or newer.) # Create a file Setup.in for your extension. This file follows the # format of the Modules/Setup.dist file; see the instructions there. # For a simple module called "spam" on file "spammodule.c", it can # contain a single line: # spam spammodule.c # You can build as many modules as you want in the same directory -- # just have a separate line for each of them in the Setup.in file. # If you want to build your extension as a shared library, insert a # line containing just the string # *shared* # at the top of your Setup.in file. # Note that the build process copies Setup.in to Setup, and then works # with Setup. It doesn't overwrite Setup when Setup.in is changed, so # while you're in the process of debugging your Setup.in file, you may # want to edit Setup instead, and copy it back to Setup.in later. # (All this is done so you can distribute your extension easily and # someone else can select the modules they actually want to build by # commenting out lines in the Setup file, without editing the # original. Editing Setup is also used to specify nonstandard # locations for include or library files.) # Copy this file (Misc/Makefile.pre.in) to the directory containing # your extension. # Run "make -f Makefile.pre.in boot". This creates Makefile # (producing Makefile.pre and sedscript as intermediate files) and # config.c, incorporating the values for sys.prefix, sys.exec_prefix # and sys.version from the installed Python binary. For this to work, # the python binary must be on your path. If this fails, try # make -f Makefile.pre.in Makefile VERSION=1.5 installdir= # where is the prefix used to install Python for installdir # (and possibly similar for exec_installdir=). # Note: "make boot" implies "make clobber" -- it assumes that when you # bootstrap you may have changed platforms so it removes all previous # output files. # If you are building your extension as a shared library (your # Setup.in file starts with *shared*), run "make" or "make sharedmods" # to build the shared library files. If you are building a statically # linked Python binary (the only solution of your platform doesn't # support shared libraries, and sometimes handy if you want to # distribute or install the resulting Python binary), run "make # python". # Note: Each time you edit Makefile.pre.in or Setup, you must run # "make Makefile" before running "make". # Hint: if you want to use VPATH, you can start in an empty # subdirectory and say (e.g.): # make -f ../Makefile.pre.in boot srcdir=.. VPATH=.. # === Bootstrap variables (edited through "make boot") === # The prefix used by "make inclinstall libainstall" of core python installdir= /usr/local # The exec_prefix used by the same exec_installdir=$(installdir) # Source directory and VPATH in case you want to use VPATH. # (You will have to edit these two lines yourself -- there is no # automatic support as the Makefile is not generated by # config.status.) srcdir= . VPATH= . # === Variables that you may want to customize (rarely) === # (Static) build target TARGET= python # Installed python binary (used only by boot target) PYTHON= python # Add more -I and -D options here CFLAGS= $(OPT) -I$(INCLUDEPY) -I$(EXECINCLUDEPY) $(DEFS) # These two variables can be set in Setup to merge extensions. # See example[23]. BASELIB= BASESETUP= # === Variables set by makesetup === MODOBJS= _MODOBJS_ MODLIBS= _MODLIBS_ # === Definitions added by makesetup === # === Variables from configure (through sedscript) === VERSION= @VERSION@ CC= @CC@ LINKCC= @LINKCC@ SGI_ABI= @SGI_ABI@ OPT= @OPT@ LDFLAGS= @LDFLAGS@ LDLAST= @LDLAST@ DEFS= @DEFS@ LIBS= @LIBS@ LIBM= @LIBM@ LIBC= @LIBC@ RANLIB= @RANLIB@ MACHDEP= @MACHDEP@ SO= @SO@ LDSHARED= @LDSHARED@ CCSHARED= @CCSHARED@ LINKFORSHARED= @LINKFORSHARED@ #@SET_CCC@ # Install prefix for architecture-independent files prefix= /usr/local # Install prefix for architecture-dependent files exec_prefix= $(prefix) # Uncomment the following two lines for AIX #LINKCC= $(LIBPL)/makexp_aix $(LIBPL)/python.exp "" $(LIBRARY); $(PURIFY) $(CC) #LDSHARED= $(LIBPL)/ld_so_aix $(CC) -bI:$(LIBPL)/python.exp # === Fixed definitions === # Shell used by make (some versions default to the login shell, which is bad) SHELL= /bin/sh # Expanded directories BINDIR= $(exec_installdir)/bin LIBDIR= $(exec_prefix)/lib MANDIR= $(installdir)/man INCLUDEDIR= $(installdir)/include SCRIPTDIR= $(prefix)/lib # Detailed destination directories BINLIBDEST= $(LIBDIR)/python$(VERSION) LIBDEST= $(SCRIPTDIR)/python$(VERSION) INCLUDEPY= $(INCLUDEDIR)/python$(VERSION) EXECINCLUDEPY= $(exec_installdir)/include/python$(VERSION) LIBP= $(exec_installdir)/lib/python$(VERSION) DESTSHARED= $(BINLIBDEST)/site-packages LIBPL= $(LIBP)/config PYTHONLIBS= $(LIBPL)/libpython$(VERSION).a MAKESETUP= $(LIBPL)/makesetup MAKEFILE= $(LIBPL)/Makefile CONFIGC= $(LIBPL)/config.c CONFIGCIN= $(LIBPL)/config.c.in SETUP= $(LIBPL)/Setup.local $(LIBPL)/Setup SYSLIBS= $(LIBM) $(LIBC) ADDOBJS= $(LIBPL)/python.o config.o # Portable install script (configure doesn't always guess right) INSTALL= $(LIBPL)/install-sh -c # Shared libraries must be installed with executable mode on some systems; # rather than figuring out exactly which, we always give them executable mode. # Also, making them read-only seems to be a good idea... INSTALL_SHARED= ${INSTALL} -m 555 # === Fixed rules === # Default target. This builds shared libraries only default: sharedmods # Build everything all: static sharedmods # Build shared libraries from our extension modules sharedmods: $(SHAREDMODS) # Build a static Python binary containing our extension modules static: $(TARGET) $(TARGET): $(ADDOBJS) lib.a $(PYTHONLIBS) Makefile $(BASELIB) $(LINKCC) $(LDFLAGS) $(LINKFORSHARED) \ $(ADDOBJS) lib.a $(PYTHONLIBS) \ $(LINKPATH) $(BASELIB) $(MODLIBS) $(LIBS) $(SYSLIBS) \ -o $(TARGET) $(LDLAST) install: sharedmods if test ! -d $(DESTSHARED) ; then \ mkdir $(DESTSHARED) ; else true ; fi -for i in X $(SHAREDMODS); do \ if test $$i != X; \ then $(INSTALL_SHARED) $$i $(DESTSHARED)/$$i; \ fi; \ done # Build the library containing our extension modules lib.a: $(MODOBJS) -rm -f lib.a ar cr lib.a $(MODOBJS) -$(RANLIB) lib.a # This runs makesetup *twice* to use the BASESETUP definition from Setup config.c Makefile: Makefile.pre Setup $(BASESETUP) $(MAKESETUP) $(MAKESETUP) \ -m Makefile.pre -c $(CONFIGCIN) Setup -n $(BASESETUP) $(SETUP) $(MAKE) -f Makefile do-it-again # Internal target to run makesetup for the second time do-it-again: $(MAKESETUP) \ -m Makefile.pre -c $(CONFIGCIN) Setup -n $(BASESETUP) $(SETUP) # Make config.o from the config.c created by makesetup config.o: config.c $(CC) $(CFLAGS) -c config.c # Setup is copied from Setup.in *only* if it doesn't yet exist Setup: cp $(srcdir)/Setup.in Setup # Make the intermediate Makefile.pre from Makefile.pre.in Makefile.pre: Makefile.pre.in sedscript sed -f sedscript $(srcdir)/Makefile.pre.in >Makefile.pre # Shortcuts to make the sed arguments on one line P=prefix E=exec_prefix H=Generated automatically from Makefile.pre.in by sedscript. L=LINKFORSHARED # Make the sed script used to create Makefile.pre from Makefile.pre.in sedscript: $(MAKEFILE) sed -n \ -e '1s/.*/1i\\/p' \ -e '2s%.*%# $H%p' \ -e '/^VERSION=/s/^VERSION=[ ]*\(.*\)/s%@VERSION[@]%\1%/p' \ -e '/^CC=/s/^CC=[ ]*\(.*\)/s%@CC[@]%\1%/p' \ -e '/^CCC=/s/^CCC=[ ]*\(.*\)/s%#@SET_CCC[@]%CCC=\1%/p' \ -e '/^LINKCC=/s/^LINKCC=[ ]*\(.*\)/s%@LINKCC[@]%\1%/p' \ -e '/^OPT=/s/^OPT=[ ]*\(.*\)/s%@OPT[@]%\1%/p' \ -e '/^LDFLAGS=/s/^LDFLAGS=[ ]*\(.*\)/s%@LDFLAGS[@]%\1%/p' \ -e '/^LDLAST=/s/^LDLAST=[ ]*\(.*\)/s%@LDLAST[@]%\1%/p' \ -e '/^DEFS=/s/^DEFS=[ ]*\(.*\)/s%@DEFS[@]%\1%/p' \ -e '/^LIBS=/s/^LIBS=[ ]*\(.*\)/s%@LIBS[@]%\1%/p' \ -e '/^LIBM=/s/^LIBM=[ ]*\(.*\)/s%@LIBM[@]%\1%/p' \ -e '/^LIBC=/s/^LIBC=[ ]*\(.*\)/s%@LIBC[@]%\1%/p' \ -e '/^RANLIB=/s/^RANLIB=[ ]*\(.*\)/s%@RANLIB[@]%\1%/p' \ -e '/^MACHDEP=/s/^MACHDEP=[ ]*\(.*\)/s%@MACHDEP[@]%\1%/p' \ -e '/^SO=/s/^SO=[ ]*\(.*\)/s%@SO[@]%\1%/p' \ -e '/^LDSHARED=/s/^LDSHARED=[ ]*\(.*\)/s%@LDSHARED[@]%\1%/p' \ -e '/^CCSHARED=/s/^CCSHARED=[ ]*\(.*\)/s%@CCSHARED[@]%\1%/p' \ -e '/^SGI_ABI=/s/^SGI_ABI=[ ]*\(.*\)/s%@SGI_ABI[@]%\1%/p' \ -e '/^$L=/s/^$L=[ ]*\(.*\)/s%@$L[@]%\1%/p' \ -e '/^$P=/s/^$P=\(.*\)/s%^$P=.*%$P=\1%/p' \ -e '/^$E=/s/^$E=\(.*\)/s%^$E=.*%$E=\1%/p' \ $(MAKEFILE) >sedscript echo "/^#@SET_CCC@/d" >>sedscript echo "/^installdir=/s%=.*%= $(installdir)%" >>sedscript echo "/^exec_installdir=/s%=.*%=$(exec_installdir)%" >>sedscript echo "/^srcdir=/s%=.*%= $(srcdir)%" >>sedscript echo "/^VPATH=/s%=.*%= $(VPATH)%" >>sedscript echo "/^LINKPATH=/s%=.*%= $(LINKPATH)%" >>sedscript echo "/^BASELIB=/s%=.*%= $(BASELIB)%" >>sedscript echo "/^BASESETUP=/s%=.*%= $(BASESETUP)%" >>sedscript # Bootstrap target boot: clobber VERSION=`$(PYTHON) -c "import sys; print sys.version[:3]"`; \ installdir=`$(PYTHON) -c "import sys; print sys.prefix"`; \ exec_installdir=`$(PYTHON) -c "import sys; print sys.exec_prefix"`; \ $(MAKE) -f $(srcdir)/Makefile.pre.in VPATH=$(VPATH) srcdir=$(srcdir) \ VERSION=$$VERSION \ installdir=$$installdir \ exec_installdir=$$exec_installdir \ Makefile # Handy target to remove intermediate files and backups clean: -rm -f *.o *~ # Handy target to remove everything that is easily regenerated clobber: clean -rm -f *.a tags TAGS config.c Makefile.pre $(TARGET) sedscript -rm -f *.so *.sl so_locations # Handy target to remove everything you don't want to distribute distclean: clobber -rm -f Makefile Setup SimpleParse-2.2.0/simpleparse/stt/TextTools/mxTextTools/mxTextTools.h0000644000175000017500000001610512620706017027437 0ustar mcfletchmcfletch00000000000000#ifndef MXTEXTTOOLS_H #define MXTEXTTOOLS_H /* mxTextTools -- Fast text manipulation routines Copyright (c) 2000, Marc-Andre Lemburg; mailto:mal@lemburg.com Copyright (c) 2000-2002, eGenix.com Software GmbH; mailto:info@egenix.com */ /* The extension's name; must be the same as the init function's suffix */ #define MXTEXTTOOLS_MODULE "mxTextTools" #include "mxbmse.h" #ifdef MXFASTSEARCH # include "private/mxfse.h" #endif /* Include generic mx extension header file */ #include "mxh.h" #ifdef MX_BUILDING_MXTEXTTOOLS # define MXTEXTTOOLS_EXTERNALIZE MX_EXPORT #else # define MXTEXTTOOLS_EXTERNALIZE MX_IMPORT #endif #ifdef __cplusplus extern "C" { #endif /* --- Text Search Object ---------------------------------------*/ /* Algorithm values */ #define MXTEXTSEARCH_BOYERMOORE 0 #define MXTEXTSEARCH_FASTSEARCH 1 #define MXTEXTSEARCH_TRIVIAL 2 typedef struct { PyObject_HEAD PyObject *match; /* Match string object */ PyObject *translate; /* Translate string object or NULL */ int algorithm; /* Algorithm to be used */ void *data; /* Internal data used by the algorithm or NULL */ } mxTextSearchObject; MXTEXTTOOLS_EXTERNALIZE(PyTypeObject) mxTextSearch_Type; #define mxTextSearch_Check(v) \ (Py_TYPE((v)) == &mxTextSearch_Type) /* Exporting these APIs for mxTextTools internal use only ! */ extern Py_ssize_t mxTextSearch_MatchLength(PyObject *self); extern Py_ssize_t mxTextSearch_SearchBuffer(PyObject *self, char *text, Py_ssize_t start, Py_ssize_t stop, Py_ssize_t *sliceleft, Py_ssize_t *sliceright); #ifdef HAVE_UNICODE extern Py_ssize_t mxTextSearch_SearchUnicode(PyObject *self, Py_UNICODE *text, Py_ssize_t start, Py_ssize_t stop, Py_ssize_t *sliceleft, Py_ssize_t *sliceright); #endif /* --- Character Set Object -------------------------------------*/ /* Mode values */ #define MXCHARSET_8BITMODE 0 #define MXCHARSET_UCS2MODE 1 #define MXCHARSET_UCS4MODE 2 typedef struct { PyObject_HEAD PyObject *definition; /* Character set definition */ int mode; /* Operation mode: 0 - 8-bit character lookup 1 - UCS-2 Unicode lookup 2 - UCS-4 Unicode lookup */ void *lookup; /* Lookup table */ } mxCharSetObject; MXTEXTTOOLS_EXTERNALIZE(PyTypeObject) mxCharSet_Type; #define mxCharSet_Check(v) \ (Py_TYPE((v)) == &mxCharSet_Type) /* Exporting these APIs for mxTextTools internal use only ! */ extern int mxCharSet_ContainsChar(PyObject *self, register unsigned char ch); #ifdef HAVE_UNICODE extern int mxCharSet_ContainsUnicodeChar(PyObject *self, register Py_UNICODE ch); #endif extern Py_ssize_t mxCharSet_Match(PyObject *self, PyObject *text, Py_ssize_t start, Py_ssize_t stop, int direction); /* --- Tag Table Object -----------------------------------------*/ typedef struct { PyObject *tagobj; /* Tag object to assign, call, append, etc. or NULL */ int cmd; /* Command integer */ int flags; /* Command flags */ PyObject *args; /* Command arguments */ int jne; /* Non-match jump offset */ int je; /* Match jump offset */ } mxTagTableEntry; #define MXTAGTABLE_STRINGTYPE 0 #define MXTAGTABLE_UNICODETYPE 1 typedef struct { PyObject_VAR_HEAD PyObject *definition; /* Reference to the original table definition or NULL; needed for caching */ int tabletype; /* Type of compiled table: 0 - 8-bit string args 1 - Unicode args */ int numentries; /* number of allocated entries */ mxTagTableEntry entry[1]; /* Variable length array of mxTagTableEntry fields */ } mxTagTableObject; MXTEXTTOOLS_EXTERNALIZE(PyTypeObject) mxTagTable_Type; #define mxTagTable_Check(v) \ (Py_TYPE((v)) == &mxTagTable_Type) #define mxTagTable_Type(v) \ (((mxTagTableObject *)(v))->tabletype) #define mxTagTable_Definition(v) \ (((mxTagTableObject *)(v))->definition) /* Exporting these APIs for mxTextTools internal use only ! */ extern PyObject *mxTagTable_New(PyObject *definition, int tabletype, int cacheable); /* --- Tagging Engine -------------------------------------------*/ /* Exporting these APIs for mxTextTools internal use only ! */ /* mxTextTools_TaggingEngine(): a table driven parser engine - return codes: rc = 2: match ok; rc = 1: match failed; rc = 0: error - doesn't check type of passed arguments ! - doesn't increment reference counts of passed objects ! */ extern int mxTextTools_TaggingEngine(PyObject *textobj, Py_ssize_t text_start, Py_ssize_t text_stop, mxTagTableObject *table, PyObject *taglist, PyObject *context, Py_ssize_t *next); extern int mxTextTools_UnicodeTaggingEngine(PyObject *textobj, Py_ssize_t text_start, Py_ssize_t text_stop, mxTagTableObject *table, PyObject *taglist, PyObject *context, Py_ssize_t *next); /* Command integers for cmd; see Constants/TagTable.py for details */ /* Low-level string matching, using the same simple logic: - match has to be a string - they only modify x (the current position in text) */ #define MATCH_ALLIN 11 #define MATCH_ALLNOTIN 12 #define MATCH_IS 13 #define MATCH_ISIN 14 #define MATCH_ISNOTIN 15 #define MATCH_WORD 21 #define MATCH_WORDSTART 22 #define MATCH_WORDEND 23 #define MATCH_ALLINSET 31 #define MATCH_ISINSET 32 #define MATCH_ALLINCHARSET 41 #define MATCH_ISINCHARSET 42 #define MATCH_MAX_LOWLEVEL 99 /* Jumps and other low-level special commands */ #define MATCH_FAIL 100 #define MATCH_JUMP MATCH_FAIL #define MATCH_EOF 101 #define MATCH_SKIP 102 #define MATCH_MOVE 103 #define MATCH_JUMPTARGET 104 #define MATCH_MAX_SPECIALS 199 /* Higher-level string matching */ #define MATCH_SWORDSTART 211 #define MATCH_SWORDEND 212 #define MATCH_SFINDWORD 213 #define MATCH_NOWORD MATCH_SWORDSTART /* Higher-level special commands */ #define MATCH_CALL 201 #define MATCH_CALLARG 202 #define MATCH_TABLE 203 #define MATCH_SUBTABLE 207 #define MATCH_TABLEINLIST 204 #define MATCH_SUBTABLEINLIST 208 #define MATCH_LOOP 205 #define MATCH_LOOPCONTROL 206 /* Special argument integers */ #define MATCH_JUMP_TO 0 #define MATCH_JUMP_MATCHOK 1000000 #define MATCH_JUMP_MATCHFAIL -1000000 #define MATCH_MOVE_EOF -1 #define MATCH_MOVE_BOF 0 #define MATCH_FAIL_HERE 1 #define MATCH_THISTABLE 999 #define MATCH_LOOPCONTROL_BREAK 0 #define MATCH_LOOPCONTROL_RESET -1 /* Flags set in cmd (>=256) */ #define MATCH_CALLTAG (1 << 8) #define MATCH_APPENDTAG (1 << 9) #define MATCH_APPENDTAGOBJ (1 << 10) #define MATCH_APPENDMATCH (1 << 11) #define MATCH_LOOKAHEAD (1 << 12) /* EOF */ #ifdef __cplusplus } #endif #endif SimpleParse-2.2.0/simpleparse/stt/TextTools/mxTextTools/mxh.h0000644000175000017500000000320212037615407025717 0ustar mcfletchmcfletch00000000000000#ifndef MXH_H #define MXH_H /* mxh.h -- Generic header file for all mx Extenstions This file should be included by every mx Extension header file and the C file. Copyright (c) 2000, Marc-Andre Lemburg; mailto:mal@lemburg.com Copyright (c) 2000-2002, eGenix.com Software GmbH; mailto:info@egenix.com See the documentation for further copyright information or contact the author. */ /* Macros to control export and import of DLL symbols. We use our own definitions since Python's don't allow specifying both imported and exported symbols at the same time; these defines haven't been thoroughly tested yet, patches are most welcome :-) */ /* Macro to "mark" a symbol for DLL export */ #if (defined(_MSC_VER) && _MSC_VER > 850 \ || defined(__MINGW32__) || defined(__CYGWIN) || defined(__BEOS__)) # ifdef __cplusplus # define MX_EXPORT(type) extern "C" type __declspec(dllexport) # else # define MX_EXPORT(type) extern type __declspec(dllexport) # endif #elif defined(__WATCOMC__) # define MX_EXPORT(type) extern type __export #elif defined(__IBMC__) # define MX_EXPORT(type) extern type _Export #else # define MX_EXPORT(type) extern type #endif /* Macro to "mark" a symbol for DLL import */ #if defined(__BORLANDC__) # define MX_IMPORT(type) extern type __import #elif (defined(_MSC_VER) && _MSC_VER > 850 \ || defined(__MINGW32__) || defined(__CYGWIN) || defined(__BEOS__)) # ifdef __cplusplus # define MX_IMPORT(type) extern "C" type __declspec(dllimport) # else # define MX_IMPORT(type) extern type __declspec(dllimport) # endif #else # define MX_IMPORT(type) extern type #endif /* EOF */ #endif SimpleParse-2.2.0/simpleparse/stt/TextTools/mxTextTools/mxbmse.c0000644000175000017500000001105712037615407026420 0ustar mcfletchmcfletch00000000000000/* mxbmse -- Fast Boyer Moore Search Algorithm (Version 0.9) The implementation is reentrant and thread safe. While the general ideas behind the Boyer Moore algorithm are in the public domain, this implementation falls under the following copyright: Copyright (c) 1997-2000, Marc-Andre Lemburg; mailto:mal@lemburg.com Copyright (c) 2000-2002, eGenix.com Software GmbH; mailto:info@egenix.com All Rights Reserved See the documentation for copying information or contact the author (mal@lemburg.com). */ /* to turn on the debugging printfs (DPRINTF):*/ /* #define MAL_DEBUG */ /* Logging file used by debugging facility */ #ifndef MAL_DEBUG_OUTPUTFILE # define MAL_DEBUG_OUTPUTFILE "mxTextSearch.log" #endif #ifdef MAL_DEBUG_WITH_PYTHON # include "mx.h" #endif #include "mxstdlib.h" #include "mxbmse.h" /* --- Fast Boyer-Moore Implementation (8-bit) ---------------------------- */ mxbmse_data *bm_init(char *match, int match_len) { mxbmse_data *c; int i; BM_SHIFT_TYPE *shift; char *m; c = newstruct(mxbmse_data); c->match = match; c->match_len = match_len; c->eom = match + match_len - 1; /* Length 1 matching does not use a shift table */ if (match_len == 1) return c; /* Init shift table */ for ( shift = c->shift, i = 256; i > 0; i--, shift++ ) *shift = (BM_SHIFT_TYPE) match_len; DPRINTF("shift table for match='%s'\n",match); for ( shift = c->shift, m = match, i = match_len - 1; i >= 0; i--, m++ ) { shift[ (unsigned char) *m ] = (BM_SHIFT_TYPE) i; DPRINTF(" char = '%c' shift = %i\n", *m, i); } return c; } void bm_free(mxbmse_data *c) { if (c) free(c); } int bm_search(mxbmse_data *c, char *text, int start, int text_len) { register char *pt; register char *eot = text + text_len; /* Error check */ if (c == NULL) return -1; /* Init text pointer */ pt = text + start + c->match_len - 1; DPRINTF("Init : %2i %20.20s \t text: %2i %20.20s\n", c->match_len,c->match,start,text+start); if (c->match_len > 1) for (;;) { register char *pm; pm = c->eom; for (;pt < eot && *pt != *pm; pt += c->shift[(unsigned char) *pt]); if (pt >= eot) break; /* First char matches.. what about the others ? */ { register int im = c->match_len; do { DPRINTF("=match: %2i '%20.20s' \t text: '%20.20s'\n", im,pm,pt); if (--im == 0) /* Match */ return pt - text + c->match_len; pt--; pm--; } while (*pt == *pm); /* Mismatch after match: use shift-table */ { register int a,b; a = c->shift[(unsigned char) *pt]; b = c->match_len - im + 1; DPRINTF("!match: %2i '%20.20s' \t text: '%20.20s' " "(sh=%i)\n", im,pm,pt,max(a,b)); pt += (a > b) ? a : b; } } } /* Special case: matching string has length 1 */ else { register char m = *c->eom; for (;pt < eot; pt++) if (*pt == m) /* Match */ return pt - text + 1; } return start; /* no match */ } /* bm search using the translate table -- 45% slower */ int bm_tr_search(mxbmse_data *c, char *text, int start, int text_len, char *tr) { register char *pt; register char *eot = text + text_len; /* Error check */ if (c == NULL) return -1; /* Init text pointer */ pt = text + start + c->match_len - 1; DPRINTF("Init : %2i '%20.20s' \t text: %2i '%20.20s'\n", c->match_len,c->match,start,text+start); if (c->match_len > 1) for (;;) { register char *pm; pm = c->eom; for (;pt < eot && tr[(unsigned char) *pt] != *pm; pt += c->shift[(unsigned char) tr[(unsigned char) *pt]]); if (pt >= eot) break; /* First char matches.. what about the others ? */ { register int im = c->match_len; do { DPRINTF("=match: %2i '%20.20s' \t text: '%20.20s'\n", im,pm,pt); if (--im == 0) /* Match */ return pt - text + c->match_len; pt--; pm--; } while (tr[(unsigned char) *pt] == *pm); /* Mismatch after match: use shift-table */ { register int a,b; a = c->shift[(unsigned char) tr[(unsigned char) *pt]]; b = c->match_len - im + 1; DPRINTF("!match: %2i '%20.20s' \t text: '%20.20s' " "(sh=%i)\n", im,pm,pt,max(a,b)); pt += (a > b)?a:b; } } } /* Special case: matching string has length 1 */ else { register char m = *c->eom; for (;pt < eot; pt++) if (*pt == m) /* Match */ return pt - text + 1; } return start; /* no match */ } SimpleParse-2.2.0/simpleparse/stt/TextTools/mxTextTools/mxTextTools.c.~1~0000644000175000017500000037603012554177362030206 0ustar mcfletchmcfletch00000000000000/* mxTextTools -- Fast text manipulation routines Copyright (c) 2000, Marc-Andre Lemburg; mailto:mal@lemburg.com Copyright (c) 2000-2002, eGenix.com Software GmbH; mailto:info@egenix.com */ /* We want all our symbols to be exported */ #define MX_BUILDING_MXTEXTTOOLS /* Logging file used by debugging facility */ #ifndef MAL_DEBUG_OUTPUTFILE # define MAL_DEBUG_OUTPUTFILE "mxTextTools.log" #endif #include "mx.h" #include "mxTextTools.h" #include #define VERSION "2.1.0" /* Initial list size used by e.g. setsplit(), setsplitx(),... */ #define INITIAL_LIST_SIZE 64 /* Maximum TagTable cache size. If this limit is reached, the cache is cleared to make room for new compile TagTables. */ #define MAX_TAGTABLES_CACHE_SIZE 100 /* Define this to enable the copy-protocol (__copy__, __deepcopy__) */ #define COPY_PROTOCOL /* --- module doc-string -------------------------------------------------- */ static char *Module_docstring = MXTEXTTOOLS_MODULE" -- Tools for fast text processing. Version "VERSION"\n\n" "Copyright (c) 1997-2000, Marc-Andre Lemburg; mailto:mal@lemburg.com\n" "Copyright (c) 2000-2002, eGenix.com Software GmbH; mailto:info@egenix.com\n\n" "Copyright (c) 2003-2006, Mike Fletcher; mailto:mcfletch@vrplumber.com\n\n" " All Rights Reserved\n\n" "See the documentation for further information on copyrights,\n" "or contact the author." ; /* --- internal macros ---------------------------------------------------- */ /* --- module globals ----------------------------------------------------- */ /* Translation strings for the 8-bit versions of lower() and upper() */ static PyObject *mx_ToUpper; static PyObject *mx_ToLower; static PyObject *mxTextTools_Error; /* mxTextTools specific error */ static PyObject *mxTextTools_TagTables; /* TagTable cache dictionary */ /* Flag telling us whether the module was initialized or not. */ static int mxTextTools_Initialized = 0; /* --- forward declarations ----------------------------------------------- */ /* --- module helper ------------------------------------------------------ */ static PyObject *mxTextTools_ToUpper(void) { char tr[256]; Py_ssize_t i; for (i = 0; i < 256; i++) tr[i] = toupper((char)i); return PyString_FromStringAndSize(tr,sizeof(tr)); } static PyObject *mxTextTools_ToLower(void) { char tr[256]; Py_ssize_t i; for (i = 0; i < 256; i++) tr[i] = tolower((char)i); return PyString_FromStringAndSize(tr,sizeof(tr)); } /* Create an exception object, insert it into the module dictionary under the given name and return the object pointer; this is NULL in case an error occurred. base can be given to indicate the base object to be used by the exception object. It should be NULL otherwise */ static PyObject *insexc(PyObject *moddict, char *name, PyObject *base) { PyObject *v; char fullname[256]; char *modname; char *dot; v = PyDict_GetItemString(moddict, "__name__"); if (v == NULL) modname = NULL; else modname = PyString_AsString(v); if (modname == NULL) { PyErr_Clear(); modname = MXTEXTTOOLS_MODULE; } /* The symbols from this extension are imported into simpleparse.stt.TextTools. We trim the name to not confuse the user with an overly long package path. */ strcpy(fullname, modname); dot = strchr(fullname, '.'); if (dot) dot = strchr(dot+1, '.'); if (dot) strcpy(dot+1, name); else sprintf(fullname, "%s.%s", modname, name); v = PyErr_NewException(fullname, base, NULL); if (v == NULL) return NULL; if (PyDict_SetItemString(moddict,name,v)) return NULL; return v; } /* Helper for adding integer constants to a dictionary. Check for errors with PyErr_Occurred() */ static void insint(PyObject *dict, char *name, int value) { PyObject *v = PyInt_FromLong((long)value); PyDict_SetItemString(dict, name, v); Py_XDECREF(v); } /* --- module interface --------------------------------------------------- */ /* --- Text Search Object ----------------------------------------------*/ staticforward PyMethodDef mxTextSearch_Methods[]; /* allocation */ static PyObject *mxTextSearch_New(PyObject *match, PyObject *translate, int algorithm) { mxTextSearchObject *so; so = PyObject_NEW(mxTextSearchObject, &mxTextSearch_Type); if (so == NULL) return NULL; so->data = NULL; so->translate = NULL; so->match = NULL; Py_INCREF(match); so->match = match; if (translate == Py_None) translate = NULL; else if (translate) { Py_Assert(PyString_Check(translate), PyExc_TypeError, "translate table must be a string"); Py_Assert(PyString_GET_SIZE(translate) == 256, PyExc_TypeError, "translate string must have exactly 256 chars"); Py_INCREF(translate); } so->translate = translate; /* Init algorithm */ so->algorithm = algorithm; switch (algorithm) { case MXTEXTSEARCH_BOYERMOORE: Py_Assert(PyString_Check(match), PyExc_TypeError, "match must be a string for Boyer-Moore"); so->data = bm_init(PyString_AS_STRING(match), PyString_GET_SIZE(match)); Py_Assert(so->data != NULL, PyExc_TypeError, "error initializing the search object"); break; #ifdef MXFASTSEARCH case MXTEXTSEARCH_FASTSEARCH: Py_Assert(PyString_Check(match), PyExc_TypeError, "match must be a string for FastSearch"); so->data = fs_init(PyString_AS_STRING(match), PyString_GET_SIZE(match)); Py_Assert(so->data != NULL, PyExc_TypeError, "error initializing the search object"); break; #endif case MXTEXTSEARCH_TRIVIAL: Py_Assert(PyString_Check(match) || PyUnicode_Check(match), PyExc_TypeError, "match must be a string or unicode"); Py_Assert(so->translate == NULL, PyExc_TypeError, "trivial search algorithm does not support translate"); break; default: Py_Error(PyExc_ValueError, "unknown or unsupported algorithm"); } return (PyObject *)so; onError: Py_DECREF(so); return NULL; } Py_C_Function_WithKeywords( mxTextSearch_TextSearch, "TextSearch(match[,translate=None,algorithm=default_algorithm])\n\n" "Create a substring search object for the string match;\n" "translate is an optional translate-string like the one used\n" "in the module re." ) { PyObject *match = 0; PyObject *translate = 0; int algorithm = -424242; Py_KeywordsGet3Args("O|Oi:TextSearch",match,translate,algorithm); if (algorithm == -424242) { if (PyUnicode_Check(match)) algorithm = MXTEXTSEARCH_TRIVIAL; else #ifdef MXFASTSEARCH algorithm = MXTEXTSEARCH_BOYERMOORE; #else algorithm = MXTEXTSEARCH_BOYERMOORE; #endif } return mxTextSearch_New(match, translate, algorithm); onError: return NULL; } static void mxTextSearch_Free(mxTextSearchObject *so) { if (so->data) { switch (so->algorithm) { case MXTEXTSEARCH_BOYERMOORE: bm_free(so->data); break; #ifdef MXFASTSEARCH case MXTEXTSEARCH_FASTSEARCH: fs_free(so->data); break; #endif case MXTEXTSEARCH_TRIVIAL: break; } } Py_XDECREF(so->match); Py_XDECREF(so->translate); PyObject_Del(so); } /* C APIs */ #define so ((mxTextSearchObject *)self) /* Get the match length from an TextSearch object or -1 in case of an error. */ Py_ssize_t mxTextSearch_MatchLength(PyObject *self) { Py_Assert(mxTextSearch_Check(self), PyExc_TypeError, "expected a TextSearch object"); switch (so->algorithm) { case MXTEXTSEARCH_BOYERMOORE: return BM_MATCH_LEN(so->data); break; #ifdef MXFASTSEARCH case MXTEXTSEARCH_FASTSEARCH: return FS_MATCH_LEN(so->data); break; #endif case MXTEXTSEARCH_TRIVIAL: if (PyString_Check(so->match)) return PyString_GET_SIZE(so->match); #ifdef HAVE_UNICODE else if (PyUnicode_Check(so->match)) return PyUnicode_GET_SIZE(so->match); #endif break; } Py_Error(mxTextTools_Error, "internal error"); onError: return -1; } static Py_ssize_t trivial_search(const char *text, Py_ssize_t start, Py_ssize_t stop, const char *match, Py_ssize_t match_len) { Py_ssize_t ml1 = match_len - 1; register const char *tx = &text[start]; register Py_ssize_t x = start; if (ml1 < 0) return start; /* Brute-force method; from right to left */ for (;;) { register Py_ssize_t j = ml1; register const char *mj = &match[j]; if (x + j >= stop) /* reached eof: no match */ return start; /* scan from right to left */ for (tx += j; j >= 0 && *tx == *mj; tx--, mj--, j--) ; if (j < 0) { /* found */ x += ml1 + 1; return x; } /* not found: rewind and advance one char */ tx -= j - 1; x++; } return start; } #ifdef HAVE_UNICODE static Py_ssize_t trivial_unicode_search(const Py_UNICODE *text, Py_ssize_t start, Py_ssize_t stop, const Py_UNICODE *match, Py_ssize_t match_len) { Py_ssize_t ml1 = match_len - 1; register const Py_UNICODE *tx = &text[start]; register Py_ssize_t x = start; if (ml1 < 0) return start; /* Brute-force method; from right to left */ for (;;) { register Py_ssize_t j = ml1; register const Py_UNICODE *mj = &match[j]; if (x + j >= stop) /* reached eof: no match */ return start; /* scan from right to left */ for (tx += j; j >= 0 && *tx == *mj; tx--, mj--, j--) ; if (j < 0) { /* found */ x += ml1 + 1; return x; } /* not found: rewind and advance one char */ tx -= j - 1; x++; } return start; } #endif /* Search for the match in text[start:stop]. Returns 1 in case a match was found and sets sliceleft, sliceright to the matching slice. Returns 0 in case no match was found and -1 in case of an error. */ Py_ssize_t mxTextSearch_SearchBuffer(PyObject *self, char *text, Py_ssize_t start, Py_ssize_t stop, Py_ssize_t *sliceleft, Py_ssize_t *sliceright) { Py_ssize_t nextpos; Py_ssize_t match_len; Py_Assert(mxTextSearch_Check(self), PyExc_TypeError, "expected a TextSearch object"); switch (so->algorithm) { case MXTEXTSEARCH_BOYERMOORE: if (so->translate) { /* search with translate table */ nextpos = bm_tr_search((mxbmse_data *)so->data, text, start, stop, PyString_AS_STRING(so->translate)); } else { /* exact search */ nextpos = bm_search((mxbmse_data *)so->data, text, start, stop); } match_len = BM_MATCH_LEN(so->data); break; #ifdef MXFASTSEARCH case MXTEXTSEARCH_FASTSEARCH: if (so->translate) { /* search with translate table */ nextpos = fs_tr_search((mxfse_data *)so->data, text, start, stop, PyString_AS_STRING(so->translate)); } else { /* exact search */ nextpos = fs_search((mxfse_data *)so->data, text, start, stop); } match_len = FS_MATCH_LEN(so->data); break; #endif case MXTEXTSEARCH_TRIVIAL: { const char *match; if (PyString_Check(so->match)) { match = PyString_AS_STRING(so->match); match_len = PyString_GET_SIZE(so->match); } else if (PyObject_AsCharBuffer(so->match, &match, &match_len)) goto onError; nextpos = trivial_search(text, start, stop, match, match_len); } break; default: Py_Error(mxTextTools_Error, "unknown algorithm type in mxTextSearch_SearchBuffer"); } /* Found ? */ if (nextpos != start) { if (sliceleft) *sliceleft = nextpos - match_len; if (sliceright) *sliceright = nextpos; return 1; } /* Not found */ return 0; onError: return -1; } #ifdef HAVE_UNICODE Py_ssize_t mxTextSearch_SearchUnicode(PyObject *self, Py_UNICODE *text, Py_ssize_t start, Py_ssize_t stop, Py_ssize_t *sliceleft, Py_ssize_t *sliceright) { Py_ssize_t nextpos; Py_ssize_t match_len; Py_Assert(mxTextSearch_Check(self), PyExc_TypeError, "expected a TextSearch object"); switch (so->algorithm) { case MXTEXTSEARCH_BOYERMOORE: Py_Error(PyExc_TypeError, "Boyer-Moore search algorithm does not support Unicode"); break; #ifdef MXFASTSEARCH case MXTEXTSEARCH_FASTSEARCH: Py_Error(PyExc_TypeError, "FastSearch search algorithm does not support Unicode"); #endif case MXTEXTSEARCH_TRIVIAL: { PyObject *u; Py_UNICODE *match; if (PyUnicode_Check(so->match)) { u = NULL; match = PyUnicode_AS_UNICODE(so->match); match_len = PyUnicode_GET_SIZE(so->match); } else { u = PyUnicode_FromEncodedObject(so->match, NULL, NULL); if (u == NULL) goto onError; match = PyUnicode_AS_UNICODE(u); match_len = PyUnicode_GET_SIZE(u); } nextpos = trivial_unicode_search(text, start, stop, match, match_len); Py_XDECREF(u); } break; default: Py_Error(mxTextTools_Error, "unknown algorithm type in mxTextSearch_SearchUnicode"); } /* Found ? */ if (nextpos != start) { if (sliceleft) *sliceleft = nextpos - match_len; if (sliceright) *sliceright = nextpos; return 1; } /* Not found */ return 0; onError: return -1; } #endif /* methods */ Py_C_Function( mxTextSearch_search, "TextSearch.search(text,start=0,stop=len(text))\n\n" "Search for the substring in text, looking only at the\n" "slice [start:stop] and return the slice (l,r)\n" "where the substring was found, (start,start) otherwise.") { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *text; Py_ssize_t start = 0; Py_ssize_t stop = INT_MAX; Py_ssize_t sliceleft, sliceright; int rc; Py_Get3Args("O|ii:TextSearch.search", text,start,stop); if (PyString_Check(text)) { Py_CheckStringSlice(text, start, stop); rc = mxTextSearch_SearchBuffer(self, PyString_AS_STRING(text), start, stop, &sliceleft, &sliceright); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, stop); rc = mxTextSearch_SearchUnicode(self, PyUnicode_AS_UNICODE(text), start, stop, &sliceleft, &sliceright); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); if (rc < 0) goto onError; if (rc == 0) { sliceleft = start; sliceright = start; } /* Return the slice */ Py_Return2("ii", sliceleft, sliceright); onError: return NULL; } Py_C_Function( mxTextSearch_find, "TextSearch.find(text,start=0,stop=len(text))\n\n" "Search for the substring in text, looking only at the\n" "slice [start:stop] and return the index\n" "where the substring was found, -1 otherwise.") { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *text; Py_ssize_t start = 0; Py_ssize_t stop = INT_MAX; Py_ssize_t sliceleft, sliceright; int rc; Py_Get3Args("O|ii:TextSearch.find", text,start,stop); if (PyString_Check(text)) { Py_CheckStringSlice(text, start, stop); rc = mxTextSearch_SearchBuffer(self, PyString_AS_STRING(text), start, stop, &sliceleft, &sliceright); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, stop); rc = mxTextSearch_SearchUnicode(self, PyUnicode_AS_UNICODE(text), start, stop, &sliceleft, &sliceright); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); if (rc < 0) goto onError; if (rc == 0) sliceleft = -1; return PyInt_FromLong(sliceleft); onError: return NULL; } Py_C_Function( mxTextSearch_findall, "TextSearch.findall(text,start=0,stop=len(text))\n\n" "Search for the substring in text, looking only at the\n" "slice [start:stop] and return a list of all\n" "non overlapping slices (l,r) in text where the match\n" "string can be found.") { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *text; PyObject *list = 0; Py_ssize_t start = 0; Py_ssize_t stop = INT_MAX; Py_ssize_t stop_index; Py_ssize_t match_len; Py_ssize_t listsize = INITIAL_LIST_SIZE; Py_ssize_t listitem = 0; Py_Get3Args("O|ii:TextSearch.findall", text,start,stop); if (PyString_Check(text)) { Py_CheckStringSlice(text, start, stop); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, stop); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); list = PyList_New(listsize); if (!list) goto onError; match_len = mxTextSearch_MatchLength(self); if (match_len < 0) goto onError; stop_index = stop - match_len; while (start <= stop_index) { register PyObject *t,*v; int rc; Py_ssize_t sliceleft, sliceright; /* exact search */ if (PyString_Check(text)) rc = mxTextSearch_SearchBuffer(self, PyString_AS_STRING(text), start, stop, &sliceleft, &sliceright); #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) rc = mxTextSearch_SearchUnicode(self, PyUnicode_AS_UNICODE(text), start, stop, &sliceleft, &sliceright); #endif else break; if (rc < 0) goto onError; if (rc == 0) break; /* Build slice and append to list */ t = PyTuple_New(2); if (!t) goto onError; v = PyInt_FromLong(sliceleft); if (!v) goto onError; PyTuple_SET_ITEM(t,0,v); v = PyInt_FromLong(sliceright); if (!v) goto onError; PyTuple_SET_ITEM(t,1,v); if (listitem < listsize) PyList_SET_ITEM(list, listitem, t); else { PyList_Append(list, t); Py_DECREF(t); } listitem++; start = sliceright; } /* Resize list if necessary */ if (listitem < listsize) PyList_SetSlice(list, listitem, listsize, (PyObject*)NULL); return list; onError: Py_XDECREF(list); return NULL; } #ifdef COPY_PROTOCOL Py_C_Function( mxTextSearch_copy, "copy([memo])\n\n" "Return a new reference for the instance. This function\n" "is used for the copy-protocol. Real copying doesn't take\n" "place, since the instances are immutable.") { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *memo; Py_GetArg("|O",memo); Py_INCREF(so); return (PyObject *)so; onError: return NULL; } #endif #undef so /* --- slots --- */ static PyObject *mxTextSearch_Repr(mxTextSearchObject *self) { char *algoname; PyObject *v; char t[500], *reprstr; v = PyObject_Repr(self->match); if (v == NULL) return NULL; reprstr = PyString_AsString(v); if (reprstr == NULL) return NULL; switch (self->algorithm) { case MXTEXTSEARCH_BOYERMOORE: algoname = "Boyer-Moore"; break; #ifdef MXFASTSEARCH case MXTEXTSEARCH_FASTSEARCH: algoname = "FastSearch"; break; #endif case MXTEXTSEARCH_TRIVIAL: algoname = "Trivial"; break; default: algoname = ""; } sprintf(t, "<%.50s TextSearch object for %.400s at 0x%lx>", algoname, reprstr, (long)self); Py_DECREF(v); return PyString_FromString(t); } static PyObject *mxTextSearch_GetAttr(mxTextSearchObject *self, char *name) { PyObject *v; if (Py_WantAttr(name,"match")) { v = self->match; Py_INCREF(v); return v; } else if (Py_WantAttr(name,"translate")) { v = self->translate; if (v == NULL) v = Py_None; Py_INCREF(v); return v; } else if (Py_WantAttr(name,"algorithm")) return PyInt_FromLong(self->algorithm); else if (Py_WantAttr(name,"__members__")) return Py_BuildValue("[sss]", "match", "translate", "algorithm"); return Py_FindMethod(mxTextSearch_Methods, (PyObject *)self, (char *)name); } /* Python Type Table */ PyTypeObject mxTextSearch_Type = { PyObject_HEAD_INIT(0) /* init at startup ! */ 0, /*ob_size*/ "TextSearch", /*tp_name*/ sizeof(mxTextSearchObject), /*tp_basicsize*/ 0, /*tp_itemsize*/ /* methods */ (destructor)mxTextSearch_Free, /*tp_dealloc*/ (printfunc)0, /*tp_print*/ (getattrfunc)mxTextSearch_GetAttr, /*tp_getattr*/ (setattrfunc)0, /*tp_setattr*/ (cmpfunc)0, /*tp_compare*/ (reprfunc)mxTextSearch_Repr, /*tp_repr*/ 0, /*tp_as_number*/ 0, /*tp_as_number*/ 0, /*tp_as_mapping*/ (hashfunc)0, /*tp_hash*/ (ternaryfunc)0, /*tp_call*/ (reprfunc)0, /*tp_str*/ (getattrofunc)0, /*tp_getattro*/ (setattrofunc)0, /*tp_setattro*/ }; /* Python Method Table */ statichere PyMethodDef mxTextSearch_Methods[] = { Py_MethodListEntry("search",mxTextSearch_search), Py_MethodListEntry("find",mxTextSearch_find), Py_MethodListEntry("findall",mxTextSearch_findall), #ifdef COPY_PROTOCOL Py_MethodListEntry("__deepcopy__",mxTextSearch_copy), Py_MethodListEntry("__copy__",mxTextSearch_copy), #endif {NULL,NULL} /* end of list */ }; /* --- Character Set Object --------------------------------------------*/ staticforward PyMethodDef mxCharSet_Methods[]; /* internal */ /* 8-bit character sets are implemented using a simple 32-byte long bitmap with one bit per character. Addressing is done as follows: def char_is_set(ordinal): return bitmap[ordinal >> 3] & (1 << (ordinal & 7)) */ #define STRING_CHARSET_SIZE 256 #define STRING_CHARSET_BITMAP_SIZE (STRING_CHARSET_SIZE / 8) typedef struct { unsigned char bitmap[STRING_CHARSET_BITMAP_SIZE]; /* character bitmap */ } string_charset; static int init_string_charset(mxCharSetObject *cs, PyObject *definition) { register Py_ssize_t i, j; char *def = PyString_AS_STRING(definition); const Py_ssize_t len = PyString_GET_SIZE(definition); string_charset *lookup = 0; register unsigned char *bitmap; int logic = 1; /* Handle logic change (first char is '^' for negative matching) */ if (len > 0 && def[0] == '^') { logic = 0; i = 1; } else i = 0; /* Build 32-byte lookup bitmap (one bit per character) */ lookup = (string_charset *)PyMem_Malloc(sizeof(string_charset)); if (lookup == NULL) { PyErr_NoMemory(); goto onError; } memset(lookup, 0, sizeof(string_charset)); cs->mode = MXCHARSET_8BITMODE; cs->lookup = (void *)lookup; bitmap = lookup->bitmap; for (; i < len; i++) { /* Handle escapes: "b\-d", "\\" */ if (def[i] == '\\') { if (i < len - 1 && def[i+1] == '\\') { j = (unsigned char)'\\'; bitmap[j >> 3] |= 1 << (j & 7); i++; } continue; } /* Handle ranges: "b-d", "\\-z", "\--z" */ if (i < len - 2 && def[i+1] == '-') { unsigned char range_left = def[i]; unsigned char range_right = def[i+2]; for (j = range_left; j <= range_right; j++) bitmap[j >> 3] |= 1 << (j & 7); i++; continue; } /* Normal processing */ j = (unsigned char)def[i]; bitmap[j >> 3] |= 1 << (j & 7); } /* Invert bitmap if negative matching is requested */ if (!logic) { DPRINTF("init_string_charset: inverting bitmap\n"); for (i = 0; i < STRING_CHARSET_BITMAP_SIZE; i++) bitmap[i] ^= 0xFF; } return 0; onError: if (lookup) PyMem_Free((void *)lookup); cs->lookup = 0; return -1; } #ifdef HAVE_UNICODE /* Unicode character sets are implemented using two step indexing which is a good compromise between lookup speed and memory usage. Lookup is done using a variable length array of 32-byte bitmap blocks. There can be 256 such blocks. Identical blocks are collapsed into a single copy. Addressing is done as follows: def char_is_set(ordinal): index = bitmapindex[ordinal >> 8] bitmap = bitmaps[index] return bitmap[(ordinal >> 3) & 31] & (1 << (ordinal & 7)) The technique used here is very similar to what is done in Python's SRE (see the BIGCHARSET patch by Martin von Loewis). Compression should be reasonably good since character sets in practice usually only contains a few single characters or longer ranges of Unicode characters. */ #define UNICODE_CHARSET_SIZE 65536 #define UNICODE_CHARSET_BITMAP_SIZE 32 #define UNICODE_CHARSET_BITMAPS (UNICODE_CHARSET_SIZE / (UNICODE_CHARSET_BITMAP_SIZE * 8)) #define UNICODE_CHARSET_BIGMAP_SIZE (UNICODE_CHARSET_SIZE / 8) typedef struct { unsigned char bitmapindex[UNICODE_CHARSET_BITMAPS]; /* Index to char bitmaps */ unsigned char bitmaps[UNICODE_CHARSET_BITMAPS][UNICODE_CHARSET_BITMAP_SIZE]; /* Variable length bitmap array */ } unicode_charset; static int init_unicode_charset(mxCharSetObject *cs, PyObject *definition) { register Py_ssize_t i, j; Py_UNICODE *def = PyUnicode_AS_UNICODE(definition); const Py_ssize_t len = PyUnicode_GET_SIZE(definition); unicode_charset *lookup = 0; unsigned char bigmap[UNICODE_CHARSET_BIGMAP_SIZE]; Py_ssize_t blocks; int logic = 1; /* Handle logic change (first char is '^' for negative matching) */ if (len > 0 && def[0] == '^') { logic = 0; i = 1; } else i = 0; /* Build bigmap */ memset(bigmap, 0, sizeof(bigmap)); for (; i < len; i++) { /* Handle escapes: "b\-d", "\\" */ if (def[i] == '\\') { if (i < len - 1 && def[i+1] == '\\') { j = (int)'\\'; bigmap[j >> 3] |= 1 << (j & 7); i++; } continue; } /* Handle ranges: "b-d", "\\-z", "\--z" */ if (i < len - 2 && def[i+1] == '-') { Py_UNICODE range_left = def[i]; Py_UNICODE range_right = def[i+2]; if (range_right >= UNICODE_CHARSET_SIZE) { Py_Error(PyExc_ValueError, "unicode ordinal out of supported range"); } for (j = range_left; j <= range_right; j++) bigmap[j >> 3] |= 1 << (j & 7); i++; continue; } /* Normal processing */ j = def[i]; if (j >= UNICODE_CHARSET_SIZE) { Py_Error(PyExc_ValueError, "unicode ordinal out of supported range"); } bigmap[j >> 3] |= 1 << (j & 7); } /* Build lookup table XXX Could add dynamic resizing here... probably not worth it though, since sizeof(unicode_charset) isn't all that large. */ lookup = (unicode_charset *)PyMem_Malloc(sizeof(unicode_charset)); if (lookup == NULL) { PyErr_NoMemory(); goto onError; } blocks = 0; for (i = UNICODE_CHARSET_BITMAPS - 1; i >= 0; i--) { unsigned char *block = &bigmap[i << 5]; for (j = blocks - 1; j >= 0; j--) if (memcmp(lookup->bitmaps[j], block, UNICODE_CHARSET_BITMAP_SIZE) == 0) break; if (j < 0) { j = blocks; DPRINTF("init_unicode_charset: Creating new block %i for %i\n", j, i); memcpy(lookup->bitmaps[j], block, UNICODE_CHARSET_BITMAP_SIZE); blocks++; } else DPRINTF("init_unicode_charset: Reusing block %i for %i\n", j, i); lookup->bitmapindex[i] = j; } DPRINTF("init_unicode_charset: Map size: %i block(s) = %i bytes\n", blocks, UNICODE_CHARSET_BITMAPS + blocks * UNICODE_CHARSET_BITMAP_SIZE); lookup = (unicode_charset *)PyMem_Realloc(lookup, UNICODE_CHARSET_BITMAPS + blocks * UNICODE_CHARSET_BITMAP_SIZE); if (lookup == NULL) { PyErr_NoMemory(); goto onError; } /* Invert bitmaps if negative matching is requested */ if (!logic) { register unsigned char *bitmap = &lookup->bitmaps[0][0]; DPRINTF("init_unicode_charset: inverting bitmaps\n"); for (i = 0; i < blocks * UNICODE_CHARSET_BITMAP_SIZE; i++) bitmap[i] ^= 0xFF; } cs->mode = MXCHARSET_UCS2MODE; cs->lookup = (void *)lookup; return 0; onError: if (lookup) PyMem_Free((void *)lookup); cs->lookup = 0; return -1; } #endif /* allocation */ static PyObject *mxCharSet_New(PyObject *definition) { mxCharSetObject *cs; cs = PyObject_NEW(mxCharSetObject, &mxCharSet_Type); if (cs == NULL) return NULL; Py_INCREF(definition); cs->definition = definition; cs->lookup = NULL; cs->mode = -1; if (PyString_Check(definition)) { if (init_string_charset(cs, definition)) goto onError; } #ifdef HAVE_UNICODE else if (PyUnicode_Check(definition)) { if (init_unicode_charset(cs, definition)) goto onError; } #endif else Py_Error(PyExc_TypeError, "character set definition must be string or unicode"); return (PyObject *)cs; onError: Py_DECREF(cs); return NULL; } Py_C_Function( mxCharSet_CharSet, "CharSet(definition)\n\n" "Create a character set matching object from the string" ) { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *definition; Py_GetArg("O:CharSet", definition); return mxCharSet_New(definition); onError: return NULL; } static void mxCharSet_Free(mxCharSetObject *cs) { Py_XDECREF(cs->definition); if (cs->lookup) PyMem_Free(cs->lookup); PyObject_Del(cs); } /* C APIs */ #define cs ((mxCharSetObject *)self) int mxCharSet_ContainsChar(PyObject *self, register unsigned char ch) { if (!mxCharSet_Check(self)) { PyErr_BadInternalCall(); goto onError; } if (cs->mode == MXCHARSET_8BITMODE) { unsigned char *bitmap = ((string_charset *)cs->lookup)->bitmap; return ((bitmap[ch >> 3] & (1 << (ch & 7))) != 0); } #ifdef HAVE_UNICODE else if (cs->mode == MXCHARSET_UCS2MODE) { unicode_charset *lookup = (unicode_charset *)cs->lookup; unsigned char *bitmap = lookup->bitmaps[lookup->bitmapindex[0]]; return ((bitmap[ch >> 3] & (1 << (ch & 7))) != 0); } #endif else { Py_Error(mxTextTools_Error, "unsupported character set mode"); } onError: return -1; } #ifdef HAVE_UNICODE int mxCharSet_ContainsUnicodeChar(PyObject *self, register Py_UNICODE ch) { if (!mxCharSet_Check(self)) { PyErr_BadInternalCall(); goto onError; } if (cs->mode == MXCHARSET_8BITMODE) { unsigned char *bitmap = ((string_charset *)cs->lookup)->bitmap; if (ch >= 256) return 0; return ((bitmap[ch >> 3] & (1 << (ch & 7))) != 0); } else if (cs->mode == MXCHARSET_UCS2MODE) { unicode_charset *lookup = (unicode_charset *)cs->lookup; unsigned char *bitmap = lookup->bitmaps[lookup->bitmapindex[ch >> 8]]; return ((bitmap[(ch >> 3) & 31] & (1 << (ch & 7))) != 0); } else { Py_Error(mxTextTools_Error, "unsupported character set mode"); } onError: return -1; } #endif static int mxCharSet_Contains(PyObject *self, PyObject *other) { if (PyString_Check(other)) { Py_Assert(PyString_GET_SIZE(other) == 1, PyExc_TypeError, "expected a single character"); return mxCharSet_ContainsChar(self, PyString_AS_STRING(other)[0]); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(other)) { Py_Assert(PyUnicode_GET_SIZE(other) == 1, PyExc_TypeError, "expected a single unicode character"); return mxCharSet_ContainsUnicodeChar(self, PyUnicode_AS_UNICODE(other)[0]); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode character"); onError: return -1; } /* In mode 1, find the position of the first character in text belonging to set. This may also be stop or start-1 in case no such character is found during the search (depending on the direction). In mode 0, find the first character not in set. This may also be stop or start-1 in case no such character is found during the search (depending on the direction). The search is done in the slice start:stop. -2 is returned in case of an error. */ static int mxCharSet_FindChar(PyObject *self, unsigned char *text, Py_ssize_t start, Py_ssize_t stop, const int mode, const int direction) { register Py_ssize_t i; register unsigned int c; register unsigned int block; unsigned char *bitmap; if (!mxCharSet_Check(self)) { PyErr_BadInternalCall(); goto onError; } if (cs->mode == MXCHARSET_8BITMODE) bitmap = ((string_charset *)cs->lookup)->bitmap; #ifdef HAVE_UNICODE else if (cs->mode == MXCHARSET_UCS2MODE) { unicode_charset *lookup = (unicode_charset *)cs->lookup; bitmap = lookup->bitmaps[lookup->bitmapindex[0]]; } #endif else { Py_Error(mxTextTools_Error, "unsupported character set mode"); } if (direction > 0) { if (mode) /* Find first char in set */ for (i = start; i < stop; i++) { c = text[i]; block = bitmap[c >> 3]; if (block && ((block & (1 << (c & 7))) != 0)) break; } else /* Find first char not in set */ for (i = start; i < stop; i++) { c = text[i]; block = bitmap[c >> 3]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } } else { if (mode) /* Find first char in set, searching from the end */ for (i = stop - 1; i >= start; i--) { c = text[i]; block = bitmap[c >> 3]; if (block && ((block & (1 << (c & 7))) != 0)) break; } else /* Find first char not in set, searching from the end */ for (i = stop - 1; i >= start; i--) { c = text[i]; block = bitmap[c >> 3]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } } return i; onError: return -2; } #ifdef HAVE_UNICODE static int mxCharSet_FindUnicodeChar(PyObject *self, Py_UNICODE *text, Py_ssize_t start, Py_ssize_t stop, const int mode, const int direction) { register int i; register unsigned int c; register unsigned int block; unsigned char *bitmap; if (!mxCharSet_Check(self)) { PyErr_BadInternalCall(); goto onError; } if (cs->mode == MXCHARSET_8BITMODE) { bitmap = ((string_charset *)cs->lookup)->bitmap; if (direction > 0) { if (mode) /* Find first char in set */ for (i = start; i < stop; i++) { c = text[i]; if (c > 256) continue; block = bitmap[c >> 3]; if (block && ((block & (1 << (c & 7))) != 0)) break; } else /* Find first char not in set */ for (i = start; i < stop; i++) { c = text[i]; if (c > 256) break; block = bitmap[c >> 3]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } } else { if (mode) /* Find first char in set, searching from the end */ for (i = stop - 1; i >= start; i--) { c = text[i]; if (c > 256) continue; block = bitmap[c >> 3]; if (block && ((block & (1 << (c & 7))) != 0)) break; } else /* Find first char not in set, searching from the end */ for (i = stop - 1; i >= start; i--) { c = text[i]; if (c > 256) break; block = bitmap[c >> 3]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } } return i; } #ifdef HAVE_UNICODE else if (cs->mode == MXCHARSET_UCS2MODE) { unicode_charset *lookup = (unicode_charset *)cs->lookup; if (direction > 0) { if (mode) /* Find first char in set */ for (i = start; i < stop; i++) { c = text[i]; bitmap = lookup->bitmaps[lookup->bitmapindex[c >> 8]]; block = bitmap[(c >> 3) & 31]; if (block && ((block & (1 << (c & 7))) != 0)) break; } else /* Find first char not in set */ for (i = start; i < stop; i++) { c = text[i]; bitmap = lookup->bitmaps[lookup->bitmapindex[c >> 8]]; block = bitmap[(c >> 3) & 31]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } } else { if (mode) /* Find first char in set, searching from the end */ for (i = stop - 1; i >= start; i--) { c = text[i]; bitmap = lookup->bitmaps[lookup->bitmapindex[c >> 8]]; block = bitmap[(c >> 3) & 31]; if (block && ((block & (1 << (c & 7))) != 0)) break; } else /* Find first char not in set, searching from the end */ for (i = stop - 1; i >= start; i--) { c = text[i]; bitmap = lookup->bitmaps[lookup->bitmapindex[c >> 8]]; block = bitmap[(c >> 3) & 31]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } } return i; } #endif else { Py_Error(mxTextTools_Error, "unsupported character set mode"); } onError: return -2; } #endif /* Return the position of the first character in text[start:stop] occurring in set or -1 in case no such character exists. */ static int mxCharSet_Search(PyObject *self, PyObject *text, Py_ssize_t start, Py_ssize_t stop, int direction) { Py_ssize_t position; if (PyString_Check(text)) { Py_CheckStringSlice(text, start, stop); position = mxCharSet_FindChar(self, (unsigned char *)PyString_AS_STRING(text), start, stop, 1, direction); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, stop); position = mxCharSet_FindUnicodeChar(self, PyUnicode_AS_UNICODE(text), start, stop, 1, direction); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); if ((direction > 0 && position >= stop) || (direction <= 0 && position < start)) position = -1; return position; onError: return -2; } /* Return the longest match of characters from set in text[start:stop]. If direction is positive, the search is done from the left (longest prefix), otherwise it is started from the right (longest suffix). -1 is returned in case of an error. */ Py_ssize_t mxCharSet_Match(PyObject *self, PyObject *text, Py_ssize_t start, Py_ssize_t stop, int direction) { Py_ssize_t position; if (PyString_Check(text)) { Py_CheckStringSlice(text, start, stop); position = mxCharSet_FindChar(self, (unsigned char *)PyString_AS_STRING(text), start, stop, 0, direction); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, stop); position = mxCharSet_FindUnicodeChar(self, PyUnicode_AS_UNICODE(text), start, stop, 0, direction); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); if (position < -1) goto onError; if (direction > 0) return position - start; else return stop-1 - position; onError: return -1; } /* Stips off characters appearing in the character set from text[start:stop] and returns the result as Python string object. where indicates the mode: where < 0: strip left only where = 0: strip left and right where > 0: strip right only */ static PyObject *mxCharSet_Strip(PyObject *self, PyObject *text, Py_ssize_t start, Py_ssize_t stop, Py_ssize_t where) { Py_ssize_t left,right; if (!mxCharSet_Check(self)) { PyErr_BadInternalCall(); goto onError; } if (PyString_Check(text)) { Py_CheckStringSlice(text, start, stop); /* Strip left */ if (where <= 0) { left = mxCharSet_FindChar(self, (unsigned char *)PyString_AS_STRING(text), start, stop, 0, 1); if (left < 0) goto onError; } else left = start; /* Strip right */ if (where >= 0) { right = mxCharSet_FindChar(self, (unsigned char *)PyString_AS_STRING(text), left, stop, 0, -1) + 1; if (right < 0) goto onError; } else right = stop; return PyString_FromStringAndSize(PyString_AS_STRING(text) + left, max(right - left, 0)); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, stop); /* Strip left */ if (where <= 0) { left = mxCharSet_FindUnicodeChar(self, PyUnicode_AS_UNICODE(text), start, stop, 0, 1); if (left < 0) goto onError; } else left = start; /* Strip right */ if (where >= 0) { right = mxCharSet_FindUnicodeChar(self, PyUnicode_AS_UNICODE(text), start, stop, 0, -1) + 1; if (right < 0) goto onError; } else right = stop; return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(text) + left, max(right - left, 0)); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); onError: return NULL; } static PyObject *mxCharSet_Split(PyObject *self, PyObject *text, Py_ssize_t start, Py_ssize_t text_len, int include_splits) { PyObject *list = NULL; PyObject *s; register Py_ssize_t x; Py_ssize_t listitem = 0; Py_ssize_t listsize = INITIAL_LIST_SIZE; if (!mxCharSet_Check(self)) { PyErr_BadInternalCall(); goto onError; } list = PyList_New(listsize); if (!list) goto onError; if (PyString_Check(text)) { unsigned char *tx = (unsigned char *)PyString_AS_STRING(text); Py_CheckStringSlice(text, start, text_len); x = start; while (x < text_len) { Py_ssize_t z; /* Skip all text in set (include_splits == 0), not in set (include_splits == 1) */ z = x; x = mxCharSet_FindChar(self, tx, x, text_len, include_splits, 1); /* Append the slice to list */ if (include_splits) { s = PyString_FromStringAndSize((char *)&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; if (x >= text_len) break; } /* Skip all text in set (include_splits == 1), not in set (include_splits == 0) */ z = x; x = mxCharSet_FindChar(self, tx, x, text_len, !include_splits, 1); /* Append the slice to list if it is not empty */ if (x > z) { s = PyString_FromStringAndSize((char *)&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; } } } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_UNICODE *tx = PyUnicode_AS_UNICODE(text); Py_CheckUnicodeSlice(text, start, text_len); x = start; while (x < text_len) { Py_ssize_t z; /* Skip all text in set (include_splits == 0), not in set (include_splits == 1) */ z = x; x = mxCharSet_FindUnicodeChar(self, tx, x, text_len, include_splits, 1); /* Append the slice to list */ if (include_splits) { s = PyUnicode_FromUnicode(&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; if (x >= text_len) break; } /* Skip all text in set (include_splits == 1), not in set (include_splits == 0) */ z = x; x = mxCharSet_FindUnicodeChar(self, tx, x, text_len, !include_splits, 1); /* Append the slice to list if it is not empty */ if (x > z) { s = PyUnicode_FromUnicode(&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; } } } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); /* Resize list if necessary */ if (listitem < listsize) PyList_SetSlice(list, listitem, listsize, (PyObject*)NULL); return list; onError: Py_XDECREF(list); return NULL; } /* methods */ Py_C_Function( mxCharSet_contains, ".contains(char)\n\n" ) { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *chr; int rc; Py_GetArg("O:CharSet.contains", chr); rc = mxCharSet_Contains(self, chr); if (rc < 0) goto onError; return PyInt_FromLong(rc); onError: return NULL; } Py_C_Function( mxCharSet_search, ".search(text[, direction=1, start=0, stop=len(text)])\n\n" ) { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *text; int direction = 1; Py_ssize_t start = 0, stop = INT_MAX; int rc; Py_Get4Args("O|iii:CharSet.search", text, direction, start, stop); rc = mxCharSet_Search(self, text, start, stop, direction); if (rc == -1) Py_ReturnNone(); if (rc < -1) goto onError; return PyInt_FromLong(rc); onError: return NULL; } Py_C_Function( mxCharSet_match, ".match(text[, direction=1, start=0, stop=len(text)])\n\n" ) { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *text; int direction = 1; Py_ssize_t start = 0, stop = INT_MAX; int rc; Py_Get4Args("O|iii:CharSet.match", text, direction, start, stop); rc = mxCharSet_Match(self, text, start, stop, direction); if (rc < 0) goto onError; return PyInt_FromLong(rc); onError: return NULL; } Py_C_Function( mxCharSet_split, ".split(text[, start=0, stop=len(text)])\n\n" ) { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *text; Py_ssize_t start = 0, stop = INT_MAX; Py_Get3Args("O|ii:CharSet.split", text, start, stop); return mxCharSet_Split(self, text, start, stop, 0); onError: return NULL; } Py_C_Function( mxCharSet_splitx, ".splitx(text[, start=0, stop=len(text)])\n\n" ) { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *text; Py_ssize_t start = 0, stop = INT_MAX; Py_Get3Args("O|ii:CharSet.splitx", text, start, stop); return mxCharSet_Split(self, text, start, stop, 1); onError: return NULL; } Py_C_Function( mxCharSet_strip, ".strip(text[, where=0, start=0, stop=len(text)])\n\n" ) { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *text; Py_ssize_t where = 0; Py_ssize_t start = 0, stop = INT_MAX; Py_Get4Args("O|iii:CharSet.strip", text, where, start, stop); return mxCharSet_Strip(self, text, start, stop, where); onError: return NULL; } #ifdef COPY_PROTOCOL Py_C_Function( mxCharSet_copy, "copy([memo])\n\n" "Return a new reference for the instance. This function\n" "is used for the copy-protocol. Real copying doesn't take\n" "place, since the instances are immutable.") { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *memo; Py_GetArg("|O",memo); Py_INCREF(cs); return (PyObject *)cs; onError: return NULL; } #endif #undef cs /* --- slots --- */ static PyObject *mxCharSet_Repr(mxCharSetObject *self) { PyObject *v; char t[500], *reprstr; v = PyObject_Repr(self->definition); if (v == NULL) return NULL; reprstr = PyString_AsString(v); if (reprstr == NULL) return NULL; sprintf(t, "", reprstr, (long)self); Py_DECREF(v); return PyString_FromString(t); } static PyObject *mxCharSet_GetAttr(mxCharSetObject *self, char *name) { PyObject *v; if (Py_WantAttr(name,"definition")) { v = self->definition; Py_INCREF(v); return v; } else if (Py_WantAttr(name,"__members__")) return Py_BuildValue("[s]", "definition"); return Py_FindMethod(mxCharSet_Methods, (PyObject *)self, (char *)name); } /* Python Type Tables */ static PySequenceMethods mxCharSet_TypeAsSequence = { (lenfunc)0, /*sq_length*/ (binaryfunc)0, /*sq_concat*/ (ssizeargfunc)0, /*sq_repeat*/ (ssizeargfunc)0, /*sq_item*/ (ssizessizeargfunc)0, /*sq_slice*/ (ssizeobjargproc)0, /*sq_ass_item*/ (ssizessizeobjargproc)0, /*sq_ass_slice*/ #if PY_VERSION_HEX >= 0x02000000 (objobjproc)mxCharSet_Contains, /*sq_contains*/ #endif }; PyTypeObject mxCharSet_Type = { PyObject_HEAD_INIT(0) /* init at startup ! */ 0, /* ob_size */ "Character Set", /* tp_name */ sizeof(mxCharSetObject), /* tp_basicsize */ 0, /* tp_itemsize */ /* methods */ (destructor)mxCharSet_Free, /* tp_dealloc */ (printfunc)0, /* tp_print */ (getattrfunc)mxCharSet_GetAttr, /* tp_getattr */ (setattrfunc)0, /* tp_setattr */ (cmpfunc)0, /* tp_compare */ (reprfunc)mxCharSet_Repr, /* tp_repr */ 0, /* tp_as_number */ &mxCharSet_TypeAsSequence, /* tp_as_sequence */ 0, /* tp_as_mapping */ (hashfunc)0, /* tp_hash */ (ternaryfunc)0, /* tp_call */ (reprfunc)0, /* tp_str */ (getattrofunc)0, /* tp_getattro */ (setattrofunc)0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT, /* tp_flags */ (char*) 0, /* tp_doc */ }; /* Python Method Table */ statichere PyMethodDef mxCharSet_Methods[] = { Py_MethodListEntry("contains",mxCharSet_contains), Py_MethodListEntry("search",mxCharSet_search), Py_MethodListEntry("match",mxCharSet_match), Py_MethodListEntry("strip",mxCharSet_strip), Py_MethodListEntry("split",mxCharSet_split), Py_MethodListEntry("splitx",mxCharSet_splitx), #ifdef COPY_PROTOCOL Py_MethodListEntry("__deepcopy__",mxCharSet_copy), Py_MethodListEntry("__copy__",mxCharSet_copy), #endif {NULL,NULL} /* end of list */ }; /* --- Tag Table Object ------------------------------------------------*/ staticforward PyMethodDef mxTagTable_Methods[]; PyObject *mxTagTable_New(PyObject *definition, int tabletype, int cacheable); /* internal APIs */ static PyObject *tc_get_item(register PyObject *obj, register Py_ssize_t i) { if (PyTuple_Check(obj)) { if (i > PyTuple_GET_SIZE(obj)) return NULL; return PyTuple_GET_ITEM(obj, i); } else if (PyList_Check(obj)) { if (i > PyList_GET_SIZE(obj)) return NULL; return PyList_GET_ITEM(obj, i); } else return NULL; } static Py_ssize_t tc_length(register PyObject *obj) { if (obj == NULL) return -1; else if (PyTuple_Check(obj)) return PyTuple_GET_SIZE(obj); else if (PyList_Check(obj)) return PyList_GET_SIZE(obj); else return -1; } /* Add a jump target to the jump dictionary */ static Py_ssize_t tc_add_jumptarget(PyObject *jumpdict, PyObject *targetname, Py_ssize_t index) { PyObject *v; v = PyDict_GetItem(jumpdict, targetname); if (v != NULL) Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "jump target already defined", (unsigned int) index); v = PyInt_FromLong(index); if (v == NULL) goto onError; if (PyDict_SetItem(jumpdict, targetname, v)) goto onError; Py_DECREF(v); return 0; onError: return -1; } /* Convert a string command argument to either an 8-bit string or Unicode depending on the tabletype. */ static PyObject *tc_convert_string_arg(PyObject *arg, Py_ssize_t tableposition, int tabletype) { /* Convert to strings */ if (tabletype == MXTAGTABLE_STRINGTYPE) { if (PyString_Check(arg)) return arg; #ifdef HAVE_UNICODE else if (PyUnicode_Check(arg)) { Py_DECREF(arg); arg = PyUnicode_AsEncodedString(arg, NULL, NULL); if (arg == NULL) Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "conversion from Unicode to " "string failed", (unsigned int)tableposition); } #endif else Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "command argument must be a " "string or unicode", (unsigned int)tableposition); } #ifdef HAVE_UNICODE /* Convert to Unicode */ else if (tabletype == MXTAGTABLE_UNICODETYPE) { if (PyUnicode_Check(arg)) return arg; else if (PyString_Check(arg)) { Py_DECREF(arg); arg = PyUnicode_Decode(PyString_AS_STRING(arg), PyString_GET_SIZE(arg), NULL, NULL); if (arg == NULL) Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "conversion from string to " "Unicode failed", (unsigned int)tableposition); } else Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "command argument must be a " "string or unicode", (unsigned int)tableposition); } #endif else Py_Error(mxTextTools_Error, "unsupported table type"); return arg; onError: return NULL; } /* Cleanup any references in the tag table. */ static int tc_cleanup(mxTagTableObject *tagtable) { Py_ssize_t i; for (i = 0; i < tagtable->ob_size; i++) { mxTagTableEntry *tagtableentry = &tagtable->entry[i]; Py_XDECREF(tagtableentry->tagobj); tagtableentry->tagobj = NULL; Py_XDECREF(tagtableentry->args); tagtableentry->args = NULL; } return 0; } /* Initialize the tag table (this is the actual Tag Table compiler) */ static int init_tag_table(mxTagTableObject *tagtable, PyObject *table, Py_ssize_t size, int tabletype, int cacheable) { Py_ssize_t i; PyObject *entry; Py_ssize_t entry_len; PyObject *tagobj, *command, *args = 0, *je, *jne; PyObject *jumpdict, *v; int secondpass, own_args = 0; jumpdict = PyDict_New(); if (jumpdict == NULL) return -1; /* Reset to all fields to 0 */ memset(&tagtable->entry[0], 0, size * sizeof(mxTagTableEntry)); /* First pass */ secondpass = 0; for (i = 0; i < size; i++) { mxTagTableEntry *tagtableentry = &tagtable->entry[i]; /* Get table entry i and parse it */ entry = tc_get_item(table, i); if (entry == NULL) { Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "not found or not a supported entry type", (unsigned int)i); } /* Special handling for jump marks (args is set to the jump mark string, jump target index is the next table entry) */ if (PyString_Check(entry)) { if (tc_add_jumptarget(jumpdict, entry, i + 1)) goto onError; tagtableentry->tagobj = NULL; tagtableentry->cmd = MATCH_JUMPTARGET; tagtableentry->flags = 0; Py_INCREF(entry); tagtableentry->args = entry; tagtableentry->jne = 0; tagtableentry->je = 1; continue; } /* Get entry length */ entry_len = tc_length(entry); if (entry_len < 3) { Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "expected an entry of the form " "(tagobj,command,arg[,jne[,je]])", (unsigned int)i); } /* Decode entry parts: (tagobj, command, args[, jne[, je]]) */ tagobj = tc_get_item(entry, 0); command = tc_get_item(entry, 1); args = tc_get_item(entry, 2); if (entry_len >= 4) jne = tc_get_item(entry, 3); else jne = NULL; if (entry_len >= 5) je = tc_get_item(entry, 4); else je = NULL; if (tagobj == NULL || command == NULL || args == NULL || (entry_len >= 4 && jne == NULL) || (entry_len >= 5 && je == NULL)) { Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "expected an entry of the form " "(tagobj,command,arg[,jne[,je]])",(unsigned int) i); } /* Store tagobj, None gets converted to NULL */ if (tagobj != Py_None) Py_INCREF(tagobj); else tagobj = NULL; tagtableentry->tagobj = tagobj; /* Decode command and flags */ Py_AssertWithArg(PyInt_Check(command), PyExc_TypeError, "tag table entry %d: " "command must be an integer",(unsigned int)i); tagtableentry->cmd = PyInt_AS_LONG(command) & 0xFF; tagtableentry->flags = PyInt_AS_LONG(command) - tagtableentry->cmd; /* Check command arguments */ Py_INCREF(args); own_args = 1; switch (tagtableentry->cmd) { case MATCH_JUMP: /* == MATCH_FAIL */ case MATCH_EOF: case MATCH_LOOP: /* args is ignored */ break; case MATCH_SKIP: case MATCH_MOVE: case MATCH_LOOPCONTROL: Py_AssertWithArg(PyInt_Check(args), PyExc_TypeError, "tag table entry %d: " "Skip|Move|LoopControl command argument " "must be an integer", (unsigned int)i); break; case MATCH_JUMPTARGET: Py_AssertWithArg(PyString_Check(args), PyExc_TypeError, "tag table entry %d: " "JumpMark command argument must be a string",(unsigned int)i); if (tc_add_jumptarget(jumpdict, args, i + 1)) goto onError; break; case MATCH_ALLIN: case MATCH_ALLNOTIN: case MATCH_IS: case MATCH_ISIN: case MATCH_ISNOTIN: case MATCH_WORD: case MATCH_WORDSTART: case MATCH_WORDEND: args = tc_convert_string_arg(args, i, tabletype); if (args == NULL) goto onError; break; case MATCH_ALLINSET: case MATCH_ISINSET: Py_AssertWithArg(PyString_Check(args) && PyString_GET_SIZE(args) == 32, PyExc_TypeError, "tag table entry %d: " "AllInSet|IsInSet command argument must " "be a set() string",(unsigned int)i); break; case MATCH_ALLINCHARSET: case MATCH_ISINCHARSET: Py_AssertWithArg(mxCharSet_Check(args), PyExc_TypeError, "tag table entry %d: " "AllInCharSet|IsInCharSet command argument must " "be a CharSet instance",(unsigned int)i); break; case MATCH_SWORDSTART: /* == MATCH_NOWORD */ case MATCH_SWORDEND: case MATCH_SFINDWORD: Py_AssertWithArg(mxTextSearch_Check(args), PyExc_TypeError, "tag table entry %d: " "sWordStart|sWordEnd|sFindWord command " "argument must be a TextSearch search " "object",(unsigned int)i); break; case MATCH_TABLE: case MATCH_SUBTABLE: Py_AssertWithArg(mxTagTable_Check(args) || PyTuple_Check(args) || PyList_Check(args) || (PyInt_Check(args) && PyInt_AS_LONG(args) == MATCH_THISTABLE), PyExc_TypeError, "tag table entry %d: " "Table|SubTable command argument " "must be a tag table tuple/object or " "ThisTable", (unsigned int)i); /* XXX We shouldn't recursively compile tag table tuples here because this will slow down the compile process too much and it's not clear whether this particular table will ever be used during tagging. */ if (!mxTagTable_Check(args) && !PyInt_Check(args)) { Py_DECREF(args); args = mxTagTable_New(args, tabletype, cacheable); if (args == NULL) goto onError; } break; case MATCH_TABLEINLIST: case MATCH_SUBTABLEINLIST: Py_AssertWithArg(PyTuple_Check(args) && PyTuple_GET_SIZE(args) == 2 && PyList_Check(PyTuple_GET_ITEM(args, 0)) && PyInt_Check(PyTuple_GET_ITEM(args, 1)), PyExc_TypeError, "tag table entry %d: " "TableInList|SubTableInList command argument " "must be a 2-tuple (list, integer)", (unsigned int)i); break; case MATCH_CALL: Py_AssertWithArg(PyCallable_Check(args), PyExc_TypeError, "tag table entry %d: " "Call command argument " "must be a callable object", (unsigned int)i); break; case MATCH_CALLARG: Py_AssertWithArg(PyTuple_Check(args) && PyTuple_GET_SIZE(args) > 0 && PyCallable_Check(PyTuple_GET_ITEM(args, 0)), PyExc_TypeError, "tag table entry %d: " "CallArg command argument " "must be a tuple (fct,[arg0,arg1,...])", (unsigned int)i); break; default: Py_ErrorWith2Args(PyExc_TypeError, "tag table entry %d: " "unknown command integer: %i", (unsigned int)i, tagtableentry->cmd); } /* Store command args */ tagtableentry->args = args; own_args = 0; /* Decode jump offsets */ if (jne) { if (PyInt_Check(jne)) tagtableentry->jne = PyInt_AS_LONG(jne); else if (PyString_Check(jne)) { /* Mark for back-patching */ tagtableentry->jne = -424242; secondpass = 1; } else Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "jne must be an integer or string", (unsigned int)i); } else tagtableentry->jne = 0; if (je) { if (PyInt_Check(je)) tagtableentry->je = PyInt_AS_LONG(je); else if (PyString_Check(je)) { /* Mark for back-patching */ tagtableentry->je = -424242; secondpass = 1; } else Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "je must be an integer or string", (unsigned int)i); } else tagtableentry->je = 1; } /* Second pass (needed to patch string jump targets) */ if (secondpass) for (i = 0; i < size; i++) { mxTagTableEntry *tagtableentry = &tagtable->entry[i]; if (tagtableentry->je != -424242 && tagtableentry->jne != -424242) continue; /* Entry (most probably) needs back-patching */ entry = tc_get_item(table, i); if (entry == NULL) { Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "unexpected error (not found)", (unsigned int)i); } /* Get entry length */ entry_len = tc_length(entry); if (entry_len < 0) { Py_ErrorWithArg(PyExc_TypeError, "tag table entry %d: " "unexpected error (no length)", (unsigned int)i); } /* Decode jump offsets */ if (entry_len >= 4) jne = tc_get_item(entry, 3); else jne = NULL; if (entry_len >= 5) je = tc_get_item(entry, 4); else je = NULL; /* Patch jump offsets */ if (jne && PyString_Check(jne)) { v = PyDict_GetItem(jumpdict, jne); if (v == NULL || !PyInt_Check(v)) Py_ErrorWith2Args(PyExc_TypeError, "tag table entry %d: " "jne jump target '%s' not found", (unsigned int)i, PyString_AS_STRING(jne)); tagtableentry->jne = PyInt_AS_LONG(v) - i; } if (je && PyString_Check(je)) { v = PyDict_GetItem(jumpdict, je); if (v == NULL || !PyInt_Check(v)) Py_ErrorWith2Args(PyExc_TypeError, "tag table entry %d: " "je jump target '%s' not found", (unsigned int)i, PyString_AS_STRING(je)); tagtableentry->je = PyInt_AS_LONG(v) - i; } } Py_DECREF(jumpdict); return 0; onError: if (own_args) { Py_DECREF(args); } return -1; } /* Check the cache for an already compiled TagTable for this definition. Return NULL in case of an error, Py_None without INCREF in case no such table was found or the TagTable object. */ static PyObject *consult_tagtable_cache(PyObject *definition, int tabletype, int cacheable) { PyObject *v, *key, *tt; if (!PyTuple_Check(definition) || !cacheable) return Py_None; key = PyTuple_New(2); if (key == NULL) goto onError; v = PyInt_FromLong((long) definition); if (v == NULL) goto onError; PyTuple_SET_ITEM(key, 0, v); v = PyInt_FromLong(tabletype); if (v == NULL) goto onError; PyTuple_SET_ITEM(key, 1, v); tt = PyDict_GetItem(mxTextTools_TagTables, key); Py_DECREF(key); if (tt != NULL) { Py_INCREF(tt); return tt; } return Py_None; onError: return NULL; } /* Adds the compiled tagtable to the cache. Returns -1 in case of an error, 0 on success. */ static int add_to_tagtable_cache(PyObject *definition, int tabletype, int cacheable, PyObject *tagtable) { PyObject *v, *key; int rc; if (!PyTuple_Check(definition) || !cacheable) return 0; key = PyTuple_New(2); if (key == NULL) goto onError; v = PyInt_FromLong((long) definition); if (v == NULL) goto onError; PyTuple_SET_ITEM(key, 0, v); v = PyInt_FromLong(tabletype); if (v == NULL) goto onError; PyTuple_SET_ITEM(key, 1, v); /* Hard-limit the cache size */ if (PyDict_Size(mxTextTools_TagTables) >= MAX_TAGTABLES_CACHE_SIZE) PyDict_Clear(mxTextTools_TagTables); rc = PyDict_SetItem(mxTextTools_TagTables, key, tagtable); Py_DECREF(key); if (rc) goto onError; return 0; onError: return -1; } /* allocation */ PyObject *mxTagTable_New(PyObject *definition, int tabletype, int cacheable) { mxTagTableObject *tagtable = 0; PyObject *v; Py_ssize_t size; /* First, consult the TagTable cache */ v = consult_tagtable_cache(definition, tabletype, cacheable); if (v == NULL) goto onError; else if (v != Py_None) return v; size = tc_length(definition); if (size < 0) Py_Error(PyExc_TypeError, "tag table definition must be a tuple or a list"); tagtable = PyObject_NEW_VAR(mxTagTableObject, &mxTagTable_Type, size); if (tagtable == NULL) goto onError; if (cacheable) { Py_INCREF(definition); tagtable->definition = definition; } else tagtable->definition = NULL; tagtable->tabletype = tabletype; /* Compile table ... */ if (init_tag_table(tagtable, definition, size, tabletype, cacheable)) goto onError; /* Cache the compiled table if it is cacheable and derived from a tuple */ if (add_to_tagtable_cache(definition, tabletype, cacheable, (PyObject *)tagtable)) goto onError; return (PyObject *)tagtable; onError: Py_XDECREF(tagtable); return NULL; } Py_C_Function( mxTagTable_TagTable, "TagTable(definition[,cachable=1])\n\n" ) { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *definition; int cacheable = 1; Py_Get2Args("O|i:TagTable", definition, cacheable); return mxTagTable_New(definition, 0, cacheable); onError: return NULL; } #ifdef HAVE_UNICODE Py_C_Function( mxTagTable_UnicodeTagTable, "TagTable(definition[,cachable=1])\n\n" ) { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *definition; int cacheable = 1; Py_Get2Args("O|i:UnicodeTagTable", definition, cacheable); return mxTagTable_New(definition, 1, cacheable); onError: return NULL; } #endif static void mxTagTable_Free(mxTagTableObject *tagtable) { tc_cleanup(tagtable); Py_XDECREF(tagtable->definition); PyObject_Del(tagtable); } /* C APIs */ #define tagtable ((mxTagTableObject *)self) static PyObject *mxTagTable_CompiledDefinition(PyObject *self) { PyObject *tuple = 0, *v, *w; Py_ssize_t i; Py_ssize_t size; if (!mxTagTable_Check(self)) { PyErr_BadInternalCall(); goto onError; } size = tagtable->ob_size; tuple = PyTuple_New(size); if (tuple == NULL) goto onError; for (i = 0; i < size; i++) { mxTagTableEntry *tagtableentry = &tagtable->entry[i]; /* Build tuple (tagobj, command, args, jne, je) */ v = PyTuple_New(5); if (v == NULL) goto onError; w = tagtableentry->tagobj; if (w == NULL) w = Py_None; Py_INCREF(w); PyTuple_SET_ITEM(v, 0, w); PyTuple_SET_ITEM(v, 1, PyInt_FromLong(tagtableentry->cmd | tagtableentry->flags)); w = tagtableentry->args; if (w == NULL) w = Py_None; Py_INCREF(w); PyTuple_SET_ITEM(v, 2, w); PyTuple_SET_ITEM(v, 3, PyInt_FromLong(tagtableentry->jne)); PyTuple_SET_ITEM(v, 4, PyInt_FromLong(tagtableentry->je)); if (PyErr_Occurred()) { Py_DECREF(v); goto onError; } PyTuple_SET_ITEM(tuple, i, v); } return tuple; onError: Py_XDECREF(tuple); return NULL; } /* methods */ Py_C_Function( mxTagTable_compiled, ".compiled()\n\n" ) { Py_NoArgsCheck(); return mxTagTable_CompiledDefinition(self); onError: return NULL; } #ifdef COPY_PROTOCOL Py_C_Function( mxTagTable_copy, "copy([memo])\n\n" "Return a new reference for the instance. This function\n" "is used for the copy-protocol. Real copying doesn't take\n" "place, since the instances are immutable.") { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *memo; Py_GetArg("|O",memo); Py_INCREF(tagtable); return (PyObject *)tagtable; onError: return NULL; } #endif #undef tagtable /* --- slots --- */ static PyObject *mxTagTable_Repr(mxTagTableObject *self) { char t[100]; if (self->tabletype == MXTAGTABLE_STRINGTYPE) sprintf(t,"", (long)self); else if (self->tabletype == MXTAGTABLE_UNICODETYPE) sprintf(t,"", (long)self); else sprintf(t,"", (long)self); return PyString_FromString(t); } static PyObject *mxTagTable_GetAttr(mxTagTableObject *self, char *name) { PyObject *v; if (Py_WantAttr(name,"definition")) { v = self->definition; if (v == NULL) v = Py_None; Py_INCREF(v); return v; } else if (Py_WantAttr(name,"__members__")) return Py_BuildValue("[s]", "definition"); return Py_FindMethod(mxTagTable_Methods, (PyObject *)self, (char *)name); } /* Python Type Tables */ PyTypeObject mxTagTable_Type = { PyObject_HEAD_INIT(0) /* init at startup ! */ 0, /* ob_size */ "Tag Table", /* tp_name */ sizeof(mxTagTableObject), /* tp_basicsize */ sizeof(mxTagTableEntry), /* tp_itemsize */ /* methods */ (destructor)mxTagTable_Free, /* tp_dealloc */ (printfunc)0, /* tp_print */ (getattrfunc)mxTagTable_GetAttr, /* tp_getattr */ (setattrfunc)0, /* tp_setattr */ (cmpfunc)0, /* tp_compare */ (reprfunc)mxTagTable_Repr, /* tp_repr */ 0, /* tp_as_number */ 0, /* tp_as_sequence */ 0, /* tp_as_mapping */ (hashfunc)0, /* tp_hash */ (ternaryfunc)0, /* tp_call */ (reprfunc)0, /* tp_str */ (getattrofunc)0, /* tp_getattro */ (setattrofunc)0, /* tp_setattro */ 0, /* tp_as_buffer */ Py_TPFLAGS_DEFAULT, /* tp_flags */ (char*) 0, /* tp_doc */ }; /* Python Method Table */ statichere PyMethodDef mxTagTable_Methods[] = { Py_MethodListEntryNoArgs("compiled",mxTagTable_compiled), #ifdef COPY_PROTOCOL Py_MethodListEntry("__deepcopy__",mxTagTable_copy), Py_MethodListEntry("__copy__",mxTagTable_copy), #endif {NULL,NULL} /* end of list */ }; /* --- Internal functions ----------------------------------------------*/ #ifdef HAVE_UNICODE /* Same as mxTextTools_Join() for Unicode objects. */ static PyObject *mxTextTools_UnicodeJoin(PyObject *seq, Py_ssize_t start, Py_ssize_t stop, PyObject *separator) { PyObject *newstring = 0, *tempstr = 0; Py_ssize_t newstring_len,current_len = 0; Py_UNICODE *p; Py_ssize_t i; Py_UNICODE *sep; Py_ssize_t sep_len; if (separator) { separator = PyUnicode_FromObject(separator); if (separator == NULL) goto onError; sep = PyUnicode_AS_UNICODE(separator); sep_len = PyUnicode_GET_SIZE(separator); } else { sep = NULL; sep_len = 0; } /* Create an empty new string */ newstring_len = (10 + sep_len) * (stop - start); newstring = PyUnicode_FromUnicode(NULL, newstring_len); if (newstring == NULL) goto onError; p = PyUnicode_AS_UNICODE(newstring); /* Join with separator */ for (i = start; i < stop; i++) { register PyObject *o; Py_UNICODE *st; Py_ssize_t len_st; o = PySequence_GetItem(seq, i); if PyTuple_Check(o) { /* Tuple entry: (string,l,r,[...]) */ register Py_ssize_t l,r; /* parse tuple */ Py_Assert((PyTuple_GET_SIZE(o) >= 3) && PyInt_Check(PyTuple_GET_ITEM(o,1)) && PyInt_Check(PyTuple_GET_ITEM(o,2)), PyExc_TypeError, "tuples must be of the format (string,l,r[,...])"); tempstr = PyUnicode_FromObject(PyTuple_GET_ITEM(o,0)); if (tempstr == NULL) goto onError; st = PyUnicode_AS_UNICODE(tempstr); len_st = PyUnicode_GET_SIZE(tempstr); l = PyInt_AS_LONG(PyTuple_GET_ITEM(o,1)); r = PyInt_AS_LONG(PyTuple_GET_ITEM(o,2)); /* compute slice */ if (r > len_st) r = len_st; else if (r < 0) { r += len_st + 1; if (r < 0) r = 0; } if (l > len_st) l = len_st; else if (l < 0) { l += len_st + 1; if (l < 0) l = 0; } /* empty ? */ if (l > r) continue; len_st = r - l; if (len_st == 0) continue; /* get pointer right */ st += l; } else { /* Must be a string entry: take the whole string */ tempstr = PyUnicode_FromObject(o); if (tempstr == NULL) goto onError; st = PyUnicode_AS_UNICODE(tempstr); len_st = PyUnicode_GET_SIZE(tempstr); } Py_DECREF(o); /* Resize the new string if needed */ while (current_len + len_st + sep_len >= newstring_len) { newstring_len += newstring_len >> 1; if (PyUnicode_Resize(&newstring, newstring_len)) goto onError; p = PyUnicode_AS_UNICODE(newstring) + current_len; } /* Insert separator */ if (i > 0 && sep_len > 0) { Py_UNICODE_COPY(p, sep, sep_len); p += sep_len; current_len += sep_len; } /* Copy snippet into new string */ Py_UNICODE_COPY(p, st, len_st); p += len_st; current_len += len_st; Py_DECREF(tempstr); tempstr = NULL; } /* Resize new string to the actual length */ if (PyUnicode_Resize(&newstring, current_len)) goto onError; Py_XDECREF(separator); return newstring; onError: Py_XDECREF(newstring); Py_XDECREF(separator); Py_XDECREF(tempstr); return NULL; } #endif /* Enhanced string join: also excepts tuple (text, left, right,...) entries which then cause text[left:right] to be used as string snippet. separator may be NULL; in that case, "" is used as separator. */ static PyObject *mxTextTools_Join(PyObject *seq, Py_ssize_t start, Py_ssize_t stop, PyObject *separator) { PyObject *newstring = 0; Py_ssize_t newstring_len, current_len = 0; char *p; Py_ssize_t i; char *sep; Py_ssize_t sep_len; if (separator) { #ifdef HAVE_UNICODE if (PyUnicode_Check(separator)) return mxTextTools_UnicodeJoin(seq, start, stop, separator); #endif Py_Assert(PyString_Check(separator), PyExc_TypeError, "separator must be a string"); sep = PyString_AS_STRING(separator); sep_len = PyString_GET_SIZE(separator); } else { sep = NULL; sep_len = 0; } /* Create an empty new string */ newstring_len = (10 + sep_len) * (stop - start); newstring = PyString_FromStringAndSize((char*)NULL, newstring_len); if (newstring == NULL) goto onError; p = PyString_AS_STRING(newstring); /* Join with separator */ for (i = start; i < stop; i++) { register PyObject *o; char *st; Py_ssize_t len_st; o = PySequence_GetItem(seq, i); if PyTuple_Check(o) { /* Tuple entry: (string,l,r,[...]) */ register Py_ssize_t l,r; /* parse tuple */ Py_Assert((PyTuple_GET_SIZE(o) >= 3) && PyInt_Check(PyTuple_GET_ITEM(o,1)) && PyInt_Check(PyTuple_GET_ITEM(o,2)), PyExc_TypeError, "tuples must be of the format (string,int,int[,...])"); #ifdef HAVE_UNICODE if (PyUnicode_Check(PyTuple_GET_ITEM(o,0))) { /* Redirect to Unicode implementation; all previous work is lost. */ Py_DECREF(o); Py_DECREF(newstring); return mxTextTools_UnicodeJoin(seq, start, stop, separator); } #endif Py_Assert(PyString_Check(PyTuple_GET_ITEM(o,0)), PyExc_TypeError, "tuples must be of the format (string,int,int[,...])"); st = PyString_AS_STRING(PyTuple_GET_ITEM(o,0)); len_st = PyString_GET_SIZE(PyTuple_GET_ITEM(o,0)); l = PyInt_AS_LONG(PyTuple_GET_ITEM(o,1)); r = PyInt_AS_LONG(PyTuple_GET_ITEM(o,2)); /* compute slice */ if (r > len_st) r = len_st; else if (r < 0) { r += len_st + 1; if (r < 0) r = 0; } if (l > len_st) l = len_st; else if (l < 0) { l += len_st + 1; if (l < 0) l = 0; } /* empty ? */ if (l > r) continue; len_st = r - l; if (len_st == 0) continue; /* get pointer right */ st += l; } else if (PyString_Check(o)) { /* String entry: take the whole string */ st = PyString_AS_STRING(o); len_st = PyString_GET_SIZE(o); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(o)) { /* Redirect to Unicode implementation; all previous work is lost. */ Py_DECREF(o); Py_DECREF(newstring); return mxTextTools_UnicodeJoin(seq, start, stop, separator); } #endif else { Py_DECREF(o); Py_Error(PyExc_TypeError, "list must contain tuples or strings as entries"); } Py_DECREF(o); /* Resize the new string if needed */ while (current_len + len_st + sep_len >= newstring_len) { newstring_len += newstring_len >> 1; if (_PyString_Resize(&newstring, newstring_len)) goto onError; p = PyString_AS_STRING(newstring) + current_len; } /* Insert separator */ if (i > 0 && sep_len > 0) { memcpy(p, sep, sep_len); p += sep_len; current_len += sep_len; } /* Copy snippet into new string */ memcpy(p,st,len_st); p += len_st; current_len += len_st; } /* Resize new string to the actual length */ if (_PyString_Resize(&newstring, current_len)) goto onError; return newstring; onError: Py_XDECREF(newstring); return NULL; } static PyObject *mxTextTools_HexStringFromString(char *str, Py_ssize_t len) { PyObject *w = 0; Py_ssize_t i; char *hex; static const char hexdigits[] = "0123456789abcdef"; /* Convert to HEX */ w = PyString_FromStringAndSize(NULL,2*len); if (!w) goto onError; hex = PyString_AS_STRING(w); for (i = 0; i < len; i ++) { unsigned char c = (unsigned char)*str; *hex++ = hexdigits[c >> 4]; *hex++ = hexdigits[c & 0x0F]; str++; } return w; onError: Py_XDECREF(w); return NULL; } static PyObject *mxTextTools_StringFromHexString(char *hex, Py_ssize_t len) { PyObject *w = 0; Py_ssize_t i; char *str; static const char hexdigits[] = "0123456789abcdef"; /* Convert to string */ Py_Assert(len % 2 == 0, PyExc_TypeError, "need 2-digit hex string argument"); len >>= 1; w = PyString_FromStringAndSize(NULL,len); if (!w) goto onError; str = PyString_AS_STRING(w); for (i = 0; i < len; i++,str++) { register char c; register Py_ssize_t j; c = tolower(*hex++); for (j = 0; j < (Py_ssize_t)sizeof(hexdigits); j++) if (c == hexdigits[j]) { *str = j << 4; break; } if (j == sizeof(hexdigits)) { DPRINTF("Failed: '%c' (%u) at %i\n",c,(unsigned int)c,i); Py_Error(PyExc_ValueError, "argument contains non-hex characters"); } c = tolower(*hex++); for (j = 0; j < (Py_ssize_t)sizeof(hexdigits); j++) if (c == hexdigits[j]) { *str += j; break; } if (j == sizeof(hexdigits)) { DPRINTF("Failed2: '%c' (%u) at %i\n",c,(unsigned int)c,i); Py_Error(PyExc_ValueError, "argument contains non-hex characters"); } } return w; onError: Py_XDECREF(w); return NULL; } static int mxTextTools_IsASCII(PyObject *text, Py_ssize_t left, Py_ssize_t right) { if (PyString_Check(text)) { Py_ssize_t len; register Py_ssize_t i; register unsigned char *str = (unsigned char *)PyString_AS_STRING(text); len = PyString_GET_SIZE(text); Py_CheckSequenceSlice(len, left, right); for (i = left; i < right; i++) if (str[i] >= 128) return 0; return 1; } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_ssize_t len; register Py_ssize_t i; register Py_UNICODE *str = PyUnicode_AS_UNICODE(text); len = PyUnicode_GET_SIZE(text); Py_CheckSequenceSlice(len, left, right); for (i = left; i < right; i++) if (str[i] >= 128) return 0; return 1; } #endif else Py_Error(PyExc_TypeError, "need string object"); onError: return -1; } /* Takes a list of tuples (replacement,l,r,...) and produces a taglist suitable for mxTextTools_Join() which creates a copy of text where every slice [l:r] is replaced by the given replacement. */ static PyObject *mxTextTools_Joinlist(PyObject *text, PyObject *list, Py_ssize_t pos, Py_ssize_t text_len) { PyObject *joinlist = 0; Py_ssize_t list_len; Py_ssize_t i; Py_ssize_t listitem = 0; Py_ssize_t listsize = INITIAL_LIST_SIZE; if (PyString_Check(text)) { Py_CheckStringSlice(text, pos, text_len); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, pos, text_len); } #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); Py_Assert(PyList_Check(list), PyExc_TypeError, "expected a list of tuples as second argument"); list_len = PyList_GET_SIZE(list); joinlist = PyList_New(listsize); if (joinlist == NULL) goto onError; for (i = 0; i < list_len; i++) { register PyObject *t; register Py_ssize_t left, right; t = PyList_GET_ITEM(list, i); Py_Assert(PyTuple_Check(t) && (PyTuple_GET_SIZE(t) >= 3) && (PyString_Check(PyTuple_GET_ITEM(t,0)) || PyUnicode_Check(PyTuple_GET_ITEM(t,0))) && PyInt_Check(PyTuple_GET_ITEM(t,1)) && PyInt_Check(PyTuple_GET_ITEM(t,2)), PyExc_TypeError, "tuples must be of the form (string,int,int,...)"); left = PyInt_AS_LONG(PyTuple_GET_ITEM(t,1)); right = PyInt_AS_LONG(PyTuple_GET_ITEM(t,2)); Py_Assert(left >= pos, PyExc_ValueError, "list is not sorted ascending"); if (left > pos) { /* joinlist.append((text,pos,left)) */ register PyObject *v; register PyObject *w; v = PyTuple_New(3); if (v == NULL) goto onError; Py_INCREF(text); PyTuple_SET_ITEM(v,0,text); w = PyInt_FromLong(pos); if (w == NULL) goto onError; PyTuple_SET_ITEM(v,1,w); w = PyTuple_GET_ITEM(t,1); Py_INCREF(w); PyTuple_SET_ITEM(v,2,w); if (listitem < listsize) PyList_SET_ITEM(joinlist,listitem,v); else { PyList_Append(joinlist,v); Py_DECREF(v); } listitem++; } /* joinlist.append(string) */ if (listitem < listsize) { register PyObject *v = PyTuple_GET_ITEM(t,0); Py_INCREF(v); PyList_SET_ITEM(joinlist,listitem,v); } else PyList_Append(joinlist,PyTuple_GET_ITEM(t,0)); listitem++; pos = right; } if (pos < text_len) { /* joinlist.append((text,pos,text_len)) */ register PyObject *v; register PyObject *w; v = PyTuple_New(3); if (v == NULL) goto onError; Py_INCREF(text); PyTuple_SET_ITEM(v,0,text); w = PyInt_FromLong(pos); if (w == NULL) goto onError; PyTuple_SET_ITEM(v,1,w); w = PyInt_FromLong(text_len); if (w == NULL) goto onError; PyTuple_SET_ITEM(v,2,w); if (listitem < listsize) PyList_SET_ITEM(joinlist,listitem,v); else { PyList_Append(joinlist,v); Py_DECREF(v); } listitem++; } /* Resize list if necessary */ if (listitem < listsize) PyList_SetSlice(joinlist,listitem,listsize,(PyObject*)NULL); return joinlist; onError: Py_XDECREF(joinlist); return NULL; } #ifdef HAVE_UNICODE static PyObject *mxTextTools_UnicodeCharSplit(PyObject *text, PyObject *separator, Py_ssize_t start, Py_ssize_t text_len) { PyObject *list = NULL; register Py_ssize_t x; Py_ssize_t listitem = 0; Py_ssize_t listsize = INITIAL_LIST_SIZE; Py_UNICODE *tx; Py_UNICODE sep; text = PyUnicode_FromObject(text); if (text == NULL) { separator = NULL; goto onError; } separator = PyUnicode_FromObject(separator); if (separator == NULL) goto onError; Py_CheckUnicodeSlice(text, start, text_len); Py_Assert(PyUnicode_GET_SIZE(separator) == 1, PyExc_TypeError, "separator must be a single character"); tx = PyUnicode_AS_UNICODE(text); sep = *PyUnicode_AS_UNICODE(separator); list = PyList_New(listsize); if (!list) goto onError; x = start; while (1) { PyObject *s; register Py_ssize_t z; /* Skip to next separator */ z = x; for (;x < text_len; x++) if (tx[x] == sep) break; /* Append the slice to list */ s = PyUnicode_FromUnicode(&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; if (x == text_len) break; /* Skip separator */ x++; } /* Resize list if necessary */ if (listitem < listsize) PyList_SetSlice(list,listitem,listsize,(PyObject*)NULL); Py_DECREF(text); Py_DECREF(separator); return list; onError: Py_XDECREF(list); Py_XDECREF(text); Py_XDECREF(separator); return NULL; } #endif static PyObject *mxTextTools_CharSplit(PyObject *text, PyObject *separator, Py_ssize_t start, Py_ssize_t text_len) { PyObject *list = 0; register Py_ssize_t x; Py_ssize_t listitem = 0; Py_ssize_t listsize = INITIAL_LIST_SIZE; char *tx; char sep; #ifdef HAVE_UNICODE if (PyUnicode_Check(text) || PyUnicode_Check(separator)) return mxTextTools_UnicodeCharSplit(text, separator, start, text_len); #endif if (PyString_Check(text) && PyString_Check(separator)) { Py_CheckStringSlice(text, start, text_len); } else Py_Error(PyExc_TypeError, "text and separator must be strings or unicode"); Py_Assert(PyString_GET_SIZE(separator) == 1, PyExc_TypeError, "separator must be a single character"); tx = PyString_AS_STRING(text); sep = *PyString_AS_STRING(separator); list = PyList_New(listsize); if (!list) goto onError; x = start; while (1) { PyObject *s; register Py_ssize_t z; /* Skip to next separator */ z = x; for (;x < text_len; x++) if (tx[x] == sep) break; /* Append the slice to list */ s = PyString_FromStringAndSize(&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; if (x == text_len) break; /* Skip separator */ x++; } /* Resize list if necessary */ if (listitem < listsize) PyList_SetSlice(list,listitem,listsize,(PyObject*)NULL); return list; onError: Py_XDECREF(list); return NULL; } #ifdef HAVE_UNICODE static PyObject *mxTextTools_UnicodeSplitAt(PyObject *text, PyObject *separator, Py_ssize_t nth, Py_ssize_t start, Py_ssize_t text_len) { PyObject *tuple = 0; register Py_ssize_t x; PyObject *s; Py_UNICODE *tx; Py_UNICODE sep; text = PyUnicode_FromObject(text); if (text == NULL) { separator = NULL; goto onError; } separator = PyUnicode_FromObject(separator); if (separator == NULL) goto onError; Py_CheckUnicodeSlice(text, start, text_len); Py_Assert(PyUnicode_GET_SIZE(separator) == 1, PyExc_TypeError, "separator must be a single character"); tx = PyUnicode_AS_UNICODE(text); sep = *PyUnicode_AS_UNICODE(separator); tuple = PyTuple_New(2); if (!tuple) goto onError; if (nth > 0) { /* Skip to nth separator from the left */ x = start; while (1) { for (; x < text_len; x++) if (tx[x] == sep) break; if (--nth == 0 || x == text_len) break; x++; } } else if (nth < 0) { /* Skip to nth separator from the right */ x = text_len - 1; while (1) { for (; x >= start; x--) if (tx[x] == sep) break; if (++nth == 0 || x < start) break; x--; } } else Py_Error(PyExc_ValueError, "nth must be non-zero"); /* Add to tuple */ if (x < start) s = PyUnicode_FromUnicode((Py_UNICODE *)"", 0); else s = PyUnicode_FromUnicode(&tx[start], x - start); if (!s) goto onError; PyTuple_SET_ITEM(tuple,0,s); /* Skip separator */ x++; if (x >= text_len) s = PyUnicode_FromUnicode((Py_UNICODE *)"", 0); else s = PyUnicode_FromUnicode(&tx[x], text_len - x); if (!s) goto onError; PyTuple_SET_ITEM(tuple,1,s); Py_DECREF(text); Py_DECREF(separator); return tuple; onError: Py_XDECREF(tuple); Py_XDECREF(text); Py_XDECREF(separator); return NULL; } #endif static PyObject *mxTextTools_SplitAt(PyObject *text, PyObject *separator, Py_ssize_t nth, Py_ssize_t start, Py_ssize_t text_len) { PyObject *tuple = 0; register Py_ssize_t x; PyObject *s; char *tx; char sep; #ifdef HAVE_UNICODE if (PyUnicode_Check(text) || PyUnicode_Check(separator)) return mxTextTools_UnicodeSplitAt(text, separator, nth, start, text_len); #endif if (PyString_Check(text) && PyString_Check(separator)) { Py_CheckStringSlice(text, start, text_len); } else Py_Error(PyExc_TypeError, "text and separator must be strings or unicode"); Py_Assert(PyString_GET_SIZE(separator) == 1, PyExc_TypeError, "separator must be a single character"); tx = PyString_AS_STRING(text); sep = *PyString_AS_STRING(separator); tuple = PyTuple_New(2); if (!tuple) goto onError; if (nth > 0) { /* Skip to nth separator from the left */ x = start; while (1) { for (; x < text_len; x++) if (tx[x] == sep) break; if (--nth == 0 || x == text_len) break; x++; } } else if (nth < 0) { /* Skip to nth separator from the right */ x = text_len - 1; while (1) { for (; x >= start; x--) if (tx[x] == sep) break; if (++nth == 0 || x < start) break; x--; } } else Py_Error(PyExc_ValueError, "nth must be non-zero"); /* Add to tuple */ if (x < start) s = PyString_FromStringAndSize("",0); else s = PyString_FromStringAndSize(&tx[start], x - start); if (!s) goto onError; PyTuple_SET_ITEM(tuple,0,s); /* Skip separator */ x++; if (x >= text_len) s = PyString_FromStringAndSize("",0); else s = PyString_FromStringAndSize(&tx[x], text_len - x); if (!s) goto onError; PyTuple_SET_ITEM(tuple,1,s); return tuple; onError: Py_XDECREF(tuple); return NULL; } #ifdef HAVE_UNICODE static PyObject *mxTextTools_UnicodeSuffix(PyObject *text, PyObject *suffixes, Py_ssize_t start, Py_ssize_t text_len, PyObject *translate) { Py_ssize_t i; Py_UNICODE *tx; text = PyUnicode_FromObject(text); if (text == NULL) goto onError; if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, text_len); } else Py_Error(PyExc_TypeError, "expected unicode"); Py_Assert(PyTuple_Check(suffixes), PyExc_TypeError, "suffixes needs to be a tuple of unicode strings"); /* XXX Add support for translate... */ Py_Assert(translate == NULL, PyExc_TypeError, "translate is not supported for Unicode suffix()es"); tx = PyUnicode_AS_UNICODE(text); for (i = 0; i < PyTuple_GET_SIZE(suffixes); i++) { PyObject *suffix = PyTuple_GET_ITEM(suffixes,i); Py_ssize_t start_cmp; suffix = PyUnicode_FromObject(suffix); if (suffix == NULL) goto onError; start_cmp = text_len - PyUnicode_GET_SIZE(suffix); if (start_cmp >= start && PyUnicode_AS_UNICODE(suffix)[0] == tx[start_cmp] && memcmp(PyUnicode_AS_UNICODE(suffix), &tx[start_cmp], PyUnicode_GET_DATA_SIZE(suffix)) == 0) { Py_DECREF(text); return suffix; } Py_DECREF(suffix); } Py_DECREF(text); Py_ReturnNone(); onError: Py_XDECREF(text); return NULL; } #endif static PyObject *mxTextTools_Suffix(PyObject *text, PyObject *suffixes, Py_ssize_t start, Py_ssize_t text_len, PyObject *translate) { Py_ssize_t i; char *tx; #ifdef HAVE_UNICODE if (PyUnicode_Check(text)) return mxTextTools_UnicodeSuffix(text, suffixes, start, text_len, translate); #endif if (PyString_Check(text)) { Py_CheckStringSlice(text, start, text_len); } else Py_Error(PyExc_TypeError, "expected string or unicode"); Py_Assert(PyTuple_Check(suffixes), PyExc_TypeError, "suffixes needs to be a tuple of strings"); tx = PyString_AS_STRING(text); if (translate) { char *tr; Py_Assert(PyString_Check(translate) && PyString_GET_SIZE(translate) == 256, PyExc_TypeError, "translate must be a string having 256 characters"); tr = PyString_AS_STRING(translate); for (i = 0; i < PyTuple_GET_SIZE(suffixes); i++) { PyObject *suffix = PyTuple_GET_ITEM(suffixes, i); Py_ssize_t start_cmp; register char *s; register char *t; register Py_ssize_t j; Py_AssertWithArg(PyString_Check(suffix), PyExc_TypeError, "tuple entry %d is not a string",(unsigned int)i); start_cmp = text_len - PyString_GET_SIZE(suffix); if (start_cmp < start) continue; /* Do the compare using a translate table */ s = PyString_AS_STRING(suffix); t = tx + start_cmp; for (j = start_cmp; j < text_len; j++, s++, t++) if (*s != tr[(unsigned char)*t]) break; if (j == text_len) { Py_INCREF(suffix); return suffix; } } } else for (i = 0; i < PyTuple_GET_SIZE(suffixes); i++) { PyObject *suffix = PyTuple_GET_ITEM(suffixes,i); Py_ssize_t start_cmp; Py_AssertWithArg(PyString_Check(suffix), PyExc_TypeError, "tuple entry %d is not a string",(unsigned int)i); start_cmp = text_len - PyString_GET_SIZE(suffix); if (start_cmp < start) continue; /* Compare without translate table */ if (PyString_AS_STRING(suffix)[0] == tx[start_cmp] && strncmp(PyString_AS_STRING(suffix), &tx[start_cmp], PyString_GET_SIZE(suffix)) == 0) { Py_INCREF(suffix); return suffix; } } Py_ReturnNone(); onError: return NULL; } #ifdef HAVE_UNICODE static PyObject *mxTextTools_UnicodePrefix(PyObject *text, PyObject *prefixes, Py_ssize_t start, Py_ssize_t text_len, PyObject *translate) { Py_ssize_t i; Py_UNICODE *tx; text = PyUnicode_FromObject(text); if (text == NULL) goto onError; if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, start, text_len); } else Py_Error(PyExc_TypeError, "expected unicode"); Py_Assert(PyTuple_Check(prefixes), PyExc_TypeError, "prefixes needs to be a tuple of unicode strings"); /* XXX Add support for translate... */ Py_Assert(translate == NULL, PyExc_TypeError, "translate is not supported for Unicode prefix()es"); tx = PyUnicode_AS_UNICODE(text); for (i = 0; i < PyTuple_GET_SIZE(prefixes); i++) { PyObject *prefix = PyTuple_GET_ITEM(prefixes,i); prefix = PyUnicode_FromObject(prefix); if (prefix == NULL) goto onError; /* Compare without translate table */ if (start + PyString_GET_SIZE(prefix) <= text_len && PyUnicode_AS_UNICODE(prefix)[0] == tx[start] && memcmp(PyUnicode_AS_UNICODE(prefix), &tx[start], PyUnicode_GET_DATA_SIZE(prefix)) == 0) { Py_INCREF(prefix); return prefix; } Py_DECREF(prefix); } Py_DECREF(text); Py_ReturnNone(); onError: Py_XDECREF(text); return NULL; } #endif static PyObject *mxTextTools_Prefix(PyObject *text, PyObject *prefixes, Py_ssize_t start, Py_ssize_t text_len, PyObject *translate) { Py_ssize_t i; char *tx; #ifdef HAVE_UNICODE if (PyUnicode_Check(text)) return mxTextTools_UnicodePrefix(text, prefixes, start, text_len, translate); #endif if (PyString_Check(text)) { Py_CheckStringSlice(text, start, text_len); } else Py_Error(PyExc_TypeError, "expected string or unicode"); Py_Assert(PyTuple_Check(prefixes), PyExc_TypeError, "prefixes needs to be a tuple of strings"); tx = PyString_AS_STRING(text); if (translate) { char *tr; Py_Assert(PyString_Check(translate) && PyString_GET_SIZE(translate) == 256, PyExc_TypeError, "translate must be a string having 256 characters"); tr = PyString_AS_STRING(translate); for (i = 0; i < PyTuple_GET_SIZE(prefixes); i++) { PyObject *prefix = PyTuple_GET_ITEM(prefixes,i); Py_ssize_t cmp_len; register char *s; register char *t; register Py_ssize_t j; Py_AssertWithArg(PyString_Check(prefix), PyExc_TypeError, "tuple entry %d is not a string",(unsigned int)i); cmp_len = PyString_GET_SIZE(prefix); if (start + cmp_len > text_len) continue; /* Do the compare using a translate table */ s = PyString_AS_STRING(prefix); t = tx + start; for (j = 0; j < cmp_len; j++, s++, t++) if (*s != tr[(unsigned char)*t]) break; if (j == cmp_len) { Py_INCREF(prefix); return prefix; } } } else for (i = 0; i < PyTuple_GET_SIZE(prefixes); i++) { PyObject *prefix = PyTuple_GET_ITEM(prefixes,i); Py_AssertWithArg(PyString_Check(prefix), PyExc_TypeError, "tuple entry %d is not a string",(unsigned int)i); if (start + PyString_GET_SIZE(prefix) > text_len) continue; /* Compare without translate table */ if (PyString_AS_STRING(prefix)[0] == tx[start] && strncmp(PyString_AS_STRING(prefix), &tx[start], PyString_GET_SIZE(prefix)) == 0) { Py_INCREF(prefix); return prefix; } } Py_ReturnNone(); onError: return NULL; } /* Stips off characters appearing in the character set from text[start:stop] and returns the result as Python string object. where indicates the mode: where < 0: strip left only where = 0: strip left and right where > 0: strip right only */ static PyObject *mxTextTools_SetStrip(char *tx, Py_ssize_t tx_len, char *setstr, Py_ssize_t setstr_len, Py_ssize_t start, Py_ssize_t stop, Py_ssize_t where) { Py_ssize_t left, right; Py_Assert(setstr_len == 32, PyExc_TypeError, "separator needs to be a set as obtained from set()"); Py_CheckBufferSlice(tx_len, start, stop); /* Strip left */ if (where <= 0) { register Py_ssize_t x; for (x = start; x < stop; x++) if (!Py_CharInSet(tx[x], setstr)) break; left = x; } else left = start; /* Strip right */ if (where >= 0) { register Py_ssize_t x; for (x = stop - 1; x >= start; x--) if (!Py_CharInSet(tx[x], setstr)) break; right = x + 1; } else right = stop; return PyString_FromStringAndSize(tx + left, max(right - left, 0)); onError: return NULL; } static PyObject *mxTextTools_SetSplit(char *tx, Py_ssize_t tx_len, char *setstr, Py_ssize_t setstr_len, Py_ssize_t start, Py_ssize_t text_len) { PyObject *list = NULL; register Py_ssize_t x; Py_ssize_t listitem = 0; Py_ssize_t listsize = INITIAL_LIST_SIZE; Py_Assert(setstr_len == 32, PyExc_TypeError, "separator needs to be a set as obtained from set()"); Py_CheckBufferSlice(tx_len,start,text_len); list = PyList_New(listsize); if (!list) goto onError; x = start; while (x < text_len) { Py_ssize_t z; /* Skip all text in set */ for (;x < text_len; x++) { register Py_ssize_t c = (unsigned char)tx[x]; register Py_ssize_t block = (unsigned char)setstr[c >> 3]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } /* Skip all text not in set */ z = x; for (;x < text_len; x++) { register Py_ssize_t c = (unsigned char)tx[x]; register Py_ssize_t block = (unsigned char)setstr[c >> 3]; if (block && ((block & (1 << (c & 7))) != 0)) break; } /* Append the slice to list if it is not empty */ if (x > z) { PyObject *s; s = PyString_FromStringAndSize((char *)&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; } } /* Resize list if necessary */ if (listitem < listsize) PyList_SetSlice(list,listitem,listsize,(PyObject*)NULL); return list; onError: Py_XDECREF(list); return NULL; } static PyObject *mxTextTools_SetSplitX(char *tx, Py_ssize_t tx_len, char *setstr, Py_ssize_t setstr_len, Py_ssize_t start, Py_ssize_t text_len) { PyObject *list = NULL; register Py_ssize_t x; Py_ssize_t listitem = 0; Py_ssize_t listsize = INITIAL_LIST_SIZE; Py_Assert(setstr_len == 32, PyExc_TypeError, "separator needs to be a set as obtained from set()"); Py_CheckBufferSlice(tx_len,start,text_len); list = PyList_New(listsize); if (!list) goto onError; x = start; while (x < text_len) { PyObject *s; register Py_ssize_t z; /* Skip all text not in set */ z = x; for (;x < text_len; x++) { register unsigned int c = (unsigned char)tx[x]; register unsigned int block = (unsigned char)setstr[c >> 3]; if (block && ((block & (1 << (c & 7))) != 0)) break; } /* Append the slice to list */ s = PyString_FromStringAndSize((char *)&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; if (x >= text_len) break; /* Skip all text in set */ z = x; for (;x < text_len; x++) { register unsigned int c = (unsigned char)tx[x]; register unsigned int block = (unsigned char)setstr[c >> 3]; if (!block || ((block & (1 << (c & 7))) == 0)) break; } /* Append the slice to list if it is not empty */ s = PyString_FromStringAndSize((char *)&tx[z], x - z); if (!s) goto onError; if (listitem < listsize) PyList_SET_ITEM(list,listitem,s); else { PyList_Append(list,s); Py_DECREF(s); } listitem++; } /* Resize list if necessary */ if (listitem < listsize) PyList_SetSlice(list,listitem,listsize,(PyObject*)NULL); return list; onError: Py_XDECREF(list); return NULL; } static PyObject *mxTextTools_Upper(PyObject *text) { PyObject *ntext; register unsigned char *s; register unsigned char *orig; register Py_ssize_t i; unsigned char *tr; Py_ssize_t len; Py_Assert(PyString_Check(text), PyExc_TypeError, "expected a Python string"); len = PyString_GET_SIZE(text); ntext = PyString_FromStringAndSize(NULL,len); if (!ntext) goto onError; /* Translate */ tr = (unsigned char *)PyString_AS_STRING(mx_ToUpper); orig = (unsigned char *)PyString_AS_STRING(text); s = (unsigned char *)PyString_AS_STRING(ntext); for (i = 0; i < len; i++, s++, orig++) *s = tr[*orig]; return ntext; onError: return NULL; } #ifdef HAVE_UNICODE static PyObject *mxTextTools_UnicodeUpper(PyObject *text) { PyObject *ntext; register Py_UNICODE *s; register Py_UNICODE *orig; register Py_ssize_t i; Py_ssize_t len; text = PyUnicode_FromObject(text); if (text == NULL) goto onError; len = PyUnicode_GET_SIZE(text); ntext = PyUnicode_FromUnicode(NULL, len); if (!ntext) goto onError; /* Translate */ orig = (Py_UNICODE *)PyUnicode_AS_UNICODE(text); s = (Py_UNICODE *)PyUnicode_AS_UNICODE(ntext); for (i = 0; i < len; i++, s++, orig++) *s = Py_UNICODE_TOUPPER(*orig); Py_DECREF(text); return ntext; onError: Py_XDECREF(text); return NULL; } #endif static PyObject *mxTextTools_Lower(PyObject *text) { PyObject *ntext; register unsigned char *s; register unsigned char *orig; register Py_ssize_t i; unsigned char *tr; Py_ssize_t len; Py_Assert(PyString_Check(text), PyExc_TypeError, "expected a Python string"); len = PyString_GET_SIZE(text); ntext = PyString_FromStringAndSize(NULL,len); if (!ntext) goto onError; /* Translate */ tr = (unsigned char *)PyString_AS_STRING(mx_ToLower); orig = (unsigned char *)PyString_AS_STRING(text); s = (unsigned char *)PyString_AS_STRING(ntext); for (i = 0; i < len; i++, s++, orig++) *s = tr[*orig]; return ntext; onError: return NULL; } #ifdef HAVE_UNICODE static PyObject *mxTextTools_UnicodeLower(PyObject *text) { PyObject *ntext; register Py_UNICODE *s; register Py_UNICODE *orig; register Py_ssize_t i; Py_ssize_t len; text = PyUnicode_FromObject(text); if (text == NULL) goto onError; len = PyUnicode_GET_SIZE(text); ntext = PyUnicode_FromUnicode(NULL, len); if (!ntext) goto onError; /* Translate */ orig = (Py_UNICODE *)PyUnicode_AS_UNICODE(text); s = (Py_UNICODE *)PyUnicode_AS_UNICODE(ntext); for (i = 0; i < len; i++, s++, orig++) *s = Py_UNICODE_TOLOWER(*orig); Py_DECREF(text); return ntext; onError: Py_XDECREF(text); return NULL; } #endif /* --- Module functions ------------------------------------------------*/ /* Interface to the tagging engine in mxte.c */ Py_C_Function_WithKeywords( mxTextTools_tag, "tag(text,tagtable,sliceleft=0,sliceright=len(text),taglist=[],context=None) \n""" "Produce a tag list for a string, given a tag-table\n" "- returns a tuple (success, taglist, nextindex)\n" "- if taglist == None, then no taglist is created" ) { PyObject *text; PyObject *tagtable; Py_ssize_t sliceright = INT_MAX; Py_ssize_t sliceleft = 0; PyObject *taglist = 0; Py_ssize_t taglist_len; PyObject *context = 0; Py_ssize_t next, result; PyObject *res; Py_KeywordsGet6Args("OO|iiOO:tag", text,tagtable,sliceleft,sliceright,taglist,context); if (taglist == NULL) { /* not given, so use default: an empty list */ taglist = PyList_New(0); if (taglist == NULL) goto onError; taglist_len = 0; } else { Py_INCREF(taglist); Py_Assert(PyList_Check(taglist) || taglist == Py_None, PyExc_TypeError, "taglist must be a list or None"); if (taglist != Py_None) { taglist_len = PyList_Size(taglist); if (taglist_len < 0) goto onError; } else taglist_len = 0; } Py_Assert(mxTagTable_Check(tagtable) || PyTuple_Check(tagtable) || PyList_Check(tagtable), PyExc_TypeError, "tagtable must be a TagTable instance, list or tuple"); /* Prepare the argument for the Tagging Engine and let it process the request */ if (PyString_Check(text)) { Py_CheckStringSlice(text, sliceleft, sliceright); if (!mxTagTable_Check(tagtable)) { tagtable = mxTagTable_New(tagtable, MXTAGTABLE_STRINGTYPE, 1); if (tagtable == NULL) goto onError; } else if (mxTagTable_Type(tagtable) != MXTAGTABLE_STRINGTYPE) { Py_Error(PyExc_TypeError, "TagTable instance is not intended for parsing strings"); } else Py_INCREF(tagtable); /* Call the Tagging Engine */ result = mxTextTools_TaggingEngine(text, sliceleft, sliceright, (mxTagTableObject *)tagtable, taglist, context, &next); Py_DECREF(tagtable); } #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) { Py_CheckUnicodeSlice(text, sliceleft, sliceright); if (!mxTagTable_Check(tagtable)) { tagtable = mxTagTable_New(tagtable, 1, 1); if (tagtable == NULL) goto onError; } else if (mxTagTable_Type(tagtable) != MXTAGTABLE_UNICODETYPE) { Py_Error(PyExc_TypeError, "TagTable instance is not intended for parsing Unicode"); } else Py_INCREF(tagtable); /* Call the Tagging Engine */ result = mxTextTools_UnicodeTaggingEngine(text, sliceleft, sliceright, (mxTagTableObject *)tagtable, taglist, context, &next); Py_DECREF(tagtable); } #endif else Py_Error(PyExc_TypeError, "text must be a string or unicode"); /* Check for exceptions during matching */ if (result == 0) goto onError; /* Undo changes to taglist in case of a match failure (result == 1) */ if (result == 1 && taglist != Py_None) { DPRINTF(" undoing changes: del taglist[%i:%i]\n", taglist_len, PyList_Size(taglist)); if (PyList_SetSlice(taglist, taglist_len, PyList_Size(taglist), NULL)) goto onError; } /* Convert result to the documented external values: 0 - no match, 1 - match. */ result--; /* Build result tuple */ res = PyTuple_New(3); if (!res) goto onError; PyTuple_SET_ITEM(res,0,PyInt_FromLong(result)); PyTuple_SET_ITEM(res,1,taglist); PyTuple_SET_ITEM(res,2,PyInt_FromLong(next)); return res; onError: if (!PyErr_Occurred()) Py_Error(PyExc_SystemError, "NULL result without error in builtin tag()"); Py_XDECREF(taglist); return NULL; } /* An extended version of string.join() for taglists: */ Py_C_Function( mxTextTools_join, "join(joinlist,sep='',start=0,stop=len(joinlist))\n\n" "Copy snippets from different strings together producing a\n" "new string\n" "The first argument must be a list of tuples or strings;\n" "tuples must be of the form (string,l,r[,...]) and turn out\n" "as string[l:r]\n" "NOTE: the syntax used for negative slices is different\n" "than the Python standard: -1 corresponds to the first\n" "character *after* the string, e.g. ('Example',0,-1) gives\n" "'Example' and not 'Exampl', like in Python\n" "sep is an optional separator string, start and stop\n" "define the slice of joinlist that is taken into accont." ) { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *joinlist = NULL; Py_ssize_t joinlist_len; PyObject *separator = NULL; Py_ssize_t start=0, stop=INT_MAX; Py_Get4Args("O|Oii:join", joinlist,separator,start,stop); Py_Assert(PySequence_Check(joinlist), PyExc_TypeError, "first argument needs to be a sequence"); joinlist_len = PySequence_Length(joinlist); Py_Assert(joinlist_len >= 0, PyExc_TypeError, "first argument needs to have a __len__ method"); Py_CheckSequenceSlice(joinlist_len, start, stop); /* Short-cut */ if ((stop - start) <= 0) { return PyString_FromString(""); } return mxTextTools_Join(joinlist, start, stop, separator); onError: return NULL; } /* Special compare function for taglist-tuples, comparing the text-slices given: - slices starting at a smaller index come first - for slices starting at the same index, the longer one wins */ Py_C_Function( mxTextTools_cmp, "cmp(a,b)\n\n" "Compare two valid taglist tuples w/r to their slice\n" "position; this is useful for sorting joinlists.") { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *v,*w; int cmp; Py_Get2Args("OO:cmp",v,w); Py_Assert(PyTuple_Check(v) && PyTuple_Check(w) && PyTuple_GET_SIZE(v) >= 3 && PyTuple_GET_SIZE(w) >= 3, PyExc_TypeError, "invalid taglist-tuple"); cmp = PyObject_Compare(PyTuple_GET_ITEM(v,1),PyTuple_GET_ITEM(w,1)); if (cmp != 0) return PyInt_FromLong(cmp); cmp = - PyObject_Compare(PyTuple_GET_ITEM(v,2),PyTuple_GET_ITEM(w,2)); return PyInt_FromLong(cmp); onError: return NULL; } Py_C_Function( mxTextTools_joinlist, "joinlist(text,list,start=0,stop=len(text))\n\n" "Takes a list of tuples (replacement,l,r,...) and produces\n" "a taglist suitable for join() which creates a copy\n" "of text where every slice [l:r] is replaced by the\n" "given replacement\n" "- the list must be sorted using cmp() as compare function\n" "- it may not contain overlapping slices\n" "- the slices may not contain negative indices\n" "- if the taglist cannot contain overlapping slices, you can\n" " give this function the taglist produced by tag() directly\n" " (sorting is not needed, as the list will already be sorted)\n" "- start and stop set the slice to work in, i.e. text[start:stop]" ) { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *list; PyObject *text; Py_ssize_t text_len = INT_MAX; Py_ssize_t pos = 0; Py_Get4Args("OO|ii:joinlist",text,list,pos,text_len); return mxTextTools_Joinlist(text, list, pos, text_len); onError: return NULL; } Py_C_Function( mxTextTools_charsplit, "charsplit(text,char,start=0,stop=len(text))\n\n" "Split text[start:stop] into substrings at char and\n" "return the result as list of strings." ) { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *text, *separator; Py_ssize_t text_len = INT_MAX; Py_ssize_t start = 0; Py_Get4Args("OO|ii:charsplit", text,separator,start,text_len); return mxTextTools_CharSplit(text, separator, start, text_len); onError: return NULL; } Py_C_Function( mxTextTools_splitat, "splitat(text,char,nth=1,start=0,stop=len(text))\n\n" "Split text[start:stop] into two substrings at the nth\n" "occurance of char and return the result as 2-tuple. If the\n" "character is not found, the second string is empty. nth may\n" "be negative: the search is then done from the right and the\n" "first string is empty in case the character is not found." ) { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *text, *separator; Py_ssize_t text_len = INT_MAX; Py_ssize_t start = 0; Py_ssize_t nth = 1; Py_Get5Args("OO|iii:splitat", text,separator,nth,start,text_len); return mxTextTools_SplitAt(text, separator, nth, start, text_len); onError: return NULL; } Py_C_Function( mxTextTools_suffix, "suffix(text,suffixes,start=0,stop=len(text)[,translate])\n\n" "Looks at text[start:stop] and returns the first matching\n" "suffix out of the tuple of strings given in suffixes.\n" "If no suffix is found to be matching, None is returned.\n" "The optional 256 char translate string is used to translate\n" "the text prior to comparing it with the given suffixes." ) { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *text, *suffixes, *translate = NULL; Py_ssize_t text_len = INT_MAX; Py_ssize_t start = 0; Py_Get5Args("OO|iiO:suffix", text,suffixes,start,text_len,translate); return mxTextTools_Suffix(text, suffixes, start, text_len, translate); onError: return NULL; } Py_C_Function( mxTextTools_prefix, "prefix(text,prefixes,start=0,stop=len(text)[,translate])\n\n" "Looks at text[start:stop] and returns the first matching\n" "prefix out of the tuple of strings given in prefixes.\n" "If no prefix is found to be matching, None is returned.\n" "The optional 256 char translate string is used to translate\n" "the text prior to comparing it with the given suffixes." ) { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *text, *prefixes, *translate = NULL; Py_ssize_t text_len = INT_MAX; Py_ssize_t start = 0; Py_Get5Args("OO|iiO:prefix", text,prefixes,start,text_len,translate); return mxTextTools_Prefix(text, prefixes, start, text_len, translate); onError: return NULL; } Py_C_Function( mxTextTools_set, "set(string,logic=1)\n\n" "Returns a character set for string: a bit encoded version\n" "of the characters occurring in string.\n" "- logic can be set to 0 if all characters *not* in string\n" " should go into the set") { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *sto; char *s,*st; Py_ssize_t len_s; int logic = 1; Py_ssize_t i; if (!PyArg_ParseTuple(args,"s#|i:set",&s,&len_s,&logic)) { goto onError; } sto = PyString_FromStringAndSize(NULL,32); if (sto == NULL) goto onError; st = PyString_AS_STRING(sto); if (logic) { memset(st,0x00,32); for (i = 0; i < len_s; i++,s++) { int j = (unsigned char)*s; st[j >> 3] |= 1 << (j & 7); } } else { memset(st,0xFF,32); for (i = 0; i < len_s; i++,s++) { int j = (unsigned char)*s; st[j >> 3] &= ~(1 << (j & 7)); } } return sto; onError: return NULL; } Py_C_Function( mxTextTools_setfind, "setfind(text,set,start=0,stop=len(text))\n\n" "Find the first occurence of any character from set in\n" "text[start:stop]\n set must be a string obtained with set()\n" "DEPRECATED: use CharSet().search() instead." ) { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *text; PyObject *set; Py_ssize_t text_len = INT_MAX; Py_ssize_t start = 0; register Py_ssize_t x; register char *tx; register unsigned char *setstr; Py_Get4Args("OO|ii:setfind",text,set,start,text_len); Py_Assert(PyString_Check(text), PyExc_TypeError, "first argument needs to be a string"); Py_Assert(PyString_Check(set) && PyString_GET_SIZE(set) == 32, PyExc_TypeError, "second argument needs to be a set"); Py_CheckStringSlice(text,start,text_len); x = start; tx = PyString_AS_STRING(text) + x; setstr = (unsigned char *)PyString_AS_STRING(set); for (;x < text_len; tx++, x++) if (Py_CharInSet(*tx,setstr)) break; if (x == text_len) /* Not found */ return PyInt_FromLong(-1L); else return PyInt_FromLong(x); onError: return NULL; } Py_C_Function( mxTextTools_setstrip, "setstrip(text,set,start=0,stop=len(text),mode=0)\n\n" "Strip all characters in text[start:stop] appearing in set.\n" "mode indicates where to strip (<0: left; =0: left and right;\n" ">0: right). set must be a string obtained with set()\n" "DEPRECATED: use CharSet().strip() instead." ) { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); char *tx; Py_ssize_t tx_len; char *setstr; Py_ssize_t setstr_len; Py_ssize_t start = 0; Py_ssize_t stop = INT_MAX; int mode = 0; Py_Get7Args("s#s#|iii:setstip", tx,tx_len,setstr,setstr_len,start,stop,mode); return mxTextTools_SetStrip(tx, tx_len, setstr, setstr_len, start, stop, mode); onError: return NULL; } Py_C_Function( mxTextTools_setsplit, "setsplit(text,set,start=0,stop=len(text))\n\n" "Split text[start:stop] into substrings using set,\n" "omitting the splitting parts and empty substrings.\n" "set must be a string obtained from set()\n" "DEPRECATED: use CharSet().split() instead." ) { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); char *tx; Py_ssize_t tx_len; char *setstr; Py_ssize_t setstr_len; Py_ssize_t start = 0; Py_ssize_t stop = INT_MAX; Py_Get6Args("s#s#|ii:setsplit", tx,tx_len,setstr,setstr_len,start,stop); return mxTextTools_SetSplit(tx, tx_len, setstr, setstr_len, start, stop); onError: return NULL; } Py_C_Function( mxTextTools_setsplitx, "setsplitx(text,set,start=0,stop=len(text))\n\n" "Split text[start:stop] into substrings using set, so\n" "that every second entry consists only of characters in set.\n" "set must be a string obtained with set()\n" "DEPRECATED: use CharSet().splitx() instead." ) { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); Py_ssize_t text_len = INT_MAX; Py_ssize_t start = 0; char *tx; Py_ssize_t tx_len; char *setstr; Py_ssize_t setstr_len; Py_Get6Args("s#s#|ii:setsplitx", tx,tx_len,setstr,setstr_len,start,text_len); return mxTextTools_SetSplitX(tx, tx_len, setstr, setstr_len, start, text_len); onError: return NULL; } Py_C_Function( mxTextTools_upper, "upper(text)\n\n" "Return text converted to upper case.") { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *text; Py_GetArgObject(text); if (PyString_Check(text)) return mxTextTools_Upper(text); #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) return mxTextTools_UnicodeUpper(text); #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); onError: return NULL; } Py_C_Function( mxTextTools_lower, "lower(text)\n\n" "Return text converted to lower case.") { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *text; Py_GetArgObject(text); if (PyString_Check(text)) return mxTextTools_Lower(text); #ifdef HAVE_UNICODE else if (PyUnicode_Check(text)) return mxTextTools_UnicodeLower(text); #endif else Py_Error(PyExc_TypeError, "expected string or unicode"); onError: return NULL; } Py_C_Function( mxTextTools_str2hex, "str2hex(text)\n\n" "Return text converted to a string consisting of two byte\n" "HEX values.") { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); char *str; Py_ssize_t len; Py_Get2Args("s#",str,len); return mxTextTools_HexStringFromString(str,len); onError: return NULL; } Py_C_Function( mxTextTools_hex2str, "hex2str(text)\n\n" "Return text interpreted as two byte HEX values converted\n" "to a string.") { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); char *str; Py_ssize_t len; Py_Get2Args("s#",str,len); return mxTextTools_StringFromHexString(str,len); onError: return NULL; } Py_C_Function( mxTextTools_isascii, "isascii(text,start=0,stop=len(text))\n\n" "Return 1/0 depending on whether text only contains ASCII\n" "characters." ) { Py_Assert(PySequence_Check(args),PyExc_TypeError,"Arguments are not a tuple?"); PyObject *text; Py_ssize_t start=0, stop = INT_MAX; int rc; Py_GetArgObject(text); rc = mxTextTools_IsASCII(text, start, stop); if (rc < 0) goto onError; return PyInt_FromLong(rc); onError: return NULL; } /* --- module init --------------------------------------------------------- */ /* Python Method Table */ static PyMethodDef Module_methods[] = { Py_MethodWithKeywordsListEntry("tag",mxTextTools_tag), Py_MethodListEntry("join",mxTextTools_join), Py_MethodListEntry("cmp",mxTextTools_cmp), Py_MethodListEntry("joinlist",mxTextTools_joinlist), Py_MethodListEntry("set",mxTextTools_set), Py_MethodListEntry("setfind",mxTextTools_setfind), Py_MethodListEntry("setsplit",mxTextTools_setsplit), Py_MethodListEntry("setsplitx",mxTextTools_setsplitx), Py_MethodListEntry("setstrip",mxTextTools_setstrip), Py_MethodWithKeywordsListEntry("TextSearch",mxTextSearch_TextSearch), Py_MethodListEntry("CharSet",mxCharSet_CharSet), Py_MethodListEntry("TagTable",mxTagTable_TagTable), #ifdef HAVE_UNICODE Py_MethodListEntry("UnicodeTagTable",mxTagTable_UnicodeTagTable), #endif Py_MethodListEntrySingleArg("upper",mxTextTools_upper), Py_MethodListEntrySingleArg("lower",mxTextTools_lower), Py_MethodListEntry("charsplit",mxTextTools_charsplit), Py_MethodListEntry("splitat",mxTextTools_splitat), Py_MethodListEntry("suffix",mxTextTools_suffix), Py_MethodListEntry("prefix",mxTextTools_prefix), Py_MethodListEntry("hex2str",mxTextTools_hex2str), Py_MethodListEntry("str2hex",mxTextTools_str2hex), Py_MethodListEntrySingleArg("isascii",mxTextTools_isascii), {NULL,NULL} /* end of list */ }; /* Cleanup function */ static void mxTextToolsModule_Cleanup(void) { mxTextTools_TagTables = NULL; /* Reset mxTextTools_Initialized flag */ mxTextTools_Initialized = 0; } MX_EXPORT(void) initmxTextTools(void) { PyObject *module, *moddict; if (mxTextTools_Initialized) Py_Error(PyExc_SystemError, "can't initialize "MXTEXTTOOLS_MODULE" more than once"); /* Init type objects */ PyType_Init(mxTextSearch_Type); #ifdef MXFASTSEARCH PyType_Init(mxFS_Type); #endif PyType_Init(mxCharSet_Type); PyType_Init(mxTagTable_Type); /* create module */ module = Py_InitModule4(MXTEXTTOOLS_MODULE, /* Module name */ Module_methods, /* Method list */ Module_docstring, /* Module doc-string */ (PyObject *)NULL, /* always pass this as *self */ PYTHON_API_VERSION); /* API Version */ if (!module) goto onError; /* Init TagTable cache */ if ((mxTextTools_TagTables = PyDict_New()) == NULL) goto onError; /* Register cleanup function */ if (Py_AtExit(mxTextToolsModule_Cleanup)) /* XXX what to do if we can't register that function ??? */; /* Add some symbolic constants to the module */ moddict = PyModule_GetDict(module); PyDict_SetItemString(moddict, "__version__", PyString_FromString(VERSION)); mx_ToUpper = mxTextTools_ToUpper(); PyDict_SetItemString(moddict, "to_upper", mx_ToUpper); mx_ToLower = mxTextTools_ToLower(); PyDict_SetItemString(moddict, "to_lower", mx_ToLower); /* Let the tag table cache live in the module dictionary; we just keep a weak reference in mxTextTools_TagTables around. */ PyDict_SetItemString(moddict, "tagtable_cache", mxTextTools_TagTables); Py_DECREF(mxTextTools_TagTables); insint(moddict, "BOYERMOORE", MXTEXTSEARCH_BOYERMOORE); insint(moddict, "FASTSEARCH", MXTEXTSEARCH_FASTSEARCH); insint(moddict, "TRIVIAL", MXTEXTSEARCH_TRIVIAL); /* Init exceptions */ if ((mxTextTools_Error = insexc(moddict, "Error", PyExc_StandardError)) == NULL) goto onError; /* Type objects */ Py_INCREF(&mxTextSearch_Type); PyDict_SetItemString(moddict, "TextSearchType", (PyObject *)&mxTextSearch_Type); Py_INCREF(&mxCharSet_Type); PyDict_SetItemString(moddict, "CharSetType", (PyObject *)&mxCharSet_Type); Py_INCREF(&mxTagTable_Type); PyDict_SetItemString(moddict, "TagTableType", (PyObject *)&mxTagTable_Type); /* Tag Table command symbols (these will be exposed via simpleparse.stt.TextTools.Constants.TagTables) */ insint(moddict, "_const_AllIn", MATCH_ALLIN); insint(moddict, "_const_AllNotIn", MATCH_ALLNOTIN); insint(moddict, "_const_Is", MATCH_IS); insint(moddict, "_const_IsIn", MATCH_ISIN); insint(moddict, "_const_IsNot", MATCH_ISNOTIN); insint(moddict, "_const_IsNotIn", MATCH_ISNOTIN); insint(moddict, "_const_Word", MATCH_WORD); insint(moddict, "_const_WordStart", MATCH_WORDSTART); insint(moddict, "_const_WordEnd", MATCH_WORDEND); insint(moddict, "_const_AllInSet", MATCH_ALLINSET); insint(moddict, "_const_IsInSet", MATCH_ISINSET); insint(moddict, "_const_AllInCharSet", MATCH_ALLINCHARSET); insint(moddict, "_const_IsInCharSet", MATCH_ISINCHARSET); insint(moddict, "_const_Fail", MATCH_FAIL); insint(moddict, "_const_Jump", MATCH_JUMP); insint(moddict, "_const_EOF", MATCH_EOF); insint(moddict, "_const_Skip", MATCH_SKIP); insint(moddict, "_const_Move", MATCH_MOVE); insint(moddict, "_const_JumpTarget", MATCH_JUMPTARGET); insint(moddict, "_const_sWordStart", MATCH_SWORDSTART); insint(moddict, "_const_sWordEnd", MATCH_SWORDEND); insint(moddict, "_const_sFindWord", MATCH_SFINDWORD); insint(moddict, "_const_NoWord", MATCH_NOWORD); insint(moddict, "_const_Call", MATCH_CALL); insint(moddict, "_const_CallArg", MATCH_CALLARG); insint(moddict, "_const_Table", MATCH_TABLE); insint(moddict, "_const_SubTable", MATCH_SUBTABLE); insint(moddict, "_const_TableInList", MATCH_TABLEINLIST); insint(moddict, "_const_SubTableInList", MATCH_SUBTABLEINLIST); insint(moddict, "_const_Loop", MATCH_LOOP); insint(moddict, "_const_LoopControl", MATCH_LOOPCONTROL); /* Tag Table command flags */ insint(moddict, "_const_CallTag", MATCH_CALLTAG); insint(moddict, "_const_AppendToTagobj", MATCH_APPENDTAG); insint(moddict, "_const_AppendTagobj", MATCH_APPENDTAGOBJ); insint(moddict, "_const_AppendMatch", MATCH_APPENDMATCH); insint(moddict, "_const_LookAhead", MATCH_LOOKAHEAD); /* Tag Table argument integers */ insint(moddict, "_const_To", MATCH_JUMP_TO); insint(moddict, "_const_MatchOk", MATCH_JUMP_MATCHOK); insint(moddict, "_const_MatchFail", MATCH_JUMP_MATCHFAIL); insint(moddict, "_const_ToEOF", MATCH_MOVE_EOF); insint(moddict, "_const_ToBOF", MATCH_MOVE_BOF); insint(moddict, "_const_Here", MATCH_FAIL_HERE); insint(moddict, "_const_ThisTable", MATCH_THISTABLE); insint(moddict, "_const_Break", MATCH_LOOPCONTROL_BREAK); insint(moddict, "_const_Reset", MATCH_LOOPCONTROL_RESET); DPRINTF("sizeof(string_charset)=%i bytes\n", sizeof(string_charset)); #ifdef HAVE_UNICODE DPRINTF("sizeof(unicode_charset)=%i bytes\n", sizeof(unicode_charset)); #endif /* We are now initialized */ mxTextTools_Initialized = 1; onError: /* Check for errors and report them */ if (PyErr_Occurred()) Py_ReportModuleInitError(MXTEXTTOOLS_MODULE); return; } SimpleParse-2.2.0/simpleparse/stt/TextTools/mxTextTools/lowlevelcommands.h0000644000175000017500000001633612620706017030506 0ustar mcfletchmcfletch00000000000000/* Low-level matching commands code fragment The contract here is: all commands move forward through the buffer failure to move forward indicates failure of the tag moving forward indicates success of the tag errors may be indicated if encountered in childReturnCode and the error* variables only childPosition should be updated otherwise */ TE_CHAR *m = TE_STRING_AS_STRING(match); if (m == NULL) { childReturnCode = ERROR_CODE; errorType = PyExc_TypeError; errorMessage = PyString_FromFormat( "Low-level command (%i) argument in entry %d couldn't be converted to a string object, is a %.50s", command, (unsigned int)index, Py_TYPE(textobj)->tp_name ); } else { switch (command) { case MATCH_ALLIN: { register Py_ssize_t ml = TE_STRING_GET_SIZE(match); register TE_CHAR *tx = &text[childPosition]; DPRINTF("\nAllIn :\n" " looking for = '%.40s'\n" " in string = '%.40s'\n",m,tx); if (ml > 1) { for (; childPosition < sliceright; tx++, childPosition++) { register Py_ssize_t j; register TE_CHAR *mj = m; register TE_CHAR ctx = *tx; for (j=0; j < ml && ctx != *mj; mj++, j++) ; if (j == ml) break; } } else if (ml == 1) { /* one char only: use faster variant: */ for (; childPosition < sliceright && *tx == *m; tx++, childPosition++) ; } break; } case MATCH_ALLNOTIN: { register Py_ssize_t ml = TE_STRING_GET_SIZE(match); register TE_CHAR *tx = &text[childPosition]; DPRINTF("\nAllNotIn :\n" " looking for = '%.40s'\n" " not in string = '%.40s'\n",m,tx); if (ml != 1) { for (; childPosition < sliceright; tx++, childPosition++) { register Py_ssize_t j; register TE_CHAR *mj = m; register TE_CHAR ctx = *tx; for (j=0; j < ml && ctx != *mj; mj++, j++) ; if (j != ml) break; } } else { /* one char only: use faster variant: */ for (; childPosition < sliceright && *tx != *m; tx++, childPosition++) ; } break; } case MATCH_IS: { DPRINTF("\nIs :\n" " looking for = '%.40s'\n" " in string = '%.40s'\n",m,text+childPosition); if (childPosition < sliceright && *(&text[childPosition]) == *m) { childPosition++; } break; } case MATCH_ISIN: { register Py_ssize_t ml = TE_STRING_GET_SIZE(match); register TE_CHAR ctx = text[childPosition]; DPRINTF("\nIsIn :\n" " looking for = '%.40s'\n" " in string = '%.40s'\n",m,text+childPosition); if (ml > 0 && childPosition < sliceright) { register Py_ssize_t j; register TE_CHAR *mj = m; for (j=0; j < ml && ctx != *mj; mj++, j++) ; if (j != ml) childPosition++; } break; } case MATCH_ISNOTIN: { register Py_ssize_t ml = TE_STRING_GET_SIZE(match); register TE_CHAR ctx = text[childPosition]; DPRINTF("\nIsNotIn :\n" " looking for = '%.40s'\n" " not in string = '%.40s'\n",m,text+childPosition); if (ml > 0 && childPosition < sliceright) { register Py_ssize_t j; register TE_CHAR *mj = m; for (j=0; j < ml && ctx != *mj; mj++, j++) ; if (j == ml) childPosition++; } else childPosition++; break; } case MATCH_WORD: { Py_ssize_t ml1 = TE_STRING_GET_SIZE(match) - 1; register TE_CHAR *tx = &text[childPosition + ml1]; register Py_ssize_t j = ml1; register TE_CHAR *mj = &m[j]; DPRINTF("\nWord :\n" " looking for = '%.40s'\n" " in string = '%.40s'\n",m,&text[childPosition]); if (childPosition+ml1 >= sliceright) break; /* compare from right to left */ for (; j >= 0 && *tx == *mj; tx--, mj--, j--) ; if (j >= 0) /* not matched */ childPosition = startPosition; /* reset */ else childPosition += ml1 + 1; break; } case MATCH_WORDSTART: case MATCH_WORDEND: { Py_ssize_t ml1 = TE_STRING_GET_SIZE(match) - 1; if (ml1 >= 0) { register TE_CHAR *tx = &text[childPosition]; DPRINTF("\nWordStart/End :\n" " looking for = '%.40s'\n" " in string = '%.40s'\n",m,tx); /* Brute-force method; from right to left */ for (;;) { register Py_ssize_t j = ml1; register TE_CHAR *mj = &m[j]; if (childPosition+j >= sliceright) { /* reached eof: no match, rewind */ childPosition = startPosition; break; } /* scan from right to left */ for (tx += j; j >= 0 && *tx == *mj; tx--, mj--, j--) ; /* DPRINTF("match text[%i+%i]: %c == %c\n", childPosition,j,*tx,*mj); */ if (j < 0) { /* found */ if (command == MATCH_WORDEND) childPosition += ml1 + 1; break; } /* not found: rewind and advance one char */ tx -= j - 1; childPosition++; } } break; } #if (TE_TABLETYPE == MXTAGTABLE_STRINGTYPE) /* Note: These two only work for 8-bit set strings. */ case MATCH_ALLINSET: { register TE_CHAR *tx = &text[childPosition]; unsigned char *m = (unsigned char *)PyString_AS_STRING(match); DPRINTF("\nAllInSet :\n" " looking for = set at 0x%lx\n" " in string = '%.40s'\n",(long)match,tx); for (; childPosition < sliceright && (m[((unsigned char)*tx) >> 3] & (1 << (*tx & 7))) > 0; tx++, childPosition++) ; break; } case MATCH_ISINSET: { register TE_CHAR *tx = &text[childPosition]; unsigned char *m = (unsigned char *)PyString_AS_STRING(match); DPRINTF("\nIsInSet :\n" " looking for = set at 0x%lx\n" " in string = '%.40s'\n",(long)match,tx); if (childPosition < sliceright && (m[((unsigned char)*tx) >> 3] & (1 << (*tx & 7))) > 0) childPosition++; break; } #endif case MATCH_ALLINCHARSET: { Py_ssize_t matching; DPRINTF("\nAllInCharSet :\n" " looking for = CharSet at 0x%lx\n" " in string = '%.40s'\n", (long)match, &text[childPosition]); matching = mxCharSet_Match(match, textobj, childPosition, sliceright, 1); if (matching < 0) { childReturnCode = ERROR_CODE; errorType = PyExc_SystemError; errorMessage = PyString_FromFormat( "Character set match returned value < 0 (%d): probable bug in text processing engine", (unsigned int)matching ); } else { childPosition += matching; } break; } case MATCH_ISINCHARSET: { int test; DPRINTF("\nIsInCharSet :\n" " looking for = CharSet at 0x%lx\n" " in string = '%.40s'\n", (long)match, &text[childPosition]); #if (TE_TABLETYPE == MXTAGTABLE_STRINGTYPE) test = mxCharSet_ContainsChar(match, text[childPosition]); #else test = mxCharSet_ContainsUnicodeChar(match, text[childPosition]); #endif if (test < 0) { childReturnCode = ERROR_CODE; errorType = PyExc_SystemError; errorMessage = PyString_FromFormat( "Character set match returned value < 0 (%i): probable bug in text processing engine", test ); } else if (test) { childPosition++; } break; } default: { childReturnCode = ERROR_CODE; errorType = PyExc_ValueError; errorMessage = PyString_FromFormat( "Unrecognised Low-Level command code %i, maximum low-level code is %i", command, MATCH_MAX_LOWLEVEL ); } /* end of the switch, this child is finished */ } } /* end of the wrapping if-check */ /* simple determination for these commands (hence calling them low-level) */ if (childReturnCode == NULL_CODE) { if (childPosition > childStart) { childReturnCode = SUCCESS_CODE; } else { childReturnCode = FAILURE_CODE; } } SimpleParse-2.2.0/simpleparse/stt/TextTools/mxTextTools/mxte_impl.h0000644000175000017500000006226012620706017027126 0ustar mcfletchmcfletch00000000000000/* mxte_impl -- A table driven Tagging Engine for Python (Version 0.9) This is the Tagging Engine implementation. It can be compiled for 8-bit strings and Unicode by setting the TE_* defines appropriately. Copyright (c) 2000, Marc-Andre Lemburg; mailto:mal@lemburg.com Copyright (c) 2000-2002, eGenix.com Software GmbH; mailto:info@egenix.com Copyright (c) 2003-2006, Mike Fletcher; mailto:mcfletch@vrplumber.com */ #ifndef TE_STRING_CHECK # define TE_STRING_CHECK(obj) PyString_Check(obj) #endif #ifndef TE_STRING_AS_STRING # define TE_STRING_AS_STRING(obj) PyString_AS_STRING(obj) #endif #ifndef TE_STRING_GET_SIZE # define TE_STRING_GET_SIZE(obj) PyString_GET_SIZE(obj) #endif #ifndef TE_STRING_FROM_STRING # define TE_STRING_FROM_STRING(str, size) PyString_FromStringAndSize(str, size) #endif #ifndef TE_CHAR # define TE_CHAR char #endif #ifndef TE_HANDLE_MATCH # define TE_HANDLE_MATCH string_match_append #endif #ifndef TE_ENGINE_API # define TE_ENGINE_API mxTextTools_TaggingEngine #endif /* --- Tagging Engine ----------------------------------------------------- */ /* Non-recursive restructuring by Mike Fletcher to support SimpleParse This restructuring eliminates the use of the C call stack for processing sub-table and table directives, allowing these to be used for repetition calls if desired. while 1: while (index_in_table() and returnCode == NULL_CODE): decode the current table[index] if the current tag is new (not already processed): reset tag variables switch( tag command ): do what tag wants to do() set tag-related variables set childReturnCode (tag variable) if table: push_frame_stack() set childReturnCode == PENDING switch(childReturnCode): # figure out what to do with child's results # possibly set table-wide returnValue childSuccess append values update table-wide values set new index childFailure rewind position set new index childError signal error for whole table childPending ignore/continue processing without updating list values reset childReturnCode #done table, figure out what to do now... if no explicit return value: figure out implicit if failure: truncate result list to previous length reset position if error: report error as exception exit else: if frame_stack(): pop_frame_stack() else: return result */ /* call-stack structures used in non-recursive implementation */ #ifndef TEXTTOOLS_CALL_STACK_STRUCTURES # define TEXTTOOLS_CALL_STACK_STRUCTURES /* codes for returnCode and childReturnCode variables */ #define EOF_CODE 3 #define SUCCESS_CODE 2 #define FAILURE_CODE 1 #define ERROR_CODE 0 #define NULL_CODE -1 #define PENDING_CODE -2 typedef struct stack_entry { /* represents data stored for a particular stack recursion We want to make this as small as possible, so anything that is duplicate information (such as unpacked values of the tag or table) is ignored. Eventually this may support another field "available branches" recording backtracking points for the engine. */ void * parent; /* pointer to a parent table or NULL */ Py_ssize_t position; /* where the engine is currently parsing for the parent table*/ Py_ssize_t startPosition; /* position where we started parsing for the parent table */ mxTagTableObject * table; /* the parent table */ Py_ssize_t index; /* index of the child tag in the parent table */ Py_ssize_t childStart; /* text start position for the child table */ PyObject * results; /* the result-target of the parent */ Py_ssize_t resultsLength; /* the length of the results list before the sub-table is called */ } recursive_stack_entry; /* Macro to reset table-specific variables XXX Not sure if loop vars are table or tag specific */ #define RESET_TABLE_VARIABLES {\ index=0;\ table_len = table->numentries;\ returnCode = NULL_CODE;\ loopcount = -1;\ loopstart = startPosition;\ taglist_len = PyList_Size( taglist );\ } /* Macro to reset tag-specific variables */ #define RESET_TAG_VARIABLES {\ childStart = position;\ childPosition = position;\ childReturnCode = NULL_CODE;\ childResults = NULL;\ } /* Macro to decode a tag-entry into local variables */ #define DECODE_TAG {\ mxTagTableEntry *entry;\ entry = &table->entry[index];\ command = entry->cmd;\ flags = entry->flags;\ match = entry->args;\ failureJump = entry->jne;\ successJump = entry->je;\ tagobj = entry->tagobj;\ if (tagobj == NULL) { tagobj = Py_None;}\ } /* macro to push relevant local variables onto the stack and setup for child table newTable becomes table, newResults becomes taglist This is currently only called in the Table/SubTable family of commands, could be inlined there, but I find it cleaner to read here. */ #define PUSH_STACK( newTable, newResults ) {\ stackTemp = (recursive_stack_entry *) PyMem_Malloc( sizeof( recursive_stack_entry ));\ stackTemp->parent = stackParent;\ stackTemp->position = position;\ stackTemp->startPosition = startPosition;\ stackTemp->table = table;\ stackTemp->index = index;\ stackTemp->childStart = childStart;\ stackTemp->resultsLength = taglist_len;\ stackTemp->results = taglist;\ \ stackParent = stackTemp;\ childReturnCode = PENDING_CODE;\ \ startPosition = position;\ table = (mxTagTableObject *) newTable;\ taglist = newResults;\ } #define POP_STACK {\ if (stackParent) {\ childStart = stackParent->childStart;\ childPosition = position;\ position = stackParent->position;\ \ startPosition = stackParent->startPosition;\ \ childResults = taglist;\ taglist_len = stackParent->resultsLength;\ taglist = stackParent->results;\ if (table != stackParent->table ) { Py_DECREF( table ); }\ table = stackParent->table;\ table_len = table->numentries;\ index = stackParent->index;\ \ stackTemp = stackParent->parent;\ PyMem_Free( stackParent );\ stackParent = stackTemp;\ stackTemp = NULL;\ \ childReturnCode = returnCode;\ returnCode = NULL_CODE;\ }\ } #endif /* mxTextTools_TaggingEngine(): a table driven parser engine - return codes: returnCode = 2: match ok; returnCode = 1: match failed; returnCode = 0: error - doesn't check type of passed arguments ! - doesn't increment reference counts of passed objects ! */ int TE_ENGINE_API( PyObject *textobj, Py_ssize_t sliceleft, Py_ssize_t sliceright, mxTagTableObject *table, PyObject *taglist, PyObject *context, Py_ssize_t *next ) { TE_CHAR *text = NULL; /* Pointer to the text object's data */ /* local variables pushed into stack on recurse */ /* whole-table variables */ Py_ssize_t position = sliceleft; /* current (head) position in text for whole table */ Py_ssize_t startPosition = sliceleft; /* start position for current tag */ Py_ssize_t table_len = table->numentries; /* table length */ short returnCode = NULL_CODE; /* return code: -1 not set, 0 error, 1 not ok, 2 ok */ Py_ssize_t index=0; /* index of current table entry */ Py_ssize_t taglist_len = PyList_Size( taglist ); /* variables tracking status of the current tag */ register short childReturnCode = NULL_CODE; /* the current child's return code value */ Py_ssize_t childStart = startPosition; register Py_ssize_t childPosition = startPosition; PyObject *childResults = NULL; /* store's the current child's results (for table children) */ int flags=0; /* flags set in command */ int command=0; /* command */ int failureJump=0; /* rel. jump distance on 'not matched', what should the default be? */ int successJump=1; /* dito on 'matched', what should the default be? */ PyObject *match=NULL; /* matching parameter */ int loopcount = -1; /* loop counter */ Py_ssize_t loopstart = startPosition; /* loop start position */ PyObject *tagobj = NULL; /* parentTable is our nearest parent, i.e. the next item to pop off the processing stack. We copied our local variables to it before starting a child table, and will copy back from it when we finish the child table. It's normally NULL */ recursive_stack_entry * stackParent = NULL; recursive_stack_entry * stackTemp = NULL; /* just temporary storage for parent pointers */ /* Error-management variables */ PyObject * errorType = NULL; PyObject * errorMessage = NULL; /* Initialise the buffer Here is where we will add memory-mapped file support I think... expand the TE_STRING macros to check for mmap file objects (only for str-type) and to access their values appropriately f = open('c:\\temp\\test.mem', 'r') buffer = mmap.mmap( f.fileno(), 0, access = mmap.ACCESS_READ ) */ if (!TE_STRING_CHECK(textobj)) { returnCode = ERROR_CODE; errorType = PyExc_TypeError; errorMessage = PyString_FromFormat( "Expected a string or unicode object to parse: found %.50s", Py_TYPE(textobj)->tp_name ); } else { text = TE_STRING_AS_STRING(textobj); if (text == NULL) { returnCode = ERROR_CODE; } } while (1) { /* this loop processes a whole table */ while ( (index < table_len) & (returnCode == NULL_CODE) & (index >= 0) ) { DPRINTF( "index %i\n", index ); DECODE_TAG if (childReturnCode == NULL_CODE ) { /* if we are not continuing processing of the child from a previous iteration we need to unpack the child into local variables */ RESET_TAG_VARIABLES childStart = position; childPosition = position; } if (command < MATCH_MAX_LOWLEVEL) { #include "lowlevelcommands.h" } else { switch (command) { /* Jumps & special commands */ #include "speccommands.h" /* non-table-recursion high-level stuff */ #include "highcommands.h" /* the recursive table commands */ #include "recursecommands.h" default: { childReturnCode = ERROR_CODE; errorType = PyExc_ValueError; errorMessage = PyString_FromFormat( "Unrecognised command code %i", command ); } } } /* we're done a single tag, process partial results for the current child This is a major re-structuring point. Previously all of this was scattered around (and duplicated among) the various command and command-group clauses. There also used to be a function call to handle the append/call functions. That's now handled inline */ /* sanity check wanted by Marc-André for skip-before-buffer */ if (childPosition < 0) { childReturnCode = ERROR_CODE; errorType = PyExc_TypeError; errorMessage = PyString_FromFormat( "tagobj (type %.50s) table entry %d moved/skipped beyond start of text (to position %d)", Py_TYPE(tagobj)->tp_name, (unsigned int)index, (unsigned int)childPosition ); } DPRINTF( "switch on return code %i\n", childReturnCode ); switch(childReturnCode) { case NULL_CODE: case SUCCESS_CODE: /* childReturnCode wasn't set or we positively matched positions are always: childStart, childPosition sub-results are: childResults unless childResults is taglist in which case we use Py_None for the tag's children unless childResults is NULL in which case we create an empty list object we call: tagobj == Py_None : do nothing... [ result tuple needed ] CallTag: entry->tagobj( resultTuple ) AppendToTagobj: entry->tagobj.append( resultTuple ) General Case: taglist.append( resultTuple ) AppendMatch: taglist.append( text[childStart:childPosition] ) AppendTagobj: taglist.append( entry->tagobj ) if LookAhead is specified: childPosition is set to childStart before continuing finally we set position = childPosition */ { PyObject * objectToCall = NULL; PyObject * objectCallResult = NULL; int releaseCallObject = 0; int releaseChildResults = 0; int releaseParameter = 1; PyObject * parameter = NULL; DPRINTF( "finishing success-code or null \n" ); if (tagobj == Py_None ) { /* XXX note: this short-circuits around "AppendTagobj" flagged items which specified tagobj == None... don't know if that's wanted or not. Similarly doesn't report AppendMatch's. Not sure what's appropriate there either. */ DPRINTF( "tagobj was none\n" ); DPRINTF( "Matched %i:%i but result not saved", childStart, childPosition ); } else { /* get the callable object */ /* normally it's taglist.append, do the exceptions first */ DPRINTF( "tagobj non-None, finding callable\n" ); if (flags & MATCH_CALLTAG) { /* want the tag itself */ objectToCall = tagobj; } else if (flags & MATCH_APPENDTAG) { /* AppendToTagobj -> want the tag's append method */ DPRINTF( "append to tag obj\n" ); objectToCall = PyObject_GetAttrString( tagobj, "append" ); DPRINTF( "got object\n"); if (objectToCall == NULL) { DPRINTF( "got invalid object\n"); returnCode = ERROR_CODE; errorType = PyExc_AttributeError; errorMessage = PyString_FromFormat( "tagobj (type %.50s) for table entry %d (flags include AppendTag) doesn't have an append method", Py_TYPE(tagobj)->tp_name, (unsigned int)index ); } else { DPRINTF( "got valid object\n"); releaseCallObject = 1; } } else { DPRINTF( "appending to tag-list\n" ); /* append of the taglist, which we know exists, because it's a list We optimise this to use the raw List API */ objectToCall = NULL; /*PyObject_GetAttrString( taglist, "append" );*/ } if (returnCode == NULL_CODE && objectToCall && PyCallable_Check(objectToCall)==0) { /* object to call isn't callable */ DPRINTF( "object not callable\n" ); returnCode = ERROR_CODE; errorType = PyExc_TypeError; errorMessage = PyString_FromFormat( "The object to call type(%.50s) for table entry %d isn't callable", Py_TYPE(objectToCall)->tp_name, (unsigned int)index ); } if (returnCode == NULL_CODE) { /* get the parameter with which to call */ /* normally it's a result tuple, do exceptions first */ DPRINTF( "getting parameter\n" ); if (flags & MATCH_APPENDMATCH) { /* XXX need to do bounds-checking here so that: childStart >= sliceleft childPosition >= sliceleft childPosition <= sliceright */ /* MATCH_APPENDMATCH cannot occur with any other flag (makes no sense) so objectToCall _must_ be the taglist, and we just want to append the string, not a tuple wrapping the string. That is, everywhere else we use tuples, here we don't */ parameter = TE_STRING_FROM_STRING( TE_STRING_AS_STRING(textobj) + childStart, childPosition - childStart ); if (parameter == NULL) { /* error occured getting parameter, report the exception */ returnCode = ERROR_CODE; } } else if ( flags & MATCH_APPENDTAGOBJ) { /* append the tagobj itself to the results list */ if (tagobj == NULL) { parameter = Py_None; } else { parameter = tagobj; } releaseParameter = 0; } else { /* need to know what the child-list is to build resultsTuple if childResults is non-null and not taglist use it if childResults == taglist, use Py_None otherwise use Py_None ( originally we created a new empty list object, that was wrong :) ). */ if (childResults == taglist) { childResults = Py_None ; } else if (childResults != NULL) { /* exists already, with a reference from PUSH's creation */ releaseChildResults = 1; } else { /* turns out mxTextTools declares the return value to be None or [], using None is far more efficient, so I've made the code use it here */ childResults = Py_None; releaseChildResults = 0; /* we aren't increfing it locally */ } if (childResults == NULL || tagobj == NULL) { returnCode = ERROR_CODE; } else { if (flags & MATCH_CALLTAG) { parameter = Py_BuildValue( "OOiiO", taglist, textobj, childStart, childPosition, childResults ); } else if (flags & MATCH_APPENDTAG) { /* AppendToTagobj -> want to call append with a 4-tuple of values, so parameter needs to be ((x,y,z,w),) */ /* XXX can't get the darn thing to accept "((OiiO))" :( */ parameter = Py_BuildValue( "((OiiO))", Py_None, childStart, childPosition, childResults ); } else { /* either we are calling a method that requires the 4 args, or we're appending the 4-tuple to a list */ parameter = Py_BuildValue( "OiiO", tagobj, childStart, childPosition, childResults ); } if (parameter == NULL) { returnCode = ERROR_CODE; } } } DPRINTF( "done getting parameter\n" ); if (parameter == NULL && returnCode == ERROR_CODE && errorType == NULL) { errorType = PyExc_SystemError; /* following may fail, as we may have run out of memory */ errorMessage = PyString_FromFormat( "Unable to build return-value tuple" ); } /* now have both object and parameter and object is callable */ if (returnCode == NULL_CODE) { /* no errors yet */ DPRINTF( "doing call\n" ); if (objectToCall) { DPRINTF( " object call\n" ); /* explicit object to call */ Py_INCREF( objectToCall ); Py_INCREF( parameter ); DPRINTF( " lock released\n" ); objectCallResult = PyEval_CallObject( objectToCall, parameter ); DPRINTF( " call finished\n" ); Py_DECREF( objectToCall ); Py_DECREF( parameter ); DPRINTF( " lock acquired\n" ); if (objectCallResult == NULL) { DPRINTF( " null result\n" ); returnCode = ERROR_CODE; /* exception is already there, should alter error-handler to check for it */ } else { DPRINTF( " non-null result, decrefing\n" ); Py_DECREF( objectCallResult ); DPRINTF( " decrefd\n" ); } objectCallResult = NULL; } else { /* list steals reference */ DPRINTF( " list append\n" ); if (PyList_Append( taglist, parameter ) == -1) { returnCode = ERROR_CODE; /* list didn't steal ref yet */ errorType = PyExc_SystemError; /* following is likely to fail, as we've likely run out of memory */ errorMessage = PyString_FromFormat( "Unable to append result tuple to result list!" ); } } } } DPRINTF( "checking whether to release object\n" ); if (releaseCallObject) { Py_DECREF( objectToCall ); } objectToCall = NULL; releaseCallObject = 0; if (releaseChildResults) { Py_DECREF( childResults ); } childResults = NULL; releaseChildResults = 0; if (releaseParameter && parameter ) { Py_DECREF( parameter ); } parameter = NULL; releaseParameter = 1; } /* ends the else clause for reporting a result */ /* reset for lookahead */ if (flags & MATCH_LOOKAHEAD) { position = childStart; } else { position = childPosition; } index += successJump; DPRINTF( "finished success-handler code\n" ); break; } case FAILURE_CODE: /* failed, if failure jump is default, should set table returnCode */ if (childResults) { if (childResults != taglist) { /* different list, decref it since we won't be using it any more */ Py_DECREF( childResults ); } childResults = NULL; } /* XXX possible (eventual) logic error here? fail with jump of 0 might work in certain cases where the "parsing" is actually occuring outside of the current buffer (i.e. a side-effect-based parsing node that fails X times before finally succeeding). Don't see anything in current commands that can cause a problem but we may need to make this an explicitly watched idea, rather than a consequence of the child failing with a 0 failureJump value. */ position = childStart; if (failureJump == 0) { returnCode = 1; } else { index += failureJump; } break; case PENDING_CODE: /* the child tag hasn't begun parsing, this was a recursive-tag-start loop pass. PENDING_CODE is set by the stack push operation */ break; case ERROR_CODE: { /* explicit error encountered while processing this child Handle this as gracefully as possible, potentially triggering huge sets of operations, but therefore needing to be very careful about system-level errors (such as memory errors). 1) Signal whole table as err-d 2) Record any extra values for the error message? */ returnCode = ERROR_CODE; break; } default: { /* what error should be raised when an un-recognised return code is generated? */ returnCode = ERROR_CODE; errorType = PyExc_SystemError; errorMessage = PyString_FromFormat( "An unknown child return code %i was generated by tag-table item %d", childReturnCode, (unsigned int)index ); } } childReturnCode = NULL_CODE; /* single entry processing loop complete */ } /* we're done the table, figure out what to do. */ if (returnCode == NULL_CODE) { /* no explicit return code was set, but done table: index went beyond table_len (>=table_len) -> success index moved before table start (<= 0) -> failure */ if (index >= table_len) { /* success */ returnCode = SUCCESS_CODE; } else if (position >= sliceright) { /* EOF while parsing, special type of failure Eventually allow for returning the whole parse-stack for restarting the parser from a particular point. */ /*returnCode = EOF_CODE;*/ returnCode = FAILURE_CODE; } else if (index < 0) { /* explicit jump before table */ returnCode = FAILURE_CODE; } else { returnCode = FAILURE_CODE; } } if (returnCode == FAILURE_CODE) { /* truncate result list */ if (PyList_SetSlice( taglist, taglist_len, PyList_Size(taglist), NULL) ) { returnCode = ERROR_CODE; errorMessage = PyString_FromFormat( "Unable to truncate list object (likely tagging engine error) type(%.50s)", Py_TYPE(taglist)->tp_name ); } /* reset position */ position = startPosition; } if (returnCode == ERROR_CODE) { /* DO_FANCY_ERROR_REPORTING( ); This is where we will do the user-triggered error reporting (as well as reporting low-level errors such as memory/type/value). We have 3 values possibly available: errorType -> PyObject * to current error class (or NULL) if it is a MemoryError: Jettison some ballast then attempt to return a short message. Need to create this ballast somewhere for that to work. if is any other error class: create the error object and raise it decorate it with details: current table (need to incref to keep alive) current index current position childStart childPosition if it is simpleparse.stt.TextTools.ParsingError: (triggered by the user in their grammar) create a list of non-None parent tagobjs (a stack report) and add it to the object 3) Build an actual error object if possible? 4) Report the parent hierarchy of the failure point 5) */ char * msg = NULL; if (errorMessage && errorType) { /* we only report our own error if we've got all the information for it XXX Need to check that we don't have cases that are just setting type */ msg = PyString_AsString( errorMessage); PyErr_SetString( errorType, msg ); Py_DECREF( errorMessage ); } /* need to free the whole stack at once */ while (stackParent != NULL) { /* this is inefficient, should do it all-in-one-go without copying values back save for startPosition and returnCode in the last item*/ POP_STACK /* need to clean up all INCREF'd objects as we go... */ if (childResults != taglist) { /* different list, decref it since we won't be using it any more */ Py_DECREF( childResults ); } childResults = NULL; } *next = startPosition; return 0; } else { if (stackParent != NULL) { /* pop stack also sets the childReturnCode for us... */ POP_STACK } else { /* this was the root table, return the final results */ if (returnCode == FAILURE_CODE) { /* there is a clause in the docs for tag that says this will return the "error position" for the table. That requires reporting childPosition for the the last-matched position */ *next = childPosition; } else { *next = position; } return returnCode; } } } /* end of infinite loop */ } SimpleParse-2.2.0/simpleparse/stt/TextTools/mxTextTools/mxstdlib.h0000644000175000017500000001342012037615407026754 0ustar mcfletchmcfletch00000000000000#ifndef MXSTDLIB_H #define MXSTDLIB_H /* Standard stuff I use often -- not Python specific Copyright (c) 2000, Marc-Andre Lemburg; mailto:mal@lemburg.com Copyright (c) 2000-2002, eGenix.com Software GmbH; mailto:info@egenix.com See the documentation for further copyright information or contact the author. */ #include #include #include #include #include #include #ifdef HAVE_LIMITS_H #include #else #ifndef INT_MAX # define INT_MAX 2147483647 #endif #ifndef LONG_MAX # define LONG_MAX INT_MAX #endif #endif /* --- My own macros for memory allocation... --------------------------- */ #ifdef MAL_MEM_DEBUG # define newstruct(x) \ (mxDebugPrintf("* malloc for struct "#x" (%s:%i)\n",__FILE__,__LINE__),\ (x *)malloc(sizeof(x))) # define cnewstruct(x) \ (mxDebugPrintf("* calloc for struct "#x" (%s:%i)\n",c,__FILE__,__LINE__),\ (x *)calloc(sizeof(x),1)) # define new(x,c) \ (mxDebugPrintf("* malloc for "#c"=%i '"#x"'s (%s:%i)\n",c,__FILE__,__LINE__),\ (x *)malloc(sizeof(x)*(c))) # define cnew(x,c) \ (mxDebugPrintf("* calloc for "#c"=%i '"#x"'s (%s:%i)\n",c,__FILE__,__LINE__),\ (x *)calloc((c),sizeof(x))) # define resize(var,x,c) \ (mxDebugPrintf("* realloc array "#var" ("#x") at %X to size "#c"=%i (%s:%i)\n",var,c,__FILE__,__LINE__),\ (x *)realloc((void*)(var),sizeof(x)*(c))) # define varresize(var,x,bytes) \ (mxDebugPrintf("* realloc var "#var" ("#x") at %X to %i bytes (%s:%i)\n",var,bytes,__FILE__,__LINE__),\ (x *)realloc((void*)(var),(bytes))) # define free(x) \ (mxDebugPrintf("* freeing "#x" at %X (%s:%i)\n",x,__FILE__,__LINE__),\ free((void*)(x))) #else # define newstruct(x) ((x *)malloc(sizeof(x))) # define cnewstruct(x) ((x *)calloc(sizeof(x),1)) # define new(x,c) ((x *)malloc(sizeof(x)*(c))) # define cnew(x,c) ((x *)calloc((c),sizeof(x))) # define resize(var,x,c) ((x *)realloc((void*)(var),sizeof(x)*(c))) # define varresize(var,x,bytes) ((x *)realloc((void*)(var),(bytes))) # define free(x) free((void*)(x)) #endif /* --- Debugging output ------------------------------------------------- */ /* Use the flag MAL_DEBUG to enable debug processing. The flag MAL_DEBUG_WITH_PYTHON can be used to indicate that the object file will be linked with Python, so we can use Python APIs for the debug processing here. */ #ifdef MAL_DEBUG_WITH_PYTHON # ifndef PYTHON_API_VERSION # error "mx.h must be included when compiling with MAL_DEBUG_WITH_PYTHON" # endif # ifndef MAL_DEBUG # define MAL_DEBUG # endif #else # if defined(PYTHON_API_VERSION) && defined(MAL_DEBUG) # define MAL_DEBUG_WITH_PYTHON # endif #endif /* Indicator for the availability of these interfaces: */ #define HAVE_MAL_DEBUG /* Name of the environment variable defining the log file name to be used: */ #ifndef MAL_DEBUG_OUTPUTFILE_ENV_VARIABLE # define MAL_DEBUG_OUTPUTFILE_ENV_VARIABLE "mxLogFile" #endif /* File name to be used for debug logging (each object file using this facility may set its own logging file) if no environment variable is set: */ #ifndef MAL_DEBUG_OUTPUTFILE # define MAL_DEBUG_OUTPUTFILE "mx.log" #endif /* Name of the environment variable defining the log file prefix to be used (e.g. to direct all log files into a separate directory): */ #ifndef MAL_DEBUG_OUTPUTFILEPREFIX_ENV_VARIABLE # define MAL_DEBUG_OUTPUTFILEPREFIX_ENV_VARIABLE "mxLogFileDir" #endif /* File name prefix to be used for log files, if no environment variable is set: */ #ifndef MAL_DEBUG_OUTPUTFILEPREFIX # define MAL_DEBUG_OUTPUTFILEPREFIX "" #endif /* Log id to be used */ #ifndef MAL_DEBUG_LOGID # define MAL_DEBUG_LOGID "New Log Session" #endif /* Debug printf() API Output is written to a log file or stream. If the output file is not yet open, the function will try to open the file as defined by the environment or the program defines. The file remains open until the program terminates. Subsequent changes to the environment are not taken into account. The output file is deduced in the following way: 1. get the filename from the environment, revert to the predefined value 2. get the filename prefix from the environment, revert to the predefined value 3. if filename is one of "stderr" or "stdout" use the native streams for output; otherwise try to open fileprefix + filename reverting to stderr in case this fails. */ static int mxDebugPrintf(const char *format, ...) { return 1; } #ifdef MAL_DEBUG # ifdef MAL_DEBUG_WITH_PYTHON /* Use the Python debug flag to enable debugging output (python -d) */ # define DPRINTF if (Py_DebugFlag) mxDebugPrintf # define IF_DEBUGGING if (Py_DebugFlag) # define DEBUGGING (Py_DebugFlag > 0) # else /* Always output debugging information */ # define DPRINTF mxDebugPrintf # define IF_DEBUGGING # define DEBUGGING (1) # endif #else # ifndef _MSC_VER /* This assumes that you are using an optimizing compiler which eliminates the resulting debug code. */ # define DPRINTF if (0) mxDebugPrintf # define IF_DEBUGGING if (0) # define DEBUGGING (0) # else /* MSVC doesn't do a good job here, so we use a different approach. */ # define DPRINTF 0 && mxDebugPrintf # define IF_DEBUGGING if (0) # define DEBUGGING (0) # endif #endif /* --- Misc ------------------------------------------------------------- */ /* The usual bunch... */ #ifndef max # define max(a,b) ((a>b)?(a):(b)) #endif #ifndef min # define min(a,b) ((a3: jne = e[3] if len(e)>4: je = e[4] flags,cmd = divmod(c,256) c = id2cmd[cmd] if type(m) == TupleType and c in ('Table','SubTable'): m = '' elif m == None: m = 'Here/To' else: m = repr(m) if len(m) > 17: m = m[:17]+'...' return '%-15.15s : %-30s : jne=%+i : je=%+i' % \ (repr(t),'%-.15s : %s'%(c,m),jne,je) def format_table(table,i=-1): """ Returns a pp-formatted version of the tag table as string """ l = [] for j in range(len(table)): if i == j: l.append('--> '+format_entry(table,j)) else: l.append(' '+format_entry(table,j)) return '\n'.join(l)+'\n' def print_tagtable(table): """ Print the tag table """ print(format_table(table)) def print_tags(text,tags,indent=0): """ Print the taglist tags for text using the given indent level """ for tag,l,r,subtags in tags: tagname = repr(tag) if len(tagname) > 20: tagname = tagname[:20] + '...' target = repr(text[l:r]) if len(target) > 60: target = target[:60] + '...' if subtags == None: print(' '+indent*' |',tagname,': ',target,(l,r)) else: print(' '+indent*' |',tagname,': ',target,(l,r)) print_tags(text,subtags,indent+1) def print_joinlist(joins,indent=0, StringType=str): """ Print the joinlist joins using the given indent level """ for j in joins: if type(j) == StringType: text = repr(j) if len(text) > 40: text = text[:40] + '...' print(' '+indent*' |',text,' (len = %i)' % len(j)) else: text = j[0] l,r = j[1:3] text = repr(text[l:r]) if len(text) > 40: text = text[:40] + '...' print(' '+indent*' |',text,' (len = %i)' % (r-l),(l,r)) def normlist(jlist, StringType=str): """ Return a normalized joinlist. All tuples in the joinlist are turned into real strings. The resulting list is a equivalent copy of the joinlist only consisting of strings. """ l = [''] * len(jlist) for i in range(len(jlist)): entry = jlist[i] if type(entry) == StringType: l[i] = entry else: l[i] = entry[0][entry[1]:entry[2]] return l # # aid for matching from a list of words # def _lookup_dict(l,index=0): d = {} for w in l: c = w[index] if c in d: d[c].append(w) else: d[c] = [w] return d def word_in_list(l): """ Creates a lookup table that matches the words in l """ t = [] d = _lookup_dict(l) keys = list(d.keys()) if len(keys) < 18: # somewhat arbitrary bound # fast hint for small sets t.append((None,IsIn,''.join(list(d.keys())))) t.append((None,Skip,-1)) # test groups for c, group in list(d.items()): t.append(None) # hint will be filled in later i = len(t)-1 for w in group: t.append((None,Word,w[1:],+1,MatchOk)) t.append((None,Fail,Here)) # add hint t[i] = (None,Is,c,len(t)-i) t.append((None,Fail,Here)) return tuple(t) # # Extra stuff useful in combination with the C functions # def replace(text,what,with_what,start=0,stop=None, SearchObject=TextSearch,join=join,joinlist=joinlist,tag=tag, string_replace=str.replace,type=type, StringType=str): """A fast replacement for string.replace. what can be given as string or search object. This function is a good example for the AppendTagobj-flag usage (the taglist can be used directly as joinlist). """ if type(what) is not TextSearchType: so = SearchObject(what) else: so = what what = so.match if stop is None: if start == 0 and len(what) < 2: return string_replace(text,what,with_what) stop = len(text) t = ((text,sWordStart,so,+2), # Found something, replace and continue searching (with_what,Skip+AppendTagobj,len(what),-1,-1), # Rest of text (text,Move,ToEOF) ) found,taglist,last = tag(text,t,start,stop) if not found: return text return join(taglist) # Alternative (usually slower) versions using different techniques: def _replace2(text,what,with_what,start=0,stop=None, join=join,joinlist=joinlist,tag=tag, TextSearchType=TextSearchType,TextSearch=TextSearch): """Analogon to string.replace; returns a string with_what all occurences of what in text[start:stop] replaced by with_what. This version uses a one entry tag-table and a Boyer-Moore-Search-object. what can be a string or a TextSearch search object. It's faster than string.replace in those cases, where the what-string gets long and/or many replacements are found; faster meaning from a few percent up to many times as fast start and stop define the slice of text to work in. stop defaults to len(text). """ if stop is None: stop = len(text) if type(what) is not TextSearchType: what=TextSearch(what) t = ((with_what,sFindWord,what,+1,+0),) found,taglist,last = tag(text,t,start,stop) if not found: return text return join(joinlist(text,taglist)) def _replace3(text,what,with_what, join=str.join,TextSearch=TextSearch, TextSearchType=TextSearchType): if type(what) is not TextSearchType: what=TextSearch(what) slices = what.findall(text) if not slices: return text l = [] x = 0 for left,right in slices: l.append(text[x:left] + with_what) x = right l.append(text[x:]) return join(l,'') def _replace4(text,what,with_what, join=join,joinlist=joinlist,tag=tag,TextSearch=TextSearch, TextSearchType=TextSearchType): if type(what) is not TextSearchType: what=TextSearch(what) slices = what.findall(text) if not slices: return text repl = [None]*len(slices) for i in range(len(slices)): repl[i] = (with_what,)+slices[i] return join(joinlist(text,repl)) def multireplace(text,replacements,start=0,stop=None, join=join,joinlist=joinlist): """ Apply multiple replacement to a text at once. replacements must be list of tuples (replacement, left, right). It is used to replace the slice text[left:right] with_what the string replacement. Note that the replacements do not affect one another. Indices always refer to the original text string. Replacements must not overlap. Otherwise a ValueError is raised. """ if stop is not None: return join(joinlist(text,replacements,start,stop)) else: return join(joinlist(text,replacements,start)) def find(text,what,start=0,stop=None, SearchObject=TextSearch): """ A faster replacement for string.find(). Uses a search object for the task. Returns the position of the first occurance of what in text[start:stop]. stop defaults to len(text). Returns -1 in case no occurance was found. """ if stop is not None: return SearchObject(what).find(text,start,stop) else: return SearchObject(what).find(text,start) def findall(text,what,start=0,stop=None, SearchObject=TextSearch): """ Find all occurances of what in text. Uses a search object for the task. Returns a list of slice tuples (l,r) marking the all occurances in text[start:stop]. stop defaults to len(text). Returns an empty list in case no occurance was found. """ if stop is not None: return SearchObject(what).findall(text,start,stop) else: return SearchObject(what).findall(text,start) def split(text,sep,start=0,stop=None,translate=None, SearchObject=TextSearch): """ A faster replacement for string.split(). Uses a search object for the task. Returns the result of cutting the text[start:stop] string into snippets at every sep occurance in form of a list of substrings. translate is passed to the search object as translation string if given. XXX Undocumented. """ if translate: so = SearchObject(sep,translate) else: so = SearchObject(sep) if stop: cuts = so.findall(text,start,stop) else: cuts = so.findall(text,start) l = 0 list = [] append = list.append for left,right in cuts: append(text[l:left]) l = right append(text[l:]) return list # helper for tagdict def _tagdict(text,dict,prefix,taglist): for o,l,r,s in taglist: pfx = prefix + str(o) dict[pfx] = text[l:r] if s: _tagdict(text,dict,pfx+'.',s) def tagdict(text,*args): """ Tag a text just like the function tag() and then convert its output into a dictionary where the tagobjects reference their respective strings This function emulates the interface of tag(). In contrast to tag() this funtion *does* make copies of the found stings, though. Returns a tuple (rc,tagdict,next) with_what the same meaning of rc and next as tag(); tagdict is the new dictionary or None in case rc is 0. """ rc,taglist,next = tag(*(text,)+args) if not rc: return (rc,None,next) d = {} tagdict = _tagdict for o,l,r,s in taglist: pfx = str(o) d[pfx] = text[l:r] if s: tagdict(text,d,pfx+'.',s) return (rc,d,next) def invset(chars): """ Return a set with_what all characters *except* the ones in chars. """ return set(chars,0) def is_whitespace(text,start=0,stop=None, charset=nonwhitespace_charset): """ Return 1 iff text[start:stop] only contains whitespace characters (as defined in Constants/Sets.py), 0 otherwise. """ if stop is None: stop = len(text) return (charset.search(text, 1, start, stop) is None) def collapse(text,separator=' ', join=join,charset=CharSet(newline+whitespace)): """ Eliminates newline characters and compresses whitespace characters into one space. The result is a one line text string. Tim Peters will like this function called with_what '-' separator ;-) """ return join(charset.split(text), separator) _linesplit_table = ( (None,Is,'\r',+1), (None,Is,'\n',+1), ('line',AllInCharSet+AppendMatch,CharSet('^\r\n'),+1,-2), (None,EOF,Here,+1,MatchOk), ('empty line',Skip+AppendMatch,0,0,-4), ) def splitlines(text, tag=tag,linesplit_table=_linesplit_table): """ Split text into a list of single lines. The following combinations are considered to be line-ends: '\r', '\r\n', '\n'; they may be used in any combination. The line-end indicators are removed from the strings prior to adding them to the list. This function allows dealing with text files from Macs, PCs and Unix origins in a portable way. """ return tag(text, linesplit_table)[1] # Alias for backward compatibility linesplit = splitlines _linecount_table = ( (None,Is,'\r',+1), (None,Is,'\n',+1), ('line',AllInCharSet,CharSet('^\r\n'),+1,-2), (None,EOF,Here,+1,MatchOk), ('empty line',Skip,0,0,-4), ) def countlines(text, linecount_table=_linecount_table): """ Returns the number of lines in text. Line ends are treated just like for splitlines() in a portable way. """ return len(tag(text, linecount_table)[1]) _wordsplit_table = ( (None,AllInCharSet,whitespace_charset,+1), ('word',AllInCharSet+AppendMatch,nonwhitespace_charset,+1,-1), (None,EOF,Here,+1,MatchOk), ) def splitwords(text, charset=whitespace_charset): """ Split text into a list of single words. Words are separated by whitespace. The whitespace is stripped before adding the words to the list. """ return charset.split(text) # # Testing and benchmarking # # Taken from my hack.py module: import time class _timer: """ timer class with a quite obvious interface - .start() starts a fairly accurate CPU-time timer plus an absolute timer - .stop() stops the timer and returns a tuple: the CPU-time in seconds and the absolute time elapsed since .start() was called """ utime = 0 atime = 0 def start(self, clock=time.clock,time=time.time): self.atime = time() self.utime = clock() def stop(self, clock=time.clock,time=time.time): self.utime = clock() - self.utime self.atime = time() - self.atime return self.utime,self.atime def usertime(self, clock=time.clock,time=time.time): self.utime = clock() - self.utime self.atime = time() - self.atime return self.utime def abstime(self, clock=time.clock,time=time.time): self.utime = clock() - self.utime self.atime = time() - self.atime return self.utime def __str__(self): return '%0.2fu %0.2fa sec.' % (self.utime,self.atime) def _bench(file='mxTextTools/mxTextTools.c'): def mismatch(orig,new): print() for i in range(len(orig)): if orig[i] != new[i]: break else: print('Length mismatch: orig=%i new=%i' % (len(orig),len(new))) if len(orig) > len(new): print('Missing chars:'+repr(orig[len(new):])) else: print('Excess chars:'+repr(new[len(orig):])) print() return print('Mismatch at offset %i:' % i) print((orig[i-100:i] + '<- %s != %s ->' % (repr(orig[i]),repr(new[i])) + orig[i+1:i+100])) print() text = open(file).read() import string t = _timer() print('Working on a %i byte string' % len(text)) if 0: print() print('Replacing strings') print('-'*72) print() for what,with_what in (('m','M'),('mx','MX'),('mxText','MXTEXT'), ('hmm','HMM'),('hmmm','HMM'),('hmhmm','HMM')): print('Replace "%s" with "%s"' % (what,with_what)) t.start() for i in range(100): rtext = text.replace(what,with_what) print('with string.replace:',t.stop(),'sec.') t.start() for i in range(100): ttext = replace(text,what,with_what) print('with tag.replace:',t.stop(),'sec.') if ttext != rtext: print('results are NOT ok !') print('-'*72) mismatch(rtext,ttext) t.start() for i in range(100): ttext = _replace2(text,what,with_what) print('with tag._replace2:',t.stop(),'sec.') if ttext != rtext: print('results are NOT ok !') print('-'*72) print(rtext) t.start() for i in range(100): ttext = _replace3(text,what,with_what) print('with tag._replace3:',t.stop(),'sec.') if ttext != rtext: print('results are NOT ok !') print('-'*72) print(rtext) t.start() for i in range(100): ttext = _replace4(text,what,with_what) print('with tag._replace4:',t.stop(),'sec.') if ttext != rtext: print('results are NOT ok !') print('-'*72) print(rtext) print() if 0: print() print('String lower/upper') print('-'*72) print() op = str.lower t.start() for i in range(1000): op(text) t.stop() print(' string.lower:',t) op = str.upper t.start() for i in range(1000): op(text) t.stop() print(' string.upper:',t) op = upper t.start() for i in range(1000): op(text) t.stop() print(' TextTools.upper:',t) op = lower t.start() for i in range(1000): op(text) t.stop() print(' TextTools.lower:',t) print('Testing...', end=' ') ltext = text.lower() assert ltext == lower(text) utext = text.upper() assert utext == upper(text) print('ok.') if 0: print() print('Joining lists') print('-'*72) print() l = setsplit(text,whitespace_set) op = str.join t.start() for i in range(1000): op(l) t.stop() print(' string.join:',t) op = join t.start() for i in range(1000): op(l) t.stop() print(' TextTools.join:',t) op = str.join t.start() for i in range(1000): op(l,' ') t.stop() print(' string.join with separator:',t) op = join t.start() for i in range(1000): op(l,' ') t.stop() print(' TextTools.join with separator:',t) if 0: print() print('Creating join lists') print('-'*72) print() repl = [] for i in range(0,len(text),10): repl.append((str(i),i,i+1)) op = joinlist t.start() for i in range(1000): op(text,repl) t.stop() print(' TextTools.joinlist:',t) if 0: print() print('Splitting text') print('-'*72) print() op = str.split t.start() for i in range(100): op(text) t.stop() print(' string.split whitespace:',t,'(',len(op(text)),'snippets )') op = setsplit ws = whitespace_set t.start() for i in range(100): op(text,ws) t.stop() print(' TextTools.setsplit whitespace:',t,'(',len(op(text,ws)),'snippets )') assert text.split() == setsplit(text,ws) op = str.split sep = 'a' t.start() for i in range(100): op(text,sep) t.stop() print(' string.split at "a":',t,'(',len(op(text,sep)),'snippets )') op = split sep = 'a' t.start() for i in range(100): op(text,sep) t.stop() print(' TextTools.split at "a":',t,'(',len(op(text,sep)),'snippets )') op = charsplit sep = 'a' t.start() for i in range(100): op(text,sep) t.stop() print(' TextTools.charsplit at "a":',t,'(',len(op(text,sep)),'snippets )') op = setsplit sep = set('a') t.start() for i in range(100): op(text,sep) t.stop() print(' TextTools.setsplit at "a":',t,'(',len(op(text,sep)),'snippets )') # Note: string.split and setsplit don't work identically ! op = str.split sep = 'int' t.start() for i in range(100): op(text,sep) t.stop() print(' string.split at "int":',t,'(',len(op(text,sep)),'snippets )') op = split sep = 'int' t.start() for i in range(100): op(text,sep) t.stop() print(' TextTools.split at "int":',t,'(',len(op(text,sep)),'snippets )') op = setsplit sep = set('int') t.start() for i in range(100): op(text,sep) t.stop() print(' TextTools.setsplit at "i", "n", "t":',t,'(',len(op(text,sep)),'snippets )') op = str.split sep = 'register' t.start() for i in range(100): op(text,sep) t.stop() print(' string.split at "register":',t,'(',len(op(text,sep)),'snippets )') op = split sep = 'register' t.start() for i in range(100): op(text,sep) t.stop() print(' TextTools.split at "register":',t,'(',len(op(text,sep)),'snippets )') if __name__=='__main__': _bench() SimpleParse-2.2.0/simpleparse/stt/TextTools/__init__.py0000644000175000017500000000355612620706017024561 0ustar mcfletchmcfletch00000000000000""" mxTextTools - A tools package for fast text processing. Copyright (c) 2000, Marc-Andre Lemburg; mailto:mal@lemburg.com Copyright (c) 2000-2003, eGenix.com Software GmbH; mailto:info@egenix.com Copyright (c) 2003-2006, Mike Fletcher; mailto:mcfletch@vrplumber.com See the documentation for further information on copyrights, or contact the author. All Rights Reserved. """ from .TextTools import * from .TextTools import __version__ try: import copyreg except ImportError: import copy_reg as copyreg ### Make the types pickleable: # Shortcuts for pickle (reduces the pickle's length) def _CS(definition): return CharSet(definition) def _TT(definition): return TagTable(definition) def _TS(match,translate,algorithm): return TextSearch(match,translate,algorithm) # Needed for backward compatibility: def _BMS(match,translate): return BMS(match,translate) def _FS(match,translate): return FS(match,translate) # Module init class modinit: ### Register the types def pickle_CharSet(cs): return _CS,(cs.definition,) def pickle_TagTable(tt): return _TT,(tt.compiled(),) def pickle_TextSearch(ts): return _TS,(ts.match, ts.translate, ts.algorithm) copyreg.pickle(CharSetType, pickle_CharSet, _CS) copyreg.pickle(TagTableType, pickle_TagTable, _TT) copyreg.pickle(TextSearchType, pickle_TextSearch, _TS) if 0: def pickle_BMS(so): return _BMS,(so.match,so.translate) def pickle_FS(so): return _FS,(so.match,so.translate) copyreg.pickle(BMSType, pickle_BMS, _BMS) copyreg.pickle(FSType, pickle_FS, _FS) del modinit SimpleParse-2.2.0/simpleparse/stt/LICENSE0000644000175000017500000000114512037615407021504 0ustar mcfletchmcfletch00000000000000============================================================================= eGenix.com mx Extension Series for Python ----------------------------------------------------------------------------- Please see the LICENSE file in the package subdirectories for information on copyright and authorized use of the packages. Some packages are licensed as Open Source software, others are free for educational use only. All packages come with full source code. ----------------------------------------------------------------------------- Marc-Andre Lemburg, mal@egenix.com CEO eGenix.com GmbH Langenfeld Germany SimpleParse-2.2.0/simpleparse/stt/__init__.py0000644000175000017500000000171412620706017022606 0ustar mcfletchmcfletch00000000000000"""SimpleParse' built-in version of the mxTextTools text-processing engine Copyright (c) 1998-2000, Marc-Andre Lemburg; mailto:mal@lemburg.com Copyright (c) 2000-2003, eGenix.com Software GmbH; mailto:info@egenix.com Copyright (c) 2003-2006, Mike Fletcher; mailto:mcfletch@vrplumber.com See the documentation for further information on copyrights, or contact the author. All Rights Reserved. IMPORTANT: The subpackages included in the mx Extension series may have different license terms imposed on them. Be sure to read the documentation of each subpackage *before* using them. """ __copyright__ = """\ Copyright (c) 1998-2000, Marc-Andre Lemburg; mailto:mal@lemburg.com Copyright (c) 2000-2003, eGenix.com Software GmbH; mailto:info@egenix.com Copyright (c) 2003-2006, Mike Fletcher; mailto:mcfletch@vrplumber.com See the documentation for further information on copyrights, or contact the author. All Rights Reserved. """SimpleParse-2.2.0/simpleparse/objectgenerator.py0000644000175000017500000007602412620706017023420 0ustar mcfletchmcfletch00000000000000"""Object-oriented tag-table generator objects The objectgenerator module is the core of the SimpleParse system, the various element token classes defined here implement transitions from EBNF-style abstractions into the low-level (assembly-like) instructions to the TextTools engine. Each class within the module is a sub-class of ElementToken, which provides a number of common facilities, the most obvious of which is the permute method, which takes care of the negative, optional, and repeating flags for the normal case (with character ranges and literals being non-normal). """ from __future__ import print_function from simpleparse.stt.TextTools.TextTools import * ### Direct use of BMS is deprecated now... try: TextSearch except NameError: TextSearch = BMS from simpleparse.error import ParserSyntaxError import copy class ElementToken: """Abstract base class for all ElementTokens Common Attributes: negative -- the element token should match a character if the "base" definition would not match at the current position optional -- the element token will match even if the base definition would not match at the current position repeating -- if the element is successfully matched, attempt to match it again. lookahead -- if true, the scanning position of the engine will be reset after the element matches errorOnFail -- if true, the engine will call the object stored in errorOnFail as a text- matching object iff the element token fails to match. This is used to signal SyntaxErrors. Attributes only used for top-level Productions: report -- if true, the production's results will be added to the result tree expanded -- if true, the production's children's results will be added to the result tree but the production's own result will be ignored """ negative = 0 optional = 0 repeating = 0 report = 1 # note that optional and errorOnFail are mutually exclusive errorOnFail = None # any item may be marked as expanded, # which says that it's a top-level declaration # and that links to it should automatically expand # as if the name wasn't present... expanded = 0 lookahead = 0 def __init__( self, **namedarguments ): """Initialize the object with named attributes This method simply takes the named attributes and updates the object's dictionary with them """ self.__dict__.update( namedarguments ) def toParser( self, generator, noReport=0 ): """Abstract interface for implementing the conversion to a text-tools table generator -- an instance of generator.Generator which provides various facilities for discovering other productions. noReport -- if true, we're being called recursively for a terminal grammar fragment where one of our parents has explicitly suppressed all reporting. This method is called by the generator or by another element-token's toParser method. """ raise NotImplementedError( '''Element token generator abstract function called''' ) def permute( self, basetable ): '''Given a positive, required, non-repeating table, convert to appropriately configured table This method applies generic logic for applying the operational flags to a basic recipe for an element. It is normally called from the elements-token's own toParser method. ''' flags = 0 if self.lookahead: flags = flags + LookAhead assert len(basetable) == 3, '''Attempt to permute a base table that already has fail flag set, can only permute unadorned tables''' if self.negative: # negative "matches" if it fails # we add in the flags while we're at it... basetable = (None, SubTable+flags, ( basetable + (1,2), (None, EOF, Here,2,1), # if we hit eof, this didn't match, otherwise, we matched (None, Fail, Here),# either hit eof or matched the client (None,Skip,1), )) elif flags: # unpack, add the flags, and repack tag, command, arg = basetable basetable = ( tag, command+flags, arg) if self.repeating: ### There are a number of problems with repetition that we'd like to solve ### via recursive table calls, but those are very expensive in the current ### implementation, so we need to use something a little more hacky... if self.optional: return [ ## this would be the "simplistic" implementation... ## basetable + (1,0) ## it doesn't work because of cases ## where all-optional children "succeed" without consuming ## when within a repeating parent ## the EOF test isn't enough to fix the problem, ## as it's only checking a common case, not the underlying failure basetable +(2,1), # fail, done, succeed, check for eof and if not, try matching again # if we hit eof, no chance of further matches, # consider ourselves done (None, EOF, Here,-1,1), ] elif self.errorOnFail: return [ basetable+(1,2), (None, Call, self.errorOnFail), # as for optional... basetable +(2,1), (None, EOF, Here,-1,1), ] else: return [ basetable, # as for optional... basetable +(2,1), (None, EOF, Here,-1,1), ] else: # single if self.optional: return [ basetable +(1,1) ] elif self.errorOnFail: return [ basetable+(1,2), (None, Call, self.errorOnFail), ] else: # not optional return [ basetable ] def __repr__( self): """Return a readily recognisable version of ourself""" from simpleparse import printers return printers.asObject( self ) def terminal (self, generator): """Determine if this element is terminal for the generator""" return 0 class Literal( ElementToken ): """Literal string value to be matched Literals are one of the most common elements within any grammar. The implementation tries to use the most efficient mechanism available for matching/searching for a literal value, so the Literal class does not use the permute method, instead defining explicit parsing methodologies for each flag and value combination Literals in the SimpleParse EBNF grammar are defined like so: "test", "test"?, "test"*, "test"+ -"test", -"test"?, -"test"*, -"test"+ Attributes: value -- a string storing the literal's value Notes: Currently we don't support Unicode literals See also: CILiteral -- case-insensitive Literal values """ value = "" def toParser( self, generator=None, noReport=0 ): """Create the parser for the element token""" flags = 0 if self.lookahead: flags = flags + LookAhead base = self.baseToParser( generator ) if flags or self.errorOnFail: if self.errorOnFail: return [(None, SubTable+flags, tuple(base),1,2),(None, Call, self.errorOnFail)] else: return [(None, SubTable+flags, tuple(base))] else: return base def baseToParser( self, generator=None ): """Parser generation without considering flag settings""" svalue = self.value if self.negative: if self.repeating: # a repeating negative value, a "search" in effect if self.optional: # if fails, then go to end of file return [ (None, sWordStart, TextSearch( svalue ),1,2), (None, Move, ToEOF ) ] else: # must first check to make sure the current position is not the word, then the same return [ (None, Word, svalue, 2,1), (None, Fail, Here), (None, sWordStart, TextSearch( svalue ),1,2), (None, Move, ToEOF ) ] #return [ (None, Word, svalue, 2,1),(None, Fail, Here),(None, WordStart, svalue,1,2), (None, Move, ToEOF ) ] else: # a single-character test saying "not a this" if self.optional: # test for a success, move back if success, move one forward if failure if len(svalue) > 1: return [ (None, Word, svalue, 2,1), (None, Skip, -len(svalue), 2,2), # backup if this was the word to start of word, succeed (None, Skip, 1 ) ] # else just move one character and succeed else: # Uses Is test instead of Word test, should be faster I'd imagine return [ (None, Is, svalue, 2,1), (None, Skip, -1, 2,2), # backtrack (None, Skip, 1 ) ] # else just move one character and succeed else: # must find at least one character not part of the word, so if len(svalue) > 1: return [ (None, Word, svalue, 2,1), (None, Fail, Here), (None, Skip, 1 ) ] # else just move one character and succeed else: #must fail if it finds or move one forward return [ (None, Is, svalue, 2,1), (None, Fail, Here), (None, Skip, 1 ) ] # else just move one character and succeed else: # positive if self.repeating: if self.optional: if len(svalue) > 1: return [ (None, Word, svalue, 1,0) ] else: return [ (None, Is, svalue, 1,0) ] else: # not optional if len(svalue) > 1: return [ (None, Word, svalue),(None, Word, svalue,1,0) ] else: return [ (None, Is, svalue),(None, Is, svalue,1,0) ] else: # not repeating if self.optional: if len(svalue) > 1: return [ (None, Word, svalue, 1,1) ] else: return [ (None, Is, svalue, 1,1) ] else: # not optional if len(svalue) > 1: return [ (None, Word, svalue) ] else: return [ (None, Word, svalue) ] def terminal (self, generator): """Determine if this element is terminal for the generator""" return 1 class _Range( ElementToken ): """Range of character values where any one of the characters may match The Range token allows you to define a set of characters (using a mini-grammar) of which any one may match. By using the repetition flags, it is possible to easily create such common structures as "names" and "numbers". For example: name := [a-zA-Z]+ number := [0-9.eE]+ (Note: those are not beautifully defined examples :) ). The mini-grammar for the simpleparsegrammar is defined as follows: '[',CHARBRACE?,CHARDASH?, (CHARRANGE/CHARNOBRACE)*, CHARDASH?,']' that is, if a literal ']' character is wanted, you must define the character as the first item in the range. A literal '-' character must appear as the first character after any literal ']' character (or the beginning of the range) or as the last character in the range. Note: The expansion from the mini-grammar occurs before the Range token is created (the simpleparse grammar does the expansion), so the value attribute of the token is actually the expanded string of characters. """ value = "" requiresExpandedSet = 1 def toParser( self, generator=None, noReport=0 ): """Create the parser for the element token""" flags = 0 if self.lookahead: flags = flags + LookAhead base = self.baseToParser( generator ) if flags or self.errorOnFail: if self.errorOnFail: return [(None, SubTable+flags, tuple(base),1,2),(None, Call, self.errorOnFail)] else: return [(None, SubTable+flags, tuple(base))] else: return base # this should be a faster and more generic character set # approach, but there's a bug with mxTextTools b3 which makes # it non-functional, so for now I'm using the old version. # Eventually this should also support the Unicode character sets ##try: ## CharSet ## class Range( _Range ): ## """Range type using the CharSet feature of mx.TextTools 2.1.0 ## ## The CharSet type allows for both Unicode and 256-char strings, ## so we can use it as our 2.1.0 primary parsing mechanism. ## It also allows for simpler definitions (doesn't require that ## we pre-exand the character set). That's going to require support ## in the SimpleParse grammar, of course. ## """ ## requiresExpandedSet = 0 ## def baseToParser( self, generator=None ): ## """Parser generation without considering flag settings""" ## svalue = self.value ## print 'generating range for ', repr(svalue) ## if not svalue: ## raise ValueError( '''Range defined with no member values, would cause infinite loop %s'''%(self)) ## if self.negative: ## svalue = '^' + svalue ## print ' generated', repr(svalue) ## svalue = CharSet(svalue) ## if self.repeating: ## if self.optional: ## return [ (None, AllInCharSet, svalue, 1 ) ] ## else: # not optional ## #return [ (None, AllInSet, svalue ) ] ## return [ (None, AllInCharSet, svalue ) ] ## else: # not repeating ## if self.optional: ## #return [ (None, IsInSet, svalue, 1 ) ] ## return [ (None, IsInCharSet, svalue, 1 ) ] ## else: # not optional ## #return [ (None, IsInSet, svalue ) ] ## return [ (None, IsInCharSet, svalue ) ] ##except NameError: class Range( _Range ): """Range type which doesn't use the CharSet features in mx.TextTools This is likely to be much slower than the CharSet version (below), and is unable to handle unicode character sets. However, it will work with TextTools 2.0.3, which may be needed in some cases. """ def baseToParser( self, generator=None ): """Parser generation without considering flag settings""" svalue = self.value if not svalue: raise ValueError( '''Range defined with no member values, would cause infinite loop %s'''%(self)) if self.negative: if self.repeating: if self.optional: #return [ (None, AllInSet, svalue, 1 ) ] return [ (None, AllNotIn, svalue, 1 ) ] else: # not optional #return [ (None, AllInSet, svalue ) ] return [ (None, AllNotIn, svalue ) ] else: # not repeating if self.optional: #return [ (None, IsInSet, svalue, 1 ) ] return [ (None, IsNotIn, svalue, 1 ) ] else: # not optional #return [ (None, IsInSet, svalue ) ] return [ (None, IsNotIn, svalue ) ] else: if self.repeating: if self.optional: #return [ (None, AllInSet, svalue, 1 ) ] return [ (None, AllIn, svalue, 1 ) ] else: # not optional #return [ (None, AllInSet, svalue ) ] return [ (None, AllIn, svalue ) ] else: # not repeating if self.optional: #return [ (None, IsInSet, svalue, 1 ) ] return [ (None, IsIn, svalue, 1 ) ] else: # not optional #return [ (None, IsInSet, svalue ) ] return [ (None, IsIn, svalue ) ] def terminal (self, generator): """Determine if this element is terminal for the generator""" return 1 class Group( ElementToken ): """Abstract base class for all group element tokens The primary feature of a group is that it has a set of element tokens stored in the attribute "children". """ children = () terminalValue = None def terminal (self, generator): """Determine if this element is terminal for the generator""" if self.terminalValue in (0,1): return self.terminalValue self.terminalValue = 0 for item in self.children: if not item.terminal( generator): return self.terminalValue self.terminalValue = 1 return self.terminalValue class SequentialGroup( Group ): """A sequence of element tokens which must match in a particular order A sequential group must match each child in turn and all children must be satisfied to consider the group matched. Within the simpleparsegrammar, the sequential group is defined like so: ("a", b, c, "d") i.e. a series of comma-separated element token definitions. """ def toParser( self, generator=None, noReport=0 ): elset = [] for child in self.children: elset.extend( child.toParser( generator, noReport ) ) basic = self.permute( (None, SubTable, tuple( elset)) ) if len(basic) == 1: first = basic[0] if len(first) == 3 and first[0] is None and first[1] == SubTable: return tuple(first[2]) return basic class CILiteral( SequentialGroup ): """Case-insensitive Literal values The CILiteral is a sequence of literal and character-range values, where each element is positive and required. Literal values are composed of those characters which are not upper-case/lower-case pairs, while the ranges are all two-character ranges with the upper and lower forms. CILiterals in the SimpleParse EBNF grammar are defined like so: c"test", c"test"?, c"test"*, c"test"+ -c"test", -c"test"?, -c"test"*, -c"test"+ Attributes: value -- a string storing the literal's value Notes: Currently we don't support Unicode literals A CILiteral will be *much* slower than a regular literal or character range """ value = "" def toParser( self, generator=None, noReport=0 ): elset = self.ciParse( self.value ) if len(elset) == 1: # XXX should be compressing these out during optimisation... # pointless declaration of case-insensitivity, # or a single-character value pass basic = self.permute( (None, SubTable, tuple( elset)) ) if len(basic) == 1: first = basic[0] if len(first) == 3 and first[0] is None and first[1] == SubTable: return tuple(first[2]) return basic def ciParse( self, value ): """Break value into set of case-dependent groups...""" def equalPrefix( a,b ): for x in range(len(a)-1): if a[x] != b[x]: return x result = [] a,b = value.upper(), value.lower() while a and b: # is there an equal literal run at the start? stringPrefix = equalPrefix( a,b ) if stringPrefix: result.append( (None, Word, a[:stringPrefix]) ) a,b = a[stringPrefix:],b[stringPrefix:] # if we hit the end of the string, that's fine, just return if not a and b: break # otherwise, the next character must be a case-differing pair result.append( (None, IsIn, a[0]+b[0]) ) a,b = a[1:], b[1:] return result class ErrorOnFail(ElementToken): """When called as a matching function, raises a SyntaxError Attributes: expected -- list of strings describing expected productions production -- string name of the production that's failing to parse message -- overrides default message generation if non-null (something,something)+! (something,something)! (something,something)+!"Unable to parse somethings in my production" (something,something)!"Unable to parse somethings in my production" if string -> give an explicit message (with optional % values) else -> use a default string """ production = "" message = "" expected = "" def __call__( self, text, position, end ): """Method called by mxTextTools iff the base production fails""" error = ParserSyntaxError( self.message ) error.error_message = self.message error.production = self.production error.expected= self.expected error.buffer = text error.position = position raise error def copy( self ): import copy return copy.copy( self ) class FirstOfGroup( Group ): """Set of tokens that matches (and stops searching) with the first successful child A FirstOf group attempts to match each child in turn, declaring success with the first successful child, or failure if none of the children match. Within the simpleparsegrammar, the FirstOf group is defined like so: ("a" / b / c / "d") i.e. a series of slash-separated element token definitions. """ def toParser( self, generator=None, noReport=0 ): elset = [] # should catch condition where a child is optional # and we are repeating (which causes a crash during # parsing), but doing so is rather complex and # requires analysis of the whole grammar. for el in self.children: assert not el.optional, """Optional child of a FirstOf group created, this would cause an infinite recursion in the engine, child was %s"""%el dataset = el.toParser( generator, noReport ) if len( dataset) == 1:# and len(dataset[0]) == 3: # we can alter the jump states with impunity elset.append( dataset[0] ) else: # for now I'm eating the inefficiency and doing an extra SubTable for all elements to allow for easy calculation of jumps within the FO group elset.append( (None, SubTable, tuple( dataset )) ) procset = [] for i in range( len( elset) -1): # note that we have to treat last el specially procset.append( elset[i] + (1,len(elset)-i) ) # if success, jump past end procset.append( elset[-1] ) # will cause a failure if last element doesn't match procset = tuple(procset) basetable = (None, SubTable, procset ) return self.permute( basetable ) class Prebuilt( ElementToken ): """Holder for pre-built TextTools tag tables You can pass in a Pre-built tag table when creating your grammar, doing so creates Prebuilt element tokens which can be referenced by the other element tokens in your grammar. """ value = () def toParser( self, generator=None, noReport=0 ): return self.value class LibraryElement( ElementToken ): """Holder for a prebuilt item with it's own generator""" generator = None production = "" methodSource = None def toParser( self, generator=None, noReport=0 ): if self.methodSource is None: source = generator.methodSource else: source = self.methodSource basetable = self.generator.buildParser( self.production, source ) try: if type(basetable[0]) == type(()): if len(basetable) == 1 and len(basetable[0]) == 3: basetable = basetable[0] else: # this is a table that got returned! basetable = (None, SubTable, basetable) return self.permute( basetable ) except: print(basetable) raise class Name( ElementToken ): """Reference to another rule in the grammar The Name element token allows you to reference another production within the grammar. There are three major sub-categories of reference depending on both the Name element token and the referenced table's values. if the Name token's report attribute is false, or the target table's report attribute is false, or the Name token negative attribute is true, the Name reference will report nothing in the result tree if the target's expand attribute is true, however, the Name reference will report the children of the target production without reporting the target production's results (SubTable match) finally: if the target is not expanded and the Name token should report something, the generator object is asked to supply the tag object and flags for processing the results of the target. See the generator.MethodSource documentation for details. Notes: expanded and un-reported productions won't get any methodsource methods called when they are finished, that's just how I decided to do it, not sure if there's some case where you'd want it. As a result, it's possible to have a method getting called for one instance (where a name ref is reporting) and not for another (where the name ref isn't reporting). """ value = "" # following two flags are new ideas in the rewrite... report = 1 def toParser( self, generator, noReport=0 ): """Create the table for parsing a name-reference Note that currently most of the "compression" optimisations occur here. """ sindex = generator.getNameIndex( self.value ) command = TableInList target = generator.getRootObjects()[sindex] reportSelf = ( (not noReport) and # parent hasn't suppressed reporting self.report and # we are not suppressing ourselves target.report and # target doesn't suppress reporting (not self.negative) and # we aren't a negation, which doesn't report anything by itself (not target.expanded) # we don't report the expanded production ) reportChildren = ( (not noReport) and # parent hasn't suppressed reporting self.report and # we are not suppressing ourselves target.report and # target doesn't suppress reporting (not self.negative) # we aren't a negation, which doesn't report anything by itself ) if reportSelf: svalue = self.value else: svalue = None flags = 0 if target.expanded: # the target is the root of an expandedname declaration # so we need to do special processing to make sure that # it gets properly reported... command = SubTableInList tagobject = None # check for indirected reference to another name... elif not reportSelf: tagobject = svalue else: flags, tagobject = generator.getObjectForName( svalue ) if flags: command = command | flags if tagobject is None and not flags: if self.terminal(generator): if extractFlags(self,reportChildren) != extractFlags(target): composite = compositeFlags(self,target, reportChildren) partial = generator.getCustomTerminalParser( sindex,composite) if partial is not None: return partial partial = tuple(copyToNewFlags(target, composite).toParser( generator, not reportChildren )) generator.cacheCustomTerminalParser( sindex,composite, partial) return partial else: partial = generator.getTerminalParser( sindex ) if partial is not None: return partial partial = tuple(target.toParser( generator, not reportChildren )) generator.setTerminalParser( sindex, partial) return partial # base, required, positive table... if ( self.terminal( generator ) and (not flags) and isinstance(target, (SequentialGroup,Literal,Name,Range)) ): partial = generator.getTerminalParser( sindex ) if partial is None: partial = tuple(target.toParser( generator, #not reportChildren )) generator.setTerminalParser( sindex, partial) if len(partial) == 1 and len(partial[0]) == 3 and ( partial[0][0] is None or tagobject is None ): # there is a single child # it doesn't report anything, or we don't partial = (partial[0][0] or tagobject,)+ partial[0][1:] else: partial = (tagobject, Table, tuple(partial)) return self.permute( partial ) basetable = ( tagobject, command, ( generator.getParserList (), sindex, ) ) return self.permute( basetable ) terminalValue = None def terminal (self, generator): """Determine if this element is terminal for the generator""" if self.terminalValue in (0,1): return self.terminalValue self.terminalValue = 0 target = generator.getRootObject( self.value ) if target.terminal( generator): self.terminalValue = 1 return self.terminalValue def extractFlags( item, report=1 ): """Extract the flags from an item as a tuple""" return ( item.negative, item.optional, item.repeating, item.errorOnFail, item.lookahead, item.report and report, ) def compositeFlags( first, second, report=1 ): """Composite flags from two items into overall flag-set""" result = [] for a,b in zip(extractFlags(first, report), extractFlags(second, report)): result.append( a or b ) return tuple(result) def copyToNewFlags( target, flags ): """Copy target using combined flags""" new = copy.copy( target ) for name,value in zip( ("negative","optional","repeating","errorOnFail","lookahead",'report'), flags, ): setattr(new, name,value) return new SimpleParse-2.2.0/simpleparse/error.py0000644000175000017500000000450412620706017021366 0ustar mcfletchmcfletch00000000000000"""Definition of the ParserSyntaxError raised on parse failure""" from simpleparse.stt.TextTools.TextTools import countlines class ParserSyntaxError( SyntaxError ): """Sub-class of SyntaxError for use by SimpleParse parsers Every instance will have the following attributes: buffer -- pointer to the source buffer position -- integer position in buffer where error occured or -1 production -- the production which failed expected -- string (currently taken from grammar) describing what production/element token failed to match the following will be calculated in order to display human-friendly error messages: line -- ~ text line-number or -1 lineChar -- ~ character on line where parsing failed or -1 """ buffer = "" position = -1 line = -1 production = "" expected = "" error_message = None DEFAULTTEMPLATE = """Failed parsing production "%(production)s" @pos %(position)s (~line %(line)s:%(lineChar)s).\nExpected syntax: %(expected)s\nGot text: %(text)s""" def __str__( self ): """Create a string representation of the error""" if self.error_message: return '%s: %s'%( self.__class__.__name__, self.messageFormat(self.error_message) ) else: return '%s: %s'%( self.__class__.__name__, self.messageFormat() ) def messageFormat( self, template=None): """Create a default message for this syntax error""" if template is None: template = self.DEFAULTTEMPLATE line, lineChar = self.getLineCoordinate() variables = { "production": self.production, "position": self.position, "line": line, "lineChar": lineChar, "expected": self.expected or "UNKNOWN", "text": repr(self.buffer[ self.position:self.position+50 ]), } return template % variables def getLineCoordinate( self ): """Get (line number, line character) for the error""" lineChar = self.buffer.rfind('\n', 0, self.position) if lineChar == -1: # was no \n before the current position lineChar = self.position line = 1 else: line = countlines( self.buffer[:lineChar] ) lineChar = self.position-lineChar return line, lineChar SimpleParse-2.2.0/simpleparse/__init__.py0000644000175000017500000000032212620706017021766 0ustar mcfletchmcfletch00000000000000'''Simple parsing using mxTextTools See the /doc subdirectory for introductory and general documentation. See license.txt for licensing information. (This is a BSD-licensed package). ''' __version__="2.2.0" SimpleParse-2.2.0/setup.cfg0000644000175000017500000000007312620710576017162 0ustar mcfletchmcfletch00000000000000[egg_info] tag_build = tag_date = 0 tag_svn_revision = 0