java.util.regex
regexes.
*
* @see java.util.regex.Pattern
* @see XML Schema Part 2
*/
public static final CharClass[] categoryCharClasses = new CharClass[RegexData.categories.length()];
public static final CharClass[] subCategoryCharClasses = new CharClass[RegexData.subCategories.length() / 2];
/**
* CharClass for each block name in specialBlockNames.
*/
public static final CharClass[] specialBlockCharClasses = {
new CharRange(0x10300, 0x1032F),
new CharRange(0x10330, 0x1034F),
new CharRange(0x10400, 0x1044F),
new CharRange(0x1D000, 0x1D0FF),
new CharRange(0x1D100, 0x1D1FF),
new CharRange(0x1D400, 0x1D7FF),
new CharRange(0x20000, 0x2A6D6),
new CharRange(0x2F800, 0x2FA1F),
new CharRange(0xE0000, 0xE007F),
new Union(new CharClass[]{
new CharRange(0xE000, 0xF8FF),
new CharRange(0xF0000, 0xFFFFD),
new CharRange(0x100000, 0x10FFFD)
}),
Empty.getInstance(),
Empty.getInstance(),
Empty.getInstance()
};
private static final CharClass DOT_SCHEMA =
new Complement(new Union(new CharClass[]{new SingleChar('\n'), new SingleChar('\r')}));
private static final CharClass ESC_d = new Property("Nd");
private static final CharClass ESC_D = new Complement(ESC_d);
private static final CharClass ESC_W = new Union(new CharClass[]{computeCategoryCharClass('P'),
computeCategoryCharClass('Z'),
computeCategoryCharClass('C')});
//was: new Property("P"), new Property("Z"), new Property("C") }
private static final CharClass ESC_w = new Complement(ESC_W);
private static final CharClass ESC_s = new Union(new CharClass[]{
new SingleChar(' '),
new SingleChar('\n'),
new SingleChar('\r'),
new SingleChar('\t')
});
private static final CharClass ESC_S = new Complement(ESC_s);
// private static final CharClass ESC_i = makeCharClass(RegexData.NMSTRT_CATEGORIES,
// RegexData.NMSTRT_INCLUDES,
// RegexData.NMSTRT_EXCLUDE_RANGES);
private static final CharClass ESC_i_10 = makeNameCharClass(XMLCharacterData.NAME_START_10_MASK);
private static final CharClass ESC_i_11 = makeNameCharClass(XMLCharacterData.NAME_START_11_MASK);
private static final CharClass ESC_I_10 = new Complement(ESC_i_10);
private static final CharClass ESC_I_11 = new Complement(ESC_i_11);
private static final CharClass ESC_c_10 = makeNameCharClass(XMLCharacterData.NAME_10_MASK);
private static final CharClass ESC_c_11 = makeNameCharClass(XMLCharacterData.NAME_11_MASK);
private static final CharClass ESC_C_10 = new Complement(ESC_c_10);
private static final CharClass ESC_C_11 = new Complement(ESC_c_11);
// private static final CharClass ESC_I = new Complement(ESC_i);
// private static final CharClass ESC_c = makeCharClass(RegexData.NMCHAR_CATEGORIES,
// RegexData.NMCHAR_INCLUDES,
// RegexData.NMCHAR_EXCLUDE_RANGES);
//
// private static final CharClass ESC_C = new Complement(ESC_c);
private JDK15RegexTranslator() {
}
/**
* Translates a regular expression in the syntax of XML Schemas Part 2 into a regular
* expression in the syntax of java.util.regex.Pattern
. The translation
* assumes that the string to be matched against the regex uses surrogate pairs correctly.
* If the string comes from XML content, a conforming XML parser will automatically
* check this; if the string comes from elsewhere, it may be necessary to check
* surrogate usage before matching.
* @param xmlVersion set to {@link net.sf.saxon.Configuration#XML10} for XML 1.0
* or {@link net.sf.saxon.Configuration#XML11} for XML 1.1
* @param regExp a String containing a regular expression in the syntax of XML Schemas Part 2
* @param xpath a boolean indicating whether the XPath 2.0 F+O extensions to the schema
* regex syntax are permitted
* @param ignoreWhitespace true if whitespace is to be ignored ('x' flag)
* @param caseBlind true if case is to be ignored ('i' flag)
* @return a JDK 1.5 regular expression
* @throws RegexSyntaxException if regexp
is not a regular expression in the
* syntax of XML Schemas Part 2, or XPath 2.0, as appropriate
* @see java.util.regex.Pattern
* @see XML Schema Part 2
*/
public static String translate(CharSequence regExp,
int xmlVersion, boolean xpath, boolean ignoreWhitespace, boolean caseBlind)
throws RegexSyntaxException {
//System.err.println("Input regex: " + regexp);
JDK15RegexTranslator tr = new JDK15RegexTranslator();
tr.regExp = regExp;
tr.length = regExp.length();
tr.xmlVersion = xmlVersion;
tr.isXPath = xpath;
tr.ignoreWhitespace = ignoreWhitespace;
tr.caseBlind = caseBlind;
tr.advance();
tr.translateTop();
//System.err.println("Output regex: " + tr.result.toString());
return tr.result.toString();
}
static abstract class CharClass {
protected CharClass() {
}
abstract void output(FastStringBuffer buf);
abstract void outputComplement(FastStringBuffer buf);
int getSingleChar() {
return -1;
}
}
static abstract class SimpleCharClass extends CharClass {
SimpleCharClass() {
}
void output(FastStringBuffer buf) {
buf.append('[');
inClassOutput(buf);
buf.append(']');
}
void outputComplement(FastStringBuffer buf) {
buf.append("[^");
inClassOutput(buf);
buf.append(']');
}
abstract void inClassOutput(FastStringBuffer buf);
}
static class SingleChar extends SimpleCharClass {
private final int c;
SingleChar(int c) {
this.c = c;
}
int getSingleChar() {
return c;
}
void output(FastStringBuffer buf) {
inClassOutput(buf);
}
void inClassOutput(FastStringBuffer buf) {
if (isJavaMetaChar(c)) {
buf.append('\\');
buf.append((char) c);
} else {
switch (c) {
case '\r':
buf.append("\\r");
break;
case '\n':
buf.append("\\n");
break;
case '\t':
buf.append("\\t");
break;
case ' ':
buf.append("\\x20");
break;
default:
buf.appendWideChar(c);
}
}
}
}
static class Empty extends SimpleCharClass {
private static final Empty instance = new Empty();
private Empty() {
}
static Empty getInstance() {
return instance;
}
void output(FastStringBuffer buf) {
buf.append("\\x00"); // no character matches
}
void outputComplement(FastStringBuffer buf) {
buf.append("[^\\x00]"); // every character matches
}
void inClassOutput(FastStringBuffer buf) {
throw new RuntimeException("BMP output botch");
}
}
static class CharRange extends SimpleCharClass {
private final int lower;
private final int upper;
CharRange(int lower, int upper) {
this.lower = lower;
this.upper = upper;
}
void inClassOutput(FastStringBuffer buf) {
if (isJavaMetaChar(lower)) {
buf.append('\\');
}
buf.appendWideChar(lower);
buf.append('-');
if (isJavaMetaChar(upper)) {
buf.append('\\');
}
buf.appendWideChar(upper);
}
}
static class Property extends SimpleCharClass {
private final String name;
Property(String name) {
this.name = name;
}
void inClassOutput(FastStringBuffer buf) {
buf.append("\\p{");
buf.append(name);
buf.append('}');
}
void outputComplement(FastStringBuffer buf) {
buf.append("\\P{");
buf.append(name);
buf.append('}');
}
}
static class Subtraction extends CharClass {
private final CharClass cc1;
private final CharClass cc2;
Subtraction(CharClass cc1, CharClass cc2) {
// min corresponds to intersection
// complement corresponds to negation
this.cc1 = cc1;
this.cc2 = cc2;
}
void output(FastStringBuffer buf) {
buf.append('[');
cc1.output(buf);
buf.append("&&");
cc2.outputComplement(buf);
buf.append(']');
}
void outputComplement(FastStringBuffer buf) {
buf.append('[');
cc1.outputComplement(buf);
cc2.output(buf);
buf.append(']');
}
}
static class Union extends CharClass {
private final List members;
Union(CharClass[] v) {
this(toList(v));
}
private static List toList(CharClass[] v) {
List members = new ArrayList(5);
for (int i = 0; i < v.length; i++)
members.add(v[i]);
return members;
}
Union(List members) {
this.members = members;
}
void output(FastStringBuffer buf) {
buf.append('[');
for (int i = 0, len = members.size(); i < len; i++) {
CharClass cc = (CharClass) members.get(i);
cc.output(buf);
}
buf.append(']');
}
void outputComplement(FastStringBuffer buf) {
boolean first = true;
int len = members.size();
for (int i = 0; i < len; i++) {
CharClass cc = (CharClass) members.get(i);
if (cc instanceof SimpleCharClass) {
if (first) {
buf.append("[^");
first = false;
}
((SimpleCharClass) cc).inClassOutput(buf);
}
}
for (int i = 0; i < len; i++) {
CharClass cc = (CharClass) members.get(i);
if (!(cc instanceof SimpleCharClass)) {
if (first) {
buf.append('[');
first = false;
} else {
buf.append("&&");
}
cc.outputComplement(buf);
}
}
if (first) {
// empty union, so the complement is everything
buf.append("[\u0001-");
buf.appendWideChar(UTF16.NONBMP_MAX);
buf.append("]");
} else {
buf.append(']');
}
}
}
static class BackReference extends CharClass {
private final int i;
BackReference(int i) {
this.i = i;
}
void output(FastStringBuffer buf) {
inClassOutput(buf);
}
void outputComplement(FastStringBuffer buf) {
inClassOutput(buf);
}
void inClassOutput(FastStringBuffer buf) {
if (i != -1) {
buf.append("(?:\\" + i + ")"); // terminate the back-reference with a syntactic separator
} else {
buf.append("(?:)"); // matches a zero-length string, while allowing a quantifier
}
}
}
static class Complement extends CharClass {
private final CharClass cc;
Complement(CharClass cc) {
this.cc = cc;
}
void output(FastStringBuffer buf) {
cc.outputComplement(buf);
}
void outputComplement(FastStringBuffer buf) {
cc.output(buf);
}
}
protected boolean translateAtom() throws RegexSyntaxException {
switch (curChar) {
case RegexData.EOS:
if (!eos)
break;
// else fall through
case '?':
case '*':
case '+':
case ')':
case '{':
case '}':
case '|':
case ']':
return false;
case '(':
copyCurChar();
int thisCapture = ++currentCapture;
translateRegExp();
expect(')');
captures.add(thisCapture);
copyCurChar();
return true;
case '\\':
advance();
parseEsc().output(result);
return true;
case '[':
inCharClassExpr = true;
advance();
parseCharClassExpr().output(result);
return true;
case '.':
if (isXPath) {
// under XPath, "." has the same meaning as in JDK 1.5
break;
} else {
// under XMLSchema, "." means anything except \n or \r, which is different from the XPath/JDK rule
DOT_SCHEMA.output(result);
advance();
return true;
}
case '$':
case '^':
if (isXPath) {
copyCurChar();
return true;
}
result.append('\\');
break;
default:
if (caseBlind) {
int thisChar = absorbSurrogatePair();
int[] variants = CaseVariants.getCaseVariants(thisChar);
if (variants.length > 0) {
CharClass[] chars = new CharClass[variants.length+1];
chars[0] = new SingleChar(thisChar);
for (int i=0; inew URL(baseURL, relativeURL)
.
*
* Spaces in either URI are converted to %20
* *If no base URI is available, and the relative URI is not an absolute URI, then the current * directory is used as a base URI.
* * @param relativeURI the relative URI. Null is permitted provided that the base URI is an absolute URI * @param base the base URI * @return the absolutized URI * @throws java.net.URISyntaxException if either of the strings is not a valid URI or * if the resolution fails */ public URI makeAbsolute(String relativeURI, String base) throws URISyntaxException { URI absoluteURI; // System.err.println("makeAbsolute " + relativeURI + " against base " + base); if (relativeURI == null) { absoluteURI = new URI(ResolveURI.escapeSpaces(base)); if (!absoluteURI.isAbsolute()) { throw new URISyntaxException(base, "Relative URI not supplied, so base URI must be absolute"); } else { return absoluteURI; } } relativeURI = ResolveURI.escapeSpaces(relativeURI); base = ResolveURI.escapeSpaces(base); try { if (base==null || base.length() == 0) { absoluteURI = new URI(relativeURI); if (!absoluteURI.isAbsolute()) { String expandedBase = ResolveURI.tryToExpand(base); if (!expandedBase.equals(base)) { // prevent infinite recursion return makeAbsolute(relativeURI, expandedBase); } } } else if (base != null && (base.startsWith("jar:") || base.startsWith("file:////"))) { // jar: URIs can't be resolved by the java.net.URI class, because they don't actually // conform with the RFC standards for hierarchic URI schemes (quite apart from not being // a registered URI scheme). But they seem to be widely used. // URIs starting file://// are accepted by the java.net.URI class, they are used to // represent Windows UNC filenames. However, the java.net.URI algorithm for resolving // a relative URI against such a base URI fails to produce a usable UNC filename (it's not // clear whether Java is implementing RFC 3986 correctly here, it depends on interpretation). // So we use the java.net.URL algorithm for this case too, because it works. try { URL baseURL = new URL(base); URL absoluteURL = new URL(baseURL, relativeURI); absoluteURI = new URI(absoluteURL.toString()); // TODO: JDK1.5: use absoluteURL.toURI() } catch (MalformedURLException err) { throw new URISyntaxException(base + " " + relativeURI, err.getMessage()); } } else { URI baseURI; try { baseURI = new URI(base); } catch (URISyntaxException e) { throw new URISyntaxException(base, "Invalid base URI: " + e.getMessage()); } try { new URI(relativeURI); // for validation only } catch (URISyntaxException e) { throw new URISyntaxException(base, "Invalid relative URI: " + e.getMessage()); } absoluteURI = (relativeURI.length()==0 ? baseURI : baseURI.resolve(relativeURI) ); } } catch (IllegalArgumentException err0) { // can be thrown by resolve() when given a bad URI throw new URISyntaxException(relativeURI, "Cannot resolve URI against base " + Err.wrap(base)); } return absoluteURI; } /** * Get the platform version */ public String getPlatformVersion() { return "Java version " + System.getProperty("java.version"); } /** * Get a suffix letter to add to the Saxon version number to identify the platform */ public String getPlatformSuffix() { return "J"; } /** * Convert a StreamSource to either a SAXSource or a PullSource, depending on the native * parser of the selected platform * * @param pipe the pipeline configuration * @param input the supplied StreamSource * @param validation indicates whether schema validation is required * @param dtdValidation indicates whether DTD validation is required * @param stripspace indicates whether whitespace text nodes should be stripped * @return the PullSource or SAXSource, initialized with a suitable parser, or the original * input Source, if now special handling is required or possible. This implementation * always returns the original input unchanged. */ public Source getParserSource(PipelineConfiguration pipe, StreamSource input, int validation, boolean dtdValidation, int stripspace) { return input; } /** * Create a compiled regular expression * @param regex the source text of the regular expression, in XML Schema or XPath syntax * @param xmlVersion set to integer 10 for XML 1.0, 11 for XML 1.1 * @param syntax requests XPath or XML Schema regex syntax *@param flags the flags argument as supplied to functions such as fn:matches(), in string form @throws XPathException if the syntax of the regular expression or flags is incorrect @return the compiled regular expression */ public RegularExpression compileRegularExpression( CharSequence regex, int xmlVersion, int syntax, CharSequence flags) throws XPathException { int flagBits = JRegularExpression.setFlags(flags); return new JRegularExpression(regex, xmlVersion, syntax, flagBits); } /** * Obtain a collation with a given set of properties. The set of properties is extensible * and variable across platforms. Common properties with example values include lang=ed-GB, * strength=primary, case-order=upper-first, ignore-modifiers=yes, alphanumeric=yes. * Properties that are not supported are generally ignored; however some errors, such as * failing to load a requested class, are fatal. * @param config the configuration object * @param props the desired properties of the collation * @param uri the collation URI * @return a collation with these properties * @throws XPathException if a fatal error occurs */ public StringCollator makeCollation(Configuration config, Properties props, String uri) throws XPathException { return JavaCollationFactory.makeCollation(config, uri, props); } /** * Given a collation, determine whether it is capable of returning collation keys. * The essential property of collation keys * is that if two values are equal under the collation, then the collation keys are * equal under the equals() method. * * @param collation the collation, provided as a Comparator * @return true if this collation can supply collation keys */ public boolean canReturnCollationKeys(StringCollator collation) { return (collation instanceof CodepointCollator) || ((collation instanceof NamedCollation) && (((NamedCollation)collation).getCollation() instanceof Collator)); } /** * Given a collation, get a collation key. The essential property of collation keys * is that if two values are equal under the collation, then the collation keys are * compare correctly under the equals() method. * * @throws ClassCastException if the collation is not one that is capable of supplying * collation keys (this should have been checked in advance) */ public Object getCollationKey(NamedCollation namedCollation, String value) { return ((Collator)namedCollation.getCollation()).getCollationKey(value); } /** * Make the default extension function factory appropriate to the platform */ public void makeExtensionLibrary(Configuration config) { config.setExtensionBinder("java", new JavaExtensionLibrary(config)); } /** * Add the platform-specific function libraries to a function library list. The libraries * that are added are those registered with the Configuration using * {@link Configuration#setExtensionBinder(String, net.sf.saxon.functions.FunctionLibrary)} * @param list the function library list that is to be extended * @param config the Configuration * @param hostLanguage the host language, for example Configuration.XQUERY */ public void addFunctionLibraries(FunctionLibraryList list, Configuration config, int hostLanguage) { FunctionLibrary extensionBinder = config.getExtensionBinder("java"); if (extensionBinder instanceof JavaExtensionLibrary) { ((JavaExtensionLibrary)extensionBinder).setStrictJavaUriFormat( hostLanguage != Configuration.XSLT ); } list.addFunctionLibrary(extensionBinder); } /** * Register a namespace-to-Java-class mapping declared using saxon:script in an XSLT stylesheet * @param library the library to contain the function, which must be a JavaExtensionLibrary * @param uri the namespace of the function name * @param theClass the Java class that implements this namespace */ public void declareJavaClass(FunctionLibrary library, String uri, Class theClass) { if (library instanceof JavaExtensionLibrary) { ((JavaExtensionLibrary)library).declareJavaClass(uri, theClass); } else { throw new IllegalStateException("saxon:script cannot be used with a custom extension library factory"); } } public SchemaType getExternalObjectType(Configuration config, String uri, String localName) { throw new UnsupportedOperationException("getExternalObjectType for Java"); } } // // The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License"); // you may not use this file except in compliance with the License. You may obtain a copy of the // License at http://www.mozilla.org/MPL/ // // Software distributed under the License is distributed on an "AS IS" basis, // WITHOUT WARRANTY OF ANY KIND, either express or implied. // See the License for the specific language governing rights and limitations under the License. // // The Original Code is: all this file. // // The Initial Developer of the Original Code is Michael H. Kay. // // Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved. // // Contributor(s): none. // saxonb-9.1.0.8/bj/net/sf/saxon/java/JRegexIterator.java 0000644 0001750 0001750 00000016334 11033112257 022070 0 ustar eugene eugene package net.sf.saxon.java; import net.sf.saxon.om.ArrayIterator; import net.sf.saxon.om.Item; import net.sf.saxon.om.SequenceIterator; import net.sf.saxon.om.EmptyIterator; import net.sf.saxon.value.StringValue; import net.sf.saxon.regex.RegexIterator; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Class JRegexIterator - provides an iterator over matched and unmatched substrings. * This implementation of RegexIterator uses the JDK regular expression engine. */ public class JRegexIterator implements RegexIterator { private String theString; // the input string being matched private Pattern pattern; // the regex against which the string is matched private Matcher matcher; // the Matcher object that does the matching, and holds the state private String current; // the string most recently returned by the iterator private String next; // if the last string was a matching string, null; otherwise the next substring // matched by the regex private int position = 0; // the value of XPath position() private int prevEnd = 0; // the position in the input string of the end of the last match or non-match /** * Construct a RegexIterator. Note that the underlying matcher.find() method is called once * to obtain each matching substring. But the iterator also returns non-matching substrings * if these appear between the matching substrings. * @param string the string to be analysed * @param pattern the regular expression */ public JRegexIterator (String string, Pattern pattern) { theString = string; this.pattern = pattern; matcher = pattern.matcher(string); next = null; } /** * Get the next item in the sequence * @return the next item in the sequence */ public Item next() { if (next == null && prevEnd >= 0) { // we've returned a match (or we're at the start), so find the next match if (matcher.find()) { int start = matcher.start(); int end = matcher.end(); if (prevEnd == start) { // there's no intervening non-matching string to return next = null; current = theString.substring(start, end); prevEnd = end; } else { // return the non-matching substring first current = theString.substring(prevEnd, start); next = theString.substring(start, end); } } else { // there are no more regex matches, we must return the final non-matching text if any if (prevEnd < theString.length()) { current = theString.substring(prevEnd); next = null; } else { // this really is the end... current = null; position = -1; prevEnd = -1; return null; } prevEnd = -1; } } else { // we've returned a non-match, so now return the match that follows it, if there is one if (prevEnd >= 0) { current = next; next = null; prevEnd = matcher.end(); } else { current = null; position = -1; return null; } } position++; return StringValue.makeStringValue(current); } /** * Get the current item in the sequence * @return the item most recently returned by next() */ public Item current() { return StringValue.makeStringValue(current); } /** * Get the position of the current item in the sequence * @return the position of the item most recently returned by next(), starting at 1 */ public int position() { return position; } public void close() { } /** * Get another iterator over the same items * @return a new iterator, positioned before the first item */ public SequenceIterator getAnother() { return new JRegexIterator(theString, pattern); } /** * Get properties of this iterator, as a bit-significant integer. * * @return the properties of this iterator. This will be some combination of * properties such as {@link #GROUNDED}, {@link #LAST_POSITION_FINDER}, * and {@link #LOOKAHEAD}. It is always * acceptable to return the value zero, indicating that there are no known special properties. * It is acceptable for the properties of the iterator to change depending on its state. */ public int getProperties() { return 0; } /** * Determine whether the current item is a matching item or a non-matching item * @return true if the current item (the one most recently returned by next()) is * an item that matches the regular expression, or false if it is an item that * does not match */ public boolean isMatching() { return next == null && prevEnd >= 0; } /** * Get a substring that matches a parenthesised group within the regular expression * @param number the number of the group to be obtained * @return the substring of the current item that matches the n'th parenthesized group * within the regular expression */ public String getRegexGroup(int number) { if (!isMatching()) return null; if (number > matcher.groupCount() || number < 0) return ""; String s = matcher.group(number); if (s==null) return ""; return s; } /** * Get a sequence containing all the regex groups (except group 0, because we want to use indexing from 1). * This is used by the saxon:analyze-string() higher-order extension function. */ public SequenceIterator getRegexGroupIterator() { int c = matcher.groupCount(); if (c == 0) { return EmptyIterator.getInstance(); } else { StringValue[] groups = new StringValue[c]; for (int i=1; i<=groups.length; i++) { groups[i-1] = StringValue.makeStringValue(matcher.group(i)); } return new ArrayIterator(groups); } } } // // The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License"); // you may not use this file except in compliance with the License. You may obtain a copy of the // License at http://www.mozilla.org/MPL/ // // Software distributed under the License is distributed on an "AS IS" basis, // WITHOUT WARRANTY OF ANY KIND, either express or implied. // See the License for the specific language governing rights and limitations under the License. // // The Original Code is: all this file. // // The Initial Developer of the Original Code is Michael H. Kay // // Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved. // // Contributor(s): // saxonb-9.1.0.8/bj/net/sf/saxon/java/JDK14RegexTranslator.java 0000644 0001750 0001750 00000104556 11033112257 023020 0 ustar eugene eugene package net.sf.saxon.java; import net.sf.saxon.om.FastStringBuffer; import net.sf.saxon.value.Whitespace; import net.sf.saxon.value.StringValue; import net.sf.saxon.regex.SurrogateRegexTranslator; import net.sf.saxon.regex.RegexData; import net.sf.saxon.regex.RegexSyntaxException; import net.sf.saxon.charcode.UTF16; import net.sf.saxon.charcode.XMLCharacterData; import net.sf.saxon.sort.IntRangeSet; import net.sf.saxon.Configuration; import java.util.ArrayList; import java.util.Iterator; import java.util.List; /** * This class translates XML Schema regex syntax into JDK 1.4 regex syntax. * Author: James Clark, Thai Open Source Software Center Ltd. See statement at end of file. * Modified by Michael Kay (a) to integrate the code into Saxon, and (b) to support XPath additions * to the XML Schema regex syntax. * * This version of the regular expression translator treats each half of a surrogate pair as a separate * character, translating anything in an XPath regex that can match a non-BMP character into a Java * regex that matches the two halves of a surrogate pair independently. This approach doesn't work * under JDK 1.5, whose regex engine treats a surrogate pair as a single character. * */ public class JDK14RegexTranslator extends SurrogateRegexTranslator { /** * Translates XML Schema regexes intojava.util.regex
regexes.
*
* @see java.util.regex.Pattern
* @see XML Schema Part 2
*/
/**
* CharClass for each block name in specialBlockNames.
*/
private static final CharClass[] specialBlockCharClasses = {
new CharRange(0x10300, 0x1032F),
new CharRange(0x10330, 0x1034F),
new CharRange(0x10400, 0x1044F),
new CharRange(0x1D000, 0x1D0FF),
new CharRange(0x1D100, 0x1D1FF),
new CharRange(0x1D400, 0x1D7FF),
new CharRange(0x20000, 0x2A6D6),
new CharRange(0x2F800, 0x2FA1F),
new CharRange(0xE0000, 0xE007F),
new Union(new CharClass[]{
new CharRange(0xE000, 0xF8FF),
new CharRange(0xF0000, 0xFFFFD),
new CharRange(0x100000, 0x10FFFD)
}),
Empty.getInstance(),
Empty.getInstance(),
Empty.getInstance()
};
private static final CharClass DOT_SCHEMA =
new Complement(new Union(new CharClass[]{new SingleChar('\n'), new SingleChar('\r')}));
private static final CharClass DOT_XPATH =
new Dot();
private static final CharClass ESC_d = new Property("Nd");
private static final CharClass ESC_D = new Complement(ESC_d);
private static final CharClass ESC_W = new Union(new CharClass[]{computeCategoryCharClass('P'),
computeCategoryCharClass('Z'),
computeCategoryCharClass('C')});
//was: new Property("P"), new Property("Z"), new Property("C") }
private static final CharClass ESC_w = new Complement(ESC_W);
private static final CharClass ESC_s = new Union(new CharClass[]{
new SingleChar(' '),
new SingleChar('\n'),
new SingleChar('\r'),
new SingleChar('\t')
});
private static final CharClass ESC_S = new Complement(ESC_s);
// private static final CharClass ESC_i = makeCharClass(RegexData.NMSTRT_CATEGORIES,
// RegexData.NMSTRT_INCLUDES,
// RegexData.NMSTRT_EXCLUDE_RANGES);
//
// private static final CharClass ESC_I = new Complement(ESC_i);
//
// private static final CharClass ESC_c = makeCharClass(RegexData.NMCHAR_CATEGORIES,
// RegexData.NMCHAR_INCLUDES,
// RegexData.NMCHAR_EXCLUDE_RANGES);
//
// private static final CharClass ESC_C = new Complement(ESC_c);
private static final CharClass ESC_i_10 = makeNameCharClass(XMLCharacterData.NAME_START_10_MASK);
private static final CharClass ESC_i_11 = makeNameCharClass(XMLCharacterData.NAME_START_11_MASK);
private static final CharClass ESC_I_10 = new Complement(ESC_i_10);
private static final CharClass ESC_I_11 = new Complement(ESC_i_11);
private static final CharClass ESC_c_10 = makeNameCharClass(XMLCharacterData.NAME_10_MASK);
private static final CharClass ESC_c_11 = makeNameCharClass(XMLCharacterData.NAME_11_MASK);
private static final CharClass ESC_C_10 = new Complement(ESC_c_10);
private static final CharClass ESC_C_11 = new Complement(ESC_c_11);
/**
* Create a regex translator for JDK 1.4
*/
public JDK14RegexTranslator() {
}
/**
* Indicate whether whitespace should be ignored
* @param ignore true if whitespace should be ignored
*/
public void setIgnoreWhitespace(boolean ignore) {
ignoreWhitespace = ignore;
}
/**
* Translates a regular expression in the syntax of XML Schemas Part 2 into a regular
* expression in the syntax of java.util.regex.Pattern
. The translation
* assumes that the string to be matched against the regex uses surrogate pairs correctly.
* If the string comes from XML content, a conforming XML parser will automatically
* check this; if the string comes from elsewhere, it may be necessary to check
* surrogate usage before matching.
*
* @param regExp a String containing a regular expression in the syntax of XML Schemas Part 2
* @param xmlVersion integer constant indicating XML 1.0 or XML 1.1
* @param xpath a boolean indicating whether the XPath 2.0 F+O extensions to the schema
* regex syntax are permitted
* @return a String containing a regular expression in the syntax of java.util.regex.Pattern
* @throws net.sf.saxon.regex.RegexSyntaxException if regexp
is not a regular expression in the
* syntax of XML Schemas Part 2, or XPath 2.0, as appropriate
* @see java.util.regex.Pattern
* @see XML Schema Part 2
*/
public String translate(CharSequence regExp, int xmlVersion, boolean xpath) throws RegexSyntaxException {
//System.err.println("Input regex: " + FastStringBuffer.diagnosticPrint(regExp));
//JDK14RegexTranslator tr = new JDK14RegexTranslator(regexp);
this.regExp = regExp;
this.xmlVersion = xmlVersion;
isXPath = xpath;
length = regExp.length();
advance();
translateTop();
//System.err.println("Output regex: " + FastStringBuffer.diagnosticPrint(result));
return result.toString();
}
static class Subtraction extends CharClass {
private final CharClass cc1;
private final CharClass cc2;
Subtraction(CharClass cc1, CharClass cc2) {
// min corresponds to intersection
// complement corresponds to negation
super(Math.min(cc1.getContainsBmp(), -cc2.getContainsBmp()),
Math.min(cc1.getContainsNonBmp(), -cc2.getContainsNonBmp()));
this.cc1 = cc1;
this.cc2 = cc2;
}
public void outputBmp(FastStringBuffer buf) {
buf.append('[');
cc1.outputBmp(buf);
buf.append("&&");
cc2.outputComplementBmp(buf);
buf.append(']');
}
// void outputBmp(FastStringBuffer buf) {
// buf.append("(?=");
// cc1.outputBmp(buf);
// buf.append(")(?:");
// cc2.outputComplementBmp(buf);
// buf.append(')');
// }
public void outputComplementBmp(FastStringBuffer buf) {
buf.append('[');
cc1.outputComplementBmp(buf);
cc2.outputBmp(buf);
buf.append(']');
}
// void outputComplementBmp(FastStringBuffer buf) {
// buf.append("(?=");
// cc1.outputComplementBmp(buf);
// buf.append(")(?:");
// cc2.outputBmp(buf);
// buf.append(')');
// }
public void addNonBmpRanges(List ranges) {
List posList = new ArrayList(5);
cc1.addNonBmpRanges(posList);
List negList = new ArrayList(5);
cc2.addNonBmpRanges(negList);
sortRangeList(posList);
sortRangeList(negList);
Iterator negIter = negList.iterator();
Range negRange;
if (negIter.hasNext()) {
negRange = (Range) negIter.next();
} else {
negRange = null;
}
for (int i = 0, len = posList.size(); i < len; i++) {
Range posRange = (Range) posList.get(i);
while (negRange != null && negRange.getMax() < posRange.getMin()) {
if (negIter.hasNext()) {
negRange = (Range) negIter.next();
} else {
negRange = null;
}
}
// if negRange != null, negRange.max >= posRange.min
int min = posRange.getMin();
while (negRange != null && negRange.getMin() <= posRange.getMax()) {
if (min < negRange.getMin()) {
ranges.add(new Range(min, negRange.getMin() - 1));
}
min = negRange.getMax() + 1;
if (min > posRange.getMax()) {
break;
}
if (negIter.hasNext()) {
negRange = (Range) negIter.next();
} else {
negRange = null;
}
}
if (min <= posRange.getMax()) {
ranges.add(new Range(min, posRange.getMax()));
}
}
}
}
static class Union extends CharClass {
private final List members;
Union(CharClass[] v) {
this(toList(v));
}
private static List toList(CharClass[] v) {
List members = new ArrayList(5);
for (int i = 0; i < v.length; i++) {
members.add(v[i]);
}
return members;
}
Union(List members) {
super(computeContainsBmp(members), computeContainsNonBmp(members));
this.members = members;
}
public void outputBmp(FastStringBuffer buf) {
buf.append('[');
for (int i = 0, len = members.size(); i < len; i++) {
CharClass cc = (CharClass) members.get(i);
if (cc.getContainsBmp() != NONE) {
if (cc instanceof SimpleCharClass) {
((SimpleCharClass) cc).inClassOutputBmp(buf);
} else {
cc.outputBmp(buf);
}
}
}
buf.append(']');
}
public void outputComplementBmp(FastStringBuffer buf) {
boolean first = true;
int len = members.size();
for (int i = 0; i < len; i++) {
CharClass cc = (CharClass) members.get(i);
if (cc.getContainsBmp() != NONE && cc instanceof SimpleCharClass) {
if (first) {
buf.append("[^");
first = false;
}
((SimpleCharClass) cc).inClassOutputBmp(buf);
}
}
for (int i = 0; i < len; i++) {
CharClass cc = (CharClass) members.get(i);
if (cc.getContainsBmp() != NONE && !(cc instanceof SimpleCharClass)) {
if (first) {
buf.append('[');
first = false;
} else {
buf.append("&&");
}
// can't have any members that are ALL, because that would make this ALL, which violates
// the precondition for outputComplementBmp
cc.outputComplementBmp(buf);
}
}
if (first)
// all members are NONE, so this is NONE, so complement is everything
{
buf.append("[\u0000-\uFFFF]");
} else {
buf.append(']');
}
}
public void addNonBmpRanges(List ranges) {
for (int i = 0, len = members.size(); i < len; i++) {
((CharClass) members.get(i)).addNonBmpRanges(ranges);
}
}
private static int computeContainsBmp(List members) {
int ret = NONE;
for (int i = 0, len = members.size(); i < len; i++) {
ret = Math.max(ret, ((CharClass) members.get(i)).getContainsBmp());
}
return ret;
}
private static int computeContainsNonBmp(List members) {
int ret = NONE;
for (int i = 0, len = members.size(); i < len; i++) {
ret = Math.max(ret, ((CharClass) members.get(i)).getContainsNonBmp());
}
return ret;
}
}
protected boolean translateAtom() throws RegexSyntaxException {
switch (curChar) {
case RegexData.EOS:
if (!eos) {
break;
}
// fall through
case '?':
case '*':
case '+':
case ')':
case '{':
case '}':
case '|':
case ']':
return false;
case '(':
copyCurChar();
int thisCapture = ++currentCapture;
translateRegExp();
expect(')');
captures.add(thisCapture);
copyCurChar();
return true;
case '\\':
advance();
parseEsc().output(result);
return true;
case '[':
inCharClassExpr = true;
advance();
parseCharClassExpr().output(result);
return true;
case '.':
if (isXPath) {
DOT_XPATH.output(result);
advance();
return true;
} else {
DOT_SCHEMA.output(result);
advance();
return true;
}
case '$':
case '^':
if (isXPath) {
copyCurChar();
return true;
}
result.append('\\');
break;
}
copyCurChar();
return true;
}
private static CharClass makeNameCharClass(byte mask) {
List ranges = new ArrayList();
// Add colon to the set of characters matched
ranges.add(new SingleChar(':'));
// Plus all the characters from the NCName tables
IntRangeSet members = XMLCharacterData.getCategory(mask);
int used = members.getNumberOfRanges();
int[] startPoints = members.getStartPoints();
int[] endPoints = members.getEndPoints();
for (int i=0; i