segment-1.3.5~svn57+dfsg/0000755000175000017500000000000011300444112013630 5ustar railrailsegment-1.3.5~svn57+dfsg/src/0000755000175000017500000000000011300444112014417 5ustar railrailsegment-1.3.5~svn57+dfsg/src/net/0000755000175000017500000000000011300444112015205 5ustar railrailsegment-1.3.5~svn57+dfsg/src/net/sourceforge/0000755000175000017500000000000011300444112017530 5ustar railrailsegment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/0000755000175000017500000000000011300444112021172 5ustar railrailsegment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/TextIterator.java0000644000175000017500000000075611223073726024520 0ustar railrailpackage net.sourceforge.segment; import java.util.Iterator; /** * Text iterator interface. * * @author loomchild */ public interface TextIterator extends Iterator { /** * @return next segment in text, or null if end of text has been * reached. */ public String next(); /** * @return true if there are more segments */ public boolean hasNext(); /** * Unsupported operation. * * @throws UnsupportedOperationException */ public void remove(); } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/ui/0000755000175000017500000000000011300444112021607 5ustar railrailsegment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/ui/console/0000755000175000017500000000000011300444112023251 5ustar railrailsegment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/ui/console/Segment.java0000644000175000017500000004356011236362500025536 0ustar railrailpackage net.sourceforge.segment.ui.console; import static net.sourceforge.segment.util.Util.getFileInputStream; import static net.sourceforge.segment.util.Util.getFileOutputStream; import static net.sourceforge.segment.util.Util.getReader; import static net.sourceforge.segment.util.Util.getResourceStream; import static net.sourceforge.segment.util.Util.getWriter; import static net.sourceforge.segment.util.Util.readAll; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.Reader; import java.io.StringReader; import java.io.Writer; import java.util.HashMap; import java.util.Map; import java.util.Random; import net.sourceforge.segment.SegmentTestSuite; import net.sourceforge.segment.TextIterator; import net.sourceforge.segment.Version; import net.sourceforge.segment.srx.LanguageRule; import net.sourceforge.segment.srx.Rule; import net.sourceforge.segment.srx.SrxDocument; import net.sourceforge.segment.srx.SrxParser; import net.sourceforge.segment.srx.SrxTextIterator; import net.sourceforge.segment.srx.SrxTransformer; import net.sourceforge.segment.srx.io.Srx1Transformer; import net.sourceforge.segment.srx.io.SrxAnyParser; import net.sourceforge.segment.srx.io.SrxAnyTransformer; import net.sourceforge.segment.srx.legacy.AccurateSrxTextIterator; import net.sourceforge.segment.srx.legacy.FastTextIterator; import net.sourceforge.segment.util.NullWriter; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.PosixParser; import org.junit.internal.runners.TextListener; import org.junit.runner.JUnitCore; /** * Text user interface to splitter. * * @author loomchild */ public class Segment { private enum Algorithm { accurate, fast, ultimate; } public static final String DEFAULT_SRX = "net/sourceforge/segment/res/xml/default.srx"; public static final String EOLN = System.getProperty("line.separator"); public static final String DEFAULT_BEGIN_SEGMENT = ""; public static final String DEFAULT_END_SEGMENT = EOLN; /* These constants apply to text / SRX generation */ public static final int WORD_LENGTH = 2; public static final int SENTENCE_LENGTH = 5; private Random random; private String text; public static void main(String[] args) { try { Segment main = new Segment(); main.run(args); } catch (Exception e) { e.printStackTrace(System.err); } } public Segment() { this.random = new Random(); } private void run(String[] args) throws Exception { Options options = createOptions(); HelpFormatter helpFormatter = new HelpFormatter(); CommandLineParser parser = new PosixParser(); CommandLine commandLine = null; try { commandLine = parser.parse(options, args); if (commandLine.hasOption('h')) { printHelp(options, helpFormatter); } else if (commandLine.hasOption('z')) { test(); } else if (commandLine.hasOption('t')) { transform(commandLine); } else { segment(commandLine); } } catch (ParseException e) { printUsage(helpFormatter); } catch (IllegalArgumentException e) { System.out.println(e.getMessage()); } } private Options createOptions() { Options options = new Options(); options.addOption("s", "srx", true, "SRX file."); options.addOption("l", "language", true, "Language code."); options.addOption("m", "map", true, "Map rule name in SRX 1.0."); options.addOption("b", "begin", true, "Output segment prefix."); options.addOption("e", "end", true, "Output segment suffix."); options.addOption("a", "algorithm", true, "Algorithm. Can be accurate, fast or ultimate (default)."); options.addOption("i", "input", true, "Use given input file instead of standard input."); options.addOption("o", "output", true, "Use given output file instead of standard output."); options.addOption("t", "transform", false, "Convert old SRX to current version."); options.addOption("p", "profile", false, "Print profile information."); options.addOption("r", "preload", false, "Preload document into memory before segmentation."); options.addOption("2", "twice", false, "Repeat the whole process twice."); options.addOption("z", "test", false, "Test the application by running a test suite."); options.addOption(null, "lookbehind", true, "Maximum length of a regular expression construct that occurs in lookbehind. Default: " + SrxTextIterator.DEFAULT_MAX_LOOKBEHIND_CONSTRUCT_LENGTH + "."); options.addOption(null, "buffer-length", true, "Length of a buffer when reading text as a stream. Default: " + SrxTextIterator.DEFAULT_BUFFER_LENGTH + "."); options.addOption(null, "margin", true, "If rule is matched but its position is in the margin (position > bufferLength - margin) then the matching is ignored. Default " + SrxTextIterator.DEFAULT_MARGIN + "."); options.addOption(null, "generate-text", true, "Generate random input with given length in KB."); options.addOption(null, "generate-srx", true, "Generate random segmentation rules with given rule count and rule length separated by a comma."); options.addOption("h", "help", false, "Print this help."); return options; } private void printUsage(HelpFormatter helpFormatter) { System.out.println("Unknown command. Use segment -h for help."); } private void printHelp(Options options, HelpFormatter helpFormatter) { String signature = "Segment"; if (Version.getInstance().getVersion() != null) { signature += " " + Version.getInstance().getVersion(); } if (Version.getInstance().getDate() != null) { signature += ", " + Version.getInstance().getDate(); } signature += "."; System.out.println(signature); helpFormatter.printHelp("segment", options); } private void test() { JUnitCore core = new JUnitCore(); core.addListener(new TextListener()); core.run(SegmentTestSuite.class); } private void segment(CommandLine commandLine) throws IOException { Reader reader = null; Writer writer = null; try { boolean profile = commandLine.hasOption('p'); boolean twice = commandLine.hasOption('2'); boolean preload = commandLine.hasOption('r'); if (twice && !profile) { throw new RuntimeException("Can only repeat segmentation twice in profile mode."); } reader = createTextReader(commandLine, profile, twice, preload); writer = createTextWriter(commandLine); if (preload) { preloadText(reader, profile); } SrxDocument document = createSrxDocument(commandLine, profile); createAndSegment(commandLine, document, reader, writer, profile); if (twice) { reader = createTextReader(commandLine, profile, twice, preload); createAndSegment(commandLine, document, reader, writer, profile); } } finally { if (reader != null) { reader.close(); } if (writer != null) { writer.close(); } } } private Reader createTextReader(CommandLine commandLine, boolean profile, boolean twice, boolean preload) throws IOException { Reader reader; if (commandLine.hasOption("generate-text")) { reader = createRandomTextReader(commandLine.getOptionValue("generate-text"), profile); } else if (commandLine.hasOption('i')) { reader = createFileReader(commandLine.getOptionValue('i')); } else { if (twice && !preload) { throw new RuntimeException("Cannot read standard input twice. " + "Preload text (-r), provide an input file (-i) or generate input text (-x)."); } reader = createStandardInputReader(); } return reader; } private Reader createStandardInputReader() { Reader reader = getReader(System.in); return reader; } private Reader createFileReader(String fileName) throws IOException { InputStream inputStream = getFileInputStream(fileName); Reader reader = getReader(inputStream); return reader; } private Reader createRandomTextReader(String generateTextOption, boolean profile) { if (text == null) { long start = System.currentTimeMillis(); if (profile) { System.out.print("Generating text... "); } this.text = generateText(generateTextOption); if (profile) { System.out.println(System.currentTimeMillis() - start + " ms."); } } Reader reader = new StringReader(text); return reader; } private String generateText(String generateTextOption) { int textLength = Integer.parseInt(generateTextOption); if (textLength < 1) { throw new RuntimeException("Text too short: " + textLength + "K."); } int wordCount = textLength * 1024 / (WORD_LENGTH + 1); StringBuilder stringBuilder = new StringBuilder(); for (int i = 0; i < wordCount; ++i) { stringBuilder.append(' '); String word = generateWord(WORD_LENGTH); stringBuilder.append(word); if ((i % SENTENCE_LENGTH) == 0) { stringBuilder.append('.'); } } return stringBuilder.toString(); } private String generateWord(int length) { StringBuilder word = new StringBuilder(); for (int i = 0; i < length; ++i) { char character = generateCharacter(); word.append(character); } return word.toString(); } private char generateCharacter() { int character = random.nextInt('Z' - 'A' + 1) + 'A'; return (char)character; } private Writer createTextWriter(CommandLine commandLine) { Writer writer; if (commandLine.hasOption('p')) { writer = new NullWriter(); } else if (commandLine.hasOption('o')) { writer = createFileWriter(commandLine.getOptionValue('o')); } else { writer = createStandardOutputWriter(); } return writer; } private Writer createStandardOutputWriter() { Writer writer = getWriter(System.out); return writer; } private Writer createFileWriter(String fileName) { OutputStream outputStream = getFileOutputStream(fileName); Writer writer = getWriter(outputStream); return writer; } private SrxDocument createSrxDocument(CommandLine commandLine, boolean profile) throws IOException { SrxDocument document; long start = System.currentTimeMillis(); if (commandLine.hasOption("generate-srx")) { if (profile) { System.out.print("Generating rules... "); } String generateSrxOption = commandLine.getOptionValue("generate-srx"); document = generateSrxDocument(generateSrxOption); } else { String fileName = commandLine.getOptionValue('s'); String mapRule = commandLine.getOptionValue('m'); if (profile) { System.out.print("Reading rules... "); } document = readSrxDocument(fileName, mapRule); } if (profile) { System.out.println(System.currentTimeMillis() - start + " ms."); } return document; } private SrxDocument readSrxDocument(String fileName, String mapRule) throws IOException { Reader srxReader; if (fileName != null) { srxReader = getReader(getFileInputStream(fileName)); } else { srxReader = getReader(getResourceStream(DEFAULT_SRX)); } Map parameterMap = new HashMap(); if (mapRule != null) { parameterMap.put(Srx1Transformer.MAP_RULE_NAME, mapRule); } // If there are transformation parameters then separate transformation // is needed. if (parameterMap.size() > 0) { SrxTransformer transformer = new SrxAnyTransformer(); srxReader = transformer.transform(srxReader, parameterMap); } SrxParser srxParser = new SrxAnyParser(); SrxDocument document = srxParser.parse(srxReader); srxReader.close(); return document; } private SrxDocument generateSrxDocument(String generateSrxOption) { String[] parts = generateSrxOption.split(","); if (parts.length != 2) { throw new RuntimeException("Cannot parse rule count and length."); } int ruleCount = Integer.parseInt(parts[0]); if (ruleCount < 0) { throw new RuntimeException("Rule count must be positive: " + ruleCount + "."); } int ruleLength = Integer.parseInt(parts[1]); if (ruleLength < 1) { throw new RuntimeException("Rule length must be greater or equal to one: " + ruleCount + "."); } SrxDocument srxDocument = new SrxDocument(); LanguageRule languageRule = generateLanguageRule(ruleCount, ruleLength); srxDocument.addLanguageMap(".*", languageRule); return srxDocument; } private LanguageRule generateLanguageRule(int ruleCount, int ruleLenght) { LanguageRule languageRule = new LanguageRule(""); // Add rules for (int i = 0; i < ruleCount; ++i) { Rule rule = generateRule(ruleLenght); languageRule.addRule(rule); } // Add end of sentence rule languageRule.addRule(new Rule(true, "\\.", " ")); return languageRule; } private Rule generateRule(int length) { StringBuilder regex = new StringBuilder(); regex.append('('); for (int i = 0; i < length; ++i) { String word = generateWord(WORD_LENGTH); regex.append(word); if (i != length - 1) { regex.append('|'); } } regex.append(')'); Rule rule = new Rule(false, regex + "\\.", " "); return rule; } private void createAndSegment(CommandLine commandLine, SrxDocument document, Reader reader, Writer writer, boolean profile) throws IOException { if (profile) { System.out.println("Segmenting... "); } long start = System.currentTimeMillis(); TextIterator textIterator = createTextIterator(commandLine, document, reader, profile); performSegment(commandLine, textIterator, writer, profile); if (profile) { System.out.println(System.currentTimeMillis() - start + " ms."); } } private TextIterator createTextIterator(CommandLine commandLine, SrxDocument document, Reader reader, boolean profile) { TextIterator textIterator; String languageCode = commandLine.getOptionValue('l'); if (languageCode == null) { languageCode = ""; } String algorithmString = commandLine.getOptionValue('a'); Algorithm algorithm = Algorithm.ultimate; if (algorithmString != null) { algorithm = Algorithm.valueOf(algorithmString); } Map parameterMap = new HashMap(); if (commandLine.hasOption("lookbehind")) { if (algorithm != Algorithm.ultimate && algorithm != Algorithm.fast) { throw new IllegalArgumentException("--lookbehind parameter can be only used with ultimate or fast algorithm."); } parameterMap.put( SrxTextIterator.MAX_LOOKBEHIND_CONSTRUCT_LENGTH_PARAMETER, Integer.parseInt(commandLine.getOptionValue("lookbehind"))); } if (commandLine.hasOption("buffer-length")) { if (commandLine.hasOption('r')) { throw new IllegalArgumentException("--buffer-length can be only used when reading text from a stream (--preload option not allowed)."); } parameterMap.put( SrxTextIterator.BUFFER_LENGTH_PARAMETER, Integer.parseInt(commandLine.getOptionValue("buffer-length"))); } if (commandLine.hasOption("margin")) { if (algorithm != Algorithm.ultimate) { throw new IllegalArgumentException("--margin parameter can be only used with ultimate algorithm."); } parameterMap.put( SrxTextIterator.MARGIN_PARAMETER, Integer.parseInt(commandLine.getOptionValue("margin"))); } if (profile) { System.out.print(" Creating text iterator... "); } long start = System.currentTimeMillis(); if (algorithm == Algorithm.accurate) { if (text != null) { textIterator = new AccurateSrxTextIterator(document, languageCode, text); } else { throw new IllegalArgumentException("For accurate algorithm preload option (-r) is mandatory."); } } else if (algorithm == Algorithm.ultimate) { if (text != null) { textIterator = new SrxTextIterator(document, languageCode, text, parameterMap); } else { textIterator = new SrxTextIterator(document, languageCode, reader, parameterMap); } } else if (algorithm == Algorithm.fast) { if (text != null) { textIterator = new FastTextIterator(document, languageCode, text, parameterMap); } else { textIterator = new FastTextIterator(document, languageCode, reader, parameterMap); } } else { throw new IllegalArgumentException("Unknown algorithm: " + algorithm + "."); } if (profile) { System.out.println(System.currentTimeMillis() - start + " ms."); } return textIterator; } private void performSegment(CommandLine commandLine, TextIterator textIterator, Writer writer, boolean profile) throws IOException { String beginSegment = commandLine.getOptionValue('b'); if (beginSegment == null) { beginSegment = DEFAULT_BEGIN_SEGMENT; } String endSegment = commandLine.getOptionValue('e'); if (endSegment == null) { endSegment = DEFAULT_END_SEGMENT; } if (profile) { System.out.print(" Performing segmentation... "); } long start = System.currentTimeMillis(); while (textIterator.hasNext()) { String segment = textIterator.next(); writer.write(beginSegment); writer.write(segment); writer.write(endSegment); } if (profile) { System.out.println(System.currentTimeMillis() - start + " ms."); } } private String preloadText(Reader reader, boolean profile) { if (text == null) { if (profile) { System.out.print("Preloading text... "); } long start = System.currentTimeMillis(); text = readAll(reader); if (profile) { System.out.println(System.currentTimeMillis() - start + " ms."); } } return text; } private void transform(CommandLine commandLine) throws IOException { Reader reader; if (commandLine.hasOption('i')) { reader = createFileReader(commandLine.getOptionValue('i')); } else { reader = createStandardInputReader(); } Writer writer; if (commandLine.hasOption('o')) { writer = createFileWriter(commandLine.getOptionValue('o')); } else { writer = createStandardOutputWriter(); } String mapRule = commandLine.getOptionValue("m"); try { SrxTransformer transformer = new SrxAnyTransformer(); Map parameterMap = new HashMap(); if (mapRule != null) { parameterMap.put(Srx1Transformer.MAP_RULE_NAME, mapRule); } transformer.transform(reader, writer, parameterMap); } finally { reader.close(); writer.close(); } } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/AbstractTextIterator.java0000644000175000017500000000057411171331077026200 0ustar railrailpackage net.sourceforge.segment; /** * Represents abstract text iterator. Responsible for implementing remove * operation. * * @author loomchild * */ public abstract class AbstractTextIterator implements TextIterator { /** * {@inheritDoc} */ public void remove() { throw new UnsupportedOperationException( "Remove is not supported by TextIterator."); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/SegmentTestSuite.java0000644000175000017500000000305311223073726025327 0ustar railrailpackage net.sourceforge.segment; import net.sourceforge.segment.srx.LanguageMapTest; import net.sourceforge.segment.srx.RuleMatcherTest; import net.sourceforge.segment.srx.SrxDocumentTest; import net.sourceforge.segment.srx.SrxTextIteratorReaderTest; import net.sourceforge.segment.srx.SrxTextIteratorStringTest; import net.sourceforge.segment.srx.TextManagerTest; import net.sourceforge.segment.srx.io.SrxParsersTest; import net.sourceforge.segment.srx.io.SrxTransformersTest; import net.sourceforge.segment.srx.io.SrxVersionTest; import net.sourceforge.segment.srx.legacy.AccurateSrxTextIteratorStringTest; import net.sourceforge.segment.srx.legacy.BufferTest; import net.sourceforge.segment.srx.legacy.FastTextIteratorReaderTest; import net.sourceforge.segment.srx.legacy.FastTextIteratorStringTest; import net.sourceforge.segment.srx.legacy.ReaderCharSequenceTest; import net.sourceforge.segment.srx.legacy.ReaderMatcherTest; import net.sourceforge.segment.util.UtilTest; import org.junit.runner.RunWith; import org.junit.runners.Suite; @RunWith(Suite.class) @Suite.SuiteClasses({ LanguageMapTest.class, SrxDocumentTest.class, TextManagerTest.class, SrxVersionTest.class, SrxParsersTest.class, SrxTransformersTest.class, UtilTest.class, SrxTextIteratorStringTest.class, SrxTextIteratorReaderTest.class, RuleMatcherTest.class, BufferTest.class, ReaderCharSequenceTest.class, ReaderMatcherTest.class, AccurateSrxTextIteratorStringTest.class, FastTextIteratorStringTest.class, FastTextIteratorReaderTest.class }) public class SegmentTestSuite { } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/util/0000755000175000017500000000000011300444112022147 5ustar railrailsegment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/util/ResourceNotFoundException.java0000644000175000017500000000067211223073726030157 0ustar railrailpackage net.sourceforge.segment.util; /** * Exception that indicates that resource has not been found by Classloader. * @author loomchild */ public class ResourceNotFoundException extends RuntimeException { private static final long serialVersionUID = 318909218824445026L; public ResourceNotFoundException(String name) { super(name); } public ResourceNotFoundException(String name, Throwable cause) { super(name, cause); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/util/TransformationErrorListener.java0000644000175000017500000000153211223073726030556 0ustar railrailpackage net.sourceforge.segment.util; import javax.xml.transform.ErrorListener; import javax.xml.transform.TransformerException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * Handles XML transformation errors by writing them to logr. * @author loomchild */ public class TransformationErrorListener implements ErrorListener { private static final Log log = LogFactory .getLog(TransformationErrorListener.class); public void warning(TransformerException exception) { log.info("Transformation warning: " + exception.getMessage()); } public void error(TransformerException exception) throws TransformerException { log.warn("Transformation error: " + exception.getMessage()); } public void fatalError(TransformerException exception) throws TransformerException { throw exception; } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/util/IgnoreDTDEntityResolver.java0000644000175000017500000000121611223073726027525 0ustar railrailpackage net.sourceforge.segment.util; import java.io.ByteArrayInputStream; import java.io.IOException; import org.xml.sax.EntityResolver; import org.xml.sax.InputSource; import org.xml.sax.SAXException; /** * This class is used to ignore XML DTD entities. * Without it XML parsing will fail if DTD could not be found in specified location. * @author loomchild */ public class IgnoreDTDEntityResolver implements EntityResolver { public IgnoreDTDEntityResolver() { } public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException { return new InputSource(new ByteArrayInputStream(new byte[0])); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/util/Util.java0000644000175000017500000004075711255767162023772 0ustar railrailpackage net.sourceforge.segment.util; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Reader; import java.io.StringWriter; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.net.URL; import java.util.Collections; import java.util.Map; import java.util.jar.Manifest; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.XMLConstants; import javax.xml.bind.JAXBContext; import javax.xml.bind.JAXBException; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import javax.xml.transform.Result; import javax.xml.transform.Source; import javax.xml.transform.Templates; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.sax.SAXSource; import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamSource; import javax.xml.validation.Schema; import javax.xml.validation.SchemaFactory; import net.sourceforge.segment.srx.SrxDocument; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; public class Util { public static final int READ_BUFFER_SIZE = 1024; public static final String MANIFEST_PATH = "/META-INF/MANIFEST.MF"; private static final Pattern STAR_PATTERN = Pattern .compile("(?<=(? klass) { URL classUrl = klass.getResource(klass.getSimpleName() + ".class"); if (classUrl == null) { throw new IllegalArgumentException("Class not found: " + klass.getName() + "."); } String classPath = classUrl.toString(); int jarIndex = classPath.indexOf('!'); if (jarIndex != -1) { String manifestPath = classPath.substring(0, jarIndex + 1) + MANIFEST_PATH; try { URL manifestUrl = new URL(manifestPath); InputStream manifestStream = manifestUrl.openStream(); Manifest manifest = new Manifest(manifestStream); return manifest; } catch (IOException e) { throw new ResourceNotFoundException( "IO Error retrieving manifest.", e); } } else { throw new ResourceNotFoundException( "Class is not in a JAR archive " + klass.getName() + "."); } } /** * Returns XMLReader validating against given XML schema. * The reader ignores DTD defined in XML file. * @param schema * @return XMLReader * @throws XMLException when SAX error occurs */ public static XMLReader getXmlReader(Schema schema) { try { SAXParserFactory parserFactory = SAXParserFactory.newInstance(); parserFactory.setValidating(false); parserFactory.setNamespaceAware(true); if (schema != null) { parserFactory.setSchema(schema); } SAXParser saxParser = parserFactory.newSAXParser(); XMLReader xmlReader = saxParser.getXMLReader(); xmlReader.setEntityResolver(new IgnoreDTDEntityResolver()); return xmlReader; } catch (ParserConfigurationException e) { throw new XmlException("SAX Parser configuration error.", e); } catch (SAXException e) { throw new XmlException("Error creating XMLReader.", e); } } /** * @see Util#getXmlReader(Schema) * @return XMLReader without XML schema associated with it * @throws XMLException when SAX error occurs */ public static XMLReader getXmlReader() { return getXmlReader(null); } /** * Reads a XML schema from given reader. * @param reader * @return XML Schema * @throws XMLException when XML schema parsing error occurs */ public static Schema getSchema(Reader reader) { return getSchema(new Reader[] { reader }); } /** * Reads a XML schema from given readers. Schema files can depend on * one another. * @param readerArray readers containing XML schemas * @return XML Schema object * @throws XMLException when XML schema parsing error occurs */ public static Schema getSchema(Reader[] readerArray) { try { Source[] sourceArray = new Source[readerArray.length]; for (int i = 0; i < readerArray.length; ++i) { Reader reader = readerArray[i]; Source source = new StreamSource(reader); sourceArray[i] = source; } SchemaFactory schemaFactory = SchemaFactory .newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI); Schema schema = schemaFactory.newSchema(sourceArray); return schema; } catch (SAXException e) { throw new XmlException("Error creating XML Schema.", e); } } /** * @param reader * @param schema XML schema * @return XML source from given reader and with given schema */ public static Source getSource(Reader reader, Schema schema) { Source source = new SAXSource(getXmlReader(schema), new InputSource(reader)); return source; } /** * @param context context package name * @return JAXB context * @throws XMLException if JAXB error occurs */ public static JAXBContext getContext(String context) { try { JAXBContext jaxbContext = JAXBContext.newInstance(context); return jaxbContext; } catch (JAXBException e) { throw new XmlException("Error creating JAXB context", e); } } /** * @param context context package name * @param classLoader class loader used to load classes from the context * @return JAXB context; loads the classes using given classloader * @throws XMLException if JAXB error occurs */ public static JAXBContext getContext(String context, ClassLoader classLoader) { try { JAXBContext jaxbContext = JAXBContext.newInstance(context, classLoader); return jaxbContext; } catch (JAXBException e) { throw new XmlException("Error creating JAXB context", e); } } /** * @param classesToBeBound * @return JAXBContext according to classes to bind * Dependency classes are also loaded automatically. * @throws XMLException if JAXB error occurs */ public static JAXBContext getContext(Class... classesToBeBound) { try { JAXBContext jaxbContext = JAXBContext.newInstance(classesToBeBound); return jaxbContext; } catch (JAXBException e) { throw new XmlException("Error creating JAXB context", e); } } /** * Returns XML transform templates from given reader containing XSLT * stylesheet. * @param reader * @return templates; they can be reused many times to perform the transformation * @throws XMLException if XML parsing error occurs */ public static Templates getTemplates(Reader reader) { try { TransformerFactory factory = TransformerFactory.newInstance(); Source source = new StreamSource(reader); Templates templates; templates = factory.newTemplates(source); return templates; } catch (TransformerConfigurationException e) { throw new XmlException("Error creating XSLT templates.", e); } } /** * Performs XML schema validation and XSLT transformation. * @param templates XSLT stylesheet * @param schema XML schema to validate against * @param reader reader with input document * @param writer writer which will be used to write output * @param parameterMap transformation parameters * @throws XMLException if transformation error occurs */ public static void transform(Templates templates, Schema schema, Reader reader, Writer writer, Map parameterMap) { try { Source source = getSource(reader, schema); Result result = new StreamResult(writer); Transformer transformer = templates.newTransformer(); transformer.setErrorListener(new TransformationErrorListener()); for (Map.Entry entry : parameterMap.entrySet()) { transformer.setParameter(entry.getKey(), entry.getValue()); } transformer.transform(source, result); } catch (TransformerConfigurationException e) { throw new XmlException("Error creating XSLT transformer.", e); } catch (TransformerException e) { throw new XmlException("XSLT transformer error.", e); } } /** * Performs XML schema validation and XSLT transformation. * @param templates XSLT stylesheet * @param schema XML schema to validate against * @param reader reader with input document * @param writer writer which will be used to write output * @throws XMLException if transformation error occurs */ public static void transform(Templates templates, Schema schema, Reader reader, Writer writer) { Map parameterMap = Collections.emptyMap(); transform(templates, schema, reader, writer, parameterMap); } /** * Performs XSLT transformation. * @param templates XSLT stylesheet * @param reader reader with input document * @param writer writer which will be used to write output * @param parameterMap transformation parameters * @throws XMLException if transformation error occurs */ public static void transform(Templates templates, Reader reader, Writer writer, Map parameterMap) { transform(templates, null, reader, writer, parameterMap); } /** * Performs XSLT transformation. * @param templates XSLT stylesheet * @param reader reader with input document * @param writer writer which will be used to write output * @throws XMLException if transformation error occurs */ public static void transform(Templates templates, Reader reader, Writer writer) { Map parameterMap = Collections.emptyMap(); transform(templates, reader, writer, parameterMap); } /** * Replaces block quotes in regular expressions with normal quotes. For * example "\Qabc\E" will be replace with "\a\b\c". * * @param pattern * @return pattern with replaced block quotes */ public static String removeBlockQuotes(String pattern) { StringBuilder patternBuilder = new StringBuilder(); boolean quote = false; char previousChar = 0; for (int i = 0; i < pattern.length(); ++i) { char currentChar = pattern.charAt(i); if (quote) { if (previousChar == '\\' && currentChar == 'E') { quote = false; // Need to remove "\\" at the end as it has been added // in previous iteration. patternBuilder.delete(patternBuilder.length() - 2, patternBuilder.length()); } else { patternBuilder.append('\\'); patternBuilder.append(currentChar); } } else { if (previousChar == '\\' && currentChar == 'Q') { quote = true; // Need to remove "\" at the end as it has been added // in previous iteration. patternBuilder.deleteCharAt(patternBuilder.length() - 1); } else { patternBuilder.append(currentChar); } } previousChar = currentChar; } return patternBuilder.toString(); } /** * Changes unlimited length pattern to limited length pattern. It is done by * replacing constructs with "*" and "+" symbols with their finite * counterparts - "{0,n}" and {1,n}. * As a side effect block quotes are replaced with normal quotes * by using {@link #removeBlockQuotes(String)}. * * @param pattern pattern to be finitized * @param infinity "n" number * @return limited length pattern */ public static String finitize(String pattern, int infinity) { String finitePattern = removeBlockQuotes(pattern); Matcher starMatcher = STAR_PATTERN.matcher(finitePattern); finitePattern = starMatcher.replaceAll("{0," + infinity + "}"); Matcher plusMatcher = PLUS_PATTERN.matcher(finitePattern); finitePattern = plusMatcher.replaceAll("{1," + infinity + "}"); Matcher rangeMatcher = RANGE_PATTERN.matcher(finitePattern); finitePattern = rangeMatcher.replaceAll("{$1," + infinity + "}"); return finitePattern; } public static Pattern compile(SrxDocument document, String regex) { Pattern pattern = document.getCache().get(regex, Pattern.class); if (pattern == null) { pattern = Pattern.compile(regex); document.getCache().put(regex, pattern); } return pattern; } /** * Replaces capturing groups with non-capturing groups in the given regular * expression. As a side effect block quotes are replaced with normal quotes * by using {@link #removeBlockQuotes(String)}. * * @param pattern * @return modified pattern */ public static String removeCapturingGroups(String pattern) { String newPattern = removeBlockQuotes(pattern); Matcher capturingGroupMatcher = CAPTURING_GROUP_PATTERN .matcher(newPattern); newPattern = capturingGroupMatcher.replaceAll("(?:"); return newPattern; } /** * Returns value if it is not null or default value if it is null. * Automatically cast value to the same type as default value. * @param value object * @param defaultValue default value. * @return object value or default value if object value is null * @throws ClassCastException when value cannot be cast to default value type */ @SuppressWarnings("unchecked") public static T getParameter(Object value, T defaultValue) { T result; if (value != null) { result = (T)value; } else { result = defaultValue; } return result; } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/util/XmlException.java0000644000175000017500000000066411223073726025454 0ustar railrailpackage net.sourceforge.segment.util; /** * Runtime version of XML exception. * @author loomchild */ public class XmlException extends RuntimeException { private static final long serialVersionUID = -143693366659133245L; public XmlException(String message) { super(message); } public XmlException(String message, Throwable cause) { super(message, cause); } public XmlException(Throwable cause) { super(cause); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/util/NullWriter.java0000644000175000017500000000070511221113076025127 0ustar railrailpackage net.sourceforge.segment.util; import java.io.IOException; import java.io.Writer; /** * Writer that does not write anywhere. Idea is similar to /dev/null. * @author loomchild */ public class NullWriter extends Writer { public void close() throws IOException { // Do nothing. } public void flush() throws IOException { // Do nothing. } public void write(char[] cbuf, int off, int len) throws IOException { // Do nothing. } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/util/Bind.java0000644000175000017500000000551411223073726023710 0ustar railrailpackage net.sourceforge.segment.util; import static net.sourceforge.segment.util.Util.getFileInputStream; import static net.sourceforge.segment.util.Util.getFileOutputStream; import static net.sourceforge.segment.util.Util.getReader; import static net.sourceforge.segment.util.Util.getWriter; import static net.sourceforge.segment.util.Util.getXmlReader; import java.io.IOException; import java.io.Reader; import java.io.Writer; import javax.xml.bind.JAXBContext; import javax.xml.bind.JAXBException; import javax.xml.bind.Marshaller; import javax.xml.bind.Unmarshaller; import javax.xml.transform.Source; import javax.xml.transform.sax.SAXSource; import javax.xml.validation.Schema; import org.xml.sax.InputSource; /** * Helper class for JAXB binding. * Responsible for marshalling and unmarshalling using given schema and context. * @author loomchild */ public class Bind { private Marshaller marshaller; private Unmarshaller unmarshaller; /** * Creates Bind. * @param context JAXB context * @param schema XML schema */ public Bind(JAXBContext context, Schema schema) { try { unmarshaller = context.createUnmarshaller(); unmarshaller.setEventHandler(new LoggingValidationEventHandler()); unmarshaller.setSchema(schema); marshaller = context.createMarshaller(); marshaller.setSchema(schema); marshaller.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, true); } catch (JAXBException e) { throw new XmlException("JAXB error", e); } } /** * Writes given object to given writer validating it. * @param writer * @param object */ public void marshal(Writer writer, Object object) { try { marshaller.marshal(object, writer); } catch (JAXBException e) { throw new XmlException("JAXB marshalling error", e); } } /** * Writes given object to a file with given name validating it. * @param fileName * @param object */ public void marshal(String fileName, Object object) { try { Writer writer = getWriter(getFileOutputStream(fileName)); marshal(writer, object); writer.close(); } catch (IOException e) { throw new IORuntimeException(e); } } /** * Retrieves object from given reader validation the input. * @param reader * @return object */ public Object unmarshal(Reader reader) { try { Source source = new SAXSource(getXmlReader(), new InputSource( reader)); return unmarshaller.unmarshal(source); } catch (JAXBException e) { throw new XmlException("JAXB unmarshalling error", e); } } /** * Retrieves object from a file with given name validating the input. * @param fileName * @return object */ public Object unmarshal(String fileName) { try { Reader reader = getReader(getFileInputStream(fileName)); Object object = unmarshal(reader); reader.close(); return object; } catch (IOException e) { throw new IORuntimeException(e); } } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/util/LoggingValidationEventHandler.java0000644000175000017500000000156511223073726030737 0ustar railrailpackage net.sourceforge.segment.util; import javax.xml.bind.ValidationEvent; import javax.xml.bind.ValidationEventHandler; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * Handles XML validation errors by writing them to log. * @author loomchild */ public class LoggingValidationEventHandler implements ValidationEventHandler { private static final Log log = LogFactory .getLog(LoggingValidationEventHandler.class); public boolean handleEvent(ValidationEvent event) { if ((event.getSeverity() == ValidationEvent.FATAL_ERROR) || (event.getSeverity() == ValidationEvent.ERROR)) { return false; } else if (event.getSeverity() == ValidationEvent.WARNING) { log.debug("Validation warning: " + event.getMessage() + "."); return true; } else { log.warn("Unknown validation event type."); return false; } } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/util/IORuntimeException.java0000644000175000017500000000075111223073726026564 0ustar railrailpackage net.sourceforge.segment.util; import java.io.IOException; /** * Represents runtime version of {@link IOException}. * Used to avoid declaring thrown exceptions. * @author loomchild */ public class IORuntimeException extends RuntimeException { private static final long serialVersionUID = -6587044052300876023L; public IORuntimeException(IOException exception) { super(exception); } public void rethrow() throws IOException { throw (IOException) getCause(); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/util/UtilTest.java0000644000175000017500000000243111221113076024573 0ustar railrailpackage net.sourceforge.segment.util; import junit.framework.TestCase; public class UtilTest extends TestCase { public static final String QUOTED_PATTERN = "\\Q\\a\\Qaa\\\\Ebb\\Q\\E\\Qcc\\Edd"; public static final String EXPECTED_UNQUOTED_PATTERN = "\\\\\\a\\\\\\Q\\a\\a\\\\bb\\c\\cdd"; public void testRemoveBlockQuotes() { String unqotedPattern = Util.removeBlockQuotes(QUOTED_PATTERN); assertEquals(EXPECTED_UNQUOTED_PATTERN, unqotedPattern); } public static final String INFINITE_PATTERN = "a*b\\*\\\\+c+d\\+\\\\\\\\*e++f{1,4}+g{3,}+h{1}+\\Qa+\\E"; public static final String EXPECTED_FINITE_PATTERN = "a{0,100}b\\*\\\\{1,100}c{1,100}d\\+\\\\\\\\{0,100}e{1,100}+f{1,4}+g{3,100}+h{1}+\\a\\+"; public void testFinitize() { String finitePattern = Util.finitize(INFINITE_PATTERN, 100); assertEquals(EXPECTED_FINITE_PATTERN, finitePattern); } public static final String CAPTURING_GROUPS_PATTERN = "(aa)\\(bb\\\\(cc(dd))ee(?:ff)\\Q()\\E"; public static final String EXPECTED_NONCAPTURING_GROUPS_PATTERN = "(?:aa)\\(bb\\\\(?:cc(?:dd))ee(?:ff)\\(\\)"; public void testRemoveCapturingGroups() { String noncapturingGroupsPattern = Util.removeCapturingGroups(CAPTURING_GROUPS_PATTERN); assertEquals(EXPECTED_NONCAPTURING_GROUPS_PATTERN, noncapturingGroupsPattern); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/Version.java0000644000175000017500000000242011223073726023475 0ustar railrailpackage net.sourceforge.segment; import static net.sourceforge.segment.util.Util.getJarManifest; import java.util.jar.Manifest; import net.sourceforge.segment.util.ResourceNotFoundException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * Retrieves segment version. Singleton. * @author loomchild */ public class Version { private static final Log log = LogFactory.getLog(Version.class); public static final String VERSION_ATTRIBUTE = "Implementation-Version"; public static final String DATE_ATTRIBUTE = "Build-Date"; private static Version instance = new Version(); private String version; private String date; public static Version getInstance() { return instance; } private Version() { try { Manifest manifest = getJarManifest(Version.class); version = manifest.getMainAttributes().getValue(VERSION_ATTRIBUTE); date = manifest.getMainAttributes().getValue(DATE_ATTRIBUTE); } catch (ResourceNotFoundException e) { // Ignore, attributes stay null log.debug("Version number cannot be retrieved."); } } /** * @return segment version string */ public String getVersion() { return version; } /** * @return segment build date */ public String getDate() { return date; } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/0000755000175000017500000000000011300444112022006 5ustar railrailsegment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/RuleManager.java0000644000175000017500000000757711236362500025103 0ustar railrailpackage net.sourceforge.segment.srx; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Pattern; import net.sourceforge.segment.util.Util; /** * Represents segmentation rules manager. * Responsible for constructing and storing break and exception rules. * * @author loomchild */ public class RuleManager { private SrxDocument document; private int maxLookbehindConstructLength; private List breakRuleList; private Map exceptionPatternMap; /** * Constructor. Responsible for retrieving rules from SRX document for * given language code, constructing patterns and storing them in * quick accessible format. * Adds break rules to {@link #breakRuleList} and constructs * corresponding exception patterns in {@link #exceptionPatternMap}. * Uses document cache to store rules and patterns. * @param document SRX document * @param languageRuleList list of language rules * @param maxLookbehindConstructLength Maximum length of regular expression in lookbehind (see {@link Util#finitize(String, int)}). */ public RuleManager(SrxDocument document, List languageRuleList, int maxLookbehindConstructLength) { this.document = document; this.maxLookbehindConstructLength = maxLookbehindConstructLength; this.breakRuleList = new ArrayList(); this.exceptionPatternMap = new HashMap(); StringBuilder exceptionPatternBuilder = new StringBuilder(); for (LanguageRule languageRule : languageRuleList) { for (Rule rule : languageRule.getRuleList()) { if (rule.isBreak()) { breakRuleList.add(rule); Pattern exceptionPattern; if (exceptionPatternBuilder.length() > 0) { String exceptionPatternString = exceptionPatternBuilder.toString(); exceptionPattern = Util.compile(document, exceptionPatternString); } else { exceptionPattern = null; } exceptionPatternMap.put(rule, exceptionPattern); } else { if (exceptionPatternBuilder.length() > 0) { exceptionPatternBuilder.append('|'); } String patternString = createExceptionPatternString(rule); exceptionPatternBuilder.append(patternString); } } } } /** * @return break rule list */ public List getBreakRuleList() { return breakRuleList; } /** * @param breakRule * @return exception pattern corresponding to give break rule */ public Pattern getExceptionPattern(Rule breakRule) { return exceptionPatternMap.get(breakRule); } /** * Creates exception pattern string that can be matched in the place * where break rule was matched. Both parts of the rule * (beforePattern and afterPattern) are incorporated * into one pattern. * beforePattern is used in lookbehind, therefore it needs to be * modified so it matches finite string (contains no *, + or {n,}). * @param rule exception rule * @return string containing exception pattern */ private String createExceptionPatternString(Rule rule) { String patternString = document.getCache().get(rule, String.class); if (patternString == null) { StringBuilder patternBuilder = new StringBuilder(); // As Java does not allow infinite length patterns // in lookbehind, before pattern need to be shortened. String beforePattern = Util.finitize(rule.getBeforePattern(), maxLookbehindConstructLength); String afterPattern = rule.getAfterPattern(); patternBuilder.append("(?:"); if (beforePattern.length() > 0) { patternBuilder.append("(?<=" + beforePattern + ")"); } if (afterPattern.length() > 0) { patternBuilder.append("(?=" + afterPattern + ")"); } patternBuilder.append(")"); patternString = patternBuilder.toString(); document.getCache().put(rule, patternString); } return patternString; } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/TextManagerTest.java0000644000175000017500000000340711236362500025744 0ustar railrailpackage net.sourceforge.segment.srx; import static org.junit.Assert.assertEquals; import java.io.StringReader; import org.junit.Test; public class TextManagerTest { @Test public void testCharSequence() { TextManager manager = new TextManager("text"); assertEquals("text", manager.getText().toString()); assertEquals(4, manager.getBufferLength()); assertEquals(false, manager.hasMoreText()); } @Test public void testEmptyString() { TextManager manager = new TextManager(""); assertEquals("", manager.getText().toString()); assertEquals(0, manager.getBufferLength()); assertEquals(false, manager.hasMoreText()); } @Test(expected=IllegalStateException.class) public void testCannotReadCharSequence() { TextManager manager = new TextManager("text"); manager.readText(1); } @Test public void testReader() { StringReader reader = new StringReader("text"); TextManager manager = new TextManager(reader, 2); assertEquals(2, manager.getBufferLength()); assertEquals("te", manager.getText().toString()); assertEquals(true, manager.hasMoreText()); manager.readText(1); assertEquals("ex", manager.getText().toString()); assertEquals(true, manager.hasMoreText()); manager.readText(1); assertEquals("xt", manager.getText().toString()); assertEquals(false, manager.hasMoreText()); } @Test public void testEmptyReader() { StringReader reader = new StringReader(""); TextManager manager = new TextManager(reader, 2); assertEquals("", manager.getText().toString()); assertEquals(2, manager.getBufferLength()); assertEquals(false, manager.hasMoreText()); } @Test(expected=IllegalArgumentException.class) public void testBufferZeroLength() { StringReader reader = new StringReader(""); new TextManager(reader, 0); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/SrxTextIteratorStringTest.java0000644000175000017500000000344311247267663030067 0ustar railrailpackage net.sourceforge.segment.srx; import static org.junit.Assert.assertEquals; import java.util.HashMap; import java.util.List; import java.util.Map; import net.sourceforge.segment.TextIterator; import org.junit.Test; public class SrxTextIteratorStringTest extends AbstractSrxTextIteratorTest { protected TextIterator getTextIterator(String text, SrxDocument document, String languageCode) { return new SrxTextIterator(document, languageCode, text); } public static final String[] MAX_LOOKBEHIND_CONSTRUCT_LENGTH_RESULT = new String[] {"XAAA.", "XBB.XC"}; public static final SrxDocument MAX_LOOKBEHIND_CONSTRUCT_LENGTH_DOCUMENT = createMaxLookbehindConstructLengthDocument(); public static SrxDocument createMaxLookbehindConstructLengthDocument() { LanguageRule languageRule = new LanguageRule(""); languageRule.addRule(new Rule(false, "XA+\\.", "")); languageRule.addRule(new Rule(false, "XB+\\.", "")); languageRule.addRule(new Rule(true, "\\.", "")); SrxDocument document = new SrxDocument(); document.addLanguageMap(".*", languageRule); return document; } /** * Test if setting * {@link SrxTextIterator#MAX_LOOKBEHIND_CONSTRUCT_LENGTH_PARAMETER} works. */ @Test public void testMaxLookbehindConstructLength() { Map parameterMap = new HashMap(); parameterMap.put(SrxTextIterator.MAX_LOOKBEHIND_CONSTRUCT_LENGTH_PARAMETER, 2); String text = merge(MAX_LOOKBEHIND_CONSTRUCT_LENGTH_RESULT); TextIterator textIterator = new SrxTextIterator(MAX_LOOKBEHIND_CONSTRUCT_LENGTH_DOCUMENT, "", text, parameterMap); List segmentList = segment(textIterator); String[] segmentArray = segmentList.toArray(new String[segmentList.size()]); assertEquals(MAX_LOOKBEHIND_CONSTRUCT_LENGTH_RESULT, segmentArray); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/SrxDocumentTest.java0000644000175000017500000000211011171331077025771 0ustar railrailpackage net.sourceforge.segment.srx; import java.util.List; import junit.framework.TestCase; public class SrxDocumentTest extends TestCase { public void testDocument() { SrxDocument document = new SrxDocument(); LanguageRule languageRule1 = new LanguageRule("1"); LanguageRule languageRule2 = new LanguageRule("2"); LanguageRule languageRule3 = new LanguageRule("3"); document.addLanguageMap("aaa", languageRule1); document.addLanguageMap("ab", languageRule2); document.addLanguageMap("a+", languageRule3); document.setCascade(true); List languageRuleList = document .getLanguageRuleList("aaa"); assertEquals(2, languageRuleList.size()); assertEquals(languageRule1, languageRuleList.get(0)); assertEquals(languageRule3, languageRuleList.get(1)); languageRuleList = document.getLanguageRuleList("xxx"); assertEquals(0, languageRuleList.size()); document.setCascade(false); languageRuleList = document.getLanguageRuleList("aaa"); assertEquals(1, languageRuleList.size()); assertEquals(languageRule1, languageRuleList.get(0)); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/SrxDocument.java0000644000175000017500000000430711223073726025145 0ustar railrailpackage net.sourceforge.segment.srx; import java.util.ArrayList; import java.util.List; /** * Represents SRX document. Responsible for storing and searching matching * language rules for given language code. * * @author loomchild */ public class SrxDocument { /** * Default cascade value. */ public static final boolean DEFAULT_CASCADE = true; private boolean cascade; private List languageMapList; private SrxDocumentCache cache; /** * Creates empty document. * * @param cascade true if document is cascading */ public SrxDocument(boolean cascade) { this.cascade = cascade; this.languageMapList = new ArrayList(); this.cache = new SrxDocumentCache(); } /** * Creates empty document with default cascade. See {@link #DEFAULT_CASCADE}. */ public SrxDocument() { this(DEFAULT_CASCADE); } /** * Sets if document is cascading or not. * * @param cascade true f document is cascading */ public void setCascade(boolean cascade) { this.cascade = cascade; } /** * @return true if document is cascading */ public boolean getCascade() { return cascade; } /** * Add language map to this document. * * @param pattern language code pattern * @param languageRule */ public void addLanguageMap(String pattern, LanguageRule languageRule) { LanguageMap languageMap = new LanguageMap(pattern, languageRule); languageMapList.add(languageMap); } /** * If cascade is true then returns all language rules matching given * language code. If cascade is false returns first language rule matching * given language code. If no matching language rules are found returns * empty list. * * @param languageCode language code, for example en_US * @return matching language rules */ public List getLanguageRuleList(String languageCode) { List matchingLanguageRuleList = new ArrayList(); for (LanguageMap languageMap : languageMapList) { if (languageMap.matches(languageCode)) { matchingLanguageRuleList.add(languageMap.getLanguageRule()); if (!cascade) { break; } } } return matchingLanguageRuleList; } public SrxDocumentCache getCache() { return cache; } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/legacy/0000755000175000017500000000000011300444112023252 5ustar railrailsegment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/legacy/MergedPattern.java0000644000175000017500000001504511236362500026673 0ustar railrailpackage net.sourceforge.segment.srx.legacy; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.regex.Pattern; import net.sourceforge.segment.srx.LanguageRule; import net.sourceforge.segment.srx.Rule; import net.sourceforge.segment.util.Util; /** * Represents merged splitting pattern. * Responsible for merging breaking rules into one large pattern and * creating non breaking rules pattern. * @author loomchild */ public class MergedPattern { private int maxLookbehindConstructLength; private Pattern breakingPattern; private List nonBreakingPatternList; private List breakingRuleIndexList; public MergedPattern(List languageRuleList, int maxLookbehindConstructLength) { this.maxLookbehindConstructLength = maxLookbehindConstructLength; StringBuilder breakingPatternBuilder = new StringBuilder(); this.nonBreakingPatternList = new ArrayList(); // This list contains indexes of last breaking rules that occur before // given non breaking pattern on the list. // It has the same size as nonBreakingPatternList. // It is needed to recognize which non breaking rules to use for // given braking rule. this.breakingRuleIndexList = new ArrayList(); // Number or breaking rules already added to breaking pattern. int breakingRuleIndex = 0; List ruleList = extractRules(languageRuleList); List> ruleGroupList = groupRules(ruleList); for (List ruleGroup : ruleGroupList) { if (ruleGroup.get(0).isBreak()) { if (breakingPatternBuilder.length() > 0) { breakingPatternBuilder.append('|'); } // All breaking rules need to be merged because segmentation // need to be done in one pass when text is read from Reader. String breakingGroupPattern = createBreakingPattern(ruleGroup); breakingPatternBuilder.append(breakingGroupPattern); // Increase current braking rule index. breakingRuleIndex += ruleGroup.size(); } else { // Add non breaking pattern Pattern nonBreakingGroupPattern = Pattern.compile(createNonBreakingPattern(ruleGroup)); nonBreakingPatternList.add(nonBreakingGroupPattern); // Add the index of last breaking rule before given // non breaking pattern. breakingRuleIndexList.add(breakingRuleIndex); } } if (breakingPatternBuilder.length() > 0) { this.breakingPattern = Pattern.compile(breakingPatternBuilder .toString()); } else { // null means that that pattern will not match anything // (as empty pattern matches everything). this.breakingPattern = null; } } public Pattern getBreakingPattern() { return breakingPattern; } /** * Returns all applicable non breaking rules when breaking rule with a * given number was matched (non breaking rules that occur before * given breaking rule in SRX file). * @param breakingRuleIndex * @return Active non breaking patterns for a given breaking rule */ public List getNonBreakingPatternList(int breakingRuleIndex) { List result = new ArrayList(); Iterator patternIterator = nonBreakingPatternList.iterator(); for (int currentBreakingRuleIndex : breakingRuleIndexList) { if (currentBreakingRuleIndex >= breakingRuleIndex) { break; } result.add(patternIterator.next()); } return result; } /** * @param languageRuleList * @return merged list of rules form given language rules */ private List extractRules(List languageRuleList) { List ruleList = new ArrayList(); for (LanguageRule languageRule : languageRuleList) { ruleList.addAll(languageRule.getRuleList()); } return ruleList; } /** * Divides rules to groups where all rules in the same group are * either breaking or non breaking. Does not change rule order. * * @param ruleList * @return list of grouped rules */ private List> groupRules(List ruleList) { List> ruleGroupList = new ArrayList>(); List ruleGroup = null; Rule previousRule = null; for (Rule rule : ruleList) { if (previousRule == null || rule.isBreak() != previousRule.isBreak()) { ruleGroup = new ArrayList(); ruleGroupList.add(ruleGroup); } ruleGroup.add(rule); previousRule = rule; } return ruleGroupList; } /** * Merges all breaking rules on list into one pattern. * * @param ruleList * @return breaking pattern */ private String createBreakingPattern(List ruleList) { StringBuilder patternBuilder = new StringBuilder(); for (Rule rule : ruleList) { if (patternBuilder.length() > 0) { patternBuilder.append('|'); } // Capturing groups need to be removed from patterns as // they will interfere with capturing group order // which is used to recognize which breaking rule has been // applied and decide which non-breaking rules to use. String beforePattern = Util.removeCapturingGroups(rule.getBeforePattern()); String afterPattern = Util.removeCapturingGroups(rule.getAfterPattern()); // Whore pattern would be in lookahead because alternative // behaves differently in lookahead - first matching not first // in order is returned first. For example: // Input: "aaa" // Pattern "aaa|aa" matches "aaa", but pattern "aa|aaa" matches "aa". // Pattern "(?=aaa|aa)" always matches "aa". patternBuilder.append("(?="); patternBuilder.append(beforePattern); // This will be after break point. patternBuilder.append("()"); patternBuilder.append(afterPattern); patternBuilder.append(")"); } return patternBuilder.toString(); } /** * Creates non breaking pattern by merging given rules. * * @param ruleList * @return Non breaking pattern */ private String createNonBreakingPattern(List ruleList) { StringBuilder patternBuilder = new StringBuilder(); for (Rule rule : ruleList) { if (patternBuilder.length() > 0) { patternBuilder.append('|'); } // As Java does not allow infinite length patterns // in lookbehind, before pattern need to be shortened. String beforePattern = Util.finitize(rule.getBeforePattern(), maxLookbehindConstructLength); String afterPattern = rule.getAfterPattern(); patternBuilder.append("(?:"); if (beforePattern.length() > 0) { patternBuilder.append("(?<=" + beforePattern + ")"); } if (afterPattern.length() > 0) { patternBuilder.append("(?=" + afterPattern + ")"); } patternBuilder.append(")"); } return patternBuilder.toString(); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/legacy/BufferTest.java0000644000175000017500000000323411221113076026174 0ustar railrailpackage net.sourceforge.segment.srx.legacy; import static org.junit.Assert.assertEquals; import org.junit.Test; public class BufferTest { @Test public void testWorking() { Buffer charQueue = new Buffer(3); assertEquals(3, charQueue.getCapacity()); assertEquals(0, charQueue.length()); charQueue.forceEnqueue('a'); charQueue.enqueue('b'); assertEquals(2, charQueue.length()); assertEquals('a', charQueue.charAt(0)); assertEquals('b', charQueue.charAt(1)); charQueue.enqueue('c'); charQueue.dequeue(); assertEquals('b', charQueue.charAt(0)); charQueue.dequeue(); assertEquals(1, charQueue.length()); charQueue.enqueue('d'); charQueue.enqueue('e'); assertEquals(3, charQueue.length()); assertEquals('c', charQueue.charAt(0)); assertEquals('d', charQueue.charAt(1)); assertEquals('e', charQueue.charAt(2)); charQueue.dequeue(); charQueue.forceEnqueue('f'); charQueue.forceEnqueue('g'); assertEquals(3, charQueue.length()); assertEquals('e', charQueue.charAt(0)); assertEquals('f', charQueue.charAt(1)); assertEquals('g', charQueue.charAt(2)); } @Test(expected = IllegalStateException.class) public void testOverflow() { Buffer charQueue = new Buffer(3); charQueue.enqueue('a'); charQueue.enqueue('b'); charQueue.enqueue('c'); charQueue.enqueue('d'); } @Test(expected = IllegalStateException.class) public void testUnderflow() { Buffer charQueue = new Buffer(3); charQueue.enqueue('a'); charQueue.dequeue(); charQueue.dequeue(); } @Test(expected = IndexOutOfBoundsException.class) public void testIndexOutOfBounds() { Buffer charQueue = new Buffer(3); charQueue.enqueue('a'); charQueue.charAt(2); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/legacy/ReaderMatcherTest.java0000644000175000017500000000422511221113076027472 0ustar railrailpackage net.sourceforge.segment.srx.legacy; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import java.io.StringReader; import java.util.regex.Pattern; import org.junit.Test; public class ReaderMatcherTest { private static final String REGEX_DATA = "abcde - fghijklmno pqrstuvw - x -"; private static final Pattern REGEX = Pattern.compile(" - "); @Test public void testSimple() { StringReader reader = new StringReader(REGEX_DATA); CharSequence text = new ReaderCharSequence(reader); ReaderMatcher matcher = new ReaderMatcher(REGEX, text); boolean found; found = matcher.find(); assertTrue(found); assertEquals(5, matcher.start()); found = matcher.find(); assertTrue(found); assertEquals(27, matcher.start()); found = matcher.find(); assertFalse(found); assertTrue(matcher.hitEnd()); } private static final Pattern REGEX_WHOLE = Pattern.compile(".*"); @Test public void testWhole() { StringReader reader = new StringReader(REGEX_DATA); CharSequence text = new ReaderCharSequence(reader); ReaderMatcher matcher = new ReaderMatcher(REGEX_WHOLE, text); boolean found = matcher.find(); assertTrue(found); assertEquals(REGEX_DATA, matcher.group()); } private static final Pattern REGEX_ALTERNATIVE = Pattern.compile("b|bc"); @Test public void testAlternative() { StringReader reader = new StringReader(REGEX_DATA); CharSequence text = new ReaderCharSequence(reader); ReaderMatcher matcher = new ReaderMatcher(REGEX_ALTERNATIVE, text); boolean found = matcher.find(); assertTrue(found); assertEquals(1, matcher.start()); assertEquals(2, matcher.end()); found = matcher.find(); assertFalse(found); } private static final Pattern REGEX_LOOKING_AT = Pattern.compile("bcd"); @Test public void testLookingAt() { StringReader reader = new StringReader(REGEX_DATA); CharSequence text = new ReaderCharSequence(reader); ReaderMatcher matcher = new ReaderMatcher(REGEX_LOOKING_AT, text); matcher.region(1, text.length()); assertTrue(matcher.lookingAt()); matcher.region(2, text.length()); assertFalse(matcher.lookingAt()); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/legacy/Buffer.java0000644000175000017500000000457111223073726025352 0ustar railrailpackage net.sourceforge.segment.srx.legacy; /** * Represents character buffer with fixed capacity. * Implements {@link CharSequence} methods and in addition queue methods. * @author loomchild */ public class Buffer implements CharSequence { private char[] buffer; private int head; private int size; private Buffer(char[] buffer, int head, int size) { this.buffer = buffer; this.head = head; this.size = size; } public Buffer(int capacity) { this(new char[capacity], 0, 0); } public int getCapacity() { return buffer.length; } public void enqueue(char character) { if (isFull()) { throw new IllegalStateException( "Not enough capacity to enqueue element"); } else { buffer[(head + length()) % getCapacity()] = character; ++size; } } public void dequeue() { if (isEmpty()) { throw new IllegalStateException("No element to dequeue"); } else { head = (head + 1) % getCapacity(); --size; } } /** * This is the same as: * if (buffer.length() * == buffer.getCapacity()) buffer.dequeue(); * buffer.enqueue(character); * * @param character */ public void forceEnqueue(char character) { buffer[(head + length()) % getCapacity()] = character; if (isFull()) { head = (head + 1) % getCapacity(); } else { ++size; } } public int length() { return size; } public char charAt(int index) { if (index < 0 || index >= length()) { throw new IndexOutOfBoundsException("Buffer index " + index + " not in <0, " + length() + ")."); } else { char character = buffer[(head + index) % getCapacity()]; return character; } } public CharSequence subSequence(int start, int end) { if (start < 0 || start > end || end > length()) { throw new IndexOutOfBoundsException("Buffer subsequence " + "<" + start + ", " + end + ") not in " + "<0, " + length() + ")."); } else { int subHead = (head + start) % getCapacity(); int subSize = end - start; return new Buffer(buffer, subHead, subSize); } } public String toString() { StringBuilder stringBuilder = new StringBuilder(size); int position = head; for (int i = 0; i < size; ++i) { stringBuilder.append(buffer[position]); position = (position + 1) % getCapacity(); } return stringBuilder.toString(); } private boolean isEmpty() { return length() == 0; } private boolean isFull() { return length() == getCapacity(); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/legacy/ReaderMatcher.java0000644000175000017500000001176611223073726026653 0ustar railrailpackage net.sourceforge.segment.srx.legacy; import java.util.regex.MatchResult; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Represents {@link MatchResult} that uses {@link ReaderCharSequence} as * a text - it takes care of exceptions that are thrown by it. * @author loomchild */ public class ReaderMatcher implements MatchResult { private Matcher matcher; private CharSequence text; private int oldLength; public ReaderMatcher(Pattern pattern, CharSequence text) { this.text = text; this.oldLength = text.length(); this.matcher = pattern.matcher(text); } public ReaderMatcher appendReplacement(StringBuffer sb, String replacement) { matcher.appendReplacement(sb, replacement); return this; } public StringBuffer appendTail(StringBuffer sb) { return matcher.appendTail(sb); } public int end() { return matcher.end(); } public int end(int group) { return matcher.end(group); } public boolean find() { boolean result = false; int end = getEnd(); try { result = matcher.find(); } catch (IndexOutOfBoundsException e) { } if (lengthChanged()) { int regionStart = Math.max(end, matcher.regionStart()); int regionEnd = Math.min(text.length(), matcher.regionEnd()); matcher.reset(text); matcher.region(regionStart, regionEnd); result = matcher.find(); } return result; } public boolean find(int start) { boolean result = false; try { result = matcher.find(start); } catch (IndexOutOfBoundsException e) { } if (lengthChanged()) { int regionStart = matcher.regionStart(); int regionEnd = Math.min(text.length(), matcher.regionEnd()); matcher.reset(text); matcher.region(regionStart, regionEnd); result = matcher.find(start); } return result; } public String group() { return matcher.group(); } public String group(int group) { return matcher.group(group); } public int groupCount() { return matcher.groupCount(); } public boolean hasAnchoringBounds() { return matcher.hasAnchoringBounds(); } public boolean hasTransparentBounds() { return matcher.hasTransparentBounds(); } public boolean hitEnd() { return matcher.hitEnd(); } public boolean lookingAt() { boolean result = false; try { result = matcher.lookingAt(); } catch (IndexOutOfBoundsException e) { } if (lengthChanged()) { int regionStart = matcher.regionStart(); int regionEnd = Math.min(text.length(), matcher.regionEnd()); matcher.reset(text); matcher.region(regionStart, regionEnd); result = matcher.lookingAt(); } return result; } public boolean matches() { boolean result = false; try { result = matcher.matches(); } catch (IndexOutOfBoundsException e) { } if (lengthChanged()) { int regionStart = matcher.regionStart(); int regionEnd = Math.min(text.length(), matcher.regionEnd()); matcher.reset(text); matcher.region(regionStart, regionEnd); result = matcher.matches(); } return result; } public Pattern pattern() { return matcher.pattern(); } public ReaderMatcher region(int start, int end) { matcher.region(start, end); return this; } public int regionEnd() { return matcher.regionEnd(); } public int regionStart() { return matcher.regionStart(); } public String replaceAll(String replacement) { String result = null; try { result = matcher.replaceAll(replacement); } catch (IndexOutOfBoundsException e) { } if (lengthChanged()) { // No need to set region because replaceAll resets matcher first. matcher.reset(text); result = matcher.replaceAll(replacement); } return result; } public String replaceFirst(String replacement) { String result = null; try { result = matcher.replaceFirst(replacement); } catch (IndexOutOfBoundsException e) { } if (lengthChanged()) { matcher.reset(text); // No need to set region because replaceFirst resets matcher first. result = matcher.replaceFirst(replacement); } return result; } public boolean requireEnd() { return matcher.requireEnd(); } public ReaderMatcher reset() { matcher.reset(); return this; } public ReaderMatcher reset(CharSequence input) { matcher.reset(input); return this; } public int start() { return matcher.start(); } public int start(int group) { return matcher.start(group); } public MatchResult toMatchResult() { return matcher.toMatchResult(); } public String toString() { return "ReaderMatcher: " + matcher.toString(); } public ReaderMatcher useAnchoringBounds(boolean b) { matcher.useAnchoringBounds(b); return this; } public ReaderMatcher usePattern(Pattern newPattern) { matcher.usePattern(newPattern); return this; } public ReaderMatcher useTransparentBounds(boolean b) { matcher.useTransparentBounds(b); return this; } private int getEnd() { try { return matcher.end(); } catch (IllegalStateException e) { return 0; } } private boolean lengthChanged() { if (text.length() < oldLength) { oldLength = text.length(); return true; } else { return false; } } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/legacy/AccurateSrxTextIterator.java0000644000175000017500000000757511223475357030761 0ustar railrailpackage net.sourceforge.segment.srx.legacy; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import net.sourceforge.segment.AbstractTextIterator; import net.sourceforge.segment.srx.LanguageRule; import net.sourceforge.segment.srx.Rule; import net.sourceforge.segment.srx.RuleMatcher; import net.sourceforge.segment.srx.SrxDocument; /** * Reprezentuje splitter dzielący na podstawie reguł zawartych w pliku srx. * * @author loomchild */ public class AccurateSrxTextIterator extends AbstractTextIterator { private List languageRuleList; private CharSequence text; private String segment; private List ruleMatcherList; private int startPosition, endPosition; /** * Creates text iterator that obtains language rules form given document * using given language code. To retrieve language rules calls * {@link SrxDocument#getLanguageRuleList(String)}. * * @param document document containing language rules * @param languageCode language code to select the rules * @param text */ public AccurateSrxTextIterator(SrxDocument document, String languageCode, CharSequence text) { this.languageRuleList = document.getLanguageRuleList(languageCode); this.text = text; this.segment = null; this.startPosition = 0; this.endPosition = 0; this.ruleMatcherList = new LinkedList(); for (LanguageRule languageRule : languageRuleList) { for (Rule rule : languageRule.getRuleList()) { RuleMatcher matcher = new RuleMatcher(document, rule, text); ruleMatcherList.add(matcher); } } } /** * Wyszukuje następne dopasowanie. * @return Zwraca następny segment albo null jeśli nie istnieje * @throws IOSRuntimeException Zgłaszany gdy nastąpi błąd przy odczycie strumienia */ public String next() { if (hasNext()) { // Initialize matchers before first search. if (segment == null) { initMatchers(); } boolean found = false; while ((ruleMatcherList.size() > 0) && !found) { RuleMatcher minMatcher = getMinMatcher(); endPosition = minMatcher.getBreakPosition(); if (minMatcher.getRule().isBreak() && endPosition > startPosition) { found = true; cutMatchers(); } moveMatchers(); } if (!found) { endPosition = text.length(); } segment = text.subSequence(startPosition, endPosition).toString(); startPosition = endPosition; return segment; } else { return null; } } /** * @return Zwraca true gdy są dostępne kolejne segmenty */ public boolean hasNext() { return (startPosition < text.length()); } private void initMatchers() { for (Iterator i = ruleMatcherList.iterator(); i.hasNext();) { RuleMatcher matcher = i.next(); matcher.find(); if (matcher.hitEnd()) { i.remove(); } } } /** * Przesuwa iteratory na kolejną pozycje jeśli to konieczne. */ private void moveMatchers() { for (Iterator i = ruleMatcherList.iterator(); i.hasNext();) { RuleMatcher matcher = i.next(); while (matcher.getBreakPosition() <= endPosition) { matcher.find(); if (matcher.hitEnd()) { i.remove(); break; } } } } /** * Move matchers that start before previous segment end. */ private void cutMatchers() { for (Iterator i = ruleMatcherList.iterator(); i.hasNext();) { RuleMatcher matcher = i.next(); if (matcher.getStartPosition() < endPosition) { matcher.find(endPosition); if (matcher.hitEnd()) { i.remove(); } } } } /** * @return Zwraca iterator pierwszego trafionego dopasowania */ private RuleMatcher getMinMatcher() { int minPosition = Integer.MAX_VALUE; RuleMatcher minMatcher = null; for (RuleMatcher matcher : ruleMatcherList) { if (matcher.getBreakPosition() < minPosition) { minPosition = matcher.getBreakPosition(); minMatcher = matcher; } } return minMatcher; } } ././@LongLink0000000000000000000000000000014700000000000011567 Lustar rootrootsegment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/legacy/AccurateSrxTextIteratorStringTest.javasegment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/legacy/AccurateSrxTextIteratorStringTest.ja0000644000175000017500000000072011223073726032434 0ustar railrailpackage net.sourceforge.segment.srx.legacy; import net.sourceforge.segment.TextIterator; import net.sourceforge.segment.srx.AbstractSrxTextIteratorTest; import net.sourceforge.segment.srx.SrxDocument; public class AccurateSrxTextIteratorStringTest extends AbstractSrxTextIteratorTest { protected TextIterator getTextIterator(String text, SrxDocument document, String languageCode) { return new AccurateSrxTextIterator(document, languageCode, text); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/legacy/FastTextIteratorStringTest.java0000644000175000017500000000106311223475357031443 0ustar railrailpackage net.sourceforge.segment.srx.legacy; import net.sourceforge.segment.TextIterator; import net.sourceforge.segment.srx.AbstractSrxTextIteratorTest; import net.sourceforge.segment.srx.SrxDocument; import org.junit.Ignore; import org.junit.Test; public class FastTextIteratorStringTest extends AbstractSrxTextIteratorTest { protected TextIterator getTextIterator(String text, SrxDocument document, String languageCode) { return new FastTextIterator(document, languageCode, text); } @Ignore @Test public void testOverlappingBreakRules() { } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/legacy/FastTextIteratorReaderTest.java0000644000175000017500000000143211223475357031377 0ustar railrailpackage net.sourceforge.segment.srx.legacy; import java.io.StringReader; import net.sourceforge.segment.TextIterator; import net.sourceforge.segment.srx.AbstractSrxTextIteratorTest; import net.sourceforge.segment.srx.SrxDocument; import org.junit.Ignore; import org.junit.Test; public class FastTextIteratorReaderTest extends AbstractSrxTextIteratorTest { private static final int BUFFER_SIZE = 60; protected TextIterator getTextIterator(String text, SrxDocument document, String languageCode) { StringReader reader = new StringReader(text); CharSequence charSequence = new ReaderCharSequence(reader, Integer.MAX_VALUE, BUFFER_SIZE); return new FastTextIterator(document, languageCode, charSequence); } @Ignore @Test public void testOverlappingBreakRules() { } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/legacy/FastTextIterator.java0000644000175000017500000001521411236362500027404 0ustar railrailpackage net.sourceforge.segment.srx.legacy; import static net.sourceforge.segment.util.Util.getParameter; import java.io.Reader; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Pattern; import net.sourceforge.segment.AbstractTextIterator; import net.sourceforge.segment.srx.LanguageRule; import net.sourceforge.segment.srx.SrxDocument; import net.sourceforge.segment.srx.SrxTextIterator; /** * Represents fast text iterator that splits text according to SRX rules. * * @author loomchild */ public class FastTextIterator extends AbstractTextIterator { private CharSequence text; private String segment; private MergedPattern mergedPattern; private ReaderMatcher breakingMatcher; private int startPosition, endPosition; /** * Creates text iterator that obtains language rules form given document * using given language code. To retrieve language rules calls * {@link SrxDocument#getLanguageRuleList(String)}. * Supported parameters: * {@link SrxTextIterator#MAX_LOOKBEHIND_CONSTRUCT_LENGTH_PARAMETER}. * * @param document * document containing language rules * @param languageCode * language code to select the rule * @param text * @param parameterMap * additional segmentation parameters */ public FastTextIterator(SrxDocument document, String languageCode, CharSequence text, Map parameterMap) { this.text = text; this.segment = null; this.startPosition = 0; this.endPosition = 0; int maxLookbehindConstructLength = getParameter(parameterMap .get(SrxTextIterator.MAX_LOOKBEHIND_CONSTRUCT_LENGTH_PARAMETER), SrxTextIterator.DEFAULT_MAX_LOOKBEHIND_CONSTRUCT_LENGTH); List languageRuleList = document .getLanguageRuleList(languageCode); Object[] key = new Object[] { languageRuleList, maxLookbehindConstructLength }; this.mergedPattern = document.getCache().get(key, MergedPattern.class); if (mergedPattern == null) { mergedPattern = new MergedPattern(languageRuleList, maxLookbehindConstructLength); document.getCache().put(key, mergedPattern); } if (mergedPattern.getBreakingPattern() != null) { this.breakingMatcher = new ReaderMatcher(mergedPattern .getBreakingPattern(), text); } } /** * Creates text iterator with no additional parameters. * * @see #FastTextIterator(SrxDocument, String, CharSequence, Map) * @param document * document containing language rules * @param languageCode * language code to select the rule * @param text */ public FastTextIterator(SrxDocument document, String languageCode, CharSequence text) { this(document, languageCode, text, new HashMap()); } /** * Creates streaming text iterator that obtains language rules form given * document using given language code. To retrieve language rules calls * {@link SrxDocument#getLanguageRuleList(String)}. To handle streams uses * ReaderCharSequence, so not all possible regular expressions are accepted. * See {@link ReaderCharSequence} for details. * Supported parameters: * {@link SrxTextIterator#BUFFER_LENGTH_PARAMETER}, * {@link SrxTextIterator#MAX_LOOKBEHIND_CONSTRUCT_LENGTH_PARAMETER}. * * @param document * document containing language rules * @param languageCode * language code to select the rules * @param reader * reader from which text will be read * @param parameterMap * additional segmentation parameters */ public FastTextIterator(SrxDocument document, String languageCode, Reader reader, Map parameterMap) { this(document, languageCode, new ReaderCharSequence(reader, Integer.MAX_VALUE, getParameter(parameterMap .get(SrxTextIterator.BUFFER_LENGTH_PARAMETER), SrxTextIterator.DEFAULT_BUFFER_LENGTH)), parameterMap); } /** * Creates streaming text iterator with no additional parameters. * * @see #FastTextIterator(SrxDocument, String, Reader, Map) * @param document * document containing language rules * @param languageCode * language code to select the rules * @param reader * reader from which text will be read */ public FastTextIterator(SrxDocument document, String languageCode, Reader reader) { this(document, languageCode, reader, new HashMap()); } /** * {@inheritDoc} */ public String next() { if (hasNext()) { boolean found = false; if (breakingMatcher != null) { while (!found && breakingMatcher.find()) { // Find which breaking rule was matched in the matcher. // It must have matched some rule so check for // breakingMatcher.groupCount() is not necessary. int breakingRuleIndex = 1; while (breakingMatcher.group(breakingRuleIndex) == null) { ++breakingRuleIndex; } // Breaking position is at the end of the group. endPosition = breakingMatcher.end(breakingRuleIndex); // When there's more than one breaking rule at the given // place only the first is matched, the rest is skipped. // So if position is not increasing the new rules are // applied in the same place as previously matched rule. if (endPosition > startPosition) { found = true; // Get non breaking patterns that are applicable // to breaking rule just matched. List activeNonBreakingPatternList = mergedPattern .getNonBreakingPatternList(breakingRuleIndex); for (Pattern nonBreakingPattern : activeNonBreakingPatternList) { // Null non breaking pattern does not match anything if (nonBreakingPattern != null) { ReaderMatcher nonBreakingMatcher = new ReaderMatcher( nonBreakingPattern, text); nonBreakingMatcher.useTransparentBounds(true); // When using transparent bound the upper bound // is not important? // Needed because text.length() is unknown. nonBreakingMatcher.region(endPosition, endPosition); found = !nonBreakingMatcher.lookingAt(); } // Break when non-breaking rule matches if (!found) { break; } } } } // Breaking matcher cannot match text behind segment start in // the future. if (found && endPosition < text.length()) { breakingMatcher.region(endPosition, text.length()); } } if (!found) { endPosition = text.length(); } segment = text.subSequence(startPosition, endPosition).toString(); startPosition = endPosition; return segment; } else { return null; } } /** * {@inheritDoc} */ public boolean hasNext() { return (startPosition < text.length()); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/legacy/ReaderCharSequence.java0000644000175000017500000001062311236362500027620 0ustar railrailpackage net.sourceforge.segment.srx.legacy; import java.io.IOException; import java.io.Reader; import net.sourceforge.segment.util.IORuntimeException; /** * Adapter of reader class to CharSequence interface. Due to behavior * differences CharSequence is not implemented perfectly. * * @author loomchild */ public class ReaderCharSequence implements CharSequence { public static final int DEFAULT_BUFFER_LENGTH = 64 * 1024; public static final int DEFAULT_LOOKAHEAD = 1; private Reader reader; private int lookahead; private Buffer buffer; private int position; private int length; /** * Create. * * @param reader reader from which char sequence will be read * @param length length of the input. When it cannot be determined it can be set to infinity * @param bufferLength size of the character buffer */ public ReaderCharSequence(Reader reader, int length, int bufferLength, int lookahead) { this.reader = reader; this.lookahead = lookahead; this.buffer = new Buffer(bufferLength); this.position = 0; this.length = length; fillBuffer(-1); } public ReaderCharSequence(Reader reader, int length, int bufferLength) { this(reader, length, bufferLength, DEFAULT_LOOKAHEAD); } public ReaderCharSequence(Reader reader, int length) { this(reader, length, DEFAULT_BUFFER_LENGTH); } public ReaderCharSequence(Reader reader) { this(reader, Integer.MAX_VALUE); } public int length() { return length; } public char charAt(int index) { if (index < 0 || index >= length) { throw new IndexOutOfBoundsException("Index " + index + " not in <0, " + length + ")"); } else if (index < getMinIndex()) { throw new IllegalStateException("Character lost, buffer too small."); } else { fillBuffer(index); if (index >= length) { throw new IndexOutOfBoundsException("End of stream."); } int relativeIndex = getRelativeIndex(index); char character = buffer.charAt(relativeIndex); return character; } } /** * The length of returned subsequence can be smaller than (end - start) when * the end of stream is reached. */ public CharSequence subSequence(int start, int end) { if (start < 0 || end < 0 || end > length || start > end) { throw new IndexOutOfBoundsException("Subsequence " + "<" + start + ", " + end + ") not in " + "<0, " + length + ")."); } else { fillBuffer(end - 1); if (end > length) { end = length; } if (start > end) { throw new IndexOutOfBoundsException("Subsequence " + "<" + start + ", " + end + ") not in " + "<0, " + length + ")."); } if (end - start > buffer.length() || start < getMinIndex()) { throw new IllegalStateException("Cannot retrieve subsequence " + "<" + start + ", " + end + "). " + "Characters lost, buffer too small."); } int relativeStart = getRelativeIndex(start); int relativeEnd = getRelativeIndex(end); CharSequence subSequence = buffer.subSequence(relativeStart, relativeEnd); return subSequence; } } /** *

Calculate buffer relative index from sequence index.

*
	 * The equation is this:
	 * relative index = index - (position - buffer length)
	 * Example:
	 * position = 10
	 * buffer length = 5
	 * index = 7
	 *                    
	 *                    |   |
	 * position - buffer  |   |  index
	 *            length  V   V
	 * 
	 * stream  |-|-|-|-|-|-|-|-|-|-|
	 *          0 1 2 3 4 5 6 7 8 9 10  <--- position
	 * 
	 *                        |
	 *                        |  relative index
	 *                        V
	 * 
	 * buffer            |-|-|-|-|-|
	 *                    0 1 2 3 4 
	 * 
	 * relative index = 2
	 * 
* * @param index sequence index * @return buffer relative index */ private int getRelativeIndex(int index) { return index - (position - buffer.length()); } private int getMinIndex() { return position - buffer.length(); } private void fillBuffer(int index) { // Index can be MAX_INT so all arithmetic operations should be // on the left side of equation to avoid integer overflow. while (index >= position - lookahead && position < length) { readCharacter(); } } private void readCharacter() { int readResult; try { readResult = reader.read(); } catch (IOException e) { throw new IORuntimeException(e); } if (readResult == -1) { length = position; } else { char character = (char)readResult; buffer.forceEnqueue(character); ++position; } } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/legacy/ReaderCharSequenceTest.java0000644000175000017500000000604411221113076030456 0ustar railrailpackage net.sourceforge.segment.srx.legacy; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; import java.io.StringReader; import org.junit.Before; import org.junit.Test; public class ReaderCharSequenceTest { private static final String DATA = "abcde"; private ReaderCharSequence sequence; @Before public void setUp() { StringReader reader = new StringReader(DATA); this.sequence = new ReaderCharSequence(reader, DATA.length(), 3); } @Test public void testWorking() { assertEquals(5, sequence.length()); assertEquals('a', sequence.charAt(0)); assertEquals('b', sequence.charAt(1)); assertEquals('c', sequence.charAt(2)); assertEquals('d', sequence.charAt(3)); assertEquals("cde", sequence.subSequence(2, 5).toString()); assertEquals('e', sequence.charAt(4)); assertEquals("de", sequence.subSequence(3, 5).toString()); assertEquals('d', sequence.charAt(3)); } public void testZeroLengthSubsequence() { assertEquals(0, sequence.subSequence(1, 1).length()); } @Test(expected = IllegalStateException.class) public void testSubsequenceLongerThanBuffer() { assertEquals("abcde", sequence.subSequence(0, 5).toString()); } @Test(expected = IllegalStateException.class) public void testReadBack() { sequence.charAt(4); sequence.charAt(0); } @Test(expected = IllegalStateException.class) public void testWindowTooNarrow() { sequence.subSequence(0, 5); } @Test(expected = IndexOutOfBoundsException.class) public void testIndexPastEnd() { sequence.charAt(5); } @Test(expected = IndexOutOfBoundsException.class) public void testIndexNegative() { sequence.charAt(-1); } @Test(expected = IndexOutOfBoundsException.class) public void testEndBeforeStart() { sequence.subSequence(2, 1); } @Test(expected = IndexOutOfBoundsException.class) public void testStartNegative() { sequence.subSequence(-1, 1); } @Test(expected = IndexOutOfBoundsException.class) public void testEndPastEnd() { sequence.subSequence(1, 6); } @Test public void testInfiniteStream() { StringReader reader = new StringReader(DATA); ReaderCharSequence infiniteSequence = new ReaderCharSequence(reader, Integer.MAX_VALUE, 3, 2); assertEquals('c', infiniteSequence.charAt(2)); assertEquals('e', infiniteSequence.charAt(4)); assertEquals(5, infiniteSequence.length()); try { infiniteSequence.charAt(5); fail(); } catch (IndexOutOfBoundsException e) { } } @Test public void testAllSubsequence() { StringReader reader = new StringReader(DATA); ReaderCharSequence infiniteSequence = new ReaderCharSequence(reader, Integer.MAX_VALUE, 5, 2); CharSequence subsequence = infiniteSequence.subSequence(0, infiniteSequence.length()); assertEquals(DATA, subsequence.toString()); } @Test public void testIterate() { StringReader reader = new StringReader(DATA); ReaderCharSequence infiniteSequence = new ReaderCharSequence(reader, Integer.MAX_VALUE, 3); for (int i = 0; i < infiniteSequence.length(); ++i) { assertEquals(DATA.charAt(i), infiniteSequence.charAt(i)); } } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/RuleMatcherTest.java0000644000175000017500000000230211222077614025735 0ustar railrailpackage net.sourceforge.segment.srx; import junit.framework.TestCase; public class RuleMatcherTest extends TestCase { public void testFind() { SrxDocument document = new SrxDocument(); Rule rule = new Rule(true, "ab+", "ca+"); String text = "abaabbcabcabcaa"; RuleMatcher matcher = new RuleMatcher(document, rule, text); assertFalse(matcher.hitEnd()); assertTrue(matcher.find()); assertFalse(matcher.hitEnd()); assertEquals(3, matcher.getStartPosition()); assertEquals(6, matcher.getBreakPosition()); assertEquals(8, matcher.getEndPosition()); assertTrue(matcher.find()); assertFalse(matcher.hitEnd()); assertEquals(7, matcher.getStartPosition()); assertEquals(9, matcher.getBreakPosition()); assertEquals(11, matcher.getEndPosition()); assertTrue(matcher.find()); assertFalse(matcher.hitEnd()); assertEquals(10, matcher.getStartPosition()); assertEquals(12, matcher.getBreakPosition()); assertEquals(15, matcher.getEndPosition()); assertFalse(matcher.find()); assertTrue(matcher.hitEnd()); assertTrue(matcher.find(6)); assertEquals(7, matcher.getStartPosition()); assertEquals(9, matcher.getBreakPosition()); assertEquals(11, matcher.getEndPosition()); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/SrxTransformer.java0000644000175000017500000000214511223073726025667 0ustar railrailpackage net.sourceforge.segment.srx; import java.io.Reader; import java.io.Writer; import java.util.Map; import net.sourceforge.segment.srx.io.SrxVersion; /** * Represents SRX document transformer between old versions and newest supported * version. Responsible for transforming using XSLT. * * @author loomchild * @see SrxVersion */ public interface SrxTransformer { /** * Transform given SRX document to newest supported version and write it to * given writer. * * @param reader reader containing SRX document * @param writer writer to write transformed SRX document * @param parameterMap map containing transformation parameters */ public void transform(Reader reader, Writer writer, Map parameterMap); /** * Transform given SRX document and return Reader containing newest * supported version. * * @param reader reader containing SRX document * @param parameterMap map containing transformation parameters * @return reader containing SRX document in newest supported version */ public Reader transform(Reader reader, Map parameterMap); } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/SrxDocumentCache.java0000644000175000017500000000232111223073726026063 0ustar railrailpackage net.sourceforge.segment.srx; import java.util.HashMap; import java.util.Map; /** * Represents SRX document cache. * Responsible for managing cached data. It can store more than one object * under one key as long as value class is different. * * @author loomchild */ public class SrxDocumentCache { private Map, Map> map; public SrxDocumentCache() { this.map = new HashMap, Map>(); } /** * Retrieves object from cache. * @param value object type * @param key * @param valueClass class of value object * @return value object */ @SuppressWarnings("unchecked") public T get(Object key, Class valueClass) { T value = null; Map klassMap = map.get(valueClass); if (klassMap != null) { value = (T)klassMap.get(key); } return value; } /** * Puts an object in cache. * @param value object type * @param key * @param value value object */ public void put(Object key, T value) { Map klassMap = map.get(value.getClass()); if (klassMap == null) { klassMap = new HashMap(); map.put(value.getClass(), klassMap); } klassMap.put(key, value); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/LanguageMapTest.java0000644000175000017500000000043011171331077025702 0ustar railrailpackage net.sourceforge.segment.srx; import junit.framework.TestCase; public class LanguageMapTest extends TestCase { public void testMatches() { LanguageMap map = new LanguageMap("PL.*", null); assertTrue(map.matches("PL_pl")); assertFalse(map.matches("EN_us")); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/LanguageRule.java0000644000175000017500000000275011223073726025245 0ustar railrailpackage net.sourceforge.segment.srx; import java.util.ArrayList; import java.util.Collections; import java.util.List; /** * Represents rule for segmenting text in some language. Contains {@link Rule} * list. * * @author loomchild */ public class LanguageRule { private List ruleList; private String name; /** * Creates language rule. * * @param name language rule name * @param ruleList rule list (it will be shallow copied) */ public LanguageRule(String name, List ruleList) { this.ruleList = new ArrayList(ruleList); this.name = name; } /** * Creates empty language rule. * * @param name language rule name */ public LanguageRule(String name) { this(name, new ArrayList()); } /** * @return unmodifiable rules list */ public List getRuleList() { return Collections.unmodifiableList(ruleList); } /** * Adds rule to the end of rule list. * @param rule */ public void addRule(Rule rule) { ruleList.add(rule); } /** * @return language rule name */ public String getName() { return name; } public int hashCode() { return name.hashCode(); } public boolean equals(Object object) { if (this == object) return true; if (object == null) return false; if (getClass() != object.getClass()) return false; LanguageRule other = (LanguageRule)object; if (name == null) { if (other.name != null) return false; } else if (!name.equals(other.name)) return false; return true; } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/LanguageMap.java0000644000175000017500000000155211223073726025052 0ustar railrailpackage net.sourceforge.segment.srx; import java.util.regex.Pattern; /** * Represents mapping between language code pattern and language rule. * * @author loomchild */ public class LanguageMap { private Pattern languagePattern; private LanguageRule languageRule; /** * Creates mapping. * * @param pattern language code pattern * @param languageRule language rule */ public LanguageMap(String pattern, LanguageRule languageRule) { this.languagePattern = Pattern.compile(pattern); this.languageRule = languageRule; } /** * @param languageCode language code * @return true if given language code matches language pattern */ public boolean matches(String languageCode) { return languagePattern.matcher(languageCode).matches(); } /** * @return language rule */ public LanguageRule getLanguageRule() { return languageRule; } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/Rule.java0000644000175000017500000000203411223475357023602 0ustar railrailpackage net.sourceforge.segment.srx; /** * Represents break or exception rule. Contains after break and before * break patterns, * * @author loomchild */ public class Rule { private boolean breaking; private String beforePattern; private String afterPattern; /** * Creates rule. * * @param breaking type of rule; true - break rule, false - exception rule * @param beforePattern pattern matching text before break * @param afterPattern pattern matching text after break */ public Rule(boolean breaking, String beforePattern, String afterPattern) { this.breaking = breaking; this.beforePattern = beforePattern; this.afterPattern = afterPattern; } /** * @return type of rule; true - break rule, false - exception rule */ public boolean isBreak() { return breaking; } /** * @return pattern matching text before break */ public String getBeforePattern() { return beforePattern; } /** * @return pattern matching text after break */ public String getAfterPattern() { return afterPattern; } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/RuleMatcher.java0000644000175000017500000000421211223073726025100 0ustar railrailpackage net.sourceforge.segment.srx; import java.util.regex.Matcher; import java.util.regex.Pattern; import net.sourceforge.segment.util.Util; /** * Represents matcher finding subsequent occurrences of one rule. * * @author loomchild */ public class RuleMatcher { @SuppressWarnings("unused") private SrxDocument document; private Rule rule; private CharSequence text; private Matcher beforeMatcher; private Matcher afterMatcher; boolean found; /** * Creates matcher. * @param rule rule which will be searched in the text * @param text */ public RuleMatcher(SrxDocument document, Rule rule, CharSequence text) { this.document = document; this.rule = rule; this.text = text; Pattern beforePattern = Util.compile(document, rule.getBeforePattern()); Pattern afterPattern = Util.compile(document, rule.getAfterPattern()); this.beforeMatcher = beforePattern.matcher(text); this.afterMatcher = afterPattern.matcher(text); this.found = true; } /** * Finds next rule match after previously found. * @return true if rule has been matched */ public boolean find() { found = false; while ((!found) && beforeMatcher.find()) { afterMatcher.region(beforeMatcher.end(), text.length()); found = afterMatcher.lookingAt(); } return found; } /** * Finds next rule match after given start position. * @param start start position * @return true if rule has been matched */ public boolean find(int start) { beforeMatcher.region(start, text.length()); return find(); } /** * @return true if end of text has been reached while searching */ public boolean hitEnd() { return !found; } /** * @return position in text where the last matching starts */ public int getStartPosition() { return beforeMatcher.start(); } /** * @return position in text where text should be splitted according to last matching */ public int getBreakPosition() { return afterMatcher.start(); } /** * @return position in text where the last matching ends */ public int getEndPosition() { return afterMatcher.end(); } /** * @return matcher rule */ public Rule getRule() { return rule; } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/SrxTextIteratorReaderTest.java0000644000175000017500000000137311236362500030003 0ustar railrailpackage net.sourceforge.segment.srx; import java.io.StringReader; import java.util.HashMap; import java.util.Map; import net.sourceforge.segment.TextIterator; public class SrxTextIteratorReaderTest extends AbstractSrxTextIteratorTest { private static final int BUFFER_SIZE = 60; private static final int MARGIN = 10; protected TextIterator getTextIterator(String text, SrxDocument document, String languageCode) { StringReader reader = new StringReader(text); Map parameterMap = new HashMap(); parameterMap.put(SrxTextIterator.BUFFER_LENGTH_PARAMETER, BUFFER_SIZE); parameterMap.put(SrxTextIterator.MARGIN_PARAMETER, MARGIN); return new SrxTextIterator(document, languageCode, reader, parameterMap); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/TextManager.java0000644000175000017500000001014611236362500025102 0ustar railrailpackage net.sourceforge.segment.srx; import java.io.IOException; import java.io.Reader; import net.sourceforge.segment.util.IORuntimeException; /** * Represents text manager. * Responsible for managing current text, reading more text from the reader * and checking if there is more text left. * @author loomchild */ public class TextManager { private CharSequence text; int nextCharacter; private Reader reader; private int bufferLength; /** * Creates text manager containing given text. Reading more text is not * possible when using this constructor. * @param text */ public TextManager(CharSequence text) { this.text = text; this.nextCharacter = -1; this.reader = null; this.bufferLength = text.length(); } /** * Creates text manager reading text from given reader. Only specified * amount of memory for buffer will be used. Managed text will never * be longer than given buffer size. * Text is not actually read until required (lazy initialization). * @param reader * @param bufferLength read buffer size */ public TextManager(Reader reader, int bufferLength) { if (bufferLength <= 0) { throw new IllegalArgumentException("Buffer size: " + bufferLength + " must be positive."); } this.text = null; this.reader = reader; this.bufferLength = bufferLength; } public int getBufferLength() { return bufferLength; } /** * @return current text */ public CharSequence getText() { initText(); return text; } /** * @return true if more text can be read */ public boolean hasMoreText() { initText(); return nextCharacter != -1; } /** * Deletes given amount of characters from current character buffer and * tries to read up to given amount of new characters and stores them in * current character buffer. * @param amount amount of characters to read * @throws IllegalArgumentException if {@link #hasMoreText()} returns false or amount is greater than buffer size */ public void readText(int amount) { initText(); if (amount <= 0) { throw new IllegalArgumentException("Amount must be positive."); } if (amount > bufferLength) { throw new IllegalArgumentException("Amount to read is larger than buffer size."); } if (!hasMoreText()) { throw new IllegalStateException("No more text to read."); } StringBuilder builder = new StringBuilder(); // Text length is equal to buffer size so it is safe. builder.append(text.subSequence(amount, text.length())); // Next character cannot be null here, so it is safe. builder.append((char)nextCharacter); builder.append(read(amount)); text = builder.toString(); } /** * Reads initial text from reader if it has not been initialized yet. */ private void initText() { if (text == null) { text = read(bufferLength + 1); } } /** * Reads the given amount of characters and returns them as a string. * Updates {@link #nextCharacter} by reading one additional character. * @param amount amount to be read * @return read characters as a string */ private String read(int amount) { char[] charBuffer = new char[amount]; int count = read(reader, charBuffer); String result; if (count == amount) { result = new String(charBuffer, 0, count - 1); nextCharacter = charBuffer[count - 1]; } else if (count > 0 && count < amount) { result = new String(charBuffer, 0, count); nextCharacter = -1; } else { result = ""; nextCharacter = -1; } return result; } /** * Reads specified amount of characters. It is needed because when * reading from console {@link Reader#read(char[])} it returns * after first end of line (probably it checks if characters are available). * @param reader input * @param buffer buffer where read characters will be stored * @return number of read characters */ private int read(Reader reader, char[] buffer) { try { int start = 0; int count; while (((count = reader.read(buffer, start, buffer.length - start)) != -1) && start < buffer.length) { start += count; } return start; } catch (IOException e) { throw new IORuntimeException(e); } } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/SrxTextIterator.java0000644000175000017500000002666311236362500026031 0ustar railrailpackage net.sourceforge.segment.srx; import static net.sourceforge.segment.util.Util.getParameter; import java.io.Reader; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import net.sourceforge.segment.AbstractTextIterator; import net.sourceforge.segment.util.IORuntimeException; /** * Represents text iterator splitting text according to rules in SRX file. * * The algorithm idea is as follows: * 1. Rule matcher list is created based on SRX file and language. Each rule * matcher is responsible for matching before break and after break regular * expressions of one break rule. * 2. Each rule matcher is matched to the text. If the rule was not found the * rule matcher is removed from the list. * 3. First rule matcher in terms of its break position in text is selected. * 4. List of exception rules corresponding to break rule is retrieved. * 5. If none of exception rules is matching in break position then * the text is marked as split and new segment is created. In addition * all rule matchers are moved so they start after the end of new segment * (which is the same as break position of the matched rule). * 6. All the rules that have break position behind last matched rule * break position are moved until they pass it. * 7. If segment was not found the whole process is repeated. * * In streaming version of this algorithm character buffer is searched. * When the end of it is reached or break position is in the margin * (break position > buffer size - margin) and there is more text, * the buffer is moved in the text until it starts after last found segment. * If this happens rule matchers are reinitialized and the text is searched again. * Streaming version has a limitation that read buffer must be at least as long * as any segment in the text. * * As this algorithm uses lookbehind extensively but Java does not permit * infinite regular expressions in lookbehind, so some patterns are finitized. * For example a* pattern will be changed to something like a{0,100}. * * @author loomchild */ public class SrxTextIterator extends AbstractTextIterator { /** * Margin size. Used in streaming splitter. * If rule is matched but its position is in the margin * (position > bufferLength - margin) then the matching is ignored, * and more text is read and rule is matched again. */ public static final String MARGIN_PARAMETER = "margin"; /** * Reader buffer size. Segments cannot be longer than this value. */ public static final String BUFFER_LENGTH_PARAMETER = "bufferLength"; /** * Maximum length of a regular expression construct that occurs in lookbehind. */ public static final String MAX_LOOKBEHIND_CONSTRUCT_LENGTH_PARAMETER = "maxLookbehindConstructLength"; /** * Default margin size. */ public static final int DEFAULT_MARGIN = 128; /** * Default size of read buffer when using streaming version of this class. * Any segment cannot be longer than buffer size. */ public static final int DEFAULT_BUFFER_LENGTH = 64 * 1024; /** * Default max lookbehind construct length parameter. */ public static final int DEFAULT_MAX_LOOKBEHIND_CONSTRUCT_LENGTH = 100; private SrxDocument document; private String segment; private int start, end; private TextManager textManager; private RuleManager ruleManager; private List ruleMatcherList; private int margin; /** * Creates text iterator that obtains language rules form given document * using given language code. This constructor version is not streaming * because it receives whole text as a string. * Supported parameters: {@link #MAX_LOOKBEHIND_CONSTRUCT_LENGTH_PARAMETER}. * * @param document SRX document * @param languageCode text language code of text used to retrieve the rules * @param text * @param parameterMap additional segmentation parameters */ public SrxTextIterator(SrxDocument document, String languageCode, CharSequence text, Map parameterMap) { parameterMap.put(MARGIN_PARAMETER, 0); init(document, languageCode, new TextManager(text), parameterMap); } /** * Creates text iterator with no additional parameters. * @see #SrxTextIterator(SrxDocument, String, CharSequence, Map) * @param document SRX document * @param languageCode text language code of text used to retrieve the rules * @param text */ public SrxTextIterator(SrxDocument document, String languageCode, CharSequence text) { this(document, languageCode, text, new HashMap()); } /** * Creates text iterator that obtains language rules from given document * using given language code. This is streaming constructor - it reads * text from reader using buffer with given size and margin. Single * segment cannot be longer than buffer size. * If rule is matched but its position is in the margin * (position > bufferLength - margin) then the matching is ignored, * and more text is read and rule is matched again. * This is needed because incomplete rule can be located at the end of the * buffer and never matched. * Supported parameters: {@link #BUFFER_LENGTH_PARAMETER}, * {@link #MARGIN_PARAMETER}, * {@link #MAX_LOOKBEHIND_CONSTRUCT_LENGTH_PARAMETER}. * * @param document SRX document * @param languageCode text language code of text used to retrieve the rules * @param reader reader from which read the text * @param parameterMap additional segmentation parameters */ public SrxTextIterator(SrxDocument document, String languageCode, Reader reader, Map parameterMap) { int bufferLength = getParameter(parameterMap.get(BUFFER_LENGTH_PARAMETER), DEFAULT_BUFFER_LENGTH); init(document, languageCode, new TextManager(reader, bufferLength), parameterMap); } /** * Creates streaming text iterator with no additional parameters. * @see SrxTextIterator#SrxTextIterator(SrxDocument, String, Reader, Map) * @param document SRX document * @param languageCode text language code of text used to retrieve the rules * @param reader reader from which read the text */ public SrxTextIterator(SrxDocument document, String languageCode, Reader reader) { this(document, languageCode, reader, new HashMap()); } /** * Finds the next segment in the text and returns it. * * @return next segment or null if it doesn't exist * @throws IllegalStateException if buffer is too small to hold the segment * @throws IORuntimeException if IO error occurs when reading the text */ public String next() { if (hasNext()) { // Initialize matchers before first search. if (segment == null) { initMatchers(); } boolean found = false; while (!found) { RuleMatcher minMatcher = getMinMatcher(); if (minMatcher == null && !textManager.hasMoreText()) { found = true; end = textManager.getText().length(); } else { if (textManager.hasMoreText() && (minMatcher == null || minMatcher.getBreakPosition() > textManager.getBufferLength() - margin)) { if (start == 0) { throw new IllegalStateException("Buffer too short"); } textManager.readText(start); start = 0; initMatchers(); minMatcher = getMinMatcher(); } end = minMatcher.getBreakPosition(); if (end > start) { found = isException(minMatcher); if (found) { cutMatchers(); } } } moveMatchers(); } segment = textManager.getText().subSequence(start, end).toString(); start = end; return segment; } else { return null; } } /** * @return true if there are more segments */ public boolean hasNext() { return (textManager.hasMoreText() || start < textManager.getText().length()); } /** * Initializes splitter. * * @param document SRX document * @param languageCode text language code * @param textManager text manager containing the text * @param parameterMap additional segmentation parameters */ private void init(SrxDocument document, String languageCode, TextManager textManager, Map parameterMap) { int margin = getParameter(parameterMap.get(MARGIN_PARAMETER), DEFAULT_MARGIN); int maxLookbehindConstructLength = getParameter(parameterMap.get( MAX_LOOKBEHIND_CONSTRUCT_LENGTH_PARAMETER), DEFAULT_MAX_LOOKBEHIND_CONSTRUCT_LENGTH); if (textManager.getBufferLength() > 0 && textManager.getBufferLength() <= margin) { throw new IllegalArgumentException("Margin: " + margin + " must be smaller than buffer itself: " + textManager.getBufferLength() + "."); } this.document = document; this.segment = null; this.start = 0; this.end = 0; this.textManager = textManager; this.margin = margin; List languageRuleList = document.getLanguageRuleList(languageCode); Object[] key = new Object[]{languageRuleList, maxLookbehindConstructLength}; this.ruleManager = document.getCache().get(key, RuleManager.class); if (ruleManager == null) { this.ruleManager = new RuleManager(document, languageRuleList, maxLookbehindConstructLength); document.getCache().put(key, ruleManager); } } /** * Initializes matcher list according to rules from ruleManager and * text from textManager. */ private void initMatchers() { this.ruleMatcherList = new LinkedList(); for (Rule rule : ruleManager.getBreakRuleList()) { RuleMatcher matcher = new RuleMatcher(document, rule, textManager.getText()); matcher.find(); if (!matcher.hitEnd()) { ruleMatcherList.add(matcher); } } } /** * Moves all matchers to the next position if their break position * is smaller than last segment end position. */ private void moveMatchers() { for (Iterator i = ruleMatcherList.iterator(); i.hasNext();) { RuleMatcher matcher = i.next(); while (matcher.getBreakPosition() <= end) { matcher.find(); if (matcher.hitEnd()) { i.remove(); break; } } } } /** * Move matchers that start before previous segment end. */ private void cutMatchers() { for (Iterator i = ruleMatcherList.iterator(); i.hasNext();) { RuleMatcher matcher = i.next(); if (matcher.getStartPosition() < end) { matcher.find(end); if (matcher.hitEnd()) { i.remove(); } } } } /** * @return first matcher in the text or null if there are no matchers */ private RuleMatcher getMinMatcher() { int minPosition = Integer.MAX_VALUE; RuleMatcher minMatcher = null; for (RuleMatcher matcher : ruleMatcherList) { if (matcher.getBreakPosition() < minPosition) { minPosition = matcher.getBreakPosition(); minMatcher = matcher; } } return minMatcher; } /** * Returns true if there are no exception rules preventing given * rule matcher from breaking the text. * @param ruleMatcher rule matcher * @return true if rule matcher breaks the text */ private boolean isException(RuleMatcher ruleMatcher) { Pattern pattern = ruleManager.getExceptionPattern(ruleMatcher.getRule()); if (pattern != null) { Matcher matcher = pattern.matcher(textManager.getText()); matcher.useTransparentBounds(true); matcher.region(ruleMatcher.getBreakPosition(), textManager.getText().length()); return !matcher.lookingAt(); } else { return true; } } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/0000755000175000017500000000000011300444112022415 5ustar railrailsegment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/Srx2Parser.java0000644000175000017500000000703111223475357025317 0ustar railrailpackage net.sourceforge.segment.srx.io; import static net.sourceforge.segment.util.Util.getContext; import static net.sourceforge.segment.util.Util.getReader; import static net.sourceforge.segment.util.Util.getResourceStream; import static net.sourceforge.segment.util.Util.getSchema; import java.io.Reader; import java.util.HashMap; import java.util.Map; import net.sourceforge.segment.srx.LanguageRule; import net.sourceforge.segment.srx.Rule; import net.sourceforge.segment.srx.SrxDocument; import net.sourceforge.segment.srx.SrxParser; import net.sourceforge.segment.srx.io.bind.Body; import net.sourceforge.segment.srx.io.bind.Languagemap; import net.sourceforge.segment.srx.io.bind.Languagerule; import net.sourceforge.segment.srx.io.bind.Srx; import net.sourceforge.segment.util.Bind; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * Represents SRX 2.0 document parser. Responsible for creating and initializing * Document according to given SRX. * * @author loomchild */ public class Srx2Parser implements SrxParser { private static final Log log = LogFactory.getLog(Srx2Parser.class); private static final String CONTEXT = "net.sourceforge.segment.srx.io.bind"; private static final String SCHEMA = "net/sourceforge/segment/res/xml/srx20.xsd"; private static Bind bind = createBind(); private static Bind createBind() { // Macintosh Java 1.5 work-around borrowed from okapi library // When you use -XstartOnFirstThread as a java -Xarg on Leopard, // your ContextClassloader gets set to null. // On other Macs setting this value breaks everything. if (Thread.currentThread().getContextClassLoader() == null) { Thread.currentThread().setContextClassLoader(Srx2Parser.class.getClassLoader()); } // Must pass the ClassLoader directly due to Java 1.5 bugs when using // custom ClassLoader. Bind bind = new Bind( getContext(CONTEXT, Srx2Parser.class.getClassLoader()), getSchema(getReader(getResourceStream(SCHEMA)))); return bind; } /** * Parses SRX document from reader. * * @param reader * @return initialized document */ public SrxDocument parse(Reader reader) { Srx srx = (Srx) bind.unmarshal(reader); SrxDocument document = new SrxDocument(); document.setCascade("yes".equals(srx.getHeader().getCascade())); Body body = srx.getBody(); Map languageRuleMap = new HashMap(); for (Languagerule lr : body.getLanguagerules().getLanguagerule()) { LanguageRule languageRule = new LanguageRule(lr .getLanguagerulename()); for (net.sourceforge.segment.srx.io.bind.Rule r : lr.getRule()) { boolean breakRule = !"no".equals(r.getBreak()); String before; if (r.getBeforebreak() != null) { before = r.getBeforebreak().getContent(); } else { before = ""; } String after; if (r.getAfterbreak() != null) { after = r.getAfterbreak().getContent(); } else { after = ""; } Rule rule = new Rule(breakRule, before, after); languageRule.addRule(rule); } languageRuleMap.put(languageRule.getName(), languageRule); } for (Languagemap lm : body.getMaprules().getLanguagemap()) { LanguageRule languageRule = languageRuleMap.get(lm .getLanguagerulename()); if (languageRule == null) { log.warn("Language map \"" + lm.getLanguagepattern() + "\": language rule \"" + lm.getLanguagerulename() + "\" not found."); } else { document.addLanguageMap(lm.getLanguagepattern(), languageRule); } } return document; } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/SrxParsersTest.java0000644000175000017500000000702711221113076026246 0ustar railrailpackage net.sourceforge.segment.srx.io; import static net.sourceforge.segment.util.Util.getReader; import static net.sourceforge.segment.util.Util.getResourceStream; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import java.io.Reader; import java.util.List; import net.sourceforge.segment.srx.LanguageRule; import net.sourceforge.segment.srx.Rule; import net.sourceforge.segment.srx.SrxDocument; import net.sourceforge.segment.srx.SrxParser; import net.sourceforge.segment.util.XmlException; import org.junit.Test; public class SrxParsersTest { public static final String SRX_1_DOCUMENT_NAME = "net/sourceforge/segment/res/test/example1.srx"; public static final String SRX_2_DOCUMENT_NAME = "net/sourceforge/segment/res/test/example.srx"; public static final String TICKET_1_DOCUMENT_NAME = "net/sourceforge/segment/res/test/ticket1.srx"; public static final String INVALID_DOCUMENT_NAME = "net/sourceforge/segment/res/test/invalid.srx"; @Test public void testSrx1Parse() { testSrx1Parse(new Srx1Parser()); } @Test public void testSrx2Parse() { testSrx2Parse(new Srx2Parser()); } @Test public void testAnyParse() { testSrx1Parse(new SrxAnyParser()); testSrx2Parse(new SrxAnyParser()); } @Test(expected = XmlException.class) public void testSrx2ParseInvalid() { Reader reader = getReader(getResourceStream(INVALID_DOCUMENT_NAME)); SrxParser parser = new Srx2Parser(); parser.parse(reader); } @Test public void testSrx2ParseTicket1() { Reader reader = getReader(getResourceStream(TICKET_1_DOCUMENT_NAME)); SrxParser parser = new Srx2Parser(); SrxDocument document = parser.parse(reader); assertTrue(document.getCascade()); List languageRuleList = document .getLanguageRuleList("en"); LanguageRule languageRule = languageRuleList.get(0); assertEquals("Default", languageRule.getName()); List ruleList = languageRule.getRuleList(); assertEquals(1, ruleList.size()); Rule rule = ruleList.get(0); assertEquals("[\\.!?…]['»\"”\\)\\]\\}]?\\u0002?\\s", rule.getBeforePattern()); assertEquals("", rule.getAfterPattern()); } public void testSrx1Parse(SrxParser parser) { Reader reader = getReader(getResourceStream(SRX_1_DOCUMENT_NAME)); SrxDocument document = parser.parse(reader); assertFalse(document.getCascade()); List languageRuleList = document .getLanguageRuleList("en"); assertEquals(1, languageRuleList.size()); LanguageRule languageRule = languageRuleList.get(0); assertEquals("Default", languageRule.getName()); List ruleList = languageRule.getRuleList(); assertEquals(5, ruleList.size()); Rule rule = ruleList.get(1); assertEquals("[Ee][Tt][Cc]\\.", rule.getBeforePattern()); assertEquals("\\s[a-z]", rule.getAfterPattern()); } public void testSrx2Parse(SrxParser parser) { Reader reader = getReader(getResourceStream(SRX_2_DOCUMENT_NAME)); SrxDocument document = parser.parse(reader); assertTrue(document.getCascade()); List languageRuleList = document .getLanguageRuleList("fr_FR"); assertEquals(2, languageRuleList.size()); LanguageRule languageRule = languageRuleList.get(0); assertEquals("French", languageRule.getName()); List ruleList = languageRule.getRuleList(); assertEquals(4, ruleList.size()); Rule rule = ruleList.get(1); assertEquals("\\s[Mm]lles\\.", rule.getBeforePattern()); assertEquals("\\s", rule.getAfterPattern()); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/SrxVersion.java0000644000175000017500000000426411223073726025425 0ustar railrailpackage net.sourceforge.segment.srx.io; import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; import java.util.regex.Matcher; import java.util.regex.Pattern; import net.sourceforge.segment.util.IORuntimeException; /** * Represents SRX version. * Responsible for retrieving SRX version from a reader without modifying it. * @author loomchild */ public enum SrxVersion { VERSION_1_0("1.0"), VERSION_2_0("2.0"); private final static int HEADER_BUFFER_LENGHT = 1024; private final static Pattern VERSION_PATTERN = Pattern .compile("]+version=\"([^\"]+)\""); private String versionString; private SrxVersion(String versionString) { this.versionString = versionString; } public String toString() { return versionString; } public static SrxVersion parse(String versionString) { for (SrxVersion version : SrxVersion.values()) { if (version.versionString.equals(versionString)) { return version; } } throw new IllegalArgumentException("Unrecognized version: " + versionString + "."); } /** * Returns SRX document version. Works simply by looking in document header * of length {@link #HEADER_BUFFER_LENGHT} * and trying to match version regular expression, so it is imperfect. * It must be possible to mark the given reader ({@link Reader#mark(int)}). * * @param reader buffered reader containing SRX document with unknown version * @return version string * @throws IORuntimeException if IO error occurs * @throws IllegalArgumentException if reader does not support marking */ public static SrxVersion parse(BufferedReader reader) { try { if (!reader.markSupported()) { throw new IllegalArgumentException("Mark not supported for reader."); } reader.mark(HEADER_BUFFER_LENGHT); char[] headerBuffer = new char[HEADER_BUFFER_LENGHT]; int count = reader.read(headerBuffer); String header = new String(headerBuffer, 0, count); reader.reset(); Matcher matcher = VERSION_PATTERN.matcher(header); String versionString = null; if (matcher.find()) { versionString = matcher.group(1); } return parse(versionString); } catch (IOException e) { throw new IORuntimeException(e); } } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/Srx1Parser.java0000644000175000017500000000201611223073726025306 0ustar railrailpackage net.sourceforge.segment.srx.io; import java.io.Reader; import java.util.Collections; import java.util.Map; import net.sourceforge.segment.srx.SrxDocument; import net.sourceforge.segment.srx.SrxParser; import net.sourceforge.segment.srx.SrxTransformer; /** * Represents SRX 1.0 parser. Transforms document to SRX 2.0 using * {@link Srx1Transformer} and then parses it using {@link Srx2Parser}. * * @author loomchild */ public class Srx1Parser implements SrxParser { /** * Transforms document to SRX 2.0 using {@link Srx1Transformer} and default * transformation parameters and parses it using {@link Srx2Parser}. * * @param reader reader from which read the document * @return initialized SRX document */ public SrxDocument parse(Reader reader) { SrxTransformer transformer = new Srx1Transformer(); Map parameterMap = Collections.emptyMap(); Reader reader2 = transformer.transform(reader, parameterMap); SrxParser parser2 = new Srx2Parser(); return parser2.parse(reader2); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/SrxVersionTest.java0000644000175000017500000000306211221113076026247 0ustar railrailpackage net.sourceforge.segment.srx.io; import static net.sourceforge.segment.util.Util.getReader; import static net.sourceforge.segment.util.Util.getResourceStream; import java.io.BufferedReader; import junit.framework.TestCase; public class SrxVersionTest extends TestCase { public static final String SRX_1_DOCUMENT_NAME = "net/sourceforge/segment/res/test/example1.srx"; public static final String SRX_2_DOCUMENT_NAME = "net/sourceforge/segment/res/test/example2.srx"; public static final String NO_SRX_DOCUMENT_NAME = "net/sourceforge/segment/res/test/some.xml"; public static final String SRX_NOVERSION_DOCUMENT_NAME = "net/sourceforge/segment/res/test/invalid.srx"; public void testGetSrxVersion() { BufferedReader reader = new BufferedReader( getReader(getResourceStream(SRX_1_DOCUMENT_NAME))); SrxVersion version = SrxVersion.parse(reader); assertEquals(SrxVersion.VERSION_1_0, version); reader = new BufferedReader( getReader(getResourceStream(SRX_2_DOCUMENT_NAME))); version = SrxVersion.parse(reader); assertEquals(SrxVersion.VERSION_2_0, version); try { reader = new BufferedReader( getReader(getResourceStream(NO_SRX_DOCUMENT_NAME))); SrxVersion.parse(reader); fail("Recognized version of non SRX document."); } catch (IllegalArgumentException e) { // OK } try { reader = new BufferedReader( getReader(getResourceStream(SRX_NOVERSION_DOCUMENT_NAME))); SrxVersion.parse(reader); fail("Recognized version of SRX document without version."); } catch (IllegalArgumentException e) { // OK } } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/bind/0000755000175000017500000000000011300444112023331 5ustar railrailsegment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/bind/Body.java0000644000175000017500000000467711171331077025122 0ustar railrail// // This file was generated by the JavaTM Architecture for XML Binding(JAXB) Reference Implementation, vJAXB 2.1.3 in JDK 1.6 // See http://java.sun.com/xml/jaxb // Any modifications to this file will be lost upon recompilation of the source schema. // Generated on: 2009.04.15 at 10:20:40 AM BST // package net.sourceforge.segment.srx.io.bind; import javax.xml.bind.annotation.XmlAccessType; import javax.xml.bind.annotation.XmlAccessorType; import javax.xml.bind.annotation.XmlElement; import javax.xml.bind.annotation.XmlRootElement; import javax.xml.bind.annotation.XmlType; /** *

Java class for anonymous complex type. * *

The following schema fragment specifies the expected content contained within this class. * *

 * <complexType>
 *   <complexContent>
 *     <restriction base="{http://www.w3.org/2001/XMLSchema}anyType">
 *       <sequence>
 *         <element ref="{http://www.lisa.org/srx20}languagerules"/>
 *         <element ref="{http://www.lisa.org/srx20}maprules"/>
 *       </sequence>
 *     </restriction>
 *   </complexContent>
 * </complexType>
 * 
* * */ @XmlAccessorType(XmlAccessType.FIELD) @XmlType(name = "", propOrder = { "languagerules", "maprules" }) @XmlRootElement(name = "body") public class Body { @XmlElement(required = true) protected Languagerules languagerules; @XmlElement(required = true) protected Maprules maprules; /** * Gets the value of the languagerules property. * * @return * possible object is * {@link Languagerules } * */ public Languagerules getLanguagerules() { return languagerules; } /** * Sets the value of the languagerules property. * * @param value * allowed object is * {@link Languagerules } * */ public void setLanguagerules(Languagerules value) { this.languagerules = value; } /** * Gets the value of the maprules property. * * @return * possible object is * {@link Maprules } * */ public Maprules getMaprules() { return maprules; } /** * Sets the value of the maprules property. * * @param value * allowed object is * {@link Maprules } * */ public void setMaprules(Maprules value) { this.maprules = value; } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/bind/Header.java0000644000175000017500000001156411171542024025402 0ustar railrail// // This file was generated by the JavaTM Architecture for XML Binding(JAXB) Reference Implementation, vJAXB 2.1.3 in JDK 1.6 // See http://java.sun.com/xml/jaxb // Any modifications to this file will be lost upon recompilation of the source schema. // Generated on: 2009.04.15 at 10:20:40 AM BST // package net.sourceforge.segment.srx.io.bind; import java.util.ArrayList; import java.util.List; import javax.xml.bind.annotation.XmlAccessType; import javax.xml.bind.annotation.XmlAccessorType; import javax.xml.bind.annotation.XmlAnyElement; import javax.xml.bind.annotation.XmlAttribute; import javax.xml.bind.annotation.XmlRootElement; import javax.xml.bind.annotation.XmlType; import org.w3c.dom.Element; /** *

Java class for anonymous complex type. * *

The following schema fragment specifies the expected content contained within this class. * *

 * <complexType>
 *   <complexContent>
 *     <restriction base="{http://www.w3.org/2001/XMLSchema}anyType">
 *       <sequence>
 *         <element ref="{http://www.lisa.org/srx20}formathandle" maxOccurs="3" minOccurs="0"/>
 *         <any/>
 *       </sequence>
 *       <attribute name="segmentsubflows" use="required">
 *         <simpleType>
 *           <restriction base="{http://www.w3.org/2001/XMLSchema}string">
 *             <enumeration value="yes"/>
 *             <enumeration value="no"/>
 *           </restriction>
 *         </simpleType>
 *       </attribute>
 *       <attribute name="cascade" use="required">
 *         <simpleType>
 *           <restriction base="{http://www.w3.org/2001/XMLSchema}string">
 *             <enumeration value="yes"/>
 *             <enumeration value="no"/>
 *           </restriction>
 *         </simpleType>
 *       </attribute>
 *     </restriction>
 *   </complexContent>
 * </complexType>
 * 
* * */ @XmlAccessorType(XmlAccessType.FIELD) @XmlType(name = "", propOrder = { "formathandle", "any" }) @XmlRootElement(name = "header") public class Header { protected List formathandle; @XmlAnyElement(lax = true) protected List any; @XmlAttribute(required = true) protected String segmentsubflows; @XmlAttribute(required = true) protected String cascade; /** * Gets the value of the formathandle property. * *

* This accessor method returns a reference to the live list, * not a snapshot. Therefore any modification you make to the * returned list will be present inside the JAXB object. * This is why there is not a set method for the formathandle property. * *

* For example, to add a new item, do as follows: *

     *    getFormathandle().add(newItem);
     * 
* * *

* Objects of the following type(s) are allowed in the list * {@link Formathandle } * * */ public List getFormathandle() { if (formathandle == null) { formathandle = new ArrayList(); } return this.formathandle; } /** * Gets the value of the any property. * *

* This accessor method returns a reference to the live list, * not a snapshot. Therefore any modification you make to the * returned list will be present inside the JAXB object. * This is why there is not a set method for the any property. * *

* For example, to add a new item, do as follows: *

     *    getAny().add(newItem);
     * 
* * *

* Objects of the following type(s) are allowed in the list * {@link Element } * {@link Object } * * */ public List getAny() { if (any == null) { any = new ArrayList(); } return this.any; } /** * Gets the value of the segmentsubflows property. * * @return * possible object is * {@link String } * */ public String getSegmentsubflows() { return segmentsubflows; } /** * Sets the value of the segmentsubflows property. * * @param value * allowed object is * {@link String } * */ public void setSegmentsubflows(String value) { this.segmentsubflows = value; } /** * Gets the value of the cascade property. * * @return * possible object is * {@link String } * */ public String getCascade() { return cascade; } /** * Sets the value of the cascade property. * * @param value * allowed object is * {@link String } * */ public void setCascade(String value) { this.cascade = value; } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/bind/Beforebreak.java0000644000175000017500000000320111171331077026412 0ustar railrail// // This file was generated by the JavaTM Architecture for XML Binding(JAXB) Reference Implementation, vJAXB 2.1.3 in JDK 1.6 // See http://java.sun.com/xml/jaxb // Any modifications to this file will be lost upon recompilation of the source schema. // Generated on: 2009.04.15 at 10:20:40 AM BST // package net.sourceforge.segment.srx.io.bind; import javax.xml.bind.annotation.XmlAccessType; import javax.xml.bind.annotation.XmlAccessorType; import javax.xml.bind.annotation.XmlRootElement; import javax.xml.bind.annotation.XmlType; import javax.xml.bind.annotation.XmlValue; /** *

Java class for anonymous complex type. * *

The following schema fragment specifies the expected content contained within this class. * *

 * <complexType>
 *   <complexContent>
 *     <restriction base="{http://www.w3.org/2001/XMLSchema}anyType">
 *     </restriction>
 *   </complexContent>
 * </complexType>
 * 
* * */ @XmlAccessorType(XmlAccessType.FIELD) @XmlType(name = "", propOrder = { "content" }) @XmlRootElement(name = "beforebreak") public class Beforebreak { @XmlValue protected String content; /** * Gets the value of the content property. * * @return * possible object is * {@link String } * */ public String getContent() { return content; } /** * Sets the value of the content property. * * @param value * allowed object is * {@link String } * */ public void setContent(String value) { this.content = value; } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/bind/Languagerules.java0000644000175000017500000000436011171542024027004 0ustar railrail// // This file was generated by the JavaTM Architecture for XML Binding(JAXB) Reference Implementation, vJAXB 2.1.3 in JDK 1.6 // See http://java.sun.com/xml/jaxb // Any modifications to this file will be lost upon recompilation of the source schema. // Generated on: 2009.04.15 at 10:20:40 AM BST // package net.sourceforge.segment.srx.io.bind; import java.util.ArrayList; import java.util.List; import javax.xml.bind.annotation.XmlAccessType; import javax.xml.bind.annotation.XmlAccessorType; import javax.xml.bind.annotation.XmlElement; import javax.xml.bind.annotation.XmlRootElement; import javax.xml.bind.annotation.XmlType; /** *

Java class for anonymous complex type. * *

The following schema fragment specifies the expected content contained within this class. * *

 * <complexType>
 *   <complexContent>
 *     <restriction base="{http://www.w3.org/2001/XMLSchema}anyType">
 *       <sequence>
 *         <element ref="{http://www.lisa.org/srx20}languagerule" maxOccurs="unbounded"/>
 *       </sequence>
 *     </restriction>
 *   </complexContent>
 * </complexType>
 * 
* * */ @XmlAccessorType(XmlAccessType.FIELD) @XmlType(name = "", propOrder = { "languagerule" }) @XmlRootElement(name = "languagerules") public class Languagerules { @XmlElement(required = true) protected List languagerule; /** * Gets the value of the languagerule property. * *

* This accessor method returns a reference to the live list, * not a snapshot. Therefore any modification you make to the * returned list will be present inside the JAXB object. * This is why there is not a set method for the languagerule property. * *

* For example, to add a new item, do as follows: *

     *    getLanguagerule().add(newItem);
     * 
* * *

* Objects of the following type(s) are allowed in the list * {@link Languagerule } * * */ public List getLanguagerule() { if (languagerule == null) { languagerule = new ArrayList(); } return this.languagerule; } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/bind/Afterbreak.java0000644000175000017500000000317711171331077026265 0ustar railrail// // This file was generated by the JavaTM Architecture for XML Binding(JAXB) Reference Implementation, vJAXB 2.1.3 in JDK 1.6 // See http://java.sun.com/xml/jaxb // Any modifications to this file will be lost upon recompilation of the source schema. // Generated on: 2009.04.15 at 10:20:40 AM BST // package net.sourceforge.segment.srx.io.bind; import javax.xml.bind.annotation.XmlAccessType; import javax.xml.bind.annotation.XmlAccessorType; import javax.xml.bind.annotation.XmlRootElement; import javax.xml.bind.annotation.XmlType; import javax.xml.bind.annotation.XmlValue; /** *

Java class for anonymous complex type. * *

The following schema fragment specifies the expected content contained within this class. * *

 * <complexType>
 *   <complexContent>
 *     <restriction base="{http://www.w3.org/2001/XMLSchema}anyType">
 *     </restriction>
 *   </complexContent>
 * </complexType>
 * 
* * */ @XmlAccessorType(XmlAccessType.FIELD) @XmlType(name = "", propOrder = { "content" }) @XmlRootElement(name = "afterbreak") public class Afterbreak { @XmlValue protected String content; /** * Gets the value of the content property. * * @return * possible object is * {@link String } * */ public String getContent() { return content; } /** * Sets the value of the content property. * * @param value * allowed object is * {@link String } * */ public void setContent(String value) { this.content = value; } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/bind/Srx.java0000644000175000017500000000617511171331077024774 0ustar railrail// // This file was generated by the JavaTM Architecture for XML Binding(JAXB) Reference Implementation, vJAXB 2.1.3 in JDK 1.6 // See http://java.sun.com/xml/jaxb // Any modifications to this file will be lost upon recompilation of the source schema. // Generated on: 2009.04.15 at 10:20:40 AM BST // package net.sourceforge.segment.srx.io.bind; import javax.xml.bind.annotation.XmlAccessType; import javax.xml.bind.annotation.XmlAccessorType; import javax.xml.bind.annotation.XmlAttribute; import javax.xml.bind.annotation.XmlElement; import javax.xml.bind.annotation.XmlRootElement; import javax.xml.bind.annotation.XmlType; /** *

Java class for anonymous complex type. * *

The following schema fragment specifies the expected content contained within this class. * *

 * <complexType>
 *   <complexContent>
 *     <restriction base="{http://www.w3.org/2001/XMLSchema}anyType">
 *       <sequence>
 *         <element ref="{http://www.lisa.org/srx20}header"/>
 *         <element ref="{http://www.lisa.org/srx20}body"/>
 *       </sequence>
 *       <attribute name="version" use="required">
 *         <simpleType>
 *           <restriction base="{http://www.w3.org/2001/XMLSchema}string">
 *             <enumeration value="2.0"/>
 *           </restriction>
 *         </simpleType>
 *       </attribute>
 *     </restriction>
 *   </complexContent>
 * </complexType>
 * 
* * */ @XmlAccessorType(XmlAccessType.FIELD) @XmlType(name = "", propOrder = { "header", "body" }) @XmlRootElement(name = "srx") public class Srx { @XmlElement(required = true) protected Header header; @XmlElement(required = true) protected Body body; @XmlAttribute(required = true) protected String version; /** * Gets the value of the header property. * * @return * possible object is * {@link Header } * */ public Header getHeader() { return header; } /** * Sets the value of the header property. * * @param value * allowed object is * {@link Header } * */ public void setHeader(Header value) { this.header = value; } /** * Gets the value of the body property. * * @return * possible object is * {@link Body } * */ public Body getBody() { return body; } /** * Sets the value of the body property. * * @param value * allowed object is * {@link Body } * */ public void setBody(Body value) { this.body = value; } /** * Gets the value of the version property. * * @return * possible object is * {@link String } * */ public String getVersion() { return version; } /** * Sets the value of the version property. * * @param value * allowed object is * {@link String } * */ public void setVersion(String value) { this.version = value; } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/bind/package-info.java0000644000175000017500000000103511171331077026532 0ustar railrail// // This file was generated by the JavaTM Architecture for XML Binding(JAXB) Reference Implementation, vJAXB 2.1.3 in JDK 1.6 // See http://java.sun.com/xml/jaxb // Any modifications to this file will be lost upon recompilation of the source schema. // Generated on: 2009.04.15 at 10:20:40 AM BST // @javax.xml.bind.annotation.XmlSchema(namespace = "http://www.lisa.org/srx20", elementFormDefault = javax.xml.bind.annotation.XmlNsForm.QUALIFIED) package net.sourceforge.segment.srx.io.bind; segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/bind/Languagemap.java0000644000175000017500000000473611171331077026442 0ustar railrail// // This file was generated by the JavaTM Architecture for XML Binding(JAXB) Reference Implementation, vJAXB 2.1.3 in JDK 1.6 // See http://java.sun.com/xml/jaxb // Any modifications to this file will be lost upon recompilation of the source schema. // Generated on: 2009.04.15 at 10:20:40 AM BST // package net.sourceforge.segment.srx.io.bind; import javax.xml.bind.annotation.XmlAccessType; import javax.xml.bind.annotation.XmlAccessorType; import javax.xml.bind.annotation.XmlAttribute; import javax.xml.bind.annotation.XmlRootElement; import javax.xml.bind.annotation.XmlType; /** *

Java class for anonymous complex type. * *

The following schema fragment specifies the expected content contained within this class. * *

 * <complexType>
 *   <complexContent>
 *     <restriction base="{http://www.w3.org/2001/XMLSchema}anyType">
 *       <attribute name="languagerulename" use="required" type="{http://www.w3.org/2001/XMLSchema}string" />
 *       <attribute name="languagepattern" use="required" type="{http://www.w3.org/2001/XMLSchema}string" />
 *     </restriction>
 *   </complexContent>
 * </complexType>
 * 
* * */ @XmlAccessorType(XmlAccessType.FIELD) @XmlType(name = "") @XmlRootElement(name = "languagemap") public class Languagemap { @XmlAttribute(required = true) protected String languagerulename; @XmlAttribute(required = true) protected String languagepattern; /** * Gets the value of the languagerulename property. * * @return * possible object is * {@link String } * */ public String getLanguagerulename() { return languagerulename; } /** * Sets the value of the languagerulename property. * * @param value * allowed object is * {@link String } * */ public void setLanguagerulename(String value) { this.languagerulename = value; } /** * Gets the value of the languagepattern property. * * @return * possible object is * {@link String } * */ public String getLanguagepattern() { return languagepattern; } /** * Sets the value of the languagepattern property. * * @param value * allowed object is * {@link String } * */ public void setLanguagepattern(String value) { this.languagepattern = value; } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/bind/ObjectFactory.java0000644000175000017500000000556311171331077026756 0ustar railrail// // This file was generated by the JavaTM Architecture for XML Binding(JAXB) Reference Implementation, vJAXB 2.1.3 in JDK 1.6 // See http://java.sun.com/xml/jaxb // Any modifications to this file will be lost upon recompilation of the source schema. // Generated on: 2009.04.15 at 10:20:40 AM BST // package net.sourceforge.segment.srx.io.bind; import javax.xml.bind.annotation.XmlRegistry; /** * This object contains factory methods for each * Java content interface and Java element interface * generated in the net.sourceforge.segment.srx.io.bind package. *

An ObjectFactory allows you to programatically * construct new instances of the Java representation * for XML content. The Java representation of XML * content can consist of schema derived interfaces * and classes representing the binding of schema * type definitions, element declarations and model * groups. Factory methods for each of these are * provided in this class. * */ @XmlRegistry public class ObjectFactory { /** * Create a new ObjectFactory that can be used to create new instances of schema derived classes for package: net.sourceforge.segment.srx.io.bind * */ public ObjectFactory() { } /** * Create an instance of {@link Header } * */ public Header createHeader() { return new Header(); } /** * Create an instance of {@link Beforebreak } * */ public Beforebreak createBeforebreak() { return new Beforebreak(); } /** * Create an instance of {@link Maprules } * */ public Maprules createMaprules() { return new Maprules(); } /** * Create an instance of {@link Body } * */ public Body createBody() { return new Body(); } /** * Create an instance of {@link Rule } * */ public Rule createRule() { return new Rule(); } /** * Create an instance of {@link Languagerules } * */ public Languagerules createLanguagerules() { return new Languagerules(); } /** * Create an instance of {@link Afterbreak } * */ public Afterbreak createAfterbreak() { return new Afterbreak(); } /** * Create an instance of {@link Languagemap } * */ public Languagemap createLanguagemap() { return new Languagemap(); } /** * Create an instance of {@link Formathandle } * */ public Formathandle createFormathandle() { return new Formathandle(); } /** * Create an instance of {@link Srx } * */ public Srx createSrx() { return new Srx(); } /** * Create an instance of {@link Languagerule } * */ public Languagerule createLanguagerule() { return new Languagerule(); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/bind/Formathandle.java0000644000175000017500000000547511171331077026626 0ustar railrail// // This file was generated by the JavaTM Architecture for XML Binding(JAXB) Reference Implementation, vJAXB 2.1.3 in JDK 1.6 // See http://java.sun.com/xml/jaxb // Any modifications to this file will be lost upon recompilation of the source schema. // Generated on: 2009.04.15 at 10:20:40 AM BST // package net.sourceforge.segment.srx.io.bind; import javax.xml.bind.annotation.XmlAccessType; import javax.xml.bind.annotation.XmlAccessorType; import javax.xml.bind.annotation.XmlAttribute; import javax.xml.bind.annotation.XmlRootElement; import javax.xml.bind.annotation.XmlType; /** *

Java class for anonymous complex type. * *

The following schema fragment specifies the expected content contained within this class. * *

 * <complexType>
 *   <complexContent>
 *     <restriction base="{http://www.w3.org/2001/XMLSchema}anyType">
 *       <attribute name="include" use="required">
 *         <simpleType>
 *           <restriction base="{http://www.w3.org/2001/XMLSchema}string">
 *             <enumeration value="yes"/>
 *             <enumeration value="no"/>
 *           </restriction>
 *         </simpleType>
 *       </attribute>
 *       <attribute name="type" use="required">
 *         <simpleType>
 *           <restriction base="{http://www.w3.org/2001/XMLSchema}string">
 *             <enumeration value="start"/>
 *             <enumeration value="end"/>
 *             <enumeration value="isolated"/>
 *           </restriction>
 *         </simpleType>
 *       </attribute>
 *     </restriction>
 *   </complexContent>
 * </complexType>
 * 
* * */ @XmlAccessorType(XmlAccessType.FIELD) @XmlType(name = "") @XmlRootElement(name = "formathandle") public class Formathandle { @XmlAttribute(required = true) protected String include; @XmlAttribute(required = true) protected String type; /** * Gets the value of the include property. * * @return * possible object is * {@link String } * */ public String getInclude() { return include; } /** * Sets the value of the include property. * * @param value * allowed object is * {@link String } * */ public void setInclude(String value) { this.include = value; } /** * Gets the value of the type property. * * @return * possible object is * {@link String } * */ public String getType() { return type; } /** * Sets the value of the type property. * * @param value * allowed object is * {@link String } * */ public void setType(String value) { this.type = value; } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/bind/Maprules.java0000644000175000017500000000433011171542024025773 0ustar railrail// // This file was generated by the JavaTM Architecture for XML Binding(JAXB) Reference Implementation, vJAXB 2.1.3 in JDK 1.6 // See http://java.sun.com/xml/jaxb // Any modifications to this file will be lost upon recompilation of the source schema. // Generated on: 2009.04.15 at 10:20:40 AM BST // package net.sourceforge.segment.srx.io.bind; import java.util.ArrayList; import java.util.List; import javax.xml.bind.annotation.XmlAccessType; import javax.xml.bind.annotation.XmlAccessorType; import javax.xml.bind.annotation.XmlElement; import javax.xml.bind.annotation.XmlRootElement; import javax.xml.bind.annotation.XmlType; /** *

Java class for anonymous complex type. * *

The following schema fragment specifies the expected content contained within this class. * *

 * <complexType>
 *   <complexContent>
 *     <restriction base="{http://www.w3.org/2001/XMLSchema}anyType">
 *       <sequence>
 *         <element ref="{http://www.lisa.org/srx20}languagemap" maxOccurs="unbounded"/>
 *       </sequence>
 *     </restriction>
 *   </complexContent>
 * </complexType>
 * 
* * */ @XmlAccessorType(XmlAccessType.FIELD) @XmlType(name = "", propOrder = { "languagemap" }) @XmlRootElement(name = "maprules") public class Maprules { @XmlElement(required = true) protected List languagemap; /** * Gets the value of the languagemap property. * *

* This accessor method returns a reference to the live list, * not a snapshot. Therefore any modification you make to the * returned list will be present inside the JAXB object. * This is why there is not a set method for the languagemap property. * *

* For example, to add a new item, do as follows: *

     *    getLanguagemap().add(newItem);
     * 
* * *

* Objects of the following type(s) are allowed in the list * {@link Languagemap } * * */ public List getLanguagemap() { if (languagemap == null) { languagemap = new ArrayList(); } return this.languagemap; } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/bind/Rule.java0000644000175000017500000000632511171331077025124 0ustar railrail// // This file was generated by the JavaTM Architecture for XML Binding(JAXB) Reference Implementation, vJAXB 2.1.3 in JDK 1.6 // See http://java.sun.com/xml/jaxb // Any modifications to this file will be lost upon recompilation of the source schema. // Generated on: 2009.04.15 at 10:20:40 AM BST // package net.sourceforge.segment.srx.io.bind; import javax.xml.bind.annotation.XmlAccessType; import javax.xml.bind.annotation.XmlAccessorType; import javax.xml.bind.annotation.XmlAttribute; import javax.xml.bind.annotation.XmlRootElement; import javax.xml.bind.annotation.XmlType; /** *

Java class for anonymous complex type. * *

The following schema fragment specifies the expected content contained within this class. * *

 * <complexType>
 *   <complexContent>
 *     <restriction base="{http://www.w3.org/2001/XMLSchema}anyType">
 *       <sequence>
 *         <element ref="{http://www.lisa.org/srx20}beforebreak" minOccurs="0"/>
 *         <element ref="{http://www.lisa.org/srx20}afterbreak" minOccurs="0"/>
 *       </sequence>
 *       <attribute name="break">
 *         <simpleType>
 *           <restriction base="{http://www.w3.org/2001/XMLSchema}string">
 *             <enumeration value="yes"/>
 *             <enumeration value="no"/>
 *           </restriction>
 *         </simpleType>
 *       </attribute>
 *     </restriction>
 *   </complexContent>
 * </complexType>
 * 
* * */ @XmlAccessorType(XmlAccessType.FIELD) @XmlType(name = "", propOrder = { "beforebreak", "afterbreak" }) @XmlRootElement(name = "rule") public class Rule { protected Beforebreak beforebreak; protected Afterbreak afterbreak; @XmlAttribute(name = "break") protected String _break; /** * Gets the value of the beforebreak property. * * @return * possible object is * {@link Beforebreak } * */ public Beforebreak getBeforebreak() { return beforebreak; } /** * Sets the value of the beforebreak property. * * @param value * allowed object is * {@link Beforebreak } * */ public void setBeforebreak(Beforebreak value) { this.beforebreak = value; } /** * Gets the value of the afterbreak property. * * @return * possible object is * {@link Afterbreak } * */ public Afterbreak getAfterbreak() { return afterbreak; } /** * Sets the value of the afterbreak property. * * @param value * allowed object is * {@link Afterbreak } * */ public void setAfterbreak(Afterbreak value) { this.afterbreak = value; } /** * Gets the value of the break property. * * @return * possible object is * {@link String } * */ public String getBreak() { return _break; } /** * Sets the value of the break property. * * @param value * allowed object is * {@link String } * */ public void setBreak(String value) { this._break = value; } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/bind/Languagerule.java0000644000175000017500000000555011171542024026623 0ustar railrail// // This file was generated by the JavaTM Architecture for XML Binding(JAXB) Reference Implementation, vJAXB 2.1.3 in JDK 1.6 // See http://java.sun.com/xml/jaxb // Any modifications to this file will be lost upon recompilation of the source schema. // Generated on: 2009.04.15 at 10:20:40 AM BST // package net.sourceforge.segment.srx.io.bind; import java.util.ArrayList; import java.util.List; import javax.xml.bind.annotation.XmlAccessType; import javax.xml.bind.annotation.XmlAccessorType; import javax.xml.bind.annotation.XmlAttribute; import javax.xml.bind.annotation.XmlElement; import javax.xml.bind.annotation.XmlRootElement; import javax.xml.bind.annotation.XmlType; /** *

Java class for anonymous complex type. * *

The following schema fragment specifies the expected content contained within this class. * *

 * <complexType>
 *   <complexContent>
 *     <restriction base="{http://www.w3.org/2001/XMLSchema}anyType">
 *       <sequence>
 *         <element ref="{http://www.lisa.org/srx20}rule" maxOccurs="unbounded"/>
 *       </sequence>
 *       <attribute name="languagerulename" use="required" type="{http://www.w3.org/2001/XMLSchema}string" />
 *     </restriction>
 *   </complexContent>
 * </complexType>
 * 
* * */ @XmlAccessorType(XmlAccessType.FIELD) @XmlType(name = "", propOrder = { "rule" }) @XmlRootElement(name = "languagerule") public class Languagerule { @XmlElement(required = true) protected List rule; @XmlAttribute(required = true) protected String languagerulename; /** * Gets the value of the rule property. * *

* This accessor method returns a reference to the live list, * not a snapshot. Therefore any modification you make to the * returned list will be present inside the JAXB object. * This is why there is not a set method for the rule property. * *

* For example, to add a new item, do as follows: *

     *    getRule().add(newItem);
     * 
* * *

* Objects of the following type(s) are allowed in the list * {@link Rule } * * */ public List getRule() { if (rule == null) { rule = new ArrayList(); } return this.rule; } /** * Gets the value of the languagerulename property. * * @return * possible object is * {@link String } * */ public String getLanguagerulename() { return languagerulename; } /** * Sets the value of the languagerulename property. * * @param value * allowed object is * {@link String } * */ public void setLanguagerulename(String value) { this.languagerulename = value; } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/SrxAnyTransformer.java0000644000175000017500000000431311223073726026745 0ustar railrailpackage net.sourceforge.segment.srx.io; import java.io.BufferedReader; import java.io.Reader; import java.io.Writer; import java.util.Map; import net.sourceforge.segment.srx.SrxTransformer; import net.sourceforge.segment.util.XmlException; /** * Represents any version intelligent SRX document transformer to newest * supported version. * * @author loomchild * @see SrxVersion */ public class SrxAnyTransformer implements SrxTransformer { /** * Transform given SRX document to newest supported version and write it to * given writer. Recognizes version by using * {@link SrxVersion#parse(BufferedReader)}, which does not always work * perfectly. * * @param reader reader containing SRX document * @param writer writer to write transformed SRX document * @param parameterMap map containing transformation parameters */ public void transform(Reader reader, Writer writer, Map parameterMap) { BufferedReader bufferedReader = new BufferedReader(reader); SrxTransformer transformer = getTransformer(bufferedReader); transformer.transform(bufferedReader, writer, parameterMap); } /** * Transform given SRX document and return Reader containing newest * supported version. Recognizes version by using * {@link SrxVersion#parse(BufferedReader)}, which does not always work * perfectly. * * @param reader reader containing SRX document * @param parameterMap map containing transformation parameters * @return reader containing SRX document in newest supported version */ public Reader transform(Reader reader, Map parameterMap) { BufferedReader bufferedReader = new BufferedReader(reader); SrxTransformer transformer = getTransformer(bufferedReader); return transformer.transform(bufferedReader, parameterMap); } private SrxTransformer getTransformer(BufferedReader reader) { SrxTransformer transformer; SrxVersion version = SrxVersion.parse(reader); if (version == SrxVersion.VERSION_1_0) { transformer = new Srx1Transformer(); } else if (version == SrxVersion.VERSION_2_0) { transformer = new Srx2Transformer(); } else { throw new XmlException("Unsupported SRX version: \"" + version + "\"."); } return transformer; } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/SrxTransformersTest.java0000644000175000017500000000621711221113076027314 0ustar railrailpackage net.sourceforge.segment.srx.io; import static net.sourceforge.segment.util.Util.getReader; import static net.sourceforge.segment.util.Util.getResourceStream; import static net.sourceforge.segment.util.Util.getTemplates; import static net.sourceforge.segment.util.Util.transform; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.io.StringWriter; import java.io.Writer; import java.util.HashMap; import java.util.Map; import javax.xml.transform.Templates; import junit.framework.TestCase; import net.sourceforge.segment.srx.SrxTransformer; import net.sourceforge.segment.util.IORuntimeException; public class SrxTransformersTest extends TestCase { public static final String SRX_1_DOCUMENT_NAME = "net/sourceforge/segment/res/test/example1.srx"; public static final String SRX_2_DOCUMENT_NAME = "net/sourceforge/segment/res/test/example2.srx"; public static final String STYLESHEET = "net/sourceforge/segment/res/xml/strip-space.xsl"; private static final Templates templates = getTemplates(getReader(getResourceStream(STYLESHEET))); public void testSrx1Transformer() { SrxTransformer transformer = new Srx1Transformer(); Map parameterMap = new HashMap(); testTransformer(SRX_2_DOCUMENT_NAME, SRX_1_DOCUMENT_NAME, transformer, parameterMap); parameterMap.put(Srx1Transformer.MAP_RULE_NAME, (Object) "Default"); testTransformer(SRX_2_DOCUMENT_NAME, SRX_1_DOCUMENT_NAME, transformer, parameterMap); } public void testSrx2Transformer() { SrxTransformer transformer = new Srx2Transformer(); Map parameterMap = new HashMap(); testTransformer(SRX_2_DOCUMENT_NAME, SRX_2_DOCUMENT_NAME, transformer, parameterMap); } private void testTransformer(String expectedDocumentName, String sourceDocumentName, SrxTransformer transformer, Map parameterMap) { Reader reader = getReader(getResourceStream(expectedDocumentName)); String expectedDocument = removeWhitespaces(reader); reader = getReader(getResourceStream(sourceDocumentName)); reader = transformer.transform(reader, parameterMap); String actualDocument = removeWhitespaces(reader); assertEquals(expectedDocument, actualDocument); reader = getReader(getResourceStream(sourceDocumentName)); Writer writer = new StringWriter(); transformer.transform(reader, writer, parameterMap); reader = new StringReader(writer.toString()); actualDocument = removeWhitespaces(reader); assertEquals(expectedDocument, actualDocument); } private String removeWhitespaces(Reader reader) { StringWriter writer = new StringWriter(); transform(templates, reader, writer); // Java 1.5 requires this because transformation does not work properly. StringReader stringReader = new StringReader(writer.toString()); StringBuilder builder = new StringBuilder(); try { int i; while ((i = stringReader.read()) != -1) { char c = (char)i; if ((c != ' ' && c != '\t' && c != '\r' && c != '\n' && c != '\f')) { builder.append((char)c); } } } catch (IOException e) { throw new IORuntimeException(e); } return builder.toString(); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/Srx2Transformer.java0000644000175000017500000000216711223073726026364 0ustar railrailpackage net.sourceforge.segment.srx.io; import static net.sourceforge.segment.util.Util.copyAll; import java.io.Reader; import java.io.Writer; import java.util.Map; import net.sourceforge.segment.srx.SrxTransformer; /** * Represents SRX document transformer between SRX 2.0 and newest supported * version. As newest supported version is 2.0 so does no transformation. * * @author loomchild */ public class Srx2Transformer implements SrxTransformer { /** * Copies SRX document from reader to writer without transformation. * * @param reader reader containing SRX document * @param writer writer to write SRX document * @param parameterMap map containing transformation parameters, ignored */ public void transform(Reader reader, Writer writer, Map parameterMap) { copyAll(reader, writer); } /** * Returns given reader without modification. * * @param reader reader containing SRX document * @param parameterMap map containing transformation parameters, ignored * @return reader */ public Reader transform(Reader reader, Map parameterMap) { return reader; } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/SrxAnyParser.java0000644000175000017500000000212611223073726025677 0ustar railrailpackage net.sourceforge.segment.srx.io; import java.io.BufferedReader; import java.io.Reader; import net.sourceforge.segment.srx.SrxDocument; import net.sourceforge.segment.srx.SrxParser; import net.sourceforge.segment.util.XmlException; /** * Represents any version intelligent SRX document parser. Responsible for * creating appropriate SRX parser to given SRX document version. * * @author loomchild */ public class SrxAnyParser implements SrxParser { /** * Parses SRX document from reader. Selects appropriate SRX parser for * document version. * * @param reader * @return Return initialized document */ public SrxDocument parse(Reader reader) { SrxParser parser; BufferedReader bufferedReader = new BufferedReader(reader); SrxVersion version = SrxVersion.parse(bufferedReader); if (version == SrxVersion.VERSION_1_0) { parser = new Srx1Parser(); } else if (version == SrxVersion.VERSION_2_0) { parser = new Srx2Parser(); } else { throw new XmlException("Unsupported SRX version: \"" + version + "\"."); } return parser.parse(bufferedReader); } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/io/Srx1Transformer.java0000644000175000017500000000655711223073726026372 0ustar railrailpackage net.sourceforge.segment.srx.io; import static net.sourceforge.segment.util.Util.getFileInputStream; import static net.sourceforge.segment.util.Util.getFileOutputStream; import static net.sourceforge.segment.util.Util.getReader; import static net.sourceforge.segment.util.Util.getResourceStream; import static net.sourceforge.segment.util.Util.getSchema; import static net.sourceforge.segment.util.Util.getTemplates; import static net.sourceforge.segment.util.Util.getWriter; import java.io.File; import java.io.IOException; import java.io.Reader; import java.io.Writer; import java.util.Map; import javax.xml.transform.Templates; import javax.xml.validation.Schema; import net.sourceforge.segment.srx.SrxTransformer; import net.sourceforge.segment.util.IORuntimeException; import net.sourceforge.segment.util.Util; /** * Represents SRX document transformer between SRX 1.0 and newest supported * version. Responsible for validating input as SRX 1.0 and doing the * transformation using XSLT stylesheet. * * @author loomchild */ public class Srx1Transformer implements SrxTransformer { /** * Transformation parameter. Used to select map rule in SRX 1.0 document. */ public static final String MAP_RULE_NAME = "maprulename"; private static final String STYLESHEET = "net/sourceforge/segment/res/xml/srx10.xsl"; private static final String SCHEMA = "net/sourceforge/segment/res/xml/srx10.xsd"; private static Templates templates = getTemplates(getReader(getResourceStream(STYLESHEET)));; private static Schema schema = getSchema(getReader(getResourceStream(SCHEMA))); /** * Transform given SRX 1.0 document to newest supported version and write it * to given writer. Because in current SRX version only one map rule is * allowed it must be selected from SRX 1.0 document. If parameter map * contains parameter {@link #MAP_RULE_NAME} then only map rule with name * given by this parameter value is preserved. Otherwise first map rule from * source document is preserved. If source document does not contain * appropriate map rule to select, resulting document will not contain * language maps and will be unusable. * * @param reader reader containing SRX 1.0 document * @param writer writer to write transformed SRX document * @param parameterMap map containing transformation parameters */ public void transform(Reader reader, Writer writer, Map parameterMap) { Util.transform(templates, schema, reader, writer, parameterMap); } /** * Transforms given SRX 1.0 document and returns Reader containing SRX * document in newest supported version. Creates temporary file and uses * {@link #transform(Reader, Writer, Map)}. * @see #transform(Reader, Writer, Map) * * @param reader reader containing SRX 1.0 document * @param parameterMap map containing transformation parameters. * @return reader containing SRX document in newest supported version */ public Reader transform(Reader reader, Map parameterMap) { try { File file = File.createTempFile("srx2", ".srx"); file.deleteOnExit(); Writer writer = getWriter(getFileOutputStream(file .getAbsolutePath())); transform(reader, writer, parameterMap); writer.close(); Reader resultReader = getReader(getFileInputStream(file .getAbsolutePath())); return resultReader; } catch (IOException e) { throw new IORuntimeException(e); } } } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/SrxParser.java0000644000175000017500000000055711223073726024626 0ustar railrailpackage net.sourceforge.segment.srx; import java.io.Reader; /** * Represents SRX parser that can parse SRX document from reader. * * @author loomchild */ public interface SrxParser { /** * Parses SRX document. * * @param reader reader from which read the document * @return initialized SRX document */ public SrxDocument parse(Reader reader); } segment-1.3.5~svn57+dfsg/src/net/sourceforge/segment/srx/AbstractSrxTextIteratorTest.java0000644000175000017500000004254311247267663030370 0ustar railrailpackage net.sourceforge.segment.srx; import static org.junit.Assert.assertEquals; import java.io.IOException; import java.util.ArrayList; import java.util.List; import net.sourceforge.segment.TextIterator; import org.junit.Test; /** * Segmentation text used in all text iterator tests. * Inheriting tests must implement * {@link #getTextIterator(String, SrxDocument, String)} abstract method. * @author loomchild */ public abstract class AbstractSrxTextIteratorTest { public static final String[] SIMPLE_RESULT = new String[] { "Ala ma kota.", " Prof. Kot nie wie kim jest.", " Ech.", "\nA inny prof. to już w ogole.", " Uch" }; public static final String SIMPLE_LANGUAGE = "pl"; public static final SrxDocument SIMPLE_DOCUMENT = createSimpleDocument(); public static SrxDocument createSimpleDocument() { LanguageRule languageRulePL = new LanguageRule("Polish"); languageRulePL.addRule(new Rule(false, "[Pp]rof\\.", "\\s")); LanguageRule languageRuleEN = new LanguageRule("English"); languageRuleEN.addRule(new Rule(false, "Mr\\.", "\\s")); LanguageRule languageRuleDEF = new LanguageRule("Default"); languageRuleDEF.addRule(new Rule(true, "\\.", "\\s")); languageRuleDEF.addRule(new Rule(true, "", "\\n")); SrxDocument document = new SrxDocument(); document.addLanguageMap("pl.*", languageRulePL); document.addLanguageMap("en.*", languageRuleEN); document.addLanguageMap(".*", languageRuleDEF); return document; } /** * Test some simple splitting with multiple rules. */ @Test public void testSimpleSplit() { performTest(SIMPLE_RESULT, SIMPLE_DOCUMENT, SIMPLE_LANGUAGE); } public static final String[] EMPTY_RESULT = new String[] { }; public static final SrxDocument EMPTY_DOCUMENT = createEmptyDocument(); public static SrxDocument createEmptyDocument() { LanguageRule languageRule = new LanguageRule(""); languageRule.addRule(new Rule(true, ".", " ")); SrxDocument document = new SrxDocument(); document.addLanguageMap(".*", languageRule); return document; } /** * Tests if when the text is empty no segments are returned. */ @Test public void testEmptyText() { performTest(EMPTY_RESULT, EMPTY_DOCUMENT); } public static final String[] ALTERNATIVE_RULE_RESULT = new String[] { "W 59 n. e. Julek nie zrobił nic ciekawego.", " Drugie dn. to: Ja też nie" }; public static final SrxDocument ALTERNATIVE_RULE_DOCUMENT = createAlternativeRuleDocument(); public static SrxDocument createAlternativeRuleDocument() { LanguageRule languageRule = new LanguageRule("Deafult"); languageRule.addRule(new Rule(false, "(n\\.)|(e\\.)|(dn\\.)", " ")); languageRule.addRule(new Rule(true, "\\.", " ")); SrxDocument document = new SrxDocument(); document.addLanguageMap(".*", languageRule); return document; } /** * Tests if splitter can deal with alternatives in patterns. */ @Test public void testAlternativeRuleSplit() { performTest(ALTERNATIVE_RULE_RESULT, ALTERNATIVE_RULE_DOCUMENT); } public static final String[] OVERLAPPING_RULES_RESULT = new String[] { "W 59 n.e. Julek nie zrobił nic ciekawego.", " Ja też nie" }; public static final SrxDocument OVERLAPPING_RULES_DOCUMENT = createOverlappingRulesDocument(); public static SrxDocument createOverlappingRulesDocument() { LanguageRule languageRule = new LanguageRule(""); languageRule.addRule(new Rule(false, "n\\.", "")); languageRule.addRule(new Rule(false, "n\\.e\\.", "")); languageRule.addRule(new Rule(true, "\\.", "")); SrxDocument document = new SrxDocument(); document.addLanguageMap(".*", languageRule); return document; } /** * Checks if splitter works with rules with common prefix. */ @Test public void testOverlappingRulesSplit() { performTest(OVERLAPPING_RULES_RESULT, OVERLAPPING_RULES_DOCUMENT); } public static final String[] INTERLACED_RULES_RESULT = new String[] { "a? b? a. b.", " c.", " d." }; public static final SrxDocument INTERLACED_RULES_DOCUMENT = createInterlacedRulesDocument(); public static SrxDocument createInterlacedRulesDocument() { LanguageRule languageRule = new LanguageRule(""); languageRule.addRule(new Rule(false, "a[\\.\\?]", " ")); languageRule.addRule(new Rule(true, "\\.", " ")); languageRule.addRule(new Rule(false, "(b[\\.\\?])", " ")); languageRule.addRule(new Rule(true, "\\?", " ")); languageRule.addRule(new Rule(false, "c[\\.\\?]", " ")); SrxDocument document = new SrxDocument(); document.addLanguageMap(".*", languageRule); return document; } /** * Tests splitting when break and exception rules are interlaced. */ @Test public void testInterlacedRulesSplit() throws IOException { performTest(INTERLACED_RULES_RESULT, INTERLACED_RULES_DOCUMENT); } public static final String[] NO_BREAK_RULES_RESULT = new String[] { "abcab" }; public static final SrxDocument NO_BREAK_RULES_DOCUMENT = createNoBreakRulesDocument(); public static SrxDocument createNoBreakRulesDocument() { LanguageRule languageRule = new LanguageRule(""); languageRule.addRule(new Rule(false, "a", " ")); SrxDocument document = new SrxDocument(); document.addLanguageMap(".*", languageRule); return document; } /** * Tests if when there is no break rules text will not be splitted. */ @Test public void testNoBreakRules() { performTest(NO_BREAK_RULES_RESULT, NO_BREAK_RULES_DOCUMENT); } public static final String[] INFINITE_NEGATIVE_RULE_RESULT = new String[] { "Abc 99. Def.", " Xyz." }; public static final SrxDocument INFINITE_NEGATIVE_RULE_DOCUMENT = createInfiniteNegativeRuleDocument(); public static SrxDocument createInfiniteNegativeRuleDocument() { LanguageRule languageRule = new LanguageRule(""); languageRule.addRule(new Rule(false, "([0-9]+\\.|[0-9]{1,}\\.|[0-9][0-9]*\\.)", "\\s")); languageRule.addRule(new Rule(true, "\\.", "\\s")); SrxDocument document = new SrxDocument(); document.addLanguageMap(".*", languageRule); return document; } /** * Tests if splitter can work with infinite length exception rule. */ @Test public void testInfiniteNegativeRule() { performTest(INFINITE_NEGATIVE_RULE_RESULT, INFINITE_NEGATIVE_RULE_DOCUMENT); } public static final String[] ONLY_BREAK_RULES_RESULT = new String[] { "Abc 99.", " Def.", " Xyz." }; public static final SrxDocument ONLY_BREAK_RULES_DOCUMENT = createOnlyBreakRulesDocument(); public static SrxDocument createOnlyBreakRulesDocument() { LanguageRule languageRule = new LanguageRule(""); languageRule.addRule(new Rule(true, "\\.", "\\s")); languageRule.addRule(new Rule(true, "", "\\n")); SrxDocument document = new SrxDocument(); document.addLanguageMap(".*", languageRule); return document; } /** * Tests if splitter can when there are only break rules. */ @Test public void testOnlyBreakRules() { performTest(ONLY_BREAK_RULES_RESULT, ONLY_BREAK_RULES_DOCUMENT); } public static final String[] BREAK_AT_THE_END_RESULT = new String[] { "a." }; public static final SrxDocument BREAK_AT_THE_END_DOCUMENT = createBreakAtTheEndDocument(); public static SrxDocument createBreakAtTheEndDocument() { LanguageRule languageRule = new LanguageRule(""); languageRule.addRule(new Rule(true, "\\.", "")); SrxDocument document = new SrxDocument(); document.addLanguageMap(".*", languageRule); return document; } /** * Tests if when break is at the end of text no blank segment is returned. */ @Test public void testBreakAtTheEndOfText() { performTest(BREAK_AT_THE_END_RESULT, BREAK_AT_THE_END_DOCUMENT); } public static final String[] EMPTY_EXCEPTION_RULE_RESULT = new String[] { "a. b. c" }; public static final SrxDocument EMPTY_EXCEPTION_RULE_DOCUMENT = createEmptyExceptionRuleDocument(); public static SrxDocument createEmptyExceptionRuleDocument() { LanguageRule languageRule = new LanguageRule(""); languageRule.addRule(new Rule(false, "", "")); languageRule.addRule(new Rule(true, "\\.", " ")); SrxDocument document = new SrxDocument(); document.addLanguageMap(".*", languageRule); return document; } /** * Tests if when there is empty (matching all) exception rule text * will not be split. */ @Test public void testEmptyExceptionRule() { performTest(EMPTY_EXCEPTION_RULE_RESULT, EMPTY_EXCEPTION_RULE_DOCUMENT); } public static final String[] EMPTY_BREAK_RULE_RESULT = new String[] { "a", " ", "b", "c" }; public static final SrxDocument EMPTY_BREAK_RULE_DOCUMENT = createEmptyBreakRuleDocument(); public static SrxDocument createEmptyBreakRuleDocument() { LanguageRule languageRule = new LanguageRule(""); languageRule.addRule(new Rule(true, "", "")); SrxDocument document = new SrxDocument(); document.addLanguageMap(".*", languageRule); return document; } /** * Tests if when there is empty (matching all) break rule text will * be split after every character. */ @Test public void testEmptyBreakRule() { performTest(EMPTY_BREAK_RULE_RESULT, EMPTY_BREAK_RULE_DOCUMENT); } public static final String[] WORD_BOUNDARY_RESULT = new String[] { "Don't split strings like U.S.A. please.", }; public static final String WORD_BOUNDARY_LANGUAGE = "en"; public static final SrxDocument WORD_BOUNDARY_DOCUMENT = createWordBoundaryDocument(); public static SrxDocument createWordBoundaryDocument() { LanguageRule languageRule = new LanguageRule(""); languageRule.addRule(new Rule(false, "\\b\\p{L}\\.", "")); languageRule.addRule(new Rule(true, "\\.", "")); SrxDocument document = new SrxDocument(); document.addLanguageMap(".*", languageRule); return document; } /** * Test if matchers match \b word boundary pattern correctly. */ @Test public void testWordBoundary() { performTest(WORD_BOUNDARY_RESULT, WORD_BOUNDARY_DOCUMENT, WORD_BOUNDARY_LANGUAGE); } public static final String[] EXCEPTION_RULE_LONGER_THAN_BREAK_RULE_RESULT = new String[] { "Ala ma kota.", " " }; public static final SrxDocument EXCEPTION_RULE_LONGER_THAN_BREAK_RULE_DOCUMENT = createExceptionRuleLongerThanBreakRuleDocument(); public static SrxDocument createExceptionRuleLongerThanBreakRuleDocument() { LanguageRule languageRule = new LanguageRule(""); languageRule.addRule(new Rule(false, "\\.", "\\sa")); languageRule.addRule(new Rule(true, "\\.", "\\s")); SrxDocument document = new SrxDocument(); document.addLanguageMap(".*", languageRule); return document; } /** * Test when exception rule is longer than break rule everything * is OK (problems with lookingAt throwing EndOfStreamException). */ @Test public void testExceptionRuleLongerThanBreakRule() { performTest(EXCEPTION_RULE_LONGER_THAN_BREAK_RULE_RESULT, EXCEPTION_RULE_LONGER_THAN_BREAK_RULE_DOCUMENT); } public static final String[] MATCHING_END_RESULT = new String[] { "A.", "." }; public static final SrxDocument MATCHING_END_DOCUMENT = createMatchingEndDocument(); public static SrxDocument createMatchingEndDocument() { LanguageRule languageRule = new LanguageRule(""); languageRule.addRule(new Rule(true, "\\.\\.\\.", "")); languageRule.addRule(new Rule(true, "\\.", "")); SrxDocument document = new SrxDocument(); document.addLanguageMap(".*", languageRule); return document; } /** * Test if unfinished rule matching the end containing other rule will not * supress it. */ @Test public void testMatchingEnd() { performTest(MATCHING_END_RESULT, MATCHING_END_DOCUMENT); } public static final String[] MATCHING_ALL_RESULT = new String[] { "A", " B.", " C", " " }; public static final SrxDocument MATCHING_ALL_DOCUMENT = createMatchingAllDocument(); public static SrxDocument createMatchingAllDocument() { LanguageRule languageRule = new LanguageRule(""); languageRule.addRule(new Rule(true, "[^\\s]*", "\\s")); languageRule.addRule(new Rule(true, "\\.", "\\s")); SrxDocument document = new SrxDocument(); document.addLanguageMap(".*", languageRule); return document; } /** * Test if rules matching whole document will not break other rules. */ @Test public void testMatchingAll() { performTest(MATCHING_ALL_RESULT, MATCHING_ALL_DOCUMENT); } public static final String[] OVERLAPPING_BREAK_RULES_RESULT = new String[] { "A..", ".B" }; public static final SrxDocument OVERLAPPING_BREAK_RULES_DOCUMENT = createOverlappingBreakRulesDocument(); public static SrxDocument createOverlappingBreakRulesDocument() { LanguageRule languageRule = new LanguageRule(""); languageRule.addRule(new Rule(true, "\\.\\.\\.", "")); languageRule.addRule(new Rule(true, "\\.\\.", "")); SrxDocument document = new SrxDocument(); document.addLanguageMap(".*", languageRule); return document; } /** * Test if overlapping break rules do not interfere with each other. */ @Test public void testOverlappingBreakRules() { performTest(OVERLAPPING_BREAK_RULES_RESULT, OVERLAPPING_BREAK_RULES_DOCUMENT); } public static final String[] MIXED_BREAK_RULES_RESULT = new String[] { "xabc", "d" }; public static final SrxDocument MIXED_BREAK_RULES_DOCUMENT = createMixedBreakRulesDocument(); public static SrxDocument createMixedBreakRulesDocument() { LanguageRule languageRule = new LanguageRule(""); languageRule.addRule(new Rule(false, "b", "c")); languageRule.addRule(new Rule(true, "b", "")); languageRule.addRule(new Rule(true, "abc", "")); SrxDocument document = new SrxDocument(); document.addLanguageMap(".*", languageRule); return document; } @Test public void testMixedBreakRules() { performTest(MIXED_BREAK_RULES_RESULT, MIXED_BREAK_RULES_DOCUMENT); } public static final String[] TEXT_LONGER_THAN_BUFFER_RESULT = createTextLongerThanBufferResult(); private static String[] createTextLongerThanBufferResult() { int length = SrxTextIterator.DEFAULT_BUFFER_LENGTH / 10 + 20; String[] result = new String[length]; for (int i = 0; i < length; ++i) { result[i] = "AAAAAAAAA."; } return result; } public static final SrxDocument TEXT_LONGER_THAN_BUFFER_DOCUMENT = createTextLongerThanBufferDocument(); private static SrxDocument createTextLongerThanBufferDocument() { LanguageRule languageRule = new LanguageRule(""); languageRule.addRule(new Rule(false, "Mr\\.", "")); languageRule.addRule(new Rule(true, "\\.", "")); SrxDocument document = new SrxDocument(); document.addLanguageMap(".*", languageRule); return document; } public void testTextLongerThanBufferRules() { performTest(TEXT_LONGER_THAN_BUFFER_RESULT, TEXT_LONGER_THAN_BUFFER_DOCUMENT); } public static final String[] TICKET_1_RESULT = new String[] { "This is a sentence. " }; public static final SrxDocument TICKET_1_DOCUMENT = createTicket1Document(); public static SrxDocument createTicket1Document() { LanguageRule languageRule = new LanguageRule(""); languageRule.addRule(new Rule(false, "[A-Z]\\.\\s", "")); languageRule.addRule(new Rule(true, "\\.\\s", "")); SrxDocument document = new SrxDocument(); document.addLanguageMap(".*", languageRule); return document; } /** * Tests situation from Ticket 1 reported by Marcin Miłkowski. */ @Test public void testTicket1Rule() { performTest(TICKET_1_RESULT, TICKET_1_DOCUMENT); } public static final String[] SPECIFICATION_EXAMPLE_RESULT = new String[] { "The U.K. Prime Minister, Mr. Blair, was seen out today." }; public static final SrxDocument SPECIFICATION_EXAMPLE_DOCUMENT = createSpecificationExampleDocument(); public static SrxDocument createSpecificationExampleDocument() { LanguageRule languageRule = new LanguageRule(""); languageRule.addRule(new Rule(false, "\\sU\\.K\\.", "\\s")); languageRule.addRule(new Rule(false, "Mr\\.", "\\s")); languageRule.addRule(new Rule(true, "[\\.\\?!]+", "\\s")); SrxDocument document = new SrxDocument(); document.addLanguageMap(".*", languageRule); return document; } /** * Tests situation described in SRX specification as an example. * The text is slightly shorter to decrease buffer size for tests. */ @Test public void testSpecificationExample() { performTest(SPECIFICATION_EXAMPLE_RESULT, SPECIFICATION_EXAMPLE_DOCUMENT); } /** * Create text iterator. This method needs to be implemented by inheriting. * @param text text to segment * @param document SRX document * @param languageCode language code of text * @return newly created text iterator */ protected abstract TextIterator getTextIterator(String text, SrxDocument document, String languageCode); private void performTest(String[] expectedResult, SrxDocument document) { performTest(expectedResult, document, ""); } private void performTest(String[] expectedResult, SrxDocument document, String languageCode) { String text = merge(expectedResult); TextIterator textIterator; List segmentList; textIterator = getTextIterator(text, document, languageCode); segmentList = segment(textIterator); String[] segmentArray = segmentList.toArray(new String[segmentList.size()]); assertEquals(expectedResult, segmentArray); } protected List segment(TextIterator textIterator) { List segmentList = new ArrayList(); while (textIterator.hasNext()) { segmentList.add(textIterator.next()); } return segmentList; } protected String merge(String[] stringArray) { StringBuilder builder = new StringBuilder(); for (String string : stringArray) { builder.append(string); } return builder.toString(); } } segment-1.3.5~svn57+dfsg/trash/0000755000175000017500000000000011300444104014752 5ustar railrailsegment-1.3.5~svn57+dfsg/trash/MergedPattern.java0000644000175000017500000001343411205015431020364 0ustar railrailpackage net.sourceforge.segment.srx; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; /** * Represents merged splitting pattern. * Responsible for merging breaking rules into one large pattern and * creating non breaking rules pattern. * @author loomchild */ public class MergedPattern { private Pattern breakingPattern; private List nonBreakingPatternList; public MergedPattern(List languageRuleList) { this.nonBreakingPatternList = new ArrayList(); StringBuilder breakingPatternBuilder = new StringBuilder(); List ruleList = extractRules(languageRuleList); List> ruleGroupList = groupRules(ruleList); if (ruleGroupList.size() > 0) { for (List ruleGroup : ruleGroupList) { if (breakingPatternBuilder.length() > 0) { breakingPatternBuilder.append('|'); } // All breaking rules need to be merged because segmentation // need to be done in one pass when text is read from Reader. // Breaking rule need to be inside capturing group so // it is possible to recognize which breaking rule has been // applied during the splitting and know which non-breaking // rules to use. // Breaking rule cannot contain capturing groups. // Capturing groups are replaced with non-capturing groups // inside create breaking pattern function. String breakingGroupPattern = createBreakingPattern(ruleGroup); breakingPatternBuilder.append("(" + breakingGroupPattern + ")"); // If first rule in the group is breaking then there are // no non-breaking rules in the group. In this case null // is appended to non breaking pattern list, because // null does not match anything. if (!ruleGroup.get(0).isBreaking()) { Pattern nonBreakingGroupPattern = Pattern.compile(createNonBreakingPattern(ruleGroup)); nonBreakingPatternList.add(nonBreakingGroupPattern); } else { nonBreakingPatternList.add(null); } } this.breakingPattern = Pattern.compile(breakingPatternBuilder .toString()); } else { // null means that that pattern will not match anything // (as empty pattern matches everything). this.breakingPattern = null; } } public Pattern getBreakingPattern() { return breakingPattern; } public List getNonBreakingPatternList() { return nonBreakingPatternList; } /** * @param languageRuleList * Language rule list. * @return Returns merged list of rules form given language rules. */ private List extractRules(List languageRuleList) { List ruleList = new ArrayList(); for (LanguageRule languageRule : languageRuleList) { ruleList.addAll(languageRule.getRuleList()); } return ruleList; } /** * Divides rules to groups where breaking and non-breaking rules cannot be * interlaced. * * @param ruleList * @return */ private List> groupRules(List ruleList) { List> ruleGroupList = new ArrayList>(); List ruleGroup = new ArrayList(); boolean previousBreaking = false; for (Rule rule : ruleList) { if (rule.isBreaking() && !previousBreaking) { ruleGroupList.add(ruleGroup); } else if (!rule.isBreaking() && previousBreaking) { ruleGroup = new ArrayList(); } ruleGroup.add(rule); previousBreaking = rule.isBreaking(); } return ruleGroupList; } /** * Merges all breaking rules on list into one pattern. * * @param ruleList * List of rules * @return Returns pattern. */ private String createBreakingPattern(List ruleList) { StringBuilder patternBuilder = new StringBuilder(); for (Rule rule : ruleList) { if (rule.isBreaking()) { if (patternBuilder.length() > 0) { patternBuilder.append('|'); } // Capturing groups need to be removed from patterns as // they will interfere with capturing group order // which is used to recognize which breaking rule has been // applied and decide which non-breaking rules to use. // In addition as Java does not allow infinite length patterns // in lookbehind, before pattern need to be shortened. String beforePattern = Util.finitize( Util.removeCapturingGroups(rule.getBeforePattern())); String afterPattern = Util.removeCapturingGroups(rule .getAfterPattern()); // Need to use lookbehind also in breaking rule because // this way they become zero-length matches and I want // to match shorter rules first, independent of occurrence order // as in normal alternative. // Example: // Input: "aaa". // Pattern "aa|a" matches "aa" first. // Pattern "(?<=aa)|(?<=a)" matches "a" first. if (beforePattern.length() > 0) { patternBuilder.append("(?<=" + beforePattern + ")"); } if (afterPattern.length() > 0) { patternBuilder.append("(?=" + afterPattern + ")"); } } } return patternBuilder.toString(); } /** * Creates non breaking pattern by merging given rules. * * @param ruleList * Rule list. * @return Non breaking pattern. */ private String createNonBreakingPattern(List ruleList) { StringBuilder patternBuilder = new StringBuilder(); for (Rule rule : ruleList) { if (!rule.isBreaking()) { if (patternBuilder.length() > 0) { patternBuilder.append('|'); } // As Java does not allow infinite length patterns // in lookbehind, before pattern need to be shortened. String beforePattern = Util.finitize(rule.getBeforePattern()); String afterPattern = rule.getAfterPattern(); if (beforePattern.length() > 0) { patternBuilder.append("(?<=" + beforePattern + ")"); } if (afterPattern.length() > 0) { patternBuilder.append("(?=" + afterPattern + ")"); } } } return patternBuilder.toString(); } } segment-1.3.5~svn57+dfsg/trash/Srx2ParserStax.java0000644000175000017500000001515711204275136020513 0ustar railrailpackage net.sourceforge.segment.srx.io; import static net.rootnode.loomchild.util.io.Util.getReader; import static net.rootnode.loomchild.util.io.Util.getResourceStream; import static net.rootnode.loomchild.util.xml.Util.getContext; import static net.rootnode.loomchild.util.xml.Util.getSchema; import static net.rootnode.loomchild.util.xml.Util.getSource; import static net.rootnode.loomchild.util.xml.Util.validate; import java.io.Reader; import java.util.HashMap; import java.util.Map; import javax.xml.namespace.QName; import javax.xml.stream.XMLEventReader; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.events.Attribute; import javax.xml.stream.events.Characters; import javax.xml.stream.events.EndElement; import javax.xml.stream.events.StartElement; import javax.xml.stream.events.XMLEvent; import javax.xml.transform.Source; import javax.xml.transform.stream.StreamSource; import javax.xml.validation.Schema; import net.rootnode.loomchild.util.exceptions.XmlException; import net.rootnode.loomchild.util.xml.Bind; import net.sourceforge.segment.srx.LanguageRule; import net.sourceforge.segment.srx.Rule; import net.sourceforge.segment.srx.SrxDocument; import net.sourceforge.segment.srx.SrxParser; import net.sourceforge.segment.srx.io.bind.Beforebreak; import net.sourceforge.segment.srx.io.bind.Body; import net.sourceforge.segment.srx.io.bind.Languagemap; import net.sourceforge.segment.srx.io.bind.Languagerule; import net.sourceforge.segment.srx.io.bind.Srx; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.sun.xml.txw2.output.StaxSerializer; /** * Represents SRX 2.0 document parser. Responsible for creating and initializing * Document according to given SRX. * * @author loomchild */ public class Srx2ParserStax implements SrxParser { private static final Log log = LogFactory.getLog(Srx2Parser.class); private static final String SCHEMA = "net/sourceforge/segment/res/xml/srx20.xsd"; private static final String NAMESPACE = "http://www.lisa.org/srx20"; private static final QName HEADER_ELEMENT = new QName(NAMESPACE, "header"); private static final QName LANGUAGERULE_ELEMENT = new QName(NAMESPACE, "languagerule"); private static final QName RULE_ELEMENT = new QName(NAMESPACE, "rule"); private static final QName BEFOREBREAK_ELEMENT = new QName(NAMESPACE, "beforebreak"); private static final QName AFTERBREAK_ELEMENT = new QName(NAMESPACE, "afterbreak"); private static final QName LANGUAGEMAP_ELEMENT = new QName(NAMESPACE, "languagemap"); private static final QName CASCADE_ATTRIBUTE = new QName("cascade"); private static final QName LANGUAGERULENAME_ATTRIBUTE = new QName("languagerulename"); private static final QName BREAKING_ATTRIBUTE = new QName("breaking"); private static final QName LANGUAGEPATTERN_ATTRIBUTE = new QName("languagepattern"); /** * Parses SRX document from reader. * * @param reader * Reader. * @return Returns initialized document. */ public SrxDocument parse(Reader reader) { try { Schema schema = getSchema(getReader(getResourceStream(SCHEMA))); Source source = getSource(reader); validate(schema, source); XMLInputFactory factory = XMLInputFactory.newInstance(); XMLEventReader eventReader = factory.createXMLEventReader(source); SrxDocument document = new SrxDocument(); Map languageRuleMap = new HashMap(); LanguageRule languageRule = null; boolean breaking = false; String before = null; String after = null; boolean inBefore = false; boolean inAfter = false; while(eventReader.hasNext()) { XMLEvent event = eventReader.nextEvent(); if (event.isStartElement()) { StartElement startElement = event.asStartElement(); if (HEADER_ELEMENT.equals(startElement.getName())) { Attribute attribute = startElement.getAttributeByName(CASCADE_ATTRIBUTE); boolean cascade = false; if (attribute != null) { cascade = "yes".equals(attribute.getValue()); } document.setCascade(cascade); } if (LANGUAGERULE_ELEMENT.equals(startElement.getName())) { Attribute attribute = startElement.getAttributeByName(LANGUAGERULENAME_ATTRIBUTE); String name = attribute.getValue(); languageRule = new LanguageRule(name); } if (RULE_ELEMENT.equals(startElement.getName())) { Attribute attribute = startElement.getAttributeByName(BREAKING_ATTRIBUTE); breaking = true; if (attribute != null) { breaking = !"no".equals(attribute.getValue()); } } if (BEFOREBREAK_ELEMENT.equals(startElement.getName())) { inBefore = true; before = ""; } if (AFTERBREAK_ELEMENT.equals(startElement.getName())) { inAfter = true; after = ""; } if (LANGUAGEMAP_ELEMENT.equals(startElement.getName())) { Attribute patternAttribute = startElement.getAttributeByName(LANGUAGEPATTERN_ATTRIBUTE); Attribute nameAttribute = startElement.getAttributeByName(LANGUAGERULENAME_ATTRIBUTE); String pattern = patternAttribute.getValue(); String name = nameAttribute.getValue(); LanguageRule mappedLanguageRule = languageRuleMap.get(name); if (mappedLanguageRule == null) { log.warn("Language map \"" + pattern + "\": language rule \"" + name + "\" not found."); } else { document.addLanguageMap(pattern, mappedLanguageRule); } } } if (event.isEndElement()) { EndElement endElement = event.asEndElement(); if (LANGUAGERULE_ELEMENT.equals(endElement.getName())) { languageRuleMap.put(languageRule.getName(), languageRule); languageRule = null; } if (RULE_ELEMENT.equals(endElement.getName())) { Rule rule = new Rule(breaking, before, after); languageRule.addRule(rule); breaking = false; before = null; after = null; } if (BEFOREBREAK_ELEMENT.equals(endElement.getName())) { inBefore = false; } if (AFTERBREAK_ELEMENT.equals(endElement.getName())) { inAfter = false; } } if (event.isCharacters()) { Characters characters = event.asCharacters(); if (inBefore) { before = before + characters.getData(); } if (inAfter) { after = after + characters.getData(); } } } return document; } catch (XMLStreamException e) { throw new XmlException("Error parsing document", e); } } } segment-1.3.5~svn57+dfsg/build.properties0000644000175000017500000000057411223475357017076 0ustar railrailproject.name=segment project.version=1.3 project.suffix= project.fullversion=${project.version}.${build.number}${project.suffix} project.fullname=${project.name}-${project.fullversion} project.authors=Jarek Lipski generate.schema=res/net/sourceforge/segment/res/xml/srx20.xsd generate.package=net.sourceforge.segment.srx.io.bind generate.dir=net/sourceforge/segment/srx/io/bind segment-1.3.5~svn57+dfsg/res/0000755000175000017500000000000011300444112014421 5ustar railrailsegment-1.3.5~svn57+dfsg/res/net/0000755000175000017500000000000011300444112015207 5ustar railrailsegment-1.3.5~svn57+dfsg/res/net/sourceforge/0000755000175000017500000000000011300444112017532 5ustar railrailsegment-1.3.5~svn57+dfsg/res/net/sourceforge/segment/0000755000175000017500000000000011300444112021174 5ustar railrailsegment-1.3.5~svn57+dfsg/res/net/sourceforge/segment/res/0000755000175000017500000000000011300444112021765 5ustar railrailsegment-1.3.5~svn57+dfsg/res/net/sourceforge/segment/res/xml/0000755000175000017500000000000011300444112022565 5ustar railrailsegment-1.3.5~svn57+dfsg/res/net/sourceforge/segment/res/xml/srx10.xsd0000644000175000017500000001007611255767162024314 0ustar railrail segment-1.3.5~svn57+dfsg/res/net/sourceforge/segment/res/xml/strip-space.xsl0000644000175000017500000000045611171331077025567 0ustar railrail segment-1.3.5~svn57+dfsg/res/net/sourceforge/segment/res/xml/srx20.xsd0000644000175000017500000003235411255767162024320 0ustar railrail Contains the regular expression to match before the segment break Contains the regular expression to match after the segment break SRX body Determines which side of the segment break that formatting information goes A value of "no" indicates that the format code does not belong to the segment being created. A value of "yes" indicates that the format code belongs to the segment being created. The type of format for which behaviour is being defined. Can be "start", "end" or "isolated". SRX header Determines whether text subflows should be segmented Determines whether a matching <languagemap> element should terminate the search Maps one or more languages to a set of rules The name of the language rule to use when the languagepattern regular expression is satisfied The regular expression pattern match for the language code A set of rules for a logical set of languages The name of the language rule Contains all the logical sets of rules A set of language maps A break/no break rule Determines whether this is a segment break or an exception rule OSCAR Segmentation Rules eXchange The version of SRX segment-1.3.5~svn57+dfsg/res/net/sourceforge/segment/res/xml/srx10.xsl0000644000175000017500000000771611171331077024320 0ustar railrail Map rule "" not found. segment-1.3.5~svn57+dfsg/res/net/sourceforge/segment/res/xml/default.srx0000644000175000017500000000152411171331077024764 0ustar railrail

[\.\?!]+ \s+\p{Lu} \n segment-1.3.5~svn57+dfsg/res/net/sourceforge/segment/res/test/0000755000175000017500000000000011300444112022744 5ustar railrailsegment-1.3.5~svn57+dfsg/res/net/sourceforge/segment/res/test/some.xml0000644000175000017500000000015611171331077024446 0ustar railrail Phillip K. Dick Ubik SF segment-1.3.5~svn57+dfsg/res/net/sourceforge/segment/res/test/ticket1.srx0000644000175000017500000000136311201015676025062 0ustar railrail
[\.!?…]['»"”\)\]\}]?\u0002?\s
segment-1.3.5~svn57+dfsg/res/net/sourceforge/segment/res/test/example1.srx0000644000175000017500000000347411171331077025241 0ustar railrail
^\s*[0-9]+\. \s [Ee][Tt][Cc]\. \s[a-z] \sMr\. \s [\.\?!]+ \s \n ^\s*[0-9]+\. \s [Ee][Tt][Cc]\. [\.\?!]+ \s [\xff61\x3002\xff0e\xff1f\xff01]+ \n
segment-1.3.5~svn57+dfsg/res/net/sourceforge/segment/res/test/example.srx0000644000175000017500000000701211171331077025150 0ustar railrail
^\s*[0-9]+\. \s \n [\.\?!]+ \s \s[Ee][Tt][Cc]\. \s[a-z] \sMr\. \s \sU\.K\. \s \s[Mm]lle\. \s \s[Mm]lles\. \s \s[Mm]me\. \s \s[Mm]mes\. \s [\xff61\x3002\xff0e\xff1f\xff01]+
segment-1.3.5~svn57+dfsg/res/net/sourceforge/segment/res/test/example2.srx0000644000175000017500000000332011171331077025230 0ustar railrail
^\s*[0-9]+\. \s [Ee][Tt][Cc]\. \s[a-z] \sMr\. \s [\.\?!]+ \s \n ^\s*[0-9]+\. \s [Ee][Tt][Cc]\. [\.\?!]+ \s [\xff61\x3002\xff0e\xff1f\xff01]+ \n
segment-1.3.5~svn57+dfsg/res/net/sourceforge/segment/res/test/invalid.srx0000644000175000017500000000031011171331077025135 0ustar railrail segment-1.3.5~svn57+dfsg/build.xml0000644000175000017500000000715611236362500015472 0ustar railrail This program divides a text into segments. segment-1.3.5~svn57+dfsg/bin/0000755000175000017500000000000011300444112014400 5ustar railrailsegment-1.3.5~svn57+dfsg/bin/segment0000755000175000017500000000032311207313634016000 0ustar railrail#!/bin/bash SCRIPT_DIR=`dirname $0` PROJECT_HOME=$SCRIPT_DIR/.. JARS=$PROJECT_HOME/lib/* CLASS=net.sourceforge.segment.ui.console.Segment exec java -cp "$CLASSPATH:$PROJECT_HOME/build/classes:$JARS" $CLASS $* segment-1.3.5~svn57+dfsg/bin/segment.bat0000644000175000017500000000035211207313634016544 0ustar railrail@echo off set SCRIPT_DIR=%~d0%~p0 set PROJECT_HOME=%SCRIPT_DIR%.. set JARS=%PROJECT_HOME%\lib\* set CLASS=net.sourceforge.segment.ui.console.Segment java -cp "%CLASSPATH%";"%PROJECT_HOME%\build\classes";"%JARS%" %CLASS% %* segment-1.3.5~svn57+dfsg/example/0000755000175000017500000000000011300444112015263 5ustar railrailsegment-1.3.5~svn57+dfsg/example/simple-1.0.srx0000644000175000017500000000324711207314041017617 0ustar railrail
\bp\. \s [\.\?!]+ \s+\p{Lu} \n \bMr\. \s [\.\?!]+ \s+\p{Lu} \n [\.\?!]+ \s+\p{Lu} \n
segment-1.3.5~svn57+dfsg/example/simple.srx0000644000175000017500000000226411236362500017326 0ustar railrail
\bp\. \s \bMr\. \s [\.\?!]+ \s+\p{Lu} \n segment-1.3.5~svn57+dfsg/build.number0000644000175000017500000000012111236362500016143 0ustar railrail#Build Number for ANT. Do not edit! #Wed Aug 05 19:59:17 GMT 2009 build.number=5 segment-1.3.5~svn57+dfsg/doc/0000755000175000017500000000000011300444104014376 5ustar railrailsegment-1.3.5~svn57+dfsg/doc/history.html0000644000175000017500000000623511236362500017002 0ustar railrailHistory

History

  • version 0.0 2006-02

    • Project inception.
  • version 0.9 2006-03-28

    • Basic functionality.
    • Simple stream splitter and non-stream SRX splitter.
    • Text interface.
    • Performance tool.
  • version 0.99 2008-06-17

    • Completely rewritten SRX split algorithm - streaming and much faster.
  • version 1.0 2008-08-17

    • Renamed project from Splitter to SRX Splitter to reflect the changes and emphasize SRX importance.
    • Changed Splitter interface to TextIterator which implements Iterator.
    • Removed isReady method from Splitter interface to simplify code and remove unresolved bugs. Threads are better solution to non-blocking streams.
    • Removed simple splitter - it can be easily replaced by SRX splitter with basic rules.
    • Added support for SRX 2.0 in addition to SRX 1.0 along with transformer tool and XSLT stylesheet.
    • Updated documentation and translated it to English.
  • version 1.1 2009-04-15

    • Renamed project from SRX Splitter to Segment.
    • Changed project package from split to net.sourceforge.segment.
    • Added pattern caching.
    • Fixed many bugs thanks to Marcin Miłkowski.
    • Added buildnumber to the version.
  • version 1.2 2009-05-28

    • Changed minimum required Java version from 1.6 to 1.5 to make it work on Macs.
    • Fixed many bugs thanks to Marcin Miłkowski - exception sometimes when text contains space at the end, rule skipping in legacy algorithm, initialization error on Macs and so on.
    • Fixed a bug with break rule applying order - now the rule that will break first is applied independent of order.
    • Changed console interface - now there is just one command named 'segment' to perform all the tasks.
    • Added buildnumber to the version.
    • Added debug information to the sources.
    • Updated the documentation and shortly described the algorithms.
    • Integrated loomchild-util library into this project, so it is no longer its dependency.
  • version 1.3 2009-07-03

    • Created brand new text iterator. It applies break rules in correct order (previous version was incorrect according to specification, algorithm pseudocode). It combines the ideas from previous algorithms so it is accurate and fast (even faster than old one).
    • Fully integrated loomchild-util library code to segment package structure.
    • Added preload, algorithm and output options to text interface.
    • Renamed splitters to Fast, Accurate and Ultimate.
    • Updated javadocs and documentation, described algorithms.
    • Allowed map of additional parameters for text iterators. Added new options to text interface (lookbehind, buffer-length and margin).
segment-1.3.5~svn57+dfsg/doc/readme.html0000644000175000017500000002433711255767162016557 0ustar railrailReadme

Segment

Version @project.fullversion@, Date @build.date@


Table of Contents


Introduction

Segment program is used to split text into segments, for example sentences. Splitting rules are read from SRX file, which is standard format for this task (see Resources).

Requirements

To run the project Java Runtime Environment (JRE) 1.5 is required. To build the project from source Java Software Development Kit (JDK) 1.5 and Ant tool are required. Program should run on any operating system supported by Java. The helper startup scripts were written for Unix and Windows.

Running

To run the program bin/segment script is used. For example on Linux, from main project directory, execute:
bin/segment
On windows, from main directory, it looks like this:
bin\segment
When the script does not work on your operating system program can be run directly using Java, look inside bin/split script for the clues how to do it.

Source text is read from standard input and resulting segments are written on standard output, one per line. Without parameters text is split using simple, built-in rules. To get help on command line parameters run:
bin/segment -h
The most popular command line is probably:
bin/segment -s rules.srx -l language -i in.txt -o out.txt
Where rules.srx is a file containing splitting rules, language is input file language code, in.txt is a input file and out.txt is a output file. To control output format useful parameters are -b and -e which define string that will be written before and after the segment (this replaces the standard end of line character).

Performance

To evaluate performance bin/segment -p option can be used. It can measure segmentation time on any data and it is possible to generate data. To generate random text --generate-text option should be used with text length in kilobytes as a parameter. To generate random SRX --generate-srx option should be used with rule count and rule length separated by a comma as a parameter. To repeat segmentation process -2 option should be used. Other option which controls how the text is handled is -r which instructs the application to preload the whole text into memory before segmentation (some algorithms require it). Size of read buffer and therefore memory usage can be controlled by setting --buffer-length option. As a result of performance analysis segmentation time is displayed. Common usage example:
bin/segment -p -2 --generate-text 100 --generate-srx 10,10

Transformation

To automatically convert rule file between old SRX version and current SRX version there is a transformation tool, invoked by bin/segment -t command. By default it reads SRX from standard input and writes transformed SRX to standard output. Usage example:
bin/segment -t -i old.srx -o new.srx
The tool accepts some command line parameters, use bin/segment -h for details. Underneath it uses XSLT stylesheet which can be found in resources directory and used separately with any XSLT processor.

Testing

The program has integrated unit tests. To run them execute:
bin/segment --test.

Data formats

Input

Plain text, UTF-8 encoded.

Output

Plain text, UTF-8 encoded. Some operating system consoles, Windows command prompt for example, have different encoding and special characters will not be displayed correctly. Output files can be opened in text editors because most of them handle UTF-8 encoded files correctly. Each segment is prefixed with string set with -b option (empty by default), and suffixed with string set with -e option (new line character by default).

SRX file

Valid SRX document as defined in SRX specification (see Resources). Both version 1.0 and 2.0 are supported, although version 2.0 is preferred. Currently input is treated as plain text, formatting is not handled specially (contrary to specification). Example SRX files can be found in example/ directory.

Document contains header and body.

Header is currently mostly ignored, only "cascade" attribute is read. It determines if only the first matching language rule is applied (cascade="no"), or all language rules that match language code are applied in the same order as they occur in SRX file (cascade="yes").

Body contains language rules and map rules. Language rules contain break (break="yes") and exception (break="no") rules. Each of those rules can consist of two regular expression elements, <beforebreak> and <afterbreak>, which must match before and after break character respectively, for the rule to be applied. Map rules specify which language rules will be used to segment the text, according to the text language.

Algorithm

The algorithm idea is as follows:

  1. Rule matcher list is created based on SRX file and language. Each rule matcher is responsible for matching before break and after break regular expressions of one break rule.
  2. Each rule matcher is matched to the text. If the rule was not found the rule matcher is removed from the list.
  3. First rule matcher in terms of its break position in text is selected.
  4. List of exception rules corresponding to break rule is retrieved.
  5. If none of exception rules is matching in break position then the text is marked as split and new segment is created. In addition all rule matchers are moved so they start after the end of new segment (which is the same as break position of the matched rule).
  6. All the rules that have break position behind last matched rule break position are moved until they pass it.
  7. If segment was not found the whole process is repeated.

In streaming version of this algorithm character buffer is searched. When the end of it is reached or break position is in the margin (break position > buffer size - margin) and there is more text, the buffer is moved in the text until it starts after last found segment. If this happens rule matchers are reinitialized and the text is searched again. Streaming version has a limitation that read buffer must be at least as long as any segment in the text.

As this algorithm uses lookbehind extensively but Java does not permit infinite regular expressions in lookbehind, so some patterns must be finitized. For example a* pattern will be changed to something like a{0,100}.

Legacy algorithms

Accurate algorithm

This is first implemented algorithm to perform segmentation task. It is stable but does not work on text streams and in real-world scenario with few break rules and many exception rules it is several times slower than the other algorithms.

At the beginning the rule matcher list is created based on SRX file and language. Each rule matcher is responsible for matching before break and after break regular expressions of one rule (break or exception). Then each rule matcher is matched to the text. If the rule was not found the rule matcher is removed from the list. Next first matching rule (in terms of break point position) is selected. If it is break rule text is split. At the end all the rules that are behind last matched rule are matched until they pass it. The whole process is repeated until the matching rule was found or there are no more rules on the list.

Fast algorithm

This algorithm creates a single large regular expression incorporating all break rules. Then this regular expression is matched to the text. Every time matching is found, all exception rules corresponding to this break rule are checked in this place. If no exception rules match, the text is split.

To create the streaming version of the algorithm ReaderCharacterSequence class was implemented. It implements character sequence interface but reads the text from a stream to the internal buffer. It does not work perfectly - buffer has limited size so for example no all subsequences can be read from it.

As this algorithm uses lookbehind extensively but Java does not permit infinite regular expressions in lookbehind, so some patterns are finitized. For example a* pattern will be changed to something like a{0,100}.

Resources


This project was written for Poleng company, but now is distributed as Free / Open Source Software. Results were used to write my Master's Thesis. Happy using:)

   -- Jarek Lipski

segment-1.3.5~svn57+dfsg/doc/copying.html0000644000175000017500000000240311207604340016741 0ustar railrailCopying

MIT License

© 2009 Jarek Lipski

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

segment-1.3.5~svn57+dfsg/doc/todo.txt0000644000175000017500000000115511255767162016133 0ustar railrailAdd Jimmy's email to authors Write automatic test with real data. It can measure performance and accuracy in real-life uses. Problem: Need to obtain SRX and text. Maybe expand TextIterator interface by adding next(Writer) method. It can save memory. Implementation can read next fragments in loop instead of doing this just once. WON'T DO ---------------------------------------------------------------------- Maybe add initialize method to TextIterator interface. But then object is not fully initialized after construction. Probably then also would need two methods: initialize() and initialize(parameterMap). segment-1.3.5~svn57+dfsg/doc/authors.html0000644000175000017500000000056511255767162017004 0ustar railrailAuthors

Authors

  • Jarek Lipski (loomchild@rootnode.net) - creation of the project, design and programming
  • Jimmy O'Regan - translation of readme file to English