pax_global_header 0000666 0000000 0000000 00000000064 12344607157 0014523 g ustar 00root root 0000000 0000000 52 comment=bff30be074deb040676b801872d6edc33b9ef9cd
morfologik-stemming-1.9.0+dfsg/ 0000775 0000000 0000000 00000000000 12344607157 0016442 5 ustar 00root root 0000000 0000000 morfologik-stemming-1.9.0+dfsg/.gitignore 0000664 0000000 0000000 00000002076 12344607157 0020437 0 ustar 00root root 0000000 0000000 *.versionsBackup
tmp/
dist/
target/
*.patch
.eclipse/
.project
.classpath
.settings
*.name
*.iml
.idea/compiler.xml
.idea/encodings.xml
.idea/copyright/profiles_settings.xml
.idea/libraries/Maven__com_carrotsearch_hppc_0_5_3.xml
.idea/libraries/Maven__com_carrotsearch_junit_benchmarks_0_7_2.xml
.idea/libraries/proguard.xml
.idea/misc.xml
.idea/modules.xml
.idea/qaplug_profiles.xml
.idea/scopes/scope_settings.xml
.idea/vcs.xml
.idea/workspace.xml
.idea/libraries/Maven__com_carrotsearch_randomizedtesting_randomizedtesting_runner_2_0_13.xml
.idea/libraries/Maven__com_google_guava_guava_15_0.xml
.idea/libraries/Maven__commons_cli_commons_cli_1_2.xml
.idea/libraries/Maven__commons_lang_commons_lang_2_6.xml
.idea/libraries/Maven__junit_junit_4_11.xml
.idea/libraries/Maven__org_easytesting_fest_assert_core_2_0M10.xml
.idea/libraries/Maven__org_easytesting_fest_util_1_2_5.xml
.idea/libraries/Maven__org_hamcrest_hamcrest_core_1_3.xml
.idea/uiDesigner.xml
.idea/inspectionProfiles/Project_Default.xml
.idea/inspectionProfiles/profiles_settings.xml
morfologik-stemming-1.9.0+dfsg/CHANGES 0000664 0000000 0000000 00000035750 12344607157 0017447 0 ustar 00root root 0000000 0000000
Morfologik-stemming Change Log
For an up-to-date CHANGES file see
https://github.com/morfologik/morfologik-stemming/blob/master/CHANGES
======================= morfologik-stemming 1.9.0 =======================
Changes in backwards compatibility policy
New Features
* Added capability to normalize input and output strings for dictionaries.
This is useful for dictionaries that do not support ligatures, for example.
To specify input conversion, use the property 'fsa.dict.input-conversion'
in the .info file. The output conversion (for example, to use ligatures)
is specified by 'fsa.dict.output-conversion'. Note that lengthy
conversion tables may negatively affect performance.
Bug Fixes
Optimizations
* The suggestion search for the speller is now performed directly by traversing
the dictionary automaton, which makes it much more time-efficient (thanks
to Jaume Ortolà).
* Suggestions are generated faster by avoiding unnecessary case conversions.
======================= morfologik-stemming 1.8.3 =======================
Bug Fixes
* Fixed a bug for spelling dictionaries in non-UTF encodings with
separators: strings with non-encodable characters might have been
accepted as spelled correctly even if they were missing in the
dictionary.
======================= morfologik-stemming 1.8.2 =======================
New Features
* Added the option of using frequencies of words for sorting spelling
replacements. It can be used in both spelling and tagging dictionaries.
'fsa.dict.frequency-included=true' must be added to the .info file.
For building the dictionary, add at the end of each entry a separator and
a character between A and Z (A: less frequently used words;
Z: more frequently used words). (Jaume Ortolà)
======================= morfologik-stemming 1.8.1 =======================
Changes in backwards compatibility policy
* MorphEncodingTool will *fail* if it detects data/lines that contain the
separator annotation byte. This is because such lines get encoded into
something that the decoder cannot process. You can use \u0000 as the
annotation byte to avoid clashes with any existing data.
======================= morfologik-stemming 1.8.0 =======================
Changes in backwards compatibility policy
* Command-line option changes to MorphEncodingTool - it now accepts an explicit
name of the sequence encoder, not infix/suffix/prefix booleans.
* Updating dependencies to their newest versions.
New Features
* Dictionary .info files can specify the sequence decoder explicitly:
suffix, prefix, infix, none are supported. For backwards compatibility,
fsa.dict.uses-prefixes, fsa.dict.uses-infixes and fsa.dict.uses-suffixes
are still supported, but will be removed in the next major version.
* Command-line option changes to MorphEncodingTool - it now accepts an explicit
name of the sequence encoder, not infix/suffix/prefix booleans.
* Rewritten implementation of tab-separated data files (tab2morph tool).
The output should yield smaller files, especially for prefix encoding
and infix encoding. This does *not* necessarily mean smaller automata
but we're working on getting these as well.
Example output before and after refactoring:
Prefix coder:
postmodernizm|modernizm|xyz => [before] postmodernizm+ANmodernizm+xyz
=> [after ] postmodernizm+EA+xyz
Infix coder:
laquelle|lequel|D f s => [before] laquelle+AAHequel+D f s
=> [after ] laquelle+AGAquel+D f s
* Changed the default format of the Polish dictionary from infix
encoded to prefix encoded (smaller output size).
Optimizations
* A number of internal implementation cleanups and refactorings.
======================= morfologik-stemming 1.7.2 =======================
* A quick fix for incorrect decoding of certain suffixes (long suffixes).
* Increased max. recursion level in Speller to 6 from 4. (Jaume Ortolà)
======================= morfologik-stemming 1.7.1 =======================
* Fixed a couple of bugs in morfologik-speller (Jaume Ortolà).
======================= morfologik-stemming 1.7.0 =======================
* Changed DictionaryMetadata API (access methods for encoder/decoder).
* Initial version of morfologik-speller component.
* Minor changes to the FSADumpTool: the header block is always UTF-8
encoded, the default platform encoding does not matter. This is done to
always support certain attributes that may be unicode (and would be
incorrectly dumped otherwise).
* Metadata *.info files can now be encoded in UTF-8 to support text
attributes that otherwise would require text2ascii conversion.
======================= morfologik-stemming 1.6.0 =======================
* Update morfologik-polish data to Morfologik 2.0 PoliMorf (08.03.2013).
Deprecated DICTIONARY constants (unified dictionary only).
* Important! The format of encoding tags has changed and is now
multiple-tags-per-lemma. The value returned from WordData#getTag
may be a number of tags concatenated with a "+" character. Previously
the same lamma/stem would be returned multiple times, each time with
a different tag.
* Moving code from SourceForge to github.
======================= morfologik-stemming 1.5.5 =======================
* Made hppc an optional component of morfologik-fsa. It is required
for constructing FSA automata only and causes problems with javac.
http://stackoverflow.com/questions/3800462/can-i-prevent-javac-accessing-the-class-path-from-the-manifests-of-our-third-par
======================= morfologik-stemming 1.5.4 =======================
* Replaced byte-based speller with CharBasedSpeller.
* Warn about UTF-8 files with BOM.
* Fixed a typo in package name (speller).
======================= morfologik-stemming 1.5.3 =======================
* Initial release of spelling correction submodule.
* Updated morfologik-polish data to morfologik 1.9 [12.06.2012]
* Updated morfologik-polish licensing info to BSD (yay).
======================= morfologik-stemming 1.5.2 =======================
* An alternative Polish dictionary added (BSD licensed): SGJP (Morfeusz).
PolishStemmer can now take an enum switching between the dictionary to
be used or combine both.
* Project split into modules. A single jar version (no external
dependencies) added by transforming via proguard.
* Enabled use of escaped special characters in the tab2morph tool.
* Added guards against the input term having separator character
somewhere (this will now return an empty list of matches). Added
getSeparatorChar to DictionaryLookup so that one can check for this
condition manually, if needed.
======================= morfologik-stemming 1.5.1 =======================
* Build system switch to Maven (tested with Maven2).
======================= morfologik-stemming 1.5.0 =======================
* Major size saving improvements in CFSA2. Built in Polish dictionary
size decreased from 2,811,345 to 1,806,661 (CFSA2 format).
* FSABuilder returns a ready-to-be-used FSA (ConstantArcSizeFSA).
Construction overhead for this automaton is a round zero (it is
immediately serialized in-memory).
* Polish dictionary updated to Morfologik 1.7. [19.11.2010]
* Added an option to serialize automaton to CFSA2 or FSA5 directly from
fsa_build.
* CFSA is now deprecated for serialization (the code still reads CFSA
automata, but will no be able to serialize them). Use CFSA2.
* Added immediate state interning. Speedup in automaton construction by
about 30%, memory use decreased significantly (did not perform exact
measurements, but incremental construction from presorted data should
consume way less memory).
* Added an option to build FSA from already sorted data (--sorted).
Avoids in-memory sorting. Pipe the input through shell sort if
building FSA from large data.
* Changed the default ordering from Java signed-byte to C-like unsigned
byte value. This lets one use GNU sort to sort the input using
'export LC_ALL=C; sort input'.
* Added traversal routines to calculate perfect hashing based on
FSA with NUMBERS.
* Changed the order of serialized arcs in the binary serializer for FSA5
to lexicographic (consistent with the input). Depth-first traversal
recreates the input, in other words.
* Removed character-based automata.
* Incompatible API changes to FSA builders (moved to morfologik.fsa).
* Incompatible API changes to FSATraversalHelper. Cleaned up match
types, added unit tests.
* An external dependency HPPC (high performance primitive collections)
is now required
======================= morfologik-stemming 1.4.1 =======================
* Upgrade of the built-in Morfologik dictionary for Polish (in CFSA
format).
* Added options to define custom FILLER and ANNOT_SEPARATOR bytes in the
fsa_build tool.
* Corrected an inconsistency with the C fsa package -- FILLER and
ANNOT_SEPARATOR characters are now identical with the C version.
* Cleanups to the tools' launcher -- will complain about missing JARs,
if any.
======================= morfologik-stemming 1.4.0 =======================
* Added FSA5 construction in Java (on byte sequences). Added preliminary
support for character sequences. Added a command line tool for FSA5
construction from unsorted data (sorting is done in-memory).
* Added a tool to encode tab-delimited dictionaries to the format
accepted by fsa_build and FSA5 construction tool.
* Added a new version of Morfologik dictionary for Polish (in CFSA format).
======================= morfologik-stemming 1.3.0 =======================
* Added runtime checking for tools availability so that unavailable tools
don't show up in the list.
* Recompressed the built-in Polish dictionary to CFSA.
* Cleaned up FSA/Dictionary separation. FSAs don't store encoding any more
(because it does not make sense for them to do so). The FSA is a purely
abstract class pushing functionality to sub-classes. Input stream
reading cleaned up.
* Added initial code for CFSA (compressed FSA). Reduces automata size
about 10%.
* Changes in the public API. Implementation classes renamed (FSAVer5Impl
into FSA5). Major tweaks and tunes to the API.
* Added support for version 5 automata built with NUMBERS flag (an extra
field stored for each node).
======================= morfologik-stemming 1.2.2 =======================
* License switch to plain BSD (removed the patent clause which did not
make much sense anyway).
* The build ZIP now includes licenses for individual JARs (prevents
confusion).
======================= morfologik-stemming 1.2.1 =======================
* Fixed tool launching routines.
======================= morfologik-stemming 1.2.0 =======================
* Package hierarchy reorganized.
* Removed stempel (heuristic stemmer for Polish).
* Code updated to Java 1.5.
* The API has changed in many places (enums instead of constants,
generics, iterables, removed explicit Arc and Node classes and replaced
by int pointers).
* FSA traversal in version 1.2 is implemented on top of primitive data
structures (int pointers) to keep memory usage minimal. The speed
boost gained from this is enormous and justifies less readable code. We
strongly advise to use the provided iterators and helper functions
for matching state sequences in the FSA.
* Tools updated. Dumping existing FSAs is much, much faster now.
======================= morfologik-stemming 1.1.4 =======================
* Fixed a bug that caused UTF-8 dictionaries to be garbled. Now it
should be relatively safe to use UTF-8 dictionaries (note: separators
cannot be multibyte UTF-8 characters, yet this is probably a very
rare case).
======================= morfologik-stemming 1.1.3 =======================
* Fixed a bug causing NPE when the library is called with null context
class loader (happens when JVM is invoked from an JNI-attached
thread). Thanks to Patrick Luby for report and detailed analysis.
* Updated the built-in dictionary to the newest version available.
======================= morfologik-stemming 1.1.2 =======================
* Fixed a bug causing JAR file locking (by implementing a workaround).
* Fixed the build script (manifest file was broken).
======================= morfologik-stemming 1.1.1 =======================
* Distribution script fixes. The final JAR does not contain test classes
and resources. Size trimmed almost twice compared to release 1.1.
* Updated the dump tool to accept dictionary metadata files.
======================= morfologik-stemming 1.1 =========================
* Introduced an auxiliary "meta" information files about compressed
dictionaries. Such information include delimiter symbol, encoding
and infix/prefix/postfix decoding info.
* The API has changed (repackaging). Some deprecated methods have been
removed. This is a major redesign/ upgrade, you will have to adjust
your source code.
* Cleaned up APIs and interfaces.
* Added infrastructure for command-line tool launching.
* Cleaned up tests.
* Changed project name to morfologik-stemmers and ownership to
(c) Morfologik.
======================= morfologik-stemming 1.0.7 =======================
* Removed one bug in fsa 'compression' decoding.
======================= morfologik-stemming 1.0.6 =======================
* Customized version of stempel replaced with a standard distribution.
* Removed deprecated methods and classes.
* Added infix and prefix encoding support for fsa dictionaries.
======================= morfologik-stemming 1.0.5 =======================
* Added filler and separator char dumps to FSADump.
* A major bug in automaton traversal corrected. Upgrade when possible.
* Certain API changes were introduced; older methods are now deprecated
and will be removed in the future.
======================= morfologik-stemming 1.0.4 =======================
* Licenses for full and no-dict versions.
======================= morfologik-stemming 1.0.3 =======================
* Project code moved to SourceForge (subproject of Morfologik).
LICENSE CHANGED FROM PUBLIC DOMAIN TO BSD (doesn't change much, but
clarifies legal issues).
======================= morfologik-stemming 1.0.2 =======================
* Added a Lametyzator constructor which allows custom dictionary stream,
field delimiters and encoding. Added an option for building stand-alone
JAR that does not include the default polish dictionary.
======================= morfologik-stemming 1.0.1 =======================
* Code cleanups. Added a method that returns the third automaton's column
(form).
======================= morfologik-stemming 1.0 =========================
* Initial release
morfologik-stemming-1.9.0+dfsg/CONTRIBUTOR 0000664 0000000 0000000 00000003324 12344607157 0020201 0 ustar 00root root 0000000 0000000
Compiling
=========
You will need maven. Then, in the top folder of the checkout:
mvn clean install
or
mvn clean install -Pquick # no tests
Eclipse
=======
We highly recommend using m2eclipse plugin and importing all projects directly
into Eclipse as Maven projects ("maven nature").
Compiling dictionaries
======================
A pipeline for compiling plain text dictionary data into automata:
1) Prepare tab-delimited input file with the following columns:
inflected-form base-form annotation. An example:
niebabińscy niebabiński adj:pl:nom.voc:m1.p1:pos
niebabińska niebabiński adj:sg:nom.voc:f:pos
niebabiński niebabiński adj:sg:acc:m3:pos
2) The above tab-delimited input can be preprocessed
to conflate shared affixes (helps in subsequent FSA compression):
java -jar morfologik-tools-*-standalone.jar tab2morph --coder INFIX --input ~/tmp/input.txt > intermediate.txt
3) Compile FSA from the intermediate format:
java -jar morfologik-tools-*-standalone.jar fsa_build --input intermediate.txt --progress > output.fsa
4) You should add output.info file specifying character encoding and additional
licensing information. See examples (Polish dictionaries).
More info:
http://languagetool.wikidot.com/developing-a-tagger-dictionary
Sonatype/ release push
======================
# snapshot deploy, create single-JAR version, javadocs, etc.
mvn clean deploy -Prelease
# ZIP with full release artifacts
mvn clean deploy -Prelease,distribution
# ZIP with full release artifacts for sourceforge.net
mvn clean install -Prelease,distribution
# For final releases, GPG sign.
mvn clean deploy -Prelease,distribution,sign
morfologik-stemming-1.9.0+dfsg/README 0000775 0000000 0000000 00000002661 12344607157 0017332 0 ustar 00root root 0000000 0000000
MORFOLOGIK
==========
FSA (automata), stemming, dictionaries and tools. Tools quickstart:
java -jar lib/morfologik-tools-${version}-standalone.jar
MODULES
=======
This project provides:
morfologik-fsa:
- Creation of byte-based, efficient finite state automata in Java, including
custom, efficient data storage formats.
- Compatibility with FSA5, binary format of finite state automata produced by
Jan Daciuk's "fsa" package.
morfologik-stemming:
- FSA-based stemming interfaces and dictionary metadata.
morfologik-polish:
- Precompiled dictionary of inflected forms, stems and tags for the Polish
language built on top of a large dictionary.
morfologik-tools:
- Command line tools to preprocess, build and dump FSA automata and dictionaries.
- There are a few command-line tools you may find useful. Type:
java -jar lib/morfologik-tools-${version}.jar
for an up-to-date list of all tools.
morfologik-speller:
- Simplistic automaton-based spelling correction (suggester).
AUTHORS
=======
Marcin Miłkowski (http://marcinmilkowski.pl) [linguistic data lead, code]
Dawid Weiss (http://www.dawidweiss.com) [fsa lead, code]
CONTRIBUTORS
============
Grzegorz Słowikowski [initial maven configs]
QUESTIONS, COMMENTS
===================
Web site: http://www.morfologik.blogspot.com
Mailing list: morfologik-devel@lists.sourceforge.net
morfologik-stemming-1.9.0+dfsg/TODO 0000664 0000000 0000000 00000000000 12344607157 0017120 0 ustar 00root root 0000000 0000000 morfologik-stemming-1.9.0+dfsg/licenses/ 0000775 0000000 0000000 00000000000 12344607157 0020247 5 ustar 00root root 0000000 0000000 morfologik-stemming-1.9.0+dfsg/licenses/commons-cli.LICENSE 0000664 0000000 0000000 00000026136 12344607157 0023503 0 ustar 00root root 0000000 0000000 Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
morfologik-stemming-1.9.0+dfsg/licenses/commons-lang.LICENSE 0000664 0000000 0000000 00000026136 12344607157 0023655 0 ustar 00root root 0000000 0000000 Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
morfologik-stemming-1.9.0+dfsg/licenses/hppc.LICENSE 0000664 0000000 0000000 00000026136 12344607157 0022215 0 ustar 00root root 0000000 0000000
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
morfologik-stemming-1.9.0+dfsg/licenses/morfologik-polish.LICENSE 0000664 0000000 0000000 00000002524 12344607157 0024722 0 ustar 00root root 0000000 0000000 BSD-licensed dictionary of Polish (Morfologik)
Copyright (c) 2013, Marcin Miłkowski
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
morfologik-stemming-1.9.0+dfsg/morfologik-distribution/ 0000775 0000000 0000000 00000000000 12344607157 0023327 5 ustar 00root root 0000000 0000000 morfologik-stemming-1.9.0+dfsg/morfologik-distribution/pom.xml 0000664 0000000 0000000 00000006322 12344607157 0024647 0 ustar 00root root 0000000 0000000
Note: Serialize to {@link CFSA2} for new code.
* *The encoding of automaton body is as follows.
* ** ---- FSA header (standard) * Byte Description * +-+-+-+-+-+-+-+-+\ * 0 | | | | | | | | | +------ '\' * +-+-+-+-+-+-+-+-+/ * +-+-+-+-+-+-+-+-+\ * 1 | | | | | | | | | +------ 'f' * +-+-+-+-+-+-+-+-+/ * +-+-+-+-+-+-+-+-+\ * 2 | | | | | | | | | +------ 's' * +-+-+-+-+-+-+-+-+/ * +-+-+-+-+-+-+-+-+\ * 3 | | | | | | | | | +------ 'a' * +-+-+-+-+-+-+-+-+/ * +-+-+-+-+-+-+-+-+\ * 4 | | | | | | | | | +------ version (fixed 0xc5) * +-+-+-+-+-+-+-+-+/ * +-+-+-+-+-+-+-+-+\ * 5 | | | | | | | | | +------ filler character * +-+-+-+-+-+-+-+-+/ * +-+-+-+-+-+-+-+-+\ * 6 | | | | | | | | | +------ annot character * +-+-+-+-+-+-+-+-+/ * +-+-+-+-+-+-+-+-+\ * 7 |C|C|C|C|G|G|G|G| +------ C - node data size (ctl), G - address size (gotoLength) * +-+-+-+-+-+-+-+-+/ * +-+-+-+-+-+-+-+-+\ * 8-32 | | | | | | | | | +------ labels mapped for type (1) of arc encoding. * : : : : : : : : : | * +-+-+-+-+-+-+-+-+/ * * ---- Start of a node; only if automaton was compiled with NUMBERS option. * * Byte * +-+-+-+-+-+-+-+-+\ * 0 | | | | | | | | | \ LSB * +-+-+-+-+-+-+-+-+ + * 1 | | | | | | | | | | number of strings recognized * +-+-+-+-+-+-+-+-+ +----- by the automaton starting * : : : : : : : : : | from this node. * +-+-+-+-+-+-+-+-+ + * ctl-1 | | | | | | | | | / MSB * +-+-+-+-+-+-+-+-+/ * * ---- A vector of node's arcs. Conditional format, depending on flags. * * 1) NEXT bit set, mapped arc label. * * +--------------- arc's label mapped in M bits if M's field value > 0 * | +------------- node pointed to is next * | | +----------- the last arc of the node * _______| | | +--------- the arc is final * / | | | | * +-+-+-+-+-+-+-+-+\ * 0 |M|M|M|M|M|1|L|F| +------ flags + (M) index of the mapped label. * +-+-+-+-+-+-+-+-+/ * * 2) NEXT bit set, label separate. * * +--------------- arc's label stored separately (M's field is zero). * | +------------- node pointed to is next * | | +----------- the last arc of the node * | | | +--------- the arc is final * | | | | * +-+-+-+-+-+-+-+-+\ * 0 |0|0|0|0|0|1|L|F| +------ flags * +-+-+-+-+-+-+-+-+/ * +-+-+-+-+-+-+-+-+\ * 1 | | | | | | | | | +------ label * +-+-+-+-+-+-+-+-+/ * * 3) NEXT bit not set. Full arc. * * +------------- node pointed to is next * | +----------- the last arc of the node * | | +--------- the arc is final * | | | * +-+-+-+-+-+-+-+-+\ * 0 |A|A|A|A|A|0|L|F| +------ flags + (A) address field, lower bits * +-+-+-+-+-+-+-+-+/ * +-+-+-+-+-+-+-+-+\ * 1 | | | | | | | | | +------ label * +-+-+-+-+-+-+-+-+/ * : : : : : : : : : * +-+-+-+-+-+-+-+-+\ * gtl-1 |A|A|A|A|A|A|A|A| +------ address, continuation (MSB) * +-+-+-+-+-+-+-+-+/ **/ public final class CFSA extends FSA { /** * Automaton header version value. */ public static final byte VERSION = (byte) 0xC5; /** * Bitmask indicating that an arc corresponds to the last character of a * sequence available when building the automaton. */ public static final int BIT_FINAL_ARC = 1 << 0; /** * Bitmask indicating that an arc is the last one of the node's list and the * following one belongs to another node. */ public static final int BIT_LAST_ARC = 1 << 1; /** * Bitmask indicating that the target node of this arc follows it in the * compressed automaton structure (no goto field). */ public static final int BIT_TARGET_NEXT = 1 << 2; /** * An array of bytes with the internal representation of the automaton. * Please see the documentation of this class for more information on how * this structure is organized. */ public byte[] arcs; /** * The length of the node header structure (if the automaton was compiled with *
NUMBERS
option). Otherwise zero.
*/
public final int nodeDataLength;
/**
* Flags for this automaton version.
*/
private final Set0
if
* the start node is also an end node.
*/
@Override
public int getRootNode() {
// Skip dummy node marking terminating state.
final int epsilonNode = skipArc(getFirstArc(0));
// And follow the epsilon node's first (and only) arc.
return getDestinationNodeOffset(getFirstArc(epsilonNode));
}
/**
* {@inheritDoc}
*/
@Override
public final int getFirstArc(int node) {
return nodeDataLength + node;
}
/**
* {@inheritDoc}
*/
@Override
public final int getNextArc(int arc) {
if (isArcLast(arc))
return 0;
else
return skipArc(arc);
}
/**
* {@inheritDoc}
*/
@Override
public int getArc(int node, byte label) {
for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
if (getArcLabel(arc) == label)
return arc;
}
// An arc labeled with "label" not found.
return 0;
}
/**
* {@inheritDoc}
*/
@Override
public int getEndNode(int arc) {
final int nodeOffset = getDestinationNodeOffset(arc);
if (0 == nodeOffset) {
throw new RuntimeException("This is a terminal arc [" + arc + "]");
}
return nodeOffset;
}
/**
* {@inheritDoc}
*/
@Override
public byte getArcLabel(int arc) {
if (isNextSet(arc) && isLabelCompressed(arc)) {
return this.labelMapping[(arcs[arc] >>> 3) & 0x1f];
} else {
return arcs[arc + 1];
}
}
/**
* {@inheritDoc}
*/
@Override
public int getRightLanguageCount(int node) {
assert getFlags().contains(FSAFlags.NUMBERS): "This FSA was not compiled with NUMBERS.";
return FSA5.decodeFromBytes(arcs, node, nodeDataLength);
}
/**
* {@inheritDoc}
*/
@Override
public boolean isArcFinal(int arc) {
return (arcs[arc] & BIT_FINAL_ARC) != 0;
}
/**
* {@inheritDoc}
*/
@Override
public boolean isArcTerminal(int arc) {
return (0 == getDestinationNodeOffset(arc));
}
/**
* Returns true
if this arc has NEXT
bit set.
*
* @see #BIT_LAST_ARC
*/
public boolean isArcLast(int arc) {
return (arcs[arc] & BIT_LAST_ARC) != 0;
}
/**
* @see #BIT_TARGET_NEXT
*/
public boolean isNextSet(int arc) {
return (arcs[arc] & BIT_TARGET_NEXT) != 0;
}
/**
* Returns true
if the label is compressed inside flags byte.
*/
public boolean isLabelCompressed(int arc) {
assert isNextSet(arc) : "Only applicable to arcs with NEXT bit.";
return (arcs[arc] & (-1 << 3)) != 0;
}
/**
* {@inheritDoc}
*
* For this automaton version, an additional {@link FSAFlags#NUMBERS} flag * may be set to indicate the automaton contains extra fields for each node.
*/ public SetThe encoding of automaton body is as follows.
* ** ---- CFSA header * Byte Description * +-+-+-+-+-+-+-+-+\ * 0 | | | | | | | | | +------ '\' * +-+-+-+-+-+-+-+-+/ * +-+-+-+-+-+-+-+-+\ * 1 | | | | | | | | | +------ 'f' * +-+-+-+-+-+-+-+-+/ * +-+-+-+-+-+-+-+-+\ * 2 | | | | | | | | | +------ 's' * +-+-+-+-+-+-+-+-+/ * +-+-+-+-+-+-+-+-+\ * 3 | | | | | | | | | +------ 'a' * +-+-+-+-+-+-+-+-+/ * +-+-+-+-+-+-+-+-+\ * 4 | | | | | | | | | +------ version (fixed 0xc6) * +-+-+-+-+-+-+-+-+/ * +-+-+-+-+-+-+-+-+\ * 5 | | | | | | | | | +----\ * +-+-+-+-+-+-+-+-+/ \ flags [MSB first] * +-+-+-+-+-+-+-+-+\ / * 6 | | | | | | | | | +----/ * +-+-+-+-+-+-+-+-+/ * +-+-+-+-+-+-+-+-+\ * 7 | | | | | | | | | +------ label lookup table size * +-+-+-+-+-+-+-+-+/ * +-+-+-+-+-+-+-+-+\ * 8-32 | | | | | | | | | +------ label value lookup table * : : : : : : : : : | * +-+-+-+-+-+-+-+-+/ * * ---- Start of a node; only if automaton was compiled with NUMBERS option. * * Byte * +-+-+-+-+-+-+-+-+\ * 0 | | | | | | | | | \ * +-+-+-+-+-+-+-+-+ + * 1 | | | | | | | | | | number of strings recognized * +-+-+-+-+-+-+-+-+ +----- by the automaton starting * : : : : : : : : : | from this node. v-coding * +-+-+-+-+-+-+-+-+ + * | | | | | | | | | / * +-+-+-+-+-+-+-+-+/ * * ---- A vector of this node's arcs. An arc's layout depends on the combination of flags. * * 1) NEXT bit set, mapped arc label. * * +----------------------- node pointed to is next * | +--------------------- the last arc of the node * | | +------------------- this arc leads to a final state (acceptor) * | | | _______+--------- arc's label; indexed if M > 0, otherwise explicit label follows * | | | / | | | | * +-+-+-+-+-+-+-+-+\ * 0 |N|L|F|M|M|M|M|M| +------ flags + (M) index of the mapped label. * +-+-+-+-+-+-+-+-+/ * +-+-+-+-+-+-+-+-+\ * 1 | | | | | | | | | +------ optional label if M == 0 * +-+-+-+-+-+-+-+-+/ * : : : : : : : : : * +-+-+-+-+-+-+-+-+\ * |A|A|A|A|A|A|A|A| +------ v-coded goto address * +-+-+-+-+-+-+-+-+/ **/ public final class CFSA2 extends FSA { /** * Automaton header version value. */ public static final byte VERSION = (byte) 0xc6; /** * The target node of this arc follows the last arc of the current state * (no goto field). */ public static final int BIT_TARGET_NEXT = 1 << 7; /** * The arc is the last one from the current node's arcs list. */ public static final int BIT_LAST_ARC = 1 << 6; /** * The arc corresponds to the last character of a sequence * available when building the automaton (acceptor transition). */ public static final int BIT_FINAL_ARC = 1 << 5; /** * The count of bits assigned to storing an indexed label. */ static final int LABEL_INDEX_BITS = 5; /** * Masks only the M bits of a flag byte. */ static final int LABEL_INDEX_MASK = (1 << LABEL_INDEX_BITS) - 1; /** * Maximum size of the labels index. */ static final int LABEL_INDEX_SIZE = (1 << LABEL_INDEX_BITS) - 1; /** * An array of bytes with the internal representation of the automaton. * Please see the documentation of this class for more information on how * this structure is organized. */ public byte[] arcs; /** * Flags for this automaton version. */ private final EnumSet
true
states are prepended with numbers.
*/
private final boolean hasNumbers;
/**
* Epsilon node's offset.
*/
private final int epsilon = 0;
/**
* Reads an automaton from a byte stream.
*/
public CFSA2(InputStream in) throws IOException {
// Read the header first.
if (FSAHeader.FSA_MAGIC != FileUtils.readInt(in))
throw new IOException("Invalid file header magic bytes.");
// Ensure we have the correct version.
final int version = FileUtils.readByte(in);
if (version != VERSION) {
throw new IOException("This class can only read FSA version: " + VERSION);
}
// Read flags.
short flagBits = FileUtils.readShort(in);
flags = EnumSet.noneOf(FSAFlags.class);
for (FSAFlags f : FSAFlags.values()) {
if (FSAFlags.isSet(flagBits, f))
flags.add(f);
}
if (flagBits != FSAFlags.asShort(flags))
throw new IOException("Unrecognized flags remained: 0x" + Integer.toHexString(flagBits));
this.hasNumbers = flags.contains(FSAFlags.NUMBERS);
/*
* Read mapping dictionary.
*/
int labelMappingSize = FileUtils.readByte(in) & 0xff;
labelMapping = new byte[labelMappingSize];
readFully(in, labelMapping);
/*
* Read arcs' data.
*/
arcs = readFully(in);
}
/**
* {@inheritDoc}
*/
@Override
public int getRootNode() {
// Skip dummy node marking terminating state.
return getDestinationNodeOffset(getFirstArc(epsilon));
}
/**
* {@inheritDoc}
*/
@Override
public final int getFirstArc(int node) {
if (hasNumbers) {
return skipVInt(node);
} else {
return node;
}
}
/**
* {@inheritDoc}
*/
@Override
public final int getNextArc(int arc) {
if (isArcLast(arc))
return 0;
else
return skipArc(arc);
}
/**
* {@inheritDoc}
*/
@Override
public int getArc(int node, byte label) {
for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
if (getArcLabel(arc) == label)
return arc;
}
// An arc labeled with "label" not found.
return 0;
}
/**
* {@inheritDoc}
*/
@Override
public int getEndNode(int arc) {
final int nodeOffset = getDestinationNodeOffset(arc);
assert nodeOffset != 0 : "Can't follow a terminal arc: " + arc;
assert nodeOffset < arcs.length : "Node out of bounds.";
return nodeOffset;
}
/**
* {@inheritDoc}
*/
@Override
public byte getArcLabel(int arc) {
int index = arcs[arc] & LABEL_INDEX_MASK;
if (index > 0) {
return this.labelMapping[index];
} else {
return arcs[arc + 1];
}
}
/**
* {@inheritDoc}
*/
@Override
public int getRightLanguageCount(int node) {
assert getFlags().contains(FSAFlags.NUMBERS): "This FSA was not compiled with NUMBERS.";
return readVInt(arcs, node);
}
/**
* {@inheritDoc}
*/
@Override
public boolean isArcFinal(int arc) {
return (arcs[arc] & BIT_FINAL_ARC) != 0;
}
/**
* {@inheritDoc}
*/
@Override
public boolean isArcTerminal(int arc) {
return (0 == getDestinationNodeOffset(arc));
}
/**
* Returns true
if this arc has NEXT
bit set.
*
* @see #BIT_LAST_ARC
*/
public boolean isArcLast(int arc) {
return (arcs[arc] & BIT_LAST_ARC) != 0;
}
/**
* @see #BIT_TARGET_NEXT
*/
public boolean isNextSet(int arc) {
return (arcs[arc] & BIT_TARGET_NEXT) != 0;
}
/**
* {@inheritDoc}
*/
public Set* It is possible to serialize the automaton with numbers required for perfect * hashing. See {@link #withNumbers()} method. *
* * @see CFSA2 * @see FSA#read(java.io.InputStream) */ public final class CFSA2Serializer implements FSASerializer { /** * Supported flags. */ private final static EnumSettrue
if we should serialize with numbers.
*
* @see #withNumbers()
*/
private boolean withNumbers;
/**
* A hash map of [state, offset] pairs.
*/
private IntIntOpenHashMap offsets = new IntIntOpenHashMap();
/**
* A hash map of [state, right-language-count] pairs.
*/
private IntIntOpenHashMap numbers = new IntIntOpenHashMap();
/**
* Scratch array for serializing vints.
*/
private final byte [] scratch = new byte [5];
/**
* The most frequent labels for integrating with the flags field.
*/
private byte [] labelsIndex;
/**
* Inverted index of labels to be integrated with flags field. A label
* at index i has the index or zero (no integration).
*/
private int [] labelsInvIndex;
/**
* Logger for progress.
*/
private IMessageLogger logger = new NullMessageLogger();
/**
* Serialize the automaton with the number of right-language sequences in
* each node. This is required to implement perfect hashing. The numbering
* also preserves the order of input sequences.
*
* @return Returns the same object for easier call chaining.
*/
public CFSA2Serializer withNumbers() {
withNumbers = true;
return this;
}
/**
* Serializes any {@link FSA} to {@link CFSA2} stream.
*
* @see #withNumbers
* @return Returns os
for chaining.
*/
@Override
public T serialize(final FSA fsa, T os) throws IOException {
/*
* Calculate the most frequent labels and build indexed labels dictionary.
*/
computeLabelsIndex(fsa);
/*
* Calculate the number of bytes required for the node data, if
* serializing with numbers.
*/
if (withNumbers) {
this.numbers = FSAUtils.rightLanguageForAllStates(fsa);
}
/*
* Linearize all the states, optimizing their layout.
*/
IntArrayList linearized = linearize(fsa);
/*
* Emit the header.
*/
FileUtils.writeInt(os, FSAHeader.FSA_MAGIC);
os.write(CFSA2.VERSION);
EnumSet fsaFlags = EnumSet.of(FLEXIBLE, STOPBIT, NEXTBIT);
if (withNumbers) fsaFlags.add(NUMBERS);
FileUtils.writeShort(os, FSAFlags.asShort(fsaFlags));
/*
* Emit labels index.
*/
os.write(labelsIndex.length);
os.write(labelsIndex);
/*
* Emit the automaton.
*/
int size = emitNodes(fsa, os, linearized);
assert size == 0 : "Size changed in the final pass?";
return os;
}
/**
* Compute a set of labels to be integrated with the flags field.
*/
private void computeLabelsIndex(final FSA fsa) {
// Compute labels count.
final int [] countByValue = new int [256];
fsa.visitAllStates(new StateVisitor() {
public boolean accept(int state) {
for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc))
countByValue[fsa.getArcLabel(arc) & 0xff]++;
return true;
}
});
// Order by descending frequency of counts and increasing label value.
Comparator comparator = new Comparator() {
public int compare(IntIntHolder o1, IntIntHolder o2) {
int countDiff = o2.b - o1.b;
if (countDiff == 0) {
countDiff = o1.a - o2.a;
}
return countDiff;
}
};
TreeSet labelAndCount = new TreeSet(comparator);
for (int label = 0; label < countByValue.length; label++) {
if (countByValue[label] > 0) {
labelAndCount.add(new IntIntHolder(label, countByValue[label]));
}
}
this.logger.startPart("Label distribution");
for (IntIntHolder c : labelAndCount) {
this.logger.log("0x" + Integer.toHexString(c.a), c.b);
}
this.logger.endPart();
labelsIndex = new byte [1 + Math.min(labelAndCount.size(), CFSA2.LABEL_INDEX_SIZE)];
labelsInvIndex = new int [256];
for (int i = labelsIndex.length - 1; i > 0 && !labelAndCount.isEmpty(); i--) {
IntIntHolder p = labelAndCount.first();
labelAndCount.remove(p);
labelsInvIndex[p.a] = i;
labelsIndex[i] = (byte) p.a;
}
}
/**
* Return supported flags.
*/
@Override
public Set getFlags() {
return flags;
}
/**
* Linearization of states.
*/
private IntArrayList linearize(final FSA fsa) throws IOException {
/*
* Compute the states with most inlinks. These should be placed as close to the
* start of the automaton, as possible so that v-coded addresses are tiny.
*/
final IntIntOpenHashMap inlinkCount = computeInlinkCount(fsa);
/*
* An array of ordered states for serialization.
*/
final IntArrayList linearized = new IntArrayList(0,
new BoundedProportionalArraySizingStrategy(1000, 10000, 1.5f));
/*
* Determine which states should be linearized first (at fixed positions) so as to
* minimize the place occupied by goto fields.
*/
int maxStates = Integer.MAX_VALUE;
int minInlinkCount = 2;
ArrayDeque statesQueue = computeFirstStates(inlinkCount, maxStates, minInlinkCount);
IntArrayList states = new IntArrayList();
while (!statesQueue.isEmpty())
states.add(statesQueue.pop());
/*
* Compute initial addresses, without node rearrangements.
*/
int serializedSize = linearizeAndCalculateOffsets(fsa, new IntArrayList(), linearized, offsets);
/*
* Probe for better node arrangements by selecting between [lower, upper]
* nodes from the potential candidate nodes list.
*/
IntArrayList sublist = new IntArrayList();
sublist.buffer = states.buffer;
sublist.elementsCount = states.elementsCount;
/*
* Probe the initial region a little bit, looking for optimal cut. It can't be binary search
* because the result isn't monotonic.
*/
logger.startPart("Compacting");
logger.log("Initial output size", serializedSize);
int cutAt = 0;
for (int cut = Math.min(25, states.size()); cut <= Math.min(150, states.size()); cut += 25) {
sublist.elementsCount = cut;
int newSize = linearizeAndCalculateOffsets(fsa, sublist, linearized, offsets);
logger.log("Moved " + sublist.size() + " states, output size", newSize);
if (newSize >= serializedSize) {
break;
}
cutAt = cut;
}
/*
* Cut at the calculated point and repeat linearization.
*/
sublist.elementsCount = cutAt;
int size = linearizeAndCalculateOffsets(fsa, sublist, linearized, offsets);
logger.log("Will move " + sublist.size() + " states, final size", size);
logger.endPart();
return linearized;
}
/**
* Linearize all states, putting states
in front of the automaton and
* calculating stable state offsets.
*/
private int linearizeAndCalculateOffsets(FSA fsa, IntArrayList states,
IntArrayList linearized, IntIntOpenHashMap offsets) throws IOException
{
final BitSet visited = new BitSet();
final IntStack nodes = new IntStack();
linearized.clear();
/*
* Linearize states with most inlinks first.
*/
for (int i = 0; i < states.size(); i++) {
linearizeState(fsa, nodes, linearized, visited, states.get(i));
}
/*
* Linearize the remaining states by chaining them one after another, in depth-order.
*/
nodes.push(fsa.getRootNode());
while (!nodes.isEmpty()) {
final int node = nodes.pop();
if (visited.get(node))
continue;
linearizeState(fsa, nodes, linearized, visited, node);
}
/*
* Calculate new state offsets. This is iterative. We start with
* maximum potential offsets and recalculate until converged.
*/
int MAX_OFFSET = Integer.MAX_VALUE;
for (IntCursor c : linearized) {
offsets.put(c.value, MAX_OFFSET);
}
int i, j = 0;
while ((i = emitNodes(fsa, null, linearized)) > 0) {
j = i;
}
return j;
}
/**
* Add a state to linearized list.
*/
private void linearizeState(final FSA fsa,
IntStack nodes,
IntArrayList linearized,
BitSet visited, int node)
{
linearized.add(node);
visited.set(node);
for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) {
if (!fsa.isArcTerminal(arc)) {
final int target = fsa.getEndNode(arc);
if (!visited.get(target))
nodes.push(target);
}
}
}
/**
* Compute the set of states that should be linearized first to minimize other
* states goto length.
*/
private ArrayDeque computeFirstStates(IntIntOpenHashMap inlinkCount,
int maxStates,
int minInlinkCount)
{
Comparator comparator = new Comparator() {
public int compare(IntIntHolder o1, IntIntHolder o2) {
int v = o1.a - o2.a;
return v == 0 ? (o1.b - o2.b) : v;
}
};
PriorityQueue stateInlink = new PriorityQueue(1, comparator);
IntIntHolder scratch = new IntIntHolder();
for (IntIntCursor c : inlinkCount) {
if (c.value > minInlinkCount) {
scratch.a = c.value;
scratch.b = c.key;
if (stateInlink.size() < maxStates || comparator.compare(scratch, stateInlink.peek()) > 0) {
stateInlink.add(new IntIntHolder(c.value, c.key));
if (stateInlink.size() > maxStates) stateInlink.remove();
}
}
}
ArrayDeque states = new ArrayDeque();
while (!stateInlink.isEmpty()) {
IntIntHolder i = stateInlink.remove();
states.addFirst(i.b);
}
return states;
}
/**
* Compute in-link count for each state.
*/
private IntIntOpenHashMap computeInlinkCount(final FSA fsa) {
IntIntOpenHashMap inlinkCount = new IntIntOpenHashMap();
BitSet visited = new BitSet();
IntStack nodes = new IntStack();
nodes.push(fsa.getRootNode());
while (!nodes.isEmpty()) {
final int node = nodes.pop();
if (visited.get(node))
continue;
visited.set(node);
for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) {
if (!fsa.isArcTerminal(arc)) {
final int target = fsa.getEndNode(arc);
inlinkCount.putOrAdd(target, 1, 1);
if (!visited.get(target))
nodes.push(target);
}
}
}
return inlinkCount;
}
/**
* Update arc offsets assuming the given goto length.
*/
private int emitNodes(FSA fsa, OutputStream os, IntArrayList linearized) throws IOException {
int offset = 0;
// Add epsilon state.
offset += emitNodeData(os, 0);
if (fsa.getRootNode() != 0)
offset += emitArc(os, BIT_LAST_ARC, (byte) '^', offsets.get(fsa.getRootNode()));
else
offset += emitArc(os, BIT_LAST_ARC, (byte) '^', 0);
boolean offsetsChanged = false;
final int max = linearized.size();
for (IntCursor c : linearized) {
final int state = c.value;
final int nextState = c.index + 1 < max ? linearized.get(c.index + 1) : NO_STATE;
if (os == null) {
offsetsChanged |= (offsets.get(state) != offset);
offsets.put(state, offset);
} else {
assert offsets.get(state) == offset : state + " " + offsets.get(state) + " " + offset;
}
offset += emitNodeData(os, withNumbers ? numbers.get(state) : 0);
offset += emitNodeArcs(fsa, os, state, nextState);
}
return offsetsChanged ? offset : 0;
}
/**
* Emit all arcs of a single node.
*/
private int emitNodeArcs(FSA fsa, OutputStream os,
final int state, final int nextState) throws IOException {
int offset = 0;
for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) {
int targetOffset;
final int target;
if (fsa.isArcTerminal(arc)) {
target = 0;
targetOffset = 0;
} else {
target = fsa.getEndNode(arc);
targetOffset = offsets.get(target);
}
int flags = 0;
if (fsa.isArcFinal(arc)) {
flags |= BIT_FINAL_ARC;
}
if (fsa.getNextArc(arc) == 0) {
flags |= BIT_LAST_ARC;
}
if (targetOffset != 0 && target == nextState) {
flags |= BIT_TARGET_NEXT;
targetOffset = 0;
}
offset += emitArc(os, flags, fsa.getArcLabel(arc), targetOffset);
}
return offset;
}
/** */
private int emitArc(OutputStream os, int flags, byte label, int targetOffset)
throws IOException
{
int length = 0;
int labelIndex = labelsInvIndex[label & 0xff];
if (labelIndex > 0) {
if (os != null) os.write(flags | labelIndex);
length++;
} else {
if (os != null) {
os.write(flags);
os.write(label);
}
length += 2;
}
if ((flags & BIT_TARGET_NEXT) == 0) {
int len = CFSA2.writeVInt(scratch, 0, targetOffset);
if (os != null) {
os.write(scratch, 0, len);
}
length += len;
}
return length;
}
/** */
private int emitNodeData(OutputStream os, int number) throws IOException {
int size = 0;
if (withNumbers) {
size = CFSA2.writeVInt(scratch, 0, number);
if (os != null) {
os.write(scratch, 0, size);
}
}
return size;
}
/** */
@Override
public CFSA2Serializer withFiller(byte filler) {
throw new UnsupportedOperationException("CFSA2 does not support filler. Use .info file.");
}
/** */
@Override
public CFSA2Serializer withAnnotationSeparator(byte annotationSeparator) {
throw new UnsupportedOperationException("CFSA2 does not support separator. Use .info file.");
}
@Override
public CFSA2Serializer withLogger(IMessageLogger logger) {
this.logger = logger;
return this;
}
}
morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/main/java/morfologik/fsa/ConstantArcSizeFSA.java 0000664 0000000 0000000 00000006717 12344607157 0033220 0 ustar 00root root 0000000 0000000 package morfologik.fsa;
import java.util.Collections;
import java.util.Set;
/**
* An FSA with constant-size arc representation produced directly
* by {@link FSABuilder}.
*
* @see FSABuilder
*/
public final class ConstantArcSizeFSA extends FSA {
/** Size of the target address field (constant for the builder). */
public final static int TARGET_ADDRESS_SIZE = 4;
/** Size of the flags field (constant for the builder). */
public final static int FLAGS_SIZE = 1;
/** Size of the label field (constant for the builder). */
public final static int LABEL_SIZE = 1;
/**
* Size of a single arc structure.
*/
public final static int ARC_SIZE = FLAGS_SIZE + LABEL_SIZE + TARGET_ADDRESS_SIZE;
/** Offset of the flags field inside an arc. */
public final static int FLAGS_OFFSET = 0;
/** Offset of the label field inside an arc. */
public final static int LABEL_OFFSET = FLAGS_SIZE;
/** Offset of the address field inside an arc. */
public final static int ADDRESS_OFFSET = LABEL_OFFSET + LABEL_SIZE;
/** A dummy address of the terminal state. */
final static int TERMINAL_STATE = 0;
/**
* An arc flag indicating the target node of an arc corresponds to a final
* state.
*/
public final static int BIT_ARC_FINAL = 1 << 1;
/** An arc flag indicating the arc is last within its state. */
public final static int BIT_ARC_LAST = 1 << 0;
/**
* An epsilon state. The first and only arc of this state points either
* to the root or to the terminal state, indicating an empty automaton.
*/
private final int epsilon;
/**
* FSA data, serialized as a byte array.
*/
private final byte[] data;
/**
* @param data FSA data. There must be no trailing bytes after the last state.
*/
ConstantArcSizeFSA(byte[] data, int epsilon) {
assert epsilon == 0 : "Epsilon is not zero?";
this.epsilon = epsilon;
this.data = data;
}
@Override
public int getRootNode() {
return getEndNode(getFirstArc(epsilon));
}
@Override
public int getFirstArc(int node) {
return node;
}
@Override
public int getArc(int node, byte label) {
for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
if (getArcLabel(arc) == label)
return arc;
}
return 0;
}
@Override
public int getNextArc(int arc) {
if (isArcLast(arc))
return 0;
return arc + ARC_SIZE;
}
@Override
public byte getArcLabel(int arc) {
return data[arc + LABEL_OFFSET];
}
/**
* Fills the target state address of an arc.
*/
private int getArcTarget(int arc) {
arc += ADDRESS_OFFSET;
return (data[arc]) << 24 |
(data[arc + 1] & 0xff) << 16 |
(data[arc + 2] & 0xff) << 8 |
(data[arc + 3] & 0xff);
}
@Override
public boolean isArcFinal(int arc) {
return (data[arc + FLAGS_OFFSET] & BIT_ARC_FINAL) != 0;
}
@Override
public boolean isArcTerminal(int arc) {
return getArcTarget(arc) == 0;
}
private boolean isArcLast(int arc) {
return (data[arc + FLAGS_OFFSET] & BIT_ARC_LAST) != 0;
}
@Override
public int getEndNode(int arc) {
return getArcTarget(arc);
}
@Override
public Set getFlags() {
return Collections.emptySet();
}
}
morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/main/java/morfologik/fsa/FSA.java 0000664 0000000 0000000 00000023275 12344607157 0030223 0 ustar 00root root 0000000 0000000 package morfologik.fsa;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.util.BitSet;
import java.util.Collections;
import java.util.Iterator;
import java.util.Set;
/**
* This is a top abstract class for handling finite state automata. These
* automata are arc-based, a design described in Jan Daciuk's Incremental
* Construction of Finite-State Automata and Transducers, and Their Use in the
* Natural Language Processing (PhD thesis, Technical University of Gdansk).
*
*
* Concrete subclasses (implementations) provide varying tradeoffs and features:
* traversal speed vs. memory size, for example.
*
*
* @see FSABuilder
*/
public abstract class FSA implements Iterable {
/**
* @return Returns the identifier of the root node of this automaton.
* Returns 0 if the start node is also the end node (the automaton
* is empty).
*/
public abstract int getRootNode();
/**
* @return Returns the identifier of the first arc leaving node
* or 0 if the node has no outgoing arcs.
*/
public abstract int getFirstArc(int node);
/**
* @return Returns the identifier of the next arc after arc
and
* leaving node
. Zero is returned if no more arcs are
* available for the node.
*/
public abstract int getNextArc(int arc);
/**
* @return Returns the identifier of an arc leaving node
and
* labeled with label
. An identifier equal to 0 means
* the node has no outgoing arc labeled label
.
*/
public abstract int getArc(int node, byte label);
/**
* Return the label associated with a given arc
.
*/
public abstract byte getArcLabel(int arc);
/**
* Returns true
if the destination node at the end of this
* arc
corresponds to an input sequence created when building
* this automaton.
*/
public abstract boolean isArcFinal(int arc);
/**
* Returns true
if this arc
does not have a
* terminating node (@link {@link #getEndNode(int)} will throw an
* exception). Implies {@link #isArcFinal(int)}.
*/
public abstract boolean isArcTerminal(int arc);
/**
* Return the end node pointed to by a given arc
. Terminal arcs
* (those that point to a terminal state) have no end node representation
* and throw a runtime exception.
*/
public abstract int getEndNode(int arc);
/**
* Returns a set of flags for this FSA instance.
*/
public abstract Set getFlags();
/**
* Calculates the number of arcs of a given node. Unless really required,
* use the following idiom for looping through all arcs:
*
* for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) {
* }
*
*/
public int getArcCount(int node) {
int count = 0;
for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
count++;
}
return count;
}
/**
* @return Returns the number of sequences reachable from the given state if
* the automaton was compiled with {@link FSAFlags#NUMBERS}. The size of
* the right language of the state, in other words.
*
* @throws UnsupportedOperationException If the automaton was not compiled with
* {@link FSAFlags#NUMBERS}. The value can then be computed by manual count
* of {@link #getSequences(int)}.
*/
public int getRightLanguageCount(int node) {
throw new UnsupportedOperationException("Automaton not compiled with " + FSAFlags.NUMBERS);
}
/**
* Returns an iterator over all binary sequences starting at the given FSA
* state (node) and ending in final nodes. This corresponds to a set of
* suffixes of a given prefix from all sequences stored in the automaton.
*
*
* The returned iterator is a {@link ByteBuffer} whose contents changes on
* each call to {@link Iterator#next()}. The keep the contents between calls
* to {@link Iterator#next()}, one must copy the buffer to some other
* location.
*
*
*
* Important. It is guaranteed that the returned byte buffer is
* backed by a byte array and that the content of the byte buffer starts at
* the array's index 0.
*
*
* @see Iterable
*/
public Iterable getSequences(final int node) {
if (node == 0) {
return Collections. emptyList();
}
return new Iterable() {
public Iterator iterator() {
return new FSAFinalStatesIterator(FSA.this, node);
}
};
}
/**
* An alias of calling {@link #iterator} directly ({@link FSA} is also
* {@link Iterable}).
*/
public final Iterable getSequences() {
return getSequences(getRootNode());
}
/**
* Returns an iterator over all binary sequences starting from the initial
* FSA state (node) and ending in final nodes. The returned iterator is a
* {@link ByteBuffer} whose contents changes on each call to
* {@link Iterator#next()}. The keep the contents between calls to
* {@link Iterator#next()}, one must copy the buffer to some other location.
*
*
* Important. It is guaranteed that the returned byte buffer is
* backed by a byte array and that the content of the byte buffer starts at
* the array's index 0.
*
*
* @see Iterable
*/
public final Iterator iterator() {
return getSequences().iterator();
}
/**
* Visit all states. The order of visiting is undefined. This method may be faster
* than traversing the automaton in post or preorder since it can scan states
* linearly. Returning false from {@link StateVisitor#accept(int)}
* immediately terminates the traversal.
*/
public T visitAllStates(T v) {
return visitInPostOrder(v);
}
/**
* Same as {@link #visitInPostOrder(StateVisitor, int)},
* starting from root automaton node.
*/
public T visitInPostOrder(T v) {
return visitInPostOrder(v, getRootNode());
}
/**
* Visits all states reachable from node
in postorder.
* Returning false from {@link StateVisitor#accept(int)}
* immediately terminates the traversal.
*/
public T visitInPostOrder(T v, int node) {
visitInPostOrder(v, node, new BitSet());
return v;
}
/** Private recursion. */
private boolean visitInPostOrder(StateVisitor v, int node, BitSet visited) {
if (visited.get(node))
return true;
visited.set(node);
for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
if (!isArcTerminal(arc)) {
if (!visitInPostOrder(v, getEndNode(arc), visited))
return false;
}
}
return v.accept(node);
}
/**
* Same as {@link #visitInPreOrder(StateVisitor, int)}, starting from root automaton node.
*/
public T visitInPreOrder(T v) {
return visitInPreOrder(v, getRootNode());
}
/**
* Visits all states in preorder. Returning false from {@link StateVisitor#accept(int)}
* skips traversal of all sub-states of a given state.
*/
public T visitInPreOrder(T v, int node) {
visitInPreOrder(v, node, new BitSet());
return v;
}
/** Private recursion. */
private void visitInPreOrder(StateVisitor v, int node, BitSet visited) {
if (visited.get(node))
return;
visited.set(node);
if (v.accept(node)) {
for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
if (!isArcTerminal(arc)) {
visitInPreOrder(v, getEndNode(arc), visited);
}
}
}
}
/**
* A factory for reading automata in any of the supported versions. If
* possible, explicit constructors should be used.
*
* @see FSA5#FSA5(InputStream)
*/
@SuppressWarnings("unchecked")
public static T read(InputStream in) throws IOException {
if (!in.markSupported()) {
in = new BufferedInputStream(in, Math.max(FSAHeader.MAX_HEADER_LENGTH + 1, 1024));
}
in.mark(FSAHeader.MAX_HEADER_LENGTH);
FSAHeader header = FSAHeader.read(in);
in.reset();
if (header.version == FSA5.VERSION)
return (T) new FSA5(in);
if (header.version == CFSA.VERSION)
return (T) new CFSA(in);
if (header.version == CFSA2.VERSION)
return (T) new CFSA2(in);
throw new IOException("Unsupported automaton version: "
+ header.version);
}
public static FSA read(File fsa) throws IOException {
InputStream is = new BufferedInputStream(new FileInputStream(fsa));
try {
return read(is);
} finally {
is.close();
}
}
}
morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/main/java/morfologik/fsa/FSA5.java 0000664 0000000 0000000 00000021145 12344607157 0030302 0 ustar 00root root 0000000 0000000 package morfologik.fsa;
import static morfologik.fsa.FSAFlags.FLEXIBLE;
import static morfologik.fsa.FSAFlags.NEXTBIT;
import static morfologik.fsa.FSAFlags.NUMBERS;
import static morfologik.fsa.FSAFlags.STOPBIT;
import static morfologik.util.FileUtils.readFully;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.EnumSet;
import java.util.Set;
/**
* FSA binary format implementation for version 5.
*
*
* Version 5 indicates the dictionary was built with these flags:
* {@link FSAFlags#FLEXIBLE}, {@link FSAFlags#STOPBIT} and
* {@link FSAFlags#NEXTBIT}. The internal representation of the FSA must
* therefore follow this description (please note this format describes only a
* single transition (arc), not the entire dictionary file).
*
*
* ---- this node header present only if automaton was compiled with NUMBERS option.
* Byte
* +-+-+-+-+-+-+-+-+\
* 0 | | | | | | | | | \ LSB
* +-+-+-+-+-+-+-+-+ +
* 1 | | | | | | | | | | number of strings recognized
* +-+-+-+-+-+-+-+-+ +----- by the automaton starting
* : : : : : : : : : | from this node.
* +-+-+-+-+-+-+-+-+ +
* ctl-1 | | | | | | | | | / MSB
* +-+-+-+-+-+-+-+-+/
*
* ---- remaining part of the node
*
* Byte
* +-+-+-+-+-+-+-+-+\
* 0 | | | | | | | | | +------ label
* +-+-+-+-+-+-+-+-+/
*
* +------------- node pointed to is next
* | +----------- the last arc of the node
* | | +--------- the arc is final
* | | |
* +-----------+
* | | | | |
* ___+___ | | | |
* / \ | | | |
* MSB LSB |
* 7 6 5 4 3 2 1 0 |
* +-+-+-+-+-+-+-+-+ |
* 1 | | | | | | | | | \ \
* +-+-+-+-+-+-+-+-+ \ \ LSB
* +-+-+-+-+-+-+-+-+ +
* 2 | | | | | | | | | |
* +-+-+-+-+-+-+-+-+ |
* 3 | | | | | | | | | +----- target node address (in bytes)
* +-+-+-+-+-+-+-+-+ | (not present except for the byte
* : : : : : : : : : | with flags if the node pointed to
* +-+-+-+-+-+-+-+-+ + is next)
* gtl | | | | | | | | | / MSB
* +-+-+-+-+-+-+-+-+ /
* gtl+1 (gtl = gotoLength)
*
*/
public final class FSA5 extends FSA {
/**
* Default filler byte.
*/
public final static byte DEFAULT_FILLER = '_';
/**
* Default annotation byte.
*/
public final static byte DEFAULT_ANNOTATION = '+';
/**
* Automaton version as in the file header.
*/
public static final byte VERSION = 5;
/**
* Bit indicating that an arc corresponds to the last character of a
* sequence available when building the automaton.
*/
public static final int BIT_FINAL_ARC = 1 << 0;
/**
* Bit indicating that an arc is the last one of the node's list and the
* following one belongs to another node.
*/
public static final int BIT_LAST_ARC = 1 << 1;
/**
* Bit indicating that the target node of this arc follows it in the
* compressed automaton structure (no goto field).
*/
public static final int BIT_TARGET_NEXT = 1 << 2;
/**
* An offset in the arc structure, where the address and flags field begins.
* In version 5 of FSA automata, this value is constant (1, skip label).
*/
public final static int ADDRESS_OFFSET = 1;
/**
* An array of bytes with the internal representation of the automaton.
* Please see the documentation of this class for more information on how
* this structure is organized.
*/
public final byte[] arcs;
/**
* The length of the node header structure (if the automaton was compiled with
* NUMBERS
option). Otherwise zero.
*/
public final int nodeDataLength;
/**
* Flags for this automaton version.
*/
private final Set flags;
/**
* Number of bytes each address takes in full, expanded form (goto length).
*/
public final int gtl;
/** Filler character. */
public final byte filler;
/** Annotation character. */
public final byte annotation;
/**
* Read and wrap a binary automaton in FSA version 5.
*/
public FSA5(InputStream fsaStream) throws IOException {
// Read the header first.
final FSAHeader header = FSAHeader.read(fsaStream);
// Ensure we have version 5.
if (header.version != VERSION) {
throw new IOException("This class can read FSA version 5 only: " + header.version);
}
/*
* Determine if the automaton was compiled with NUMBERS. If so, modify
* ctl and goto fields accordingly.
*/
flags = EnumSet.of(FLEXIBLE, STOPBIT, NEXTBIT);
if ((header.gtl & 0xf0) != 0) {
flags.add(NUMBERS);
}
this.nodeDataLength = (header.gtl >>> 4) & 0x0f;
this.gtl = header.gtl & 0x0f;
this.filler = header.filler;
this.annotation = header.annotation;
arcs = readFully(fsaStream);
}
/**
* Returns the start node of this automaton.
*/
@Override
public int getRootNode() {
// Skip dummy node marking terminating state.
final int epsilonNode = skipArc(getFirstArc(0));
// And follow the epsilon node's first (and only) arc.
return getDestinationNodeOffset(getFirstArc(epsilonNode));
}
/**
* {@inheritDoc}
*/
@Override
public final int getFirstArc(int node) {
return nodeDataLength + node;
}
/**
* {@inheritDoc}
*/
@Override
public final int getNextArc(int arc) {
if (isArcLast(arc))
return 0;
else
return skipArc(arc);
}
/**
* {@inheritDoc}
*/
@Override
public int getArc(int node, byte label) {
for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) {
if (getArcLabel(arc) == label)
return arc;
}
// An arc labeled with "label" not found.
return 0;
}
/**
* {@inheritDoc}
*/
@Override
public int getEndNode(int arc) {
final int nodeOffset = getDestinationNodeOffset(arc);
assert nodeOffset != 0 : "No target node for terminal arcs.";
return nodeOffset;
}
/**
* {@inheritDoc}
*/
@Override
public byte getArcLabel(int arc) {
return arcs[arc];
}
/**
* {@inheritDoc}
*/
@Override
public boolean isArcFinal(int arc) {
return (arcs[arc + ADDRESS_OFFSET] & BIT_FINAL_ARC) != 0;
}
/**
* {@inheritDoc}
*/
@Override
public boolean isArcTerminal(int arc) {
return (0 == getDestinationNodeOffset(arc));
}
/**
* Returns the number encoded at the given node. The number equals the count
* of the set of suffixes reachable from node
(called its right
* language).
*/
@Override
public int getRightLanguageCount(int node) {
assert getFlags().contains(FSAFlags.NUMBERS): "This FSA was not compiled with NUMBERS.";
return decodeFromBytes(arcs, node, nodeDataLength);
}
/**
* {@inheritDoc}
*
* For this automaton version, an additional {@link FSAFlags#NUMBERS} flag
* may be set to indicate the automaton contains extra fields for each node.
*/
@Override
public Set getFlags() {
return Collections.unmodifiableSet(flags);
}
/**
* Returns true
if this arc has LAST
bit set.
*
* @see #BIT_LAST_ARC
*/
public boolean isArcLast(int arc) {
return (arcs[arc + ADDRESS_OFFSET] & BIT_LAST_ARC) != 0;
}
/**
* @see #BIT_TARGET_NEXT
*/
public boolean isNextSet(int arc) {
return (arcs[arc + ADDRESS_OFFSET] & BIT_TARGET_NEXT) != 0;
}
/**
* Returns an n-byte integer encoded in byte-packed representation.
*/
static final int decodeFromBytes(
final byte[] arcs, final int start, final int n)
{
int r = 0;
for (int i = n; --i >= 0;) {
r = r << 8 | (arcs[start + i] & 0xff);
}
return r;
}
/**
* Returns the address of the node pointed to by this arc.
*/
final int getDestinationNodeOffset(int arc) {
if (isNextSet(arc)) {
/* The destination node follows this arc in the array. */
return skipArc(arc);
} else {
/*
* The destination node address has to be extracted from the arc's
* goto field.
*/
return decodeFromBytes(arcs, arc + ADDRESS_OFFSET, gtl) >>> 3;
}
}
/**
* Read the arc's layout and skip as many bytes, as needed.
*/
private int skipArc(int offset) {
return offset + (isNextSet(offset)
? 1 + 1 /* label + flags */
: 1 + gtl /* label + flags/address */);
}
} morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/main/java/morfologik/fsa/FSA5Serializer.java 0000664 0000000 0000000 00000023022 12344607157 0032330 0 ustar 00root root 0000000 0000000 package morfologik.fsa;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.util.*;
import com.carrotsearch.hppc.*;
import com.carrotsearch.hppc.BitSet;
import static morfologik.fsa.FSAFlags.*;
/**
* Serializes in-memory {@link FSA} graphs to a binary format compatible with
* Jan Daciuk's fsa
's package FSA5
format.
*
*
* It is possible to serialize the automaton with numbers required for perfect
* hashing. See {@link #withNumbers()} method.
*
*
* @see FSA5
* @see FSA#read(java.io.InputStream)
*/
public final class FSA5Serializer implements FSASerializer {
/**
* Maximum number of bytes for a serialized arc.
*/
private final static int MAX_ARC_SIZE = 1 + 5;
/**
* Maximum number of bytes for per-node data.
*/
private final static int MAX_NODE_DATA_SIZE = 16;
/**
* Number of bytes for the arc's flags header (arc representation without
* the goto address).
*/
private final static int SIZEOF_FLAGS = 1;
/**
* Supported flags.
*/
private final static EnumSet flags = EnumSet.of(NUMBERS, SEPARATORS, FLEXIBLE, STOPBIT, NEXTBIT);
/**
* @see FSA5#filler
*/
public byte fillerByte = FSA5.DEFAULT_FILLER;
/**
* @see FSA5#annotation
*/
public byte annotationByte = FSA5.DEFAULT_ANNOTATION;
/**
* true
if we should serialize with numbers.
*
* @see #withNumbers()
*/
private boolean withNumbers;
/**
* A hash map of [state, offset] pairs.
*/
private IntIntOpenHashMap offsets = new IntIntOpenHashMap();
/**
* A hash map of [state, right-language-count] pairs.
*/
private IntIntOpenHashMap numbers = new IntIntOpenHashMap();
/**
* Serialize the automaton with the number of right-language sequences in
* each node. This is required to implement perfect hashing. The numbering
* also preserves the order of input sequences.
*
* @return Returns the same object for easier call chaining.
*/
public FSA5Serializer withNumbers() {
withNumbers = true;
return this;
}
/**
* {@inheritDoc}
*/
@Override
public FSA5Serializer withFiller(byte filler) {
this.fillerByte = filler;
return this;
}
/**
* {@inheritDoc}
*/
@Override
public FSA5Serializer withAnnotationSeparator(byte annotationSeparator) {
this.annotationByte = annotationSeparator;
return this;
}
/**
* {@inheritDoc}
*/
@Override
public FSASerializer withLogger(IMessageLogger logger) {
return this;
}
/**
* Serialize root state s
to an output stream in
* FSA5
format.
*
* @see #withNumbers
* @return Returns os
for chaining.
*/
@Override
public T serialize(final FSA fsa, T os)
throws IOException {
// Prepare space for arc offsets and linearize all the states.
int[] linearized = linearize(fsa);
/*
* Calculate the number of bytes required for the node data, if
* serializing with numbers.
*/
int nodeDataLength = 0;
if (withNumbers) {
this.numbers = FSAUtils.rightLanguageForAllStates(fsa);
int maxNumber = numbers.get(fsa.getRootNode());
while (maxNumber > 0) {
nodeDataLength++;
maxNumber >>>= 8;
}
}
// Calculate minimal goto length.
int gtl = 1;
while (true) {
// First pass: calculate offsets of states.
if (!emitArcs(fsa, null, linearized, gtl, nodeDataLength)) {
gtl++;
continue;
}
// Second pass: check if goto overflows anywhere.
if (emitArcs(fsa, null, linearized, gtl, nodeDataLength))
break;
gtl++;
}
/*
* Emit the header.
*/
os.write(new byte[] { '\\', 'f', 's', 'a' });
os.write(FSA5.VERSION);
os.write(fillerByte);
os.write(annotationByte);
os.write((nodeDataLength << 4) | gtl);
/*
* Emit the automaton.
*/
boolean gtlUnchanged = emitArcs(fsa, os, linearized, gtl, nodeDataLength);
assert gtlUnchanged : "gtl changed in the final pass.";
return os;
}
/**
* Return supported flags.
*/
@Override
public Set getFlags() {
return flags;
}
/**
* Linearization of states.
*/
private int[] linearize(final FSA fsa) {
int[] linearized = new int[0];
int last = 0;
BitSet visited = new BitSet();
IntStack nodes = new IntStack();
nodes.push(fsa.getRootNode());
while (!nodes.isEmpty()) {
final int node = nodes.pop();
if (visited.get(node))
continue;
if (last >= linearized.length) {
linearized = Arrays.copyOf(linearized, linearized.length + 100000);
}
visited.set(node);
linearized[last++] = node;
for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) {
if (!fsa.isArcTerminal(arc)) {
int target = fsa.getEndNode(arc);
if (!visited.get(target))
nodes.push(target);
}
}
}
return Arrays.copyOf(linearized, last);
}
/**
* Update arc offsets assuming the given goto length.
*/
private boolean emitArcs(FSA fsa, OutputStream os, int[] linearized,
int gtl, int nodeDataLength) throws IOException {
final ByteBuffer bb = ByteBuffer.allocate(Math.max(MAX_NODE_DATA_SIZE,
MAX_ARC_SIZE));
int offset = 0;
// Add dummy terminal state.
offset += emitNodeData(bb, os, nodeDataLength, 0);
offset += emitArc(bb, os, gtl, 0, (byte) 0, 0);
// Add epsilon state.
offset += emitNodeData(bb, os, nodeDataLength, 0);
if (fsa.getRootNode() != 0)
offset += emitArc(bb, os, gtl, FSA5.BIT_LAST_ARC | FSA5.BIT_TARGET_NEXT, (byte) '^', 0);
else
offset += emitArc(bb, os, gtl, FSA5.BIT_LAST_ARC , (byte) '^', 0);
int maxStates = linearized.length;
for (int j = 0; j < maxStates; j++) {
final int s = linearized[j];
if (os == null) {
offsets.put(s, offset);
} else {
assert offsets.get(s) == offset : s + " " + offsets.get(s) + " " + offset;
}
offset += emitNodeData(bb, os, nodeDataLength, withNumbers ? numbers.get(s) : 0);
for (int arc = fsa.getFirstArc(s); arc != 0; arc = fsa.getNextArc(arc)) {
int targetOffset;
final int target;
if (fsa.isArcTerminal(arc)) {
targetOffset = 0;
target = 0;
} else {
target = fsa.getEndNode(arc);
targetOffset = offsets.get(target);
}
int flags = 0;
if (fsa.isArcFinal(arc)) {
flags |= FSA5.BIT_FINAL_ARC;
}
if (fsa.getNextArc(arc) == 0) {
flags |= FSA5.BIT_LAST_ARC;
if (j + 1 < maxStates && target == linearized[j + 1]
&& targetOffset != 0) {
flags |= FSA5.BIT_TARGET_NEXT;
targetOffset = 0;
}
}
int bytes = emitArc(bb, os, gtl, flags, fsa.getArcLabel(arc), targetOffset);
if (bytes < 0)
// gtl too small. interrupt eagerly.
return false;
offset += bytes;
}
}
return true;
}
/** */
private int emitArc(ByteBuffer bb, OutputStream os, int gtl, int flags, byte label, int targetOffset)
throws IOException
{
int arcBytes = (flags & FSA5.BIT_TARGET_NEXT) != 0 ? SIZEOF_FLAGS : gtl;
flags |= (targetOffset << 3);
bb.put(label);
for (int b = 0; b < arcBytes; b++) {
bb.put((byte) flags);
flags >>>= 8;
}
if (flags != 0) {
// gtl too small. interrupt eagerly.
return -1;
}
bb.flip();
int bytes = bb.remaining();
if (os != null) {
os.write(bb.array(), bb.position(), bb.remaining());
}
bb.clear();
return bytes;
}
/** */
private int emitNodeData(ByteBuffer bb, OutputStream os,
int nodeDataLength, int number) throws IOException {
if (nodeDataLength > 0 && os != null) {
for (int i = 0; i < nodeDataLength; i++) {
bb.put((byte) number);
number >>>= 8;
}
bb.flip();
os.write(bb.array(), bb.position(), bb.remaining());
bb.clear();
}
return nodeDataLength;
}
}
morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/main/java/morfologik/fsa/FSABuilder.java 0000664 0000000 0000000 00000036001 12344607157 0031521 0 ustar 00root root 0000000 0000000 package morfologik.fsa;
import java.util.*;
import morfologik.util.Arrays;
import static morfologik.fsa.ConstantArcSizeFSA.*;
/**
* Fast, memory-conservative finite state automaton builder, returning a
* byte-serialized {@link ConstantArcSizeFSA} (a tradeoff between construction
* speed and memory consumption).
*/
public final class FSABuilder {
/**
* Debug and information constants.
*
* @see FSABuilder#getInfo()
*/
public enum InfoEntry {
SERIALIZATION_BUFFER_SIZE("Serialization buffer size"),
SERIALIZATION_BUFFER_REALLOCATIONS("Serialization buffer reallocs"),
CONSTANT_ARC_AUTOMATON_SIZE("Constant arc FSA size"),
MAX_ACTIVE_PATH_LENGTH("Max active path"),
STATE_REGISTRY_TABLE_SLOTS("Registry hash slots"),
STATE_REGISTRY_SIZE("Registry hash entries"),
ESTIMATED_MEMORY_CONSUMPTION_MB("Estimated mem consumption (MB)");
private final String stringified;
InfoEntry(String stringified) {
this.stringified = stringified;
}
@Override
public String toString() {
return stringified;
}
}
/** A megabyte. */
private final static int MB = 1024 * 1024;
/**
* Internal serialized FSA buffer expand ratio.
*/
private final static int BUFFER_GROWTH_SIZE = 5 * MB;
/**
* Maximum number of labels from a single state.
*/
private final static int MAX_LABELS = 256;
/**
* Comparator comparing full byte arrays consistently with
* {@link #compare(byte[], int, int, byte[], int, int)}.
*/
public static final Comparator LEXICAL_ORDERING = new Comparator() {
public int compare(byte[] o1, byte[] o2) {
return FSABuilder.compare(o1, 0, o1.length, o2, 0, o2.length);
}
};
/**
* Internal serialized FSA buffer expand ratio.
*/
private final int bufferGrowthSize;
/**
* Holds serialized and mutable states. Each state is a sequential list of
* arcs, the last arc is marked with {@link #BIT_ARC_LAST}.
*/
private byte[] serialized = new byte[0];
/**
* Number of bytes already taken in {@link #serialized}. Start from 1 to
* keep 0 a sentinel value (for the hash set and final state).
*/
private int size;
/**
* States on the "active path" (still mutable). Values are addresses of each
* state's first arc.
*/
private int[] activePath = new int[0];
/**
* Current length of the active path.
*/
private int activePathLen;
/**
* The next offset at which an arc will be added to the given state on
* {@link #activePath}.
*/
private int[] nextArcOffset = new int[0];
/**
* Root state. If negative, the automaton has been built already and cannot be extended.
*/
private int root;
/**
* An epsilon state. The first and only arc of this state points either
* to the root or to the terminal state, indicating an empty automaton.
*/
private int epsilon;
/**
* Hash set of state addresses in {@link #serialized}, hashed by
* {@link #hash(int, int)}. Zero reserved for an unoccupied slot.
*/
private int[] hashSet = new int[2];
/**
* Number of entries currently stored in {@link #hashSet}.
*/
private int hashSize = 0;
/**
* Previous sequence added to the automaton in {@link #add(byte[], int, int)}. Used in assertions only.
*/
private byte [] previous;
/**
* Information about the automaton and its compilation.
*/
private TreeMap info;
/**
* {@link #previous} sequence's length, used in assertions only.
*/
private int previousLength;
/** */
public FSABuilder() {
this(BUFFER_GROWTH_SIZE);
}
/** */
public FSABuilder(int bufferGrowthSize) {
this.bufferGrowthSize = Math.max(bufferGrowthSize, ARC_SIZE * MAX_LABELS);
// Allocate epsilon state.
epsilon = allocateState(1);
serialized[epsilon + FLAGS_OFFSET] |= BIT_ARC_LAST;
// Allocate root, with an initial empty set of output arcs.
expandActivePath(1);
root = activePath[0];
}
/**
* Add a single sequence of bytes to the FSA. The input must be lexicographically greater
* than any previously added sequence.
*/
public void add(byte[] sequence, int start, int len) {
assert serialized != null : "Automaton already built.";
assert previous == null || len == 0 || compare(previous, 0, previousLength, sequence, start, len) <= 0 :
"Input must be sorted: "
+ Arrays.toString(previous, 0, previousLength) + " >= "
+ Arrays.toString(sequence, start, len);
assert setPrevious(sequence, start, len);
// Determine common prefix length.
final int commonPrefix = commonPrefix(sequence, start, len);
// Make room for extra states on active path, if needed.
expandActivePath(len);
// Freeze all the states after the common prefix.
for (int i = activePathLen - 1; i > commonPrefix; i--) {
final int frozenState = freezeState(i);
setArcTarget(nextArcOffset[i - 1] - ARC_SIZE, frozenState);
nextArcOffset[i] = activePath[i];
}
// Create arcs to new suffix states.
for (int i = commonPrefix + 1, j = start + commonPrefix; i <= len; i++) {
final int p = nextArcOffset[i - 1];
serialized[p + FLAGS_OFFSET] = (byte) (i == len ? BIT_ARC_FINAL : 0);
serialized[p + LABEL_OFFSET] = sequence[j++];
setArcTarget(p, i == len ? TERMINAL_STATE : activePath[i]);
nextArcOffset[i - 1] = p + ARC_SIZE;
}
// Save last sequence's length so that we don't need to calculate it again.
this.activePathLen = len;
}
/** Number of serialization buffer reallocations. */
private int serializationBufferReallocations;
/**
* Complete the automaton.
*/
public FSA complete() {
add(new byte[0], 0, 0);
if (nextArcOffset[0] - activePath[0] == 0) {
// An empty FSA.
setArcTarget(epsilon, TERMINAL_STATE);
} else {
// An automaton with at least a single arc from root.
root = freezeState(0);
setArcTarget(epsilon, root);
}
info = new TreeMap();
info.put(InfoEntry.SERIALIZATION_BUFFER_SIZE, serialized.length);
info.put(InfoEntry.SERIALIZATION_BUFFER_REALLOCATIONS, serializationBufferReallocations);
info.put(InfoEntry.CONSTANT_ARC_AUTOMATON_SIZE, size);
info.put(InfoEntry.MAX_ACTIVE_PATH_LENGTH, activePath.length);
info.put(InfoEntry.STATE_REGISTRY_TABLE_SLOTS, hashSet.length);
info.put(InfoEntry.STATE_REGISTRY_SIZE, hashSize);
info.put(InfoEntry.ESTIMATED_MEMORY_CONSUMPTION_MB,
(this.serialized.length + this.hashSet.length * 4) / (double) MB);
final FSA fsa = new ConstantArcSizeFSA(java.util.Arrays.copyOf(this.serialized, this.size), epsilon);
this.serialized = null;
this.hashSet = null;
return fsa;
}
/**
* Build a minimal, deterministic automaton from a sorted list of byte sequences.
*/
public static FSA build(byte[][] input) {
final FSABuilder builder = new FSABuilder();
for (byte [] chs : input)
builder.add(chs, 0, chs.length);
return builder.complete();
}
/**
* Build a minimal, deterministic automaton from an iterable list of byte sequences.
*/
public static FSA build(Iterable input) {
final FSABuilder builder = new FSABuilder();
for (byte [] chs : input)
builder.add(chs, 0, chs.length);
return builder.complete();
}
/**
* Return various statistics concerning the FSA and its compilation.
*/
public Map getInfo() {
return info;
}
/** Is this arc the state's last? */
private boolean isArcLast(int arc) {
return (serialized[arc + FLAGS_OFFSET] & BIT_ARC_LAST) != 0;
}
/** Is this arc final? */
private boolean isArcFinal(int arc) {
return (serialized[arc + FLAGS_OFFSET] & BIT_ARC_FINAL) != 0;
}
/** Get label's arc. */
private byte getArcLabel(int arc) {
return serialized[arc + LABEL_OFFSET];
}
/**
* Fills the target state address of an arc.
*/
private void setArcTarget(int arc, int state) {
arc += ADDRESS_OFFSET + TARGET_ADDRESS_SIZE;
for (int i = 0; i < TARGET_ADDRESS_SIZE; i++) {
serialized[--arc] = (byte) state;
state >>>= 8;
}
}
/**
* Returns the address of an arc.
*/
private int getArcTarget(int arc) {
arc += ADDRESS_OFFSET;
return (serialized[arc]) << 24 |
(serialized[arc + 1] & 0xff) << 16 |
(serialized[arc + 2] & 0xff) << 8 |
(serialized[arc + 3] & 0xff);
}
/**
* @return The number of common prefix characters with the previous
* sequence.
*/
private int commonPrefix(byte[] sequence, int start, int len) {
// Empty root state case.
final int max = Math.min(len, activePathLen);
int i;
for (i = 0; i < max; i++) {
final int lastArc = nextArcOffset[i] - ARC_SIZE;
if (sequence[start++] != getArcLabel(lastArc)) {
break;
}
}
return i;
}
/**
* Freeze a state: try to find an equivalent state in the interned states
* dictionary first, if found, return it, otherwise, serialize the mutable
* state at activePathIndex
and return it.
*/
private int freezeState(final int activePathIndex) {
final int start = activePath[activePathIndex];
final int end = nextArcOffset[activePathIndex];
final int len = end - start;
// Set the last arc flag on the current active path's state.
serialized[end - ARC_SIZE + FLAGS_OFFSET] |= BIT_ARC_LAST;
// Try to locate a state with an identical content in the hash set.
final int bucketMask = (hashSet.length - 1);
int slot = hash(start, len) & bucketMask;
for (int i = 0;;) {
int state = hashSet[slot];
if (state == 0) {
state = hashSet[slot] = serialize(activePathIndex);
if (++hashSize > hashSet.length / 2)
expandAndRehash();
return state;
} else if (equivalent(state, start, len)) {
return state;
}
slot = (slot + (++i)) & bucketMask;
}
}
/**
* Reallocate and rehash the hash set.
*/
private void expandAndRehash() {
final int[] newHashSet = new int[hashSet.length * 2];
final int bucketMask = (newHashSet.length - 1);
for (int j = 0; j < hashSet.length; j++) {
final int state = hashSet[j];
if (state > 0) {
int slot = hash(state, stateLength(state)) & bucketMask;
for (int i = 0; newHashSet[slot] > 0;) {
slot = (slot + (++i)) & bucketMask;
}
newHashSet[slot] = state;
}
}
this.hashSet = newHashSet;
}
/**
* The total length of the serialized state data (all arcs).
*/
private int stateLength(int state) {
int arc = state;
while (!isArcLast(arc)) {
arc += ARC_SIZE;
}
return arc - state + ARC_SIZE;
}
/** Return true
if two regions in {@link #serialized} are identical. */
private boolean equivalent(int start1, int start2, int len) {
if (start1 + len > size || start2 + len > size)
return false;
while (len-- > 0)
if (serialized[start1++] != serialized[start2++])
return false;
return true;
}
/**
* Serialize a given state on the active path.
*/
private int serialize(final int activePathIndex) {
expandBuffers();
final int newState = size;
final int start = activePath[activePathIndex];
final int len = nextArcOffset[activePathIndex] - start;
System.arraycopy(serialized, start, serialized, newState, len);
size += len;
return newState;
}
/**
* Hash code of a fragment of {@link #serialized} array.
*/
private int hash(int start, int byteCount) {
assert byteCount % ARC_SIZE == 0 : "Not an arc multiply?";
int h = 0;
for (int arcs = byteCount / ARC_SIZE; --arcs >= 0; start += ARC_SIZE) {
h = 17 * h + getArcLabel(start);
h = 17 * h + getArcTarget(start);
if (isArcFinal(start)) h += 17;
}
return h;
}
/**
* Append a new mutable state to the active path.
*/
private void expandActivePath(int size) {
if (activePath.length < size) {
final int p = activePath.length;
activePath = java.util.Arrays.copyOf(activePath, size);
nextArcOffset = java.util.Arrays.copyOf(nextArcOffset, size);
for (int i = p; i < size; i++) {
nextArcOffset[i] = activePath[i] =
allocateState(/* assume max labels count */ MAX_LABELS);
}
}
}
/**
* Expand internal buffers for the next state.
*/
private void expandBuffers() {
if (this.serialized.length < size + ARC_SIZE * MAX_LABELS) {
serialized = java.util.Arrays.copyOf(serialized, serialized.length + bufferGrowthSize);
serializationBufferReallocations++;
}
}
/**
* Allocate space for a state with the given number of outgoing labels.
*
* @return state offset
*/
private int allocateState(int labels) {
expandBuffers();
final int state = size;
size += labels * ARC_SIZE;
return state;
}
/**
* Copy current
into an internal buffer.
*/
private boolean setPrevious(byte [] sequence, int start, int length) {
if (previous == null || previous.length < length) {
previous = new byte [length];
}
System.arraycopy(sequence, start, previous, 0, length);
previousLength = length;
return true;
}
/**
* Lexicographic order of input sequences. By default, consistent with the "C" sort
* (absolute value of bytes, 0-255).
*/
public static int compare(byte [] s1, int start1, int lens1,
byte [] s2, int start2, int lens2) {
final int max = Math.min(lens1, lens2);
for (int i = 0; i < max; i++) {
final byte c1 = s1[start1++];
final byte c2 = s2[start2++];
if (c1 != c2)
return (c1 & 0xff) - (c2 & 0xff);
}
return lens1 - lens2;
}
}
FSAFinalStatesIterator.java 0000664 0000000 0000000 00000010673 12344607157 0034012 0 ustar 00root root 0000000 0000000 morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/main/java/morfologik/fsa package morfologik.fsa;
import java.nio.ByteBuffer;
import java.util.*;
/**
* An iterator that traverses the right language of a given node (all sequences
* reachable from a given node).
*/
public final class FSAFinalStatesIterator implements Iterator {
/**
* Default expected depth of the recursion stack (estimated longest sequence
* in the automaton). Buffers expand by the same value if exceeded.
*/
private final static int EXPECTED_MAX_STATES = 15;
/** The FSA to which this iterator belongs. */
private final FSA fsa;
/** An internal cache for the next element in the FSA */
private ByteBuffer nextElement;
/**
* A buffer for the current sequence of bytes from the current node to the
* root.
*/
private byte[] buffer = new byte[EXPECTED_MAX_STATES];
/** Reusable byte buffer wrapper around {@link #buffer}. */
private ByteBuffer bufferWrapper = ByteBuffer.wrap(buffer);
/** An arc stack for DFS when processing the automaton. */
private int[] arcs = new int[EXPECTED_MAX_STATES];
/** Current processing depth in {@link #arcs}. */
private int position;
/**
* Create an instance of the iterator for a given node.
*/
public FSAFinalStatesIterator(FSA fsa, int node) {
this.fsa = fsa;
if (fsa.getFirstArc(node) != 0) {
restartFrom(node);
}
}
/**
* Restart walking from node
. Allows iterator reuse.
*/
public void restartFrom(int node) {
position = 0;
bufferWrapper.clear();
nextElement = null;
pushNode(node);
}
/** Returns true
if there are still elements in this iterator. */
@Override
public boolean hasNext() {
if (nextElement == null) {
nextElement = advance();
}
return nextElement != null;
}
/**
* @return Returns a {@link ByteBuffer} with the sequence corresponding to
* the next final state in the automaton.
*/
@Override
public ByteBuffer next() {
if (nextElement != null) {
final ByteBuffer cache = nextElement;
nextElement = null;
return cache;
} else {
final ByteBuffer cache = advance();
if (cache == null) {
throw new NoSuchElementException();
}
return cache;
}
}
/**
* Advances to the next available final state.
*/
private final ByteBuffer advance() {
if (position == 0) {
return null;
}
while (position > 0) {
final int lastIndex = position - 1;
final int arc = arcs[lastIndex];
if (arc == 0) {
// Remove the current node from the queue.
position--;
continue;
}
// Go to the next arc, but leave it on the stack
// so that we keep the recursion depth level accurate.
arcs[lastIndex] = fsa.getNextArc(arc);
// Expand buffer if needed.
final int bufferLength = this.buffer.length;
if (lastIndex >= bufferLength) {
this.buffer = Arrays.copyOf(buffer, bufferLength
+ EXPECTED_MAX_STATES);
this.bufferWrapper = ByteBuffer.wrap(buffer);
}
buffer[lastIndex] = fsa.getArcLabel(arc);
if (!fsa.isArcTerminal(arc)) {
// Recursively descend into the arc's node.
pushNode(fsa.getEndNode(arc));
}
if (fsa.isArcFinal(arc)) {
bufferWrapper.clear();
bufferWrapper.limit(lastIndex + 1);
return bufferWrapper;
}
}
return null;
}
/**
* Not implemented in this iterator.
*/
@Override
public void remove() {
throw new UnsupportedOperationException("Read-only iterator.");
}
/**
* Descends to a given node, adds its arcs to the stack to be traversed.
*/
private void pushNode(int node) {
// Expand buffers if needed.
if (position == arcs.length) {
arcs = Arrays.copyOf(arcs, arcs.length + EXPECTED_MAX_STATES);
}
arcs[position++] = fsa.getFirstArc(node);
}
} morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/main/java/morfologik/fsa/FSAFlags.java 0000664 0000000 0000000 00000002441 12344607157 0031170 0 ustar 00root root 0000000 0000000 package morfologik.fsa;
import java.util.Set;
/**
* FSA automaton flags. Where applicable, flags follow Daciuk's fsa
package.
*/
public enum FSAFlags {
/** Daciuk: flexible FSA encoding. */
FLEXIBLE(1 << 0),
/** Daciuk: stop bit in use. */
STOPBIT(1 << 1),
/** Daciuk: next bit in use. */
NEXTBIT(1 << 2),
/** Daciuk: tails compression. */
TAILS(1 << 3),
/*
* These flags are outside of byte range (never occur in Daciuk's FSA).
*/
/**
* The FSA contains right-language count numbers on states.
*
* @see FSA#getRightLanguageCount(int)
*/
NUMBERS(1 << 8),
/**
* The FSA supports legacy built-in separator and filler characters (Daciuk's FSA package
* compatibility).
*/
SEPARATORS(1 << 9);
/**
* Bit mask for the corresponding flag.
*/
public final int bits;
/** */
private FSAFlags(int bits) {
this.bits = bits;
}
/**
* Returns true
if the corresponding flag is set in the bit set.
*/
public static boolean isSet(int flags, FSAFlags flag) {
return (flags & flag.bits) != 0;
}
/**
* Returns the set of flags encoded in a single short.
*/
public static short asShort(Set flags) {
short value = 0;
for (FSAFlags f : flags)
value |= f.bits;
return value;
}
} morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/main/java/morfologik/fsa/FSAHeader.java 0000664 0000000 0000000 00000002353 12344607157 0031326 0 ustar 00root root 0000000 0000000 package morfologik.fsa;
import java.io.IOException;
import java.io.InputStream;
import morfologik.util.FileUtils;
import static morfologik.util.FileUtils.*;
/**
* Standard FSA file header, as described in fsa
package documentation.
*/
final class FSAHeader {
/**
* FSA magic (4 bytes).
*/
public final static int FSA_MAGIC = ('\\' << 24) | ('f' << 16) | ('s' << 8) | ('a');
/**
* Maximum length of the header block.
*/
public static final int MAX_HEADER_LENGTH = 4 + 8;
/** FSA version number. */
public byte version;
/** Filler character. */
public byte filler;
/** Annotation character. */
public byte annotation;
/** Goto field (may be a compound, depending on the automaton version). */
public byte gtl;
/**
* Read FSA header from a stream, consuming its bytes.
*
* @throws IOException If the stream ends prematurely or if it contains invalid data.
*/
public static FSAHeader read(InputStream in) throws IOException {
if (FSA_MAGIC != FileUtils.readInt(in))
throw new IOException("Invalid file header magic bytes.");
final FSAHeader h = new FSAHeader();
h.version = readByte(in);
h.filler = readByte(in);
h.annotation = readByte(in);
h.gtl = readByte(in);
return h;
}
}
morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/main/java/morfologik/fsa/FSAInfo.java 0000664 0000000 0000000 00000006243 12344607157 0031033 0 ustar 00root root 0000000 0000000 package morfologik.fsa;
import java.util.BitSet;
import com.carrotsearch.hppc.IntIntOpenHashMap;
/**
* Compute additional information about an FSA: number of arcs, nodes, etc.
*/
public final class FSAInfo {
/**
* Computes the exact number of states and nodes by recursively traversing
* the FSA.
*/
private static class NodeVisitor {
final BitSet visitedArcs = new BitSet();
final BitSet visitedNodes = new BitSet();
int nodes;
int arcs;
int totalArcs;
private final FSA fsa;
NodeVisitor(FSA fsa) {
this.fsa = fsa;
}
public void visitNode(final int node) {
if (visitedNodes.get(node)) {
return;
}
visitedNodes.set(node);
nodes++;
for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa
.getNextArc(arc)) {
if (!visitedArcs.get(arc)) {
arcs++;
}
totalArcs++;
visitedArcs.set(arc);
if (!fsa.isArcTerminal(arc)) {
visitNode(fsa.getEndNode(arc));
}
}
}
}
/**
* Computes the exact number of final states.
*/
private static class FinalStateVisitor {
final IntIntOpenHashMap visitedNodes = new IntIntOpenHashMap();
private final FSA fsa;
FinalStateVisitor(FSA fsa) {
this.fsa = fsa;
}
public int visitNode(int node) {
if (visitedNodes.containsKey(node))
return visitedNodes.lget();
int fromHere = 0;
for (int arc = fsa.getFirstArc(node);
arc != 0; arc = fsa.getNextArc(arc))
{
if (fsa.isArcFinal(arc))
fromHere++;
if (!fsa.isArcTerminal(arc)) {
fromHere += visitNode(fsa.getEndNode(arc));
}
}
visitedNodes.put(node, fromHere);
return fromHere;
}
}
/**
* Number of nodes in the automaton.
*/
public final int nodeCount;
/**
* Number of arcs in the automaton, excluding an arcs from the zero node
* (initial) and an arc from the start node to the root node.
*/
public final int arcsCount;
/**
* Total number of arcs, counting arcs that physically overlap due to
* merging.
*/
public final int arcsCountTotal;
/**
* Number of final states (number of input sequences stored in the automaton).
*/
public final int finalStatesCount;
/**
* Arcs size (in serialized form).
*/
public final int size;
/*
*
*/
public FSAInfo(FSA fsa) {
final NodeVisitor w = new NodeVisitor(fsa);
int root = fsa.getRootNode();
if (root > 0) {
w.visitNode(root);
}
this.nodeCount = 1 + w.nodes;
this.arcsCount = 1 + w.arcs;
this.arcsCountTotal = 1 + w.totalArcs;
final FinalStateVisitor fsv = new FinalStateVisitor(fsa);
this.finalStatesCount = fsv.visitNode(fsa.getRootNode());
if (fsa instanceof FSA5) {
this.size = ((FSA5) fsa).arcs.length;
} else {
this.size = 0;
}
}
/*
*
*/
public FSAInfo(int nodeCount, int arcsCount, int arcsCountTotal, int finalStatesCount) {
this.nodeCount = nodeCount;
this.arcsCount = arcsCount;
this.arcsCountTotal = arcsCountTotal;
this.finalStatesCount = finalStatesCount;
this.size = 0;
}
/*
*
*/
@Override
public String toString() {
return "Nodes: " + nodeCount
+ ", arcs visited: " + arcsCount
+ ", arcs total: " + arcsCountTotal
+ ", final states: " + finalStatesCount
+ ", size: " + size;
}
}
morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/main/java/morfologik/fsa/FSASerializer.java 0000664 0000000 0000000 00000002361 12344607157 0032246 0 ustar 00root root 0000000 0000000 package morfologik.fsa;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Set;
/**
* All FSA serializers to binary formats will implement this interface.
*/
public interface FSASerializer {
/**
* Serialize a finite state automaton to an output stream.
*/
public T serialize(FSA fsa, T os) throws IOException;
/**
* Returns the set of flags supported by the serializer (and the output automaton).
*/
public Set getFlags();
/**
* Log extra messages during construction.
*/
public FSASerializer withLogger(IMessageLogger logger);
/**
* Supports built-in filler separator. Only if {@link #getFlags()} returns
* {@link FSAFlags#SEPARATORS}.
*/
public FSASerializer withFiller(byte filler);
/**
* Supports built-in annotation separator. Only if {@link #getFlags()} returns
* {@link FSAFlags#SEPARATORS}.
*/
public FSASerializer withAnnotationSeparator(byte annotationSeparator);
/**
* Supports built-in right language count on nodes, speeding up perfect hash counts.
* Only if {@link #getFlags()} returns {@link FSAFlags#NUMBERS}.
*/
public FSASerializer withNumbers();
}
morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/main/java/morfologik/fsa/FSATraversal.java 0000664 0000000 0000000 00000011721 12344607157 0032100 0 ustar 00root root 0000000 0000000 package morfologik.fsa;
import static morfologik.fsa.MatchResult.*;
/**
* This class implements some common matching and scanning operations on a
* generic FSA.
*/
public final class FSATraversal {
/**
* Target automaton.
*/
private final FSA fsa;
/**
* Traversals of the given FSA.
*/
public FSATraversal(FSA fsa) {
this.fsa = fsa;
}
/**
* Calculate perfect hash for a given input sequence of bytes. The perfect hash requires
* that {@link FSA} is built with {@link FSAFlags#NUMBERS} and corresponds to the sequential
* order of input sequences used at automaton construction time.
*
* @param start Start index in the sequence array.
* @param length Length of the byte sequence, must be at least 1.
*
* @return Returns a unique integer assigned to the input sequence in the automaton (reflecting
* the number of that sequence in the input used to build the automaton). Returns a negative
* integer if the input sequence was not part of the input from which the automaton was created.
* The type of mismatch is a constant defined in {@link MatchResult}.
*/
public int perfectHash(byte[] sequence, int start, int length, int node) {
assert fsa.getFlags().contains(FSAFlags.NUMBERS) : "FSA not built with NUMBERS option.";
assert length > 0 : "Must be a non-empty sequence.";
int hash = 0;
final int end = start + length - 1;
int seqIndex = start;
byte label = sequence[seqIndex];
// Seek through the current node's labels, looking for 'label', update hash.
for (int arc = fsa.getFirstArc(node); arc != 0;) {
if (fsa.getArcLabel(arc) == label) {
if (fsa.isArcFinal(arc)) {
if (seqIndex == end)
return hash;
hash++;
}
if (fsa.isArcTerminal(arc)) {
/* The automaton contains a prefix of the input sequence. */
return AUTOMATON_HAS_PREFIX;
}
// The sequence is a prefix of one of the sequences stored in the automaton.
if (seqIndex == end) {
return SEQUENCE_IS_A_PREFIX;
}
// Make a transition along the arc, go the target node's first arc.
arc = fsa.getFirstArc(fsa.getEndNode(arc));
label = sequence[++seqIndex];
continue;
} else {
if (fsa.isArcFinal(arc))
hash++;
if (!fsa.isArcTerminal(arc))
hash += fsa.getRightLanguageCount(fsa.getEndNode(arc));
}
arc = fsa.getNextArc(arc);
}
// Labels of this node ended without a match on the sequence.
// Perfect hash does not exist.
return NO_MATCH;
}
/**
* @see #perfectHash(byte[], int, int, int)
*/
public int perfectHash(byte[] sequence) {
return perfectHash(sequence, 0, sequence.length, fsa.getRootNode());
}
/**
* Same as {@link #match(byte[], int, int, int)}, but allows passing
* a reusable {@link MatchResult} object so that no intermediate garbage is
* produced.
*
* @return The same object as result
, but with reset internal
* type and other fields.
*/
public MatchResult match(MatchResult result,
byte[] sequence, int start, int length, int node)
{
if (node == 0) {
result.reset(NO_MATCH, start, node);
return result;
}
final FSA fsa = this.fsa;
final int end = start + length;
for (int i = start; i < end; i++) {
final int arc = fsa.getArc(node, sequence[i]);
if (arc != 0) {
if (fsa.isArcFinal(arc) && i + 1 == end) {
/* The automaton has an exact match of the input sequence. */
result.reset(EXACT_MATCH, i, node);
return result;
}
if (fsa.isArcTerminal(arc)) {
/* The automaton contains a prefix of the input sequence. */
result.reset(AUTOMATON_HAS_PREFIX, i + 1, 0);
return result;
}
// Make a transition along the arc.
node = fsa.getEndNode(arc);
} else {
result.reset(NO_MATCH, i, node);
return result;
}
}
/* The sequence is a prefix of at least one sequence in the automaton. */
result.reset(SEQUENCE_IS_A_PREFIX, 0, node);
return result;
}
/**
* Finds a matching path in the dictionary for a given sequence of labels
* from sequence
and starting at node node
.
*
* @param sequence
* An array of labels to follow in the FSA.
* @param start
* Starting index in sequence
.
* @param length
* How many symbols to consider from sequence
?
* @param node
* Start node identifier in the FSA.
*
* @see #match(byte [], int)
*/
public MatchResult match(byte[] sequence, int start, int length, int node) {
return match(new MatchResult(), sequence, start, length, node);
}
/**
* @see #match(byte[], int, int, int)
*/
public MatchResult match(byte[] sequence, int node) {
return match(sequence, 0, sequence.length, node);
}
/**
* @see #match(byte[], int, int, int)
*/
public MatchResult match(byte[] sequence) {
return match(sequence, fsa.getRootNode());
}
} morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/main/java/morfologik/fsa/FSAUtils.java 0000664 0000000 0000000 00000013647 12344607157 0031246 0 ustar 00root root 0000000 0000000 package morfologik.fsa;
import java.io.IOException;
import java.io.StringWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.TreeMap;
import com.carrotsearch.hppc.IntIntOpenHashMap;
/**
* Other FSA-related utilities not directly associated with the class hierarchy.
*/
public final class FSAUtils {
public final static class IntIntHolder {
public int a;
public int b;
public IntIntHolder(int a, int b) {
this.a = a;
this.b = b;
}
public IntIntHolder() {
}
}
/**
* Returns the right-language reachable from a given FSA node, formatted
* as an input for the graphviz package (expressed in the dot
* language).
*/
public static String toDot(FSA fsa, int node) {
try {
StringWriter w = new StringWriter();
toDot(w, fsa, node);
return w.toString();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* Saves the right-language reachable from a given FSA node, formatted
* as an input for the graphviz package (expressed in the dot
* language), to the given writer.
*/
public static void toDot(Writer w, FSA fsa, int node) throws IOException {
w.write("digraph Automaton {\n");
w.write(" rankdir = LR;\n");
final BitSet visited = new BitSet();
w.write(" stop [shape=doublecircle,label=\"\"];\n");
w.write(" initial [shape=plaintext,label=\"\"];\n");
w.write(" initial -> " + node + "\n\n");
visitNode(w, 0, fsa, node, visited);
w.write("}\n");
}
private static void visitNode(Writer w, int d, FSA fsa, int s, BitSet visited) throws IOException {
visited.set(s);
w.write(" "); w.write(Integer.toString(s));
if (fsa.getFlags().contains(FSAFlags.NUMBERS)) {
int nodeNumber = fsa.getRightLanguageCount(s);
w.write(" [shape=circle,label=\"" + nodeNumber + "\"];\n");
} else {
w.write(" [shape=circle,label=\"\"];\n");
}
for (int arc = fsa.getFirstArc(s); arc != 0; arc = fsa.getNextArc(arc)) {
w.write(" ");
w.write(Integer.toString(s));
w.write(" -> ");
if (fsa.isArcTerminal(arc)) {
w.write("stop");
} else {
w.write(Integer.toString(fsa.getEndNode(arc)));
}
final byte label = fsa.getArcLabel(arc);
w.write(" [label=\"");
if (Character.isLetterOrDigit(label))
w.write((char) label);
else {
w.write("0x");
w.write(Integer.toHexString(label & 0xFF));
}
w.write("\"");
if (fsa.isArcFinal(arc)) w.write(" arrowhead=\"tee\"");
if (fsa instanceof FSA5) {
if (((FSA5) fsa).isNextSet(arc)) {
w.write(" color=\"blue\"");
}
}
w.write("]\n");
}
for (int arc = fsa.getFirstArc(s); arc != 0; arc = fsa.getNextArc(arc)) {
if (!fsa.isArcTerminal(arc)) {
int endNode = fsa.getEndNode(arc);
if (!visited.get(endNode)) {
visitNode(w, d + 1, fsa, endNode, visited);
}
}
}
}
/**
* All byte sequences generated as the right language of state
.
*/
public static ArrayList rightLanguage(FSA fsa, int state) {
final ArrayList rl = new ArrayList();
final byte [] buffer = new byte [0];
descend(fsa, state, buffer, 0, rl);
return rl;
}
/**
* Recursive descend and collection of the right language.
*/
private static byte [] descend(FSA fsa, int state, byte [] b, int position, ArrayList rl) {
if (b.length <= position) {
b = Arrays.copyOf(b, position + 1);
}
for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) {
b[position] = fsa.getArcLabel(arc);
if (fsa.isArcFinal(arc)) {
rl.add(Arrays.copyOf(b, position + 1));
}
if (!fsa.isArcTerminal(arc))
b = descend(fsa, fsa.getEndNode(arc), b, position + 1, rl);
}
return b;
}
/**
* Calculate fan-out ratio.
* @return The returned array: result[outgoing-arcs]
*/
public static TreeMap calculateFanOuts(final FSA fsa, int root) {
final int [] result = new int [256];
fsa.visitInPreOrder(new StateVisitor() {
public boolean accept(int state) {
int count = 0;
for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc))
count++;
result[count]++;
return true;
}
});
TreeMap output = new TreeMap();
int low = 1; // Omit #0, there is always a single node like that (dummy).
while (low < result.length && result[low] == 0) low++;
int high = result.length - 1;
while (high >= 0 && result[high] == 0) high--;
for (int i = low; i <= high; i++) {
output.put(i, result[i]);
}
return output;
}
/**
* Calculate the size of right language for each state in an FSA.
*/
public static IntIntOpenHashMap rightLanguageForAllStates(final FSA fsa) {
final IntIntOpenHashMap numbers = new IntIntOpenHashMap();
fsa.visitInPostOrder(new StateVisitor() {
public boolean accept(int state) {
int thisNodeNumber = 0;
for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) {
thisNodeNumber +=
(fsa.isArcFinal(arc) ? 1 : 0) +
(fsa.isArcTerminal(arc) ? 0 : numbers.get(fsa.getEndNode(arc)));
}
numbers.put(state, thisNodeNumber);
return true;
}
});
return numbers;
}
}
morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/main/java/morfologik/fsa/IMessageLogger.java 0000664 0000000 0000000 00000000621 12344607157 0032435 0 ustar 00root root 0000000 0000000 package morfologik.fsa;
public interface IMessageLogger {
/**
* Log progress to the console.
*/
public void log(String msg);
/**
* Log message header and save current time.
*/
public void startPart(String header);
/**
*
*/
public void endPart();
/**
* Log a two-part message.
*/
public void log(String header, Object v);
} morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/main/java/morfologik/fsa/MatchResult.java 0000664 0000000 0000000 00000003543 12344607157 0032041 0 ustar 00root root 0000000 0000000 package morfologik.fsa;
/**
* A matching result returned from {@link FSATraversal}.
*
* @see FSATraversal
*/
public final class MatchResult {
/**
* The automaton has exactly one match for the input sequence.
*/
public static final int EXACT_MATCH = 0;
/**
* The automaton has no match for the input sequence.
*/
public static final int NO_MATCH = -1;
/**
* The automaton contains a prefix of the input sequence. That is:
* one of the input sequences used to build the automaton is a
* prefix of the input sequence that is shorter than the sequence.
*
* {@link MatchResult#index} will contain an index of the
* first character of the input sequence not present in the
* dictionary.
*/
public static final int AUTOMATON_HAS_PREFIX = -3;
/**
* The sequence is a prefix of at least one sequence in the automaton.
* {@link MatchResult#node} returns the node from which all sequences
* with the given prefix start in the automaton.
*/
public static final int SEQUENCE_IS_A_PREFIX = -4;
/**
* One of the match kind constants defined in this class.
*
* @see #NO_MATCH
* @see #EXACT_MATCH
* @see #AUTOMATON_HAS_PREFIX
* @see #SEQUENCE_IS_A_PREFIX
*/
public int kind;
/**
* Input sequence's index, interpretation depends on {@link #kind}.
*/
public int index;
/**
* Automaton node, interpretation depends on the {@link #kind}.
*/
public int node;
/*
*
*/
MatchResult(int kind, int index, int node) {
reset(kind, index, node);
}
/*
*
*/
MatchResult(int kind) {
reset(kind, 0, 0);
}
/*
*
*/
public MatchResult() {
reset(NO_MATCH, 0, 0);
}
/*
*
*/
final void reset(int kind, int index, int node) {
this.kind = kind;
this.index = index;
this.node = node;
}
}
morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/main/java/morfologik/fsa/NullMessageLogger.java 0000664 0000000 0000000 00000000535 12344607157 0033163 0 ustar 00root root 0000000 0000000 package morfologik.fsa;
/*
* Do-nothing logger.
*/
final class NullMessageLogger implements IMessageLogger {
@Override
public void log(String msg) {
}
@Override
public void startPart(String header) {
}
@Override
public void endPart() {
}
@Override
public void log(String header, Object v) {
}
}
morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/main/java/morfologik/fsa/StateVisitor.java 0000664 0000000 0000000 00000000323 12344607157 0032237 0 ustar 00root root 0000000 0000000 package morfologik.fsa;
/**
* State visitor.
*
* @see FSA#visitInPostOrder(StateVisitor)
* @see FSA#visitInPreOrder(StateVisitor)
*/
public interface StateVisitor {
public boolean accept(int state);
} morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/main/java/morfologik/util/ 0000775 0000000 0000000 00000000000 12344607157 0027142 5 ustar 00root root 0000000 0000000 morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/main/java/morfologik/util/Arrays.java 0000664 0000000 0000000 00000003170 12344607157 0031247 0 ustar 00root root 0000000 0000000 package morfologik.util;
/**
* Compatibility layer for JVM 1.5.
*/
public final class Arrays {
private Arrays() {
// No instances.
}
/**
* Compare two lists of objects for reference-equality.
*/
public static boolean referenceEquals(Object[] a1, int a1s, Object[] a2, int a2s, int length) {
for (int i = 0; i < length; i++)
if (a1[a1s++] != a2[a2s++])
return false;
return true;
}
/**
* Compare two arrays for equality.
*/
public static boolean equals(byte[] a1, int a1s, byte [] a2, int a2s, int length) {
for (int i = 0; i < length; i++)
if (a1[a1s++] != a2[a2s++])
return false;
return true;
}
/**
* Compare two arrays for equality.
*/
public static boolean equals(boolean[] a1, int a1s, boolean[] a2, int a2s, int length) {
for (int i = 0; i < length; i++)
if (a1[a1s++] != a2[a2s++])
return false;
return true;
}
/**
* Compare two arrays for equality.
*/
public static boolean equals(int[] a1, int a1s, int[] a2, int a2s, int length) {
for (int i = 0; i < length; i++)
if (a1[a1s++] != a2[a2s++])
return false;
return true;
}
/**
* Convert an array of strings to bytes.
*/
public static String toString(byte [] bytes, int start, int length)
{
if (bytes.length != length)
{
final byte [] sub = new byte [length];
System.arraycopy(bytes, start, sub, 0, length);
bytes = sub;
}
return java.util.Arrays.toString(bytes);
}
}
morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/main/java/morfologik/util/BufferUtils.java 0000664 0000000 0000000 00000002661 12344607157 0032244 0 ustar 00root root 0000000 0000000 package morfologik.util;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
/**
* Utility functions for buffers.
*/
public final class BufferUtils {
/**
* No instances.
*/
private BufferUtils() {
// empty
}
/**
* Ensure the byte buffer's capacity. If a new buffer is allocated, its
* content is empty (the old buffer's contents is not copied).
*
* @param buffer
* The buffer to check or null
if a new buffer
* should be allocated.
*/
public static ByteBuffer ensureCapacity(ByteBuffer buffer, int capacity) {
if (buffer == null || buffer.capacity() < capacity) {
buffer = ByteBuffer.allocate(capacity);
}
return buffer;
}
/**
* Ensure the char buffer's capacity. If a new buffer is allocated, its
* content is empty (the old buffer's contents is not copied).
*
* @param buffer
* The buffer to check or null
if a new buffer
* should be allocated.
*/
public static CharBuffer ensureCapacity(CharBuffer buffer, int capacity) {
if (buffer == null || buffer.capacity() < capacity) {
buffer = CharBuffer.allocate(capacity);
}
return buffer;
}
/**
* Convert a byte buffer to a string in platform default encoding.
*/
public static String toString(ByteBuffer sequence) {
return new String(sequence.array(), sequence.position(), sequence.remaining());
}
} morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/main/java/morfologik/util/FileUtils.java 0000664 0000000 0000000 00000006127 12344607157 0031713 0 ustar 00root root 0000000 0000000 package morfologik.util;
import java.io.*;
/**
* Utility functions.
*/
public final class FileUtils {
/**
* No instances.
*/
private FileUtils() {
// empty
}
/**
* Checks if the given file exists.
*/
public static void assertExists(File fsaFile, boolean requireFile,
boolean requireDirectory) throws IOException {
if (!fsaFile.exists()) {
throw new IOException("File does not exist: "
+ fsaFile.getAbsolutePath());
}
if (requireFile) {
if (!fsaFile.isFile() || !fsaFile.canRead()) {
throw new IOException("File cannot be read: "
+ fsaFile.getAbsolutePath());
}
}
if (requireDirectory) {
if (!fsaFile.isDirectory()) {
throw new IOException("Not a directory: "
+ fsaFile.getAbsolutePath());
}
}
}
/**
* Force any non-null closeables.
*/
public static void close(Closeable... closeables) {
for (Closeable c : closeables) {
if (c != null) {
try {
c.close();
} catch (IOException e) {
// too bad
}
}
}
}
/**
* Reads all bytes from an input stream (until EOF).
*/
public static byte[] readFully(InputStream stream) throws IOException {
final ByteArrayOutputStream baos = new ByteArrayOutputStream(1024 * 16);
final byte[] buffer = new byte[1024 * 8];
int bytesCount;
while ((bytesCount = stream.read(buffer)) > 0) {
baos.write(buffer, 0, bytesCount);
}
return baos.toByteArray();
}
/**
* Read enough bytes to fill array
If there are not enough
* bytes, throw an exception.
*/
public static void readFully(InputStream in, byte[] array)
throws IOException {
int offset = 0;
int cnt;
while ((cnt = in.read(array, offset, array.length - offset)) > 0) {
offset += cnt;
if (offset == array.length)
break;
}
if (cnt < 0)
throw new EOFException();
}
/**
* Read exactly 4 bytes from the input stream.
*/
public static int readInt(InputStream in) throws IOException {
int v = 0;
for (int i = 0; i < 4; i++) {
v = (v << 8) | (readByte(in) & 0xff);
}
return v;
}
/**
*
*/
public static void writeInt(OutputStream os, int v) throws IOException {
os.write( v >>> 24);
os.write((v >>> 16) & 0xff);
os.write((v >>> 8) & 0xff);
os.write( v & 0xff);
}
/**
* Read exactly 2 bytes from the input stream.
*/
public static short readShort(InputStream in) throws IOException {
return (short) (readByte(in) << 8 |
readByte(in) & 0xff);
}
/**
* Read exactly one byte from the input stream.
*
* @throws EOFException if EOF is reached.
*/
public static byte readByte(InputStream in) throws IOException {
int b = in.read();
if (b == -1)
throw new EOFException();
return (byte) b;
}
/**
*
*/
public static void writeShort(OutputStream os, short v) throws IOException {
os.write((v >>> 8) & 0xff);
os.write( v & 0xff);
}
} morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/main/java/morfologik/util/ResourceUtils.java 0000664 0000000 0000000 00000003032 12344607157 0032613 0 ustar 00root root 0000000 0000000 package morfologik.util;
import java.io.*;
import java.net.*;
/**
* Resource management utilities.
*/
public final class ResourceUtils {
/**
* No instances.
*/
private ResourceUtils() {
}
/**
* Returns an input stream to the resource.
*
* @param resource
* The path leading to the resource. Can be an URL, a path
* leading to a class resource or a {@link File}.
*
* @return InputStream instance.
* @throws IOException
* If the resource could not be found or opened.
*/
public static InputStream openInputStream(String resource)
throws IOException {
try {
// See if the resource is an URL first.
final URL url = new URL(resource);
// success, load the resource.
return url.openStream();
} catch (MalformedURLException e) {
// No luck. Fallback to class loader paths.
}
// Try current thread's class loader first.
final ClassLoader ldr = Thread.currentThread().getContextClassLoader();
InputStream is;
if (ldr != null && (is = ldr.getResourceAsStream(resource)) != null) {
return is;
} else if ((is = ResourceUtils.class.getResourceAsStream(resource)) != null) {
return is;
} else if ((is = ClassLoader.getSystemResourceAsStream(resource)) != null) {
return is;
}
// Try file path
final File f = new File(resource);
if (f.exists() && f.isFile() && f.canRead()) {
return new FileInputStream(f);
}
throw new IOException("Could not locate resource: " + resource);
}
}
morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/test/ 0000775 0000000 0000000 00000000000 12344607157 0023127 5 ustar 00root root 0000000 0000000 morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/test/java/ 0000775 0000000 0000000 00000000000 12344607157 0024050 5 ustar 00root root 0000000 0000000 morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/test/java/morfologik/ 0000775 0000000 0000000 00000000000 12344607157 0026220 5 ustar 00root root 0000000 0000000 morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/test/java/morfologik/fsa/ 0000775 0000000 0000000 00000000000 12344607157 0026771 5 ustar 00root root 0000000 0000000 morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/test/java/morfologik/fsa/CFSA2SerializerTest.java 0000664 0000000 0000000 00000001213 12344607157 0033321 0 ustar 00root root 0000000 0000000 package morfologik.fsa;
import static org.junit.Assert.*;
import org.junit.Test;
/**
*
*/
public class CFSA2SerializerTest extends SerializerTestBase {
protected CFSA2Serializer createSerializer() {
return new CFSA2Serializer();
}
@Test
public void testVIntCoding() {
byte [] scratch = new byte [5];
int [] values = {0, 1, 128, 256, 0x1000, Integer.MAX_VALUE };
for (int v : values) {
int len = CFSA2.writeVInt(scratch, 0, v);
assertEquals(v, CFSA2.readVInt(scratch, 0));
assertEquals(len, CFSA2.vIntLength(v));
}
}
}
morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/test/java/morfologik/fsa/FSA5SerializerTest.java 0000664 0000000 0000000 00000000310 12344607157 0033216 0 ustar 00root root 0000000 0000000 package morfologik.fsa;
/**
*
*/
public class FSA5SerializerTest extends SerializerTestBase {
protected FSA5Serializer createSerializer() {
return new FSA5Serializer();
}
}
morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/test/java/morfologik/fsa/FSA5Test.java 0000664 0000000 0000000 00000006253 12344607157 0031200 0 ustar 00root root 0000000 0000000 package morfologik.fsa;
import static morfologik.fsa.FSAFlags.NEXTBIT;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.junit.Test;
/**
* Additional tests for {@link FSA5}.
*/
public final class FSA5Test {
public ArrayList expected = new ArrayList(Arrays.asList(
"a", "aba", "ac", "b", "ba", "c"));
@Test
public void testVersion5() throws IOException {
final FSA fsa = FSA.read(this.getClass().getResourceAsStream("abc.fsa"));
assertFalse(fsa.getFlags().contains(FSAFlags.NUMBERS));
verifyContent(expected, fsa);
}
@Test
public void testVersion5WithNumbers() throws IOException {
final FSA fsa = FSA.read(this.getClass().getResourceAsStream("abc-numbers.fsa"));
verifyContent(expected, fsa);
assertTrue(fsa.getFlags().contains(FSAFlags.NUMBERS));
}
@Test
public void testArcsAndNodes() throws IOException {
final FSA fsa1 = FSA.read(this.getClass().getResourceAsStream(
"abc.fsa"));
final FSA fsa2 = FSA.read(this.getClass().getResourceAsStream(
"abc-numbers.fsa"));
FSAInfo info1 = new FSAInfo(fsa1);
FSAInfo info2 = new FSAInfo(fsa2);
assertEquals(info1.arcsCount, info2.arcsCount);
assertEquals(info1.nodeCount, info2.nodeCount);
assertEquals(4, info2.nodeCount);
assertEquals(7, info2.arcsCount);
}
@Test
public void testNumbers() throws IOException {
final FSA5 fsa = FSA.read(this.getClass().getResourceAsStream("abc-numbers.fsa"));
assertTrue(fsa.getFlags().contains(NEXTBIT));
// Get all numbers for nodes.
byte[] buffer = new byte[128];
final ArrayList result = new ArrayList();
walkNode(buffer, 0, fsa, fsa.getRootNode(), 0, result);
Collections.sort(result);
assertEquals(Arrays
.asList("0 c", "1 b", "2 ba", "3 a", "4 ac", "5 aba"), result);
}
public static void walkNode(byte[] buffer, int depth, FSA fsa, int node,
int cnt, List result) throws IOException {
for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) {
buffer[depth] = fsa.getArcLabel(arc);
if (fsa.isArcFinal(arc) || fsa.isArcTerminal(arc)) {
result.add(cnt + " " + new String(buffer, 0, depth + 1, "UTF-8"));
}
if (fsa.isArcFinal(arc))
cnt++;
if (!fsa.isArcTerminal(arc)) {
walkNode(buffer, depth + 1, fsa, fsa.getEndNode(arc), cnt, result);
cnt += fsa.getRightLanguageCount(fsa.getEndNode(arc));
}
}
}
private static void verifyContent(List expected, FSA fsa) throws IOException {
final ArrayList actual = new ArrayList();
int count = 0;
for (ByteBuffer bb : fsa.getSequences()) {
assertEquals(0, bb.arrayOffset());
assertEquals(0, bb.position());
actual.add(new String(bb.array(), 0, bb.remaining(), "UTF-8"));
count++;
}
assertEquals(expected.size(), count);
Collections.sort(actual);
assertEquals(expected, actual);
}
}
morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/test/java/morfologik/fsa/FSABuilderTest.java 0000664 0000000 0000000 00000005201 12344607157 0032412 0 ustar 00root root 0000000 0000000 package morfologik.fsa;
import static morfologik.fsa.FSATestUtils.*;
import static org.junit.Assert.assertEquals;
import java.io.IOException;
import java.util.Arrays;
import morfologik.fsa.FSA;
import morfologik.fsa.FSABuilder;
import morfologik.util.MinMax;
import org.junit.BeforeClass;
import org.junit.Test;
public class FSABuilderTest {
private static byte[][] input;
private static byte[][] input2;
@BeforeClass
public static void prepareByteInput() {
input = generateRandom(25000, new MinMax(1, 20), new MinMax(0, 255));
input2 = generateRandom(40, new MinMax(1, 20), new MinMax(0, 3));
}
/**
*
*/
@Test
public void testEmptyInput() {
byte[][] input = {};
checkCorrect(input, FSABuilder.build(input));
}
/**
*
*/
@Test
public void testHashResizeBug() throws Exception {
byte[][] input = {
{0, 1 },
{0, 2 },
{1, 1 },
{2, 1 },
};
FSA fsa = FSABuilder.build(input);
checkCorrect(input, FSABuilder.build(input));
checkMinimal(fsa);
}
/**
*
*/
@Test
public void testSmallInput() throws Exception {
byte[][] input = {
"abc".getBytes("UTF-8"),
"bbc".getBytes("UTF-8"),
"d".getBytes("UTF-8"),
};
checkCorrect(input, FSABuilder.build(input));
}
/**
* Verify absolute byte-value ordering in the comparators and serialized automaton.
*/
@Test
public void testLexicographicOrder() throws IOException {
byte[][] input = {
{0},
{1},
{(byte) 0xff},
};
Arrays.sort(input, FSABuilder.LEXICAL_ORDERING);
// Check if lexical ordering is consistent with absolute byte value.
assertEquals(0, input[0][0]);
assertEquals(1, input[1][0]);
assertEquals((byte) 0xff, input[2][0]);
final FSA fsa;
checkCorrect(input, fsa = FSABuilder.build(input));
int arc = fsa.getFirstArc(fsa.getRootNode());
assertEquals(0, fsa.getArcLabel(arc));
arc = fsa.getNextArc(arc);
assertEquals(1, fsa.getArcLabel(arc));
arc = fsa.getNextArc(arc);
assertEquals((byte) 0xff, fsa.getArcLabel(arc));
}
/**
*
*/
@Test
public void testRandom25000_largerAlphabet() {
FSA fsa = FSABuilder.build(input);
checkCorrect(input, fsa);
checkMinimal(fsa);
}
/**
*
*/
@Test
public void testRandom25000_smallAlphabet() throws IOException {
FSA fsa = FSABuilder.build(input2);
checkCorrect(input2, fsa);
checkMinimal(fsa);
}
}
morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/test/java/morfologik/fsa/FSATestUtils.java 0000664 0000000 0000000 00000014312 12344607157 0032127 0 ustar 00root root 0000000 0000000 package morfologik.fsa;
import java.nio.ByteBuffer;
import java.util.*;
import morfologik.util.BufferUtils;
import morfologik.util.MinMax;
import org.junit.Assert;
public class FSATestUtils {
/**
* Generate a sorted list of random sequences.
*/
public static byte[][] generateRandom(int count, MinMax length,
MinMax alphabet) {
final byte[][] input = new byte[count][];
final Random rnd = new Random(0x11223344);
for (int i = 0; i < count; i++) {
input[i] = randomByteSequence(rnd, length, alphabet);
}
Arrays.sort(input, FSABuilder.LEXICAL_ORDERING);
return input;
}
/**
* Generate a random string.
*/
private static byte[] randomByteSequence(Random rnd, MinMax length,
MinMax alphabet) {
byte[] bytes = new byte[length.min + rnd.nextInt(length.range())];
for (int i = 0; i < bytes.length; i++) {
bytes[i] = (byte) (alphabet.min + rnd.nextInt(alphabet.range()));
}
return bytes;
}
/**
* Check if the DFSA is correct with respect to the given input.
*/
public static void checkCorrect(byte[][] input, FSA fsa) {
// (1) All input sequences are in the right language.
HashSet rl = new HashSet();
for (ByteBuffer bb : fsa) {
rl.add(ByteBuffer.wrap(Arrays.copyOf(bb.array(), bb.remaining())));
}
HashSet uniqueInput = new HashSet();
for (byte[] sequence : input) {
uniqueInput.add(ByteBuffer.wrap(sequence));
}
for (ByteBuffer sequence : uniqueInput) {
Assert.assertTrue("Not present in the right language: "
+ BufferUtils.toString(sequence), rl.remove(sequence));
}
// (2) No other sequence _other_ than the input is in the right language.
Assert.assertEquals(0, rl.size());
}
/**
* Check if the DFSA reachable from a given state is minimal. This means no
* two states have the same right language.
*/
public static void checkMinimal(final FSA fsa) {
final HashMap stateLanguages = new HashMap();
fsa.visitInPostOrder(new StateVisitor() {
private StringBuilder b = new StringBuilder();
public boolean accept(int state) {
List rightLanguage = allSequences(fsa, state);
Collections.sort(rightLanguage, FSABuilder.LEXICAL_ORDERING);
b.setLength(0);
for (byte[] seq : rightLanguage) {
b.append(Arrays.toString(seq));
b.append(',');
}
String full = b.toString();
Assert.assertFalse("State exists: " + state + " "
+ full + " " + stateLanguages.get(full), stateLanguages.containsKey(full));
stateLanguages.put(full, state);
return true;
}
});
}
static List allSequences(FSA fsa, int state) {
ArrayList seq = new ArrayList();
for (ByteBuffer bb : fsa.getSequences(state)) {
seq.add(Arrays.copyOf(bb.array(), bb.remaining()));
}
return seq;
}
/**
* Check if two FSAs are identical.
*/
public static void checkIdentical(FSA fsa1, FSA fsa2) {
ArrayDeque fromRoot = new ArrayDeque();
checkIdentical(fromRoot,
fsa1, fsa1.getRootNode(), new BitSet(),
fsa2, fsa2.getRootNode(), new BitSet());
}
/*
*
*/
static void checkIdentical(ArrayDeque fromRoot,
FSA fsa1, int node1, BitSet visited1,
FSA fsa2, int node2, BitSet visited2) {
int arc1 = fsa1.getFirstArc(node1);
int arc2 = fsa2.getFirstArc(node2);
if (visited1.get(node1) != visited2.get(node2)) {
throw new RuntimeException("Two nodes should either be visited or not visited: "
+ Arrays.toString(fromRoot.toArray()) + " "
+ " node1: " + node1 + " "
+ " node2: " + node2);
}
visited1.set(node1);
visited2.set(node2);
TreeSet labels1 = new TreeSet();
TreeSet labels2 = new TreeSet();
while (true) {
labels1.add((char) fsa1.getArcLabel(arc1));
labels2.add((char) fsa2.getArcLabel(arc2));
arc1 = fsa1.getNextArc(arc1);
arc2 = fsa2.getNextArc(arc2);
if (arc1 == 0 || arc2 == 0) {
if (arc1 != arc2) {
throw new RuntimeException("Different number of labels at path: "
+ Arrays.toString(fromRoot.toArray()));
}
break;
}
}
if (!labels1.equals(labels2)) {
throw new RuntimeException("Different sets of labels at path: "
+ Arrays.toString(fromRoot.toArray()) + ":\n"
+ labels1 + "\n" + labels2);
}
// recurse.
for (char chr : labels1) {
byte label = (byte) chr;
fromRoot.push(Character.isLetterOrDigit(chr) ? Character.toString(chr) : Integer.toString(chr));
arc1 = fsa1.getArc(node1, label);
arc2 = fsa2.getArc(node2, label);
if (fsa1.isArcFinal(arc1) != fsa2.isArcFinal(arc2)) {
throw new RuntimeException("Different final flag on arcs at: "
+ Arrays.toString(fromRoot.toArray()) + ", label: " + label);
}
if (fsa1.isArcTerminal(arc1) != fsa2.isArcTerminal(arc2)) {
throw new RuntimeException("Different terminal flag on arcs at: "
+ Arrays.toString(fromRoot.toArray()) + ", label: " + label);
}
if (!fsa1.isArcTerminal(arc1)) {
checkIdentical(fromRoot,
fsa1, fsa1.getEndNode(arc1), visited1,
fsa2, fsa2.getEndNode(arc2), visited2);
}
fromRoot.pop();
}
}
}
morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/test/java/morfologik/fsa/FSATraversalTest.java 0000664 0000000 0000000 00000010013 12344607157 0032764 0 ustar 00root root 0000000 0000000 package morfologik.fsa;
import static org.junit.Assert.*;
import static morfologik.fsa.MatchResult.*;
import java.io.*;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.HashSet;
import org.junit.Before;
import org.junit.Test;
/**
* Tests {@link FSATraversal}.
*/
public final class FSATraversalTest {
private FSA fsa;
/**
*
*/
@Before
public void setUp() throws Exception {
fsa = FSA.read(this.getClass().getResourceAsStream("en_tst.dict"));
}
/**
*
*/
@Test
public void testTraversalWithIterable() {
int count = 0;
for (ByteBuffer bb : fsa.getSequences()) {
assertEquals(0, bb.arrayOffset());
assertEquals(0, bb.position());
count++;
}
assertEquals(346773, count);
}
/**
*
*/
@Test
public void testPerfectHash() throws IOException {
byte[][] input = new byte[][] {
{ 'a' },
{ 'a', 'b', 'a' },
{ 'a', 'c' },
{ 'b' },
{ 'b', 'a' },
{ 'c' },
};
Arrays.sort(input, FSABuilder.LEXICAL_ORDERING);
FSA s = FSABuilder.build(input);
final byte[] fsaData =
new FSA5Serializer()
.withNumbers()
.serialize(s, new ByteArrayOutputStream())
.toByteArray();
final FSA5 fsa = (FSA5) FSA.read(new ByteArrayInputStream(fsaData));
final FSATraversal traversal = new FSATraversal(fsa);
int i = 0;
for (byte [] seq : input)
{
assertEquals(new String(seq), i++, traversal.perfectHash(seq));
}
// Check if the total number of sequences is encoded at the root node.
assertEquals(6, fsa.getRightLanguageCount(fsa.getRootNode()));
// Check sub/super sequence scenarios.
assertEquals(AUTOMATON_HAS_PREFIX, traversal.perfectHash("abax".getBytes("UTF-8")));
assertEquals(SEQUENCE_IS_A_PREFIX, traversal.perfectHash("ab".getBytes("UTF-8")));
assertEquals(NO_MATCH, traversal.perfectHash("d".getBytes("UTF-8")));
assertEquals(NO_MATCH, traversal.perfectHash(new byte [] {0}));
assertTrue(AUTOMATON_HAS_PREFIX < 0);
assertTrue(SEQUENCE_IS_A_PREFIX < 0);
assertTrue(NO_MATCH < 0);
}
/**
*
*/
@Test
public void testRecursiveTraversal() {
final int[] counter = new int[] { 0 };
class Recursion {
public void dumpNode(final int node) {
int arc = fsa.getFirstArc(node);
do {
if (fsa.isArcFinal(arc)) {
counter[0]++;
}
if (!fsa.isArcTerminal(arc)) {
dumpNode(fsa.getEndNode(arc));
}
arc = fsa.getNextArc(arc);
} while (arc != 0);
}
}
new Recursion().dumpNode(fsa.getRootNode());
assertEquals(346773, counter[0]);
}
/**
* Test {@link FSATraversal} and matching results.
*/
@Test
public void testMatch() throws IOException {
final FSA5 fsa = FSA.read(this.getClass().getResourceAsStream("abc.fsa"));
final FSATraversal traversalHelper = new FSATraversal(fsa);
MatchResult m = traversalHelper.match("ax".getBytes());
assertEquals(NO_MATCH, m.kind);
assertEquals(1, m.index);
assertEquals(new HashSet(Arrays.asList("ba", "c")),
suffixes(fsa, m.node));
assertEquals(EXACT_MATCH,
traversalHelper.match("aba".getBytes()).kind);
m = traversalHelper.match("abalonger".getBytes());
assertEquals(AUTOMATON_HAS_PREFIX, m.kind);
assertEquals("longer", "abalonger".substring(m.index));
m = traversalHelper.match("ab".getBytes());
assertEquals(SEQUENCE_IS_A_PREFIX, m.kind);
assertEquals(new HashSet(Arrays.asList("a")),
suffixes(fsa, m.node));
}
/**
* Return all sequences reachable from a given node, as strings.
*/
private HashSet suffixes(FSA fsa, int node) {
HashSet result = new HashSet();
for (ByteBuffer bb : fsa.getSequences(node))
{
try {
result.add(new String(bb.array(), bb.position(), bb.remaining(), "UTF-8"));
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
}
return result;
}
}
morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/test/java/morfologik/fsa/SerializerTestBase.java 0000664 0000000 0000000 00000015741 12344607157 0033410 0 ustar 00root root 0000000 0000000 package morfologik.fsa;
import static morfologik.fsa.FSAFlags.NUMBERS;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.*;
import java.nio.ByteBuffer;
import java.util.*;
import morfologik.util.BufferUtils;
import org.junit.*;
public abstract class SerializerTestBase {
@Test
public void testA() throws IOException {
byte[][] input = new byte[][] {
{ 'a' },
};
Arrays.sort(input, FSABuilder.LEXICAL_ORDERING);
FSA s = FSABuilder.build(input);
checkSerialization(input, s);
}
@Test
public void testArcsSharing() throws IOException {
byte[][] input = new byte[][] {
{ 'a', 'c', 'f' },
{ 'a', 'd', 'g' },
{ 'a', 'e', 'h' },
{ 'b', 'd', 'g' },
{ 'b', 'e', 'h' },
};
Arrays.sort(input, FSABuilder.LEXICAL_ORDERING);
FSA s = FSABuilder.build(input);
checkSerialization(input, s);
}
@Test
public void testFSA5SerializerSimple() throws IOException {
byte[][] input = new byte[][] {
{ 'a' },
{ 'a', 'b', 'a' },
{ 'a', 'c' },
{ 'b' },
{ 'b', 'a' },
{ 'c' },
};
Arrays.sort(input, FSABuilder.LEXICAL_ORDERING);
FSA s = FSABuilder.build(input);
checkSerialization(input, s);
}
@Test
public void testNotMinimal() throws IOException {
byte[][] input = new byte[][] {
{ 'a', 'b', 'a' },
{ 'b' },
{ 'b', 'a' }
};
Arrays.sort(input, FSABuilder.LEXICAL_ORDERING);
FSA s = FSABuilder.build(input);
checkSerialization(input, s);
}
/**
*
*/
@Test
public void testFSA5Bug0() throws IOException {
checkCorrect(new String[] {
"3-D+A+JJ",
"3-D+A+NN",
"4-F+A+NN",
"z+A+NN", });
}
/**
*
*/
@Test
public void testFSA5Bug1() throws IOException {
checkCorrect(new String[] { "+NP", "n+N", "n+NP", });
}
private void checkCorrect(String[] strings) throws IOException {
byte[][] input = new byte[strings.length][];
for (int i = 0; i < strings.length; i++) {
input[i] = strings[i].getBytes("ISO8859-1");
}
Arrays.sort(input, FSABuilder.LEXICAL_ORDERING);
FSA s = FSABuilder.build(input);
checkSerialization(input, s);
}
/**
*
*/
@Test
public void testEmptyInput() throws IOException {
byte[][] input = new byte[][] {};
FSA s = FSABuilder.build(input);
checkSerialization(input, s);
}
/**
*
*/
@Test
public void test_abc() throws IOException {
testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("abc.fsa")));
}
/**
*
*/
@Test
public void test_minimal() throws IOException {
testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("minimal.fsa")));
}
/**
*
*/
@Test
public void test_minimal2() throws IOException {
testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("minimal2.fsa")));
}
/**
*
*/
@Test
public void test_en_tst() throws IOException {
testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("en_tst.dict")));
}
private void testBuiltIn(FSA fsa) throws IOException {
final ArrayList sequences = new ArrayList();
sequences.clear();
for (ByteBuffer bb : fsa) {
sequences.add(Arrays.copyOf(bb.array(), bb.remaining()));
}
Collections.sort(sequences, FSABuilder.LEXICAL_ORDERING);
final byte[][] in = sequences.toArray(new byte[sequences.size()][]);
FSA root = FSABuilder.build(in);
// Check if the DFSA is correct first.
FSATestUtils.checkCorrect(in, root);
// Check serialization.
checkSerialization(in, root);
}
/** */
private void checkSerialization(byte[][] input, FSA root)
throws IOException {
checkSerialization0(createSerializer(), input, root);
if (createSerializer().getFlags().contains(FSAFlags.NUMBERS)) {
checkSerialization0(createSerializer().withNumbers(), input, root);
}
}
/** */
private void checkSerialization0(FSASerializer serializer,
final byte[][] in, FSA root) throws IOException {
final byte[] fsaData = serializer.serialize(root,
new ByteArrayOutputStream()).toByteArray();
FSA fsa = FSA.read(new ByteArrayInputStream(fsaData));
checkCorrect(in, fsa);
}
/**
* Check if the FSA is correct with respect to the given input.
*/
protected void checkCorrect(byte[][] input, FSA fsa) {
// (1) All input sequences are in the right language.
HashSet rl = new HashSet();
for (ByteBuffer bb : fsa) {
byte[] array = bb.array();
int length = bb.remaining();
rl.add(ByteBuffer.wrap(Arrays.copyOf(array, length)));
}
HashSet uniqueInput = new HashSet();
for (byte[] sequence : input) {
uniqueInput.add(ByteBuffer.wrap(sequence));
}
for (ByteBuffer sequence : uniqueInput) {
Assert.assertTrue("Not present in the right language: "
+ BufferUtils.toString(sequence), rl.remove(sequence));
}
// (2) No other sequence _other_ than the input is in the right
// language.
Assert.assertEquals(0, rl.size());
}
@Test
public void testAutomatonWithNodeNumbers() throws IOException {
Assume.assumeTrue(createSerializer().getFlags().contains(FSAFlags.NUMBERS));
byte[][] input = new byte[][] {
{ 'a' },
{ 'a', 'b', 'a' },
{ 'a', 'c' },
{ 'b' },
{ 'b', 'a' },
{ 'c' }, };
Arrays.sort(input, FSABuilder.LEXICAL_ORDERING);
FSA s = FSABuilder.build(input);
final byte[] fsaData =
createSerializer()
.withNumbers()
.serialize(s, new ByteArrayOutputStream()).toByteArray();
FSA fsa = FSA.read(new ByteArrayInputStream(fsaData));
// Ensure we have the NUMBERS flag set.
assertTrue(fsa.getFlags().contains(NUMBERS));
// Get all numbers from nodes.
byte[] buffer = new byte[128];
final ArrayList result = new ArrayList();
FSA5Test.walkNode(buffer, 0, fsa, fsa.getRootNode(), 0, result);
Collections.sort(result);
assertEquals(
Arrays.asList("0 a", "1 aba", "2 ac", "3 b", "4 ba", "5 c"),
result);
}
/**
*
*/
protected abstract FSASerializer createSerializer();
}
morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/test/java/morfologik/util/ 0000775 0000000 0000000 00000000000 12344607157 0027175 5 ustar 00root root 0000000 0000000 morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/test/java/morfologik/util/MinMax.java 0000664 0000000 0000000 00000000546 12344607157 0031236 0 ustar 00root root 0000000 0000000 package morfologik.util;
/**
* Minimum/maximum and range.
*/
public final class MinMax
{
public final int min;
public final int max;
public MinMax(int min, int max)
{
this.min = Math.min(min, max);
this.max = Math.max(min, max);
}
public int range()
{
return max - min;
}
} morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/test/resources/ 0000775 0000000 0000000 00000000000 12344607157 0025141 5 ustar 00root root 0000000 0000000 morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/test/resources/morfologik/ 0000775 0000000 0000000 00000000000 12344607157 0027311 5 ustar 00root root 0000000 0000000 morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/test/resources/morfologik/fsa/ 0000775 0000000 0000000 00000000000 12344607157 0030062 5 ustar 00root root 0000000 0000000 morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/test/resources/morfologik/fsa/abc-numbers.fsa 0000664 0000000 0000000 00000000035 12344607157 0032751 0 ustar 00root root 0000000 0000000 \fsa_+ ^cbacba morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/test/resources/morfologik/fsa/abc.fsa 0000664 0000000 0000000 00000000030 12344607157 0031273 0 ustar 00root root 0000000 0000000 \fsa_+ ^cbqacba morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/test/resources/morfologik/fsa/abc.in 0000664 0000000 0000000 00000000020 12344607157 0031127 0 ustar 00root root 0000000 0000000 a
aba
ac
b
ba
c
morfologik-stemming-1.9.0+dfsg/morfologik-fsa/src/test/resources/morfologik/fsa/en_tst.dict 0000664 0000000 0000000 00004053126 12344607157 0032236 0 ustar 00root root 0000000 0000000 \fsa_+ ^3P4 zPBp =Cl;DQE0:F0GDH/ I@9J4K3L