pax_global_header00006660000000000000000000000064141426306120014511gustar00rootroot0000000000000052 comment=48a67a2831005f49c48ec29a5837640e23e54e6b snowball-2.2.0/000077500000000000000000000000001414263061200133335ustar00rootroot00000000000000snowball-2.2.0/.gitignore000066400000000000000000000006721414263061200153300ustar00rootroot00000000000000*.o /ada/bin/ /ada/obj/ /algorithms.mk /libstemmer/libstemmer.c /libstemmer/libstemmer_utf8.c /libstemmer/mkinc.mak /libstemmer/mkinc_utf8.mak /libstemmer/modules.h /libstemmer/modules_utf8.h /libstemmer.a /snowball /src_c /stemtest /stemwords /dist /java/org/tartarus/snowball/ext/ /js_out /python_check /python_out *.generated.cs /rust/Cargo.lock /rust/src/snowball/algorithms/*.rs /rust/target/ /go/algorithms/ /go/stemwords/algorithms.go snowball-2.2.0/.travis.yml000066400000000000000000000074271414263061200154560ustar00rootroot00000000000000language: minimal arch: arm64 dist: xenial env: global: MAKE=make matrix: include: - name: "C distribution build" language: c dist: focal compiler: gcc env: CFLAGS_DIST_BUILD='-O2 -Wall -W -std=c90 -Wdeclaration-after-statement -Werror' - name: "C distribution build (clang)" language: c dist: focal compiler: clang env: CFLAGS_DIST_BUILD='-O2 -Wall -W -std=c90 -Wdeclaration-after-statement -Werror' - language: c compiler: gcc env: c_tests=y CFLAGS='-O2 -Wall -W -std=c99 -Werror' - language: c compiler: clang env: c_tests=y CFLAGS='-O2 -Wall -W -std=c99 -Werror' - language: java env: JAVA=java JAVAC=javac - language: go go: "1.8" env: GO=go - language: go dist: bionic go: "1.17" env: GO=go - language: node_js node_js: "node" env: NODE=node - language: rust rust: - stable - beta dist: bionic env: RUST=rust - language: csharp arch: amd64 # csharp doesn't seem to work on arm64 env: MCS=mcs dist: bionic - name: Pascal env: FPC=fpc dist: bionic addons: apt: packages: - fpc # The pure Python versions run slowly so we need to thin the testdata # for languages such as Arabic where there's a lot, or else the build # hits the travis time limit. With pypy, it's enough faster than we # can run the full tests. - language: python python: "3.9" env: PYTHON=python THIN_FACTOR=10 - language: python python: "3.7" env: PYTHON=python THIN_FACTOR=10 - language: python python: "3.6" env: PYTHON=python THIN_FACTOR=10 - language: python python: "pypy3.7-7.3.5" env: PYTHON=python dist: bionic - name: "Ada" env: gprbuild=gprbuild dist: bionic addons: apt: packages: - gnat - gprbuild - os: windows language: c env: c_tests=y MAKE=mingw32-make - os: windows language: go env: GO=go MAKE=mingw32-make before_install: # Try to check out a branch of the same name from the snowball-data repo # sibling of this snowball repo, so that PRs requiring changes to both can be # CI tested easily. # # If that fails, just use the standard snowball-data repo's default branch. - GH_BRANCH=${TRAVIS_PULL_REQUEST_BRANCH:-$TRAVIS_BRANCH} - GH_REPO_SLUG=${TRAVIS_PULL_REQUEST_SLUG:-$TRAVIS_REPO_SLUG} - GH_REPO_URL=https://github.com/${GH_REPO_SLUG%%/*}/snowball-data.git - echo "Trying branch $GH_BRANCH from $GH_REPO_URL" - git clone --depth=1 -b "$GH_BRANCH" "$GH_REPO_URL" || git clone --depth=1 https://github.com/snowballstem/snowball-data.git script: # Ensure CC is set for building the compiler in non-C builds. - test -n "$CC" || export CC=gcc - $MAKE CC="$CC" - test -z "$CFLAGS_DIST_BUILD" || { pip install setuptools && $MAKE dist && mkdir tmp && cd tmp && tar xf ../dist/libstemmer_c-*.tar.gz && cd libstemmer_c-* && $MAKE CFLAGS="$CFLAGS_DIST_BUILD" ; } - test -z "$c_tests" || $MAKE check CC="$CC" STEMMING_DATA=snowball-data - test -z "$PYTHON" || $MAKE check_python python="$PYTHON" STEMMING_DATA=snowball-data - test -z "$JAVA" -o -z "$JAVAC" || $MAKE check_java STEMMING_DATA=snowball-data - test -z "$MCS" || $MAKE check_csharp MCS="$MCS" STEMMING_DATA=snowball-data - test -z "$NODE" || $MAKE check_js STEMMING_DATA=snowball-data - test -z "$RUST" || $MAKE check_rust STEMMING_DATA=snowball-data - test -z "$RUST" || $MAKE check_rust STEMMING_DATA=snowball-data - test -z "$GO" || $MAKE check_go STEMMING_DATA=snowball-data - test -z "$FPC" || $MAKE check_pascal STEMMING_DATA=snowball-data - test -z "$gprbuild" || $MAKE check_ada STEMMING_DATA=snowball-data snowball-2.2.0/AUTHORS000066400000000000000000000013171414263061200144050ustar00rootroot00000000000000Authors ======= Martin Porter ------------- - Designed the snowball language. - Implemented the snowball to C compiler. - Implemented the stemming algorithms in C. - Wrote the documentation. Richard Boulton --------------- - Implemented Java backend of the snowball compiler. - Developed build system. - Assisted with website maintenance. Assistance from --------------- Olivier Bornet - fixes to java packaging and build system. Andreas Jung - useful bug reports on the libstemmer library. Olly Betts - several patches, bug reports, and performance improvements. Sebastiano Vigna and Oerd Cukalla - patches for the Java stemming algorithms. Ralf Junker - fix a potential memory leak in sb_stemmer_new(). snowball-2.2.0/CONTRIBUTING.rst000066400000000000000000000224271414263061200160030ustar00rootroot00000000000000Adding a new stemming algorithm =============================== This needs PRs against three repositories. Name the branch the same for at least `snowball` and `snowball-data`, push to the latter repo first, and the CI should use your new vocabulary list when running the testsuite. Some points to note about algorithm implementations: * Avoid literal non-ASCII characters in snowball string literals - they will work OK for languages that use UTF-8, but not wide-character Unicode or other encodings. Instead use ``stringdef`` like the existing stemmers do, and please use the newer `U+` notation rather than the older ``hex`` or ``decimal`` as this allows us to support different encodings without having to modify the source files - for example:: stringdef o" {U+00F6} define foo 'o{o"}' not:: stringdef o" hex F6 define foo 'o{o"}' and definitely not:: define foo 'oö' It's helpful to consistently use the same ``stringdef`` codes across the different stemmers - the website has `guidance on what to use `_ and a `list of stringdef lines for common characters to cut and paste from `_. snowball repo ------------- Add `.sbl` source to algorithms subdirectory. Add entry to `libstemmer/modules.txt`, maintaining the current sorted order by the first column. The columns are: * Algorithm name (needs to match the `.sbl` source without extension) * Encodings to support. Wide-character Unicode is always supported and doesn't need to be listed here. You should always include `UTF_8`, and also `ISO_8859_1` if the stemmer only uses characters from that and the language can be usefully written using it. We currently also have support for `ISO_8859_2` and `KOI8_R`, but other single-byte character sets can be supported quite easily if they are useful. * Names and ISO-639 codes for the language. Wikipedia has a handy list of `all the ISO-639 codes `_ - find the row for your new language and include the codes from the "639-1", "639-2/T" and (if different) "639-2/B" columns. For example, for the `Afar` language you'd put `afar,aa,aar` here. snowball-data repo ------------------ Add subdirectory named after new stemmer containing: * voc.txt - word list * output.txt - stemmed equivalents * COPYING - licensing details (word lists need to be under an OSI-approved licence) If you don't have access to a suitably licensed word list of a suitable size, you may be able to use the `wikipedia-most-common-words` script to generate one by extracting the most frequent words from a Wikipedia dump in the language the stemmer is for. You need to specify the Unicode "script" (that's "script" in the sense of alphabet) to use - you can find the appropriate one by looking in the Unicode `Scripts.txt`:: https://www.unicode.org/Public/13.0.0/ucd/Scripts.txt The script name is the second column, between `;` and `#`. The first entries are all "Common" which isn't what you want - scroll down to get to the entries that are useful here. You also need to specify the minimum frequency to select. Picking this value will probably need some experimentation as the appropriate threshold depends on how much data there is in the wikipedia dump for a particular language, as well as the size of the vocabulary for the language, and how inflected the language is. Try counting the number of unique words extracted (`wc -l voc.txt` on Unix) and also looking through the list - some proper nouns, words from other languages, typos, etc are OK (since the stemmer will encounter all these in practice too), but at some point "more" stops being "better". snowball-website repo --------------------- Create subdirectory of `algorithms/` named after the language. Create `stemmer.tt` which describes the stemming algorithm. This is a "template toolkit" template which is essentially a mix of HTML and some macros for adding the navigation, sample vocabulary, etc. See the existing `stemmer.tt` files for other algorithms for inspiration. If it is based on an academic paper, cite the paper and describe any difference between your implementation and that described in the paper (for example, sometimes papers have ambiguities that need resolving to re-implement the algorithm described). If you have a stopword list, add that as `stop.txt` in your new subdirectory. The `generate` script checks if such a file exists and if it does a link to it is automatically added. Link to your new `stemmer.tt` from `algorithms/index.tt`. Add a news entry to `index.tt`. Add the new stemmer to the online demo. Assuming you have checkouts of the `snowball`, `snowball-data` and `snowball-website` repos in sibling directories: * run `make check_js` in the `snowball` repo * run `./update-js` * add the new stemmer to git with: `git add js/*-stemmer.js` * if the new language is written right-to-left (RTL) then add it to the check in `demo.tt` (search for `rtl` to find the place to change.) * `git commit`. Adding a new programming language generator =========================================== This is a short guide to adding support for generating code for another programming language. Is a new generator the right solution? -------------------------------------- Adding a new code generator is probably not your only option if you want to use Snowball from another language - most languages have support for writing bindings to a C library, so this is probably another option. Generating code can have advantages. For example, it can be simpler to deploy without C bindings which need to be built for a specific platform. However, it's likely to be significantly more work to implement a new generator than to write bindings to the generated C code, especially as the libstemmer C API is a very small and simple one. Generated code can also be slower - currently the Snowball compiler often generates code that assumes an optimising compiler will clean up redundant constructs, which is not a problem for C, and probably not for most compiled languages, but for a language like Python C bindings are much faster than the generated Python code (using pypy helps a lot, but is still slower). See doc/libstemmer_python_README for some timings. That said, the unoptimised generated code has improved over time, and is likely to improve further in the future. Key problems to solve --------------------- * A key problem to solve is how to map the required flow of control in response to Snowball signals. In the generated C code this is mostly done using `goto`. If your language doesn't provide an equivalent to `goto` then you'll need an alternative solution. In Java and JavaScript we use labelled `break` from blocks and loops instead. If your language has an equivalent to this feature, that will probably work. For Python, we currently generate a `try:` ... `raise lab123` ... `except lab123: pass` construct. This works, but doesn't seem ideal. If one of the mechanisms above sounds suitable then take a look at the generator for the respective generated output and generator code. If not, come and talk to us on the snowball-discuss mailing list. * Snowball's division is specified as integer division with semantics matching C - i.e. the result should be truncated (rounded towards zero). Some languages lack a built-in integer division operation, or have one which instead implements rounding towards negative infinity. Existing backends with special handling handling here which may be useful to look at include Javascript, Pascal and Python. Don't hardcode algorithm names ------------------------------ We want to avoid hard-coded lists of algorithms in the language-specific code that have to be manually updated each time a new algorithm is added, because that adds some extra tedious work for adding a new algorithm, and mechanical updates done by hand tend to miss places that need updating, or code gets copied and pasted from an existing case but not fully updated. All the existing language backends generate any such code at build time, and adding a new algorithm just requires updating `libstemmer/modules.txt`. You can probably copy the approach used for Pascal (script `pascal/generate.pl` works from template `stemwords-template.dpr` which has marked blocks of code that get expanded for each stemming algorithm with a placeholder replaced by the algorithm name. For an alternative approach, see Rust where this is done by `rust/build.rs`. Mechanics of adding a new generator ----------------------------------- Copy an existing `compiler/generator_*.c` for your new language and modify away (`generator.c` has the generator for C, but also some common functions so if you start from this one you'll need to remove those common functions). Please resist reformatting existing C code - there's currently a lot of code repeated in each generator which ought to be pulled out as common code, and if you reformat that just makes that job harder. Add your new source to `COMPILER_SOURCES` in `GNUmakefile`. Add prototypes for the new functions to `compiler/header.h`. Add support to `compiler/driver.c`. Add targets to `GNUmakefile` to run tests for the new language. Hook up automated testing via CI in `.travis.yml`. Add to the list of languages in `README.rst`. snowball-2.2.0/COPYING000066400000000000000000000031671414263061200143750ustar00rootroot00000000000000Copyright (c) 2001, Dr Martin Porter Copyright (c) 2004,2005, Richard Boulton Copyright (c) 2013, Yoshiki Shibukawa Copyright (c) 2006,2007,2009,2010,2011,2014-2019, Olly Betts All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the Snowball project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. snowball-2.2.0/GNUmakefile000066400000000000000000000662161414263061200154200ustar00rootroot00000000000000# -*- makefile -*- # After changing this, run `make update_version` to update various sources # which hard-code it. SNOWBALL_VERSION = 2.2.0 ifeq ($(OS),Windows_NT) EXEEXT = .exe endif c_src_dir = src_c JAVAC ?= javac JAVA ?= java java_src_main_dir = java/org/tartarus/snowball java_src_dir = $(java_src_main_dir)/ext MONO ?= mono MCS ?= mcs csharp_src_main_dir = csharp/Snowball csharp_src_dir = $(csharp_src_main_dir)/Algorithms csharp_sample_dir = csharp/Stemwords FPC ?= fpc pascal_src_dir = pascal python ?= python3 python_output_dir = python_out python_runtime_dir = snowballstemmer python_sample_dir = sample js_output_dir = js_out js_runtime_dir = javascript js_sample_dir = sample NODE ?= nodejs cargo ?= cargo cargoflags ?= --release rust_src_main_dir = rust/src rust_src_dir = $(rust_src_main_dir)/snowball/algorithms go ?= go goflags ?= stemwords/algorithms.go stemwords/main.go gofmt ?= gofmt go_src_main_dir = go go_src_dir = $(go_src_main_dir)/algorithms gprbuild ?= gprbuild ada_src_main_dir = ada ada_src_dir = $(ada_src_main_dir)/algorithms DIFF = diff ifeq ($(OS),Windows_NT) DIFF = diff --strip-trailing-cr endif ICONV = iconv #ICONV = python ./iconv.py tarball_ext = .tar.gz # algorithms.mk is generated from libstemmer/modules.txt and defines: # * libstemmer_algorithms # * ISO_8859_1_algorithms # * ISO_8859_2_algorithms # * KOI8_R_algorithms include algorithms.mk other_algorithms = german2 kraaij_pohlmann lovins all_algorithms = $(libstemmer_algorithms) $(other_algorithms) COMPILER_SOURCES = compiler/space.c \ compiler/tokeniser.c \ compiler/analyser.c \ compiler/generator.c \ compiler/driver.c \ compiler/generator_csharp.c \ compiler/generator_java.c \ compiler/generator_js.c \ compiler/generator_pascal.c \ compiler/generator_python.c \ compiler/generator_rust.c \ compiler/generator_go.c \ compiler/generator_ada.c COMPILER_HEADERS = compiler/header.h \ compiler/syswords.h \ compiler/syswords2.h RUNTIME_SOURCES = runtime/api.c \ runtime/utilities.c RUNTIME_HEADERS = runtime/api.h \ runtime/header.h JAVARUNTIME_SOURCES = java/org/tartarus/snowball/Among.java \ java/org/tartarus/snowball/SnowballProgram.java \ java/org/tartarus/snowball/SnowballStemmer.java \ java/org/tartarus/snowball/TestApp.java CSHARP_RUNTIME_SOURCES = csharp/Snowball/Among.cs \ csharp/Snowball/Stemmer.cs \ csharp/Snowball/AssemblyInfo.cs CSHARP_STEMWORDS_SOURCES = csharp/Stemwords/Program.cs JS_RUNTIME_SOURCES = javascript/base-stemmer.js JS_SAMPLE_SOURCES = javascript/stemwords.js PASCAL_RUNTIME_SOURCES = pascal/SnowballProgram.pas PASCAL_STEMWORDS_SOURCES = pascal/stemwords.dpr PYTHON_RUNTIME_SOURCES = python/snowballstemmer/basestemmer.py \ python/snowballstemmer/among.py PYTHON_SAMPLE_SOURCES = python/testapp.py \ python/stemwords.py PYTHON_PACKAGE_FILES = python/MANIFEST.in \ python/setup.py \ python/setup.cfg LIBSTEMMER_SOURCES = libstemmer/libstemmer.c LIBSTEMMER_UTF8_SOURCES = libstemmer/libstemmer_utf8.c LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/libstemmer_c.in STEMWORDS_SOURCES = examples/stemwords.c STEMTEST_SOURCES = tests/stemtest.c PYTHON_STEMWORDS_SOURCE = python/stemwords.py COMMON_FILES = COPYING \ NEWS ALL_ALGORITHM_FILES = $(all_algorithms:%=algorithms/%.sbl) C_LIB_SOURCES = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) \ $(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.c) \ $(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.c) \ $(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.c) C_LIB_HEADERS = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) \ $(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.h) \ $(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.h) \ $(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.h) C_OTHER_SOURCES = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) C_OTHER_HEADERS = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) JAVA_SOURCES = $(libstemmer_algorithms:%=$(java_src_dir)/%Stemmer.java) CSHARP_SOURCES = $(libstemmer_algorithms:%=$(csharp_src_dir)/%Stemmer.generated.cs) PASCAL_SOURCES = $(ISO_8859_1_algorithms:%=$(pascal_src_dir)/%Stemmer.pas) PYTHON_SOURCES = $(libstemmer_algorithms:%=$(python_output_dir)/%_stemmer.py) \ $(python_output_dir)/__init__.py JS_SOURCES = $(libstemmer_algorithms:%=$(js_output_dir)/%-stemmer.js) RUST_SOURCES = $(libstemmer_algorithms:%=$(rust_src_dir)/%_stemmer.rs) GO_SOURCES = $(libstemmer_algorithms:%=$(go_src_dir)/%_stemmer.go) \ $(go_src_main_dir)/stemwords/algorithms.go ADA_SOURCES = $(libstemmer_algorithms:%=$(ada_src_dir)/stemmer-%.ads) \ $(libstemmer_algorithms:%=$(ada_src_dir)/stemmer-%.adb) \ $(ada_src_dir)/stemmer-factory.ads $(ada_src_dir)/stemmer-factory.adb COMPILER_OBJECTS=$(COMPILER_SOURCES:.c=.o) RUNTIME_OBJECTS=$(RUNTIME_SOURCES:.c=.o) LIBSTEMMER_OBJECTS=$(LIBSTEMMER_SOURCES:.c=.o) LIBSTEMMER_UTF8_OBJECTS=$(LIBSTEMMER_UTF8_SOURCES:.c=.o) STEMWORDS_OBJECTS=$(STEMWORDS_SOURCES:.c=.o) STEMTEST_OBJECTS=$(STEMTEST_SOURCES:.c=.o) C_LIB_OBJECTS = $(C_LIB_SOURCES:.c=.o) C_OTHER_OBJECTS = $(C_OTHER_SOURCES:.c=.o) JAVA_CLASSES = $(JAVA_SOURCES:.java=.class) JAVA_RUNTIME_CLASSES=$(JAVARUNTIME_SOURCES:.java=.class) CFLAGS=-O2 -W -Wall -Wmissing-prototypes -Wmissing-declarations CPPFLAGS= INCLUDES=-Iinclude all: snowball$(EXEEXT) libstemmer.a stemwords$(EXEEXT) $(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS) algorithms.mk: libstemmer/mkalgorithms.pl libstemmer/modules.txt libstemmer/mkalgorithms.pl algorithms.mk libstemmer/modules.txt clean: rm -f $(COMPILER_OBJECTS) $(RUNTIME_OBJECTS) \ $(LIBSTEMMER_OBJECTS) $(LIBSTEMMER_UTF8_OBJECTS) $(STEMWORDS_OBJECTS) snowball$(EXEEXT) \ libstemmer.a stemwords$(EXEEXT) \ libstemmer/modules.h \ libstemmer/modules_utf8.h \ $(C_LIB_SOURCES) $(C_LIB_HEADERS) $(C_LIB_OBJECTS) \ $(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS) \ $(JAVA_SOURCES) $(JAVA_CLASSES) $(JAVA_RUNTIME_CLASSES) \ $(CSHARP_SOURCES) \ $(PASCAL_SOURCES) pascal/stemwords.dpr pascal/stemwords pascal/*.o pascal/*.ppu \ $(PYTHON_SOURCES) \ $(JS_SOURCES) \ $(RUST_SOURCES) \ $(ADA_SOURCES) \ libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak \ libstemmer/libstemmer.c libstemmer/libstemmer_utf8.c \ algorithms.mk rm -rf dist -rmdir $(c_src_dir) -rmdir $(python_output_dir) -rmdir $(js_output_dir) snowball$(EXEEXT): $(COMPILER_OBJECTS) $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ $(COMPILER_OBJECTS): $(COMPILER_HEADERS) libstemmer/libstemmer.c: libstemmer/libstemmer_c.in sed 's/@MODULES_H@/modules.h/' $^ >$@ libstemmer/libstemmer_utf8.c: libstemmer/libstemmer_c.in sed 's/@MODULES_H@/modules_utf8.h/' $^ >$@ libstemmer/modules.h libstemmer/mkinc.mak: libstemmer/mkmodules.pl libstemmer/modules.txt libstemmer/mkmodules.pl $@ $(c_src_dir) libstemmer/modules.txt libstemmer/mkinc.mak libstemmer/modules_utf8.h libstemmer/mkinc_utf8.mak: libstemmer/mkmodules.pl libstemmer/modules.txt libstemmer/mkmodules.pl $@ $(c_src_dir) libstemmer/modules.txt libstemmer/mkinc_utf8.mak utf8 libstemmer/libstemmer.o: libstemmer/modules.h $(C_LIB_HEADERS) libstemmer.a: libstemmer/libstemmer.o $(RUNTIME_OBJECTS) $(C_LIB_OBJECTS) $(AR) -cru $@ $^ examples/%.o: examples/%.c $(CC) $(CFLAGS) $(INCLUDES) $(CPPFLAGS) -c -o $@ $< stemwords$(EXEEXT): $(STEMWORDS_OBJECTS) libstemmer.a $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ tests/%.o: tests/%.c $(CC) $(CFLAGS) $(INCLUDES) $(CPPFLAGS) -c -o $@ $< stemtest$(EXEEXT): $(STEMTEST_OBJECTS) libstemmer.a $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ csharp_stemwords$(EXEEXT): $(CSHARP_STEMWORDS_SOURCES) $(CSHARP_RUNTIME_SOURCES) $(CSHARP_SOURCES) $(MCS) -unsafe -target:exe -out:$@ $(CSHARP_STEMWORDS_SOURCES) $(CSHARP_RUNTIME_SOURCES) $(CSHARP_SOURCES) pascal/stemwords.dpr: pascal/stemwords-template.dpr libstemmer/modules.txt pascal/generate.pl $(ISO_8859_1_algorithms) < pascal/stemwords-template.dpr > $@ pascal/stemwords: $(PASCAL_STEMWORDS_SOURCES) $(PASCAL_RUNTIME_SOURCES) $(PASCAL_SOURCES) $(FPC) -o$@ -Mdelphi $(PASCAL_STEMWORDS_SOURCES) $(c_src_dir)/stem_UTF_8_%.c $(c_src_dir)/stem_UTF_8_%.h: algorithms/%.sbl snowball$(EXEEXT) @mkdir -p $(c_src_dir) @l=`echo "$<" | sed 's!\(.*\)\.sbl$$!\1!;s!^.*/!!'`; \ o="$(c_src_dir)/stem_UTF_8_$${l}"; \ echo "./snowball $< -o $${o} -eprefix $${l}_UTF_8_ -r ../runtime -u"; \ ./snowball $< -o $${o} -eprefix $${l}_UTF_8_ -r ../runtime -u $(c_src_dir)/stem_KOI8_R_%.c $(c_src_dir)/stem_KOI8_R_%.h: algorithms/%.sbl snowball$(EXEEXT) @mkdir -p $(c_src_dir) @l=`echo "$<" | sed 's!\(.*\)\.sbl$$!\1!;s!^.*/!!'`; \ o="$(c_src_dir)/stem_KOI8_R_$${l}"; \ echo "./snowball charsets/KOI8-R.sbl $< -o $${o} -eprefix $${l}_KOI8_R_ -r ../runtime"; \ ./snowball charsets/KOI8-R.sbl $< -o $${o} -eprefix $${l}_KOI8_R_ -r ../runtime $(c_src_dir)/stem_ISO_8859_1_%.c $(c_src_dir)/stem_ISO_8859_1_%.h: algorithms/%.sbl snowball$(EXEEXT) @mkdir -p $(c_src_dir) @l=`echo "$<" | sed 's!\(.*\)\.sbl$$!\1!;s!^.*/!!'`; \ o="$(c_src_dir)/stem_ISO_8859_1_$${l}"; \ echo "./snowball $< -o $${o} -eprefix $${l}_ISO_8859_1_ -r ../runtime"; \ ./snowball $< -o $${o} -eprefix $${l}_ISO_8859_1_ -r ../runtime $(c_src_dir)/stem_ISO_8859_2_%.c $(c_src_dir)/stem_ISO_8859_2_%.h: algorithms/%.sbl snowball$(EXEEXT) @mkdir -p $(c_src_dir) @l=`echo "$<" | sed 's!\(.*\)\.sbl$$!\1!;s!^.*/!!'`; \ o="$(c_src_dir)/stem_ISO_8859_2_$${l}"; \ echo "./snowball charsets/ISO-8859-2.sbl $< -o $${o} -eprefix $${l}_ISO_8859_2_ -r ../runtime"; \ ./snowball charsets/ISO-8859-2.sbl $< -o $${o} -eprefix $${l}_ISO_8859_2_ -r ../runtime $(c_src_dir)/stem_%.o: $(c_src_dir)/stem_%.c $(c_src_dir)/stem_%.h $(CC) $(CFLAGS) $(INCLUDES) $(CPPFLAGS) -c -o $@ $< $(java_src_dir)/%Stemmer.java: algorithms/%.sbl snowball$(EXEEXT) @mkdir -p $(java_src_dir) @l=`echo "$<" | sed 's!\(.*\)\.sbl$$!\1!;s!^.*/!!'`; \ o="$(java_src_dir)/$${l}Stemmer"; \ echo "./snowball $< -j -o $${o} -p org.tartarus.snowball.SnowballStemmer"; \ ./snowball $< -j -o $${o} -p org.tartarus.snowball.SnowballStemmer $(csharp_src_dir)/%Stemmer.generated.cs: algorithms/%.sbl snowball$(EXEEXT) @mkdir -p $(csharp_src_dir) @l=`echo "$<" | sed 's!\(.*\)\.sbl$$!\1!;s!^.*/!!'`; \ t=`echo "$${l}" | sed 's/.*/\L&/; s/[a-z]*/\u&/g'`; \ o="$(csharp_src_dir)/$${l}Stemmer.generated"; \ echo "./snowball $< -cs -o $${o}"; \ ./snowball $< -cs -o $${o} $(pascal_src_dir)/%Stemmer.pas: algorithms/%.sbl snowball$(EXEEXT) @mkdir -p $(pascal_src_dir) @l=`echo "$<" | sed 's!\(.*\)\.sbl$$!\1!;s!^.*/!!'`; \ t=`echo "$${l}" | sed 's/.*/\L&/; s/[a-z]*/\u&/g'`; \ o="$(pascal_src_dir)/$${l}Stemmer"; \ echo "./snowball $< -pascal -o $${o}"; \ ./snowball $< -pascal -o $${o} $(python_output_dir)/%_stemmer.py: algorithms/%.sbl snowball$(EXEEXT) @mkdir -p $(python_output_dir) @l=`echo "$<" | sed 's!\(.*\)\.sbl$$!\1!;s!^.*/!!'`; \ o="$(python_output_dir)/$${l}_stemmer"; \ echo "./snowball $< -py -o $${o}"; \ ./snowball $< -py -o $${o} $(python_output_dir)/__init__.py: libstemmer/modules.txt @mkdir -p $(python_output_dir) $(python) python/create_init.py $(python_output_dir) $(rust_src_dir)/%_stemmer.rs: algorithms/%.sbl snowball$(EXEEXT) @mkdir -p $(rust_src_dir) @l=`echo "$<" | sed 's!\(.*\)\.sbl$$!\1!;s!^.*/!!'`; \ o="$(rust_src_dir)/$${l}_stemmer"; \ echo "./snowball $< -rust -o $${o}"; \ ./snowball $< -rust -o $${o} $(go_src_main_dir)/stemwords/algorithms.go: go/stemwords/generate.go libstemmer/modules.txt @echo "Generating algorithms.go" @cd go/stemwords && go generate $(go_src_dir)/%_stemmer.go: algorithms/%.sbl snowball$(EXEEXT) @l=`echo "$<" | sed 's!\(.*\)\.sbl$$!\1!;s!^.*/!!'`; \ o="$(go_src_dir)/$${l}/$${l}_stemmer"; \ mkdir -p $(go_src_dir)/$${l} @l=`echo "$<" | sed 's!\(.*\)\.sbl$$!\1!;s!^.*/!!'`; \ o="$(go_src_dir)/$${l}/$${l}_stemmer"; \ echo "./snowball $< -go -o $${o} -gop $${l}"; \ ./snowball $< -go -o $${o} -gop $${l} @l=`echo "$<" | sed 's!\(.*\)\.sbl$$!\1!;s!^.*/!!'`; \ o="$(go_src_dir)/$${l}/$${l}_stemmer"; \ echo "$(gofmt) -s -w $(go_src_dir)/$${l}/$${l}_stemmer.go"; \ $(gofmt) -s -w $(go_src_dir)/$${l}/$${l}_stemmer.go $(js_output_dir)/%-stemmer.js: algorithms/%.sbl snowball$(EXEEXT) @mkdir -p $(js_output_dir) @l=`echo "$<" | sed 's!\(.*\)\.sbl$$!\1!;s!^.*/!!'`; \ o="$(js_output_dir)/$${l}-stemmer"; \ echo "./snowball $< -js -o $${o}"; \ ./snowball $< -js -o $${o} $(ada_src_dir)/stemmer-%.ads: algorithms/%.sbl snowball @mkdir -p $(ada_src_dir) @l=`echo "$<" | sed 's!\(.*\)\.sbl$$!\1!;s!^.*/!!'`; \ o="$(ada_src_dir)/stemmer-$${l}"; \ echo "./snowball $< -ada -o $${o}"; \ ./snowball $< -ada -P $${l} -o $${o} # Make a full source distribution dist: dist_snowball dist_libstemmer_c dist_libstemmer_csharp dist_libstemmer_java dist_libstemmer_js dist_libstemmer_python # Make a distribution of all the sources involved in snowball dist_snowball: $(COMPILER_SOURCES) $(COMPILER_HEADERS) \ $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \ $(LIBSTEMMER_SOURCES) \ $(LIBSTEMMER_UTF8_SOURCES) \ $(LIBSTEMMER_HEADERS) \ $(LIBSTEMMER_EXTRA) \ $(ALL_ALGORITHM_FILES) $(STEMWORDS_SOURCES) $(STEMTEST_SOURCES) \ $(COMMON_FILES) \ GNUmakefile README.rst doc/TODO libstemmer/mkmodules.pl destname=snowball-$(SNOWBALL_VERSION); \ dest=dist/$${destname}; \ rm -rf $${dest} && \ rm -f $${dest}$(tarball_ext) && \ for file in $^; do \ dir=`dirname $$file` && \ mkdir -p $${dest}/$${dir} && \ cp -a $${file} $${dest}/$${dir} || exit 1 ; \ done && \ (cd dist && tar zcf $${destname}$(tarball_ext) $${destname}) && \ rm -rf $${dest} # Make a distribution of all the sources required to compile the C library. dist_libstemmer_c: \ $(RUNTIME_SOURCES) \ $(RUNTIME_HEADERS) \ $(LIBSTEMMER_SOURCES) \ $(LIBSTEMMER_UTF8_SOURCES) \ $(LIBSTEMMER_HEADERS) \ $(LIBSTEMMER_EXTRA) \ $(C_LIB_SOURCES) \ $(C_LIB_HEADERS) \ libstemmer/mkinc.mak \ libstemmer/mkinc_utf8.mak destname=libstemmer_c-$(SNOWBALL_VERSION); \ dest=dist/$${destname}; \ rm -rf $${dest} && \ rm -f $${dest}$(tarball_ext) && \ mkdir -p $${dest} && \ cp -a doc/libstemmer_c_README $${dest}/README && \ mkdir -p $${dest}/examples && \ cp -a examples/stemwords.c $${dest}/examples && \ mkdir -p $${dest}/$(c_src_dir) && \ cp -a $(C_LIB_SOURCES) $(C_LIB_HEADERS) $${dest}/$(c_src_dir) && \ mkdir -p $${dest}/runtime && \ cp -a $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) $${dest}/runtime && \ mkdir -p $${dest}/libstemmer && \ cp -a $(LIBSTEMMER_SOURCES) $(LIBSTEMMER_UTF8_SOURCES) $(LIBSTEMMER_HEADERS) $(LIBSTEMMER_EXTRA) $${dest}/libstemmer && \ mkdir -p $${dest}/include && \ mv $${dest}/libstemmer/libstemmer.h $${dest}/include && \ (cd $${dest} && \ echo "README.rst" >> MANIFEST && \ ls $(c_src_dir)/*.c $(c_src_dir)/*.h >> MANIFEST && \ ls runtime/*.c runtime/*.h >> MANIFEST && \ ls libstemmer/*.c libstemmer/*.h >> MANIFEST && \ ls include/*.h >> MANIFEST) && \ cp -a libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak $${dest}/ && \ cp -a $(COMMON_FILES) $${dest} && \ echo 'include mkinc.mak' >> $${dest}/Makefile && \ echo 'ifeq ($$(OS),Windows_NT)' >> $${dest}/Makefile && \ echo 'EXEEXT=.exe' >> $${dest}/Makefile && \ echo 'endif' >> $${dest}/Makefile && \ echo 'CFLAGS=-O2' >> $${dest}/Makefile && \ echo 'CPPFLAGS=-Iinclude' >> $${dest}/Makefile && \ echo 'all: libstemmer.a stemwords$$(EXEEXT)' >> $${dest}/Makefile && \ echo 'libstemmer.a: $$(snowball_sources:.c=.o)' >> $${dest}/Makefile && \ echo ' $$(AR) -cru $$@ $$^' >> $${dest}/Makefile && \ echo 'stemwords$$(EXEEXT): examples/stemwords.o libstemmer.a' >> $${dest}/Makefile && \ echo ' $$(CC) $$(CFLAGS) -o $$@ $$^' >> $${dest}/Makefile && \ echo 'clean:' >> $${dest}/Makefile && \ echo ' rm -f stemwords$$(EXEEXT) libstemmer.a *.o $(c_src_dir)/*.o examples/*.o runtime/*.o libstemmer/*.o' >> $${dest}/Makefile && \ (cd dist && tar zcf $${destname}$(tarball_ext) $${destname}) && \ rm -rf $${dest} # Make a distribution of all the sources required to compile the Java library. dist_libstemmer_java: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \ $(LIBSTEMMER_EXTRA) \ $(JAVA_SOURCES) destname=libstemmer_java-$(SNOWBALL_VERSION); \ dest=dist/$${destname}; \ rm -rf $${dest} && \ rm -f $${dest}$(tarball_ext) && \ mkdir -p $${dest} && \ cp -a doc/libstemmer_java_README $${dest}/README && \ mkdir -p $${dest}/$(java_src_dir) && \ cp -a $(JAVA_SOURCES) $${dest}/$(java_src_dir) && \ mkdir -p $${dest}/$(java_src_main_dir) && \ cp -a $(JAVARUNTIME_SOURCES) $${dest}/$(java_src_main_dir) && \ cp -a $(COMMON_FILES) $${dest} && \ (cd $${dest} && \ echo "README" >> MANIFEST && \ ls $(java_src_dir)/*.java >> MANIFEST && \ ls $(java_src_main_dir)/*.java >> MANIFEST) && \ (cd dist && tar zcf $${destname}$(tarball_ext) $${destname}) && \ rm -rf $${dest} # Make a distribution of all the sources required to compile the C# library. dist_libstemmer_csharp: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \ $(LIBSTEMMER_EXTRA) \ $(CSHARP_SOURCES) destname=libstemmer_csharp-$(SNOWBALL_VERSION); \ dest=dist/$${destname}; \ rm -rf $${dest} && \ rm -f $${dest}$(tarball_ext) && \ mkdir -p $${dest} && \ cp -a doc/libstemmer_csharp_README $${dest}/README && \ mkdir -p $${dest}/$(csharp_src_dir) && \ cp -a $(CSHARP_SOURCES) $${dest}/$(csharp_src_dir) && \ mkdir -p $${dest}/$(csharp_src_main_dir) && \ cp -a $(CSHARP_RUNTIME_SOURCES) $${dest}/$(csharp_src_main_dir) && \ mkdir -p $${dest}/$(csharp_sample_dir) && \ cp -a $(CSHARP_STEMWORDS_SOURCES) $${dest}/$(csharp_sample_dir) && \ cp -a $(COMMON_FILES) $${dest} && \ (cd dist && tar zcf $${destname}$(tarball_ext) $${destname}) && \ rm -rf $${dest} dist_libstemmer_python: $(PYTHON_SOURCES) destname=snowballstemmer-$(SNOWBALL_VERSION); \ dest=dist/$${destname}; \ rm -rf $${dest} && \ rm -f $${dest}$(tarball_ext) && \ mkdir -p $${dest} && \ mkdir -p $${dest}/src/$(python_runtime_dir) && \ mkdir -p $${dest}/src/$(python_sample_dir) && \ cp libstemmer/modules.txt $${dest} && \ cp doc/libstemmer_python_README $${dest}/README.rst && \ cp -a $(PYTHON_SOURCES) $${dest}/src/$(python_runtime_dir) && \ cp -a $(PYTHON_SAMPLE_SOURCES) $${dest}/src/$(python_sample_dir) && \ cp -a $(PYTHON_RUNTIME_SOURCES) $${dest}/src/$(python_runtime_dir) && \ cp -a $(COMMON_FILES) $(PYTHON_PACKAGE_FILES) $${dest} && \ (cd $${dest} && $(python) setup.py sdist bdist_wheel && cp dist/*.tar.gz dist/*.whl ..) && \ rm -rf $${dest} dist_libstemmer_js: $(JS_SOURCES) destname=jsstemmer-$(SNOWBALL_VERSION); \ dest=dist/$${destname}; \ rm -rf $${dest} && \ rm -f $${dest}$(tarball_ext) && \ mkdir -p $${dest} && \ mkdir -p $${dest}/$(js_runtime_dir) && \ mkdir -p $${dest}/$(js_sample_dir) && \ cp -a doc/libstemmer_js_README $${dest}/README.rst && \ cp -a $(COMMON_FILES) $${dest} && \ cp -a $(JS_RUNTIME_SOURCES) $${dest}/$(js_runtime_dir) && \ cp -a $(JS_SAMPLE_SOURCES) $${dest}/$(js_sample_dir) && \ cp -a $(JS_SOURCES) $${dest}/$(js_runtime_dir) && \ (cd $${dest} && \ ls README.rst $(COMMON_FILES) $(js_runtime_dir)/*.js $(js_sample_dir)/*.js > MANIFEST) && \ (cd dist && tar zcf $${destname}$(tarball_ext) $${destname}) && \ rm -rf $${dest} check: check_stemtest check_utf8 check_iso_8859_1 check_iso_8859_2 check_koi8r check_stemtest: stemtest$(EXEEXT) ./stemtest check_utf8: $(libstemmer_algorithms:%=check_utf8_%) check_iso_8859_1: $(ISO_8859_1_algorithms:%=check_iso_8859_1_%) check_iso_8859_2: $(ISO_8859_2_algorithms:%=check_iso_8859_2_%) check_koi8r: $(KOI8_R_algorithms:%=check_koi8r_%) # Where the data files are located - assumes their repo is checked out as # a sibling to this one. STEMMING_DATA ?= ../snowball-data STEMMING_DATA_ABS := $(abspath $(STEMMING_DATA)) check_utf8_%: $(STEMMING_DATA)/% stemwords$(EXEEXT) @echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with UTF-8" @if test -f '$ tmp.txt @$(ICONV) -f UTF-8 -t ISO-8859-1 '$ tmp.in; \ $(cargo) run $(cargoflags) -- -l `echo $<|sed 's!.*/!!'` -i tmp.in -o $(PWD)/tmp.txt; \ rm tmp.in; \ else \ $(cargo) run $(cargoflags) -- -l `echo $<|sed 's!.*/!!'` -i $ tmp.in; \ $(go) run $(goflags) -l `echo $<|sed 's!.*/!!'` -i tmp.in -o $(PWD)/tmp.txt; \ rm tmp.in; \ else \ $(go) run $(goflags) -l `echo $<|sed 's!.*/!!'` -i $ tmp.in; \ $(NODE) javascript/stemwords.js -l `echo $<|sed 's!.*/!!'` -i tmp.in -o tmp.txt; \ rm tmp.in; \ else \ $(NODE) javascript/stemwords.js -l `echo $<|sed 's!.*/!!'` -i $ tmp.in; \ $(python) stemwords.py -c utf8 -l `echo $<|sed 's!.*/!!'` -i tmp.in -o $(PWD)/tmp.txt; \ rm tmp.in; \ else \ $(python) stemwords.py -c utf8 -l `echo $<|sed 's!.*/!!'` -i $ tmp.in; \ ./bin/stemwords `echo $<|sed 's!.*/!!'` tmp.in $(PWD)/tmp.txt; \ rm tmp.in; \ else \ ./bin/stemwords `echo $<|sed 's!.*/!!'` $ 3)` can now be used when previously a temporary variable was required: `$tmp = len $tmp > 3` Code generation improvements ---------------------------- * General: + Avoid unnecessarily saving and restoring of the cursor for more commands - `atlimit`, `do`, `set` and `unset` all leave the cursor alone or always restore its value, and for C `booltest` (which other languages already handled). + Special case handling for `setlimit tomark AE`. All uses of setlimit in the current stemmers we ship follow this pattern, and by special-casing we can avoid having to save and restore the cursor (#74). + Merge duplicate actions in the same `among`. This reduces the size of the switch/if-chain in the generated code which dispatch the among for many of the stemmers. + Generate simpler code for `among`. We always check for a zero return value when we call the among, so there's no point also checking for that in the switch/if-chain. We can also avoid the switch/if-chain entirely when there's only one possible outcome (besides the zero return). + Optimise code generated for `do `. This speeds up "make check_python" by about 2%, and should speed up other interpreted languages too (#110). + Generate more and better comments referencing snowball source. + Add homepage URL and compiler version as comments in generated files. * C/C++: + Fix `size` and `sizeof` to not report one too high (reported by Assem Chelli in #32). + If signal `f` from a function call would lead to return from the current function then handle this and bailing out on an error together with a simple `if (ret <= 0) return ret;` + Inline testing for a single character literals. + Avoiding generating `|| 0` in corner case - this can result in a compiler warning when building the generated code. + Implement `insert_v()` in terms of `insert_s()`. + Add conditional `extern "C"` so `runtime/api.h` can be included from C++ code. Closes #90, reported by vvarma. * Java: + Fix functions in `among` to work in Java. We seem to need to make the methods called from among `public` instead of `private`, and to call them on `this` instead of the `methodObject` (which is cleaner anyway). No revision in version control seems to generate working code for this case, but Richard says it definitely used to work - possibly older JVMs failed to correctly enforce the access controls when methods were invoked by reflection. + Code after handling `f` by returning from the current function is unreachable too. + Previously we incorrectly decided that code after an `or` was unreachable in certain cases. None of the current stemmers in the distribution triggered this, but Martin Porter's snowball version of the Schinke Latin stemmer does. Fixes #58, reported by Alexander Myltsev. + The reachability logic was failing to consider reachability from the final command in an `or`. Fixes #82, reported by David Corbett. + Fix `maxint` and `minint`. Patch from David Corbett in #31. + Fix `$` on strings. The previous generated code was just wrong. This doesn't affect any of the included algorithms, but for example breaks Martin Porter's snowball implementation of Schinke's Latin Stemmer. Issue noted by Jakob Demler while working on the Rust backend in #51, and reported in the Schinke's Latin Stemmer by Alexander Myltsev in #58. + Make SnowballProgram objects serializable. Patch from Oleg Smirnov in #43. + Eliminate range-check implementation for groupings. This was removed from the C generator 10 years earlier, isn't used for any of the existing algorithms, and it doesn't seem likely it would be - the grouping would have to consist entirely of a contiguous block of Unicode code-points. + Simplify code generated for `repeat` and `atleast`. + Eliminate unused return values and variables from runtime functions. + Only import the `among` and `SnowballProgram` classes if they're actually used. + Only generate `copy_from()` method if it's used. + Merge runtime functions `eq_s` and `eq_v` functions. + Java arrays know their own length so stop storing it separately. + Escape char 127 (DEL) in generated Java code. It's unlikely that this character would actually be used in a real stemmer, so this was more of a theoretical bug. + Drop unused import of InvocationTargetException from SnowballStemmer. Reported by GerritDeMeulder in #72. + Fix lint check issues in generated Java code. The stemmer classes are only referenced in the example app via reflection, so add @SuppressWarnings("unused") for them. The stemmer classes override equals() and hashCode() methods from the standard java Object class, so mark these with @Override. Both suggested by GerritDeMeulder in #72. + Declare Java variables at point of use in generated code. Putting all declarations at the top of the function was adding unnecessary complexity to the Java generator code for no benefit. + Improve formatting of generated code. New stemming algorithms ----------------------- * Add Tamil stemmer from Damodharan Rajalingam (#2, #3). * Add Arabic stemmer from Assem Chelli (#32, #50). * Add Irish stemmer from Jim O'Regan (#48). * Add Nepali stemmer from Arthur Zakirov (#70). * Add Indonesian stemmer from Olly Betts (#71). * Add Hindi stemmer from Olly Betts (#73). Thanks to David Corbett for review. * Add Lithuanian stemmer from Dainius Jocas (#22, #76). * Add Greek stemmer from Oleg Smirnov (#44). * Add Catalan and Basque stemmers from Israel Olalla (#104). Behavioural changes to existing algorithms ------------------------------------------ * Portuguese: + Replace incorrect Spanish suffixes by Portuguese suffixes (#1). * French: + The MSDOS CP850 version of the French algorithm was missing changes present in the ISO8859-1 and Unicode versions. There's now a single version of each algorithm which was based on the Unicode version. + Recognize French suffixes even when they begin with diaereses. Patch from David Corbett in #78. * Russian: + We now normalise 'ё' to 'е' before stemming. The documentation has long said "we assume ['ё'] is mapped into ['е']" but it's more convenient for the stemmer to actually perform this normalisation. This change has no effect if the caller is already normalising as we recommend. It's a change in behaviour they aren't, but 'ё' occurs rarely (there are currently no instances in our test vocabulary) and this improves behaviour when it does occur. Patch from Eugene Mirotin (#65, #68). * Finish: + Adjust the Finnish algorithm not to mangle numbers. This change also means it tends to leave foreign words alone. Fixes #66. * Danish: + Adjust Danish algorithm not to mangle alphanumeric codes. In particular alphanumeric codes ending in a double digit (e.g. 0x0e00, hal9000, space1999) are no longer mangled. See #81. Optimisations to existing algorithms ------------------------------------ * Turkish: + Simplify uses of `test` in stemmer code. + Check for 'ad' or 'soyad' more efficiently, and without needing the strlen variable. This speeds up "make check_utf8_turkish" by 11% on x86 Linux. * Kraaij-Pohlmann: + Eliminate variable x `$p1 <= cursor` is simpler and a little more efficient than `setmark x $x >= p1`. Code clarity improvements to existing algorithms ------------------------------------------------ * Turkish: + Use , for cedilla to match the conventions used in other stemmers. * Kraaij-Pohlmann: + Avoid cryptic `[among ( (])` ... `)` construct - instead use the same `[substring] among (` ... `)` construct we do in other stemmers. Compiler -------- * Support conventional --help and --version options. * Warn if -r or -ep used with backend other than C/C++. * Warn if encoding command line options are specified when generating code in a language with a fixed encoding. * The default classname is now set based on the output filename, so `-n` is now often no longer needed. Fixes #64. * Avoid potential one byte buffer over-read when parsing snowball code. * Avoid comparing with uninitialised array element during compilation. * Improve `-syntax` output for `setlimit L for C`. * Optimise away double negation so generators don't have to worry about generating `--` (decrement operator in many languages). Fixes #52, reported by David Corbett. * Improved compiler error and warning messages: - We now report FILE:LINE: before each diagnostic message. - Improve warnings for unused declarations/definitions. - Warn for variables which are used, but either never initialised or never read. - Flag non-ASCII literal strings. This is an error for wide Unicode, but only a warning for single-byte and UTF-8 which work so long as the source encoding matches the encoding used in the generated stemmer code. - Improve error recovery after an undeclared `define`. We now sniff the token after the identifier and if it is `as` we parse as a routine, otherwise we parse as a grouping. Previously we always just assumed it was a routine, which gave a confusing second error if it was a grouping. - Improve error recovery after an unexpected token in `among`. Previously we acted as if the unexpected token closed the `among` (this probably wasn't intended but just a missing `break;` in a switch statement). Now we issue an error and try the next token. * Report error instead of silently truncating character values (e.g. `hex 123` previously silently became byte 0x23 which is `#` rather than a g-with-cedilla). * Enlarge the initial input buffer size to 8192 bytes and double each time we hit the end. Snowball programs are typically a few KB in size (with the current largest we ship being the Greek stemmer at 27KB) so the previous approach of starting with a 10 byte input buffer and increasing its size by 50% plus 40 bytes each time it filled was inefficient, needing up to 15 reallocations to load greek.sbl. * Identify variables only used by one `routine`/`external`. This information isn't yet used, but such variables which are also always written to before being read can be emitted as local variables in most target languages. * We now allow multiple source files on command line, and allow them to be after (or even interspersed) with options to better match modern Unix conventions. Support for multiple source files allows specifying a single byte character set mapping via a source file of `stringdef`. * Avoid infinite recursion in compiler when optimising a recursive snowball function. Recursive functions aren't typical in snowball programs, but the compiler shouldn't crash for any input, especially not a valid one. We now simply limit on how deep the compiler will recurse and make the pessimistic assumption in the unlikely event we hit this limit. Build system ------------ * `make clean` in C libstemmer_c distribution now removes `examples/*.o`. (#59) * Fix all the places which previously had to have a list of stemmers to work dynamically or be generated, so now only modules.txt needs updating to add a new stemmer. * Add check_java make target which runs tests for java. * Support gzipped test data (the uncompressed arabic test data is too big for github). * GNUmakefile: Drop useless `-eprefix` and `-r` options from snowball invocations for Java - these are only meaningful when generating C code. * Pass CFLAGS when linking which matches convention (e.g. automake does it) and facilitates use of tools such as ASan. Fixes #84, reported by Thomas Pointhuber. * Add CI builds with -std=c90 to check compiler and generated code are C90 (#54) libstemmer ---------- * Split out CPPFLAGS from CFLAGS and use CFLAGS when linking stemwords. * Add -O2 to CFLAGS. * Make generated tables of encodings and modules const. * Fix clang static analyzer memory leak warning (in practice this code path can never actually be taken). Patch from Patrick O. Perry (#56) Documentation ------------- * Added copyright and licensing details (#10). * Document that libstemmer supports ISO_8859_2 encoding. Currently hungarian and romanian are available in ISO_8859_2. * Remove documentation falsely claiming that libstemmer supports CP850 encoding. * CONTRIBUTING.rst: Add guidance for contributing new stemming algorithms and new language backends. * Overhaul libstemmer_python_README. Most notably, replace the benchmark data which was very out of date. snowball-2.2.0/README.rst000066400000000000000000000037111414263061200150240ustar00rootroot00000000000000Snowball is a small string processing language for creating stemming algorithms for use in Information Retrieval, plus a collection of stemming algorithms implemented using it. Snowball was originally designed and built by Martin Porter. Martin retired from development in 2014 and Snowball is now maintained as a community project. Martin originally chose the name Snowball as a tribute to SNOBOL, the excellent string handling language from the 1960s. It now also serves as a metaphor for how the project grows by gathering contributions over time. The Snowball compiler translates a Snowball program into source code in another language - currently Ada, ISO C, C#, Go, Java, Javascript, Object Pascal, Python and Rust are supported. This repository contains the source code for the snowball compiler and the stemming algorithms. The snowball compiler is written in ISO C - you'll need a C compiler which support C99 to build it (but the C code it generates should work with any ISO C compiler). See https://snowballstem.org/ for more information about Snowball. What is Stemming? ================= Stemming maps different forms of the same word to a common "stem" - for example, the English stemmer maps *connection*, *connections*, *connective*, *connected*, and *connecting* to *connect*. So a searching for *connected* would also find documents which only have the other forms. This stem form is often a word itself, but this is not always the case as this is not a requirement for text search systems, which are the intended field of use. We also aim to conflate words with the same meaning, rather than all words with a common linguistic root (so *awe* and *awful* don't have the same stem), and over-stemming is more problematic than under-stemming so we tend not to stem in cases that are hard to resolve. If you want to always reduce words to a root form and/or get a root form which is itself a word then Snowball's stemming algorithms likely aren't the right answer. snowball-2.2.0/ada/000077500000000000000000000000001414263061200140605ustar00rootroot00000000000000snowball-2.2.0/ada/README.md000066400000000000000000000041711414263061200153420ustar00rootroot00000000000000# Ada Target for Snowball The Ada Snowball generator generates an Ada child package for each Snowball algorithm. The parent package is named `Stemmer` and it provides various operations used by the generated code. The `Stemmer` package contains the Ada Snowball runtime available either in `ada/src` directory or from https://github.com/stcarrez/ada-stemmer. The generated child package declares the `Context_Type` tagged type and the `Stem` procedure: ```Ada package Stemmer. is type Context_Type is new Stemmer.Context_Type with private; procedure Stem (Z : in out Context_Type; Result : out Boolean); private type Context_Type is new Stemmer.Context_Type with record ... end record; end Stemmer.; ``` It is possible to use directly the generated operation or use it through the `Stemmer.Factory` package. ## Usage To generate Ada source for a Snowball algorithm: ``` $ snowball path/to/algorithm.sbl -ada -P -o src/stemmer- ``` ### Ada specific options `-P ` the child package name used in the generated Ada file (defaults to `snowball`). It must be a valid Ada identifier. ## Code Organization `compiler/generator_ada.c` has the Ada code generation logic `ada/src` contains the default Ada Snowball runtime support which is also available at https://github.com/stcarrez/ada-stemmer `ada/algorithms` location where the makefile generated code will end up ## Using the Generated Stemmers To use the generated stemmer, import the Ada generated package, declare an instance of the generated `Context_Type` and call the `Stem_Word` procedure. ``` with Stemmer.English; Ctx : Stemmer.English.Context_Type; Result : Boolean; Ctx.Stem_Word ("zealously", Result); if Result then Ada.Text_IO.Put_Line (Ctx.Get_Result); end if; ``` You can use the context as many times as you want. ## Testing To run the tests, you will need an Ada compiler such as GNAT as well as the `gprbuild` build tool. Only the existing Snowball algorithms have been used for testing. This does not exercise all features of the language. Run: ``` $ make check_ada ``` snowball-2.2.0/ada/generate.gpr000066400000000000000000000007441414263061200163710ustar00rootroot00000000000000with "stemmer_config"; project Generate is Mains := ("generate.adb"); for Main use Mains; for Source_Dirs use ("generate"); for Object_Dir use "./" & Stemmer_Config'Object_Dir & "/obj"; for Exec_Dir use "./" & Stemmer_Config'Exec_Dir & "/bin"; package Binder renames Stemmer_Config.Binder; package Builder renames Stemmer_Config.Builder; package Compiler renames Stemmer_Config.Compiler; package Linker renames Stemmer_Config.Linker; end Generate; snowball-2.2.0/ada/generate/000077500000000000000000000000001414263061200156525ustar00rootroot00000000000000snowball-2.2.0/ada/generate/generate.adb000066400000000000000000000055051414263061200201210ustar00rootroot00000000000000with Ada.Characters.Handling; with Ada.Text_IO; with Ada.Command_Line; with Ada.Containers.Indefinite_Vectors; procedure Generate is use Ada.Characters.Handling; use Ada.Text_IO; package String_Vectors is new Ada.Containers.Indefinite_Vectors (Element_Type => String, Index_Type => Positive); Languages : String_Vectors.Vector; function Capitalize (S : in String) return String is (To_Upper (S (S'First)) & S (S'First + 1 .. S'Last)); procedure Write_Spec is File : File_Type; I : Natural := 0; begin Create (File, Out_File, "stemmer-factory.ads"); Put_Line (File, "package Stemmer.Factory with SPARK_Mode is"); New_Line (File); Put (File, " type Language_Type is ("); for Lang of Languages loop Put (File, "L_" & To_Upper (Lang)); I := I + 1; if I < Natural (Languages.Length) then Put_Line (File, ","); Put (File, " "); end if; end loop; Put_Line (File, ");"); New_Line (File); Put_Line (File, " function Stem (Language : in Language_Type;"); Put_Line (File, " Word : in String) return String;"); New_Line (File); Put_Line (File, "end Stemmer.Factory;"); Close (File); end Write_Spec; procedure Write_Body is File : File_Type; begin Create (File, Out_File, "stemmer-factory.adb"); for Lang of Languages loop Put_Line (File, "with Stemmer." & Capitalize (Lang) & ";"); end loop; Put_Line (File, "package body Stemmer.Factory with SPARK_Mode is"); New_Line (File); Put_Line (File, " function Stem (Language : in Language_Type;"); Put_Line (File, " Word : in String) return String is"); Put_Line (File, " Result : Boolean := False;"); Put_Line (File, " begin"); Put_Line (File, " case Language is"); for Lang of Languages loop Put_Line (File, " when L_" & To_Upper (Lang) & " =>"); Put_Line (File, " declare"); Put_Line (File, " C : Stemmer." & Capitalize (Lang) & ".Context_Type;"); Put_Line (File, " begin"); Put_Line (File, " C.Stem_Word (Word, Result);"); Put_Line (File, " return Get_Result (C);"); Put_Line (File, " end;"); New_Line (File); end loop; Put_Line (File, " end case;"); Put_Line (File, " end Stem;"); New_Line (File); Put_Line (File, "end Stemmer.Factory;"); Close (File); end Write_Body; Count : constant Natural := Ada.Command_Line.Argument_Count; begin for I in 1 .. Count loop Languages.Append (To_Lower (Ada.Command_Line.Argument (I))); end loop; Write_Spec; Write_Body; end Generate; snowball-2.2.0/ada/src/000077500000000000000000000000001414263061200146475ustar00rootroot00000000000000snowball-2.2.0/ada/src/stemmer.adb000066400000000000000000000505571414263061200170070ustar00rootroot00000000000000----------------------------------------------------------------------- -- stemmer -- Multi-language stemmer with Snowball generator -- Written by Stephane Carrez (Stephane.Carrez@gmail.com) -- All rights reserved. -- -- Redistribution and use in source and binary forms, with or without -- modification, are permitted provided that the following conditions -- are met: -- -- 1. Redistributions of source code must retain the above copyright notice, -- this list of conditions and the following disclaimer. -- 2. Redistributions in binary form must reproduce the above copyright notice, -- this list of conditions and the following disclaimer in the documentation -- and/or other materials provided with the distribution. -- 3. Neither the name of the Snowball project nor the names of its contributors -- may be used to endorse or promote products derived from this software -- without specific prior written permission. -- -- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ----------------------------------------------------------------------- with Interfaces; package body Stemmer with SPARK_Mode is subtype Byte is Interfaces.Unsigned_8; use type Interfaces.Unsigned_8; procedure Stem_Word (Context : in out Context_Type'Class; Word : in String; Result : out Boolean) is begin Context.P (1 .. Word'Length) := Word; Context.C := 0; Context.L := Word'Length; Context.Lb := 0; Stemmer.Stem (Context, Result); end Stem_Word; function Get_Result (Context : in Context_Type'Class) return String is begin return Context.P (1 .. Context.L); end Get_Result; function Eq_S (Context : in Context_Type'Class; S : in String) return Char_Index is begin if Context.L - Context.C < S'Length then return 0; end if; if Context.P (Context.C + 1 .. Context.C + S'Length) /= S then return 0; end if; return S'Length; end Eq_S; function Eq_S_Backward (Context : in Context_Type'Class; S : in String) return Char_Index is begin if Context.C - Context.Lb < S'Length then return 0; end if; if Context.P (Context.C + 1 - S'Length .. Context.C) /= S then return 0; end if; return S'Length; end Eq_S_Backward; function Length (Context : in Context_Type'Class) return Natural is begin return Context.L - Context.Lb; end Length; function Length_Utf8 (Context : in Context_Type'Class) return Natural is Count : Natural := 0; Pos : Positive := 1; Val : Byte; begin while Pos <= Context.L loop Val := Character'Pos (Context.P (Pos)); Pos := Pos + 1; if Val >= 16#C0# or Val < 16#80# then Count := Count + 1; end if; end loop; return Count; end Length_Utf8; function Check_Among (Context : in Context_Type'Class; Pos : in Char_Index; Shift : in Natural; Mask : in Mask_Type) return Boolean is use Interfaces; Val : constant Byte := Character'Pos (Context.P (Pos + 1)); begin if Natural (Shift_Right (Val, 5)) /= Shift then return True; end if; return (Shift_Right (Unsigned_64 (Mask), Natural (Val and 16#1f#)) and 1) = 0; end Check_Among; procedure Find_Among (Context : in out Context_Type'Class; Amongs : in Among_Array_Type; Pattern : in String; Execute : access procedure (Ctx : in out Context_Type'Class; Operation : in Operation_Index; Status : out Boolean); Result : out Integer) is I : Natural := Amongs'First; J : Natural := Amongs'Last + 1; Common_I : Natural := 0; Common_J : Natural := 0; First_Key_Inspected : Boolean := False; C : constant Natural := Context.C; L : constant Integer := Context.L; begin loop declare K : constant Natural := I + (J - I) / 2; W : constant Among_Type := Amongs (K); Common : Natural := (if Common_I < Common_J then Common_I else Common_J); Diff : Integer := 0; begin for I2 in W.First + Common .. W.Last loop if C + Common = L then Diff := -1; exit; end if; Diff := Character'Pos (Context.P (C + Common + 1)) - Character'Pos (Pattern (I2)); exit when Diff /= 0; Common := Common + 1; end loop; if Diff < 0 then J := K; Common_J := Common; else I := K; Common_I := Common; end if; end; if J - I <= 1 then exit when I > 0 or J = I or First_Key_Inspected; First_Key_Inspected := True; end if; end loop; loop declare W : constant Among_Type := Amongs (I); Len : constant Natural := W.Last - W.First + 1; Status : Boolean; begin if Common_I >= Len then Context.C := C + Len; if W.Operation = 0 then Result := W.Result; return; end if; Execute (Context, W.Operation, Status); Context.C := C + Len; if Status then Result := W.Result; return; end if; end if; exit when W.Substring_I < 0; I := W.Substring_I; end; end loop; Result := 0; end Find_Among; procedure Find_Among_Backward (Context : in out Context_Type'Class; Amongs : in Among_Array_Type; Pattern : in String; Execute : access procedure (Ctx : in out Context_Type'Class; Operation : in Operation_Index; Status : out Boolean); Result : out Integer) is I : Natural := Amongs'First; J : Natural := Amongs'Last + 1; Common_I : Natural := 0; Common_J : Natural := 0; First_Key_Inspected : Boolean := False; C : constant Integer := Context.C; Lb : constant Integer := Context.Lb; begin loop declare K : constant Natural := I + (J - I) / 2; W : constant Among_Type := Amongs (K); Common : Natural := (if Common_I < Common_J then Common_I else Common_J); Diff : Integer := 0; begin for I2 in reverse W.First .. W.Last - Common loop if C - Common = Lb then Diff := -1; exit; end if; Diff := Character'Pos (Context.P (C - Common)) - Character'Pos (Pattern (I2)); exit when Diff /= 0; Common := Common + 1; end loop; if Diff < 0 then J := K; Common_J := Common; else I := K; Common_I := Common; end if; end; if J - I <= 1 then exit when I > 0 or J = I or First_Key_Inspected; First_Key_Inspected := True; end if; end loop; loop declare W : constant Among_Type := Amongs (I); Len : constant Natural := W.Last - W.First + 1; Status : Boolean; begin if Common_I >= Len then Context.C := C - Len; if W.Operation = 0 then Result := W.Result; return; end if; Execute (Context, W.Operation, Status); Context.C := C - Len; if Status then Result := W.Result; return; end if; end if; exit when W.Substring_I < 0; I := W.Substring_I; end; end loop; Result := 0; end Find_Among_Backward; function Skip_Utf8 (Context : in Context_Type'Class) return Result_Index is Pos : Char_Index := Context.C; Val : Byte; begin if Pos >= Context.L then return -1; end if; Pos := Pos + 1; Val := Character'Pos (Context.P (Pos)); if Val >= 16#C0# then while Pos < Context.L loop Val := Character'Pos (Context.P (Pos + 1)); exit when Val >= 16#C0# or Val < 16#80#; Pos := Pos + 1; end loop; end if; return Pos; end Skip_Utf8; function Skip_Utf8 (Context : in Context_Type'Class; N : in Integer) return Result_Index is Pos : Char_Index := Context.C; Val : Byte; begin if N < 0 then return -1; end if; for I in 1 .. N loop if Pos >= Context.L then return -1; end if; Pos := Pos + 1; Val := Character'Pos (Context.P (Pos)); if Val >= 16#C0# then while Pos < Context.L loop Val := Character'Pos (Context.P (Pos + 1)); exit when Val >= 16#C0# or Val < 16#80#; Pos := Pos + 1; end loop; end if; end loop; return Pos; end Skip_Utf8; function Skip_Utf8_Backward (Context : in Context_Type'Class) return Result_Index is Pos : Char_Index := Context.C; Val : Byte; begin if Pos <= Context.Lb then return -1; end if; Val := Character'Pos (Context.P (Pos)); Pos := Pos - 1; if Val >= 16#80# then while Pos > Context.Lb loop Val := Character'Pos (Context.P (Pos + 1)); exit when Val >= 16#C0#; Pos := Pos - 1; end loop; end if; return Pos; end Skip_Utf8_Backward; function Skip_Utf8_Backward (Context : in Context_Type'Class; N : in Integer) return Result_Index is Pos : Char_Index := Context.C; Val : Byte; begin if N < 0 then return -1; end if; for I in 1 .. N loop if Pos <= Context.Lb then return -1; end if; Val := Character'Pos (Context.P (Pos)); Pos := Pos - 1; if Val >= 16#80# then while Pos > Context.Lb loop Val := Character'Pos (Context.P (Pos + 1)); exit when Val >= 16#C0#; Pos := Pos - 1; end loop; end if; end loop; return Pos; end Skip_Utf8_Backward; function Shift_Left (Value : in Utf8_Type; Shift : in Natural) return Utf8_Type is (Utf8_Type (Interfaces.Shift_Left (Interfaces.Unsigned_32 (Value), Shift))); procedure Get_Utf8 (Context : in Context_Type'Class; Value : out Utf8_Type; Count : out Natural) is B0, B1, B2, B3 : Byte; begin if Context.C >= Context.L then Value := 0; Count := 0; return; end if; B0 := Character'Pos (Context.P (Context.C + 1)); if B0 < 16#C0# or Context.C + 1 >= Context.L then Value := Utf8_Type (B0); Count := 1; return; end if; B1 := Character'Pos (Context.P (Context.C + 2)) and 16#3F#; if B0 < 16#E0# or Context.C + 2 >= Context.L then Value := Shift_Left (Utf8_Type (B0 and 16#1F#), 6) or Utf8_Type (B1); Count := 2; return; end if; B2 := Character'Pos (Context.P (Context.C + 3)) and 16#3F#; if B0 < 16#F0# or Context.C + 3 >= Context.L then Value := Shift_Left (Utf8_Type (B0 and 16#0F#), 12) or Shift_Left (Utf8_Type (B1), 6) or Utf8_Type (B2); Count := 3; return; end if; B3 := Character'Pos (Context.P (Context.C + 4)) and 16#3F#; Value := Shift_Left (Utf8_Type (B0 and 16#07#), 18) or Shift_Left (Utf8_Type (B1), 12) or Shift_Left (Utf8_Type (B2), 6) or Utf8_Type (B3); Count := 4; end Get_Utf8; procedure Get_Utf8_Backward (Context : in Context_Type'Class; Value : out Utf8_Type; Count : out Natural) is B0, B1, B2, B3 : Byte; begin if Context.C <= Context.Lb then Value := 0; Count := 0; return; end if; B3 := Character'Pos (Context.P (Context.C)); if B3 < 16#80# or Context.C - 1 <= Context.Lb then Value := Utf8_Type (B3); Count := 1; return; end if; B2 := Character'Pos (Context.P (Context.C - 1)); if B2 >= 16#C0# or Context.C - 2 <= Context.Lb then B3 := B3 and 16#3F#; Value := Shift_Left (Utf8_Type (B2 and 16#1F#), 6) or Utf8_Type (B3); Count := 2; return; end if; B1 := Character'Pos (Context.P (Context.C - 2)); if B1 >= 16#E0# or Context.C - 3 <= Context.Lb then B3 := B3 and 16#3F#; B2 := B2 and 16#3F#; Value := Shift_Left (Utf8_Type (B1 and 16#0F#), 12) or Shift_Left (Utf8_Type (B2), 6) or Utf8_Type (B3); Count := 3; return; end if; B0 := Character'Pos (Context.P (Context.C - 3)); B1 := B1 and 16#1F#; B2 := B2 and 16#3F#; B3 := B3 and 16#3F#; Value := Shift_Left (Utf8_Type (B0 and 16#07#), 18) or Shift_Left (Utf8_Type (B1), 12) or Shift_Left (Utf8_Type (B2), 6) or Utf8_Type (B3); Count := 4; end Get_Utf8_Backward; procedure Out_Grouping (Context : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index) is Ch : Utf8_Type; Count : Natural; begin if Context.C >= Context.L then Result := -1; return; end if; loop Get_Utf8 (Context, Ch, Count); if Count = 0 then Result := -1; return; end if; if Ch <= Max and Ch >= Min then Ch := Ch - Min; if S (Ch) then Result := Count; return; end if; end if; Context.C := Context.C + Count; exit when not Repeat; end loop; Result := 0; end Out_Grouping; procedure Out_Grouping_Backward (Context : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index) is Ch : Utf8_Type; Count : Natural; begin if Context.C = 0 then Result := -1; return; end if; loop Get_Utf8_Backward (Context, Ch, Count); if Count = 0 then Result := -1; return; end if; if Ch <= Max and Ch >= Min then Ch := Ch - Min; if S (Ch) then Result := Count; return; end if; end if; Context.C := Context.C - Count; exit when not Repeat; end loop; Result := 0; end Out_Grouping_Backward; procedure In_Grouping (Context : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index) is Ch : Utf8_Type; Count : Natural; begin if Context.C >= Context.L then Result := -1; return; end if; loop Get_Utf8 (Context, Ch, Count); if Count = 0 then Result := -1; return; end if; if Ch > Max or Ch < Min then Result := Count; return; end if; Ch := Ch - Min; if not S (Ch) then Result := Count; return; end if; Context.C := Context.C + Count; exit when not Repeat; end loop; Result := 0; end In_Grouping; procedure In_Grouping_Backward (Context : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index) is Ch : Utf8_Type; Count : Natural; begin if Context.C = 0 then Result := -1; return; end if; loop Get_Utf8_Backward (Context, Ch, Count); if Count = 0 then Result := -1; return; end if; if Ch > Max or Ch < Min then Result := Count; return; end if; Ch := Ch - Min; if not S (Ch) then Result := Count; return; end if; Context.C := Context.C - Count; exit when not Repeat; end loop; Result := 0; end In_Grouping_Backward; procedure Replace (Context : in out Context_Type'Class; C_Bra : in Char_Index; C_Ket : in Char_Index; S : in String; Adjustment : out Integer) is begin Adjustment := S'Length - (C_Ket - C_Bra); if Adjustment > 0 then Context.P (C_Bra + S'Length + 1 .. Context.Lb + Adjustment + 1) := Context.P (C_Ket + 1 .. Context.Lb + 1); end if; if S'Length > 0 then Context.P (C_Bra + 1 .. C_Bra + S'Length) := S; end if; if Adjustment < 0 then Context.P (C_Bra + S'Length + 1 .. Context.L + Adjustment + 1) := Context.P (C_Ket + 1 .. Context.L + 1); end if; Context.L := Context.L + Adjustment; if Context.C >= C_Ket then Context.C := Context.C + Adjustment; elsif Context.C > C_Bra then Context.C := C_Bra; end if; end Replace; procedure Slice_Del (Context : in out Context_Type'Class) is Result : Integer; begin Replace (Context, Context.Bra, Context.Ket, "", Result); end Slice_Del; procedure Slice_From (Context : in out Context_Type'Class; Text : in String) is Result : Integer; begin Replace (Context, Context.Bra, Context.Ket, Text, Result); end Slice_From; function Slice_To (Context : in Context_Type'Class) return String is begin return Context.P (Context.Bra + 1 .. Context.Ket); end Slice_To; procedure Insert (Context : in out Context_Type'Class; C_Bra : in Char_Index; C_Ket : in Char_Index; S : in String) is Result : Integer; begin Replace (Context, C_Bra, C_Ket, S, Result); if C_Bra <= Context.Bra then Context.Bra := Context.Bra + Result; end if; if C_Bra <= Context.Ket then Context.Ket := Context.Ket + Result; end if; end Insert; end Stemmer; snowball-2.2.0/ada/src/stemmer.ads000066400000000000000000000231041414263061200170140ustar00rootroot00000000000000----------------------------------------------------------------------- -- stemmer -- Multi-language stemmer with Snowball generator -- Written by Stephane Carrez (Stephane.Carrez@gmail.com) -- All rights reserved. -- -- Redistribution and use in source and binary forms, with or without -- modification, are permitted provided that the following conditions -- are met: -- -- 1. Redistributions of source code must retain the above copyright notice, -- this list of conditions and the following disclaimer. -- 2. Redistributions in binary form must reproduce the above copyright notice, -- this list of conditions and the following disclaimer in the documentation -- and/or other materials provided with the distribution. -- 3. Neither the name of the Snowball project nor the names of its contributors -- may be used to endorse or promote products derived from this software -- without specific prior written permission. -- -- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ----------------------------------------------------------------------- package Stemmer with SPARK_Mode is pragma Preelaborate; WORD_MAX_LENGTH : constant := 1024; type Context_Type is abstract tagged private; -- Apply the stemming algorithm on the word initialized in the context. procedure Stem (Context : in out Context_Type; Result : out Boolean) is abstract; -- Stem the word and return True if it was reduced. procedure Stem_Word (Context : in out Context_Type'Class; Word : in String; Result : out Boolean) with Global => null, Pre => Word'Length < WORD_MAX_LENGTH; -- Get the stem or the input word unmodified. function Get_Result (Context : in Context_Type'Class) return String with Global => null, Post => Get_Result'Result'Length < WORD_MAX_LENGTH; private type Mask_Type is mod 2**32; -- A 32-bit character value that was read from UTF-8 sequence. -- A modular value is used because shift and logical arithmetic is necessary. type Utf8_Type is mod 2**32; -- Index of the Grouping_Array. The index comes from the 32-bit character value -- minus a starting offset. We don't expect large tables and we check against -- a maximum value. subtype Grouping_Index is Utf8_Type range 0 .. 16384; type Grouping_Array is array (Grouping_Index range <>) of Boolean with Pack; subtype Among_Index is Natural range 0 .. 65535; subtype Among_Start_Index is Among_Index range 1 .. Among_Index'Last; subtype Operation_Index is Natural range 0 .. 65535; subtype Result_Index is Integer range -1 .. WORD_MAX_LENGTH - 1; subtype Char_Index is Result_Index range 0 .. Result_Index'Last; type Among_Type is record First : Among_Start_Index; Last : Among_Index; Substring_I : Integer; Result : Integer; Operation : Operation_Index; end record; type Among_Array_Type is array (Natural range <>) of Among_Type; function Eq_S (Context : in Context_Type'Class; S : in String) return Char_Index with Global => null, Pre => S'Length > 0, Post => Eq_S'Result = 0 or Eq_S'Result = S'Length; function Eq_S_Backward (Context : in Context_Type'Class; S : in String) return Char_Index with Global => null, Pre => S'Length > 0, Post => Eq_S_Backward'Result = 0 or Eq_S_Backward'Result = S'Length; procedure Find_Among (Context : in out Context_Type'Class; Amongs : in Among_Array_Type; Pattern : in String; Execute : access procedure (Ctx : in out Context_Type'Class; Operation : in Operation_Index; Status : out Boolean); Result : out Integer) with Global => null, Pre => Pattern'Length > 0 and Amongs'Length > 0; procedure Find_Among_Backward (Context : in out Context_Type'Class; Amongs : in Among_Array_Type; Pattern : in String; Execute : access procedure (Ctx : in out Context_Type'Class; Operation : in Operation_Index; Status : out Boolean); Result : out Integer) with Global => null, Pre => Pattern'Length > 0 and Amongs'Length > 0; function Skip_Utf8 (Context : in Context_Type'Class) return Result_Index with Global => null; function Skip_Utf8 (Context : in Context_Type'Class; N : in Integer) return Result_Index with Global => null; function Skip_Utf8_Backward (Context : in Context_Type'Class) return Result_Index with Global => null; function Skip_Utf8_Backward (Context : in Context_Type'Class; N : in Integer) return Result_Index with Global => null; procedure Get_Utf8 (Context : in Context_Type'Class; Value : out Utf8_Type; Count : out Natural); procedure Get_Utf8_Backward (Context : in Context_Type'Class; Value : out Utf8_Type; Count : out Natural); function Length (Context : in Context_Type'Class) return Natural; function Length_Utf8 (Context : in Context_Type'Class) return Natural; function Check_Among (Context : in Context_Type'Class; Pos : in Char_Index; Shift : in Natural; Mask : in Mask_Type) return Boolean; procedure Out_Grouping (Context : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index); procedure Out_Grouping_Backward (Context : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index); procedure In_Grouping (Context : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index); procedure In_Grouping_Backward (Context : in out Context_Type'Class; S : in Grouping_Array; Min : in Utf8_Type; Max : in Utf8_Type; Repeat : in Boolean; Result : out Result_Index); procedure Replace (Context : in out Context_Type'Class; C_Bra : in Char_Index; C_Ket : in Char_Index; S : in String; Adjustment : out Integer) with Global => null, Pre => C_Bra >= Context.Lb and C_Ket >= C_Bra and C_Ket <= Context.L; procedure Slice_Del (Context : in out Context_Type'Class) with Global => null, Pre => Context.Bra >= Context.Lb and Context.Ket >= Context.Bra and Context.Ket <= Context.L; procedure Slice_From (Context : in out Context_Type'Class; Text : in String) with Global => null, Pre => Context.Bra >= Context.Lb and Context.Ket >= Context.Bra and Context.Ket <= Context.L and Context.L - Context.Lb + Text'Length + Context.Ket - Context.Bra < Context.P'Length; function Slice_To (Context : in Context_Type'Class) return String; procedure Insert (Context : in out Context_Type'Class; C_Bra : in Char_Index; C_Ket : in Char_Index; S : in String) with Global => null, Pre => C_Bra >= Context.Lb and C_Ket >= C_Bra and C_Ket <= Context.L; -- The context indexes follow the C paradigm: they start at 0 for the first character. -- This is necessary because several algorithms rely on this when they compare the -- cursor position ('C') or setup some markers from the cursor. type Context_Type is abstract tagged record C : Char_Index := 0; L : Char_Index := 0; Lb : Char_Index := 0; Bra : Char_Index := 0; Ket : Char_Index := 0; P : String (1 .. WORD_MAX_LENGTH); end record; end Stemmer; snowball-2.2.0/ada/src/stemwords.adb000066400000000000000000000045651414263061200173600ustar00rootroot00000000000000with Ada.Text_IO; with Ada.Command_Line; with Stemmer.Factory; procedure Stemwords is use Stemmer.Factory; function Get_Language (Name : in String) return Language_Type; function Is_Space (C : in Character) return Boolean; function Is_Space (C : in Character) return Boolean is begin return C = ' ' or C = ASCII.HT; end Is_Space; function Get_Language (Name : in String) return Language_Type is begin return Language_Type'Value ("L_" & Name); exception when Constraint_Error => Ada.Text_IO.Put_Line ("Unsupported language: " & Name); return L_ENGLISH; end Get_Language; Count : constant Natural := Ada.Command_Line.Argument_Count; begin if Count /= 3 then Ada.Text_IO.Put_Line ("Usage: stemwords "); return; end if; declare Lang : constant Language_Type := Get_Language (Ada.Command_Line.Argument (1)); Input : constant String := Ada.Command_Line.Argument (2); Output : constant String := Ada.Command_Line.Argument (3); Src_File : Ada.Text_IO.File_Type; Dst_File : Ada.Text_IO.File_Type; begin Ada.Text_IO.Open (Src_File, Ada.Text_IO.In_File, Input); Ada.Text_IO.Create (Dst_File, Ada.Text_IO.Out_File, Output); while not Ada.Text_IO.End_Of_File (Src_File) loop declare Line : constant String := Ada.Text_IO.Get_Line (Src_File); Pos : Positive := Line'First; Last_Pos : Positive; Start_Pos : Positive; begin while Pos <= Line'Last loop Last_Pos := Pos; while Pos <= Line'Last and then Is_Space (Line (Pos)) loop Pos := Pos + 1; end loop; if Last_Pos < Pos then Ada.Text_IO.Put (Dst_File, Line (Last_Pos .. Pos - 1)); end if; exit when Pos > Line'Last; Start_Pos := Pos; while Pos <= Line'Last and then not Is_Space (Line (Pos)) loop Pos := Pos + 1; end loop; Ada.Text_IO.Put (Dst_File, Stemmer.Factory.Stem (Lang, Line (Start_Pos .. Pos - 1))); end loop; Ada.Text_IO.New_Line (Dst_File); end; end loop; Ada.Text_IO.Close (Src_File); Ada.Text_IO.Close (Dst_File); end; end Stemwords; snowball-2.2.0/ada/stemmer_config.gpr000066400000000000000000000044261414263061200176010ustar00rootroot00000000000000abstract project Stemmer_Config is for Source_Dirs use (); type Yes_No is ("yes", "no"); type Library_Type_Type is ("relocatable", "static", "static-pic"); type Build_Type is ("distrib", "debug", "optimize", "profile", "coverage"); Mode : Build_Type := external ("BUILD", "distrib"); Processors := External ("PROCESSORS", "1"); package Builder is case Mode is when "debug" => for Default_Switches ("Ada") use ("-g", "-j" & Processors); when others => for Default_Switches ("Ada") use ("-g", "-O3", "-j" & Processors); end case; end Builder; package compiler is warnings := ("-gnatwua"); defaults := ("-gnat2012"); case Mode is when "distrib" => for Default_Switches ("Ada") use defaults & ("-gnatafno", "-gnatVa", "-gnatwa"); when "debug" => for Default_Switches ("Ada") use defaults & warnings & ("-gnata", "-gnatVaMI", "-gnaty3abcefhiklmnprstxM99"); when "coverage" => for Default_Switches ("Ada") use defaults & warnings & ("-gnata", "-gnatVaMI", "-gnaty3abcefhiklmnprstxM99", "-fprofile-arcs", "-ftest-coverage"); when "optimize" => for Default_Switches ("Ada") use defaults & warnings & ("-gnatn", "-gnatp", "-fdata-sections", "-ffunction-sections"); when "profile" => for Default_Switches ("Ada") use defaults & warnings & ("-pg"); end case; end compiler; package binder is case Mode is when "debug" => for Default_Switches ("Ada") use ("-E"); when others => for Default_Switches ("Ada") use ("-E"); end case; end binder; package linker is case Mode is when "profile" => for Default_Switches ("Ada") use ("-pg"); when "distrib" => for Default_Switches ("Ada") use ("-s"); when "optimize" => for Default_Switches ("Ada") use ("-Wl,--gc-sections"); when "coverage" => for Default_Switches ("ada") use ("-fprofile-arcs"); when others => null; end case; end linker; package Ide is for VCS_Kind use "git"; end Ide; end Stemmer_Config; snowball-2.2.0/ada/stemwords.gpr000066400000000000000000000007601414263061200166240ustar00rootroot00000000000000with "stemmer_config"; project Stemwords is Mains := ("stemwords.adb"); for Main use Mains; for Source_Dirs use ("src", "algorithms"); for Object_Dir use "./" & Stemmer_Config'Object_Dir & "/obj"; for Exec_Dir use "./" & Stemmer_Config'Exec_Dir & "/bin"; package Binder renames Stemmer_Config.Binder; package Builder renames Stemmer_Config.Builder; package Compiler renames Stemmer_Config.Compiler; package Linker renames Stemmer_Config.Linker; end Stemwords; snowball-2.2.0/algorithms/000077500000000000000000000000001414263061200155045ustar00rootroot00000000000000snowball-2.2.0/algorithms/arabic.sbl000066400000000000000000000422551414263061200174370ustar00rootroot00000000000000/* * Authors: * - Assem Chelli, < assem [dot] ch [at] gmail > * - Abdelkrim Aries * */ stringescapes { } /* the Arabic letters in Unicode */ // Hamza stringdef o '{U+0621}' // Hamza stringdef ao '{U+0623}' // Hamza above Alef stringdef ao_ '{U+0625}' // Hamza below Alef stringdef a~ '{U+0622}' // Alef madda stringdef wo '{U+0624}' // Hamza above waw stringdef yo '{U+0626}' // Hamza above yeh // Letters stringdef a '{U+0627}' // Alef stringdef a_ '{U+0649}' // Alef Maksura stringdef b '{U+0628}' // Beh stringdef t_ '{U+0629}' // Teh_Marbuta stringdef t '{U+062A}' // Teh stringdef th '{U+062B}' // Theh stringdef j '{U+062C}' // Jeem stringdef h '{U+062D}' // Hah stringdef x '{U+062E}' // Khah stringdef d '{U+062F}' // Dal stringdef dz '{U+0630}' // Thal stringdef r '{U+0631}' // Reh stringdef z '{U+0632}' // Zain stringdef s '{U+0633}' // Seen stringdef sh '{U+0634}' // Sheen stringdef c '{U+0635}' // Sad stringdef dh '{U+0636}' // Dad stringdef tt '{U+0637}' // Tah stringdef zh '{U+0638}' // Zah stringdef i '{U+0639}' // Ain stringdef gh '{U+063A}' // Ghain stringdef f '{U+0641}' // Feh stringdef q '{U+0642}' // Qaf stringdef k '{U+0643}' // Kaf stringdef l '{U+0644}' // Lam stringdef m '{U+0645}' // Meem stringdef n '{U+0646}' // Noon stringdef e '{U+0647}' // Heh stringdef w '{U+0648}' // Waw stringdef y '{U+064A}' // Yeh // Diacritics stringdef aan '{U+064B}' // FatHatan stringdef uun '{U+064C}' // Dammatan stringdef iin '{U+064D}' // Kasratan stringdef aa '{U+064E}' // FatHa stringdef uu '{U+064F}' // Damma stringdef ii '{U+0650}' // Kasra stringdef oo '{U+0652}' // Sukun stringdef ~ '{U+0651}' // Shadda // Hindu–Arabic numerals stringdef 0 '{U+0660}' stringdef 1 '{U+0661}' stringdef 2 '{U+0662}' stringdef 3 '{U+0663}' stringdef 4 '{U+0664}' stringdef 5 '{U+0665}' stringdef 6 '{U+0666}' stringdef 7 '{U+0667}' stringdef 8 '{U+0668}' stringdef 9 '{U+0669}' // Kasheeda stringdef _ '{U+0640}' // Kasheeda, Tatweel // Shaped forms stringdef o1 '{U+FE80}' // HAMZA stringdef ao1 '{U+FE83}' // ALEF_HAMZA_ABOVE stringdef ao2 '{U+FE84}' // ALEF_HAMZA_ABOVE stringdef ao_1 '{U+FE87}' // ALEF_HAMZA_BELOW stringdef ao_2 '{U+FE88}' // ALEF_HAMZA_BELOW stringdef yo1 '{U+FE8B}' // YEH_HAMZA stringdef yo2 '{U+FE8C}' // YEH_HAMZA stringdef yo3 '{U+FE89}' // YEH_HAMZA stringdef yo4 '{U+FE8A}' // YEH_HAMZA stringdef a~1 '{U+FE81}' // ALEF_MADDA stringdef a~2 '{U+FE82}' // ALEF_MADDA stringdef wo1 '{U+FE85}' // WAW_HAMZA stringdef wo2 '{U+FE86}' // WAW_HAMZA stringdef a1 '{U+FE8D}' // ALEF stringdef a2 '{U+FE8E}' // ALEF stringdef b1 '{U+FE8F}' // BEH stringdef b2 '{U+FE90}' // BEH stringdef b3 '{U+FE91}' // BEH stringdef b4 '{U+FE92}' // BEH stringdef t_1 '{U+FE93}' // TEH_MARBUTA stringdef t_2 '{U+FE94}' // TEH_MARBUTA stringdef t1 '{U+FE97}' // TEH stringdef t2 '{U+FE98}' // TEH stringdef t3 '{U+FE95}' // TEH stringdef t4 '{U+FE96}' // TEH stringdef th1 '{U+FE9B}' // THEH stringdef th2 '{U+FE9C}' // THEH stringdef th3 '{U+FE9A}' // THEH stringdef th4 '{U+FE99}' // THEH stringdef j1 '{U+FE9F}' // JEEM stringdef j2 '{U+FEA0}' // JEEM stringdef j3 '{U+FE9D}' // JEEM stringdef j4 '{U+FE9E}' // JEEM stringdef h1 '{U+FEA3}' // HAH stringdef h2 '{U+FEA4}' // HAH stringdef h3 '{U+FEA1}' // HAH stringdef h4 '{U+FEA2}' // HAH stringdef x1 '{U+FEA7}' // KHAH stringdef x2 '{U+FEA8}' // KHAH stringdef x3 '{U+FEA5}' // KHAH stringdef x4 '{U+FEA6}' // KHAH stringdef d1 '{U+FEA9}' // DAL stringdef d2 '{U+FEAA}' // DAL stringdef dz1 '{U+FEAB}' // THAL stringdef dz2 '{U+FEAC}' // THAL stringdef r1 '{U+FEAD}' // REH stringdef r2 '{U+FEAE}' // REH stringdef z1 '{U+FEAF}' // ZAIN stringdef z2 '{U+FEB0}' // ZAIN stringdef s1 '{U+FEB3}' // SEEN stringdef s2 '{U+FEB4}' // SEEN stringdef s3 '{U+FEB1}' // SEEN stringdef s4 '{U+FEB2}' // SEEN stringdef sh1 '{U+FEB7}' // SHEEN stringdef sh2 '{U+FEB8}' // SHEEN stringdef sh3 '{U+FEB5}' // SHEEN stringdef sh4 '{U+FEB6}' // SHEEN stringdef c1 '{U+FEBB}' // SAD stringdef c2 '{U+FEBC}' // SAD stringdef c3 '{U+FEB9}' // SAD stringdef c4 '{U+FEBA}' // SAD stringdef dh1 '{U+FEBF}' // DAD stringdef dh2 '{U+FEC0}' // DAD stringdef dh3 '{U+FEBD}' // DAD stringdef dh4 '{U+FEBE}' // DAD stringdef tt1 '{U+FEC3}' // TAH stringdef tt2 '{U+FEC4}' // TAH stringdef tt3 '{U+FEC1}' // TAH stringdef tt4 '{U+FEC2}' // TAH stringdef zh1 '{U+FEC7}' // ZAH stringdef zh2 '{U+FEC8}' // ZAH stringdef zh3 '{U+FEC5}' // ZAH stringdef zh4 '{U+FEC6}' // ZAH stringdef i1 '{U+FECB}' // AIN stringdef i2 '{U+FECC}' // AIN stringdef i3 '{U+FEC9}' // AIN stringdef i4 '{U+FECA}' // AIN stringdef gh1 '{U+FECF}' // GHAIN stringdef gh2 '{U+FED0}' // GHAIN stringdef gh3 '{U+FECD}' // GHAIN stringdef gh4 '{U+FECE}' // GHAIN stringdef f1 '{U+FED3}' // FEH stringdef f2 '{U+FED4}' // FEH stringdef f3 '{U+FED1}' // FEH stringdef f4 '{U+FED2}' // FEH stringdef q1 '{U+FED7}' // QAF stringdef q2 '{U+FED8}' // QAF stringdef q3 '{U+FED5}' // QAF stringdef q4 '{U+FED6}' // QAF stringdef k1 '{U+FEDB}' // KAF stringdef k2 '{U+FEDC}' // KAF stringdef k3 '{U+FED9}' // KAF stringdef k4 '{U+FEDA}' // KAF stringdef l1 '{U+FEDF}' // LAM stringdef l2 '{U+FEE0}' // LAM stringdef l3 '{U+FEDD}' // LAM stringdef l4 '{U+FEDE}' // LAM stringdef m1 '{U+FEE3}' // MEEM stringdef m2 '{U+FEE4}' // MEEM stringdef m3 '{U+FEE1}' // MEEM stringdef m4 '{U+FEE2}' // MEEM stringdef n1 '{U+FEE7}' // NOON stringdef n2 '{U+FEE8}' // NOON stringdef n3 '{U+FEE5}' // NOON stringdef n4 '{U+FEE6}' // NOON stringdef e1 '{U+FEEB}' // HEH stringdef e2 '{U+FEEC}' // HEH stringdef e3 '{U+FEE9}' // HEH stringdef e4 '{U+FEEA}' // HEH stringdef w1 '{U+FEED}' // WAW stringdef w2 '{U+FEEE}' // WAW stringdef a_1 '{U+FEEF}' // ALEF_MAKSURA stringdef a_2 '{U+FEF0}' // ALEF_MAKSURA stringdef y1 '{U+FEF3}' // YEH stringdef y2 '{U+FEF4}' // YEH stringdef y3 '{U+FEF1}' // YEH stringdef y4 '{U+FEF2}' // YEH // Ligatures Lam-Alef stringdef la '{U+FEFB}' // LAM_ALEF stringdef la2 '{U+FEFC}' // LAM_ALEF stringdef lao '{U+FEF7}' // LAM_ALEF_HAMZA_ABOVE stringdef lao2 '{U+FEF8}' // LAM_ALEF_HAMZA_ABOVE stringdef lao_ '{U+FEF9}' // LAM_ALEF_HAMZA_BELOW stringdef lao_2 '{U+FEFA}' // LAM_ALEF_HAMZA_BELOW stringdef la~ '{U+FEF5}' // LAM_ALEF_MADDA_ABOVE stringdef la~2 '{U+FEF6}' // LAM_ALEF_MADDA_ABOVE booleans ( is_noun is_verb is_defined ) routines ( Prefix_Step1 Prefix_Step2 Prefix_Step3a_Noun Prefix_Step3b_Noun Prefix_Step3_Verb Prefix_Step4_Verb Suffix_All_alef_maqsura Suffix_Noun_Step1a Suffix_Noun_Step1b Suffix_Noun_Step2a Suffix_Noun_Step2b Suffix_Noun_Step2c1 Suffix_Noun_Step2c2 Suffix_Noun_Step3 Suffix_Verb_Step1 Suffix_Verb_Step2a Suffix_Verb_Step2b Suffix_Verb_Step2c Normalize_post Normalize_pre Checks1 ) externals ( stem ) groupings ( ) // Normalizations define Normalize_pre as ( do repeat ( ( [substring] among ( '{aan}' '{uun}' '{iin}' '{aa}' '{uu}' '{ii}' '{oo}' '{~}'( delete ) // strip vocalization '{_}' ( delete ) // strip kasheeda // Hindu–Arabic numerals '{0}' ( <- '0') '{1}' ( <- '1') '{2}' ( <- '2') '{3}' ( <- '3') '{4}' ( <- '4') '{5}' ( <- '5') '{6}' ( <- '6') '{7}' ( <- '7') '{8}' ( <- '8') '{9}' ( <- '9') // Shaped forms '{o1}' ( <- '{o}' ) // HAMZA '{ao1}' '{ao2}' ( <- '{ao}' ) // ALEF_HAMZA_ABOVE '{ao_1}' '{ao_2}' ( <- '{ao_}' ) // ALEF_HAMZA_BELOW '{yo1}' '{yo2}' '{yo3}' '{yo4}' ( <- '{yo}' ) // YEH_HAMZA '{a~1}' '{a~2}'( <- '{a~}' ) // ALEF_MADDA '{wo1}' '{wo2}'( <- '{wo}' ) // WAW_HAMZA '{a1}' '{a2}' ( <- '{a}' ) // ALEF '{b1}' '{b2}' '{b3}' '{b4}' ( <- '{b}' ) // BEH '{t_1}' '{t_2}' ( <- '{t_}' ) // TEH_MARBUTA '{t1}' '{t2}' '{t3}' '{t4}' ( <- '{t}' ) // TEH '{th1}' '{th2}' '{th3}' '{th4}' ( <- '{th}' ) // THEH '{j1}' '{j2}' '{j3}' '{j4}'( <- '{j}' ) // JEEM '{h1}' '{h2}' '{h3}' '{h4}' ( <- '{h}' ) // HAH '{x1}' '{x2}' '{x3}' '{x4}'( <- '{x}' ) // KHAH '{d1}' '{d2}' ( <- '{d}' ) // DAL '{dz1}''{dz2}' ( <- '{dz}' ) // THAL '{r1}' '{r2}'( <- '{r}' ) // REH '{z1}' '{z2}' ( <- '{z}' ) // ZAIN '{s1}' '{s2}' '{s3}' '{s4}'( <- '{s}' ) // SEEN '{sh1}' '{sh2}' '{sh3}' '{sh4}' ( <- '{sh}' ) // SHEEN '{c1}' '{c2}' '{c3}' '{c4}'( <- '{c}' ) // SAD '{dh1}' '{dh2}' '{dh3}' '{dh4}'( <- '{dh}' ) // DAD '{tt1}' '{tt2}' '{tt3}' '{tt4}' ( <- '{tt}' ) // TAH '{zh1}' '{zh2}' '{zh3}' '{zh4}'( <- '{zh}' ) // ZAH '{i1}' '{i2}' '{i3}' '{i4}'( <- '{i}' ) // AIN '{gh1}' '{gh2}' '{gh3}' '{gh4}'( <- '{gh}' ) // GHAIN '{f1}' '{f2}' '{f3}' '{f4}' ( <- '{f}' ) // FEH '{q1}' '{q2}' '{q3}' '{q4}' ( <- '{q}' ) // QAF '{k1}' '{k2}' '{k3}' '{k4}'( <- '{k}' ) // KAF '{l1}' '{l2}' '{l3}' '{l4}'( <- '{l}' ) // LAM '{m1}' '{m2}' '{m3}' '{m4}' ( <- '{m}' ) // MEEM '{n1}' '{n2}' '{n3}' '{n4}'( <- '{n}' ) // NOON '{e1}' '{e2}' '{e3}' '{e4}' ( <- '{e}' ) // HEH '{w1}' '{w2}' ( <- '{w}' ) // WAW '{a_1}' '{a_2}' ( <- '{a_}' ) // ALEF_MAKSURA '{y1}' '{y2}' '{y3}' '{y4}' ( <- '{y}' ) // YEH // Ligatures Lam-Alef '{la}' '{la2}' (<- '{l}{a}') '{lao}' '{lao2}' (<- '{l}{ao}') '{lao_}' '{lao_2}' (<- '{l}{ao_}') '{la~}' '{la~2}' (<- '{l}{a~}') ) ) or next ) ) define Normalize_post as ( do ( // normalize last hamza backwards ( [substring] among ( '{ao}''{ao_}' '{a~}' ( <- '{o}') '{wo}' ( <- '{o}') '{yo}' ( <- '{o}') ) ) ) do repeat ( ( // normalize other hamza's [substring] among ( '{ao}''{ao_}' '{a~}' ( <- '{a}') '{wo}' ( <- '{w}') '{yo}' ( <- '{y}') ) ) or next ) ) // Checks define Checks1 as ( [substring] among ( '{b}{a}{l}' '{k}{a}{l}' ($(len > 4) set is_noun unset is_verb set is_defined) '{l}{l}' '{a}{l}' ($(len > 3) set is_noun unset is_verb set is_defined) ) ) //prefixes define Prefix_Step1 as ( [substring] among ( '{ao}{ao}' ($(len > 3) <- '{ao}' ) '{ao}{a~}' ($(len > 3) <- '{a~}' ) '{ao}{wo}' ($(len > 3) <- '{ao}' ) '{ao}{a}' ($(len > 3) <- '{a}' ) '{ao}{ao_}' ($(len > 3) <- '{ao_}' ) // '{ao}' ($(len > 3) delete) //rare case ) ) define Prefix_Step2 as ( [substring] among ( '{f}' '{w}' ($(len > 3) not '{a}' delete) ) ) define Prefix_Step3a_Noun as ( // it is noun and defined [substring] among ( '{b}{a}{l}' '{k}{a}{l}' ($(len > 5) delete) '{l}{l}' '{a}{l}' ($(len > 4) delete) ) ) define Prefix_Step3b_Noun as ( // probably noun and defined [substring] among ( '{b}{a}' ( ) // exception - not a valid verb prefix so can just succeed here '{b}' ($(len > 3) delete) // '{k}' '{l}' ($(len > 3) delete) // BUG: cause confusion '{b}{b}' ($(len > 3) <- '{b}' ) '{k}{k}' ($(len > 3) <- '{k}' ) ) ) define Prefix_Step3_Verb as ( [substring] among ( //'{s}' ($(len > 4) delete)// BUG: cause confusion '{s}{y}' ($(len > 4) <- '{y}' ) '{s}{t}' ($(len > 4) <- '{t}') '{s}{n}' ($(len > 4) <- '{n}') '{s}{ao}' ($(len > 4) <- '{ao}') ) ) define Prefix_Step4_Verb as ( [substring] among ( '{y}{s}{t}' '{n}{s}{t}' '{t}{s}{t}' ($(len > 4) set is_verb unset is_noun <- '{a}{s}{t}' ) ) ) // suffixes backwardmode ( define Suffix_Noun_Step1a as ( [substring] among ( '{y}' '{k}' '{e}' ($(len >= 4) delete) '{n}{a}' '{k}{m}' '{e}{a}' '{e}{n}' '{e}{m}' ($(len >= 5) delete) '{k}{m}{a}' '{e}{m}{a}' ($(len >= 6) delete) ) ) define Suffix_Noun_Step1b as ( [substring] among ( '{n}' ($(len > 5) delete) ) ) define Suffix_Noun_Step2a as ( [substring] among ( '{a}' '{y}' '{w}' ($(len > 4) delete) ) ) define Suffix_Noun_Step2b as ( [substring] among ( '{a}{t}' ($(len >= 5) delete) ) ) define Suffix_Noun_Step2c1 as ( [substring] among ( '{t}' ($(len >= 4) delete) ) ) define Suffix_Noun_Step2c2 as ( // feminine t_ [substring] among ( '{t_}' ($(len >= 4) delete) ) ) define Suffix_Noun_Step3 as ( // ya' nisbiya [substring] among ( '{y}' ($(len >= 3) delete) ) ) define Suffix_Verb_Step1 as ( [substring] among ( '{e}' '{k}' ($(len >= 4) delete) '{n}{y}' '{n}{a}' '{e}{a}' '{e}{m}' '{e}{n}' '{k}{m}' '{k}{n}' ($(len >= 5) delete) '{e}{m}{a}' '{k}{m}{a}' '{k}{m}{w}'($(len >= 6) delete) ) ) define Suffix_Verb_Step2a as ( [substring] among ( '{t}' ($(len >= 4) delete) '{a}' '{n}' '{y}' ($(len >= 4) delete) '{n}{a}' '{t}{a}' '{t}{n}' ($(len >= 5) delete)// past '{a}{n}' '{w}{n}' '{y}{n}' ($(len > 5) delete) // present '{t}{m}{a}' ($(len >= 6) delete) ) ) define Suffix_Verb_Step2b as ( [substring] among ( '{w}{a}' '{t}{m}' ($(len >= 5) delete) ) ) define Suffix_Verb_Step2c as ( [substring] among ( '{w}' ($(len >= 4) delete) '{t}{m}{w}' ($(len >= 6) delete) ) ) define Suffix_All_alef_maqsura as ( [substring] among ( '{a_}' ( <- '{y}' ) // spell error // '{a_}' ( delete ) // if noun > 3 // '{a_}' ( <- '{a}') // if verb ) ) ) define stem as ( // set initial values set is_noun set is_verb unset is_defined // guess type and properties do Checks1 // normalization pre-stemming do Normalize_pre backwards ( do ( //Suffixes for verbs ( is_verb ( ( (atleast 1 Suffix_Verb_Step1) ( Suffix_Verb_Step2a or Suffix_Verb_Step2c or next) ) or Suffix_Verb_Step2b or Suffix_Verb_Step2a ) ) //Suffixes for nouns or ( is_noun ( try ( Suffix_Noun_Step2c2 or (not is_defined Suffix_Noun_Step1a ( Suffix_Noun_Step2a or Suffix_Noun_Step2b or Suffix_Noun_Step2c1 or next)) or (Suffix_Noun_Step1b ( Suffix_Noun_Step2a or Suffix_Noun_Step2b or Suffix_Noun_Step2c1)) or (not is_defined Suffix_Noun_Step2a) or (Suffix_Noun_Step2b) ) Suffix_Noun_Step3 ) ) // Suffixes for alef maqsura or Suffix_All_alef_maqsura ) ) //Prefixes do ( try Prefix_Step1 try Prefix_Step2 ( Prefix_Step3a_Noun or (is_noun Prefix_Step3b_Noun) or (is_verb try Prefix_Step3_Verb Prefix_Step4_Verb) ) ) // normalization post-stemming do Normalize_post ) snowball-2.2.0/algorithms/armenian.sbl000066400000000000000000000162551414263061200200110ustar00rootroot00000000000000stringescapes {} stringdef a '{U+0561}' // 531 stringdef b '{U+0562}' // 532 stringdef g '{U+0563}' // 533 stringdef d '{U+0564}' // 534 stringdef ye '{U+0565}' // 535 stringdef z '{U+0566}' // 536 stringdef e '{U+0567}' // 537 stringdef y '{U+0568}' // 538 stringdef dt '{U+0569}' // 539 stringdef zh '{U+056A}' // 53A stringdef i '{U+056B}' // 53B stringdef l '{U+056C}' // 53C stringdef kh '{U+056D}' // 53D stringdef ts '{U+056E}' // 53E stringdef k '{U+056F}' // 53F stringdef h '{U+0570}' // 540 stringdef dz '{U+0571}' // 541 stringdef gh '{U+0572}' // 542 stringdef djch '{U+0573}' // 543 stringdef m '{U+0574}' // 544 stringdef j '{U+0575}' // 545 stringdef n '{U+0576}' // 546 stringdef sh '{U+0577}' // 547 stringdef vo '{U+0578}' // 548 stringdef ch '{U+0579}' // 549 stringdef p '{U+057A}' // 54A stringdef dj '{U+057B}' // 54B stringdef r '{U+057C}' // 54C stringdef s '{U+057D}' // 54D stringdef v '{U+057E}' // 54E stringdef t '{U+057F}' // 54F stringdef r' '{U+0580}' // 550 stringdef c '{U+0581}' // 551 stringdef u '{U+0582}' // 552 //vjun stringdef bp '{U+0583}' // 553 stringdef q '{U+0584}' // 554 stringdef ev '{U+0587}' stringdef o '{U+0585}' // 555 stringdef f '{U+0586}' // 556 routines ( mark_regions R2 adjective verb noun ending ) externals ( stem ) integers ( pV p2 ) groupings ( v ) define v '{a}{e}{i}{o}{u}{ye}{vo}{y}' define mark_regions as ( $pV = limit $p2 = limit do ( gopast v setmark pV gopast non-v gopast v gopast non-v setmark p2 ) ) backwardmode ( define R2 as $p2 <= cursor define adjective as ( [substring] among ( '{b}{a}{r'}' '{p}{ye}{s}' '{vo}{r'}{e}{n}' '{vo}{v}{i}{n}' '{a}{k}{i}' '{l}{a}{j}{n}' '{r'}{vo}{r'}{d}' '{ye}{r'}{vo}{r'}{d}' '{a}{k}{a}{n}' '{a}{l}{i}' '{k}{vo}{t}' '{ye}{k}{ye}{n}' '{vo}{r'}{a}{k}' '{ye}{gh}' '{v}{vo}{u}{n}' '{ye}{r'}{ye}{n}' '{a}{r'}{a}{n}' '{ye}{n}' '{a}{v}{ye}{t}' '{g}{i}{n}' '{i}{v}' '{a}{t}' '{i}{n}' (delete) ) ) define verb as ( [substring] among ( '{vo}{u}{m}' '{v}{vo}{u}{m}' '{a}{l}{vo}{u}' '{ye}{l}{vo}{u}' '{v}{ye}{l}' '{a}{n}{a}{l}' '{ye}{l}{vo}{u}{c}' '{a}{l}{vo}{u}{c}' '{y}{a}{l}' '{y}{ye}{l}' '{a}{l}{vo}{v}' '{ye}{l}{vo}{v}' '{a}{l}{i}{s}' '{ye}{l}{i}{s}' '{ye}{n}{a}{l}' '{a}{c}{n}{a}{l}' '{ye}{c}{n}{ye}{l}' '{c}{n}{ye}{l}' '{n}{ye}{l}' '{a}{t}{ye}{l}' '{vo}{t}{ye}{l}' '{k}{vo}{t}{ye}{l}' '{t}{ye}{l}' '{v}{a}{ts}' '{ye}{c}{v}{ye}{l}' '{a}{c}{v}{ye}{l}' '{ye}{c}{i}{r'}' '{a}{c}{i}{r'}' '{ye}{c}{i}{n}{q}' '{a}{c}{i}{n}{q}' '{v}{ye}{c}{i}{r'}' '{v}{ye}{c}{i}{n}{q}' '{v}{ye}{c}{i}{q}' '{v}{ye}{c}{i}{n}' '{a}{c}{r'}{i}{r'}' '{a}{c}{r'}{ye}{c}' '{a}{c}{r'}{i}{n}{q}' '{a}{c}{r'}{i}{q}' '{a}{c}{r'}{i}{n}' '{ye}{c}{i}{q}' '{a}{c}{i}{q}' '{ye}{c}{i}{n}' '{a}{c}{i}{n}' '{a}{c}{a}{r'}' '{a}{c}{a}{v}' '{a}{c}{a}{n}{q}' '{a}{c}{a}{q}' '{a}{c}{a}{n}' '{v}{ye}{c}{i}' '{a}{c}{r'}{i}' '{ye}{c}{a}{r'}' '{ye}{c}{a}{v}' '{c}{a}{n}{q}' '{c}{a}{q}' '{c}{a}{n}' '{a}{c}{a}' '{a}{c}{i}' '{ye}{c}{a}' '{ch}{ye}{l}' '{ye}{c}{i}' '{a}{r'}' '{a}{v}' '{a}{n}{q}' '{a}{q}' '{a}{n}' '{a}{l}' '{ye}{l}' '{ye}{c}' '{a}{c}' '{v}{ye}' '{a}' (delete) ) ) define noun as ( [substring] among ( '{a}{ts}{vo}' '{a}{n}{a}{k}' '{a}{n}{o}{c}' '{a}{r'}{a}{n}' '{a}{r'}{q}' '{p}{a}{n}' '{s}{t}{a}{n}' '{ye}{gh}{e}{n}' '{ye}{n}{q}' '{i}{k}' '{i}{ch}' '{i}{q}' '{m}{vo}{u}{n}{q}' '{j}{a}{k}' '{j}{vo}{u}{n}' '{vo}{n}{q}' '{vo}{r'}{d}' '{vo}{c}' '{ch}{ye}{q}' '{v}{a}{ts}{q}' '{v}{vo}{r'}' '{a}{v}{vo}{r'}' '{vo}{u}{dt}{j}{vo}{u}{n}' '{vo}{u}{k}' '{vo}{u}{h}{i}' '{vo}{u}{j}{dt}' '{vo}{u}{j}{q}' '{vo}{u}{s}{t}' '{vo}{u}{s}' '{c}{i}' '{a}{l}{i}{q}' '{a}{n}{i}{q}' '{i}{l}' '{i}{ch}{q}' '{vo}{u}{n}{q}' '{g}{a}{r'}' '{vo}{u}' '{a}{k}' '{a}{n}' '{q}' (delete) ) ) define ending as ( [substring] R2 among ( '{n}{ye}{r'}{y}' '{n}{ye}{r'}{n}' '{n}{ye}{r'}{i}' '{n}{ye}{r'}{d}' '{ye}{r'}{i}{c}' '{n}{ye}{r'}{i}{c}' '{ye}{r'}{i}' '{ye}{r'}{d}' '{ye}{r'}{n}' '{ye}{r'}{y}' '{n}{ye}{r'}{i}{n}' '{vo}{u}{dt}{j}{a}{n}{n}' '{vo}{u}{dt}{j}{a}{n}{y}' '{vo}{u}{dt}{j}{a}{n}{s}' '{vo}{u}{dt}{j}{a}{n}{d}' '{vo}{u}{dt}{j}{a}{n}' '{ye}{r'}{i}{n}' '{i}{n}' '{s}{a}' '{vo}{dj}' '{i}{c}' '{ye}{r'}{vo}{v}' '{n}{ye}{r'}{vo}{v}' '{ye}{r'}{vo}{u}{m}' '{n}{ye}{r'}{vo}{u}{m}' '{vo}{u}{n}' '{vo}{u}{d}' '{v}{a}{n}{s}' '{v}{a}{n}{y}' '{v}{a}{n}{d}' '{a}{n}{y}' '{a}{n}{d}' '{v}{a}{n}' '{vo}{dj}{y}' '{vo}{dj}{s}' '{vo}{dj}{d}' '{vo}{c}' '{vo}{u}{c}' '{vo}{dj}{i}{c}' '{c}{i}{c}' '{v}{i}{c}' '{v}{i}' '{v}{vo}{v}' '{vo}{v}' '{a}{n}{vo}{v}' '{a}{n}{vo}{u}{m}' '{v}{a}{n}{i}{c}' '{a}{m}{b}' '{a}{n}' '{n}{ye}{r'}' '{ye}{r'}' '{v}{a}' '{y}' '{n}' '{d}' '{c}' '{i}' (delete) ) ) ) define stem as ( do mark_regions backwards setlimit tomark pV for ( do ending do verb do adjective do noun ) ) snowball-2.2.0/algorithms/basque.sbl000066400000000000000000000120571414263061200174730ustar00rootroot00000000000000routines ( aditzak izenak adjetiboak mark_regions RV R2 R1 ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v ) stringescapes {} /* special characters */ stringdef n~ '{U+00F1}' define v 'aeiou' define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v (non-v gopast v) or (v gopast non-v) ) or ( non-v (non-v gopast v) or (v next) ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) backwardmode ( define RV as $pV <= cursor define R2 as $p2 <= cursor define R1 as $p1 <= cursor define aditzak as ( [substring] among( 'le' 'la' 'tzaile' 'aldatu' 'atu' 'tzailea' 'taile' 'tailea' 'pera' 'gale' 'galea' 'gura' 'kura' 'kor' 'korra' 'or' 'orra' 'tun' 'tuna' 'gaitz' 'gaitza' 'kaitz' 'kaitza' 'ezin' 'ezina' 'tezin' 'tezina' 'errez' 'erreza' 'karri' 'karria' 'tzaga' 'tzaka' 'tzake' 'tzeke' 'ez' 'eza' 'tzez' 'keta' 'eta' 'etan' 'pen' 'pena' 'tze' 'atze' 'kuntza' 'kunde' 'kundea' 'kune' 'kunea' 'kuna' 'kera' 'era' 'kizun' 'kizuna' 'dura' 'tura' 'men' 'mena' 'go' 'ago' 'tio' 'taldi' 'taldia' 'aldi' 'aldia' 'gune' 'gunea' 'bide' 'bidea' 'pide' 'pidea' 'gai' 'gaia' 'ki' 'kin' 'rekin' 'kina' 'kari' 'karia' 'ari' 'tari' 'etari' 'gailu' 'gailua' 'kide' 'kidea' 'ide' 'idea' 'du' 'ka' 'kan' 'an' 'ean' 'tu' 'lari' 'tatu' 'rean' 'tarazi' 'arazi' 'tzat' 'bera' 'dako' ( RV delete ) 'garri' 'garria' 'tza' (R2 delete) 'atseden' (<- 'atseden') 'arabera' (<- 'arabera') 'baditu' (<- 'baditu') ) ) define izenak as ( [substring] among( 'ari' 'aria' 'bizia' 'kari' 'karia' 'lari' 'laria' 'tari' 'taria' 'zain' 'zaina' 'tzain' 'tzaina' 'zale' 'zalea' 'tzale' 'tzalea' 'aizun' 'orde' 'ordea' 'burua' 'ohi' 'ohia' 'kintza' 'gintzo' 'gintzu' 'tzu' 'tzua' 'tzo' 'tzoa' 'kuntza' 'talde' 'taldea' 'eria' 'keria' 'teria' 'di' 'za' 'ada' 'tara' 'etara' 'tra' 'ta' 'tegi' 'tegia' 'keta' 'z' 'zko' 'zkoa' 'ti' 'tia' 'tsu' 'tsua' 'zu' 'zua' 'bera' 'pera' 'zto' 'ztoa' 'asi' 'asia' 'gile' 'gilea' 'estu' 'estua' 'larri' 'larria' 'nahi' 'nahia' 'koi' 'koia' 'oi' 'oia' 'goi' 'min' 'mina' 'dun' 'duna' 'duru' 'durua' 'duri' 'duria' 'os' 'osa' 'oso' 'osoa' 'ar' 'ara' 'tar' 'dar' 'dara' 'tiar' 'tiara' 'liar' 'liara' 'gabe' 'gabea' 'kabe' 'kabea' 'ga' 'ge' 'kada' 'tasun' 'tasuna' 'asun' 'asuna' 'go' 'mendu' 'mendua' 'mentu' 'mentua' 'mendi' 'mendia' 'zio' 'zioa' 'zino' 'zinoa' 'zione' 'zionea' 'ezia' 'degi' 'degia' 'egi' 'egia' 'toki' 'tokia' 'leku' 'lekua' 'gintza' 'alde' 'aldea' 'kalde' 'kaldea' 'gune' 'gunea' 'une' 'unea' 'una' 'pe' 'pea' 'gibel' 'gibela' 'ondo' 'ondoa' 'arte' 'artea' 'aurre' 'aurrea' 'etxe' 'etxea' 'ola' 'ontzi' 'ontzia' 'gela' 'denda' 'taldi' 'taldia' 'aldi' 'aldia' 'te' 'tea' 'zaro' 'zaroa' 'taro' 'taroa' 'oro' 'oroa' 'aro' 'aroa' 'ero' 'eroa' 'eroz' 'eroza' 'ka' 'kan' 'kana' 'tako' 'etako' 'takoa' 'kote' 'kotea' 'tzar' 'tzarra' 'handi' 'handia' 'kondo' 'kondoa' 'skila' 'no' 'noa' '{n~}o' '{n~}oa' 'ska' 'xka' 'zka' 'tila' 'to' 'toa' 'tto' 'ttoa' 'txo' 'txoa' 'txu' 'txua' 'anda' 'anga' 'urren' 'urrena' 'gai' 'gaia' 'gei' 'geia' 'eme' 'emea' 'kume' 'kumea' 'sa' 'ko' 'eko' 'koa' 'ena' 'enea' 'ne' 'nea' 'kor' 'korra' 'ez' 'eza' 'eta' 'etan' 'ki' 'kia' 'kin' 'kina' 'tu' 'tua' 'du' 'dua' 'ek' 'tarik' 'tariko' 'tan' 'ordu' 'ordua' 'oste' 'ostea' 'tzara' 'ra' 'antza' 'behar' 'ro' 'giro' 'ak' 'zp' 'ket' 'kail' 'kaila' 'ail' 'kirri' 'kirria' 'ngo' 'ngoa' '{n~}i' 'sko' 'sta' 'koitz' 'koitza' 'na' 'garren' 'garrena' 'kera' 'gerren' 'gerrena' 'garna' 'kide' 'tz' 'tuko' ( RV delete ) 'ora' 'garri' 'garria' 'or' 'buru' 'ren' 'tza' ( R2 delete ) 'joka' (<- 'jok') 'tzen' 'ten' 'en' 'tatu' (R1 delete) 'trako' (<- 'tra') 'minutuko' (<- 'minutu') 'zehar' (<- 'zehar') 'geldi' (<- 'geldi') 'igaro' (<- 'igaro') 'aurka' (<- 'aurka') ) ) define adjetiboak as ( [substring] among( 'era' 'ero' 'go' 'tate' 'tade' 'date' 'dade' 'keria' 'ki' 'to' 'ro' 'la' 'gi' 'larik' 'lanik' 'ik' 'ztik' 'rik' ( RV delete ) 'zlea' (<- 'z') ) ) ) define stem as ( do mark_regions backwards ( repeat aditzak repeat izenak do adjetiboak ) ) /* Note 1: additions of 21 Jul 2010 */ snowball-2.2.0/algorithms/catalan.sbl000066400000000000000000000172101414263061200176120ustar00rootroot00000000000000routines ( cleaning mark_regions R1 R2 attached_pronoun standard_suffix verb_suffix residual_suffix ) externals ( stem ) integers ( p1 p2 ) groupings ( v ) stringescapes {} /* special characters */ stringdef a' '{U+00E1}' // a-acute stringdef a` '{U+00E0}' // a-grave stringdef c, '{U+00E7}' // c-cedilla stringdef e' '{U+00E9}' // e-acute stringdef e` '{U+00E8}' // e-grave stringdef i' '{U+00ED}' // i-acute stringdef i` '{U+00EC}' // i-grave stringdef i" '{U+00EF}' // i-diaeresis stringdef o' '{U+00F3}' // o-acute stringdef o` '{U+00F2}' // o-grave stringdef u' '{U+00FA}' // u-acute stringdef u" '{U+00FC}' // u-diaeresis stringdef . '{U+00B7}' // - per l aggeminades define v 'aeiou{a'}{a`}{e'}{e`}{i'}{i"}{o'}{o`}{u'}{u"}' define mark_regions as ( $p1 = limit $p2 = limit // defaults do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define cleaning as repeat ( [substring] among( '{a'}' (<- 'a') '{a`}' (<- 'a') '{e'}' (<- 'e') '{e`}' (<- 'e') '{i'}' (<- 'i') '{i`}' (<- 'i') '{o'}' (<- 'o') '{o`}' (<- 'o') '{u'}' (<- 'u') '{u"}' (<- 'u') '{i"}' (<- 'i') '{.}' (<- '.') '' (next) ) ) backwardmode ( define R1 as $p1 <= cursor define R2 as $p2 <= cursor define attached_pronoun as ( [substring] among ( '{'}s' '{'}hi' '{'}ho' '{'}l' '{'}ls' '-ls' '-la' '-les' '-li' 'vos' 'se' 'nos' '-nos' '-us' 'us' '{'}n' '{'}ns' '-n' '-ns' '{'}m' '-me' '-m' '-te' '{'}t' 'li' 'lo' 'los' 'me' 'sela' 'selo' 'selas' 'selos' 'le' 'la' 'las' 'les' 'ens' 'ho' 'hi' (R1 delete) ) ) define standard_suffix as ( [substring] among( 'ar' 'atge' 'formes' 'icte' 'ictes' 'ell' 'ells' 'ella' '{e'}s' '{e`}s' 'esc' 'essa' 'et' 'ets' 'eta' 'eres' 'eries' 'ers' 'ina' 'ines' 'able' 'ls' 'i{o'}' 'itat' 'itats' 'itzar' 'iva' 'ives' 'ivisme' 'ius' 'fer' 'ment' 'amen' 'ament' 'aments' 'ments' 'ot' 'sfera' 'al' 'als' 'era' 'ana' 'iste' 'aire' 'eria' 'esa' 'eses' 'esos' 'or' '{i'}cia' '{i'}cies' 'icis' 'ici' '{i'}ci' '{i'}cis' '{a`}ria' '{a`}ries' 'alla' 'ci{o'}' 'cions' 'n{c,}a' 'nces' '{o'}' 'dor' 'all' 'il' '{i'}stic' 'enc' 'enca' '{i'}s' 'issa' 'issos' '{i'}ssem' '{i'}ssiu' 'issem' 'isseu' '{i'}sseu' '{o'}s' 'osa' 'dora' 'dores' 'dors' 'adura' 'ble' 'bles' '{i'}vol' '{i'}vola' 'd{i'}s' 'egar' 'ejar' 'ificar' 'itar' 'ables' 'adors' 'idores' 'idors' 'adora' 'aci{o'}' 'doras' 'dur' 'dures' 'alleng{u"}es' 'ant' 'ants' 'ancia' 'ancies' 'at{o`}ria' 'at{o`}ries' 'tori' 'toris' 'ats' 'ions' 'ota' 'isam' 'ors' 'ora' 'ores' 'isament' 'bilitat' 'bilitats' 'ivitat' 'ivitats' 'ari' 'aris' 'ionisme' 'ionista' 'ionistes' 'ialista' 'ialistes' 'ialisme' 'ialismes' 'ud' 'uts' 'uds' 'encia' 'encies' '{e`}ncia' '{e`}ncies' '{i"}tat' '{i"}tats' 'atiu' 'atius' 'atives' 'ativa' 'ativitat' 'ativitats' 'ible' 'ibles' 'assa' 'asses' 'assos' 'ent' 'ents' '{i'}ssim' '{i'}ssima' '{i'}ssims' '{i'}ssimes' '{i`}ssem' '{i`}sseu' '{i`}ssin' 'ims' 'ima' 'imes' 'isme' 'ista' 'ismes' 'istes' 'inia' 'inies' '{i'}inia' '{i'}nies' 'ita' 'ites' 'triu' 'trius' 'oses' 'osos' 'ient' 'otes' 'ots' (R1 delete) 'acions' 'ada' 'ades' (R2 delete) 'log{i'}a' 'log{i'}es''logia' 'logies' 'logi' 'logis' 'l{o'}gica' 'l{o'}gics' 'l{o'}giques' (R2 <- 'log') 'ic' 'ica' 'ics' 'iques' (R2 <- 'ic') 'qu{i'}ssim' 'qu{i'}ssims' 'qu{i'}ssimes' 'qu{i'}ssima' (R1 <- 'c') ) ) define verb_suffix as ( [substring] among( 'ador' 'adora' 'adors' 'adores' 're' 'ie' 'ent' 'ents' 'udes' 'ar{a`}' 'eren' 'ar{a'}' 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais' 'aria' 'arian' 'arien' 'aries' 'ar{a`}s' 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ara' 'ar{e'}' 'ar{e'}s' 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais' 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}' 'er{e'}' 'er' 'erau' 'erass' 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais' 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}' 'ir{e'}' '{i'}rem' '{i'}reu' '{i'}eu' 'ia' 'ies' '{i'}em' '{i`}eu' 'ien' 'at' 'ut' 'uda' 'ava' 'aves' 'avem' '{a'}vem' '{a`}vem' '{a`}veu' '{a'}veu' 'aven' 'au' 'ats' 'asseu' 'esseu' 'eresseu' '{a`}sseu' '{a`}ssem' '{a`}ssim' '{a`}ssiu' 'essen' 'esses' 'assen' 'asses' 'assim' 'assiu' '{e'}ssen' '{e'}sseu' '{e'}ssim' '{e'}ssiu' '{e'}ssem' '{i'}' 'ares' '{a`}rem' '{a`}reu' '{a`}ren' 'ar{i'}em' 'ar{i'}eu' 'areu' 'aren' 'ant' '{i"}m' '{i"}u' '{e'}s' '{i"}en' 'en' 'es' 'em' 'am' 'ams' '{i"}a' '{i"}es' 'dre' 'eix' 'eixer' 'tzar' 'eixes' 'ides' '{i"}des' 'it' '{i"}t' '{i"}da' 'aba' 'ada' 'ades' 'ida' '{i'}a' 'iera' 'ad' 'ed' 'its' 'id' 'ids' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an' 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado' 'ido' 'iendo' 'i{o'}' 'ar' 'ir' 'as' 'ieu' 'ii' 'io' 'i{a`}' 'ess' 'essin' 'essis' 'ass' 'assin' 'assis' 'essim' '{e`}ssim' '{e`}ssiu' 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases' 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais' 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados' 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' 'ques' '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos' 'ira' 'iran' 'irem' 'iren' 'ires' 'ireu' 'iria' 'irien' 'iries' 'ir{a`}' 'ir{a`}s' 'ir{e`}' 'ir{i`}em' 'ir{i`}eu' 'isquen' 'iguem' 'igueu' 'esqui' 'esquin' 'esquis' 'eixi' 'eixin' 'eixis' 'eixen' 'eixo' 'isin' 'isis' 'esques' 'sis' 'sin' 'int' 'ir{i'}em' 'ir{i'}eu' 'isc' 'atges' 'esca' 'esquen' 'issen' 'isses' 'issin' 'issis' 'isca' 'issiu' 'issim' '{i"}sc' '{i"}sca' '{i"}ssin' '{i'}ssiu' '{i'}ssim' '{i"}ssis' '{i"}guem' '{i"}gueu' '{i"}ra' '{i"}ren' '{i"}res' '{i"}squen' '{i"}sques' '{i"}ssen' '{i"}sses' '{i"}xo' '{i"}xen' '{i"}xes' '{i"}x' 'ixo' 'ixen' 'ixes' 'ix' 'ixa' 'inin' 'inis' 'ini' 'ineu' 'itza' 'itzi' 'itzeu' 'itzis' 'itzo' 'itz' 'itz{a`}' 'arem' 'in' '{a`}s' 'i{i"}' 'i{i"}n' 'i{i"}s' (R1 delete) 'ando' (R2 delete) ) ) define residual_suffix as ( [substring] among( 'os' 'a' 'o' '{a'}' '{a`}' '{i'}' '{o'}' 'e' '{e'}' 'eu' 'iu' 'is' 'i' 'ir' 's' '{i`}' 'itz' '{i"}' '{i"}n' '{i"}s' 'it' (R1 delete) 'iqu' (R1 <- 'ic') ) ) ) define stem as ( do mark_regions backwards ( do attached_pronoun do ( standard_suffix or verb_suffix ) do residual_suffix ) do cleaning ) /* First works 2010/07/19 First Grammatical Reviews: https://ca.wikipedia.org/wiki/Gram%C3%A0tica_del_catal%C3%A0 Suffix list: https://ca.wikipedia.org/wiki/Llista_de_sufixos Irregular Verbs: https://ca.wikipedia.org/wiki/Flexi%C3%B3_verbal_del_catal%C3%A0 */ snowball-2.2.0/algorithms/danish.sbl000066400000000000000000000034521414263061200174600ustar00rootroot00000000000000routines ( mark_regions main_suffix consonant_pair other_suffix undouble ) externals ( stem ) strings ( ch ) integers ( p1 x ) groupings ( c v s_ending ) stringescapes {} /* special characters */ stringdef ae '{U+00E6}' stringdef ao '{U+00E5}' stringdef o/ '{U+00F8}' define c 'bcdfghjklmnpqrstvwxz' define v 'aeiouy{ae}{ao}{o/}' define s_ending 'abcdfghjklmnoprtvyz{ao}' define mark_regions as ( $p1 = limit test ( hop 3 setmark x ) goto v gopast non-v setmark p1 try ( $p1 < x $p1 = x ) ) backwardmode ( define main_suffix as ( setlimit tomark p1 for ([substring]) among( 'hed' 'ethed' 'ered' 'e' 'erede' 'ende' 'erende' 'ene' 'erne' 'ere' 'en' 'heden' 'eren' 'er' 'heder' 'erer' 'heds' 'es' 'endes' 'erendes' 'enes' 'ernes' 'eres' 'ens' 'hedens' 'erens' 'ers' 'ets' 'erets' 'et' 'eret' (delete) 's' (s_ending delete) ) ) define consonant_pair as ( test ( setlimit tomark p1 for ([substring]) among( 'gd' // significant in the call from other_suffix 'dt' 'gt' 'kt' ) ) next] delete ) define other_suffix as ( do ( ['st'] 'ig' delete ) setlimit tomark p1 for ([substring]) among( 'ig' 'lig' 'elig' 'els' (delete do consonant_pair) 'l{o/}st' (<-'l{o/}s') ) ) define undouble as ( setlimit tomark p1 for ([c] ->ch) ch delete ) ) define stem as ( do mark_regions backwards ( do main_suffix do consonant_pair do other_suffix do undouble ) ) snowball-2.2.0/algorithms/dutch.sbl000066400000000000000000000060171414263061200173210ustar00rootroot00000000000000routines ( prelude postlude e_ending en_ending mark_regions R1 R2 undouble standard_suffix ) externals ( stem ) booleans ( e_found ) integers ( p1 p2 ) groupings ( v v_I v_j ) stringescapes {} /* special characters */ stringdef a" '{U+00E4}' stringdef e" '{U+00EB}' stringdef i" '{U+00EF}' stringdef o" '{U+00F6}' stringdef u" '{U+00FC}' stringdef a' '{U+00E1}' stringdef e' '{U+00E9}' stringdef i' '{U+00ED}' stringdef o' '{U+00F3}' stringdef u' '{U+00FA}' stringdef e` '{U+00E8}' define v 'aeiouy{e`}' define v_I v + 'I' define v_j v + 'j' define prelude as ( test repeat ( [substring] among( '{a"}' '{a'}' (<- 'a') '{e"}' '{e'}' (<- 'e') '{i"}' '{i'}' (<- 'i') '{o"}' '{o'}' (<- 'o') '{u"}' '{u'}' (<- 'u') '' (next) ) //or next ) try(['y'] <- 'Y') repeat goto ( v [('i'] v <- 'I') or ('y'] <- 'Y') ) ) define mark_regions as ( $p1 = limit $p2 = limit gopast v gopast non-v setmark p1 try($p1 < 3 $p1 = 3) // at least 3 gopast v gopast non-v setmark p2 ) define postlude as repeat ( [substring] among( 'Y' (<- 'y') 'I' (<- 'i') '' (next) ) //or next ) backwardmode ( define R1 as $p1 <= cursor define R2 as $p2 <= cursor define undouble as ( test among('kk' 'dd' 'tt') [next] delete ) define e_ending as ( unset e_found ['e'] R1 test non-v delete set e_found undouble ) define en_ending as ( R1 non-v and not 'gem' delete undouble ) define standard_suffix as ( do ( [substring] among( 'heden' ( R1 <- 'heid' ) 'en' 'ene' ( en_ending ) 's' 'se' ( R1 non-v_j delete ) ) ) do e_ending do ( ['heid'] R2 not 'c' delete ['en'] en_ending ) do ( [substring] among( 'end' 'ing' ( R2 delete (['ig'] R2 not 'e' delete) or undouble ) 'ig' ( R2 not 'e' delete ) 'lijk' ( R2 delete e_ending ) 'baar' ( R2 delete ) 'bar' ( R2 e_found delete ) ) ) do ( non-v_I test ( among ('aa' 'ee' 'oo' 'uu') non-v ) [next] delete ) ) ) define stem as ( do prelude do mark_regions backwards do standard_suffix do postlude ) snowball-2.2.0/algorithms/english.sbl000066400000000000000000000116521414263061200176440ustar00rootroot00000000000000integers ( p1 p2 ) booleans ( Y_found ) routines ( prelude postlude mark_regions shortv R1 R2 Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5 exception1 exception2 ) externals ( stem ) groupings ( v v_WXY valid_LI ) stringescapes {} define v 'aeiouy' define v_WXY v + 'wxY' define valid_LI 'cdeghkmnrt' define prelude as ( unset Y_found do ( ['{'}'] delete) do ( ['y'] <-'Y' set Y_found) do repeat(goto (v ['y']) <-'Y' set Y_found) ) define mark_regions as ( $p1 = limit $p2 = limit do( among ( 'gener' 'commun' // added May 2005 'arsen' // added Nov 2006 (arsenic/arsenal) // ... extensions possible here ... ) or (gopast v gopast non-v) setmark p1 gopast v gopast non-v setmark p2 ) ) backwardmode ( define shortv as ( ( non-v_WXY v non-v ) or ( non-v v atlimit ) ) define R1 as $p1 <= cursor define R2 as $p2 <= cursor define Step_1a as ( try ( [substring] among ( '{'}' '{'}s' '{'}s{'}' (delete) ) ) [substring] among ( 'sses' (<-'ss') 'ied' 'ies' ((hop 2 <-'i') or <-'ie') 's' (next gopast v delete) 'us' 'ss' ) ) define Step_1b as ( [substring] among ( 'eed' 'eedly' (R1 <-'ee') 'ed' 'edly' 'ing' 'ingly' ( test gopast v delete test substring among( 'at' 'bl' 'iz' (<+ 'e') 'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt' // ignoring double c, h, j, k, q, v, w, and x ([next] delete) '' (atmark p1 test shortv <+ 'e') ) ) ) ) define Step_1c as ( ['y' or 'Y'] non-v not atlimit <-'i' ) define Step_2 as ( [substring] R1 among ( 'tional' (<-'tion') 'enci' (<-'ence') 'anci' (<-'ance') 'abli' (<-'able') 'entli' (<-'ent') 'izer' 'ization' (<-'ize') 'ational' 'ation' 'ator' (<-'ate') 'alism' 'aliti' 'alli' (<-'al') 'fulness' (<-'ful') 'ousli' 'ousness' (<-'ous') 'iveness' 'iviti' (<-'ive') 'biliti' 'bli' (<-'ble') 'ogi' ('l' <-'og') 'fulli' (<-'ful') 'lessli' (<-'less') 'li' (valid_LI delete) ) ) define Step_3 as ( [substring] R1 among ( 'tional' (<- 'tion') 'ational' (<- 'ate') 'alize' (<-'al') 'icate' 'iciti' 'ical' (<-'ic') 'ful' 'ness' (delete) 'ative' (R2 delete) // 'R2' added Dec 2001 ) ) define Step_4 as ( [substring] R2 among ( 'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement' 'ment' 'ent' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize' (delete) 'ion' ('s' or 't' delete) ) ) define Step_5 as ( [substring] among ( 'e' (R2 or (R1 not shortv) delete) 'l' (R2 'l' delete) ) ) define exception2 as ( [substring] atlimit among( 'inning' 'outing' 'canning' 'herring' 'earring' 'proceed' 'exceed' 'succeed' // ... extensions possible here ... ) ) ) define exception1 as ( [substring] atlimit among( /* special changes: */ 'skis' (<-'ski') 'skies' (<-'sky') 'dying' (<-'die') 'lying' (<-'lie') 'tying' (<-'tie') /* special -LY cases */ 'idly' (<-'idl') 'gently' (<-'gentl') 'ugly' (<-'ugli') 'early' (<-'earli') 'only' (<-'onli') 'singly' (<-'singl') // ... extensions possible here ... /* invariant forms: */ 'sky' 'news' 'howe' 'atlas' 'cosmos' 'bias' 'andes' // not plural forms // ... extensions possible here ... ) ) define postlude as (Y_found repeat(goto (['Y']) <-'y')) define stem as ( exception1 or not hop 3 or ( do prelude do mark_regions backwards ( do Step_1a exception2 or ( do Step_1b do Step_1c do Step_2 do Step_3 do Step_4 do Step_5 ) ) do postlude ) ) snowball-2.2.0/algorithms/finnish.sbl000066400000000000000000000121441414263061200176460ustar00rootroot00000000000000 /* Finnish stemmer. Numbers in square brackets refer to the sections in Fred Karlsson, Finnish: An Essential Grammar. Routledge, 1999 ISBN 0-415-20705-3 */ routines ( mark_regions R2 particle_etc possessive LONG VI case_ending i_plural t_plural other_endings tidy ) externals ( stem ) integers ( p1 p2 ) strings ( x ) booleans ( ending_removed ) groupings ( AEI C V1 V2 particle_end ) stringescapes {} /* special characters */ stringdef a" '{U+00E4}' stringdef o" '{U+00F6}' define AEI 'a{a"}ei' define C 'bcdfghjklmnpqrstvwxz' define V1 'aeiouy{a"}{o"}' define V2 'aeiou{a"}{o"}' define particle_end V1 + 'nt' define mark_regions as ( $p1 = limit $p2 = limit goto V1 gopast non-V1 setmark p1 goto V1 gopast non-V1 setmark p2 ) backwardmode ( define R2 as $p2 <= cursor define particle_etc as ( setlimit tomark p1 for ([substring]) among( 'kin' 'kaan' 'k{a"}{a"}n' 'ko' 'k{o"}' 'han' 'h{a"}n' 'pa' 'p{a"}' // Particles [91] (particle_end) 'sti' // Adverb [87] (R2) ) delete ) define possessive as ( // [36] setlimit tomark p1 for ([substring]) among( 'si' (not 'k' delete) // take 'ksi' as the Comitative case 'ni' (delete ['kse'] <- 'ksi') // kseni = ksi + ni 'nsa' 'ns{a"}' 'mme' 'nne' (delete) /* Now for Vn possessives after case endings: [36] */ 'an' (among('ta' 'ssa' 'sta' 'lla' 'lta' 'na') delete) '{a"}n' (among('t{a"}' 'ss{a"}' 'st{a"}' 'll{a"}' 'lt{a"}' 'n{a"}') delete) 'en' (among('lle' 'ine') delete) ) ) define LONG as among('aa' 'ee' 'ii' 'oo' 'uu' '{a"}{a"}' '{o"}{o"}') define VI as ('i' V2) define case_ending as ( setlimit tomark p1 for ([substring]) among( 'han' ('a') //-. 'hen' ('e') // | 'hin' ('i') // | 'hon' ('o') // | 'h{a"}n' ('{a"}') // Illative [43] 'h{o"}n' ('{o"}') // | 'siin' VI // | 'seen' LONG //-' 'den' VI 'tten' VI // Genitive plurals [34] () 'n' // Genitive or Illative ( try ( LONG // Illative or 'ie' // Genitive and next ] ) /* otherwise Genitive */ ) 'a' '{a"}' //-. (V1 C) // | 'tta' 'tt{a"}' // Partitive [32] ('e') // | 'ta' 't{a"}' //-' 'ssa' 'ss{a"}' // Inessive [41] 'sta' 'st{a"}' // Elative [42] 'lla' 'll{a"}' // Adessive [44] 'lta' 'lt{a"}' // Ablative [51] 'lle' // Allative [46] 'na' 'n{a"}' // Essive [49] 'ksi' // Translative[50] 'ine' // Comitative [51] /* Abessive and Instructive are too rare for inclusion [51] */ ) delete set ending_removed ) define other_endings as ( setlimit tomark p2 for ([substring]) among( 'mpi' 'mpa' 'mp{a"}' 'mmi' 'mma' 'mm{a"}' // Comparative forms [85] (not 'po') //-improves things 'impi' 'impa' 'imp{a"}' 'immi' 'imma' 'imm{a"}' // Superlative forms [86] 'eja' 'ej{a"}' // indicates agent [93.1B] ) delete ) define i_plural as ( // [26] setlimit tomark p1 for ([substring]) among( 'i' 'j' ) delete ) define t_plural as ( // [26] setlimit tomark p1 for ( ['t'] test V1 delete ) setlimit tomark p2 for ([substring]) among( 'mma' (not 'po') //-mmat endings 'imma' //-immat endings ) delete ) define tidy as ( setlimit tomark p1 for ( do ( LONG and ([next] delete ) ) // undouble vowel do ( [AEI] C delete ) // remove trailing a, a", e, i do ( ['j'] 'o' or 'u' delete ) do ( ['o'] 'j' delete ) ) goto non-V1 [C] -> x x delete // undouble consonant ) ) define stem as ( do mark_regions unset ending_removed backwards ( do particle_etc do possessive do case_ending do other_endings (ending_removed do i_plural) or do t_plural do tidy ) ) snowball-2.2.0/algorithms/french.sbl000066400000000000000000000143671414263061200174660ustar00rootroot00000000000000routines ( prelude postlude mark_regions RV R1 R2 standard_suffix i_verb_suffix verb_suffix residual_suffix un_double un_accent ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v keep_with_s ) stringescapes {} /* special characters */ stringdef a^ '{U+00E2}' // a-circumflex stringdef a` '{U+00E0}' // a-grave stringdef c, '{U+00E7}' // c-cedilla stringdef e" '{U+00EB}' // e-diaeresis (rare) stringdef e' '{U+00E9}' // e-acute stringdef e^ '{U+00EA}' // e-circumflex stringdef e` '{U+00E8}' // e-grave stringdef i" '{U+00EF}' // i-diaeresis stringdef i^ '{U+00EE}' // i-circumflex stringdef o^ '{U+00F4}' // o-circumflex stringdef u^ '{U+00FB}' // u-circumflex stringdef u` '{U+00F9}' // u-grave define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}' define prelude as repeat goto ( ( v [ ('u' ] v <- 'U') or ('i' ] v <- 'I') or ('y' ] <- 'Y') ) or ( [ '{e"}' ] <- 'He' ) or ( [ '{i"}' ] <- 'Hi' ) or ( ['y'] v <- 'Y' ) or ( 'q' ['u'] <- 'U' ) ) define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v v next ) or among ( // this exception list begun Nov 2006 'par' // paris, parie, pari 'col' // colis 'tap' // tapis // extensions possible here ) or ( next gopast v ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( 'I' (<- 'i') 'U' (<- 'u') 'Y' (<- 'y') 'He' (<- '{e"}') 'Hi' (<- '{i"}') 'H' (delete) '' (next) ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define standard_suffix as ( [substring] among( 'ance' 'iqUe' 'isme' 'able' 'iste' 'eux' 'ances' 'iqUes' 'ismes' 'ables' 'istes' ( R2 delete ) 'atrice' 'ateur' 'ation' 'atrices' 'ateurs' 'ations' ( R2 delete try ( ['ic'] (R2 delete) or <-'iqU' ) ) 'logie' 'logies' ( R2 <- 'log' ) 'usion' 'ution' 'usions' 'utions' ( R2 <- 'u' ) 'ence' 'ences' ( R2 <- 'ent' ) 'ement' 'ements' ( RV delete try ( [substring] among( 'iv' (R2 delete ['at'] R2 delete) 'eus' ((R2 delete) or (R1<-'eux')) 'abl' 'iqU' (R2 delete) 'i{e`}r' 'I{e`}r' //) (RV <-'i') //)--new 2 Sept 02 ) ) ) 'it{e'}' 'it{e'}s' ( R2 delete try ( [substring] among( 'abil' ((R2 delete) or <-'abl') 'ic' ((R2 delete) or <-'iqU') 'iv' (R2 delete) ) ) ) 'if' 'ive' 'ifs' 'ives' ( R2 delete try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' ) ) 'eaux' (<- 'eau') 'aux' (R1 <- 'al') 'euse' 'euses'((R2 delete) or (R1<-'eux')) 'issement' 'issements'(R1 non-v delete) // verbal // fail(...) below forces entry to verb_suffix. -ment typically // follows the p.p., e.g 'confus{e'}ment'. 'amment' (RV fail(<- 'ant')) 'emment' (RV fail(<- 'ent')) 'ment' 'ments' (test(v RV) fail(delete)) // v is e,i,u,{e'},I or U ) ) define i_verb_suffix as setlimit tomark pV for ( [substring] among ( '{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai' 'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez' 'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait' 'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses' 'issez' 'issiez' 'issions' 'issons' 'it' (not 'H' non-v delete) ) ) define verb_suffix as setlimit tomark pV for ( [substring] among ( 'ions' (R2 delete) '{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai' 'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions' 'erons' 'eront' 'ez' 'iez' // 'ons' //-best omitted (delete) '{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ais' 'ait' 'ant' 'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez' 'assions' (delete try(['e'] delete) ) ) ) define keep_with_s 'aiou{e`}s' define residual_suffix as ( try(['s'] test ('Hi' or non-keep_with_s) delete) setlimit tomark pV for ( [substring] among( 'ion' (R2 's' or 't' delete) 'ier' 'i{e`}re' 'Ier' 'I{e`}re' (<-'i') 'e' (delete) ) ) ) define un_double as ( test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete ) define un_accent as ( atleast 1 non-v [ '{e'}' or '{e`}' ] <-'e' ) ) define stem as ( do prelude do mark_regions backwards ( do ( ( ( standard_suffix or i_verb_suffix or verb_suffix ) and try( [ ('Y' ] <- 'i' ) or ('{c,}'] <- 'c' ) ) ) or residual_suffix ) // try(['ent'] RV delete) // is best omitted do un_double do un_accent ) do postlude ) snowball-2.2.0/algorithms/german.sbl000066400000000000000000000051241414263061200174610ustar00rootroot00000000000000 /* Extra rule for -nisse ending added 11 Dec 2009 */ routines ( prelude postlude mark_regions R1 R2 standard_suffix ) externals ( stem ) integers ( p1 p2 x ) groupings ( v s_ending st_ending ) stringescapes {} /* special characters */ stringdef a" '{U+00E4}' stringdef o" '{U+00F6}' stringdef u" '{U+00FC}' stringdef ss '{U+00DF}' define v 'aeiouy{a"}{o"}{u"}' define s_ending 'bdfghklmnrt' define st_ending s_ending - 'r' define prelude as ( test repeat ( ( ['{ss}'] <- 'ss' ) or next ) repeat goto ( v [('u'] v <- 'U') or ('y'] v <- 'Y') ) ) define mark_regions as ( $p1 = limit $p2 = limit test(hop 3 setmark x) gopast v gopast non-v setmark p1 try($p1 < x $p1 = x) // at least 3 gopast v gopast non-v setmark p2 ) define postlude as repeat ( [substring] among( 'Y' (<- 'y') 'U' (<- 'u') '{a"}' (<- 'a') '{o"}' (<- 'o') '{u"}' (<- 'u') '' (next) ) ) backwardmode ( define R1 as $p1 <= cursor define R2 as $p2 <= cursor define standard_suffix as ( do ( [substring] R1 among( 'em' 'ern' 'er' ( delete ) 'e' 'en' 'es' ( delete try (['s'] 'nis' delete) ) 's' ( s_ending delete ) ) ) do ( [substring] R1 among( 'en' 'er' 'est' ( delete ) 'st' ( st_ending hop 3 delete ) ) ) do ( [substring] R2 among( 'end' 'ung' ( delete try (['ig'] not 'e' R2 delete) ) 'ig' 'ik' 'isch' ( not 'e' delete ) 'lich' 'heit' ( delete try ( ['er' or 'en'] R1 delete ) ) 'keit' ( delete try ( [substring] R2 among( 'lich' 'ig' ( delete ) ) ) ) ) ) ) ) define stem as ( do prelude do mark_regions backwards do standard_suffix do postlude ) snowball-2.2.0/algorithms/german2.sbl000066400000000000000000000053531414263061200175470ustar00rootroot00000000000000 /* Extra rule for -nisse ending added 11 Dec 2009 */ routines ( prelude postlude mark_regions R1 R2 standard_suffix ) externals ( stem ) integers ( p1 p2 x ) groupings ( v s_ending st_ending ) stringescapes {} /* special characters */ stringdef a" '{U+00E4}' stringdef o" '{U+00F6}' stringdef u" '{U+00FC}' stringdef ss '{U+00DF}' define v 'aeiouy{a"}{o"}{u"}' define s_ending 'bdfghklmnrt' define st_ending s_ending - 'r' define prelude as ( test repeat goto ( v [('u'] v <- 'U') or ('y'] v <- 'Y') ) repeat ( [substring] among( '{ss}' (<- 'ss') 'ae' (<- '{a"}') 'oe' (<- '{o"}') 'ue' (<- '{u"}') 'qu' () '' (next) ) ) ) define mark_regions as ( $p1 = limit $p2 = limit test(hop 3 setmark x) gopast v gopast non-v setmark p1 try($p1 < x $p1 = x) // at least 3 gopast v gopast non-v setmark p2 ) define postlude as repeat ( [substring] among( 'Y' (<- 'y') 'U' (<- 'u') '{a"}' (<- 'a') '{o"}' (<- 'o') '{u"}' (<- 'u') '' (next) ) ) backwardmode ( define R1 as $p1 <= cursor define R2 as $p2 <= cursor define standard_suffix as ( do ( [substring] R1 among( 'em' 'ern' 'er' ( delete ) 'e' 'en' 'es' ( delete try (['s'] 'nis' delete) ) 's' ( s_ending delete ) ) ) do ( [substring] R1 among( 'en' 'er' 'est' ( delete ) 'st' ( st_ending hop 3 delete ) ) ) do ( [substring] R2 among( 'end' 'ung' ( delete try (['ig'] not 'e' R2 delete) ) 'ig' 'ik' 'isch' ( not 'e' delete ) 'lich' 'heit' ( delete try ( ['er' or 'en'] R1 delete ) ) 'keit' ( delete try ( [substring] R2 among( 'lich' 'ig' ( delete ) ) ) ) ) ) ) ) define stem as ( do prelude do mark_regions backwards do standard_suffix do postlude ) snowball-2.2.0/algorithms/greek.sbl000066400000000000000000000644611414263061200173160ustar00rootroot00000000000000// A stemmer for Modern Greek language, based on: // // Ntais, Georgios. Development of a Stemmer for the Greek // Language. Diss. Royal Institute of Technology, 2006. // https://sais.se/mthprize/2007/ntais2007.pdf // // Saroukos, Spyridon. Enhancing a Greek language stemmer. // University of Tampere, 2008. // https://tampub.uta.fi/bitstream/handle/10024/80480/gradu03463.pdf stringescapes {} stringdef a '{U+03B1}' // alpha stringdef v '{U+03B2}' // beta stringdef g '{U+03B3}' // gamma stringdef d '{U+03B4}' // delta stringdef e '{U+03B5}' // epsilon stringdef z '{U+03B6}' // zeta stringdef i '{U+03B7}' // eta stringdef th '{U+03B8}' // theta stringdef y '{U+03B9}' // iota stringdef k '{U+03BA}' // kappa stringdef l '{U+03BB}' // lamda stringdef m '{U+03BC}' // mu stringdef n '{U+03BD}' // nu stringdef x '{U+03BE}' // xi stringdef o '{U+03BF}' // omicron stringdef p '{U+03C0}' // pi stringdef r '{U+03C1}' // rho stringdef ss '{U+03C2}' // sigma final stringdef s '{U+03C3}' // sigma stringdef t '{U+03C4}' // tau stringdef u '{U+03C5}' // upsilon stringdef f '{U+03C6}' // phi stringdef ch '{U+03C7}' // chi stringdef ps '{U+03C8}' // psi stringdef oo '{U+03C9}' // omega stringdef A '{U+0391}' // Alpha stringdef V '{U+0392}' // Beta stringdef G '{U+0393}' // Gamma stringdef D '{U+0394}' // Delta stringdef E '{U+0395}' // Epsilon stringdef Z '{U+0396}' // Zeta stringdef I '{U+0397}' // Eta stringdef Th '{U+0398}' // Theta stringdef Y '{U+0399}' // Iota stringdef K '{U+039A}' // Kappa stringdef L '{U+039B}' // Lamda stringdef M '{U+039C}' // Mu stringdef N '{U+039D}' // Nu stringdef X '{U+039E}' // Xi stringdef O '{U+039F}' // Omicron stringdef P '{U+03A0}' // Pi stringdef R '{U+03A1}' // Rho stringdef S '{U+03A3}' // Sigma stringdef T '{U+03A4}' // Tau stringdef U '{U+03A5}' // Upsilon stringdef F '{U+03A6}' // Phi stringdef Ch '{U+03A7}' // Chi stringdef Ps '{U+03A8}' // Psi stringdef Oo '{U+03A9}' // Omega stringdef Y: '{U+03AA}' // Iota with dialytika stringdef U: '{U+03AB}' // Upsilon with dialytika stringdef a' '{U+03AC}' // alpha with tonos stringdef e' '{U+03AD}' // epsilon with tonos stringdef i' '{U+03AE}' // eta with tonos stringdef y' '{U+03AF}' // iota with tonos stringdef o' '{U+03CC}' // omicron with tonos stringdef u' '{U+03CD}' // upsilon with tonos stringdef oo' '{U+03CE}' // omega with tonos stringdef i:' '{U+0390}' // iota with dialytika and tonos stringdef u:' '{U+03B0}' // upsilon with dialytika and tonos stringdef i: '{U+03CA}' // iota with dialytika stringdef u: '{U+03CB}' // upsilon with dialytika stringdef A' '{U+0386}' // Alpha with tonos stringdef E' '{U+0388}' // Epsilon with tonos stringdef I' '{U+0389}' // Eta with tonos stringdef Y' '{U+038A}' // Iota with tonos stringdef O' '{U+038C}' // Omicron with tonos stringdef U' '{U+038E}' // Upsilon with tonos stringdef OO' '{U+038F}' // Omega with tonos externals ( stem ) booleans ( test1 ) groupings ( v v2 ) routines ( tolower has_min_length steps1 steps2 steps3 steps4 steps5 steps6 steps7 steps8 steps9 steps10 step1 step2a step2b step2c step2d step3 step4 step5a step5b step5c step5d step5e step5f step5g step5h step5i step5j step5k step5l step5m step6 step7 ) define v '{a}{e}{i}{y}{o}{u}{oo}' define v2 '{a}{e}{i}{y}{o}{oo}' backwardmode ( define has_min_length as ( $(len >= 3) ) define tolower as ( repeat ( [substring] among ( '{A}' (<- '{a}') '{V}' (<- '{v}') '{G}' (<- '{g}') '{D}' (<- '{d}') '{E}' (<- '{e}') '{Z}' (<- '{z}') '{I}' (<- '{i}') '{Th}' (<- '{th}') '{Y}' (<- '{y}') '{K}' (<- '{k}') '{L}' (<- '{l}') '{M}' (<- '{m}') '{N}' (<- '{n}') '{X}' (<- '{x}') '{O}' (<- '{o}') '{P}' (<- '{p}') '{R}' (<- '{r}') '{S}' (<- '{s}') '{T}' (<- '{t}') '{U}' (<- '{u}') '{F}' (<- '{f}') '{Ch}' (<- '{ch}') '{Ps}' (<- '{ps}') '{Oo}' (<- '{oo}') '{Y:}' (<- '{y}') '{U:}' (<- '{u}') '{a'}' (<- '{a}') '{e'}' (<- '{e}') '{i'}' (<- '{i}') '{y'}' (<- '{y}') '{o'}' (<- '{o}') '{u'}' (<- '{u}') '{oo'}' (<- '{oo}') '{i:'}' (<- '{i}') '{u:'}' (<- '{u}') '{i:}' (<- '{i}') '{u:}' (<- '{u}') '{A'}' (<- '{a}') '{E'}' (<- '{e}') '{I'}' (<- '{i}') '{Y'}' (<- '{y}') '{O'}' (<- '{o}') '{U'}' (<- '{u}') '{OO'}' (<- '{oo}') '{ss}' (<- '{s}') '' (next) ) ) ) define step1 as ( [substring] among ( '{f}{a}{g}{y}{a}' '{f}{a}{g}{y}{o}{u}' '{f}{a}{g}{y}{oo}{n}' (<- '{f}{a}') '{s}{k}{a}{g}{y}{a}' '{s}{k}{a}{g}{y}{o}{u}' '{s}{k}{a}{g}{y}{oo}{n}' (<- '{s}{k}{a}') '{o}{l}{o}{g}{y}{o}{u}' '{o}{l}{o}{g}{y}{a}' '{o}{l}{o}{g}{y}{oo}{n}' (<- '{o}{l}{o}') '{s}{o}{g}{y}{o}{u}' '{s}{o}{g}{y}{a}' '{s}{o}{g}{y}{oo}{n}' (<- '{s}{o}') '{t}{a}{t}{o}{g}{y}{a}' '{t}{a}{t}{o}{g}{y}{o}{u}' '{t}{a}{t}{o}{g}{y}{oo}{n}' (<- '{t}{a}{t}{o}') '{k}{r}{e}{a}{s}' '{k}{r}{e}{a}{t}{o}{s}' '{k}{r}{e}{a}{t}{a}' '{k}{r}{e}{a}{t}{oo}{n}' (<- '{k}{r}{e}') '{p}{e}{r}{a}{s}' '{p}{e}{r}{a}{t}{o}{s}' '{p}{e}{r}{a}{t}{i}' '{p}{e}{r}{a}{t}{a}' '{p}{e}{r}{a}{t}{oo}{n}' (<- '{p}{e}{r}') '{t}{e}{r}{a}{s}' '{t}{e}{r}{a}{t}{o}{s}' '{t}{e}{r}{a}{t}{a}' '{t}{e}{r}{a}{t}{oo}{n}' (<- '{t}{e}{r}') '{f}{oo}{s}' '{f}{oo}{t}{o}{s}' '{f}{oo}{t}{a}' '{f}{oo}{t}{oo}{n}' (<- '{f}{oo}') '{k}{a}{th}{e}{s}{t}{oo}{s}' '{k}{a}{th}{e}{s}{t}{oo}{t}{o}{s}' '{k}{a}{th}{e}{s}{t}{oo}{t}{a}' '{k}{a}{th}{e}{s}{t}{oo}{t}{oo}{n}' (<- '{k}{a}{th}{e}{s}{t}') '{g}{e}{g}{o}{n}{o}{s}' '{g}{e}{g}{o}{n}{o}{t}{o}{s}' '{g}{e}{g}{o}{n}{o}{t}{a}' '{g}{e}{g}{o}{n}{o}{t}{oo}{n}' (<- '{g}{e}{g}{o}{n}') ) unset test1 ) define steps1 as ( [substring] among ( '{y}{z}{a}' '{y}{z}{e}{s}' '{y}{z}{e}' '{y}{z}{a}{m}{e}' '{y}{z}{a}{t}{e}' '{y}{z}{a}{n}' '{y}{z}{a}{n}{e}' '{y}{z}{oo}' '{y}{z}{e}{y}{s}' '{y}{z}{e}{y}' '{y}{z}{o}{u}{m}{e}' '{y}{z}{e}{t}{e}' '{y}{z}{o}{u}{n}' '{y}{z}{o}{u}{n}{e}' ( delete unset test1 ([] substring atlimit among ( '{a}{n}{a}{m}{p}{a}' '{e}{m}{p}{a}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{p}{a}' '{p}{e}{r}{y}{p}{a}' '{a}{th}{r}{o}' '{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' (<- '{y}') '{m}{a}{r}{k}' '{k}{o}{r}{n}' '{a}{m}{p}{a}{r}' '{a}{r}{r}' '{v}{a}{th}{u}{r}{y}' '{v}{a}{r}{k}' '{v}' '{v}{o}{l}{v}{o}{r}' '{g}{k}{r}' '{g}{l}{u}{k}{o}{r}' '{g}{l}{u}{k}{u}{r}' '{y}{m}{p}' '{l}' '{l}{o}{u}' '{m}{a}{r}' '{m}' '{p}{r}' '{m}{p}{r}' '{p}{o}{l}{u}{r}' '{p}' '{r}' '{p}{y}{p}{e}{r}{o}{r}' (<- '{y}{z}') )) ) ) ) define steps2 as ( [substring] among ( '{oo}{th}{i}{k}{a}' '{oo}{th}{i}{k}{e}{s}' '{oo}{th}{i}{k}{e}' '{oo}{th}{i}{k}{a}{m}{e}' '{oo}{th}{i}{k}{a}{t}{e}' '{oo}{th}{i}{k}{a}{n}' '{oo}{th}{i}{k}{a}{n}{e}' ( delete unset test1 [] substring atlimit among ( '{a}{l}' '{v}{y}' '{e}{n}' '{u}{ps}' '{l}{y}' '{z}{oo}' '{s}' '{ch}' (<- '{oo}{n}') ) ) ) ) define steps3 as ( [substring] among ( '{y}{s}{a}' '{y}{s}{e}{s}' '{y}{s}{e}' '{y}{s}{a}{m}{e}' '{y}{s}{a}{t}{e}' '{y}{s}{a}{n}' '{y}{s}{a}{n}{e}' ( delete unset test1 ('{y}{s}{a}' atlimit <- '{y}{s}') or ([] substring atlimit among ( '{a}{n}{a}{m}{p}{a}' '{a}{th}{r}{o}' '{e}{m}{p}{a}' '{e}{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{e}{p}{e}' '{p}{e}{r}{y}{p}{a}' '{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' '{k}{l}{e}' '{ch}{a}{r}{t}{o}{p}{a}' '{e}{x}{a}{r}{ch}{a}' '{m}{e}{t}{e}{p}{e}' '{a}{p}{o}{k}{l}{e}' '{a}{p}{e}{k}{l}{e}' '{e}{k}{l}{e}' '{p}{e}' (<- '{y}') '{a}{n}' '{a}{f}' '{g}{e}' '{g}{y}{g}{a}{n}{t}{o}{a}{f}' '{g}{k}{e}' '{d}{i}{m}{o}{k}{r}{a}{t}' '{k}{o}{m}' '{g}{k}' '{m}' '{p}' '{p}{o}{u}{k}{a}{m}' '{o}{l}{o}' '{l}{a}{r}' (<- '{y}{s}') )) ) ) ) define steps4 as ( [substring] among ( '{y}{s}{oo}' '{y}{s}{e}{y}{s}' '{y}{s}{e}{y}' '{y}{s}{o}{u}{m}{e}' '{y}{s}{e}{t}{e}' '{y}{s}{o}{u}{n}' '{y}{s}{o}{u}{n}{e}' ( delete unset test1 [] substring atlimit among ( '{a}{n}{a}{m}{p}{a}' '{e}{m}{p}{a}' '{e}{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{e}{p}{e}' '{p}{e}{r}{y}{p}{a}' '{a}{th}{r}{o}' '{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' '{k}{l}{e}' '{ch}{a}{r}{t}{o}{p}{a}' '{e}{x}{a}{r}{ch}{a}' '{m}{e}{t}{e}{p}{e}' '{a}{p}{o}{k}{l}{e}' '{a}{p}{e}{k}{l}{e}' '{e}{k}{l}{e}' '{p}{e}' (<- '{y}') ) ) ) ) define steps5 as ( [substring] among ( '{y}{s}{t}{o}{s}' '{y}{s}{t}{o}{u}' '{y}{s}{t}{o}' '{y}{s}{t}{e}' '{y}{s}{t}{o}{y}' '{y}{s}{t}{oo}{n}' '{y}{s}{t}{o}{u}{s}' '{y}{s}{t}{i}' '{y}{s}{t}{i}{s}' '{y}{s}{t}{a}' '{y}{s}{t}{e}{s}' ( delete unset test1 ([] substring atlimit among ( '{d}{a}{n}{e}' '{s}{u}{n}{a}{th}{r}{o}' '{k}{l}{e}' '{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{a}{s}{e}' '{p}{l}{e}' (<- '{y}') '{m}' '{p}' '{a}{p}' '{a}{r}' '{i}{d}' '{k}{t}' '{s}{k}' '{s}{ch}' '{u}{ps}' '{f}{a}' '{ch}{r}' '{ch}{t}' '{a}{k}{t}' '{a}{o}{r}' '{a}{s}{ch}' '{a}{t}{a}' '{a}{ch}{n}' '{a}{ch}{t}' '{g}{e}{m}' '{g}{u}{r}' '{e}{m}{p}' '{e}{u}{p}' '{e}{ch}{th}' '{i}{f}{a}' '{k}{a}{th}' '{k}{a}{k}' '{k}{u}{l}' '{l}{u}{g}' '{m}{a}{k}' '{m}{e}{g}' '{t}{a}{ch}' '{f}{y}{l}' '{ch}{oo}{r}' (<- '{y}{s}{t}') )) ) ) ) define steps6 as ( [substring] among ( '{y}{s}{m}{o}' '{y}{s}{m}{o}{y}' '{y}{s}{m}{o}{s}' '{y}{s}{m}{o}{u}' '{y}{s}{m}{o}{u}{s}' '{y}{s}{m}{oo}{n}' ( delete unset test1 ([] substring atlimit among ( '{s}{e}' '{m}{e}{t}{a}{s}{e}' '{m}{y}{k}{r}{o}{s}{e}' '{e}{g}{k}{l}{e}' '{a}{p}{o}{k}{l}{e}' (<- '{y}{s}{m}') '{d}{a}{n}{e}' '{a}{n}{t}{y}{d}{a}{n}{e}' (<- '{y}') )) or ([substring] among ( '{a}{g}{n}{oo}{s}{t}{y}{k}' (<- '{a}{g}{n}{oo}{s}{t}') '{a}{t}{o}{m}{y}{k}' (<- '{a}{t}{o}{m}') '{g}{n}{oo}{s}{t}{y}{k}' (<- '{g}{n}{oo}{s}{t}') '{e}{th}{n}{y}{k}' (<- '{e}{th}{n}') '{e}{k}{l}{e}{k}{t}{y}{k}' (<- '{e}{k}{l}{e}{k}{t}') '{s}{k}{e}{p}{t}{y}{k}' (<- '{s}{k}{e}{p}{t}') '{t}{o}{p}{y}{k}' (<- '{t}{o}{p}') '{a}{l}{e}{x}{a}{n}{d}{r}{y}{n}' (<- '{a}{l}{e}{x}{a}{n}{d}{r}') '{v}{u}{z}{a}{n}{t}{y}{n}' (<- '{v}{u}{z}{a}{n}{t}') '{th}{e}{a}{t}{r}{y}{n}' (<- '{th}{e}{a}{t}{r}') )) ) ) ) define steps7 as ( [substring] among ( '{a}{r}{a}{k}{y}' '{a}{r}{a}{k}{y}{a}' '{o}{u}{d}{a}{k}{y}' '{o}{u}{d}{a}{k}{y}{a}' ( delete unset test1 [] substring atlimit among ( '{s}' '{ch}' (<- '{a}{r}{a}{k}') ) ) ) ) define steps8 as ( [substring] among ( '{a}{k}{y}' '{a}{k}{y}{a}' '{y}{t}{s}{a}' '{y}{t}{s}{a}{s}' '{y}{t}{s}{e}{s}' '{y}{t}{s}{oo}{n}' '{a}{r}{a}{k}{y}' '{a}{r}{a}{k}{y}{a}' ( delete unset test1 ([] substring atlimit among ( '{v}{a}{m}{v}' '{v}{r}' '{k}{a}{y}{m}' '{k}{o}{n}' '{k}{o}{r}' '{l}{a}{v}{r}' '{l}{o}{u}{l}' '{m}{e}{r}' '{m}{o}{u}{s}{t}' '{n}{a}{g}{k}{a}{s}' '{p}{l}' '{r}' '{r}{u}' '{s}' '{s}{k}' '{s}{o}{k}' '{s}{p}{a}{n}' '{t}{z}' '{f}{a}{r}{m}' '{ch}' '{k}{a}{p}{a}{k}' '{a}{l}{y}{s}{f}' '{a}{m}{v}{r}' '{a}{n}{th}{r}' '{k}' '{f}{u}{l}' '{k}{a}{t}{r}{a}{p}' '{k}{l}{y}{m}' '{m}{a}{l}' '{s}{l}{o}{v}' '{f}' '{s}{f}' '{t}{s}{e}{ch}{o}{s}{l}{o}{v}' (<- '{a}{k}') '{v}' '{v}{a}{l}' '{g}{y}{a}{n}' '{g}{l}' '{z}' '{i}{g}{o}{u}{m}{e}{n}' '{k}{a}{r}{d}' '{m}{a}{k}{r}{u}{n}' '{n}{u}{f}' '{p}{a}{t}{e}{r}' '{p}' '{t}{o}{s}' '{t}{r}{y}{p}{o}{l}' // We're implementing the revised algorithm from the Saroukos paper // which also lists '{k}{o}{n}' and '{s}{k}' here, but these are // also listed just above in the `Add {a}{k} in the end` exception. // It seems they're redundant here, so we omit them (otherwise the // Snowball compiler would report an error). (<- '{y}{t}{s}') )) or ([] '{k}{o}{r}' <- '{y}{t}{s}') ) ) ) define steps9 as ( [substring] among ( '{y}{d}{y}{o}' '{y}{d}{y}{a}' '{y}{d}{y}{oo}{n}' ( delete unset test1 ([] substring atlimit among ( '{a}{y}{f}{n}' '{y}{r}' '{o}{l}{o}' '{ps}{a}{l}' (<- '{y}{d}') )) or ([] substring among ( '{e}' '{p}{a}{y}{ch}{n}' (<- '{y}{d}') )) ) ) ) define steps10 as ( [substring] among ( '{y}{s}{k}{o}{s}' '{y}{s}{k}{o}{u}' '{y}{s}{k}{o}' '{y}{s}{k}{e}' ( delete unset test1 [] substring atlimit among ( '{d}' '{y}{v}' '{m}{i}{n}' '{r}' '{f}{r}{a}{g}{k}' '{l}{u}{k}' '{o}{v}{e}{l}' (<- '{y}{s}{k}') ) ) ) ) define step2a as ( [substring] among ( '{a}{d}{e}{s}' '{a}{d}{oo}{n}' (delete) ) not (substring among ( '{o}{k}' '{m}{a}{m}' '{m}{a}{n}' '{m}{p}{a}{m}{p}' '{p}{a}{t}{e}{r}' '{g}{y}{a}{g}{y}' '{n}{t}{a}{n}{t}' '{k}{u}{r}' '{th}{e}{y}' '{p}{e}{th}{e}{r}' )) insert '{a}{d}' ) define step2b as ( [substring] among ( '{e}{d}{e}{s}' '{e}{d}{oo}{n}' (delete) ) [] substring among ( '{o}{p}' '{y}{p}' '{e}{m}{p}' '{u}{p}' '{g}{i}{p}' '{d}{a}{p}' '{k}{r}{a}{s}{p}' '{m}{y}{l}' (<- '{e}{d}') ) ) define step2c as ( [substring] among ( '{o}{u}{d}{e}{s}' '{o}{u}{d}{oo}{n}' (delete) ) [] substring among ( '{a}{r}{k}' '{k}{a}{l}{y}{a}{k}' '{p}{e}{t}{a}{l}' '{l}{y}{ch}' '{p}{l}{e}{x}' '{s}{k}' '{s}' '{f}{l}' '{f}{r}' '{v}{e}{l}' '{l}{o}{u}{l}' '{ch}{n}' '{s}{p}' '{t}{r}{a}{g}' '{f}{e}' (<- '{o}{u}{d}') ) ) define step2d as ( [substring] among ( '{e}{oo}{s}' '{e}{oo}{n}' (delete unset test1) ) [] substring atlimit among ( '{th}' '{d}' '{e}{l}' '{g}{a}{l}' '{n}' '{p}' '{y}{d}' '{p}{a}{r}' (<- '{e}') ) ) define step3 as ( [substring] among ( '{y}{a}' '{y}{o}{u}' '{y}{oo}{n}' (delete unset test1) ) ([] v <- '{y}') ) define step4 as ( [substring] among ( '{y}{k}{a}' '{y}{k}{o}' '{y}{k}{o}{u}' '{y}{k}{oo}{n}' (delete unset test1) ) ([] v <- '{y}{k}') or [] substring atlimit among ( '{a}{l}' '{a}{d}' '{e}{n}{d}' '{a}{m}{a}{n}' '{a}{m}{m}{o}{ch}{a}{l}' '{i}{th}' '{a}{n}{i}{th}' '{a}{n}{t}{y}{d}' '{f}{u}{s}' '{v}{r}{oo}{m}' '{g}{e}{r}' '{e}{x}{oo}{d}' '{k}{a}{l}{p}' '{k}{a}{l}{l}{y}{n}' '{k}{a}{t}{a}{d}' '{m}{o}{u}{l}' '{m}{p}{a}{n}' '{m}{p}{a}{g}{y}{a}{t}' '{m}{p}{o}{l}' '{m}{p}{o}{s}' '{n}{y}{t}' '{x}{y}{k}' '{s}{u}{n}{o}{m}{i}{l}' '{p}{e}{t}{s}' '{p}{y}{t}{s}' '{p}{y}{k}{a}{n}{t}' '{p}{l}{y}{a}{t}{s}' '{p}{o}{s}{t}{e}{l}{n}' '{p}{r}{oo}{t}{o}{d}' '{s}{e}{r}{t}' '{s}{u}{n}{a}{d}' '{t}{s}{a}{m}' '{u}{p}{o}{d}' '{f}{y}{l}{o}{n}' '{f}{u}{l}{o}{d}' '{ch}{a}{s}' (<- '{y}{k}') ) ) define step5a as ( do ('{a}{g}{a}{m}{e}' atlimit <- '{a}{g}{a}{m}') do ( [substring] among ( '{a}{g}{a}{m}{e}' '{i}{s}{a}{m}{e}' '{o}{u}{s}{a}{m}{e}' '{i}{k}{a}{m}{e}' '{i}{th}{i}{k}{a}{m}{e}' (delete unset test1) ) ) ['{a}{m}{e}'] delete unset test1 [] substring atlimit among ( '{a}{n}{a}{p}' '{a}{p}{o}{th}' '{a}{p}{o}{k}' '{a}{p}{o}{s}{t}' '{v}{o}{u}{v}' '{x}{e}{th}' '{o}{u}{l}' '{p}{e}{th}' '{p}{y}{k}{r}' '{p}{o}{t}' '{s}{y}{ch}' '{ch}' (<- '{a}{m}') ) ) define step5b as ( do ( [substring] among ( '{a}{g}{a}{n}{e}' '{i}{s}{a}{n}{e}' '{o}{u}{s}{a}{n}{e}' '{y}{o}{n}{t}{a}{n}{e}' '{y}{o}{t}{a}{n}{e}' '{y}{o}{u}{n}{t}{a}{n}{e}' '{o}{n}{t}{a}{n}{e}' '{o}{t}{a}{n}{e}' '{o}{u}{n}{t}{a}{n}{e}' '{i}{k}{a}{n}{e}' '{i}{th}{i}{k}{a}{n}{e}' ( delete unset test1 [] substring atlimit among ( '{t}{r}' '{t}{s}' (<- '{a}{g}{a}{n}') ) ) ) ) ['{a}{n}{e}'] delete unset test1 ([] v2 <- '{a}{n}') or [] substring atlimit among ( '{v}{e}{t}{e}{r}' '{v}{o}{u}{l}{k}' '{v}{r}{a}{ch}{m}' '{g}' '{d}{r}{a}{d}{o}{u}{m}' '{th}' '{k}{a}{l}{p}{o}{u}{z}' '{k}{a}{s}{t}{e}{l}' '{k}{o}{r}{m}{o}{r}' '{l}{a}{o}{p}{l}' '{m}{oo}{a}{m}{e}{th}' '{m}' '{m}{o}{u}{s}{o}{u}{l}{m}' '{n}' '{o}{u}{l}' '{p}' '{p}{e}{l}{e}{k}' '{p}{l}' '{p}{o}{l}{y}{s}' '{p}{o}{r}{t}{o}{l}' '{s}{a}{r}{a}{k}{a}{t}{s}' '{s}{o}{u}{l}{t}' '{t}{s}{a}{r}{l}{a}{t}' '{o}{r}{f}' '{t}{s}{y}{g}{g}' '{t}{s}{o}{p}' '{f}{oo}{t}{o}{s}{t}{e}{f}' '{ch}' '{ps}{u}{ch}{o}{p}{l}' '{a}{g}' '{g}{a}{l}' '{g}{e}{r}' '{d}{e}{k}' '{d}{y}{p}{l}' '{a}{m}{e}{r}{y}{k}{a}{n}' '{o}{u}{r}' '{p}{y}{th}' '{p}{o}{u}{r}{y}{t}' '{s}' '{z}{oo}{n}{t}' '{y}{k}' '{k}{a}{s}{t}' '{k}{o}{p}' '{l}{y}{ch}' '{l}{o}{u}{th}{i}{r}' '{m}{a}{y}{n}{t}' '{m}{e}{l}' '{s}{y}{g}' '{s}{p}' '{s}{t}{e}{g}' '{t}{r}{a}{g}' '{t}{s}{a}{g}' '{f}' '{e}{r}' '{a}{d}{a}{p}' '{a}{th}{y}{g}{g}' '{a}{m}{i}{ch}' '{a}{n}{y}{k}' '{a}{n}{o}{r}{g}' '{a}{p}{i}{g}' '{a}{p}{y}{th}' '{a}{t}{s}{y}{g}{g}' '{v}{a}{s}' '{v}{a}{s}{k}' '{v}{a}{th}{u}{g}{a}{l}' '{v}{y}{o}{m}{i}{ch}' '{v}{r}{a}{ch}{u}{k}' '{d}{y}{a}{t}' '{d}{y}{a}{f}' '{e}{n}{o}{r}{g}' '{th}{u}{s}' '{k}{a}{p}{n}{o}{v}{y}{o}{m}{i}{ch}' '{k}{a}{t}{a}{g}{a}{l}' '{k}{l}{y}{v}' '{k}{o}{y}{l}{a}{r}{f}' '{l}{y}{v}' '{m}{e}{g}{l}{o}{v}{y}{o}{m}{i}{ch}' '{m}{y}{k}{r}{o}{v}{y}{o}{m}{i}{ch}' '{n}{t}{a}{v}' '{x}{i}{r}{o}{k}{l}{y}{v}' '{o}{l}{y}{g}{o}{d}{a}{m}' '{o}{l}{o}{g}{a}{l}' '{p}{e}{n}{t}{a}{r}{f}' '{p}{e}{r}{i}{f}' '{p}{e}{r}{y}{t}{r}' '{p}{l}{a}{t}' '{p}{o}{l}{u}{d}{a}{p}' '{p}{o}{l}{u}{m}{i}{ch}' '{s}{t}{e}{f}' '{t}{a}{v}' '{t}{e}{t}' '{u}{p}{e}{r}{i}{f}' '{u}{p}{o}{k}{o}{p}' '{ch}{a}{m}{i}{l}{o}{d}{a}{p}' '{ps}{i}{l}{o}{t}{a}{v}' (<- '{a}{n}') ) ) define step5c as ( do ( [substring] among ( '{i}{s}{e}{t}{e}' (delete unset test1) ) ) ['{e}{t}{e}'] delete unset test1 ([] v2 <- '{e}{t}') or ([] substring among ( '{o}{d}' '{a}{y}{r}' '{f}{o}{r}' '{t}{a}{th}' '{d}{y}{a}{th}' '{s}{ch}' '{e}{n}{d}' '{e}{u}{r}' '{t}{y}{th}' '{u}{p}{e}{r}{th}' '{r}{a}{th}' '{e}{n}{th}' '{r}{o}{th}' '{s}{th}' '{p}{u}{r}' '{a}{y}{n}' '{s}{u}{n}{d}' '{s}{u}{n}' '{s}{u}{n}{th}' '{ch}{oo}{r}' '{p}{o}{n}' '{v}{r}' '{k}{a}{th}' '{e}{u}{th}' '{e}{k}{th}' '{n}{e}{t}' '{r}{o}{n}' '{a}{r}{k}' '{v}{a}{r}' '{v}{o}{l}' '{oo}{f}{e}{l}' (<- '{e}{t}') )) or [] substring atlimit among ( '{a}{v}{a}{r}' '{v}{e}{n}' '{e}{n}{a}{r}' '{a}{v}{r}' '{a}{d}' '{a}{th}' '{a}{n}' '{a}{p}{l}' '{v}{a}{r}{o}{n}' '{n}{t}{r}' '{s}{k}' '{k}{o}{p}' '{m}{p}{o}{r}' '{n}{y}{f}' '{p}{a}{g}' '{p}{a}{r}{a}{k}{a}{l}' '{s}{e}{r}{p}' '{s}{k}{e}{l}' '{s}{u}{r}{f}' '{t}{o}{k}' '{u}' '{d}' '{e}{m}' '{th}{a}{r}{r}' '{th}' (<- '{e}{t}') ) ) define step5d as ( [substring] among ( '{o}{n}{t}{a}{s}' '{oo}{n}{t}{a}{s}' ( delete unset test1 ([] '{a}{r}{ch}' atlimit <- '{o}{n}{t}') or ([] '{k}{r}{e}' <- '{oo}{n}{t}') ) ) ) define step5e as ( [substring] among ( '{o}{m}{a}{s}{t}{e}' '{y}{o}{m}{a}{s}{t}{e}' ( delete unset test1 ([] '{o}{n}' atlimit <- '{o}{m}{a}{s}{t}') ) ) ) define step5f as ( do ( ['{y}{e}{s}{t}{e}'] delete unset test1 [] substring atlimit among ( '{p}' '{a}{p}' '{s}{u}{m}{p}' '{a}{s}{u}{m}{p}' '{a}{k}{a}{t}{a}{p}' '{a}{m}{e}{t}{a}{m}{f}' (<- '{y}{e}{s}{t}') ) ) ['{e}{s}{t}{e}'] delete unset test1 [] substring atlimit among ( '{a}{l}' '{a}{r}' '{e}{k}{t}{e}{l}' '{z}' '{m}' '{x}' '{p}{a}{r}{a}{k}{a}{l}' '{p}{r}{o}' '{n}{y}{s}' (<- '{y}{e}{s}{t}') ) ) define step5g as ( do ( [substring] among ( '{i}{th}{i}{k}{a}' '{i}{th}{i}{k}{e}{s}' '{i}{th}{i}{k}{e}' (delete unset test1) ) ) [substring] among ( '{i}{k}{a}' '{i}{k}{e}{s}' '{i}{k}{e}' ( delete unset test1 ([] substring among ( '{s}{k}{oo}{l}' '{s}{k}{o}{u}{l}' '{n}{a}{r}{th}' '{s}{f}' '{o}{th}' '{p}{y}{th}' (<- '{i}{k}') )) or ([] substring atlimit among ( '{d}{y}{a}{th}' '{th}' '{p}{a}{r}{a}{k}{a}{t}{a}{th}' '{p}{r}{o}{s}{th}' '{s}{u}{n}{th}' (<- '{i}{k}') )) ) ) ) define step5h as ( [substring] among ( '{o}{u}{s}{a}' '{o}{u}{s}{e}{s}' '{o}{u}{s}{e}' ( delete unset test1 ([] substring among ( '{p}{o}{d}{a}{r}' '{v}{l}{e}{p}' '{p}{a}{n}{t}{a}{ch}' '{f}{r}{u}{d}' '{m}{a}{n}{t}{y}{l}' '{m}{a}{l}{l}' '{k}{u}{m}{a}{t}' '{l}{a}{ch}' '{l}{i}{g}' '{f}{a}{g}' '{o}{m}' '{p}{r}{oo}{t}' (<- '{o}{u}{s}') )) or ([] substring atlimit among ( '{f}{a}{r}{m}{a}{k}' '{ch}{a}{d}' '{a}{g}{k}' '{a}{n}{a}{r}{r}' '{v}{r}{o}{m}' '{e}{k}{l}{y}{p}' '{l}{a}{m}{p}{y}{d}' '{l}{e}{ch}' '{m}' '{p}{a}{t}' '{r}' '{l}' '{m}{e}{d}' '{m}{e}{s}{a}{z}' '{u}{p}{o}{t}{e}{y}{n}' '{a}{m}' '{a}{y}{th}' '{a}{n}{i}{k}' '{d}{e}{s}{p}{o}{z}' '{e}{n}{d}{y}{a}{f}{e}{r}' '{d}{e}' '{d}{e}{u}{t}{e}{r}{e}{u}' '{k}{a}{th}{a}{r}{e}{u}' '{p}{l}{e}' '{t}{s}{a}' (<- '{o}{u}{s}') )) ) ) ) define step5i as ( [substring] among ( '{a}{g}{a}' '{a}{g}{e}{s}' '{a}{g}{e}' ( delete unset test1 ([] '{k}{o}{l}{l}' <- '{a}{g}') or ( ([] substring among ( '{ps}{o}{f}' '{n}{a}{u}{l}{o}{ch}' () '{o}{f}' '{p}{e}{l}' '{ch}{o}{r}{t}' '{l}{l}' '{s}{f}' '{r}{p}' '{f}{r}' '{p}{r}' '{l}{o}{ch}' '{s}{m}{i}{n}' (<- '{a}{g}') )) or ([] substring atlimit among ( '{a}{v}{a}{s}{t}' '{p}{o}{l}{u}{f}' '{a}{d}{i}{f}' '{p}{a}{m}{f}' '{r}' '{a}{s}{p}' '{a}{f}' '{a}{m}{a}{l}' '{a}{m}{a}{l}{l}{y}' '{a}{n}{u}{s}{t}' '{a}{p}{e}{r}' '{a}{s}{p}{a}{r}' '{a}{ch}{a}{r}' '{d}{e}{r}{v}{e}{n}' '{d}{r}{o}{s}{o}{p}' '{x}{e}{f}' '{n}{e}{o}{p}' '{n}{o}{m}{o}{t}' '{o}{l}{o}{p}' '{o}{m}{o}{t}' '{p}{r}{o}{s}{t}' '{p}{r}{o}{s}{oo}{p}{o}{p}' '{s}{u}{m}{p}' '{s}{u}{n}{t}' '{t}' '{u}{p}{o}{t}' '{ch}{a}{r}' '{a}{e}{y}{p}' '{a}{y}{m}{o}{s}{t}' '{a}{n}{u}{p}' '{a}{p}{o}{t}' '{a}{r}{t}{y}{p}' '{d}{y}{a}{t}' '{e}{n}' '{e}{p}{y}{t}' '{k}{r}{o}{k}{a}{l}{o}{p}' '{s}{y}{d}{i}{r}{o}{p}' '{l}' '{n}{a}{u}' '{o}{u}{l}{a}{m}' '{o}{u}{r}' '{p}' '{t}{r}' '{m}' (<- '{a}{g}') )) ) ) ) ) define step5j as ( [substring] among ( '{i}{s}{e}' '{i}{s}{o}{u}' '{i}{s}{a}' (delete unset test1) ) [] substring atlimit among ( '{n}' '{ch}{e}{r}{s}{o}{n}' '{d}{oo}{d}{e}{k}{a}{n}' '{e}{r}{i}{m}{o}{n}' '{m}{e}{g}{a}{l}{o}{n}' '{e}{p}{t}{a}{n}' (<- '{i}{s}') ) ) define step5k as ( [substring] among ( '{i}{s}{t}{e}' (delete unset test1) ) [] substring atlimit among ( '{a}{s}{v}' '{s}{v}' '{a}{ch}{r}' '{ch}{r}' '{a}{p}{l}' '{a}{e}{y}{m}{n}' '{d}{u}{s}{ch}{r}' '{e}{u}{ch}{r}' '{k}{o}{y}{n}{o}{ch}{r}' '{p}{a}{l}{y}{m}{ps}' (<- '{i}{s}{t}') ) ) define step5l as ( [substring] among ( '{o}{u}{n}{e}' '{i}{s}{o}{u}{n}{e}' '{i}{th}{o}{u}{n}{e}' (delete unset test1) ) [] substring atlimit among ( '{n}' '{r}' '{s}{p}{y}' '{s}{t}{r}{a}{v}{o}{m}{o}{u}{t}{s}' '{k}{a}{k}{o}{m}{o}{u}{t}{s}' '{e}{x}{oo}{n}' (<- '{o}{u}{n}') ) ) define step5m as ( [substring] among ( '{o}{u}{m}{e}' '{i}{s}{o}{u}{m}{e}' '{i}{th}{o}{u}{m}{e}' (delete unset test1) ) [] substring atlimit among ( '{p}{a}{r}{a}{s}{o}{u}{s}' '{f}' '{ch}' '{oo}{r}{y}{o}{p}{l}' '{a}{z}' '{a}{l}{l}{o}{s}{o}{u}{s}' '{a}{s}{o}{u}{s}' (<- '{o}{u}{m}') ) ) define step6 as ( do ( [substring] among ( '{m}{a}{t}{a}' '{m}{a}{t}{oo}{n}' '{m}{a}{t}{o}{s}' (<- '{m}{a}') ) ) test1 [substring] among ( '{a}' '{a}{g}{a}{t}{e}' '{a}{g}{a}{n}' '{a}{e}{y}' '{a}{m}{a}{y}' '{a}{n}' '{a}{s}' '{a}{s}{a}{y}' '{a}{t}{a}{y}' '{a}{oo}' '{e}' '{e}{y}' '{e}{y}{s}' '{e}{y}{t}{e}' '{e}{s}{a}{y}' '{e}{s}' '{e}{t}{a}{y}' '{y}' '{y}{e}{m}{a}{y}' '{y}{e}{m}{a}{s}{t}{e}' '{y}{e}{t}{a}{y}' '{y}{e}{s}{a}{y}' '{y}{e}{s}{a}{s}{t}{e}' '{y}{o}{m}{a}{s}{t}{a}{n}' '{y}{o}{m}{o}{u}{n}' '{y}{o}{m}{o}{u}{n}{a}' '{y}{o}{n}{t}{a}{n}' '{y}{o}{n}{t}{o}{u}{s}{a}{n}' '{y}{o}{s}{a}{s}{t}{a}{n}' '{y}{o}{s}{a}{s}{t}{e}' '{y}{o}{s}{o}{u}{n}' '{y}{o}{s}{o}{u}{n}{a}' '{y}{o}{t}{a}{n}' '{y}{o}{u}{m}{a}' '{y}{o}{u}{m}{a}{s}{t}{e}' '{y}{o}{u}{n}{t}{a}{y}' '{y}{o}{u}{n}{t}{a}{n}' '{i}' '{i}{d}{e}{s}' '{i}{d}{oo}{n}' '{i}{th}{e}{y}' '{i}{th}{e}{y}{s}' '{i}{th}{e}{y}{t}{e}' '{i}{th}{i}{k}{a}{t}{e}' '{i}{th}{i}{k}{a}{n}' '{i}{th}{o}{u}{n}' '{i}{th}{oo}' '{i}{k}{a}{t}{e}' '{i}{k}{a}{n}' '{i}{s}' '{i}{s}{a}{n}' '{i}{s}{a}{t}{e}' '{i}{s}{e}{y}' '{i}{s}{e}{s}' '{i}{s}{o}{u}{n}' '{i}{s}{oo}' '{o}' '{o}{y}' '{o}{m}{a}{y}' '{o}{m}{a}{s}{t}{a}{n}' '{o}{m}{o}{u}{n}' '{o}{m}{o}{u}{n}{a}' '{o}{n}{t}{a}{y}' '{o}{n}{t}{a}{n}' '{o}{n}{t}{o}{u}{s}{a}{n}' '{o}{s}' '{o}{s}{a}{s}{t}{a}{n}' '{o}{s}{a}{s}{t}{e}' '{o}{s}{o}{u}{n}' '{o}{s}{o}{u}{n}{a}' '{o}{t}{a}{n}' '{o}{u}' '{o}{u}{m}{a}{y}' '{o}{u}{m}{a}{s}{t}{e}' '{o}{u}{n}' '{o}{u}{n}{t}{a}{y}' '{o}{u}{n}{t}{a}{n}' '{o}{u}{s}' '{o}{u}{s}{a}{n}' '{o}{u}{s}{a}{t}{e}' '{u}' '{u}{s}' '{oo}' '{oo}{n}' (delete) ) ) define step7 as ( [substring] among ( '{e}{s}{t}{e}{r}' '{e}{s}{t}{a}{t}' '{o}{t}{e}{r}' '{o}{t}{a}{t}' '{u}{t}{e}{r}' '{u}{t}{a}{t}' '{oo}{t}{e}{r}' '{oo}{t}{a}{t}' (delete) ) ) ) define stem as ( backwards ( do tolower has_min_length set test1 do step1 do steps1 do steps2 do steps3 do steps4 do steps5 do steps6 do steps7 do steps8 do steps9 do steps10 do step2a do step2b do step2c do step2d do step3 do step4 do step5a do step5b do step5c do step5d do step5e do step5f do step5g do step5h do step5j do step5i do step5k do step5l do step5m do step6 do step7 ) ) snowball-2.2.0/algorithms/hindi.sbl000066400000000000000000000226621414263061200173110ustar00rootroot00000000000000// An implementation of "A Lightweight Stemmer for Hindi": // http://www.kbcs.in/downloads/papers/StmmerHindi.pdf externals ( stem ) stringescapes {} // The transliteration scheme used for our stringdefs matches that used in the // paper, as documented in the appendix. It appears to match the WX notation // (https://en.wikipedia.org/wiki/WX_notation) except that WX apparently // uses 'z' for Anunasika whereas the paper uses Mh. // // We discriminate dependent vowels by adding a leading "_" to their stringdef // names (mnemonic: the _ signifies removing the implicit a from the preceding // character). // Vowels and sonorants: stringdef a '{U+0905}' stringdef A '{U+0906}' stringdef i '{U+0907}' stringdef I '{U+0908}' stringdef u '{U+0909}' stringdef U '{U+090A}' stringdef q '{U+090B}' stringdef e '{U+090F}' stringdef E '{U+0910}' stringdef o '{U+0913}' stringdef O '{U+0914}' // Vowel signs: stringdef _A '{U+093E}' stringdef _i '{U+093F}' stringdef _I '{U+0940}' stringdef _u '{U+0941}' stringdef _U '{U+0942}' stringdef _q '{U+0943}' stringdef _e '{U+0947}' stringdef _E '{U+0948}' stringdef _o '{U+094B}' stringdef _O '{U+094C}' // Diacritics: stringdef M '{U+0902}' stringdef H '{U+0903}' stringdef Mh '{U+0901}' stringdef Z '{U+093C}' // Nukta stringdef virama '{U+094D}' // Velar consonants: stringdef k '{U+0915}' stringdef K '{U+0916}' stringdef g '{U+0917}' stringdef G '{U+0918}' stringdef f '{U+0919}' // Palatal consonants: stringdef c '{U+091A}' stringdef C '{U+091B}' stringdef j '{U+091C}' stringdef J '{U+091D}' stringdef F '{U+091E}' // Retroflex consonants: stringdef t '{U+091F}' stringdef T '{U+0920}' stringdef d '{U+0921}' stringdef D '{U+0922}' stringdef N '{U+0923}' // Dental consonants: stringdef w '{U+0924}' stringdef W '{U+0925}' stringdef x '{U+0926}' stringdef X '{U+0927}' stringdef n '{U+0928}' // Labial consonants: stringdef p '{U+092A}' stringdef P '{U+092B}' stringdef b '{U+092C}' stringdef B '{U+092D}' stringdef m '{U+092E}' // Semi-vowels: stringdef y '{U+092F}' stringdef r '{U+0930}' stringdef l '{U+0932}' stringdef v '{U+0935}' // Fricatives: stringdef S '{U+0936}' stringdef R '{U+0937}' stringdef s '{U+0938}' stringdef h '{U+0939}' stringdef lY '{U+0933}' // Precomposed characters - letters + nukta: stringdef nZ '{U+0929}' // ≡ {n}{Z} stringdef rZ '{U+0931}' // ≡ {r}{Z} stringdef lYZ '{U+0934}' // ≡ {lY}{Z} stringdef kZ '{U+0958}' // ≡ {k}{Z} stringdef KZ '{U+0959}' // ≡ {K}{Z} stringdef gZ '{U+095A}' // ≡ {g}{Z} stringdef jZ '{U+095B}' // ≡ {j}{Z} stringdef dZ '{U+095C}' // ≡ {d}{Z} stringdef DZ '{U+095D}' // ≡ {D}{Z} stringdef PZ '{U+095E}' // ≡ {P}{Z} stringdef yZ '{U+095F}' // ≡ {y}{Z} groupings ( consonant ) routines ( CONSONANT ) define consonant '{k}{K}{g}{G}{f}' + '{c}{C}{j}{J}{F}' + '{t}{T}{d}{D}{N}' + '{w}{W}{x}{X}{n}' + '{p}{P}{b}{B}{m}' + '{y}{r}{l}{v}' + '{S}{R}{s}{h}' + '{lY}' + '{Z}' + // Nukta // Precomposed characters - letter and nukta: '{nZ}{rZ}{lYZ}{kZ}{KZ}{gZ}{jZ}{dZ}{DZ}{PZ}{yZ}' backwardmode ( define CONSONANT as ( consonant ) ) define stem as ( // We assume in this implementation that the whole word doesn't count // as a valid suffix to remove, so we remove the longest suffix from // the list which leaves at least one character. This change affects // 47 words out of the 65,140 in the sample vocabulary from Hindi // wikipedia. // // The trick here is we use `next` in forward mode to advance the cursor // to the second character, then `backwards` swaps the cursor and limit. next backwards ( [substring] among ( // The list below is derived from figure 3 in the paper. // // We perform the stemming on the Devanagari characters rather than // transliterating to Latin, so we have adapted the list below to // reflect this by converting suffixes back to Devanagari as // follows: // // * within the suffixes, "a" after a consonant is dropped since // consonants have an implicit "a". // // * within the suffixes, a vowel other than "a" after a consonant // is a dependent vowel (vowel sign); a vowel (including "a") // after a non-consonant is an independent vowel. // // * to allow the vowel at the start of each suffix being dependent // or independent, we include each suffix twice. For the // dependent version, a leading "a" is dropped and we check that // the suffix is preceded by a consonant (which will have an // implicit "a"). // // * we add '{a}', which is needed for the example given right at // the end of section 5 to work (conflating BarawIya and // BarawIyawA), and which 3.1 a.v strongly suggests should be in // the list: // // Thus, the following suffix deletions (longest possible // match) are required to reduce inflected forms of masculine // nouns to a common stem: // a A i [...] // // Adding '{a}' only affect 2 words out of the 65,140 in the // sample vocabulary. // // * The transliterations of our stems would end with "a" when our // stems end in a consonant, so we also include {virama} in the // list of suffixes to remove (this affects 222 words from the // sample vocabulary). // // We've also assumed that Mh in the suffix list always means {Mh} // and never {M}{h}{virama}. Only one of the 65,140 words in the // sample vocabulary stems differently due to this (and that word // seems to be a typo). '{virama}' '{a}' '{A}' '{i}' '{I}' '{u}' '{U}' '{e}' '{o}' '{e}{M}' '{o}{M}' '{A}{M}' '{u}{A}{M}' '{u}{e}{M}' '{u}{o}{M}' '{A}{e}{M}' '{A}{o}{M}' '{i}{y}{_A}{M}' '{i}{y}{_o}{M}' '{A}{i}{y}{_A}{M}' '{A}{i}{y}{_o}{M}' '{A}{Mh}' '{i}{y}{_A}{Mh}' '{A}{i}{y}{_A}{Mh}' '{a}{w}{_A}{e}{M}' '{a}{w}{_A}{o}{M}' '{a}{n}{_A}{e}{M}' '{a}{n}{_A}{o}{M}' '{a}{w}{_A}' '{a}{w}{_I}' '{I}{M}' '{a}{w}{_I}{M}' '{a}{w}{_e}' '{A}{w}{_A}' '{A}{w}{_I}' '{A}{w}{_I}{M}' '{A}{w}{_e}' '{a}{n}{_A}' '{a}{n}{_I}' '{a}{n}{_e}' '{A}{n}{_A}' '{A}{n}{_e}' '{U}{M}{g}{_A}' '{U}{M}{g}{_I}' '{A}{U}{M}{g}{_A}' '{A}{U}{M}{g}{_I}' '{e}{M}{g}{_e}' '{e}{M}{g}{_I}' '{A}{e}{M}{g}{_e}' '{A}{e}{M}{g}{_I}' '{o}{g}{_e}' '{o}{g}{_I}' '{A}{o}{g}{_e}' '{A}{o}{g}{_I}' '{e}{g}{_A}' '{e}{g}{_I}' '{A}{e}{g}{_A}' '{A}{e}{g}{_I}' '{A}{y}{_A}' '{A}{e}' '{A}{I}' '{A}{I}{M}' '{i}{e}' '{A}{o}' '{A}{i}{e}' '{a}{k}{r}' '{A}{k}{r}' '{_A}' '{_i}' '{_I}' '{_u}' '{_U}' '{_e}' '{_o}' '{_e}{M}' '{_o}{M}' '{_A}{M}' '{_u}{A}{M}' '{_u}{e}{M}' '{_u}{o}{M}' '{_A}{e}{M}' '{_A}{o}{M}' '{_i}{y}{_A}{M}' '{_i}{y}{_o}{M}' '{_A}{i}{y}{_A}{M}' '{_A}{i}{y}{_o}{M}' '{_A}{Mh}' '{_i}{y}{_A}{Mh}' '{_A}{i}{y}{_A}{Mh}' '{_I}{M}' '{_A}{w}{_A}' '{_A}{w}{_I}' '{_A}{w}{_I}{M}' '{_A}{w}{_e}' '{_A}{n}{_A}' '{_A}{n}{_e}' '{_U}{M}{g}{_A}' '{_U}{M}{g}{_I}' '{_A}{U}{M}{g}{_A}' '{_A}{U}{M}{g}{_I}' '{_e}{M}{g}{_e}' '{_e}{M}{g}{_I}' '{_A}{e}{M}{g}{_e}' '{_A}{e}{M}{g}{_I}' '{_o}{g}{_e}' '{_o}{g}{_I}' '{_A}{o}{g}{_e}' '{_A}{o}{g}{_I}' '{_e}{g}{_A}' '{_e}{g}{_I}' '{_A}{e}{g}{_A}' '{_A}{e}{g}{_I}' '{_A}{y}{_A}' '{_A}{e}' '{_A}{I}' '{_A}{I}{M}' '{_i}{e}' '{_A}{o}' '{_A}{i}{e}' '{_A}{k}{r}' /* Suffixes with a leading implicit a: */ '{w}{_A}{e}{M}' CONSONANT '{w}{_A}{o}{M}' CONSONANT '{n}{_A}{e}{M}' CONSONANT '{n}{_A}{o}{M}' CONSONANT '{w}{_A}' CONSONANT '{w}{_I}' CONSONANT '{w}{_I}{M}' CONSONANT '{w}{_e}' CONSONANT '{n}{_A}' CONSONANT '{n}{_I}' CONSONANT '{n}{_e}' CONSONANT '{k}{r}' CONSONANT ) delete ) ) snowball-2.2.0/algorithms/hungarian.sbl000066400000000000000000000122561414263061200201700ustar00rootroot00000000000000/* Hungarian Stemmer Removes noun inflections */ routines ( mark_regions R1 v_ending case case_special case_other plural owned sing_owner plur_owner instrum factive undouble double ) externals ( stem ) integers ( p1 ) groupings ( v ) stringescapes {} /* special characters */ stringdef a' '{U+00E1}' //a-acute stringdef e' '{U+00E9}' //e-acute stringdef i' '{U+00ED}' //i-acute stringdef o' '{U+00F3}' //o-acute stringdef o" '{U+00F6}' //o-umlaut stringdef oq '{U+0151}' //o-double acute stringdef u' '{U+00FA}' //u-acute stringdef u" '{U+00FC}' //u-umlaut stringdef uq '{U+0171}' //u-double acute define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}' define mark_regions as ( $p1 = limit (v goto non-v among('cs' 'gy' 'ly' 'ny' 'sz' 'ty' 'zs' 'dzs') or next setmark p1) or (non-v gopast v setmark p1) ) backwardmode ( define R1 as $p1 <= cursor define v_ending as ( [substring] R1 among( '{a'}' (<- 'a') '{e'}' (<- 'e') ) ) define double as ( test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm' 'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs') ) define undouble as ( next [hop 1] delete ) define instrum as( [substring] R1 among( 'al' (double) 'el' (double) ) delete undouble ) define case as ( [substring] R1 among( 'ban' 'ben' 'ba' 'be' 'ra' 're' 'nak' 'nek' 'val' 'vel' 't{o'}l' 't{oq}l' 'r{o'}l' 'r{oq}l' 'b{o'}l' 'b{oq}l' 'hoz' 'hez' 'h{o"}z' 'n{a'}l' 'n{e'}l' 'ig' 'at' 'et' 'ot' '{o"}t' '{e'}rt' 'k{e'}pp' 'k{e'}ppen' 'kor' 'ul' '{u"}l' 'v{a'}' 'v{e'}' 'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt' 'k{e'}nt' 'en' 'on' 'an' '{o"}n' 'n' 't' ) delete v_ending ) define case_special as( [substring] R1 among( '{e'}n' (<- 'e') '{a'}n' (<- 'a') '{a'}nk{e'}nt' (<- 'a') ) ) define case_other as( [substring] R1 among( 'astul' 'est{u"}l' (delete) 'stul' 'st{u"}l' (delete) '{a'}stul' (<- 'a') '{e'}st{u"}l' (<- 'e') ) ) define factive as( [substring] R1 among( '{a'}' (double) '{e'}' (double) ) delete undouble ) define plural as ( [substring] R1 among( '{a'}k' (<- 'a') '{e'}k' (<- 'e') '{o"}k' (delete) 'ak' (delete) 'ok' (delete) 'ek' (delete) 'k' (delete) ) ) define owned as ( [substring] R1 among ( 'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete) '{e'}k{e'}' (<- 'e') '{a'}k{e'}' (<- 'a') 'k{e'}' (delete) '{e'}{e'}i' (<- 'e') '{a'}{e'}i' (<- 'a') '{e'}i' (delete) '{e'}{e'}' (<- 'e') '{e'}' (delete) ) ) define sing_owner as ( [substring] R1 among( '{u"}nk' 'unk' (delete) '{a'}nk' (<- 'a') '{e'}nk' (<- 'e') 'nk' (delete) '{a'}juk' (<- 'a') '{e'}j{u"}k' (<- 'e') 'juk' 'j{u"}k' (delete) 'uk' '{u"}k' (delete) 'em' 'om' 'am' (delete) '{a'}m' (<- 'a') '{e'}m' (<- 'e') 'm' (delete) 'od' 'ed' 'ad' '{o"}d' (delete) '{a'}d' (<- 'a') '{e'}d' (<- 'e') 'd' (delete) 'ja' 'je' (delete) 'a' 'e' 'o' (delete) '{a'}' (<- 'a') '{e'}' (<- 'e') ) ) define plur_owner as ( [substring] R1 among( 'jaim' 'jeim' (delete) '{a'}im' (<- 'a') '{e'}im' (<- 'e') 'aim' 'eim' (delete) 'im' (delete) 'jaid' 'jeid' (delete) '{a'}id' (<- 'a') '{e'}id' (<- 'e') 'aid' 'eid' (delete) 'id' (delete) 'jai' 'jei' (delete) '{a'}i' (<- 'a') '{e'}i' (<- 'e') 'ai' 'ei' (delete) 'i' (delete) 'jaink' 'jeink' (delete) 'eink' 'aink' (delete) '{a'}ink' (<- 'a') '{e'}ink' (<- 'e') 'ink' 'jaitok' 'jeitek' (delete) 'aitok' 'eitek' (delete) '{a'}itok' (<- 'a') '{e'}itek' (<- 'e') 'itek' (delete) 'jeik' 'jaik' (delete) 'aik' 'eik' (delete) '{a'}ik' (<- 'a') '{e'}ik' (<- 'e') 'ik' (delete) ) ) ) define stem as ( do mark_regions backwards ( do instrum do case do case_special do case_other do factive do owned do sing_owner do plur_owner do plural ) ) snowball-2.2.0/algorithms/indonesian.sbl000066400000000000000000000157261414263061200203500ustar00rootroot00000000000000// An implementation of the "Porter Stemmer for Bahasa Indonesia" from: // http://www.illc.uva.nl/Research/Publications/Reports/MoL-2003-02.text.pdf integers ( // The paper defines measure as the number of vowels in the word. We // count this initially, then adjust the count each time we remove a // prefix or suffix. measure // Numeric code for the type of prefix removed: // // 0 other/none // 1 'di' or 'meng' or 'ter' // 2 'per' // 3 'ke' or 'peng' // 4 'ber' // // Some of these have variant forms, so e.g. "meng" includes "men", "me", // "meny", "mem". // // Note that the value of prefix is only used in remove_suffix (and // routines it calls) so we don't need to worry about // remove_second_order_prefix overwriting a value of prefix set by // remove_first_order_prefix since remove_suffix gets called between // the two. prefix ) groupings ( vowel ) routines ( remove_particle remove_possessive_pronoun remove_first_order_prefix remove_second_order_prefix remove_suffix KER SUFFIX_KAN_OK SUFFIX_AN_OK SUFFIX_I_OK VOWEL ) externals ( stem ) stringescapes {} backwardmode ( define remove_particle as ( [substring] among ( 'kah' 'lah' 'pun' (delete $measure-=1) ) ) define remove_possessive_pronoun as ( [substring] among ( 'ku' 'mu' 'nya' (delete $measure-=1) ) ) // prefix not in {ke, peng, per} define SUFFIX_KAN_OK as ( // On page 29, the example "kompas Q.31" says "Both Nazief and Porter // stemmer converted the word peledakan (blast, explotion) to ledak (to // blast, to explode)". However, the algorithm as described doesn't // behave in this way - grammatically the prefix pe- occurs as a // variation of both the first-order derivational prefix peng- and the // second-order derivational prefix per-, but table 2.5 doesn't include // "pe", only table 2.6 does, so "peledakan" is handled (incorrectly) // as having prefix "per" not "peng", and so we remove derivational // suffix "kan" rather than "an" to give stem leda. (Porter-style // stemmers remove the longest suffix they can amongst those available, // which this paper notes in the last paragraph on page 15). // // We resolve this by amending the condition on suffix "kan" to // "prefix ∉ {ke, peng, per}", which seems to make the stemmer's // behaviour match all the examples in the paper except for one: // "perbaikan" is shown in table 3.4 as stemming to "bai", but with // this change it now stems to "baik". The table notes that "baik" is // the actual root so this deviation is an improvement. In a sample // vocabulary derived from the most common words in id.wikipedia.org, // this change only affects 0.12% of words (76 out of 64,587, including // "peledakan" and "perbaikan"). $prefix != 3 and $prefix != 2 ) // prefix not in {di, meng, ter} define SUFFIX_AN_OK as ( $prefix != 1 ) define SUFFIX_I_OK as ( // prefix not in {ke, peng, ber} $prefix <= 2 // The rest of the condition from the paper is: // V|K...c₁c₁, c₁ ≠ s, c₂ ≠ i // // The meaning of this is unclear in several ways, and none of the // examples given of the stemmer's behaviour in the paper help to // resolve these issues. // // Notice that c₂ isn't actually used - the most obvious explanation // seems to be that "c₁c₁" should read "c₁c₂", or maybe "c₂c₁". // // Elsewhere the paper defines V... as meaning "the stem starts with // a vowel" and K... as meaning "the stem starts with a consonant". // // In other places where it says X|Y... it seems the | binds more // tightly, so it's (V|K)...cᵢcⱼ not V|(K...cᵢcⱼ). That seems a bit // odd as the first letter must be either a vowel or a consonant, so // that really just means "ends cᵢcⱼ". However, nowhere in the paper // uses or defines a notation such as ...X, which may explain this // seemingly redundant way of specifying this. // // The conditions elsewhere on prefix removal (e.g. V...) are clearly // on the stem left after the prefix is removed. None of the other // rules for suffix removal have conditions on the stem, but for // consistency with the prefix rules we might expect that the cᵢcⱼ // test is on what's left *after* removing the "i" suffix. // // However, studying Indonesian wordlists and discussion with a native // speaker leads us to conclude that the purpose of this check is to // protect words of foreign origin (e.g. "televisi", "organisasi", // "komunikasi") from stemming, and the common feature of these is // that the word ends "-si", so we conclude that the condition here // should be read as "word does not end -si", and this is what we // have implemented. not 's' ) define remove_suffix as ( [substring] among ( 'kan' SUFFIX_KAN_OK 'an' SUFFIX_AN_OK 'i' SUFFIX_I_OK (delete $measure-=1) ) ) ) define vowel 'aeiou' define VOWEL as ( vowel ) define KER as ( non-vowel 'er' ) define remove_first_order_prefix as ( [substring] among ( 'di' 'meng' 'men' 'me' 'ter' (delete $prefix=1 $measure-=1) 'ke' 'peng' 'pen' (delete $prefix=3 $measure-=1) 'meny' VOWEL ($prefix=1 <-'s' $measure-=1) 'peny' VOWEL ($prefix=3 <-'s' $measure-=1) 'mem' ($prefix=1 $measure-=1 vowel and <-'p' or delete) 'pem' ($prefix=3 $measure-=1 vowel and <-'p' or delete) ) ) define remove_second_order_prefix as ( // The paper has the condition on removal of prefix "bel" and "pel" as // just "ajar" not "ajar..." but it seems that the latter must be what // is intended so that e.g. "pelajaran" stems to "ajar" not "lajar". // This change only affects a very small number of words (11 out of // 64,587) and only for the better. [substring] among ( 'per' 'pe' (delete $prefix=2 $measure-=1) 'pelajar' (<-'ajar' $measure-=1) 'ber' (delete $prefix=4 $measure-=1) 'belajar' (<-'ajar' $prefix=4 $measure-=1) 'be' KER (delete $prefix=4 $measure-=1) ) ) define stem as ( $measure = 0 do ( repeat ( gopast vowel $measure+=1 ) ) $measure > 2 $prefix = 0 backwards ( do remove_particle $measure > 2 do remove_possessive_pronoun ) $measure > 2 test ( remove_first_order_prefix do ( test ($measure > 2 backwards remove_suffix) $measure > 2 remove_second_order_prefix ) ) or ( do remove_second_order_prefix do ($measure > 2 backwards remove_suffix) ) ) snowball-2.2.0/algorithms/irish.sbl000066400000000000000000000047471414263061200173400ustar00rootroot00000000000000routines ( R1 R2 RV initial_morph mark_regions noun_sfx deriv verb_sfx ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v ) stringescapes {} /* Accented characters */ stringdef a' '{U+00E1}' // a-acute stringdef e' '{U+00E9}' // e-acute stringdef i' '{U+00ED}' // i-acute stringdef o' '{U+00F3}' // o-acute stringdef u' '{U+00FA}' // u-acute define v 'aeiou{a'}{e'}{i'}{o'}{u'}' define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( gopast v setmark pV gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define initial_morph as ( [substring] among ( 'h-' 'n-' 't-' //nAthair -> n-athair, but alone are problematic (delete) // verbs 'd{'}' (delete) 'd{'}fh' (<- 'f') // other contractions 'm{'}' 'b{'}' (delete) 'sh' (<- 's') 'mb' (<- 'b') 'gc' (<- 'c') 'nd' (<- 'd') 'bhf' (<- 'f') 'ng' (<- 'g') 'bp' (<- 'p') 'ts' (<- 's') 'dt' (<- 't') // Lenition 'bh' (<- 'b') 'ch' (<- 'c') 'dh' (<- 'd') 'fh' (<- 'f') 'gh' (<- 'g') 'mh' (<- 'm') 'ph' (<- 'p') 'th' (<- 't') ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define noun_sfx as ( [substring] among ( 'amh' 'eamh' 'abh' 'eabh' 'aibh' 'ibh' 'aimh' 'imh' 'a{i'}ocht' '{i'}ocht' 'a{i'}ochta' '{i'}ochta' (R1 delete) 'ire' 'ir{i'}' 'aire' 'air{i'}' (R2 delete) ) ) define deriv as ( [substring] among ( 'acht' 'eacht' 'ach' 'each' 'eacht{u'}il' 'eachta' 'acht{u'}il' 'achta' (R2 delete) //siopadóireacht -> siopadóir but not poblacht -> pobl 'arcacht' 'arcachta{i'}' 'arcachta' (<- 'arc') // monarcacht -> monarc 'gineach' 'gineas' 'ginis' (<- 'gin') 'grafa{i'}och' 'grafa{i'}ocht' 'grafa{i'}ochta' 'grafa{i'}ochta{i'}' (<- 'graf') 'paite' 'patach' 'pataigh' 'patacha' (<- 'paite') '{o'}ideach' '{o'}ideacha' '{o'}idigh' (<- '{o'}id') ) ) define verb_sfx as ( [substring] among ( 'imid' 'aimid' '{i'}mid' 'a{i'}mid' 'faidh' 'fidh' (RV delete) 'ain' 'eadh' 'adh' '{a'}il' 'tear' 'tar' (R1 delete) ) ) ) define stem as ( do initial_morph do mark_regions backwards ( do noun_sfx do deriv do verb_sfx ) ) snowball-2.2.0/algorithms/italian.sbl000066400000000000000000000114431414263061200176320ustar00rootroot00000000000000 routines ( prelude postlude mark_regions RV R1 R2 attached_pronoun standard_suffix verb_suffix vowel_suffix ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v AEIO CG ) stringescapes {} /* special characters */ stringdef a' '{U+00E1}' stringdef a` '{U+00E0}' stringdef e' '{U+00E9}' stringdef e` '{U+00E8}' stringdef i' '{U+00ED}' stringdef i` '{U+00EC}' stringdef o' '{U+00F3}' stringdef o` '{U+00F2}' stringdef u' '{U+00FA}' stringdef u` '{U+00F9}' define v 'aeiou{a`}{e`}{i`}{o`}{u`}' define prelude as ( test repeat ( [substring] among( '{a'}' (<- '{a`}') '{e'}' (<- '{e`}') '{i'}' (<- '{i`}') '{o'}' (<- '{o`}') '{u'}' (<- '{u`}') 'qu' (<- 'qU') '' (next) ) ) repeat goto ( v [ ('u' ] v <- 'U') or ('i' ] v <- 'I') ) ) define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v (non-v gopast v) or (v gopast non-v) ) or ( non-v (non-v gopast v) or (v next) ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( 'I' (<- 'i') 'U' (<- 'u') '' (next) ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define attached_pronoun as ( [substring] among( 'ci' 'gli' 'la' 'le' 'li' 'lo' 'mi' 'ne' 'si' 'ti' 'vi' // the compound forms are: 'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene' 'mela' 'mele' 'meli' 'melo' 'mene' 'tela' 'tele' 'teli' 'telo' 'tene' 'cela' 'cele' 'celi' 'celo' 'cene' 'vela' 'vele' 'veli' 'velo' 'vene' ) among( (RV) 'ando' 'endo' (delete) 'ar' 'er' 'ir' (<- 'e') ) ) define standard_suffix as ( [substring] among( 'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo' 'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti' 'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente' 'atrice' 'atrici' 'ante' 'anti' // Note 1 ( R2 delete ) 'azione' 'azioni' 'atore' 'atori' ( R2 delete try ( ['ic'] R2 delete ) ) 'logia' 'logie' ( R2 <- 'log' ) 'uzione' 'uzioni' 'usione' 'usioni' ( R2 <- 'u' ) 'enza' 'enze' ( R2 <- 'ente' ) 'amento' 'amenti' 'imento' 'imenti' ( RV delete ) 'amente' ( R1 delete try ( [substring] R2 delete among( 'iv' ( ['at'] R2 delete ) 'os' 'ic' 'abil' ) ) ) 'it{a`}' ( R2 delete try ( [substring] among( 'abil' 'ic' 'iv' (R2 delete) ) ) ) 'ivo' 'ivi' 'iva' 'ive' ( R2 delete try ( ['at'] R2 delete ['ic'] R2 delete ) ) ) ) define verb_suffix as setlimit tomark pV for ( [substring] among( 'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi' 'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate' 'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai' 'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo' 'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete' 'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo' 'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei' 'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono' 'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita' 'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo' 'ono' 'uta' 'ute' 'uti' 'uto' 'ar' 'ir' // but 'er' is problematical (delete) ) ) define AEIO 'aeio{a`}{e`}{i`}{o`}' define CG 'cg' define vowel_suffix as ( try ( [AEIO] RV delete ['i'] RV delete ) try ( ['h'] CG RV delete ) ) ) define stem as ( do prelude do mark_regions backwards ( do attached_pronoun do (standard_suffix or verb_suffix) do vowel_suffix ) do postlude ) /* Note 1: additions of 15 Jun 2005 */ snowball-2.2.0/algorithms/kraaij_pohlmann.sbl000066400000000000000000000130211414263061200213400ustar00rootroot00000000000000strings ( ch ) integers ( p1 p2 ) booleans ( Y_found stemmed GE_removed ) routines ( R1 R2 C V VX lengthen_V Step_1 Step_2 Step_3 Step_4 Step_7 Step_6 Step_1c Lose_prefix Lose_infix measure ) externals ( stem ) groupings ( v v_WX AOU AIOU ) stringescapes {} define v 'aeiouy' define v_WX v + 'wx' define AOU 'aou' define AIOU 'aiou' backwardmode ( define R1 as ($p1 <= cursor) define R2 as ($p2 <= cursor) define V as test (v or 'ij') define VX as test (next v or 'ij') define C as test (not 'ij' non-v) define lengthen_V as do ( non-v_WX [ (AOU] test (non-v or atlimit)) or ('e'] test (non-v or atlimit not AIOU not (next AIOU non-v))) ->ch insert ch ) define Step_1 as ( [substring] among ( '{'}s' (delete) 's' (R1 not ('t' R1) C delete) 'ies' (R1 <-'ie') 'es' (('ar' R1 C ] delete lengthen_V) or ('er' R1 C ] delete) or (R1 C <-'e')) 'aus' (R1 V <-'au') 'en' (('hed' R1 ] <-'heid') or ('nd' delete) or ('d' R1 C ] delete) or ('i' or 'j' V delete) or (R1 C delete lengthen_V)) 'nde' (<-'nd') ) ) define Step_2 as ( [substring] among ( 'je' (('{'}t' ] delete) or ('et' ] R1 C delete) or ('rnt' ] <-'rn') or ('t' ] R1 VX delete) or ('ink' ] <-'ing') or ('mp' ] <-'m') or ('{'}' ] R1 delete) or (] R1 C delete)) 'ge' (R1 <-'g') 'lijke'(R1 <-'lijk') 'ische'(R1 <-'isch') 'de' (R1 C delete) 'te' (R1 <-'t') 'se' (R1 <-'s') 're' (R1 <-'r') 'le' (R1 delete attach 'l' lengthen_V) 'ene' (R1 C delete attach 'en' lengthen_V) 'ieve' (R1 C <-'ief') ) ) define Step_3 as ( [substring] among ( 'atie' (R1 <-'eer') 'iteit' (R1 delete lengthen_V) 'heid' 'sel' 'ster' (R1 delete) 'rder' (<-'r') 'ing' 'isme' 'erij' (R1 delete lengthen_V) 'arij' (R1 C <-'aar') 'fie' (R2 delete attach 'f' lengthen_V) 'gie' (R2 delete attach 'g' lengthen_V) 'tst' (R1 C <-'t') 'dst' (R1 C <-'d') ) ) define Step_4 as ( ( [substring] among ( 'ioneel' (R1 <-'ie') 'atief' (R1 <-'eer') 'baar' (R1 delete) 'naar' (R1 V <-'n') 'laar' (R1 V <-'l') 'raar' (R1 V <-'r') 'tant' (R1 <-'teer') 'lijker' 'lijkst' (R1 <-'lijk') 'achtig' 'achtiger' 'achtigst'(R1 delete) 'eriger' 'erigst' 'erig' 'end' (R1 C delete lengthen_V) ) ) or ( [substring] among ( 'iger' 'igst' 'ig' (R1 C delete lengthen_V) ) ) ) define Step_7 as ( [substring] among ( 'kt' (<-'k') 'ft' (<-'f') 'pt' (<-'p') ) ) define Step_6 as ( [substring] among ( 'bb' (<-'b') 'cc' (<-'c') 'dd' (<-'d') 'ff' (<-'f') 'gg' (<-'g') 'hh' (<-'h') 'jj' (<-'j') 'kk' (<-'k') 'll' (<-'l') 'mm' (<-'m') 'nn' (<-'n') 'pp' (<-'p') 'qq' (<-'q') 'rr' (<-'r') 'ss' (<-'s') 'tt' (<-'t') 'vv' (<-'v') 'ww' (<-'w') 'xx' (<-'x') 'zz' (<-'z') 'v' (<-'f') 'z' (<-'s') ) ) define Step_1c as ( [substring] among ( (R1 C) 'd' (not ('n' R1) delete) 't' (not ('h' R1) delete) ) ) ) define Lose_prefix as ( ['ge'] test hop 3 (goto v goto non-v) set GE_removed delete ) define Lose_infix as ( next gopast (['ge']) test hop 3 (goto v goto non-v) set GE_removed delete ) define measure as ( $p1 = limit $p2 = limit do( repeat non-v atleast 1 ('ij' or v) non-v setmark p1 repeat non-v atleast 1 ('ij' or v) non-v setmark p2 ) ) define stem as ( unset Y_found unset stemmed do ( ['y'] <-'Y' set Y_found ) do repeat(goto (v ['y'])<-'Y' set Y_found ) measure backwards ( do (Step_1 set stemmed ) do (Step_2 set stemmed ) do (Step_3 set stemmed ) do (Step_4 set stemmed ) ) unset GE_removed do (Lose_prefix and measure) backwards ( do (GE_removed Step_1c) ) unset GE_removed do (Lose_infix and measure) backwards ( do (GE_removed Step_1c) ) backwards ( do (Step_7 set stemmed ) do (stemmed or GE_removed Step_6) ) do(Y_found repeat(goto (['Y']) <-'y')) ) snowball-2.2.0/algorithms/lithuanian.sbl000066400000000000000000000317231414263061200203500ustar00rootroot00000000000000externals ( stem ) // escape symbols for substituting lithuanian characters stringescapes { } /* Special characters in Unicode Latin Extended-A */ // ' nosine stringdef a' '{U+0105}' // ą a + ogonek stringdef e' '{U+0119}' // ę e + ogonek stringdef i' '{U+012F}' // į i + ogonek stringdef u' '{U+0173}' // ų u + ogonek // . taskas stringdef e. '{U+0117}' // ė e + dot // - ilgoji stringdef u- '{U+016B}' // ū u + macron // * varnele stringdef c* '{U+010D}' // č c + caron (haček) stringdef s* '{U+0161}' // š s + caron (haček) stringdef z* '{U+017E}' // ž z + caron (haček) // [C](VC)^m[V|C] // definitions of variables for // p1 - position of m = 0 integers ( p1 ) // groupings // v - lithuanian vowels groupings ( v ) // v - all lithuanian vowels define v 'aeiyou{a'}{e'}{i'}{u'}{e.}{u-}' // all lithuanian stemmer routines: 4 steps routines ( step2 R1 step1 fix_chdz fix_gd fix_conflicts ) backwardmode ( define R1 as $p1 <= cursor define step1 as ( setlimit tomark p1 for ([substring]) R1 among ( // Daiktavardžiai (Nouns) // I linksniuotė (declension I) 'as' 'ias' 'is' 'ys' // vyras, kelias, brolis, gaidys 'o' 'io' // vyro, kelio 'ui' 'iui' // vyrui, keliui '{a'}' 'i{a'}' '{i'}' // vyrą, kelią, brolį 'u' 'iu' // vyru, keliu 'e' 'yje' // vyre, kelyje 'y' 'au' 'i' // kely, brolau, broli, 'an' // nusižengiman 'ai' 'iai' // vyrai, keliai '{u'}' 'i{u'}' // vyrų, kelių 'ams' 'am' // vyrams, vyram 'iams' 'iam' // broliams, broliam 'us' 'ius' // vyrus, brolius 'ais' 'iais' // vyrais, keliais 'uose' 'iuose' 'uos' 'iuos' // vyruose, keliuose, vyruos, keliuos 'uosna' 'iuosna' // vyruosna, keliuosna 'ysna' // žutysna 'asis' 'aisi' // sukimasis, sukimaisi 'osi' '{u'}si' // sukimosi, sukimųsi 'uisi' // sukimuisi '{a'}si' // sukimąsi 'usi' // sukimusi 'esi' // sukimesi 'uo' // mėnuo // II linksniuote (declension II) 'a' 'ia' // galva, vysnios 'os' 'ios' // galvos, vysnios 'oj' 'oje' 'ioje' // galvoje, vysnioje 'osna' 'iosna' // galvosna, vyšniosna 'om' 'oms' 'ioms' // galvoms, vysnioms 'omis' 'iomis' // galvomis, vysniomis 'ose' 'iose' // galvose, vysniose 'on' 'ion' // galvon, vyšnion // III linksniuote (declension III) '{e.}' // gervė '{e.}s' // gervės 'ei' // gervei '{e'}' // gervę '{e.}j' '{e.}je' // gervėj, gervėje '{e.}ms' // gervėms 'es' // gerves '{e.}mis' // gervėmis '{e.}se' // gervėse '{e.}sna' // gervėsna '{e.}n' // žydaitėn // IV linksniuote (declension IV) 'aus' 'iaus' // sūnaus, skaičiaus 'umi' 'iumi' // sūnumi, skaičiumi 'uje' 'iuje' // sūnuje, skaičiuje 'iau' // skaičiau '{u-}s' // sūnūs 'ums' // sūnums 'umis' // sūnumis 'un' 'iun' // sūnun, administratoriun // V linksniuote (declension V) 'ies' 'ens' 'enio' 'ers' // avies, vandens, sesers 'eniui' 'eriai' // vandeniui, eriai 'en{i'}' 'er{i'}' // vandenį, seserį 'imi' 'eniu' 'erimi' 'eria' // avimi, vandeniu, seserimi, seseria 'enyje' 'eryje' // vandenyje, seseryje 'ie' 'enie' 'erie' // avie, vandenie, seserie 'enys' 'erys' // vandenys, seserys // 'en{u'}' konfliktas su 'žandenų' 'antenų' 'er{u'}' // seserų 'ims' 'enims' 'erims' // avims, vandemins, seserims 'enis' // vandenis 'imis' // žebenkštimis 'enimis' // vandenimis 'yse' 'enyse' 'eryse' // avyse, vandenyse, seseryse // Būdvardžiai (Adjectives) // (i)a linksniuotė 'iem' 'iems' // geriem, geriems 'ame' 'iame' // naujame, mediniame // Veiksmažodžiai (Verbs) // Tiesioginė nuosaka (indicative mood) // esamasis laikas (present tense) // (i)a asmenuotė (declension (i)a) 'uosi' 'iuosi' // dirbuosi, traukiuosi 'iesi' // dirbiesi 'asi' 'iasi' // dirbasi, traukiasi 'am{e.}s' 'iam{e.}s' // dirbamės, traukiamės 'at' 'ate' 'iat' 'iate' // dirbat, dirbate, ariat, traukiate 'at{e.}s' 'iat{e.}s' // dirbatės, traukiatės // i asmenuotė (declension i) 'isi' // tikisi 'im' // mylim // 'ime' konfliktassu daiktavardžiu vietininku, pvz. 'gėrime' 'im{e.}s' // tikimės 'it' 'ite' // mylit, mylite, tikitės // 'it{e.}s' konfliktas su priesaga ir dgs. vardininko galūne -ait-ės pvz. žydaitės // o asmenuotė (declension o) 'ome' // mokome 'ot' 'ote' // mokot, mokote // būtasis laikas // o asmenuotė (declension o) '{e.}jo' '{e.}josi' // tikėjo, tikėjosi 'ot{e.}s' // tikėjotės/bijotės // ė asmenuotė (declension ė) 'eisi' // mokeisi '{e.}si' // mokėsi '{e.}m' '{e.}me' // mokėm, mokėme '{e.}m{e.}s' // mokėmės '{e.}t' '{e.}te' // mokėt, mokėte '{e.}t{e.}s' // mokėtės // būtasis dažninis laikas (frequentative past tense) 'ausi' // mokydavausi 'om{e.}s' // mokydavomės/bijomės // būsimasis laikas (future tense) 'siu' 'siuosi' // dirbsiu, mokysiuosi 'si' 'siesi' // dirbsi, dirbsiesi 's' 'ysis' // dirbs, mokysis 'sim' 'sime' // dirbsim, dirbsime 'sit' 'site' // gersit, gersite // tariamoji nuosaka (subjunctive mood) '{c*}iau' '{c*}iausi' // dirbčiau 'tum' 'tumei' // dirbtum, dirbtumei 'tumeis' 'tumeisi' // mokytumeis, mokytumeisi // 't{u'}' nes blogai batutų -> batų 't{u'}si' // mokytųsi // 'tume' konfliktas su 'šventume' 'tum{e.}m' // dirbtumėm 'tum{e.}me' // dirbtumėme 'tum{e.}m{e.}s' // mokytumėmės 'tute' 'tum{e.}t' // dirbtute, dirbtumėt 'tum{e.}te' // dirbtumėte 'tum{e.}t{e.}s' // mokytumėtės // liepiamoji nuosaka (imperative mood) 'k' 'ki' // dirbk, dirbki, mokykis // 'kis' konfliktas viln-išk-is // 'kime' konfliktas, nes pirkime 'kim{e.}s' // mokykimės // bendratis (infinitive) 'uoti' 'iuoti' // meluoti, dygsniuoti 'auti' 'iauti' // draugauti, girtuokliauti 'oti' 'ioti' // dovanoti, meškerioti '{e.}ti' // auklėti 'yti' // akyti 'inti' // auginti 'in{e.}ti' // blusinėti 'enti' // gyventi 'tel{e.}ti' // bumbtelėti 'ter{e.}ti' // bumbterėti 'ti' // skalbti // 'tis' konfliktas, nes rytme-tis -> rytme // dalyviai (participles) '{a'}s' 'i{a'}s' '{i'}s' // dirbąs, žaidžiąs, gulįs 't{u'}s' // suktųs -> suk 'sim{e.}s' // suksimės 'sit{e.}s' // suksitės 'kite' // supkite ) delete ) define step2 as repeat ( setlimit tomark p1 for ([substring]) among ( // daiktavardziu priesagos (Noun suffixes) // budvardziu priesagos (Adjective suffixes) // 'in' // konfliktas su 'augintinis' ir 'akiniais' // lauk-in-is 'ing' // tvark-ing-as 'i{s*}k' // lenk-išk-as '{e.}t' // dem-ėt-as 'ot' // garban-ot-as 'uot' 'iuot' // lang-uot-as, akin-iuot-as // 'tin', nes augintinis // dirb-tin-is // 'ut', nes batutas, degutas etc. // maž-ut-is 'yt' // maž-yt-is 'iuk' // maž-iuk-as 'iul' // maž-ul-is '{e.}l' // maž-ėl-is 'yl' // maž-yl-is 'u{c*}iuk' // maž-učiuk-as 'uliuk' // maž-uliuk-as 'ut{e.}ait' // maž-utėlait-is 'ok' // did-ok-as 'iok' // višč-iok-as 'sv' '{s*}v' 'zgan' // sal-sv-as, pilk-šv-as, bal-zgan-as 'op' 'iop' // dvej-op-as, viener-iop-as 'ain' // apval-ain-as 'yk{s*}t' 'yk{s*}{c*}' // ten-ykšt-is, vakar-ykšč-ias // laisniai 'esn' // did-esn-is 'aus' 'iaus' // nauj-aus-ias, ger-iaus-ias // ivardziuotiniai budvardziai (Pronominal adjectives) // vyriska gimine (Male gender) 'ias' // žaliasis 'oj' 'ioj' // gerojo, žaliojo 'aj' 'iaj' // gerajam, žaliajam '{a'}j' 'i{a'}j' // garąjį, žaliąjį 'uoj' 'iuoj' // geruoju, žaliuoju 'iej' // gerieji '{u'}j' 'i{u'}j' // gerųjų, žaliųjų 'ies' // geriesiems 'uos' 'iuos' // geruosius, žaliuosius 'ais' 'iais' // geraisiais, žaliaisiais // moteriska gimine (Female gender) 'os' 'ios' // gerosios, žaliosios '{a'}s' 'i{a'}s' // gerąsios, žaliąsias // būtasis dažninis laikas (frequentative past tense) 'dav' // ei-dav-o // dalyvių priesagos (particple suffix) 'ant' 'iant' 'int' // tur-int-is '{e.}j' // tur-ėj-o '{e'}' // '{e.}j{e'}' '{e'}s' // dirb-ęs-is 'siant' // dirb-siant // pusdalyviai (participle) 'dam' // bėg-dam-as 'auj' // ūkinink-auj-a 'jam' 'iau' 'am' // baiminim-ams-i ) delete ) define fix_conflicts as ( [substring] among ( // 'lietuvaite' -> 'lietuvaitė', konfliktas su 'myl-ite' 'aite' (<-'ait{e.}') // 'lietuvaitės' -> 'lietuvaitė', konfliktas su 'myl-itės' 'ait{e.}s' (<-'ait{e.}') // ''ūs-uotės' -> 'ūs-uotė', konfliktas 'mokotės' 'uot{e.}s' (<-'uot{e.}') // ''ūs-uote' -> 'ūs-uotė', konfliktas 'mokote' 'uote' (<-'uot{e.}') // 'žerėjime' -> 'žėrėjimas', konfliktas su 'žais-ime' '{e.}jime' (<-'{e.}jimas') // 'žvilgesiu' -> 'žvilgesys', konfliktas su 'dirb-siu' 'esiu' (<-'esys') // 'duobkasiu' -> 'duobkasys', konfliktas su 'pakasiu' 'asius' (<-'asys') // 'žioravime' -> 'žioravimas', konfliktas su 'myl-ime' 'avime' (<-'avimas') 'ojime' (<-'ojimas') // 'advokatės' -> 'advokatė', konfliktas su 'dirb-atės' 'okat{e.}s' (<-'okat{e.}') // 'advokate' -> 'advokatė', konfliktas su 'dirb-ate' 'okate' (<-'okat{e.}') ) ) define fix_chdz as ( [substring] among ( '{c*}' (<-'t') 'd{z*}' (<-'d') ) ) define fix_gd as ( [substring] among ( 'gd' (<-'g') // '{e.}k' (<-'{e.}g') ) ) ) define stem as ( $p1 = limit do ( // priešdėlis 'a' ilgeniuose nei 6 raidės žodžiuose, pvz. 'a-liejus'. try (test 'a' $(len > 6) hop 1) gopast v gopast non-v setmark p1 ) backwards ( do fix_conflicts do step1 do fix_chdz do step2 do fix_chdz do fix_gd ) ) snowball-2.2.0/algorithms/lovins.sbl000066400000000000000000000200031414263061200175130ustar00rootroot00000000000000 stringescapes {} routines ( A B C D E F G H I J K L M N O P Q R S T U V W X Y Z AA BB CC endings undouble respell ) externals ( stem ) backwardmode ( /* Lovins' conditions A, B ... CC, as given in her Appendix B, where a test for a two letter prefix ('test hop 2') is implicitly assumed. Note that 'e' next 'u' corresponds to her u*e because Snowball is scanning backwards. */ define A as ( hop 2 ) define B as ( hop 3 ) define C as ( hop 4 ) define D as ( hop 5 ) define E as ( test hop 2 not 'e' ) define F as ( test hop 3 not 'e' ) define G as ( test hop 3 'f' ) define H as ( test hop 2 't' or 'll' ) define I as ( test hop 2 not 'o' not 'e' ) define J as ( test hop 2 not 'a' not 'e' ) define K as ( test hop 3 'l' or 'i' or ('e' next 'u') ) define L as ( test hop 2 not 'u' not 'x' not ('s' not 'o') ) define M as ( test hop 2 not 'a' not 'c' not 'e' not 'm' ) define N as ( test hop 3 ( hop 2 not 's' or hop 2 ) ) define O as ( test hop 2 'l' or 'i' ) define P as ( test hop 2 not 'c' ) define Q as ( test hop 2 test hop 3 not 'l' not 'n' ) define R as ( test hop 2 'n' or 'r' ) define S as ( test hop 2 'dr' or ('t' not 't') ) define T as ( test hop 2 's' or ('t' not 'o') ) define U as ( test hop 2 'l' or 'm' or 'n' or 'r' ) define V as ( test hop 2 'c' ) define W as ( test hop 2 not 's' not 'u' ) define X as ( test hop 2 'l' or 'i' or ('e' next 'u') ) define Y as ( test hop 2 'in' ) define Z as ( test hop 2 not 'f' ) define AA as ( test hop 2 among ( 'd' 'f' 'ph' 'th' 'l' 'er' 'or' 'es' 't' ) ) define BB as ( test hop 3 not 'met' not 'ryst' ) define CC as ( test hop 2 'l' ) /* The system of endings, as given in Appendix A. */ define endings as ( [substring] among( 'alistically' B 'arizability' A 'izationally' B 'antialness' A 'arisations' A 'arizations' A 'entialness' A 'allically' C 'antaneous' A 'antiality' A 'arisation' A 'arization' A 'ationally' B 'ativeness' A 'eableness' E 'entations' A 'entiality' A 'entialize' A 'entiation' A 'ionalness' A 'istically' A 'itousness' A 'izability' A 'izational' A 'ableness' A 'arizable' A 'entation' A 'entially' A 'eousness' A 'ibleness' A 'icalness' A 'ionalism' A 'ionality' A 'ionalize' A 'iousness' A 'izations' A 'lessness' A 'ability' A 'aically' A 'alistic' B 'alities' A 'ariness' E 'aristic' A 'arizing' A 'ateness' A 'atingly' A 'ational' B 'atively' A 'ativism' A 'elihood' E 'encible' A 'entally' A 'entials' A 'entiate' A 'entness' A 'fulness' A 'ibility' A 'icalism' A 'icalist' A 'icality' A 'icalize' A 'ication' G 'icianry' A 'ination' A 'ingness' A 'ionally' A 'isation' A 'ishness' A 'istical' A 'iteness' A 'iveness' A 'ivistic' A 'ivities' A 'ization' F 'izement' A 'oidally' A 'ousness' A 'aceous' A 'acious' B 'action' G 'alness' A 'ancial' A 'ancies' A 'ancing' B 'ariser' A 'arized' A 'arizer' A 'atable' A 'ations' B 'atives' A 'eature' Z 'efully' A 'encies' A 'encing' A 'ential' A 'enting' C 'entist' A 'eously' A 'ialist' A 'iality' A 'ialize' A 'ically' A 'icance' A 'icians' A 'icists' A 'ifully' A 'ionals' A 'ionate' D 'ioning' A 'ionist' A 'iously' A 'istics' A 'izable' E 'lessly' A 'nesses' A 'oidism' A 'acies' A 'acity' A 'aging' B 'aical' A 'alist' A 'alism' B 'ality' A 'alize' A 'allic'BB 'anced' B 'ances' B 'antic' C 'arial' A 'aries' A 'arily' A 'arity' B 'arize' A 'aroid' A 'ately' A 'ating' I 'ation' B 'ative' A 'ators' A 'atory' A 'ature' E 'early' Y 'ehood' A 'eless' A 'elity' A 'ement' A 'enced' A 'ences' A 'eness' E 'ening' E 'ental' A 'ented' C 'ently' A 'fully' A 'ially' A 'icant' A 'ician' A 'icide' A 'icism' A 'icist' A 'icity' A 'idine' I 'iedly' A 'ihood' A 'inate' A 'iness' A 'ingly' B 'inism' J 'inity'CC 'ional' A 'ioned' A 'ished' A 'istic' A 'ities' A 'itous' A 'ively' A 'ivity' A 'izers' F 'izing' F 'oidal' A 'oides' A 'otide' A 'ously' A 'able' A 'ably' A 'ages' B 'ally' B 'ance' B 'ancy' B 'ants' B 'aric' A 'arly' K 'ated' I 'ates' A 'atic' B 'ator' A 'ealy' Y 'edly' E 'eful' A 'eity' A 'ence' A 'ency' A 'ened' E 'enly' E 'eous' A 'hood' A 'ials' A 'ians' A 'ible' A 'ibly' A 'ical' A 'ides' L 'iers' A 'iful' A 'ines' M 'ings' N 'ions' B 'ious' A 'isms' B 'ists' A 'itic' H 'ized' F 'izer' F 'less' A 'lily' A 'ness' A 'ogen' A 'ward' A 'wise' A 'ying' B 'yish' A 'acy' A 'age' B 'aic' A 'als'BB 'ant' B 'ars' O 'ary' F 'ata' A 'ate' A 'eal' Y 'ear' Y 'ely' E 'ene' E 'ent' C 'ery' E 'ese' A 'ful' A 'ial' A 'ian' A 'ics' A 'ide' L 'ied' A 'ier' A 'ies' P 'ily' A 'ine' M 'ing' N 'ion' Q 'ish' C 'ism' B 'ist' A 'ite'AA 'ity' A 'ium' A 'ive' A 'ize' F 'oid' A 'one' R 'ous' A 'ae' A 'al'BB 'ar' X 'as' B 'ed' E 'en' F 'es' E 'ia' A 'ic' A 'is' A 'ly' B 'on' S 'or' T 'um' U 'us' V 'yl' R '{'}s' A 's{'}' A 'a' A 'e' A 'i' A 'o' A 's' W 'y' B (delete) ) ) /* Undoubling is rule 1 of appendix C. */ define undouble as ( test substring among ('bb' 'dd' 'gg' 'll' 'mm' 'nn' 'pp' 'rr' 'ss' 'tt') [next] delete ) /* The other appendix C rules can be done together. */ define respell as ( [substring] among ( 'iev' (<-'ief') 'uct' (<-'uc') 'umpt' (<-'um') 'rpt' (<-'rb') 'urs' (<-'ur') 'istr' (<-'ister') 'metr' (<-'meter') 'olv' (<-'olut') 'ul' (not 'a' not 'i' not 'o' <-'l') 'bex' (<-'bic') 'dex' (<-'dic') 'pex' (<-'pic') 'tex' (<-'tic') 'ax' (<-'ac') 'ex' (<-'ec') 'ix' (<-'ic') 'lux' (<-'luc') 'uad' (<-'uas') 'vad' (<-'vas') 'cid' (<-'cis') 'lid' (<-'lis') 'erid' (<-'eris') 'pand' (<-'pans') 'end' (not 's' <-'ens') 'ond' (<-'ons') 'lud' (<-'lus') 'rud' (<-'rus') 'her' (not 'p' not 't' <-'hes') 'mit' (<-'mis') 'ent' (not 'm' <-'ens') /* 'ent' was 'end' in the 1968 paper - a typo. */ 'ert' (<-'ers') 'et' (not 'n' <-'es') 'yt' (<-'ys') 'yz' (<-'ys') ) ) ) define stem as ( backwards ( do endings do undouble do respell ) ) snowball-2.2.0/algorithms/nepali.sbl000066400000000000000000000117311414263061200174610ustar00rootroot00000000000000/* * Authors: * - Ingroj Shrestha , Nepali NLP Group * - Oleg Bartunov , Postgres Professional Ltd. * - Shreeya Singh Dhakal, Nepali NLP Group */ routines ( remove_category_1 check_category_2 remove_category_2 remove_category_3 ) stringescapes {} stringdef dsc '{U+0901}' // DEVANAGARI_SIGN_CANDRABINDU stringdef dsa '{U+0902}' // DEVANAGARI_SIGN_ANUSVARA stringdef dli '{U+0907}' // DEVANAGARI_LETTER_I stringdef dlii '{U+0908}' // DEVANAGARI_LETTER_II stringdef dle '{U+090F}' // DEVANAGARI_LETTER_E stringdef dlka '{U+0915}' // DEVANAGARI_LETTER_KA stringdef dlkha '{U+0916}' // DEVANAGARI_LETTER_KHA stringdef dlg '{U+0917}' // DEVANAGARI_LETTER_GA stringdef dlc '{U+091B}' // DEVANAGARI_LETTER_CHA stringdef dlta '{U+0924}' // DEVANAGARI_LETTER_TA stringdef dltha '{U+0925}' // DEVANAGARI_LETTER_THA stringdef dld '{U+0926}' // DEVANAGARI_LETTER_DA stringdef dln '{U+0928}' // DEVANAGARI_LETTER_NA stringdef dlpa '{U+092A}' // DEVANAGARI_LETTER_PA stringdef dlpha '{U+092B}' // DEVANAGARI_LETTER_PHA stringdef dlb '{U+092D}' // DEVANAGARI_LETTER_BHA stringdef dlm '{U+092E}' // DEVANAGARI_LETTER_MA stringdef dly '{U+092F}' // DEVANAGARI_LETTER_YA stringdef dlr '{U+0930}' // DEVANAGARI_LETTER_RA stringdef dll '{U+0932}' // DEVANAGARI_LETTER_LA stringdef dlv '{U+0935}' // DEVANAGARI_LETTER_VA stringdef dls '{U+0938}' // DEVANAGARI_LETTER_SA stringdef dlh '{U+0939}' // DEVANAGARI_LETTER_HA stringdef dvsaa '{U+093E}' // DEVANAGARI_VOWEL_SIGN_AA stringdef dvsi '{U+093F}' // DEVANAGARI_VOWEL_SIGN_I stringdef dvsii '{U+0940}' // DEVANAGARI_VOWEL_SIGN_II stringdef dvsu '{U+0941}' // DEVANAGARI_VOWEL_SIGN_U stringdef dvsuu '{U+0942}' // DEVANAGARI_VOWEL_SIGN_UU stringdef dvse '{U+0947}' // DEVANAGARI_VOWEL_SIGN_E stringdef dvsai '{U+0948}' // DEVANAGARI_VOWEL_SIGN_AI stringdef dvso '{U+094B}' // DEVANAGARI_VOWEL_SIGN_O stringdef dvsau '{U+094C}' // DEVANAGARI_VOWEL_SIGN_AU stringdef dsv '{U+094D}' // DEVANAGARI_SIGN_VIRAMA externals ( stem ) backwardmode ( define remove_category_1 as( [substring] among ( '{dlm}{dvsaa}{dlr}{dsv}{dlpha}{dlta}' '{dld}{dsv}{dlv}{dvsaa}{dlr}{dvsaa}' '{dls}{dsc}{dlg}{dvsai}' '{dls}{dsa}{dlg}' '{dls}{dsc}{dlg}' '{dll}{dvsaa}{dli}' '{dll}{dvsaa}{dlii}' '{dlpa}{dlc}{dvsi}' '{dll}{dvse}' '{dlr}{dlta}' '{dlm}{dvsai}' '{dlm}{dvsaa}' (delete) '{dlka}{dvso}' '{dlka}{dvsaa}' '{dlka}{dvsi}' '{dlka}{dvsii}' '{dlka}{dvsai}'(('{dle}' or '{dvse}' ()) or delete) ) ) define check_category_2 as( [substring] among( '{dsc}' '{dsa}' '{dvsai}' ) ) define remove_category_2 as ( [substring] among( '{dsc}' '{dsa}' ('{dly}{dvsau}' or '{dlc}{dvsau}' or '{dln}{dvsau}' or '{dltha}{dvse}' delete) '{dvsai}' ('{dlta}{dsv}{dlr}' delete) ) ) define remove_category_3 as( [substring] among( '{dltha}{dvsi}{dli}{dls}{dsv}' '{dlh}{dvsu}{dln}{dvse}{dlc}' '{dlh}{dvsu}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dls}{dsv}' '{dln}{dvse}{dlc}{dln}{dsv}' '{dli}{dle}{dlka}{dvsii}' '{dli}{dle}{dlka}{dvsaa}' '{dli}{dle}{dlka}{dvso}' '{dvsi}{dle}{dlka}{dvsii}' '{dvsi}{dle}{dlka}{dvsaa}' '{dvsi}{dle}{dlka}{dvso}' '{dli}{dlc}{dln}{dsv}' '{dvsi}{dlc}{dln}{dsv}' '{dli}{dlc}{dls}{dsv}' '{dvsi}{dlc}{dls}{dsv}' '{dle}{dlc}{dln}{dsv}' '{dvse}{dlc}{dln}{dsv}' '{dle}{dlc}{dls}{dsv}' '{dvse}{dlc}{dls}{dsv}' '{dlc}{dvsi}{dln}{dsv}' '{dlc}{dvse}{dls}{dsv}' '{dlc}{dsv}{dly}{dvsau}' '{dltha}{dvsi}{dln}{dsv}' '{dltha}{dvsi}{dly}{dvso}' '{dltha}{dvsi}{dly}{dvsau}' '{dltha}{dvsi}{dls}{dsv}' '{dltha}{dsv}{dly}{dvso}' '{dltha}{dsv}{dly}{dvsau}' '{dld}{dvsi}{dly}{dvso}' '{dld}{dvse}{dlkha}{dvsi}' '{dld}{dvse}{dlkha}{dvsii}' '{dll}{dvsaa}{dln}{dsv}' '{dlm}{dvsaa}{dltha}{dvsi}' '{dln}{dvse}{dlka}{dvsai}' '{dln}{dvse}{dlka}{dvsaa}' '{dln}{dvse}{dlka}{dvso}' '{dln}{dvse}{dlc}{dvsau}' '{dlh}{dvso}{dls}{dsv}' '{dli}{dln}{dsv}{dlc}' '{dvsi}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dvsu}' '{dli}{dlc}{dvsau}' '{dvsi}{dlc}{dvsau}' '{dli}{dls}{dsv}' '{dvsi}{dls}{dsv}' '{dvsi}{dly}{dvso}' '{dli}{dly}{dvso}' '{dle}{dlka}{dvsaa}' '{dvse}{dlka}{dvsaa}' '{dle}{dlka}{dvsii}' '{dvse}{dlka}{dvsii}' '{dle}{dlka}{dvsai}' '{dvse}{dlka}{dvsai}' '{dle}{dlka}{dvso}' '{dvse}{dlka}{dvso}' '{dle}{dlc}{dvsu}' '{dvse}{dlc}{dvsu}' '{dle}{dlc}{dvsau}' '{dvse}{dlc}{dvsau}' '{dlc}{dln}{dsv}' '{dlc}{dls}{dsv}' '{dltha}{dvsi}{dle}' '{dlpa}{dlr}{dsv}' '{dlb}{dly}{dvso}' '{dlh}{dlr}{dvsu}' '{dlh}{dlr}{dvsuu}' '{dvsi}{dld}{dvsaa}' '{dli}{dld}{dvsaa}' '{dvsi}{dld}{dvso}' '{dli}{dld}{dvso}' '{dvsi}{dld}{dvsai}' '{dli}{dld}{dvsai}' '{dln}{dvse}{dlc}' '{dli}{dlc}' '{dvsi}{dlc}' '{dle}{dlc}' '{dvse}{dlc}' '{dlc}{dvsu}' '{dlc}{dvse}' '{dlc}{dvsau}' '{dltha}{dvsii}' '{dltha}{dvse}' '{dld}{dvsaa}' '{dld}{dvsii}' '{dld}{dvsai}' '{dld}{dvso}' '{dln}{dvsu}' '{dln}{dvse}' '{dly}{dvso}' '{dly}{dvsau}' '{dlc}' (delete) ) ) ) define stem as ( backwards ( do remove_category_1 do ( repeat (do (check_category_2 and remove_category_2) remove_category_3) ) ) ) snowball-2.2.0/algorithms/norwegian.sbl000066400000000000000000000027721414263061200202070ustar00rootroot00000000000000routines ( mark_regions main_suffix consonant_pair other_suffix ) externals ( stem ) integers ( p1 x ) groupings ( v s_ending ) stringescapes {} /* special characters */ stringdef ae '{U+00E6}' stringdef ao '{U+00E5}' stringdef o/ '{U+00F8}' define v 'aeiouy{ae}{ao}{o/}' define s_ending 'bcdfghjlmnoprtvyz' define mark_regions as ( $p1 = limit test ( hop 3 setmark x ) goto v gopast non-v setmark p1 try ( $p1 < x $p1 = x ) ) backwardmode ( define main_suffix as ( setlimit tomark p1 for ([substring]) among( 'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar' 'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens' 'hetens' 'ers' 'ets' 'et' 'het' 'ast' (delete) 's' (s_ending or ('k' non-v) delete) 'erte' 'ert' (<-'er') ) ) define consonant_pair as ( test ( setlimit tomark p1 for ([substring]) among( 'dt' 'vt' ) ) next] delete ) define other_suffix as ( setlimit tomark p1 for ([substring]) among( 'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov' 'hetslov' (delete) ) ) ) define stem as ( do mark_regions backwards ( do main_suffix do consonant_pair do other_suffix ) ) snowball-2.2.0/algorithms/porter.sbl000066400000000000000000000056441414263061200175320ustar00rootroot00000000000000integers ( p1 p2 ) booleans ( Y_found ) routines ( shortv R1 R2 Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5a Step_5b ) externals ( stem ) groupings ( v v_WXY ) define v 'aeiouy' define v_WXY v + 'wxY' backwardmode ( define shortv as ( non-v_WXY v non-v ) define R1 as $p1 <= cursor define R2 as $p2 <= cursor define Step_1a as ( [substring] among ( 'sses' (<-'ss') 'ies' (<-'i') 'ss' () 's' (delete) ) ) define Step_1b as ( [substring] among ( 'eed' (R1 <-'ee') 'ed' 'ing' ( test gopast v delete test substring among( 'at' 'bl' 'iz' (<+ 'e') 'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt' // ignoring double c, h, j, k, q, v, w, and x ([next] delete) '' (atmark p1 test shortv <+ 'e') ) ) ) ) define Step_1c as ( ['y' or 'Y'] gopast v <-'i' ) define Step_2 as ( [substring] R1 among ( 'tional' (<-'tion') 'enci' (<-'ence') 'anci' (<-'ance') 'abli' (<-'able') 'entli' (<-'ent') 'eli' (<-'e') 'izer' 'ization' (<-'ize') 'ational' 'ation' 'ator' (<-'ate') 'alli' (<-'al') 'alism' 'aliti' (<-'al') 'fulness' (<-'ful') 'ousli' 'ousness' (<-'ous') 'iveness' 'iviti' (<-'ive') 'biliti' (<-'ble') ) ) define Step_3 as ( [substring] R1 among ( 'alize' (<-'al') 'icate' 'iciti' 'ical' (<-'ic') 'ative' 'ful' 'ness' (delete) ) ) define Step_4 as ( [substring] R2 among ( 'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement' 'ment' 'ent' 'ou' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize' (delete) 'ion' ('s' or 't' delete) ) ) define Step_5a as ( ['e'] R2 or (R1 not shortv) delete ) define Step_5b as ( ['l'] R2 'l' delete ) ) define stem as ( unset Y_found do ( ['y'] <-'Y' set Y_found) do repeat(goto (v ['y']) <-'Y' set Y_found) $p1 = limit $p2 = limit do( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) backwards ( do Step_1a do Step_1b do Step_1c do Step_2 do Step_3 do Step_4 do Step_5a do Step_5b ) do(Y_found repeat(goto (['Y']) <-'y')) ) snowball-2.2.0/algorithms/portuguese.sbl000066400000000000000000000127521414263061200204170ustar00rootroot00000000000000routines ( prelude postlude mark_regions RV R1 R2 standard_suffix verb_suffix residual_suffix residual_form ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v ) stringescapes {} /* special characters */ stringdef a' '{U+00E1}' // a-acute stringdef a^ '{U+00E2}' // a-circumflex e.g. 'bota^nico stringdef e' '{U+00E9}' // e-acute stringdef e^ '{U+00EA}' // e-circumflex stringdef i' '{U+00ED}' // i-acute stringdef o^ '{U+00F4}' // o-circumflex stringdef o' '{U+00F3}' // o-acute stringdef u' '{U+00FA}' // u-acute stringdef c, '{U+00E7}' // c-cedilla stringdef a~ '{U+00E3}' // a-tilde stringdef o~ '{U+00F5}' // o-tilde define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}' define prelude as repeat ( [substring] among( '{a~}' (<- 'a~') '{o~}' (<- 'o~') '' (next) ) //or next ) define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v (non-v gopast v) or (v gopast non-v) ) or ( non-v (non-v gopast v) or (v next) ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( 'a~' (<- '{a~}') 'o~' (<- '{o~}') '' (next) ) //or next ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define standard_suffix as ( [substring] among( 'eza' 'ezas' 'ico' 'ica' 'icos' 'icas' 'ismo' 'ismos' '{a'}vel' '{i'}vel' 'ista' 'istas' 'oso' 'osa' 'osos' 'osas' 'amento' 'amentos' 'imento' 'imentos' 'adora' 'ador' 'a{c,}a~o' 'adoras' 'adores' 'a{c,}o~es' // no -ic test 'ante' 'antes' '{a^}ncia' // Note 1 ( R2 delete ) 'logia' 'logias' ( R2 <- 'log' ) 'u{c,}a~o' 'u{c,}o~es' ( R2 <- 'u' ) '{e^}ncia' '{e^}ncias' ( R2 <- 'ente' ) 'amente' ( R1 delete try ( [substring] R2 delete among( 'iv' (['at'] R2 delete) 'os' 'ic' 'ad' ) ) ) 'mente' ( R2 delete try ( [substring] among( 'ante' // Note 1 'avel' '{i'}vel' (R2 delete) ) ) ) 'idade' 'idades' ( R2 delete try ( [substring] among( 'abil' 'ic' 'iv' (R2 delete) ) ) ) 'iva' 'ivo' 'ivas' 'ivos' ( R2 delete try ( ['at'] R2 delete // but not a further ['ic'] R2 delete ) ) 'ira' 'iras' ( RV 'e' // -eira -eiras usually non-verbal <- 'ir' ) ) ) define verb_suffix as setlimit tomark pV for ( [substring] among( 'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}' 'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste' 'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam' 'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem' 'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o' 'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias' 'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras' 'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres' 'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is' 'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis' '{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis' '{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos' '{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos' 'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos' 'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos' '{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou' 'ira' 'iras' (delete) ) ) define residual_suffix as ( [substring] among( 'os' 'a' 'i' 'o' '{a'}' '{i'}' '{o'}' ( RV delete ) ) ) define residual_form as ( [substring] among( 'e' '{e'}' '{e^}' ( RV delete [('u'] test 'g') or ('i'] test 'c') RV delete ) '{c,}' (<-'c') ) ) ) define stem as ( do prelude do mark_regions backwards ( do ( ( ( standard_suffix or verb_suffix ) and do ( ['i'] test 'c' RV delete ) ) or residual_suffix ) do residual_form ) do postlude ) /* Note 1: additions of 15 Jun 2005 */ snowball-2.2.0/algorithms/romanian.sbl000066400000000000000000000141071414263061200200150ustar00rootroot00000000000000 routines ( prelude postlude mark_regions RV R1 R2 step_0 standard_suffix combo_suffix verb_suffix vowel_suffix ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v ) booleans ( standard_suffix_removed ) stringescapes {} /* special characters */ stringdef a^ '{U+00E2}' // a circumflex stringdef i^ '{U+00EE}' // i circumflex stringdef a+ '{U+0103}' // a breve stringdef s, '{U+015F}' // s cedilla stringdef t, '{U+0163}' // t cedilla define v 'aeiou{a^}{i^}{a+}' define prelude as ( repeat goto ( v [ ('u' ] v <- 'U') or ('i' ] v <- 'I') ) ) define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v (non-v gopast v) or (v gopast non-v) ) or ( non-v (non-v gopast v) or (v next) ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( 'I' (<- 'i') 'U' (<- 'u') '' (next) ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define step_0 as ( [substring] R1 among( 'ul' 'ului' ( delete ) 'aua' ( <-'a' ) 'ea' 'ele' 'elor' ( <-'e' ) 'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor' ( <-'i') 'ile' ( not 'ab' <- 'i' ) 'atei' ( <- 'at' ) 'a{t,}ie' 'a{t,}ia' ( <- 'a{t,}i' ) ) ) define combo_suffix as test ( [substring] R1 ( among( /* 'IST'. alternative: include the following 'alism' 'alisme' 'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' ( <- 'al' ) */ 'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' ( <- 'abil' ) 'ibilitate' ( <- 'ibil' ) 'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' ( <- 'iv' ) 'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i' 'icator' 'icatori' 'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}' 'ical' 'icala' 'icale' 'icali' 'ical{a+}' ( <- 'ic' ) 'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune' 'atoare' 'ator' 'atori' '{a+}toare' '{a+}tor' '{a+}tori' ( <- 'at' ) 'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune' 'itoare' 'itor' 'itori' ( <- 'it' ) ) set standard_suffix_removed ) ) define standard_suffix as ( unset standard_suffix_removed repeat combo_suffix [substring] R2 ( among( // past participle is treated here, rather than // as a verb ending: 'at' 'ata' 'at{a+}' 'ati' 'ate' 'ut' 'uta' 'ut{a+}' 'uti' 'ute' 'it' 'ita' 'it{a+}' 'iti' 'ite' 'ic' 'ica' 'ice' 'ici' 'ic{a+}' 'abil' 'abila' 'abile' 'abili' 'abil{a+}' 'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}' 'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i' 'ant' 'anta' 'ante' 'anti' 'ant{a+}' 'ator' 'atori' 'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i' 'iv' 'iva' 'ive' 'ivi' 'iv{a+}' ( delete ) 'iune' 'iuni' ( '{t,}'] <- 't' ) 'ism' 'isme' 'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' ( <- 'ist' /* 'IST'. alternative: remove with <- '' */ ) ) set standard_suffix_removed ) ) define verb_suffix as setlimit tomark pV for ( [substring] among( // 'long' infinitive: 'are' 'ere' 'ire' '{a^}re' // gerund: 'ind' '{a^}nd' 'indu' '{a^}ndu' 'eze' 'easc{a+}' // present: 'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti' 'e{s,}te' '{a+}sc' '{a+}{s,}ti' '{a+}{s,}te' // imperfect: 'am' 'ai' 'au' 'eam' 'eai' 'ea' 'ea{t,}i' 'eau' 'iam' 'iai' 'ia' 'ia{t,}i' 'iau' // past: // (not 'ii') 'ui' 'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}' 'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}' 'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}' '{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}' // pluferfect: 'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}' 'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}' '{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i' '{a^}ser{a+}' 'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}' ( non-v or 'u' delete ) // present: '{a+}m' 'a{t,}i' 'em' 'e{t,}i' 'im' 'i{t,}i' '{a^}m' '{a^}{t,}i' // past: 'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}' 'sei' 'se' // pluperfect: 'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}' (delete) ) ) define vowel_suffix as ( [substring] RV among ( 'a' 'e' 'i' 'ie' '{a+}' ( delete ) ) ) ) define stem as ( do prelude do mark_regions backwards ( do step_0 do standard_suffix do ( standard_suffix_removed or verb_suffix ) do vowel_suffix ) do postlude ) snowball-2.2.0/algorithms/russian.sbl000066400000000000000000000143451414263061200177010ustar00rootroot00000000000000stringescapes {} /* the 33 Cyrillic letters represented in ASCII characters following the * conventions of the standard Library of Congress transliteration: */ stringdef a '{U+0430}' stringdef b '{U+0431}' stringdef v '{U+0432}' stringdef g '{U+0433}' stringdef d '{U+0434}' stringdef e '{U+0435}' stringdef e" '{U+0451}' stringdef zh '{U+0436}' stringdef z '{U+0437}' stringdef i '{U+0438}' stringdef i` '{U+0439}' stringdef k '{U+043A}' stringdef l '{U+043B}' stringdef m '{U+043C}' stringdef n '{U+043D}' stringdef o '{U+043E}' stringdef p '{U+043F}' stringdef r '{U+0440}' stringdef s '{U+0441}' stringdef t '{U+0442}' stringdef u '{U+0443}' stringdef f '{U+0444}' stringdef kh '{U+0445}' stringdef ts '{U+0446}' stringdef ch '{U+0447}' stringdef sh '{U+0448}' stringdef shch '{U+0449}' stringdef " '{U+044A}' stringdef y '{U+044B}' stringdef ' '{U+044C}' stringdef e` '{U+044D}' stringdef iu '{U+044E}' stringdef ia '{U+044F}' routines ( mark_regions R2 perfective_gerund adjective adjectival reflexive verb noun derivational tidy_up ) externals ( stem ) integers ( pV p2 ) groupings ( v ) define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}' define mark_regions as ( $pV = limit $p2 = limit do ( gopast v setmark pV gopast non-v gopast v gopast non-v setmark p2 ) ) backwardmode ( define R2 as $p2 <= cursor define perfective_gerund as ( [substring] among ( '{v}' '{v}{sh}{i}' '{v}{sh}{i}{s}{'}' ('{a}' or '{ia}' delete) '{i}{v}' '{i}{v}{sh}{i}' '{i}{v}{sh}{i}{s}{'}' '{y}{v}' '{y}{v}{sh}{i}' '{y}{v}{sh}{i}{s}{'}' (delete) ) ) define adjective as ( [substring] among ( '{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}' '{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}' '{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}' '{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}' '{ia}{ia}' // and - '{o}{iu}' // - which is somewhat archaic '{e}{iu}' // - soft form of {o}{iu} (delete) ) ) define adjectival as ( adjective /* of the participle forms, em, vsh, ivsh, yvsh are readily removable. nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of errors. Removing im, uem, enn creates too many errors. */ try ( [substring] among ( '{e}{m}' // present passive participle '{n}{n}' // adjective from past passive participle '{v}{sh}' // past active participle '{iu}{shch}' '{shch}' // present active participle ('{a}' or '{ia}' delete) //but not '{i}{m}' '{u}{e}{m}' // present passive participle //or '{e}{n}{n}' // adjective from past passive participle '{i}{v}{sh}' '{y}{v}{sh}'// past active participle '{u}{iu}{shch}' // present active participle (delete) ) ) ) define reflexive as ( [substring] among ( '{s}{ia}' '{s}{'}' (delete) ) ) define verb as ( [substring] among ( '{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}' '{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}' '{n}{y}' '{t}{'}' '{e}{sh}{'}' '{n}{n}{o}' ('{a}' or '{ia}' delete) '{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}' '{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}' '{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}' '{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}' '{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}' '{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}' (delete) /* note the short passive participle tests: '{n}{a}' '{n}' '{n}{o}' '{n}{y}' '{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}' */ ) ) define noun as ( [substring] among ( '{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}' '{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}' '{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}' '{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}' '{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}' '{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}' (delete) /* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}' '{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}' omitted - they only occur on 12 words. */ ) ) define derivational as ( [substring] R2 among ( '{o}{s}{t}' '{o}{s}{t}{'}' (delete) ) ) define tidy_up as ( [substring] among ( '{e}{i`}{sh}' '{e}{i`}{sh}{e}' // superlative forms (delete ['{n}'] '{n}' delete ) '{n}' ('{n}' delete) // e.g. -nno endings '{'}' (delete) // with some slight false conflations ) ) ) define stem as ( // Normalise {e"} to {e}. The documentation has long suggested the user // should do this before calling the stemmer - we now do it for them. do repeat ( goto (['{e"}']) <- '{e}' ) do mark_regions backwards setlimit tomark pV for ( do ( perfective_gerund or ( try reflexive adjectival or verb or noun ) ) try([ '{i}' ] delete) // because noun ending -i{iu} is being treated as verb ending -{iu} do derivational do tidy_up ) ) snowball-2.2.0/algorithms/serbian.sbl000066400000000000000000001531171414263061200176410ustar00rootroot00000000000000/* Stemmer for Serbian language, based on: * * Ljubesic, Nikola. Pandzic, Ivan. Stemmer for Croatian * http://nlp.ffzg.hr/resources/tools/stemmer-for-croatian/ * * authors: Stefan Petkovic and Dragan Ivanovic * emails: petkovic8 at gmail.com and dragan.ivanovic at uns.ac.rs * version: 1.0 (20.04.2019) */ routines ( cyr_to_lat prelude mark_regions R1 Step_1 Step_2 Step_3 ) externals ( stem ) booleans ( no_diacritics ) integers ( p1 ) groupings ( v ca sa rg ) stringescapes {} /* special characters - Unicode codepoints */ /* serbian cyrillic */ stringdef cyrA '{U+0430}' stringdef cyrB '{U+0431}' stringdef cyrV '{U+0432}' stringdef cyrG '{U+0433}' stringdef cyrD '{U+0434}' stringdef cyrDx '{U+0452}' stringdef cyrE '{U+0435}' stringdef cyrZh '{U+0436}' stringdef cyrZ '{U+0437}' stringdef cyrI '{U+0438}' stringdef cyrJ '{U+0458}' stringdef cyrK '{U+043A}' stringdef cyrL '{U+043B}' stringdef cyrLJ '{U+0459}' stringdef cyrM '{U+043C}' stringdef cyrN '{U+043D}' stringdef cyrNJ '{U+045A}' stringdef cyrO '{U+043E}' stringdef cyrP '{U+043F}' stringdef cyrR '{U+0440}' stringdef cyrS '{U+0441}' stringdef cyrT '{U+0442}' stringdef cyrCy '{U+045B}' stringdef cyrU '{U+0443}' stringdef cyrF '{U+0444}' stringdef cyrH '{U+0445}' stringdef cyrC '{U+0446}' stringdef cyrCx '{U+0447}' stringdef cyrDzx '{U+045F}' stringdef cyrSx '{U+0448}' /* serbian latin with diacritics */ stringdef cx '{U+010D}' // small c with caron stringdef cy '{U+0107}' // small c with acute stringdef zx '{U+017E}' // small z with caron stringdef sx '{U+0161}' // small s with caron stringdef dx '{U+0111}' // small d with stroke define v 'aeiou' define sa '{cx}{cy}{zx}{sx}{dx}' define ca 'bvgdzjklmnprstfhc' + sa define rg 'r' define cyr_to_lat as ( do repeat goto ( [substring] among ( '{cyrA}' (<- 'a') '{cyrB}' (<- 'b') '{cyrV}' (<- 'v') '{cyrG}' (<- 'g') '{cyrD}' (<- 'd') '{cyrDx}' (<- '{dx}') '{cyrE}' (<- 'e') '{cyrZh}' (<- '{zx}') '{cyrZ}' (<- 'z') '{cyrI}' (<- 'i') '{cyrJ}' (<- 'j') '{cyrK}' (<- 'k') '{cyrL}' (<- 'l') '{cyrLJ}' (<- 'lj') '{cyrM}' (<- 'm') '{cyrN}' (<- 'n') '{cyrNJ}' (<- 'nj') '{cyrO}' (<- 'o') '{cyrP}' (<- 'p') '{cyrR}' (<- 'r') '{cyrS}' (<- 's') '{cyrT}' (<- 't') '{cyrCy}' (<- '{cy}') '{cyrU}' (<- 'u') '{cyrF}' (<- 'f') '{cyrH}' (<- 'h') '{cyrC}' (<- 'c') '{cyrCx}' (<- '{cx}') '{cyrDzx}' (<- 'd{zx}') '{cyrSx}' (<- '{sx}') ) ) ) define prelude as ( do repeat goto ( ca ['ije'] ca <- 'e' ) do repeat goto ( ca ['je'] ca <- 'e' ) do repeat goto ( ['dj'] <- '{dx}' ) ) define mark_regions as ( set no_diacritics do ( gopast sa unset no_diacritics ) $p1 = limit do ( gopast v setmark p1 ($p1 < 2) ( gopast non-v setmark p1 ) ) do ( gopast 'r' $(cursor >= 2) or (gopast non-rg) $(p1 - cursor > 1) setmark p1 ) ) backwardmode ( define R1 as $p1 <= cursor define Step_1 as ( [substring] among ( 'lozi' 'lozima' (<-'loga') 'pesi' 'pesima' (<-'peh') 'vojci' (<-'vojka') 'bojci' (<-'bojka') 'jaci' 'jacima' (<-'jak') '{cx}ajan' (<-'{cx}ajni') 'cajan' (no_diacritics <-'cajni') 'eran' (<-'erni') 'laran' (<-'larni') 'esan' (<-'esni') 'anjac' (<-'anjca') 'ajac' 'ajaca' (<-'ajca') 'ljaca' 'ljac' (<-'ljca') 'ejac' 'ejaca' (<-'ejca') 'ojac' 'ojaca' (<-'ojca') 'ajaka' (<-'ajka') 'ojaka' (<-'ojka') '{sx}aca' '{sx}ac' (<-'{sx}ca') 'inzima' 'inzi' (<-'ing') 'tvenici' (<-'tvenik') 'tetici' 'teticima' (<-'tetika') 'nstava' (<-'nstva') 'nicima' (<-'nik') 'ticima' (<-'tik') 'zicima' (<-'zik') 'snici' (<-'snik') 'kuse' (<-'kusi') 'kusan' (<-'kusni') 'kustava' (<-'kustva') 'du{sx}an' (<-'du{sx}ni') 'dusan' (no_diacritics <-'dusni') 'antan' (<-'antni') 'bilan' (<-'bilni') 'tilan' (<-'tilni') 'avilan' (<-'avilni') 'silan' (<-'silni') 'gilan' (<-'gilni') 'rilan' (<-'rilni') 'nilan' (<-'nilni') 'alan' (<-'alni') 'ozan' (<-'ozni') 'rave' (<-'ravi') 'stavan' (<-'stavni') 'pravan' (<-'pravni') 'tivan' (<-'tivni') 'sivan' (<-'sivni') 'atan' (<-'atni') 'enat' (<-'enta') 'tetan' (<-'tetni') 'pletan' (<-'pletni') '{sx}ave' (<-'{sx}avi') 'save' (no_diacritics <-'savi') 'anata' (<-'anta') 'a{cx}ak' 'a{cx}aka' (<-'a{cx}ka') 'acak' 'acaka' (no_diacritics <-'acka') 'u{sx}ak' (<-'u{sx}ka') 'usak' (no_diacritics <-'uska') 'atak' 'ataka' 'atci' 'atcima' (<-'atka') 'etak' 'etaka' (<-'etka') 'itak' 'itaka' 'itci' (<-'itka') 'otak' 'otaka' (<-'otka') 'utak' 'utaka' 'utci' 'utcima' (<-'utka') 'eskan' (<-'eskna') 'ti{cx}an' (<-'ti{cx}ni') 'tican' (no_diacritics <-'ticni') 'ojsci' (<-'ojska') 'esama' (<-'esma') 'metar' 'metara' (<-'metra') 'centar' 'centara' (<-'centra') 'istar' 'istara' (<-'istra') 'o{sx}{cy}u' (<-'osti') 'oscu' (no_diacritics <-'osti') 'daba' (<-'dba') '{cx}cima' '{cx}ci' (<-'{cx}ka') 'mac' 'maca' (<-'mca') 'naca' 'nac' (<-'nca') 'voljan' (<-'voljni') 'anaka' (<-'anki') 'vac' 'vaca' (<-'vca') 'saca' 'sac' (<-'sca') 'raca' 'rac' (<-'rca') 'aoca' 'alaca' 'alac' (<-'alca') 'elaca' 'elac' (<-'elca') 'olaca' 'olac' 'olce' (<-'olca') 'njac' 'njaca' (<-'njca') 'ekata' 'ekat' (<-'ekta') 'izam' 'izama' (<-'izma') 'jebe' (<-'jebi') 'baci' (<-'baci') 'a{sx}an' (<-'a{sx}ni') 'asan' (no_diacritics <-'asni') ) ) define Step_2 as ( [substring] R1 among ( 'skijima' 'skijega' 'skijemu' 'skijem' 'skega' 'skemu' 'skem' 'skijim' 'skijih' 'skijoj' 'skijeg' 'skiji' 'skije' 'skija' 'skoga' 'skome' 'skomu' 'skima' 'skog' 'skom' 'skim' 'skih' 'skoj' 'ski' 'ske' 'sko' 'ska' 'sku' (<-'sk') '{sx}kijima' '{sx}kijega' '{sx}kijemu' '{sx}kijem' '{sx}kega' '{sx}kemu' '{sx}kem' '{sx}kijim' '{sx}kijih' '{sx}kijoj' '{sx}kijeg' '{sx}kiji' '{sx}kije' '{sx}kija' '{sx}koga' '{sx}kome' '{sx}komu' '{sx}kima' '{sx}kog' '{sx}kom' '{sx}kim' '{sx}kih' '{sx}koj' '{sx}ki' '{sx}ke' '{sx}ko' '{sx}ka' '{sx}ku' (<-'{sx}k') 'stvima' 'stvom' 'stvo' 'stva' 'stvu' (<-'stv') '{sx}tvima' '{sx}tvom' '{sx}tvo' '{sx}tva' '{sx}tvu' (<-'{sx}tv') 'tanijama' 'tanijima' 'tanijom' 'tanija' 'taniju' 'tanije' 'taniji' (<-'tanij') 'manijama' 'manijima' 'manijom' 'manija' 'maniju' 'manije' 'maniji' (<-'manij') 'panijama' 'panijima' 'panijom' 'panija' 'paniju' 'panije' 'paniji' (<-'panij') 'ranijama' 'ranijima' 'ranijom' 'ranija' 'raniju' 'ranije' 'raniji' (<-'ranij') 'ganijama' 'ganijima' 'ganijom' 'ganija' 'ganiju' 'ganije' 'ganiji' (<-'ganij') 'aninom' 'anina' 'aninu' 'anine' 'anima' 'anin' 'anom' 'anu' 'ani' 'ana' 'ane' (<-'an') 'inima' 'inama' 'inom' 'ina' 'ine' 'ini' 'inu' 'ino' (<-'in') 'onovima' 'onova' 'onove' 'onovi' 'onima' 'onom' 'ona' 'one' 'oni' 'onu' (<-'on') 'nijima' 'nijega' 'nijemu' 'nijeg' 'nijem' 'nega' 'nemu' 'neg' 'nem' 'nijim' 'nijih' 'nijoj' 'niji' 'nije' 'nija' 'niju' 'nima' 'nome' 'nomu' 'noga' 'noj' 'nom' 'nih' 'nim' 'nog' 'no' 'ne' 'na' 'nu' 'ni' (<-'n') 'a{cy}oga' 'a{cy}ome' 'a{cy}omu' 'a{cy}ega' 'a{cy}emu' 'a{cy}ima' 'a{cy}oj' 'a{cy}ih' 'a{cy}om' 'a{cy}eg' 'a{cy}em' 'a{cy}og' 'a{cy}uh' 'a{cy}im' 'a{cy}e' 'a{cy}a' (<-'a{cy}') 'e{cy}oga' 'e{cy}ome' 'e{cy}omu' 'e{cy}ega' 'e{cy}emu' 'e{cy}ima' 'e{cy}oj' 'e{cy}ih' 'e{cy}om' 'e{cy}eg' 'e{cy}em' 'e{cy}og' 'e{cy}uh' 'e{cy}im' 'e{cy}e' 'e{cy}a' (<-'e{cy}') 'u{cy}oga' 'u{cy}ome' 'u{cy}omu' 'u{cy}ega' 'u{cy}emu' 'u{cy}ima' 'u{cy}oj' 'u{cy}ih' 'u{cy}om' 'u{cy}eg' 'u{cy}em' 'u{cy}og' 'u{cy}uh' 'u{cy}im' 'u{cy}e' 'u{cy}a' (<-'u{cy}') 'ugovima' 'ugovi' 'ugove' 'ugova' (<-'ugov') 'ugama' 'ugom' 'uga' 'uge' 'ugi' 'ugu' 'ugo' (<-'ug') 'logama' 'logom' 'loga' 'logu' 'loge' (<-'log') 'govima' 'gama' 'govi' 'gove' 'gova' 'gom' 'ga' 'ge' 'gi' 'gu' 'go' (<-'g') 'rarijem' 'rarija' 'rariju' 'rario' (<-'rari') 'otijem' 'otija' 'otiju' 'otio' (<-'oti') 'sijem' 'sija' 'siju' 'sio' (<-'si') 'lijem' 'lija' 'liju' 'lio' (<-'li') 'uju{cy}i' 'ujemo' 'ujete' 'ujmo' 'ujem' 'uje{sx}' 'uje' 'uju' (<-'uj') 'cajevima' 'cajevi' 'cajeva' 'cajeve' 'cajama' 'cajima' 'cajem' 'caja' 'caje' 'caji' 'caju' (<-'caj') '{cx}ajevima' '{cx}ajevi' '{cx}ajeva' '{cx}ajeve' '{cx}ajama' '{cx}ajima' '{cx}ajem' '{cx}aja' '{cx}aje' '{cx}aji' '{cx}aju' (<-'{cx}aj') '{cy}ajevima' '{cy}ajevi' '{cy}ajeva' '{cy}ajeve' '{cy}ajama' '{cy}ajima' '{cy}ajem' '{cy}aja' '{cy}aje' '{cy}aji' '{cy}aju' (<-'{cy}aj') '{dx}ajevima' '{dx}ajevi' '{dx}ajeva' '{dx}ajeve' '{dx}ajama' '{dx}ajima' '{dx}ajem' '{dx}aja' '{dx}aje' '{dx}aji' '{dx}aju' (<-'{dx}aj') 'lajevima' 'lajevi' 'lajeva' 'lajeve' 'lajama' 'lajima' 'lajem' 'laja' 'laje' 'laji' 'laju' (<-'laj') 'rajevima' 'rajevi' 'rajeva' 'rajeve' 'rajama' 'rajima' 'rajem' 'raja' 'raje' 'raji' 'raju' (<-'raj') 'bijima' 'bijama' 'bijom' 'bija' 'bije' 'biji' 'biju' 'bijo' (<-'bij') 'cijima' 'cijama' 'cijom' 'cija' 'cije' 'ciji' 'ciju' 'cijo' (<-'cij') 'dijima' 'dijama' 'dijom' 'dija' 'dije' 'diji' 'diju' 'dijo' (<-'dij') 'lijima' 'lijama' 'lijom' 'lije' 'liji' 'lijo' (<-'lij') 'nijama' 'nijom' 'nijo' (<-'nij') 'mijima' 'mijama' 'mijom' 'mija' 'mije' 'miji' 'miju' 'mijo' (<-'mij') '{zx}ijima' '{zx}ijama' '{zx}ijom' '{zx}ija' '{zx}ije' '{zx}iji' '{zx}iju' '{zx}ijo' (<-'{zx}ij') 'gijima' 'gijama' 'gijom' 'gija' 'gije' 'giji' 'giju' 'gijo' (<-'gij') 'fijima' 'fijama' 'fijom' 'fija' 'fije' 'fiji' 'fiju' 'fijo' (<-'fij') 'pijima' 'pijama' 'pijom' 'pija' 'pije' 'piji' 'piju' 'pijo' (<-'pij') 'rijima' 'rijama' 'rijom' 'rija' 'rije' 'riji' 'riju' 'rijo' (<-'rij') 'sijima' 'sijama' 'sijom' 'sije' 'siji' 'sijo' (<-'sij') 'tijima' 'tijama' 'tijom' 'tija' 'tije' 'tiji' 'tiju' 'tijo' (<-'tij') 'zijima' 'zijama' 'zijom' 'zija' 'zije' 'ziji' 'ziju' 'zijo' (<-'zij') 'nalima' 'nalama' 'nalom' 'nala' 'nale' 'nali' 'nalu' 'nalo' (<-'nal') 'ijalima' 'ijalama' 'ijalom' 'ijala' 'ijale' 'ijali' 'ijalu' 'ijalo' (<-'ijal') 'ozilima' 'ozilom' 'ozila' 'ozile' 'ozilu' 'ozili' (<-'ozil') 'olovima' 'olovi' 'olova' 'olove' (<-'olov') 'olima' 'olom' 'ola' 'olu' 'ole' 'oli' (<-'ol') 'lemama' 'lemima' 'lemom' 'lema' 'leme' 'lemi' 'lemu' 'lemo' (<-'lem') 'ramama' 'ramom' 'rama' 'rame' 'rami' 'ramu' 'ramo' (<-'ram') 'arama' 'arima' 'arom' 'aru' 'ara' 'are' 'ari' (<-'ar') 'drama' 'drima' 'drom' 'dru' 'dra' 'dre' 'dri' (<-'dr') 'erama' 'erima' 'erom' 'eru' 'era' 'ere' 'eri' (<-'er') 'orama' 'orima' 'orom' 'oru' 'ora' 'ore' 'ori' (<-'or') 'esima' 'esom' 'ese' 'esa' 'esu' (<-'es') 'isima' 'isom' 'ise' 'isa' 'isu' (<-'is') 'ta{sx}ama' 'ta{sx}ima' 'ta{sx}om' 'ta{sx}em' 'ta{sx}a' 'ta{sx}u' 'ta{sx}i' 'ta{sx}e' (<-'ta{sx}') 'na{sx}ama' 'na{sx}ima' 'na{sx}om' 'na{sx}em' 'na{sx}a' 'na{sx}u' 'na{sx}i' 'na{sx}e' (<-'na{sx}') 'ja{sx}ama' 'ja{sx}ima' 'ja{sx}om' 'ja{sx}em' 'ja{sx}a' 'ja{sx}u' 'ja{sx}i' 'ja{sx}e' (<-'ja{sx}') 'ka{sx}ama' 'ka{sx}ima' 'ka{sx}om' 'ka{sx}em' 'ka{sx}a' 'ka{sx}u' 'ka{sx}i' 'ka{sx}e' (<-'ka{sx}') 'ba{sx}ama' 'ba{sx}ima' 'ba{sx}om' 'ba{sx}em' 'ba{sx}a' 'ba{sx}u' 'ba{sx}i' 'ba{sx}e' (<-'ba{sx}') 'ga{sx}ama' 'ga{sx}ima' 'ga{sx}om' 'ga{sx}em' 'ga{sx}a' 'ga{sx}u' 'ga{sx}i' 'ga{sx}e' (<-'ga{sx}') 'va{sx}ama' 'va{sx}ima' 'va{sx}om' 'va{sx}em' 'va{sx}a' 'va{sx}u' 'va{sx}i' 'va{sx}e' (<-'va{sx}') 'e{sx}ima' 'e{sx}ama' 'e{sx}om' 'e{sx}em' 'e{sx}i' 'e{sx}e' 'e{sx}a' 'e{sx}u' (<-'e{sx}') 'i{sx}ima' 'i{sx}ama' 'i{sx}om' 'i{sx}em' 'i{sx}i' 'i{sx}e' 'i{sx}a' 'i{sx}u' (<-'i{sx}') 'ikatima' 'ikatom' 'ikata' 'ikate' 'ikati' 'ikatu' 'ikato' (<-'ikat') 'latima' 'latom' 'lata' 'late' 'lati' 'latu' 'lato' (<-'lat') 'etama' 'etima' 'etom' 'eta' 'ete' 'eti' 'etu' 'eto' (<-'et') 'estima' 'estama' 'estom' 'esta' 'este' 'esti' 'estu' 'esto' (<-'est') 'istima' 'istama' 'istom' 'ista' 'iste' 'isti' 'istu' 'isto' (<-'ist') 'kstima' 'kstama' 'kstom' 'ksta' 'kste' 'ksti' 'kstu' 'ksto' (<-'kst') 'ostima' 'ostama' 'ostom' 'osta' 'oste' 'osti' 'ostu' 'osto' (<-'ost') 'i{sx}tima' 'i{sx}tem' 'i{sx}ta' 'i{sx}te' 'i{sx}tu' (<-'i{sx}t') 'ovasmo' 'ovaste' 'ovahu' 'ovati' 'ova{sx}e' 'ovali' 'ovala' 'ovale' 'ovalo' 'ovat' 'ovah' 'ovao' (<-'ova') 'avijemu' 'avijima' 'avijega' 'avijeg' 'avijem' 'avemu' 'avega' 'aveg' 'avem' 'avijim' 'avijih' 'avijoj' 'avoga' 'avome' 'avomu' 'avima' 'avama' 'aviji' 'avije' 'avija' 'aviju' 'avim' 'avih' 'avoj' 'avom' 'avog' 'avi' 'ava' 'avu' 'ave' 'avo' (<-'av') 'evijemu' 'evijima' 'evijega' 'evijeg' 'evijem' 'evemu' 'evega' 'eveg' 'evem' 'evijim' 'evijih' 'evijoj' 'evoga' 'evome' 'evomu' 'evima' 'evama' 'eviji' 'evije' 'evija' 'eviju' 'evim' 'evih' 'evoj' 'evom' 'evog' 'evi' 'eva' 'evu' 'eve' 'evo' (<-'ev') 'ivijemu' 'ivijima' 'ivijega' 'ivijeg' 'ivijem' 'ivemu' 'ivega' 'iveg' 'ivem' 'ivijim' 'ivijih' 'ivijoj' 'ivoga' 'ivome' 'ivomu' 'ivima' 'ivama' 'iviji' 'ivije' 'ivija' 'iviju' 'ivim' 'ivih' 'ivoj' 'ivom' 'ivog' 'ivi' 'iva' 'ivu' 'ive' 'ivo' (<-'iv') 'ovijemu' 'ovijima' 'ovijega' 'ovijeg' 'ovijem' 'ovemu' 'ovega' 'oveg' 'ovijim' 'ovijih' 'ovijoj' 'ovoga' 'ovome' 'ovomu' 'ovima' 'oviji' 'ovije' 'ovija' 'oviju' 'ovim' 'ovih' 'ovoj' 'ovom' 'ovog' 'ovi' 'ova' 'ovu' 'ove' 'ovo' (<-'ov') 'movima' 'movom' 'mova' 'movu' 'move' 'movi' (<-'mov') 'lovima' 'lovom' 'lova' 'lovu' 'love' 'lovi' (<-'lov') 'elijemu' 'elijima' 'elijega' 'elijeg' 'elijem' 'elemu' 'elega' 'eleg' 'elem' 'elijim' 'elijih' 'elijoj' 'eloga' 'elome' 'elomu' 'elima' 'eliji' 'elije' 'elija' 'eliju' 'elim' 'elih' 'eloj' 'elom' 'elog' 'eli' 'ela' 'elu' 'ele' 'elo' (<-'el') 'anjijemu' 'anjijima' 'anjijega' 'anjijeg' 'anjijem' 'anjemu' 'anjega' 'anjeg' 'anjem' 'anjijim' 'anjijih' 'anjijoj' 'anjoga' 'anjome' 'anjomu' 'anjima' 'anjiji' 'anjije' 'anjija' 'anjiju' 'anjim' 'anjih' 'anjoj' 'anjom' 'anjog' 'anja' 'anje' 'anji' 'anjo' 'anju' (<-'anj') 'enjijemu' 'enjijima' 'enjijega' 'enjijeg' 'enjijem' 'enjemu' 'enjega' 'enjeg' 'enjem' 'enjijim' 'enjijih' 'enjijoj' 'enjoga' 'enjome' 'enjomu' 'enjima' 'enjiji' 'enjije' 'enjija' 'enjiju' 'enjim' 'enjih' 'enjoj' 'enjom' 'enjog' 'enja' 'enje' 'enji' 'enjo' 'enju' (<-'enj') '{sx}njijemu' '{sx}njijima' '{sx}njijega' '{sx}njijeg' '{sx}njijem' '{sx}njemu' '{sx}njega' '{sx}njeg' '{sx}njem' '{sx}njijim' '{sx}njijih' '{sx}njijoj' '{sx}njoga' '{sx}njome' '{sx}njomu' '{sx}njima' '{sx}njiji' '{sx}njije' '{sx}njija' '{sx}njiju' '{sx}njim' '{sx}njih' '{sx}njoj' '{sx}njom' '{sx}njog' '{sx}nja' '{sx}nje' '{sx}nji' '{sx}njo' '{sx}nju' (<-'{sx}nj') 'anemu' 'anega' 'aneg' 'anem' (<-'an') 'enemu' 'enega' 'eneg' 'enem' (<-'en') '{sx}nemu' '{sx}nega' '{sx}neg' '{sx}nem' (<-'{sx}n') '{cx}inama' '{cx}inome' '{cx}inomu' '{cx}inoga' '{cx}inima' '{cx}inog' '{cx}inom' '{cx}inim' '{cx}inih' '{cx}inoj' '{cx}ina' '{cx}inu' '{cx}ini' '{cx}ino' '{cx}ine' (<-'{cx}in') 'ro{sx}iv{sx}i' 'ro{sx}ismo' 'ro{sx}iste' 'ro{sx}i{sx}e' 'ro{sx}imo' 'ro{sx}ite' 'ro{sx}iti' 'ro{sx}ili' 'ro{sx}ila' 'ro{sx}ilo' 'ro{sx}ile' 'ro{sx}im' 'ro{sx}i{sx}' 'ro{sx}it' 'ro{sx}ih' 'ro{sx}io' (<-'ro{sx}i') 'o{sx}ijemu' 'o{sx}ijima' 'o{sx}ijega' 'o{sx}ijeg' 'o{sx}ijem' 'o{sx}emu' 'o{sx}ega' 'o{sx}eg' 'o{sx}em' 'o{sx}ijim' 'o{sx}ijih' 'o{sx}ijoj' 'o{sx}oga' 'o{sx}ome' 'o{sx}omu' 'o{sx}ima' 'o{sx}iji' 'o{sx}ije' 'o{sx}ija' 'o{sx}iju' 'o{sx}im' 'o{sx}ih' 'o{sx}oj' 'o{sx}om' 'o{sx}og' 'o{sx}i' 'o{sx}a' 'o{sx}u' 'o{sx}e' (<-'o{sx}') 'evitijima' 'evitijega' 'evitijemu' 'evitijem' 'evitega' 'evitemu' 'evitem' 'evitijim' 'evitijih' 'evitijoj' 'evitijeg' 'evitiji' 'evitije' 'evitija' 'evitoga' 'evitome' 'evitomu' 'evitima' 'evitog' 'evitom' 'evitim' 'evitih' 'evitoj' 'eviti' 'evite' 'evito' 'evita' 'evitu' (<-'evit') 'ovitijima' 'ovitijega' 'ovitijemu' 'ovitijem' 'ovitega' 'ovitemu' 'ovitem' 'ovitijim' 'ovitijih' 'ovitijoj' 'ovitijeg' 'ovitiji' 'ovitije' 'ovitija' 'ovitoga' 'ovitome' 'ovitomu' 'ovitima' 'ovitog' 'ovitom' 'ovitim' 'ovitih' 'ovitoj' 'oviti' 'ovite' 'ovito' 'ovita' 'ovitu' (<-'ovit') 'astijima' 'astijega' 'astijemu' 'astijem' 'astega' 'astemu' 'astem' 'astijim' 'astijih' 'astijoj' 'astijeg' 'astiji' 'astije' 'astija' 'astoga' 'astome' 'astomu' 'astima' 'astog' 'astom' 'astim' 'astih' 'astoj' 'asti' 'aste' 'asto' 'asta' 'astu' (<-'ast') 'kijemu' 'kijima' 'kijega' 'kijeg' 'kijem' 'kemu' 'kega' 'keg' 'kem' 'kijim' 'kijih' 'kijoj' 'koga' 'kome' 'komu' 'kima' 'kiji' 'kije' 'kija' 'kiju' 'kim' 'kih' 'koj' 'kom' 'kog' 'kov' 'ki' 'ka' 'ku' 'ke' 'ko' (<-'k') 'evaju{cy}i' 'evasmo' 'evaste' 'evajmo' 'evajte' 'evaju' 'evala' 'evale' 'evali' 'evalo' 'evamo' 'evana' 'evane' 'evani' 'evano' 'evate' 'evati' 'eva{sx}e' 'evahu' 'evah' 'evaj' 'evam' 'evan' 'evao' 'evat' 'evav' 'eva{sx}' (<-'eva') 'avaju{cy}i' 'avasmo' 'avaste' 'avajmo' 'avajte' 'avaju' 'avala' 'avale' 'avali' 'avalo' 'avamo' 'avana' 'avane' 'avani' 'avano' 'avate' 'avati' 'ava{sx}e' 'avahu' 'avah' 'avaj' 'avam' 'avan' 'avao' 'avat' 'avav' 'ava{sx}' (<-'ava') 'ivaju{cy}i' 'ivasmo' 'ivaste' 'ivajmo' 'ivajte' 'ivaju' 'ivala' 'ivale' 'ivali' 'ivalo' 'ivamo' 'ivana' 'ivane' 'ivani' 'ivano' 'ivate' 'ivati' 'iva{sx}e' 'ivahu' 'ivah' 'ivaj' 'ivam' 'ivan' 'ivao' 'ivat' 'ivav' 'iva{sx}' (<-'iva') 'uvaju{cy}i' 'uvasmo' 'uvaste' 'uvajmo' 'uvajte' 'uvaju' 'uvala' 'uvale' 'uvali' 'uvalo' 'uvamo' 'uvana' 'uvane' 'uvani' 'uvano' 'uvate' 'uvati' 'uva{sx}e' 'uvahu' 'uvah' 'uvaj' 'uvam' 'uvan' 'uvao' 'uvat' 'uvav' 'uva{sx}' (<-'uva') 'irujemo' 'irujete' 'iruju{cy}i' 'iraju{cy}i' 'irivat' 'irujem' 'iruje{sx}' 'irujmo' 'irujte' 'irav{sx}i' 'irasmo' 'iraste' 'irati' 'iramo' 'irate' 'iraju' 'ira{sx}e' 'irahu' 'irala' 'iralo' 'irali' 'irale' 'iruje' 'iruju' 'iruj' 'iral' 'iran' 'iram' 'ira{sx}' 'irat' 'irah' 'irao' (<-'ir') 'a{cx}ismo' 'a{cx}iste' 'a{cx}iti' 'a{cx}imo' 'a{cx}ite' 'a{cx}i{sx}e' 'a{cx}e{cy}i' 'a{cx}ila' 'a{cx}ilo' 'a{cx}ili' 'a{cx}ile' 'a{cx}ena' 'a{cx}eno' 'a{cx}eni' 'a{cx}ene' 'a{cx}io' 'a{cx}im' 'a{cx}i{sx}' 'a{cx}it' 'a{cx}ih' 'a{cx}en' 'a{cx}i' 'a{cx}e' (<-'a{cx}') 'a{cx}av{sx}i' 'a{cx}asmo' 'a{cx}aste' 'a{cx}ahu' 'a{cx}ati' 'a{cx}amo' 'a{cx}ate' 'a{cx}a{sx}e' 'a{cx}ala' 'a{cx}alo' 'a{cx}ali' 'a{cx}ale' 'a{cx}aju' 'a{cx}ana' 'a{cx}ano' 'a{cx}ani' 'a{cx}ane' 'a{cx}ao' 'a{cx}am' 'a{cx}a{sx}' 'a{cx}at' 'a{cx}ah' 'a{cx}an' (<-'a{cx}a') 'nuv{sx}i' 'nusmo' 'nuste' 'nu{cy}i' 'nimo' 'nite' 'nemo' 'nete' 'nula' 'nulo' 'nule' 'nuli' 'nuto' 'nuti' 'nuta' 'ne{sx}' 'nuo' 'nut' (<-'n') 'niv{sx}i' 'nismo' 'niste' 'niti' 'nila' 'nilo' 'nile' 'nili' 'ni{sx}' 'nio' (<-'ni') 'aju{cy}i' 'av{sx}i' 'asmo' 'ajmo' 'ajte' 'ajem' 'aloj' 'amo' 'ate' 'aje' 'aju' 'ati' 'a{sx}e' 'ahu' 'ala' 'ali' 'ale' 'alo' 'ano' 'at' 'ah' 'ao' 'aj' 'an' 'am' 'a{sx}' (<-'a') 'uraju{cy}i' 'urasmo' 'uraste' 'urajmo' 'urajte' 'uramo' 'urate' 'uraju' 'urati' 'ura{sx}e' 'urahu' 'urala' 'urali' 'urale' 'uralo' 'urana' 'urano' 'urani' 'urane' 'ural' 'urat' 'urah' 'urao' 'uraj' 'uran' 'uram' 'ura{sx}' (<-'ur') 'astajasmo' 'astajaste' 'astajahu' 'astajati' 'astajemo' 'astajete' 'astaja{sx}e' 'astajali' 'astaju{cy}i' 'astajala' 'astajalo' 'astajale' 'astajmo' 'astajao' 'astajem' 'astaje{sx}' 'astajat' 'astajah' 'astajte' 'astaje' 'astaju' (<-'astaj') 'istajasmo' 'istajaste' 'istajahu' 'istajati' 'istajemo' 'istajete' 'istaja{sx}e' 'istajali' 'istaju{cy}i' 'istajala' 'istajalo' 'istajale' 'istajmo' 'istajao' 'istajem' 'istaje{sx}' 'istajat' 'istajah' 'istajte' 'istaje' 'istaju' (<-'istaj') 'ostajasmo' 'ostajaste' 'ostajahu' 'ostajati' 'ostajemo' 'ostajete' 'ostaja{sx}e' 'ostajali' 'ostaju{cy}i' 'ostajala' 'ostajalo' 'ostajale' 'ostajmo' 'ostajao' 'ostajem' 'ostaje{sx}' 'ostajat' 'ostajah' 'ostajte' 'ostaje' 'ostaju' (<-'ostaj') 'alama' 'alima' 'alom' 'alu' 'al' (<-'a') 'ajevima' 'ajevi' 'ajeva' 'ajeve' 'ajama' 'ajima' 'aja' 'aji' (<-'aj') 'astadosmo' 'astadoste' 'astado{sx}e' 'astanemo' 'astademo' 'astanete' 'astadete' 'astanimo' 'astanite' 'astanila' 'astav{sx}i' 'astanem' 'astadem' 'astane{sx}' 'astade{sx}' 'astadoh' 'astade' 'astati' 'astane' 'astanu' 'astadu' 'astala' 'astali' 'astalo' 'astale' 'astat' 'astao' (<-'asta') 'istadosmo' 'istadoste' 'istado{sx}e' 'istanemo' 'istademo' 'istanete' 'istadete' 'istanimo' 'istanite' 'istanila' 'istav{sx}i' 'istanem' 'istadem' 'istane{sx}' 'istade{sx}' 'istadoh' 'istade' 'istati' 'istane' 'istanu' 'istadu' 'istala' 'istali' 'istalo' 'istale' 'istat' 'istao' (<-'ista') 'ostadosmo' 'ostadoste' 'ostado{sx}e' 'ostanemo' 'ostademo' 'ostanete' 'ostadete' 'ostanimo' 'ostanite' 'ostanila' 'ostav{sx}i' 'ostanem' 'ostadem' 'ostane{sx}' 'ostade{sx}' 'ostadoh' 'ostade' 'ostati' 'ostane' 'ostanu' 'ostadu' 'ostala' 'ostali' 'ostalo' 'ostale' 'ostat' 'ostao' (<-'osta') 'tasmo' 'taste' 'tajmo' 'tajte' 'tav{sx}i' 'tati' 'tamo' 'tate' 'taju' 'tala' 'talo' 'tale' 'tali' 'tana' 'tano' 'tani' 'tane' 'tan' 'taj' 'tao' 'tam' 'ta{sx}' 'tat' 'tah' (<-'ta') 'injasmo' 'injaste' 'injati' 'injemo' 'injete' 'injali' 'injala' 'injalo' 'injale' 'inja{sx}e' 'injahu' 'injem' 'inje{sx}' 'injat' 'injah' 'injao' (<-'inj') 'astemo' 'astete' 'astimo' 'astite' 'astu{cy}i' 'aste{sx}' 'asli' 'asla' 'aslo' 'asle' (<-'as') 'iv{sx}i' 'ie{cy}i' 'ismo' 'imo' 'ite' 'iti' 'ili' 'ila' 'ilo' 'ile' 'im' 'i{sx}' 'it' 'ih' 'io' (<-'i') 'ijemo' 'ijete' 'ijem' 'ije{sx}' 'ijmo' 'ijte' 'iju' 'ije' 'ij' 'ilu' (<-'i') 'lu{cx}ujete' 'lu{cx}uju{cy}i' 'lu{cx}ujemo' 'lu{cx}ujem' 'lu{cx}uje{sx}' 'lu{cx}ismo' 'lu{cx}iste' 'lu{cx}ujmo' 'lu{cx}ujte' 'lu{cx}uje' 'lu{cx}uju' 'lu{cx}i{sx}e' 'lu{cx}iti' 'lu{cx}imo' 'lu{cx}ite' 'lu{cx}ila' 'lu{cx}ilo' 'lu{cx}ili' 'lu{cx}ile' 'lu{cx}ena' 'lu{cx}eno' 'lu{cx}eni' 'lu{cx}ene' 'lu{cx}uj' 'lu{cx}io' 'lu{cx}en' 'lu{cx}im' 'lu{cx}i{sx}' 'lu{cx}it' 'lu{cx}ih' 'lu{cx}e' 'lu{cx}i' (<-'lu{cx}') 'jetismo' 'jetiste' 'jeti{sx}e' 'jetimo' 'jetite' 'jetiti' 'jetili' 'jetila' 'jetilo' 'jetile' 'jetim' 'jeti{sx}' 'jetit' 'jetih' 'jetio' (<-'jeti') 'emo' 'em' 'e{sx}' 'elama' 'el' (<-'e') 'ilama' 'ilima' 'ilom' 'il' (<-'i') 'atijega' 'atijemu' 'atijima' 'atijeg' 'atijem' 'atega' 'atemu' 'ateg' 'atem' 'atijih' 'atijim' 'atima' 'atoga' 'atome' 'atomu' 'atiji' 'atije' 'atija' 'atiju' 'atoj' 'atog' 'atom' 'atim' 'atih' 'ata' 'atu' 'ato' (<-'at') 'etav{sx}i' 'etu{cy}i' 'etemo' 'etimo' 'etem' 'ete{sx}' (<-'et') 'lucujuci' 'lucujemo' 'lucujete' 'lucujem' 'lucujes' 'lucujmo' 'lucujte' 'lucismo' 'luciste' 'luciti' 'lucite' 'lucise' 'lucuje' 'lucuju' 'lucila' 'lucile' 'lucili' 'lucilo' 'lucena' 'luceni' 'lucene' 'luceno' 'lucimo' 'lucim' 'lucis' 'lucih' 'lucit' 'lucio' 'lucuj' 'lucen' 'luce' 'luci' (no_diacritics <-'luc') 'snjijima' 'snjijemu' 'snjijega' 'snjijim' 'snjijih' 'snjijeg' 'snjijoj' 'snjiji' 'snjija' 'snjije' 'snjiju' 'snjima' 'snjemu' 'snjomu' 'snjome' 'snjega' 'snjoga' 'snjih' 'snjim' 'snjem' 'snjom' 'snjeg' 'snjog' 'snjoj' 'snja' 'snje' 'snji' 'snjo' 'snju' (no_diacritics <-'snj') 'osijima' 'osijemu' 'osijega' 'snjijem' 'osijih' 'osijim' 'osijem' 'osijeg' 'osijoj' 'osima' 'osemu' 'osomu' 'osome' 'osega' 'osoga' 'osija' 'osije' 'osiji' 'osiju' 'osih' 'osim' 'osem' 'osom' 'oseg' 'osog' 'osoj' 'osa' 'ose' 'osi' 'osu' (no_diacritics <-'os') 'acismo' 'aciste' 'acima' 'acimo' 'acome' 'acomu' 'acite' 'aciti' 'acise' 'acila' 'acile' 'acili' 'acilo' 'acega' 'acene' 'aceci' 'aceni' 'acemu' 'acena' 'aceno' 'acoga' 'acoj' 'acih' 'acem' 'acom' 'acen' 'acog' 'acit' 'acio' 'aceg' 'acim' 'acuh' 'acis' 'ace' 'aca' 'aci' (no_diacritics <-'ac') 'ecome' 'ecoga' 'ecemu' 'ecima' 'ecega' 'ecomu' 'ecoj' 'ecuh' 'ecom' 'ecog' 'eceg' 'ecih' 'ecem' 'ecim' 'eca' 'ece' (no_diacritics <-'ec') 'ucomu' 'ucome' 'ucima' 'ucoga' 'ucega' 'ucemu' 'ucih' 'ucog' 'uceg' 'ucom' 'ucem' 'ucim' 'ucuh' 'ucoj' 'uca' 'uce' (no_diacritics <-'uc') 'rosismo' 'rosivsi' 'rosiste' 'rositi' 'rosili' 'rosise' 'rosite' 'rosilo' 'rosimo' 'rosile' 'rosila' 'rosit' 'rosis' 'rosio' 'rosim' 'rosih' (no_diacritics <-'rosi') 'acavsi' 'acaste' 'acasmo' 'acaju' 'acane' 'acate' 'acali' 'acani' 'acati' 'acale' 'acahu' 'acase' 'acano' 'acamo' 'acalo' 'acana' 'acala' 'acam' 'acan' 'acao' 'acas' 'acat' 'acah' (no_diacritics <-'aca') 'jasima' 'jasama' 'jasem' 'jasom' 'jase' 'jasi' 'jasa' 'jasu' (no_diacritics <-'jas') 'tasima' 'tasama' 'tasem' 'tasom' 'tase' 'tasa' 'tasu' 'tasi' (no_diacritics <-'tas') 'gasima' 'gasama' 'gasem' 'gasom' 'gasi' 'gasu' 'gase' 'gasa' (no_diacritics <-'gas') 'nasama' 'nasima' 'nasem' 'nasom' 'nasu' 'nasi' 'nase' 'nasa' (no_diacritics <-'nas') 'kasama' 'kasima' 'kasom' 'kasem' 'kasi' 'kasu' 'kase' 'kasa' (no_diacritics <-'kas') 'vasama' 'vasima' 'vasom' 'vasem' 'vasi' 'vase' 'vasa' 'vasu' (no_diacritics <-'vas') 'basama' 'basima' 'basom' 'basem' 'basi' 'base' 'basu' 'basa' (no_diacritics <-'bas') 'astuci' 'astes' (no_diacritics <-'as') 'cinima' 'cinome' 'cinama' 'cinomu' 'cinoga' 'cinom' 'cinih' 'cinim' 'cinog' 'cinoj' 'cino' 'cini' 'cinu' 'cine' 'cina' (no_diacritics <-'cin') 'astajase' 'astajuci' 'astajes' (no_diacritics <-'astaj') 'istajase' 'istajuci' 'istajes' (no_diacritics <-'istaj') 'ostajase' 'ostajuci' 'ostajes' (no_diacritics <-'ostaj') 'astadose' 'astades' 'astanes' 'astavsi' (no_diacritics <-'asta') 'istadose' 'istades' 'istanes' 'istavsi' (no_diacritics <-'ista') 'ostadose' 'ostades' 'ostanes' 'ostavsi' (no_diacritics <-'osta') 'avajuci' 'avase' 'avas' (no_diacritics <-'ava') 'evajuci' 'evase' 'evas' (no_diacritics <-'eva') 'ivajuci' 'ivase' 'ivas' (no_diacritics <-'iva') 'uvajuci' 'uvase' 'uvas' (no_diacritics <-'uva') 'ovase' (no_diacritics <-'ova') 'jetise' 'jetis' (no_diacritics <-'jeti') 'injase' 'injes' (no_diacritics <-'inj') 'istem' (no_diacritics <-'ist') 'esama' 'esem' 'esi' (no_diacritics <-'es') 'etavsi' 'etuci' 'etes' (no_diacritics <-'et') 'isama' 'isem' 'isi' (no_diacritics <-'is') 'irajuci' 'irujuci' 'irujes' 'iravsi' 'irase' 'iras' (no_diacritics <-'ir') 'urajuci' 'urase' 'uras' (no_diacritics <-'ur') 'ujuci' 'ujes' (no_diacritics <-'uj') 'nivsi' 'nis' (no_diacritics <-'ni') 'snega' 'snemu' 'snem' 'sneg' (no_diacritics <-'sn') 'tavsi' 'tas' (no_diacritics <-'ta') 'ajuci' 'avsi' 'ase' 'as' (no_diacritics <-'a') 'ijes' 'ivsi' 'ieci' 'is' (no_diacritics <-'i') 'es' (no_diacritics <-'e') 'nuvsi' 'nuci' 'nes' (no_diacritics <-'n') ) ) define Step_3 as ( [substring] R1 among ( 'enom' 'enoj' 'enog' 'enim' 'enih' 'anoj' 'anog' 'anim' 'anih' 'ost' 'eno' 'eni' 'oga' 'ima' 'enu' 'ena' 'ama' 'ano' 'ani' 'om' 'og' 'u' 'o' 'i' 'e' 'a' (<-'') ) ) ) define stem as ( do cyr_to_lat do prelude do mark_regions backwards ( do Step_1 do (Step_2 or Step_3) ) ) snowball-2.2.0/algorithms/spanish.sbl000066400000000000000000000133561414263061200176630ustar00rootroot00000000000000routines ( postlude mark_regions RV R1 R2 attached_pronoun standard_suffix y_verb_suffix verb_suffix residual_suffix ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v ) stringescapes {} /* special characters */ stringdef a' '{U+00E1}' // a-acute stringdef e' '{U+00E9}' // e-acute stringdef i' '{U+00ED}' // i-acute stringdef o' '{U+00F3}' // o-acute stringdef u' '{U+00FA}' // u-acute stringdef u" '{U+00FC}' // u-diaeresis stringdef n~ '{U+00F1}' // n-tilde define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}' define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v (non-v gopast v) or (v gopast non-v) ) or ( non-v (non-v gopast v) or (v next) ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( '{a'}' (<- 'a') '{e'}' (<- 'e') '{i'}' (<- 'i') '{o'}' (<- 'o') '{u'}' (<- 'u') // and possibly {u"}->u here, or in prelude '' (next) ) //or next ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define attached_pronoun as ( [substring] among( 'me' 'se' 'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo' 'las' 'les' 'los' 'nos' ) substring RV among( 'i{e'}ndo' (] <- 'iendo') '{a'}ndo' (] <- 'ando') '{a'}r' (] <- 'ar') '{e'}r' (] <- 'er') '{i'}r' (] <- 'ir') 'ando' 'iendo' 'ar' 'er' 'ir' (delete) 'yendo' ('u' delete) ) ) define standard_suffix as ( [substring] among( 'anza' 'anzas' 'ico' 'ica' 'icos' 'icas' 'ismo' 'ismos' 'able' 'ables' 'ible' 'ibles' 'ista' 'istas' 'oso' 'osa' 'osos' 'osas' 'amiento' 'amientos' 'imiento' 'imientos' ( R2 delete ) 'adora' 'ador' 'aci{o'}n' 'adoras' 'adores' 'aciones' 'ante' 'antes' 'ancia' 'ancias'// Note 1 ( R2 delete try ( ['ic'] R2 delete ) ) 'log{i'}a' 'log{i'}as' ( R2 <- 'log' ) 'uci{o'}n' 'uciones' ( R2 <- 'u' ) 'encia' 'encias' ( R2 <- 'ente' ) 'amente' ( R1 delete try ( [substring] R2 delete among( 'iv' (['at'] R2 delete) 'os' 'ic' 'ad' ) ) ) 'mente' ( R2 delete try ( [substring] among( 'ante' // Note 1 'able' 'ible' (R2 delete) ) ) ) 'idad' 'idades' ( R2 delete try ( [substring] among( 'abil' 'ic' 'iv' (R2 delete) ) ) ) 'iva' 'ivo' 'ivas' 'ivos' ( R2 delete try ( ['at'] R2 delete // but not a further ['ic'] R2 delete ) ) ) ) define y_verb_suffix as ( setlimit tomark pV for ([substring]) among( 'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}' 'yas' 'yes' 'yais' 'yamos' ('u' delete) ) ) define verb_suffix as ( setlimit tomark pV for ([substring]) among( 'en' 'es' '{e'}is' 'emos' (try ('u' test 'g') ] delete) 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais' 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}' 'ar{e'}' 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais' 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}' 'er{e'}' 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais' 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}' 'ir{e'}' 'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed' 'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an' 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado' 'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as' 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases' 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais' 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados' 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos' (delete) ) ) define residual_suffix as ( [substring] among( 'os' 'a' 'o' '{a'}' '{i'}' '{o'}' ( RV delete ) 'e' '{e'}' ( RV delete try( ['u'] test 'g' RV delete ) ) ) ) ) define stem as ( do mark_regions backwards ( do attached_pronoun do ( standard_suffix or y_verb_suffix or verb_suffix ) do residual_suffix ) do postlude ) /* Note 1: additions of 15 Jun 2005 */ snowball-2.2.0/algorithms/swedish.sbl000066400000000000000000000027121414263061200176560ustar00rootroot00000000000000routines ( mark_regions main_suffix consonant_pair other_suffix ) externals ( stem ) integers ( p1 x ) groupings ( v s_ending ) stringescapes {} /* special characters */ stringdef a" '{U+00E4}' stringdef ao '{U+00E5}' stringdef o" '{U+00F6}' define v 'aeiouy{a"}{ao}{o"}' define s_ending 'bcdfghjklmnoprtvy' define mark_regions as ( $p1 = limit test ( hop 3 setmark x ) goto v gopast non-v setmark p1 try ( $p1 < x $p1 = x ) ) backwardmode ( define main_suffix as ( setlimit tomark p1 for ([substring]) among( 'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne' 'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter' 'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens' 'hetens' 'erns' 'at' 'andet' 'het' 'ast' (delete) 's' (s_ending delete) ) ) define consonant_pair as setlimit tomark p1 for ( among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt') and ([next] delete) ) define other_suffix as setlimit tomark p1 for ( [substring] among( 'lig' 'ig' 'els' (delete) 'l{o"}st' (<-'l{o"}s') 'fullt' (<-'full') ) ) ) define stem as ( do mark_regions backwards ( do main_suffix do consonant_pair do other_suffix ) ) snowball-2.2.0/algorithms/tamil.sbl000066400000000000000000000265311414263061200173230ustar00rootroot00000000000000/* * Affix stripping stemming algorithm for Tamil * By Damodharan Rajalingam */ stringescapes {} /* Aytham */ stringdef aytham '{U+0B83}' /* Uyir - independent vowels */ stringdef a '{U+0B85}' stringdef aa '{U+0B86}' stringdef i '{U+0B87}' stringdef ii '{U+0B88}' stringdef u '{U+0B89}' stringdef uu '{U+0B8A}' stringdef e '{U+0B8E}' stringdef ee '{U+0B8F}' stringdef ai '{U+0B90}' stringdef o '{U+0B92}' stringdef oo '{U+0B93}' stringdef au '{U+0B94}' /* Consonants */ stringdef ka '{U+0B95}' stringdef nga '{U+0B99}' stringdef ca '{U+0B9A}' stringdef ja '{U+0B9C}' stringdef nya '{U+0B9E}' stringdef tta '{U+0B9F}' stringdef nna '{U+0BA3}' stringdef ta '{U+0BA4}' stringdef tha '{U+0BA4}' stringdef na '{U+0BA8}' stringdef nnna '{U+0BA9}' stringdef pa '{U+0BAA}' stringdef ma '{U+0BAE}' stringdef ya '{U+0BAF}' stringdef ra '{U+0BB0}' stringdef rra '{U+0BB1}' stringdef la '{U+0BB2}' stringdef lla '{U+0BB3}' stringdef llla '{U+0BB4}' stringdef zha '{U+0BB4}' stringdef va '{U+0BB5}' /* Vatamozi - borrowed */ stringdef sha '{U+0BB6}' stringdef ssa '{U+0BB7}' stringdef sa '{U+0BB8}' stringdef ha '{U+0BB9}' /* Dependent vowel signs (kombu etc.) */ stringdef vs_aa '{U+0BBE}' stringdef vs_i '{U+0BBF}' stringdef vs_ii '{U+0BC0}' stringdef vs_u '{U+0BC1}' stringdef vs_uu '{U+0BC2}' stringdef vs_e '{U+0BC6}' stringdef vs_ee '{U+0BC7}' stringdef vs_ai '{U+0BC8}' stringdef vs_o '{U+0BCA}' stringdef vs_oo '{U+0BCB}' stringdef vs_au '{U+0BCC}' /* Pulli */ stringdef pulli '{U+0BCD}' /* AU length markk */ stringdef au_lmark '{U+0BD7}' routines ( remove_plural_suffix remove_question_suffixes remove_question_prefixes remove_pronoun_prefixes remove_command_suffixes remove_um remove_vetrumai_urupukal fix_va_start fix_ending fix_endings remove_tense_suffix remove_tense_suffixes remove_common_word_endings has_min_length ) externals ( stem ) booleans ( found_a_match found_vetrumai_urupu ) define has_min_length as ( $(len > 4) ) define fix_va_start as ( (try '{va}{vs_oo}' and [ '{va}{vs_oo}' ] <- '{oo}' ) or (try '{va}{vs_o}' and [ '{va}{vs_o}' ] <- '{o}' ) or (try '{va}{vs_u}' and [ '{va}{vs_u}' ] <- '{u}' ) or (try '{va}{vs_uu}' and [ '{va}{vs_uu}' ] <- '{uu}' ) ) define fix_endings as ( do repeat fix_ending ) define remove_question_prefixes as ( [ ('{e}' ) among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete do fix_va_start ) // Gives signal t if an ending was fixed, signal f otherwise. define fix_ending as ( $(len > 3) backwards ( ( [among('{na}{pulli}' '{na}{pulli}{ta}' '{na}{pulli}{ta}{pulli}') ] delete ) or ( ['{ya}{pulli}' test among('{vs_ai}' '{vs_i}' '{vs_ii}') ] delete ) or ( [ '{tta}{pulli}{pa}{pulli}' or '{tta}{pulli}{ka}{pulli}' ] <- '{lla}{pulli}' ) or ( [ '{nnna}{pulli}{rra}{pulli}' ] <- '{la}{pulli}' ) or // ( [ '{rra}{pulli}{ka}{pulli}' or '{nnna}{pulli}{nnna}{pulli}' ] <- '{la}{pulli}' ) ( [ '{rra}{pulli}{ka}{pulli}' ] <- '{la}{pulli}' ) or ( [ '{tta}{pulli}{tta}{pulli}' ] <- '{tta}{vs_u}' ) or ( found_vetrumai_urupu [ '{ta}{pulli}{ta}{pulli}' (test not '{vs_ai}') ] <- '{ma}{pulli}' ] ) or ( [ '{vs_u}{ka}{pulli}' or '{vs_u}{ka}{pulli}{ka}{pulli}' ] <- '{pulli}' ) or ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete ) or ( [ '{vs_u}{ka}{pulli}' ] <- '{pulli}' ) or ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete ) or ( [ '{pulli}' (among('{ya}' '{ra}' '{la}' '{va}' '{zha}' '{lla}') or among('{nga}' '{nya}' '{nna}' '{na}' '{ma}' '{nnna}')) '{pulli}' ] <- '{pulli}' ) or ( [ among('{va}' '{ya}' '{va}{pulli}') ] delete ) or ( [ '{nnna}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')) ] delete ) or ( [ '{nga}{pulli}' (test not '{vs_ai}')] <- '{ma}{pulli}' ) or ( [ '{nga}{pulli}' ] delete ) or ( [ '{pulli}' (test (among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') or '{pulli}')) ] delete ) ) ) define remove_pronoun_prefixes as ( unset found_a_match [ among('{a}' '{i}' '{u}') among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete (set found_a_match) do fix_va_start ) define remove_plural_suffix as ( unset found_a_match backwards ( ( [ '{vs_u}{nga}{pulli}{ka}{lla}{pulli}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}')) ] <- '{pulli}' ) or ( [ '{rra}{pulli}{ka}{lla}{pulli}' ] <- '{la}{pulli}' ) or ( [ '{tta}{pulli}{ka}{lla}{pulli}' ] <- '{lla}{pulli}' ) or ( [ '{ka}{lla}{pulli}' ] delete ) (set found_a_match) ) ) define remove_question_suffixes as ( has_min_length unset found_a_match backwards ( do ( [ among('{vs_oo}' '{vs_ee}' '{vs_aa}') ] <- '{pulli}' (set found_a_match) ) ) do fix_endings ) define remove_command_suffixes as ( has_min_length unset found_a_match backwards ( [ among('{pa}{vs_i}' '{va}{vs_i}') ] delete (set found_a_match) ) ) define remove_um as ( unset found_a_match has_min_length backwards ( [ '{vs_u}{ma}{pulli}' ] <- '{pulli}' (set found_a_match) ) do fix_ending ) define remove_common_word_endings as ( // These are not suffixes actually but are // some words that are attached to other words // but can be removed for stemming unset found_a_match has_min_length backwards ( test ( [ '{vs_u}{tta}{nnna}{pulli}' or '{vs_i}{la}{pulli}{la}{vs_ai}' or '{vs_i}{tta}{ma}{pulli}' or '{vs_i}{nnna}{pulli}{rra}{vs_i}' or '{vs_aa}{ka}{vs_i}' or '{vs_aa}{ka}{vs_i}{ya}' or '{vs_e}{nnna}{pulli}{rra}{vs_u}' or '{vs_u}{lla}{pulli}{lla}' or '{vs_u}{tta}{vs_ai}{ya}' or '{vs_u}{tta}{vs_ai}' or '{vs_e}{nnna}{vs_u}{ma}{pulli}' or ('{la}{pulli}{la}' test (not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or '{vs_e}{nnna}' or '{vs_aa}{ka}{vs_i}' ] <- '{pulli}' (set found_a_match) ) or test ( [ among('{pa}{tta}{vs_u}' '{pa}{tta}{pulli}{tta}' '{pa}{tta}{pulli}{tta}{vs_u}' '{pa}{tta}{pulli}{tta}{ta}{vs_u}' '{pa}{tta}{pulli}{tta}{nna}' '{ka}{vs_u}{ra}{vs_i}{ya}' '{pa}{rra}{pulli}{rra}{vs_i}' '{va}{vs_i}{tta}{vs_u}' '{va}{vs_i}{tta}{pulli}{tta}{vs_u}' '{pa}{tta}{vs_i}{ta}{vs_aa}{nnna}' '{pa}{tta}{vs_i}' '{ta}{vs_aa}{nnna}' '{vs_e}{la}{pulli}{la}{vs_aa}{ma}{pulli}') ] delete (set found_a_match) ) ) do fix_endings ) define remove_vetrumai_urupukal as ( unset found_a_match unset found_vetrumai_urupu has_min_length backwards ( ( test ( ['{nnna}{vs_ai}'] delete ) or test ([ ( '{vs_i}{nnna}{vs_ai}' or '{vs_ai}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}'))) or ( '{vs_ai}' (test (among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}'))) ] <- '{pulli}' ) or test ( [ '{vs_o}{tta}{vs_u}' or '{vs_oo}{tta}{vs_u}' or '{vs_i}{la}{pulli}' or '{vs_i}{rra}{pulli}' or ('{vs_i}{nnna}{pulli}' (test not '{ma}')) or '{vs_i}{nnna}{pulli}{rra}{vs_u}' or '{vs_i}{ra}{vs_u}{na}{pulli}{ta}{vs_u}' or '{va}{vs_i}{tta}' or ($(len >= 7) '{vs_i}{tta}{ma}{pulli}') or '{vs_aa}{la}{pulli}' or '{vs_u}{tta}{vs_ai}' or '{vs_aa}{ma}{la}{pulli}' or ('{la}{pulli}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or '{vs_u}{lla}{pulli}' ] <- '{pulli}' ) or test ( [ '{ka}{nna}{pulli}' or '{ma}{vs_u}{nnna}{pulli}' or '{ma}{vs_ee}{la}{pulli}' or '{ma}{vs_ee}{rra}{pulli}' or '{ka}{vs_ii}{llla}{pulli}' or '{pa}{vs_i}{nnna}{pulli}' or ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) ] delete ) or test ([ '{vs_ii}' ] <- '{vs_i}') ) (set found_a_match) (set found_vetrumai_urupu) do ( [ '{vs_i}{nnna}{pulli}' ] <- '{pulli}' ) ) do fix_endings ) define remove_tense_suffixes as ( set found_a_match repeat ( found_a_match (do remove_tense_suffix) ) ) define remove_tense_suffix as ( unset found_a_match has_min_length backwards ( do ( test ( [among( '{ka}{vs_o}{nna}{pulli}{tta}{vs_i}{ra}{pulli}' '{pa}{tta}{vs_u}' )] delete (set found_a_match) ) or test ( [ '{ma}{vs_aa}{ra}{pulli}' or '{ma}{vs_i}{nnna}{pulli}' or '{nnna}{nnna}{pulli}' or '{nnna}{vs_aa}{nnna}{pulli}' or '{nnna}{vs_aa}{lla}{pulli}' or '{nnna}{vs_aa}{ra}{pulli}' or ('{va}{nnna}{pulli}' test (not among('{a}' '{aa}' '{i}' '{ii}' '{u}' '{uu}' '{e}' '{ee}' '{ai}' '{o}' '{oo}' '{au}')) ) or '{nnna}{lla}{pulli}' or '{va}{lla}{pulli}' or '{nnna}{ra}{pulli}' or '{va}{ra}{pulli}' or '{nnna}' or '{pa}' or '{ka}' or '{ta}' or '{ya}' or '{pa}{nnna}{pulli}' or '{pa}{lla}{pulli}' or '{pa}{ra}{pulli}' or ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or '{vs_i}{rra}{pulli}{rra}{vs_u}' or '{pa}{ma}{pulli}' or '{nnna}{ma}{pulli}' or '{ta}{vs_u}{ma}{pulli}' or '{rra}{vs_u}{ma}{pulli}' or '{ka}{vs_u}{ma}{pulli}' or '{nnna}{vs_e}{nnna}{pulli}' or '{nnna}{vs_ai}' or '{va}{vs_ai}' ] delete (set found_a_match) ) or test ( [ ('{vs_aa}{nnna}{pulli}' test (not '{ca}')) or '{vs_aa}{lla}{pulli}' or '{vs_aa}{ra}{pulli}' or '{vs_ee}{nnna}{pulli}' or '{vs_aa}' or '{vs_aa}{ma}{pulli}' or '{vs_e}{ma}{pulli}' or '{vs_ee}{ma}{pulli}' or '{vs_oo}{ma}{pulli}' or '{ka}{vs_u}{ma}{pulli}' or '{ta}{vs_u}{ma}{pulli}' or '{tta}{vs_u}{ma}{pulli}' or '{rra}{vs_u}{ma}{pulli}' or '{vs_aa}{ya}{pulli}' or '{nnna}{vs_e}{nnna}{pulli}' or '{nnna}{vs_i}{ra}{pulli}' or '{vs_ii}{ra}{pulli}' or '{vs_ii}{ya}{ra}{pulli}' ] <- '{pulli}' (set found_a_match) ) or test ( ([ '{ka}{vs_u}' or '{ta}{vs_u}' ) (test '{pulli}') ] delete (set found_a_match) ) ) do ([among( '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}' '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}{pulli}' '{ka}{vs_i}{nnna}{pulli}{rra}' '{ka}{vs_i}{nnna}{pulli}{rra}{pulli}' '{ka}{vs_i}{rra}' '{ka}{vs_i}{rra}{pulli}' )] delete (set found_a_match) ) ) do fix_endings ) define stem as ( unset found_vetrumai_urupu do fix_ending has_min_length do remove_question_prefixes do remove_pronoun_prefixes do remove_question_suffixes do remove_um do remove_common_word_endings do remove_vetrumai_urupukal do remove_plural_suffix do remove_command_suffixes do remove_tense_suffixes ) snowball-2.2.0/algorithms/turkish.sbl000066400000000000000000000303441414263061200177030ustar00rootroot00000000000000/* Stemmer for Turkish * author: Evren (Kapusuz) Çilden * email: evren.kapusuz at gmail.com * version: 1.0 (15.01.2007) * stems nominal verb suffixes * stems nominal inflections * more than one syllable word check * (y,n,s,U) context check * vowel harmony check * last consonant check and conversion (b, c, d, ğ to p, ç, t, k) * The stemming algorithm is based on the paper "An Affix Stripping * Morphological Analyzer for Turkish" by Gülşen Eryiğit and * Eşref Adalı (Proceedings of the IAESTED International Conference * ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004, * Innsbruck, Austria * Turkish is an agglutinative language and has a very rich morphological * structure. In Turkish, you can form many different words from a single stem * by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means * "You had been the doctor of him". The stem of the word is "doktor" and it * takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about * the append order of suffixes can be clearly described as FSMs. * The paper referenced above defines some FSMs for right to left * morphological analysis. I generated a method for constructing snowball * expressions from right to left FSMs for stemming suffixes. */ routines ( append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings check_vowel_harmony // tests vowel harmony for suffixes is_reserved_word // tests whether current string is a reserved word ('ad','soyad') mark_cAsInA // nominal verb suffix mark_DA // noun suffix mark_DAn // noun suffix mark_DUr // nominal verb suffix mark_ki // noun suffix mark_lAr // noun suffix, nominal verb suffix mark_lArI // noun suffix mark_nA // noun suffix mark_ncA // noun suffix mark_ndA // noun suffix mark_ndAn // noun suffix mark_nU // noun suffix mark_nUn // noun suffix mark_nUz // nominal verb suffix mark_sU // noun suffix mark_sUn // nominal verb suffix mark_sUnUz // nominal verb suffix mark_possessives // -(U)m,-(U)n,-(U)mUz,-(U)nUz, mark_yA // noun suffix mark_ylA // noun suffix mark_yU // noun suffix mark_yUm // nominal verb suffix mark_yUz // nominal verb suffix mark_yDU // nominal verb suffix mark_yken // nominal verb suffix mark_ymUs_ // nominal verb suffix mark_ysA // nominal verb suffix mark_suffix_with_optional_y_consonant mark_suffix_with_optional_U_vowel mark_suffix_with_optional_n_consonant mark_suffix_with_optional_s_consonant more_than_one_syllable_word post_process_last_consonants postlude stem_nominal_verb_suffixes stem_noun_suffixes stem_suffix_chain_before_ki ) stringescapes { } /* Special characters in Unicode Latin-1 and Latin Extended-A */ stringdef c, '{U+00E7}' // LATIN SMALL LETTER C WITH CEDILLA stringdef g~ '{U+011F}' // LATIN SMALL LETTER G WITH BREVE stringdef i' '{U+0131}' // LATIN SMALL LETTER I WITHOUT DOT stringdef o" '{U+00F6}' // LATIN SMALL LETTER O WITH DIAERESIS stringdef s, '{U+015F}' // LATIN SMALL LETTER S WITH CEDILLA stringdef u" '{U+00FC}' // LATIN SMALL LETTER U WITH DIAERESIS booleans ( continue_stemming_noun_suffixes ) groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6) define vowel 'ae{i'}io{o"}u{u"}' define U '{i'}iu{u"}' // the vowel grouping definitions below are used for checking vowel harmony define vowel1 'a{i'}ou' // vowels that can end with suffixes containing 'a' define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e' define vowel3 'a{i'}' // vowels that can end with suffixes containing 'i'' define vowel4 'ei' // vowels that can end with suffixes containing 'i' define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u' define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing 'o"' or 'u"' externals ( stem ) backwardmode ( // checks vowel harmony for possible suffixes, // helps to detect whether the candidate for suffix applies to vowel harmony // this rule is added to prevent over stemming define check_vowel_harmony as ( test ( (goto vowel) // if there is a vowel ( ('a' goto vowel1) or ('e' goto vowel2) or ('{i'}' goto vowel3) or ('i' goto vowel4) or ('o' goto vowel5) or ('{o"}' goto vowel6) or ('u' goto vowel5) or ('{u"}' goto vowel6) ) ) ) // if the last consonant before suffix is vowel and n then advance and delete // if the last consonant before suffix is non vowel and n do nothing // if the last consonant before suffix is not n then only delete the suffix // assumption: slice beginning is set correctly define mark_suffix_with_optional_n_consonant as ( ('n' (test vowel)) or ((not(test 'n')) test(next vowel)) ) // if the last consonant before suffix is vowel and s then advance and delete // if the last consonant before suffix is non vowel and s do nothing // if the last consonant before suffix is not s then only delete the suffix // assumption: slice beginning is set correctly define mark_suffix_with_optional_s_consonant as ( ('s' (test vowel)) or ((not(test 's')) test(next vowel)) ) // if the last consonant before suffix is vowel and y then advance and delete // if the last consonant before suffix is non vowel and y do nothing // if the last consonant before suffix is not y then only delete the suffix // assumption: slice beginning is set correctly define mark_suffix_with_optional_y_consonant as ( ('y' (test vowel)) or ((not(test 'y')) test(next vowel)) ) define mark_suffix_with_optional_U_vowel as ( (U (test non-vowel)) or ((not(test U)) test(next non-vowel)) ) define mark_possessives as ( among ('m{i'}z' 'miz' 'muz' 'm{u"}z' 'n{i'}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n') (mark_suffix_with_optional_U_vowel) ) define mark_sU as ( check_vowel_harmony U (mark_suffix_with_optional_s_consonant) ) define mark_lArI as ( among ('leri' 'lar{i'}') ) define mark_yU as ( check_vowel_harmony U (mark_suffix_with_optional_y_consonant) ) define mark_nU as ( check_vowel_harmony among ('n{i'}' 'ni' 'nu' 'n{u"}') ) define mark_nUn as ( check_vowel_harmony among ('{i'}n' 'in' 'un' '{u"}n') (mark_suffix_with_optional_n_consonant) ) define mark_yA as ( check_vowel_harmony among('a' 'e') (mark_suffix_with_optional_y_consonant) ) define mark_nA as ( check_vowel_harmony among('na' 'ne') ) define mark_DA as ( check_vowel_harmony among('da' 'de' 'ta' 'te') ) define mark_ndA as ( check_vowel_harmony among('nda' 'nde') ) define mark_DAn as ( check_vowel_harmony among('dan' 'den' 'tan' 'ten') ) define mark_ndAn as ( check_vowel_harmony among('ndan' 'nden') ) define mark_ylA as ( check_vowel_harmony among('la' 'le') (mark_suffix_with_optional_y_consonant) ) define mark_ki as ( 'ki' ) define mark_ncA as ( check_vowel_harmony among('ca' 'ce') (mark_suffix_with_optional_n_consonant) ) define mark_yUm as ( check_vowel_harmony among ('{i'}m' 'im' 'um' '{u"}m') (mark_suffix_with_optional_y_consonant) ) define mark_sUn as ( check_vowel_harmony among ('s{i'}n' 'sin' 'sun' 's{u"}n' ) ) define mark_yUz as ( check_vowel_harmony among ('{i'}z' 'iz' 'uz' '{u"}z') (mark_suffix_with_optional_y_consonant) ) define mark_sUnUz as ( among ('s{i'}n{i'}z' 'siniz' 'sunuz' 's{u"}n{u"}z') ) define mark_lAr as ( check_vowel_harmony among ('ler' 'lar') ) define mark_nUz as ( check_vowel_harmony among ('n{i'}z' 'niz' 'nuz' 'n{u"}z') ) define mark_DUr as ( check_vowel_harmony among ('t{i'}r' 'tir' 'tur' 't{u"}r' 'd{i'}r' 'dir' 'dur' 'd{u"}r') ) define mark_cAsInA as ( among ('cas{i'}na' 'cesine') ) define mark_yDU as ( check_vowel_harmony among ('t{i'}m' 'tim' 'tum' 't{u"}m' 'd{i'}m' 'dim' 'dum' 'd{u"}m' 't{i'}n' 'tin' 'tun' 't{u"}n' 'd{i'}n' 'din' 'dun' 'd{u"}n' 't{i'}k' 'tik' 'tuk' 't{u"}k' 'd{i'}k' 'dik' 'duk' 'd{u"}k' 't{i'}' 'ti' 'tu' 't{u"}' 'd{i'}' 'di' 'du' 'd{u"}') (mark_suffix_with_optional_y_consonant) ) // does not fully obey vowel harmony define mark_ysA as ( among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se') (mark_suffix_with_optional_y_consonant) ) define mark_ymUs_ as ( check_vowel_harmony among ('m{i'}{s,}' 'mi{s,}' 'mu{s,}' 'm{u"}{s,}') (mark_suffix_with_optional_y_consonant) ) define mark_yken as ( 'ken' (mark_suffix_with_optional_y_consonant) ) define stem_nominal_verb_suffixes as ( [ set continue_stemming_noun_suffixes (mark_ymUs_ or mark_yDU or mark_ysA or mark_yken) or (mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_) or ( mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_)) unset continue_stemming_noun_suffixes ) or (mark_nUz (mark_yDU or mark_ysA)) or ((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_)) or (mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)) ]delete ) // stems noun suffix chains ending with -ki define stem_suffix_chain_before_ki as ( [ mark_ki ( (mark_DA] delete try([ (mark_lAr] delete try(stem_suffix_chain_before_ki)) or (mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) )) or (mark_nUn] delete try([ (mark_lArI] delete) or ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (stem_suffix_chain_before_ki) )) or (mark_ndA ( (mark_lArI] delete) or ((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki))) or (stem_suffix_chain_before_ki) )) ) ) define stem_noun_suffixes as ( ([mark_lAr] delete try(stem_suffix_chain_before_ki)) or ([mark_ncA] delete try( ([mark_lArI] delete) or ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or ([mark_lAr] delete stem_suffix_chain_before_ki) ) ) or ([(mark_ndA or mark_nA) ( (mark_lArI] delete) or (mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (stem_suffix_chain_before_ki) ) ) or ([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI))) or ( [mark_DAn] delete try ([ ( (mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lAr] delete try(stem_suffix_chain_before_ki)) or (stem_suffix_chain_before_ki) )) ) or ([mark_nUn or mark_ylA] delete try( ([mark_lAr] delete stem_suffix_chain_before_ki) or ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or stem_suffix_chain_before_ki ) ) or ([mark_lArI] delete) or (stem_suffix_chain_before_ki) or ([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki)) or ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) ) define post_process_last_consonants as ( [substring] among ( 'b' (<- 'p') 'c' (<- '{c,}') 'd' (<- 't') '{g~}' (<- 'k') ) ) // after stemming if the word ends with 'd' or 'g' most probably last U is overstemmed // like in 'kedim' -> 'ked' // Turkish words don't usually end with 'd' or 'g' // some very well known words are ignored (like 'ad' 'soyad' // appends U to stems ending with d or g, decides which vowel to add // based on the last vowel in the stem define append_U_to_stems_ending_with_d_or_g as ( test('d' or 'g') (test((goto vowel) 'a' or '{i'}') <+ '{i'}') or (test((goto vowel) 'e' or 'i') <+ 'i') or (test((goto vowel) 'o' or 'u') <+ 'u') or (test((goto vowel) '{o"}' or '{u"}') <+ '{u"}') ) define is_reserved_word as ( 'ad' try 'soy' atlimit ) ) // Tests if there are more than one syllables // In Turkish each vowel indicates a distinct syllable define more_than_one_syllable_word as ( test (atleast 2 (gopast vowel)) ) define postlude as ( backwards ( not(is_reserved_word) do append_U_to_stems_ending_with_d_or_g do post_process_last_consonants ) ) define stem as ( (more_than_one_syllable_word) ( backwards ( do stem_nominal_verb_suffixes continue_stemming_noun_suffixes do stem_noun_suffixes ) postlude ) ) snowball-2.2.0/algorithms/yiddish.sbl000066400000000000000000000411431414263061200176460ustar00rootroot00000000000000/* ******************************************* * Stemmer for Yiddish language in YIVO script * * Author: Assaf Urieli * Emails: assaf.urieli at gmail.com * Version: 0.1 (15.05.2020) * ********************************************* */ routines ( prelude mark_regions R1 R1plus3 standard_suffix ) externals ( stem ) integers ( p1 x ) groupings ( vowel niked alefBeys consonant ) stringescapes {} // AlefBeys stringdef Alef '{U+05D0}' stringdef Beys '{U+05D1}' stringdef Giml '{U+05D2}' stringdef Dalet '{U+05D3}' stringdef Hey '{U+05D4}' stringdef Vov '{U+05D5}' stringdef Zayen '{U+05D6}' stringdef Khes '{U+05D7}' stringdef Tes '{U+05D8}' stringdef Yud '{U+05D9}' stringdef LangerKhof '{U+05DA}' stringdef Khof '{U+05DB}' stringdef Lamed '{U+05DC}' stringdef ShlosMem '{U+05DD}' stringdef Mem '{U+05DE}' stringdef LangerNun '{U+05DF}' stringdef Nun '{U+05E0}' stringdef Samekh '{U+05E1}' stringdef Ayen '{U+05E2}' stringdef LangerFey '{U+05E3}' stringdef Fey '{U+05E4}' stringdef LangerTsadek '{U+05E5}' stringdef Tsadek '{U+05E6}' stringdef Kuf '{U+05E7}' stringdef Reysh '{U+05E8}' stringdef Shin '{U+05E9}' stringdef Sof '{U+05EA}' stringdef TsveyVovn '{U+05F0}' stringdef VovYud '{U+05F1}' stringdef TsveyYudn '{U+05F2}' // Niked stringdef Shvo '{U+05B0}' stringdef Khirik '{U+05B4}' stringdef Tseyre '{U+05B5}' stringdef Segl '{U+05B6}' stringdef ReducedSegl '{U+05B1}' stringdef Pasekh '{U+05B7}' stringdef ReducedPasekh '{U+05B2}' stringdef Komets '{U+05B8}' stringdef ReducedKomets '{U+05B3}' stringdef Rafe '{U+05BF}' stringdef SinDot '{U+05C2}' stringdef ShinDot '{U+05C1}' stringdef Khoylm '{U+05B9}' stringdef Melupm '{U+05BC}' stringdef Kubuts '{U+05BB}' // Groupings define niked '{Shvo}{Khirik}{Tseyre}{Segl}{ReducedSegl}{Pasekh}{ReducedPasekh}{Komets}{ReducedKomets}{SinDot}{ShinDot}{Khoylm}{Melupm}{Kubuts}{Rafe}' define alefBeys '{Alef}{Beys}{Giml}{Dalet}{Hey}{Vov}{Zayen}{Khes}{Tes}{Yud}{LangerKhof}{Khof}{Lamed}{ShlosMem}{Mem}{LangerNun}{Nun}{Samekh}{Ayen}{LangerFey}{Fey}{LangerTsadek}{Tsadek}{Kuf}{Reysh}{Shin}{Sof}{TsveyVovn}{VovYud}{TsveyYudn}' define vowel '{Alef}{Vov}{Yud}{Ayen}{VovYud}{TsveyYudn}' define consonant alefBeys - vowel define prelude as ( do ( repeat goto ( [substring] among ( '{Vov}{Vov}' ( not '{Melupm}' <- '{TsveyVovn}' ) '{Vov}{Yud}' ( not '{Khirik}' <- '{VovYud}' ) '{Yud}{Yud}' ( not '{Khirik}' <- '{TsveyYudn}' ) '{LangerKhof}' ( <- '{Khof}') '{ShlosMem}' ( <- '{Mem}' ) '{LangerNun}' ( <- '{Nun}' ) '{LangerFey}' ( <- '{Fey}' ) '{LangerTsadek}' ( <- '{Tsadek}' ) ) ) ) do (repeat goto ( [niked] delete )) ) define mark_regions as ( $p1 = limit ( try ( // Replace past participle ge- at start of word // Unless word starts with gelt- or gebn- ['{Giml}{Ayen}'] not ('{Lamed}{Tes}' or '{Beys}{Nun}') <- 'GE' ) try ( // skip verbal prefix among( // Free stressed: Adurkh-, Durkh-, Ahin-, Aher-, Avek-, Mit-, Antkegn-, Akegn-, Anider-, Arop-, Aroys-, Aroyf-, Arum-, Arayn-, Arunter-, Ariber-, Nokh-, Farbay-, Aheym-, Afir-, Faroys-, Funander-, Tsuzamen-, Tsunoyf-, Tsurik- '{Alef}{Dalet}{Vov}{Reysh}{Khof}' '{Dalet}{Vov}{Reysh}{Khof}' '{Alef}{Hey}{Yud}{Nun}' '{Alef}{Hey}{Ayen}{Reysh}' '{Alef}{TsveyVovn}{Ayen}{Kuf}' '{Mem}{Yud}{Tes}' '{Alef}{Nun}{Tes}{Kuf}{Ayen}{Giml}{Nun}' '{Alef}{Kuf}{Ayen}{Giml}{Nun}' '{Alef}{Nun}{Yud}{Dalet}{Ayen}{Reysh}' '{Alef}{Reysh}{Alef}{Fey}' '{Alef}{Reysh}{VovYud}{Samekh}' '{Alef}{Reysh}{VovYud}{Fey}' '{Alef}{Reysh}{Vov}{Mem}' '{Alef}{Reysh}{TsveyYudn}{Nun}' '{Alef}{Reysh}{Vov}{Nun}{Tes}{Ayen}{Reysh}' '{Alef}{Reysh}{Yud}{Beys}{Ayen}{Reysh}' '{Nun}{Alef}{Khof}' '{Fey}{Alef}{Reysh}{Beys}{TsveyYudn}' '{Alef}{Hey}{TsveyYudn}{Mem}' '{Alef}{Fey}{Yud}{Reysh}' '{Fey}{Alef}{Reysh}{VovYud}{Samekh}' '{Fey}{Vov}{Nun}{Alef}{Nun}{Dalet}{Ayen}{Reysh}' '{Tsadek}{Vov}{Zayen}{Alef}{Mem}{Ayen}{Nun}' '{Tsadek}{Vov}{Nun}{VovYud}{Fey}' '{Tsadek}{Vov}{Reysh}{Yud}{Kuf}' // Stressed: Oys-, Oyf-, Um-, Unter-, Iber-, Ayn-, On-, Op-, Bay-, For-, Tsu-. '{Alef}{VovYud}{Samekh}' '{Alef}{VovYud}{Fey}' '{Alef}{Vov}{Mem}' '{Alef}{Vov}{Nun}{Tes}{Ayen}{Reysh}' '{Alef}{Yud}{Beys}{Ayen}{Reysh}' '{Alef}{TsveyYudn}{Nun}' '{Alef}{Nun}' '{Alef}{Fey}' '{Beys}{TsveyYudn}' '{Fey}{Alef}{Reysh}' '{Tsadek}{Vov}' // Unstressed: Ant-, Ba-, Der-, Tse-. Far- already covered by For-. Ge- comes later. '{Alef}{Nun}{Tes}' '{Beys}{Alef}' '{Dalet}{Ayen}{Reysh}' '{Tsadek}{Ayen}' // If verbal prefix followed by Tsu- or Ge-, replace it ( // Don't mark the TSU- prefix inside verbs like "oys-tsugn" test (('{Tsadek}{Vov}{Giml}{Nun}' or '{Tsadek}{Vov}{Kuf}{Tes}' or '{Tsadek}{Vov}{Kuf}{Nun}') atlimit) or // Don't mark the GE- prefix inside verbs like "avek-gebn" test ('{Giml}{Ayen}{Beys}{Nun}') or ( ['{Giml}{Ayen}'] <- 'GE') or (['{Tsadek}{Vov}'] <- 'TSU') ) ) ) test(hop 3 setmark x) // We want to allow three-consonant Hebrew roots. // To this end, we skip three-consonant combinations that exist in non-Hebraic Yiddish. try ( among( '{Shin}{Fey}{Reysh}' '{Shin}{Tes}{Reysh}' '{Shin}{Tes}{Shin}' '{Dalet}{Zayen}{Shin}' ( true ) ) ) // Either 3 consonants or the first non-vowel after a vowel ( not (consonant consonant consonant setmark p1) goto vowel repeat vowel setmark p1 ) try($p1 < x $p1 = x) // at least 3 past the prefix ) ) backwardmode ( define R1 as $p1 <= cursor // Like R1, but also allows the cursor to be outside R1 by the width of Giml Yud Samekh define R1plus3 as $p1 <= cursor + sizeof '{Giml}{Yud}{Samekh}' define standard_suffix as ( do ( [substring] among( // Plural/adjective endings: -er, -ers, -e, -n, -s, -en, -ns, -eners, -ens, -es '{Ayen}{Reysh}{Samekh}' '{Ayen}{Nun}' '{Nun}{Samekh}' '{Ayen}{Nun}{Ayen}{Reysh}{Samekh}' '{Ayen}{Samekh}' '{Ayen}' '{Nun}' '{Samekh}' '{Ayen}{Mem}' '{Ayen}{Reysh}' ( R1 delete ) // Exception: don't delete noun endings -ie, like "agitatsie" '{Yud}{Ayen}' ( true ) // -ies => ie '{Yud}{Ayen}{Samekh}' ( R1 <- '{Yud}{Ayen}' ) // Plural/adjective endings: -enem, -ener, -ene, -ens '{Ayen}{Nun}{Ayen}' '{Ayen}{Nun}{Ayen}{Mem}' '{Ayen}{Nun}{Ayen}{Reysh}' '{Ayen}{Nun}{Samekh}' (R1 delete [substring] among ( // -gegangen => -gey '{Giml}{Alef}{Nun}{Giml}' (<- '{Giml}{TsveyYudn}') // -genumen => -nem '{Nun}{Vov}{Mem}' (<- '{Nun}{Ayen}{Mem}') // -gemiten => -mayd '{Mem}{Yud}{Tes}' (<- '{Mem}{TsveyYudn}{Dalet}') // -gebiten => -bayt '{Beys}{Yud}{Tes}' (<- '{Beys}{TsveyYudn}{Tes}') // -gebisen => -bays '{Beys}{Yud}{Samekh}' (<- '{Beys}{TsveyYudn}{Samekh}') // -gevizen => -vayz '{TsveyVovn}{Yud}{Zayen}' (<- '{TsveyVovn}{TsveyYudn}{Zayen}') // -getriben => -trayb '{Tes}{Reysh}{Yud}{Beys}' (<- '{Tes}{Reysh}{TsveyYudn}{Beys}') // -geliten => -layt '{Lamed}{Yud}{Tes}' (<- '{Lamed}{TsveyYudn}{Tes}') // -gekliben => -klayb '{Kuf}{Lamed}{Yud}{Beys}' (<- '{Kuf}{Lamed}{TsveyYudn}{Beys}') // -geriben => -rayb '{Reysh}{Yud}{Beys}' (<- '{Reysh}{TsveyYudn}{Beys}') // -gerisen => -rays '{Reysh}{Yud}{Samekh}' (<- '{Reysh}{TsveyYudn}{Samekh}') // -geshvigen => -shvayg '{Shin}{TsveyVovn}{Yud}{Giml}' (<- '{Shin}{TsveyVovn}{TsveyYudn}{Giml}') // -geshmisen => -shmays '{Shin}{Mem}{Yud}{Samekh}' (<- '{Shin}{Mem}{TsveyYudn}{Samekh}') // -geshniten => -shnayd '{Shin}{Nun}{Yud}{Tes}' (<- '{Shin}{Nun}{TsveyYudn}{Dalet}') // -geshriben => -shrayb '{Shin}{Reysh}{Yud}{Beys}' (<- '{Shin}{Reysh}{TsveyYudn}{Beys}') // -gebunden => -bind '{Beys}{Vov}{Nun}{Dalet}' (<- '{Beys}{Yud}{Nun}{Dalet}') // -gevuntshn => -vintsh '{TsveyVovn}{Vov}{Tes}{Shin}' (<- '{TsveyVovn}{Yud}{Tes}{Shin}') // -gezungen => -zing '{Zayen}{Vov}{Nun}{Giml}' (<- '{Zayen}{Yud}{Nun}{Giml}') // -getrunken => -trink '{Tes}{Reysh}{Vov}{Nun}{Kuf}' (<- '{Tes}{Reysh}{Yud}{Nun}{Kuf}') // -getsvungen => -tsving '{Tsadek}{TsveyVovn}{Vov}{Nun}{Giml}' (<- '{Tsadek}{TsveyVovn}{Yud}{Nun}{Giml}') // -geshlungen => -shling '{Shin}{Lamed}{Vov}{Nun}{Giml}' (<- '{Shin}{Lamed}{Yud}{Nun}{Giml}') // -geboygen => -beyg '{Beys}{VovYud}{Giml}' (<- '{Beys}{TsveyYudn}{Giml}') // -gehoyben => -heyb '{Hey}{VovYud}{Beys}' (<- '{Hey}{TsveyYudn}{Beys}') // -farloyren => -farlir '{Fey}{Alef}{Reysh}{Lamed}{VovYud}{Reysh}' (<- '{Fey}{Alef}{Reysh}{Lamed}{Yud}{Reysh}') // -shtanen => -shtey '{Shin}{Tes}{Alef}{Nun}' (<- '{Shin}{Tes}{TsveyYudn}') // -geshvoyrn => -shver '{Shin}{TsveyVovn}{VovYud}{Reysh}' (<- '{Shin}{TsveyVovn}{Ayen}{Reysh}') ) ) // Verb/past participle ending: -t '{Tes}' ( R1 delete ) // As well as noun/adjectives ending in -tn, -te, -ter, -ts so that the "-t" doesn't differentiate // Similarly for past participles: -tns, -tene, -tenem, -tener // If the Tes was before R1, we try to perform the same action while leaving the Tes in place '{Tes}{Nun}' '{Tes}{Ayen}' '{Tes}{Ayen}{Reysh}' '{Tes}{Samekh}' '{Tes}{Nun}{Samekh}' '{Tes}{Ayen}{Nun}{Ayen}' '{Tes}{Ayen}{Nun}{Ayen}{Mem}' '{Tes}{Ayen}{Nun}{Ayen}{Reysh}' ( ((R1 delete) or ( <- '{Tes}')) // -(ge)brakht => -breng ['{Beys}{Reysh}{Alef}{Khof}' try '{Giml}{Ayen}'] <- '{Beys}{Reysh}{Ayen}{Nun}{Giml}' ) // Past participles: -et, -etn, -ets, -ete, -eter '{Ayen}{Tes}' '{Ayen}{Tes}{Nun}' '{Ayen}{Tes}{Samekh}' '{Ayen}{Tes}{Ayen}' '{Ayen}{Tes}{Ayen}{Reysh}' ( R1 delete ) // -geyn shorted to -gey '{Giml}{TsveyYudn}{Nun}' ( <- '{Giml}{TsveyYudn}') // ##################### Long list of irregular past participles // -(ge)gangen (shortened to -gangen after prefixes) => -gey '{Giml}{Alef}{Nun}{Giml}{Ayen}{Nun}' ( <- '{Giml}{TsveyYudn}' ) // -(ge)numen (shortened to -numen after prefixes) => -nem '{Nun}{Vov}{Mem}{Ayen}{Nun}' (<- '{Nun}{Ayen}{Mem}' ) // -(ge)shribn (shortened to -shribn after prefixes) => -shrayb '{Shin}{Reysh}{Yud}{Beys}{Nun}' (<- '{Shin}{Reysh}{TsveyYudn}{Beys}' ) // -gemiten => -mayd 'GE{Mem}{Yud}{Tes}{Nun}' (<- '{Mem}{TsveyYudn}{Dalet}') // -gebiten => -bayt 'GE{Beys}{Yud}{Tes}{Nun}' (<- '{Beys}{TsveyYudn}{Tes}') // -gebisen => -bays 'GE{Beys}{Yud}{Samekh}{Nun}' ( <- '{Beys}{TsveyYudn}{Samekh}') // -gevizen => -vayz '{TsveyVovn}{Yud}{Zayen}{Nun}' ( <- '{TsveyVovn}{TsveyYudn}{Zayen}') // -getriben => -trayb '{Tes}{Reysh}{Yud}{Beys}{Nun}' ( <- '{Tes}{Reysh}{TsveyYudn}{Beys}') // -geliten => -layt 'GE{Lamed}{Yud}{Tes}{Nun}' ( <- '{Lamed}{TsveyYudn}{Tes}') // -gekliben => -klayb '{Kuf}{Lamed}{Yud}{Beys}{Nun}' ( <- '{Kuf}{Lamed}{TsveyYudn}{Beys}') // -geriben => -rayb '{Reysh}{Yud}{Beys}{Nun}' ( <- '{Reysh}{TsveyYudn}{Beys}') // -gerisen => -rays 'GE{Reysh}{Yud}{Samekh}{Nun}' ( <- '{Reysh}{TsveyYudn}{Samekh}') // -geshvigen => -shvayg '{Shin}{TsveyVovn}{Yud}{Giml}{Nun}' ( <- '{Shin}{TsveyVovn}{TsveyYudn}{Giml}') // -geshmisen => -shmays '{Shin}{Mem}{Yud}{Samekh}{Nun}' ( <- '{Shin}{Mem}{TsveyYudn}{Samekh}') // -geshniten => -shnayd '{Shin}{Nun}{Yud}{Tes}{Nun}' ( <- '{Shin}{Nun}{TsveyYudn}{Dalet}') // -gebunden => -bind '{Beys}{Vov}{Nun}{Dalet}{Nun}' ( <- '{Beys}{Yud}{Nun}{Dalet}') // -gevuntshn => -vintsh '{TsveyVovn}{Vov}{Tes}{Shin}{Nun}' ( <- '{TsveyVovn}{Yud}{Tes}{Shin}') // -gezungen => -zing '{Zayen}{Vov}{Nun}{Giml}{Nun}' ( <- '{Zayen}{Yud}{Nun}{Giml}') // -getrunken => -trink '{Tes}{Reysh}{Vov}{Nun}{Kuf}{Nun}' ( <- '{Tes}{Reysh}{Yud}{Nun}{Kuf}') // -getsvungen => -tsving '{Tsadek}{TsveyVovn}{Vov}{Nun}{Giml}{Nun}' ( <- '{Tsadek}{TsveyVovn}{Yud}{Nun}{Giml}') // -geshlungen => -shling '{Shin}{Lamed}{Vov}{Nun}{Giml}{Nun}' ( <- '{Shin}{Lamed}{Yud}{Nun}{Giml}') // -geboygen => -beyg '{Beys}{VovYud}{Giml}{Nun}' ( <- '{Beys}{TsveyYudn}{Giml}') // -gehoyben => -heyb '{Hey}{VovYud}{Beys}{Nun}' ( <- '{Hey}{TsveyYudn}{Beys}') // -farloyren => -farlir '{Fey}{Alef}{Reysh}{Lamed}{VovYud}{Reysh}{Nun}' ( <- '{Fey}{Alef}{Reysh}{Lamed}{Yud}{Reysh}') // -shtanen => -shtey '{Shin}{Tes}{Alef}{Nun}{Ayen}{Nun}' ( <- '{Shin}{Tes}{TsveyYudn}') // -geshvoyrn => -shver '{Shin}{TsveyVovn}{VovYud}{Reysh}{Nun}' ( <- '{Shin}{TsveyVovn}{Ayen}{Reysh}') // -(ge)brakht (shortened to -brakht after prefixes) => -breng '{Beys}{Reysh}{Alef}{Khof}{Tes}' (<- '{Beys}{Reysh}{Ayen}{Nun}{Giml}' ) // ###### End of irregular past participles // Noun endings: -ung, -hayt, -kayt, -ikayt, -shaft '{Vov}{Nun}{Giml}' '{Hey}{TsveyYudn}{Tes}' '{Kuf}{TsveyYudn}{Tes}' '{Yud}{Kuf}{TsveyYudn}{Tes}' '{Shin}{Alef}{Fey}{Tes}' ( R1 delete ) // Noun endings: -izm, izmen '{Yud}{Zayen}{Mem}' '{Yud}{Zayen}{Mem}{Ayen}{Nun}' ( R1 delete ) // Plural ending: -im '{Yud}{Mem}' ( R1 delete ) // Plural ending: -os (Hebraic), replace with -h '{Vov}{Sof}' ( R1 <- '{Hey}' ) // Diminutive endings: -elekh, -ele, -lekh, -eles, -elen '{Ayen}{Lamed}{Ayen}{Khof}' '{Ayen}{Lamed}{Ayen}' '{Lamed}{Ayen}{Khof}' '{Ayen}{Lamed}{Ayen}{Samekh}' '{Ayen}{Lamed}{Ayen}{Nun}' ( R1 delete ) // Noun ending: -ist '{Yud}{Samekh}{Tes}' ( // Exceptions: -gist, -shist ( ('{Giml}' or '{Shin}') try (R1plus3 <- '{Yud}{Samekh}') ) or ( R1 delete ) ) // Noun ending: -istn '{Yud}{Samekh}{Tes}{Nun}' ( R1 delete ) // Verb ending: -stu '{Samekh}{Tes}{Vov}' ( R1 delete ) // Superlative ending: -ster, -ste, -stn '{Samekh}{Tes}{Ayen}{Reysh}' '{Samekh}{Tes}{Ayen}' '{Samekh}{Tes}{Nun}' ( R1 delete ) // Ambiguous verb ending: -st '{Samekh}{Tes}' ( R1 delete ) ) ) do ( [substring] among( // Noun endings: -ung, -hayt, -kayt, -ikayt, -shaft '{Vov}{Nun}{Giml}' '{Hey}{TsveyYudn}{Tes}' '{Kuf}{TsveyYudn}{Tes}' '{Yud}{Kuf}{TsveyYudn}{Tes}' '{Shin}{Alef}{Fey}{Tes}' ( R1 delete ) // Diminutive endings: -l '{Lamed}' ( R1 consonant delete ) ) ) do ( [substring] among( // Adjective endings: -ig, -ik, -ish, -nik, -dik '{Yud}{Giml}' '{Yud}{Kuf}' '{Yud}{Shin}' '{Nun}{Yud}{Kuf}' '{Dalet}{Yud}{Kuf}' ( R1 delete ) // Exceptions to above: -blik, -glik '{Beys}{Lamed}{Yud}{Kuf}' '{Giml}{Lamed}{Yud}{Kuf}' ( true ) // Present participle endings: -ndik '{Nun}{Dalet}{Yud}{Kuf}' ( R1 delete ) // Present participle ending -endik: delete if after a -ng, -nk, -n, -m, consonant+l, or vowel. // Otherwise, delete just the -ndik part. '{Ayen}{Nun}{Dalet}{Yud}{Kuf}' ( R1 delete ) ) ) do (repeat goto ( ['GE' or 'TSU'] delete )) ) ) define stem as ( do prelude do mark_regions backwards do standard_suffix ) snowball-2.2.0/charsets/000077500000000000000000000000001414263061200151475ustar00rootroot00000000000000snowball-2.2.0/charsets/ISO-8859-2.sbl000066400000000000000000000051031414263061200170540ustar00rootroot00000000000000// ISO-8859-2 character mappings. stringdef U+00A0 hex 'A0' stringdef U+0104 hex 'A1' stringdef U+02D8 hex 'A2' stringdef U+0141 hex 'A3' stringdef U+00A4 hex 'A4' stringdef U+013D hex 'A5' stringdef U+015A hex 'A6' stringdef U+00A7 hex 'A7' stringdef U+00A8 hex 'A8' stringdef U+0160 hex 'A9' stringdef U+015E hex 'AA' stringdef U+0164 hex 'AB' stringdef U+0179 hex 'AC' stringdef U+00AD hex 'AD' stringdef U+017D hex 'AE' stringdef U+017B hex 'AF' stringdef U+00B0 hex 'B0' stringdef U+0105 hex 'B1' stringdef U+02DB hex 'B2' stringdef U+0142 hex 'B3' stringdef U+00B4 hex 'B4' stringdef U+013E hex 'B5' stringdef U+015B hex 'B6' stringdef U+02C7 hex 'B7' stringdef U+00B8 hex 'B8' stringdef U+0161 hex 'B9' stringdef U+015F hex 'BA' stringdef U+0165 hex 'BB' stringdef U+017A hex 'BC' stringdef U+02DD hex 'BD' stringdef U+017E hex 'BE' stringdef U+017C hex 'BF' stringdef U+0154 hex 'C0' stringdef U+00C1 hex 'C1' stringdef U+00C2 hex 'C2' stringdef U+0102 hex 'C3' stringdef U+00C4 hex 'C4' stringdef U+0139 hex 'C5' stringdef U+0106 hex 'C6' stringdef U+00C7 hex 'C7' stringdef U+010C hex 'C8' stringdef U+00C9 hex 'C9' stringdef U+0118 hex 'CA' stringdef U+00CB hex 'CB' stringdef U+011A hex 'CC' stringdef U+00CD hex 'CD' stringdef U+00CE hex 'CE' stringdef U+010E hex 'CF' stringdef U+0110 hex 'D0' stringdef U+0143 hex 'D1' stringdef U+0147 hex 'D2' stringdef U+00D3 hex 'D3' stringdef U+00D4 hex 'D4' stringdef U+0150 hex 'D5' stringdef U+00D6 hex 'D6' stringdef U+00D7 hex 'D7' stringdef U+0158 hex 'D8' stringdef U+016E hex 'D9' stringdef U+00DA hex 'DA' stringdef U+0170 hex 'DB' stringdef U+00DC hex 'DC' stringdef U+00DD hex 'DD' stringdef U+0162 hex 'DE' stringdef U+00DF hex 'DF' stringdef U+0155 hex 'E0' stringdef U+00E1 hex 'E1' stringdef U+00E2 hex 'E2' stringdef U+0103 hex 'E3' stringdef U+00E4 hex 'E4' stringdef U+013A hex 'E5' stringdef U+0107 hex 'E6' stringdef U+00E7 hex 'E7' stringdef U+010D hex 'E8' stringdef U+00E9 hex 'E9' stringdef U+0119 hex 'EA' stringdef U+00EB hex 'EB' stringdef U+011B hex 'EC' stringdef U+00ED hex 'ED' stringdef U+00EE hex 'EE' stringdef U+010F hex 'EF' stringdef U+0111 hex 'F0' stringdef U+0144 hex 'F1' stringdef U+0148 hex 'F2' stringdef U+00F3 hex 'F3' stringdef U+00F4 hex 'F4' stringdef U+0151 hex 'F5' stringdef U+00F6 hex 'F6' stringdef U+00F7 hex 'F7' stringdef U+0159 hex 'F8' stringdef U+016F hex 'F9' stringdef U+00FA hex 'FA' stringdef U+0171 hex 'FB' stringdef U+00FC hex 'FC' stringdef U+00FD hex 'FD' stringdef U+0163 hex 'FE' stringdef U+02D9 hex 'FF' snowball-2.2.0/charsets/KOI8-R.sbl000066400000000000000000000036671414263061200165760ustar00rootroot00000000000000// KOI8-R character mappings. stringdef U+00A0 hex '9A' stringdef U+00A9 hex 'BF' stringdef U+00B0 hex '9C' stringdef U+00B2 hex '9D' stringdef U+00B7 hex '9E' stringdef U+00F7 hex '9F' stringdef U+0401 hex 'B3' stringdef U+0410 hex 'E1' stringdef U+0411 hex 'E2' stringdef U+0412 hex 'F7' stringdef U+0413 hex 'E7' stringdef U+0414 hex 'E4' stringdef U+0415 hex 'E5' stringdef U+0416 hex 'F6' stringdef U+0417 hex 'FA' stringdef U+0418 hex 'E9' stringdef U+0419 hex 'EA' stringdef U+041A hex 'EB' stringdef U+041B hex 'EC' stringdef U+041C hex 'ED' stringdef U+041D hex 'EE' stringdef U+041E hex 'EF' stringdef U+041F hex 'F0' stringdef U+0420 hex 'F2' stringdef U+0421 hex 'F3' stringdef U+0422 hex 'F4' stringdef U+0423 hex 'F5' stringdef U+0424 hex 'E6' stringdef U+0425 hex 'E8' stringdef U+0426 hex 'E3' stringdef U+0427 hex 'FE' stringdef U+0428 hex 'FB' stringdef U+0429 hex 'FD' stringdef U+042A hex 'FF' stringdef U+042B hex 'F9' stringdef U+042C hex 'F8' stringdef U+042D hex 'FC' stringdef U+042E hex 'E0' stringdef U+042F hex 'F1' stringdef U+0430 hex 'C1' stringdef U+0431 hex 'C2' stringdef U+0432 hex 'D7' stringdef U+0433 hex 'C7' stringdef U+0434 hex 'C4' stringdef U+0435 hex 'C5' stringdef U+0436 hex 'D6' stringdef U+0437 hex 'DA' stringdef U+0438 hex 'C9' stringdef U+0439 hex 'CA' stringdef U+043A hex 'CB' stringdef U+043B hex 'CC' stringdef U+043C hex 'CD' stringdef U+043D hex 'CE' stringdef U+043E hex 'CF' stringdef U+043F hex 'D0' stringdef U+0440 hex 'D2' stringdef U+0441 hex 'D3' stringdef U+0442 hex 'D4' stringdef U+0443 hex 'D5' stringdef U+0444 hex 'C6' stringdef U+0445 hex 'C8' stringdef U+0446 hex 'C3' stringdef U+0447 hex 'DE' stringdef U+0448 hex 'DB' stringdef U+0449 hex 'DD' stringdef U+044A hex 'DF' stringdef U+044B hex 'D9' stringdef U+044C hex 'D8' stringdef U+044D hex 'DC' stringdef U+044E hex 'C0' stringdef U+044F hex 'D1' stringdef U+0451 hex 'A3' snowball-2.2.0/charsets/cp850.sbl000066400000000000000000000066661414263061200165260ustar00rootroot00000000000000// Code page 850 (MSDOS Latin 1) character mappings. stringdef U+00A0 hex 'FF' stringdef U+00A1 hex 'AD' stringdef U+00A2 hex 'BD' stringdef U+00A3 hex '9C' stringdef U+00A4 hex 'CF' stringdef U+00A5 hex 'BE' stringdef U+00A6 hex 'DD' stringdef U+00A7 hex 'F5' stringdef U+00A8 hex 'F9' stringdef U+00A9 hex 'B8' stringdef U+00AA hex 'A6' stringdef U+00AB hex 'AE' stringdef U+00AC hex 'AA' stringdef U+00AD hex 'F0' stringdef U+00AE hex 'A9' stringdef U+00AF hex 'EE' stringdef U+00B0 hex 'F8' stringdef U+00B1 hex 'F1' stringdef U+00B2 hex 'FD' stringdef U+00B3 hex 'FC' stringdef U+00B4 hex 'EF' stringdef U+00B5 hex 'E6' stringdef U+00B6 hex 'F4' stringdef U+00B7 hex 'FA' stringdef U+00B8 hex 'F7' stringdef U+00B9 hex 'FB' stringdef U+00BA hex 'A7' stringdef U+00BB hex 'AF' stringdef U+00BC hex 'AC' stringdef U+00BD hex 'AB' stringdef U+00BE hex 'F3' stringdef U+00BF hex 'A8' stringdef U+00C0 hex 'B7' stringdef U+00C1 hex 'B5' stringdef U+00C2 hex 'B6' stringdef U+00C3 hex 'C7' stringdef U+00C4 hex '8E' stringdef U+00C5 hex '8F' stringdef U+00C6 hex '92' stringdef U+00C7 hex '80' stringdef U+00C8 hex 'D4' stringdef U+00C9 hex '90' stringdef U+00CA hex 'D2' stringdef U+00CB hex 'D3' stringdef U+00CC hex 'DE' stringdef U+00CD hex 'D6' stringdef U+00CE hex 'D7' stringdef U+00CF hex 'D8' stringdef U+00D0 hex 'D1' stringdef U+00D1 hex 'A5' stringdef U+00D2 hex 'E3' stringdef U+00D3 hex 'E0' stringdef U+00D4 hex 'E2' stringdef U+00D5 hex 'E5' stringdef U+00D6 hex '99' stringdef U+00D7 hex '9E' stringdef U+00D8 hex '9D' stringdef U+00D9 hex 'EB' stringdef U+00DA hex 'E9' stringdef U+00DB hex 'EA' stringdef U+00DC hex '9A' stringdef U+00DD hex 'ED' stringdef U+00DE hex 'E8' stringdef U+00DF hex 'E1' stringdef U+00E0 hex '85' stringdef U+00E1 hex 'A0' stringdef U+00E2 hex '83' stringdef U+00E3 hex 'C6' stringdef U+00E4 hex '84' stringdef U+00E5 hex '86' stringdef U+00E6 hex '91' stringdef U+00E7 hex '87' stringdef U+00E8 hex '8A' stringdef U+00E9 hex '82' stringdef U+00EA hex '88' stringdef U+00EB hex '89' stringdef U+00EC hex '8D' stringdef U+00ED hex 'A1' stringdef U+00EE hex '8C' stringdef U+00EF hex '8B' stringdef U+00F0 hex 'D0' stringdef U+00F1 hex 'A4' stringdef U+00F2 hex '95' stringdef U+00F3 hex 'A2' stringdef U+00F4 hex '93' stringdef U+00F5 hex 'E4' stringdef U+00F6 hex '94' stringdef U+00F7 hex 'F6' stringdef U+00F8 hex '9B' stringdef U+00F9 hex '97' stringdef U+00FA hex 'A3' stringdef U+00FB hex '96' stringdef U+00FC hex '81' stringdef U+00FD hex 'EC' stringdef U+00FE hex 'E7' stringdef U+00FF hex '98' stringdef U+0131 hex 'D5' stringdef U+0192 hex '9F' stringdef U+2017 hex 'F2' stringdef U+2500 hex 'C4' stringdef U+2502 hex 'B3' stringdef U+250C hex 'DA' stringdef U+2510 hex 'BF' stringdef U+2514 hex 'C0' stringdef U+2518 hex 'D9' stringdef U+251C hex 'C3' stringdef U+2524 hex 'B4' stringdef U+252C hex 'C2' stringdef U+2534 hex 'C1' stringdef U+253C hex 'C5' stringdef U+2550 hex 'CD' stringdef U+2551 hex 'BA' stringdef U+2554 hex 'C9' stringdef U+2557 hex 'BB' stringdef U+255A hex 'C8' stringdef U+255D hex 'BC' stringdef U+2560 hex 'CC' stringdef U+2563 hex 'B9' stringdef U+2566 hex 'CB' stringdef U+2569 hex 'CA' stringdef U+256C hex 'CE' stringdef U+2580 hex 'DF' stringdef U+2584 hex 'DC' stringdef U+2588 hex 'DB' stringdef U+2591 hex 'B0' stringdef U+2592 hex 'B1' stringdef U+2593 hex 'B2' stringdef U+25A0 hex 'FE' snowball-2.2.0/compiler/000077500000000000000000000000001414263061200151455ustar00rootroot00000000000000snowball-2.2.0/compiler/analyser.c000066400000000000000000001452261414263061200171410ustar00rootroot00000000000000 #include /* printf etc */ #include /* exit */ #include /* memmove */ #include "header.h" typedef enum { e_token_omitted = 0, e_unexpected_token = 1, e_string_omitted = 2, e_unexpected_token_in_among = 3, /* For codes above here, report "after " t->previous_token after the error. */ e_unresolved_substring = 14, e_not_allowed_inside_reverse = 15, e_empty_grouping = 16, e_already_backwards = 17, e_empty_among = 18, e_adjacent_bracketed_in_among = 19, e_substring_preceded_by_substring = 20, /* For codes below here, tokeniser->b is printed before the error. */ e_redeclared = 30, e_undeclared = 31, e_declared_as_different_mode = 32, e_not_of_type_x = 33, e_not_of_type_string_or_integer = 34, e_misplaced = 35, e_redefined = 36, e_misused = 37 } error_code; /* recursive usage: */ static void read_program_(struct analyser * a, int terminator); static struct node * read_C(struct analyser * a); static struct node * C_style(struct analyser * a, const char * s, int token); static void print_node_(struct node * p, int n, const char * s) { int i; for (i = 0; i < n; i++) fputs(i == n - 1 ? s : " ", stdout); printf("%s ", name_of_token(p->type)); if (p->name) report_b(stdout, p->name->b); if (p->literalstring) { printf("'"); report_b(stdout, p->literalstring); printf("'"); } else if (p->type == c_number) { printf("%d", p->number); } printf("\n"); if (p->AE) print_node_(p->AE, n+1, "# "); if (p->left) print_node_(p->left, n+1, " "); if (p->aux) print_node_(p->aux, n+1, "@ "); if (p->right) print_node_(p->right, n, " "); } extern void print_program(struct analyser * a) { print_node_(a->program, 0, " "); } static struct node * new_node(struct analyser * a, int type) { NEW(node, p); p->next = a->nodes; a->nodes = p; p->left = 0; p->right = 0; p->aux = 0; p->AE = 0; p->name = 0; p->literalstring = 0; p->mode = a->mode; p->line_number = a->tokeniser->line_number; p->type = type; return p; } static const char * name_of_mode(int n) { switch (n) { case m_backward: return "string backward"; case m_forward: return "string forward"; /* case m_integer: return "integer"; */ } fprintf(stderr, "Invalid mode %d in name_of_mode()\n", n); exit(1); } static const char * name_of_type(int n) { switch (n) { case 's': return "string"; case 'i': return "integer"; case 'r': return "routine"; case 'R': return "routine or grouping"; case 'g': return "grouping"; } fprintf(stderr, "Invalid type %d in name_of_type()\n", n); exit(1); } static const char * name_of_name_type(int code) { switch (code) { case t_string: return "string"; case t_boolean: return "boolean"; case t_integer: return "integer"; case t_routine: return "routine"; case t_external: return "external"; case t_grouping: return "grouping"; } fprintf(stderr, "Invalid type code %d in name_of_name_type()\n", code); exit(1); } static void count_error(struct analyser * a) { struct tokeniser * t = a->tokeniser; if (t->error_count >= 20) { fprintf(stderr, "... etc\n"); exit(1); } t->error_count++; } static void error2(struct analyser * a, error_code n, int x) { struct tokeniser * t = a->tokeniser; count_error(a); fprintf(stderr, "%s:%d: ", t->file, t->line_number); if ((int)n >= (int)e_redeclared) report_b(stderr, t->b); switch (n) { case e_token_omitted: fprintf(stderr, "%s omitted", name_of_token(t->omission)); break; case e_unexpected_token_in_among: fprintf(stderr, "in among(...), "); /* fall through */ case e_unexpected_token: fprintf(stderr, "unexpected %s", name_of_token(t->token)); if (t->token == c_number) fprintf(stderr, " %d", t->number); if (t->token == c_name) { fprintf(stderr, " "); report_b(stderr, t->b); } break; case e_string_omitted: fprintf(stderr, "string omitted"); break; case e_unresolved_substring: fprintf(stderr, "unresolved substring on line %d", x); break; case e_not_allowed_inside_reverse: fprintf(stderr, "%s not allowed inside reverse(...)", name_of_token(t->token)); break; case e_empty_grouping: fprintf(stderr, "empty grouping"); break; case e_already_backwards: fprintf(stderr, "backwards used when already in this mode"); break; case e_empty_among: fprintf(stderr, "empty among(...)"); break; case e_adjacent_bracketed_in_among: fprintf(stderr, "two adjacent bracketed expressions in among(...)"); break; case e_substring_preceded_by_substring: fprintf(stderr, "substring preceded by another substring on line %d", x); break; case e_redeclared: fprintf(stderr, " re-declared"); break; case e_undeclared: fprintf(stderr, " undeclared"); break; case e_declared_as_different_mode: fprintf(stderr, " declared as %s mode; used as %s mode", name_of_mode(a->mode), name_of_mode(x)); break; case e_not_of_type_x: fprintf(stderr, " not of type %s", name_of_type(x)); break; case e_not_of_type_string_or_integer: fprintf(stderr, " not of type string or integer"); break; case e_misplaced: fprintf(stderr, " misplaced"); break; case e_redefined: fprintf(stderr, " redefined"); break; case e_misused: fprintf(stderr, " mis-used as %s mode", name_of_mode(x)); break; } if ((int)n < (int)e_unresolved_substring && t->previous_token > 0) fprintf(stderr, " after %s", name_of_token(t->previous_token)); fprintf(stderr, "\n"); } static void error(struct analyser * a, error_code n) { error2(a, n, 0); } static void error4(struct analyser * a, struct name * q) { count_error(a); fprintf(stderr, "%s:%d: ", a->tokeniser->file, q->used->line_number); report_b(stderr, q->b); fprintf(stderr, " undefined\n"); } static void omission_error(struct analyser * a, int n) { a->tokeniser->omission = n; error(a, e_token_omitted); } static int check_token(struct analyser * a, int code) { struct tokeniser * t = a->tokeniser; if (t->token != code) { omission_error(a, code); return false; } return true; } static int get_token(struct analyser * a, int code) { struct tokeniser * t = a->tokeniser; read_token(t); { int x = check_token(a, code); if (!x) t->token_held = true; return x; } } static struct name * look_for_name(struct analyser * a) { symbol * q = a->tokeniser->b; struct name * p; for (p = a->names; p; p = p->next) { symbol * b = p->b; int n = SIZE(b); if (n == SIZE(q) && memcmp(q, b, n * sizeof(symbol)) == 0) { p->referenced = true; return p; } } return 0; } static struct name * find_name(struct analyser * a) { struct name * p = look_for_name(a); if (p == 0) error(a, e_undeclared); return p; } static void check_routine_mode(struct analyser * a, struct name * p, int mode) { if (p->mode < 0) p->mode = mode; else if (p->mode != mode) error2(a, e_misused, mode); } static void check_name_type(struct analyser * a, struct name * p, int type) { switch (type) { case 's': if (p->type == t_string) return; break; case 'i': if (p->type == t_integer) return; break; case 'b': if (p->type == t_boolean) return; break; case 'R': if (p->type == t_grouping) return; /* FALLTHRU */ case 'r': if (p->type == t_routine || p->type == t_external) return; break; case 'g': if (p->type == t_grouping) return; break; } error2(a, e_not_of_type_x, type); } static void read_names(struct analyser * a, int type) { struct tokeniser * t = a->tokeniser; if (!get_token(a, c_bra)) return; while (true) { int token = read_token(t); switch (token) { case c_len: { /* Context-sensitive token - once declared as a name, it loses * its special meaning, for compatibility with older versions * of snowball. */ static const symbol c_len_lit[] = { 'l', 'e', 'n' }; t->b = MOVE_TO_B(t->b, c_len_lit); goto handle_as_name; } case c_lenof: { /* Context-sensitive token - once declared as a name, it loses * its special meaning, for compatibility with older versions * of snowball. */ static const symbol c_lenof_lit[] = { 'l', 'e', 'n', 'o', 'f' }; t->b = MOVE_TO_B(t->b, c_lenof_lit); goto handle_as_name; } case c_name: handle_as_name: if (look_for_name(a) != 0) error(a, e_redeclared); else { NEW(name, p); p->b = copy_b(t->b); p->type = type; p->mode = -1; /* routines, externals */ /* We defer assigning counts until after we've eliminated * variables whose values are never used. */ p->count = -1; p->referenced = false; p->used_in_among = false; p->used = 0; p->value_used = false; p->initialised = false; p->used_in_definition = false; p->local_to = 0; p->grouping = 0; p->definition = 0; p->declaration_line_number = t->line_number; p->next = a->names; a->names = p; if (token != c_name) { disable_token(t, token); } } break; default: if (!check_token(a, c_ket)) t->token_held = true; return; } } } static symbol * new_literalstring(struct analyser * a) { NEW(literalstring, p); p->b = copy_b(a->tokeniser->b); p->next = a->literalstrings; a->literalstrings = p; return p->b; } static int read_AE_test(struct analyser * a) { struct tokeniser * t = a->tokeniser; switch (read_token(t)) { case c_assign: return c_mathassign; case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: case c_eq: case c_ne: case c_gr: case c_ge: case c_ls: case c_le: return t->token; default: error(a, e_unexpected_token); t->token_held = true; return c_eq; } } static int binding(int t) { switch (t) { case c_plus: case c_minus: return 1; case c_multiply: case c_divide: return 2; default: return -2; } } static void mark_used_in(struct analyser * a, struct name * q, struct node * p) { if (!q->used) { q->used = p; q->local_to = a->program_end->name; } else if (q->local_to) { if (q->local_to != a->program_end->name) { /* Used in more than one routine/external. */ q->local_to = NULL; } } } static void name_to_node(struct analyser * a, struct node * p, int type) { struct name * q = find_name(a); if (q) { check_name_type(a, q, type); mark_used_in(a, q, p); } p->name = q; } static struct node * read_AE(struct analyser * a, struct name * assigned_to, int B) { struct tokeniser * t = a->tokeniser; struct node * p; struct node * q; switch (read_token(t)) { case c_minus: /* monadic */ q = read_AE(a, assigned_to, 100); if (q->type == c_neg) { /* Optimise away double negation, which avoids generators * having to worry about generating "--" (decrement operator * in many languages). */ p = q->right; /* Don't free q, it's in the linked list a->nodes. */ break; } if (q->type == c_number) { /* Negated constant. */ q->number = -q->number; p = q; break; } p = new_node(a, c_neg); p->right = q; break; case c_bra: p = read_AE(a, assigned_to, 0); get_token(a, c_ket); break; case c_name: p = new_node(a, c_name); name_to_node(a, p, 'i'); if (p->name) { // $x = x + 1 shouldn't count as a use of x. p->name->value_used = (p->name != assigned_to); } break; case c_maxint: case c_minint: a->int_limits_used = true; /* fall through */ case c_cursor: case c_limit: case c_len: case c_size: p = new_node(a, t->token); break; case c_number: p = new_node(a, c_number); p->number = t->number; break; case c_lenof: case c_sizeof: { int token = t->token; p = C_style(a, "S", token); if (!p->literalstring) break; /* Replace lenof or sizeof on a literal string with a numeric * constant. */ int result; if (token == c_lenof && t->encoding == ENC_UTF8) { // UTF-8. int i = 0; symbol * b = p->literalstring; result = 0; while (i < SIZE(b)) { int dummy; i += get_utf8(b + i, &dummy); ++result; } } else { result = SIZE(p->literalstring); } p->type = c_number; p->literalstring = NULL; p->number = result; break; } default: error(a, e_unexpected_token); t->token_held = true; return 0; } while (true) { int token = read_token(t); int b = binding(token); if (binding(token) <= B) { t->token_held = true; return p; } struct node * r = read_AE(a, assigned_to, b); if (p->type == c_number && r->type == c_number) { // Evaluate constant sub-expression. q = new_node(a, c_number); switch (token) { case c_plus: q->number = p->number + r->number; break; case c_minus: q->number = p->number - r->number; break; case c_multiply: q->number = p->number * r->number; break; case c_divide: q->number = p->number / r->number; break; default: fprintf(stderr, "Unexpected AE operator %s\n", name_of_token(token)); exit(1); } } else { q = new_node(a, token); q->left = p; q->right = r; } p = q; } } static struct node * read_C_connection(struct analyser * a, struct node * q, int op) { struct tokeniser * t = a->tokeniser; struct node * p = new_node(a, op); struct node * p_end = q; p->left = q; do { q = read_C(a); p_end->right = q; p_end = q; } while (read_token(t) == op); t->token_held = true; return p; } static struct node * read_C_list(struct analyser * a) { struct tokeniser * t = a->tokeniser; struct node * p = new_node(a, c_bra); struct node * p_end = 0; while (true) { int token = read_token(t); if (token == c_ket) return p; if (token < 0) { omission_error(a, c_ket); return p; } t->token_held = true; { struct node * q = read_C(a); while (true) { token = read_token(t); if (token != c_and && token != c_or) { t->token_held = true; break; } q = read_C_connection(a, q, token); } if (p_end == 0) p->left = q; else p_end->right = q; p_end = q; } } } static struct node * C_style(struct analyser * a, const char * s, int token) { int i; struct node * p = new_node(a, token); for (i = 0; s[i] != 0; i++) switch (s[i]) { case 'C': p->left = read_C(a); continue; case 'D': p->aux = read_C(a); continue; case 'A': p->AE = read_AE(a, 0, 0); continue; case 'f': get_token(a, c_for); continue; case 'S': { int str_token = read_token(a->tokeniser); if (str_token == c_name) name_to_node(a, p, 's'); else if (str_token == c_literalstring) p->literalstring = new_literalstring(a); else error(a, e_string_omitted); } continue; case 'b': case 's': case 'i': if (get_token(a, c_name)) name_to_node(a, p, s[i]); continue; } return p; } static struct node * read_literalstring(struct analyser * a) { struct node * p = new_node(a, c_literalstring); p->literalstring = new_literalstring(a); return p; } static void reverse_b(symbol * b) { int i = 0; int j = SIZE(b) - 1; while (i < j) { int ch1 = b[i]; int ch2 = b[j]; b[i++] = ch2; b[j--] = ch1; } } static int compare_amongvec(const void *pv, const void *qv) { const struct amongvec * p = (const struct amongvec*)pv; const struct amongvec * q = (const struct amongvec*)qv; symbol * b_p = p->b; int p_size = p->size; symbol * b_q = q->b; int q_size = q->size; int smaller_size = p_size < q_size ? p_size : q_size; int i; for (i = 0; i < smaller_size; i++) if (b_p[i] != b_q[i]) return b_p[i] - b_q[i]; if (p_size - q_size) return p_size - q_size; return p->line_number - q->line_number; } #define PTR_NULL_CHECK(P, Q) do {\ if ((Q) == NULL) {\ if ((P) != NULL) return 1;\ } else {\ if ((P) == NULL) return -1;\ }\ } while (0) static int compare_node(const struct node *p, const struct node *q) { PTR_NULL_CHECK(p, q); if (q == NULL) { /* p must be NULL too. */ return 0; } if (p->type != q->type) return p->type > q->type ? 1 : -1; if (p->mode != q->mode) return p->mode > q->mode ? 1 : -1; if (p->type == c_number) { if (p->number != q->number) return p->number > q->number ? 1 : -1; } PTR_NULL_CHECK(p->left, q->left); if (p->left) { int r = compare_node(p->left, q->left); if (r != 0) return r; } PTR_NULL_CHECK(p->AE, q->AE); if (p->AE) { int r = compare_node(p->AE, q->AE); if (r != 0) return r; } PTR_NULL_CHECK(p->aux, q->aux); if (p->aux) { int r = compare_node(p->aux, q->aux); if (r != 0) return r; } PTR_NULL_CHECK(p->name, q->name); if (p->name) { int r; if (SIZE(p->name->b) != SIZE(q->name->b)) { return SIZE(p->name->b) - SIZE(q->name->b); } r = memcmp(p->name->b, q->name->b, SIZE(p->name->b) * sizeof(symbol)); if (r != 0) return r; } PTR_NULL_CHECK(p->literalstring, q->literalstring); if (p->literalstring) { int r; if (SIZE(p->literalstring) != SIZE(q->literalstring)) { return SIZE(p->literalstring) - SIZE(q->literalstring); } r = memcmp(p->literalstring, q->literalstring, SIZE(p->literalstring) * sizeof(symbol)); if (r != 0) return r; } return compare_node(p->right, q->right); } static void make_among(struct analyser * a, struct node * p, struct node * substring) { NEW(among, x); NEWVEC(amongvec, v, p->number); struct node * q = p->left; struct amongvec * w0 = v; struct amongvec * w1 = v; int result = 1; int direction = substring != 0 ? substring->mode : p->mode; int backward = direction == m_backward; if (a->amongs == 0) a->amongs = x; else a->amongs_end->next = x; a->amongs_end = x; x->next = 0; x->b = v; x->number = a->among_count++; x->function_count = 0; x->starter = 0; x->nocommand_count = 0; x->amongvar_needed = false; if (q->type == c_bra) { x->starter = q; q = q->right; } while (q) { if (q->type == c_literalstring) { symbol * b = q->literalstring; w1->b = b; /* pointer to case string */ w1->action = NULL; /* action gets filled in below */ w1->line_number = q->line_number; w1->size = SIZE(b); /* number of characters in string */ w1->i = -1; /* index of longest substring */ w1->result = -1; /* number of corresponding case expression */ if (q->left) { struct name * function = q->left->name; w1->function = function; function->used_in_among = true; check_routine_mode(a, function, direction); x->function_count++; } else { w1->function = 0; } w1++; } else if (q->left == 0) { /* empty command: () */ w0 = w1; } else { /* Check for previous action which is the same as this one and use * the same action code if we find one. */ int among_result = -1; struct amongvec * w; for (w = v; w < w0; ++w) { if (w->action && compare_node(w->action->left, q->left) == 0) { if (w->result <= 0) { printf("Among code %d isn't positive\n", w->result); exit(1); } among_result = w->result; break; } } if (among_result < 0) { among_result = result++; } while (w0 != w1) { w0->action = q; w0->result = among_result; w0++; } } q = q->right; } if (w1-v != p->number) { fprintf(stderr, "oh! %d %d\n", (int)(w1-v), p->number); exit(1); } x->command_count = result - 1; { NEWVEC(node*, commands, x->command_count); memset(commands, 0, x->command_count * sizeof(struct node*)); for (w0 = v; w0 < w1; w0++) { if (w0->result > 0) { /* result == -1 when there's no command. */ if (w0->result > x->command_count) { fprintf(stderr, "More among codes than expected\n"); exit(1); } if (!commands[w0->result - 1]) commands[w0->result - 1] = w0->action; } else { ++x->nocommand_count; } if (backward) reverse_b(w0->b); } x->commands = commands; } qsort(v, w1 - v, sizeof(struct amongvec), compare_amongvec); /* the following loop is O(n squared) */ for (w0 = w1 - 1; w0 >= v; w0--) { symbol * b = w0->b; int size = w0->size; struct amongvec * w; for (w = w0 - 1; w >= v; w--) { if (w->size < size && memcmp(w->b, b, w->size * sizeof(symbol)) == 0) { w0->i = w - v; /* fill in index of longest substring */ break; } } } if (backward) for (w0 = v; w0 < w1; w0++) reverse_b(w0->b); for (w0 = v; w0 < w1 - 1; w0++) if (w0->size == (w0 + 1)->size && memcmp(w0->b, (w0 + 1)->b, w0->size * sizeof(symbol)) == 0) { count_error(a); fprintf(stderr, "%s:%d: among(...) has repeated string '", a->tokeniser->file, (w0 + 1)->line_number); report_b(stderr, (w0 + 1)->b); fprintf(stderr, "'\n"); count_error(a); fprintf(stderr, "%s:%d: previously seen here\n", a->tokeniser->file, w0->line_number); } x->literalstring_count = p->number; p->among = x; x->substring = substring; if (substring != 0) substring->among = x; if (x->command_count > 1 || (x->command_count == 1 && x->nocommand_count > 0) || x->starter != 0) { /* We need to set among_var rather than just checking if find_among*() * returns zero or not. */ x->amongvar_needed = a->amongvar_needed = true; } } static int is_just_true(struct node * q) { if (!q) return 1; if (q->type != c_bra && q->type != c_true) return 0; return is_just_true(q->left) && is_just_true(q->right); } static struct node * read_among(struct analyser * a) { struct tokeniser * t = a->tokeniser; struct node * p = new_node(a, c_among); struct node * p_end = 0; int previous_token = -1; struct node * substring = a->substring; a->substring = 0; p->number = 0; /* counts the number of literals */ if (!get_token(a, c_bra)) return p; while (true) { struct node * q; int token = read_token(t); switch (token) { case c_literalstring: q = read_literalstring(a); if (read_token(t) == c_name) { struct node * r = new_node(a, c_name); name_to_node(a, r, 'r'); q->left = r; } else t->token_held = true; p->number++; break; case c_bra: if (previous_token == c_bra) error(a, e_adjacent_bracketed_in_among); q = read_C_list(a); if (is_just_true(q->left)) { /* Convert anything equivalent to () to () so we handle it * the same way. */ q->left = 0; } break; default: error(a, e_unexpected_token_in_among); previous_token = token; continue; case c_ket: if (p->number == 0) error(a, e_empty_among); if (t->error_count == 0) make_among(a, p, substring); return p; } previous_token = token; if (p_end == 0) p->left = q; else p_end->right = q; p_end = q; } } static struct node * read_substring(struct analyser * a) { struct node * p = new_node(a, c_substring); if (a->substring != 0) error2(a, e_substring_preceded_by_substring, a->substring->line_number); a->substring = p; return p; } static void check_modifyable(struct analyser * a) { if (!a->modifyable) error(a, e_not_allowed_inside_reverse); } static int ae_uses_name(struct node * p, struct name * q) { switch (p->type) { case c_name: case c_lenof: case c_sizeof: if (p->name == q) return 1; break; case c_neg: return ae_uses_name(p->right, q); case c_multiply: case c_plus: case c_minus: case c_divide: return ae_uses_name(p->left, q) || ae_uses_name(p->right, q); } return 0; } static struct node * read_C(struct analyser * a) { struct tokeniser * t = a->tokeniser; int token = read_token(t); switch (token) { case c_bra: { struct node * p = read_C_list(a); if (p->type != c_bra) { fprintf(stderr, "read_C_list returned unexpected type %s\n", name_of_token(p->type)); exit(1); } if (p->left && !p->left->right) { // Replace a single entry command list with the command it // contains in order to make subsequent optimisations easier. p = p->left; } return p; } case c_backwards: { int mode = a->mode; if (a->mode == m_backward) error(a, e_already_backwards); else a->mode = m_backward; { struct node * p = C_style(a, "C", token); a->mode = mode; return p; } } case c_reverse: { int mode = a->mode; int modifyable = a->modifyable; a->modifyable = false; a->mode = mode == m_forward ? m_backward : m_forward; { struct node * p = C_style(a, "C", token); a->mode = mode; a->modifyable = modifyable; return p; } } case c_not: case c_try: case c_fail: case c_test: case c_do: case c_goto: case c_gopast: case c_repeat: return C_style(a, "C", token); case c_loop: case c_atleast: return C_style(a, "AC", token); case c_setmark: { struct node * n = C_style(a, "i", token); if (n->name) n->name->initialised = true; return n; } case c_tomark: case c_atmark: return C_style(a, "A", token); case c_hop: { struct node * n = C_style(a, "A", token); if (n->AE->type == c_number) { if (n->AE->number < 0) { fprintf(stderr, "%s:%d: warning: hop %d now signals f (as was " "always documented) rather than moving the cursor " "in the opposite direction\n", a->tokeniser->file, n->AE->line_number, n->AE->number); n->AE = NULL; n->type = c_false; } else if (n->AE->number == 0) { fprintf(stderr, "%s:%d: warning: hop 0 is a no-op\n", a->tokeniser->file, n->AE->line_number); n->AE = NULL; n->type = c_true; } } return n; } case c_delete: check_modifyable(a); /* fall through */ case c_next: case c_tolimit: case c_atlimit: case c_leftslice: case c_rightslice: case c_true: case c_false: case c_debug: return new_node(a, token); case c_assignto: case c_sliceto: { struct node *n; check_modifyable(a); n = C_style(a, "s", token); if (n->name) n->name->initialised = true; return n; } case c_assign: case c_insert: case c_attach: case c_slicefrom: { struct node *n; check_modifyable(a); n = C_style(a, "S", token); if (n->name) n->name->value_used = true; return n; } case c_setlimit: return C_style(a, "CfD", token); case c_set: case c_unset: { struct node * n = C_style(a, "b", token); if (n->name) n->name->initialised = true; return n; } case c_dollar: { struct tokeniser * t = a->tokeniser; read_token(t); if (t->token == c_bra) { /* Handle newer $(AE REL_OP AE) syntax. */ struct node * n = read_AE(a, 0, 0); read_token(t); int token = t->token; switch (token) { case c_assign: count_error(a); fprintf(stderr, "%s:%d: Expected relational operator (did you mean '=='?)\n", t->file, t->line_number); /* Assume it was == to try to avoid an error avalanche. */ token = c_eq; /* FALLTHRU */ case c_eq: case c_ne: case c_gr: case c_ge: case c_ls: case c_le: { struct node * lhs = n; struct node * rhs = read_AE(a, 0, 0); if (lhs->type == c_number && rhs->type == c_number) { // Evaluate constant numeric test expression. int result; switch (token) { case c_eq: result = (lhs->number == rhs->number); break; case c_ne: result = (lhs->number != rhs->number); break; case c_gr: result = (lhs->number > rhs->number); break; case c_ge: result = (lhs->number >= rhs->number); break; case c_ls: result = (lhs->number < rhs->number); break; case c_le: result = (lhs->number <= rhs->number); break; default: fprintf(stderr, "Unexpected numeric test operator %s\n", name_of_token(t->token)); exit(1); } n = new_node(a, result ? c_true : c_false); } else { n = new_node(a, token); n->left = lhs; n->AE = rhs; } get_token(a, c_ket); break; } default: error(a, e_unexpected_token); t->token_held = true; break; } return n; } if (t->token == c_name) { struct node * p; struct name * q = find_name(a); int mode = a->mode; int modifyable = a->modifyable; if (q && q->type == t_string) { /* Assume for now that $ on string both initialises and * uses the string variable. FIXME: Can we do better? */ q->initialised = true; q->value_used = true; a->mode = m_forward; a->modifyable = true; p = new_node(a, c_dollar); p->left = read_C(a); p->name = q; } else { if (q && q->type != t_integer) { /* If $ is used on an unknown name or a name which * isn't a string or an integer then we assume the * unknown name is an integer as $ is used more often * on integers than strings, so hopefully this it less * likely to cause an error avalanche. * * For an unknown name, we'll already have reported an * error. */ error(a, e_not_of_type_string_or_integer); q = NULL; } p = new_node(a, read_AE_test(a)); switch (p->type) { case c_eq: case c_ne: case c_gr: case c_ge: case c_ls: case c_le: p->left = new_node(a, c_name); p->left->name = q; if (q) { q->value_used = true; } p->AE = read_AE(a, NULL, 0); break; default: /* +=, etc don't "initialise" as they only * amend an existing value. Similarly, they * don't count as using the value. */ p->name = q; p->AE = read_AE(a, q, 0); if (p->type == c_mathassign && q) { /* $x = x + 1 doesn't initialise x. */ q->initialised = !ae_uses_name(p->AE, q); } break; } } if (q) mark_used_in(a, q, p); a->mode = mode; a->modifyable = modifyable; return p; } error(a, e_unexpected_token); t->token_held = true; return new_node(a, c_dollar); } case c_name: { struct name * q = find_name(a); struct node * p = new_node(a, c_name); if (q) { mark_used_in(a, q, p); switch (q->type) { case t_boolean: p->type = c_booltest; q->value_used = true; break; case t_integer: error(a, e_misplaced); /* integer name misplaced */ break; case t_string: q->value_used = true; break; case t_routine: case t_external: p->type = c_call; check_routine_mode(a, q, a->mode); break; case t_grouping: p->type = c_grouping; break; } } p->name = q; return p; } case c_non: { struct node * p = new_node(a, token); read_token(t); if (t->token == c_minus) read_token(t); if (!check_token(a, c_name)) { omission_error(a, c_name); return p; } name_to_node(a, p, 'g'); return p; } case c_literalstring: return read_literalstring(a); case c_among: return read_among(a); case c_substring: return read_substring(a); default: error(a, e_unexpected_token); return 0; } } static int next_symbol(symbol * p, symbol * W, int utf8) { if (utf8) { int ch; int j = get_utf8(p, & ch); W[0] = ch; return j; } else { W[0] = p[0]; return 1; } } static symbol * alter_grouping(symbol * p, symbol * q, int style, int utf8) { int j = 0; symbol W[1]; int width; if (style == c_plus) { while (j < SIZE(q)) { width = next_symbol(q + j, W, utf8); p = add_to_b(p, 1, W); j += width; } } else { while (j < SIZE(q)) { int i; width = next_symbol(q + j, W, utf8); for (i = 0; i < SIZE(p); i++) { if (p[i] == W[0]) { memmove(p + i, p + i + 1, (SIZE(p) - i - 1) * sizeof(symbol)); SIZE(p)--; } } j += width; } } return p; } static void read_define_grouping(struct analyser * a, struct name * q) { struct tokeniser * t = a->tokeniser; int style = c_plus; { NEW(grouping, p); if (a->groupings == 0) a->groupings = p; else a->groupings_end->next = p; a->groupings_end = p; if (q) q->grouping = p; p->next = 0; p->name = q; p->line_number = a->tokeniser->line_number; p->b = create_b(0); while (true) { switch (read_token(t)) { case c_name: { struct name * r = find_name(a); if (r) { check_name_type(a, r, 'g'); p->b = alter_grouping(p->b, r->grouping->b, style, false); r->used_in_definition = true; } } break; case c_literalstring: p->b = alter_grouping(p->b, t->b, style, (a->encoding == ENC_UTF8)); break; default: error(a, e_unexpected_token); return; } switch (read_token(t)) { case c_plus: case c_minus: style = t->token; break; default: goto label0; } } label0: { int i; int max = 0; int min = 1<<16; for (i = 0; i < SIZE(p->b); i++) { if (p->b[i] > max) max = p->b[i]; if (p->b[i] < min) min = p->b[i]; } p->largest_ch = max; p->smallest_ch = min; if (min == 1<<16) error(a, e_empty_grouping); } t->token_held = true; return; } } static void read_define_routine(struct analyser * a, struct name * q) { struct node * p = new_node(a, c_define); a->amongvar_needed = false; if (q) { check_name_type(a, q, 'R'); if (q->definition != 0) error(a, e_redefined); if (q->mode < 0) q->mode = a->mode; else if (q->mode != a->mode) error2(a, e_declared_as_different_mode, q->mode); } p->name = q; if (a->program == 0) a->program = p; else a->program_end->right = p; a->program_end = p; get_token(a, c_as); p->left = read_C(a); if (q) q->definition = p->left; if (a->substring != 0) { error2(a, e_unresolved_substring, a->substring->line_number); a->substring = 0; } p->amongvar_needed = a->amongvar_needed; } static void read_define(struct analyser * a) { if (get_token(a, c_name)) { struct name * q = find_name(a); int type; if (q) { type = q->type; } else { /* No declaration, so sniff next token - if it is 'as' then parse * as a routine, otherwise as a grouping. */ if (read_token(a->tokeniser) == c_as) { type = t_routine; } else { type = t_grouping; } a->tokeniser->token_held = true; } if (type == t_grouping) { read_define_grouping(a, q); } else { read_define_routine(a, q); } } } static void read_backwardmode(struct analyser * a) { int mode = a->mode; a->mode = m_backward; if (get_token(a, c_bra)) { read_program_(a, c_ket); check_token(a, c_ket); } a->mode = mode; } static void read_program_(struct analyser * a, int terminator) { struct tokeniser * t = a->tokeniser; while (true) { switch (read_token(t)) { case c_strings: read_names(a, t_string); break; case c_booleans: read_names(a, t_boolean); break; case c_integers: read_names(a, t_integer); break; case c_routines: read_names(a, t_routine); break; case c_externals: read_names(a, t_external); break; case c_groupings: read_names(a, t_grouping); break; case c_define: read_define(a); break; case c_backwardmode:read_backwardmode(a); break; case c_ket: if (terminator == c_ket) return; /* fall through */ default: error(a, e_unexpected_token); break; case -1: if (terminator >= 0) omission_error(a, c_ket); return; } } } static void remove_dead_assignments(struct node * p, struct name * q) { if (p->name == q) { switch (p->type) { case c_assignto: case c_sliceto: case c_mathassign: case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: case c_setmark: case c_set: case c_unset: case c_dollar: /* c_true is a no-op. */ p->type = c_true; p->AE = NULL; break; default: /* There are no read accesses to this variable, so any * references must be assignments. */ fprintf(stderr, "Unhandled type of dead assignment via %s\n", name_of_token(p->type)); exit(1); } } if (p->AE) remove_dead_assignments(p->AE, q); if (p->left) remove_dead_assignments(p->left, q); if (p->aux) remove_dead_assignments(p->aux, q); if (p->right) remove_dead_assignments(p->right, q); } extern void read_program(struct analyser * a) { read_program_(a, -1); { struct name * q = a->names; while (q) { switch (q->type) { case t_external: case t_routine: if (q->used && q->definition == 0) error4(a, q); break; case t_grouping: if (q->used && q->grouping == 0) error4(a, q); break; } q = q->next; } } if (a->tokeniser->error_count == 0) { struct name * q = a->names; struct name ** ptr = &(a->names); while (q) { if (!q->referenced) { fprintf(stderr, "%s:%d: warning: %s '", a->tokeniser->file, q->declaration_line_number, name_of_name_type(q->type)); report_b(stderr, q->b); if (q->type == t_routine || q->type == t_external || q->type == t_grouping) { fprintf(stderr, "' declared but not defined\n"); } else { fprintf(stderr, "' defined but not used\n"); q = q->next; *ptr = q; continue; } } else if (q->type == t_routine || q->type == t_grouping) { /* It's OK to define a grouping but only use it to define other * groupings. */ if (!q->used && !q->used_in_definition) { int line_num; if (q->type == t_routine) { line_num = q->definition->line_number; } else { line_num = q->grouping->line_number; } fprintf(stderr, "%s:%d: warning: %s '", a->tokeniser->file, line_num, name_of_name_type(q->type)); report_b(stderr, q->b); fprintf(stderr, "' defined but not used\n"); } } else if (q->type == t_external) { /* Unused is OK. */ } else if (!q->initialised) { fprintf(stderr, "%s:%d: warning: %s '", a->tokeniser->file, q->declaration_line_number, name_of_name_type(q->type)); report_b(stderr, q->b); fprintf(stderr, "' is never initialised\n"); } else if (!q->value_used) { fprintf(stderr, "%s:%d: warning: %s '", a->tokeniser->file, q->declaration_line_number, name_of_name_type(q->type)); report_b(stderr, q->b); fprintf(stderr, "' is set but never used\n"); remove_dead_assignments(a->program, q); q = q->next; *ptr = q; continue; } ptr = &(q->next); q = q->next; } { /* Now we've eliminated variables whose values are never used we * can number the variables, which is used by some generators. */ int * name_count = a->name_count; struct name * n; for (n = a->names; n; n = n->next) { n->count = name_count[n->type]++; } } } } extern struct analyser * create_analyser(struct tokeniser * t) { NEW(analyser, a); a->tokeniser = t; a->nodes = 0; a->names = 0; a->literalstrings = 0; a->program = 0; a->amongs = 0; a->among_count = 0; a->groupings = 0; a->mode = m_forward; a->modifyable = true; { int i; for (i = 0; i < t_size; i++) a->name_count[i] = 0; } a->substring = 0; a->int_limits_used = false; return a; } extern void close_analyser(struct analyser * a) { { struct node * q = a->nodes; while (q) { struct node * q_next = q->next; FREE(q); q = q_next; } } { struct name * q = a->names; while (q) { struct name * q_next = q->next; lose_b(q->b); FREE(q); q = q_next; } } { struct literalstring * q = a->literalstrings; while (q) { struct literalstring * q_next = q->next; lose_b(q->b); FREE(q); q = q_next; } } { struct among * q = a->amongs; while (q) { struct among * q_next = q->next; FREE(q->b); FREE(q->commands); FREE(q); q = q_next; } } { struct grouping * q = a->groupings; while (q) { struct grouping * q_next = q->next; lose_b(q->b); FREE(q); q = q_next; } } FREE(a); } snowball-2.2.0/compiler/driver.c000066400000000000000000000460561414263061200166170ustar00rootroot00000000000000#include /* for toupper etc */ #include /* for fprintf etc */ #include /* for free etc */ #include /* for strcmp */ #include "header.h" #define DEFAULT_JAVA_PACKAGE "org.tartarus.snowball.ext" #define DEFAULT_JAVA_BASE_CLASS "org.tartarus.snowball.SnowballProgram" #define DEFAULT_JAVA_AMONG_CLASS "org.tartarus.snowball.Among" #define DEFAULT_JAVA_STRING_CLASS "java.lang.StringBuilder" #define DEFAULT_GO_PACKAGE "snowball" #define DEFAULT_GO_SNOWBALL_RUNTIME "github.com/snowballstem/snowball/go" #define DEFAULT_ADA_PACKAGE "Snowball" #define DEFAULT_ADA_SNOWBALL_RUNTIME "github.com/snowballstem/snowball/ada" #define DEFAULT_CS_NAMESPACE "Snowball" #define DEFAULT_CS_BASE_CLASS "Stemmer" #define DEFAULT_CS_AMONG_CLASS "Among" #define DEFAULT_CS_STRING_CLASS "StringBuilder" #define DEFAULT_JS_BASE_CLASS "BaseStemmer" #define DEFAULT_PYTHON_BASE_CLASS "BaseStemmer" static int eq(const char * s1, const char * s2) { return strcmp(s1, s2) == 0; } static void print_arglist(int exit_code) { FILE * f = exit_code ? stderr : stdout; fprintf(f, "Usage: snowball SOURCE_FILE... [OPTIONS]\n\n" "Supported options:\n" " -o[utput] file\n" " -s[yntax]\n" " -comments\n" #ifndef DISABLE_JAVA " -j[ava]\n" #endif #ifndef DISABLE_CSHARP " -cs[harp]\n" #endif " -c++\n" #ifndef DISABLE_PASCAL " -pascal\n" #endif #ifndef DISABLE_PYTHON " -py[thon]\n" #endif #ifndef DISABLE_JS " -js\n" #endif #ifndef DISABLE_RUST " -rust\n" #endif #ifndef DISABLE_GO " -go\n" #endif #ifndef DISABLE_ADA " -ada\n" #endif " -w[idechars]\n" " -u[tf8]\n" " -n[ame] class name\n" " -ep[refix] string\n" " -vp[refix] string\n" " -i[nclude] directory\n" " -r[untime] path to runtime headers\n" " -p[arentclassname] fully qualified parent class name\n" #if !defined(DISABLE_JAVA) || !defined(DISABLE_CSHARP) " -P[ackage] package name for stemmers\n" " -S[tringclass] StringBuffer-compatible class\n" " -a[mongclass] fully qualified name of the Among class\n" #endif #ifndef DISABLE_GO " -gop[ackage] Go package name for stemmers\n" " -gor[untime] Go snowball runtime package\n" #endif " --help display this help and exit\n" " --version output version information and exit\n" ); exit(exit_code); } static void check_lim(int i, int argc) { if (i >= argc) { fprintf(stderr, "argument list is one short\n"); print_arglist(1); } } static FILE * get_output(symbol * b) { char * s = b_to_s(b); FILE * output = fopen(s, "w"); if (output == 0) { fprintf(stderr, "Can't open output %s\n", s); exit(1); } free(s); return output; } static int read_options(struct options * o, int argc, char * argv[]) { char * s; int i = 1; int new_argc = 1; /* Note down the last option used to specify an explicit encoding so * we can warn we ignored it for languages with a fixed encoding. */ const char * encoding_opt = NULL; /* set defaults: */ o->output_file = 0; o->syntax_tree = false; o->comments = false; o->externals_prefix = NULL; o->variables_prefix = 0; o->runtime_path = 0; o->parent_class_name = NULL; o->string_class = NULL; o->among_class = NULL; o->package = NULL; o->go_snowball_runtime = DEFAULT_GO_SNOWBALL_RUNTIME; o->name = NULL; o->make_lang = LANG_C; o->includes = 0; o->includes_end = 0; o->encoding = ENC_SINGLEBYTE; /* read options: */ while (i < argc) { s = argv[i++]; if (s[0] != '-') { /* Non-option argument - shuffle down. */ argv[new_argc++] = s; continue; } { if (eq(s, "-o") || eq(s, "-output")) { check_lim(i, argc); o->output_file = argv[i++]; continue; } if (eq(s, "-n") || eq(s, "-name")) { check_lim(i, argc); o->name = argv[i++]; continue; } #ifndef DISABLE_JS if (eq(s, "-js")) { o->make_lang = LANG_JAVASCRIPT; continue; } #endif #ifndef DISABLE_RUST if (eq(s, "-rust")) { o->make_lang = LANG_RUST; continue; } #endif #ifndef DISABLE_GO if (eq(s, "-go")) { o->make_lang = LANG_GO; continue; } #endif #ifndef DISABLE_JAVA if (eq(s, "-j") || eq(s, "-java")) { o->make_lang = LANG_JAVA; continue; } #endif #ifndef DISABLE_CSHARP if (eq(s, "-cs") || eq(s, "-csharp")) { o->make_lang = LANG_CSHARP; continue; } #endif if (eq(s, "-c++")) { o->make_lang = LANG_CPLUSPLUS; continue; } #ifndef DISABLE_PASCAL if (eq(s, "-pascal")) { o->make_lang = LANG_PASCAL; continue; } #endif #ifndef DISABLE_PYTHON if (eq(s, "-py") || eq(s, "-python")) { o->make_lang = LANG_PYTHON; continue; } #endif #ifndef DISABLE_ADA if (eq(s, "-ada")) { o->make_lang = LANG_ADA; continue; } #endif if (eq(s, "-w") || eq(s, "-widechars")) { encoding_opt = s; o->encoding = ENC_WIDECHARS; continue; } if (eq(s, "-s") || eq(s, "-syntax")) { o->syntax_tree = true; continue; } if (eq(s, "-comments")) { o->comments = true; continue; } if (eq(s, "-ep") || eq(s, "-eprefix")) { check_lim(i, argc); o->externals_prefix = argv[i++]; continue; } if (eq(s, "-vp") || eq(s, "-vprefix")) { check_lim(i, argc); o->variables_prefix = argv[i++]; continue; } if (eq(s, "-i") || eq(s, "-include")) { check_lim(i, argc); { NEW(include, p); symbol * b = add_s_to_b(0, argv[i++]); b = add_s_to_b(b, "/"); p->next = 0; p->b = b; if (o->includes == 0) o->includes = p; else o->includes_end->next = p; o->includes_end = p; } continue; } if (eq(s, "-r") || eq(s, "-runtime")) { check_lim(i, argc); o->runtime_path = argv[i++]; continue; } if (eq(s, "-u") || eq(s, "-utf8")) { encoding_opt = s; o->encoding = ENC_UTF8; continue; } if (eq(s, "-p") || eq(s, "-parentclassname")) { check_lim(i, argc); o->parent_class_name = argv[i++]; continue; } #if !defined(DISABLE_JAVA) || !defined(DISABLE_CSHARP) if (eq(s, "-P") || eq(s, "-Package")) { check_lim(i, argc); o->package = argv[i++]; continue; } if (eq(s, "-S") || eq(s, "-stringclass")) { check_lim(i, argc); o->string_class = argv[i++]; continue; } if (eq(s, "-a") || eq(s, "-amongclass")) { check_lim(i, argc); o->among_class = argv[i++]; continue; } #endif #ifndef DISABLE_GO if (eq(s, "-gop") || eq(s, "-gopackage")) { check_lim(i, argc); o->package = argv[i++]; continue; } if (eq(s, "-gor") || eq(s, "-goruntime")) { check_lim(i, argc); o->go_snowball_runtime = argv[i++]; continue; } #endif if (eq(s, "--help")) { print_arglist(0); } if (eq(s, "--version")) { printf("Snowball compiler version " SNOWBALL_VERSION "\n"); exit(0); } fprintf(stderr, "'%s' misplaced\n", s); print_arglist(1); } } if (new_argc == 1) { fprintf(stderr, "no source files specified\n"); print_arglist(1); } argv[new_argc] = NULL; /* Set language-dependent defaults. */ switch (o->make_lang) { case LANG_C: case LANG_CPLUSPLUS: encoding_opt = NULL; break; case LANG_CSHARP: o->encoding = ENC_WIDECHARS; if (!o->parent_class_name) o->parent_class_name = DEFAULT_CS_BASE_CLASS; if (!o->string_class) o->string_class = DEFAULT_CS_STRING_CLASS; if (!o->among_class) o->among_class = DEFAULT_CS_AMONG_CLASS; if (!o->package) o->package = DEFAULT_CS_NAMESPACE; break; case LANG_GO: o->encoding = ENC_UTF8; if (!o->package) o->package = DEFAULT_GO_PACKAGE; break; case LANG_ADA: o->encoding = ENC_UTF8; if (!o->package) o->package = DEFAULT_ADA_PACKAGE; break; case LANG_JAVA: o->encoding = ENC_WIDECHARS; if (!o->parent_class_name) o->parent_class_name = DEFAULT_JAVA_BASE_CLASS; if (!o->string_class) o->string_class = DEFAULT_JAVA_STRING_CLASS; if (!o->among_class) o->among_class = DEFAULT_JAVA_AMONG_CLASS; if (!o->package) o->package = DEFAULT_JAVA_PACKAGE; break; case LANG_JAVASCRIPT: o->encoding = ENC_WIDECHARS; if (!o->parent_class_name) o->parent_class_name = DEFAULT_JS_BASE_CLASS; break; case LANG_PYTHON: o->encoding = ENC_WIDECHARS; if (!o->parent_class_name) o->parent_class_name = DEFAULT_PYTHON_BASE_CLASS; break; case LANG_RUST: o->encoding = ENC_UTF8; break; default: break; } if (encoding_opt) { fprintf(stderr, "warning: %s only meaningful for C and C++\n", encoding_opt); } if (o->make_lang != LANG_C && o->make_lang != LANG_CPLUSPLUS) { if (o->runtime_path) { fprintf(stderr, "warning: -r/-runtime only meaningful for C and C++\n"); } if (o->externals_prefix) { fprintf(stderr, "warning: -ep/-eprefix only meaningful for C and C++\n"); } } if (!o->externals_prefix) o->externals_prefix = ""; if (!o->name && o->output_file) { /* Default class name to basename of output_file - this is the standard * convention for at least Java and C#. */ const char * slash = strrchr(o->output_file, '/'); size_t len; const char * leaf = (slash == NULL) ? o->output_file : slash + 1; slash = strrchr(leaf, '\\'); if (slash != NULL) leaf = slash + 1; { const char * dot = strchr(leaf, '.'); len = (dot == NULL) ? strlen(leaf) : (size_t)(dot - leaf); } { char * new_name = malloc(len + 1); switch (o->make_lang) { case LANG_CSHARP: case LANG_PASCAL: /* Upper case initial letter. */ memcpy(new_name, leaf, len); new_name[0] = toupper(new_name[0]); break; case LANG_JAVASCRIPT: case LANG_PYTHON: { /* Upper case initial letter and change each * underscore+letter or hyphen+letter to an upper case * letter. */ size_t i, j = 0; int uc_next = true; for (i = 0; i != len; ++i) { unsigned char ch = leaf[i]; if (ch == '_' || ch == '-') { uc_next = true; } else { if (uc_next) { new_name[j] = toupper(ch); uc_next = false; } else { new_name[j] = ch; } ++j; } } len = j; break; } default: /* Just copy. */ memcpy(new_name, leaf, len); break; } new_name[len] = '\0'; o->name = new_name; } } return new_argc; } extern int main(int argc, char * argv[]) { int i; NEW(options, o); argc = read_options(o, argc, argv); { char * file = argv[1]; symbol * u = get_input(file); if (u == 0) { fprintf(stderr, "Can't open input %s\n", file); exit(1); } { struct tokeniser * t = create_tokeniser(u, file); struct analyser * a = create_analyser(t); struct input ** next_input_ptr = &(t->next); a->encoding = t->encoding = o->encoding; t->includes = o->includes; /* If multiple source files are specified, set up the others to be * read after the first in order, using the same mechanism as * 'get' uses. */ for (i = 2; i != argc; ++i) { NEW(input, q); file = argv[i]; u = get_input(file); if (u == 0) { fprintf(stderr, "Can't open input %s\n", file); exit(1); } q->p = u; q->c = 0; q->file = file; q->file_needs_freeing = false; q->line_number = 1; *next_input_ptr = q; next_input_ptr = &(q->next); } *next_input_ptr = NULL; read_program(a); if (t->error_count > 0) exit(1); if (o->syntax_tree) print_program(a); close_tokeniser(t); if (!o->syntax_tree) { struct generator * g; const char * s = o->output_file; if (!s) { fprintf(stderr, "Please include the -o option\n"); print_arglist(1); } g = create_generator(a, o); if (o->make_lang == LANG_C || o->make_lang == LANG_CPLUSPLUS) { symbol * b = add_s_to_b(0, s); b = add_s_to_b(b, ".h"); o->output_h = get_output(b); b[SIZE(b) - 1] = 'c'; if (o->make_lang == LANG_CPLUSPLUS) { b = add_s_to_b(b, "c"); } o->output_src = get_output(b); lose_b(b); generate_program_c(g); fclose(o->output_src); fclose(o->output_h); } #ifndef DISABLE_JAVA if (o->make_lang == LANG_JAVA) { symbol * b = add_s_to_b(0, s); b = add_s_to_b(b, ".java"); o->output_src = get_output(b); lose_b(b); generate_program_java(g); fclose(o->output_src); } #endif #ifndef DISABLE_PASCAL if (o->make_lang == LANG_PASCAL) { symbol *b = add_s_to_b(0, s); b = add_s_to_b(b, ".pas"); o->output_src = get_output(b); lose_b(b); generate_program_pascal(g); fclose(o->output_src); } #endif #ifndef DISABLE_PYTHON if (o->make_lang == LANG_PYTHON) { symbol * b = add_s_to_b(0, s); b = add_s_to_b(b, ".py"); o->output_src = get_output(b); lose_b(b); generate_program_python(g); fclose(o->output_src); } #endif #ifndef DISABLE_JS if (o->make_lang == LANG_JAVASCRIPT) { symbol * b = add_s_to_b(0, s); b = add_s_to_b(b, ".js"); o->output_src = get_output(b); lose_b(b); generate_program_js(g); fclose(o->output_src); } #endif #ifndef DISABLE_CSHARP if (o->make_lang == LANG_CSHARP) { symbol * b = add_s_to_b(0, s); b = add_s_to_b(b, ".cs"); o->output_src = get_output(b); lose_b(b); generate_program_csharp(g); fclose(o->output_src); } #endif #ifndef DISABLE_RUST if (o->make_lang == LANG_RUST) { symbol * b = add_s_to_b(0, s); b = add_s_to_b(b, ".rs"); o->output_src = get_output(b); lose_b(b); generate_program_rust(g); fclose(o->output_src); } #endif #ifndef DISABLE_GO if (o->make_lang == LANG_GO) { symbol * b = add_s_to_b(0, s); b = add_s_to_b(b, ".go"); o->output_src = get_output(b); lose_b(b); generate_program_go(g); fclose(o->output_src); } #endif #ifndef DISABLE_ADA if (o->make_lang == LANG_ADA) { symbol * b = add_s_to_b(0, s); b = add_s_to_b(b, ".ads"); o->output_h = get_output(b); b[SIZE(b) - 1] = 'b'; o->output_src = get_output(b); lose_b(b); generate_program_ada(g); fclose(o->output_src); fclose(o->output_h); } #endif close_generator(g); } close_analyser(a); } lose_b(u); } { struct include * p = o->includes; while (p) { struct include * q = p->next; lose_b(p->b); FREE(p); p = q; } } FREE(o); if (space_count) fprintf(stderr, "%d blocks unfreed\n", space_count); return 0; } snowball-2.2.0/compiler/generator.c000066400000000000000000001500261414263061200173030ustar00rootroot00000000000000 #include /* for INT_MAX */ #include /* for fprintf etc */ #include /* for free etc */ #include /* for strlen */ #include "header.h" /* Define this to get warning messages when optimisations can't be used. */ /* #define OPTIMISATION_WARNINGS */ /* recursive use: */ static void generate(struct generator * g, struct node * p); static int new_label(struct generator * g) { return g->next_label++; } /* Write routines for simple entities */ /* Write a space if the preceding character was not whitespace */ static void ws_opt_space(struct generator * g, const char * s) { int ch = str_back(g->outbuf); if (ch != ' ' && ch != '\n' && ch != '\t' && ch != -1) write_char(g, ' '); write_string(g, s); } static void wi3(struct generator * g, int i) { if (i < 100) write_char(g, ' '); if (i < 10) write_char(g, ' '); write_int(g, i); /* integer (width 3) */ } /* Write routines for items from the syntax tree */ static void write_varname(struct generator * g, struct name * p) { int ch = "SIIrxg"[p->type]; switch (p->type) { case t_external: write_string(g, g->options->externals_prefix); break; case t_string: case t_boolean: case t_integer: { int count = p->count; if (count < 0) { fprintf(stderr, "Reference to optimised out variable "); report_b(stderr, p->b); fprintf(stderr, " attempted\n"); exit(1); } if (p->type == t_boolean) { /* We use a single array for booleans and integers, with the * integers first. */ count += g->analyser->name_count[t_integer]; } write_char(g, ch); write_char(g, '['); write_int(g, count); write_char(g, ']'); return; } default: write_char(g, ch); write_char(g, '_'); } write_b(g, p->b); } static void write_varref(struct generator * g, struct name * p) { /* reference to variable */ if (p->type < t_routine) write_string(g, "z->"); write_varname(g, p); } static void write_hexdigit(struct generator * g, int i) { str_append_ch(g->outbuf, "0123456789ABCDEF"[i & 0xF]); /* hexchar */ } static void write_hex(struct generator * g, int i) { if (i >> 4) write_hex(g, i >> 4); write_hexdigit(g, i); /* hex integer */ } /* write character literal */ static void wlitch(struct generator * g, int ch) { if (32 <= ch && ch < 127) { write_char(g, '\''); if (ch == '\'' || ch == '\\') { write_char(g, '\\'); } write_char(g, ch); write_char(g, '\''); } else { write_string(g, "0x"); write_hex(g, ch); } } static void wlitarray(struct generator * g, symbol * p) { /* write literal array */ write_string(g, "{ "); { int i; for (i = 0; i < SIZE(p); i++) { wlitch(g, p[i]); if (i < SIZE(p) - 1) write_string(g, ", "); } } write_string(g, " }"); } static void wlitref(struct generator * g, symbol * p) { /* write ref to literal array */ if (SIZE(p) == 0) { write_char(g, '0'); } else { struct str * s = g->outbuf; g->outbuf = g->declarations; write_string(g, "static const symbol s_"); write_int(g, g->literalstring_count); write_string(g, "[] = "); wlitarray(g, p); write_string(g, ";\n"); g->outbuf = s; write_string(g, "s_"); write_int(g, g->literalstring_count); g->literalstring_count++; } } static void write_margin(struct generator * g) { int i; for (i = 0; i < g->margin; i++) write_string(g, " "); } void write_comment_content(struct generator * g, struct node * p) { switch (p->type) { case c_mathassign: case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: if (p->name) { write_char(g, '$'); write_b(g, p->name->b); write_char(g, ' '); } write_string(g, name_of_token(p->type)); write_string(g, " "); break; case c_eq: case c_ne: case c_gr: case c_ge: case c_ls: case c_le: write_string(g, "$( "); write_string(g, name_of_token(p->type)); write_string(g, " )"); break; default: write_string(g, name_of_token(p->type)); if (p->name) { write_char(g, ' '); write_b(g, p->name->b); } } write_string(g, ", line "); write_int(g, p->line_number); } static void write_comment(struct generator * g, struct node * p) { if (g->options->comments) { ws_opt_space(g, "/* "); write_comment_content(g, p); write_string(g, " */"); } write_newline(g); } static void wms(struct generator * g, const char * s) { write_margin(g); write_string(g, s); } /* margin + string */ static void write_block_start(struct generator * g) { /* block start */ wms(g, "{ "); g->margin++; } static void write_block_end(struct generator * g) { /* block end */ if (g->line_labelled == g->line_count) { wms(g, ";"); write_newline(g); } g->margin--; wms(g, "}"); write_newline(g); } static void w(struct generator * g, const char * s); /* keep c */ static void wk(struct generator * g, struct node * p, int keep_limit) { ++g->keep_count; if (p->mode == m_forward) { write_string(g, "int c"); write_int(g, g->keep_count); write_string(g, " = z->c"); if (keep_limit) { write_string(g, ", mlimit"); write_int(g, g->keep_count); } write_char(g, ';'); } else { write_string(g, "int m"); write_int(g, g->keep_count); write_string(g, " = z->l - z->c"); if (keep_limit) { write_string(g, ", mlimit"); write_int(g, g->keep_count); } write_string(g, "; (void)m"); write_int(g, g->keep_count); write_char(g, ';'); } } static void wrestore(struct generator * g, struct node * p, int keep_token) { /* restore c */ if (p->mode == m_forward) { write_string(g, "z->c = c"); } else { write_string(g, "z->c = z->l - m"); } write_int(g, keep_token); write_char(g, ';'); } static void wrestorelimit(struct generator * g, struct node * p, int keep_token) { /* restore limit */ if (p->mode == m_forward) { w(g, "z->l += mlimit"); } else { w(g, "z->lb = mlimit"); } write_int(g, keep_token); write_string(g, ";"); } static void winc(struct generator * g, struct node * p) { /* increment c */ write_string(g, p->mode == m_forward ? "z->c++;" : "z->c--;"); } static void wsetl(struct generator * g, int n) { g->margin--; wms(g, "lab"); write_int(g, n); write_char(g, ':'); write_newline(g); g->line_labelled = g->line_count; g->margin++; } static void wgotol(struct generator * g, int n) { wms(g, "goto lab"); write_int(g, n); write_char(g, ';'); write_newline(g); } static void write_failure(struct generator * g, struct node * p) { /* fail */ if (g->failure_keep_count != 0) { write_string(g, "{ "); if (g->failure_keep_count > 0) { wrestore(g, p, g->failure_keep_count); } else { wrestorelimit(g, p, -g->failure_keep_count); } write_char(g, ' '); } switch (g->failure_label) { case x_return: write_string(g, "return 0;"); break; default: write_string(g, "goto lab"); write_int(g, g->failure_label); write_char(g, ';'); g->label_used = 1; } if (g->failure_keep_count != 0) write_string(g, " }"); } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { write_string(g, p->mode == m_forward ? "if (z->c >= z->l) " : "if (z->c <= z->lb) "); write_failure(g, p); } static void write_data_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != 0) { write_int(g, SIZE(b)); w(g, ", "); wlitref(g, b); } else { write_varref(g, p->name); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; int l = strlen(input); while (i < l) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } switch (input[i++]) { default: write_char(g, input[i - 1]); continue; case 'C': write_comment(g, p); continue; case 'k': wk(g, p, false); continue; case 'K': wk(g, p, true); continue; case 'i': winc(g, p); continue; case 'l': write_check_limit(g, p); continue; case 'f': write_failure(g, p); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': write_string(g, g->S[input[i++] - '0']); continue; case 'I': write_int(g, g->I[input[i++] - '0']); continue; case 'J': wi3(g, g->I[input[i++] - '0']); continue; case 'V': write_varref(g, g->V[input[i++] - '0']); continue; case 'W': write_varname(g, g->V[input[i++] - '0']); continue; case 'L': wlitref(g, g->L[input[i++] - '0']); continue; case 'A': wlitarray(g, g->L[input[i++] - '0']); continue; case 'c': wlitch(g, g->I[input[i++] - '0']); continue; case 'a': write_data_address(g, p); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case '$': /* insert_s, insert_v etc */ write_char(g, p->literalstring == 0 ? 'v' : 's'); continue; case 'p': write_string(g, g->options->externals_prefix); continue; } } } static void w(struct generator * g, const char * s) { writef(g, s, 0); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "MAXINT"); break; case c_minint: write_string(g, "MININT"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; goto label0; case c_divide: s = " / "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "z->c"); break; case c_limit: w(g, p->mode == m_forward ? "z->l" : "z->lb"); break; case c_len: if (g->options->encoding == ENC_UTF8) { w(g, "len_utf8(z->p)"); break; } /* FALLTHRU */ case c_size: w(g, "SIZE(z->p)"); break; case c_lenof: if (g->options->encoding == ENC_UTF8) { g->V[0] = p->name; w(g, "len_utf8(~V0)"); break; } /* FALLTHRU */ case c_sizeof: g->V[0] = p->name; w(g, "SIZE(~V0)"); break; } } /* K_needed() tests to see if we really need to keep c. Not true when the command does not touch the cursor. This and repeat_score() could be elaborated almost indefinitely. */ static int K_needed_(struct generator * g, struct node * p, int call_depth) { while (p) { switch (p->type) { case c_atlimit: case c_do: case c_dollar: case c_leftslice: case c_rightslice: case c_mathassign: case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: case c_eq: case c_ne: case c_gr: case c_ge: case c_ls: case c_le: case c_sliceto: case c_booltest: case c_set: case c_unset: case c_true: case c_false: case c_debug: break; case c_call: /* Recursive functions aren't typical in snowball programs, so * make the pessimistic assumption that keep is needed if we * hit a generous limit on recursion. It's not likely to make * a difference to any real world program, but means we won't * recurse until we run out of stack for pathological cases. */ if (call_depth >= 100) return true; if (K_needed_(g, p->name->definition, call_depth + 1)) return true; break; case c_bra: if (K_needed_(g, p->left, call_depth)) return true; break; default: return true; } p = p->right; } return false; } extern int K_needed(struct generator * g, struct node * p) { return K_needed_(g, p, 0); } static int repeat_score(struct generator * g, struct node * p, int call_depth) { int score = 0; while (p) { switch (p->type) { case c_dollar: case c_leftslice: case c_rightslice: case c_mathassign: case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: case c_eq: case c_ne: case c_gr: case c_ge: case c_ls: case c_le: case c_sliceto: /* case c_not: must not be included here! */ case c_debug: break; case c_call: /* Recursive functions aren't typical in snowball programs, so * make the pessimistic assumption that repeat requires cursor * reinstatement if we hit a generous limit on recursion. It's * not likely to make a difference to any real world program, * but means we won't recurse until we run out of stack for * pathological cases. */ if (call_depth >= 100) { return 2; } score += repeat_score(g, p->name->definition, call_depth + 1); if (score >= 2) return score; break; case c_bra: score += repeat_score(g, p->left, call_depth); if (score >= 2) return score; break; case c_name: case c_literalstring: case c_next: case c_grouping: case c_non: case c_hop: if (++score >= 2) return score; break; default: return 2; } p = p->right; } return score; } /* tests if an expression requires cursor reinstatement in a repeat */ extern int repeat_restore(struct generator * g, struct node * p) { return repeat_score(g, p, 0) >= 2; } static void generate_bra(struct generator * g, struct node * p) { p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { int keep_c = 0; if (K_needed(g, p->left)) { writef(g, "~{~k~C", p); keep_c = g->keep_count; } else { writef(g, "~M~C", p); } p = p->left; while (p) { generate(g, p); if (keep_c && p->right != 0) { w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); } p = p->right; } if (keep_c) w(g, "~}"); } static void generate_or(struct generator * g, struct node * p) { int keep_c = 0; int used = g->label_used; int a0 = g->failure_label; int a1 = g->failure_keep_count; int out_lab = new_label(g); if (K_needed(g, p->left)) { writef(g, "~{~k~C", p); keep_c = g->keep_count; } else { writef(g, "~M~C", p); } p = p->left; g->failure_keep_count = 0; while (p->right) { g->failure_label = new_label(g); g->label_used = 0; generate(g, p); wgotol(g, out_lab); if (g->label_used) wsetl(g, g->failure_label); if (keep_c) { w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); } p = p->right; } g->label_used = used; g->failure_label = a0; g->failure_keep_count = a1; generate(g, p); if (keep_c) w(g, "~}"); wsetl(g, out_lab); } static void generate_backwards(struct generator * g, struct node * p) { writef(g, "~Mz->lb = z->c; z->c = z->l;~C~N", p); generate(g, p->left); w(g, "~Mz->c = z->lb;~N"); } static void generate_not(struct generator * g, struct node * p) { int keep_c = 0; int used = g->label_used; int a0 = g->failure_label; int a1 = g->failure_keep_count; if (K_needed(g, p->left)) { writef(g, "~{~k~C", p); keep_c = g->keep_count; } else { writef(g, "~M~C", p); } g->failure_label = new_label(g); g->label_used = 0; g->failure_keep_count = 0; generate(g, p->left); { int l = g->failure_label; int u = g->label_used; g->label_used = used; g->failure_label = a0; g->failure_keep_count = a1; writef(g, "~M~f~N", p); if (u) wsetl(g, l); } if (keep_c) { w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N~}"); } } static void generate_try(struct generator * g, struct node * p) { int keep_c = 0; if (K_needed(g, p->left)) { writef(g, "~{~k~C", p); keep_c = g->keep_count; } else { writef(g, "~M~C", p); } g->failure_keep_count = keep_c; g->failure_label = new_label(g); g->label_used = 0; generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); if (keep_c) w(g, "~}"); } static void generate_set(struct generator * g, struct node * p) { g->V[0] = p->name; writef(g, "~M~V0 = 1;~C", p); } static void generate_unset(struct generator * g, struct node * p) { g->V[0] = p->name; writef(g, "~M~V0 = 0;~C", p); } static void generate_fail(struct generator * g, struct node * p) { generate(g, p->left); writef(g, "~M~f~C", p); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { int keep_c = 0; if (K_needed(g, p->left)) { keep_c = ++g->keep_count; w(g, p->mode == m_forward ? "~{int c_test" : "~{int m_test"); write_int(g, keep_c); w(g, p->mode == m_forward ? " = z->c;" : " = z->l - z->c;"); writef(g, "~C", p); } else writef(g, "~M~C", p); generate(g, p->left); if (keep_c) { w(g, p->mode == m_forward ? "~Mz->c = c_test" : "~Mz->c = z->l - m_test"); write_int(g, keep_c); writef(g, ";~N~}", p); } } static void generate_do(struct generator * g, struct node * p) { int keep_c = 0; if (K_needed(g, p->left)) { writef(g, "~{~k~C", p); keep_c = g->keep_count; } else { writef(g, "~M~C", p); } if (p->left->type == c_call) { /* Optimise do */ g->V[0] = p->left->name; writef(g, "~{int ret = ~V0(z);~C", p->left); w(g, "~Mif (ret < 0) return ret;~N~}"); } else { g->failure_label = new_label(g); g->label_used = 0; g->failure_keep_count = 0; generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); } if (keep_c) { w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N~}"); } } static void generate_next(struct generator * g, struct node * p) { if (g->options->encoding == ENC_UTF8) { if (p->mode == m_forward) w(g, "~{int ret = skip_utf8(z->p, z->c, z->l, 1"); else w(g, "~{int ret = skip_b_utf8(z->p, z->c, z->lb, 1"); writef(g, ");~N" "~Mif (ret < 0) ~f~N" "~Mz->c = ret;~C" "~}", p); } else writef(g, "~M~l~N" "~M~i~C", p); } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "in" : "out"; g->S[2] = g->options->encoding == ENC_UTF8 ? "_U" : ""; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; if (is_goto) { writef(g, "~Mif (~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 1) < 0) ~f~C", p); } else { writef(g, "~{~C" "~Mint ret = ~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 1);~N" "~Mif (ret < 0) ~f~N", p); if (p->mode == m_forward) w(g, "~Mz->c += ret;~N"); else w(g, "~Mz->c -= ret;~N"); w(g, "~}"); } } static void generate_GO(struct generator * g, struct node * p, int style) { int keep_c = 0; int used = g->label_used; int a0 = g->failure_label; int a1 = g->failure_keep_count; if (p->left->type == c_grouping || p->left->type == c_non) { /* Special case for "goto" or "gopast" when used on a grouping or an * inverted grouping - the movement of c by the matching action is * exactly what we want! */ #ifdef OPTIMISATION_WARNINGS printf("Optimising %s %s\n", style ? "goto" : "gopast", p->left->type == c_non ? "non" : "grouping"); #endif if (g->options->comments) { writef(g, "~M~C", p); } generate_GO_grouping(g, p->left, style, p->left->type == c_non); return; } w(g, "~Mwhile(1) {"); writef(g, "~C~+", p); if (style == 1 || repeat_restore(g, p->left)) { writef(g, "~M~k~N", p); keep_c = g->keep_count; } g->failure_label = new_label(g); g->label_used = 0; g->failure_keep_count = 0; generate(g, p->left); if (style == 1) { /* include for goto; omit for gopast */ w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); } w(g, "~Mbreak;~N"); if (g->label_used) wsetl(g, g->failure_label); if (keep_c) { w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); } g->label_used = used; g->failure_label = a0; g->failure_keep_count = a1; generate_next(g, p); w(g, "~}"); } static void generate_loop(struct generator * g, struct node * p) { w(g, "~{int i; for (i = "); generate_AE(g, p->AE); writef(g, "; i > 0; i--)~C" "~{", p); generate(g, p->left); w(g, "~}" "~}"); } static void generate_repeat_or_atleast(struct generator * g, struct node * p, int atleast_case) { int keep_c = 0; if (atleast_case) { writef(g, "~Mwhile(1) {~+~N", p); } else { writef(g, "~Mwhile(1) {~+~C", p); } if (repeat_restore(g, p->left)) { writef(g, "~M~k~N", p); keep_c = g->keep_count; } g->failure_label = new_label(g); g->label_used = 0; g->failure_keep_count = 0; generate(g, p->left); if (atleast_case) w(g, "~Mi--;~N"); w(g, "~Mcontinue;~N"); if (g->label_used) wsetl(g, g->failure_label); if (keep_c) { w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); } w(g, "~Mbreak;~N" "~}"); } static void generate_repeat(struct generator * g, struct node * p) { generate_repeat_or_atleast(g, p, false); } static void generate_atleast(struct generator * g, struct node * p) { w(g, "~{int i = "); generate_AE(g, p->AE); w(g, ";~C"); { int used = g->label_used; int a0 = g->failure_label; int a1 = g->failure_keep_count; generate_repeat_or_atleast(g, p, true); g->label_used = used; g->failure_label = a0; g->failure_keep_count = a1; } writef(g, "~Mif (i > 0) ~f~N" "~}", p); } static void generate_setmark(struct generator * g, struct node * p) { g->V[0] = p->name; writef(g, "~M~V0 = z->c;~C", p); } static void generate_tomark(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif (z->c ~S0 "); generate_AE(g, p->AE); writef(g, ") ~f~N", p); w(g, "~Mz->c = "); generate_AE(g, p->AE); writef(g, ";~C", p); } static void generate_atmark(struct generator * g, struct node * p) { w(g, "~Mif (z->c != "); generate_AE(g, p->AE); writef(g, ") ~f~C", p); } static void generate_hop(struct generator * g, struct node * p) { if (g->options->encoding == ENC_UTF8) { g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = p->mode == m_forward ? "z->l" : "z->lb"; w(g, "~{int ret = skip~S0_utf8(z->p, z->c, ~S1, "); generate_AE(g, p->AE); writef(g, ");~C", p); writef(g, "~Mif (ret < 0) ~f~N", p); writef(g, "~Mz->c = ret;~N" "~}", p); } else { // Fixed-width characters. g->S[0] = p->mode == m_forward ? "+" : "-"; if (p->AE->type == c_number) { // Constant distance hop. // // No need to check for negative hop as that's converted to false by // the analyser. // // Note that if we signal f then z->c will be reset when this is // handled - we rely on this here and unconditionally update z->c. w(g, "z->c = z->c ~S0 "); generate_AE(g, p->AE); w(g, ";~C"); if (p->mode == m_forward) { writef(g, "~Mif (z->c > z->l) ~f~N", p); } else { writef(g, "~Mif (z->c < z->lb) ~f~N", p); } } else { w(g, "~{int ret = z->c ~S0 "); generate_AE(g, p->AE); writef(g, ";~C", p); if (p->mode == m_forward) { writef(g, "~Mif (ret > z->l || ret < z->c) ~f~N", p); } else { writef(g, "~Mif (ret < z->lb || ret > z->c) ~f~N", p); } writef(g, "~Mz->c = ret;~N" "~}", p); } } } static void generate_delete(struct generator * g, struct node * p) { writef(g, "~{int ret = slice_del(z);~C", p); writef(g, "~Mif (ret < 0) return ret;~N" "~}", p); } static void generate_tolimit(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? "" : "b"; writef(g, "~Mz->c = z->l~S0;~C", p); } static void generate_atlimit(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? "" : "b"; g->S[1] = p->mode == m_forward ? "<" : ">"; writef(g, "~Mif (z->c ~S1 z->l~S0) ~f~C", p); } static void generate_leftslice(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? "bra" : "ket"; writef(g, "~Mz->~S0 = z->c;~C", p); } static void generate_rightslice(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? "ket" : "bra"; writef(g, "~Mz->~S0 = z->c;~C", p); } static void generate_assignto(struct generator * g, struct node * p) { g->V[0] = p->name; writef(g, "~M~V0 = assign_to(z, ~V0);~C" "~Mif (~V0 == 0) return -1;~C", p); } static void generate_sliceto(struct generator * g, struct node * p) { g->V[0] = p->name; writef(g, "~M~V0 = slice_to(z, ~V0);~C" "~Mif (~V0 == 0) return -1;~N", p); } static void generate_insert(struct generator * g, struct node * p, int style) { int keep_c = style == c_attach; if (p->mode == m_backward) keep_c = !keep_c; writef(g, "~{int ret;~N", p); if (keep_c) w(g, "~{int saved_c = z->c;~N"); writef(g, "~Mret = insert_~$(z, z->c, z->c, ~a);~C", p); if (keep_c) w(g, "~Mz->c = saved_c;~N~}"); writef(g, "~Mif (ret < 0) return ret;~N" "~}", p); } static void generate_assignfrom(struct generator * g, struct node * p) { int keep_c = p->mode == m_forward; /* like 'attach' */ writef(g, "~{int ret;~N", p); if (keep_c) writef(g, "~{int saved_c = z->c;~N", p); w(g, "~Mret = "); writef(g, keep_c ? "insert_~$(z, z->c, z->l, ~a);~C" : "insert_~$(z, z->lb, z->c, ~a);~C", p); if (keep_c) w(g, "~Mz->c = saved_c;~N~}"); writef(g, "~Mif (ret < 0) return ret;~N" "~}", p); } static void generate_slicefrom(struct generator * g, struct node * p) { writef(g, "~{int ret = slice_from_~$(z, ~a);~C", p); writef(g, "~Mif (ret < 0) return ret;~N" "~}", p); } static void generate_setlimit(struct generator * g, struct node * p) { int keep_c; if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; ++g->keep_count; writef(g, "~N~{int mlimit", p); write_int(g, g->keep_count); writef(g, ";~C", p); keep_c = g->keep_count; g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif (z->c ~S0 "); generate_AE(g, q->AE); writef(g, ") ~f~N", q); w(g, "~Mmlimit"); write_int(g, keep_c); if (p->mode == m_forward) { w(g, " = z->l - z->c; z->l = "); } else { w(g, " = z->lb; z->lb = "); } generate_AE(g, q->AE); w(g, ";~N"); } else { writef(g, "~{~K~C", p); keep_c = g->keep_count; generate(g, p->left); w(g, "~Mmlimit"); write_int(g, keep_c); if (p->mode == m_forward) w(g, " = z->l - z->c; z->l = z->c;~N"); else w(g, " = z->lb; z->lb = z->c;~N"); w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); } g->failure_keep_count = -keep_c; generate(g, p->aux); w(g, "~M"); wrestorelimit(g, p, -g->failure_keep_count); w(g, "~N" "~}"); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { int used = g->label_used; int a0 = g->failure_label; int a1 = g->failure_keep_count; int keep_token; g->failure_label = new_label(g); g->label_used = 0; g->failure_keep_count = 0; keep_token = ++g->keep_count; g->I[0] = keep_token; writef(g, "~{struct SN_env env~I0 = * z;~C", p); g->V[0] = p->name; /* Assume failure. */ writef(g, "~Mint failure = 1;~N" "~Mz->p = ~V0;~N" "~Mz->lb = z->c = 0;~N" "~Mz->l = SIZE(z->p);~N", p); generate(g, p->left); /* Mark success. */ w(g, "~Mfailure = 0;~N"); if (g->label_used) wsetl(g, g->failure_label); g->V[0] = p->name; /* necessary */ g->label_used = used; g->failure_label = a0; g->failure_keep_count = a1; g->I[0] = keep_token; writef(g, "~M~V0 = z->p;~N" "~M* z = env~I0;~N" "~Mif (failure) ~f~N~}", p); } static void generate_integer_assign(struct generator * g, struct node * p, char * s) { g->V[0] = p->name; g->S[0] = s; w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); writef(g, ";~C", p); } static void generate_integer_test(struct generator * g, struct node * p, char * s) { w(g, "~Mif (!("); generate_AE(g, p->left); write_char(g, ' '); write_string(g, s); write_char(g, ' '); generate_AE(g, p->AE); writef(g, ")) ~f~C", p); } static void generate_call(struct generator * g, struct node * p) { g->V[0] = p->name; writef(g, "~{int ret = ~V0(z);~C", p); if (g->failure_keep_count == 0 && g->failure_label == x_return) { /* Combine the two tests in this special case for better optimisation * and clearer generated code. */ writef(g, "~Mif (ret <= 0) return ret;~N~}", p); } else { writef(g, "~Mif (ret == 0) ~f~N" "~Mif (ret < 0) return ret;~N~}", p); } } static void generate_grouping(struct generator * g, struct node * p, int complement) { struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; g->S[2] = g->options->encoding == ENC_UTF8 ? "_U" : ""; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; writef(g, "~Mif (~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 0)) ~f~C", p); } static void generate_namedstring(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? "" : "_b"; g->V[0] = p->name; writef(g, "~Mif (!(eq_v~S0(z, ~V0))) ~f~C", p); } static void generate_literalstring(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (SIZE(b) == 1) { /* It's quite common to compare with a single character literal string, * so just inline the simpler code for this case rather than making a * function call. In UTF-8 mode, only do this for the ASCII subset, * since multi-byte characters are more complex to test against. */ if (g->options->encoding == ENC_UTF8 && *b >= 128) { printf("single byte %d\n", *b); exit(1); } g->I[0] = *b; if (p->mode == m_forward) { writef(g, "~Mif (z->c == z->l || z->p[z->c] != ~c0) ~f~C" "~Mz->c++;~N", p); } else { writef(g, "~Mif (z->c <= z->lb || z->p[z->c - 1] != ~c0) ~f~C" "~Mz->c--;~N", p); } } else { g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = SIZE(b); g->L[0] = b; writef(g, "~Mif (!(eq_s~S0(z, ~I0, ~L0))) ~f~C", p); } } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; g->next_label = 0; g->S[0] = q->type == t_routine ? "static" : "extern"; g->V[0] = q; w(g, "~N~S0 int ~V0(struct SN_env * z) {"); if (g->options->comments) { write_string(g, p->mode == m_forward ? " /* forwardmode */" : " /* backwardmode */"); } w(g, "~N~+"); if (p->amongvar_needed) w(g, "~Mint among_var;~N"); g->failure_keep_count = 0; g->failure_label = x_return; g->label_used = 0; g->keep_count = 0; generate(g, p->left); w(g, "~Mreturn 1;~N~}"); } static void generate_substring(struct generator * g, struct node * p) { struct among * x = p->among; int block = -1; unsigned int bitmap = 0; struct amongvec * among_cases = x->b; int c; int empty_case = -1; int n_cases = 0; symbol cases[2]; int shortest_size = INT_MAX; int shown_comment = 0; g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; g->I[1] = x->literalstring_count; /* In forward mode with non-ASCII UTF-8 characters, the first byte * of the string will often be the same, so instead look at the last * common byte position. * * In backward mode, we can't match if there are fewer characters before * the current position than the minimum length. */ for (c = 0; c < x->literalstring_count; ++c) { int size = among_cases[c].size; if (size != 0 && size < shortest_size) { shortest_size = size; } } for (c = 0; c < x->literalstring_count; ++c) { symbol ch; if (among_cases[c].size == 0) { empty_case = c; continue; } if (p->mode == m_forward) { ch = among_cases[c].b[shortest_size - 1]; } else { ch = among_cases[c].b[among_cases[c].size - 1]; } if (n_cases == 0) { block = ch >> 5; } else if (ch >> 5 != block) { block = -1; if (n_cases > 2) break; } if (block == -1) { if (n_cases > 0 && ch == cases[0]) continue; if (n_cases < 2) { cases[n_cases++] = ch; } else if (ch != cases[1]) { ++n_cases; break; } } else { if ((bitmap & (1u << (ch & 0x1f))) == 0) { bitmap |= 1u << (ch & 0x1f); if (n_cases < 2) cases[n_cases] = ch; ++n_cases; } } } if (block != -1 || n_cases <= 2) { char buf[64]; g->I[2] = block; g->I[3] = bitmap; g->I[4] = shortest_size - 1; if (p->mode == m_forward) { sprintf(buf, "z->p[z->c + %d]", shortest_size - 1); g->S[1] = buf; if (shortest_size == 1) { writef(g, "~Mif (z->c >= z->l", p); } else { writef(g, "~Mif (z->c + ~I4 >= z->l", p); } } else { g->S[1] = "z->p[z->c - 1]"; if (shortest_size == 1) { writef(g, "~Mif (z->c <= z->lb", p); } else { writef(g, "~Mif (z->c - ~I4 <= z->lb", p); } } if (n_cases == 0) { /* We get this for the degenerate case: among ( '' ) * This doesn't seem to be a useful construct, but it is * syntactically valid. */ } else if (n_cases == 1) { g->I[4] = cases[0]; writef(g, " || ~S1 != ~I4", p); } else if (n_cases == 2) { g->I[4] = cases[0]; g->I[5] = cases[1]; writef(g, " || (~S1 != ~I4 && ~S1 != ~I5)", p); } else { writef(g, " || ~S1 >> 5 != ~I2 || !((~I3 >> (~S1 & 0x1f)) & 1)", p); } write_string(g, ") "); if (empty_case != -1) { /* If the among includes the empty string, it can never fail * so not matching the bitmap means we match the empty string. */ g->I[4] = among_cases[empty_case].result; writef(g, "among_var = ~I4; else~C", p); } else { writef(g, "~f~C", p); } shown_comment = 1; } else { #ifdef OPTIMISATION_WARNINGS printf("Couldn't shortcut among %d\n", x->number); #endif } if (!x->amongvar_needed) { writef(g, "~Mif (!(find_among~S0(z, a_~I0, ~I1))) ~f", p); writef(g, shown_comment ? "~N" : "~C", p); } else { writef(g, "~Mamong_var = find_among~S0(z, a_~I0, ~I1);", p); writef(g, shown_comment ? "~N" : "~C", p); writef(g, "~Mif (!(among_var)) ~f~N", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == 0) generate_substring(g, p); if (x->starter != 0) generate(g, x->starter); if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { int i; writef(g, "~Mswitch (among_var) {~C~+", p); for (i = 1; i <= x->command_count; i++) { g->I[0] = i; w(g, "~Mcase ~I0:~N~+"); generate(g, x->commands[i - 1]); w(g, "~Mbreak;~N~-"); } w(g, "~}"); } } static void generate_booltest(struct generator * g, struct node * p) { g->V[0] = p->name; writef(g, "~Mif (!(~V0)) ~f~C", p); } static void generate_false(struct generator * g, struct node * p) { writef(g, "~M~f~C", p); } static void generate_debug(struct generator * g, struct node * p) { g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Mdebug(z, ~I0, ~I1);~C", p); } static void generate(struct generator * g, struct node * p) { int used = g->label_used; int a0 = g->failure_label; int a1 = g->failure_keep_count; switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_setmark: generate_setmark(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_atmark: generate_atmark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_atlimit: generate_atlimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_assign: generate_assignfrom(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_mathassign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: generate_integer_assign(g, p, "/="); break; case c_eq: generate_integer_test(g, p, "=="); break; case c_ne: generate_integer_test(g, p, "!="); break; case c_gr: generate_integer_test(g, p, ">"); break; case c_ge: generate_integer_test(g, p, ">="); break; case c_ls: generate_integer_test(g, p, "<"); break; case c_le: generate_integer_test(g, p, "<="); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } if (g->failure_label != a0) g->label_used = used; g->failure_label = a0; g->failure_keep_count = a1; } void write_generated_comment_content(struct generator * g) { w(g, "Generated by Snowball " SNOWBALL_VERSION " - https://snowballstem.org/"); } void write_start_comment(struct generator * g, const char * comment_start, const char * comment_end) { write_margin(g); w(g, comment_start); write_generated_comment_content(g); if (comment_end) { w(g, comment_end); } w(g, "~N~N"); } static void generate_head(struct generator * g) { w(g, "#include \""); if (g->options->runtime_path) { write_string(g, g->options->runtime_path); if (g->options->runtime_path[strlen(g->options->runtime_path) - 1] != '/') write_char(g, '/'); } w(g, "header.h\"~N~N"); } static void generate_routine_headers(struct generator * g) { struct name * q; for (q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_routine: w(g, "static int ~W0(struct SN_env * z);~N"); break; case t_external: w(g, "#ifdef __cplusplus~N" "extern \"C\" {~N" "#endif~N" "extern int ~W0(struct SN_env * z);~N" "#ifdef __cplusplus~N" "}~N" "#endif~N" ); break; } } } static void generate_among_table(struct generator * g, struct among * x) { struct amongvec * v = x->b; g->I[0] = x->number; { int i; for (i = 0; i < x->literalstring_count; i++) { g->I[1] = i; g->I[2] = v->size; g->L[0] = v->b; if (v->size) w(g, "static const symbol s_~I0_~I1[~I2] = ~A0;~N"); v++; } } g->I[1] = x->literalstring_count; w(g, "~N~Mstatic const struct among a_~I0[~I1] =~N{~N"); v = x->b; { int i; for (i = 0; i < x->literalstring_count; i++) { g->I[1] = i; g->I[2] = v->size; g->I[3] = v->i; g->I[4] = v->result; g->S[0] = i < x->literalstring_count - 1 ? "," : ""; if (g->options->comments) { w(g, "/*~J1 */ "); } w(g, "{ ~I2, "); if (v->size == 0) { w(g, "0,"); } else { w(g, "s_~I0_~I1,"); } w(g, " ~I3, ~I4, "); if (v->function == 0) { write_char(g, '0'); } else { write_varname(g, v->function); } w(g, "}~S0~N"); v++; } } w(g, "};~N~N"); } static void generate_amongs(struct generator * g) { struct among * x; for (x = g->analyser->amongs; x; x = x->next) { generate_among_table(g, x); } } static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7)/ 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); int i; for (i = 0; i < size; i++) map[i] = 0; for (i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); g->V[0] = q->name; w(g, "static const unsigned char ~V0[] = { "); for (i = 0; i < size; i++) { write_int(g, map[i]); if (i < size - 1) w(g, ", "); } w(g, " };~N~N"); lose_b(map); } static void generate_groupings(struct generator * g) { struct grouping * q; for (q = g->analyser->groupings; q; q = q->next) { if (q->name->used) generate_grouping_table(g, q); } } static void generate_create(struct generator * g) { int * p = g->analyser->name_count; g->I[0] = p[t_string]; g->I[1] = p[t_integer] + p[t_boolean]; w(g, "~N" "extern struct SN_env * ~pcreate_env(void) { return SN_create_env(~I0, ~I1); }" "~N"); } static void generate_close(struct generator * g) { int * p = g->analyser->name_count; g->I[0] = p[t_string]; w(g, "~Nextern void ~pclose_env(struct SN_env * z) { SN_close_env(z, ~I0); }~N~N"); } static void generate_create_and_close_templates(struct generator * g) { w(g, "~N" "extern struct SN_env * ~pcreate_env(void);~N" "extern void ~pclose_env(struct SN_env * z);~N" "~N"); } static void generate_header_file(struct generator * g) { struct name * q; const char * vp = g->options->variables_prefix; g->S[0] = vp; w(g, "#ifdef __cplusplus~N" "extern \"C\" {~N" "#endif~N"); /* for C++ */ generate_create_and_close_templates(g); for (q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_external: w(g, "extern int ~W0(struct SN_env * z);~N"); break; case t_string: case t_integer: case t_boolean: if (vp) { int count = q->count; if (count < 0) { /* Unused variables should get removed from `names`. */ fprintf(stderr, "Optimised out variable "); report_b(stderr, q->b); fprintf(stderr, " still in names list\n"); exit(1); } if (q->type == t_boolean) { /* We use a single array for booleans and integers, * with the integers first. */ count += g->analyser->name_count[t_integer]; } g->I[0] = count; g->I[1] = "SIIrxg"[q->type]; w(g, "#define ~S0"); write_b(g, q->b); w(g, " (~c1[~I0])~N"); } break; } } w(g, "~N" "#ifdef __cplusplus~N" "}~N" "#endif~N"); /* for C++ */ w(g, "~N"); } extern void generate_program_c(struct generator * g) { g->outbuf = str_new(); write_start_comment(g, "/* ", " */"); generate_head(g); generate_routine_headers(g); w(g, "#ifdef __cplusplus~N" "extern \"C\" {~N" "#endif~N" "~N"); generate_create_and_close_templates(g); w(g, "~N" "#ifdef __cplusplus~N" "}~N" "#endif~N"); generate_amongs(g); generate_groupings(g); g->declarations = g->outbuf; g->outbuf = str_new(); g->literalstring_count = 0; { struct node * p = g->analyser->program; while (p) { generate(g, p); p = p->right; } } generate_create(g); generate_close(g); output_str(g->options->output_src, g->declarations); str_delete(g->declarations); output_str(g->options->output_src, g->outbuf); str_clear(g->outbuf); write_start_comment(g, "/* ", " */"); generate_header_file(g); output_str(g->options->output_h, g->outbuf); str_delete(g->outbuf); } /* Generator functions common to multiple languages. */ extern struct generator * create_generator(struct analyser * a, struct options * o) { NEW(generator, g); g->analyser = a; g->options = o; g->margin = 0; g->debug_count = 0; g->copy_from_count = 0; g->line_count = 0; g->line_labelled = 0; g->failure_label = -1; g->unreachable = false; #ifndef DISABLE_PYTHON g->max_label = 0; #endif return g; } extern void close_generator(struct generator * g) { FREE(g); } /* Write routines for simple entities */ extern void write_char(struct generator * g, int ch) { str_append_ch(g->outbuf, ch); /* character */ } extern void write_newline(struct generator * g) { str_append_ch(g->outbuf, '\n'); /* newline */ g->line_count++; } extern void write_string(struct generator * g, const char * s) { str_append_string(g->outbuf, s); } extern void write_int(struct generator * g, int i) { str_append_int(g->outbuf, i); } extern void write_b(struct generator * g, symbol * b) { str_append_b(g->outbuf, b); } extern void write_str(struct generator * g, struct str * str) { str_append(g->outbuf, str); } snowball-2.2.0/compiler/generator_ada.c000066400000000000000000001424431414263061200201140ustar00rootroot00000000000000#include /* for exit */ #include /* for strlen */ #include /* for fprintf etc */ #include #include #include "header.h" /* prototypes */ static void generate(struct generator * g, struct node * p); static void generate_next(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); static int new_label(struct generator * g) { return g->next_label++; } static struct str * vars_newname(struct generator * g) { struct str * output; g->var_number++; output = str_new(); str_append_string(output, "v_"); str_append_int(output, g->var_number); return output; } /* Write routines for items from the syntax tree */ static void write_varname(struct generator * g, struct name * p) { int ch = p->b[0]; if (p->type != t_external) { write_char(g, "SBIRXG"[p->type]); write_char(g, '_'); } write_char(g, toupper(ch)); str_append_b_tail(g->outbuf, p->b, 1); ch = p->b[SIZE(p->b) - 1]; if (ch == '_') { write_char(g, 'E'); } } static void write_varref(struct generator * g, struct name * p) { /* reference to variable */ if (p->type < t_routine) write_string(g, "Z."); write_varname(g, p); } static void write_literal_string(struct generator * g, symbol * p) { int i; // Ada supports UTF-8 literal strings, we only need to escape the quote and // special characters. write_char(g, '"'); for (i = 0; i < SIZE(p); i++) { int ch = p[i]; if (ch == '"') { write_string(g, "\"\""); } else if (ch < 32 || ch == 127) { printf("In write_literal_string, can't handle non-graphic character 0x%02x currently\n", (int)p[i]); exit(1); } else if (ch <= 255) { write_char(g, ch); } else { printf("In write_literal_string, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]); exit(1); } } write_char(g, '"'); } static void write_margin(struct generator * g) { int i; for (i = 0; i < g->margin; i++) write_string(g, " "); } /* Write a variable declaration. */ static void write_declare(struct generator * g, char * declaration, struct node * p) { struct str * temp = g->outbuf; g->outbuf = g->declarations; write_string(g, " "); writef(g, declaration, p); write_string(g, ";"); write_newline(g); g->outbuf = temp; } static void write_comment(struct generator * g, struct node * p) { if (g->options->comments) { write_margin(g); write_string(g, "-- "); write_comment_content(g, p); write_newline(g); } } static void write_block_start(struct generator * g) { w(g, "~Mbegin~+~N"); } static void write_block_end(struct generator * g) { /* block end */ w(g, "~-~Mend;~N"); } static void restore_string(struct node * p, struct str * out, struct str * savevar) { str_clear(out); str_append_string(out, "Z.C := "); if (p->mode != m_forward) str_append_string(out, "Z.L - "); str_append(out, savevar); str_append_string(out, ";"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "Z.L - "; write_declare(g, " ~B0 : Char_Index", p); writef(g, "~M~B0 := ~S1Z.C;~N" , p); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { write_margin(g); if (p->mode == m_forward) { write_string(g, "Z.C := "); } else { write_string(g, "Z.C := Z.L - "); } write_str(g, savevar); write_string(g, ";"); write_newline(g); } static void wsetl(struct generator * g, int n) { write_newline(g); write_margin(g); write_string(g, "<>"); write_newline(g); g->line_labelled = g->line_count; } static void wgotol(struct generator * g, int n) { write_margin(g); write_string(g, "goto lab"); write_int(g, n); write_string(g, ";"); write_newline(g); } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } write_margin(g); switch (g->failure_label) { case x_return: write_string(g, "Result := False;"); write_newline(g); write_margin(g); write_string(g, "return;"); break; default: write_string(g, "goto lab"); write_int(g, g->failure_label); write_string(g, ";"); g->label_used = 1; } write_newline(g); g->unreachable = true; } static void write_failure_if(struct generator * g, char * s, struct node * p) { writef(g, "~Mif ", p); writef(g, s, p); writef(g, " then~N~+", p); write_failure(g); writef(g, "~-~Mend if;~N", p); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "Z.C >= Z.L", p); } else { write_failure_if(g, "Z.C <= Z.Lb", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; int l = strlen(input); while (i < l) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } switch (input[i++]) { default: write_char(g, input[i - 1]); continue; case 'C': write_comment(g, p); continue; case 'f': write_failure(g); g->unreachable = false; continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': write_string(g, g->S[input[i++] - '0']); continue; case 'B': write_b(g, g->B[input[i++] - '0']); continue; case 'I': write_int(g, g->I[input[i++] - '0']); continue; case 'V': write_varref(g, g->V[input[i++] - '0']); continue; case 'W': write_varname(g, g->V[input[i++] - '0']); continue; case 'L': write_literal_string(g, g->L[input[i++] - '0']); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_string(g, g->options->name); continue; } } } static void w(struct generator * g, const char * s) { writef(g, s, 0); } static int need_among_var(struct node *p) { while (p) { if (p->type == c_substring || p->type == c_among) { return 1; } if (p->right && need_among_var(p->right)) { return 1; } p = p->left; } return 0; } static int need_among_handler(struct among *a) { int i; struct amongvec * v = a->b; for (i = 0; i < a->literalstring_count; i++, v++) { if (v->function != 0) { return 1; } } return 0; } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: // Avoid `parentheses required for unary minus` error from gnat. if (p->number < 0) write_char(g, '('); write_int(g, p->number); if (p->number < 0) write_char(g, ')'); break; case c_maxint: write_string(g, "Integer'Last"); break; case c_minint: write_string(g, "Integer'First"); break; case c_neg: write_string(g, "(-"); generate_AE(g, p->right); write_char(g, ')'); break; case c_multiply: s = " * "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; goto label0; case c_divide: s = " / "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "Z.C"); break; case c_limit: w(g, p->mode == m_forward ? "Z.L" : "Z.Lb"); break; case c_len: w(g, "Length_Utf8 (Z)"); break; case c_size: w(g, "Length (Z)"); break; case c_lenof: case c_sizeof: g->V[0] = p->name; w(g, "Length_Utf8 (~V0)"); break; default: break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (keep_c && p->right != 0) write_restorecursor(g, p, savevar); p = p->right; } str_delete(savevar); } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int used = g->label_used; int keep_c = K_needed(g, p->left); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); int end_unreachable = true; write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); p = p->left; str_clear(g->failure_str); if (p == 0) { /* p should never be 0 after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right) { g->failure_label = new_label(g); g->label_used = 0; generate(g, p); if (!g->unreachable) { wgotol(g, out_lab); end_unreachable = false; } if (g->label_used) wsetl(g, g->failure_label); g->unreachable = false; if (keep_c) { write_restorecursor(g, p, savevar); } p = p->right; } g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); wsetl(g, out_lab); if (!end_unreachable) { g->unreachable = false; } str_delete(savevar); } static void generate_backwards(struct generator * g, struct node * p) { writef(g, "~MZ.Lb := Z.C; Z.C := Z.L;~C~N", p); generate(g, p->left); w(g, "~MZ.C := Z.Lb;~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); int a0 = g->failure_label, l; struct str * a1 = str_copy(g->failure_str); write_comment(g, p); if (keep_c) { write_savecursor(g, p, savevar); } g->failure_label = new_label(g); str_clear(g->failure_str); l = g->failure_label; generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); if (g->label_used) wsetl(g, l); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); str_delete(savevar); } static void generate_try(struct generator * g, struct node * p) { struct str * savevar; int keep_c = K_needed(g, p->left); g->failure_label = new_label(g); g->label_used = 0; str_clear(g->failure_str); write_comment(g, p); if (keep_c) { savevar = vars_newname(g); write_savecursor(g, p, savevar); restore_string(p, g->failure_str, savevar); } generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); g->unreachable = false; if (keep_c) { str_delete(savevar); } } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 := True;~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 := False;~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) { write_savecursor(g, p, savevar); } generate(g, p->left); if (!g->unreachable) { if (keep_c) { write_restorecursor(g, p, savevar); } } str_delete(savevar); } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); g->V[0] = p->left->name; w(g, "~M~V0 (Z, Result);~N"); } else { g->failure_label = new_label(g); str_clear(g->failure_str); generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); g->unreachable = false; } if (keep_c) write_restorecursor(g, p, savevar); str_delete(savevar); } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_Backward"; g->S[1] = complement ? "In" : "Out"; g->S[2] = g->options->encoding == ENC_UTF8 ? "" : ""; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; if (is_goto) { writef(g, "~M~S1_Grouping~S0~S2 (Z, ~V0, ~I0, ~I1, True, C);", p); write_failure_if(g, "C < 0", p); } else { writef(g, "~C" "~M~S1_Grouping~S0~S2 (Z, ~V0, ~I0, ~I1, True, C);~N", p); write_failure_if(g, "C < 0", p); if (p->mode == m_forward) w(g, "~MZ.C := Z.C + C;~N"); else w(g, "~MZ.C := Z.C - C;~N"); } } static void generate_GO(struct generator * g, struct node * p, int style) { int end_unreachable = false; int used = g->label_used; /* Initialise to NULL to suppress bogus "may be used uninitialised" warning. */ struct str * savevar = NULL; int keep_c = style == 1 || repeat_restore(g, p->left); int a0 = g->failure_label; int golab = new_label(g); if (p->left->type == c_grouping || p->left->type == c_non) { /* Special case for "goto" or "gopast" when used on a grouping or an * inverted grouping - the movement of c by the matching action is * exactly what we want! */ #ifdef OPTIMISATION_WARNINGS printf("Optimising %s %s\n", style ? "goto" : "gopast", p->left->type == c_non ? "non" : "grouping"); #endif if (g->options->comments) { writef(g, "~M~C", p); } generate_GO_grouping(g, p->left, style, p->left->type == c_non); return; } write_comment(g, p); w(g, "~Mloop~N~+"); if (keep_c) { savevar = vars_newname(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (style == 1) write_restorecursor(g, p, savevar); g->I[0] = golab; w(g, "~Mexit;~N"); } g->unreachable = false; if (g->label_used) wsetl(g, g->failure_label); if (keep_c) { write_restorecursor(g, p, savevar); str_delete(savevar); } g->label_used = used; g->failure_label = a0; write_check_limit(g, p); generate_next(g, p); g->I[0] = golab; w(g, "~-~Mend loop;~N"); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); g->B[0] = str_data(loopvar); write_declare(g, " ~B0 : Integer", p); w(g, "~MFor ~B0 := "); generate_AE(g, p->AE); writef(g, " DownTo 1 Do~N", p); writef(g, "~{", p); generate(g, p->left); w(g, "~}"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { struct str * savevar = vars_newname(g); int keep_c = repeat_restore(g, p->left); int replab = new_label(g); g->I[0] = replab; wsetl(g, replab); writef(g, "~N~Mloop~N~+", p); if (keep_c) write_savecursor(g, p, savevar); g->failure_label = new_label(g); g->label_used = 0; generate(g, p->left); if (!g->unreachable) { if (loopvar != 0) { g->B[0] = str_data(loopvar); w(g, "~M~B0 := ~B0 - 1;~N"); } g->I[0] = replab; w(g, "~Mgoto lab~I0;~N"); } if (g->label_used) wsetl(g, g->failure_label); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); w(g, "~N~Mexit;~N~-~Mend loop;~N"); str_delete(savevar); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); w(g, "~{"); g->B[0] = str_data(loopvar); write_declare(g, " ~B0 : Integer", p); w(g, "~M~B0 := "); generate_AE(g, p->AE); w(g, ";~N"); { int a0 = g->failure_label; generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); w(g, "~}"); str_delete(loopvar); } static void generate_setmark(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 := Z.C;~N", p); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif Z.C ~S0 "); generate_AE(g, p->AE); w(g, " then~N"); write_failure(g); w(g, "~Mend if;~N"); g->unreachable = false; w(g, "~MZ.C := "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_atmark(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif Z.C /= "); generate_AE(g, p->AE); writef(g, " then~N~+", p); write_failure(g); w(g, "~-~Mend if;~N"); g->unreachable = false; } static void generate_hop(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? "" : "_Backward"; if (g->options->encoding == ENC_UTF8) { w(g, "~MC := Skip_Utf8~S0 (Z, "); generate_AE(g, p->AE); writef(g, ");~C~N", p); write_failure_if(g, "C < 0", p); } else { w(g, "~MC := Z.C ~S0 "); generate_AE(g, p->AE); writef(g, ";~C~N", p); if (p->mode == m_forward) { write_failure_if(g, "C > Z.L or C < Z.C", p); } else { write_failure_if(g, "C < Z.Lb or C > Z.C", p); } } writef(g, "~MZ.C := C;~N", p); } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~MSlice_Del (Z);~N", p); } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); if (p->mode == m_forward) w(g, "~MC := Skip_Utf8 (Z);~N"); else w(g, "~MC := Skip_Utf8_Backward (Z);~N"); write_failure_if(g, "C < 0", p); w(g, "~MZ.C := C;~N"); } static void generate_tolimit(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? "" : "b"; writef(g, "~MZ.C := Z.L~S0;~C~N", p); } static void generate_atlimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "b"; g->S[1] = p->mode == m_forward ? "<" : ">"; write_failure_if(g, "Z.C ~S1 Z.L~S0", p); } static void generate_leftslice(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? "Bra" : "Ket"; writef(g, "~MZ.~S0 := Z.C;~C~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? "Ket" : "Bra"; writef(g, "~MZ.~S0 := Z.C;~C~N", p); } static void generate_assignto(struct generator * g, struct node * p) { g->V[0] = p->name; writef(g, "~M~V0 := Assign_To (Z, ~V0);~C~N", p); write_failure_if(g, "~V0 == 0", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 := Ada.Strings.Unbounded.To_Unbounded_String (Slice_To (Z));~N", p); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != 0) { write_literal_string(g, b); } else { write_varname(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { int keep_c = style == c_attach; write_comment(g, p); if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) w(g, "~MC := Z.C;~N"); writef(g, "~MInsert (Z, Z.C, Z.C, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~MZ.C := C;~N"); } static void generate_assignfrom(struct generator * g, struct node * p) { int keep_c = p->mode == m_forward; /* like 'attach' */ write_comment(g, p); if (keep_c) writef(g, "~MC := Z.C;~N", p); if (p->mode == m_forward) { writef(g, "~MInsert (Z, Z.C, Z.L, ", p); } else { writef(g, "~MInsert (Z, Z.Lb, Z.C, ", p); } generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~MZ.C := C;~N"); } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~MSlice_From (Z, "); generate_address(g, p); writef(g, ");~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); struct str * varname = vars_newname(g); g->B[0] = str_data(varname); write_declare(g, " ~B0 : Integer", p); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; ++g->keep_count; g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif Z.C ~S0 "); generate_AE(g, q->AE); writef(g, " then~N~+", q); w(g, "~MResult := False;~N"); w(g, "~Mreturn;~-~N"); w(g, "~Mend if;~N"); w(g, "~M~B0"); g->unreachable = false; if (p->mode == m_forward) { w(g, " := Z.L - Z.C; Z.L := "); } else { w(g, " := Z.Lb; Z.Lb := "); } generate_AE(g, q->AE); w(g, ";~N"); if (p->mode == m_forward) { str_assign(g->failure_str, "Z.L := Z.L + "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "Z.Lb := "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } else { write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~M~B0 := Z.L - Z.C;~N"); w(g, "~MZ.L := Z.C;~N"); } else { w(g, "~M~B0 := Z.Lb;~N"); w(g, "~MZ.Lb := Z.C;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "Z.L := Z.L + "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "Z.Lb := "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); str_delete(savevar); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); g->B[0] = str_data(savevar); write_comment(g, p); g->V[0] = p->name; { struct str * saved_output = g->outbuf; str_clear(g->failure_str); g->outbuf = g->failure_str; writef(g, "~V0 := FCurrent; " "FCurrent := ~B0_Current; " "FCursor := ~B0_Cursor; " "FLimit := ~B0_Limit; " "FBkLimit := ~B0_BkLimit; " "FBra := ~B0_Bra; " "FKet := ~B0_Ket;", p); g->failure_str = g->outbuf; g->outbuf = saved_output; } write_declare(g, "~B0_Current : AnsiString", p); write_declare(g, "~B0_Cursor : Integer", p); write_declare(g, "~B0_Limit : Integer", p); write_declare(g, "~B0_BkLimit : Integer", p); write_declare(g, "~B0_Bra : Integer", p); write_declare(g, "~B0_Ket : Integer", p); writef(g, "~{" "~M~B0_Current := FCurrent;~N" "{ ~M~B0_Current := Copy(FCurrent, 1, FLimit); }~N" "~M~B0_Cursor := FCursor;~N" "~M~B0_Limit := FLimit;~N" "~M~B0_BkLimit := FBkLimit;~N" "~M~B0_Bra := FBra;~N" "~M~B0_Ket := FKet;~N" "~MFCurrent := ~V0;~N" "~MFCursor := 0;~N" "~MFLimit := Length(current);~N", p); generate(g, p->left); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } w(g, "~}"); str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, char * s) { g->V[0] = p->name; w(g, "~M~V0 := "); if (s != 0) { g->S[0] = s; w(g, "~V0 ~S0 "); } generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p, char * s) { w(g, "~Mif not ("); generate_AE(g, p->left); write_char(g, ' '); write_string(g, s); write_char(g, ' '); generate_AE(g, p->AE); w(g, ") then~+~N"); write_failure(g); w(g, "~-~Mend if;~N"); g->unreachable = false; } static void generate_integer_function(struct generator * g, struct node * p, char * s) { w(g, "~MResult := ("); generate_AE(g, p->left); write_char(g, ' '); write_string(g, s); write_char(g, ' '); generate_AE(g, p->AE); w(g, ");~N"); g->unreachable = false; } static void generate_call(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 (Z, Result);~N", p); write_failure_if(g, "not Result", p); } static void generate_grouping(struct generator * g, struct node * p, int complement) { struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_Backward"; g->S[1] = complement ? "Out_" : "In_"; g->S[2] = g->options->encoding == ENC_UTF8 ? "" : ""; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; writef(g, "~M~S1Grouping~S0~S2 (Z, ~V0, ~I0, ~I1, False, C);~N", p); write_failure_if(g, "C /= 0", p); } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_Backward"; g->V[0] = p->name; writef(g, "~MC := Eq_S~S0 (Z, Ada.Strings.Unbounded.To_String (~V0));", p); write_failure_if(g, "C = 0", p); } static void generate_literalstring(struct generator * g, struct node * p) { symbol * b = p->literalstring; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_Backward"; g->L[0] = b; writef(g, "~MC := Eq_S~S0 (Z, ~L0);~N", p); write_failure_if(g, "C = 0", p); if (p->mode == m_forward) { writef(g, "~MZ.C := Z.C + C;~N", p); } else { writef(g, "~MZ.C := Z.C - C;~N", p); } } static void generate_define(struct generator * g, struct node * p) { struct str *saved_output; struct str *saved_declarations; /* Generate function header. */ g->V[0] = p->name; w(g, "~N~Mprocedure ~W0 (Z : in out Context_Type; Result : out Boolean) is~N"); /* Save output*/ saved_output = g->outbuf; saved_declarations = g->declarations; g->outbuf = str_new(); g->declarations = str_new(); g->next_label = 0; g->var_number = 0; g->failure_label = x_return; g->unreachable = false; /* Generate function body. */ w(g, "~{"); switch (p->left->type) { case c_eq: generate_integer_function(g, p->left, "="); break; case c_ne: generate_integer_function(g, p->left, "/="); break; case c_gr: generate_integer_function(g, p->left, ">"); break; case c_ge: generate_integer_function(g, p->left, ">="); break; case c_ls: generate_integer_function(g, p->left, "<"); break; case c_le: generate_integer_function(g, p->left, "<="); break; default: generate(g, p->left); if (!g->unreachable) w(g, "~N~MResult := True;~N"); str_append_string(saved_output, " C : Result_Index;\n"); if (need_among_var(p->left) || 1) { str_append_string(saved_output, " A : Integer;\n"); } break; } g->V[0] = p->name; w(g, "~-~Mend ~W0;~N"); if (g->var_number) { str_append(saved_output, g->declarations); } str_append(saved_output, g->outbuf); str_delete(g->declarations); str_delete(g->outbuf); g->declarations = saved_declarations; g->outbuf = saved_output; } static void generate_substring(struct generator * g, struct node * p) { struct among * x = p->among; int block = -1; unsigned int bitmap = 0; struct amongvec * among_cases = x->b; int c; int empty_case = -1; int n_cases = 0; symbol cases[2]; int shortest_size = INT_MAX; int call_done = 0; int need_handler = need_among_handler(x); write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_Backward"; g->I[0] = x->number; /* In forward mode with non-ASCII UTF-8 characters, the first character * of the string will often be the same, so instead look at the last * common character position. * * In backward mode, we can't match if there are fewer characters before * the current position than the minimum length. */ for (c = 0; c < x->literalstring_count; ++c) { int size = among_cases[c].size; if (size != 0 && size < shortest_size) { shortest_size = size; } } for (c = 0; c < x->literalstring_count; ++c) { symbol ch; if (among_cases[c].size == 0) { empty_case = c; continue; } if (p->mode == m_forward) { ch = among_cases[c].b[shortest_size - 1]; } else { ch = among_cases[c].b[among_cases[c].size - 1]; } if (n_cases == 0) { block = ch >> 5; } else if (ch >> 5 != block) { block = -1; if (n_cases > 2) break; } if (block == -1) { if (n_cases > 0 && ch == cases[0]) continue; if (n_cases < 2) { cases[n_cases++] = ch; } else if (ch != cases[1]) { ++n_cases; break; } } else { if ((bitmap & (1u << (ch & 0x1f))) == 0) { bitmap |= 1u << (ch & 0x1f); if (n_cases < 2) cases[n_cases] = ch; ++n_cases; } } } if (block != -1 || n_cases <= 2) { char buf[64]; char buf2[128]; char buf3[64]; g->I[2] = block; g->I[3] = bitmap; g->I[4] = shortest_size - 1; g->S[3] = buf3; snprintf(buf3, sizeof(buf3), "16#%x#", bitmap); if (p->mode == m_forward) { if (shortest_size == 1) sprintf(buf, "Z.C"); else sprintf(buf, "Z.C + %d", shortest_size - 1); snprintf(buf2, sizeof(buf2), "Character'Pos (Z.P (%s + 1))", buf); g->S[1] = buf; g->S[2] = buf2; if (shortest_size == 1) { writef(g, "~Mif Z.C >= Z.L", p); } else { writef(g, "~Mif Z.C + ~I4 >= Z.L", p); } } else { g->S[1] = "Z.C - 1"; g->S[2] = "Character'Pos (Z.P (Z.C))"; if (shortest_size == 1) { writef(g, "~Mif Z.C <= Z.Lb", p); } else { writef(g, "~Mif Z.C - ~I4 <= Z.Lb", p); } } if (n_cases == 0) { /* We get this for the degenerate case: among ( '' ) * This doesn't seem to be a useful construct, but it is * syntactically valid. */ } else if (n_cases == 1) { g->I[4] = cases[0]; writef(g, " or else ~S2 /= ~I4", p); } else if (n_cases == 2) { g->I[4] = cases[0]; g->I[5] = cases[1]; writef(g, " or else (~S2 /= ~I4 and then ~S2 /= ~I5)", p); } else { writef(g, " or else Check_Among (Z, ~S1, ~I2, ~S3)", p); } writef(g, " then~+~N", p); if (empty_case != -1) { /* If the among includes the empty string, it can never fail * so not matching the bitmap means we match the empty string. */ g->I[4] = among_cases[empty_case].result; writef(g, "~MA := ~I4;~-~N~Melse~+~C", p); if (need_handler) { writef(g, "~MFind_Among~S0 (Z, A_~I0, Among_String, Among_Handler'Access, A);~N", p); } else { writef(g, "~MFind_Among~S0 (Z, A_~I0, Among_String, null, A);~N", p); } write_failure_if(g, "A = 0", p); call_done = 1; } else { writef(g, "~f~C", p); } writef(g, "~-~Mend if;~N", p); } else { #ifdef OPTIMISATION_WARNINGS printf("Couldn't shortcut among %d\n", x->number); #endif } if (!call_done) { if (need_handler) { writef(g, "~MFind_Among~S0 (Z, A_~I0, Among_String, Among_Handler'Access, A);~N", p); } else { writef(g, "~MFind_Among~S0 (Z, A_~I0, Among_String, null, A);~N", p); } write_failure_if(g, "A = 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == 0) generate_substring(g, p); if (x->starter != 0) generate(g, x->starter); if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { int i; write_comment(g, p); w(g, "~Mcase A is~N~+"); for (i = 1; i <= x->command_count; i++) { g->I[0] = i; w(g, "~Mwhen ~I0 =>~N"); g->margin++; generate(g, x->commands[i - 1]); g->margin--; g->unreachable = false; } w(g, "~Mwhen others =>~N"); w(g, "~M null;~N"); w(g, "~-~Mend case;~N"); } } static void generate_booltest(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "not ~V0", p); } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Mdebug(~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { int a0; struct str * a1; if (g->unreachable) return; a0 = g->failure_label; a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_setmark: generate_setmark(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_atmark: generate_atmark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_atlimit: generate_atlimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_assign: generate_assignfrom(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_mathassign: generate_integer_assign(g, p, NULL); break; case c_plusassign: generate_integer_assign(g, p, "+"); break; case c_minusassign: generate_integer_assign(g, p, "-"); break; case c_multiplyassign:generate_integer_assign(g, p, "*"); break; case c_divideassign: generate_integer_assign(g, p, "/"); break; case c_eq: generate_integer_test(g, p, "="); break; case c_ne: generate_integer_test(g, p, "/="); break; case c_gr: generate_integer_test(g, p, ">"); break; case c_ge: generate_integer_test(g, p, ">="); break; case c_ls: generate_integer_test(g, p, "<"); break; case c_le: generate_integer_test(g, p, "<="); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } /* Class declaration generation. */ static void generate_unit_start(struct generator * g) { g->margin = 0; write_start_comment(g, "-- ", NULL); } static void generate_method_decl(struct generator * g, struct name * q) { g->V[0] = q; w(g, "~Mprocedure ~W0 (Z : in out Context_Type; Result : out Boolean);~N"); } static void generate_method_decls(struct generator * g, enum name_types type) { struct name * q; struct among * a = g->analyser->amongs; int need_handler = 0; for (q = g->analyser->names; q; q = q->next) { if ((enum name_types)q->type == type) { generate_method_decl(g, q); } } while (a != 0 && need_handler == 0) { need_handler = need_among_handler(a); a = a->next; } if (need_handler) { w(g, "~N~Mprocedure Among_Handler (Context : in out Stemmer.Context_Type'Class; Operation : in Operation_Index; Result : out Boolean);~N"); } } static int has_string_variable(struct generator * g) { struct name * q; for (q = g->analyser->names; q; q = q->next) { g->V[0] = q; if (q->type == t_string) { return 1; } } return 0; } static void generate_member_decls(struct generator * g) { struct name * q; int count = 0; for (q = g->analyser->names; q; q = q->next) { if (q->type == t_string || q->type == t_integer || q->type == t_boolean) count++; } w(g, " type Context_Type is new Stemmer.Context_Type with"); if (count > 0) { w(g, " record~N~+"); for (q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_string: w(g, "~M~W0 : Ada.Strings.Unbounded.Unbounded_String;~N"); break; case t_integer: w(g, "~M~W0 : Integer;~N"); break; case t_boolean: w(g, "~M~W0 : Boolean;~N"); break; } } w(g, "~-"); w(g, "~- end record;~N"); } else { w(g, " null record;~N"); } } static int generate_among_string(struct generator * g, struct among * x, int count) { int i; struct amongvec * v = x->b; int limit = count == 0 ? 38 : 80; g->I[0] = x->number; for (i = 0; i < x->literalstring_count; i++, v++) { /* Write among's string. */ g->L[0] = v->b; g->I[1] = i; if (count + SIZE(v->b) > limit) { w(g, "~N~M& "); count = 3; limit = 80; } else if (count > 0) { w(g, " & "); } w(g, "~L0"); count += SIZE(v->b) + 5; } return count; } static int generate_among_table(struct generator * g, struct among * x, int start_pos, int *operation) { int i; struct amongvec * v = x->b; g->I[0] = x->number; g->I[1] = x->literalstring_count - 1; w(g, "~MA_~I0 : constant Among_Array_Type (0 .. ~I1) := ~+(~N"); v = x->b; for (i = 0; i < x->literalstring_count; i++, v++) { g->I[1] = start_pos; /* Write among's string position. */ if (x->literalstring_count == 1) { w(g, "~Mothers => (~I1, "); } else { w(g, "~M(~I1, "); } start_pos = start_pos + SIZE(v->b); g->I[1] = start_pos - 1; w(g, "~I1, "); /* Write among's index & result. */ g->I[2] = v->i; w(g, "~I2, "); g->I[2] = v->result; w(g, "~I2, "); /* Write among's handler. */ if (v->function == 0) { w(g, "0)"); } else { *operation = *operation + 1; g->I[1] = *operation; w(g, "~I1)"); } if (i + 1 < x->literalstring_count) { w(g, ",~N"); } } w(g, ");~-~N~N"); return start_pos; } static int generate_amongs(struct generator * g) { struct among * a = g->analyser->amongs; int count; int start_pos; w(g, "~MAmong_String : constant String := ~+"); count = 0; while (a != 0) { count = generate_among_string(g, a, count); a = a->next; } w(g, ";~N~-~N"); int operation = 0; start_pos = 1; a = g->analyser->amongs; while (a != 0) { start_pos = generate_among_table(g, a, start_pos, &operation); a = a->next; } return operation; } static int generate_constructor(struct generator * g) { return generate_amongs(g); } static void generate_methods(struct generator * g) { struct node * p = g->analyser->program; while (p != 0) { generate(g, p); p = p->right; } } static int generate_operations_dispatcher(struct generator * g) { struct among * a = g->analyser->amongs; int i; int operation = 0; w(g, "~N~Mprocedure Among_Handler (Context : in out Stemmer.Context_Type'Class; Operation : in Operation_Index; Result : out Boolean) is~N"); w(g, "~Mbegin~+~N~M"); w(g, "case Operation is~+~N~M"); a = g->analyser->amongs; while (a != 0) { struct amongvec * v = a->b; for (i = 0; i < a->literalstring_count; i++, v++) { if (v->function != 0) { operation++; g->I[2] = operation; w(g, "when ~I2 =>~N~M"); g->V[0] = v->function; w(g, " ~W0 (Context_Type (Context), Result);~N~M"); } } a = a->next; } w(g, "when others =>~N~M"); w(g, " Result := False;~-~N~Mend case;~-~N~M"); w(g, "end Among_Handler;~N~-"); return operation; } static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7)/ 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); int i; int count = 0; int need_comma = 0; for (i = 0; i < size; i++) map[i] = 0; /* Using unicode would require revision here */ for (i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); g->V[0] = q->name; g->I[0] = 8 * size - 1; w(g, "~N~M~W0 : constant Grouping_Array (0 .. ~I0) := (~N~+~M"); for (i = 0; i < size; i++) { unsigned char m = map[i]; int j; count++; if (i != 0) { w(g, ",~N~M"); need_comma = 0; } for (j = 0; j < 8; j++) { if (need_comma) w(g, ", "); if (m & (1 << j)) { w(g, "True"); } else { w(g, "False"); } need_comma = 1; } } w(g, "~N~-~M);~N"); lose_b(map); } static void generate_groupings(struct generator * g) { struct grouping * q; for (q = g->analyser->groupings; q; q = q->next) { if (q->name->used) generate_grouping_table(g, q); } } extern void generate_program_ada(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); generate_unit_start(g); /* generate implementation. */ w(g, "package body Stemmer."); w(g, g->options->package); w(g, " is~N~+~N"); w(g, "~Mpragma Style_Checks (\"-mr\");~N"); w(g, "~Mpragma Warnings (Off, \"*variable*is never read and never assigned*\");~N"); w(g, "~Mpragma Warnings (Off, \"*mode could be*instead of*\");~N"); w(g, "~Mpragma Warnings (Off, \"*formal parameter.*is not modified*\");~N"); w(g, "~Mpragma Warnings (Off, \"*this line is too long*\");~N"); w(g, "~Mpragma Warnings (Off, \"*is not referenced*\");~N"); w(g, "~N"); generate_method_decls(g, t_routine); generate_groupings(g); int operations = generate_constructor(g); generate_methods(g); if (operations > 0) { generate_operations_dispatcher(g); } w(g, "end Stemmer."); w(g, g->options->package); w(g, ";~N"); output_str(g->options->output_src, g->outbuf); str_clear(g->outbuf); g->margin = 0; write_start_comment(g, "-- ", NULL); if (has_string_variable(g)) { w(g, "private with Ada.Strings.Unbounded;~N"); } w(g, "package Stemmer."); w(g, g->options->package); w(g, " with SPARK_Mode is~N~+"); w(g, " type Context_Type is new Stemmer.Context_Type with private;~N"); w(g, " procedure Stem (Z : in out Context_Type; Result : out Boolean);~N"); w(g, "private~N"); generate_member_decls(g); w(g, "end Stemmer."); w(g, g->options->package); w(g, ";~N"); output_str(g->options->output_h, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-2.2.0/compiler/generator_csharp.c000066400000000000000000001071701414263061200206450ustar00rootroot00000000000000 #include /* for exit */ #include /* for strlen */ #include /* for fprintf etc */ #include "header.h" /* prototypes */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); static int new_label(struct generator * g) { return g->next_label++; } static struct str * vars_newname(struct generator * g) { struct str * output; g->var_number++; output = str_new(); str_append_string(output, "c"); str_append_int(output, g->var_number); return output; } /* Write routines for items from the syntax tree */ static void write_varname(struct generator * g, struct name * p) { int ch = "SBIrxg"[p->type]; if (p->type != t_external) { write_char(g, ch); write_char(g, '_'); } write_b(g, p->b); } static void write_varref(struct generator * g, struct name * p) { /* In c#, references look just the same */ write_varname(g, p); } static void write_hexdigit(struct generator * g, int n) { write_char(g, n < 10 ? n + '0' : n - 10 + 'A'); } static void write_hex(struct generator * g, int ch) { write_string(g, "\\u"); { int i; for (i = 12; i >= 0; i -= 4) write_hexdigit(g, ch >> i & 0xf); } } static void write_literal_string(struct generator * g, symbol * p) { int i; write_string(g, "\""); for (i = 0; i < SIZE(p); i++) { int ch = p[i]; if (32 <= ch && ch < 127) { if (ch == '\"' || ch == '\\') write_string(g, "\\"); write_char(g, ch); } else { write_hex(g, ch); } } write_string(g, "\""); } static void write_margin(struct generator * g) { int i; for (i = 0; i < g->margin; i++) write_string(g, " "); } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "// "); write_comment_content(g, p); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~M{~+~N"); } static void write_block_end(struct generator * g) /* block end */ { w(g, "~-~M}~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "limit - "; writef(g, "~Mint ~B0 = ~S1cursor;~N", p); } static void restore_string(struct node * p, struct str * out, struct str * savevar) { str_clear(out); str_append_string(out, "cursor = "); if (p->mode != m_forward) str_append_string(out, "limit - "); str_append(out, savevar); str_append_string(out, ";"); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { struct str * temp = str_new(); write_margin(g); restore_string(p, temp, savevar); write_str(g, temp); write_newline(g); str_delete(temp); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "cursor++;" : "cursor--;"); write_newline(g); } static void wsetl(struct generator * g, int n) { w(g, "~-~Mlab~+"); write_int(g, n); w(g, ": ; ~N"); } static void wgotol(struct generator * g, int n) { write_margin(g); write_string(g, "goto lab"); write_int(g, n); write_string(g, ";"); write_newline(g); } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_block_start(g); write_margin(g); write_str(g, g->failure_str); write_newline(g); } write_margin(g); switch (g->failure_label) { case x_return: write_string(g, "return false;"); break; default: write_string(g, "goto lab"); write_int(g, g->failure_label); write_string(g, ";"); g->label_used = 1; } write_newline(g); if (str_len(g->failure_str) != 0) write_block_end(g); } static void write_failure_if(struct generator * g, char * s, struct node * p) { writef(g, "~Mif (", p); writef(g, s, p); writef(g, ")~N", p); write_block_start(g); write_failure(g); write_block_end(g); } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "cursor >= limit", p); } else { write_failure_if(g, "cursor <= limit_backward", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; int l = strlen(input); while (i < l) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } switch (input[i++]) { default: write_char(g, input[i - 1]); continue; case 'C': write_comment(g, p); continue; case 'f': write_block_start(g); write_failure(g); write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': write_string(g, g->S[input[i++] - '0']); continue; case 'B': write_b(g, g->B[input[i++] - '0']); continue; case 'I': write_int(g, g->I[input[i++] - '0']); continue; case 'V': write_varref(g, g->V[input[i++] - '0']); continue; case 'W': write_varname(g, g->V[input[i++] - '0']); continue; case 'L': write_literal_string(g, g->L[input[i++] - '0']); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_string(g, g->options->name); continue; } } } static void w(struct generator * g, const char * s) { writef(g, s, 0); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "int.MaxValue"); break; case c_minint: write_string(g, "int.MinValue"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; goto label0; case c_divide: s = " / "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "cursor"); break; case c_limit: w(g, p->mode == m_forward ? "limit" : "limit_backward"); break; case c_lenof: /* Same as sizeof() for C#. */ case c_sizeof: g->V[0] = p->name; w(g, "~V0.Length"); break; case c_len: /* Same as size() for C#. */ case c_size: w(g, "current.Length"); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); p = p->left; while (p) { generate(g, p); if (keep_c && p->right != 0) write_restorecursor(g, p, savevar); p = p->right; } str_delete(savevar); } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); write_comment(g, p); if (keep_c) { write_block_start(g); write_savecursor(g, p, savevar); } p = p->left; str_clear(g->failure_str); if (p == 0) { /* p should never be 0 after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != 0) { g->failure_label = new_label(g); g->label_used = 0; generate(g, p); wgotol(g, out_lab); if (g->label_used) wsetl(g, g->failure_label); if (keep_c) write_restorecursor(g, p, savevar); p = p->right; } g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); if (keep_c) write_block_end(g); wsetl(g, out_lab); str_delete(savevar); } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mlimit_backward = cursor;~N" "~Mcursor = limit;~N", p); generate(g, p->left); w(g, "~Mcursor = limit_backward;~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); write_comment(g, p); if (keep_c) { write_block_start(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); { int l = g->failure_label; int u = g->label_used; g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_failure(g); if (u) wsetl(g, l); } if (keep_c) { write_restorecursor(g, p, savevar); write_block_end(g); } str_delete(savevar); } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) { write_block_start(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); g->label_used = 0; str_clear(g->failure_str); if (keep_c) restore_string(p, g->failure_str, savevar); generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); if (keep_c) write_block_end(g); str_delete(savevar); } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = true;~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = false;~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) { write_block_start(g); write_savecursor(g, p, savevar); } generate(g, p->left); if (keep_c) { write_restorecursor(g, p, savevar); write_block_end(g); } str_delete(savevar); } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) { write_block_start(g); write_savecursor(g, p, savevar); } if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); g->V[0] = p->left->name; w(g, "~M~V0();~N"); } else { g->failure_label = new_label(g); g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); } if (keep_c) { write_restorecursor(g, p, savevar); write_block_end(g); } str_delete(savevar); } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "in" : "out"; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; if (is_goto) { w(g, "~Mif (~S1_grouping~S0(~V0, ~I0, ~I1, true) < 0)~N~f~N"); } else { w(g, "~{~N" "~Mint ret = ~S1_grouping~S0(~V0, ~I0, ~I1, true);~N" "~Mif (ret < 0)~N~f~N"); if (p->mode == m_forward) w(g, "~Mcursor += ret;~N"); else w(g, "~Mcursor -= ret;~N"); w(g, "~}"); } } static void generate_GO(struct generator * g, struct node * p, int style) { struct str * savevar = vars_newname(g); int keep_c = style == 1 || repeat_restore(g, p->left); int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); if (p->left->type == c_grouping || p->left->type == c_non) { /* Special case for "goto" or "gopast" when used on a grouping or an * inverted grouping - the movement of c by the matching action is * exactly what we want! */ #ifdef OPTIMISATION_WARNINGS printf("Optimising %s %s\n", style ? "goto" : "gopast", p->left->type == c_non ? "non" : "grouping"); #endif write_comment(g, p); generate_GO_grouping(g, p->left, style, p->left->type == c_non); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; str_delete(savevar); return; } w(g, "~Mwhile (true)~N~{"); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); g->failure_label = new_label(g); g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); if (style == 1) { /* include for goto; omit for gopast */ write_restorecursor(g, p, savevar); } w(g, "~Mbreak;~N"); if (g->label_used) wsetl(g, g->failure_label); if (keep_c) write_restorecursor(g, p, savevar); g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); str_delete(savevar); w(g, "~}"); } static void generate_loop(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); g->B[0] = str_data(loopvar); w(g, "~Mfor (int ~B0 = "); generate_AE(g, p->AE); g->B[0] = str_data(loopvar); writef(g, "; ~B0 > 0; ~B0--)~N", p); write_block_start(g); generate(g, p->left); write_block_end(g); str_delete(loopvar); } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * atleast_case) { struct str * savevar = vars_newname(g); int keep_c = repeat_restore(g, p->left); writef(g, "~Mwhile (true)~N~{", p); if (keep_c) write_savecursor(g, p, savevar); g->failure_label = new_label(g); g->label_used = 0; str_clear(g->failure_str); generate(g, p->left); if (atleast_case != 0) { g->B[0] = str_data(atleast_case); w(g, "~M~B0--;~N"); } w(g, "~Mcontinue;~N"); if (g->label_used) wsetl(g, g->failure_label); if (keep_c) write_restorecursor(g, p, savevar); w(g, "~Mbreak;~N~}"); str_delete(savevar); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); w(g, "~{"); g->B[0] = str_data(loopvar); w(g, "~Mint ~B0 = "); generate_AE(g, p->AE); w(g, ";~N"); { int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); w(g, "~}"); str_delete(loopvar); } static void generate_setmark(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = cursor;~N", p); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif (cursor ~S0 "); generate_AE(g, p->AE); w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); w(g, "~Mcursor = "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_atmark(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif (cursor != "); generate_AE(g, p->AE); writef(g, ")~N", p); write_block_start(g); write_failure(g); write_block_end(g); } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "+" : "-"; w(g, "~{~Mint c = cursor ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); g->S[1] = p->mode == m_forward ? "> limit" : "< limit_backward"; g->S[2] = p->mode == m_forward ? "<" : ">"; if (p->AE->type == c_number) { // Constant distance hop. // // No need to check for negative hop as that's converted to false by // the analyser. write_failure_if(g, "c ~S1", p); } else { write_failure_if(g, "c ~S1 || c ~S2 cursor", p); } writef(g, "~Mcursor = c;~N", p); writef(g, "~}", p); } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mslice_del();~N", p); } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "limit" : "limit_backward"; writef(g, "~Mcursor = ~S0;~N", p); } static void generate_atlimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "limit" : "limit_backward"; g->S[1] = p->mode == m_forward ? "<" : ">"; write_failure_if(g, "cursor ~S1 ~S0", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "bra" : "ket"; writef(g, "~M~S0 = cursor;~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "ket" : "bra"; writef(g, "~M~S0 = cursor;~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~Massign_to(~V0);~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~Mslice_to(~V0);~N", p); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != 0) { write_literal_string(g, b); } else { write_varref(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { int keep_c = style == c_attach; write_comment(g, p); if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) w(g, "~{~Mint c = cursor;~N"); writef(g, "~Minsert(cursor, cursor, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~Mcursor = c;~N~}"); } static void generate_assignfrom(struct generator * g, struct node * p) { int keep_c = p->mode == m_forward; /* like 'attach' */ write_comment(g, p); if (keep_c) writef(g, "~{~Mint c = cursor;~N", p); if (p->mode == m_forward) { writef(g, "~Minsert(cursor, limit, ", p); } else { writef(g, "~Minsert(limit_backward, cursor, ", p); } generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~Mcursor = c;~N~}"); } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mslice_from("); generate_address(g, p); writef(g, ");~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); struct str * varname = vars_newname(g); write_comment(g, p); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif (cursor ~S0 "); generate_AE(g, q->AE); w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mint ~B0 = limit - cursor;~N"); w(g, "~Mlimit = "); } else { w(g, "~Mint ~B0 = limit_backward;~N"); w(g, "~Mlimit_backward = "); } generate_AE(g, q->AE); writef(g, ";~N", q); if (p->mode == m_forward) { str_assign(g->failure_str, "limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } else { write_savecursor(g, p, savevar); generate(g, p->left); g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mint ~B0 = limit - cursor;~N"); w(g, "~Mlimit = cursor;~N"); } else { w(g, "~Mint ~B0 = limit_backward;~N"); w(g, "~Mlimit_backward = cursor;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } generate(g, p->aux); write_margin(g); write_str(g, g->failure_str); write_newline(g); str_delete(varname); str_delete(savevar); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); write_comment(g, p); g->V[0] = p->name; str_assign(g->failure_str, "copy_from("); str_append(g->failure_str, savevar); str_append_string(g->failure_str, ");"); g->B[0] = str_data(savevar); writef(g, "~{~MEnv ~B0 = new Env(this);~N" "~Mcurrent = ~V0;~N" "~Mcursor = 0;~N" "~Mlimit = current.Length;~N", p); generate(g, p->left); write_margin(g); write_str(g, g->failure_str); write_newline(g); w(g, "~}"); str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, char * s) { g->V[0] = p->name; g->S[0] = s; w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p, char * s) { w(g, "~Mif (!("); generate_AE(g, p->left); write_char(g, ' '); write_string(g, s); write_char(g, ' '); generate_AE(g, p->AE); w(g, "))~N"); write_block_start(g); write_failure(g); write_block_end(g); } static void generate_call(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; w(g, "~Mif (!~V0())~N~+"); write_failure(g); w(g, "~-"); } static void generate_grouping(struct generator * g, struct node * p, int complement) { struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; w(g, "~Mif (~S1_grouping~S0(~V0, ~I0, ~I1, false) != 0)~N~f"); } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->V[0] = p->name; write_failure_if(g, "!(eq_s~S0(~V0))", p); } static void generate_literalstring(struct generator * g, struct node * p) { symbol * b = p->literalstring; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->L[0] = b; write_failure_if(g, "!(eq_s~S0(~L0))", p); } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; struct str * saved_output = g->outbuf; if (q->type == t_routine) { g->S[0] = "private"; } else { g->S[0] = "protected override"; } g->V[0] = q; w(g, "~N~M~S0 bool ~V0()~N~M{~+~N"); g->outbuf = str_new(); g->next_label = 0; g->var_number = 0; if (p->amongvar_needed) w(g, "~Mint among_var;~N"); str_clear(g->failure_str); g->failure_label = x_return; g->label_used = 0; g->keep_count = 0; generate(g, p->left); w(g, "~Mreturn true;~N"); w(g, "~}"); str_append(saved_output, g->outbuf); str_delete(g->outbuf); g->outbuf = saved_output; } static void generate_substring(struct generator * g, struct node * p) { struct among * x = p->among; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; if (!x->amongvar_needed) { write_failure_if(g, "find_among~S0(a_~I0) == 0", p); } else { writef(g, "~Mamong_var = find_among~S0(a_~I0);~N", p); write_failure_if(g, "among_var == 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == 0) generate_substring(g, p); if (x->starter != 0) generate(g, x->starter); if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { int i; w(g, "~Mswitch (among_var) {~N~+"); for (i = 1; i <= x->command_count; i++) { g->I[0] = i; w(g, "~Mcase ~I0:~N~+"); generate(g, x->commands[i - 1]); w(g, "~Mbreak;~N~-"); } write_block_end(g); } } static void generate_booltest(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "!(~V0)", p); } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Mdebug(~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { int used = g->label_used; int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_setmark: generate_setmark(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_atmark: generate_atmark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_atlimit: generate_atlimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_assign: generate_assignfrom(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_mathassign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: generate_integer_assign(g, p, "/="); break; case c_eq: generate_integer_test(g, p, "=="); break; case c_ne: generate_integer_test(g, p, "!="); break; case c_gr: generate_integer_test(g, p, ">"); break; case c_ge: generate_integer_test(g, p, ">="); break; case c_ls: generate_integer_test(g, p, "<"); break; case c_le: generate_integer_test(g, p, "<="); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } if (g->failure_label != a0) g->label_used = used; g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } static void generate_class_begin(struct generator * g) { w(g, "#pragma warning disable 0164~N"); w(g, "#pragma warning disable 0162~N~N"); w(g, "~Mnamespace "); w(g, g->options->package); w(g, "~N~{"); w(g, "~Musing System;~N"); w(g, "~Musing System.Text;~N"); w(g, "~M~N"); w(g, "~M///~N"); w(g, "~M/// This class implements the stemming algorithm defined by a snowball script.~N"); w(g, "~M/// "); write_generated_comment_content(g); w(g, "~N" "~M///~N"); w(g, "~M/// ~N"); w(g, "~M[System.CodeDom.Compiler.GeneratedCode(\"Snowball\", \"" SNOWBALL_VERSION "\")]~N"); w(g, "~Mpublic partial class ~n : "); w(g, g->options->parent_class_name); w(g, "~N~{"); } static void generate_class_end(struct generator * g) { w(g, "~N"); w(g, "~}"); w(g, "~}"); w(g, "~N"); } static void generate_among_table(struct generator * g, struct among * x, const char * type) { struct amongvec * v = x->b; g->I[0] = x->number; g->S[0] = type; w(g, "~M~S0a_~I0 = new[] ~N~M{~N~+"); { int i; for (i = 0; i < x->literalstring_count; i++) { g->I[0] = v->i; g->I[1] = v->result; g->L[0] = v->b; g->S[0] = i < x->literalstring_count - 1 ? "," : ""; w(g, "~Mnew Among(~L0, ~I0, ~I1"); if (v->function != 0) { w(g, ", "); write_varname(g, v->function); } w(g, ")~S0~N"); v++; } } w(g, "~-~M};~N~N"); } static void generate_amongs(struct generator * g) { int amongs_with_functions = 0; struct among * x = g->analyser->amongs; while (x != 0) { if (x->function_count) { g->I[0] = x->number; g->I[1] = x->literalstring_count; w(g, "~Mprivate readonly Among[] a_~I0;~N"); ++amongs_with_functions; } else { generate_among_table(g, x, "private static readonly Among[] "); } x = x->next; } w(g, "~N"); if (!amongs_with_functions) return; w(g, "~M/// ~N"); w(g, "~M/// Initializes a new instance of the class.~N"); w(g, "~M/// ~N"); w(g, "~M/// ~N"); w(g, "~Mpublic ~n()~N~{"); x = g->analyser->amongs; while (x != 0) { if (x->function_count) { generate_among_table(g, x, ""); } x = x->next; } w(g, "~}~N~N"); } static void generate_grouping_table(struct generator * g, struct grouping * q) { symbol * b = q->b; g->V[0] = q->name; w(g, "~Mprivate const string ~V0 = "); write_literal_string(g, b); w(g, ";~N"); } static void generate_groupings(struct generator * g) { struct grouping * q; for (q = g->analyser->groupings; q; q = q->next) { if (q->name->used) generate_grouping_table(g, q); } } static void generate_members(struct generator * g) { struct name * q; for (q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_string: w(g, "~Mprivate "); w(g, g->options->string_class); w(g, " ~W0 = new "); w(g, g->options->string_class); w(g, "();~N"); break; case t_integer: w(g, "~Mprivate int ~W0;~N"); break; case t_boolean: w(g, "~Mprivate bool ~W0;~N"); break; } } w(g, "~N"); } static void generate_methods(struct generator * g) { struct node * p; for (p = g->analyser->program; p; p = p->right) { generate(g, p); } } extern void generate_program_csharp(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); write_start_comment(g, "// ", NULL); generate_class_begin(g); generate_members(g); generate_groupings(g); generate_amongs(g); generate_methods(g); generate_class_end(g); output_str(g->options->output_src, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-2.2.0/compiler/generator_go.c000066400000000000000000001070261414263061200177720ustar00rootroot00000000000000#include /* for exit */ #include /* for strlen */ #include /* for fprintf etc */ #include /* for toupper */ #include "header.h" /* prototypes */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); static int new_label(struct generator * g) { return g->next_label++; } static struct str * vars_newname(struct generator * g) { struct str * output; g->var_number++; output = str_new(); str_append_string(output, "v_"); str_append_int(output, g->var_number); return output; } /* Write routines for items from the syntax tree */ static void write_varname(struct generator * g, struct name * p) { switch (p->type) { case t_external: write_char(g, toupper(p->b[0])); str_append_b_tail(g->outbuf, p->b, 1); return; default: { int ch = "SbirxG"[p->type]; write_char(g, ch); write_char(g, '_'); break; } } write_b(g, p->b); } static void write_varref(struct generator * g, struct name * p) { write_string(g, "context."); write_varname(g, p); } static void write_hexdigit(struct generator * g, int n) { write_char(g, n < 10 ? n + '0' : n - 10 + 'A'); } static void write_hex(struct generator * g, int ch) { write_string(g, "\\u"); { int i; for (i = 12; i >= 0; i -= 4) write_hexdigit(g, ch >> i & 0xf); } } static void write_literal_string(struct generator * g, symbol * p) { int i = 0; write_string(g, "\""); while (i < SIZE(p)) { int ch; i += get_utf8(p + i, &ch); if (32 <= ch && ch < 127) { if (ch == '\"' || ch == '\\') write_string(g, "\\"); write_char(g, ch); } else { write_hex(g, ch); } } write_string(g, "\""); } static void write_margin(struct generator * g) { int i; for (i = 0; i < g->margin; i++) write_string(g, " "); } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; /* FIXME could use Go //line syntax if we had original filename */ write_margin(g); write_string(g, "// "); write_comment_content(g, p); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~+{~N"); } static void write_block_end(struct generator * g) /* block end */ { w(g, "~-~M}~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "env.Limit - "; writef(g, "~Mvar ~B0 = ~S1env.Cursor~N", p); } static void restore_string(struct node * p, struct str * out, struct str * savevar) { str_clear(out); str_append_string(out, "env.Cursor = "); if (p->mode != m_forward) str_append_string(out, "env.Limit - "); str_append(out, savevar); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { struct str * temp = str_new(); write_margin(g); restore_string(p, temp, savevar); write_str(g, temp); write_newline(g); str_delete(temp); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "env.NextChar();" : "env.PrevChar();"); write_newline(g); } static void wsetlab_begin(struct generator * g, int n) { g->I[0] = n; w(g, "~Mlab~I0: for {~N~+"); } static void wsetlab_end(struct generator * g, int n) { g->I[0] = n; w(g, "~Mbreak lab~I0~N"); w(g, "~-~M}~N"); } static void wgotol(struct generator * g, int n) { g->I[0] = n; w(g, "~Mbreak lab~I0~N"); } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } switch (g->failure_label) { case x_return: w(g, "~Mreturn false~N"); g->unreachable = true; break; default: g->I[0] = g->failure_label; w(g, "~Mbreak lab~I0~N"); g->unreachable = true; } } static void write_failure_if(struct generator * g, char * s, struct node * p) { writef(g, "~Mif ", p); writef(g, s, p); writef(g, " ", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "env.Cursor >= env.Limit", p); } else { write_failure_if(g, "env.Cursor <= env.LimitBackward", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; int l = strlen(input); while (i < l) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } switch (input[i++]) { default: write_char(g, input[i - 1]); continue; case 'C': write_comment(g, p); continue; case 'f': write_block_start(g); write_failure(g); g->unreachable = false; write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': write_string(g, g->S[input[i++] - '0']); continue; case 'B': write_b(g, g->B[input[i++] - '0']); continue; case 'I': write_int(g, g->I[input[i++] - '0']); continue; case 'V': write_varref(g, g->V[input[i++] - '0']); continue; case 'W': write_varname(g, g->V[input[i++] - '0']); continue; case 'L': write_literal_string(g, g->L[input[i++] - '0']); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_string(g, g->options->name); continue; } } } static void w(struct generator * g, const char * s) { writef(g, s, 0); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "snowballRuntime.MaxInt"); break; case c_minint: write_string(g, "snowballRuntime.MinInt"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; goto label0; case c_divide: s = " / "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "env.Cursor"); break; case c_limit: w(g, p->mode == m_forward ? "env.Limit" : "env.LimitBackward"); break; case c_lenof: g->V[0] = p->name; w(g, "snowballRuntime.RuneCountInString(~V0)"); break; case c_sizeof: g->V[0] = p->name; w(g, "len(~V0)"); break; case c_len: w(g, "snowballRuntime.RuneCountInString(env.Current())"); break; case c_size: w(g, "len(env.Current())"); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (keep_c && p->right != 0) write_restorecursor(g, p, savevar); p = p->right; } str_delete(savevar); } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); int end_unreachable = true; write_comment(g, p); wsetlab_begin(g, out_lab); if (keep_c) write_savecursor(g, p, savevar); p = p->left; str_clear(g->failure_str); if (p == 0) { /* p should never be 0 after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != 0) { int label = new_label(g); g->failure_label = label; wsetlab_begin(g, label); generate(g, p); if (!g->unreachable) { wgotol(g, out_lab); end_unreachable = false; } w(g, "~-~M}~N"); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); p = p->right; } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); wsetlab_end(g, out_lab); if (!end_unreachable) { g->unreachable = false; } str_delete(savevar); } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g,"~Menv.LimitBackward = env.Cursor~N" "~Menv.Cursor = env.Limit~N", p); generate(g, p->left); w(g, "~Menv.Cursor = env.LimitBackward~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int label = new_label(g); g->failure_label = label; write_comment(g, p); if (keep_c) { write_savecursor(g, p, savevar); } str_clear(g->failure_str); wsetlab_begin(g, label); generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); w(g, "~-~M}~N"); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); str_delete(savevar); } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); write_comment(g, p); if (keep_c) { write_savecursor(g, p, savevar); restore_string(p, g->failure_str, savevar); } wsetlab_begin(g, label); generate(g, p->left); wsetlab_end(g, label); g->unreachable = false; str_delete(savevar); } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = true~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = false~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) { write_savecursor(g, p, savevar); } generate(g, p->left); if (!g->unreachable) { if (keep_c) { write_restorecursor(g, p, savevar); } } str_delete(savevar); } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); g->V[0] = p->left->name; w(g, "~M~W0(env, context)~N"); } else { int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g, label); generate(g, p->left); wsetlab_end(g, label); g->unreachable = false; } if (keep_c) write_restorecursor(g, p, savevar); str_delete(savevar); } static void generate_GO(struct generator * g, struct node * p, int style) { int end_unreachable = false; struct str * savevar = vars_newname(g); int keep_c = style == 1 || repeat_restore(g, p->left); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int golab = new_label(g); g->I[0] = golab; write_comment(g, p); w(g, "~Mgolab~I0: for {~N~+"); if (keep_c) write_savecursor(g, p, savevar); g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (style == 1) write_restorecursor(g, p, savevar); g->I[0] = golab; w(g, "~Mbreak golab~I0~N"); } g->unreachable = false; w(g, "~-~M}~N"); if (keep_c) write_restorecursor(g, p, savevar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); write_block_end(g); str_delete(savevar); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); w(g, "~Mfor _ = range make([]struct{},"); generate_AE(g, p->AE); writef(g, ") {~+~N", p); generate(g, p->left); w(g, "~-~M}~N"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { struct str * savevar = vars_newname(g); int keep_c = repeat_restore(g, p->left); int replab = new_label(g); g->I[0] = replab; writef(g, "~Mreplab~I0: for{~N~+", p); if (keep_c) write_savecursor(g, p, savevar); g->failure_label = new_label(g); str_clear(g->failure_str); g->I[0] = g->failure_label; w(g, "~Mlab~I0: for _ = range [2]struct{}{} {~N~+"); generate(g, p->left); if (!g->unreachable) { if (loopvar != 0) { g->B[0] = str_data(loopvar); w(g, "~M~B0--~N"); } g->I[0] = replab; w(g, "~Mcontinue replab~I0~N"); } w(g, "~-~M}~N"); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); g->I[0] = replab; w(g, "~Mbreak replab~I0~N~-~M}~N"); str_delete(savevar); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); g->B[0] = str_data(loopvar); w(g, "~Mvar ~B0 = "); generate_AE(g, p->AE); w(g, "~N"); { int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); str_delete(loopvar); } static void generate_setmark(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = env.Cursor~N", p); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif env.Cursor ~S0 "); generate_AE(g, p->AE); writef(g, " ", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; w(g, "~Menv.Cursor = "); generate_AE(g, p->AE); writef(g, "~N", p); } static void generate_atmark(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif env.Cursor != "); generate_AE(g, p->AE); writef(g, " ", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); // Generate the AE to a temporary block so we can substitute it in // write_failure_if(). struct str * ae = str_new(); struct str * s = g->outbuf; g->outbuf = ae; generate_AE(g, p->AE); g->outbuf = s; g->B[0] = str_data(ae); g->S[0] = p->mode == m_forward ? "" : "Back"; g->S[1] = p->AE->type == c_number ? "" : "Checked"; write_failure_if(g, "!env.Hop~S0~S1(~B0)", p); str_delete(ae); } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mif !env.SliceDel() {~N" "~+~Mreturn false~N~-" "~M}~N", p); } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.Limit" : "env.LimitBackward"; writef(g, "~Menv.Cursor = ~S0~N", p); } static void generate_atlimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.Limit" : "env.LimitBackward"; g->S[1] = p->mode == m_forward ? "<" : ">"; write_failure_if(g, "env.Cursor ~S1 ~S0", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.Bra" : "env.Ket"; writef(g, "~M~S0 = env.Cursor~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.Ket" : "env.Bra"; writef(g, "~M~S0 = env.Cursor~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = env.AssignTo()~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = env.SliceTo()~N" "~Mif ~V0 == \"\" {~N" "~+~Mreturn false~N~-~M}~N", p); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != 0) { write_literal_string(g, b); } else { write_varref(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { int keep_c = style == c_attach; write_block_start(g); write_comment(g, p); if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) w(g, "~Mvar c = env.Cursor~N"); w(g, "~Mbra, ket := env.Cursor, env.Cursor~N"); writef(g, "~Menv.Insert(bra, ket, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~Menv.Cursor = c~N"); write_block_end(g); } static void generate_assignfrom(struct generator * g, struct node * p) { int keep_c = p->mode == m_forward; /* like 'attach' */ write_block_start(g); write_comment(g, p); if (keep_c) writef(g, "~Mvar c = env.Cursor~N", p); if (p->mode == m_forward) { writef(g, "~Menv.Insert(env.Cursor, env.Limit, ", p); } else { writef(g, "~Menv.Insert(env.LimitBackward, env.Cursor, ", p); } generate_address(g, p); writef(g, ")~N", p); if (keep_c) w(g, "~Menv.Cursor = c~N"); write_block_end(g); } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif !env.SliceFrom("); generate_address(g, p); writef(g, ") {~N" "~+~Mreturn false~N~-~M}~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); struct str * varname = vars_newname(g); write_comment(g, p); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif env.Cursor ~S0 "); generate_AE(g, q->AE); w(g, " "); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mvar ~B0 = env.Limit - env.Cursor~N"); w(g, "~Menv.Limit = "); } else { w(g, "~Mvar ~B0 = env.LimitBackward~N"); w(g, "~Menv.LimitBackward = "); } generate_AE(g, q->AE); writef(g, ";~N", q); if (p->mode == m_forward) { str_assign(g->failure_str, "env.Limit += "); str_append(g->failure_str, varname); str_append_string(g->failure_str, ";"); } else { str_assign(g->failure_str, "env.LimitBackward = "); str_append(g->failure_str, varname); str_append_string(g->failure_str, ";"); } } else { write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mvar ~B0 = env.Limit - env.Cursor~N"); w(g, "~Menv.Limit = env.Cursor~N"); } else { w(g, "~Mvar ~B0 = env.LimitBackward~N"); w(g, "~Menv.LimitBackward = env.Cursor~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "env.Limit += "); str_append(g->failure_str, varname); str_append_string(g->failure_str, ";"); } else { str_assign(g->failure_str, "env.LimitBackward = "); str_append(g->failure_str, varname); str_append_string(g->failure_str, ";"); } } } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); str_delete(savevar); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { struct str * savevar_env = vars_newname(g); write_comment(g, p); g->V[0] = p->name; g->B[0] = str_data(savevar_env); writef(g, "~Mvar ~B0 = env.Clone()~N" "~Menv.SetCurrent(~V0)~N" "~Menv.Cursor = 0~N" "~Menv.Limit = len(env.Current())~N", p); generate(g, p->left); if (!g->unreachable) { g->V[0] = p->name; g->B[0] = str_data(savevar_env); /* Update string variable. */ w(g, "~M~V0 = env.Current()~N"); /* Reset env */ w(g, "~M*env = *~B0~N"); } str_delete(savevar_env); } static void generate_integer_assign(struct generator * g, struct node * p, char * s) { g->V[0] = p->name; g->S[0] = s; w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); w(g, "~N"); } static void generate_integer_test(struct generator * g, struct node * p, char * s) { w(g, "~Mif !("); generate_AE(g, p->left); write_char(g, ' '); write_string(g, s); write_char(g, ' '); generate_AE(g, p->AE); w(g, ")"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } static void generate_call(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "!~W0(env, context)", p); } static void generate_grouping(struct generator * g, struct node * p, int complement) { struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "B"; g->S[1] = complement ? "Out" : "In"; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "!env.~S1Grouping~S0(~W0, ~I0, ~I1)", p); } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "B"; g->V[0] = p->name; write_failure_if(g, "!env.EqS~S0(~V0)", p); } static void generate_literalstring(struct generator * g, struct node * p) { symbol * b = p->literalstring; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "B"; g->L[0] = b; write_failure_if(g, "!env.EqS~S0(~L0)", p); } static void generate_setup_context(struct generator * g) { struct name * q; w(g, "~Mvar context = &Context {~+~N"); for (q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_string: w(g, "~M~W0: \"\",~N"); break; case t_integer: w(g, "~M~W0: 0,~N"); break; case t_boolean: w(g, "~M~W0: false,~N"); break; } } w(g, "~-~M}~N"); w(g, "~M_ = context~N"); } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; struct str * saved_output = g->outbuf; g->V[0] = q; if (q->type == t_routine) { w(g, "~N~Mfunc ~W0(env *snowballRuntime.Env, ctx interface{}) bool {~+~N"); w(g, "~Mcontext := ctx.(*Context)~N"); w(g, "~M_ = context~N"); } else { w(g, "~N~Mfunc ~W0(env *snowballRuntime.Env) bool {~+~N"); generate_setup_context(g); } if (p->amongvar_needed) w(g, "~Mvar among_var int32~N"); g->outbuf = str_new(); g->next_label = 0; g->var_number = 0; str_clear(g->failure_str); g->failure_label = x_return; g->unreachable = false; generate(g, p->left); if (!g->unreachable) w(g, "~Mreturn true~N"); w(g, "~-~M}~N"); str_append(saved_output, g->outbuf); str_delete(g->outbuf); g->outbuf = saved_output; } static void generate_substring(struct generator * g, struct node * p) { struct among * x = p->among; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "B"; g->I[0] = x->number; if (!x->amongvar_needed) { write_failure_if(g, "env.FindAmong~S0(~A_~I0, context) == 0", p); } else { writef(g, "~Mamong_var = env.FindAmong~S0(~A_~I0, context)~N", p); write_failure_if(g, "among_var == 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == 0) generate_substring(g, p); if (x->starter != 0) generate(g, x->starter); if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { int i; w(g, "~M"); for (i = 1; i <= x->command_count; i++) { g->I[0] = i; if (i > 1) w(g, " else "); w(g, "if among_var == ~I0 {~N~+"); generate(g, x->commands[i - 1]); w(g, "~-~M}"); g->unreachable = false; } w(g, "~N"); } } static void generate_booltest(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "!~V0", p); } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Menv.Debug(~I0, ~I1)~N", p); } static void generate(struct generator * g, struct node * p) { int a0; struct str * a1; if (g->unreachable) return; a0 = g->failure_label; a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_setmark: generate_setmark(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_atmark: generate_atmark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_atlimit: generate_atlimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_assign: generate_assignfrom(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_mathassign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: generate_integer_assign(g, p, "/="); break; case c_eq: generate_integer_test(g, p, "=="); break; case c_ne: generate_integer_test(g, p, "!="); break; case c_gr: generate_integer_test(g, p, ">"); break; case c_ge: generate_integer_test(g, p, ">="); break; case c_ls: generate_integer_test(g, p, "<"); break; case c_le: generate_integer_test(g, p, "<="); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } static void generate_class_begin(struct generator * g) { w(g, "package "); w(g, g->options->package); w(g, "~N~N"); w(g, "import(~N"); w(g, " snowballRuntime \""); w(g, g->options->go_snowball_runtime); w(g, "\"~N)~N~N"); } static void generate_among_table(struct generator * g, struct among * x) { struct amongvec * v = x->b; g->I[0] = x->number; g->I[1] = x->literalstring_count; w(g, "~Mvar A_~I0 = []*snowballRuntime.Among{~N~+"); { int i; for (i = 0; i < x->literalstring_count; i++) { g->I[0] = v->i; g->I[1] = v->result; g->L[0] = v->b; g->S[0] = ","; w(g, "~M&snowballRuntime.Among{Str:~L0, A:~I0, B:~I1, "); if (v->function != 0) { w(g, "F:"); write_varname(g, v->function); } else { w(g, "F:nil"); } w(g, "}~S0~N"); v++; } } w(g, "~-~M}~N~N"); } static void generate_amongs(struct generator * g) { struct among * x; for (x = g->analyser->amongs; x; x = x->next) { generate_among_table(g, x); } } static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7)/ 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); int i; for (i = 0; i < size; i++) map[i] = 0; for (i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); g->V[0] = q->name; g->I[0] = size; w(g, "~Mvar ~W0 = []byte{"); for (i = 0; i < size; i++) { write_int(g, map[i]); if (i < size - 1) w(g, ", "); } w(g, "}~N~N"); lose_b(map); } static void generate_groupings(struct generator * g) { struct grouping * q; for (q = g->analyser->groupings; q; q = q->next) { if (q->name->used) generate_grouping_table(g, q); } } static void generate_members(struct generator * g) { struct name * q; w(g, "type Context struct {~+~N"); for (q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_string: w(g, "~M~W0 string~N"); break; case t_integer: w(g, "~M~W0 int~N"); break; case t_boolean: w(g, "~M~W0 bool~N"); break; } } w(g, "~-}~N"); } static void generate_methods(struct generator * g) { struct node * p = g->analyser->program; while (p != 0) { generate(g, p); g->unreachable = false; p = p->right; } } extern void generate_program_go(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); write_start_comment(g, "//! ", NULL); if (g->analyser->int_limits_used) { /* std::usize is used in the code generated for usize::MAX and usize::MIN */ w(g, "use std::usize;~N~N"); } generate_class_begin(g); generate_amongs(g); generate_groupings(g); generate_members(g); generate_methods(g); output_str(g->options->output_src, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-2.2.0/compiler/generator_java.c000066400000000000000000001073751414263061200203150ustar00rootroot00000000000000 #include /* for exit */ #include /* for strlen */ #include /* for fprintf etc */ #include "header.h" /* prototypes */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); static int new_label(struct generator * g) { return g->next_label++; } static struct str * vars_newname(struct generator * g) { struct str * output; g->var_number++; output = str_new(); str_append_string(output, "v_"); str_append_int(output, g->var_number); return output; } /* Write routines for items from the syntax tree */ static void write_varname(struct generator * g, struct name * p) { int ch = "SBIrxg"[p->type]; if (p->type != t_external) { write_char(g, ch); write_char(g, '_'); } write_b(g, p->b); } static void write_varref(struct generator * g, struct name * p) { /* In java, references look just the same */ write_varname(g, p); } static void write_hexdigit(struct generator * g, int n) { write_char(g, n < 10 ? n + '0' : n - 10 + 'A'); } static void write_hex(struct generator * g, int ch) { write_string(g, "\\u"); { int i; for (i = 12; i >= 0; i -= 4) write_hexdigit(g, ch >> i & 0xf); } } static void write_literal_string(struct generator * g, symbol * p) { int i; write_string(g, "\""); for (i = 0; i < SIZE(p); i++) { int ch = p[i]; if (32 <= ch && ch < 127) { if (ch == '\"' || ch == '\\') write_string(g, "\\"); write_char(g, ch); } else { write_hex(g, ch); } } write_string(g, "\""); } static void write_margin(struct generator * g) { int i; for (i = 0; i < g->margin; i++) write_string(g, " "); } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "// "); write_comment_content(g, p); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~M{~+~N"); } static void write_block_end(struct generator * g) /* block end */ { w(g, "~-~M}~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "limit - "; writef(g, "~Mint ~B0 = ~S1cursor;~N", p); } static void restore_string(struct node * p, struct str * out, struct str * savevar) { str_clear(out); str_append_string(out, "cursor = "); if (p->mode != m_forward) str_append_string(out, "limit - "); str_append(out, savevar); str_append_string(out, ";"); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { struct str * temp = str_new(); write_margin(g); restore_string(p, temp, savevar); write_str(g, temp); write_newline(g); str_delete(temp); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "cursor++;" : "cursor--;"); write_newline(g); } static void wsetlab_begin(struct generator * g, int n) { w(g, "~Mlab"); write_int(g, n); w(g, ": {~+~N"); } static void wsetlab_end(struct generator * g) { w(g, "~-~M}~N"); } static void wgotol(struct generator * g, int n) { write_margin(g); write_string(g, "break lab"); write_int(g, n); write_string(g, ";"); write_newline(g); } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } write_margin(g); switch (g->failure_label) { case x_return: write_string(g, "return false;"); g->unreachable = true; break; default: write_string(g, "break lab"); write_int(g, g->failure_label); write_string(g, ";"); g->unreachable = true; } write_newline(g); } static void write_failure_if(struct generator * g, char * s, struct node * p) { writef(g, "~Mif (", p); writef(g, s, p); writef(g, ")~N", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "cursor >= limit", p); } else { write_failure_if(g, "cursor <= limit_backward", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; int l = strlen(input); while (i < l) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } switch (input[i++]) { default: write_char(g, input[i - 1]); continue; case 'C': write_comment(g, p); continue; case 'f': write_block_start(g); write_failure(g); g->unreachable = false; write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': write_string(g, g->S[input[i++] - '0']); continue; case 'B': write_b(g, g->B[input[i++] - '0']); continue; case 'I': write_int(g, g->I[input[i++] - '0']); continue; case 'V': write_varref(g, g->V[input[i++] - '0']); continue; case 'W': write_varname(g, g->V[input[i++] - '0']); continue; case 'L': write_literal_string(g, g->L[input[i++] - '0']); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_string(g, g->options->name); continue; } } } static void w(struct generator * g, const char * s) { writef(g, s, 0); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "Integer.MAX_VALUE"); break; case c_minint: write_string(g, "Integer.MIN_VALUE"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; goto label0; case c_divide: s = " / "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "cursor"); break; case c_limit: w(g, p->mode == m_forward ? "limit" : "limit_backward"); break; case c_lenof: /* Same as sizeof() for Java. */ case c_sizeof: g->V[0] = p->name; w(g, "~V0.length()"); break; case c_len: /* Same as size() for Java. */ case c_size: w(g, "current.length()"); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (keep_c && p->right != 0) write_restorecursor(g, p, savevar); p = p->right; } str_delete(savevar); } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); int end_unreachable = true; write_comment(g, p); wsetlab_begin(g, out_lab); if (keep_c) write_savecursor(g, p, savevar); p = p->left; str_clear(g->failure_str); if (p == 0) { /* p should never be 0 after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != 0) { g->failure_label = new_label(g); wsetlab_begin(g, g->failure_label); generate(g, p); if (!g->unreachable) { wgotol(g, out_lab); end_unreachable = false; } wsetlab_end(g); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); p = p->right; } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); wsetlab_end(g); if (!end_unreachable) { g->unreachable = false; } str_delete(savevar); } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mlimit_backward = cursor;~N" "~Mcursor = limit;~N", p); generate(g, p->left); w(g, "~Mcursor = limit_backward;~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); write_comment(g, p); if (keep_c) { write_block_start(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); wsetlab_end(g); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); if (keep_c) write_block_end(g); str_delete(savevar); } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); g->failure_label = new_label(g); str_clear(g->failure_str); if (keep_c) restore_string(p, g->failure_str, savevar); wsetlab_begin(g, g->failure_label); generate(g, p->left); wsetlab_end(g); g->unreachable = false; str_delete(savevar); } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = true;~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = false;~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) { write_savecursor(g, p, savevar); } generate(g, p->left); if (!g->unreachable) { if (keep_c) { write_restorecursor(g, p, savevar); } } str_delete(savevar); } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); g->V[0] = p->left->name; w(g, "~M~V0();~N"); } else { g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); wsetlab_end(g); g->unreachable = false; } if (keep_c) write_restorecursor(g, p, savevar); str_delete(savevar); } static void generate_GO(struct generator * g, struct node * p, int style) { int end_unreachable = false; struct str * savevar = vars_newname(g); int keep_c = style == 1 || repeat_restore(g, p->left); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int golab = new_label(g); g->I[0] = golab; write_comment(g, p); w(g, "~Mgolab~I0: while(true)~N"); w(g, "~{"); if (keep_c) write_savecursor(g, p, savevar); g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (style == 1) write_restorecursor(g, p, savevar); g->I[0] = golab; w(g, "~Mbreak golab~I0;~N"); } g->unreachable = false; wsetlab_end(g); if (keep_c) write_restorecursor(g, p, savevar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); write_block_end(g); str_delete(savevar); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); g->B[0] = str_data(loopvar); w(g, "~Mfor (int ~B0 = "); generate_AE(g, p->AE); g->B[0] = str_data(loopvar); writef(g, "; ~B0 > 0; ~B0--)~N", p); writef(g, "~{", p); generate(g, p->left); w(g, "~}"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { struct str * savevar = vars_newname(g); int keep_c = repeat_restore(g, p->left); writef(g, "~Mwhile(true)~N~{", p); if (keep_c) write_savecursor(g, p, savevar); g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (!g->unreachable) { if (loopvar != 0) { g->B[0] = str_data(loopvar); w(g, "~M~B0--;~N"); } w(g, "~Mcontinue;~N"); } wsetlab_end(g); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); w(g, "~Mbreak;~N~}"); str_delete(savevar); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); w(g, "~{"); g->B[0] = str_data(loopvar); w(g, "~Mint ~B0 = "); generate_AE(g, p->AE); w(g, ";~N"); { int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); w(g, "~}"); str_delete(loopvar); } static void generate_setmark(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = cursor;~N", p); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif (cursor ~S0 "); generate_AE(g, p->AE); w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; w(g, "~Mcursor = "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_atmark(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif (cursor != "); generate_AE(g, p->AE); writef(g, ")~N", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "+" : "-"; w(g, "~{~Mint c = cursor ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); g->S[1] = p->mode == m_forward ? "> limit" : "< limit_backward"; g->S[2] = p->mode == m_forward ? "<" : ">"; if (p->AE->type == c_number) { // Constant distance hop. // // No need to check for negative hop as that's converted to false by // the analyser. write_failure_if(g, "c ~S1", p); } else { write_failure_if(g, "c ~S1 || c ~S2 cursor", p); } writef(g, "~Mcursor = c;~N", p); writef(g, "~}", p); } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mslice_del();~N", p); } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "limit" : "limit_backward"; writef(g, "~Mcursor = ~S0;~N", p); } static void generate_atlimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "limit" : "limit_backward"; g->S[1] = p->mode == m_forward ? "<" : ">"; write_failure_if(g, "cursor ~S1 ~S0", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "bra" : "ket"; writef(g, "~M~S0 = cursor;~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "ket" : "bra"; writef(g, "~M~S0 = cursor;~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~Massign_to(~V0);~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~Mslice_to(~V0);~N", p); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != 0) { write_literal_string(g, b); } else { write_varref(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { int keep_c = style == c_attach; write_comment(g, p); if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) w(g, "~{~Mint c = cursor;~N"); writef(g, "~Minsert(cursor, cursor, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~Mcursor = c;~N~}"); } static void generate_assignfrom(struct generator * g, struct node * p) { int keep_c = p->mode == m_forward; /* like 'attach' */ write_comment(g, p); if (keep_c) writef(g, "~{~Mint c = cursor;~N", p); if (p->mode == m_forward) { writef(g, "~Minsert(cursor, limit, ", p); } else { writef(g, "~Minsert(limit_backward, cursor, ", p); } generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~Mcursor = c;~N~}"); } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mslice_from("); generate_address(g, p); writef(g, ");~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); struct str * varname = vars_newname(g); write_comment(g, p); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif (cursor ~S0 "); generate_AE(g, q->AE); w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mint ~B0 = limit - cursor;~N"); w(g, "~Mlimit = "); } else { w(g, "~Mint ~B0 = limit_backward;~N"); w(g, "~Mlimit_backward = "); } generate_AE(g, q->AE); writef(g, ";~N", q); if (p->mode == m_forward) { str_assign(g->failure_str, "limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } else { write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mint ~B0 = limit - cursor;~N"); w(g, "~Mlimit = cursor;~N"); } else { w(g, "~Mint ~B0 = limit_backward;~N"); w(g, "~Mlimit_backward = cursor;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); str_delete(savevar); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); g->B[0] = str_data(savevar); writef(g, "~{~C~N" "~MSnowballProgram ~B0 = new SnowballProgram(this);~N", p); ++g->copy_from_count; str_assign(g->failure_str, "copy_from("); str_append(g->failure_str, savevar); str_append_string(g->failure_str, ");"); g->V[0] = p->name; writef(g, "~Mcurrent = ~V0;~N" "~Mcursor = 0;~N" "~Mlimit = current.length();~N", p); generate(g, p->left); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } w(g, "~}"); str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, char * s) { g->V[0] = p->name; g->S[0] = s; w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p, char * s) { w(g, "~Mif (!("); generate_AE(g, p->left); write_char(g, ' '); write_string(g, s); write_char(g, ' '); generate_AE(g, p->AE); w(g, "))~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } static void generate_call(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "!~V0()", p); } static void generate_grouping(struct generator * g, struct node * p, int complement) { struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "!(~S1_grouping~S0(~V0, ~I0, ~I1))", p); } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->V[0] = p->name; write_failure_if(g, "!(eq_s~S0(~V0))", p); } static void generate_literalstring(struct generator * g, struct node * p) { symbol * b = p->literalstring; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->L[0] = b; write_failure_if(g, "!(eq_s~S0(~L0))", p); } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; struct str * saved_output = g->outbuf; /* We currently make functions used in among public as this seems to * be required to allow the SnowballProgram base class to invoke them. * FIXME: Is this avoidable? */ if (q->type == t_routine && !q->used_in_among) { g->S[0] = "private"; } else { g->S[0] = "public"; } g->V[0] = q; w(g, "~N~M~S0 boolean ~V0() {~+~N"); g->outbuf = str_new(); g->next_label = 0; g->var_number = 0; if (p->amongvar_needed) w(g, "~Mint among_var;~N"); str_clear(g->failure_str); g->failure_label = x_return; g->unreachable = false; generate(g, p->left); if (!g->unreachable) w(g, "~Mreturn true;~N"); w(g, "~}"); str_append(saved_output, g->outbuf); str_delete(g->outbuf); g->outbuf = saved_output; } static void generate_substring(struct generator * g, struct node * p) { struct among * x = p->among; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; if (!x->amongvar_needed) { write_failure_if(g, "find_among~S0(a_~I0) == 0", p); } else { writef(g, "~Mamong_var = find_among~S0(a_~I0);~N", p); write_failure_if(g, "among_var == 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == 0) generate_substring(g, p); if (x->starter != 0) generate(g, x->starter); if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { int i; w(g, "~Mswitch (among_var) {~N~+"); for (i = 1; i <= x->command_count; i++) { g->I[0] = i; w(g, "~Mcase ~I0:~N~+"); generate(g, x->commands[i - 1]); if (!g->unreachable) w(g, "~Mbreak;~N"); w(g, "~-"); g->unreachable = false; } write_block_end(g); } } static void generate_booltest(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "!(~V0)", p); } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Mdebug(~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { int a0; struct str * a1; if (g->unreachable) return; a0 = g->failure_label; a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_setmark: generate_setmark(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_atmark: generate_atmark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_atlimit: generate_atlimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_assign: generate_assignfrom(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_mathassign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: generate_integer_assign(g, p, "/="); break; case c_eq: generate_integer_test(g, p, "=="); break; case c_ne: generate_integer_test(g, p, "!="); break; case c_gr: generate_integer_test(g, p, ">"); break; case c_ge: generate_integer_test(g, p, ">="); break; case c_ls: generate_integer_test(g, p, "<"); break; case c_le: generate_integer_test(g, p, "<="); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } static void generate_class_begin(struct generator * g) { w(g, "package "); w(g, g->options->package); w(g, ";~N~N"); if (g->analyser->among_count > 0) { w(g, "import "); w(g, g->options->among_class); w(g, ";~N~N"); } if (g->copy_from_count > 0) { w(g, "import org.tartarus.snowball.SnowballProgram;~N~N"); } w(g, "/**~N" " * This class implements the stemming algorithm defined by a snowball script.~N" " *

~N" " * "); write_generated_comment_content(g); w(g, "~N" " *

~N" " */~N" "@SuppressWarnings(\"unused\")~N" "public class ~n extends "); w(g, g->options->parent_class_name); w(g, " {~+~N" "~N" "~Mprivate static final long serialVersionUID = 1L;~N" "~N"); } static void generate_class_end(struct generator * g) { w(g, "~N}"); w(g, "~N~N"); } static void generate_equals(struct generator * g) { w(g, "~N" "@Override~N" "~Mpublic boolean equals( Object o ) {~N" "~+~Mreturn o instanceof "); w(g, g->options->name); w(g, ";~N~-~M}~N" "~N" "@Override~N" "~Mpublic int hashCode() {~N" "~+~Mreturn "); w(g, g->options->name); w(g, ".class.getName().hashCode();~N" "~-~M}~N"); w(g, "~N~N"); } static void generate_among_table(struct generator * g, struct among * x) { struct amongvec * v = x->b; g->I[0] = x->number; w(g, "~Mprivate final static Among a_~I0[] = {~N~+"); { int i; for (i = 0; i < x->literalstring_count; i++) { g->I[0] = v->i; g->I[1] = v->result; g->L[0] = v->b; g->S[0] = i < x->literalstring_count - 1 ? "," : ""; w(g, "~Mnew Among(~L0, ~I0, ~I1"); if (v->function != 0) { w(g, ", \""); write_varname(g, v->function); w(g, "\", ~n.class"); } w(g, ")~S0~N"); v++; } } w(g, "~-~M};~N~N"); } static void generate_amongs(struct generator * g) { struct among * x; for (x = g->analyser->amongs; x; x = x->next) { generate_among_table(g, x); } } static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7)/ 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); int i; for (i = 0; i < size; i++) map[i] = 0; /* Using unicode would require revision here */ for (i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); g->V[0] = q->name; w(g, "~Mprivate static final char ~V0[] = {"); for (i = 0; i < size; i++) { write_int(g, map[i]); if (i < size - 1) w(g, ", "); } w(g, " };~N~N"); lose_b(map); } static void generate_groupings(struct generator * g) { struct grouping * q; for (q = g->analyser->groupings; q; q = q->next) { if (q->name->used) generate_grouping_table(g, q); } } static void generate_members(struct generator * g) { struct name * q; for (q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_string: w(g, "~Mprivate "); w(g, g->options->string_class); w(g, " ~W0 = new "); w(g, g->options->string_class); w(g, "();~N"); break; case t_integer: w(g, "~Mprivate int ~W0;~N"); break; case t_boolean: w(g, "~Mprivate boolean ~W0;~N"); break; } } w(g, "~N"); } static void generate_methods(struct generator * g) { struct node * p; for (p = g->analyser->program; p; p = p->right) { generate(g, p); g->unreachable = false; } } extern void generate_program_java(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); generate_amongs(g); generate_groupings(g); generate_members(g); generate_methods(g); generate_equals(g); generate_class_end(g); { /* We need to call generate_class_begin() after we've generated the * methods so we know if copy_from_count > 0. */ struct str * body = g->outbuf; g->outbuf = str_new(); write_start_comment(g, "// ", NULL); generate_class_begin(g); str_append(g->outbuf, body); str_delete(body); } output_str(g->options->output_src, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-2.2.0/compiler/generator_js.c000066400000000000000000001111701414263061200177740ustar00rootroot00000000000000 #include /* for exit */ #include /* for strlen */ #include /* for fprintf etc */ #include "header.h" /* prototypes */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); static int new_label(struct generator * g) { return g->next_label++; } static struct str * vars_newname(struct generator * g) { struct str * output; g->var_number++; output = str_new(); str_append_string(output, "v_"); str_append_int(output, g->var_number); return output; } /* Write routines for items from the syntax tree */ static void write_varname(struct generator * g, struct name * p) { int ch = "SBIrxg"[p->type]; if (p->type != t_external) { write_char(g, ch); write_char(g, '_'); } write_b(g, p->b); } static void write_varref(struct generator * g, struct name * p) { write_varname(g, p); } static void write_hexdigit(struct generator * g, int n) { write_char(g, n < 10 ? n + '0' : n - 10 + 'A'); } static void write_hex(struct generator * g, int ch) { write_string(g, "\\u"); { int i; for (i = 12; i >= 0; i -= 4) write_hexdigit(g, ch >> i & 0xf); } } static void write_literal_string(struct generator * g, symbol * p) { int i; write_string(g, "\""); for (i = 0; i < SIZE(p); i++) { int ch = p[i]; if (32 <= ch && ch < 127) { if (ch == '\"' || ch == '\\') write_string(g, "\\"); write_char(g, ch); } else { write_hex(g, ch); } } write_string(g, "\""); } static void write_margin(struct generator * g) { int i; for (i = 0; i < g->margin; i++) write_string(g, " "); } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "// "); write_comment_content(g, p); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~M{~+~N"); } static void write_block_end(struct generator * g) /* block end */ { w(g, "~-~M}~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "base.limit - "; writef(g, "~Mvar /** number */ ~B0 = ~S1base.cursor;~N", p); } static void restore_string(struct node * p, struct str * out, struct str * savevar) { str_clear(out); str_append_string(out, "base.cursor = "); if (p->mode != m_forward) str_append_string(out, "base.limit - "); str_append(out, savevar); str_append_string(out, ";"); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { struct str * temp = str_new(); write_margin(g); restore_string(p, temp, savevar); write_str(g, temp); write_newline(g); str_delete(temp); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "base.cursor++;" : "base.cursor--;"); write_newline(g); } static void wsetlab_begin(struct generator * g, int n) { g->I[0] = n; w(g, "~Mlab~I0: {~N~+"); } static void wsetlab_end(struct generator * g) { w(g, "~-~M}~N"); } static void wgotol(struct generator * g, int n) { write_margin(g); write_string(g, "break lab"); write_int(g, n); write_string(g, ";"); write_newline(g); } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } write_margin(g); switch (g->failure_label) { case x_return: write_string(g, "return false;"); g->unreachable = true; break; default: write_string(g, "break lab"); write_int(g, g->failure_label); write_string(g, ";"); g->unreachable = true; } write_newline(g); } static void write_failure_if(struct generator * g, char * s, struct node * p) { writef(g, "~Mif (", p); writef(g, s, p); writef(g, ")~N", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "base.cursor >= base.limit", p); } else { write_failure_if(g, "base.cursor <= base.limit_backward", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; int l = strlen(input); while (i < l) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } switch (input[i++]) { default: write_char(g, input[i - 1]); continue; case 'C': write_comment(g, p); continue; case 'f': write_block_start(g); write_failure(g); g->unreachable = false; write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': write_string(g, g->S[input[i++] - '0']); continue; case 'B': write_b(g, g->B[input[i++] - '0']); continue; case 'I': write_int(g, g->I[input[i++] - '0']); continue; case 'V': write_varref(g, g->V[input[i++] - '0']); continue; case 'W': write_varname(g, g->V[input[i++] - '0']); continue; case 'L': write_literal_string(g, g->L[input[i++] - '0']); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_string(g, g->options->name); continue; case 'P': write_string(g, g->options->parent_class_name); continue; } } } static void w(struct generator * g, const char * s) { writef(g, s, 0); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "(-1>>>1)"); break; case c_minint: write_string(g, "(~(-1>>>1))"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_divide: /* Snowball specifies integer division with semantics matching C, * so we need to use `Math.trunc(x/y)` here. */ write_string(g, "Math.trunc("); generate_AE(g, p->left); write_string(g, " / "); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "base.cursor"); break; case c_limit: w(g, p->mode == m_forward ? "base.limit" : "base.limit_backward"); break; case c_lenof: /* Same as sizeof() for Javascript. */ case c_sizeof: g->V[0] = p->name; w(g, "~V0.length"); break; case c_len: /* Same as size() for Javascript. */ case c_size: w(g, "base.current.length"); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (keep_c && p->right != 0) write_restorecursor(g, p, savevar); p = p->right; } str_delete(savevar); } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); int end_unreachable = true; write_comment(g, p); wsetlab_begin(g, out_lab); if (keep_c) write_savecursor(g, p, savevar); p = p->left; str_clear(g->failure_str); if (p == 0) { /* p should never be 0 after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != 0) { g->failure_label = new_label(g); wsetlab_begin(g, g->failure_label); generate(g, p); if (!g->unreachable) { wgotol(g, out_lab); end_unreachable = false; } wsetlab_end(g); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); p = p->right; } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); wsetlab_end(g); if (!end_unreachable) { g->unreachable = false; } str_delete(savevar); } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mbase.limit_backward = base.cursor; base.cursor = base.limit;~N", p); generate(g, p->left); w(g, "~Mbase.cursor = base.limit_backward;~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); write_comment(g, p); if (keep_c) { write_block_start(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); wsetlab_end(g); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); if (keep_c) write_block_end(g); str_delete(savevar); } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); g->failure_label = new_label(g); str_clear(g->failure_str); if (keep_c) restore_string(p, g->failure_str, savevar); wsetlab_begin(g, g->failure_label); generate(g, p->left); wsetlab_end(g); g->unreachable = false; str_delete(savevar); } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = true;~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = false;~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) { write_savecursor(g, p, savevar); } generate(g, p->left); if (!g->unreachable) { if (keep_c) { write_restorecursor(g, p, savevar); } } str_delete(savevar); } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); g->V[0] = p->left->name; w(g, "~M~V0();~N"); } else { g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); wsetlab_end(g); g->unreachable = false; } if (keep_c) write_restorecursor(g, p, savevar); str_delete(savevar); } static void generate_GO(struct generator * g, struct node * p, int style) { int end_unreachable = false; struct str * savevar = vars_newname(g); int keep_c = style == 1 || repeat_restore(g, p->left); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int golab = new_label(g); g->I[0] = golab; write_comment(g, p); w(g, "~Mgolab~I0: while(true)~N"); w(g, "~{"); if (keep_c) write_savecursor(g, p, savevar); g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (style == 1) write_restorecursor(g, p, savevar); g->I[0] = golab; w(g, "~Mbreak golab~I0;~N"); } g->unreachable = false; wsetlab_end(g); if (keep_c) write_restorecursor(g, p, savevar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); write_block_end(g); str_delete(savevar); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); g->B[0] = str_data(loopvar); w(g, "~Mfor (var /** number */ ~B0 = "); generate_AE(g, p->AE); g->B[0] = str_data(loopvar); writef(g, "; ~B0 > 0; ~B0--)~N", p); writef(g, "~{", p); generate(g, p->left); w(g, "~}"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { struct str * savevar = vars_newname(g); int keep_c = repeat_restore(g, p->left); writef(g, "~Mwhile(true)~N~{", p); if (keep_c) write_savecursor(g, p, savevar); g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (!g->unreachable) { if (loopvar != 0) { g->B[0] = str_data(loopvar); w(g, "~M~B0--;~N"); } w(g, "~Mcontinue;~N"); } wsetlab_end(g); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); w(g, "~Mbreak;~N~}"); str_delete(savevar); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); w(g, "~{"); g->B[0] = str_data(loopvar); w(g, "~Mvar ~B0 = "); generate_AE(g, p->AE); w(g, ";~N"); { int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); w(g, "~}"); str_delete(loopvar); } static void generate_setmark(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = base.cursor;~N", p); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif (base.cursor ~S0 "); generate_AE(g, p->AE); w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; w(g, "~Mbase.cursor = "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_atmark(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif (base.cursor != "); generate_AE(g, p->AE); writef(g, ")~N", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } static void generate_hop(struct generator * g, struct node * p) { int c_count = ++g->keep_count; write_comment(g, p); g->S[0] = p->mode == m_forward ? "+" : "-"; g->I[0] = c_count; w(g, "~{~Mvar /** number */ c~I0 = base.cursor ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); g->I[0] = c_count; g->S[1] = p->mode == m_forward ? "> base.limit" : "< base.limit_backward"; g->S[2] = p->mode == m_forward ? "<" : ">"; if (p->AE->type == c_number) { // Constant distance hop. // // No need to check for negative hop as that's converted to false by // the analyser. write_failure_if(g, "c~I0 ~S1", p); } else { write_failure_if(g, "c~I0 ~S1 || c~I0 ~S2 base.cursor", p); } writef(g, "~Mbase.cursor = c~I0;~N", p); writef(g, "~}", p); } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mif (!base.slice_del())~N" "~M{~N" "~+~Mreturn false;~N~-" "~M}~N", p); } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); if (p->mode == m_forward) { writef(g, "~Mbase.cursor = base.limit;~N", p); } else { writef(g, "~Mbase.cursor = base.limit_backward;~N", p); } } static void generate_atlimit(struct generator * g, struct node * p) { write_comment(g, p); if (p->mode == m_forward) { write_failure_if(g, "base.cursor < base.limit", p); } else { write_failure_if(g, "base.cursor > base.limit_backward", p); } } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); if (p->mode == m_forward) { writef(g, "~Mbase.bra = base.cursor;~N", p); } else { writef(g, "~Mbase.ket = base.cursor;~N", p); } } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); if (p->mode == m_forward) { writef(g, "~Mbase.ket = base.cursor;~N", p); } else { writef(g, "~Mbase.bra = base.cursor;~N", p); } } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = base.assign_to();~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = base.slice_to();~N" "~Mif (~V0 == '')~N" "~M{~N" "~+~Mreturn false;~N~-" "~M}~N", p); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != 0) { write_literal_string(g, b); } else { write_varref(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { int c_count; int keep_c = style == c_attach; write_comment(g, p); if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) { c_count = ++g->keep_count; g->I[0] = c_count; w(g, "~{~Mvar /** number */ c~I0 = base.cursor;~N"); } writef(g, "~Mbase.insert(base.cursor, base.cursor, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) { g->I[0] = c_count; w(g, "~Mbase.cursor = c~I0;~N~}"); } } static void generate_assignfrom(struct generator * g, struct node * p) { int c_count; int keep_c = p->mode == m_forward; /* like 'attach' */ write_comment(g, p); if (keep_c) { c_count = ++g->keep_count; g->I[0] = c_count; w(g, "~{~Mvar /** number */ c~I0 = base.cursor;~N"); } if (p->mode == m_forward) { writef(g, "~Mbase.insert(base.cursor, base.limit, ", p); } else { writef(g, "~Mbase.insert(base.limit_backward, base.cursor, ", p); } generate_address(g, p); writef(g, ");~N", p); if (keep_c) { g->I[0] = c_count; w(g, "~Mbase.cursor = c~I0;~N~}"); } } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif (!base.slice_from("); generate_address(g, p); writef(g, "))~N" "~M{~N" "~+~Mreturn false;~N~-" "~M}~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); struct str * varname = vars_newname(g); write_comment(g, p); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif (base.cursor ~S0 "); generate_AE(g, q->AE); w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; g->B[0] = str_data(varname); w(g, "~Mvar /** number */ ~B0 = "); if (p->mode == m_forward) { w(g, "base.limit - base.cursor;~N"); w(g, "~Mbase.limit = "); } else { w(g, "base.limit_backward;~N"); w(g, "~Mbase.limit_backward = "); } generate_AE(g, q->AE); writef(g, ";~N", q); if (p->mode == m_forward) { str_assign(g->failure_str, "base.limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "base.limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } else { write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); w(g, "~Mvar /** number */ ~B0 = "); if (p->mode == m_forward) { w(g, "base.limit - base.cursor;~N"); w(g, "~Mbase.limit = base.cursor;~N"); } else { w(g, "base.limit_backward;~N"); w(g, "~Mbase.limit_backward = base.cursor;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "base.limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "base.limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); str_delete(savevar); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); g->B[0] = str_data(savevar); writef(g, "~{~C~N" "~Mvar /** !Object */ ~B0 = new BaseStemmer();~N", p); writef(g, "~M~B0.copy_from(base);~N", p); ++g->copy_from_count; str_assign(g->failure_str, "base.copy_from("); str_append(g->failure_str, savevar); str_append_string(g->failure_str, ");"); g->V[0] = p->name; writef(g, "~Mbase.current = ~V0;~N" "~Mbase.cursor = 0;~N" "~Mbase.limit_backward = 0;~N" "~Mbase.limit = base.current.length;~N", p); generate(g, p->left); if (!g->unreachable) { g->V[0] = p->name; writef(g, "~M~V0 = base.current;~N", p); write_margin(g); write_str(g, g->failure_str); write_newline(g); } w(g, "~}"); str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, char * s) { g->V[0] = p->name; g->S[0] = s; w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p, char * s) { w(g, "~Mif (!("); generate_AE(g, p->left); write_char(g, ' '); write_string(g, s); write_char(g, ' '); generate_AE(g, p->AE); w(g, "))~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } static void generate_call(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "!~V0()", p); } static void generate_grouping(struct generator * g, struct node * p, int complement) { struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "!(base.~S1_grouping~S0(~V0, ~I0, ~I1))", p); } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->V[0] = p->name; write_failure_if(g, "!(base.eq_s~S0(~V0))", p); } static void generate_literalstring(struct generator * g, struct node * p) { symbol * b = p->literalstring; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->L[0] = b; write_failure_if(g, "!(base.eq_s~S0(~L0))", p); } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; struct str * saved_output = g->outbuf; struct str * saved_declarations = g->declarations; g->V[0] = q; if (q->type == t_routine) { w(g, "~N~M/** @return {boolean} */~N" "~Mfunction ~W0() {~+~N"); } else { w(g, "~N~Mthis.~W0 = /** @return {boolean} */ function() {~+~N"); } g->outbuf = str_new(); g->declarations = str_new(); g->next_label = 0; g->var_number = 0; if (p->amongvar_needed) { w(g, "~Mvar /** number */ among_var;~N"); } str_clear(g->failure_str); g->failure_label = x_return; g->unreachable = false; g->keep_count = 0; generate(g, p->left); if (!g->unreachable) w(g, "~Mreturn true;~N"); w(g, "~-~M};~N"); str_append(saved_output, g->declarations); str_append(saved_output, g->outbuf); str_delete(g->declarations); str_delete(g->outbuf); g->declarations = saved_declarations; g->outbuf = saved_output; } static void generate_substring(struct generator * g, struct node * p) { struct among * x = p->among; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; if (!x->amongvar_needed) { write_failure_if(g, "base.find_among~S0(a_~I0) == 0", p); } else { writef(g, "~Mamong_var = base.find_among~S0(a_~I0);~N", p); write_failure_if(g, "among_var == 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == 0) generate_substring(g, p); if (x->starter != 0) generate(g, x->starter); if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { int i; w(g, "~Mswitch (among_var) {~N~+"); for (i = 1; i <= x->command_count; i++) { g->I[0] = i; w(g, "~Mcase ~I0:~N~+"); generate(g, x->commands[i - 1]); if (!g->unreachable) w(g, "~Mbreak;~N"); w(g, "~-"); g->unreachable = false; } write_block_end(g); } } static void generate_booltest(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "!~V0", p); } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Mbase.debug(~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { int a0; struct str * a1; if (g->unreachable) return; a0 = g->failure_label; a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_setmark: generate_setmark(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_atmark: generate_atmark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_atlimit: generate_atlimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_assign: generate_assignfrom(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_mathassign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: /* Snowball specifies integer division with semantics matching C, * so we need to use `Math.trunc(x/y)` here. */ g->V[0] = p->name; w(g, "~M~V0 = Math.trunc(~V0 / "); generate_AE(g, p->AE); w(g, ");~N"); break; case c_eq: generate_integer_test(g, p, "=="); break; case c_ne: generate_integer_test(g, p, "!="); break; case c_gr: generate_integer_test(g, p, ">"); break; case c_ge: generate_integer_test(g, p, ">="); break; case c_ls: generate_integer_test(g, p, "<"); break; case c_le: generate_integer_test(g, p, "<="); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } static void generate_class_begin(struct generator * g) { w(g, "/**@constructor*/~N"); w(g, "~n = function() {~+~N" "~Mvar base = new ~P();~N"); } static void generate_class_end(struct generator * g) { w(g, "~N"); w(g, "~M/**@return{string}*/~N"); w(g, "~Mthis['stemWord'] = function(/**string*/word) {~+~N"); w(g, "~Mbase.setCurrent(word);~N"); w(g, "~Mthis.stem();~N"); w(g, "~Mreturn base.getCurrent();~N"); w(g, "~-~M};~N"); w(g, "~-};~N"); /* w(g, "window['~n'] = ~n;~N"); */ } static void generate_among_table(struct generator * g, struct among * x) { struct amongvec * v = x->b; g->I[0] = x->number; w(g, "~M/** @const */ var a_~I0 = [~N~+"); { int i; for (i = 0; i < x->literalstring_count; i++) { g->I[0] = v->i; g->I[1] = v->result; g->L[0] = v->b; g->S[0] = i < x->literalstring_count - 1 ? "," : ""; w(g, "~M[~L0, ~I0, ~I1"); if (v->function != 0) { w(g, ", "); write_varname(g, v->function); } w(g, "]~S0~N"); v++; } } w(g, "~-~M];~N~N"); } static void generate_amongs(struct generator * g) { struct among * x; for (x = g->analyser->amongs; x; x = x->next) { generate_among_table(g, x); } } static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7)/ 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); int i; for (i = 0; i < size; i++) map[i] = 0; /* Using unicode would require revision here */ for (i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); g->V[0] = q->name; w(g, "~M/** @const */ var /** Array */ ~W0 = ["); for (i = 0; i < size; i++) { write_int(g, map[i]); if (i < size - 1) w(g, ", "); } w(g, "];~N~N"); lose_b(map); } static void generate_groupings(struct generator * g) { struct grouping * q; for (q = g->analyser->groupings; q; q = q->next) { if (q->name->used) generate_grouping_table(g, q); } } static void generate_members(struct generator * g) { struct name * q; for (q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_string: w(g, "~Mvar /** string */ ~W0 = '';~N"); break; case t_integer: w(g, "~Mvar /** number */ ~W0 = 0;~N"); break; case t_boolean: w(g, "~Mvar /** boolean */ ~W0 = false;~N"); break; } } w(g, "~N"); } static void generate_methods(struct generator * g) { struct node * p = g->analyser->program; while (p != 0) { generate(g, p); g->unreachable = false; p = p->right; } } extern void generate_program_js(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); write_start_comment(g, "// ", NULL); generate_class_begin(g); generate_amongs(g); generate_groupings(g); generate_members(g); generate_methods(g); generate_class_end(g); output_str(g->options->output_src, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-2.2.0/compiler/generator_pascal.c000066400000000000000000001170061414263061200206270ustar00rootroot00000000000000#include /* for exit */ #include /* for strlen */ #include /* for fprintf etc */ #include "header.h" #define BASE_UNIT "SnowballProgram" #define BASE_CLASS "T" BASE_UNIT /* prototypes */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); static int new_label(struct generator * g) { return g->next_label++; } static struct str * vars_newname(struct generator * g) { struct str * output; g->var_number++; output = str_new(); str_append_string(output, "v_"); str_append_int(output, g->var_number); return output; } /* Write routines for items from the syntax tree */ static void write_varname(struct generator * g, struct name * p) { if (p->type != t_external) { /* Pascal identifiers are case-insensitive but Snowball identifiers * should be case-sensitive. To address this, we encode the case of * the identifier. For readability of the generated code, the * encoding tries to be minimally intrusive for common cases. * * After the letter which indicates the type and before the "_" we * encode the case pattern in the Snowball identifier using "U" for * an upper-case letter, "l" for a lower-case letter and nothing for * other characters. Any trailing string of "l" is omitted (since * it's redundant and decreases readability). * * Identifiers without any upper-case encode most simply, e.g. I_foo2 * * A capitalised identifier is also concise, e.g. IU_Foo2 * * All-caps gives a string of Us, e.g. IUUUUUUUU_SHOUTING * * But any example can be handled, e.g. IUllU_Foo79_Bar * * We don't try to solve this problem for external identifiers - it * seems more helpful to leave those alone and encourage snowball * program authors to avoid naming externals which only differ by * case. */ int i, len = SIZE(p->b); int lower_pending = 0; write_char(g, "SBIrxg"[p->type]); for (i = 0; i != len; ++i) { int ch = p->b[i]; if (ch >= 'a' && ch <= 'z') { ++lower_pending; } else if (ch >= 'A' && ch <= 'Z') { while (lower_pending) { write_char(g, 'l'); --lower_pending; } write_char(g, 'U'); } } write_char(g, '_'); } write_b(g, p->b); } static void write_literal_string(struct generator * g, symbol * p) { int i; write_char(g, '\''); for (i = 0; i < SIZE(p); i++) { int ch = p[i]; if (ch == '\'') { write_string(g, "''"); } else if (32 <= ch && ch < 127) { write_char(g, ch); } else { write_char(g, '\''); write_char(g, '#'); write_int (g, ch); write_char(g, '\''); } } write_char(g, '\''); } static void write_margin(struct generator * g) { int i; for (i = 0; i < g->margin; i++) write_string(g, " "); } /* Write a variable declaration. */ static void write_declare(struct generator * g, char * declaration, struct node * p) { struct str * temp = g->outbuf; g->outbuf = g->declarations; write_string(g, " "); writef(g, declaration, p); write_string(g, ";"); write_newline(g); g->outbuf = temp; } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "{ "); write_comment_content(g, p); write_string(g, " }"); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~MBegin~+~N"); } static void write_block_end(struct generator * g) { /* block end */ w(g, "~-~MEnd;~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "FLimit - "; write_declare(g, "~B0 : Integer", p); writef(g, "~M~B0 := ~S1FCursor;~N" , p); } static void restore_string(struct node * p, struct str * out, struct str * savevar) { str_clear(out); str_append_string(out, "FCursor := "); if (p->mode != m_forward) str_append_string(out, "FLimit - "); str_append(out, savevar); str_append_string(out, ";"); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { struct str * temp = str_new(); write_margin(g); restore_string(p, temp, savevar); write_str(g, temp); write_newline(g); str_delete(temp); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "Inc(FCursor);" : "Dec(FCursor);"); write_newline(g); } static void wsetlab_begin(struct generator * g) { w(g, "~MRepeat~N~+"); } static void wsetlab_end(struct generator * g, int n) { w(g, "~-~MUntil True;~N"); w(g, "lab"); write_int(g, n); w(g, ":~N"); } static void wgotol(struct generator * g, int n) { write_margin(g); write_string(g, "goto lab"); write_int(g, n); write_string(g, ";"); write_newline(g); } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } write_margin(g); switch (g->failure_label) { case x_return: write_string(g, "Begin Result := False; Exit; End;"); break; default: write_string(g, "goto lab"); write_int(g, g->failure_label); write_string(g, ";"); } write_newline(g); g->unreachable = true; } static void write_failure_if(struct generator * g, char * s, struct node * p) { writef(g, "~MIf (", p); writef(g, s, p); writef(g, ") Then~N", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "FCursor >= FLimit", p); } else { write_failure_if(g, "FCursor <= FBkLimit", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; int l = strlen(input); while (i < l) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } switch (input[i++]) { default: write_char(g, input[i - 1]); continue; case 'C': write_comment(g, p); continue; case 'f': write_block_start(g); write_failure(g); g->unreachable = false; write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': write_string(g, g->S[input[i++] - '0']); continue; case 'B': write_b(g, g->B[input[i++] - '0']); continue; case 'I': write_int(g, g->I[input[i++] - '0']); continue; case 'V': case 'W': write_varname(g, g->V[input[i++] - '0']); continue; case 'L': write_literal_string(g, g->L[input[i++] - '0']); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_string(g, g->options->name); continue; } } } static void w(struct generator * g, const char * s) { writef(g, s, 0); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varname(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "MAXINT"); break; case c_minint: write_string(g, "(-MAXINT - 1)"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; goto label0; case c_divide: s = " div "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "FCursor"); break; case c_limit: w(g, p->mode == m_forward ? "FLimit" : "FBkLimit"); break; case c_len: case c_size: w(g, "Length(current)"); break; case c_lenof: case c_sizeof: g->V[0] = p->name; w(g, "Length(~V0)"); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (keep_c && p->right != 0) write_restorecursor(g, p, savevar); p = p->right; } str_delete(savevar); } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); int end_unreachable = true; write_comment(g, p); wsetlab_begin(g); if (keep_c) write_savecursor(g, p, savevar); p = p->left; str_clear(g->failure_str); if (p == 0) { /* p should never be 0 after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right) { g->failure_label = new_label(g); wsetlab_begin(g); generate(g, p); if (!g->unreachable) { wgotol(g, out_lab); end_unreachable = false; } wsetlab_end(g, g->failure_label); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); p = p->right; } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); wsetlab_end(g, out_lab); if (!end_unreachable) { g->unreachable = false; } str_delete(savevar); } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g,"~MFBkLimit := FCursor; FCursor := FLimit;~N", p); generate(g, p->left); w(g, "~MFCursor := FBkLimit;~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); int a0 = g->failure_label, l; struct str * a1 = str_copy(g->failure_str); write_comment(g, p); if (keep_c) { write_block_start(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g); l = g->failure_label; generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); wsetlab_end(g, l); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); if (keep_c) write_block_end(g); str_delete(savevar); } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); g->failure_label = new_label(g); str_clear(g->failure_str); if (keep_c) restore_string(p, g->failure_str, savevar); wsetlab_begin(g); generate(g, p->left); wsetlab_end(g, g->failure_label); g->unreachable = false; str_delete(savevar); } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 := True;~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 := False;~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) { write_savecursor(g, p, savevar); } generate(g, p->left); if (!g->unreachable) { if (keep_c) { write_restorecursor(g, p, savevar); } } str_delete(savevar); } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); g->V[0] = p->left->name; w(g, "~M~V0();~N"); } else { g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g); generate(g, p->left); wsetlab_end(g, g->failure_label); g->unreachable = false; } if (keep_c) write_restorecursor(g, p, savevar); str_delete(savevar); } static void generate_GO(struct generator * g, struct node * p, int style) { int end_unreachable = false; struct str * savevar = vars_newname(g); int keep_c = style == 1 || repeat_restore(g, p->left); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int golab = new_label(g); write_comment(g, p); w(g, "~MWhile True Do~N"); w(g, "~{"); if (keep_c) write_savecursor(g, p, savevar); g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (style == 1) write_restorecursor(g, p, savevar); g->I[0] = golab; w(g, "~Mgoto lab~I0;~N"); } g->unreachable = false; wsetlab_end(g, g->failure_label); if (keep_c) write_restorecursor(g, p, savevar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); g->I[0] = golab; w(g, "~}lab~I0:~N"); str_delete(savevar); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); g->B[0] = str_data(loopvar); write_declare(g, "~B0 : Integer", p); w(g, "~MFor ~B0 := "); generate_AE(g, p->AE); writef(g, " DownTo 1 Do~N", p); writef(g, "~{", p); generate(g, p->left); w(g, "~}"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { struct str * savevar = vars_newname(g); int keep_c = repeat_restore(g, p->left); int replab = new_label(g); g->I[0] = replab; writef(g, "lab~I0:~N~MWhile True Do~N~{", p); if (keep_c) write_savecursor(g, p, savevar); g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g); generate(g, p->left); if (!g->unreachable) { if (loopvar != 0) { g->B[0] = str_data(loopvar); w(g, "~MDec(~B0);~N"); } g->I[0] = replab; w(g, "~Mgoto lab~I0;~N"); } wsetlab_end(g, g->failure_label); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); w(g, "~MBreak;~N~}"); str_delete(savevar); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); w(g, "~{"); g->B[0] = str_data(loopvar); write_declare(g, "~B0 : Integer", p); w(g, "~M~B0 := "); generate_AE(g, p->AE); w(g, ";~N"); { int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); w(g, "~}"); str_delete(loopvar); } static void generate_setmark(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~W0 := FCursor;~N", p); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~MIf (FCursor ~S0 "); generate_AE(g, p->AE); w(g, ") Then~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; w(g, "~MFCursor := "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_atmark(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~MIf (FCursor <> "); generate_AE(g, p->AE); writef(g, ") Then~N", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "+" : "-"; w(g, "~{~MC := FCursor ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); g->S[1] = p->mode == m_forward ? "> FLimit" : "< FBkLimit"; g->S[2] = p->mode == m_forward ? "<" : ">"; if (p->AE->type == c_number) { // Constant distance hop. // // No need to check for negative hop as that's converted to false by // the analyser. write_failure_if(g, "C ~S1", p); } else { write_failure_if(g, "(C ~S1) Or (C ~S2 FCursor)", p); } writef(g, "~MFCursor := C;~N", p); g->temporary_used = true; writef(g, "~}", p); } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~MSliceDel;~N", p); } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "FLimit" : "FBkLimit"; writef(g, "~MFCursor := ~S0;~N", p); } static void generate_atlimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "FLimit" : "FBkLimit"; g->S[1] = p->mode == m_forward ? "<" : ">"; write_failure_if(g, "FCursor ~S1 ~S0", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "FBra" : "FKet"; writef(g, "~M~S0 := FCursor;~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "FKet" : "FBra"; writef(g, "~M~S0 := FCursor;~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 := AssignTo();~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 := SliceTo();~N", p); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != 0) { write_literal_string(g, b); } else { write_varname(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { int keep_c = style == c_attach; write_comment(g, p); if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) { w(g, "~{~MC := FCursor;~N"); g->temporary_used = true; } writef(g, "~Minsert(FCursor, FCursor, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~MFCursor := C;~N~}"); } static void generate_assignfrom(struct generator * g, struct node * p) { int keep_c = p->mode == m_forward; /* like 'attach' */ write_comment(g, p); if (keep_c) { writef(g, "~{~MC := FCursor;~N", p); g->temporary_used = true; } if (p->mode == m_forward) { writef(g, "~Minsert(FCursor, FLimit, ", p); } else { writef(g, "~Minsert(FBkLimit, FCursor, ", p); } generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~MFCursor := c;~N~}"); } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~MSliceFrom("); generate_address(g, p); writef(g, ");~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); struct str * varname = vars_newname(g); write_comment(g, p); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~MIf (FCursor ~S0 "); generate_AE(g, q->AE); w(g, ") Then~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; g->B[0] = str_data(varname); write_declare(g, "~B0 : Integer", p); if (p->mode == m_forward) { w(g, "~M~B0 := FLimit - FCursor;~N"); w(g, "~MFLimit := "); } else { w(g, "~M~B0 := FBkLimit;~N"); w(g, "~MFBkLimit := "); } generate_AE(g, q->AE); writef(g, ";~N", q); if (p->mode == m_forward) { str_assign(g->failure_str, "FLimit := FLimit + "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "FBkLimit := "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } else { write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); write_declare(g, "~B0 : Integer", p); if (p->mode == m_forward) { w(g, "~M~B0 := FLimit - FCursor;~N"); w(g, "~MFLimit := FCursor;~N"); } else { w(g, "~M~B0 := FBkLimit;~N"); w(g, "~MFBkLimit := FCursor;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "FLimit := FLimit + "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "FBkLimit := "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } } } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); str_delete(savevar); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); g->B[0] = str_data(savevar); write_comment(g, p); g->V[0] = p->name; { struct str * saved_output = g->outbuf; str_clear(g->failure_str); g->outbuf = g->failure_str; writef(g, "~V0 := FCurrent; " "FCurrent := ~B0_Current; " "FCursor := ~B0_Cursor; " "FLimit := ~B0_Limit; " "FBkLimit := ~B0_BkLimit; " "FBra := ~B0_Bra; " "FKet := ~B0_Ket;", p); g->failure_str = g->outbuf; g->outbuf = saved_output; } write_declare(g, "~B0_Current : AnsiString", p); write_declare(g, "~B0_Cursor : Integer", p); write_declare(g, "~B0_Limit : Integer", p); write_declare(g, "~B0_BkLimit : Integer", p); write_declare(g, "~B0_Bra : Integer", p); write_declare(g, "~B0_Ket : Integer", p); writef(g, "~{" "~M~B0_Current := FCurrent;~N" "{ ~M~B0_Current := Copy(FCurrent, 1, FLimit); }~N" "~M~B0_Cursor := FCursor;~N" "~M~B0_Limit := FLimit;~N" "~M~B0_BkLimit := FBkLimit;~N" "~M~B0_Bra := FBra;~N" "~M~B0_Ket := FKet;~N" "~MFCurrent := ~V0;~N" "~MFCursor := 0;~N" "~MFLimit := Length(current);~N", p); generate(g, p->left); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } w(g, "~}"); str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, char * s) { g->V[0] = p->name; w(g, "~M~W0 := "); if (s != 0) { g->S[0] = s; w(g, "~W0 ~S0 "); } generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p, char * s) { w(g, "~MIf Not ("); generate_AE(g, p->left); write_char(g, ' '); write_string(g, s); write_char(g, ' '); generate_AE(g, p->AE); w(g, ") Then~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } static void generate_call(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "Not ~V0", p); } static void generate_grouping(struct generator * g, struct node * p, int complement) { struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "Bk"; g->S[1] = complement ? "Out" : "In"; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "Not (~S1Grouping~S0(~V0, ~I0, ~I1))", p); } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "Bk"; g->V[0] = p->name; write_failure_if(g, "Not (EqV~S0(~V0))", p); } static void generate_literalstring(struct generator * g, struct node * p) { symbol * b = p->literalstring; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "Bk"; g->I[0] = SIZE(b); g->L[0] = b; write_failure_if(g, "Not (EqS~S0(~I0, ~L0))", p); } static void generate_define(struct generator * g, struct node * p) { struct str *saved_output; struct str *saved_declarations; /* Generate function header. */ g->V[0] = p->name; w(g, "~N~MFunction T~n.~W0 : Boolean;~N"); /* Save output*/ saved_output = g->outbuf; saved_declarations = g->declarations; g->outbuf = str_new(); g->declarations = str_new(); g->next_label = 0; g->var_number = 0; str_clear(g->failure_str); g->failure_label = x_return; g->unreachable = false; /* Generate function body. */ w(g, "~{"); g->temporary_used = false; generate(g, p->left); if (!g->unreachable) w(g, "~MResult := True;~N"); w(g, "~}"); if (g->temporary_used) { str_append_string(g->declarations, " C : Integer;\n"); } if (p->amongvar_needed) { str_append_string(g->declarations, " AmongVar : Integer;\n"); } if (str_len(g->declarations) > 0) { str_append_string(saved_output, "Var\n"); str_append(saved_output, g->declarations); } if (g->next_label) { int i, num = g->next_label; str_append_string(saved_output, "Label\n"); for (i = 0; i < num; ++i) { str_append_string(saved_output, " lab"); str_append_int(saved_output, i); str_append_string(saved_output, i == num - 1 ? ";\n" : ",\n"); } } str_append(saved_output, g->outbuf); str_delete(g->declarations); str_delete(g->outbuf); g->declarations = saved_declarations; g->outbuf = saved_output; } static void generate_substring(struct generator * g, struct node * p) { struct among * x = p->among; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "Bk"; g->I[0] = x->number; g->I[1] = x->literalstring_count; if (!x->amongvar_needed) { write_failure_if(g, "FindAmong~S0(a_~I0, ~I1) = 0", p); } else { writef(g, "~MAmongVar := FindAmong~S0(a_~I0, ~I1);~N", p); write_failure_if(g, "AmongVar = 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == 0) generate_substring(g, p); if (x->starter != 0) generate(g, x->starter); if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { int i; write_comment(g, p); w(g, "~MCase AmongVar Of~N~+"); for (i = 1; i <= x->command_count; i++) { g->I[0] = i; w(g, "~M~I0:~N~{"); generate(g, x->commands[i - 1]); w(g, "~}"); g->unreachable = false; } write_block_end(g); } } static void generate_booltest(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "Not (~V0)", p); } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Mdebug(~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { int a0; struct str * a1; if (g->unreachable) return; a0 = g->failure_label; a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_setmark: generate_setmark(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_atmark: generate_atmark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_atlimit: generate_atlimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_assign: generate_assignfrom(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_mathassign: generate_integer_assign(g, p, NULL); break; case c_plusassign: generate_integer_assign(g, p, "+"); break; case c_minusassign: generate_integer_assign(g, p, "-"); break; case c_multiplyassign:generate_integer_assign(g, p, "*"); break; case c_divideassign: generate_integer_assign(g, p, "div"); break; case c_eq: generate_integer_test(g, p, "="); break; case c_ne: generate_integer_test(g, p, "<>"); break; case c_gr: generate_integer_test(g, p, ">"); break; case c_ge: generate_integer_test(g, p, ">="); break; case c_ls: generate_integer_test(g, p, "<"); break; case c_le: generate_integer_test(g, p, "<="); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } /* Class declaration generation. */ static void generate_unit_start(struct generator * g) { write_start_comment(g, "{ ", " }"); w(g, "Unit ~n;~N~N{$HINTS OFF}~N~NInterface~N~NUses " BASE_UNIT ";~N"); } static void generate_unit_end(struct generator * g) { w(g, "~NEnd.~N"); } static void generate_class_begin(struct generator * g) { w(g, "~NType~N~+~MT~n = Class(" BASE_CLASS ")~N"); } static void generate_class_end(struct generator * g) { w(g, "~}~NImplementation~N"); } static void generate_method_decl(struct generator * g, struct name * q) { g->V[0] = q; w(g, "~MFunction ~W0 : Boolean;"); if (q->type == t_external) { w(g, " Override;"); } w(g, "~N"); } static void generate_method_decls(struct generator * g) { struct name * q; w(g, "~Mpublic~N~+"); w(g, "~MConstructor Create;~N"); for (q = g->analyser->names; q; q = q->next) { if (q->type == t_external) { generate_method_decl(g, q); } } w(g, "~-"); w(g, "~Mprivate~N~+"); for (q = g->analyser->names; q; q = q->next) { if (q->type == t_routine) { generate_method_decl(g, q); } } w(g, "~-"); } static void generate_member_decls(struct generator * g) { struct name * q; w(g, "~Mprivate~N~+"); for (q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_string: w(g, "~M~W0 : AnsiString;~N"); break; case t_integer: w(g, "~M~W0 : Integer;~N"); break; case t_boolean: w(g, "~M~W0 : Boolean;~N"); break; } } w(g, "~-"); } static void generate_among_decls(struct generator * g) { struct among *a = g->analyser->amongs; w(g, "~Mprivate~N~+"); while (a != 0) { g->I[0] = a->number; w(g, "~Ma_~I0 : Array Of TAmong;~N"); a = a->next; } w(g, "~-"); } static void generate_among_table(struct generator * g, struct among * x) { int i; struct amongvec * v = x->b; g->I[0] = x->number; g->I[1] = x->literalstring_count; w(g, "~MSetLength(a_~I0, ~I1);~N~+"); for (i = 0; i < x->literalstring_count; i++, v++) { g->I[1] = i; /* Write among's string. */ g->L[0] = v->b; w(g, "~Ma_~I0[~I1].Str := ~L0;~N"); /* Write among's index & result. */ g->I[2] = v->i; w(g, "~Ma_~I0[~I1].Index := ~I2;~N"); g->I[2] = v->result; w(g, "~Ma_~I0[~I1].Result := ~I2;~N"); /* Write among's handler. */ w(g, "~Ma_~I0[~I1].Method := "); if (v->function == 0) { w(g, "nil;~N~N"); } else { g->V[0] = v->function; w(g, "Self.~W0;~N~N"); } } w(g, "~-"); } static void generate_amongs(struct generator * g) { struct among * a = g->analyser->amongs; while (a != 0) { generate_among_table(g, a); a = a->next; } } static void generate_constructor(struct generator * g) { w(g, "~N~MConstructor T~n.Create;~N~{"); generate_amongs(g); w(g, "~}"); } static void generate_methods(struct generator * g) { struct node * p = g->analyser->program; while (p != 0) { generate(g, p); p = p->right; } } static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7)/ 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); int i; for (i = 0; i < size; i++) map[i] = 0; /* Using unicode would require revision here */ for (i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); g->V[0] = q->name; g->I[0] = size - 1; w(g, "~N~MConst~+~N~M~W0 : Array [0..~I0] Of Char = (~N~+"); for (i = 0; i < size; i++) { if (i != 0) w(g, ",~N"); g->I[0] = map[i]; w(g, "~MChr(~I0)"); } w(g, "~N~-~M);~N~-"); lose_b(map); } static void generate_groupings(struct generator * g) { struct grouping * q; for (q = g->analyser->groupings; q; q = q->next) { if (q->name->used) generate_grouping_table(g, q); } } extern void generate_program_pascal(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); generate_unit_start(g); /* Generate class declaration. */ generate_class_begin(g); generate_member_decls(g); generate_among_decls(g); generate_method_decls(g); generate_class_end(g); /* generate implementation. */ generate_groupings(g); generate_constructor(g); generate_methods(g); generate_unit_end(g); output_str(g->options->output_src, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-2.2.0/compiler/generator_python.c000066400000000000000000001124521414263061200207050ustar00rootroot00000000000000 #include /* for exit */ #include /* for strlen */ #include /* for fprintf etc */ #include "header.h" /* prototypes */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); static int new_label(struct generator * g) { int next_label = g->next_label++; g->max_label = (next_label > g->max_label) ? next_label : g->max_label; return next_label; } static struct str * vars_newname(struct generator * g) { struct str * output; g->var_number++; output = str_new(); str_append_string(output, "v_"); str_append_int(output, g->var_number); return output; } /* Write routines for items from the syntax tree */ static void write_varname(struct generator * g, struct name * p) { switch (p->type) { case t_external: write_char(g, '_'); break; case t_routine: write_string(g, "__"); /* FALLTHRU */ default: { int ch = "SBIrxg"[p->type]; write_char(g, ch); write_char(g, '_'); break; } } write_b(g, p->b); } static void write_varref(struct generator * g, struct name * p) { write_string(g, "self."); write_varname(g, p); } static void write_hexdigit(struct generator * g, int n) { write_char(g, n < 10 ? n + '0' : n - 10 + 'A'); } static void write_hex(struct generator * g, int ch) { write_string(g, "\\u"); { int i; for (i = 12; i >= 0; i -= 4) write_hexdigit(g, ch >> i & 0xf); } } static void write_literal_string(struct generator * g, symbol * p) { int i; write_string(g, "u\""); for (i = 0; i < SIZE(p); i++) { int ch = p[i]; if (32 <= ch && ch < 127) { if (ch == '\"' || ch == '\\') write_string(g, "\\"); write_char(g, ch); } else { write_hex(g, ch); } } write_string(g, "\""); } static void write_margin(struct generator * g) { int i; for (i = 0; i < g->margin; i++) write_string(g, " "); } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "# "); write_comment_content(g, p); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~+~N"); } static void write_block_end(struct generator * g) /* block end */ { w(g, "~-"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "self.limit - "; writef(g, "~M~B0 = ~S1self.cursor~N", p); } static void restore_string(struct node * p, struct str * out, struct str * savevar) { str_clear(out); str_append_string(out, "self.cursor = "); if (p->mode != m_forward) str_append_string(out, "self.limit - "); str_append(out, savevar); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { struct str * temp = str_new(); write_margin(g); restore_string(p, temp, savevar); write_str(g, temp); write_newline(g); str_delete(temp); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "self.cursor += 1" : "self.cursor -= 1"); write_newline(g); } static void wsetlab_begin(struct generator * g) { w(g, "~Mtry:~N~+"); } static void wsetlab_end(struct generator * g, int n) { g->I[0] = n; w(g, "~-~Mexcept lab~I0: pass~N"); } static void wgotol(struct generator * g, int n) { g->I[0] = n; w(g, "~Mraise lab~I0()~N"); } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } switch (g->failure_label) { case x_return: w(g, "~Mreturn False~N"); g->unreachable = true; break; default: g->I[0] = g->failure_label; w(g, "~Mraise lab~I0()~N"); g->unreachable = true; } } static void write_failure_if(struct generator * g, char * s, struct node * p) { writef(g, "~Mif ", p); writef(g, s, p); writef(g, ":", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "self.cursor >= self.limit", p); } else { write_failure_if(g, "self.cursor <= self.limit_backward", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; int l = strlen(input); while (i < l) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } switch (input[i++]) { default: write_char(g, input[i - 1]); continue; case 'C': write_comment(g, p); continue; case 'f': write_block_start(g); write_failure(g); g->unreachable = false; write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': write_string(g, g->S[input[i++] - '0']); continue; case 'B': write_b(g, g->B[input[i++] - '0']); continue; case 'I': write_int(g, g->I[input[i++] - '0']); continue; case 'V': write_varref(g, g->V[input[i++] - '0']); continue; case 'W': write_varname(g, g->V[input[i++] - '0']); continue; case 'L': write_literal_string(g, g->L[input[i++] - '0']); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_string(g, g->options->name); continue; } } } static void w(struct generator * g, const char * s) { writef(g, s, 0); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "sys.maxsize"); break; case c_minint: write_string(g, "(~sys.maxsize)"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_divide: /* Snowball specifies integer division with semantics matching C, * so Python's `/` or `//` isn't suitable (`//` would be in cases * where we knew that the arguments had the same sign). * * The `float(`...`)` is needed for Python2. */ write_string(g, "int(float("); generate_AE(g, p->left); write_string(g, ") / "); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "self.cursor"); break; case c_limit: w(g, p->mode == m_forward ? "self.limit" : "self.limit_backward"); break; case c_lenof: /* Same as sizeof() for Python. */ case c_sizeof: g->V[0] = p->name; w(g, "len(~V0)"); break; case c_len: /* Same as size() for Python. */ case c_size: w(g, "len(self.current)"); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (keep_c && p->right != 0) write_restorecursor(g, p, savevar); p = p->right; } str_delete(savevar); } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); int end_unreachable = true; write_comment(g, p); wsetlab_begin(g); if (keep_c) write_savecursor(g, p, savevar); p = p->left; str_clear(g->failure_str); if (p == 0) { /* p should never be 0 after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != 0) { int label = new_label(g); g->failure_label = label; wsetlab_begin(g); generate(g, p); if (!g->unreachable) { wgotol(g, out_lab); end_unreachable = false; } wsetlab_end(g, label); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); p = p->right; } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); wsetlab_end(g, out_lab); if (!end_unreachable) { g->unreachable = false; } str_delete(savevar); } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mself.limit_backward = self.cursor~N" "~Mself.cursor = self.limit~N", p); generate(g, p->left); w(g, "~Mself.cursor = self.limit_backward~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); write_comment(g, p); if (keep_c) { write_savecursor(g, p, savevar); } wsetlab_begin(g); generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); wsetlab_end(g, label); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); str_delete(savevar); } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); write_comment(g, p); if (keep_c) { write_savecursor(g, p, savevar); restore_string(p, g->failure_str, savevar); } wsetlab_begin(g); generate(g, p->left); wsetlab_end(g, label); g->unreachable = false; str_delete(savevar); } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = True~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = False~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) { write_savecursor(g, p, savevar); } generate(g, p->left); if (!g->unreachable) { if (keep_c) { write_restorecursor(g, p, savevar); } } str_delete(savevar); } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); g->V[0] = p->left->name; w(g, "~M~V0()~N"); } else { int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g); generate(g, p->left); wsetlab_end(g, label); g->unreachable = false; } if (keep_c) write_restorecursor(g, p, savevar); str_delete(savevar); } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { struct grouping * q = p->name->grouping; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "in" : "out"; g->S[2] = g->options->encoding == ENC_UTF8 ? "_U" : ""; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "not self.go_~S1_grouping~S0~S2(~n.~W0, ~I0, ~I1)", p); if (!is_goto) { if (p->mode == m_forward) w(g, "~Mself.cursor += 1~N"); else w(g, "~Mself.cursor -= 1~N"); } } static void generate_GO(struct generator * g, struct node * p, int style) { int end_unreachable; struct str * savevar; int keep_c; int a0; struct str * a1; int golab; int label; if (p->left->type == c_grouping || p->left->type == c_non) { /* Special case for "goto" or "gopast" when used on a grouping or an * inverted grouping - the movement of c by the matching action is * exactly what we want! */ #ifdef OPTIMISATION_WARNINGS printf("Optimising %s %s\n", style ? "goto" : "gopast", p->left->type == c_non ? "non" : "grouping"); #endif write_comment(g, p); generate_GO_grouping(g, p->left, style, p->left->type == c_non); return; } end_unreachable = false; savevar = vars_newname(g); keep_c = style == 1 || repeat_restore(g, p->left); a0 = g->failure_label; a1 = str_copy(g->failure_str); golab = new_label(g); write_comment(g, p); w(g, "~Mtry:~N~+" "~Mwhile True:~N~+"); if (keep_c) write_savecursor(g, p, savevar); label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (style == 1) write_restorecursor(g, p, savevar); g->I[0] = golab; w(g, "~Mraise lab~I0()~N"); } g->unreachable = false; wsetlab_end(g, label); if (keep_c) write_restorecursor(g, p, savevar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); w(g, "~-~-"); g->I[0] = golab; w(g, "~Mexcept lab~I0: pass~N"); str_delete(savevar); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); g->B[0] = str_data(loopvar); w(g, "~Mfor ~B0 in range ("); generate_AE(g, p->AE); g->B[0] = str_data(loopvar); writef(g, ", 0, -1):~N", p); writef(g, "~{", p); generate(g, p->left); w(g, "~}"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { struct str * savevar = vars_newname(g); int keep_c = repeat_restore(g, p->left); int label = new_label(g); g->failure_label = label; writef(g, "~Mwhile True:~N~+", p); if (keep_c) write_savecursor(g, p, savevar); str_clear(g->failure_str); wsetlab_begin(g); generate(g, p->left); if (!g->unreachable) { if (loopvar != 0) { g->B[0] = str_data(loopvar); w(g, "~M~B0 -= 1~N"); } w(g, "~Mcontinue~N"); } wsetlab_end(g, label); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); w(g, "~Mbreak~N~}"); str_delete(savevar); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); g->B[0] = str_data(loopvar); w(g, "~M~B0 = "); generate_AE(g, p->AE); w(g, "~N"); { int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); str_delete(loopvar); } static void generate_setmark(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = self.cursor~N", p); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif self.cursor ~S0 "); generate_AE(g, p->AE); w(g, ":"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; w(g, "~Mself.cursor = "); generate_AE(g, p->AE); writef(g, "~N", p); } static void generate_atmark(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif self.cursor != "); generate_AE(g, p->AE); writef(g, ":", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "+" : "-"; w(g, "~Mc = self.cursor ~S0 "); generate_AE(g, p->AE); w(g, "~N"); g->S[1] = p->mode == m_forward ? "> self.limit" : "< self.limit_backward"; g->S[2] = p->mode == m_forward ? "<" : ">"; if (p->AE->type == c_number) { // Constant distance hop. // // No need to check for negative hop as that's converted to false by // the analyser. write_failure_if(g, "c ~S1", p); } else { write_failure_if(g, "c ~S1 or c ~S2 self.cursor", p); } writef(g, "~Mself.cursor = c~N", p); } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mif not self.slice_del():~N" "~+~Mreturn False~N~-" "~N", p); } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "self.limit" : "self.limit_backward"; writef(g, "~Mself.cursor = ~S0~N", p); } static void generate_atlimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "self.limit" : "self.limit_backward"; g->S[1] = p->mode == m_forward ? "<" : ">"; write_failure_if(g, "self.cursor ~S1 ~S0", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "self.bra" : "self.ket"; writef(g, "~M~S0 = self.cursor~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "self.ket" : "self.bra"; writef(g, "~M~S0 = self.cursor~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = self.assign_to()~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = self.slice_to()~N" "~Mif ~V0 == '':~N" "~+~Mreturn False~N~-", p); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != 0) { write_literal_string(g, b); } else { write_varref(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { int keep_c = style == c_attach; write_comment(g, p); if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) w(g, "~Mc = self.cursor~N"); writef(g, "~Mself.insert(self.cursor, self.cursor, ", p); generate_address(g, p); writef(g, ")~N", p); if (keep_c) w(g, "~Mself.cursor = c~N"); } static void generate_assignfrom(struct generator * g, struct node * p) { int keep_c = p->mode == m_forward; /* like 'attach' */ write_comment(g, p); if (keep_c) writef(g, "~Mc = self.cursor~N", p); if (p->mode == m_forward) { writef(g, "~Mself.insert(self.cursor, self.limit, ", p); } else { writef(g, "~Mself.insert(self.limit_backward, self.cursor, ", p); } generate_address(g, p); writef(g, ")~N", p); if (keep_c) w(g, "~Mself.cursor = c~N"); } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif not self.slice_from("); generate_address(g, p); writef(g, "):~N" "~+~Mreturn False~N~-", p); } static void generate_setlimit(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); struct str * varname = vars_newname(g); write_comment(g, p); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif self.cursor ~S0 "); generate_AE(g, q->AE); w(g, ":"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~M~B0 = self.limit - self.cursor~N"); w(g, "~Mself.limit = "); } else { w(g, "~M~B0 = self.limit_backward~N"); w(g, "~Mself.limit_backward = "); } generate_AE(g, q->AE); writef(g, "~N", q); if (p->mode == m_forward) { str_assign(g->failure_str, "self.limit += "); str_append(g->failure_str, varname); } else { str_assign(g->failure_str, "self.limit_backward = "); str_append(g->failure_str, varname); } } else { write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~M~B0 = self.limit - self.cursor~N"); w(g, "~Mself.limit = self.cursor~N"); } else { w(g, "~M~B0 = self.limit_backward~N"); w(g, "~Mself.limit_backward = self.cursor~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "self.limit += "); str_append(g->failure_str, varname); } else { str_assign(g->failure_str, "self.limit_backward = "); str_append(g->failure_str, varname); } } } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); str_delete(savevar); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); g->B[0] = str_data(savevar); write_comment(g, p); writef(g, "~M~B0 = BaseStemmer()~N" "~M~B0.copy_from(self)~N", p); { struct str * saved_output = g->outbuf; str_clear(g->failure_str); g->outbuf = g->failure_str; g->V[0] = p->name; writef(g, "~V0 = self.current; ", p); /* For Python 3, this can just be: super().copy_from(~B0) */ writef(g, "super(~n, self).copy_from(~B0)", p); g->failure_str = g->outbuf; g->outbuf = saved_output; } writef(g, "~Mself.current = ~V0~N" "~Mself.cursor = 0~N" "~Mself.limit = len(self.current)~N", p); generate(g, p->left); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, char * s) { g->V[0] = p->name; g->S[0] = s; w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); w(g, "~N"); } static void generate_integer_test(struct generator * g, struct node * p, char * s) { w(g, "~Mif not "); generate_AE(g, p->left); write_char(g, ' '); write_string(g, s); write_char(g, ' '); generate_AE(g, p->AE); w(g, ":"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } static void generate_call(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "not ~V0()", p); } static void generate_grouping(struct generator * g, struct node * p, int complement) { struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "not self.~S1_grouping~S0(~n.~W0, ~I0, ~I1)", p); } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->V[0] = p->name; write_failure_if(g, "not self.eq_s~S0(~V0)", p); } static void generate_literalstring(struct generator * g, struct node * p) { symbol * b = p->literalstring; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->L[0] = b; write_failure_if(g, "not self.eq_s~S0(~L0)", p); } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; struct str * saved_output = g->outbuf; g->V[0] = q; w(g, "~N~Mdef ~W0(self):~+~N"); g->outbuf = str_new(); g->next_label = 0; g->var_number = 0; str_clear(g->failure_str); g->failure_label = x_return; g->unreachable = false; generate(g, p->left); if (!g->unreachable) w(g, "~Mreturn True~N"); w(g, "~-"); str_append(saved_output, g->outbuf); str_delete(g->outbuf); g->outbuf = saved_output; } static void generate_substring(struct generator * g, struct node * p) { struct among * x = p->among; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; if (!x->amongvar_needed) { write_failure_if(g, "self.find_among~S0(~n.a_~I0) == 0", p); } else { writef(g, "~Mamong_var = self.find_among~S0(~n.a_~I0)~N", p); write_failure_if(g, "among_var == 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == 0) generate_substring(g, p); if (x->starter != 0) generate(g, x->starter); if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { int i; for (i = 1; i <= x->command_count; i++) { if (i == x->command_count && x->nocommand_count == 0) { w(g, "~Melse:~N~+"); } else { g->I[0] = i; w(g, (i > 1 ? "~Melif" : "~Mif")); w(g, " among_var == ~I0:~N~+"); } generate(g, x->commands[i - 1]); w(g, "~-"); g->unreachable = false; } } } static void generate_booltest(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "not ~V0", p); } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Mself.debug(~I0, ~I1)~N", p); } static void generate(struct generator * g, struct node * p) { int a0; struct str * a1; if (g->unreachable) return; a0 = g->failure_label; a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_setmark: generate_setmark(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_atmark: generate_atmark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_atlimit: generate_atlimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_assign: generate_assignfrom(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_mathassign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: /* Snowball specifies integer division with semantics matching C, * so Python's `/=` or `//=` isn't suitable (`//=` would be in * cases where we knew that the arguments had the same sign). * * The `float(`...`)` is needed for Python2. */ g->V[0] = p->name; w(g, "~M~V0 = int(float(~V0) / "); generate_AE(g, p->AE); w(g, ")~N"); break; case c_eq: generate_integer_test(g, p, "=="); break; case c_ne: generate_integer_test(g, p, "!="); break; case c_gr: generate_integer_test(g, p, ">"); break; case c_ge: generate_integer_test(g, p, ">="); break; case c_ls: generate_integer_test(g, p, "<"); break; case c_le: generate_integer_test(g, p, "<="); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } static void generate_class_begin(struct generator * g) { w(g, "from .basestemmer import "); w(g, g->options->parent_class_name); w(g, "~N" "from .among import Among~N" "~N" "~N" "class ~n("); w(g, g->options->parent_class_name); w(g, "):~N" "~+~M'''~N" "~MThis class implements the stemming algorithm defined by a snowball script.~N" "~M"); write_generated_comment_content(g); w(g, "~N" "~M'''~N" "~N"); } static void generate_among_table(struct generator * g, struct among * x) { struct amongvec * v = x->b; g->I[0] = x->number; w(g, "~Ma_~I0 = [~N~+"); { int i; for (i = 0; i < x->literalstring_count; i++) { g->I[0] = v->i; g->I[1] = v->result; g->L[0] = v->b; g->S[0] = i < x->literalstring_count - 1 ? "," : ""; w(g, "~MAmong(~L0, ~I0, ~I1"); if (v->function != 0) { w(g, ", \""); if (v->function->type == t_routine) { /* Need to use mangled version of private name here. */ w(g, "_~n"); } write_varname(g, v->function); w(g, "\""); } w(g, ")~S0~N"); v++; } } w(g, "~-~M]~N~N"); } static void generate_amongs(struct generator * g) { struct among * x; for (x = g->analyser->amongs; x; x = x->next) { generate_among_table(g, x); } } static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7)/ 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); int i; for (i = 0; i < size; i++) map[i] = 0; /* Using unicode would require revision here */ for (i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); g->V[0] = q->name; w(g, "~M~W0 = ["); for (i = 0; i < size; i++) { write_int(g, map[i]); if (i < size - 1) w(g, ", "); } w(g, "]~N~N"); lose_b(map); } static void generate_groupings(struct generator * g) { struct grouping * q; for (q = g->analyser->groupings; q; q = q->next) { if (q->name->used) generate_grouping_table(g, q); } } static void generate_members(struct generator * g) { struct name * q; for (q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_string: w(g, " ~W0 = \"\"~N"); break; case t_integer: w(g, " ~W0 = 0~N"); break; case t_boolean: w(g, " ~W0 = False~N"); break; } } } static void generate_methods(struct generator * g) { struct node * p = g->analyser->program; while (p != 0) { generate(g, p); g->unreachable = false; p = p->right; } } static void generate_label_classes(struct generator * g) { int i; for (i = 0; i <= g->max_label; i++) { g->I[0] = i; w(g, "~N~Nclass lab~I0(BaseException): pass~N"); } } extern void generate_program_python(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); write_start_comment(g, "# ", NULL); if (g->analyser->int_limits_used) { /* sys.maxsize is used in the code generated for maxint and minint */ w(g, "import sys~N~N"); } generate_class_begin(g); generate_amongs(g); generate_groupings(g); generate_members(g); generate_methods(g); generate_label_classes(g); output_str(g->options->output_src, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-2.2.0/compiler/generator_rust.c000066400000000000000000001105161414263061200203600ustar00rootroot00000000000000 #include /* for exit */ #include /* for strlen */ #include /* for fprintf etc */ #include "header.h" /* prototypes */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); static int new_label(struct generator * g) { return g->next_label++; } static struct str * vars_newname(struct generator * g) { struct str * output; g->var_number++; output = str_new(); str_append_string(output, "v_"); str_append_int(output, g->var_number); return output; } /* Write routines for items from the syntax tree */ static void write_varname(struct generator * g, struct name * p) { switch (p->type) { case t_external: break; default: { int ch = "SbirxG"[p->type]; write_char(g, ch); write_char(g, '_'); break; } } write_b(g, p->b); } static void write_varref(struct generator * g, struct name * p) { write_string(g, "context."); write_varname(g, p); } static void write_hexdigit(struct generator * g, int n) { write_char(g, n < 10 ? n + '0' : n - 10 + 'A'); } static void write_hex(struct generator * g, int ch) { write_string(g, "\\u{"); { int i; for (i = 12; i >= 0; i -= 4) write_hexdigit(g, ch >> i & 0xf); } write_string(g, "}"); } static void write_literal_string(struct generator * g, symbol * p) { int i = 0; write_string(g, "\""); while (i < SIZE(p)) { int ch; i += get_utf8(p + i, &ch); if (32 <= ch && ch < 127) { if (ch == '\"' || ch == '\\') write_string(g, "\\"); write_char(g, ch); } else { write_hex(g, ch); } } write_string(g, "\""); } static void write_margin(struct generator * g) { int i; for (i = 0; i < g->margin; i++) write_string(g, " "); } static void write_comment(struct generator * g, struct node * p) { if (!g->options->comments) return; write_margin(g); write_string(g, "// "); write_comment_content(g, p); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~+{~N"); } static void write_block_end(struct generator * g) /* block end */ { w(g, "~-~M}~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "env.limit - "; writef(g, "~Mlet ~B0 = ~S1env.cursor;~N", p); } static void restore_string(struct node * p, struct str * out, struct str * savevar) { str_clear(out); str_append_string(out, "env.cursor = "); if (p->mode != m_forward) str_append_string(out, "env.limit - "); str_append(out, savevar); str_append_string(out, ";"); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { struct str * temp = str_new(); write_margin(g); restore_string(p, temp, savevar); write_str(g, temp); write_newline(g); str_delete(temp); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "env.next_char();" : "env.previous_char();"); write_newline(g); } static void wsetlab_begin(struct generator * g, int n) { g->I[0] = n; w(g, "~M'lab~I0: loop {~N~+"); } static void wsetlab_end(struct generator * g, int n) { if (!g->unreachable) { g->I[0] = n; w(g, "~Mbreak 'lab~I0;~N"); } w(g, "~-~M}~N"); } static void wgotol(struct generator * g, int n) { g->I[0] = n; w(g, "~Mbreak 'lab~I0;~N"); } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } switch (g->failure_label) { case x_return: w(g, "~Mreturn false;~N"); g->unreachable = true; break; default: g->I[0] = g->failure_label; w(g, "~Mbreak 'lab~I0;~N"); g->unreachable = true; } } static void write_failure_if(struct generator * g, char * s, struct node * p) { writef(g, "~Mif ", p); writef(g, s, p); writef(g, " ", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "env.cursor >= env.limit", p); } else { write_failure_if(g, "env.cursor <= env.limit_backward", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; int l = strlen(input); while (i < l) { int ch = input[i++]; if (ch != '~') { write_char(g, ch); continue; } switch (input[i++]) { default: write_char(g, input[i - 1]); continue; case 'C': write_comment(g, p); continue; case 'f': write_block_start(g); write_failure(g); g->unreachable = false; write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': write_string(g, g->S[input[i++] - '0']); continue; case 'B': write_b(g, g->B[input[i++] - '0']); continue; case 'I': write_int(g, g->I[input[i++] - '0']); continue; case 'V': write_varref(g, g->V[input[i++] - '0']); continue; case 'W': write_varname(g, g->V[input[i++] - '0']); continue; case 'L': write_literal_string(g, g->L[input[i++] - '0']); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_string(g, g->options->name); continue; } } } static void w(struct generator * g, const char * s) { writef(g, s, 0); } static void generate_AE(struct generator * g, struct node * p) { const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "i32::MAX"); break; case c_minint: write_string(g, "i32::MIN"); break; case c_neg: write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; goto label0; case c_divide: s = " / "; label0: write_char(g, '('); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "env.cursor"); break; case c_limit: w(g, p->mode == m_forward ? "env.limit" : "env.limit_backward"); break; case c_lenof: g->V[0] = p->name; w(g, "(~V0.chars().count() as i32)"); break; case c_sizeof: g->V[0] = p->name; w(g, "(~V0.len() as i32)"); break; case c_len: w(g, "(env.current.chars().count() as i32)"); break; case c_size: w(g, "(env.current.len() as i32)"); break; } } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); p = p->left; while (p) { generate(g, p); if (g->unreachable) break; if (keep_c && p->right != 0) write_restorecursor(g, p, savevar); p = p->right; } str_delete(savevar); } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); int end_unreachable = true; write_comment(g, p); wsetlab_begin(g, out_lab); if (keep_c) write_savecursor(g, p, savevar); p = p->left; str_clear(g->failure_str); if (p == 0) { /* p should never be 0 after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit(1); } while (p->right != 0) { int label = new_label(g); g->failure_label = label; wsetlab_begin(g, label); generate(g, p); if (!g->unreachable) { wgotol(g, out_lab); end_unreachable = false; } w(g, "~-~M}~N"); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); p = p->right; } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); wsetlab_end(g, out_lab); if (!end_unreachable) { g->unreachable = false; } str_delete(savevar); } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g,"~Menv.limit_backward = env.cursor;~N" "~Menv.cursor = env.limit;~N", p); generate(g, p->left); w(g, "~Menv.cursor = env.limit_backward;~N"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int label = new_label(g); g->failure_label = label; write_comment(g, p); if (keep_c) { write_savecursor(g, p, savevar); } str_clear(g->failure_str); wsetlab_begin(g, label); generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); w(g, "~-~M}~N"); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); str_delete(savevar); } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); write_comment(g, p); if (keep_c) { write_savecursor(g, p, savevar); restore_string(p, g->failure_str, savevar); } wsetlab_begin(g, label); generate(g, p->left); wsetlab_end(g, label); g->unreachable = false; str_delete(savevar); } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = true;~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = false;~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) { write_savecursor(g, p, savevar); } generate(g, p->left); if (!g->unreachable) { if (keep_c) { write_restorecursor(g, p, savevar); } } str_delete(savevar); } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); if (p->left->type == c_call) { /* Optimise do */ write_comment(g, p->left); g->V[0] = p->left->name; w(g, "~M~W0(env, context);~N"); } else { int label = new_label(g); g->failure_label = label; str_clear(g->failure_str); wsetlab_begin(g, label); generate(g, p->left); wsetlab_end(g, label); g->unreachable = false; } if (keep_c) write_restorecursor(g, p, savevar); str_delete(savevar); } static void generate_GO(struct generator * g, struct node * p, int style) { int end_unreachable = false; struct str * savevar = vars_newname(g); int keep_c = style == 1 || repeat_restore(g, p->left); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int golab = new_label(g); g->I[0] = golab; write_comment(g, p); w(g, "~M'golab~I0: loop {~N~+"); if (keep_c) write_savecursor(g, p, savevar); g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (style == 1) write_restorecursor(g, p, savevar); g->I[0] = golab; w(g, "~Mbreak 'golab~I0;~N"); } g->unreachable = false; w(g, "~-~M}~N"); if (keep_c) write_restorecursor(g, p, savevar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); write_block_end(g); str_delete(savevar); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); w(g, "~Mfor _ in 0.."); generate_AE(g, p->AE); writef(g, " {~+~N", p); generate(g, p->left); w(g, "~-~M}~N"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat_or_atleast(struct generator * g, struct node * p, struct str * loopvar) { struct str * savevar = vars_newname(g); int keep_c = repeat_restore(g, p->left); int replab = new_label(g); g->I[0] = replab; writef(g, "~M'replab~I0: loop{~N~+", p); if (keep_c) write_savecursor(g, p, savevar); g->failure_label = new_label(g); str_clear(g->failure_str); g->I[0] = g->failure_label; w(g, "~M'lab~I0: for _ in 0..1 {~N~+"); generate(g, p->left); if (!g->unreachable) { if (loopvar != 0) { g->B[0] = str_data(loopvar); w(g, "~M~B0 -= 1;~N"); } g->I[0] = replab; w(g, "~Mcontinue 'replab~I0;~N"); } w(g, "~-~M}~N"); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); g->I[0] = replab; w(g, "~Mbreak 'replab~I0;~N~-~M}~N"); str_delete(savevar); } static void generate_repeat(struct generator * g, struct node * p) { write_comment(g, p); generate_repeat_or_atleast(g, p, NULL); } static void generate_atleast(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); g->B[0] = str_data(loopvar); w(g, "~Mlet mut ~B0 = "); generate_AE(g, p->AE); w(g, ";~N"); { int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat_or_atleast(g, p, loopvar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); str_delete(loopvar); } static void generate_setmark(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = env.cursor;~N", p); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif env.cursor ~S0 "); generate_AE(g, p->AE); writef(g, " ", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; w(g, "~Menv.cursor = "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_atmark(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif env.cursor != "); generate_AE(g, p->AE); writef(g, " ", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); // Generate the AE to a temporary block so we can substitute it in // write_failure_if(). struct str * ae = str_new(); struct str * s = g->outbuf; g->outbuf = ae; generate_AE(g, p->AE); g->outbuf = s; g->B[0] = str_data(ae); g->S[0] = p->mode == m_forward ? "" : "_back"; g->S[1] = p->AE->type == c_number ? "" : "_checked"; write_failure_if(g, "!env.hop~S0~S1(~B0)", p); str_delete(ae); } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mif !env.slice_del() {~N" "~+~Mreturn false;~N~-" "~M}~N", p); } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.limit" : "env.limit_backward"; writef(g, "~Menv.cursor = ~S0;~N", p); } static void generate_atlimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.limit" : "env.limit_backward"; g->S[1] = p->mode == m_forward ? "<" : ">"; write_failure_if(g, "env.cursor ~S1 ~S0", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.bra" : "env.ket"; writef(g, "~M~S0 = env.cursor;~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "env.ket" : "env.bra"; writef(g, "~M~S0 = env.cursor;~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = env.assign_to();~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = env.slice_to();~N" "~Mif ~V0.is_empty() {~N" "~+~Mreturn false;~N~-~M}~N", p); } static void generate_address(struct generator * g, struct node * p) { /* If we deal with a string variable which is of type String we need to * pass it by reference not by value. Literalstrings on the other hand are * of type &'static str so we can pass them by value. */ symbol * b = p->literalstring; if (b != 0) { write_literal_string(g, b); } else { write_char(g, '&'); write_varref(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { int keep_c = style == c_attach; write_comment(g, p); if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) w(g, "~Mlet c = env.cursor;~N"); w(g, "~Mlet (bra, ket) = (env.cursor, env.cursor);~N"); writef(g, "~Menv.insert(bra, ket, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~Menv.cursor = c;~N"); } static void generate_assignfrom(struct generator * g, struct node * p) { int keep_c = p->mode == m_forward; /* like 'attach' */ write_comment(g, p); if (keep_c) writef(g, "~Mlet c = env.cursor;~N", p); /* Copying limits and cursors is necessary here because the rust * borrowchecker does not like taking something from someone you are about * to mutate... */ if (p->mode == m_forward) { writef(g, "~Mlet (bra, ket) = (env.cursor, env.limit);~N", p); } else { writef(g, "~Mlet (bra, ket) = (env.limit_backward, env.cursor);~N", p); } writef(g, "~Menv.insert(bra, ket, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~Menv.cursor = c;~N"); } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif !env.slice_from("); generate_address(g, p); writef(g, ") {~N" "~+~Mreturn false;~N~-~M}~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); struct str * varname = vars_newname(g); write_comment(g, p); if (p->left && p->left->type == c_tomark) { /* Special case for: * * setlimit tomark AE for C * * All uses of setlimit in the current stemmers we ship follow this * pattern, and by special-casing we can avoid having to save and * restore c. */ struct node * q = p->left; g->S[0] = q->mode == m_forward ? ">" : "<"; w(g, "~Mif env.cursor ~S0 "); generate_AE(g, q->AE); w(g, " "); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mlet ~B0 = env.limit - env.cursor;~N"); w(g, "~Menv.limit = "); } else { w(g, "~Mlet ~B0 = env.limit_backward;~N"); w(g, "~Menv.limit_backward = "); } generate_AE(g, q->AE); writef(g, ";~N", q); if (p->mode == m_forward) { str_assign(g->failure_str, "env.limit += "); str_append(g->failure_str, varname); str_append_string(g->failure_str, ";"); } else { str_assign(g->failure_str, "env.limit_backward = "); str_append(g->failure_str, varname); str_append_string(g->failure_str, ";"); } } else { write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); if (p->mode == m_forward) { w(g, "~Mlet ~B0 = env.limit - env.cursor;~N"); w(g, "~Menv.limit = env.cursor;~N"); } else { w(g, "~Mlet ~B0 = env.limit_backward;~N"); w(g, "~Menv.limit_backward = env.cursor;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "env.limit += "); str_append(g->failure_str, varname); str_append_string(g->failure_str, ";"); } else { str_assign(g->failure_str, "env.limit_backward = "); str_append(g->failure_str, varname); str_append_string(g->failure_str, ";"); } } } if (!g->unreachable) { generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); str_delete(savevar); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { struct str * savevar_env = vars_newname(g); write_comment(g, p); g->V[0] = p->name; g->B[0] = str_data(savevar_env); writef(g, "~Mlet ~B0 = env.clone();~N" "~Menv.set_current_s(~V0.clone());~N" "~Menv.cursor = 0;~N" "~Menv.limit = env.current.len() as i32;~N", p); generate(g, p->left); if (!g->unreachable) { g->V[0] = p->name; g->B[0] = str_data(savevar_env); /* Update string variable. */ w(g, "~M~V0 = env.current.clone().into_owned();~N"); /* Reset env */ w(g, "~M*env = ~B0;~N"); } str_delete(savevar_env); } static void generate_integer_assign(struct generator * g, struct node * p, char * s) { g->V[0] = p->name; g->S[0] = s; w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p, char * s) { w(g, "~Mif !("); generate_AE(g, p->left); write_char(g, ' '); write_string(g, s); write_char(g, ' '); generate_AE(g, p->AE); w(g, ")"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } static void generate_call(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "!~W0(env, context)", p); } static void generate_grouping(struct generator * g, struct node * p, int complement) { struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; write_failure_if(g, "!env.~S1_grouping~S0(~W0, ~I0, ~I1)", p); } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->V[0] = p->name; write_failure_if(g, "!env.eq_s~S0(&~V0)", p); } static void generate_literalstring(struct generator * g, struct node * p) { symbol * b = p->literalstring; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->L[0] = b; write_failure_if(g, "!env.eq_s~S0(&~L0)", p); } static void generate_setup_context(struct generator * g) { struct name * q; w(g, "~Mlet mut context = &mut Context {~+~N"); for (q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_string: w(g, "~M~W0: String::new(),~N"); break; case t_integer: w(g, "~M~W0: 0,~N"); break; case t_boolean: w(g, "~M~W0: false,~N"); break; } } w(g, "~-~M};~N"); } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; struct str * saved_output = g->outbuf; g->V[0] = q; if (q->type == t_routine) { w(g, "~N~Mfn ~W0(env: &mut SnowballEnv, context: &mut Context) -> bool {~+~N"); } else { w(g, "~N~Mpub fn ~W0(env: &mut SnowballEnv) -> bool {~+~N"); generate_setup_context(g); } if (p->amongvar_needed) w(g, "~Mlet mut among_var;~N"); g->outbuf = str_new(); g->next_label = 0; g->var_number = 0; str_clear(g->failure_str); g->failure_label = x_return; g->unreachable = false; generate(g, p->left); if (!g->unreachable) w(g, "~Mreturn true;~N"); w(g, "~-~M}~N"); str_append(saved_output, g->outbuf); str_delete(g->outbuf); g->outbuf = saved_output; } static void generate_substring(struct generator * g, struct node * p) { struct among * x = p->among; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; if (!x->amongvar_needed) { write_failure_if(g, "env.find_among~S0(~A_~I0, context) == 0", p); } else { writef(g, "~Mamong_var = env.find_among~S0(~A_~I0, context);~N", p); write_failure_if(g, "among_var == 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; if (x->substring == 0) generate_substring(g, p); if (x->starter != 0) generate(g, x->starter); if (x->command_count == 1 && x->nocommand_count == 0) { /* Only one outcome ("no match" already handled). */ generate(g, x->commands[0]); } else if (x->command_count > 0) { int i; w(g, "~M"); for (i = 1; i <= x->command_count; i++) { g->I[0] = i; if (i > 1) w(g, " else "); w(g, "if among_var == ~I0 {~N~+"); generate(g, x->commands[i - 1]); w(g, "~-~M}"); g->unreachable = false; } w(g, "~N"); } } static void generate_booltest(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "!~V0", p); } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Menv.debug(~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { int a0; struct str * a1; if (g->unreachable) return; a0 = g->failure_label; a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_setmark: generate_setmark(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_atmark: generate_atmark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_atlimit: generate_atlimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_assign: generate_assignfrom(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_mathassign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: generate_integer_assign(g, p, "/="); break; case c_eq: generate_integer_test(g, p, "=="); break; case c_ne: generate_integer_test(g, p, "!="); break; case c_gr: generate_integer_test(g, p, ">"); break; case c_ge: generate_integer_test(g, p, ">="); break; case c_ls: generate_integer_test(g, p, "<"); break; case c_le: generate_integer_test(g, p, "<="); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } /* rustc emits warnings if variables don't match the style guide */ /* (i.e. upper-case for globals, snake case for fields etc.) */ /* To allow warning free compilation of generated code and */ /* consistency with snowball variable namings we allow some kind of warnings here */ static void generate_allow_warnings(struct generator * g) { w(g, "#![allow(non_upper_case_globals)]~N"); w(g, "#![allow(non_snake_case)]~N"); w(g, "#![allow(unused_variables)]~N"); w(g, "#![allow(unused_mut)]~N"); } static void generate_class_begin(struct generator * g) { w(g, "use snowball::SnowballEnv;~N"); if (g->analyser->among_count > 0) { w(g, "use snowball::Among;~N~N"); } } static void generate_among_table(struct generator * g, struct among * x) { struct amongvec * v = x->b; g->I[0] = x->number; g->I[1] = x->literalstring_count; w(g, "~Mstatic A_~I0: &'static [Among; ~I1] = &[~N~+"); { int i; for (i = 0; i < x->literalstring_count; i++) { g->I[0] = v->i; g->I[1] = v->result; g->L[0] = v->b; g->S[0] = ","; w(g, "~MAmong(~L0, ~I0, ~I1, "); if (v->function != 0) { w(g, "Some(&"); write_varname(g, v->function); w(g, ")"); } else { w(g, "None"); } w(g, ")~S0~N"); v++; } } w(g, "~-~M];~N~N"); } static void generate_amongs(struct generator * g) { struct among * x; for (x = g->analyser->amongs; x; x = x->next) { generate_among_table(g, x); } } static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7)/ 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); int i; for (i = 0; i < size; i++) map[i] = 0; for (i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); g->V[0] = q->name; g->I[0] = size; w(g, "~Mstatic ~W0: &'static [u8; ~I0] = &["); for (i = 0; i < size; i++) { write_int(g, map[i]); if (i < size - 1) w(g, ", "); } w(g, "];~N~N"); lose_b(map); } static void generate_groupings(struct generator * g) { struct grouping * q; for (q = g->analyser->groupings; q; q = q->next) { if (q->name->used) generate_grouping_table(g, q); } } static void generate_members(struct generator * g) { struct name * q; w(g, "#[derive(Clone)]~N"); w(g, "struct Context {~+~N"); for (q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_string: w(g, "~M~W0: String,~N"); break; case t_integer: w(g, "~M~W0: i32,~N"); break; case t_boolean: w(g, "~M~W0: bool,~N"); break; } } w(g, "~-}~N"); } static void generate_methods(struct generator * g) { struct node * p = g->analyser->program; while (p != 0) { generate(g, p); g->unreachable = false; p = p->right; } } extern void generate_program_rust(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); write_start_comment(g, "//! ", NULL); generate_allow_warnings(g); if (g->analyser->int_limits_used) { /* std::i32 is used in the code generated for i32::MAX and i32::MIN */ w(g, "use std::i32;~N~N"); } generate_class_begin(g); generate_amongs(g); generate_groupings(g); generate_members(g); generate_methods(g); output_str(g->options->output_src, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } snowball-2.2.0/compiler/header.h000066400000000000000000000301621414263061200165500ustar00rootroot00000000000000#include #define SNOWBALL_VERSION "2.2.0" typedef unsigned char byte; typedef unsigned short symbol; #define true 1 #define false 0 #define MALLOC check_malloc #define FREE check_free #define NEW(type, p) struct type * p = (struct type *) MALLOC(sizeof(struct type)) #define NEWVEC(type, p, n) struct type * p = (struct type *) MALLOC(sizeof(struct type) * (n)) #define SIZE(p) ((int *)(p))[-1] #define CAPACITY(p) ((int *)(p))[-2] extern symbol * create_b(int n); extern void report_b(FILE * out, const symbol * p); extern void lose_b(symbol * p); extern symbol * increase_capacity(symbol * p, int n); extern symbol * move_to_b(symbol * p, int n, const symbol * q); extern symbol * add_to_b(symbol * p, int n, const symbol * q); extern symbol * copy_b(const symbol * p); extern char * b_to_s(const symbol * p); extern symbol * add_s_to_b(symbol * p, const char * s); #define MOVE_TO_B(B, LIT) \ move_to_b(B, sizeof(LIT) / sizeof(LIT[0]), LIT) struct str; /* defined in space.c */ extern struct str * str_new(void); extern void str_delete(struct str * str); extern void str_append(struct str * str, const struct str * add); extern void str_append_ch(struct str * str, char add); extern void str_append_symbol(struct str * str, symbol add); extern void str_append_b(struct str * str, const symbol * q); extern void str_append_b_tail(struct str * str, const symbol * q, int skip); extern void str_append_string(struct str * str, const char * s); extern void str_append_int(struct str * str, int i); extern void str_clear(struct str * str); extern void str_assign(struct str * str, const char * s); extern struct str * str_copy(const struct str * old); extern symbol * str_data(const struct str * str); extern int str_len(const struct str * str); extern int str_back(const struct str *str); extern int get_utf8(const symbol * p, int * slot); extern int put_utf8(int ch, symbol * p); extern void output_str(FILE * outfile, struct str * str); typedef enum { ENC_SINGLEBYTE, ENC_UTF8, ENC_WIDECHARS } enc; struct m_pair { struct m_pair * next; symbol * name; symbol * value; }; /* struct input must be a prefix of struct tokeniser. */ struct input { struct input * next; symbol * p; int c; char * file; int file_needs_freeing; int line_number; }; struct include { struct include * next; symbol * b; }; enum token_codes { #include "syswords2.h" c_mathassign, c_name, c_number, c_literalstring, c_neg, c_call, c_grouping, c_booltest, NUM_TOKEN_CODES }; enum uplus_modes { UPLUS_NONE, UPLUS_DEFINED, UPLUS_UNICODE }; /* struct input must be a prefix of struct tokeniser. */ struct tokeniser { struct input * next; symbol * p; int c; char * file; int file_needs_freeing; int line_number; symbol * b; symbol * b2; int number; int m_start; int m_end; struct m_pair * m_pairs; int get_depth; int error_count; int token; int previous_token; byte token_held; enc encoding; int omission; struct include * includes; /* Mode in which U+ has been used: * UPLUS_NONE - not used yet * UPLUS_DEFINED - stringdef U+xxxx .... * UPLUS_UNICODE - {U+xxxx} used with implicit meaning */ int uplusmode; char token_disabled[NUM_TOKEN_CODES]; }; extern symbol * get_input(const char * filename); extern struct tokeniser * create_tokeniser(symbol * b, char * file); extern int read_token(struct tokeniser * t); extern const char * name_of_token(int code); extern void disable_token(struct tokeniser * t, int code); extern void close_tokeniser(struct tokeniser * t); extern int space_count; extern void * check_malloc(int n); extern void check_free(void * p); struct node; struct name { struct name * next; symbol * b; int type; /* t_string etc */ int mode; /* )_ for routines, externals */ struct node * definition; /* ) */ int count; /* 0, 1, 2 for each type */ struct grouping * grouping; /* for grouping names */ byte referenced; byte used_in_among; /* Function used in among? */ byte value_used; /* (For variables) is its value ever used? */ byte initialised; /* (For variables) is it ever initialised? */ byte used_in_definition; /* (grouping) used in grouping definition? */ struct node * used; /* First use, or NULL if not used */ struct name * local_to; /* Local to one routine/external */ int declaration_line_number;/* Line number of declaration */ }; struct literalstring { struct literalstring * next; symbol * b; }; struct amongvec { symbol * b; /* the string giving the case */ int size; /* - and its size */ struct node * action; /* the corresponding action */ int i; /* the amongvec index of the longest substring of b */ int result; /* the numeric result for the case */ int line_number; /* for diagnostics and stable sorting */ struct name * function; }; struct among { struct among * next; struct amongvec * b; /* pointer to the amongvec */ int number; /* amongs are numbered 0, 1, 2 ... */ int literalstring_count; /* in this among */ int command_count; /* in this among (includes "no command" entries) */ int nocommand_count; /* number of "no command" entries in this among */ int function_count; /* in this among */ int amongvar_needed; /* do we need to set among_var? */ struct node * starter; /* i.e. among( (starter) 'string' ... ) */ struct node * substring; /* i.e. substring ... among ( ... ) */ struct node ** commands; /* array with command_count entries */ }; struct grouping { struct grouping * next; symbol * b; /* the characters of this group */ int largest_ch; /* character with max code */ int smallest_ch; /* character with min code */ struct name * name; /* so g->name->grouping == g */ int line_number; }; struct node { struct node * next; struct node * left; struct node * aux; /* used in setlimit */ struct among * among; /* used in among */ struct node * right; int type; int mode; struct node * AE; struct name * name; symbol * literalstring; int number; int line_number; int amongvar_needed; /* used in routine definitions */ }; enum name_types { t_size = 6, t_string = 0, t_boolean = 1, t_integer = 2, t_routine = 3, t_external = 4, t_grouping = 5 /* If this list is extended, adjust wvn in generator.c */ }; /* In name_count[i] below, remember that type is ----+---- 0 | string 1 | boolean 2 | integer 3 | routine 4 | external 5 | grouping */ struct analyser { struct tokeniser * tokeniser; struct node * nodes; struct name * names; struct literalstring * literalstrings; int mode; byte modifyable; /* false inside reverse(...) */ struct node * program; struct node * program_end; int name_count[t_size]; /* name_count[i] counts the number of names of type i */ struct among * amongs; struct among * amongs_end; int among_count; int amongvar_needed; /* used in reading routine definitions */ struct grouping * groupings; struct grouping * groupings_end; struct node * substring; /* pending 'substring' in current routine definition */ enc encoding; byte int_limits_used; /* are maxint or minint used? */ }; enum analyser_modes { m_forward = 0, m_backward /*, m_integer */ }; extern void print_program(struct analyser * a); extern struct analyser * create_analyser(struct tokeniser * t); extern void close_analyser(struct analyser * a); extern void read_program(struct analyser * a); struct generator { struct analyser * analyser; struct options * options; int unreachable; /* 0 if code can be reached, 1 if current code * is unreachable. */ int var_number; /* Number of next variable to use. */ struct str * outbuf; /* temporary str to store output */ struct str * declarations; /* str storing variable declarations */ int next_label; #ifndef DISABLE_PYTHON int max_label; #endif int margin; /* if > 0, keep_count to restore in case of a failure; * if < 0, the negated keep_count for the limit to restore in case of * failure. */ int failure_keep_count; #if !defined(DISABLE_JAVA) && !defined(DISABLE_JS) && !defined(DISABLE_PYTHON) && !defined(DISABLE_CSHARP) struct str * failure_str; /* This is used by some generators instead of failure_keep_count */ #endif int label_used; /* Keep track of whether the failure label is used. */ int failure_label; int debug_count; int copy_from_count; /* count of calls to copy_from() */ const char * S[10]; /* strings */ symbol * B[10]; /* blocks */ int I[10]; /* integers */ struct name * V[5]; /* variables */ symbol * L[5]; /* literals, used in formatted write */ int line_count; /* counts number of lines output */ int line_labelled; /* in ISO C, will need extra ';' if it is a block end */ int literalstring_count; int keep_count; /* used to number keep/restore pairs to avoid compiler warnings about shadowed variables */ int temporary_used; /* track if temporary variable used (for Pascal) */ }; /* Special values for failure_label in struct generator. */ enum special_labels { x_return = -1 }; struct options { /* for the command line: */ const char * output_file; const char * name; FILE * output_src; FILE * output_h; byte syntax_tree; byte comments; enc encoding; enum { LANG_JAVA, LANG_C, LANG_CPLUSPLUS, LANG_CSHARP, LANG_PASCAL, LANG_PYTHON, LANG_JAVASCRIPT, LANG_RUST, LANG_GO, LANG_ADA } make_lang; const char * externals_prefix; const char * variables_prefix; const char * runtime_path; const char * parent_class_name; const char * package; const char * go_snowball_runtime; const char * string_class; const char * among_class; struct include * includes; struct include * includes_end; }; /* Generator functions common to several backends. */ extern struct generator * create_generator(struct analyser * a, struct options * o); extern void close_generator(struct generator * g); extern void write_char(struct generator * g, int ch); extern void write_newline(struct generator * g); extern void write_string(struct generator * g, const char * s); extern void write_int(struct generator * g, int i); extern void write_symbol(struct generator * g, symbol s); extern void write_b(struct generator * g, symbol * b); extern void write_str(struct generator * g, struct str * str); extern void write_comment_content(struct generator * g, struct node * p); extern void write_generated_comment_content(struct generator * g); extern void write_start_comment(struct generator * g, const char * comment_start, const char * comment_end); extern int K_needed(struct generator * g, struct node * p); extern int repeat_restore(struct generator * g, struct node * p); /* Generator for C code. */ extern void generate_program_c(struct generator * g); #ifndef DISABLE_JAVA /* Generator for Java code. */ extern void generate_program_java(struct generator * g); #endif #ifndef DISABLE_CSHARP /* Generator for C# code. */ extern void generate_program_csharp(struct generator * g); #endif #ifndef DISABLE_PASCAL extern void generate_program_pascal(struct generator * g); #endif #ifndef DISABLE_PYTHON /* Generator for Python code. */ extern void generate_program_python(struct generator * g); #endif #ifndef DISABLE_JS extern void generate_program_js(struct generator * g); #endif #ifndef DISABLE_RUST extern void generate_program_rust(struct generator * g); #endif #ifndef DISABLE_GO extern void generate_program_go(struct generator * g); #endif #ifndef DISABLE_ADA extern void generate_program_ada(struct generator * g); #endif snowball-2.2.0/compiler/space.c000066400000000000000000000157701414263061200164160ustar00rootroot00000000000000 #include /* for printf */ #include /* malloc, free */ #include /* memmove */ #include "header.h" #define HEAD 2*sizeof(int) #define EXTENDER 40 /* This modules provides a simple mechanism for arbitrary length writable strings, called 'blocks'. They are 'symbol *' items rather than 'char *' items however. The calls are: symbol * b = create_b(n); - create an empty block b with room for n symbols b = increase_capacity(b, n); - increase the capacity of block b by n symbols (b may change) b2 = copy_b(b) - copy block b into b2 lose_b(b); - lose block b b = move_to_b(b, n, p); - set the data in b to be the n symbols at address p b = add_to_b(b, n, p); - add the n symbols at address p to the end of the data in b SIZE(b) - is the number of symbols in b For example: symbol * b = create_b(0); { int i; char p[10]; for (i = 0; i < 100; i++) { sprintf(p, " %d", i); add_s_to_b(b, p); } } and b contains " 0 1 2 ... 99" spaced out as symbols. */ /* For a block b, SIZE(b) is the number of symbols so far written into it, CAPACITY(b) the total number it can contain, so SIZE(b) <= CAPACITY(b). In fact blocks have 1 extra character over the promised capacity so they can be zero terminated by 'b[SIZE(b)] = 0;' without fear of overwriting. */ extern symbol * create_b(int n) { symbol * p = (symbol *) (HEAD + (char *) MALLOC(HEAD + (n + 1) * sizeof(symbol))); CAPACITY(p) = n; SIZE(p) = 0; return p; } extern void report_b(FILE * out, const symbol * p) { int i; for (i = 0; i < SIZE(p); i++) { if (p[i] > 255) { printf("In report_b, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]); exit(1); } putc(p[i], out); } } extern void output_str(FILE * outfile, struct str * str) { report_b(outfile, str_data(str)); } extern void lose_b(symbol * p) { if (p == 0) return; FREE((char *) p - HEAD); } extern symbol * increase_capacity(symbol * p, int n) { symbol * q = create_b(CAPACITY(p) + n + EXTENDER); memmove(q, p, CAPACITY(p) * sizeof(symbol)); SIZE(q) = SIZE(p); lose_b(p); return q; } extern symbol * move_to_b(symbol * p, int n, const symbol * q) { int x = n - CAPACITY(p); if (x > 0) p = increase_capacity(p, x); memmove(p, q, n * sizeof(symbol)); SIZE(p) = n; return p; } extern symbol * add_to_b(symbol * p, int n, const symbol * q) { int x = SIZE(p) + n - CAPACITY(p); if (x > 0) p = increase_capacity(p, x); memmove(p + SIZE(p), q, n * sizeof(symbol)); SIZE(p) += n; return p; } extern symbol * copy_b(const symbol * p) { int n = SIZE(p); symbol * q = create_b(n); move_to_b(q, n, p); return q; } int space_count = 0; extern void * check_malloc(int n) { space_count++; return malloc(n); } extern void check_free(void * p) { space_count--; free(p); } /* To convert a block to a zero terminated string: */ extern char * b_to_s(const symbol * p) { int n = SIZE(p); char * s = (char *)malloc(n + 1); { int i; for (i = 0; i < n; i++) { if (p[i] > 255) { printf("In b_to_s, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]); exit(1); } s[i] = (char)p[i]; } } s[n] = 0; return s; } /* To add a zero terminated string to a block. If p = 0 the block is created. */ extern symbol * add_s_to_b(symbol * p, const char * s) { int n = strlen(s); int k; if (p == 0) p = create_b(n); k = SIZE(p); { int x = k + n - CAPACITY(p); if (x > 0) p = increase_capacity(p, x); } { int i; for (i = 0; i < n; i++) p[i + k] = s[i]; } SIZE(p) += n; return p; } /* The next section defines string handling capabilities in terms of the lower level block handling capabilities of space.c */ /* -------------------------------------------------------------*/ struct str { symbol * data; }; /* Create a new string. */ extern struct str * str_new(void) { struct str * output = (struct str *) malloc(sizeof(struct str)); output->data = create_b(0); return output; } /* Delete a string. */ extern void str_delete(struct str * str) { lose_b(str->data); free(str); } /* Append a str to this str. */ extern void str_append(struct str * str, const struct str * add) { symbol * q = add->data; str->data = add_to_b(str->data, SIZE(q), q); } /* Append a character to this str. */ extern void str_append_ch(struct str * str, char add) { symbol sym = (unsigned char)add; str->data = add_to_b(str->data, 1, &sym); } /* Append a low level block to a str. */ extern void str_append_b(struct str * str, const symbol * q) { str->data = add_to_b(str->data, SIZE(q), q); } /* Append the tail of a low level block to a str. */ extern void str_append_b_tail(struct str * str, const symbol * q, int skip) { if (skip < 0 || skip >= SIZE(q)) return; str->data = add_to_b(str->data, SIZE(q) - skip, q + skip); } /* Append a (char *, null terminated) string to a str. */ extern void str_append_string(struct str * str, const char * s) { str->data = add_s_to_b(str->data, s); } /* Append an integer to a str. */ extern void str_append_int(struct str * str, int i) { char s[30]; sprintf(s, "%d", i); str_append_string(str, s); } /* Clear a string */ extern void str_clear(struct str * str) { SIZE(str->data) = 0; } /* Set a string */ extern void str_assign(struct str * str, const char * s) { str_clear(str); str_append_string(str, s); } /* Copy a string. */ extern struct str * str_copy(const struct str * old) { struct str * newstr = str_new(); str_append(newstr, old); return newstr; } /* Get the data stored in this str. */ extern symbol * str_data(const struct str * str) { return str->data; } /* Get the length of the str. */ extern int str_len(const struct str * str) { return SIZE(str->data); } /* Get the last character of the str. * * Or -1 if the string is empty. */ extern int str_back(const struct str *str) { return SIZE(str->data) ? str->data[SIZE(str->data) - 1] : -1; } extern int get_utf8(const symbol * p, int * slot) { int b0, b1; b0 = *p++; if (b0 < 0xC0) { /* 1100 0000 */ * slot = b0; return 1; } b1 = *p++; if (b0 < 0xE0) { /* 1110 0000 */ * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2; } * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (*p & 0x3F); return 3; } extern int put_utf8(int ch, symbol * p) { if (ch < 0x80) { p[0] = ch; return 1; } if (ch < 0x800) { p[0] = (ch >> 6) | 0xC0; p[1] = (ch & 0x3F) | 0x80; return 2; } p[0] = (ch >> 12) | 0xE0; p[1] = ((ch >> 6) & 0x3F) | 0x80; p[2] = (ch & 0x3F) | 0x80; return 3; } snowball-2.2.0/compiler/syswords.h000066400000000000000000000101151414263061200172110ustar00rootroot00000000000000static const struct system_word vocab[82+1] = { { 0, (const byte *)"", 82+1}, { 1, (const byte *)"$", c_dollar }, { 1, (const byte *)"(", c_bra }, { 1, (const byte *)")", c_ket }, { 1, (const byte *)"*", c_multiply }, { 1, (const byte *)"+", c_plus }, { 1, (const byte *)"-", c_minus }, { 1, (const byte *)"/", c_divide }, { 1, (const byte *)"<", c_ls }, { 1, (const byte *)"=", c_assign }, { 1, (const byte *)">", c_gr }, { 1, (const byte *)"?", c_debug }, { 1, (const byte *)"[", c_leftslice }, { 1, (const byte *)"]", c_rightslice }, { 2, (const byte *)"!=", c_ne }, { 2, (const byte *)"*=", c_multiplyassign }, { 2, (const byte *)"+=", c_plusassign }, { 2, (const byte *)"-=", c_minusassign }, { 2, (const byte *)"->", c_sliceto }, { 2, (const byte *)"/*", c_comment2 }, { 2, (const byte *)"//", c_comment1 }, { 2, (const byte *)"/=", c_divideassign }, { 2, (const byte *)"<+", c_insert }, { 2, (const byte *)"<-", c_slicefrom }, { 2, (const byte *)"<=", c_le }, { 2, (const byte *)"==", c_eq }, { 2, (const byte *)"=>", c_assignto }, { 2, (const byte *)">=", c_ge }, { 2, (const byte *)"as", c_as }, { 2, (const byte *)"do", c_do }, { 2, (const byte *)"or", c_or }, { 3, (const byte *)"and", c_and }, { 3, (const byte *)"for", c_for }, { 3, (const byte *)"get", c_get }, { 3, (const byte *)"hex", c_hex }, { 3, (const byte *)"hop", c_hop }, { 3, (const byte *)"len", c_len }, { 3, (const byte *)"non", c_non }, { 3, (const byte *)"not", c_not }, { 3, (const byte *)"set", c_set }, { 3, (const byte *)"try", c_try }, { 4, (const byte *)"fail", c_fail }, { 4, (const byte *)"goto", c_goto }, { 4, (const byte *)"loop", c_loop }, { 4, (const byte *)"next", c_next }, { 4, (const byte *)"size", c_size }, { 4, (const byte *)"test", c_test }, { 4, (const byte *)"true", c_true }, { 5, (const byte *)"among", c_among }, { 5, (const byte *)"false", c_false }, { 5, (const byte *)"lenof", c_lenof }, { 5, (const byte *)"limit", c_limit }, { 5, (const byte *)"unset", c_unset }, { 6, (const byte *)"atmark", c_atmark }, { 6, (const byte *)"attach", c_attach }, { 6, (const byte *)"cursor", c_cursor }, { 6, (const byte *)"define", c_define }, { 6, (const byte *)"delete", c_delete }, { 6, (const byte *)"gopast", c_gopast }, { 6, (const byte *)"insert", c_insert }, { 6, (const byte *)"maxint", c_maxint }, { 6, (const byte *)"minint", c_minint }, { 6, (const byte *)"repeat", c_repeat }, { 6, (const byte *)"sizeof", c_sizeof }, { 6, (const byte *)"tomark", c_tomark }, { 7, (const byte *)"atleast", c_atleast }, { 7, (const byte *)"atlimit", c_atlimit }, { 7, (const byte *)"decimal", c_decimal }, { 7, (const byte *)"reverse", c_reverse }, { 7, (const byte *)"setmark", c_setmark }, { 7, (const byte *)"strings", c_strings }, { 7, (const byte *)"tolimit", c_tolimit }, { 8, (const byte *)"booleans", c_booleans }, { 8, (const byte *)"integers", c_integers }, { 8, (const byte *)"routines", c_routines }, { 8, (const byte *)"setlimit", c_setlimit }, { 9, (const byte *)"backwards", c_backwards }, { 9, (const byte *)"externals", c_externals }, { 9, (const byte *)"groupings", c_groupings }, { 9, (const byte *)"stringdef", c_stringdef }, { 9, (const byte *)"substring", c_substring }, { 12, (const byte *)"backwardmode", c_backwardmode }, { 13, (const byte *)"stringescapes", c_stringescapes } }; snowball-2.2.0/compiler/syswords2.h000066400000000000000000000015301414263061200172740ustar00rootroot00000000000000 c_among = 4, c_and, c_as, c_assign, c_assignto, c_atleast, c_atlimit, c_atmark, c_attach, c_backwardmode, c_backwards, c_booleans, c_bra, c_comment1, c_comment2, c_cursor, c_debug, c_decimal, c_define, c_delete, c_divide, c_divideassign, c_do, c_dollar, c_eq, c_externals, c_fail, c_false, c_for, c_ge, c_get, c_gopast, c_goto, c_gr, c_groupings, c_hex, c_hop, c_insert, c_integers, c_ket, c_le, c_leftslice, c_len, c_lenof, c_limit, c_loop, c_ls, c_maxint, c_minint, c_minus, c_minusassign, c_multiply, c_multiplyassign, c_ne, c_next, c_non, c_not, c_or, c_plus, c_plusassign, c_repeat, c_reverse, c_rightslice, c_routines, c_set, c_setlimit, c_setmark, c_size, c_sizeof, c_slicefrom, c_sliceto, c_stringdef, c_stringescapes, c_strings, c_substring, c_test, c_tolimit, c_tomark, c_true, c_try, c_unset, snowball-2.2.0/compiler/tokeniser.c000066400000000000000000000453371414263061200173300ustar00rootroot00000000000000 #include /* stderr etc */ #include /* malloc free */ #include /* strlen */ #include /* isalpha etc */ #include "header.h" struct system_word { int s_size; /* size of system word */ const byte * s; /* pointer to the system word */ int code; /* its internal code */ }; /* ASCII collating assumed in syswords.c */ #include "syswords.h" #define INITIAL_INPUT_BUFFER_SIZE 8192 static int hex_to_num(int ch); static int smaller(int a, int b) { return a < b ? a : b; } extern symbol * get_input(const char * filename) { FILE * input = fopen(filename, "r"); if (input == 0) { return 0; } { symbol * u = create_b(INITIAL_INPUT_BUFFER_SIZE); int size = 0; while (true) { int ch = getc(input); if (ch == EOF) break; if (size >= CAPACITY(u)) u = increase_capacity(u, size); u[size++] = ch; } fclose(input); SIZE(u) = size; return u; } } static void error(struct tokeniser * t, const char * s1, int n, symbol * p, const char * s2) { if (t->error_count == 20) { fprintf(stderr, "... etc\n"); exit(1); } fprintf(stderr, "%s:%d: ", t->file, t->line_number); if (s1) fprintf(stderr, "%s", s1); if (p) { int i; for (i = 0; i < n; i++) fprintf(stderr, "%c", p[i]); } if (s2) fprintf(stderr, "%s", s2); fprintf(stderr, "\n"); t->error_count++; } static void error1(struct tokeniser * t, const char * s) { error(t, s, 0,0, 0); } static void error2(struct tokeniser * t, const char * s) { error(t, "unexpected end of text after ", 0,0, s); } static int compare_words(int m, symbol * p, int n, const byte * q) { if (m != n) return m - n; { int i; for (i = 0; i < n; i++) { int diff = p[i] - q[i]; if (diff) return diff; } } return 0; } static int find_word(int n, symbol * p) { int i = 0; int j = vocab->code; do { int k = i + (j - i)/2; const struct system_word * w = vocab + k; int diff = compare_words(n, p, w->s_size, w->s); if (diff == 0) return w->code; if (diff < 0) j = k; else i = k; } while (j - i != 1); return -1; } static int get_number(int n, symbol * p) { int x = 0; int i; for (i = 0; i < n; i++) x = 10*x + p[i] - '0'; return x; } static int eq_s(struct tokeniser * t, const char * s) { int l = strlen(s); if (SIZE(t->p) - t->c < l) return false; { int i; for (i = 0; i < l; i++) if (t->p[t->c + i] != s[i]) return false; } t->c += l; return true; } static int white_space(struct tokeniser * t, int ch) { switch (ch) { case '\n': t->line_number++; /* fall through */ case '\r': case '\t': case ' ': return true; } return false; } static symbol * find_in_m(struct tokeniser * t, int n, symbol * p) { struct m_pair * q; for (q = t->m_pairs; q; q = q->next) { symbol * name = q->name; if (n == SIZE(name) && memcmp(name, p, n * sizeof(symbol)) == 0) return q->value; } return 0; } static int read_literal_string(struct tokeniser * t, int c) { symbol * p = t->p; int ch; SIZE(t->b) = 0; while (true) { if (c >= SIZE(p)) { error2(t, "'"); return c; } ch = p[c]; if (ch == '\n') { error1(t, "string not terminated"); return c; } c++; if (ch == t->m_start) { /* Inside insert characters. */ int c0 = c; int newlines = false; /* no newlines as yet */ int black_found = false; /* no printing chars as yet */ while (true) { if (c >= SIZE(p)) { error2(t, "'"); return c; } ch = p[c]; c++; if (ch == t->m_end) break; if (!white_space(t, ch)) black_found = true; if (ch == '\n') newlines = true; if (newlines && black_found) { error1(t, "string not terminated"); return c; } } if (!newlines) { int n = c - c0 - 1; /* macro size */ int firstch = p[c0]; symbol * q = find_in_m(t, n, p + c0); if (q == 0) { if (n == 1 && (firstch == '\'' || firstch == t->m_start)) t->b = add_to_b(t->b, 1, p + c0); else if (n >= 3 && firstch == 'U' && p[c0 + 1] == '+') { int codepoint = 0; int x; if (t->uplusmode == UPLUS_DEFINED) { /* See if found with xxxx upper-cased. */ symbol * uc = create_b(n); int i; for (i = 0; i != n; ++i) { uc[i] = toupper(p[c0 + i]); } q = find_in_m(t, n, uc); lose_b(uc); if (q != 0) { t->b = add_to_b(t->b, SIZE(q), q); continue; } error1(t, "Some U+xxxx stringdefs seen but not this one"); } else { t->uplusmode = UPLUS_UNICODE; } for (x = c0 + 2; x != c - 1; ++x) { int hex = hex_to_num(p[x]); if (hex < 0) { error1(t, "Bad hex digit following U+"); break; } codepoint = (codepoint << 4) | hex; } if (t->encoding == ENC_UTF8) { if (codepoint < 0 || codepoint > 0x01ffff) { error1(t, "character values exceed 0x01ffff"); } /* Ensure there's enough space for a max length * UTF-8 sequence. */ if (CAPACITY(t->b) < SIZE(t->b) + 3) { t->b = increase_capacity(t->b, 3); } SIZE(t->b) += put_utf8(codepoint, t->b + SIZE(t->b)); } else { symbol sym; if (t->encoding == ENC_SINGLEBYTE) { /* Only ISO-8859-1 is handled this way - for * other single-byte character sets you need * stringdef all the U+xxxx codes you use * like - e.g.: * * stringdef U+0171 hex 'FB' */ if (codepoint < 0 || codepoint > 0xff) { error1(t, "character values exceed 256"); } } else { if (codepoint < 0 || codepoint > 0xffff) { error1(t, "character values exceed 64K"); } } sym = codepoint; t->b = add_to_b(t->b, 1, &sym); } } else error(t, "string macro '", n, p + c0, "' undeclared"); } else t->b = add_to_b(t->b, SIZE(q), q); } } else { if (ch == '\'') return c; if (ch < 0 || ch >= 0x80) { if (t->encoding != ENC_WIDECHARS) { /* We don't really want people using non-ASCII literal * strings, but historically it's worked for single-byte * and UTF-8 if the source encoding matches what the * generated stemmer works in and it seems unfair to just * suddenly make this a hard error.` */ fprintf(stderr, "%s:%d: warning: Non-ASCII literal strings aren't " "portable - use stringdef instead\n", t->file, t->line_number); } else { error1(t, "Non-ASCII literal strings aren't " "portable - use stringdef instead"); } } t->b = add_to_b(t->b, 1, p + c - 1); } } } static int next_token(struct tokeniser * t) { symbol * p = t->p; int c = t->c; int ch; int code = -1; while (true) { if (c >= SIZE(p)) { t->c = c; return -1; } ch = p[c]; if (white_space(t, ch)) { c++; continue; } if (isalpha(ch)) { int c0 = c; while (c < SIZE(p) && (isalnum(p[c]) || p[c] == '_')) c++; code = find_word(c - c0, p + c0); if (code < 0 || t->token_disabled[code]) { t->b = move_to_b(t->b, c - c0, p + c0); code = c_name; } } else if (isdigit(ch)) { int c0 = c; while (c < SIZE(p) && isdigit(p[c])) c++; t->number = get_number(c - c0, p + c0); code = c_number; } else if (ch == '\'') { c = read_literal_string(t, c + 1); code = c_literalstring; } else { int lim = smaller(2, SIZE(p) - c); int i; for (i = lim; i > 0; i--) { code = find_word(i, p + c); if (code >= 0) { c += i; break; } } } if (code >= 0) { t->c = c; return code; } error(t, "'", 1, p + c, "' unknown"); c++; continue; } } static int next_char(struct tokeniser * t) { if (t->c >= SIZE(t->p)) return -1; return t->p[t->c++]; } static int next_real_char(struct tokeniser * t) { while (true) { int ch = next_char(t); if (!white_space(t, ch)) return ch; } } static void read_chars(struct tokeniser * t) { int ch = next_real_char(t); if (ch < 0) { error2(t, "stringdef"); return; } { int c0 = t->c-1; while (true) { ch = next_char(t); if (white_space(t, ch) || ch < 0) break; } t->b2 = move_to_b(t->b2, t->c - c0 - 1, t->p + c0); } } static int decimal_to_num(int ch) { if ('0' <= ch && ch <= '9') return ch - '0'; return -1; } static int hex_to_num(int ch) { if ('0' <= ch && ch <= '9') return ch - '0'; if ('a' <= ch && ch <= 'f') return ch - 'a' + 10; if ('A' <= ch && ch <= 'F') return ch - 'A' + 10; return -1; } static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) { int c = 0; int d = 0; while (true) { while (c < SIZE(p) && p[c] == ' ') c++; if (c == SIZE(p)) break; { int number = 0; while (c != SIZE(p)) { int ch = p[c]; if (ch == ' ') break; if (base == 10) { ch = decimal_to_num(ch); if (ch < 0) { error1(t, "decimal string contains non-digits"); return; } } else { ch = hex_to_num(ch); if (ch < 0) { error1(t, "hex string contains non-hex characters"); return; } } number = base * number + ch; c++; } if (t->encoding == ENC_SINGLEBYTE) { if (number < 0 || number > 0xff) { error1(t, "character values exceed 256"); return; } } else { if (number < 0 || number > 0xffff) { error1(t, "character values exceed 64K"); return; } } if (t->encoding == ENC_UTF8) d += put_utf8(number, p + d); else p[d++] = number; } } SIZE(p) = d; } extern int read_token(struct tokeniser * t) { symbol * p = t->p; int held = t->token_held; t->token_held = false; if (held) return t->token; while (true) { int code = next_token(t); switch (code) { case c_comment1: /* slash-slash comment */ while (t->c < SIZE(p) && p[t->c] != '\n') t->c++; continue; case c_comment2: /* slash-star comment */ while (true) { if (t->c >= SIZE(p)) { error1(t, "/* comment not terminated"); t->token = -1; return -1; } if (p[t->c] == '\n') t->line_number++; if (eq_s(t, "*/")) break; t->c++; } continue; case c_stringescapes: { int ch1 = next_real_char(t); int ch2 = next_real_char(t); if (ch2 < 0) { error2(t, "stringescapes"); continue; } if (ch1 == '\'') { error1(t, "first stringescape cannot be '"); continue; } t->m_start = ch1; t->m_end = ch2; continue; } case c_stringdef: { int base = 0; read_chars(t); code = read_token(t); if (code == c_hex) { base = 16; code = read_token(t); } else if (code == c_decimal) { base = 10; code = read_token(t); } if (code != c_literalstring) { error1(t, "string omitted after stringdef"); continue; } if (base > 0) convert_numeric_string(t, t->b, base); { NEW(m_pair, q); q->next = t->m_pairs; q->name = copy_b(t->b2); q->value = copy_b(t->b); t->m_pairs = q; if (t->uplusmode != UPLUS_DEFINED && (SIZE(t->b2) >= 3 && t->b2[0] == 'U' && t->b2[1] == '+')) { if (t->uplusmode == UPLUS_UNICODE) { error1(t, "U+xxxx already used with implicit meaning"); } else { t->uplusmode = UPLUS_DEFINED; } } } continue; } case c_get: code = read_token(t); if (code != c_literalstring) { error1(t, "string omitted after get"); continue; } t->get_depth++; if (t->get_depth > 10) { error1(t, "get directives go 10 deep. Looping?"); exit(1); } { NEW(input, q); char * file = b_to_s(t->b); symbol * u = get_input(file); if (u == 0) { struct include * r; for (r = t->includes; r; r = r->next) { symbol * b = copy_b(r->b); b = add_to_b(b, SIZE(t->b), t->b); free(file); file = b_to_s(b); u = get_input(file); lose_b(b); if (u != 0) break; } } if (u == 0) { error(t, "Can't get '", SIZE(t->b), t->b, "'"); exit(1); } memmove(q, t, sizeof(struct input)); t->next = q; t->p = u; t->c = 0; t->file = file; t->file_needs_freeing = true; t->line_number = 1; } p = t->p; continue; case -1: if (t->next) { lose_b(p); { struct input * q = t->next; memmove(t, q, sizeof(struct input)); p = t->p; FREE(q); } t->get_depth--; continue; } /* fall through */ default: t->previous_token = t->token; t->token = code; return code; } } } extern const char * name_of_token(int code) { int i; for (i = 1; i < vocab->code; i++) if ((vocab + i)->code == code) return (const char *)(vocab + i)->s; switch (code) { case c_mathassign: return "="; case c_name: return "name"; case c_number: return "number"; case c_literalstring:return "literal"; case c_neg: return "neg"; case c_grouping: return "grouping"; case c_call: return "call"; case c_booltest: return "Boolean test"; case -2: return "start of text"; case -1: return "end of text"; default: return "?"; } } extern void disable_token(struct tokeniser * t, int code) { t->token_disabled[code] = 1; } extern struct tokeniser * create_tokeniser(symbol * p, char * file) { NEW(tokeniser, t); t->next = 0; t->p = p; t->c = 0; t->file = file; t->file_needs_freeing = false; t->line_number = 1; t->b = create_b(0); t->b2 = create_b(0); t->m_start = -1; t->m_pairs = 0; t->get_depth = 0; t->error_count = 0; t->token_held = false; t->token = -2; t->previous_token = -2; t->uplusmode = UPLUS_NONE; memset(t->token_disabled, 0, sizeof(t->token_disabled)); return t; } extern void close_tokeniser(struct tokeniser * t) { lose_b(t->b); lose_b(t->b2); { struct m_pair * q = t->m_pairs; while (q) { struct m_pair * q_next = q->next; lose_b(q->name); lose_b(q->value); FREE(q); q = q_next; } } { struct input * q = t->next; while (q) { struct input * q_next = q->next; FREE(q); q = q_next; } } if (t->file_needs_freeing) free(t->file); FREE(t); } snowball-2.2.0/csharp/000077500000000000000000000000001414263061200146135ustar00rootroot00000000000000snowball-2.2.0/csharp/.gitignore000066400000000000000000000001061414263061200166000ustar00rootroot00000000000000*.o *.suo *.user *.GhostDoc.xml bin/ obj/ TestResults/ TestResult.xml snowball-2.2.0/csharp/Snowball/000077500000000000000000000000001414263061200163745ustar00rootroot00000000000000snowball-2.2.0/csharp/Snowball/Algorithms/000077500000000000000000000000001414263061200205055ustar00rootroot00000000000000snowball-2.2.0/csharp/Snowball/Algorithms/.gitignore000066400000000000000000000000171414263061200224730ustar00rootroot00000000000000*.generated.cs snowball-2.2.0/csharp/Snowball/Among.cs000066400000000000000000000076231414263061200177740ustar00rootroot00000000000000// Copyright (c) 2001, Dr Martin Porter // Copyright (c) 2002, Richard Boulton // Copyright (c) 2015, Cesar Souza // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright notice, // * this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // * notice, this list of conditions and the following disclaimer in the // * documentation and/or other materials provided with the distribution. // * Neither the name of the copyright holders nor the names of its contributors // * may be used to endorse or promote products derived from this software // * without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace Snowball { using System; using System.Text; /// /// Snowball's among construction. /// /// public sealed class Among { /// /// Search string. /// /// public string SearchString { get; private set; } /// /// Index to longest matching substring. /// /// public int MatchIndex { get; private set; } /// /// Result of the lookup. /// /// public int Result { get; private set; } /// /// Action to be invoked. /// /// public Func Action { get; private set; } /// /// Initializes a new instance of the class. /// /// /// The search string. /// The index to the longest matching substring. /// The result of the lookup. /// public Among(String str, int index, int result) : this(str, index, result, null) { } /// /// Initializes a new instance of the class. /// /// /// The search string. /// The index to the longest matching substring. /// The result of the lookup. /// The action to be performed, if any. /// public Among(String str, int index, int result, Func action) { this.SearchString = str; this.MatchIndex = index; this.Result = result; this.Action = action; } /// /// Returns a that represents this instance. /// /// /// /// A that represents this instance. /// /// public override string ToString() { return SearchString; } } } snowball-2.2.0/csharp/Snowball/AssemblyInfo.cs000066400000000000000000000026341414263061200213230ustar00rootroot00000000000000using System.Reflection; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; // General Information about an assembly is controlled through the following // set of attributes. Change these attribute values to modify the information // associated with an assembly. [assembly: AssemblyTitle("Snowball")] [assembly: AssemblyDescription("")] [assembly: AssemblyConfiguration("")] [assembly: AssemblyCompany("")] [assembly: AssemblyProduct("Snowball")] [assembly: AssemblyCopyright("Copyright © 2015-2019")] [assembly: AssemblyTrademark("")] [assembly: AssemblyCulture("")] // Setting ComVisible to false makes the types in this assembly not visible // to COM components. If you need to access a type in this assembly from // COM, set the ComVisible attribute to true on that type. [assembly: ComVisible(false)] // The following GUID is for the ID of the typelib if this project is exposed to COM [assembly: Guid("5c54ebc8-a3a3-46f8-b732-60b1440c8b0b")] // Version information for an assembly consists of the following four values: // // Major Version // Minor Version // Build Number // Revision // // You can specify all the values or you can default the Build and Revision Numbers // by using the '*' as shown below: // [assembly: AssemblyVersion("1.0.*")] [assembly: AssemblyVersion(/*SNOWBALL_VERSION*/"2.2.0.0")] [assembly: AssemblyFileVersion(/*SNOWBALL_VERSION*/"2.2.0.0")] snowball-2.2.0/csharp/Snowball/Stemmer.cs000066400000000000000000000431041414263061200203410ustar00rootroot00000000000000// Copyright (c) 2001, Dr Martin Porter // Copyright (c) 2002, Richard Boulton // Copyright (c) 2015, Cesar Souza // Copyright (c) 2018, Olly Betts // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright notice, // * this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // * notice, this list of conditions and the following disclaimer in the // * documentation and/or other materials provided with the distribution. // * Neither the name of the copyright holders nor the names of its contributors // * may be used to endorse or promote products derived from this software // * without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace Snowball { using System; using System.Linq; using System.Text; /// /// Class holding current state. /// /// public class Env { /// /// Initializes a new instance of the class. /// /// protected Env() { } /// /// Gets the current string. /// /// protected StringBuilder current; /// /// Current cursor position. /// /// protected int cursor; /// /// Forward limit for inspecting the buffer. /// /// protected int limit; /// /// Backward limit for inspecting the buffer. /// /// protected int limit_backward; /// /// Starting bracket position. /// /// protected int bra; /// /// Ending bracket position. /// /// protected int ket; /// /// Copy another Env object. /// /// public Env(Env other) { copy_from(other); } /// /// Copy another Env object. /// /// protected void copy_from(Env other) { current = other.current; cursor = other.cursor; limit = other.limit; limit_backward = other.limit_backward; bra = other.bra; ket = other.ket; } } /// /// Base class for Snowball's stemmer algorithms. /// /// public abstract class Stemmer : Env { /// /// Initializes a new instance of the class. /// /// protected Stemmer() { current = new StringBuilder(); setBufferContents(""); } /// /// Calls the stemmer to process the next word. /// /// protected abstract bool stem(); /// /// Stems the buffer's contents. /// /// public bool Stem() { return this.stem(); } /// /// Stems a given word. /// /// /// The word to be stemmed. /// /// The stemmed word. /// public string Stem(string word) { setBufferContents(word); this.stem(); return current.ToString(); } /// /// Gets the current processing buffer. /// /// public StringBuilder Buffer { get { return current; } } /// /// Gets or sets the current word to be stemmed /// or the stemmed word, if the stemmer has been /// processed. /// /// public string Current { get { return current.ToString(); } set { setBufferContents(value); } } private void setBufferContents(string value) { current.Clear(); current.Insert(0, value); cursor = 0; limit = current.Length; limit_backward = 0; bra = cursor; ket = limit; } /// /// Determines whether the current character is /// inside a given group of characters s. /// protected int in_grouping(string s, int min, int max, bool repeat) { do { if (cursor >= limit) return -1; char ch = current[cursor]; if (ch > max || ch < min) return 1; if (!s.Contains(ch)) return 1; cursor++; } while (repeat); return 0; } /// /// Determines whether the current character is /// inside a given group of characters s. /// protected int in_grouping_b(string s, int min, int max, bool repeat) { do { if (cursor <= limit_backward) return -1; char ch = current[cursor - 1]; if (ch > max || ch < min) return 1; if (!s.Contains(ch)) return 1; cursor--; } while (repeat); return 0; } /// /// Determines whether the current character is /// outside a given group of characters s. /// protected int out_grouping(string s, int min, int max, bool repeat) { do { if (cursor >= limit) return -1; char ch = current[cursor]; if (ch > max || ch < min) { cursor++; continue; } if (!s.Contains(ch)) { cursor++; continue; } return 1; } while (repeat); return 0; } /// /// Determines whether the current character is /// outside a given group of characters s. /// protected int out_grouping_b(string s, int min, int max, bool repeat) { do { if (cursor <= limit_backward) return -1; char ch = current[cursor - 1]; if (ch > max || ch < min) { cursor--; continue; } if (!s.Contains(ch)) { cursor--; continue; } return 1; } while (repeat); return 0; } /// /// Determines if the current buffer contains the /// string s, starting from the current position and /// going forward. /// protected bool eq_s(String s) { if (limit - cursor < s.Length) return false; for (int i = 0; i != s.Length; i++) { if (current[cursor + i] != s[i]) return false; } cursor += s.Length; return true; } /// /// Determines if the current buffer contains the /// string s, starting from the current position and /// going backwards. /// protected bool eq_s_b(String s) { if (cursor - limit_backward < s.Length) return false; for (int i = 0; i != s.Length; i++) { if (current[cursor - s.Length + i] != s[i]) return false; } cursor -= s.Length; return true; } /// /// Determines if the current buffer contains the /// string s, starting from the current position and /// going backwards. /// protected bool eq_s_b(StringBuilder s) { if (cursor - limit_backward < s.Length) return false; for (int i = 0; i != s.Length; i++) { if (current[cursor - s.Length + i] != s[i]) return false; } cursor -= s.Length; return true; } /// /// Searches if the current buffer matches against one of the /// amongs, starting from the current cursor position and going /// forward. /// /// protected int find_among(Among[] v) { int i = 0; int j = v.Length; int c = cursor; int l = limit; int common_i = 0; int common_j = 0; bool first_key_inspected = false; while (true) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; // smaller Among w = v[k]; for (int i2 = common; i2 < w.SearchString.Length; i2++) { if (c + common == l) { diff = -1; break; } diff = current[c + common] - w.SearchString[i2]; if (diff != 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; // v->s has been inspected if (j == i) break; // only one item in v // - but now we need to go round once more to get // v->s inspected. This looks messy, but is actually // the optimal approach. if (first_key_inspected) break; first_key_inspected = true; } } while (true) { Among w = v[i]; if (common_i >= w.SearchString.Length) { cursor = c + w.SearchString.Length; if (w.Action == null) return w.Result; bool res = w.Action(); cursor = c + w.SearchString.Length; if (res) return w.Result; } i = w.MatchIndex; if (i < 0) return 0; } } /// /// Searches if the current buffer matches against one of the /// amongs, starting from the current cursor position and going /// backwards. /// /// protected int find_among_b(Among[] v) { int i = 0; int j = v.Length; int c = cursor; int lb = limit_backward; int common_i = 0; int common_j = 0; bool first_key_inspected = false; while (true) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; Among w = v[k]; for (int i2 = w.SearchString.Length - 1 - common; i2 >= 0; i2--) { if (c - common == lb) { diff = -1; break; } diff = current[c - 1 - common] - w.SearchString[i2]; if (diff != 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; if (j == i) break; if (first_key_inspected) break; first_key_inspected = true; } } while (true) { Among w = v[i]; if (common_i >= w.SearchString.Length) { cursor = c - w.SearchString.Length; if (w.Action == null) return w.Result; bool res = w.Action(); cursor = c - w.SearchString.Length; if (res) return w.Result; } i = w.MatchIndex; if (i < 0) return 0; } } /// /// Replaces the characters between c_bra /// and c_ket by the characters in s. /// /// protected int replace_s(int c_bra, int c_ket, String s) { int adjustment = s.Length - (c_ket - c_bra); Replace(current, c_bra, c_ket, s); limit += adjustment; if (cursor >= c_ket) cursor += adjustment; else if (cursor > c_bra) cursor = c_bra; return adjustment; } /// /// Checks if a slicing can be done. /// protected void slice_check() { if (bra < 0 || bra > ket || ket > limit || limit > current.Length) { System.Diagnostics.Trace.WriteLine("faulty slice operation"); } } /// /// Replaces the contents of the bracket with the string s. /// /// /// The s. protected void slice_from(String s) { slice_check(); replace_s(bra, ket, s); } /// /// Removes the current bracket contents. /// /// protected void slice_del() { slice_from(""); } /// /// Replaces the contents of the bracket with the string s. /// /// protected void insert(int c_bra, int c_ket, String s) { int adjustment = replace_s(c_bra, c_ket, s); if (c_bra <= bra) bra += adjustment; if (c_bra <= ket) ket += adjustment; } /// /// Replaces the contents of the bracket with the string s. /// /// protected void insert(int c_bra, int c_ket, StringBuilder s) { int adjustment = replace_s(c_bra, c_ket, s.ToString()); if (c_bra <= bra) bra += adjustment; if (c_bra <= ket) ket += adjustment; } /// /// Replaces the contents of the bracket with the string s. /// /// protected void slice_to(StringBuilder s) { slice_check(); Replace(s, 0, s.Length, current.ToString(bra, ket - bra)); } /// /// Replaces the contents of the bracket with the string s. /// /// protected void assign_to(StringBuilder s) { Replace(s, 0, s.Length, current.ToString(0, limit)); } /// /// Replaces a specific region of the buffer with another text. /// public static StringBuilder Replace(StringBuilder sb, int index, int length, string text) { sb.Remove(index, length - index); sb.Insert(index, text); return sb; } } } snowball-2.2.0/csharp/Stemwords/000077500000000000000000000000001414263061200166025ustar00rootroot00000000000000snowball-2.2.0/csharp/Stemwords/App.config000066400000000000000000000002661414263061200205150ustar00rootroot00000000000000 snowball-2.2.0/csharp/Stemwords/Program.cs000066400000000000000000000076251414263061200205520ustar00rootroot00000000000000// Copyright (c) 2001, Dr Martin Porter // Copyright (c) 2002, Richard Boulton // Copyright (c) 2015, Cesar Souza // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright notice, // * this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // * notice, this list of conditions and the following disclaimer in the // * documentation and/or other materials provided with the distribution. // * Neither the name of the copyright holders nor the names of its contributors // * may be used to endorse or promote products derived from this software // * without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace Snowball { using System; using System.IO; using System.Reflection; using System.Linq; using System.Text; /// /// Snowball's Stemmer program. /// /// public static class Program { private static void usage() { Console.WriteLine("Usage: stemwords.exe -l -i [-o ]"); } /// /// Main program entrypoint. /// /// public static void Main(String[] args) { string language = null; string inputName = null; string outputName = null; for (int i = 0; i < args.Length; i++) { if (args[i] == "-l") language = args[i + 1]; else if (args[i] == "-i") inputName = args[i + 1]; if (args[i] == "-o") outputName = args[i + 1]; } if (language == null || inputName == null) { usage(); return; } Stemmer stemmer = typeof(Stemmer).Assembly.GetTypes() .Where(t => t.IsSubclassOf(typeof(Stemmer)) && !t.IsAbstract) .Where(t => match(t.Name, language)) .Select(t => (Stemmer)Activator.CreateInstance(t)).FirstOrDefault(); if (stemmer == null) { Console.WriteLine("Language not found."); return; } Console.WriteLine("Using " + stemmer.GetType()); TextWriter output = System.Console.Out; if (outputName != null) output = new StreamWriter(outputName); foreach (var line in File.ReadAllLines(inputName)) { var o = stemmer.Stem(line); output.WriteLine(o); } output.Flush(); } private static bool match(string stemmerName, string language) { string expectedName = language.Replace("_", "") + "Stemmer"; return stemmerName.StartsWith(expectedName, StringComparison.CurrentCultureIgnoreCase); } } } snowball-2.2.0/doc/000077500000000000000000000000001414263061200141005ustar00rootroot00000000000000snowball-2.2.0/doc/TODO000066400000000000000000000010221414263061200145630ustar00rootroot00000000000000Things to do: - Write documentation for how to use libstemmer (as opposed to how stemming algorithms themselves work). Currently, the documentation in the include/libstemmer.h header file is pretty clear and comprehensive, but an overview document wouldn't go amiss. Things that would be nice to include at some point. - Add version numbers to each stemming algorithm, and allow the interface to request a specific version of the stemming algorithms. Default to providing the latest version of the algorithm. snowball-2.2.0/doc/libstemmer_c_README000066400000000000000000000120711414263061200175060ustar00rootroot00000000000000libstemmer_c ============ This document pertains to the C version of the libstemmer distribution, available for download from: https://snowballstem.org/download.html Compiling the library ===================== A simple makefile is provided for Unix style systems. On such systems, it should be possible simply to run "make", and the file "libstemmer.o" and the example program "stemwords" will be generated. If this doesn't work on your system, you need to write your own build system (or call the compiler directly). The files to compile are all contained in the "libstemmer", "runtime" and "src_c" directories, and the public header file is contained in the "include" directory. The library comes in two flavours; UTF-8 only, and UTF-8 plus other character sets. To use the utf-8 only flavour, compile "libstemmer_utf8.c" instead of "libstemmer.c". For convenience "mkinc.mak" is a makefile fragment listing the source files and header files used to compile the standard version of the library. "mkinc_utf8.mak" is a comparable makefile fragment listing just the source files for the UTF-8 only version of the library. Using the library ================= The library provides a simple C API. Essentially, a new stemmer can be obtained by using "sb_stemmer_new". "sb_stemmer_stem" is then used to stem a word, "sb_stemmer_length" returns the stemmed length of the last word processed, and "sb_stemmer_delete" is used to delete a stemmer. Creating a stemmer is a relatively expensive operation - the expected usage pattern is that a new stemmer is created when needed, used to stem many words, and deleted after some time. Stemmers are re-entrant, but not threadsafe. In other words, if you wish to access the same stemmer object from multiple threads, you must ensure that all access is protected by a mutex or similar device. libstemmer does not currently incorporate any mechanism for caching the results of stemming operations. Such caching can greatly increase the performance of a stemmer under certain situations, so suitable patches will be considered for inclusion. The standard libstemmer sources contain an algorithm for each of the supported languages. The algorithm may be selected using the english name of the language, or using the 2 or 3 letter ISO 639 language codes. In addition, the traditional "Porter" stemming algorithm for english is included for backwards compatibility purposes, but we recommend use of the "English" stemmer in preference for new projects. (Some minor algorithms which are included only as curiosities in the snowball website, such as the Lovins stemmer and the Kraaij Pohlmann stemmer, are not included in the standard libstemmer sources. These are not really supported by the snowball project, but it would be possible to compile a modified libstemmer library containing these if desired.) The stemwords example ===================== The stemwords example program allows you to run any of the stemmers compiled into the libstemmer library on a sample vocabulary. For details on how to use it, run it with the "-h" command line option. Using the library in a larger system ==================================== If you are incorporating the library into the build system of a larger program, I recommend copying the unpacked tarball without modification into a subdirectory of the sources of your program. Future versions of the library are intended to keep the same structure, so this will keep the work required to move to a new version of the library to a minimum. As an additional convenience, the list of source and header files used in the library is detailed in mkinc.mak - a file which is in a suitable format for inclusion by a Makefile. By including this file in your build system, you can link the snowball system into your program with a few extra rules. Using the library in a system using GNU autotools ================================================= The libstemmer_c library can be integrated into a larger system which uses the GNU autotool framework (and in particular, automake and autoconf) as follows: 1) Unpack libstemmer_c-*.tar.gz in the top level project directory and rename the resulting directory to remove the version number so that there is a libstemmer_c subdirectory of the top level directory of the project. 2) Add a file "Makefile.am" to the unpacked libstemmer_c folder, containing: noinst_LTLIBRARIES = libstemmer.la include $(srcdir)/mkinc.mak noinst_HEADERS = $(snowball_headers) libstemmer_la_SOURCES = $(snowball_sources) (You may also need to add other lines to this, for example, if you are using compiler options which are not compatible with compiling the libstemmer library.) 3) Add libstemmer_c to the AC_CONFIG_FILES declaration in the project's configure.ac file. 4) Add to the top level makefile the following lines (or modify existing assignments to these variables appropriately): AUTOMAKE_OPTIONS = subdir-objects AM_CPPFLAGS = -I$(top_srcdir)/libstemmer_c/include SUBDIRS=libstemmer_c _LIBADD = libstemmer_c/libstemmer.la (Where is the name of the library or executable which links against libstemmer.) snowball-2.2.0/doc/libstemmer_csharp_README000066400000000000000000000012421414263061200205420ustar00rootroot00000000000000libstemmer_csharp ================= This document pertains to the C# version of the libstemmer distribution, available for download from: https://snowballstem.org/download.html Compiling the library ===================== To build a library:: mcs -target:library -out:snowballstemmer.dll csharp/Snowball/*.cs csharp/Snowball/Algorithms/*cs And to build the example program using that library:: mcs -target:exe -out:stemwords.exe -r:snowballstemmer.dll csharp/Stemwords/Program.cs Using the library ================= There is currently no formal documentation on the use of the C# version of the library. Additionally, its interface is not guaranteed to be stable. snowball-2.2.0/doc/libstemmer_java_README000066400000000000000000000021421414263061200202030ustar00rootroot00000000000000libstemmer_java =============== This document pertains to the Java version of the libstemmer distribution, available for download from: https://snowballstem.org/download.html Compiling the library ===================== Simply run the java compiler on all the java source files under the java directory. For example, this can be done under unix by changing directory into the java directory, and running: javac org/tartarus/snowball/*.java org/tartarus/snowball/ext/*.java This will compile the library and also an example program "TestApp" which provides a command line interface to the library. Using the library ================= There is currently no formal documentation on the use of the Java version of the library. Additionally, its interface is not guaranteed to be stable. The best documentation of the library is the source of the TestApp example program. The TestApp example =================== The TestApp example program allows you to run any of the stemmers compiled into the libstemmer library on a sample vocabulary. For details on how to use it, run it with no command line parameters. snowball-2.2.0/doc/libstemmer_js_README000066400000000000000000000011571414263061200177030ustar00rootroot00000000000000Snowball stemming library collection for Javascript =================================================== How to use library ------------------ You can use each stemming modules from Javascript code - e.g to use them with node: .. code-block:: javascript const stemmer = require('base-stemmer.js'); const english_stemmer = require('english-stemmer.js'); var stemmer = new EnglishStemmer(); alert(stemmer.stemWord("testing")); You'll need to bundle ``base-stemmer.js`` and whichever languages you want stemmers for (e.g. ``english-stemmer.js`` for English). FIXME: Document how to use in a web browser. snowball-2.2.0/doc/libstemmer_python_README000066400000000000000000000074741414263061200206200ustar00rootroot00000000000000Snowball stemming library collection for Python =============================================== Python 3 (>= 3.3) is supported. We no longer actively support Python 2 as the Python developers stopped supporting it at the start of 2020. Snowball 2.1.0 was the last release to officially support Python 2. What is Stemming? ----------------- Stemming maps different forms of the same word to a common "stem" - for example, the English stemmer maps *connection*, *connections*, *connective*, *connected*, and *connecting* to *connect*. So a searching for *connected* would also find documents which only have the other forms. This stem form is often a word itself, but this is not always the case as this is not a requirement for text search systems, which are the intended field of use. We also aim to conflate words with the same meaning, rather than all words with a common linguistic root (so *awe* and *awful* don't have the same stem), and over-stemming is more problematic than under-stemming so we tend not to stem in cases that are hard to resolve. If you want to always reduce words to a root form and/or get a root form which is itself a word then Snowball's stemming algorithms likely aren't the right answer. How to use library ------------------ The ``snowballstemmer`` module has two functions. The ``snowballstemmer.algorithms`` function returns a list of available algorithm names. The ``snowballstemmer.stemmer`` function takes an algorithm name and returns a ``Stemmer`` object. ``Stemmer`` objects have a ``Stemmer.stemWord(word)`` method and a ``Stemmer.stemWords(word[])`` method. .. code-block:: python import snowballstemmer stemmer = snowballstemmer.stemmer('english'); print(stemmer.stemWords("We are the world".split())); Automatic Acceleration ---------------------- `PyStemmer `_ is a wrapper module for Snowball's ``libstemmer_c`` and should provide results 100% compatible to **snowballstemmer**. **PyStemmer** is faster because it wraps generated C versions of the stemmers; **snowballstemmer** uses generate Python code and is slower but offers a pure Python solution. If PyStemmer is installed, ``snowballstemmer.stemmer`` returns a ``PyStemmer`` ``Stemmer`` object which provides the same ``Stemmer.stemWord()`` and ``Stemmer.stemWords()`` methods. Benchmark ~~~~~~~~~ This is a crude benchmark which measures the time for running each stemmer on every word in its sample vocabulary (10,787,583 words over 26 languages). It's not a realistic test of normal use as a real application would do much more than just stemming. It's also skewed towards the stemmers which do more work per word and towards those with larger sample vocabularies. * Python 2.7 + **snowballstemmer** : 13m00s (15.0 * PyStemmer) * Python 3.7 + **snowballstemmer** : 12m19s (14.2 * PyStemmer) * PyPy 7.1.1 (Python 2.7.13) + **snowballstemmer** : 2m14s (2.6 * PyStemmer) * PyPy 7.1.1 (Python 3.6.1) + **snowballstemmer** : 1m46s (2.0 * PyStemmer) * Python 2.7 + **PyStemmer** : 52s For reference the equivalent test for C runs in 9 seconds. These results are for Snowball 2.0.0. They're likely to evolve over time as the code Snowball generates for both Python and C continues to improve (for a much older test over a different set of stemmers using Python 2.7, **snowballstemmer** was 30 times slower than **PyStemmer**, or 9 times slower with **PyPy**). The message to take away is that if you're stemming a lot of words you should either install **PyStemmer** (which **snowballstemmer** will then automatically use for you as described above) or use PyPy. The TestApp example ------------------- The ``testapp.py`` example program allows you to run any of the stemmers on a sample vocabulary. Usage:: testapp.py "sentences ... " .. code-block:: bash $ python testapp.py English "sentences... " snowball-2.2.0/examples/000077500000000000000000000000001414263061200151515ustar00rootroot00000000000000snowball-2.2.0/examples/stemwords.c000066400000000000000000000143401414263061200173460ustar00rootroot00000000000000/* This is a simple program which uses libstemmer to provide a command * line interface for stemming using any of the algorithms provided. */ #include #include /* for malloc, free */ #include /* for memmove */ #include /* for isupper, tolower */ #include "libstemmer.h" const char * progname; static int pretty = 1; static void stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out) { #define INC 10 int lim = INC; sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol)); while (1) { int ch = getc(f_in); if (ch == EOF) { free(b); return; } { int i = 0; int inlen = 0; while (ch != '\n' && ch != EOF) { if (i == lim) { sb_symbol * newb; newb = (sb_symbol *) realloc(b, (lim + INC) * sizeof(sb_symbol)); if (newb == 0) goto error; b = newb; lim = lim + INC; } /* Update count of utf-8 characters. */ if (ch < 0x80 || ch > 0xBF) inlen += 1; /* force lower case: */ ch = tolower(ch); b[i] = ch; i++; ch = getc(f_in); } { const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i); if (stemmed == NULL) { fprintf(stderr, "Out of memory"); exit(1); } if (pretty == 1) { fwrite(b, i, 1, f_out); fputs(" -> ", f_out); } else if (pretty == 2) { fwrite(b, i, 1, f_out); if (sb_stemmer_length(stemmer) > 0) { int j; if (inlen < 30) { for (j = 30 - inlen; j > 0; j--) fputs(" ", f_out); } else { fputs("\n", f_out); for (j = 30; j > 0; j--) fputs(" ", f_out); } } } fputs((const char *)stemmed, f_out); putc('\n', f_out); } } } error: if (b != 0) free(b); return; } /** Display the command line syntax, and then exit. * @param n The value to exit with. */ static void usage(int n) { printf("usage: %s [-l ] [-i ] [-o ] [-c ] [-p[2]] [-h]\n" "\n" "The input file consists of a list of words to be stemmed, one per\n" "line. Words should be in lower case, but (for English) A-Z letters\n" "are mapped to their a-z equivalents anyway. If omitted, stdin is\n" "used.\n" "\n" "If -c is given, the argument is the character encoding of the input\n" "and output files. If it is omitted, the UTF-8 encoding is used.\n" "\n" "If -p is given the output file consists of each word of the input\n" "file followed by \"->\" followed by its stemmed equivalent.\n" "If -p2 is given the output file is a two column layout containing\n" "the input words in the first column and the stemmed equivalents in\n" "the second column.\n" "Otherwise, the output file consists of the stemmed words, one per\n" "line.\n" "\n" "-h displays this help\n", progname); exit(n); } int main(int argc, char * argv[]) { const char * in = 0; const char * out = 0; FILE * f_in; FILE * f_out; struct sb_stemmer * stemmer; const char * language = "english"; const char * charenc = NULL; int i = 1; pretty = 0; progname = argv[0]; while (i < argc) { const char * s = argv[i++]; if (s[0] == '-') { if (strcmp(s, "-o") == 0) { if (i >= argc) { fprintf(stderr, "%s requires an argument\n", s); exit(1); } out = argv[i++]; } else if (strcmp(s, "-i") == 0) { if (i >= argc) { fprintf(stderr, "%s requires an argument\n", s); exit(1); } in = argv[i++]; } else if (strcmp(s, "-l") == 0) { if (i >= argc) { fprintf(stderr, "%s requires an argument\n", s); exit(1); } language = argv[i++]; } else if (strcmp(s, "-c") == 0) { if (i >= argc) { fprintf(stderr, "%s requires an argument\n", s); exit(1); } charenc = argv[i++]; } else if (strcmp(s, "-p2") == 0) { pretty = 2; } else if (strcmp(s, "-p") == 0) { pretty = 1; } else if (strcmp(s, "-h") == 0) { usage(0); } else { fprintf(stderr, "option %s unknown\n", s); usage(1); } } else { fprintf(stderr, "unexpected parameter %s\n", s); usage(1); } } /* prepare the files */ f_in = (in == 0) ? stdin : fopen(in, "r"); if (f_in == 0) { fprintf(stderr, "file %s not found\n", in); exit(1); } f_out = (out == 0) ? stdout : fopen(out, "w"); if (f_out == 0) { fprintf(stderr, "file %s cannot be opened\n", out); exit(1); } /* do the stemming process: */ stemmer = sb_stemmer_new(language, charenc); if (stemmer == 0) { if (charenc == NULL) { fprintf(stderr, "language `%s' not available for stemming\n", language); exit(1); } else { fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc); exit(1); } } stem_file(stemmer, f_in, f_out); sb_stemmer_delete(stemmer); if (in != 0) (void) fclose(f_in); if (out != 0) (void) fclose(f_out); return 0; } snowball-2.2.0/go/000077500000000000000000000000001414263061200137405ustar00rootroot00000000000000snowball-2.2.0/go/README.md000066400000000000000000000034501414263061200152210ustar00rootroot00000000000000# Go Target for Snowball The initial implementation was built as a port of the Rust target. The initial focus has been on getting it to function, and making it work correctly. No attempt has been made to beautify the implementation, generated code, or address performance issues. ## Usage To generate Go source for a Snowball algorithm: ``` $ snowball path/to/algorithm.sbl -go -o algorithm ``` ### Go specific options `-gop[ackage]` the package name used in the generated go file (defaults to `snowball`) `-gor[untime]` the import path used for the Go Snowball runtime (defaults to `github.com/snowballstem/snowball/go`) ## Code Organization `compiler/generator_go.c` has the Go code generation logic `go/` contains the default Go Snowball runtime support `go/stemwords` contains the source for a Go version of the stemwords utility `go/algorithms` location where the makefile generated code will end up ## Using the Generated Stemmers Assuming you generated a stemmer, put that code in a package which is imported by this code as `english`. ``` env := snowball.NewEnv("beautiful") english.Stem(env) fmt.Printf("stemmed word is: %s", env.Current()) ``` NOTE: you can use the env.SetCurrent("new_word") to reuse the env on subsequent calls to the stemmer. ## Testing Only the existing Snowball algorithms have been used for testing. This does not exercise all features of the language. Run: ``` $ make check_go ``` An initial pass of fuzz-testing has been performed on the generated stemmers for the algorithms in this repo. Each ran for 5 minutes and used an initial corpus seeded with 10k words from the algorithm's snowballstem-data voc.txt file. ## Known Limitations - Code going through generate_dollar production has not been tested - Code going through generate_debug production has not been tested snowball-2.2.0/go/among.go000066400000000000000000000004071414263061200153710ustar00rootroot00000000000000package snowball import "fmt" type AmongF func(env *Env, ctx interface{}) bool type Among struct { Str string A int32 B int32 F AmongF } func (a *Among) String() string { return fmt.Sprintf("str: `%s`, a: %d, b: %d, f: %p", a.Str, a.A, a.B, a.F) } snowball-2.2.0/go/env.go000066400000000000000000000161141414263061200150620ustar00rootroot00000000000000package snowball import ( "log" "strings" "unicode/utf8" ) // Env represents the Snowball execution environment type Env struct { current string Cursor int Limit int LimitBackward int Bra int Ket int } // NewEnv creates a new Snowball execution environment on the provided string func NewEnv(val string) *Env { return &Env{ current: val, Cursor: 0, Limit: len(val), LimitBackward: 0, Bra: 0, Ket: len(val), } } func (env *Env) Current() string { return env.current } func (env *Env) SetCurrent(s string) { env.current = s env.Cursor = 0 env.Limit = len(s) env.LimitBackward = 0 env.Bra = 0 env.Ket = len(s) } func (env *Env) ReplaceS(bra, ket int, s string) int32 { adjustment := int32(len(s)) - (int32(ket) - int32(bra)) result, _ := splitAt(env.current, bra) rsplit := ket if ket < bra { rsplit = bra } _, rhs := splitAt(env.current, rsplit) result += s result += rhs newLim := int32(env.Limit) + adjustment env.Limit = int(newLim) if env.Cursor >= ket { newCur := int32(env.Cursor) + adjustment env.Cursor = int(newCur) } else if env.Cursor > bra { env.Cursor = bra } env.current = result return adjustment } func (env *Env) EqS(s string) bool { if env.Cursor >= env.Limit { return false } if strings.HasPrefix(env.current[env.Cursor:], s) { env.Cursor += len(s) for !onCharBoundary(env.current, env.Cursor) { env.Cursor++ } return true } return false } func (env *Env) EqSB(s string) bool { if int32(env.Cursor)-int32(env.LimitBackward) < int32(len(s)) { return false } else if !onCharBoundary(env.current, env.Cursor-len(s)) || !strings.HasPrefix(env.current[env.Cursor-len(s):], s) { return false } else { env.Cursor -= len(s) return true } } func (env *Env) SliceFrom(s string) bool { bra, ket := env.Bra, env.Ket env.ReplaceS(bra, ket, s) return true } func (env *Env) NextChar() { env.Cursor++ for !onCharBoundary(env.current, env.Cursor) { env.Cursor++ } } func (env *Env) PrevChar() { env.Cursor-- for !onCharBoundary(env.current, env.Cursor) { env.Cursor-- } } func (env *Env) Hop(delta int32) bool { res := env.Cursor for delta > 0 { delta-- if res >= env.Limit { return false } res++ for res < env.Limit && !onCharBoundary(env.current, res) { res++ } } env.Cursor = res return true } func (env *Env) HopChecked(delta int32) bool { return delta >= 0 && env.Hop(delta) } func (env *Env) HopBack(delta int32) bool { res := env.Cursor for delta > 0 { delta-- if res <= env.LimitBackward { return false } res-- for res > env.LimitBackward && !onCharBoundary(env.current, res) { res-- } } env.Cursor = res return true } func (env *Env) HopBackChecked(delta int32) bool { return delta >= 0 && env.HopBack(delta) } func (env *Env) InGrouping(chars []byte, min, max int32) bool { if env.Cursor >= env.Limit { return false } r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:]) if r != utf8.RuneError { if r > max || r < min { return false } r -= min if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 { return false } env.NextChar() return true } return false } func (env *Env) InGroupingB(chars []byte, min, max int32) bool { if env.Cursor <= env.LimitBackward { return false } env.PrevChar() r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:]) if r != utf8.RuneError { env.NextChar() if r > max || r < min { return false } r -= min if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 { return false } env.PrevChar() return true } return false } func (env *Env) OutGrouping(chars []byte, min, max int32) bool { if env.Cursor >= env.Limit { return false } r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:]) if r != utf8.RuneError { if r > max || r < min { env.NextChar() return true } r -= min if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 { env.NextChar() return true } } return false } func (env *Env) OutGroupingB(chars []byte, min, max int32) bool { if env.Cursor <= env.LimitBackward { return false } env.PrevChar() r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:]) if r != utf8.RuneError { env.NextChar() if r > max || r < min { env.PrevChar() return true } r -= min if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 { env.PrevChar() return true } } return false } func (env *Env) SliceDel() bool { return env.SliceFrom("") } func (env *Env) Insert(bra, ket int, s string) { adjustment := env.ReplaceS(bra, ket, s) if bra <= env.Bra { env.Bra = int(int32(env.Bra) + adjustment) } if bra <= env.Ket { env.Ket = int(int32(env.Ket) + adjustment) } } func (env *Env) SliceTo() string { return env.current[env.Bra:env.Ket] } func (env *Env) FindAmong(amongs []*Among, ctx interface{}) int32 { var i int32 j := int32(len(amongs)) c := env.Cursor l := env.Limit var commonI, commonJ int firstKeyInspected := false for { k := i + ((j - i) >> 1) var diff int32 common := min(commonI, commonJ) w := amongs[k] for lvar := common; lvar < len(w.Str); lvar++ { if c+common == l { diff-- break } diff = int32(env.current[c+common]) - int32(w.Str[lvar]) if diff != 0 { break } common++ } if diff < 0 { j = k commonJ = common } else { i = k commonI = common } if j-i <= 1 { if i > 0 { break } if j == i { break } if firstKeyInspected { break } firstKeyInspected = true } } for { w := amongs[i] if commonI >= len(w.Str) { env.Cursor = c + len(w.Str) if w.F != nil { res := w.F(env, ctx) env.Cursor = c + len(w.Str) if res { return w.B } } else { return w.B } } i = w.A if i < 0 { return 0 } } } func (env *Env) FindAmongB(amongs []*Among, ctx interface{}) int32 { var i int32 j := int32(len(amongs)) c := env.Cursor lb := env.LimitBackward var commonI, commonJ int firstKeyInspected := false for { k := i + ((j - i) >> 1) diff := int32(0) common := min(commonI, commonJ) w := amongs[k] for lvar := len(w.Str) - int(common) - 1; lvar >= 0; lvar-- { if c-common == lb { diff-- break } diff = int32(env.current[c-common-1]) - int32(w.Str[lvar]) if diff != 0 { break } // Count up commons. But not one character but the byte width of that char common++ } if diff < 0 { j = k commonJ = common } else { i = k commonI = common } if j-i <= 1 { if i > 0 { break } if j == i { break } if firstKeyInspected { break } firstKeyInspected = true } } for { w := amongs[i] if commonI >= len(w.Str) { env.Cursor = c - len(w.Str) if w.F != nil { res := w.F(env, ctx) env.Cursor = c - len(w.Str) if res { return w.B } } else { return w.B } } i = w.A if i < 0 { return 0 } } } func (env *Env) Debug(count, lineNumber int) { log.Printf("snowball debug, count: %d, line: %d", count, lineNumber) } func (env *Env) Clone() *Env { clone := *env return &clone } func (env *Env) AssignTo() string { return env.Current() } snowball-2.2.0/go/stemwords/000077500000000000000000000000001414263061200157675ustar00rootroot00000000000000snowball-2.2.0/go/stemwords/generate.go000066400000000000000000000020421414263061200201060ustar00rootroot00000000000000// +build ignore package main import ( "flag" "fmt" "io" "io/ioutil" "log" "os" ) // tool to register all algorithms built with the stemwords tool func main() { flag.Parse() if flag.NArg() < 1 { log.Fatal("must specify algorithms directory") } var w io.Writer if flag.NArg() > 1 { var err error w, err = os.Create(flag.Arg(1)) if err != nil { log.Fatalf("error creating output file %v", err) } } else { w = os.Stdout } fmt.Fprintf(w, "%s", header) files, err := ioutil.ReadDir(flag.Arg(0)) if err != nil { log.Fatal(err) } for _, file := range files { fmt.Fprintf(w, " %s \"github.com/snowballstem/snowball/go/algorithms/%s\"\n", file.Name(), file.Name()) } fmt.Fprintf(w, closeImportStartInit) for _, file := range files { fmt.Fprintf(w, " languages[\"%s\"] = %s.Stem\n", file.Name(), file.Name()) } fmt.Fprintf(w, "%s", footer) } var header = `// generated list of supported algorithms, DO NOT EDIT package main import ( ` var closeImportStartInit = `) func init() {` var footer = `} ` snowball-2.2.0/go/stemwords/main.go000066400000000000000000000023251414263061200172440ustar00rootroot00000000000000//go:generate go run generate.go ../algorithms algorithms.go //go:generate gofmt -s -w algorithms.go package main import ( "bufio" "flag" "fmt" "log" "os" snowballRuntime "github.com/snowballstem/snowball/go" ) var language = flag.String("l", "", "language") var input = flag.String("i", "", "input file") var output = flag.String("o", "", "output file") func main() { flag.Parse() if *language == "" { log.Fatal("must specify language") } stemmer, ok := languages[*language] if !ok { log.Fatalf("no language support for %s", *language) } var reader = os.Stdin if *input != "" { var err error reader, err = os.Open(*input) if err != nil { log.Fatal(err) } defer reader.Close() } var writer = os.Stdout if *output != "" { var err error writer, err = os.Create(*output) if err != nil { log.Fatal(err) } defer writer.Close() } var err error scanner := bufio.NewScanner(reader) for scanner.Scan() { word := scanner.Text() env := snowballRuntime.NewEnv(word) stemmer(env) fmt.Fprintf(writer, "%s\n", env.Current()) } if err = scanner.Err(); err != nil { log.Fatal(err) } } type StemFunc func(env *snowballRuntime.Env) bool var languages = make(map[string]StemFunc) snowball-2.2.0/go/util.go000066400000000000000000000012031414263061200152400ustar00rootroot00000000000000package snowball import ( "math" "unicode/utf8" ) const MaxInt = math.MaxInt32 const MinInt = math.MinInt32 func splitAt(str string, mid int) (string, string) { return str[:mid], str[mid:] } func min(a, b int) int { if a < b { return a } return b } func onCharBoundary(s string, pos int) bool { if pos <= 0 || pos >= len(s) { return true } return utf8.RuneStart(s[pos]) } // RuneCountInString is a wrapper around utf8.RuneCountInString // this allows us to not have to conditionally include // the utf8 package into some stemmers and not others func RuneCountInString(str string) int { return utf8.RuneCountInString(str) } snowball-2.2.0/iconv.py000066400000000000000000000024211414263061200150220ustar00rootroot00000000000000#!env python # Simple (but slow) iconv replacement in Python. import sys in_cs = out_cs = in_file = out_file = pending = None for arg in sys.argv[1:]: if pending != None: arg = pending + arg pending = None if arg.startswith('-'): if arg[1] in ('f', 't', 'o'): if len(arg) == 2: pending = arg continue if arg[1] == 'f': in_cs = arg[2:] continue if arg[1] == 't': out_cs = arg[2:] continue if arg[1] == 'o': out_file = open(arg[2:], 'wb') continue print("Unknown option: '%s'" % arg) sys.exit(1) if in_file == None: in_file = open(arg, 'rb') continue print("Too many arguments") sys.exit(1) if in_cs == None: print("Need to specify input cs with -f") sys.exit(1) if out_cs == None: print("Need to specify output cs with -t") sys.exit(1) if in_file == None: if hasattr(sys.stdin, 'buffer'): in_file = sys.stdin.buffer else: in_file = sys.stdin if out_file == None: if hasattr(sys.stdout, 'buffer'): out_file = sys.stdout.buffer else: out_file = sys.stdout out_file.write(in_file.read().decode(in_cs).encode(out_cs)) snowball-2.2.0/include/000077500000000000000000000000001414263061200147565ustar00rootroot00000000000000snowball-2.2.0/include/libstemmer.h000066400000000000000000000055771414263061200173100ustar00rootroot00000000000000 /* Make header file work when included from C++ */ #ifdef __cplusplus extern "C" { #endif struct sb_stemmer; typedef unsigned char sb_symbol; /* FIXME - should be able to get a version number for each stemming * algorithm (which will be incremented each time the output changes). */ /** Returns an array of the names of the available stemming algorithms. * Note that these are the canonical names - aliases (ie, other names for * the same algorithm) will not be included in the list. * The list is terminated with a null pointer. * * The list must not be modified in any way. */ const char ** sb_stemmer_list(void); /** Create a new stemmer object, using the specified algorithm, for the * specified character encoding. * * All algorithms will usually be available in UTF-8, but may also be * available in other character encodings. * * @param algorithm The algorithm name. This is either the english * name of the algorithm, or the 2 or 3 letter ISO 639 codes for the * language. Note that case is significant in this parameter - the * value should be supplied in lower case. * * @param charenc The character encoding. NULL may be passed as * this value, in which case UTF-8 encoding will be assumed. Otherwise, * the argument may be one of "UTF_8", "ISO_8859_1" (i.e. Latin 1), * "ISO_8859_2" (i.e. Latin 2) or "KOI8_R" (Russian). Note that case is * significant in this parameter. * * @return NULL if the specified algorithm is not recognised, or the * algorithm is not available for the requested encoding. Otherwise, * returns a pointer to a newly created stemmer for the requested algorithm. * The returned pointer must be deleted by calling sb_stemmer_delete(). * * @note NULL will also be returned if an out of memory error occurs. */ struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc); /** Delete a stemmer object. * * This frees all resources allocated for the stemmer. After calling * this function, the supplied stemmer may no longer be used in any way. * * It is safe to pass a null pointer to this function - this will have * no effect. */ void sb_stemmer_delete(struct sb_stemmer * stemmer); /** Stem a word. * * The return value is owned by the stemmer - it must not be freed or * modified, and it will become invalid when the stemmer is called again, * or if the stemmer is freed. * * The length of the return value can be obtained using sb_stemmer_length(). * * If an out-of-memory error occurs, this will return NULL. */ const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size); /** Get the length of the result of the last stemmed word. * This should not be called before sb_stemmer_stem() has been called. */ int sb_stemmer_length(struct sb_stemmer * stemmer); #ifdef __cplusplus } #endif snowball-2.2.0/java/000077500000000000000000000000001414263061200142545ustar00rootroot00000000000000snowball-2.2.0/java/org/000077500000000000000000000000001414263061200150435ustar00rootroot00000000000000snowball-2.2.0/java/org/tartarus/000077500000000000000000000000001414263061200167105ustar00rootroot00000000000000snowball-2.2.0/java/org/tartarus/snowball/000077500000000000000000000000001414263061200205315ustar00rootroot00000000000000snowball-2.2.0/java/org/tartarus/snowball/Among.java000066400000000000000000000016111414263061200224340ustar00rootroot00000000000000package org.tartarus.snowball; import java.lang.reflect.Method; public class Among { public Among (String s, int substring_i, int result) { this.s = s.toCharArray(); this.substring_i = substring_i; this.result = result; this.method = null; } public Among (String s, int substring_i, int result, String methodname, Class programclass) { this.s = s.toCharArray(); this.substring_i = substring_i; this.result = result; try { this.method = programclass.getDeclaredMethod(methodname); } catch (NoSuchMethodException e) { throw new RuntimeException(e); } } public final char[] s; /* search string */ public final int substring_i; /* index to longest matching substring */ public final int result; /* result of the lookup */ public final Method method; /* method to use if substring matches */ }; snowball-2.2.0/java/org/tartarus/snowball/SnowballProgram.java000066400000000000000000000212151414263061200245060ustar00rootroot00000000000000 package org.tartarus.snowball; import java.lang.reflect.InvocationTargetException; import java.io.Serializable; public class SnowballProgram implements Serializable { protected SnowballProgram() { current = new StringBuilder(); init(); } static final long serialVersionUID = 2016072500L; private void init() { cursor = 0; limit = current.length(); limit_backward = 0; bra = cursor; ket = limit; } /** * Set the current string. */ public void setCurrent(String value) { // Make a new StringBuilder. If we reuse the old one, and a user of // the library keeps a reference to the buffer returned (for example, // by converting it to a String in a way which doesn't force a copy), // the buffer size will not decrease, and we will risk wasting a large // amount of memory. // Thanks to Wolfram Esser for spotting this problem. current = new StringBuilder(value); init(); } /** * Get the current string. */ public String getCurrent() { return current.toString(); } // current string protected StringBuilder current; protected int cursor; protected int limit; protected int limit_backward; protected int bra; protected int ket; public SnowballProgram(SnowballProgram other) { current = other.current; cursor = other.cursor; limit = other.limit; limit_backward = other.limit_backward; bra = other.bra; ket = other.ket; } protected void copy_from(SnowballProgram other) { current = other.current; cursor = other.cursor; limit = other.limit; limit_backward = other.limit_backward; bra = other.bra; ket = other.ket; } protected boolean in_grouping(char [] s, int min, int max) { if (cursor >= limit) return false; char ch = current.charAt(cursor); if (ch > max || ch < min) return false; ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false; cursor++; return true; } protected boolean in_grouping_b(char [] s, int min, int max) { if (cursor <= limit_backward) return false; char ch = current.charAt(cursor - 1); if (ch > max || ch < min) return false; ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false; cursor--; return true; } protected boolean out_grouping(char [] s, int min, int max) { if (cursor >= limit) return false; char ch = current.charAt(cursor); if (ch > max || ch < min) { cursor++; return true; } ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { cursor++; return true; } return false; } protected boolean out_grouping_b(char [] s, int min, int max) { if (cursor <= limit_backward) return false; char ch = current.charAt(cursor - 1); if (ch > max || ch < min) { cursor--; return true; } ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { cursor--; return true; } return false; } protected boolean eq_s(CharSequence s) { if (limit - cursor < s.length()) return false; int i; for (i = 0; i != s.length(); i++) { if (current.charAt(cursor + i) != s.charAt(i)) return false; } cursor += s.length(); return true; } protected boolean eq_s_b(CharSequence s) { if (cursor - limit_backward < s.length()) return false; int i; for (i = 0; i != s.length(); i++) { if (current.charAt(cursor - s.length() + i) != s.charAt(i)) return false; } cursor -= s.length(); return true; } protected int find_among(Among v[]) { int i = 0; int j = v.length; int c = cursor; int l = limit; int common_i = 0; int common_j = 0; boolean first_key_inspected = false; while (true) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; // smaller Among w = v[k]; int i2; for (i2 = common; i2 < w.s.length; i2++) { if (c + common == l) { diff = -1; break; } diff = current.charAt(c + common) - w.s[i2]; if (diff != 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; // v->s has been inspected if (j == i) break; // only one item in v // - but now we need to go round once more to get // v->s inspected. This looks messy, but is actually // the optimal approach. if (first_key_inspected) break; first_key_inspected = true; } } while (true) { Among w = v[i]; if (common_i >= w.s.length) { cursor = c + w.s.length; if (w.method == null) return w.result; boolean res; try { Object resobj = w.method.invoke(this); res = resobj.toString().equals("true"); } catch (InvocationTargetException e) { res = false; // FIXME - debug message } catch (IllegalAccessException e) { res = false; // FIXME - debug message } cursor = c + w.s.length; if (res) return w.result; } i = w.substring_i; if (i < 0) return 0; } } // find_among_b is for backwards processing. Same comments apply protected int find_among_b(Among v[]) { int i = 0; int j = v.length; int c = cursor; int lb = limit_backward; int common_i = 0; int common_j = 0; boolean first_key_inspected = false; while (true) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; Among w = v[k]; int i2; for (i2 = w.s.length - 1 - common; i2 >= 0; i2--) { if (c - common == lb) { diff = -1; break; } diff = current.charAt(c - 1 - common) - w.s[i2]; if (diff != 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; if (j == i) break; if (first_key_inspected) break; first_key_inspected = true; } } while (true) { Among w = v[i]; if (common_i >= w.s.length) { cursor = c - w.s.length; if (w.method == null) return w.result; boolean res; try { Object resobj = w.method.invoke(this); res = resobj.toString().equals("true"); } catch (InvocationTargetException e) { res = false; // FIXME - debug message } catch (IllegalAccessException e) { res = false; // FIXME - debug message } cursor = c - w.s.length; if (res) return w.result; } i = w.substring_i; if (i < 0) return 0; } } /* to replace chars between c_bra and c_ket in current by the * chars in s. */ protected int replace_s(int c_bra, int c_ket, String s) { int adjustment = s.length() - (c_ket - c_bra); current.replace(c_bra, c_ket, s); limit += adjustment; if (cursor >= c_ket) cursor += adjustment; else if (cursor > c_bra) cursor = c_bra; return adjustment; } protected void slice_check() { if (bra < 0 || bra > ket || ket > limit || limit > current.length()) // this line could be removed { System.err.println("faulty slice operation"); // FIXME: report error somehow. /* fprintf(stderr, "faulty slice operation:\n"); debug(z, -1, 0); exit(1); */ } } protected void slice_from(String s) { slice_check(); replace_s(bra, ket, s); } protected void slice_from(CharSequence s) { slice_from(s.toString()); } protected void slice_del() { slice_from(""); } protected void insert(int c_bra, int c_ket, String s) { int adjustment = replace_s(c_bra, c_ket, s); if (c_bra <= bra) bra += adjustment; if (c_bra <= ket) ket += adjustment; } protected void insert(int c_bra, int c_ket, CharSequence s) { insert(c_bra, c_ket, s.toString()); } /* Copy the slice into the supplied StringBuilder */ protected void slice_to(StringBuilder s) { slice_check(); s.replace(0, s.length(), current.substring(bra, ket)); } protected void assign_to(StringBuilder s) { s.replace(0, s.length(), current.substring(0, limit)); } /* extern void debug(struct SN_env * z, int number, int line_count) { int i; int limit = SIZE(z->p); //if (number >= 0) printf("%3d (line %4d): '", number, line_count); if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit); for (i = 0; i <= limit; i++) { if (z->lb == i) printf("{"); if (z->bra == i) printf("["); if (z->c == i) printf("|"); if (z->ket == i) printf("]"); if (z->l == i) printf("}"); if (i < limit) { int ch = z->p[i]; if (ch == 0) ch = '#'; printf("%c", ch); } } printf("'\n"); } */ }; snowball-2.2.0/java/org/tartarus/snowball/SnowballStemmer.java000066400000000000000000000002771414263061200245200ustar00rootroot00000000000000 package org.tartarus.snowball; public abstract class SnowballStemmer extends SnowballProgram { public abstract boolean stem(); static final long serialVersionUID = 2016072500L; }; snowball-2.2.0/java/org/tartarus/snowball/TestApp.java000066400000000000000000000037311414263061200227600ustar00rootroot00000000000000 package org.tartarus.snowball; import java.lang.reflect.Method; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Reader; import java.io.Writer; import java.nio.charset.StandardCharsets; public class TestApp { private static void usage() { System.err.println("Usage: TestApp [] [-o ]"); } public static void main(String [] args) throws Throwable { if (args.length < 2) { usage(); return; } Class stemClass = Class.forName("org.tartarus.snowball.ext." + args[0] + "Stemmer"); SnowballStemmer stemmer = (SnowballStemmer) stemClass.newInstance(); int arg = 1; InputStream instream; if (args.length > arg && !args[arg].equals("-o")) { instream = new FileInputStream(args[arg++]); } else { instream = System.in; } OutputStream outstream; if (args.length > arg) { if (args.length != arg + 2 || !args[arg].equals("-o")) { usage(); return; } outstream = new FileOutputStream(args[arg + 1]); } else { outstream = System.out; } Reader reader = new InputStreamReader(instream, StandardCharsets.UTF_8); reader = new BufferedReader(reader); Writer output = new OutputStreamWriter(outstream, StandardCharsets.UTF_8); output = new BufferedWriter(output); StringBuffer input = new StringBuffer(); int character; while ((character = reader.read()) != -1) { char ch = (char) character; if (Character.isWhitespace(ch)) { stemmer.setCurrent(input.toString()); stemmer.stem(); output.write(stemmer.getCurrent()); output.write('\n'); input.delete(0, input.length()); } else { input.append(ch < 127 ? Character.toLowerCase(ch) : ch); } } output.flush(); } } snowball-2.2.0/javascript/000077500000000000000000000000001414263061200155015ustar00rootroot00000000000000snowball-2.2.0/javascript/base-stemmer.js000066400000000000000000000177051414263061200204350ustar00rootroot00000000000000/**@constructor*/ BaseStemmer = function() { this.setCurrent = function(value) { this.current = value; this.cursor = 0; this.limit = this.current.length; this.limit_backward = 0; this.bra = this.cursor; this.ket = this.limit; }; this.getCurrent = function() { return this.current; }; this.copy_from = function(other) { this.current = other.current; this.cursor = other.cursor; this.limit = other.limit; this.limit_backward = other.limit_backward; this.bra = other.bra; this.ket = other.ket; }; this.in_grouping = function(s, min, max) { if (this.cursor >= this.limit) return false; var ch = this.current.charCodeAt(this.cursor); if (ch > max || ch < min) return false; ch -= min; if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) return false; this.cursor++; return true; }; this.in_grouping_b = function(s, min, max) { if (this.cursor <= this.limit_backward) return false; var ch = this.current.charCodeAt(this.cursor - 1); if (ch > max || ch < min) return false; ch -= min; if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) return false; this.cursor--; return true; }; this.out_grouping = function(s, min, max) { if (this.cursor >= this.limit) return false; var ch = this.current.charCodeAt(this.cursor); if (ch > max || ch < min) { this.cursor++; return true; } ch -= min; if ((s[ch >>> 3] & (0X1 << (ch & 0x7))) == 0) { this.cursor++; return true; } return false; }; this.out_grouping_b = function(s, min, max) { if (this.cursor <= this.limit_backward) return false; var ch = this.current.charCodeAt(this.cursor - 1); if (ch > max || ch < min) { this.cursor--; return true; } ch -= min; if ((s[ch >>> 3] & (0x1 << (ch & 0x7))) == 0) { this.cursor--; return true; } return false; }; this.eq_s = function(s) { if (this.limit - this.cursor < s.length) return false; if (this.current.slice(this.cursor, this.cursor + s.length) != s) { return false; } this.cursor += s.length; return true; }; this.eq_s_b = function(s) { if (this.cursor - this.limit_backward < s.length) return false; if (this.current.slice(this.cursor - s.length, this.cursor) != s) { return false; } this.cursor -= s.length; return true; }; /** @return {number} */ this.find_among = function(v) { var i = 0; var j = v.length; var c = this.cursor; var l = this.limit; var common_i = 0; var common_j = 0; var first_key_inspected = false; while (true) { var k = i + ((j - i) >>> 1); var diff = 0; var common = common_i < common_j ? common_i : common_j; // smaller // w[0]: string, w[1]: substring_i, w[2]: result, w[3]: function (optional) var w = v[k]; var i2; for (i2 = common; i2 < w[0].length; i2++) { if (c + common == l) { diff = -1; break; } diff = this.current.charCodeAt(c + common) - w[0].charCodeAt(i2); if (diff != 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; // v->s has been inspected if (j == i) break; // only one item in v // - but now we need to go round once more to get // v->s inspected. This looks messy, but is actually // the optimal approach. if (first_key_inspected) break; first_key_inspected = true; } } do { var w = v[i]; if (common_i >= w[0].length) { this.cursor = c + w[0].length; if (w.length < 4) return w[2]; var res = w[3](this); this.cursor = c + w[0].length; if (res) return w[2]; } i = w[1]; } while (i >= 0); return 0; }; // find_among_b is for backwards processing. Same comments apply this.find_among_b = function(v) { var i = 0; var j = v.length var c = this.cursor; var lb = this.limit_backward; var common_i = 0; var common_j = 0; var first_key_inspected = false; while (true) { var k = i + ((j - i) >> 1); var diff = 0; var common = common_i < common_j ? common_i : common_j; var w = v[k]; var i2; for (i2 = w[0].length - 1 - common; i2 >= 0; i2--) { if (c - common == lb) { diff = -1; break; } diff = this.current.charCodeAt(c - 1 - common) - w[0].charCodeAt(i2); if (diff != 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; if (j == i) break; if (first_key_inspected) break; first_key_inspected = true; } } do { var w = v[i]; if (common_i >= w[0].length) { this.cursor = c - w[0].length; if (w.length < 4) return w[2]; var res = w[3](this); this.cursor = c - w[0].length; if (res) return w[2]; } i = w[1]; } while (i >= 0); return 0; }; /* to replace chars between c_bra and c_ket in this.current by the * chars in s. */ this.replace_s = function(c_bra, c_ket, s) { var adjustment = s.length - (c_ket - c_bra); this.current = this.current.slice(0, c_bra) + s + this.current.slice(c_ket); this.limit += adjustment; if (this.cursor >= c_ket) this.cursor += adjustment; else if (this.cursor > c_bra) this.cursor = c_bra; return adjustment; }; this.slice_check = function() { if (this.bra < 0 || this.bra > this.ket || this.ket > this.limit || this.limit > this.current.length) { return false; } return true; }; this.slice_from = function(s) { var result = false; if (this.slice_check()) { this.replace_s(this.bra, this.ket, s); result = true; } return result; }; this.slice_del = function() { return this.slice_from(""); }; this.insert = function(c_bra, c_ket, s) { var adjustment = this.replace_s(c_bra, c_ket, s); if (c_bra <= this.bra) this.bra += adjustment; if (c_bra <= this.ket) this.ket += adjustment; }; this.slice_to = function() { var result = ''; if (this.slice_check()) { result = this.current.slice(this.bra, this.ket); } return result; }; this.assign_to = function() { return this.current.slice(0, this.limit); }; }; snowball-2.2.0/javascript/stemwords.js000066400000000000000000000057271414263061200201010ustar00rootroot00000000000000const stemmer = require('base-stemmer.js'); const fs = require('fs'); const readline = require('readline'); function usage() { console.log("usage: stemwords.js [-l ] -i -o [-c ] [-h]\n"); console.log("The input file consists of a list of words to be stemmed, one per"); console.log("line. Words should be in lower case.\n"); console.log("If -c is given, the argument is the character encoding of the input"); console.log("and output files. If it is omitted, the UTF-8 encoding is used.\n"); console.log("The output file consists of the stemmed words, one per line.\n"); console.log("-h displays this help"); } if (process.argv.length < 5) { usage(); } else { var input = ''; var output = ''; var encoding = 'utf8'; var language = 'English'; var show_help = false; while (process.argv.length > 0) { var arg = process.argv.shift(); switch (arg) { case "-h": show_help = true; process.argv.length = 0; break; case "-l": if (process.argv.length == 0) { show_help = true; break; } language = process.argv.shift(); break; case "-i": if (process.argv.length == 0) { show_help = true; break; } input = process.argv.shift(); break; case "-o": if (process.argv.length == 0) { show_help = true; break; } output = process.argv.shift(); break; case "-c": if (process.argv.length == 0) { show_help = true; break; } encoding = process.argv.shift(); break; } } if (show_help || input == '' || output == '') { usage(); } else { stemming(language, input, output, encoding); } } // function stemming (lang : string, input : string, output : string, encoding : string) { function stemming (lang, input, output, encoding) { const lines = readline.createInterface({ input: fs.createReadStream(input, encoding), terminal: false }); var out = fs.createWriteStream(output, encoding); var stemmer = create(lang); lines.on('line', (original) => { out.write(stemmer.stemWord(original) + '\n'); }); } function create (name) { var lc_name = name.toLowerCase(); if (!lc_name.match('\\W') && lc_name != 'base') { var algo = lc_name.substr(0, 1).toUpperCase() + lc_name.substr(1); try { const stemmer = require(lc_name + '-stemmer.js'); return Function('return new ' + algo + 'Stemmer()')(); } catch (error) { } } console.log('Unknown stemming language: ' + name + '\n'); usage(); process.exit(1); } snowball-2.2.0/libstemmer/000077500000000000000000000000001414263061200154765ustar00rootroot00000000000000snowball-2.2.0/libstemmer/libstemmer_c.in000066400000000000000000000043121414263061200204730ustar00rootroot00000000000000 #include #include #include "../include/libstemmer.h" #include "../runtime/api.h" #include "@MODULES_H@" struct sb_stemmer { struct SN_env * (*create)(void); void (*close)(struct SN_env *); int (*stem)(struct SN_env *); struct SN_env * env; }; extern const char ** sb_stemmer_list(void) { return algorithm_names; } static stemmer_encoding_t sb_getenc(const char * charenc) { const struct stemmer_encoding * encoding; if (charenc == NULL) return ENC_UTF_8; for (encoding = encodings; encoding->name != 0; encoding++) { if (strcmp(encoding->name, charenc) == 0) break; } if (encoding->name == NULL) return ENC_UNKNOWN; return encoding->enc; } extern struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc) { stemmer_encoding_t enc; const struct stemmer_modules * module; struct sb_stemmer * stemmer; enc = sb_getenc(charenc); if (enc == ENC_UNKNOWN) return NULL; for (module = modules; module->name != 0; module++) { if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break; } if (module->name == NULL) return NULL; stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer)); if (stemmer == NULL) return NULL; stemmer->create = module->create; stemmer->close = module->close; stemmer->stem = module->stem; stemmer->env = stemmer->create(); if (stemmer->env == NULL) { sb_stemmer_delete(stemmer); return NULL; } return stemmer; } void sb_stemmer_delete(struct sb_stemmer * stemmer) { if (stemmer == 0) return; if (stemmer->close) { stemmer->close(stemmer->env); stemmer->close = 0; } free(stemmer); } const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size) { int ret; if (SN_set_current(stemmer->env, size, (const symbol *)(word))) { stemmer->env->l = 0; return NULL; } ret = stemmer->stem(stemmer->env); if (ret < 0) return NULL; stemmer->env->p[stemmer->env->l] = 0; return (const sb_symbol *)(stemmer->env->p); } int sb_stemmer_length(struct sb_stemmer * stemmer) { return stemmer->env->l; } snowball-2.2.0/libstemmer/mkalgorithms.pl000077500000000000000000000034721414263061200205450ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use 5.006; use warnings; my $progname = $0; if (scalar @ARGV != 2) { print "Usage: $progname \n"; exit 1; } my $outname = shift(@ARGV); my $descfile = shift(@ARGV); my %aliases = (); my %algorithms = (); my %algorithm_encs = (); my %encs = (); sub addalgenc($$) { my $alg = shift(); my $enc = shift(); if (defined $algorithm_encs{$alg}) { my $hashref = $algorithm_encs{$alg}; $$hashref{$enc}=1; } else { my %newhash = ($enc => 1); $algorithm_encs{$alg}=\%newhash; } $encs{$enc} = 1; } sub readinput() { open DESCFILE, $descfile; my $line; while ($line = ) { next if $line =~ m/^\s*#/; next if $line =~ m/^\s*$/; my ($alg,$encstr,$aliases) = split(/\s+/, $line); my $enc; my $alias; $algorithms{$alg} = 1; foreach $alias (split(/,/, $aliases)) { foreach $enc (split(/,/, $encstr)) { $aliases{$alias} = $alg; addalgenc($alg, $enc); } } } } sub printoutput() { open (OUT, ">$outname") or die "Can't open output file `$outname': $!\n"; print OUT <{$enc}; } print OUT "\n"; } } readinput(); printoutput(); snowball-2.2.0/libstemmer/mkmodules.pl000077500000000000000000000137171414263061200200470ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use 5.006; use warnings; my $progname = $0; if (scalar @ARGV < 4 || scalar @ARGV > 5) { print "Usage: $progname []\n"; exit 1; } my $outname = shift(@ARGV); my $c_src_dir = shift(@ARGV); my $descfile = shift(@ARGV); my $srclistfile = shift(@ARGV); my $enc_only; my $extn = ''; if (@ARGV) { $enc_only = shift(@ARGV); $extn = '_'.$enc_only; } my %aliases = (); my %algorithms = (); my %algorithm_encs = (); my %encs = (); sub addalgenc($$) { my $alg = shift(); my $enc = shift(); if (defined $enc_only) { my $norm_enc = lc $enc; $norm_enc =~ s/_//g; if ($norm_enc ne $enc_only) { return; } } if (defined $algorithm_encs{$alg}) { my $hashref = $algorithm_encs{$alg}; $$hashref{$enc}=1; } else { my %newhash = ($enc => 1); $algorithm_encs{$alg}=\%newhash; } $encs{$enc} = 1; } sub readinput() { open DESCFILE, $descfile; my $line; while ($line = ) { next if $line =~ m/^\s*#/; next if $line =~ m/^\s*$/; my ($alg,$encstr,$aliases) = split(/\s+/, $line); my $enc; my $alias; $algorithms{$alg} = 1; foreach $alias (split(/,/, $aliases)) { foreach $enc (split(/,/, $encstr)) { # print "$alias, $enc\n"; $aliases{$alias} = $alg; addalgenc($alg, $enc); } } } } sub printoutput() { open (OUT, ">$outname") or die "Can't open output file `$outname': $!\n"; print OUT < 77) { print OUT ",\n * "; $linelen = 3; } else { print OUT ', '; $linelen += 2; } } print OUT $lang; $linelen += length($lang); $need_sep = 1; } print OUT "\n */\n\n"; foreach $lang (@algorithms) { my $hashref = $algorithm_encs{$lang}; foreach $enc (sort keys (%$hashref)) { print OUT "#include \"../$c_src_dir/stem_${enc}_$lang.h\"\n"; } } print OUT <$srclistfile") or die "Can't open output file `$srclistfile': $!\n"; print OUT < 77) { print OUT ",\n# "; $linelen = 3; } else { print OUT ', '; $linelen += 2; } } print OUT $lang; $linelen += length($lang); $need_sep = 1; } print OUT "\n\nsnowball_sources= \\\n"; for $lang (sort keys %aliases) { my $hashref = $algorithm_encs{$lang}; my $enc; foreach $enc (sort keys (%$hashref)) { print OUT " src_c/stem_${enc}_${lang}.c \\\n"; } } $need_sep = 0; for $srcfile ('runtime/api.c', 'runtime/utilities.c', "libstemmer/libstemmer${extn}.c") { print OUT " \\\n" if $need_sep; print OUT " $srcfile"; $need_sep = 1; } print OUT "\n\nsnowball_headers= \\\n"; for $lang (sort keys %aliases) { my $hashref = $algorithm_encs{$lang}; my $enc; foreach $enc (sort keys (%$hashref)) { my $p = "${lang}_${enc}"; print OUT " src_c/stem_${enc}_${lang}.h \\\n"; } } $need_sep = 0; for $srcfile ('include/libstemmer.h', "libstemmer/modules${extn}.h", 'runtime/api.h', 'runtime/header.h') { print OUT " \\\n" if $need_sep; print OUT " $srcfile"; $need_sep = 1; } print OUT "\n\n"; close OUT or die "Can't close ${srclistfile}: $!\n"; } readinput(); printoutput(); printsrclist(); snowball-2.2.0/libstemmer/modules.txt000066400000000000000000000063171414263061200177160ustar00rootroot00000000000000# This file contains a list of stemmers to include in the distribution. # The format is a set of space separated lines - on each line: # First item is name of stemmer. # Second item is comma separated list of character sets. # Third item is comma separated list of names to refer to the stemmer by. # # Lines starting with a #, or blank lines, are ignored. # List all the main algorithms for each language, in UTF-8, and also with # the most commonly used encoding. arabic UTF_8 arabic,ar,ara armenian UTF_8 armenian,hy,hye,arm basque UTF_8,ISO_8859_1 basque,eu,eus,baq catalan UTF_8,ISO_8859_1 catalan,ca,cat danish UTF_8,ISO_8859_1 danish,da,dan dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld english UTF_8,ISO_8859_1 english,en,eng finnish UTF_8,ISO_8859_1 finnish,fi,fin french UTF_8,ISO_8859_1 french,fr,fre,fra german UTF_8,ISO_8859_1 german,de,ger,deu greek UTF_8 greek,el,gre,ell hindi UTF_8 hindi,hi,hin hungarian UTF_8,ISO_8859_2 hungarian,hu,hun indonesian UTF_8,ISO_8859_1 indonesian,id,ind irish UTF_8,ISO_8859_1 irish,ga,gle italian UTF_8,ISO_8859_1 italian,it,ita lithuanian UTF_8 lithuanian,lt,lit nepali UTF_8 nepali,ne,nep norwegian UTF_8,ISO_8859_1 norwegian,no,nor portuguese UTF_8,ISO_8859_1 portuguese,pt,por romanian UTF_8,ISO_8859_2 romanian,ro,rum,ron russian UTF_8,KOI8_R russian,ru,rus serbian UTF_8 serbian,sr,srp spanish UTF_8,ISO_8859_1 spanish,es,esl,spa swedish UTF_8,ISO_8859_1 swedish,sv,swe tamil UTF_8 tamil,ta,tam turkish UTF_8 turkish,tr,tur yiddish UTF_8 yiddish,yi,yid # Also include the traditional porter algorithm for english. # The porter algorithm is included in the libstemmer distribution to assist # with backwards compatibility, but for new systems the english algorithm # should be used in preference. porter UTF_8,ISO_8859_1 porter english # Some other stemmers in the snowball project are not included in the standard # distribution. To compile a libstemmer with them in, add them to this list, # and regenerate the distribution. (You will need a full source checkout for # this.) They are included in the snowball website as curiosities, but are not # intended for general use, and use of them is is not fully supported. These # algorithms are: # # german2 - This is a slight modification of the german stemmer. #german2 UTF_8,ISO_8859_1 german2 german # # kraaij_pohlmann - This is a different dutch stemmer. #kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann dutch # # lovins - This is an english stemmer, but fairly outdated, and # only really applicable to a restricted type of input text # (keywords in academic publications). #lovins UTF_8,ISO_8859_1 lovins english snowball-2.2.0/libstemmer/test.c000066400000000000000000000020101414263061200166120ustar00rootroot00000000000000 #include "libstemmer.h" /* test code */ void error(const char * err) { printf("%s\n", err); exit(1); } int main () { const char * stemmed; const char * unstemmed; struct sb_stemmer * s; const char ** list = sb_stemmer_list(); if (*list == 0) error("TEST FAIL: empty list of stemmers"); s = sb_stemmer_new("e"); if (s != 0) error("TEST FAIL: non zero return for unrecognised language"); s = sb_stemmer_new("english"); if (s == 0) error("TEST FAIL: zero return for recognised language"); sb_stemmer_delete(s); s = sb_stemmer_new("en"); if (s == 0) error("TEST FAIL: zero return for recognised language"); unstemmed = "recognised"; stemmed = sb_stemmer_stem(s, unstemmed, 10); printf("%s -> %s\n", unstemmed, stemmed); if (sb_stemmer_length(s) != strlen(stemmed)) error("TEST FAIL: length not correct"); unstemmed = "recognized"; printf("%s -> %s\n", unstemmed, stemmed); sb_stemmer_delete(s); printf("Success\n"); return 0; } snowball-2.2.0/pascal/000077500000000000000000000000001414263061200145765ustar00rootroot00000000000000snowball-2.2.0/pascal/.gitignore000066400000000000000000000000571414263061200165700ustar00rootroot00000000000000/*.ppu /*Stemmer.pas /stemwords.dpr /stemwords snowball-2.2.0/pascal/SnowballProgram.pas000066400000000000000000000235201414263061200204160ustar00rootroot00000000000000unit SnowballProgram; interface Type TAmongHandler = Function : Boolean of Object; Type TAmong = record Str : AnsiString; // search string Index : Integer; // index to longest matching substring Result : Integer; // result of the lookup Method : TAmongHandler; // method to use if substring matches End; Type {$M+} TSnowballProgram = Class Protected FCurrent : AnsiString; FCursor : Integer; FLimit : Integer; FBkLimit : Integer; FBra : Integer; FKet : Integer; Procedure SetCurrent(Current: AnsiString); Protected Function InGrouping(s : array of char; min, max : Integer) : Boolean; Function InGroupingBk(s : array of char; min, max : Integer) : Boolean; Function OutGrouping(s : array of char; min, max : Integer) : Boolean; Function OutGroupingBk(s : array of char; min, max : Integer) : Boolean; Function EqS(s_size : Integer; s : AnsiString) : Boolean; Function EqSBk(s_size : Integer; s : AnsiString) : Boolean; Function EqV(s : AnsiString) : Boolean; Function EqVBk(s : AnsiString) : Boolean; Function FindAmong(v : array of TAmong; v_size : Integer) : Integer; Function FindAmongBk(v : array of TAmong; v_size : Integer) : Integer; Procedure SliceDel; Procedure SliceCheck; Procedure SliceFrom(s : AnsiString); Function ReplaceS(bra, ket : Integer; s : AnsiString) : Integer; Procedure Insert(bra, ket : Integer; s : AnsiString); Function SliceTo : AnsiString; Function AssignTo : AnsiString; Public { Set & Retrieve current string } Property Current: AnsiString Read FCurrent Write SetCurrent; { Method subclasses need to implement } Function stem : Boolean; Virtual; Abstract; End; Implementation Uses Math; Procedure TSnowballProgram.SetCurrent(Current: AnsiString); Begin FCurrent := Current; FCursor := 0; FLimit := Length(Current); FBkLimit := 0; FBra := FCursor; FKet := FLimit; End; Function TSnowballProgram.InGrouping(s : array of char; min, max : Integer) : Boolean; Var ch : Integer; Begin Result := False; If (FCursor >= FLimit) Then Exit; ch := Ord(FCurrent[FCursor + 1]); If (ch > max) Or (ch < min) Then Exit; ch := ch - min; If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then Exit; Inc(FCursor); Result := True; End; Function TSnowballProgram.InGroupingBk(s : array of char; min, max : Integer) : Boolean; Var ch : Integer; Begin Result := False; If (FCursor <= FBkLimit) Then Exit; ch := Ord(FCurrent[FCursor]); If (ch > max) Or (ch < min) Then Exit; ch := ch - min; If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then Exit; Dec(FCursor); Result := True; End; Function TSnowballProgram.OutGrouping(s : array of char; min, max : Integer) : Boolean; Var ch : Integer; Begin Result := False; If (FCursor >= FLimit) Then Exit; ch := Ord(FCurrent[FCursor + 1]); If (ch > max) Or (ch < min) Then Begin Inc(FCursor); Result := True; Exit; End; ch := ch - min; If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then Begin Inc(FCursor); Result := True; End; End; Function TSnowballProgram.OutGroupingBk(s : array of char; min, max : Integer) : Boolean; Var ch : Integer; Begin Result := False; If (FCursor <= FBkLimit) Then Exit; ch := Ord(FCurrent[FCursor]); If (ch > max) Or (ch < min) Then Begin Dec(FCursor); Result := True; Exit; End; ch := ch - min; If (Ord(s[ch Shr 3]) And Ord(1 Shl (ch And $7))) = 0 Then Begin Dec(FCursor); Result := True; End; End; Function TSnowballProgram.EqS(s_size : Integer; s : AnsiString) : Boolean; Var I : Integer; Begin Result := False; If (FLimit - FCursor) < s_size Then Exit; For I := 1 To s_size Do If FCurrent[FCursor + I] <> s[I] Then Exit; FCursor := FCursor + s_size; Result := True; End; Function TSnowballProgram.EqSBk(s_size : Integer; s : AnsiString) : Boolean; Var I : Integer; Begin Result := False; if (FCursor - FBkLimit) < s_size Then Exit; For I := 1 To s_size Do If FCurrent[FCursor - s_size + I] <> s[i] Then Exit; FCursor := FCursor - s_size; Result := True; End; Function TSnowballProgram.EqV(s : AnsiString) : Boolean; Begin Result := EqS(Length(s), s); End; Function TSnowballProgram.EqVBk(s : AnsiString) : Boolean; Begin Result := EqSBk(Length(s), s); End; Function TSnowballProgram.FindAmong(v : array of TAmong; v_size : Integer) : Integer; Var i, i2, j, c, l, common_i, common_j, k, diff, common : Integer; first_key_inspected, res : Boolean; w : TAmong; Begin i := 0; j := v_size; c := FCursor; l := FLimit; common_i := 0; common_j := 0; first_key_inspected := false; While True Do Begin k := i + ((j - i) Shr 1); diff := 0; common := Min(common_i, common_j); // smaller w := v[k]; For i2 := common To Length(w.Str) - 1 Do Begin if (c + common) = l Then Begin diff := -1; Break; End; diff := Ord(FCurrent[c + common + 1]) - Ord(w.Str[i2 + 1]); if diff <> 0 Then Break; Inc(common); End; if diff < 0 Then Begin j := k; common_j := common; End Else Begin i := k; common_i := common; End; If (j - i) <= 1 Then Begin If (i > 0) Then Break; // v->s has been inspected if (j = i) Then Break; // only one item in v // - but now we need to go round once more to get // v->s inspected. This looks messy, but is actually // the optimal approach. if (first_key_inspected) Then Break; first_key_inspected := True; End; End; While True Do Begin w := v[i]; If (common_i >= Length(w.Str)) Then Begin FCursor := c + Length(w.Str); If Not Assigned(w.Method) Then Begin Result := w.Result; Exit; End; res := w.Method; FCursor := c + Length(w.Str); if (res) Then Begin Result := w.Result; Exit; End; End; i := w.Index; if i < 0 Then Begin Result := 0; Exit; End; End; End; Function TSnowballProgram.FindAmongBk(v : array of TAmong; v_size : Integer) : Integer; Var i, j, c, lb, common_i, common_j, k, diff, common, i2 : Integer; first_key_inspected, res : Boolean; w : TAmong; Begin i := 0; j := v_size; c := FCursor; lb := FBkLimit; common_i := 0; common_j := 0; first_key_inspected := false; While True Do Begin k := i + ((j - i) Shr 1); diff := 0; common := Min(common_i, common_j); w := v[k]; For i2 := Length(w.Str) - 1 - common DownTo 0 Do Begin If (c - common) = lb Then Begin diff := -1; Break; End; diff := Ord(FCurrent[c - common]) - Ord(w.Str[i2 + 1]); if diff <> 0 Then Break; Inc(common); End; If diff < 0 Then Begin j := k; common_j := common; End Else Begin i := k; common_i := common; End; If (j - i) <= 1 Then Begin if i > 0 Then Break; if j = i Then Break; if first_key_inspected Then Break; first_key_inspected := True; End; End; While True Do Begin w := v[i]; if common_i >= Length(w.Str) Then Begin FCursor := c - Length(w.Str); If Not Assigned(w.Method) Then Begin Result := w.Result; Exit; End; res := w.Method; FCursor := c - Length(w.Str); If Res Then Begin Result := w.Result; Exit; End; End; i := w.Index; If i < 0 Then Begin Result := 0; Exit; End; End; End; Procedure TSnowballProgram.SliceCheck; Begin if (FBra < 0) Or (FBra > FKet) Or (FKet > FLimit) Or (FLimit > Length(FCurrent)) Then Begin WriteLn('Faulty slice operation.'); Halt; End; End; Procedure TSnowballProgram.SliceDel; Begin SliceFrom(''); End; Function TSnowballProgram.ReplaceS(bra, ket : Integer; s : AnsiString) : Integer; Var adjustment : Integer; Begin adjustment := Length(s) - (ket - bra); Delete(FCurrent, bra + 1, ket - bra); System.Insert(s, FCurrent, bra + 1); FLimit := FLimit + adjustment; if (FCursor >= ket) Then FCursor := FCursor + adjustment Else If (FCursor > bra) Then FCursor := bra; Result := adjustment; End; Procedure TSnowballProgram.Insert(bra, ket : Integer; s : AnsiString); Var adjustment : Integer; Begin adjustment := ReplaceS(bra, ket, s); If (bra <= FBra) Then FBra := FBra + adjustment; If (bra <= FKet) Then FKet := FKet + adjustment; End; Function TSnowballProgram.SliceTo() : AnsiString; Begin SliceCheck(); Result := Copy(FCurrent, FBra + 1, FKet - FBra); End; Procedure TSnowballProgram.SliceFrom(s : AnsiString); Begin SliceCheck(); ReplaceS(FBra, FKet, s); End; Function TSnowballProgram.AssignTo() : AnsiString; Begin Result := Copy(FCurrent, 1, FLimit); End; End. snowball-2.2.0/pascal/generate.pl000077500000000000000000000006771414263061200167420ustar00rootroot00000000000000#!/usr/bin/env perl use strict; use warnings; # Generate Pascal stemwords source. my @sources = @ARGV; while (defined(my $line = )) { if ($line =~ /\{\s*BEGIN TEMPLATE\s*\}/) { my $template = ''; while (defined($line = ) && $line !~ /\{\s*END TEMPLATE\s*\}/) { $template .= $line; } foreach my $source(@sources) { my $out = $template; $out =~ s/%STEMMER%/$source/g; print $out; } next; } print $line; } snowball-2.2.0/pascal/stemwords-template.dpr000066400000000000000000000026751414263061200211570ustar00rootroot00000000000000program stemwords; {$ifdef windows} {$APPTYPE CONSOLE} {$endif} uses SnowballProgram, { BEGIN TEMPLATE } %STEMMER%Stemmer in '%STEMMER%Stemmer.pas', { END TEMPLATE } SysUtils; Var Stemmer : TSnowballProgram; CurWord : AnsiString; i : Integer; language : AnsiString; Const Delimiters : Set Of Char = [#10, #13]; Function NextWord : Boolean; Var C : Char; Begin CurWord := ''; Result := Not Eof; While Not Eof Do Begin Read(C); If IOResult <> 0 Then Break; If C In Delimiters Then Break; CurWord := CurWord + C; End; End; begin language := 'english'; i := 0; while i < ParamCount do begin i := i + 1; if ParamStr(i) = '-l' then begin i := i + 1; language := ParamStr(i); continue; end; WriteLn('option '+ParamStr(i)+' unknown'); Exit; end; if False then { BEGIN TEMPLATE } else if language = '%STEMMER%' then Stemmer := T%STEMMER%Stemmer.Create { END TEMPLATE } else begin WriteLn('Stemming language '+language+' unknown'); Exit; end; Try While Not Eof Do Begin While NextWord Do Begin Stemmer.Current := CurWord; Stemmer.Stem; WriteLn(Stemmer.Current); End; End; Finally Stemmer.Free; End; end. snowball-2.2.0/python/000077500000000000000000000000001414263061200146545ustar00rootroot00000000000000snowball-2.2.0/python/MANIFEST.in000066400000000000000000000001761414263061200164160ustar00rootroot00000000000000include *.rst include modules.txt include setup.* recursive-include src *.py include MANIFEST.in include COPYING include NEWS snowball-2.2.0/python/create_init.py000066400000000000000000000024211414263061200175130ustar00rootroot00000000000000#! /bin/sh/env python import sys import re import os python_out_folder = sys.argv[1] filematch = re.compile(r"(\w+)_stemmer\.py$") imports = [] languages = [] for pyscript in os.listdir(python_out_folder): match = filematch.match(pyscript) if (match): langname = match.group(1) titlecase = langname.title() languages.append(" '%(lang)s': %(title)sStemmer," % {'lang': langname, 'title': titlecase}) imports.append('from .%(lang)s_stemmer import %(title)sStemmer' % {'lang': langname, 'title': titlecase}) imports.sort() languages.sort() src = '''__all__ = ('language', 'stemmer') %(imports)s _languages = { %(languages)s } try: import Stemmer cext_available = True except ImportError: cext_available = False def algorithms(): if cext_available: return Stemmer.language() else: return list(_languages.keys()) def stemmer(lang): if cext_available: return Stemmer.Stemmer(lang) if lang.lower() in _languages: return _languages[lang.lower()]() else: raise KeyError("Stemming algorithm '%%s' not found" %% lang) ''' % {'imports': '\n'.join(imports), 'languages': '\n'.join(languages)} with open(os.path.join(python_out_folder, '__init__.py'), 'w') as out: out.write(src) snowball-2.2.0/python/setup.cfg000066400000000000000000000001651414263061200164770ustar00rootroot00000000000000[metadata] long_description = file: README.rst long_description_content_type = text/x-rst [bdist_wheel] universal=1 snowball-2.2.0/python/setup.py000066400000000000000000000047321414263061200163740ustar00rootroot00000000000000#!/usr/bin/env python from setuptools import setup import re SNOWBALL_VERSION = '2.2.0' n_stemmers = 0 langs = [] variants = {} with open('modules.txt') as fp: for line in fp.readlines(): if len(line) <= 1 or line[0] == '#': continue if line[-1:] == '\n': line = line[:-1] tokens = re.split(r'\s+', line) if len(tokens) < 3: print("Bad modules.txt line: " + line) continue (name, encs, codes) = tokens[:3] if len(tokens) > 3: variant_of = tokens[3] if variant_of in variants: variants[variant_of].append(name) else: variants[variant_of] = [name] else: langs.append(name) n_stemmers += 1 desc = 'This package provides ' + str(n_stemmers) + ' stemmers for ' + \ str(len(langs)) + ' languages generated from Snowball algorithms.' classifiers = [ 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', 'License :: OSI Approved :: BSD License' ] for lang in langs: lang_titlecase = lang.title() # Only classifiers listed in https://pypi.org/classifiers/ are allowed if lang_titlecase not in ('Armenian', 'Yiddish'): classifiers.append('Natural Language :: ' + lang_titlecase) classifiers.extend([ 'Operating System :: OS Independent', 'Programming Language :: Python', 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: Implementation :: CPython', 'Programming Language :: Python :: Implementation :: PyPy', 'Topic :: Database', 'Topic :: Internet :: WWW/HTTP :: Indexing/Search', 'Topic :: Text Processing :: Indexing', 'Topic :: Text Processing :: Linguistic' ]) setup(name='snowballstemmer', version=SNOWBALL_VERSION, description=desc, author='Snowball Developers', author_email='snowball-discuss@lists.tartarus.org', url='https://github.com/snowballstem/snowball', keywords="stemmer", license="BSD-3-Clause", packages=['snowballstemmer'], package_dir={"snowballstemmer": "src/snowballstemmer"}, classifiers = classifiers ) snowball-2.2.0/python/snowballstemmer/000077500000000000000000000000001414263061200200725ustar00rootroot00000000000000snowball-2.2.0/python/snowballstemmer/among.py000066400000000000000000000006241414263061200215470ustar00rootroot00000000000000 class Among(object): def __init__(self, s, substring_i, result, method=None): """ @ivar s search string @ivar substring index to longest matching substring @ivar result of the lookup @ivar method method to use if substring matches """ self.s = s self.substring_i = substring_i self.result = result self.method = method snowball-2.2.0/python/snowballstemmer/basestemmer.py000066400000000000000000000223121414263061200227530ustar00rootroot00000000000000class BaseStemmer(object): def __init__(self): self.set_current("") def set_current(self, value): ''' Set the self.current string. ''' self.current = value self.cursor = 0 self.limit = len(self.current) self.limit_backward = 0 self.bra = self.cursor self.ket = self.limit def get_current(self): ''' Get the self.current string. ''' return self.current def copy_from(self, other): self.current = other.current self.cursor = other.cursor self.limit = other.limit self.limit_backward = other.limit_backward self.bra = other.bra self.ket = other.ket def in_grouping(self, s, min, max): if self.cursor >= self.limit: return False ch = ord(self.current[self.cursor]) if ch > max or ch < min: return False ch -= min if (s[ch >> 3] & (0x1 << (ch & 0x7))) == 0: return False self.cursor += 1 return True def go_in_grouping(self, s, min, max): while self.cursor < self.limit: ch = ord(self.current[self.cursor]) if ch > max or ch < min: return True ch -= min if (s[ch >> 3] & (0x1 << (ch & 0x7))) == 0: return True self.cursor += 1 return False def in_grouping_b(self, s, min, max): if self.cursor <= self.limit_backward: return False ch = ord(self.current[self.cursor - 1]) if ch > max or ch < min: return False ch -= min if (s[ch >> 3] & (0x1 << (ch & 0x7))) == 0: return False self.cursor -= 1 return True def go_in_grouping_b(self, s, min, max): while self.cursor > self.limit_backward: ch = ord(self.current[self.cursor - 1]) if ch > max or ch < min: return True ch -= min if (s[ch >> 3] & (0x1 << (ch & 0x7))) == 0: return True self.cursor -= 1 return False def out_grouping(self, s, min, max): if self.cursor >= self.limit: return False ch = ord(self.current[self.cursor]) if ch > max or ch < min: self.cursor += 1 return True ch -= min if (s[ch >> 3] & (0X1 << (ch & 0x7))) == 0: self.cursor += 1 return True return False def go_out_grouping(self, s, min, max): while self.cursor < self.limit: ch = ord(self.current[self.cursor]) if ch <= max and ch >= min: ch -= min if (s[ch >> 3] & (0X1 << (ch & 0x7))): return True self.cursor += 1 return False def out_grouping_b(self, s, min, max): if self.cursor <= self.limit_backward: return False ch = ord(self.current[self.cursor - 1]) if ch > max or ch < min: self.cursor -= 1 return True ch -= min if (s[ch >> 3] & (0X1 << (ch & 0x7))) == 0: self.cursor -= 1 return True return False def go_out_grouping_b(self, s, min, max): while self.cursor > self.limit_backward: ch = ord(self.current[self.cursor - 1]) if ch <= max and ch >= min: ch -= min if (s[ch >> 3] & (0X1 << (ch & 0x7))): return True self.cursor -= 1 return False def eq_s(self, s): if self.limit - self.cursor < len(s): return False if self.current[self.cursor:self.cursor + len(s)] != s: return False self.cursor += len(s) return True def eq_s_b(self, s): if self.cursor - self.limit_backward < len(s): return False if self.current[self.cursor - len(s):self.cursor] != s: return False self.cursor -= len(s) return True def find_among(self, v): i = 0 j = len(v) c = self.cursor l = self.limit common_i = 0 common_j = 0 first_key_inspected = False while True: k = i + ((j - i) >> 1) diff = 0 common = min(common_i, common_j) # smaller w = v[k] for i2 in range(common, len(w.s)): if c + common == l: diff = -1 break diff = ord(self.current[c + common]) - ord(w.s[i2]) if diff != 0: break common += 1 if diff < 0: j = k common_j = common else: i = k common_i = common if j - i <= 1: if i > 0: break # v->s has been inspected if j == i: break # only one item in v # - but now we need to go round once more to get # v->s inspected. This looks messy, but is actually # the optimal approach. if first_key_inspected: break first_key_inspected = True while True: w = v[i] if common_i >= len(w.s): self.cursor = c + len(w.s) if w.method is None: return w.result method = getattr(self, w.method) res = method() self.cursor = c + len(w.s) if res: return w.result i = w.substring_i if i < 0: return 0 return -1 # not reachable def find_among_b(self, v): ''' find_among_b is for backwards processing. Same comments apply ''' i = 0 j = len(v) c = self.cursor lb = self.limit_backward common_i = 0 common_j = 0 first_key_inspected = False while True: k = i + ((j - i) >> 1) diff = 0 common = min(common_i, common_j) w = v[k] for i2 in range(len(w.s) - 1 - common, -1, -1): if c - common == lb: diff = -1 break diff = ord(self.current[c - 1 - common]) - ord(w.s[i2]) if diff != 0: break common += 1 if diff < 0: j = k common_j = common else: i = k common_i = common if j - i <= 1: if i > 0: break if j == i: break if first_key_inspected: break first_key_inspected = True while True: w = v[i] if common_i >= len(w.s): self.cursor = c - len(w.s) if w.method is None: return w.result method = getattr(self, w.method) res = method() self.cursor = c - len(w.s) if res: return w.result i = w.substring_i if i < 0: return 0 return -1 # not reachable def replace_s(self, c_bra, c_ket, s): ''' to replace chars between c_bra and c_ket in self.current by the chars in s. @type c_bra int @type c_ket int @type s: string ''' adjustment = len(s) - (c_ket - c_bra) self.current = self.current[0:c_bra] + s + self.current[c_ket:] self.limit += adjustment if self.cursor >= c_ket: self.cursor += adjustment elif self.cursor > c_bra: self.cursor = c_bra return adjustment def slice_check(self): if self.bra < 0 or self.bra > self.ket or self.ket > self.limit or self.limit > len(self.current): return False return True def slice_from(self, s): ''' @type s string ''' result = False if self.slice_check(): self.replace_s(self.bra, self.ket, s) result = True return result def slice_del(self): return self.slice_from("") def insert(self, c_bra, c_ket, s): ''' @type c_bra int @type c_ket int @type s: string ''' adjustment = self.replace_s(c_bra, c_ket, s) if c_bra <= self.bra: self.bra += adjustment if c_bra <= self.ket: self.ket += adjustment def slice_to(self): ''' Return the slice as a string. ''' result = '' if self.slice_check(): result = self.current[self.bra:self.ket] return result def assign_to(self): ''' Return the current string up to the limit. ''' return self.current[0:self.limit] def stemWord(self, word): self.set_current(word) self._stem() return self.get_current() def stemWords(self, words): return [self.stemWord(word) for word in words] snowball-2.2.0/python/stemwords.py000066400000000000000000000065551414263061200172700ustar00rootroot00000000000000import sys import codecs import snowballstemmer def usage(): print('''usage: %s [-l ] [-i ] [-o ] [-c ] [-p[2]] [-h] The input file consists of a list of words to be stemmed, one per line. Words should be in lower case, but (for English) A-Z letters are mapped to their a-z equivalents anyway. If omitted, stdin is used. If -c is given, the argument is the character encoding of the input and output files. If it is omitted, the UTF-8 encoding is used. If -p is given the output file consists of each word of the input file followed by \"->\" followed by its stemmed equivalent. If -p2 is given the output file is a two column layout containing the input words in the first column and the stemmed eqivalents in the second column. Otherwise, the output file consists of the stemmed words, one per line. -h displays this help''' % sys.argv[0]) def main(): argv = sys.argv[1:] if len(argv) < 5: usage() else: pretty = 0 input = '' output = '' encoding = 'utf_8' language = 'English' show_help = False while len(argv): arg = argv[0] argv = argv[1:] if arg == '-h': show_help = True break elif arg == "-p": pretty = 1 elif arg == "-p2": pretty = 2 elif arg == "-l": if len(argv) == 0: show_help = True break language = argv[0] argv = argv[1:] elif arg == "-i": if len(argv) == 0: show_help = True break input = argv[0] argv = argv[1:] elif arg == "-o": if len(argv) == 0: show_help = True break output = argv[0] argv = argv[1:] elif arg == "-c": if len(argv) == 0: show_help = True break encoding = argv[0] if show_help or input == '' or output == '': usage() else: stemming(language, input, output, encoding, pretty) def stemming(lang, input, output, encoding, pretty): stemmer = snowballstemmer.stemmer(lang) with codecs.open(output, "w", encoding) as outfile: with codecs.open(input, "r", encoding) as infile: for original in infile.readlines(): original = original.strip() # Convert only ASCII-letters to lowercase, to match C behavior original = ''.join((c.lower() if 'A' <= c <= 'Z' else c for c in original)) stemmed = stemmer.stemWord(original) if pretty == 0: if stemmed != "": outfile.write(stemmed) elif pretty == 1: outfile.write(original, " -> ", stemmed) elif pretty == 2: outfile.write(original) if len(original) < 30: outfile.write(" " * (30 - len(original))) else: outfile.write("\n") outfile.write(" " * 30) outfile.write(stemmed) outfile.write('\n') main() snowball-2.2.0/python/testapp.py000066400000000000000000000012011414263061200167000ustar00rootroot00000000000000import sys import re import snowballstemmer def usage(): print("testapp.py \"sentence\"...") def main(): argv = sys.argv if len(argv) < 1: usage() return algorithm = 'english' if len(argv) > 2: algorithm = argv[1] argv = argv[2:] else: argv = argv[1:] stemmer = snowballstemmer.stemmer(algorithm) splitter = re.compile(r"[\s\.-]") for arg in argv: for word in splitter.split(arg): if word == '': continue original = word.lower() print(original + " -> " + stemmer.stemWord(original)) main() snowball-2.2.0/runtime/000077500000000000000000000000001414263061200150165ustar00rootroot00000000000000snowball-2.2.0/runtime/api.c000066400000000000000000000023001414263061200157260ustar00rootroot00000000000000 #include /* for calloc, free */ #include "header.h" extern struct SN_env * SN_create_env(int S_size, int I_size) { struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env)); if (z == NULL) return NULL; z->p = create_s(); if (z->p == NULL) goto error; if (S_size) { int i; z->S = (symbol * *) calloc(S_size, sizeof(symbol *)); if (z->S == NULL) goto error; for (i = 0; i < S_size; i++) { z->S[i] = create_s(); if (z->S[i] == NULL) goto error; } } if (I_size) { z->I = (int *) calloc(I_size, sizeof(int)); if (z->I == NULL) goto error; } return z; error: SN_close_env(z, S_size); return NULL; } extern void SN_close_env(struct SN_env * z, int S_size) { if (z == NULL) return; if (S_size) { int i; for (i = 0; i < S_size; i++) { lose_s(z->S[i]); } free(z->S); } free(z->I); if (z->p) lose_s(z->p); free(z); } extern int SN_set_current(struct SN_env * z, int size, const symbol * s) { int err = replace_s(z, 0, z->l, size, s, NULL); z->c = 0; return err; } snowball-2.2.0/runtime/api.h000066400000000000000000000014341414263061200157420ustar00rootroot00000000000000 typedef unsigned char symbol; /* Or replace 'char' above with 'short' for 16 bit characters. More precisely, replace 'char' with whatever type guarantees the character width you need. Note however that sizeof(symbol) should divide HEAD, defined in header.h as 2*sizeof(int), without remainder, otherwise there is an alignment problem. In the unlikely event of a problem here, consult Martin Porter. */ struct SN_env { symbol * p; int c; int l; int lb; int bra; int ket; symbol * * S; int * I; }; #ifdef __cplusplus extern "C" { #endif extern struct SN_env * SN_create_env(int S_size, int I_size); extern void SN_close_env(struct SN_env * z, int S_size); extern int SN_set_current(struct SN_env * z, int size, const symbol * s); #ifdef __cplusplus } #endif snowball-2.2.0/runtime/header.h000066400000000000000000000050361414263061200164230ustar00rootroot00000000000000 #include #include "api.h" #define MAXINT INT_MAX #define MININT INT_MIN #define HEAD 2*sizeof(int) #define SIZE(p) ((int *)(p))[-1] #define SET_SIZE(p, n) ((int *)(p))[-1] = n #define CAPACITY(p) ((int *)(p))[-2] struct among { int s_size; /* number of chars in string */ const symbol * s; /* search string */ int substring_i;/* index to longest matching substring */ int result; /* result of the lookup */ int (* function)(struct SN_env *); }; extern symbol * create_s(void); extern void lose_s(symbol * p); extern int skip_utf8(const symbol * p, int c, int limit, int n); extern int skip_b_utf8(const symbol * p, int c, int limit, int n); extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int eq_s(struct SN_env * z, int s_size, const symbol * s); extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s); extern int eq_v(struct SN_env * z, const symbol * p); extern int eq_v_b(struct SN_env * z, const symbol * p); extern int find_among(struct SN_env * z, const struct among * v, int v_size); extern int find_among_b(struct SN_env * z, const struct among * v, int v_size); extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment); extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s); extern int slice_from_v(struct SN_env * z, const symbol * p); extern int slice_del(struct SN_env * z); extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s); extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p); extern symbol * slice_to(struct SN_env * z, symbol * p); extern symbol * assign_to(struct SN_env * z, symbol * p); extern int len_utf8(const symbol * p); extern void debug(struct SN_env * z, int number, int line_count); snowball-2.2.0/runtime/utilities.c000066400000000000000000000330571414263061200172050ustar00rootroot00000000000000 #include #include #include #include "header.h" #define CREATE_SIZE 1 extern symbol * create_s(void) { symbol * p; void * mem = malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol)); if (mem == NULL) return NULL; p = (symbol *) (HEAD + (char *) mem); CAPACITY(p) = CREATE_SIZE; SET_SIZE(p, 0); return p; } extern void lose_s(symbol * p) { if (p == NULL) return; free((char *) p - HEAD); } /* new_p = skip_utf8(p, c, l, n); skips n characters forwards from p + c. new_p is the new position, or -1 on failure. -- used to implement hop and next in the utf8 case. */ extern int skip_utf8(const symbol * p, int c, int limit, int n) { int b; if (n < 0) return -1; for (; n > 0; n--) { if (c >= limit) return -1; b = p[c++]; if (b >= 0xC0) { /* 1100 0000 */ while (c < limit) { b = p[c]; if (b >= 0xC0 || b < 0x80) break; /* break unless b is 10------ */ c++; } } } return c; } /* new_p = skip_b_utf8(p, c, lb, n); skips n characters backwards from p + c - 1 new_p is the new position, or -1 on failure. -- used to implement hop and next in the utf8 case. */ extern int skip_b_utf8(const symbol * p, int c, int limit, int n) { int b; if (n < 0) return -1; for (; n > 0; n--) { if (c <= limit) return -1; b = p[--c]; if (b >= 0x80) { /* 1000 0000 */ while (c > limit) { b = p[c]; if (b >= 0xC0) break; /* 1100 0000 */ c--; } } } return c; } /* Code for character groupings: utf8 cases */ static int get_utf8(const symbol * p, int c, int l, int * slot) { int b0, b1, b2; if (c >= l) return 0; b0 = p[c++]; if (b0 < 0xC0 || c == l) { /* 1100 0000 */ *slot = b0; return 1; } b1 = p[c++] & 0x3F; if (b0 < 0xE0 || c == l) { /* 1110 0000 */ *slot = (b0 & 0x1F) << 6 | b1; return 2; } b2 = p[c++] & 0x3F; if (b0 < 0xF0 || c == l) { /* 1111 0000 */ *slot = (b0 & 0xF) << 12 | b1 << 6 | b2; return 3; } *slot = (b0 & 0x7) << 18 | b1 << 12 | b2 << 6 | (p[c] & 0x3F); return 4; } static int get_b_utf8(const symbol * p, int c, int lb, int * slot) { int a, b; if (c <= lb) return 0; b = p[--c]; if (b < 0x80 || c == lb) { /* 1000 0000 */ *slot = b; return 1; } a = b & 0x3F; b = p[--c]; if (b >= 0xC0 || c == lb) { /* 1100 0000 */ *slot = (b & 0x1F) << 6 | a; return 2; } a |= (b & 0x3F) << 6; b = p[--c]; if (b >= 0xE0 || c == lb) { /* 1110 0000 */ *slot = (b & 0xF) << 12 | a; return 3; } *slot = (p[--c] & 0x7) << 18 | (b & 0x3F) << 12 | a; return 4; } extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; int w = get_utf8(z->p, z->c, z->l, & ch); if (!w) return -1; if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return w; z->c += w; } while (repeat); return 0; } extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; int w = get_b_utf8(z->p, z->c, z->lb, & ch); if (!w) return -1; if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return w; z->c -= w; } while (repeat); return 0; } extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; int w = get_utf8(z->p, z->c, z->l, & ch); if (!w) return -1; if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) return w; z->c += w; } while (repeat); return 0; } extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; int w = get_b_utf8(z->p, z->c, z->lb, & ch); if (!w) return -1; if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) return w; z->c -= w; } while (repeat); return 0; } /* Code for character groupings: non-utf8 cases */ extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; if (z->c >= z->l) return -1; ch = z->p[z->c]; if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 1; z->c++; } while (repeat); return 0; } extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; if (z->c <= z->lb) return -1; ch = z->p[z->c - 1]; if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 1; z->c--; } while (repeat); return 0; } extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; if (z->c >= z->l) return -1; ch = z->p[z->c]; if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) return 1; z->c++; } while (repeat); return 0; } extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; if (z->c <= z->lb) return -1; ch = z->p[z->c - 1]; if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) return 1; z->c--; } while (repeat); return 0; } extern int eq_s(struct SN_env * z, int s_size, const symbol * s) { if (z->l - z->c < s_size || memcmp(z->p + z->c, s, s_size * sizeof(symbol)) != 0) return 0; z->c += s_size; return 1; } extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s) { if (z->c - z->lb < s_size || memcmp(z->p + z->c - s_size, s, s_size * sizeof(symbol)) != 0) return 0; z->c -= s_size; return 1; } extern int eq_v(struct SN_env * z, const symbol * p) { return eq_s(z, SIZE(p), p); } extern int eq_v_b(struct SN_env * z, const symbol * p) { return eq_s_b(z, SIZE(p), p); } extern int find_among(struct SN_env * z, const struct among * v, int v_size) { int i = 0; int j = v_size; int c = z->c; int l = z->l; const symbol * q = z->p + c; const struct among * w; int common_i = 0; int common_j = 0; int first_key_inspected = 0; while (1) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; /* smaller */ w = v + k; { int i2; for (i2 = common; i2 < w->s_size; i2++) { if (c + common == l) { diff = -1; break; } diff = q[common] - w->s[i2]; if (diff != 0) break; common++; } } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; /* v->s has been inspected */ if (j == i) break; /* only one item in v */ /* - but now we need to go round once more to get v->s inspected. This looks messy, but is actually the optimal approach. */ if (first_key_inspected) break; first_key_inspected = 1; } } while (1) { w = v + i; if (common_i >= w->s_size) { z->c = c + w->s_size; if (w->function == 0) return w->result; { int res = w->function(z); z->c = c + w->s_size; if (res) return w->result; } } i = w->substring_i; if (i < 0) return 0; } } /* find_among_b is for backwards processing. Same comments apply */ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) { int i = 0; int j = v_size; int c = z->c; int lb = z->lb; const symbol * q = z->p + c - 1; const struct among * w; int common_i = 0; int common_j = 0; int first_key_inspected = 0; while (1) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; w = v + k; { int i2; for (i2 = w->s_size - 1 - common; i2 >= 0; i2--) { if (c - common == lb) { diff = -1; break; } diff = q[- common] - w->s[i2]; if (diff != 0) break; common++; } } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; if (j == i) break; if (first_key_inspected) break; first_key_inspected = 1; } } while (1) { w = v + i; if (common_i >= w->s_size) { z->c = c - w->s_size; if (w->function == 0) return w->result; { int res = w->function(z); z->c = c - w->s_size; if (res) return w->result; } } i = w->substring_i; if (i < 0) return 0; } } /* Increase the size of the buffer pointed to by p to at least n symbols. * If insufficient memory, returns NULL and frees the old buffer. */ static symbol * increase_size(symbol * p, int n) { symbol * q; int new_size = n + 20; void * mem = realloc((char *) p - HEAD, HEAD + (new_size + 1) * sizeof(symbol)); if (mem == NULL) { lose_s(p); return NULL; } q = (symbol *) (HEAD + (char *)mem); CAPACITY(q) = new_size; return q; } /* to replace symbols between c_bra and c_ket in z->p by the s_size symbols at s. Returns 0 on success, -1 on error. Also, frees z->p (and sets it to NULL) on error. */ extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjptr) { int adjustment; int len; if (z->p == NULL) { z->p = create_s(); if (z->p == NULL) return -1; } adjustment = s_size - (c_ket - c_bra); len = SIZE(z->p); if (adjustment != 0) { if (adjustment + len > CAPACITY(z->p)) { z->p = increase_size(z->p, adjustment + len); if (z->p == NULL) return -1; } memmove(z->p + c_ket + adjustment, z->p + c_ket, (len - c_ket) * sizeof(symbol)); SET_SIZE(z->p, adjustment + len); z->l += adjustment; if (z->c >= c_ket) z->c += adjustment; else if (z->c > c_bra) z->c = c_bra; } if (s_size) memmove(z->p + c_bra, s, s_size * sizeof(symbol)); if (adjptr != NULL) *adjptr = adjustment; return 0; } static int slice_check(struct SN_env * z) { if (z->bra < 0 || z->bra > z->ket || z->ket > z->l || z->p == NULL || z->l > SIZE(z->p)) /* this line could be removed */ { #if 0 fprintf(stderr, "faulty slice operation:\n"); debug(z, -1, 0); #endif return -1; } return 0; } extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s) { if (slice_check(z)) return -1; return replace_s(z, z->bra, z->ket, s_size, s, NULL); } extern int slice_from_v(struct SN_env * z, const symbol * p) { return slice_from_s(z, SIZE(p), p); } extern int slice_del(struct SN_env * z) { return slice_from_s(z, 0, 0); } extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s) { int adjustment; if (replace_s(z, bra, ket, s_size, s, &adjustment)) return -1; if (bra <= z->bra) z->bra += adjustment; if (bra <= z->ket) z->ket += adjustment; return 0; } extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p) { return insert_s(z, bra, ket, SIZE(p), p); } extern symbol * slice_to(struct SN_env * z, symbol * p) { if (slice_check(z)) { lose_s(p); return NULL; } { int len = z->ket - z->bra; if (CAPACITY(p) < len) { p = increase_size(p, len); if (p == NULL) return NULL; } memmove(p, z->p + z->bra, len * sizeof(symbol)); SET_SIZE(p, len); } return p; } extern symbol * assign_to(struct SN_env * z, symbol * p) { int len = z->l; if (CAPACITY(p) < len) { p = increase_size(p, len); if (p == NULL) return NULL; } memmove(p, z->p, len * sizeof(symbol)); SET_SIZE(p, len); return p; } extern int len_utf8(const symbol * p) { int size = SIZE(p); int len = 0; while (size--) { symbol b = *p++; if (b >= 0xC0 || b < 0x80) ++len; } return len; } #if 0 extern void debug(struct SN_env * z, int number, int line_count) { int i; int limit = SIZE(z->p); /*if (number >= 0) printf("%3d (line %4d): '", number, line_count);*/ if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit); for (i = 0; i <= limit; i++) { if (z->lb == i) printf("{"); if (z->bra == i) printf("["); if (z->c == i) printf("|"); if (z->ket == i) printf("]"); if (z->l == i) printf("}"); if (i < limit) { int ch = z->p[i]; if (ch == 0) ch = '#'; printf("%c", ch); } } printf("'\n"); } #endif snowball-2.2.0/rust/000077500000000000000000000000001414263061200143305ustar00rootroot00000000000000snowball-2.2.0/rust/Cargo.toml000066400000000000000000000002101414263061200162510ustar00rootroot00000000000000[package] name = "testapp" version = "0.1.0" authors = ["Jakob Demler "] build = "build.rs" [dependencies] snowball-2.2.0/rust/build.rs000066400000000000000000000037761414263061200160120ustar00rootroot00000000000000use std::env; use std::fs; use std::fs::{OpenOptions}; use std::io::Write; use std::path::Path; // This build script makes the code independent from the algorithms declared // in the makefile. // We check which stemmers were generated and then produce the corresponding // includes for src/algorithms/mod.rs and a closure for src/main.rs to match // strings to stemmers fn main() { let out_dir = env::var("OUT_DIR").unwrap(); let lang_match_path = Path::new(&out_dir).join("lang_matches.rs"); let lang_include_path = Path::new(&out_dir).join("lang_include.rs"); let mut lang_match_file = OpenOptions::new().write(true).create(true).truncate(true).open(&lang_match_path).unwrap(); let mut lang_include_file = OpenOptions::new().write(true).create(true).truncate(true).open(&lang_include_path).unwrap(); let src_dir = Path::new(&env::var("CARGO_MANIFEST_DIR").unwrap()).join("src"); let algo_dir = src_dir.join("snowball/algorithms"); lang_match_file.write_all(b" move |lang:String|{ match lang.as_str() {") .unwrap(); for file in fs::read_dir(&algo_dir).unwrap() { let file = file.unwrap(); let path = file.path(); let filestem = path.file_stem().unwrap().to_str().unwrap(); if path.is_file() && filestem != "mod" { //Also we need to copy all the stemmer files into OUT_DIR... fs::copy(&path, Path::new(&out_dir).join(file.file_name())).unwrap(); let split = filestem.len() - 8; let langname = &filestem[..split]; writeln!(&mut lang_match_file, "\"{}\" => Stemmer {{ stemmer: snowball::algorithms::{}_stemmer::stem}},", langname, langname) .unwrap(); writeln!(&mut lang_include_file, "pub mod {}_stemmer;", langname).unwrap(); } } lang_match_file.write_all(b" x => panic!(\"Unknown algorithm '{}'\", x) } } ") .unwrap(); } snowball-2.2.0/rust/rust-pre-1.27-compat.patch000066400000000000000000000024201414263061200207760ustar00rootroot00000000000000Applying this patch restores compatibility with Rust < 1.27 (but causes newer versions to report "warning: trait objects without an explicit `dyn` are deprecated"). diff --git a/rust/src/main.rs b/rust/src/main.rs index 064325a9..bf752795 100644 --- a/rust/src/main.rs +++ b/rust/src/main.rs @@ -56,9 +56,9 @@ fn main() { let mut output = if let Some(output_file) = output_arg { - Box::new(File::create(Path::new(&output_file)).unwrap()) as Box + Box::new(File::create(Path::new(&output_file)).unwrap()) as Box } else { - Box::new(std::io::stdout()) as Box + Box::new(std::io::stdout()) as Box }; if let Some(input_file) = input_arg { diff --git a/rust/src/snowball/among.rs b/rust/src/snowball/among.rs index 57fc8bae..70631933 100644 --- a/rust/src/snowball/among.rs +++ b/rust/src/snowball/among.rs @@ -3,4 +3,4 @@ use snowball::SnowballEnv; pub struct Among(pub &'static str, pub i32, pub i32, - pub Option<&'static (dyn Fn(&mut SnowballEnv, &mut T) -> bool + Sync)>); + pub Option<&'static (Fn(&mut SnowballEnv, &mut T) -> bool + Sync)>); snowball-2.2.0/rust/src/000077500000000000000000000000001414263061200151175ustar00rootroot00000000000000snowball-2.2.0/rust/src/main.rs000066400000000000000000000063041414263061200164140ustar00rootroot00000000000000use std::fs::File; use std::io::{BufRead, BufReader, Write}; use std::path::Path; use std::env; use std::borrow::Cow; pub mod snowball; use snowball::SnowballEnv; fn usage(name: &str) { println!("{} -l [-i ] [-o ] The input file consists of a list of words to be stemmed, one per line. Words should be in lower case, but (for English) A-Z letters are mapped to their a-z equivalents anyway. If omitted, stdin is used.", name); } fn main() { let args: Vec = env::args().collect(); if args.len() < 3 { usage(&args[0]); } else { let mut language = None; let mut input_arg = None; let mut output_arg = None; let mut i = 1; while i < args.len() { match args[i].as_str() { "-l" => { language = Some(args[i+1].clone()); i += 2; }, "-i" => { input_arg = Some(args[i+1].clone()); i += 2; }, "-o" => { output_arg = Some(args[i+1].clone()); i += 2; }, x => { println!("Unrecognized option '{}'", x); usage(&args[0]); return } } } if language.is_none() { println!("Please specify a language!"); usage(&args[0]); return; } let stemmer = Stemmer::create(language.unwrap()); let mut output = if let Some(output_file) = output_arg { Box::new(File::create(Path::new(&output_file)).unwrap()) as Box } else { Box::new(std::io::stdout()) as Box }; if let Some(input_file) = input_arg { for line in BufReader::new(File::open(Path::new(&input_file)).unwrap()).lines() { writeln!(&mut output, "{}", stemmer.stem(&line.unwrap())).unwrap(); } } else { let stdin = std::io::stdin(); for line in stdin.lock().lines() { writeln!(&mut output, "{}", stemmer.stem(&line.unwrap())).unwrap(); } } } } /// Wraps a usable interface around the actual stemmer implementation pub struct Stemmer { stemmer: fn(&mut SnowballEnv) -> bool, } impl Stemmer { /// Create a new stemmer from an algorithm pub fn create(lang: String) -> Self { // Have a look at ../build.rs // There we generate a file that is rust code for a closure that returns a stemmer. // We match against all the algorithms in src/snowball/algoritms/ folder. // Alas, this cannot be included as a match statement or function because of Rust's // hygenic macros. let match_language = include!(concat!(env!("OUT_DIR"), "/lang_matches.rs")); match_language(lang) } /// Stem a single word /// Please note, that the input is expected to be all lowercase (if that is applicable). pub fn stem<'a>(&self, input: &'a str) -> Cow<'a, str> { let mut env = SnowballEnv::create(input); (self.stemmer)(&mut env); env.get_current() } } snowball-2.2.0/rust/src/snowball/000077500000000000000000000000001414263061200167405ustar00rootroot00000000000000snowball-2.2.0/rust/src/snowball/algorithms/000077500000000000000000000000001414263061200211115ustar00rootroot00000000000000snowball-2.2.0/rust/src/snowball/algorithms/mod.rs000066400000000000000000000001231414263061200222320ustar00rootroot00000000000000// Have a look at build.rs include!(concat!(env!("OUT_DIR"), "/lang_include.rs")); snowball-2.2.0/rust/src/snowball/among.rs000066400000000000000000000003751414263061200204140ustar00rootroot00000000000000use snowball::SnowballEnv; pub struct Among(pub &'static str, pub i32, pub i32, pub Option<&'static (dyn Fn(&mut SnowballEnv, &mut T) -> bool + Sync)>); snowball-2.2.0/rust/src/snowball/mod.rs000066400000000000000000000001771414263061200200720ustar00rootroot00000000000000pub mod algorithms; mod among; mod snowball_env; pub use snowball::among::Among; pub use snowball::snowball_env::SnowballEnv; snowball-2.2.0/rust/src/snowball/snowball_env.rs000066400000000000000000000311321414263061200217770ustar00rootroot00000000000000use std::borrow::Cow; use snowball::Among; #[derive(Debug, Clone)] pub struct SnowballEnv<'a> { pub current: Cow<'a, str>, pub cursor: i32, pub limit: i32, pub limit_backward: i32, pub bra: i32, pub ket: i32, } impl<'a> SnowballEnv<'a> { pub fn create(value: &'a str) -> Self { let len = value.len(); SnowballEnv { current: Cow::from(value), cursor: 0, limit: len as i32, limit_backward: 0, bra: 0, ket: len as i32, } } pub fn get_current(self) -> Cow<'a, str> { self.current } pub fn set_current(&mut self, current: &'a str) { self.current = Cow::from(current); } pub fn set_current_s(&mut self, current: String) { self.current = Cow::from(current); } fn replace_s(&mut self, bra: i32, ket: i32, s: &str) -> i32 { let adjustment = s.len() as i32 - (ket - bra); let mut result = String::with_capacity(self.current.len()); { let (lhs, _) = self.current.split_at(bra as usize); let (_, rhs) = self.current.split_at(ket as usize); result.push_str(lhs); result.push_str(s); result.push_str(rhs); } // ... not very nice... let new_lim = self.limit + adjustment; self.limit = new_lim; if self.cursor >= ket { let new_cur = self.cursor + adjustment; self.cursor = new_cur; } else if self.cursor > bra { self.cursor = bra } self.current = Cow::from(result); adjustment } /// Check if s is after cursor. /// If so, move cursor to the end of s pub fn eq_s(&mut self, s: &str) -> bool { if self.cursor >= self.limit { return false; } if self.current[(self.cursor as usize)..].starts_with(s) { self.cursor += s.len() as i32; while !self.current.is_char_boundary(self.cursor as usize) { self.cursor += 1; } true } else { false } } /// Check if 's' is before cursor /// If so, move cursor to the beginning of s pub fn eq_s_b(&mut self, s: &str) -> bool { if (self.cursor - self.limit_backward) < s.len() as i32 { false // Check if cursor -s.len is a char boundary. if not well... return false obv } else if !self.current.is_char_boundary(self.cursor as usize - s.len()) || !self.current[self.cursor as usize - s.len()..].starts_with(s) { false } else { self.cursor -= s.len() as i32; true } } /// Replace string between `bra` and `ket` with s pub fn slice_from(&mut self, s: &str) -> bool { let (bra, ket) = (self.bra, self.ket); self.replace_s(bra, ket, s); true } /// Move cursor to next character pub fn next_char(&mut self) { self.cursor += 1; while !self.current.is_char_boundary(self.cursor as usize) { self.cursor += 1; } } /// Move cursor to previous character pub fn previous_char(&mut self) { self.cursor -= 1; while !self.current.is_char_boundary(self.cursor as usize) { self.cursor -= 1; } } pub fn hop(&mut self, mut delta: i32) -> bool { let mut res = self.cursor; while delta > 0 { delta -= 1; if res >= self.limit { return false; } res += 1; while res < self.limit && !self.current.is_char_boundary(res as usize) { res += 1; } } self.cursor = res; return true; } pub fn hop_checked(&mut self, delta: i32) -> bool { return delta >= 0 && self.hop(delta); } pub fn hop_back(&mut self, mut delta: i32) -> bool { let mut res = self.cursor; while delta > 0 { delta -= 1; if res <= self.limit_backward { return false; } res -= 1; while res > self.limit_backward && !self.current.is_char_boundary(res as usize) { res -= 1; } } self.cursor = res; return true; } pub fn hop_back_checked(&mut self, delta: i32) -> bool { return delta >= 0 && self.hop_back(delta); } // A grouping is represented by a minimum code point, a maximum code point, // and a bitfield of which code points in that range are in the grouping. // For example, in english.sbl, valid_LI is 'cdeghkmnrt'. // The minimum and maximum code points are 99 and 116, // so every time one of these grouping functions is called for g_valid_LI, // min must be 99 and max must be 116. There are 18 code points within that // range (inclusive) so the grouping is represented with 18 bits, plus 6 bits of padding: // // cdefghij klmnopqr st // 11101100 10110001 01000000 // // The first bit is the least significant. // Those three bytes become &[0b00110111, 0b10001101, 0b00000010], // which is &[55, 141, 2], which is how g_valid_LI is defined in english.rs. /// Check if the char the cursor points to is in the grouping pub fn in_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool { if self.cursor >= self.limit { return false; } if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer if ch > max || ch < min { return false; } ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { return false; } self.next_char(); return true; } return false; } pub fn in_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bool { if self.cursor <= self.limit_backward { return false; } self.previous_char(); if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer self.next_char(); if ch > max || ch < min { return false; } ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { return false; } self.previous_char(); return true; } return false; } pub fn out_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool { if self.cursor >= self.limit { return false; } if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer if ch > max || ch < min { self.next_char(); return true; } ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { self.next_char(); return true; } } return false; } pub fn out_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bool { if self.cursor <= self.limit_backward { return false; } self.previous_char(); if let Some(chr) = self.current[self.cursor as usize..].chars().next() { let mut ch = chr as u32; //codepoint as integer self.next_char(); if ch > max || ch < min { self.previous_char(); return true; } ch -= min; if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { self.previous_char(); return true; } } return false; } /// Helper function that removes the string slice between `bra` and `ket` pub fn slice_del(&mut self) -> bool { self.slice_from("") } pub fn insert(&mut self, bra: i32, ket: i32, s: &str) { let adjustment = self.replace_s(bra, ket, s); if bra <= self.bra { self.bra = self.bra + adjustment; } if bra <= self.ket { self.ket = self.ket + adjustment; } } pub fn assign_to(&mut self) -> String { self.current[0..self.limit as usize].to_string() } pub fn slice_to(&mut self) -> String { self.current[self.bra as usize..self.ket as usize].to_string() } pub fn find_among(&mut self, amongs: &[Among], context: &mut T) -> i32 { use std::cmp::min; let mut i: i32 = 0; let mut j: i32 = amongs.len() as i32; let c = self.cursor; let l = self.limit; let mut common_i = 0i32; let mut common_j = 0i32; let mut first_key_inspected = false; loop { let k = i + ((j - i) >> 1); let mut diff: i32 = 0; let mut common = min(common_i, common_j); let w = &amongs[k as usize]; for lvar in common..w.0.len() as i32 { if c + common == l { diff = -1; break; } diff = self.current.as_bytes()[(c + common) as usize] as i32 - w.0.as_bytes()[lvar as usize] as i32; if diff != 0 { break; } common += 1; } if diff < 0 { j = k; common_j = common; } else { i = k; common_i = common; } if j - i <= 1 { if i > 0 { break; } if j == i { break; } if first_key_inspected { break; } first_key_inspected = true; } } loop { let w = &amongs[i as usize]; if common_i >= w.0.len() as i32{ self.cursor = c + w.0.len() as i32; if let Some(ref method) = w.3 { let res = method(self, context); self.cursor = c + w.0.len() as i32; if res { return w.2; } } else { return w.2; } } i = w.1; if i < 0 { return 0; } } } pub fn find_among_b(&mut self, amongs: &[Among], context: &mut T) -> i32 { let mut i: i32 = 0; let mut j: i32 = amongs.len() as i32; let c = self.cursor; let lb = self.limit_backward; let mut common_i = 0i32; let mut common_j = 0i32; let mut first_key_inspected = false; loop { let k = i + ((j - i) >> 1); let mut diff: i32 = 0; let mut common = if common_i < common_j { common_i } else { common_j }; let w = &amongs[k as usize]; for lvar in (0..w.0.len() - common as usize).rev() { if c - common == lb { diff = -1; break; } diff = self.current.as_bytes()[(c - common - 1) as usize] as i32 - w.0.as_bytes()[lvar] as i32; if diff != 0 { break; } // Count up commons. But not one character but the byte width of that char common += 1; } if diff < 0 { j = k; common_j = common; } else { i = k; common_i = common; } if j - i <= 1 { if i > 0 { break; } if j == i { break; } if first_key_inspected { break; } first_key_inspected = true; } } loop { let w = &amongs[i as usize]; if common_i >= w.0.len() as i32 { self.cursor = c - w.0.len() as i32; if let Some(ref method) = w.3 { let res = method(self, context); self.cursor = c - w.0.len() as i32; if res { return w.2; } } else { return w.2; } } i = w.1; if i < 0 { return 0; } } } } snowball-2.2.0/tests/000077500000000000000000000000001414263061200144755ustar00rootroot00000000000000snowball-2.2.0/tests/stemtest.c000066400000000000000000000056071414263061200165210ustar00rootroot00000000000000/* This is a simple program which uses libstemmer to provide a command * line interface for stemming using any of the algorithms provided. */ #include #include #include /* for strlen, memcmp */ #include "libstemmer.h" #define EMOJI_FACE_THROWING_A_KISS "\xf0\x9f\x98\x98" #define U_40079 "\xf1\x80\x81\xb9" static const struct testcase { /* Stemmer to use, or 0 to test with all stemmers */ const char * language; /* Character encoding (can be 0 for UTF-8) */ const char * charenc; /* Input string (0 marks end of list) */ const char * input; /* Expected output string (0 means same as input) */ const char * expect; } testcases[] = { { "en", 0, "a" EMOJI_FACE_THROWING_A_KISS "ing", "a" EMOJI_FACE_THROWING_A_KISS "e" }, { "en", 0, U_40079 "wing", 0 }, // The Finnish stemmer used to damage numbers ending with two or more of // the same digit: https://github.com/snowballstem/snowball/issues/66 { 0, 0, "2000", 0 }, { 0, 0, "999", 0 }, { 0, 0, "1000000000", 0 }, // The Danish stemmer used to damage a number at the end of a word: // https://github.com/snowballstem/snowball/issues/81 { 0, 0, "space1999", 0 }, { 0, 0, "hal9000", 0 }, { 0, 0, "0x0e00", 0 }, { 0, 0, 0, 0 } }; static void run_testcase(const char * language, const struct testcase *test) { const char * charenc = test->charenc; const char * input = test->input; const char * expect = test->expect; struct sb_stemmer * stemmer = sb_stemmer_new(language, charenc); const sb_symbol * stemmed; int len; if (expect == NULL) expect = input; if (stemmer == 0) { if (charenc == NULL) { fprintf(stderr, "language `%s' not available for stemming\n", language); exit(1); } else { fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc); exit(1); } } stemmed = sb_stemmer_stem(stemmer, (const unsigned char*)input, strlen(input)); if (stemmed == NULL) { fprintf(stderr, "Out of memory"); exit(1); } len = sb_stemmer_length(stemmer); if (len != (int)strlen(expect) || memcmp(stemmed, expect, len) != 0) { fprintf(stderr, "%s stemmer output for %s was %.*s not %s\n", language, input, len, stemmed, expect); exit(1); } sb_stemmer_delete(stemmer); } int main(int argc, char * argv[]) { const char ** all_languages = sb_stemmer_list(); const struct testcase * p; (void)argc; (void)argv; for (p = testcases; p->input; ++p) { const char * language = p->language; if (language) { run_testcase(language, p); } else { const char ** l; for (l = all_languages; *l; ++l) { run_testcase(*l, p); } } } return 0; }