kytea_0.4.6+dfsg.orig/0000755000175000017500000000000012205657353014104 5ustar koichikoichikytea_0.4.6+dfsg.orig/configure.ac0000644000175000017500000000306312133226710016361 0ustar koichikoichi# -*- Autoconf -*- # Process this file with autoconf to produce a configure script. AC_PREREQ([2.59]) AC_INIT([kytea], [0.4.6], [kytea@phontron.com]) AC_CONFIG_SRCDIR([src/lib/kytea.cpp]) AC_CONFIG_HEADERS([src/include/kytea/config.h]) AC_CONFIG_FILES([ Makefile kytea.pc src/Makefile src/include/Makefile src/lib/Makefile src/lib/liblinear/Makefile src/lib/liblinear/blas/Makefile src/bin/Makefile src/test/Makefile src/api/Makefile data/Makefile ]) # disable shared libraries AC_PROG_LIBTOOL # Intialize automake AM_INIT_AUTOMAKE([-Wall]) # set CFLAGS and CXXFLAGS for maximal optimization if test -n "${CXXFLAGS}"; then user_set_cxxflags=yes fi AC_PROG_CXX if test X$user_set_cxxflags != Xyes; then CXXFLAGS="-g -Wall -O3" fi if test -n "${CFLAGS}"; then user_set_cflags=yes fi AC_PROG_CC if test X$user_set_cflags != Xyes; then CFLAGS="-g -Wall -O3" fi # Checks for features. AC_ARG_ENABLE(quantize, [ --enable-quantize Quantize the model, resulting in smaller but possibly less accurate models)], [], [enable_quantize=yes]) if test "x$enable_quantize" == xno; then AC_DEFINE([DISABLE_QUANTIZE], [1], [Disable quantizing]) else AC_DEFINE([DISABLE_QUANTIZE], [0], [Enable quantizing]) fi # Checks for typedefs, structures, and compiler characteristics. AC_HEADER_STDBOOL AC_C_INLINE AC_TYPE_SIZE_T # Check to make sure that we have unordered_map AC_LANG([C++]) AC_CHECK_HEADERS([boost/tr1/unordered_map.hpp tr1/unordered_map ext/hash_map], break) AC_OUTPUT kytea_0.4.6+dfsg.orig/AUTHORS0000644000175000017500000000006512122355536015151 0ustar koichikoichiPrinciple Contact: Graham Neubig kytea_0.4.6+dfsg.orig/ChangeLog0000644000175000017500000000033412122355536015652 0ustar koichikoichiVersion 0.0.2 (11/16/2009) * Support for Shift-JIS (in addition to the previous EUC-JP and UTF8). * Build system change to Autotools KyTea - Version 0.0.1 (11/05/2009) * Initial release of KyWs and KyPe. kytea_0.4.6+dfsg.orig/NEWS0000644000175000017500000000011412122355536014573 0ustar koichikoichiKyTea - Version 0.0.1 (11/05/2009) * Initial release of KyWs and KyPe. kytea_0.4.6+dfsg.orig/kytea.pc.in0000644000175000017500000000037212122355536016150 0ustar koichikoichiprefix=@prefix@ exec_prefix=@exec_prefix@ bindir=@bindir@ libdir=@libdir@ includedir=@includedir@ Name: KyTea Description: KyTea is a general toolkit developed for analyzing text. Version: @VERSION@ Cflags: -I${includedir} Libs: -L${libdir} -lkytea kytea_0.4.6+dfsg.orig/COPYING0000644000175000017500000000106412122355536015134 0ustar koichikoichiCopyright 2009, KyTea Development Team Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. kytea_0.4.6+dfsg.orig/README0000644000175000017500000000064112122355536014761 0ustar koichikoichiKyTea KyTea is a general text analysis toolkit, with a focus on Japanese and other languages requiring word or morpheme segmentation. Detailed usage information can be found at http://www.phontron.com/kytea To build KyTea, run > ./configure > make If there is no configure file in the directory (for example, if you checked the source out from github), you can rebuild the configure file using > autoreconf -i kytea_0.4.6+dfsg.orig/src/0000755000175000017500000000000012151067112014657 5ustar koichikoichikytea_0.4.6+dfsg.orig/src/api/0000755000175000017500000000000012151067113015431 5ustar koichikoichikytea_0.4.6+dfsg.orig/src/api/Makefile.in0000644000175000017500000003761712133240156017514 0ustar koichikoichi# Makefile.in generated by automake 1.11.1 from Makefile.am. # @configure_input@ # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, # 2003, 2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, # Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY, to the extent permitted by law; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR A # PARTICULAR PURPOSE. @SET_MAKE@ VPATH = @srcdir@ pkgdatadir = $(datadir)/@PACKAGE@ pkgincludedir = $(includedir)/@PACKAGE@ pkglibdir = $(libdir)/@PACKAGE@ pkglibexecdir = $(libexecdir)/@PACKAGE@ am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd install_sh_DATA = $(install_sh) -c -m 644 install_sh_PROGRAM = $(install_sh) -c install_sh_SCRIPT = $(install_sh) -c INSTALL_HEADER = $(INSTALL_DATA) transform = $(program_transform_name) NORMAL_INSTALL = : PRE_INSTALL = : POST_INSTALL = : NORMAL_UNINSTALL = : PRE_UNINSTALL = : POST_UNINSTALL = : build_triplet = @build@ host_triplet = @host@ bin_PROGRAMS = api-example$(EXEEXT) subdir = src/api DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 am__aclocal_m4_deps = $(top_srcdir)/configure.ac am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ $(ACLOCAL_M4) mkinstalldirs = $(install_sh) -d CONFIG_HEADER = $(top_builddir)/src/include/kytea/config.h CONFIG_CLEAN_FILES = CONFIG_CLEAN_VPATH_FILES = am__installdirs = "$(DESTDIR)$(bindir)" PROGRAMS = $(bin_PROGRAMS) am__objects_1 = am_api_example_OBJECTS = api-example.$(OBJEXT) $(am__objects_1) api_example_OBJECTS = $(am_api_example_OBJECTS) api_example_DEPENDENCIES = ../lib/libkytea.la DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)/src/include/kytea depcomp = $(SHELL) $(top_srcdir)/depcomp am__depfiles_maybe = depfiles am__mv = mv -f CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) LTCXXCOMPILE = $(LIBTOOL) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) CXXLD = $(CXX) CXXLINK = $(LIBTOOL) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ --mode=link $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \ $(LDFLAGS) -o $@ SOURCES = $(api_example_SOURCES) DIST_SOURCES = $(api_example_SOURCES) ETAGS = etags CTAGS = ctags DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) ACLOCAL = @ACLOCAL@ AMTAR = @AMTAR@ AR = @AR@ AUTOCONF = @AUTOCONF@ AUTOHEADER = @AUTOHEADER@ AUTOMAKE = @AUTOMAKE@ AWK = @AWK@ CC = @CC@ CCDEPMODE = @CCDEPMODE@ CFLAGS = @CFLAGS@ CPP = @CPP@ CPPFLAGS = @CPPFLAGS@ CXX = @CXX@ CXXCPP = @CXXCPP@ CXXDEPMODE = @CXXDEPMODE@ CXXFLAGS = @CXXFLAGS@ CYGPATH_W = @CYGPATH_W@ DEFS = @DEFS@ DEPDIR = @DEPDIR@ DLLTOOL = @DLLTOOL@ DSYMUTIL = @DSYMUTIL@ DUMPBIN = @DUMPBIN@ ECHO_C = @ECHO_C@ ECHO_N = @ECHO_N@ ECHO_T = @ECHO_T@ EGREP = @EGREP@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ INSTALL = @INSTALL@ INSTALL_DATA = @INSTALL_DATA@ INSTALL_PROGRAM = @INSTALL_PROGRAM@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ LD = @LD@ LDFLAGS = @LDFLAGS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ LIPO = @LIPO@ LN_S = @LN_S@ LTLIBOBJS = @LTLIBOBJS@ MAKEINFO = @MAKEINFO@ MANIFEST_TOOL = @MANIFEST_TOOL@ MKDIR_P = @MKDIR_P@ NM = @NM@ NMEDIT = @NMEDIT@ OBJDUMP = @OBJDUMP@ OBJEXT = @OBJEXT@ OTOOL = @OTOOL@ OTOOL64 = @OTOOL64@ PACKAGE = @PACKAGE@ PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ PACKAGE_NAME = @PACKAGE_NAME@ PACKAGE_STRING = @PACKAGE_STRING@ PACKAGE_TARNAME = @PACKAGE_TARNAME@ PACKAGE_URL = @PACKAGE_URL@ PACKAGE_VERSION = @PACKAGE_VERSION@ PATH_SEPARATOR = @PATH_SEPARATOR@ RANLIB = @RANLIB@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ STRIP = @STRIP@ VERSION = @VERSION@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ abs_top_srcdir = @abs_top_srcdir@ ac_ct_AR = @ac_ct_AR@ ac_ct_CC = @ac_ct_CC@ ac_ct_CXX = @ac_ct_CXX@ ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ am__include = @am__include@ am__leading_dot = @am__leading_dot@ am__quote = @am__quote@ am__tar = @am__tar@ am__untar = @am__untar@ bindir = @bindir@ build = @build@ build_alias = @build_alias@ build_cpu = @build_cpu@ build_os = @build_os@ build_vendor = @build_vendor@ builddir = @builddir@ datadir = @datadir@ datarootdir = @datarootdir@ docdir = @docdir@ dvidir = @dvidir@ exec_prefix = @exec_prefix@ host = @host@ host_alias = @host_alias@ host_cpu = @host_cpu@ host_os = @host_os@ host_vendor = @host_vendor@ htmldir = @htmldir@ includedir = @includedir@ infodir = @infodir@ install_sh = @install_sh@ libdir = @libdir@ libexecdir = @libexecdir@ localedir = @localedir@ localstatedir = @localstatedir@ mandir = @mandir@ mkdir_p = @mkdir_p@ oldincludedir = @oldincludedir@ pdfdir = @pdfdir@ prefix = @prefix@ program_transform_name = @program_transform_name@ psdir = @psdir@ sbindir = @sbindir@ sharedstatedir = @sharedstatedir@ srcdir = @srcdir@ sysconfdir = @sysconfdir@ target_alias = @target_alias@ top_build_prefix = @top_build_prefix@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ # KYTH = kytea.h corpus-io.h model-io.h string-util.h \ # kytea-model.h kytea-string.h kytea-struct.h dictionary.h general-io.h \ # kytea-config.h KYTH = AM_CPPFLAGS = -I$(srcdir)/../include -DPKGDATADIR='"$(pkgdatadir)"' api_example_SOURCES = api-example.cpp ${KYTH} api_example_LDADD = ../lib/libkytea.la all: all-am .SUFFIXES: .SUFFIXES: .cpp .lo .o .obj $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) @for dep in $?; do \ case '$(am__configure_deps)' in \ *$$dep*) \ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ && { if test -f $@; then exit 0; else break; fi; }; \ exit 1;; \ esac; \ done; \ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu src/api/Makefile'; \ $(am__cd) $(top_srcdir) && \ $(AUTOMAKE) --gnu src/api/Makefile .PRECIOUS: Makefile Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status @case '$?' in \ *config.status*) \ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ *) \ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ esac; $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh $(top_srcdir)/configure: $(am__configure_deps) cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh $(ACLOCAL_M4): $(am__aclocal_m4_deps) cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh $(am__aclocal_m4_deps): install-binPROGRAMS: $(bin_PROGRAMS) @$(NORMAL_INSTALL) test -z "$(bindir)" || $(MKDIR_P) "$(DESTDIR)$(bindir)" @list='$(bin_PROGRAMS)'; test -n "$(bindir)" || list=; \ for p in $$list; do echo "$$p $$p"; done | \ sed 's/$(EXEEXT)$$//' | \ while read p p1; do if test -f $$p || test -f $$p1; \ then echo "$$p"; echo "$$p"; else :; fi; \ done | \ sed -e 'p;s,.*/,,;n;h' -e 's|.*|.|' \ -e 'p;x;s,.*/,,;s/$(EXEEXT)$$//;$(transform);s/$$/$(EXEEXT)/' | \ sed 'N;N;N;s,\n, ,g' | \ $(AWK) 'BEGIN { files["."] = ""; dirs["."] = 1 } \ { d=$$3; if (dirs[d] != 1) { print "d", d; dirs[d] = 1 } \ if ($$2 == $$4) files[d] = files[d] " " $$1; \ else { print "f", $$3 "/" $$4, $$1; } } \ END { for (d in files) print "f", d, files[d] }' | \ while read type dir files; do \ if test "$$dir" = .; then dir=; else dir=/$$dir; fi; \ test -z "$$files" || { \ echo " $(INSTALL_PROGRAM_ENV) $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL_PROGRAM) $$files '$(DESTDIR)$(bindir)$$dir'"; \ $(INSTALL_PROGRAM_ENV) $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL_PROGRAM) $$files "$(DESTDIR)$(bindir)$$dir" || exit $$?; \ } \ ; done uninstall-binPROGRAMS: @$(NORMAL_UNINSTALL) @list='$(bin_PROGRAMS)'; test -n "$(bindir)" || list=; \ files=`for p in $$list; do echo "$$p"; done | \ sed -e 'h;s,^.*/,,;s/$(EXEEXT)$$//;$(transform)' \ -e 's/$$/$(EXEEXT)/' `; \ test -n "$$list" || exit 0; \ echo " ( cd '$(DESTDIR)$(bindir)' && rm -f" $$files ")"; \ cd "$(DESTDIR)$(bindir)" && rm -f $$files clean-binPROGRAMS: @list='$(bin_PROGRAMS)'; test -n "$$list" || exit 0; \ echo " rm -f" $$list; \ rm -f $$list || exit $$?; \ test -n "$(EXEEXT)" || exit 0; \ list=`for p in $$list; do echo "$$p"; done | sed 's/$(EXEEXT)$$//'`; \ echo " rm -f" $$list; \ rm -f $$list api-example$(EXEEXT): $(api_example_OBJECTS) $(api_example_DEPENDENCIES) @rm -f api-example$(EXEEXT) $(CXXLINK) $(api_example_OBJECTS) $(api_example_LDADD) $(LIBS) mostlyclean-compile: -rm -f *.$(OBJEXT) distclean-compile: -rm -f *.tab.c @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/api-example.Po@am__quote@ .cpp.o: @am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< @am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po @AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(CXXCOMPILE) -c -o $@ $< .cpp.obj: @am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` @am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po @AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` .cpp.lo: @am__fastdepCXX_TRUE@ $(LTCXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< @am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo @AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(LTCXXCOMPILE) -c -o $@ $< mostlyclean-libtool: -rm -f *.lo clean-libtool: -rm -rf .libs _libs ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ unique=`for i in $$list; do \ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ done | \ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ END { if (nonempty) { for (i in files) print i; }; }'`; \ mkid -fID $$unique tags: TAGS TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ $(TAGS_FILES) $(LISP) set x; \ here=`pwd`; \ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ unique=`for i in $$list; do \ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ done | \ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ END { if (nonempty) { for (i in files) print i; }; }'`; \ shift; \ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ test -n "$$unique" || unique=$$empty_fix; \ if test $$# -gt 0; then \ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ "$$@" $$unique; \ else \ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ $$unique; \ fi; \ fi ctags: CTAGS CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ $(TAGS_FILES) $(LISP) list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ unique=`for i in $$list; do \ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ done | \ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ END { if (nonempty) { for (i in files) print i; }; }'`; \ test -z "$(CTAGS_ARGS)$$unique" \ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ $$unique GTAGS: here=`$(am__cd) $(top_builddir) && pwd` \ && $(am__cd) $(top_srcdir) \ && gtags -i $(GTAGS_ARGS) "$$here" distclean-tags: -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags distdir: $(DISTFILES) @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ list='$(DISTFILES)'; \ dist_files=`for file in $$list; do echo $$file; done | \ sed -e "s|^$$srcdirstrip/||;t" \ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ case $$dist_files in \ */*) $(MKDIR_P) `echo "$$dist_files" | \ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ sort -u` ;; \ esac; \ for file in $$dist_files; do \ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ if test -d $$d/$$file; then \ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ if test -d "$(distdir)/$$file"; then \ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ fi; \ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ fi; \ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ else \ test -f "$(distdir)/$$file" \ || cp -p $$d/$$file "$(distdir)/$$file" \ || exit 1; \ fi; \ done check-am: all-am check: check-am all-am: Makefile $(PROGRAMS) installdirs: for dir in "$(DESTDIR)$(bindir)"; do \ test -z "$$dir" || $(MKDIR_P) "$$dir"; \ done install: install-am install-exec: install-exec-am install-data: install-data-am uninstall: uninstall-am install-am: all-am @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am installcheck: installcheck-am install-strip: $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ `test -z '$(STRIP)' || \ echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install mostlyclean-generic: clean-generic: distclean-generic: -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) maintainer-clean-generic: @echo "This command is intended for maintainers to use" @echo "it deletes files that may require special tools to rebuild." clean: clean-am clean-am: clean-binPROGRAMS clean-generic clean-libtool mostlyclean-am distclean: distclean-am -rm -rf ./$(DEPDIR) -rm -f Makefile distclean-am: clean-am distclean-compile distclean-generic \ distclean-tags dvi: dvi-am dvi-am: html: html-am html-am: info: info-am info-am: install-data-am: install-dvi: install-dvi-am install-dvi-am: install-exec-am: install-binPROGRAMS install-html: install-html-am install-html-am: install-info: install-info-am install-info-am: install-man: install-pdf: install-pdf-am install-pdf-am: install-ps: install-ps-am install-ps-am: installcheck-am: maintainer-clean: maintainer-clean-am -rm -rf ./$(DEPDIR) -rm -f Makefile maintainer-clean-am: distclean-am maintainer-clean-generic mostlyclean: mostlyclean-am mostlyclean-am: mostlyclean-compile mostlyclean-generic \ mostlyclean-libtool pdf: pdf-am pdf-am: ps: ps-am ps-am: uninstall-am: uninstall-binPROGRAMS .MAKE: install-am install-strip .PHONY: CTAGS GTAGS all all-am check check-am clean clean-binPROGRAMS \ clean-generic clean-libtool ctags distclean distclean-compile \ distclean-generic distclean-libtool distclean-tags distdir dvi \ dvi-am html html-am info info-am install install-am \ install-binPROGRAMS install-data install-data-am install-dvi \ install-dvi-am install-exec install-exec-am install-html \ install-html-am install-info install-info-am install-man \ install-pdf install-pdf-am install-ps install-ps-am \ install-strip installcheck installcheck-am installdirs \ maintainer-clean maintainer-clean-generic mostlyclean \ mostlyclean-compile mostlyclean-generic mostlyclean-libtool \ pdf pdf-am ps ps-am tags uninstall uninstall-am \ uninstall-binPROGRAMS # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. .NOEXPORT: kytea_0.4.6+dfsg.orig/src/api/api-example.cpp0000644000175000017500000000377312122355536020360 0ustar koichikoichi#include // a file including the main program #include // a file including sentence, word, and pronunciation objects #include // a file to include the StringUtil object #include using namespace std; using namespace kytea; int main(int argc, char** argv) { // Create an instance of the Kytea program Kytea kytea; // Load a KyTea model from a model file // this can be a binary or text model in any character encoding, // it will be detected automatically kytea.readModel("../../data/model.bin"); // Get the string utility class. This allows you to convert from // the appropriate string encoding to Kytea's internal format StringUtil* util = kytea.getStringUtil(); // Get the configuration class, this allows you to read or set the // configuration for the analysis KyteaConfig* config = kytea.getConfig(); // Map a plain text string to a KyteaString, and create a sentence object KyteaString surface_string = util->mapString("これはテストです。"); KyteaSentence sentence(surface_string, util->normalize(surface_string)); // Find the word boundaries kytea.calculateWS(sentence); // Find the pronunciations for each tag level for(int i = 0; i < config->getNumTags(); i++) kytea.calculateTags(sentence,i); // For each word in the sentence const KyteaSentence::Words & words = sentence.words; for(int i = 0; i < (int)words.size(); i++) { // Print the word cout << util->showString(words[i].surface); // For each tag level for(int j = 0; j < (int)words[i].tags.size(); j++) { cout << "\t"; // Print each of its tags for(int k = 0; k < (int)words[i].tags[j].size(); k++) { cout << " " << util->showString(words[i].tags[j][k].first) << "/" << words[i].tags[j][k].second; } } cout << endl; } cout << endl; } kytea_0.4.6+dfsg.orig/src/api/Makefile.am0000644000175000017500000000054012122355536017473 0ustar koichikoichi# KYTH = kytea.h corpus-io.h model-io.h string-util.h \ # kytea-model.h kytea-string.h kytea-struct.h dictionary.h general-io.h \ # kytea-config.h KYTH = AM_CPPFLAGS = -I$(srcdir)/../include -DPKGDATADIR='"$(pkgdatadir)"' bin_PROGRAMS = api-example api_example_SOURCES = api-example.cpp ${KYTH} api_example_LDADD = ../lib/libkytea.la kytea_0.4.6+dfsg.orig/src/Makefile.in0000644000175000017500000004050412133240156016730 0ustar koichikoichi# Makefile.in generated by automake 1.11.1 from Makefile.am. # @configure_input@ # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, # 2003, 2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, # Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY, to the extent permitted by law; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR A # PARTICULAR PURPOSE. @SET_MAKE@ VPATH = @srcdir@ pkgdatadir = $(datadir)/@PACKAGE@ pkgincludedir = $(includedir)/@PACKAGE@ pkglibdir = $(libdir)/@PACKAGE@ pkglibexecdir = $(libexecdir)/@PACKAGE@ am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd install_sh_DATA = $(install_sh) -c -m 644 install_sh_PROGRAM = $(install_sh) -c install_sh_SCRIPT = $(install_sh) -c INSTALL_HEADER = $(INSTALL_DATA) transform = $(program_transform_name) NORMAL_INSTALL = : PRE_INSTALL = : POST_INSTALL = : NORMAL_UNINSTALL = : PRE_UNINSTALL = : POST_UNINSTALL = : build_triplet = @build@ host_triplet = @host@ subdir = src DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 am__aclocal_m4_deps = $(top_srcdir)/configure.ac am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ $(ACLOCAL_M4) mkinstalldirs = $(install_sh) -d CONFIG_HEADER = $(top_builddir)/src/include/kytea/config.h CONFIG_CLEAN_FILES = CONFIG_CLEAN_VPATH_FILES = SOURCES = DIST_SOURCES = RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \ html-recursive info-recursive install-data-recursive \ install-dvi-recursive install-exec-recursive \ install-html-recursive install-info-recursive \ install-pdf-recursive install-ps-recursive install-recursive \ installcheck-recursive installdirs-recursive pdf-recursive \ ps-recursive uninstall-recursive RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \ distclean-recursive maintainer-clean-recursive AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \ $(RECURSIVE_CLEAN_TARGETS:-recursive=) tags TAGS ctags CTAGS \ distdir ETAGS = etags CTAGS = ctags DIST_SUBDIRS = $(SUBDIRS) DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) am__relativize = \ dir0=`pwd`; \ sed_first='s,^\([^/]*\)/.*$$,\1,'; \ sed_rest='s,^[^/]*/*,,'; \ sed_last='s,^.*/\([^/]*\)$$,\1,'; \ sed_butlast='s,/*[^/]*$$,,'; \ while test -n "$$dir1"; do \ first=`echo "$$dir1" | sed -e "$$sed_first"`; \ if test "$$first" != "."; then \ if test "$$first" = ".."; then \ dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \ dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \ else \ first2=`echo "$$dir2" | sed -e "$$sed_first"`; \ if test "$$first2" = "$$first"; then \ dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \ else \ dir2="../$$dir2"; \ fi; \ dir0="$$dir0"/"$$first"; \ fi; \ fi; \ dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \ done; \ reldir="$$dir2" ACLOCAL = @ACLOCAL@ AMTAR = @AMTAR@ AR = @AR@ AUTOCONF = @AUTOCONF@ AUTOHEADER = @AUTOHEADER@ AUTOMAKE = @AUTOMAKE@ AWK = @AWK@ CC = @CC@ CCDEPMODE = @CCDEPMODE@ CFLAGS = @CFLAGS@ CPP = @CPP@ CPPFLAGS = @CPPFLAGS@ CXX = @CXX@ CXXCPP = @CXXCPP@ CXXDEPMODE = @CXXDEPMODE@ CXXFLAGS = @CXXFLAGS@ CYGPATH_W = @CYGPATH_W@ DEFS = @DEFS@ DEPDIR = @DEPDIR@ DLLTOOL = @DLLTOOL@ DSYMUTIL = @DSYMUTIL@ DUMPBIN = @DUMPBIN@ ECHO_C = @ECHO_C@ ECHO_N = @ECHO_N@ ECHO_T = @ECHO_T@ EGREP = @EGREP@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ INSTALL = @INSTALL@ INSTALL_DATA = @INSTALL_DATA@ INSTALL_PROGRAM = @INSTALL_PROGRAM@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ LD = @LD@ LDFLAGS = @LDFLAGS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ LIPO = @LIPO@ LN_S = @LN_S@ LTLIBOBJS = @LTLIBOBJS@ MAKEINFO = @MAKEINFO@ MANIFEST_TOOL = @MANIFEST_TOOL@ MKDIR_P = @MKDIR_P@ NM = @NM@ NMEDIT = @NMEDIT@ OBJDUMP = @OBJDUMP@ OBJEXT = @OBJEXT@ OTOOL = @OTOOL@ OTOOL64 = @OTOOL64@ PACKAGE = @PACKAGE@ PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ PACKAGE_NAME = @PACKAGE_NAME@ PACKAGE_STRING = @PACKAGE_STRING@ PACKAGE_TARNAME = @PACKAGE_TARNAME@ PACKAGE_URL = @PACKAGE_URL@ PACKAGE_VERSION = @PACKAGE_VERSION@ PATH_SEPARATOR = @PATH_SEPARATOR@ RANLIB = @RANLIB@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ STRIP = @STRIP@ VERSION = @VERSION@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ abs_top_srcdir = @abs_top_srcdir@ ac_ct_AR = @ac_ct_AR@ ac_ct_CC = @ac_ct_CC@ ac_ct_CXX = @ac_ct_CXX@ ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ am__include = @am__include@ am__leading_dot = @am__leading_dot@ am__quote = @am__quote@ am__tar = @am__tar@ am__untar = @am__untar@ bindir = @bindir@ build = @build@ build_alias = @build_alias@ build_cpu = @build_cpu@ build_os = @build_os@ build_vendor = @build_vendor@ builddir = @builddir@ datadir = @datadir@ datarootdir = @datarootdir@ docdir = @docdir@ dvidir = @dvidir@ exec_prefix = @exec_prefix@ host = @host@ host_alias = @host_alias@ host_cpu = @host_cpu@ host_os = @host_os@ host_vendor = @host_vendor@ htmldir = @htmldir@ includedir = @includedir@ infodir = @infodir@ install_sh = @install_sh@ libdir = @libdir@ libexecdir = @libexecdir@ localedir = @localedir@ localstatedir = @localstatedir@ mandir = @mandir@ mkdir_p = @mkdir_p@ oldincludedir = @oldincludedir@ pdfdir = @pdfdir@ prefix = @prefix@ program_transform_name = @program_transform_name@ psdir = @psdir@ sbindir = @sbindir@ sharedstatedir = @sharedstatedir@ srcdir = @srcdir@ sysconfdir = @sysconfdir@ target_alias = @target_alias@ top_build_prefix = @top_build_prefix@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ SUBDIRS = include lib bin api test all: all-recursive .SUFFIXES: $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) @for dep in $?; do \ case '$(am__configure_deps)' in \ *$$dep*) \ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ && { if test -f $@; then exit 0; else break; fi; }; \ exit 1;; \ esac; \ done; \ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu src/Makefile'; \ $(am__cd) $(top_srcdir) && \ $(AUTOMAKE) --gnu src/Makefile .PRECIOUS: Makefile Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status @case '$?' in \ *config.status*) \ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ *) \ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ esac; $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh $(top_srcdir)/configure: $(am__configure_deps) cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh $(ACLOCAL_M4): $(am__aclocal_m4_deps) cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh $(am__aclocal_m4_deps): mostlyclean-libtool: -rm -f *.lo clean-libtool: -rm -rf .libs _libs # This directory's subdirectories are mostly independent; you can cd # into them and run `make' without going through this Makefile. # To change the values of `make' variables: instead of editing Makefiles, # (1) if the variable is set in `config.status', edit `config.status' # (which will cause the Makefiles to be regenerated when you run `make'); # (2) otherwise, pass the desired values on the `make' command line. $(RECURSIVE_TARGETS): @fail= failcom='exit 1'; \ for f in x $$MAKEFLAGS; do \ case $$f in \ *=* | --[!k]*);; \ *k*) failcom='fail=yes';; \ esac; \ done; \ dot_seen=no; \ target=`echo $@ | sed s/-recursive//`; \ list='$(SUBDIRS)'; for subdir in $$list; do \ echo "Making $$target in $$subdir"; \ if test "$$subdir" = "."; then \ dot_seen=yes; \ local_target="$$target-am"; \ else \ local_target="$$target"; \ fi; \ ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ || eval $$failcom; \ done; \ if test "$$dot_seen" = "no"; then \ $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \ fi; test -z "$$fail" $(RECURSIVE_CLEAN_TARGETS): @fail= failcom='exit 1'; \ for f in x $$MAKEFLAGS; do \ case $$f in \ *=* | --[!k]*);; \ *k*) failcom='fail=yes';; \ esac; \ done; \ dot_seen=no; \ case "$@" in \ distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \ *) list='$(SUBDIRS)' ;; \ esac; \ rev=''; for subdir in $$list; do \ if test "$$subdir" = "."; then :; else \ rev="$$subdir $$rev"; \ fi; \ done; \ rev="$$rev ."; \ target=`echo $@ | sed s/-recursive//`; \ for subdir in $$rev; do \ echo "Making $$target in $$subdir"; \ if test "$$subdir" = "."; then \ local_target="$$target-am"; \ else \ local_target="$$target"; \ fi; \ ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ || eval $$failcom; \ done && test -z "$$fail" tags-recursive: list='$(SUBDIRS)'; for subdir in $$list; do \ test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \ done ctags-recursive: list='$(SUBDIRS)'; for subdir in $$list; do \ test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \ done ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ unique=`for i in $$list; do \ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ done | \ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ END { if (nonempty) { for (i in files) print i; }; }'`; \ mkid -fID $$unique tags: TAGS TAGS: tags-recursive $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ $(TAGS_FILES) $(LISP) set x; \ here=`pwd`; \ if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \ include_option=--etags-include; \ empty_fix=.; \ else \ include_option=--include; \ empty_fix=; \ fi; \ list='$(SUBDIRS)'; for subdir in $$list; do \ if test "$$subdir" = .; then :; else \ test ! -f $$subdir/TAGS || \ set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \ fi; \ done; \ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ unique=`for i in $$list; do \ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ done | \ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ END { if (nonempty) { for (i in files) print i; }; }'`; \ shift; \ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ test -n "$$unique" || unique=$$empty_fix; \ if test $$# -gt 0; then \ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ "$$@" $$unique; \ else \ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ $$unique; \ fi; \ fi ctags: CTAGS CTAGS: ctags-recursive $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ $(TAGS_FILES) $(LISP) list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ unique=`for i in $$list; do \ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ done | \ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ END { if (nonempty) { for (i in files) print i; }; }'`; \ test -z "$(CTAGS_ARGS)$$unique" \ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ $$unique GTAGS: here=`$(am__cd) $(top_builddir) && pwd` \ && $(am__cd) $(top_srcdir) \ && gtags -i $(GTAGS_ARGS) "$$here" distclean-tags: -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags distdir: $(DISTFILES) @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ list='$(DISTFILES)'; \ dist_files=`for file in $$list; do echo $$file; done | \ sed -e "s|^$$srcdirstrip/||;t" \ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ case $$dist_files in \ */*) $(MKDIR_P) `echo "$$dist_files" | \ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ sort -u` ;; \ esac; \ for file in $$dist_files; do \ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ if test -d $$d/$$file; then \ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ if test -d "$(distdir)/$$file"; then \ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ fi; \ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ fi; \ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ else \ test -f "$(distdir)/$$file" \ || cp -p $$d/$$file "$(distdir)/$$file" \ || exit 1; \ fi; \ done @list='$(DIST_SUBDIRS)'; for subdir in $$list; do \ if test "$$subdir" = .; then :; else \ test -d "$(distdir)/$$subdir" \ || $(MKDIR_P) "$(distdir)/$$subdir" \ || exit 1; \ fi; \ done @list='$(DIST_SUBDIRS)'; for subdir in $$list; do \ if test "$$subdir" = .; then :; else \ dir1=$$subdir; dir2="$(distdir)/$$subdir"; \ $(am__relativize); \ new_distdir=$$reldir; \ dir1=$$subdir; dir2="$(top_distdir)"; \ $(am__relativize); \ new_top_distdir=$$reldir; \ echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \ echo " am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \ ($(am__cd) $$subdir && \ $(MAKE) $(AM_MAKEFLAGS) \ top_distdir="$$new_top_distdir" \ distdir="$$new_distdir" \ am__remove_distdir=: \ am__skip_length_check=: \ am__skip_mode_fix=: \ distdir) \ || exit 1; \ fi; \ done check-am: all-am check: check-recursive all-am: Makefile installdirs: installdirs-recursive installdirs-am: install: install-recursive install-exec: install-exec-recursive install-data: install-data-recursive uninstall: uninstall-recursive install-am: all-am @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am installcheck: installcheck-recursive install-strip: $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ `test -z '$(STRIP)' || \ echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install mostlyclean-generic: clean-generic: distclean-generic: -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) maintainer-clean-generic: @echo "This command is intended for maintainers to use" @echo "it deletes files that may require special tools to rebuild." clean: clean-recursive clean-am: clean-generic clean-libtool mostlyclean-am distclean: distclean-recursive -rm -f Makefile distclean-am: clean-am distclean-generic distclean-tags dvi: dvi-recursive dvi-am: html: html-recursive html-am: info: info-recursive info-am: install-data-am: install-dvi: install-dvi-recursive install-dvi-am: install-exec-am: install-html: install-html-recursive install-html-am: install-info: install-info-recursive install-info-am: install-man: install-pdf: install-pdf-recursive install-pdf-am: install-ps: install-ps-recursive install-ps-am: installcheck-am: maintainer-clean: maintainer-clean-recursive -rm -f Makefile maintainer-clean-am: distclean-am maintainer-clean-generic mostlyclean: mostlyclean-recursive mostlyclean-am: mostlyclean-generic mostlyclean-libtool pdf: pdf-recursive pdf-am: ps: ps-recursive ps-am: uninstall-am: .MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) ctags-recursive \ install-am install-strip tags-recursive .PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \ all all-am check check-am clean clean-generic clean-libtool \ ctags ctags-recursive distclean distclean-generic \ distclean-libtool distclean-tags distdir dvi dvi-am html \ html-am info info-am install install-am install-data \ install-data-am install-dvi install-dvi-am install-exec \ install-exec-am install-html install-html-am install-info \ install-info-am install-man install-pdf install-pdf-am \ install-ps install-ps-am install-strip installcheck \ installcheck-am installdirs installdirs-am maintainer-clean \ maintainer-clean-generic mostlyclean mostlyclean-generic \ mostlyclean-libtool pdf pdf-am ps ps-am tags tags-recursive \ uninstall uninstall-am # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. .NOEXPORT: kytea_0.4.6+dfsg.orig/src/bin/0000755000175000017500000000000012151067113015430 5ustar koichikoichikytea_0.4.6+dfsg.orig/src/bin/Makefile.in0000644000175000017500000004052712133240156017505 0ustar koichikoichi# Makefile.in generated by automake 1.11.1 from Makefile.am. # @configure_input@ # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, # 2003, 2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, # Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY, to the extent permitted by law; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR A # PARTICULAR PURPOSE. @SET_MAKE@ VPATH = @srcdir@ pkgdatadir = $(datadir)/@PACKAGE@ pkgincludedir = $(includedir)/@PACKAGE@ pkglibdir = $(libdir)/@PACKAGE@ pkglibexecdir = $(libexecdir)/@PACKAGE@ am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd install_sh_DATA = $(install_sh) -c -m 644 install_sh_PROGRAM = $(install_sh) -c install_sh_SCRIPT = $(install_sh) -c INSTALL_HEADER = $(INSTALL_DATA) transform = $(program_transform_name) NORMAL_INSTALL = : PRE_INSTALL = : POST_INSTALL = : NORMAL_UNINSTALL = : PRE_UNINSTALL = : POST_UNINSTALL = : build_triplet = @build@ host_triplet = @host@ bin_PROGRAMS = kytea$(EXEEXT) train-kytea$(EXEEXT) subdir = src/bin DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 am__aclocal_m4_deps = $(top_srcdir)/configure.ac am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ $(ACLOCAL_M4) mkinstalldirs = $(install_sh) -d CONFIG_HEADER = $(top_builddir)/src/include/kytea/config.h CONFIG_CLEAN_FILES = CONFIG_CLEAN_VPATH_FILES = am__installdirs = "$(DESTDIR)$(bindir)" PROGRAMS = $(bin_PROGRAMS) am__objects_1 = am_kytea_OBJECTS = run-kytea.$(OBJEXT) $(am__objects_1) kytea_OBJECTS = $(am_kytea_OBJECTS) kytea_DEPENDENCIES = ../lib/libkytea.la am_train_kytea_OBJECTS = train-kytea.$(OBJEXT) $(am__objects_1) train_kytea_OBJECTS = $(am_train_kytea_OBJECTS) train_kytea_DEPENDENCIES = ../lib/libkytea.la DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)/src/include/kytea depcomp = $(SHELL) $(top_srcdir)/depcomp am__depfiles_maybe = depfiles am__mv = mv -f CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) LTCXXCOMPILE = $(LIBTOOL) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) CXXLD = $(CXX) CXXLINK = $(LIBTOOL) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ --mode=link $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \ $(LDFLAGS) -o $@ SOURCES = $(kytea_SOURCES) $(train_kytea_SOURCES) DIST_SOURCES = $(kytea_SOURCES) $(train_kytea_SOURCES) ETAGS = etags CTAGS = ctags DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) ACLOCAL = @ACLOCAL@ AMTAR = @AMTAR@ AR = @AR@ AUTOCONF = @AUTOCONF@ AUTOHEADER = @AUTOHEADER@ AUTOMAKE = @AUTOMAKE@ AWK = @AWK@ CC = @CC@ CCDEPMODE = @CCDEPMODE@ CFLAGS = @CFLAGS@ CPP = @CPP@ CPPFLAGS = @CPPFLAGS@ CXX = @CXX@ CXXCPP = @CXXCPP@ CXXDEPMODE = @CXXDEPMODE@ CXXFLAGS = @CXXFLAGS@ CYGPATH_W = @CYGPATH_W@ DEFS = @DEFS@ DEPDIR = @DEPDIR@ DLLTOOL = @DLLTOOL@ DSYMUTIL = @DSYMUTIL@ DUMPBIN = @DUMPBIN@ ECHO_C = @ECHO_C@ ECHO_N = @ECHO_N@ ECHO_T = @ECHO_T@ EGREP = @EGREP@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ INSTALL = @INSTALL@ INSTALL_DATA = @INSTALL_DATA@ INSTALL_PROGRAM = @INSTALL_PROGRAM@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ LD = @LD@ LDFLAGS = @LDFLAGS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ LIPO = @LIPO@ LN_S = @LN_S@ LTLIBOBJS = @LTLIBOBJS@ MAKEINFO = @MAKEINFO@ MANIFEST_TOOL = @MANIFEST_TOOL@ MKDIR_P = @MKDIR_P@ NM = @NM@ NMEDIT = @NMEDIT@ OBJDUMP = @OBJDUMP@ OBJEXT = @OBJEXT@ OTOOL = @OTOOL@ OTOOL64 = @OTOOL64@ PACKAGE = @PACKAGE@ PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ PACKAGE_NAME = @PACKAGE_NAME@ PACKAGE_STRING = @PACKAGE_STRING@ PACKAGE_TARNAME = @PACKAGE_TARNAME@ PACKAGE_URL = @PACKAGE_URL@ PACKAGE_VERSION = @PACKAGE_VERSION@ PATH_SEPARATOR = @PATH_SEPARATOR@ RANLIB = @RANLIB@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ STRIP = @STRIP@ VERSION = @VERSION@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ abs_top_srcdir = @abs_top_srcdir@ ac_ct_AR = @ac_ct_AR@ ac_ct_CC = @ac_ct_CC@ ac_ct_CXX = @ac_ct_CXX@ ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ am__include = @am__include@ am__leading_dot = @am__leading_dot@ am__quote = @am__quote@ am__tar = @am__tar@ am__untar = @am__untar@ bindir = @bindir@ build = @build@ build_alias = @build_alias@ build_cpu = @build_cpu@ build_os = @build_os@ build_vendor = @build_vendor@ builddir = @builddir@ datadir = @datadir@ datarootdir = @datarootdir@ docdir = @docdir@ dvidir = @dvidir@ exec_prefix = @exec_prefix@ host = @host@ host_alias = @host_alias@ host_cpu = @host_cpu@ host_os = @host_os@ host_vendor = @host_vendor@ htmldir = @htmldir@ includedir = @includedir@ infodir = @infodir@ install_sh = @install_sh@ libdir = @libdir@ libexecdir = @libexecdir@ localedir = @localedir@ localstatedir = @localstatedir@ mandir = @mandir@ mkdir_p = @mkdir_p@ oldincludedir = @oldincludedir@ pdfdir = @pdfdir@ prefix = @prefix@ program_transform_name = @program_transform_name@ psdir = @psdir@ sbindir = @sbindir@ sharedstatedir = @sharedstatedir@ srcdir = @srcdir@ sysconfdir = @sysconfdir@ target_alias = @target_alias@ top_build_prefix = @top_build_prefix@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ # KYTH = kytea.h corpus-io.h model-io.h string-util.h \ # kytea-model.h kytea-string.h kytea-struct.h dictionary.h general-io.h \ # kytea-config.h KYTH = AM_CPPFLAGS = -I$(srcdir)/../include -DPKGDATADIR='"$(pkgdatadir)"' kytea_SOURCES = run-kytea.cpp ${KYTH} kytea_LDADD = ../lib/libkytea.la train_kytea_SOURCES = train-kytea.cpp ${KYTH} train_kytea_LDADD = ../lib/libkytea.la all: all-am .SUFFIXES: .SUFFIXES: .cpp .lo .o .obj $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) @for dep in $?; do \ case '$(am__configure_deps)' in \ *$$dep*) \ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ && { if test -f $@; then exit 0; else break; fi; }; \ exit 1;; \ esac; \ done; \ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu src/bin/Makefile'; \ $(am__cd) $(top_srcdir) && \ $(AUTOMAKE) --gnu src/bin/Makefile .PRECIOUS: Makefile Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status @case '$?' in \ *config.status*) \ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ *) \ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ esac; $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh $(top_srcdir)/configure: $(am__configure_deps) cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh $(ACLOCAL_M4): $(am__aclocal_m4_deps) cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh $(am__aclocal_m4_deps): install-binPROGRAMS: $(bin_PROGRAMS) @$(NORMAL_INSTALL) test -z "$(bindir)" || $(MKDIR_P) "$(DESTDIR)$(bindir)" @list='$(bin_PROGRAMS)'; test -n "$(bindir)" || list=; \ for p in $$list; do echo "$$p $$p"; done | \ sed 's/$(EXEEXT)$$//' | \ while read p p1; do if test -f $$p || test -f $$p1; \ then echo "$$p"; echo "$$p"; else :; fi; \ done | \ sed -e 'p;s,.*/,,;n;h' -e 's|.*|.|' \ -e 'p;x;s,.*/,,;s/$(EXEEXT)$$//;$(transform);s/$$/$(EXEEXT)/' | \ sed 'N;N;N;s,\n, ,g' | \ $(AWK) 'BEGIN { files["."] = ""; dirs["."] = 1 } \ { d=$$3; if (dirs[d] != 1) { print "d", d; dirs[d] = 1 } \ if ($$2 == $$4) files[d] = files[d] " " $$1; \ else { print "f", $$3 "/" $$4, $$1; } } \ END { for (d in files) print "f", d, files[d] }' | \ while read type dir files; do \ if test "$$dir" = .; then dir=; else dir=/$$dir; fi; \ test -z "$$files" || { \ echo " $(INSTALL_PROGRAM_ENV) $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL_PROGRAM) $$files '$(DESTDIR)$(bindir)$$dir'"; \ $(INSTALL_PROGRAM_ENV) $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL_PROGRAM) $$files "$(DESTDIR)$(bindir)$$dir" || exit $$?; \ } \ ; done uninstall-binPROGRAMS: @$(NORMAL_UNINSTALL) @list='$(bin_PROGRAMS)'; test -n "$(bindir)" || list=; \ files=`for p in $$list; do echo "$$p"; done | \ sed -e 'h;s,^.*/,,;s/$(EXEEXT)$$//;$(transform)' \ -e 's/$$/$(EXEEXT)/' `; \ test -n "$$list" || exit 0; \ echo " ( cd '$(DESTDIR)$(bindir)' && rm -f" $$files ")"; \ cd "$(DESTDIR)$(bindir)" && rm -f $$files clean-binPROGRAMS: @list='$(bin_PROGRAMS)'; test -n "$$list" || exit 0; \ echo " rm -f" $$list; \ rm -f $$list || exit $$?; \ test -n "$(EXEEXT)" || exit 0; \ list=`for p in $$list; do echo "$$p"; done | sed 's/$(EXEEXT)$$//'`; \ echo " rm -f" $$list; \ rm -f $$list kytea$(EXEEXT): $(kytea_OBJECTS) $(kytea_DEPENDENCIES) @rm -f kytea$(EXEEXT) $(CXXLINK) $(kytea_OBJECTS) $(kytea_LDADD) $(LIBS) train-kytea$(EXEEXT): $(train_kytea_OBJECTS) $(train_kytea_DEPENDENCIES) @rm -f train-kytea$(EXEEXT) $(CXXLINK) $(train_kytea_OBJECTS) $(train_kytea_LDADD) $(LIBS) mostlyclean-compile: -rm -f *.$(OBJEXT) distclean-compile: -rm -f *.tab.c @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/run-kytea.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/train-kytea.Po@am__quote@ .cpp.o: @am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< @am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po @AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(CXXCOMPILE) -c -o $@ $< .cpp.obj: @am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` @am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po @AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` .cpp.lo: @am__fastdepCXX_TRUE@ $(LTCXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< @am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo @AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(LTCXXCOMPILE) -c -o $@ $< mostlyclean-libtool: -rm -f *.lo clean-libtool: -rm -rf .libs _libs ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ unique=`for i in $$list; do \ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ done | \ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ END { if (nonempty) { for (i in files) print i; }; }'`; \ mkid -fID $$unique tags: TAGS TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ $(TAGS_FILES) $(LISP) set x; \ here=`pwd`; \ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ unique=`for i in $$list; do \ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ done | \ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ END { if (nonempty) { for (i in files) print i; }; }'`; \ shift; \ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ test -n "$$unique" || unique=$$empty_fix; \ if test $$# -gt 0; then \ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ "$$@" $$unique; \ else \ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ $$unique; \ fi; \ fi ctags: CTAGS CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ $(TAGS_FILES) $(LISP) list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ unique=`for i in $$list; do \ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ done | \ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ END { if (nonempty) { for (i in files) print i; }; }'`; \ test -z "$(CTAGS_ARGS)$$unique" \ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ $$unique GTAGS: here=`$(am__cd) $(top_builddir) && pwd` \ && $(am__cd) $(top_srcdir) \ && gtags -i $(GTAGS_ARGS) "$$here" distclean-tags: -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags distdir: $(DISTFILES) @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ list='$(DISTFILES)'; \ dist_files=`for file in $$list; do echo $$file; done | \ sed -e "s|^$$srcdirstrip/||;t" \ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ case $$dist_files in \ */*) $(MKDIR_P) `echo "$$dist_files" | \ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ sort -u` ;; \ esac; \ for file in $$dist_files; do \ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ if test -d $$d/$$file; then \ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ if test -d "$(distdir)/$$file"; then \ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ fi; \ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ fi; \ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ else \ test -f "$(distdir)/$$file" \ || cp -p $$d/$$file "$(distdir)/$$file" \ || exit 1; \ fi; \ done check-am: all-am check: check-am all-am: Makefile $(PROGRAMS) installdirs: for dir in "$(DESTDIR)$(bindir)"; do \ test -z "$$dir" || $(MKDIR_P) "$$dir"; \ done install: install-am install-exec: install-exec-am install-data: install-data-am uninstall: uninstall-am install-am: all-am @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am installcheck: installcheck-am install-strip: $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ `test -z '$(STRIP)' || \ echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install mostlyclean-generic: clean-generic: distclean-generic: -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) maintainer-clean-generic: @echo "This command is intended for maintainers to use" @echo "it deletes files that may require special tools to rebuild." clean: clean-am clean-am: clean-binPROGRAMS clean-generic clean-libtool mostlyclean-am distclean: distclean-am -rm -rf ./$(DEPDIR) -rm -f Makefile distclean-am: clean-am distclean-compile distclean-generic \ distclean-tags dvi: dvi-am dvi-am: html: html-am html-am: info: info-am info-am: install-data-am: install-dvi: install-dvi-am install-dvi-am: install-exec-am: install-binPROGRAMS install-html: install-html-am install-html-am: install-info: install-info-am install-info-am: install-man: install-pdf: install-pdf-am install-pdf-am: install-ps: install-ps-am install-ps-am: installcheck-am: maintainer-clean: maintainer-clean-am -rm -rf ./$(DEPDIR) -rm -f Makefile maintainer-clean-am: distclean-am maintainer-clean-generic mostlyclean: mostlyclean-am mostlyclean-am: mostlyclean-compile mostlyclean-generic \ mostlyclean-libtool pdf: pdf-am pdf-am: ps: ps-am ps-am: uninstall-am: uninstall-binPROGRAMS .MAKE: install-am install-strip .PHONY: CTAGS GTAGS all all-am check check-am clean clean-binPROGRAMS \ clean-generic clean-libtool ctags distclean distclean-compile \ distclean-generic distclean-libtool distclean-tags distdir dvi \ dvi-am html html-am info info-am install install-am \ install-binPROGRAMS install-data install-data-am install-dvi \ install-dvi-am install-exec install-exec-am install-html \ install-html-am install-info install-info-am install-man \ install-pdf install-pdf-am install-ps install-ps-am \ install-strip installcheck installcheck-am installdirs \ maintainer-clean maintainer-clean-generic mostlyclean \ mostlyclean-compile mostlyclean-generic mostlyclean-libtool \ pdf pdf-am ps ps-am tags uninstall uninstall-am \ uninstall-binPROGRAMS # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. .NOEXPORT: kytea_0.4.6+dfsg.orig/src/bin/run-kytea.cpp0000644000175000017500000000236212122355536020065 0ustar koichikoichi/* * Copyright 2009, KyTea Development Team * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include using namespace std; using namespace kytea; // trains a pronunciation estimation model using a corpus and a dictionary int main(int argv, const char **argc) { #ifndef KYTEA_SAFE try { #endif KyteaConfig * config = new KyteaConfig; config->setDebug(0); config->setOnTraining(false); config->parseRunCommandLine(argv, argc); Kytea kytea(config); kytea.analyze(); return 0; #ifndef KYTEA_SAFE } catch (exception &e) { cerr << endl; cerr << " KyTea Error: " << e.what() << endl; return 1; } #endif } kytea_0.4.6+dfsg.orig/src/bin/Makefile.am0000644000175000017500000000065612122355536017502 0ustar koichikoichi# KYTH = kytea.h corpus-io.h model-io.h string-util.h \ # kytea-model.h kytea-string.h kytea-struct.h dictionary.h general-io.h \ # kytea-config.h KYTH = AM_CPPFLAGS = -I$(srcdir)/../include -DPKGDATADIR='"$(pkgdatadir)"' bin_PROGRAMS = kytea train-kytea kytea_SOURCES = run-kytea.cpp ${KYTH} kytea_LDADD = ../lib/libkytea.la train_kytea_SOURCES = train-kytea.cpp ${KYTH} train_kytea_LDADD = ../lib/libkytea.la kytea_0.4.6+dfsg.orig/src/bin/train-kytea.cpp0000644000175000017500000000234212122355536020374 0ustar koichikoichi/* * Copyright 2009, KyTea Development Team * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include using namespace std; using namespace kytea; // trains a KyTea model int main(int argv, const char **argc) { #ifndef KYTEA_SAFE try { #endif KyteaConfig * config = new KyteaConfig; config->setDebug(1); config->setOnTraining(true); config->parseTrainCommandLine(argv, argc); Kytea kytea(config); kytea.trainAll(); return 0; #ifndef KYTEA_SAFE } catch (exception &e) { cerr << endl; cerr << " KyTea Error: " << e.what() << endl; return 1; } #endif } kytea_0.4.6+dfsg.orig/src/include/0000755000175000017500000000000012151067112016302 5ustar koichikoichikytea_0.4.6+dfsg.orig/src/include/Makefile.in0000644000175000017500000003374012133240157020360 0ustar koichikoichi# Makefile.in generated by automake 1.11.1 from Makefile.am. # @configure_input@ # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, # 2003, 2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, # Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY, to the extent permitted by law; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR A # PARTICULAR PURPOSE. @SET_MAKE@ VPATH = @srcdir@ pkgdatadir = $(datadir)/@PACKAGE@ pkgincludedir = $(includedir)/@PACKAGE@ pkglibdir = $(libdir)/@PACKAGE@ pkglibexecdir = $(libexecdir)/@PACKAGE@ am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd install_sh_DATA = $(install_sh) -c -m 644 install_sh_PROGRAM = $(install_sh) -c install_sh_SCRIPT = $(install_sh) -c INSTALL_HEADER = $(INSTALL_DATA) transform = $(program_transform_name) NORMAL_INSTALL = : PRE_INSTALL = : POST_INSTALL = : NORMAL_UNINSTALL = : PRE_UNINSTALL = : POST_UNINSTALL = : build_triplet = @build@ host_triplet = @host@ subdir = src/include DIST_COMMON = $(nobase_include_HEADERS) $(srcdir)/Makefile.am \ $(srcdir)/Makefile.in ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 am__aclocal_m4_deps = $(top_srcdir)/configure.ac am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ $(ACLOCAL_M4) mkinstalldirs = $(install_sh) -d CONFIG_HEADER = $(top_builddir)/src/include/kytea/config.h CONFIG_CLEAN_FILES = CONFIG_CLEAN_VPATH_FILES = SOURCES = DIST_SOURCES = am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; am__vpath_adj = case $$p in \ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ *) f=$$p;; \ esac; am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; am__install_max = 40 am__nobase_strip_setup = \ srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` am__nobase_strip = \ for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" am__nobase_list = $(am__nobase_strip_setup); \ for p in $$list; do echo "$$p $$p"; done | \ sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ if (++n[$$2] == $(am__install_max)) \ { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ END { for (dir in files) print dir, files[dir] }' am__base_list = \ sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' am__installdirs = "$(DESTDIR)$(includedir)" HEADERS = $(nobase_include_HEADERS) ETAGS = etags CTAGS = ctags DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) ACLOCAL = @ACLOCAL@ AMTAR = @AMTAR@ AR = @AR@ AUTOCONF = @AUTOCONF@ AUTOHEADER = @AUTOHEADER@ AUTOMAKE = @AUTOMAKE@ AWK = @AWK@ CC = @CC@ CCDEPMODE = @CCDEPMODE@ CFLAGS = @CFLAGS@ CPP = @CPP@ CPPFLAGS = @CPPFLAGS@ CXX = @CXX@ CXXCPP = @CXXCPP@ CXXDEPMODE = @CXXDEPMODE@ CXXFLAGS = @CXXFLAGS@ CYGPATH_W = @CYGPATH_W@ DEFS = @DEFS@ DEPDIR = @DEPDIR@ DLLTOOL = @DLLTOOL@ DSYMUTIL = @DSYMUTIL@ DUMPBIN = @DUMPBIN@ ECHO_C = @ECHO_C@ ECHO_N = @ECHO_N@ ECHO_T = @ECHO_T@ EGREP = @EGREP@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ INSTALL = @INSTALL@ INSTALL_DATA = @INSTALL_DATA@ INSTALL_PROGRAM = @INSTALL_PROGRAM@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ LD = @LD@ LDFLAGS = @LDFLAGS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ LIPO = @LIPO@ LN_S = @LN_S@ LTLIBOBJS = @LTLIBOBJS@ MAKEINFO = @MAKEINFO@ MANIFEST_TOOL = @MANIFEST_TOOL@ MKDIR_P = @MKDIR_P@ NM = @NM@ NMEDIT = @NMEDIT@ OBJDUMP = @OBJDUMP@ OBJEXT = @OBJEXT@ OTOOL = @OTOOL@ OTOOL64 = @OTOOL64@ PACKAGE = @PACKAGE@ PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ PACKAGE_NAME = @PACKAGE_NAME@ PACKAGE_STRING = @PACKAGE_STRING@ PACKAGE_TARNAME = @PACKAGE_TARNAME@ PACKAGE_URL = @PACKAGE_URL@ PACKAGE_VERSION = @PACKAGE_VERSION@ PATH_SEPARATOR = @PATH_SEPARATOR@ RANLIB = @RANLIB@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ STRIP = @STRIP@ VERSION = @VERSION@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ abs_top_srcdir = @abs_top_srcdir@ ac_ct_AR = @ac_ct_AR@ ac_ct_CC = @ac_ct_CC@ ac_ct_CXX = @ac_ct_CXX@ ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ am__include = @am__include@ am__leading_dot = @am__leading_dot@ am__quote = @am__quote@ am__tar = @am__tar@ am__untar = @am__untar@ bindir = @bindir@ build = @build@ build_alias = @build_alias@ build_cpu = @build_cpu@ build_os = @build_os@ build_vendor = @build_vendor@ builddir = @builddir@ datadir = @datadir@ datarootdir = @datarootdir@ docdir = @docdir@ dvidir = @dvidir@ exec_prefix = @exec_prefix@ host = @host@ host_alias = @host_alias@ host_cpu = @host_cpu@ host_os = @host_os@ host_vendor = @host_vendor@ htmldir = @htmldir@ includedir = @includedir@ infodir = @infodir@ install_sh = @install_sh@ libdir = @libdir@ libexecdir = @libexecdir@ localedir = @localedir@ localstatedir = @localstatedir@ mandir = @mandir@ mkdir_p = @mkdir_p@ oldincludedir = @oldincludedir@ pdfdir = @pdfdir@ prefix = @prefix@ program_transform_name = @program_transform_name@ psdir = @psdir@ sbindir = @sbindir@ sharedstatedir = @sharedstatedir@ srcdir = @srcdir@ sysconfdir = @sysconfdir@ target_alias = @target_alias@ top_build_prefix = @top_build_prefix@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ nobase_include_HEADERS = kytea/config.h \ kytea/corpus-io.h \ kytea/corpus-io-eda.h \ kytea/corpus-io-full.h \ kytea/corpus-io-part.h \ kytea/corpus-io-prob.h \ kytea/corpus-io-raw.h \ kytea/corpus-io-tokenized.h \ kytea/dictionary.h \ kytea/feature-io.h \ kytea/feature-lookup.h \ kytea/feature-vector.h \ kytea/general-io.h \ kytea/kytea-config.h \ kytea/kytea.h \ kytea/kytea-lm.h \ kytea/kytea-model.h \ kytea/kytea-string.h \ kytea/kytea-struct.h \ kytea/kytea-util.h \ kytea/model-io.h \ kytea/model-io-binary.h \ kytea/model-io-text.h \ kytea/string-util.h \ kytea/string-util-map-euc.h \ kytea/string-util-map-sjis.h \ kytea/string-util-map-utf8.h all: all-am .SUFFIXES: $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) @for dep in $?; do \ case '$(am__configure_deps)' in \ *$$dep*) \ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ && { if test -f $@; then exit 0; else break; fi; }; \ exit 1;; \ esac; \ done; \ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu src/include/Makefile'; \ $(am__cd) $(top_srcdir) && \ $(AUTOMAKE) --gnu src/include/Makefile .PRECIOUS: Makefile Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status @case '$?' in \ *config.status*) \ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ *) \ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ esac; $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh $(top_srcdir)/configure: $(am__configure_deps) cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh $(ACLOCAL_M4): $(am__aclocal_m4_deps) cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh $(am__aclocal_m4_deps): mostlyclean-libtool: -rm -f *.lo clean-libtool: -rm -rf .libs _libs install-nobase_includeHEADERS: $(nobase_include_HEADERS) @$(NORMAL_INSTALL) test -z "$(includedir)" || $(MKDIR_P) "$(DESTDIR)$(includedir)" @list='$(nobase_include_HEADERS)'; test -n "$(includedir)" || list=; \ $(am__nobase_list) | while read dir files; do \ xfiles=; for file in $$files; do \ if test -f "$$file"; then xfiles="$$xfiles $$file"; \ else xfiles="$$xfiles $(srcdir)/$$file"; fi; done; \ test -z "$$xfiles" || { \ test "x$$dir" = x. || { \ echo "$(MKDIR_P) '$(DESTDIR)$(includedir)/$$dir'"; \ $(MKDIR_P) "$(DESTDIR)$(includedir)/$$dir"; }; \ echo " $(INSTALL_HEADER) $$xfiles '$(DESTDIR)$(includedir)/$$dir'"; \ $(INSTALL_HEADER) $$xfiles "$(DESTDIR)$(includedir)/$$dir" || exit $$?; }; \ done uninstall-nobase_includeHEADERS: @$(NORMAL_UNINSTALL) @list='$(nobase_include_HEADERS)'; test -n "$(includedir)" || list=; \ $(am__nobase_strip_setup); files=`$(am__nobase_strip)`; \ test -n "$$files" || exit 0; \ echo " ( cd '$(DESTDIR)$(includedir)' && rm -f" $$files ")"; \ cd "$(DESTDIR)$(includedir)" && rm -f $$files ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ unique=`for i in $$list; do \ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ done | \ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ END { if (nonempty) { for (i in files) print i; }; }'`; \ mkid -fID $$unique tags: TAGS TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ $(TAGS_FILES) $(LISP) set x; \ here=`pwd`; \ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ unique=`for i in $$list; do \ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ done | \ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ END { if (nonempty) { for (i in files) print i; }; }'`; \ shift; \ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ test -n "$$unique" || unique=$$empty_fix; \ if test $$# -gt 0; then \ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ "$$@" $$unique; \ else \ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ $$unique; \ fi; \ fi ctags: CTAGS CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ $(TAGS_FILES) $(LISP) list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ unique=`for i in $$list; do \ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ done | \ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ END { if (nonempty) { for (i in files) print i; }; }'`; \ test -z "$(CTAGS_ARGS)$$unique" \ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ $$unique GTAGS: here=`$(am__cd) $(top_builddir) && pwd` \ && $(am__cd) $(top_srcdir) \ && gtags -i $(GTAGS_ARGS) "$$here" distclean-tags: -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags distdir: $(DISTFILES) @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ list='$(DISTFILES)'; \ dist_files=`for file in $$list; do echo $$file; done | \ sed -e "s|^$$srcdirstrip/||;t" \ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ case $$dist_files in \ */*) $(MKDIR_P) `echo "$$dist_files" | \ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ sort -u` ;; \ esac; \ for file in $$dist_files; do \ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ if test -d $$d/$$file; then \ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ if test -d "$(distdir)/$$file"; then \ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ fi; \ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ fi; \ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ else \ test -f "$(distdir)/$$file" \ || cp -p $$d/$$file "$(distdir)/$$file" \ || exit 1; \ fi; \ done check-am: all-am check: check-am all-am: Makefile $(HEADERS) installdirs: for dir in "$(DESTDIR)$(includedir)"; do \ test -z "$$dir" || $(MKDIR_P) "$$dir"; \ done install: install-am install-exec: install-exec-am install-data: install-data-am uninstall: uninstall-am install-am: all-am @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am installcheck: installcheck-am install-strip: $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ `test -z '$(STRIP)' || \ echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install mostlyclean-generic: clean-generic: distclean-generic: -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) maintainer-clean-generic: @echo "This command is intended for maintainers to use" @echo "it deletes files that may require special tools to rebuild." clean: clean-am clean-am: clean-generic clean-libtool mostlyclean-am distclean: distclean-am -rm -f Makefile distclean-am: clean-am distclean-generic distclean-tags dvi: dvi-am dvi-am: html: html-am html-am: info: info-am info-am: install-data-am: install-nobase_includeHEADERS install-dvi: install-dvi-am install-dvi-am: install-exec-am: install-html: install-html-am install-html-am: install-info: install-info-am install-info-am: install-man: install-pdf: install-pdf-am install-pdf-am: install-ps: install-ps-am install-ps-am: installcheck-am: maintainer-clean: maintainer-clean-am -rm -f Makefile maintainer-clean-am: distclean-am maintainer-clean-generic mostlyclean: mostlyclean-am mostlyclean-am: mostlyclean-generic mostlyclean-libtool pdf: pdf-am pdf-am: ps: ps-am ps-am: uninstall-am: uninstall-nobase_includeHEADERS .MAKE: install-am install-strip .PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \ clean-libtool ctags distclean distclean-generic \ distclean-libtool distclean-tags distdir dvi dvi-am html \ html-am info info-am install install-am install-data \ install-data-am install-dvi install-dvi-am install-exec \ install-exec-am install-html install-html-am install-info \ install-info-am install-man install-nobase_includeHEADERS \ install-pdf install-pdf-am install-ps install-ps-am \ install-strip installcheck installcheck-am installdirs \ maintainer-clean maintainer-clean-generic mostlyclean \ mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ tags uninstall uninstall-am uninstall-nobase_includeHEADERS # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. .NOEXPORT: kytea_0.4.6+dfsg.orig/src/include/kytea/0000755000175000017500000000000012151067112017417 5ustar koichikoichikytea_0.4.6+dfsg.orig/src/include/kytea/string-util-map-utf8.h0000644000175000017500000000415712122355536023527 0ustar koichikoichi// #define STRING_UTIL_ORIG_UTF8 "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789(){}<>「」[]-~.-/_,%?、―"'・─+:–!。&*@=" #define STRING_UTIL_ORIG_UTF8 "\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54\x55\x56\x57\x58\x59\x5a\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x28\x29\x7b\x7d\x3c\x3e\xef\xbd\xa2\xef\xbd\xa3\x5b\x5d\x2d\xef\xbd\x9e\x2e\xef\xbc\x8d\x2f\x5f\x2c\x25\x3f\xef\xbd\xa4\xe2\x80\x95\x22\x27\xef\xbd\xa5\xe2\x94\x80\x2b\x3a\xe2\x80\x93\x21\xef\xbd\xa1\x26\x2a\x40\x3d" // #define STRING_UTIL_NORM_UTF8 "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789(){}<>「」[]−〜。ー/_,%?、ー”’・ー+:ー!。&*@=" #define STRING_UTIL_NORM_UTF8 "\xef\xbd\x81\xef\xbd\x82\xef\xbd\x83\xef\xbd\x84\xef\xbd\x85\xef\xbd\x86\xef\xbd\x87\xef\xbd\x88\xef\xbd\x89\xef\xbd\x8a\xef\xbd\x8b\xef\xbd\x8c\xef\xbd\x8d\xef\xbd\x8e\xef\xbd\x8f\xef\xbd\x90\xef\xbd\x91\xef\xbd\x92\xef\xbd\x93\xef\xbd\x94\xef\xbd\x95\xef\xbd\x96\xef\xbd\x97\xef\xbd\x98\xef\xbd\x99\xef\xbd\x9a\xef\xbc\xa1\xef\xbc\xa2\xef\xbc\xa3\xef\xbc\xa4\xef\xbc\xa5\xef\xbc\xa6\xef\xbc\xa7\xef\xbc\xa8\xef\xbc\xa9\xef\xbc\xaa\xef\xbc\xab\xef\xbc\xac\xef\xbc\xad\xef\xbc\xae\xef\xbc\xaf\xef\xbc\xb0\xef\xbc\xb1\xef\xbc\xb2\xef\xbc\xb3\xef\xbc\xb4\xef\xbc\xb5\xef\xbc\xb6\xef\xbc\xb7\xef\xbc\xb8\xef\xbc\xb9\xef\xbc\xba\xef\xbc\x90\xef\xbc\x91\xef\xbc\x92\xef\xbc\x93\xef\xbc\x94\xef\xbc\x95\xef\xbc\x96\xef\xbc\x97\xef\xbc\x98\xef\xbc\x99\xef\xbc\x88\xef\xbc\x89\xef\xbd\x9b\xef\xbd\x9d\xef\xbc\x9c\xef\xbc\x9e\xe3\x80\x8c\xe3\x80\x8d\xef\xbc\xbb\xef\xbc\xbd\xe2\x88\x92\xe3\x80\x9c\xe3\x80\x82\xe3\x83\xbc\xef\xbc\x8f\xef\xbc\xbf\xef\xbc\x8c\xef\xbc\x85\xef\xbc\x9f\xe3\x80\x81\xe3\x83\xbc\xe2\x80\x9d\xe2\x80\x99\xe3\x83\xbb\xe3\x83\xbc\xef\xbc\x8b\xef\xbc\x9a\xe3\x83\xbc\xef\xbc\x81\xe3\x80\x82\xef\xbc\x86\xef\xbc\x8a\xef\xbc\xa0\xef\xbc\x9d" kytea_0.4.6+dfsg.orig/src/include/kytea/string-util-map-sjis.h0000644000175000017500000000326612122355536023611 0ustar koichikoichi// #define STRING_UTIL_ORIG_SJIS "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789(){}<>「」[]-~.-/_,%?、―"'・─+:–!。&*@=" #define STRING_UTIL_ORIG_SJIS "\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54\x55\x56\x57\x58\x59\x5a\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x28\x29\x7b\x7d\x3c\x3e\x81\x75\x81\x76\x5b\x5d\x2d\x81\x60\x2e\x81\x7c\x2f\x5f\x2c\x25\x3f\x81\x41\x81\x5c\x22\x27\x81\x45\x84\x9f\x2b\x3a\x21\x81\x42\x26\x2a\x40\x3d" // #define STRING_UTIL_NORM_SJIS "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789(){}<>「」[]−〜。ー/_,%?、ー”’・ー+:ー!。&*@=" #define STRING_UTIL_NORM_SJIS "\x82\x81\x82\x82\x82\x83\x82\x84\x82\x85\x82\x86\x82\x87\x82\x88\x82\x89\x82\x8a\x82\x8b\x82\x8c\x82\x8d\x82\x8e\x82\x8f\x82\x90\x82\x91\x82\x92\x82\x93\x82\x94\x82\x95\x82\x96\x82\x97\x82\x98\x82\x99\x82\x9a\x82\x60\x82\x61\x82\x62\x82\x63\x82\x64\x82\x65\x82\x66\x82\x67\x82\x68\x82\x69\x82\x6a\x82\x6b\x82\x6c\x82\x6d\x82\x6e\x82\x6f\x82\x70\x82\x71\x82\x72\x82\x73\x82\x74\x82\x75\x82\x76\x82\x77\x82\x78\x82\x79\x82\x4f\x82\x50\x82\x51\x82\x52\x82\x53\x82\x54\x82\x55\x82\x56\x82\x57\x82\x58\x81\x69\x81\x6a\x81\x6f\x81\x70\x81\x83\x81\x84\x81\x75\x81\x76\x81\x6d\x81\x6e\x81\x7c\x81\x60\x81\x42\x81\x5b\x81\x5e\x81\x51\x81\x43\x81\x93\x81\x48\x81\x41\x81\x5b\x81\x68\x81\x66\x81\x45\x81\x5b\x81\x7b\x81\x46\x81\x49\x81\x42\x81\x95\x81\x96\x81\x97\x81\x81" kytea_0.4.6+dfsg.orig/src/include/kytea/model-io.h0000644000175000017500000000551512122355536021313 0ustar koichikoichi/* * Copyright 2009, KyTea Development Team * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef MODEL_IO_H__ #define MODEL_IO_H__ #include #include #include #include #if DISABLE_QUANTIZE # define MODEL_IO_VERSION "0.4.0NQ" #else # define MODEL_IO_VERSION "0.4.0" #endif namespace kytea { class FeatureLookup; class KyteaModel; class KyteaLM; class ModelTagEntry; class ProbTagEntry; class ModelIO : public GeneralIO { public: typedef char Format; const static Format FORMAT_BINARY = 'B'; const static Format FORMAT_TEXT = 'T'; const static Format FORMAT_UNKNOWN = 'U'; int numTags_; public: ModelIO(StringUtil* util) : GeneralIO(util) { } ModelIO(StringUtil* util, const char* file, bool out, bool bin) : GeneralIO(util,file,out,bin) { } ModelIO(StringUtil* util, std::iostream & str, bool out, bool bin) : GeneralIO(util,str,out,bin) { } virtual ~ModelIO() { } static ModelIO* createIO(const char* file, Format form, bool output, KyteaConfig & config); static ModelIO* createIO(std::iostream & str, Format form, bool output, KyteaConfig & config); virtual void writeConfig(const KyteaConfig & conf) = 0; virtual void writeModel(const KyteaModel * mod) = 0; virtual void writeWordList(const std::vector & list) = 0; virtual void writeLM(const KyteaLM * mod) = 0; virtual void writeFeatVec(const FeatVec * vec) = 0; virtual void readConfig(KyteaConfig & conf) = 0; virtual KyteaModel * readModel() = 0; virtual std::vector readWordList() = 0; virtual KyteaLM * readLM() = 0; virtual FeatVec * readFeatVec() = 0; // These must be explicitly expanded because templated virtuals are not allowed virtual void writeModelDictionary(const Dictionary * dict) = 0; virtual void writeProbDictionary(const Dictionary * dict) = 0; virtual void writeVectorDictionary(const Dictionary * dict) = 0; virtual Dictionary * readModelDictionary() = 0; virtual Dictionary * readProbDictionary() = 0; virtual Dictionary * readVectorDictionary() = 0; virtual void writeFeatureLookup(const FeatureLookup * featLookup) = 0; virtual FeatureLookup * readFeatureLookup() = 0; }; } #endif kytea_0.4.6+dfsg.orig/src/include/kytea/kytea-model.h0000644000175000017500000001271012122355536022014 0ustar koichikoichi/* * Copyright 2009, KyTea Development Team * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef KYTEA_MODEL_H__ #define KYTEA_MODEL_H__ #include #include #include #include #define SIG_CUTOFF 1E-6 namespace kytea { typedef std::vector FeatNameVec; class FeatureLookup; template class Dictionary; class KyteaModel { public: static inline bool isProbabilistic(int solver) { return solver == 0 || solver == 6 || solver == 7; } static int featuresAdded_; protected: KyteaUnsignedMap ids_; FeatNameVec names_; FeatNameVec oldNames_; std::vector labels_; std::vector weights_; double multiplier_; double bias_; int solver_, numW_; bool addFeat_; FeatureLookup * featLookup_; public: KyteaModel() : multiplier_(1.0f), bias_(1.0f), solver_(1), addFeat_(true), featLookup_(NULL) { KyteaString str; mapFeat(str); } ~KyteaModel(); // Check that two models are equal, and throw an error if they aren't // Mainly used for making sure that model IO is working properly void checkEqual(const KyteaModel & rhs) const; // feature functions inline unsigned mapFeat(const KyteaString & str) { KyteaUnsignedMap::const_iterator it = ids_.find(str); unsigned ret = 0; if(it != ids_.end()) ret = it->second; else if(addFeat_) { ret = names_.size(); ids_[str] = ret; names_.push_back(str); } // std::cerr << "mapFeat:"; for(unsigned i=0;i "<= names_.size()) THROW_ERROR("FATAL: Array index out of bounds in showFeat ("<= "< > runClassifier(const std::vector & feat); // std::pair runClassifier(const std::vector & feat); void printClassifier(const std::vector & feat, StringUtil * util, std::ostream & out = std::cerr); void trainModel(const std::vector< std::vector > & xs, std::vector & ys, double bias, int solver, double epsilon, double cost); void trimModel(); inline const KyteaUnsignedMap & getIds() const { return ids_; } inline const unsigned getNumFeatures() const { return names_.size()-1; } inline const double getBias() const { return bias_; } inline const unsigned getNumWeights() const { return numW_; } inline const int getSolver() const { return solver_; } inline const unsigned getNumClasses() const { return labels_.size(); } inline const int getLabel(unsigned idx) const { return labels_[idx]; } inline FeatureLookup * getFeatureLookup() const { return featLookup_; } inline const FeatVal getWeight(unsigned i, unsigned j) const { int id = i*numW_+j; #ifdef KYTEA_SAFE if(id >= (int)weights_.size()) THROW_ERROR("weight out of bounds: size="<= (int)weights_.size()) THROW_ERROR("weight out of bounds: size="< header file. */ #undef HAVE_BOOST_TR1_UNORDERED_MAP_HPP /* Define to 1 if you have the header file. */ #undef HAVE_DLFCN_H /* Define to 1 if you have the header file. */ #undef HAVE_EXT_HASH_MAP /* Define to 1 if you have the header file. */ #undef HAVE_INTTYPES_H /* Define to 1 if you have the header file. */ #undef HAVE_MEMORY_H /* Define to 1 if stdbool.h conforms to C99. */ #undef HAVE_STDBOOL_H /* Define to 1 if you have the header file. */ #undef HAVE_STDINT_H /* Define to 1 if you have the header file. */ #undef HAVE_STDLIB_H /* Define to 1 if you have the header file. */ #undef HAVE_STRINGS_H /* Define to 1 if you have the header file. */ #undef HAVE_STRING_H /* Define to 1 if you have the header file. */ #undef HAVE_SYS_STAT_H /* Define to 1 if you have the header file. */ #undef HAVE_SYS_TYPES_H /* Define to 1 if you have the header file. */ #undef HAVE_TR1_UNORDERED_MAP /* Define to 1 if you have the header file. */ #undef HAVE_UNISTD_H /* Define to 1 if the system has the type `_Bool'. */ #undef HAVE__BOOL /* Define to the sub-directory in which libtool stores uninstalled libraries. */ #undef LT_OBJDIR /* Name of package */ #undef PACKAGE /* Define to the address where bug reports for this package should be sent. */ #undef PACKAGE_BUGREPORT /* Define to the full name of this package. */ #undef PACKAGE_NAME /* Define to the full name and version of this package. */ #undef PACKAGE_STRING /* Define to the one symbol short name of this package. */ #undef PACKAGE_TARNAME /* Define to the home page for this package. */ #undef PACKAGE_URL /* Define to the version of this package. */ #undef PACKAGE_VERSION /* Define to 1 if you have the ANSI C header files. */ #undef STDC_HEADERS /* Version number of package */ #undef VERSION /* Define to `__inline__' or `__inline' if that's what the C compiler calls it, or to nothing if 'inline' is not supported under any name. */ #ifndef __cplusplus #undef inline #endif /* Define to `unsigned int' if does not define. */ #undef size_t kytea_0.4.6+dfsg.orig/src/include/kytea/string-util.h0000644000175000017500000001404212122355536022062 0ustar koichikoichi/* * Copyright 2009, KyTea Development Team * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef STRING_UTIL_H__ #define STRING_UTIL_H__ #include // #include // #include #include // #include // #include namespace kytea { // a class for turning std::strings into internal representation class StringUtil { public: // types of characters (set in the constructor) typedef char CharType; const static CharType KANJI = 'K'; const static CharType KATAKANA = 'T'; const static CharType HIRAGANA = 'H'; const static CharType ROMAJI = 'R'; const static CharType DIGIT = 'D'; const static CharType OTHER = 'O'; // types of encodings typedef char Encoding; const static Encoding ENCODING_UTF8 = 'W'; const static Encoding ENCODING_EUC = 'E'; const static Encoding ENCODING_SJIS = 'S'; // A map that normalizes characters to a single representation GenericMap * normMap_; public: StringUtil() : normMap_(NULL) { } virtual ~StringUtil() { if(normMap_) delete normMap_; } // map a std::string to a character virtual KyteaChar mapChar(const std::string & str, bool add = true) = 0; virtual std::string showChar(KyteaChar c) = 0; std::string showString(const KyteaString & c) { std::ostringstream buff; for(unsigned i = 0; i < c.length(); i++) buff << showChar(c[i]); return buff.str(); } // map an unparsed std::string to a KyteaString virtual KyteaString mapString(const std::string & str) = 0; // get the type of a character virtual CharType findType(const std::string & str) = 0; virtual CharType findType(KyteaChar c) = 0; // return the encoding provided by this util virtual Encoding getEncoding() = 0; virtual const char* getEncodingString() = 0; // transform to or from a character std::string virtual void unserialize(const std::string & str) = 0; virtual std::string serialize() const = 0; // normalization functions virtual GenericMap * getNormMap() = 0; KyteaString normalize(const KyteaString & str); // Check that these are equal by serializing them void checkEqual(const StringUtil & rhs) const; // parse an integer or float int parseInt(const char* str); double parseFloat(const char* str); // get a std::string of character types std::string getTypeString(const KyteaString& str) { std::ostringstream buff; for(unsigned i = 0; i < str.length(); i++) buff << findType(str[i]); return buff.str(); } }; // a class for parsing UTF8 class StringUtilUtf8 : public StringUtil { private: const static char maskr6 = 63, maskr5 = 31, maskr4 = 15, maskr3 = 7, maskl1 = 1 << 7, maskl2 = 3 << 6, maskl3 = 7 << 5, maskl4 = 15 << 4, maskl5 = 31 << 3; // variables StringCharMap charIds_; std::vector charNames_; std::vector charTypes_; public: StringUtilUtf8(); ~StringUtilUtf8() { } // map a std::string to a character KyteaChar mapChar(const std::string & str, bool add = true); std::string showChar(KyteaChar c); CharType findType(KyteaChar c); GenericMap * getNormMap(); bool badu(char val) { return ((val ^ maskl1) & maskl2); } KyteaString mapString(const std::string & str); // find the type of a unicode character CharType findType(const std::string & str); Encoding getEncoding() { return ENCODING_UTF8; } const char* getEncodingString() { return "utf8"; } const std::vector & getCharNames() { return charNames_; } // transform to or from a character std::string void unserialize(const std::string & str); std::string serialize() const; }; class StringUtilEuc : public StringUtil { const static char maskl1 = 1 << 7; const static KyteaChar mask3len = 1 << 14; public: StringUtilEuc() { }; ~StringUtilEuc() { } KyteaChar mapChar(const std::string & str, bool add = true); std::string showChar(KyteaChar c); GenericMap * getNormMap(); // map an unparsed std::string to a KyteaString KyteaString mapString(const std::string & str); // get the type of a character CharType findType(const std::string & str); CharType findType(KyteaChar c); // return the encoding provided by this util Encoding getEncoding(); const char* getEncodingString(); // transform to or from a character std::string void unserialize(const std::string & str); std::string serialize() const; }; class StringUtilSjis : public StringUtil { const static char maskl1 = 1 << 7; const static KyteaChar mask3len = 1 << 14; public: StringUtilSjis() { }; ~StringUtilSjis() { } KyteaChar mapChar(const std::string & str, bool add = true); GenericMap * getNormMap(); std::string showChar(KyteaChar c); // map an unparsed std::string to a KyteaString KyteaString mapString(const std::string & str); // get the type of a character CharType findType(const std::string & str); CharType findType(KyteaChar c); // return the encoding provided by this util Encoding getEncoding(); const char* getEncodingString(); // transform to or from a character std::string void unserialize(const std::string & str); std::string serialize() const; }; } #endif kytea_0.4.6+dfsg.orig/src/include/kytea/model-io-binary.h0000644000175000017500000001106612122356565022576 0ustar koichikoichi#ifndef MODEL_IO_BINARY_H__ #define MODEL_IO_BINARY_H__ #include #include namespace kytea { class BinaryModelIO : public ModelIO { public: BinaryModelIO(StringUtil* util) : ModelIO(util) { } BinaryModelIO(StringUtil* util, const char* file, bool out) : ModelIO(util,file,out,true) { } BinaryModelIO(StringUtil* util, std::iostream & str, bool out) : ModelIO(util,str,out,true) { } // output functions void writeConfig(const KyteaConfig & conf); void writeModel(const KyteaModel * mod); void writeWordList(const std::vector & list); void writeModelDictionary(const Dictionary * dict) { writeDictionary(dict); } void writeProbDictionary(const Dictionary * dict) { writeDictionary(dict); } void writeVectorDictionary(const Dictionary * dict) { writeDictionary(dict); } void writeLM(const KyteaLM * mod); void writeFeatVec(const FeatVec * vec); void writeFeatureLookup(const FeatureLookup * featLookup); template void writeEntry(const Entry * entry); template void writeDictionary(const Dictionary * dict) { // write the number of dicts if(dict == 0) { writeBinary((unsigned char)0); writeBinary((uint32_t)0); return; } if(dict->getNumDicts() > 8) THROW_ERROR("Only 8 dictionaries may be stored in a binary file."); writeBinary(dict->getNumDicts()); // write the states const std::vector & states = dict->getStates(); writeBinary((uint32_t)states.size()); for(unsigned i = 0; i < states.size(); i++) { const DictionaryState * state = states[i]; writeBinary((uint32_t)state->failure); writeBinary((uint32_t)state->gotos.size()); for(unsigned j = 0; j < state->gotos.size(); j++) { writeBinary((KyteaChar)state->gotos[j].first); writeBinary((uint32_t)state->gotos[j].second); } writeBinary((uint32_t)state->output.size()); for(unsigned j = 0; j < state->output.size(); j++) writeBinary((uint32_t)state->output[j]); writeBinary(state->isBranch); } // write the entries const std::vector & entries = dict->getEntries(); writeBinary((uint32_t)entries.size()); for(unsigned i = 0; i < entries.size(); i++) writeEntry(entries[i]); } // input functions void readConfig(KyteaConfig & conf); KyteaModel * readModel(); std::vector readWordList(); Dictionary * readModelDictionary() { return readDictionary(); } Dictionary * readProbDictionary() { return readDictionary(); } Dictionary * readVectorDictionary() { return readDictionary(); } KyteaLM * readLM(); FeatVec * readFeatVec(); FeatureLookup * readFeatureLookup(); template Entry * readEntry(); template Dictionary * readDictionary() { Dictionary * dict = new Dictionary(util_); std::string line, buff; // get the number of dictionaries unsigned numDicts = readBinary(); dict->setNumDicts(numDicts); // get the states std::vector & states = dict->getStates(); states.resize(readBinary()); if(states.size() == 0) { delete dict; return 0; } for(unsigned i = 0; i < states.size(); i++) { DictionaryState * state = new DictionaryState(); state->failure = readBinary(); state->gotos.resize(readBinary()); for(unsigned j = 0; j < state->gotos.size(); j++) { state->gotos[j].first = readBinary(); state->gotos[j].second = readBinary(); } state->output.resize(readBinary()); for(unsigned j = 0; j < state->output.size(); j++) state->output[j] = readBinary(); state->isBranch = readBinary(); states[i] = state; } // get the entries std::vector & entries = dict->getEntries(); entries.resize(readBinary()); for(unsigned i = 0; i < entries.size(); i++) entries[i] = readEntry(); return dict; } }; } #endif kytea_0.4.6+dfsg.orig/src/include/kytea/feature-lookup.h0000644000175000017500000000623612122355536022551 0ustar koichikoichi#ifndef FEATURE_LOOKUP__ #define FEATURE_LOOKUP__ #include #include #include #include namespace kytea { class KyteaString; class ModelTagEntry; class FeatureLookup { protected: Dictionary *charDict_, *typeDict_, *selfDict_; FeatVec *dictVector_, *biases_, *tagDictVector_, *tagUnkVector_; public: FeatureLookup() : charDict_(NULL), typeDict_(NULL), selfDict_(NULL), dictVector_(NULL), biases_(NULL), tagDictVector_(NULL), tagUnkVector_(NULL) { } ~FeatureLookup(); void checkEqual(const FeatureLookup & rhs) const; // Getters const Dictionary * getCharDict() const { return charDict_; } const Dictionary * getTypeDict() const { return typeDict_; } const Dictionary * getSelfDict() const { return selfDict_; } const FeatVec * getDictVector() const { return dictVector_; } const FeatVal getBias(int id) const { return (*biases_)[id]; } const std::vector * getBiases() const { return biases_; } const FeatVal getTagUnkFeat(int tag) const { return (*tagUnkVector_)[tag]; } // const FeatVal getTagDictFeat(int dict, int tag, int target) const { // return (*tagDictVector_)[dict*numTags_*numTags_+tag*numTags_+target]; // } const std::vector * getTagDictVector() const { return tagDictVector_; } const std::vector * getTagUnkVector() const { return tagUnkVector_; } void addNgramScores(const Dictionary * dict, const KyteaString & str, int window, std::vector & score); void addDictionaryScores( const Dictionary::MatchResult & matches, int numDicts, int max, std::vector & score); void addTagNgrams(const KyteaString & chars, const Dictionary * dict, std::vector & scores, int window, int startChar, int endChar); void addSelfWeights(const KyteaString & chars, std::vector & scores, int isType); void addTagDictWeights(const std::vector > & exists, std::vector & scores); // Setters, these will all take control of the features they are passed // (without making a copy) void setCharDict(Dictionary * charDict) { charDict_ = charDict; } void setTypeDict(Dictionary * typeDict) { typeDict_ = typeDict; } void setSelfDict(Dictionary * selfDict) { selfDict_ = selfDict; } void setDictVector(FeatVec * dictVector) { dictVector_ = dictVector; } void setBias(FeatVal bias, int id) { if(biases_ == NULL) biases_ = new FeatVec(id+1, 0); else if((int)biases_->size() <= id) biases_->resize(id+1, 0); (*biases_)[id] = bias; } void setBiases(FeatVec * biases) { biases_ = biases; } void setTagDictVector(FeatVec * tagDictVector) { tagDictVector_ = tagDictVector; } void setTagUnkVector(FeatVec * tagUnkVector) { tagUnkVector_ = tagUnkVector; } }; } #endif kytea_0.4.6+dfsg.orig/src/include/kytea/kytea-lm.h0000644000175000017500000000261312122355536021325 0ustar koichikoichi/* * Copyright 2009-2010, KyTea Development Team * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef KYTEA_LM_H__ #define KYTEA_LM_H__ #include // #include namespace kytea { class KyteaLM { public: unsigned n_, vocabSize_; KyteaDoubleMap probs_; KyteaDoubleMap fallbacks_; KyteaLM(unsigned n) : n_(n), vocabSize_(10000) { } ~KyteaLM() { } // train a trigram model using Kneser-Ney smoothing void train(const std::vector & corpus); // score a string with the language model double score(const KyteaString & str) const; // score a single position in the string double scoreSingle(const KyteaString & val, int pos); const KyteaDoubleMap & getProbs() const { return probs_; } const KyteaDoubleMap & getFallbacks() const { return fallbacks_; } void checkEqual(const KyteaLM & rhs) const; }; } #endif kytea_0.4.6+dfsg.orig/src/include/kytea/kytea-util.h0000644000175000017500000000177612122355536021703 0ustar koichikoichi#ifndef KYTEA_UTIL__ #define KYTEA_UTIL__ // #include #include #include #include namespace kytea { #define THROW_ERROR(msg) do { \ std::ostringstream oss; \ oss << msg; \ throw std::runtime_error(oss.str()); } \ while (0); template void checkPointerEqual(const T* lhs, const T* rhs); // Vector equality checking function template void checkValueVecEqual(const std::vector & a, const std::vector & b); // Vector equality checking with null pointers template void checkValueVecEqual(const std::vector * a, const std::vector * b); // Vector equality checking function template void checkPointerVecEqual(const std::vector & a, const std::vector & b); // Vector equality checking with null pointers template void checkPointerVecEqual(const std::vector * a, const std::vector * b); }; #endif // KYTEA_UTIL__ kytea_0.4.6+dfsg.orig/src/include/kytea/feature-vector.h0000644000175000017500000000055612122356565022544 0ustar koichikoichi#ifndef FEATURE_VECTOR_H__ #define FEATURE_VECTOR_H__ #include #include namespace kytea { // Define the size of the feature values and sums #if DISABLE_QUANTIZE typedef double FeatVal; typedef double FeatSum; #else typedef int16_t FeatVal; typedef int32_t FeatSum; #endif typedef std::vector FeatVec; } #endif kytea_0.4.6+dfsg.orig/src/include/kytea/feature-io.h0000644000175000017500000000227112122355536021642 0ustar koichikoichi#ifndef FEATURE_IO_H__ #define FEATURE_IO_H__ // #include // #include // #include // #include #include #include #include namespace kytea { class FeatureIO { protected: std::ofstream * out_; TagHash feats_; typedef std::map WordMap; // Dictionary::WordMap wm_; WordMap wm_; int numTags_, numDicts_; public: FeatureIO() : out_(0), numTags_(0), numDicts_(0) { } ~FeatureIO(); int getNumTags() { return numTags_; } int getNumDicts() { return numDicts_; } void setNumTags(int numTags) { numTags_ = numTags; } void load(const std::string& fileName,StringUtil* util); void openOut(const std::string& fileName); void closeOut(); WordMap & getWordMap() { return wm_; } TagHash & getFeatures() { return feats_; } TagTriplet * getFeatures(const KyteaString & str, bool add); void printFeatures(const KyteaString & featId, TagTriplet* trip, StringUtil * util); void printFeatures(const KyteaString & featId, StringUtil * util); void printWordMap(StringUtil * util); }; } #endif kytea_0.4.6+dfsg.orig/src/include/kytea/corpus-io-raw.h0000644000175000017500000000107512122356565022315 0ustar koichikoichi#ifndef CORPUS_IO_RAW_H__ #define CORPUS_IO_RAW_H__ #include namespace kytea { class RawCorpusIO : public CorpusIO { public: RawCorpusIO(StringUtil * util) : CorpusIO(util) { } RawCorpusIO(const CorpusIO & c) : CorpusIO(c) { } RawCorpusIO(StringUtil * util, const char* file, bool out) : CorpusIO(util,file,out) { } RawCorpusIO(StringUtil * util, std::iostream & str, bool out) : CorpusIO(util,str,out) { } KyteaSentence * readSentence(); void writeSentence(const KyteaSentence * sent, double conf = 0.0); }; } #endif kytea_0.4.6+dfsg.orig/src/include/kytea/kytea-config.h0000644000175000017500000002370112122355536022163 0ustar koichikoichi/* * Copyright 2009, KyTea Development Team * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef KYTEA_CONFIG_H__ #define KYTEA_CONFIG_H__ namespace kytea { class KyteaConfig; } #include #include namespace kytea { class StringUtil; class KyteaConfig { private: // must be the same as CorpusIO::Format, not used directly because of cross-dependencies typedef char CorpForm; bool onTraining_; unsigned debug_; // the debugging level // 0 = silent (default for running) // 1 = simple progress updates (default for training) // 2 = detailed progress updates // 3 = everything StringUtil * util_; // a std::string utility to hold the encoding, etc std::vector corpora_; // corpora to read for training std::vector corpusFormats_; // the annotation of each corpus std::vector dicts_; // dictionaries to read std::vector subwordDicts_; // subword dictionaries to use for unknown estimation std::string model_; // model file to write/read char modelForm_; // model format (ModelIO::Format) std::string input_, output_; // the file to input/output CorpForm inputForm_, outputForm_; // the format/file to input/output to (default: stdout, full) std::string featIn_, featOut_; std::ostream* featStr_; bool doWS_, doTags_, doUnk_; std::vector doTag_; // feature options bool addFeat_; // whether or not to add newly found features double confidence_; // when using probability, only annotate or use values that // are at least this confident (default: 0=deterministic) // charW: the number of characters on either side of the boundary to use (default: 3) // charN: the maximum n-gram order of characters to use (default: 3) // typeW: the number of character types on either side of the boundary to use (default: 3) // typeN: the maximum n-gram order of types to use (default: 3) // dictN: all dictionary words over this are treated as equal frequency (default: 4) char charW_, charN_, typeW_, typeN_, dictN_; // unknown word arguments // unkN: the n-gram length of the unknown word spelling model // defTag: a default tag to use when no candidates were generated // unkTag: a tag to append after every word with no tag in the dictionary char unkN_; unsigned unkBeam_; std::string defTag_; std::string unkTag_; // liblinear training values double bias_; // the bias used for liblinear training double eps_; // the termination epsilon double cost_; // the cost for the SVM or LR training int solverType_; // the type of solver to be used // extra arguments, should be input/output for the analyzer std::vector args_; // set the type of the input corpus void setIOFormat(const char* str, CorpForm & cf); // formatting tags std::string wordBound_, tagBound_, elemBound_, unkBound_, noBound_, hasBound_, skipBound_, escape_; // hard constraint on character divisions. can be used for digits, etc. std::string wsConstraint_; // the number of tag levels int numTags_; std::vector global_; // tagMax: the maximum number of tags to return for a word unsigned tagMax_; // check argument legality void ch(const char * n, const char* v); public: KyteaConfig(); KyteaConfig(const KyteaConfig & rhs); ~KyteaConfig(); void addCorpus(const std::string & corp, CorpForm format); void addDictionary(const std::string & corp); void addSubwordDict(const std::string & corp); // parse command line arguments void parseTrainCommandLine(int argc, const char ** argv); void parseRunCommandLine(int argc, const char ** argv); void printUsage(); void printVersion(); // parse a single argument // the value argument can be null // return 1 if the value was used 0 if not unsigned parseTrainArg(const char * n, const char * v); unsigned parseRunArg(const char * n, const char * v); // getters const std::vector & getCorpusFiles() const { return corpora_; } const std::vector & getCorpusFormats() const { return corpusFormats_; } const std::vector & getDictionaryFiles() const { return dicts_; } const std::vector & getSubwordDictFiles() const { return subwordDicts_; } const std::string & getModelFile(); const char getModelFormat() const { return modelForm_; } const unsigned getDebug() const { return debug_; } StringUtil * getStringUtil() { return util_; } const StringUtil * getStringUtil() const { return util_; } const CorpForm getInputFormat() const { return inputForm_; } const CorpForm getOutputFormat() const { return outputForm_; } const std::string & getFeatureIn() const { return featIn_; } const std::string & getFeatureOut() const { return featOut_; } const bool getWriteFeatures() const { return featOut_.length() > 0; } const char getCharN() const { return charN_; } const char getCharWindow() const { return charW_; } const char getTypeN() const { return typeN_; } const char getTypeWindow() const { return typeW_; } const char getDictionaryN() const { return dictN_; } const char getUnkN() const { return unkN_; } const unsigned getTagMax() const { return tagMax_; } const unsigned getUnkBeam() const { return unkBeam_; } const std::string & getUnkTag() const { return unkTag_; } const std::string & getDefaultTag() const { return defTag_; } const std::string & getWsConstraint() const { return wsConstraint_; } const double getBias() const { return bias_; } const double getEpsilon() const { return eps_; } const double getCost() const { return cost_; } const int getSolverType() const { return solverType_; } const bool getDoWS() const { return doWS_; } const bool getDoUnk() const { return doUnk_; } const bool getDoTags() const { return doTags_; } const bool getDoTag(int i) const { return doTags_ && (i >= (int)doTag_.size() || doTag_[i]); } const char* getWordBound() const { return wordBound_.c_str(); } const char* getTagBound() const { return tagBound_.c_str(); } const char* getElemBound() const { return elemBound_.c_str(); } const char* getUnkBound() const { return unkBound_.c_str(); } const char* getNoBound() const { return noBound_.c_str(); } const char* getHasBound() const { return hasBound_.c_str(); } const char* getSkipBound() const { return skipBound_.c_str(); } const char* getEscape() const { return escape_.c_str(); } const double getConfidence() const { return confidence_; } const char getEncoding() const; const char* getEncodingString() const; int getNumTags() const { return numTags_; } bool getGlobal(int i) const { return i < (int)global_.size() && global_[i]; } const std::vector & getArguments() const { return args_; } // setters void setDebug(unsigned debug) { debug_ = debug; } void setModelFile(const char* file) { model_ = file; } void setModelFormat(char mf) { modelForm_ = mf; } void setEpsilon(double v) { eps_ = v; } void setCost(double v) { cost_ = v; } void setBias(bool v) { bias_ = (v?1.0f:-1.0f); } void setSolverType(int v) { solverType_ = v; } void setCharWindow(char v) { charW_ = v; } void setCharN(char v) { charN_ = v; } void setTypeWindow(char v) { typeW_ = v; } void setTypeN(char v) { typeN_ = v; } void setDictionaryN(char v) { dictN_ = v; } void setUnkN(char v) { unkN_ = v; } void setTagMax(unsigned v) { tagMax_ = v; } void setUnkBeam(unsigned v) { unkBeam_ = v; } void setUnkTag(const std::string & v) { unkTag_ = v; } void setUnkTag(const char* v) { unkTag_ = v; } void setDefaultTag(const std::string & v) { defTag_ = v; } void setDefaultTag(const char* v) { defTag_ = v; } void setOnTraining(bool v) { onTraining_ = v; } void setDoWS(bool v) { doWS_ = v; } void setDoUnk(bool v) { doUnk_ = v; } void setDoTags(bool v) { doTags_ = v; } void setDoTag(int i, bool v) { if(i >= (int)doTag_.size()) doTag_.resize(i+1,true); doTag_[i] = v; } void setInputFormat(CorpForm v) { inputForm_ = v; } void setWordBound(const char* v) { wordBound_ = v; } void setTagBound(const char* v) { tagBound_ = v; } void setElemBound(const char* v) { elemBound_ = v; } void setUnkBound(const char* v) { unkBound_ = v; } void setNoBound(const char* v) { noBound_ = v; } void setHasBound(const char* v) { hasBound_ = v; } void setSkipBound(const char* v) { skipBound_ = v; } void setEscape(const char* v) { escape_ = v; } void setNumTags(int v) { numTags_ = v; } void setGlobal(int v) { if((int)global_.size() <= v) global_.resize(v+1,false); global_[v] = true; } void setFeatureIn(const std::string & featIn) { featIn_ = featIn; } void setFeatureOut(const std::string & featOut) { featOut_ = featOut; } void setWsConstraint(const std::string & wsConstraint) { wsConstraint_ = wsConstraint; } std::ostream * getFeatureOutStream(); void closeFeatureOutStream(); // set the encoding of the StringUtil class and reset all the IOs void setEncoding(const char* str); }; } #endif kytea_0.4.6+dfsg.orig/src/include/kytea/general-io.h0000644000175000017500000000447512122355536021634 0ustar koichikoichi/* * Copyright 2009, KyTea Development Team * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef GENERAL_IO_H__ #define GENERAL_IO_H__ // #include // #include // #include #include #include // #include // #include // #include #if DISABLE_QUANTIZE # define DECIMAL_PRECISION 8 #else # define DECIMAL_PRECISION 6 #endif namespace kytea { // Forward declarations class StringUtil; class KyteaString; class GeneralIO { protected: StringUtil* util_; std::iostream* str_; bool out_; bool bin_; bool owns_; // write template void writeBinary(T v) { str_->write(reinterpret_cast(&v), sizeof(T)); } void writeString(const char* str, size_t size) { str_->write(str, size+1); } void writeString(const std::string & str) { str_->write(str.c_str(), str.length()+1); } void writeString(const KyteaString & str); // read template T readBinary(); std::string readString(); KyteaString readKyteaString(); public: GeneralIO(StringUtil* util) : util_(util), str_(0), out_(true), bin_(false), owns_(false) { } GeneralIO(StringUtil* util, std::iostream & str, bool out, bool bin) : util_(util), str_(&str), out_(out), bin_(false), owns_(false) { setStream(str, out, bin); } GeneralIO(StringUtil* util, const char* file, bool out, bool bin) : util_(util), str_(0), bin_(false), owns_(true) { openFile(file,out,bin); } ~GeneralIO() { if(str_ && owns_) delete str_; } void openFile(const char* file, bool out, bool bin); void setStream(std::iostream & str, bool out, bool bin); }; } #endif kytea_0.4.6+dfsg.orig/src/include/kytea/config.h0000644000175000017500000000512212133240173021035 0ustar koichikoichi/* src/include/kytea/config.h. Generated from config.h.in by configure. */ /* src/include/kytea/config.h.in. Generated from configure.ac by autoheader. */ /* Enable quantizing */ #define DISABLE_QUANTIZE 0 /* Define to 1 if you have the header file. */ #define HAVE_BOOST_TR1_UNORDERED_MAP_HPP 1 /* Define to 1 if you have the header file. */ #define HAVE_DLFCN_H 1 /* Define to 1 if you have the header file. */ /* #undef HAVE_EXT_HASH_MAP */ /* Define to 1 if you have the header file. */ #define HAVE_INTTYPES_H 1 /* Define to 1 if you have the header file. */ #define HAVE_MEMORY_H 1 /* Define to 1 if stdbool.h conforms to C99. */ #define HAVE_STDBOOL_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STDINT_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STDLIB_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STRINGS_H 1 /* Define to 1 if you have the header file. */ #define HAVE_STRING_H 1 /* Define to 1 if you have the header file. */ #define HAVE_SYS_STAT_H 1 /* Define to 1 if you have the header file. */ #define HAVE_SYS_TYPES_H 1 /* Define to 1 if you have the header file. */ /* #undef HAVE_TR1_UNORDERED_MAP */ /* Define to 1 if you have the header file. */ #define HAVE_UNISTD_H 1 /* Define to 1 if the system has the type `_Bool'. */ #define HAVE__BOOL 1 /* Define to the sub-directory in which libtool stores uninstalled libraries. */ #define LT_OBJDIR ".libs/" /* Name of package */ #define PACKAGE "kytea" /* Define to the address where bug reports for this package should be sent. */ #define PACKAGE_BUGREPORT "kytea@phontron.com" /* Define to the full name of this package. */ #define PACKAGE_NAME "kytea" /* Define to the full name and version of this package. */ #define PACKAGE_STRING "kytea 0.4.6" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "kytea" /* Define to the home page for this package. */ #define PACKAGE_URL "" /* Define to the version of this package. */ #define PACKAGE_VERSION "0.4.6" /* Define to 1 if you have the ANSI C header files. */ #define STDC_HEADERS 1 /* Version number of package */ #define VERSION "0.4.6" /* Define to `__inline__' or `__inline' if that's what the C compiler calls it, or to nothing if 'inline' is not supported under any name. */ #ifndef __cplusplus /* #undef inline */ #endif /* Define to `unsigned int' if does not define. */ /* #undef size_t */ kytea_0.4.6+dfsg.orig/src/include/kytea/model-io-text.h0000644000175000017500000001215512127276745022304 0ustar koichikoichi#ifndef MODEL_IO_TEXT_H__ #define MODEL_IO_TEXT_H__ #include #include namespace kytea { class CorpusIO; class TextModelIO : public ModelIO { public: TextModelIO(StringUtil* util) : ModelIO(util) { } TextModelIO(StringUtil* util, const char* file, bool out) : ModelIO(util,file,out,false) { } TextModelIO(StringUtil* util, std::iostream & str, bool out) : ModelIO(util,str,out,false) { } // writing functions void writeConfig(const KyteaConfig & conf); void writeModel(const KyteaModel * mod); void writeWordList(const std::vector & list); void writeModelDictionary(const Dictionary * dict) { writeDictionary(dict); } void writeProbDictionary(const Dictionary * dict) { writeDictionary(dict); } void writeVectorDictionary(const Dictionary * dict) { writeDictionary(dict); } void writeLM(const KyteaLM * mod); void writeFeatVec(const FeatVec * vec); void writeFeatureLookup(const FeatureLookup * featLookup); template void writeEntry(const Entry * entry); template void writeDictionary(const Dictionary * dict) { if(dict == 0) { *str_ << "0" << std::endl << "0" << std::endl; return; } // write the states *str_ << (unsigned)dict->getNumDicts() << std::endl; const std::vector & states = dict->getStates(); *str_ << states.size() << std::endl; if(states.size() == 0) return; for(unsigned i = 0; i < states.size(); i++) { *str_ << states[i]->failure; for(unsigned j = 0; j < states[i]->gotos.size(); j++) *str_ << " " << util_->showChar(states[i]->gotos[j].first) << " " << states[i]->gotos[j].second; *str_ << std::endl; for(unsigned j = 0; j < states[i]->output.size(); j++) { if(j!=0) *str_ << " "; *str_ << states[i]->output[j]; } *str_ << std::endl; *str_ << (states[i]->isBranch?'b':'n') << std::endl; } // write the entries const std::vector & entries = dict->getEntries(); *str_ << entries.size() << std::endl; for(unsigned i = 0; i < entries.size(); i++) writeEntry((Entry*)entries[i]); } // create an appropriate parser based on the type static CorpusIO* createIO(const char* file, Format form, bool output, StringUtil* util); static CorpusIO* createIO(std::iostream & str, Format form, bool output, StringUtil* util); void readConfig(KyteaConfig & conf); KyteaModel * readModel(); std::vector readWordList(); Dictionary * readModelDictionary() { return readDictionary(); } Dictionary * readProbDictionary() { return readDictionary(); } Dictionary * readVectorDictionary() { return readDictionary(); } KyteaLM * readLM(); FeatVec * readFeatVec(); FeatureLookup * readFeatureLookup(); template Entry * readEntry(); template Dictionary * readDictionary() { Dictionary * dict = new Dictionary(util_); std::string line, buff; // get the number of dictionaries std::getline(*str_, line); dict->setNumDicts(util_->parseInt(line.c_str())); // get the states std::vector & states = dict->getStates(); getline(*str_, line); states.resize(util_->parseInt(line.c_str())); if(states.size() == 0) { delete dict; return 0; } for(unsigned i = 0; i < states.size(); i++) { DictionaryState * state = new DictionaryState(); getline(*str_, line); std::istringstream iss(line); iss >> buff; state->failure = util_->parseInt(buff.c_str()); while(iss >> buff) { std::pair p; p.first = util_->mapChar(buff.c_str()); if(!(iss >> buff)) THROW_ERROR("Badly formed model (goto character without a destination)"); p.second = util_->parseInt(buff.c_str()); state->gotos.push_back(p); } sort(state->gotos.begin(), state->gotos.end()); getline(*str_, line); std::istringstream iss2(line); while(iss2 >> buff) state->output.push_back(util_->parseInt(buff.c_str())); getline(*str_, line); if(line.length() != 1) THROW_ERROR("Badly formed model (branch indicator not found)"); state->isBranch = (line[0] == 'b'); states[i] = state; } // get the entries std::vector & entries = dict->getEntries(); getline(*str_, line); entries.resize(util_->parseInt(line.c_str())); for(unsigned i = 0; i < entries.size(); i++) { entries[i] = readEntry(); } return dict; } }; } #endif kytea_0.4.6+dfsg.orig/src/include/kytea/string-util-map-euc.h0000644000175000017500000000326212122355536023411 0ustar koichikoichi// #define STRING_UTIL_ORIG_EUC "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789(){}<>「」[]-~.-/_,%?、―"'・─+:–!。&*@=" #define STRING_UTIL_ORIG_EUC "\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54\x55\x56\x57\x58\x59\x5a\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x28\x29\x7b\x7d\x3c\x3e\xa1\xd6\xa1\xd7\x5b\x5d\x2d\xa1\xc1\x2e\xa1\xdd\x2f\x5f\x2c\x25\x3f\xa1\xa2\xa1\xbd\x22\x27\xa1\xa6\xa8\xa1\x2b\x3a\x21\xa1\xa3\x26\x2a\x40\x3d" // #define STRING_UTIL_NORM_EUC "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789(){}<>「」[]−〜。ー/_,%?、ー”’・ー+:ー!。&*@=" #define STRING_UTIL_NORM_EUC "\xa3\xe1\xa3\xe2\xa3\xe3\xa3\xe4\xa3\xe5\xa3\xe6\xa3\xe7\xa3\xe8\xa3\xe9\xa3\xea\xa3\xeb\xa3\xec\xa3\xed\xa3\xee\xa3\xef\xa3\xf0\xa3\xf1\xa3\xf2\xa3\xf3\xa3\xf4\xa3\xf5\xa3\xf6\xa3\xf7\xa3\xf8\xa3\xf9\xa3\xfa\xa3\xc1\xa3\xc2\xa3\xc3\xa3\xc4\xa3\xc5\xa3\xc6\xa3\xc7\xa3\xc8\xa3\xc9\xa3\xca\xa3\xcb\xa3\xcc\xa3\xcd\xa3\xce\xa3\xcf\xa3\xd0\xa3\xd1\xa3\xd2\xa3\xd3\xa3\xd4\xa3\xd5\xa3\xd6\xa3\xd7\xa3\xd8\xa3\xd9\xa3\xda\xa3\xb0\xa3\xb1\xa3\xb2\xa3\xb3\xa3\xb4\xa3\xb5\xa3\xb6\xa3\xb7\xa3\xb8\xa3\xb9\xa1\xca\xa1\xcb\xa1\xd0\xa1\xd1\xa1\xe3\xa1\xe4\xa1\xd6\xa1\xd7\xa1\xce\xa1\xcf\xa1\xdd\xa1\xc1\xa1\xa3\xa1\xbc\xa1\xbf\xa1\xb2\xa1\xa4\xa1\xf3\xa1\xa9\xa1\xa2\xa1\xbc\xa1\xc9\xa1\xc7\xa1\xa6\xa1\xbc\xa1\xdc\xa1\xa7\xa1\xaa\xa1\xa3\xa1\xf5\xa1\xf6\xa1\xf7\xa1\xe1" kytea_0.4.6+dfsg.orig/src/include/kytea/corpus-io-full.h0000644000175000017500000000214212133227117022452 0ustar koichikoichi#ifndef CORPUS_IO_FULL_H__ #define CORPUS_IO_FULL_H__ #include #include namespace kytea { class FullCorpusIO : public CorpusIO { protected: bool allTags_; KyteaString bounds_; bool printWords_; public: FullCorpusIO(StringUtil * util, const char* wordBound = " ", const char* tagBound = "/", const char* elemBound = "&", const char* escape = "\\"); FullCorpusIO(const CorpusIO & c, const char* wordBound = " ", const char* tagBound = "/", const char* elemBound = "&", const char* escape = "\\"); FullCorpusIO(StringUtil * util, const char* file, bool out, const char* wordBound = " ", const char* tagBound = "/", const char* elemBound = "&", const char* escape = "\\"); FullCorpusIO(StringUtil * util, std::iostream & str, bool out, const char* wordBound = " ", const char* tagBound = "/", const char* elemBound = "&", const char* escape = "\\"); KyteaSentence * readSentence(); void writeSentence(const KyteaSentence * sent, double conf = 0.0); void setPrintWords(bool printWords) { printWords_ = printWords; } }; } #endif kytea_0.4.6+dfsg.orig/src/include/kytea/corpus-io-tokenized.h0000644000175000017500000000143112122356565023514 0ustar koichikoichi#ifndef CORPUS_IO_TOKENIZED_H__ #define CORPUS_IO_TOKENIZED_H__ #include #include namespace kytea { // An IO class for corpora that are tokenized, but with no tags class TokenizedCorpusIO : public CorpusIO { protected: bool allTags_; KyteaString bounds_; public: TokenizedCorpusIO(StringUtil * util, const char* wordBound = " "); TokenizedCorpusIO(const CorpusIO & c, const char* wordBound = " "); TokenizedCorpusIO(StringUtil * util, const char* file, bool out, const char* wordBound = " "); TokenizedCorpusIO(StringUtil * util, std::iostream & str, bool out, const char* wordBound = " "); KyteaSentence * readSentence(); void writeSentence(const KyteaSentence * sent, double conf = 0.0); }; } #endif kytea_0.4.6+dfsg.orig/src/include/kytea/kytea.h0000644000175000017500000001406712122360536020721 0ustar koichikoichi/* * Copyright 2009-2010, KyTea Development Team * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef KYTEA_H__ #define KYTEA_H__ #include #include #include namespace kytea { class KyteaTest; class StringUtil; class KyteaConfig; template class Dictionary; class ModelTagEntry; class ProbTagEntry; class KyteaModel; class KyteaLM; class FeatureIO; // a class representing the main analyzer class Kytea { private: friend class KyteaTest; typedef unsigned FeatureId; typedef std::vector Sentences; typedef std::vector< std::vector< FeatureId > > SentenceFeatures; StringUtil* util_; KyteaConfig* config_; Dictionary * dict_; Sentences sentences_; // Values for the word segmentation models KyteaModel* wsModel_; Dictionary* subwordDict_; std::vector subwordModels_; std::vector globalMods_; std::vector< std::vector > globalTags_; std::vector dictFeats_; std::vector charPrefixes_, typePrefixes_; FeatureIO* fio_; public: /////////////////////////////////////////////////////////////////// // API functions // /////////////////////////////////////////////////////////////////// // Read a model from the file fileName. Character encoding, // settings, and other information will be read automatically. void readModel(const char* fileName); // Writes a model representing the current instance to the // file fileName. The model will be of the type specified // by the parameters in KyteaConfig void writeModel(const char* fileName); // Calculate the word segmentation for a sentence void calculateWS(KyteaSentence & sent); // Calculate the tagss for a sentence void calculateTags(KyteaSentence & sent, int lev); // Calculate the unknown pronunciation for a single unknown word void calculateUnknownTag(KyteaWord & str, int lev); // Get the string utility class that allows you to map to/from // Kyteas internal string representation (using // mapString/showString) StringUtil* getStringUtil() { return config_->getStringUtil(); } // Get the the configuration of this isntance of KyTea KyteaConfig* getConfig() { return config_; } // These are available for convenience, and require you to set // the appropriate settings in KyteaConfig first // "trainAll" performs full training of Kytea from start to finish void trainAll(); // "analyze" loads models, and analyzes the full corpus input void analyze(); /////////////////////////////////////////////////////////////////// // Constructor/Destructor // /////////////////////////////////////////////////////////////////// void init(); Kytea() : config_(new KyteaConfig()) { init(); } Kytea(KyteaConfig * config) : config_(config) { init(); } ~Kytea(); KyteaModel* getWSModel() { return wsModel_; } // Set the word segmentation model and take control of it void setWSModel(KyteaModel* model) { wsModel_ = model; } // Set the dictionary and take control of it template void setDictionary(Dictionary * dict); /////////////////////////////////////////////////////////////////// // Functions used internally during Kytea training, testing etc. // /////////////////////////////////////////////////////////////////// public: void checkEqual(const Kytea & rhs); private: // functions to create dictionaries void buildVocabulary(); // a function that checks to make sure that configuration is correct before // training void trainSanityCheck(); // functions for word segmentation void trainWS(); void preparePrefixes(); unsigned wsDictionaryFeatures(const KyteaString & sent, SentenceFeatures & feat); unsigned wsNgramFeatures(const KyteaString & sent, SentenceFeatures & feat, const std::vector & prefixes, int n); // functions for tagging void trainLocalTags(int lev); void trainGlobalTags(int lev); unsigned tagNgramFeatures(const KyteaString & chars, std::vector & feat, const std::vector & prefixes, KyteaModel * model, int n, int sc, int ec); unsigned tagSelfFeatures(const KyteaString & self, std::vector & feat, const KyteaString & pref, KyteaModel * model); unsigned tagDictFeatures(const KyteaString & surf, int lev, std::vector & myFeats, KyteaModel * model); // Get matches of the dictionary for a single word in the form of // { , } // where x is the dictionary and y is the tag that exists in the dicitonary std::vector > getDictionaryMatches(const KyteaString & str, int lev); template void addTag(typename Dictionary::WordMap& allWords, const KyteaString & word, int lev, const KyteaString * tag, int dict); template void addTag(typename Dictionary::WordMap& allWords, const KyteaString & word, const KyteaTag * tag, int dict); template void scanDictionaries(const std::vector & dict, typename Dictionary::WordMap & wordMap, KyteaConfig * config, StringUtil * util, bool saveIds = true); // functions for unknown word PE void trainUnk(int lev); void buildFeatureLookups(); void analyzeInput(); std::vector generateTagCandidates(const KyteaString & str, int lev); }; } #endif kytea_0.4.6+dfsg.orig/src/include/kytea/kytea-struct.h0000644000175000017500000001306112122355536022240 0ustar koichikoichi/* * Copyright 2009, KyTea Development Team * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef KYTEA_STRUCT_H__ #define KYTEA_STRUCT_H__ // #include // #include // #include // #include // #include #include #include #include #include // maps for use with various classes #ifdef HAVE_BOOST_TR1_UNORDERED_MAP_HPP # include template class GenericMap : public std::tr1::unordered_map { }; template class StringMap : public std::tr1::unordered_map { }; template class KyteaStringMap : public std::tr1::unordered_map { }; #elif HAVE_TR1_UNORDERED_MAP #if _MSC_VER >=1600 # include #else # include #endif template class GenericMap : public std::tr1::unordered_map { }; template class StringMap : public std::tr1::unordered_map { }; template class KyteaStringMap : public std::tr1::unordered_map { }; #elif HAVE_EXT_HASH_MAP # include namespace __gnu_cxx { template <> struct hash { size_t operator() (const std::string& x) const { return hash()(x.c_str()); } }; } template class GenericMap : public __gnu_cxx::hash_map { }; template class StringMap : public __gnu_cxx::hash_map { }; template class KyteaStringMap : public __gnu_cxx::hash_map { }; #else # include template class GenericMap : public std::map { }; template class StringMap : public std::map { }; template class KyteaStringMap : public std::map { }; #endif namespace kytea { // Map equality checking function template void checkMapEqual(const KyteaStringMap & a, const KyteaStringMap & b); // KyteaTag // a single scored tag candidate typedef std::pair KyteaTag; inline bool operator<(const KyteaTag & a, const KyteaTag & b) { if(a.second < b.second) return false; if(b.second < a.second) return true; return a.first < b.first; } // KyteaWord // a single word, with multiple lists of candidates for each tag class KyteaWord { public: KyteaWord(const KyteaString & s, const KyteaString & n) : surface(s), norm(n), isCertain(true), unknown(false) { } // The surface form of the word KyteaString surface; // The normalized form of the word used for calculating features KyteaString norm; // Each of its tags std::vector< std::vector< KyteaTag > > tags; // Whether the word boundaries are certain bool isCertain; // Whether this is an unknown word bool unknown; // get a tag for a certain level void limitTags(unsigned lev, unsigned lim) { if(tags.size() > lev && tags[lev].size() > lim) tags[lev].resize(lim); } const int getNumTags() const { return tags.size(); } const KyteaTag * getTag(int lev) const { return (lev<(int)tags.size()&&tags[lev].size()>0) ? &tags[lev][0] : 0; } const std::vector< KyteaTag > & getTags(int lev) const { return tags[lev]; } const KyteaString & getTagSurf(int lev) const { return tags[lev][0].first; } double getTagConf(int lev) const { return tags[lev][0].second; } void setTag(int lev, const KyteaTag & tag) { if(lev >= (int)tags.size()) tags.resize(lev+1); tags[lev].resize(1); tags[lev][0] = tag; } void setTagConf(int lev, double conf) { tags[lev][0].second = conf; } void clearTags(int lev) { if((int)tags.size() > lev) tags[lev].clear(); } void addTag(int lev, const KyteaTag & tag) { if(lev >= (int)tags.size()) tags.resize(lev+1); tags[lev].push_back(tag); } void setUnknown(bool val) { unknown = val; } bool getUnknown() const { return unknown; } bool hasTag(int lev) const { return (int)tags.size() > lev && tags[lev].size() > 0; } }; // KyteaSentence // contains a single sentence with multiple words class KyteaSentence { public: typedef std::vector Words; typedef std::vector Floats; // the original raw string KyteaString surface; KyteaString norm; Floats wsConfs; // the string of words Words words; // constructors KyteaSentence() : surface(), wsConfs(0) { } KyteaSentence(const KyteaString & str, const KyteaString & norm_str) : surface(str), norm(norm_str), wsConfs(std::max(str.length(),(unsigned)1)-1,0) { } void refreshWS(double confidence); }; } typedef StringMap StringCharMap; typedef KyteaStringMap KyteaUnsignedMap; typedef KyteaStringMap KyteaDoubleMap; typedef KyteaStringMap > TwoCountHash; #endif kytea_0.4.6+dfsg.orig/src/include/kytea/dictionary.h0000644000175000017500000001170112122355536021745 0ustar koichikoichi/* * Copyright 2009, KyTea Development Team * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef KYTEA_DICTIONARY_H_ #define KYTEA_DICTIONARY_H_ #include #include // #include #include #include namespace kytea { class KyteaModel; class KyteaString; class TagEntry { public: TagEntry(const KyteaString & str) : word(str), tags(), inDict(0) { } virtual ~TagEntry() { } KyteaString word; std::vector< std::vector > tags; std::vector< std::vector > tagInDicts; unsigned char inDict; virtual void setNumTags(int i) { tags.resize(i); tagInDicts.resize(i); } // check if this is in the dictionary inline static bool isInDict(unsigned char in, unsigned char test) { return (1 << test) & in; } inline bool isInDict(unsigned char test) const { return isInDict(inDict,test); } // add this to the dictionary inline static void setInDict(unsigned char & in, unsigned char test) { in |= (1 << test); } inline void setInDict(char test) { setInDict(inDict,test); } }; class ModelTagEntry : public TagEntry { public: ModelTagEntry(const KyteaString & str) : TagEntry(str) { } ~ModelTagEntry(); void setNumTags(int i) { TagEntry::setNumTags(i); tagMods.resize(i,0); } std::vector tagMods; }; class ProbTagEntry : public TagEntry { public: ProbTagEntry(const KyteaString & str) : TagEntry(str), probs() { } ~ProbTagEntry() { } double incrementProb(const KyteaString & str, int lev); void setNumTags(int i) { TagEntry::setNumTags(i); probs.resize(i); } std::vector< std::vector< double > > probs; }; class DictionaryState { public: DictionaryState() : failure(0), gotos(), output(), isBranch(false) { } typedef std::vector< std::pair< KyteaChar, unsigned> > Gotos; unsigned failure; Gotos gotos; std::vector< unsigned > output; bool isBranch; inline unsigned step(KyteaChar input) { Gotos::const_iterator l=gotos.begin(), r=gotos.end(), m; KyteaChar check; while(r != l) { m = l+std::distance(l,r)/2; check = m->first; if(inputcheck) l=m+1; else return m->second; } return 0; } }; // a dictionary that uses a FA tree and the Aho-Corasick algorithm for search // Aho-Corasick "Efficient String Matching: An Aid to Bibliographic Search" template class Dictionary { public: typedef std::map WordMap; typedef typename WordMap::const_iterator wm_const_iterator; // A result of dictionary matching, containing pairs of the ending point // and the entry typedef std::vector< std::pair > MatchResult; private: StringUtil * util_; std::vector states_; std::vector entries_; unsigned char numDicts_; // std::string space(unsigned lev) { // std::ostringstream oss; // while(lev-- > 0) // oss << " "; // return oss.str(); // } // Build the goto and failures for the Aho-Corasick method void buildGoto(wm_const_iterator start, wm_const_iterator end, unsigned lev, unsigned nid); void buildFailures(); public: Dictionary(StringUtil * util) : util_(util), numDicts_(0) { }; void clearData(); ~Dictionary() { clearData(); }; void buildIndex(const WordMap & input); void print(); const Entry * findEntry(KyteaString str) const; Entry * findEntry(KyteaString str); unsigned getTagID(KyteaString str, KyteaString tag, int lev); MatchResult match( const KyteaString & chars ) const; std::vector & getEntries() { return entries_; } std::vector & getStates() { return states_; } const std::vector & getEntries() const { return entries_; } const std::vector & getStates() const { return states_; } unsigned char getNumDicts() const { return numDicts_; } void setNumDicts(unsigned char numDicts) { numDicts_ = numDicts; } // This is only a light check to make sure the number of states // and entries are identical for now, if necessary expand to check // the values as well void checkEqual(const Dictionary & rhs) const; }; } #endif kytea_0.4.6+dfsg.orig/src/include/kytea/corpus-io-eda.h0000644000175000017500000000104612122356565022253 0ustar koichikoichi#ifndef CORPUS_IO_EDA_H__ #define CORPUS_IO_EDA_H__ #include namespace kytea { class EdaCorpusIO : public CorpusIO { public: EdaCorpusIO(StringUtil * util); EdaCorpusIO(const CorpusIO & c); EdaCorpusIO(StringUtil * util, const char* file, bool out); EdaCorpusIO(StringUtil * util, std::iostream & str, bool out); KyteaSentence * readSentence(); void writeSentence(const KyteaSentence * sent, double conf = 0.0); protected: // The ID of the last sentence printed int id_; }; } #endif kytea_0.4.6+dfsg.orig/src/include/kytea/kytea-string.h0000644000175000017500000001156512122355536022231 0ustar koichikoichi/* * Copyright 2009, KyTea Development Team * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // Turn this on to make KyTea do boundary checking (useful for development) // #define KYTEA_SAFE #ifndef KYTEA_STRING_H__ #define KYTEA_STRING_H__ #include #include // #include // #include // #include // #include namespace kytea { typedef unsigned short KyteaChar; class StringUtil; // an implementation of a string, kept in memory class KyteaStringImpl { public: unsigned length_; unsigned count_; KyteaChar* chars_; KyteaStringImpl() : length_(0), count_(1), chars_(0) { } KyteaStringImpl(unsigned length); KyteaStringImpl(const KyteaStringImpl & impl); ~KyteaStringImpl() { if(chars_) delete [] chars_; } unsigned dec() { return --count_; } unsigned inc() { return ++count_; } }; class KyteaString { public: friend class StringUtil; private: KyteaStringImpl* impl_; public: typedef std::vector Tokens; // ctor KyteaString() : impl_(0) { } KyteaString(const KyteaString & str) { impl_ = str.impl_; if(impl_) impl_->inc(); } KyteaString(unsigned length) { impl_ = new KyteaStringImpl(length); } // dtor ~KyteaString() { if(impl_ && !impl_->dec()) delete impl_; } // tokenize the string using the characters in the delimiter string Tokens tokenize(const KyteaString & delim, bool includeDelim = false) const; // splice a string into the appropriate location void splice(const KyteaString& str, unsigned pos); // Get the substring of a KyteaString KyteaString substr(unsigned s) const; KyteaString substr(unsigned s, unsigned l) const; inline KyteaChar & operator[](int i) { #ifdef KYTEA_SAFE if(impl_ == 0 || i < 0 || (unsigned)i >= impl_->length_) throw std::runtime_error("string index out of bounds"); #endif return getImpl()->chars_[i]; } inline const KyteaChar & operator[](int i) const { #ifdef KYTEA_SAFE if(impl_ == 0 || i < 0 || (unsigned)i >= impl_->length_) throw std::runtime_error("string index out of bounds"); #endif return impl_->chars_[i]; } KyteaString & operator= (const KyteaString &str) { if(impl_ && !impl_->dec()) delete impl_; impl_ = str.impl_; if(impl_) impl_->inc(); return *this; } inline unsigned length() const { return (impl_?impl_->length_:0); } // Get the hash for this value size_t getHash() const; // Get the implementation inline const KyteaStringImpl * getImpl() const { return impl_; } KyteaStringImpl * getImpl(); // Find if it begins with a particular string bool beginsWith(const KyteaString & s) const; }; inline KyteaString operator+(const KyteaString& a, const KyteaChar& b) { const KyteaStringImpl * aimp = a.getImpl(); if(aimp == 0) { KyteaString ret(1); ret[0] = b; return ret; } KyteaString ret(aimp->length_+1); ret.splice(a,0); ret[aimp->length_]=b; return ret; } inline KyteaString operator+(const KyteaString& a, const KyteaString& b) { const KyteaStringImpl * aimp = a.getImpl(); if(aimp == 0) return b; const KyteaStringImpl * bimp = b.getImpl(); if(bimp == 0) return a; KyteaString ret(aimp->length_+bimp->length_); ret.splice(a,0); ret.splice(b,aimp->length_); return ret; } inline bool operator<(const KyteaString & a, const KyteaString & b) { unsigned i; const unsigned al = a.length(), bl = b.length(), ml=std::min(al,bl); for(i = 0; i < ml; i++) { if(a[i] < b[i]) return true; else if(b[i] < a[i]) return false; } return (bl != i); } inline bool operator==(const KyteaString & a, const KyteaString & b) { unsigned i; const unsigned al = a.length(); if(al!=b.length()) return false; for(i = 0; i < al; i++) if(a[i] != b[i]) return false; return true; } inline bool operator!=(const KyteaString & a, const KyteaString & b) { return !(a==b); } // hashing using the djb2 algorithm // found at // http://www.cse.yorku.ca/~oz/hash.html class KyteaStringHash { public: size_t operator()(const KyteaString & x) const { return x.getHash(); } }; } #endif kytea_0.4.6+dfsg.orig/src/include/kytea/corpus-io.h0000644000175000017500000000476412133240274021525 0ustar koichikoichi/* * Copyright 2009, KyTea Development Team * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef CORPUS_IO_H__ #define CORPUS_IO_H__ namespace kytea { class CorpusIO; const static char CORP_FORMAT_RAW = 0; const static char CORP_FORMAT_FULL = 1; const static char CORP_FORMAT_PART = 2; const static char CORP_FORMAT_PROB = 3; const static char CORP_FORMAT_TOK = 4; const static char CORP_FORMAT_DEFAULT = 5; const static char CORP_FORMAT_EDA = 6; const static char CORP_FORMAT_TAGS = 7; } #include #include // #include // #include namespace kytea { // Forward declarations class KyteaConfig; class StringUtil; class KyteaSentence; class CorpusIO : public GeneralIO { protected: std::string unkTag_; int numTags_; std::vector doTag_; public: typedef char Format; CorpusIO(StringUtil * util) : GeneralIO(util), unkTag_(), numTags_(0), doTag_() { } CorpusIO(StringUtil * util, const char* file, bool out) : GeneralIO(util,file,out,false), numTags_(0), doTag_() { } CorpusIO(StringUtil * util, std::iostream & str, bool out) : GeneralIO(util,str,out,false), numTags_(0), doTag_() { } int getNumTags() { return numTags_; } void setNumTags(int numTags) { numTags_ = numTags; } void setDoTag(int i, bool v) { if(i >= (int)doTag_.size()) doTag_.resize(i+1,true); doTag_[i] = v; } bool getDoTag(int i) { return i >= (int)doTag_.size() || doTag_[i]; } virtual ~CorpusIO() { } // create an appropriate parser based on the type static CorpusIO* createIO(const char* file, Format form, const KyteaConfig & conf, bool output, StringUtil* util); static CorpusIO* createIO(std::iostream & str, Format form, const KyteaConfig & conf, bool output, StringUtil* util); virtual KyteaSentence * readSentence() = 0; virtual void writeSentence(const KyteaSentence * sent, double conf = 0.0) = 0; void setUnkTag(const std::string & tag) { unkTag_ = tag; } }; } #endif kytea_0.4.6+dfsg.orig/src/include/kytea/corpus-io-part.h0000644000175000017500000000344312122356565022473 0ustar koichikoichi#ifndef CORPUS_IO_PART_H__ #define CORPUS_IO_PART_H__ #include #include namespace kytea { class PartCorpusIO : public CorpusIO { private: KyteaString bounds_; public: // PartCorpusIO ctr // util: the string utility to use // unkBound: the delimiter for when the bound is unannotated // skipBound: the delimiter for when annotation of a bound has been skipped // noBound: the delimiter for when no bound exists // hasBound: the delimiter for when a boundary exists // tagBound: the delimiter for when a boundary exists // elemBound: the delimiter for when a boundary exists // escape: the escape character PartCorpusIO(StringUtil * util, const char* unkBound = " ", const char* skipBound = "?", const char* noBound = "-", const char* hasBound = "|", const char* tagBound = "/", const char* elemBound = "&", const char* escape = "\\"); PartCorpusIO(const CorpusIO & c, const char* unkBound = " ", const char* skipBound = "?", const char* noBound = "-", const char* hasBound = "|", const char* tagBound = "/", const char* elemBound = "&", const char* escape = "\\"); PartCorpusIO(StringUtil * util, std::iostream & str, bool out, const char* unkBound = " ", const char* skipBound = "?", const char* noBound = "-", const char* hasBound = "|", const char* tagBound = "/", const char* elemBound = "&", const char* escape = "\\"); PartCorpusIO(StringUtil * util, const char* file, bool out, const char* unkBound = " ", const char* skipBound = "?", const char* noBound = "-", const char* hasBound = "|", const char* tagBound = "/", const char* elemBound = "&", const char* escape = "\\"); KyteaSentence * readSentence(); void writeSentence(const KyteaSentence * sent, double conf = 0.0); }; } #endif kytea_0.4.6+dfsg.orig/src/include/Makefile.am0000644000175000017500000000125212122355536020346 0ustar koichikoichinobase_include_HEADERS = kytea/config.h \ kytea/corpus-io.h \ kytea/corpus-io-eda.h \ kytea/corpus-io-full.h \ kytea/corpus-io-part.h \ kytea/corpus-io-prob.h \ kytea/corpus-io-raw.h \ kytea/corpus-io-tokenized.h \ kytea/dictionary.h \ kytea/feature-io.h \ kytea/feature-lookup.h \ kytea/feature-vector.h \ kytea/general-io.h \ kytea/kytea-config.h \ kytea/kytea.h \ kytea/kytea-lm.h \ kytea/kytea-model.h \ kytea/kytea-string.h \ kytea/kytea-struct.h \ kytea/kytea-util.h \ kytea/model-io.h \ kytea/model-io-binary.h \ kytea/model-io-text.h \ kytea/string-util.h \ kytea/string-util-map-euc.h \ kytea/string-util-map-sjis.h \ kytea/string-util-map-utf8.h kytea_0.4.6+dfsg.orig/src/Makefile.am0000644000175000017500000000004312122355536016720 0ustar koichikoichiSUBDIRS = include lib bin api test kytea_0.4.6+dfsg.orig/src/lib/0000755000175000017500000000000012205657353015441 5ustar koichikoichikytea_0.4.6+dfsg.orig/src/lib/kytea-struct.cpp0000644000175000017500000000401712122355536020602 0ustar koichikoichi#include #include using namespace kytea; namespace kytea { // Map equality checking function template void checkMapEqual(const KyteaStringMap & a, const KyteaStringMap & b) { if(a.size() != b.size()) THROW_ERROR("checkMapEqual a.size() != b.size() ("<::const_iterator ait = a.begin(); ait != a.end(); ait++) { typename KyteaStringMap::const_iterator bit = b.find(ait->first); if(bit == b.end() || ait->second != bit->second) THROW_ERROR("Values don't match in map"); } } template void checkMapEqual(const KyteaStringMap & a, const KyteaStringMap & b); template void checkMapEqual(const KyteaStringMap & a, const KyteaStringMap & b); } void KyteaSentence::refreshWS(double confidence) { Words newWords; // In order to keep track of new words, use the start and end int nextWord = 0, nextEnd = 0, nextStart = -1; if(surface.length() != 0) { int last = 0, i; for(i = 0; i <= (int)wsConfs.size(); i++) { double myConf = (i == (int)wsConfs.size()) ? 100.0 : wsConfs[i]; if(myConf > confidence) { // Catch up to the current word while(nextWord < (int)words.size() && nextEnd < i+1) { nextStart = nextEnd; nextEnd += words[nextWord].surface.length(); nextWord++; } // If both the beginning and end match, use the current word if(last == nextStart && i+1 == nextEnd) newWords.push_back(words[nextWord-1]); else { KyteaWord w(surface.substr(last, i-last+1), norm.substr(last, i-last+1)); newWords.push_back(w); } // Update the start of the next word last = i+1; } } } words = newWords; } kytea_0.4.6+dfsg.orig/src/lib/Makefile.in0000644000175000017500000005754412133240157017513 0ustar koichikoichi# Makefile.in generated by automake 1.11.1 from Makefile.am. # @configure_input@ # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, # 2003, 2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, # Inc. # This Makefile.in is free software; the Free Software Foundation # gives unlimited permission to copy and/or distribute it, # with or without modifications, as long as this notice is preserved. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY, to the extent permitted by law; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR A # PARTICULAR PURPOSE. @SET_MAKE@ VPATH = @srcdir@ pkgdatadir = $(datadir)/@PACKAGE@ pkgincludedir = $(includedir)/@PACKAGE@ pkglibdir = $(libdir)/@PACKAGE@ pkglibexecdir = $(libexecdir)/@PACKAGE@ am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd install_sh_DATA = $(install_sh) -c -m 644 install_sh_PROGRAM = $(install_sh) -c install_sh_SCRIPT = $(install_sh) -c INSTALL_HEADER = $(INSTALL_DATA) transform = $(program_transform_name) NORMAL_INSTALL = : PRE_INSTALL = : POST_INSTALL = : NORMAL_UNINSTALL = : PRE_UNINSTALL = : POST_UNINSTALL = : build_triplet = @build@ host_triplet = @host@ subdir = src/lib DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 am__aclocal_m4_deps = $(top_srcdir)/configure.ac am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ $(ACLOCAL_M4) mkinstalldirs = $(install_sh) -d CONFIG_HEADER = $(top_builddir)/src/include/kytea/config.h CONFIG_CLEAN_FILES = CONFIG_CLEAN_VPATH_FILES = am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; am__vpath_adj = case $$p in \ $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \ *) f=$$p;; \ esac; am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`; am__install_max = 40 am__nobase_strip_setup = \ srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'` am__nobase_strip = \ for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||" am__nobase_list = $(am__nobase_strip_setup); \ for p in $$list; do echo "$$p $$p"; done | \ sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \ $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \ if (++n[$$2] == $(am__install_max)) \ { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \ END { for (dir in files) print dir, files[dir] }' am__base_list = \ sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' am__installdirs = "$(DESTDIR)$(libdir)" LTLIBRARIES = $(lib_LTLIBRARIES) libkytea_la_DEPENDENCIES = $(LLLIBS) am__objects_1 = kytea.lo general-io.lo corpus-io-prob.lo \ corpus-io-eda.lo corpus-io-full.lo corpus-io-part.lo \ corpus-io-tokenized.lo corpus-io-raw.lo corpus-io.lo \ model-io.lo string-util.lo kytea-model.lo kytea-config.lo \ kytea-lm.lo feature-io.lo dictionary.lo feature-lookup.lo \ kytea-util.lo kytea-string.lo kytea-struct.lo am_libkytea_la_OBJECTS = $(am__objects_1) libkytea_la_OBJECTS = $(am_libkytea_la_OBJECTS) libkytea_la_LINK = $(LIBTOOL) --tag=CXX $(AM_LIBTOOLFLAGS) \ $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \ $(CXXFLAGS) $(libkytea_la_LDFLAGS) $(LDFLAGS) -o $@ DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)/src/include/kytea depcomp = $(SHELL) $(top_srcdir)/depcomp am__depfiles_maybe = depfiles am__mv = mv -f CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) LTCXXCOMPILE = $(LIBTOOL) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) CXXLD = $(CXX) CXXLINK = $(LIBTOOL) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ --mode=link $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \ $(LDFLAGS) -o $@ SOURCES = $(libkytea_la_SOURCES) DIST_SOURCES = $(libkytea_la_SOURCES) RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \ html-recursive info-recursive install-data-recursive \ install-dvi-recursive install-exec-recursive \ install-html-recursive install-info-recursive \ install-pdf-recursive install-ps-recursive install-recursive \ installcheck-recursive installdirs-recursive pdf-recursive \ ps-recursive uninstall-recursive RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive \ distclean-recursive maintainer-clean-recursive AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \ $(RECURSIVE_CLEAN_TARGETS:-recursive=) tags TAGS ctags CTAGS \ distdir ETAGS = etags CTAGS = ctags DIST_SUBDIRS = $(SUBDIRS) DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) am__relativize = \ dir0=`pwd`; \ sed_first='s,^\([^/]*\)/.*$$,\1,'; \ sed_rest='s,^[^/]*/*,,'; \ sed_last='s,^.*/\([^/]*\)$$,\1,'; \ sed_butlast='s,/*[^/]*$$,,'; \ while test -n "$$dir1"; do \ first=`echo "$$dir1" | sed -e "$$sed_first"`; \ if test "$$first" != "."; then \ if test "$$first" = ".."; then \ dir2=`echo "$$dir0" | sed -e "$$sed_last"`/"$$dir2"; \ dir0=`echo "$$dir0" | sed -e "$$sed_butlast"`; \ else \ first2=`echo "$$dir2" | sed -e "$$sed_first"`; \ if test "$$first2" = "$$first"; then \ dir2=`echo "$$dir2" | sed -e "$$sed_rest"`; \ else \ dir2="../$$dir2"; \ fi; \ dir0="$$dir0"/"$$first"; \ fi; \ fi; \ dir1=`echo "$$dir1" | sed -e "$$sed_rest"`; \ done; \ reldir="$$dir2" ACLOCAL = @ACLOCAL@ AMTAR = @AMTAR@ AR = @AR@ AUTOCONF = @AUTOCONF@ AUTOHEADER = @AUTOHEADER@ AUTOMAKE = @AUTOMAKE@ AWK = @AWK@ CC = @CC@ CCDEPMODE = @CCDEPMODE@ CFLAGS = @CFLAGS@ CPP = @CPP@ CPPFLAGS = @CPPFLAGS@ CXX = @CXX@ CXXCPP = @CXXCPP@ CXXDEPMODE = @CXXDEPMODE@ CXXFLAGS = @CXXFLAGS@ CYGPATH_W = @CYGPATH_W@ DEFS = @DEFS@ DEPDIR = @DEPDIR@ DLLTOOL = @DLLTOOL@ DSYMUTIL = @DSYMUTIL@ DUMPBIN = @DUMPBIN@ ECHO_C = @ECHO_C@ ECHO_N = @ECHO_N@ ECHO_T = @ECHO_T@ EGREP = @EGREP@ EXEEXT = @EXEEXT@ FGREP = @FGREP@ GREP = @GREP@ INSTALL = @INSTALL@ INSTALL_DATA = @INSTALL_DATA@ INSTALL_PROGRAM = @INSTALL_PROGRAM@ INSTALL_SCRIPT = @INSTALL_SCRIPT@ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ LD = @LD@ LDFLAGS = @LDFLAGS@ LIBOBJS = @LIBOBJS@ LIBS = @LIBS@ LIBTOOL = @LIBTOOL@ LIPO = @LIPO@ LN_S = @LN_S@ LTLIBOBJS = @LTLIBOBJS@ MAKEINFO = @MAKEINFO@ MANIFEST_TOOL = @MANIFEST_TOOL@ MKDIR_P = @MKDIR_P@ NM = @NM@ NMEDIT = @NMEDIT@ OBJDUMP = @OBJDUMP@ OBJEXT = @OBJEXT@ OTOOL = @OTOOL@ OTOOL64 = @OTOOL64@ PACKAGE = @PACKAGE@ PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ PACKAGE_NAME = @PACKAGE_NAME@ PACKAGE_STRING = @PACKAGE_STRING@ PACKAGE_TARNAME = @PACKAGE_TARNAME@ PACKAGE_URL = @PACKAGE_URL@ PACKAGE_VERSION = @PACKAGE_VERSION@ PATH_SEPARATOR = @PATH_SEPARATOR@ RANLIB = @RANLIB@ SED = @SED@ SET_MAKE = @SET_MAKE@ SHELL = @SHELL@ STRIP = @STRIP@ VERSION = @VERSION@ abs_builddir = @abs_builddir@ abs_srcdir = @abs_srcdir@ abs_top_builddir = @abs_top_builddir@ abs_top_srcdir = @abs_top_srcdir@ ac_ct_AR = @ac_ct_AR@ ac_ct_CC = @ac_ct_CC@ ac_ct_CXX = @ac_ct_CXX@ ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ am__include = @am__include@ am__leading_dot = @am__leading_dot@ am__quote = @am__quote@ am__tar = @am__tar@ am__untar = @am__untar@ bindir = @bindir@ build = @build@ build_alias = @build_alias@ build_cpu = @build_cpu@ build_os = @build_os@ build_vendor = @build_vendor@ builddir = @builddir@ datadir = @datadir@ datarootdir = @datarootdir@ docdir = @docdir@ dvidir = @dvidir@ exec_prefix = @exec_prefix@ host = @host@ host_alias = @host_alias@ host_cpu = @host_cpu@ host_os = @host_os@ host_vendor = @host_vendor@ htmldir = @htmldir@ includedir = @includedir@ infodir = @infodir@ install_sh = @install_sh@ libdir = @libdir@ libexecdir = @libexecdir@ localedir = @localedir@ localstatedir = @localstatedir@ mandir = @mandir@ mkdir_p = @mkdir_p@ oldincludedir = @oldincludedir@ pdfdir = @pdfdir@ prefix = @prefix@ program_transform_name = @program_transform_name@ psdir = @psdir@ sbindir = @sbindir@ sharedstatedir = @sharedstatedir@ srcdir = @srcdir@ sysconfdir = @sysconfdir@ target_alias = @target_alias@ top_build_prefix = @top_build_prefix@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ LLLIBS = liblinear/liblinear.la KYTCPP = kytea.cpp general-io.cpp corpus-io-prob.cpp corpus-io-eda.cpp corpus-io-full.cpp corpus-io-part.cpp corpus-io-tokenized.cpp corpus-io-raw.cpp corpus-io.cpp model-io.cpp string-util.cpp kytea-model.cpp kytea-config.cpp kytea-lm.cpp feature-io.cpp dictionary.cpp feature-lookup.cpp kytea-util.cpp kytea-string.cpp kytea-struct.cpp # KYTH = kytea.h corpus-io.h model-io.h string-util.h \ # kytea-model.h kytea-string.h kytea-struct.h dictionary.h general-io.h \ # kytea-config.h AM_CPPFLAGS = -I$(srcdir)/../include -DPKGDATADIR='"$(pkgdatadir)"' SUBDIRS = liblinear lib_LTLIBRARIES = libkytea.la libkytea_la_SOURCES = ${KYTCPP} libkytea_la_LIBADD = ${LLLIBS} libkytea_la_LDFLAGS = -version-info 0:0:0 all: all-recursive .SUFFIXES: .SUFFIXES: .cpp .lo .o .obj $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) @for dep in $?; do \ case '$(am__configure_deps)' in \ *$$dep*) \ ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ && { if test -f $@; then exit 0; else break; fi; }; \ exit 1;; \ esac; \ done; \ echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu src/lib/Makefile'; \ $(am__cd) $(top_srcdir) && \ $(AUTOMAKE) --gnu src/lib/Makefile .PRECIOUS: Makefile Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status @case '$?' in \ *config.status*) \ cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ *) \ echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ esac; $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh $(top_srcdir)/configure: $(am__configure_deps) cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh $(ACLOCAL_M4): $(am__aclocal_m4_deps) cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh $(am__aclocal_m4_deps): install-libLTLIBRARIES: $(lib_LTLIBRARIES) @$(NORMAL_INSTALL) test -z "$(libdir)" || $(MKDIR_P) "$(DESTDIR)$(libdir)" @list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \ list2=; for p in $$list; do \ if test -f $$p; then \ list2="$$list2 $$p"; \ else :; fi; \ done; \ test -z "$$list2" || { \ echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 '$(DESTDIR)$(libdir)'"; \ $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$list2 "$(DESTDIR)$(libdir)"; \ } uninstall-libLTLIBRARIES: @$(NORMAL_UNINSTALL) @list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \ for p in $$list; do \ $(am__strip_dir) \ echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(libdir)/$$f'"; \ $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(libdir)/$$f"; \ done clean-libLTLIBRARIES: -test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES) @list='$(lib_LTLIBRARIES)'; for p in $$list; do \ dir="`echo $$p | sed -e 's|/[^/]*$$||'`"; \ test "$$dir" != "$$p" || dir=.; \ echo "rm -f \"$${dir}/so_locations\""; \ rm -f "$${dir}/so_locations"; \ done libkytea.la: $(libkytea_la_OBJECTS) $(libkytea_la_DEPENDENCIES) $(libkytea_la_LINK) -rpath $(libdir) $(libkytea_la_OBJECTS) $(libkytea_la_LIBADD) $(LIBS) mostlyclean-compile: -rm -f *.$(OBJEXT) distclean-compile: -rm -f *.tab.c @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/corpus-io-eda.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/corpus-io-full.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/corpus-io-part.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/corpus-io-prob.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/corpus-io-raw.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/corpus-io-tokenized.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/corpus-io.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dictionary.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/feature-io.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/feature-lookup.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/general-io.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kytea-config.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kytea-lm.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kytea-model.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kytea-string.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kytea-struct.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kytea-util.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/kytea.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/model-io.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/string-util.Plo@am__quote@ .cpp.o: @am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< @am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po @AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(CXXCOMPILE) -c -o $@ $< .cpp.obj: @am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'` @am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po @AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` .cpp.lo: @am__fastdepCXX_TRUE@ $(LTCXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< @am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo @AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(LTCXXCOMPILE) -c -o $@ $< mostlyclean-libtool: -rm -f *.lo clean-libtool: -rm -rf .libs _libs # This directory's subdirectories are mostly independent; you can cd # into them and run `make' without going through this Makefile. # To change the values of `make' variables: instead of editing Makefiles, # (1) if the variable is set in `config.status', edit `config.status' # (which will cause the Makefiles to be regenerated when you run `make'); # (2) otherwise, pass the desired values on the `make' command line. $(RECURSIVE_TARGETS): @fail= failcom='exit 1'; \ for f in x $$MAKEFLAGS; do \ case $$f in \ *=* | --[!k]*);; \ *k*) failcom='fail=yes';; \ esac; \ done; \ dot_seen=no; \ target=`echo $@ | sed s/-recursive//`; \ list='$(SUBDIRS)'; for subdir in $$list; do \ echo "Making $$target in $$subdir"; \ if test "$$subdir" = "."; then \ dot_seen=yes; \ local_target="$$target-am"; \ else \ local_target="$$target"; \ fi; \ ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ || eval $$failcom; \ done; \ if test "$$dot_seen" = "no"; then \ $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \ fi; test -z "$$fail" $(RECURSIVE_CLEAN_TARGETS): @fail= failcom='exit 1'; \ for f in x $$MAKEFLAGS; do \ case $$f in \ *=* | --[!k]*);; \ *k*) failcom='fail=yes';; \ esac; \ done; \ dot_seen=no; \ case "$@" in \ distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \ *) list='$(SUBDIRS)' ;; \ esac; \ rev=''; for subdir in $$list; do \ if test "$$subdir" = "."; then :; else \ rev="$$subdir $$rev"; \ fi; \ done; \ rev="$$rev ."; \ target=`echo $@ | sed s/-recursive//`; \ for subdir in $$rev; do \ echo "Making $$target in $$subdir"; \ if test "$$subdir" = "."; then \ local_target="$$target-am"; \ else \ local_target="$$target"; \ fi; \ ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \ || eval $$failcom; \ done && test -z "$$fail" tags-recursive: list='$(SUBDIRS)'; for subdir in $$list; do \ test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \ done ctags-recursive: list='$(SUBDIRS)'; for subdir in $$list; do \ test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \ done ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ unique=`for i in $$list; do \ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ done | \ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ END { if (nonempty) { for (i in files) print i; }; }'`; \ mkid -fID $$unique tags: TAGS TAGS: tags-recursive $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ $(TAGS_FILES) $(LISP) set x; \ here=`pwd`; \ if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \ include_option=--etags-include; \ empty_fix=.; \ else \ include_option=--include; \ empty_fix=; \ fi; \ list='$(SUBDIRS)'; for subdir in $$list; do \ if test "$$subdir" = .; then :; else \ test ! -f $$subdir/TAGS || \ set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \ fi; \ done; \ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ unique=`for i in $$list; do \ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ done | \ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ END { if (nonempty) { for (i in files) print i; }; }'`; \ shift; \ if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ test -n "$$unique" || unique=$$empty_fix; \ if test $$# -gt 0; then \ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ "$$@" $$unique; \ else \ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ $$unique; \ fi; \ fi ctags: CTAGS CTAGS: ctags-recursive $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ $(TAGS_FILES) $(LISP) list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ unique=`for i in $$list; do \ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ done | \ $(AWK) '{ files[$$0] = 1; nonempty = 1; } \ END { if (nonempty) { for (i in files) print i; }; }'`; \ test -z "$(CTAGS_ARGS)$$unique" \ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ $$unique GTAGS: here=`$(am__cd) $(top_builddir) && pwd` \ && $(am__cd) $(top_srcdir) \ && gtags -i $(GTAGS_ARGS) "$$here" distclean-tags: -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags distdir: $(DISTFILES) @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ list='$(DISTFILES)'; \ dist_files=`for file in $$list; do echo $$file; done | \ sed -e "s|^$$srcdirstrip/||;t" \ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ case $$dist_files in \ */*) $(MKDIR_P) `echo "$$dist_files" | \ sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ sort -u` ;; \ esac; \ for file in $$dist_files; do \ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ if test -d $$d/$$file; then \ dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ if test -d "$(distdir)/$$file"; then \ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ fi; \ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ fi; \ cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ else \ test -f "$(distdir)/$$file" \ || cp -p $$d/$$file "$(distdir)/$$file" \ || exit 1; \ fi; \ done @list='$(DIST_SUBDIRS)'; for subdir in $$list; do \ if test "$$subdir" = .; then :; else \ test -d "$(distdir)/$$subdir" \ || $(MKDIR_P) "$(distdir)/$$subdir" \ || exit 1; \ fi; \ done @list='$(DIST_SUBDIRS)'; for subdir in $$list; do \ if test "$$subdir" = .; then :; else \ dir1=$$subdir; dir2="$(distdir)/$$subdir"; \ $(am__relativize); \ new_distdir=$$reldir; \ dir1=$$subdir; dir2="$(top_distdir)"; \ $(am__relativize); \ new_top_distdir=$$reldir; \ echo " (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir="$$new_top_distdir" distdir="$$new_distdir" \\"; \ echo " am__remove_distdir=: am__skip_length_check=: am__skip_mode_fix=: distdir)"; \ ($(am__cd) $$subdir && \ $(MAKE) $(AM_MAKEFLAGS) \ top_distdir="$$new_top_distdir" \ distdir="$$new_distdir" \ am__remove_distdir=: \ am__skip_length_check=: \ am__skip_mode_fix=: \ distdir) \ || exit 1; \ fi; \ done check-am: all-am check: check-recursive all-am: Makefile $(LTLIBRARIES) installdirs: installdirs-recursive installdirs-am: for dir in "$(DESTDIR)$(libdir)"; do \ test -z "$$dir" || $(MKDIR_P) "$$dir"; \ done install: install-recursive install-exec: install-exec-recursive install-data: install-data-recursive uninstall: uninstall-recursive install-am: all-am @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am installcheck: installcheck-recursive install-strip: $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ `test -z '$(STRIP)' || \ echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install mostlyclean-generic: clean-generic: distclean-generic: -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) maintainer-clean-generic: @echo "This command is intended for maintainers to use" @echo "it deletes files that may require special tools to rebuild." clean: clean-recursive clean-am: clean-generic clean-libLTLIBRARIES clean-libtool \ mostlyclean-am distclean: distclean-recursive -rm -rf ./$(DEPDIR) -rm -f Makefile distclean-am: clean-am distclean-compile distclean-generic \ distclean-tags dvi: dvi-recursive dvi-am: html: html-recursive html-am: info: info-recursive info-am: install-data-am: install-dvi: install-dvi-recursive install-dvi-am: install-exec-am: install-libLTLIBRARIES install-html: install-html-recursive install-html-am: install-info: install-info-recursive install-info-am: install-man: install-pdf: install-pdf-recursive install-pdf-am: install-ps: install-ps-recursive install-ps-am: installcheck-am: maintainer-clean: maintainer-clean-recursive -rm -rf ./$(DEPDIR) -rm -f Makefile maintainer-clean-am: distclean-am maintainer-clean-generic mostlyclean: mostlyclean-recursive mostlyclean-am: mostlyclean-compile mostlyclean-generic \ mostlyclean-libtool pdf: pdf-recursive pdf-am: ps: ps-recursive ps-am: uninstall-am: uninstall-libLTLIBRARIES .MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) ctags-recursive \ install-am install-strip tags-recursive .PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \ all all-am check check-am clean clean-generic \ clean-libLTLIBRARIES clean-libtool ctags ctags-recursive \ distclean distclean-compile distclean-generic \ distclean-libtool distclean-tags distdir dvi dvi-am html \ html-am info info-am install install-am install-data \ install-data-am install-dvi install-dvi-am install-exec \ install-exec-am install-html install-html-am install-info \ install-info-am install-libLTLIBRARIES install-man install-pdf \ install-pdf-am install-ps install-ps-am install-strip \ installcheck installcheck-am installdirs installdirs-am \ maintainer-clean maintainer-clean-generic mostlyclean \ mostlyclean-compile mostlyclean-generic mostlyclean-libtool \ pdf pdf-am ps ps-am tags tags-recursive uninstall uninstall-am \ uninstall-libLTLIBRARIES # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. .NOEXPORT: kytea_0.4.6+dfsg.orig/src/lib/corpus-io-eda.cpp0000644000175000017500000000266312122355536020617 0ustar koichikoichi#include #include #include #include #include #include "config.h" #define PROB_TRUE 100.0 #define PROB_FALSE -100.0 #define PROB_UNKNOWN 0.0 using namespace kytea; using namespace std; KyteaSentence * EdaCorpusIO::readSentence() { THROW_ERROR("Using EDA format for input is not currently supported"); return NULL; } void EdaCorpusIO::writeSentence(const KyteaSentence * sent, double conf) { *str_ << "ID=" << ++id_ << endl; for(unsigned i = 0; i < sent->words.size(); i++) { const KyteaWord & w = sent->words[i]; // Find the POS tag string tag = "UNK"; if(w.getNumTags() >= 1) { const vector< KyteaTag > & tags = w.getTags(0); if(tags.size() > 0) tag = util_->showString(tags[0].first); } // Print *str_ << i+1 << " " << i+2 << " " << util_->showString(w.surface) << " " << tag << " 0" << endl; } *str_ << endl; } EdaCorpusIO::EdaCorpusIO(StringUtil * util) : CorpusIO(util), id_(0) { } EdaCorpusIO::EdaCorpusIO(const CorpusIO & c) : CorpusIO(c), id_(0) { } EdaCorpusIO::EdaCorpusIO(StringUtil * util, const char* file, bool out) : CorpusIO(util,file,out), id_(0) { } EdaCorpusIO::EdaCorpusIO(StringUtil * util, std::iostream & str, bool out) : CorpusIO(util,str,out), id_(0) { } kytea_0.4.6+dfsg.orig/src/lib/kytea-model.cpp0000644000175000017500000003760412122355536020366 0ustar koichikoichi#include #include #include #include #include #include "liblinear/linear.h" #include #include #include #include using namespace kytea; using namespace std; #define SIG_CUTOFF 1E-6 #define SHORT_MAX 32767 int KyteaModel::featuresAdded_ = 0; template class secondmore { public: bool operator() (const pair & a, const pair & b) { return b.second < a.second; } }; // note: this is not safe, all features must be within the appropriate range vector< pair > KyteaModel::runClassifier(const vector & feat) { int i, j, featSize = feat.size(); FeatSum dec; vector< pair > ret(labels_.size()); // for binary predictors if(numW_ == 1) { dec = (bias_>=0?getWeight(getBiasId()-1,0):0); for(i = 0; i < featSize; i++) dec += getWeight(feat[i]-1,0); // linear regression is probabilities double big = abs(dec)*multiplier_, small = 0; if(isProbabilistic(solver_)) { big = 1/(1+exp(-1*big)); small = 1-big; } if(dec > 0) { ret[0] = pair(labels_[0],big); ret[1] = pair(labels_[1],small); } else { ret[0] = pair(labels_[1],big); ret[1] = pair(labels_[0],small); } } // for non-binary predictors else { double sum = 0, max1 = -100000, max2 = -100000, weight; for(j = 0; j < numW_; j++) { dec = (bias_>=0?getWeight(getBiasId()-1,j):0); for(i = 0; i < featSize; i++) { dec += getWeight(feat[i]-1,j); } weight = dec*multiplier_; // get probability for LR if(isProbabilistic(solver_)) { weight = 1/(1+exp(-1*weight)); sum += weight; } // save the top two values for SVM else if(weight > max1) { max2 = max1; max1 = weight; } else if(weight > max2) max2 = weight; ret[j] = pair(labels_[j], weight); } if(isProbabilistic(solver_)) for(j = 0; j < numW_; j++) ret[j].second /= sum; else for(j = 0; j < numW_; j++) ret[j].second -= max2; sort(ret.begin(),ret.end(),secondmore()); } return ret; } // note: this is not safe, all features must be within the appropriate range void KyteaModel::printClassifier(const vector & feat, StringUtil * util, ostream & out) { int i, j, featSize = feat.size(); FeatSum weight, tot; vector< pair > idxs; vector sums(numW_,0); // for binary predictors if(numW_ == 1) { if(bias_>=0) { sums[0] = getWeight(getBiasId()-1,0); ostringstream buff; buff << "BIAS=" << sums[0]; idxs.push_back(pair(buff.str(),abs(sums[0]))); } for(i = 0; i < featSize; i++) { weight = getWeight(feat[i]-1,0); ostringstream buff; buff << util->showString(showFeat(feat[i])) << "=" << weight; idxs.push_back(pair(buff.str(),abs(weight))); sums[0] += weight; } } // for non-binary predictors else { if(bias_>=0) { tot = 0; ostringstream buff; buff << "BIAS="; for(j = 0; j < numW_; j++) { weight = getWeight(getBiasId()-1,j); sums[j] += weight; tot += abs(weight); if(j != 0) buff << "/"; buff << weight; } idxs.push_back(pair(buff.str(),tot)); } for(i = 0; i < featSize; i++) { tot = 0; ostringstream buff; buff << util->showString(showFeat(feat[i])) << "="; for(j = 0; j < numW_; j++) { weight = getWeight(feat[i]-1,j); sums[j] += weight; tot += abs(weight); if(j != 0) buff << "/"; buff << weight; } idxs.push_back(pair(buff.str(),tot)); } } sort(idxs.begin(),idxs.end(),secondmore()); for(i = 0; i < (int)idxs.size(); i++) { if(i != 0) out << " "; out << idxs[i].first; } out << " --- TOTAL="; for(i = 0; i < numW_; i++) { if(i != 0) out << "/"; out << sums[i]; } } // allocate a LL feature node feature_node * allocateFeatures(const vector & feats, int biasId, double biasVal) { feature_node * nodes = (feature_node*)malloc((feats.size()+(biasVal>=0?2:1))*sizeof(feature_node)); unsigned i; for(i = 0; i < feats.size(); i++) { nodes[i].index=feats[i]; nodes[i].value = 1; } if(biasVal >= 0) { nodes[i].index = biasId; nodes[i++].value = biasVal; } nodes[i].index = -1; return nodes; } // train the model void KyteaModel::trainModel(const vector< vector > & xs, vector & ys, double bias, int solver, double epsilon, double cost) { if(xs.size() == 0) return; solver_ = solver; if(weights_.size()>0) weights_.clear(); setBias(bias); // build the liblinear model struct problem prob; struct parameter param; prob.l = xs.size(); // for(int i = 0; i < min(5,(int)xs.size()); i++) { // cerr << "ys["<=0?1:0); param.solver_type = solver; param.C = cost; param.eps = epsilon; param.nr_weight = 0; param.weight_label = NULL; param.weight = NULL; if(param.eps == HUGE_VAL) { if(param.solver_type == L2R_LR || param.solver_type == L2R_L2LOSS_SVC) param.eps = 0.01; else if(param.solver_type == L2R_L2LOSS_SVC_DUAL || param.solver_type == L2R_L1LOSS_SVC_DUAL || param.solver_type == MCSVM_CS || param.solver_type == L2R_LR_DUAL) param.eps = 0.1; else if(param.solver_type == L1R_L2LOSS_SVC || param.solver_type == L1R_LR) param.eps = 0.01; } model* mod_ = train(&prob, ¶m); // free the problem for(int i = 0; i < prob.l; i++) free(myXs[i]); free(myXs); int i, j; // create the labels labels_.resize(mod_->nr_class); for(int i = 0; i < mod_->nr_class; i++) labels_[i] = mod_->label[i]; numW_ = (labels_.size()==2 && solver_ != MCSVM_CS?1:labels_.size()); // find the multiplier #if DISABLE_QUANTIZE multiplier_ = 1; #else const unsigned wSize = numW_*names_.size(); multiplier_ = 0; double val; for(unsigned i = 0; i < wSize; i++) { val = abs(mod_->w[i]); if(val > multiplier_) multiplier_ = val; } multiplier_ /= SHORT_MAX; #endif // trim values oldNames_ = names_; names_.clear(); ids_.clear(); KyteaString empty; mapFeat(empty); weights_.clear(); for(i=0; i<(int)oldNames_.size()-1; i++) { double myMax = 0.0; for(j=0; jw[i*numW_+j]),myMax); if(myMax>SIG_CUTOFF) { mapFeat(oldNames_[i+1]); // If the number of weights is two, push the difference if(numW_ == 2) { weights_.push_back((FeatVal) ((mod_->w[i*numW_]-mod_->w[i*numW_+1])/multiplier_)); // Otherwise, keep the number of weights as-is, and push all } else { for(j = 0; j < numW_; j++) weights_.push_back((FeatVal)(mod_->w[i*numW_+j]/multiplier_)); } } } if(bias_>=0) { // If the number of weights is two, push the difference if(numW_ == 2) { weights_.push_back((FeatVal) ((mod_->w[i*numW_]-mod_->w[i*numW_+1])/multiplier_)); // Otherwise push all } else { for(j = 0; j < numW_; j++) weights_.push_back((FeatVal)(mod_->w[i*numW_+j]/multiplier_)); } } // If the number of weights was two, we've converted to one if(numW_ == 2) numW_ = 1; free_and_destroy_model(&mod_); // When we're done with training, no more adding features addFeat_ = false; } void KyteaModel::setNumClasses(unsigned v) { if(v == 1) THROW_ERROR("Trying to set the number of classes to 1"); labels_.resize(v); numW_ = (v==2 && solver_ != MCSVM_CS?1:v); } Dictionary > * KyteaModel::makeDictionaryFromPrefixes(const vector & prefs, StringUtil* util, bool adjustPos) { typedef Dictionary >::WordMap WordMap; WordMap wm; int pos; for(int i = 0; i < (int)names_.size(); i++) { const KyteaString & str = names_[i]; for(pos = 0; pos < (int)prefs.size() && !str.beginsWith(prefs[pos]); pos++); if(pos != (int)prefs.size()) { featuresAdded_++; KyteaString name = str.substr(prefs[pos].length()); WordMap::iterator it = wm.find(name); if(it == wm.end()) { pair p = wm.insert(WordMap::value_type(name,new vector(prefs.size()*numW_))); it = p.first; } // If this is an n-gram dictionary, adjust the position according to // n-gram length, otherwise just use the location of th eprefix int id = (adjustPos ? (prefs.size()-pos-name.length())*numW_ : pos*numW_ ); for(int j = 0; j < numW_; j++) { // cerr << "adding for "<showString(str)<<" @ "<showString(name) << " ["<second).size()<<" == "<second)[id+j] = getWeight(i-1,j) * labels_[0]; } } } if(wm.size() > 0) { Dictionary > * ret = new Dictionary >(util); ret->buildIndex(wm); return ret; } return NULL; } void KyteaModel::buildFeatureLookup(StringUtil * util, int charw, int typew, int numDicts, int maxLen) { if(featLookup_) { delete featLookup_; featLookup_ = 0; } // Do not build the feature lookup if there are no features to use if(names_.size() == 0 || getNumClasses() < 2) return; featLookup_ = new FeatureLookup; featuresAdded_ = 0; // Make the character values vector charPref, typePref, selfPref, dictPref; for(int i = 1-charw; i <= charw; i++) { ostringstream oss; oss << "X" << i; charPref.push_back(util->mapString(oss.str())); } featLookup_->setCharDict(makeDictionaryFromPrefixes(charPref, util, true)); // Make the type values for(int i = 1-typew; i <= typew; i++) { ostringstream oss; oss << "T" << i; typePref.push_back(util->mapString(oss.str())); } featLookup_->setTypeDict(makeDictionaryFromPrefixes(typePref, util, true)); // Make the self prefixes selfPref.push_back(util->mapString("SX")); selfPref.push_back(util->mapString("ST")); featLookup_->setSelfDict(makeDictionaryFromPrefixes(selfPref, util, false)); // Get the bias feature int bias = getBiasId(); if(bias != -1) { featuresAdded_++; for(int j = 0; j < numW_; j++) featLookup_->setBias(getWeight(bias-1, j) * labels_[0], j); } bool prevAddFeat = addFeat_; addFeat_ = false; // Make the dictionary values if(numDicts*maxLen > 0) { vector * dictFeats = new vector(numDicts*maxLen*3,0); int id = 0; for(int i = 0; i < numDicts; i++) { for(int j = 1; j <= maxLen; j++) { ostringstream oss1; oss1 << "D" << i << "R" << j; unsigned id1 = mapFeat(util->mapString(oss1.str())); if(id1 != 0) { (*dictFeats)[id] = getWeight(id1-1, 0) * labels_[0]; featuresAdded_++; } id++; ostringstream oss2; oss2 << "D" << i << "I" << j; unsigned id2 = mapFeat(util->mapString(oss2.str())); if(id2 != 0) { featuresAdded_++; (*dictFeats)[id] = getWeight(id2-1, 0) * labels_[0]; } id++; ostringstream oss3; oss3 << "D" << i << "L" << j; unsigned id3 = mapFeat(util->mapString(oss3.str())); if(id3 != 0) { featuresAdded_++; (*dictFeats)[id] = getWeight(id3-1, 0) * labels_[0]; } id++; } } featLookup_->setDictVector(dictFeats); } if(numDicts > 0) { // Make the tag dictionary values vector * tagDictFeats = new vector(numDicts*labels_.size()*labels_.size(),0); int id = 0; for(int i = 0; i <= numDicts; i++) { for(int j = 0; j < (int)labels_.size(); j++) { ostringstream oss1; oss1 << "D" << i << "T" << j; unsigned id1 = mapFeat(util->mapString(oss1.str())); if(id1 != 0) { for(int k = 0; k < (int)labels_.size(); k++) (*tagDictFeats)[id+k] = getWeight(id1-1, k) * labels_[0]; featuresAdded_++; } id += labels_.size(); } } featLookup_->setTagDictVector(tagDictFeats); } // Make the unknown vector unsigned id1 = mapFeat(util->mapString("UNK")); if(id1 != 0) { vector * tagUnkFeats = new vector(labels_.size(),0); featuresAdded_++; for(int k = 0; k < (int)labels_.size(); k++) (*tagUnkFeats)[k] = getWeight(id1-1, k) * labels_[0]; featLookup_->setTagUnkVector(tagUnkFeats); } addFeat_ = prevAddFeat; if(featuresAdded_ != (int)names_.size()) THROW_ERROR("Did not add all the features to the feature lookup ("< 0.01) THROW_ERROR("multipliers don't match: "< #include using namespace kytea; using namespace std; // Constructors for KyteaStringImpl KyteaStringImpl::KyteaStringImpl(unsigned length) : length_(length), count_(1) { chars_ = new KyteaChar[length]; } KyteaStringImpl::KyteaStringImpl(const KyteaStringImpl & impl) : length_(impl.length_), count_(1) { chars_ = new KyteaChar[length_]; memcpy(chars_, impl.chars_, sizeof(KyteaChar)*length_); } // tokenize the string using the characters in the delimiter string KyteaString::Tokens KyteaString::tokenize(const KyteaString & delim, bool includeDelim) const { unsigned i,j,s=0; const unsigned l=length(),dl=delim.length(); vector ret; for(i = 0; i < l; i++) { for(j = 0; j < dl && delim[j] != impl_->chars_[i]; j++); if(j != dl) { if(s != i) ret.push_back(substr(s,i-s)); if(includeDelim) ret.push_back(substr(i,1)); s = i+1; } } if(s != i) ret.push_back(substr(s,i-s)); return ret; } // splice a string into the appropriate location void KyteaString::splice(const KyteaString& str, unsigned pos) { const unsigned l = str.length(); if(!l) return; #ifdef KYTEA_SAFE if(pos+l > length()) throw runtime_error("KyteaString splice index out of bounds"); #endif memcpy(impl_->chars_+pos, str.getImpl()->chars_, sizeof(KyteaChar)*l); } KyteaString KyteaString::substr(unsigned s) const { const unsigned l = length()-s; #ifdef KYTEA_SAFE if(s+l > length()) throw runtime_error("KyteaString substr index out of bounds"); #endif KyteaString ret(l); memcpy(ret.getImpl()->chars_, impl_->chars_+s, sizeof(KyteaChar)*l); return ret; } KyteaString KyteaString::substr(unsigned s, unsigned l) const { #ifdef KYTEA_SAFE if(s+l > length()) throw runtime_error("substr out of bounds"); #endif KyteaString ret(l); memcpy(ret.getImpl()->chars_, impl_->chars_+s, sizeof(KyteaChar)*l); return ret; } size_t KyteaString::getHash() const { size_t hash = 5381; if(impl_==0) return hash; const unsigned l = impl_->length_; const KyteaChar* cs = impl_->chars_; for(unsigned i = 0; i < l; i++) hash = ((hash << 5) + hash) + cs[i]; /* hash * 33 + x[i] */ return hash; } KyteaStringImpl * KyteaString::getImpl() { if(impl_->count_ != 1) { impl_->dec(); impl_ = new KyteaStringImpl(*impl_); } return impl_; } bool KyteaString::beginsWith(const KyteaString & s) const { if(s.length() > this->length()) return 0; for(int i = s.length()-1; i >= 0; i--) { if((*this)[i] != s[i]) return 0; } return 1; } kytea_0.4.6+dfsg.orig/src/lib/kytea-util.cpp0000644000175000017500000000671412122355536020241 0ustar koichikoichi#include #include #include #include #include #include using namespace std; using namespace kytea; namespace kytea { template void checkPointerEqual(const T* lhs, const T* rhs) { if(lhs == NULL) { if(rhs != NULL) THROW_ERROR("lhs == NULL, rhs != NULL"); } else { if(rhs == NULL) THROW_ERROR("lhs != NULL, rhs == NULL"); lhs->checkEqual(*rhs); } } template void checkPointerEqual(const StringUtil* lhs, const StringUtil* rhs); template void checkPointerEqual(const KyteaModel* lhs, const KyteaModel* rhs); template void checkPointerEqual(const FeatureLookup* lhs, const FeatureLookup* rhs); template void checkPointerEqual(const Dictionary* lhs, const Dictionary* rhs); template void checkPointerEqual(const Dictionary* lhs, const Dictionary* rhs); template void checkPointerEqual(const Dictionary* lhs, const Dictionary* rhs); // Vector equality checking function template void checkValueVecEqual(const std::vector & a, const std::vector & b) { if(a.size() != b.size()) THROW_ERROR("Vector sizes don't match: "< void checkValueVecEqual(const std::vector * a, const std::vector * b) { if((a == NULL || a->size() == 0) != (b == NULL || b->size() == 0)) { THROW_ERROR("only one dictVector_ is NULL"); } else if(a != NULL) { checkValueVecEqual(*a, *b); } } template void checkValueVecEqual(const std::vector * a, const std::vector * b); template void checkValueVecEqual(const std::vector * a, const std::vector * b); template void checkValueVecEqual(const std::vector > * a, const std::vector > * b); template void checkValueVecEqual(const std::vector * a, const std::vector * b); template void checkValueVecEqual(const std::vector * a, const std::vector * b); // Vector equality checking function template void checkPointerVecEqual(const std::vector & a, const std::vector & b) { if(a.size() > b.size()) { for(int i = b.size(); i < (int)a.size(); i++) if(a[i] != 0) THROW_ERROR("Vector sizes don't match: "< #include #include #include #include #include #include #include #include using namespace kytea; using namespace std; // set the type of the input corpus void KyteaConfig::setIOFormat(const char* str, CorpForm & cf) { if(!strcmp(str, "full")) { cf = CORP_FORMAT_FULL; } else if(!strcmp(str, "tags")) { cf = CORP_FORMAT_TAGS; } else if(!strcmp(str, "tok")) { cf = CORP_FORMAT_TOK; } else if(!strcmp(str, "part")) { cf = CORP_FORMAT_PART; } else if(!strcmp(str, "conf")) { cf = CORP_FORMAT_PROB; } else if(!strcmp(str, "prob")) { cf = CORP_FORMAT_PROB; } else if(!strcmp(str, "eda")) { cf = CORP_FORMAT_EDA; } else if(!strcmp(str, "raw")) { cf = CORP_FORMAT_RAW; } else THROW_ERROR("Unsupported corpus IO format '" << str << "'"); } void KyteaConfig::parseTrainCommandLine(int argc, const char ** argv) { for(int i = 1; i < argc; i++) i += parseTrainArg(argv[i], (i == argc-1?NULL:argv[i+1])); } void KyteaConfig::parseRunCommandLine(int argc, const char ** argv) { for(int i = 1; i < argc; i++) i += parseRunArg(argv[i], (i == argc-1?NULL:argv[i+1])); } const string & KyteaConfig::getModelFile() { // load the model file if it has not been specified at the command line if(model_.length() == 0) { if(getenv("KYTEA_MODEL")) model_ = getenv("KYTEA_MODEL"); else { model_ = PKGDATADIR; model_ += "/model.bin"; } } return model_; } void KyteaConfig::printUsage() { if(onTraining_) { // print the training usage cerr << "train-kytea:" << endl << " A program to train models for KyTea" << endl << "" << endl << "Input/Output Options: " << endl << " -encode The text encoding to be used (utf8/euc/sjis; default: utf8)" << endl << " -full A fully annotated training corpus (multiple possible)" << endl << " -tok A training corpus that is tokenized with no tags (multiple possible)" << endl << " -part A partially annotated training corpus (multiple possible)" << endl << " -conf A confidence annotated training corpus (multiple possible)" << endl << " -feat A file containing features generated by -featout" << endl << " -dict A dictionary file (one 'word/pron' entry per line, multiple possible)" << endl << " -subword A file of subword units. This will enable unknown word PE." << endl << " -model The file to write the trained model to" << endl << " -modtext Print a text model (instead of the default binary)" << endl << " -featout Write the features used in training the model to this file" << endl << "Model Training Options (basic)" << endl << " -nows Don't train a word segmentation model" << endl << " -notags Skip the training of tagging, do only word segmentation" << endl << " -global Train the nth tag with a global model (good for POS, bad for PE)" << endl << " -debug The debugging level during training (0=silent, 1=normal, 2=detailed)" << endl << "Model Training Options (for advanced users): " << endl << " -charw The character window to use for WS (3)" << endl << " -charn The character n-gram length to use for WS for WS (3)" << endl << " -typew The character type window to use for WS (3)" << endl << " -typen The character type n-gram length to use for WS for WS (3)" << endl << " -dictn Dictionary words greater than -dictn will be grouped together (4)" << endl << " -unkn Language model n-gram order for unknown words (3)" << endl << " -eps The epsilon stopping criterion for classifier training" << endl << " -cost The cost hyperparameter for classifier training" << endl << " -nobias Don't use a bias value in classifier training" << endl << " -solver The solver (1=SVM, 7=logistic regression, etc.; default 1,"<parseInt(v)); } // input options for training else if(!strcmp(n, "-full")) { ch(n,v); addCorpus(v, CORP_FORMAT_FULL); } else if(!strcmp(n, "-tok")) { ch(n,v); addCorpus(v, CORP_FORMAT_TOK); } else if(!strcmp(n, "-part")) { ch(n,v); addCorpus(v, CORP_FORMAT_PART); } else if(!strcmp(n, "-conf")) { ch(n,v); addCorpus(v, CORP_FORMAT_PROB); } else if(!strcmp(n, "-dict")) { ch(n,v); addDictionary(v); } else if(!strcmp(n, "-subword")) { ch(n,v); addSubwordDict(v); } else if(!strcmp(n, "-global")) { ch(n,v); setGlobal(util_->parseInt(v)-1); } // output option for training else if(!strcmp(n, "-model")) { ch(n,v); setModelFile(v); } else if(!strcmp(n, "-modtext")) { setModelFormat('T'); r=0; } else if(!strcmp(n, "-featout")) { ch(n,v); setFeatureOut(v); } else if(!strcmp(n, "-feat")) { ch(n,v); setFeatureIn(v); } else if(!strcmp(n, "-numtags")) { ch(n,v); setNumTags(util_->parseInt(v)); } // liblinear options else if(!strcmp(n, "-eps")) { ch(n,v); setEpsilon(util_->parseFloat(v)); } else if(!strcmp(n, "-cost")) { ch(n,v); setCost(util_->parseFloat(v)); } else if(!strcmp(n, "-solver")) { ch(n,v); setSolverType(util_->parseInt(v)); } // feature options else if(!strcmp(n, "-charw")) { ch(n,v); setCharWindow(util_->parseInt(v)); } else if(!strcmp(n, "-charn")) { ch(n,v); setCharN(util_->parseInt(v)); } else if(!strcmp(n, "-typew")) { ch(n,v); setTypeWindow(util_->parseInt(v)); } else if(!strcmp(n, "-typen")) { ch(n,v); setTypeN(util_->parseInt(v)); } else if(!strcmp(n, "-dictn")) { ch(n,v); setDictionaryN(util_->parseInt(v)); } else if(!strcmp(n, "-unkn")) { ch(n,v); setUnkN(util_->parseInt(v)); } // formatting options else if(!strcmp(n, "-wordbound")) { ch(n,v); setWordBound(v); } else if(!strcmp(n, "-tagbound")) { ch(n,v); setTagBound(v); } else if(!strcmp(n, "-elembound")) { ch(n,v); setElemBound(v); } else if(!strcmp(n, "-unkbound")) { ch(n,v); setUnkBound(v); } else if(!strcmp(n, "-nobound")) { ch(n,v); setNoBound(v); } else if(!strcmp(n, "-hasbound")) { ch(n,v); setHasBound(v); } else if(!strcmp(n, "-skipbound")) { ch(n,v); setSkipBound(v); } // whether or not to perform word segmentation, pronunciation estimation else if(!strcmp(n, "-nows")) { setDoWS(false); r=0; } else if(!strcmp(n, "-notags")) { setDoTags(false); r=0; } else if(!strcmp(n, "-nobias")) { setBias(false); r=0; } // --- DEPRECATED --- // do not use these undocumented options, as they may disappear in the future else if(!strcmp(n, "-prob")) { ch(n,v); addCorpus(v, CORP_FORMAT_PROB); } else if(!strcmp(n, "-dicn")) { ch(n,v); setDictionaryN(util_->parseInt(v)); } else if(n[0] == '-') { cerr << "Invalid argument '" << n << "'" << endl << endl; printUsage(); } else { r=0; args_.push_back(n); } return r; } unsigned KyteaConfig::parseRunArg(const char * n, const char * v) { unsigned r=1; if(!strcmp(n, "--help") || !strcmp(n,"-help")) { printUsage(); } else if(!strcmp(n, "--version") || !strcmp(n,"-version")) { printVersion(); } // general input/output option else if(!strcmp(n, "-in")) { ch(n,v); setIOFormat(v, inputForm_); } else if(!strcmp(n, "-out")) { ch(n,v); setIOFormat(v, outputForm_); } // output option for training else if(!strcmp(n, "-model")) { ch(n,v); setModelFile(v); } // whether or not to perform word segmentation, pronunciation estimation else if(!strcmp(n, "-nows")) { setDoWS(false); r=0; } else if(!strcmp(n, "-wsconst")) { ch(n,v); setWsConstraint(v); } else if(!strcmp(n, "-notags")) { setDoTags(false); r=0; } else if(!strcmp(n, "-notag")) { ch(n,v); if(util_->parseInt(v) < 1) THROW_ERROR("Illegal setting "<parseInt(v)-1,false); } else if(!strcmp(n, "-nounk")) { setDoUnk(false); r=0; } else if(!strcmp(n, "-numtags")) { ch(n,v); setNumTags(util_->parseInt(v)); } else if(!strcmp(n, "-tagmax")) { ch(n,v); setTagMax(util_->parseInt(v)); } // the limit on the number of unknown words to output else if(!strcmp(n, "-unktag")) { ch(n,v); setUnkTag(v); } else if(!strcmp(n, "-deftag")) { ch(n,v); setDefaultTag(v); } else if(!strcmp(n, "-unkbeam")) { ch(n,v); setUnkBeam(util_->parseInt(v)); } else if(!strcmp(n, "-debug")) { ch(n,v); setDebug(util_->parseInt(v)); } // formatting options else if(!strcmp(n, "-wordbound")) { ch(n,v); setWordBound(v); } else if(!strcmp(n, "-tagbound")) { ch(n,v); setTagBound(v); } else if(!strcmp(n, "-elembound")) { ch(n,v); setElemBound(v); } else if(!strcmp(n, "-unkbound")) { ch(n,v); setUnkBound(v); } else if(!strcmp(n, "-nobound")) { ch(n,v); setNoBound(v); } else if(!strcmp(n, "-hasbound")) { ch(n,v); setHasBound(v); } else if(!strcmp(n, "-skipbound")) { ch(n,v); setSkipBound(v); } else if(n[0] == '-') { cerr << "Invalid argument '" << n << "'" << endl << endl; printUsage(); } else { r=0; args_.push_back(n); } return r; } // set the encoding of the StringUtil class and reset all the IOs void KyteaConfig::setEncoding(const char* str) { if(util_) delete util_; if(!strcmp(str,"utf8")) util_ = new StringUtilUtf8(); else if(!strcmp(str,"euc")) util_ = new StringUtilEuc(); else if(!strcmp(str,"sjis")) util_ = new StringUtilSjis(); else THROW_ERROR("Unsupported encoding format '" << str << "'"); } KyteaConfig::KyteaConfig() : onTraining_(true), debug_(0), util_(0), dicts_(), modelForm_('B'), inputForm_(CORP_FORMAT_DEFAULT), outputForm_(CORP_FORMAT_FULL), featStr_(0), doWS_(true), doTags_(true), doUnk_(true), addFeat_(false), confidence_(0.0), charW_(3), charN_(3), typeW_(3), typeN_(3), dictN_(4), unkN_(3), unkBeam_(50), defTag_("UNK"), unkTag_(), bias_(1.0f), eps_(HUGE_VAL), cost_(1.0), solverType_(1/*SVM*/), wordBound_(" "), tagBound_("/"), elemBound_("&"), unkBound_(" "), noBound_("-"), hasBound_("|"), skipBound_("?"), escape_("\\"), wsConstraint_(""), numTags_(0), tagMax_(3) { setEncoding("utf8"); } KyteaConfig::KyteaConfig(const KyteaConfig & rhs) : onTraining_(rhs.onTraining_), debug_(rhs.debug_), util_(rhs.util_), dicts_(rhs.dicts_), modelForm_(rhs.modelForm_), inputForm_(rhs.inputForm_), outputForm_(rhs.outputForm_), featStr_(rhs.featStr_), doWS_(rhs.doWS_), doTags_(rhs.doTags_), doUnk_(rhs.doUnk_), addFeat_(rhs.addFeat_), confidence_(rhs.confidence_), charW_(rhs.charW_), charN_(rhs.charN_), typeW_(rhs.typeW_), typeN_(rhs.typeN_), dictN_(rhs.dictN_), unkN_(rhs.unkN_), unkBeam_(rhs.unkBeam_), defTag_(rhs.defTag_), unkTag_(rhs.unkTag_), bias_(rhs.bias_), eps_(rhs.eps_), cost_(rhs.cost_), solverType_(rhs.solverType_), wordBound_(rhs.wordBound_), tagBound_(rhs.tagBound_), elemBound_(rhs.elemBound_), unkBound_(rhs.unkBound_), noBound_(rhs.noBound_), hasBound_(rhs.hasBound_), skipBound_(rhs.skipBound_), escape_(rhs.escape_), numTags_(rhs.numTags_), tagMax_(rhs.tagMax_) { } KyteaConfig::~KyteaConfig() { if(util_) delete util_; } void KyteaConfig::addCorpus(const std::string & corp, CorpForm format) { corpora_.push_back(corp); corpusFormats_.push_back(format); } void KyteaConfig::addDictionary(const std::string & corp) { dicts_.push_back(corp); } void KyteaConfig::addSubwordDict(const std::string & corp) { subwordDicts_.push_back(corp); } const char KyteaConfig::getEncoding() const { return util_->getEncoding(); } const char* KyteaConfig::getEncodingString() const { return util_->getEncodingString(); } std::ostream * KyteaConfig::getFeatureOutStream() { if(featOut_.length() && !featStr_) featStr_ = new std::ofstream(featOut_.c_str()); return featStr_; } void KyteaConfig::closeFeatureOutStream() { if(featStr_) { delete featStr_; featStr_ = 0; } } kytea_0.4.6+dfsg.orig/src/lib/corpus-io-prob.cpp0000644000175000017500000000524612122355536021030 0ustar koichikoichi#include #include #include #include #include #include #include #define PROB_TRUE 100.0 #define PROB_FALSE -100.0 #define PROB_UNKNOWN 0.0 using namespace kytea; using namespace std; KyteaSentence * ProbCorpusIO::readSentence() { #ifdef KYTEA_SAFE if(out_ || !str_) THROW_ERROR("Attempted to read a sentence from an closed or output object"); #endif KyteaSentence* ret = FullCorpusIO::readSentence(); if(ret == 0) return 0; // get the ws confidences string s; getline(*str_, s); istringstream wsiss(s); KyteaSentence::Floats::iterator wsit = ret->wsConfs.begin(); while((wsiss >> s) && (wsit != ret->wsConfs.end())) { *wsit = util_->parseFloat(s.c_str()); wsit++; } if(wsiss.good() || wsit != ret->wsConfs.end()) { THROW_ERROR("Bad number of WS confidences in a probability file"); } // get the pe confidences for(int i = 0; i < getNumTags(); i++) { getline(*str_, s); istringstream peiss(s); KyteaSentence::Words::iterator peit = ret->words.begin(); while((peiss >> s) && (peit != ret->words.end())) { if(peit->getTag(i)) peit->setTagConf(i,util_->parseFloat(s.c_str())); peit++; } if(peiss.good() || peit != ret->words.end()) { THROW_ERROR("Bad number of PE confidences in a probability file"); } } // get the separator line getline(*str_, s); if(s.length()) THROW_ERROR("Badly formatted probability file (no white-space between sentences)"); return ret; } void ProbCorpusIO::writeSentence(const KyteaSentence * sent, double conf) { FullCorpusIO::writeSentence(sent, conf); const string & space = util_->showChar(bounds_[0]), & = util_->showChar(bounds_[2]); for(unsigned i = 0; i < sent->wsConfs.size(); i++) { if(i != 0) *str_ << space; *str_ << abs(sent->wsConfs[i]); } *str_ << endl; for(int k = 0; k < getNumTags(); k++) { if(getDoTag(k)) { for(unsigned i = 0; i < sent->words.size(); i++) { if(i != 0) *str_ << space; const vector< KyteaTag > & tags = sent->words[i].getTags(k); if(tags.size() > 0) { *str_ << tags[0].second; if(allTags_) for(unsigned j = 1; j < tags.size(); j++) *str_ << amp << tags[j].second; } else *str_ << 0; } *str_ << endl; } } *str_ << endl; } kytea_0.4.6+dfsg.orig/src/lib/feature-lookup.cpp0000644000175000017500000001515712122355536021114 0ustar koichikoichi #include #include #include #include using namespace kytea; using namespace std; FeatureLookup::~FeatureLookup() { if(charDict_) delete charDict_; if(typeDict_) delete typeDict_; if(selfDict_) delete selfDict_; if(dictVector_) delete dictVector_; if(biases_) delete biases_; if(tagDictVector_) delete tagDictVector_; if(tagUnkVector_) delete tagUnkVector_; } void FeatureLookup::addNgramScores(const Dictionary * dict, const KyteaString & str, int window, vector & score) { if(!dict) return; Dictionary::MatchResult res = dict->match(str); // For every entry for(int i = 0; i < (int)res.size(); i++) { // Let's say we have a n-gram that matched at position 2 // The first boundary that can be affected is 2-window const int base_pos = res[i].first - window; const int start = max(0, -base_pos); const int end = min(window*2,(int)score.size()-base_pos); const FeatVec & vec = *res[i].second; for(int j = start; j < end; j++) { // cerr << "adding score[" << base_pos+j << "] += vec["< * dict, vector & scores, int window, int startChar, int endChar) { if(!dict) return; // Create a substring that exactly covers the window that we are interested // in of up to -window characters before, and +window characters after int myStart = max(startChar-window,0); int myEnd = min(endChar+window,(int)chars.length()); // cerr << "startChar=="<::MatchResult res = dict->match(str); // Add up the sum of all the features // myStart-startChar is how far to the left of the starting character we are int offset = window-(startChar-myStart); for(int i = 0; i < (int)res.size(); i++) { // The position we are interested in is the matched position plus the // offset int pos = res[i].first + offset; // Reverse this and multiply by the number of candidates pos = (window*2 - pos - 1) * scores.size(); FeatVal* vec = &((*res[i].second)[pos]); // Now add up all the values in the feature vector for(int j = 0; j < (int)scores.size(); j++) { #ifdef KYTEA_SAFE if(j+pos >= (int)res[i].second->size() || j+pos < 0) THROW_ERROR("j+pos "<size() "<size()<<", window="< & scores, int featIdx) { #ifdef KYTEA_SAFE if(selfDict_ == NULL) THROW_ERROR("Trying to add self weights when no self is present"); #endif FeatVec * entry = selfDict_->findEntry(word); if(entry) { int base = featIdx * scores.size(); for(int i = 0; i < (int)scores.size(); i++) scores[i] += (*entry)[base+i]; } } void FeatureLookup::addDictionaryScores(const Dictionary::MatchResult & matches, int numDicts, int max, vector & score) { if(dictVector_ == NULL || dictVector_->size() == 0 || matches.size() == 0) return; const int len = score.size(), dictLen = len*3*max; vector on(numDicts*dictLen, 0); int end; ModelTagEntry* myEntry; for(int i = 0; i < (int)matches.size(); i++) { end = matches[i].first; myEntry = matches[i].second; if(myEntry->inDict == 0) continue; const int wlen = myEntry->word.length(); const int lablen = min(wlen,max)-1; for(int di = 0; ((1 << di) & ~1) <= myEntry->inDict; di++) { if(myEntry->isInDict(di)) { const int dictOffset = di*dictLen; // left value (position end-wlen, type if(end >= wlen) on[dictOffset + (end-wlen)*3*max +lablen*3 /*+ 0*/] = 1; // middle values for(int k = end-wlen+1; k < end; k++) on[dictOffset + k*3*max +lablen*3 + 1 ] = 1; // right value if(end != len) on[dictOffset + end*3*max +lablen*3 + 2 ] = 1; } } } for(int i = 0; i < len; i++) { FeatSum & val = score[i]; for(int di = 0; di < numDicts; di++) { char* myOn = &on[di*dictLen + i*3*max]; FeatVal* myScore = &(*dictVector_)[3*max*di]; for(int j = 0; j < 3*max; j++) { // cerr << "i="< #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace kytea; using namespace std; /////////////////////////////////// // Dictionary building functions // /////////////////////////////////// template void Kytea::addTag(typename Dictionary::WordMap& allWords, const KyteaString & word, int lev, const KyteaString * tag, int dict) { typedef typename Dictionary::WordMap WordMap; typename WordMap::iterator it = allWords.find(word); if(it == allWords.end()) { Entry * ent = new Entry(word); ent->setNumTags(lev+1); if(tag) { ent->tags[lev].push_back(*tag); ent->tagInDicts[lev].push_back(0); } if(dict >= 0) { Entry::setInDict(ent->inDict,dict); if(tag) Entry::setInDict(ent->tagInDicts[lev][0],dict); } allWords.insert(typename WordMap::value_type(word,ent)); } else { if(tag) { unsigned i; if((int)it->second->tags.size() <= lev) it->second->setNumTags(lev+1); vector & tags = it->second->tags[lev]; vector & tagInDicts = it->second->tagInDicts[lev]; for(i = 0; i < tags.size() && tags[i] != *tag; i++); if(i == tags.size()) { tags.push_back(*tag); tagInDicts.push_back(0); } if(dict >= 0) Entry::setInDict(tagInDicts[i],dict); } if(dict >= 0) Entry::setInDict(it->second->inDict,dict); } } template void Kytea::addTag(typename Dictionary::WordMap& allWords, const KyteaString & word, const KyteaTag * tag, int dict) { addTag(allWords,word,(tag?&tag->first:0),dict); } template void Kytea::scanDictionaries(const vector & dict, typename Dictionary::WordMap & wordMap, KyteaConfig * config, StringUtil * util, bool saveIds) { // scan the dictionaries KyteaString word; unsigned char numDicts = 0; for(vector::const_iterator it = dict.begin(); it != dict.end(); it++) { if(config_->getDebug()) cerr << "Reading dictionary from " << *it << " "; CorpusIO * io = CorpusIO::createIO(it->c_str(), CORP_FORMAT_FULL, *config, false, util); io->setNumTags(config_->getNumTags()); KyteaSentence* next; int lines = 0; while((next = io->readSentence())) { lines++; if(next->words.size() != 1) { ostringstream buff; buff << "Badly formatted dictionary entry (too many or too few words '"; for(unsigned i = 0; i < next->words.size(); i++) { if(i != 0) buff << " --- "; buff << util->showString(next->words[i].surface); } buff << "')"; THROW_ERROR(buff.str()); } word = next->words[0].norm; for(int i = 0; i < next->words[0].getNumTags(); i++) if(next->words[0].hasTag(i)) addTag(wordMap, word, i, &next->words[0].getTagSurf(i), (saveIds?numDicts:-1)); if(next->words[0].getNumTags() == 0) addTag(wordMap, word, 0, 0, (saveIds?numDicts:-1)); delete next; } delete io; numDicts++; if(config_->getDebug() > 0) { if(lines) cerr << " done (" << lines << " entries)" << endl; else cerr << " WARNING - empty training data specified." << endl; } } } void Kytea::buildVocabulary() { Dictionary::WordMap & allWords = fio_->getWordMap(); if(config_->getDebug() > 0) cerr << "Scanning dictionaries and corpora for vocabulary" << endl; // scan the corpora vector corpora = config_->getCorpusFiles(); vector corpForm = config_->getCorpusFormats(); int maxTag = config_->getNumTags(); for(unsigned i = 0; i < corpora.size(); i++) { if(config_->getDebug() > 0) cerr << "Reading corpus from " << corpora[i] << " "; CorpusIO * io = CorpusIO::createIO(corpora[i].c_str(), corpForm[i], *config_, false, util_); io->setNumTags(config_->getNumTags()); KyteaSentence* next; int lines = 0; while((next = io->readSentence())) { lines++; bool toAdd = false; for(unsigned i = 0; i < next->words.size(); i++) { if(next->words[i].isCertain) { maxTag = max(next->words[i].getNumTags(),maxTag); for(int j = 0; j < next->words[i].getNumTags(); j++) if(next->words[i].hasTag(j)) addTag(allWords, next->words[i].norm, j, &next->words[i].getTagSurf(j), -1); if(next->words[i].getNumTags() == 0) addTag(allWords, next->words[i].norm, 0, 0, -1); toAdd = true; } } const unsigned wsSize = next->wsConfs.size(); for(unsigned i = 0; !toAdd && i < wsSize; i++) toAdd = (next->wsConfs[i] != 0); if(toAdd) sentences_.push_back(next); else delete next; } if(config_->getDebug() > 0) { if(lines) cerr << " done (" << lines << " lines)" << endl; else cerr << " WARNING - empty training data specified." << endl; } delete io; } config_->setNumTags(maxTag); // scan the dictionaries scanDictionaries(config_->getDictionaryFiles(), allWords, config_, util_, true); if(sentences_.size() == 0 && fio_->getFeatures().size() == 0) THROW_ERROR("There were no sentences in the training data. Check to make sure your training file contains sentences."); if(config_->getDebug() > 0) cerr << "Building dictionary index "; if(allWords.size() == 0) THROW_ERROR("FATAL: There were sentences in the training data, but no words were found!"); if(dict_ != 0) delete dict_; dict_ = new Dictionary(util_); dict_->buildIndex(allWords); dict_->setNumDicts(max((int)config_->getDictionaryFiles().size(),fio_->getNumDicts())); if(config_->getDebug() > 0) cerr << "done!" << endl; } ///////////////////////////////// // Word segmentation functions // ///////////////////////////////// unsigned Kytea::wsDictionaryFeatures(const KyteaString & chars, SentenceFeatures & features) { // vector & entries = dict_->getEntries(); // vector & states = dict_->getStates(); // unsigned currState = 0, nextState; ModelTagEntry* myEntry; const unsigned len = features.size(), max=config_->getDictionaryN(), dictLen = len*3*max; vector on(dict_->getNumDicts()*dictLen, 0); unsigned ret = 0, end; Dictionary::MatchResult matches = dict_->match(chars); for(unsigned i = 0; i < matches.size(); i++) { end = matches[i].first; myEntry = matches[i].second; if(myEntry->inDict == 0) continue; const unsigned wlen = myEntry->word.length(); const unsigned lablen = min(wlen,max)-1; for(unsigned di = 0; ((1 << di) & ~1) <= myEntry->inDict; di++) { if(myEntry->isInDict(di)) { const unsigned dictOffset = di*dictLen; // left value (position end-wlen, type if(end >= wlen) on[dictOffset + (end-wlen)*3*max +/*0*max*/+ lablen] = 1; // right value if(end != len) on[dictOffset + end*3*max + 2*max + lablen] = 1; // middle values for(unsigned k = end-wlen+1; k < end; k++) on[dictOffset + k*3*max + 1*max + lablen] = 1; } } } for(unsigned i = 0; i < len; i++) { for(unsigned di = 0; di < dict_->getNumDicts(); di++) { char* myOn = &on[di*dictLen + i*3*max]; for(unsigned j = 0; j < 3*max; j++) { unsigned featId = 3*max*di+j; if(myOn[j] && dictFeats_[featId]) { features[i].push_back(dictFeats_[featId]); ret++; } } } } return ret; } unsigned Kytea::wsNgramFeatures(const KyteaString & chars, SentenceFeatures & features, const vector & prefixes, int n) { const int featSize = (int)features.size(), charLength = (int)chars.length(), w = (int)prefixes.size()/2; // int rightBound, nextRight; unsigned ret = 0, thisFeat; for(int i = 0; i < featSize; i++) { const int rightBound=min(i+w+1,charLength); vector & myFeats = features[i]; for(int j = i-w+1; j < rightBound; j++) { if(j < 0) continue; KyteaString str = prefixes[j-i+w-1]; const int nextRight = min(j+n, rightBound); for(int k = j; kmapFeat(str); if(thisFeat) { myFeats.push_back(thisFeat); ret++; } } } } return ret; } void Kytea::preparePrefixes() { // prepare dictionary prefixes if(config_->getDoWS() && wsModel_) { const char cs[3] = { 'L', 'I', 'R' }; dictFeats_.resize(0); for(unsigned di = 0; di < dict_->getNumDicts(); di++) { for(unsigned i = 0; i < 3; i++) { for(unsigned j = 0; j < (unsigned)config_->getDictionaryN(); j++) { ostringstream buff; buff << "D" << di << cs[i] << (j+1); dictFeats_.push_back(wsModel_->mapFeat(util_->mapString(buff.str()))); } } } } // create n-gram feature prefixes charPrefixes_.resize(0); for(int i = 1; i <= 2*(int)config_->getCharWindow(); i++) { ostringstream buff; buff << "X" << i-(int)config_->getCharWindow(); charPrefixes_.push_back(util_->mapString(buff.str())); } typePrefixes_.resize(0); for(int i = 1; i <= 2*(int)config_->getTypeWindow(); i++) { ostringstream buff; buff << "T" << i-(int)config_->getTypeWindow(); typePrefixes_.push_back(util_->mapString(buff.str())); } } void Kytea::trainWS() { if(wsModel_) delete wsModel_; TagTriplet * trip = fio_->getFeatures(util_->mapString("WS"),true); if(trip->third) wsModel_ = trip->third; else trip->third = wsModel_ = new KyteaModel(); if(config_->getDebug() > 0) cerr << "Creating word segmentation features "; // create word prefixes vector dictFeats; bool hasDictionary = (dict_->getNumDicts() > 0 && dict_->getStates().size() > 0); preparePrefixes(); // make the sentence features one by one unsigned scount = 0; vector< vector > & xs = trip->first; vector & ys = trip->second; for(Sentences::const_iterator it = sentences_.begin(); it != sentences_.end(); it++) { if(++scount % 1000 == 0) cerr << "."; KyteaSentence * sent = *it; SentenceFeatures feats(sent->wsConfs.size()); unsigned fts = 0; if(hasDictionary) fts += wsDictionaryFeatures(sent->norm, feats); fts += wsNgramFeatures(sent->norm, feats, charPrefixes_, config_->getCharN()); string str = util_->getTypeString(sent->norm); fts += wsNgramFeatures(util_->mapString(str), feats, typePrefixes_, config_->getTypeN()); for(unsigned i = 0; i < feats.size(); i++) { if(abs(sent->wsConfs[i]) > config_->getConfidence()) { xs.push_back(feats[i]); ys.push_back(sent->wsConfs[i]>1?1:-1); } } } if(config_->getDebug() > 0) cerr << " done!" << endl << "Building classifier "; // train the model wsModel_->trainModel(xs,ys,config_->getBias(),config_->getSolverType(),config_->getEpsilon(),config_->getCost()); if(config_->getDebug() > 0) cerr << " done!" << endl; fio_->printFeatures(util_->mapString("WS"),util_); } ////////////////////////////// // Tag estimation functions // ////////////////////////////// // chars: the string to use to calculate features // feat: the vector of feature indices // prefixes: prefixes to use for features // model: model to use // n: window to use // sc: index of the first character before the word // ec: index of the first character after the word unsigned Kytea::tagNgramFeatures(const KyteaString & chars, vector & feat, const vector & prefixes, KyteaModel * model, int n, int sc, int ec) { int w = (int)prefixes.size()/2; vector wind(prefixes.size()); for(int i = w-1; i >= 0; i--) wind[w-i-1] = (sc-i<0?0:chars[sc-i]); for(int i = 0; i < w; i++) wind[w+i] = (ec+i>=(int)chars.length()?0:chars[ec+i]); unsigned ret = 0, thisFeat = 0; for(unsigned i = 0; i < wind.size(); i++) { if(wind[i] == 0) continue; KyteaString str = prefixes[i]; for(int k = 0; k < n && i+k < wind.size() && wind[i+k] != 0; k++) { str = str+wind[i+k]; thisFeat = model->mapFeat(str); if(thisFeat) { feat.push_back(thisFeat); ret++; } } } return ret; } unsigned Kytea::tagSelfFeatures(const KyteaString & self, vector & feat, const KyteaString & pref, KyteaModel * model) { unsigned thisFeat = model->mapFeat(pref+self), ret = 0; if(thisFeat) { feat.push_back(thisFeat); ret++; } return ret; } void Kytea::trainGlobalTags(int lev) { if(dict_ == 0) return; if(config_->getDebug() > 0) cerr << "Creating tagging features (tag "<getAddFeatures(); wsModel_->setAddFeatures(false); } preparePrefixes(); if(wsModel_) wsModel_->setAddFeatures(wsAdd); ostringstream oss; oss << "T "<mapString(oss.str()); TagTriplet * trip = fio_->getFeatures(featId,true); globalMods_[lev] = (trip->third?trip->third:new KyteaModel()); trip->third = globalMods_[lev]; KyteaString kssx = util_->mapString("SX"), ksst = util_->mapString("ST"); // build features for(Sentences::const_iterator it = sentences_.begin(); it != sentences_.end(); it++) { int startPos = 0, finPos=0; KyteaString charStr = (*it)->norm; KyteaString typeStr = util_->mapString(util_->getTypeString(charStr)); for(unsigned j = 0; j < (*it)->words.size(); j++) { startPos = finPos; KyteaWord & word = (*it)->words[j]; finPos = startPos+word.norm.length(); if(!word.getTag(lev) || word.getTagConf(lev) <= config_->getConfidence()) continue; unsigned myTag; KyteaString tagSurf = word.getTagSurf(lev); for(myTag = 0; myTag < trip->fourth.size() && tagSurf != trip->fourth[myTag]; myTag++); if(myTag == trip->fourth.size()) trip->fourth.push_back(tagSurf); myTag++; vector feat; tagNgramFeatures(charStr, feat, charPrefixes_, trip->third, config_->getCharN(), startPos-1, finPos); tagNgramFeatures(typeStr, feat, typePrefixes_, trip->third, config_->getTypeN(), startPos-1, finPos); tagSelfFeatures(word.norm, feat, kssx, trip->third); tagSelfFeatures(util_->mapString(util_->getTypeString(word.norm)), feat, ksst, trip->third); tagDictFeatures(word.norm, lev, feat, trip->third); trip->first.push_back(feat); trip->second.push_back(myTag); } } if(config_->getDebug() > 0) cerr << "done!" << endl << "Training global tag classifiers "; trip->third->trainModel(trip->first,trip->second,config_->getBias(),config_->getSolverType(),config_->getEpsilon(),config_->getCost()); globalTags_[lev] = trip->fourth; if(config_->getDebug() > 0) cerr << "done with " << globalTags_[lev].size() << " labels and " << trip->third->getNumFeatures() << " features!" << endl; fio_->printFeatures(featId,util_); } template T max(const vector & vec) { T myMax = 0; for(unsigned i = 0; i < vec.size(); i++) if(myMax < vec[i]) myMax = vec[i]; return myMax; } void Kytea::trainLocalTags(int lev) { if(config_->getDebug() > 0) cerr << "Creating tagging features (tag "<mapString(oss.str()); bool wsAdd = false; if(wsModel_) { wsAdd = wsModel_->getAddFeatures(); wsModel_->setAddFeatures(false); } preparePrefixes(); if(wsModel_) wsModel_->setAddFeatures(wsAdd); // find words that need to be modeled vector & entries = dict_->getEntries(); ModelTagEntry* myEntry = 0; for(unsigned i = 0; i < entries.size(); i++) { myEntry = entries[i]; if((int)myEntry->tags.size() > lev && (myEntry->tags[lev].size() > 1 || config_->getWriteFeatures())) { TagTriplet * trip = fio_->getFeatures(featId+myEntry->word,true); if((int)myEntry->tagMods.size() <= lev) myEntry->tagMods.resize(lev+1,0); myEntry->tagMods[lev] = (trip->third ? trip->third : new KyteaModel()); trip->third = myEntry->tagMods[lev]; trip->fourth = myEntry->tags[lev]; } } // build features for(Sentences::const_iterator it = sentences_.begin(); it != sentences_.end(); it++) { int startPos = 0, finPos=0; KyteaString charStr = (*it)->norm; KyteaString typeStr = util_->mapString(util_->getTypeString(charStr)); for(unsigned j = 0; j < (*it)->words.size(); j++) { startPos = finPos; KyteaWord & word = (*it)->words[j]; finPos = startPos+word.norm.length(); if(!word.getTag(lev) || word.getTagConf(lev) <= config_->getConfidence()) continue; TagTriplet * trip = fio_->getFeatures(featId+word.norm,false); if(trip) { unsigned myTag = dict_->getTagID(word.norm,word.getTagSurf(lev),lev); if(myTag != 0) { vector feat; tagNgramFeatures(charStr, feat, charPrefixes_, trip->third, config_->getCharN(), startPos-1, finPos); tagNgramFeatures(typeStr, feat, typePrefixes_, trip->third, config_->getTypeN(), startPos-1, finPos); trip->first.push_back(feat); trip->second.push_back(myTag); } } } } if(config_->getDebug() > 0) cerr << "done!" << endl << "Training local tag classifiers "; // calculate classifiers for(unsigned i = 0; i < entries.size(); i++) { myEntry = entries[i]; if((int)myEntry->tags.size() > lev && (myEntry->tags[lev].size() > 1 || config_->getWriteFeatures())) { TagTriplet * trip = fio_->getFeatures(featId+myEntry->word,false); if(!trip) THROW_ERROR("FATAL: Unbuilt model in entry table"); vector< vector > & xs = trip->first; vector & ys = trip->second; // train the model trip->third->trainModel(xs,ys,config_->getBias(),config_->getSolverType(),config_->getEpsilon(),config_->getCost()); if(trip->third->getNumClasses() == 1) { int myLab = trip->third->getLabel(0)-1; KyteaString tmpString = myEntry->tags[lev][0]; myEntry->tags[lev][0] = myEntry->tags[lev][myLab]; myEntry->tags[lev][myLab] = tmpString; char tmpDict = myEntry->tagInDicts[lev][0]; myEntry->tagInDicts[lev][0] = myEntry->tagInDicts[lev][myLab]; myEntry->tagInDicts[lev][myLab] = tmpDict; } } } // print the features fio_->printFeatures(featId,util_); if(config_->getDebug() > 0) cerr << "done!" << endl; } vector > Kytea::getDictionaryMatches(const KyteaString & surf, int lev) { vector > ret; if(!dict_) return ret; const ModelTagEntry* ent = dict_->findEntry(surf); if(ent == 0 || ent->inDict == 0 || (int)ent->tagInDicts.size() <= lev) return ret; // For each tag const vector & tid = ent->tagInDicts[lev]; for(int i = 0; i < (int)tid.size(); i++) { // For each dictionary for(int j = 0; j < dict_->getNumDicts(); j++) if(ModelTagEntry::isInDict(tid[i],j)) ret.push_back(pair(j,i)); } return ret; } unsigned Kytea::tagDictFeatures(const KyteaString & surf, int lev, vector & myFeats, KyteaModel * model) { vector > matches = getDictionaryMatches(surf,lev); if(matches.size() == 0) { unsigned thisFeat = model->mapFeat(util_->mapString("UNK")); if(thisFeat) { myFeats.push_back(thisFeat); return 1; } return 0; } int ret = 0; for(int i = 0; i < (int)matches.size(); i++) { ostringstream oss; oss << "D" << matches[i].first << "T" << matches[i].second; unsigned thisFeat = model->mapFeat(util_->mapString(oss.str())); if(thisFeat != 0) { myFeats.push_back(thisFeat); ret++; } } return ret; } void Kytea::trainSanityCheck() { if(config_->getCorpusFiles().size() == 0 && config_->getFeatureIn().length() == 0) { THROW_ERROR("At least one input corpus must be specified (-part/-full/-prob)"); } else if(config_->getDictionaryFiles().size() > 8) { THROW_ERROR("The maximum number of dictionaries that can be specified is 8."); } else if(config_->getModelFile().length() == 0) { THROW_ERROR("An output model file must be specified when training (-model)"); } // check to make sure the model can be output to ModelIO * modout = ModelIO::createIO(config_->getModelFile().c_str(),config_->getModelFormat(), true, *config_); delete modout; } /////////////////////////////// // Unknown word Tag functions // /////////////////////////////// inline void collectCounts(vector & vec, unsigned pos) { for(unsigned i = 0; i < pos; i++) { if(vec.size() <= i) vec.push_back(1); else vec[i]++; } } void Kytea::trainUnk(int lev) { // 1. load the subword dictionaries if(!subwordDict_) { Dictionary::WordMap subwordMap; scanDictionaries(config_->getSubwordDictFiles(), subwordMap, config_, util_, false); subwordDict_ = new Dictionary(util_); subwordDict_->buildIndex(subwordMap); } // 2. align the pronunciation strings, count subword/tag pairs, // create dictionary of pronunciations to count, collect counts if(config_->getDebug() > 0) cerr << " Aligning pronunciation strings" << endl; typedef vector< pair > AlignHyp; const vector & dictEntries = dict_->getEntries(); Dictionary::WordMap tagMap; vector tagCorpus; for(unsigned w = 0; w < dictEntries.size(); w++) { const TagEntry* myDictEntry = dictEntries[w]; const KyteaString & word = myDictEntry->word; const unsigned wordLen = word.length(); if((int)myDictEntry->tags.size() <= lev) continue; for(unsigned p = 0; p < myDictEntry->tags[lev].size(); p++) { const KyteaString & tag = myDictEntry->tags[lev][p]; vector< vector< AlignHyp > > stacks(wordLen+1, vector< AlignHyp >()); stacks[0].push_back(AlignHyp(1,pair(0,0))); // find matches in the word Dictionary::MatchResult matches = subwordDict_->match(word); for(unsigned i = 0; i < matches.size(); i++) { const ProbTagEntry* mySubEntry = matches[i].second; const unsigned cend = matches[i].first+1; const unsigned cstart = cend-mySubEntry->word.length(); for(unsigned j = 0; j < stacks[cstart].size(); j++) { const AlignHyp & myHyp = stacks[cstart][j]; const unsigned pstart = myHyp[myHyp.size()-1].second; if((int)mySubEntry->tags.size() <= lev) continue; for(unsigned k = 0; k < mySubEntry->tags[lev].size(); k++) { // if the current hypothesis matches the alignment hypothesis const KyteaString & pstr = mySubEntry->tags[lev][k]; const unsigned pend = pstart+pstr.length(); if(pend <= tag.length() && tag.substr(pstart,pend-pstart) == pstr) { AlignHyp nextHyp = myHyp; nextHyp.push_back( pair(cend,pend) ); stacks[cend].push_back(nextHyp); } } } } // count the number of alignments for(unsigned i = 0; i < stacks[wordLen].size(); i++) { const AlignHyp & myHyp = stacks[wordLen][i]; if(myHyp[myHyp.size()-1].second == tag.length()) { tagCorpus.push_back(tag); for(unsigned j = 1; j < myHyp.size(); j++) { KyteaString subChar = word.substr(myHyp[j-1].first,myHyp[j].first-myHyp[j-1].first); KyteaString subTag = tag.substr(myHyp[j-1].second,myHyp[j].second-myHyp[j-1].second); ProbTagEntry* mySubEntry = subwordDict_->findEntry(subChar); mySubEntry->incrementProb(subTag,lev); addTag(tagMap,subTag,lev,&subTag,0); } break; } } } } if(tagMap.size() == 0) { cerr << " No words found! Aborting unknown model for level "<getDebug() > 0) cerr << " Building index" << endl; Dictionary tagDict(util_); tagDict.buildIndex(tagMap); // 3. put a Dirichlet process prior on the translations, and calculate the ideal alpha // count how many unique options there are for each pronunciation // and accumulate the numerator counts if(config_->getDebug() > 0) cerr << " Calculating alpha" << endl; TwoCountHash tagCounts; const vector & subEntries = subwordDict_->getEntries(); vector numerCounts; for(unsigned w = 0; w < subEntries.size(); w++) { const ProbTagEntry* mySubEntry = subEntries[w]; if((int)mySubEntry->tags.size() <= lev) continue; for(unsigned p = 0; p < mySubEntry->tags[lev].size(); p++) { const KyteaString& tag = mySubEntry->tags[lev][p]; TwoCountHash::iterator pcit = tagCounts.find(tag); if(pcit == tagCounts.end()) pcit = tagCounts.insert(TwoCountHash::value_type(tag,TwoCountHash::mapped_type(0,0))).first; unsigned totalTagCounts = (unsigned)(mySubEntry->probs[lev].size()>p?mySubEntry->probs[lev][p]:0); pcit->second.first++; // add the unique count pcit->second.second += totalTagCounts; // add the total count collectCounts(numerCounts,totalTagCounts); } } // accumulate the denominator counts vector< vector > denomCounts; for(TwoCountHash::const_iterator it = tagCounts.begin(); it != tagCounts.end(); it++) { if(denomCounts.size() < it->second.first) denomCounts.resize(it->second.first,vector()); collectCounts(denomCounts[it->second.first-1],it->second.second); } // maximize the alpha using Newton's method double alpha = 0.0001, maxAlpha = 100, changeCutoff = 0.0000001, change = 1; while(abs(change) > changeCutoff && alpha < maxAlpha) { double der1 = 0, der2 = 0, lik = 0, den = 0; for(unsigned i = 0; i < numerCounts.size(); i++) { den = alpha+i; der1 += numerCounts[i]/den; der2 -= numerCounts[i]/den/den; lik += numerCounts[i]*log(den); } for(unsigned i = 0; i < denomCounts.size(); i++) { for(unsigned j = 0; j < denomCounts[i].size(); j++) { den = (i+1)*alpha+j; der1 -= denomCounts[i][j]*(i+1)/den; der2 += denomCounts[i][j]*(i+1)*(i+1)/den/den; lik -= denomCounts[i][j]*log(den); } } change = -1*der1/der2; alpha += change; } if(alpha > maxAlpha) { alpha = 1; if(config_->getDebug() > 0) cerr << "WARNING: Alpha maximization exploded, reverting to alpha="<::MatchResult matches = tagDict.match(tagCorpus[p]); for(unsigned m = 0; m < matches.size(); m++) matches[m].second->incrementProb(matches[m].second->word,lev); } // 5. calculate the TM probabilities and adjust with the segmentation probabilities for(unsigned w = 0; w < subEntries.size(); w++) { ProbTagEntry* mySubEntry = subEntries[w]; if(mySubEntry->probs[lev].size() != mySubEntry->tags[lev].size()) mySubEntry->probs[lev].resize(mySubEntry->tags[lev].size(),0); for(unsigned p = 0; p < mySubEntry->tags[lev].size(); p++) { const KyteaString & tag = mySubEntry->tags[lev][p]; ProbTagEntry* myTagEntry = tagDict.findEntry(tag); double origCount = mySubEntry->probs[lev][p]; pair myTagCounts = tagCounts[tag]; // get the smoothed TM probability mySubEntry->probs[lev][p] = (mySubEntry->probs[lev][p]+alpha) / (myTagCounts.second+alpha*myTagCounts.first); // adjust it with the segmentation probability (if existing) if(myTagEntry) mySubEntry->probs[lev][p] *= myTagCounts.second/myTagEntry->probs[lev][0]; else if (origCount != 0.0) THROW_ERROR("FATAL: Numerator found but denominator not in TM calculation"); mySubEntry->probs[lev][p] = log(mySubEntry->probs[lev][p]); } } // 6. make the language model if(config_->getDebug() > 0) cerr << " Calculating LM" << endl; if((int)subwordModels_.size() <= lev) subwordModels_.resize(lev+1,0); subwordModels_[lev] = new KyteaLM(config_->getUnkN()); subwordModels_[lev]->train(tagCorpus); } ////////////////// // IO functions // ////////////////// void Kytea::buildFeatureLookups() { // Write out the word segmentation features if(wsModel_) { wsModel_->buildFeatureLookup(util_, config_->getCharWindow(), config_->getTypeWindow(), dict_->getNumDicts(), config_->getDictionaryN()); } for(int i = 0; i < (int)globalMods_.size(); i++) if(globalMods_[i]) globalMods_[i]->buildFeatureLookup(util_, config_->getCharWindow(), config_->getTypeWindow(), dict_->getNumDicts(), config_->getDictionaryN()); // Build the entries for the local models vector & localEntries = dict_->getEntries(); for(int i = 0; i < (int)localEntries.size(); i++) { if(localEntries[i]) { for(int j = 0; j < (int)localEntries[i]->tagMods.size(); j++) { if(localEntries[i]->tagMods[j]) { localEntries[i]->tagMods[j]->buildFeatureLookup(util_, config_->getCharWindow(), config_->getTypeWindow(), dict_->getNumDicts(), config_->getDictionaryN()); } } } } } void Kytea::writeModel(const char* fileName) { if(config_->getDebug() > 0) cerr << "Printing model to " << fileName; // Build the feature lookups before printing buildFeatureLookups(); ModelIO * modout = ModelIO::createIO(fileName,config_->getModelFormat(), true, *config_); modout->writeConfig(*config_); modout->writeModel(wsModel_); // write the global models for(int i = 0; i < config_->getNumTags(); i++) { modout->writeWordList(i >= (int)globalTags_.size()?vector():globalTags_[i]); modout->writeModel(i >= (int)globalMods_.size()?0:globalMods_[i]); } modout->writeModelDictionary(dict_); modout->writeProbDictionary(subwordDict_); for(int i = 0; i < config_->getNumTags(); i++) modout->writeLM(i>=(int)subwordModels_.size()?0:subwordModels_[i]); delete modout; if(config_->getDebug() > 0) cerr << " done!" << endl; } void Kytea::readModel(const char* fileName) { if(config_->getDebug() > 0) cerr << "Reading model from " << fileName; ModelIO * modin = ModelIO::createIO(fileName,ModelIO::FORMAT_UNKNOWN, false, *config_); util_ = config_->getStringUtil(); modin->readConfig(*config_); // Write out the word segmentation features wsModel_ = modin->readModel(); // read the global models globalMods_.resize(config_->getNumTags(),0); globalTags_.resize(config_->getNumTags(), vector()); for(int i = 0; i < config_->getNumTags(); i++) { globalTags_[i] = modin->readWordList(); globalMods_[i] = modin->readModel(); } // read the dictionaries dict_ = modin->readModelDictionary(); subwordDict_ = modin->readProbDictionary(); subwordModels_.resize(config_->getNumTags(),0); for(int i = 0; i < config_->getNumTags(); i++) subwordModels_[i] = modin->readLM(); delete modin; // prepare the prefixes in advance for faster analysis preparePrefixes(); if(config_->getDebug() > 0) cerr << " done!" << endl; } //////////////////////// // Analysis functions // //////////////////////// void Kytea::calculateWS(KyteaSentence & sent) { if(!wsModel_) THROW_ERROR("This model cannot be used for word segmentation."); // Skip empty sentences if(sent.norm.length() == 0) return; // get the features for the sentence FeatureLookup * featLookup = wsModel_->getFeatureLookup(); vector scores(sent.norm.length()-1, featLookup->getBias(0)); featLookup->addNgramScores(featLookup->getCharDict(), sent.norm, config_->getCharWindow(), scores); const string & type_str = util_->getTypeString(sent.norm); featLookup->addNgramScores(featLookup->getTypeDict(), util_->mapString(type_str), config_->getTypeWindow(), scores); if(featLookup->getDictVector()) featLookup->addDictionaryScores( dict_->match(sent.norm), dict_->getNumDicts(), config_->getDictionaryN(), scores); // If the characters match the hard constraint, OK const string & wsc = config_->getWsConstraint(); if(wsc.size()) for(unsigned i = 0; i < scores.size(); i++) if(type_str[i]==type_str[i+1] && wsc.find(type_str[i]) != std::string::npos) scores[i] = KyteaModel::isProbabilistic(config_->getSolverType())?0:-100; // Update values, but only ones that are not already sure for(unsigned i = 0; i < sent.wsConfs.size(); i++) if(abs(sent.wsConfs[i]) <= config_->getConfidence()) sent.wsConfs[i] = scores[i]*wsModel_->getMultiplier(); sent.refreshWS(config_->getConfidence()); for(int i = 0; i < (int)sent.words.size(); i++) { KyteaWord & word = sent.words[i]; word.setUnknown(dict_->findEntry(word.norm) == 0); } if(KyteaModel::isProbabilistic(config_->getSolverType())) { for(unsigned i = 0; i < sent.wsConfs.size(); i++) sent.wsConfs[i] = 1/(1.0+exp(-abs(sent.wsConfs[i]))); } } // generate candidates with TM scores bool kyteaTagMore(const KyteaTag a, const KyteaTag b) { return a.second > b.second; } # define BEAM_SIZE 50 vector< KyteaTag > Kytea::generateTagCandidates(const KyteaString & str, int lev) { // cerr << "generateTagCandidates("<showString(str)<<")"<::MatchResult matches = subwordDict_->match(str); vector< vector< KyteaTag > > stack(str.length()+1); stack[0].push_back(KyteaTag(KyteaString(),0)); unsigned end, start, lastEnd = 0; for(unsigned i = 0; i < matches.size(); i++) { // cerr << " match "<showString(matches[i].second->word)<<" "<word.length(); // trim to the beam size if(end != lastEnd && config_->getUnkBeam() > 0 && stack[lastEnd].size() > config_->getUnkBeam()) { sort(stack[lastEnd].begin(), stack[lastEnd].end(), kyteaTagMore); stack[lastEnd].resize(config_->getUnkBeam()); } lastEnd = end; // expand the hypotheses for(unsigned j = 0; j < entry->tags[lev].size(); j++) { for(unsigned k = 0; k < stack[start].size(); k++) { KyteaTag nextPair( stack[start][k].first+entry->tags[lev][j], stack[start][k].second+entry->probs[lev][j] ); // cerr << " ("<showString(entry->word)<<", "<showString(nextPair.first)<<"/"<scoreSingle(nextPair.first,pos); // cerr << "-->" << nextPair.second; } // cerr << endl; stack[end].push_back(nextPair); } } } vector ret = stack[stack.size()-1]; for(unsigned i = 0; i < ret.size(); i++) ret[i].second += subwordModels_[lev]->scoreSingle(ret[i].first,ret[i].first.length()); return ret; } void Kytea::calculateUnknownTag(KyteaWord & word, int lev) { // cerr << "calculateUnknownTag("<showString(word.surf)<<")"<= (int)subwordModels_.size() || subwordModels_[lev] == 0) return; if(word.norm.length() > 256) { cerr << "WARNING: skipping pronunciation estimation for extremely long unknown word of length " <showString(word.norm.substr(0,20))<<"'"<mapString(""),0)); return; } // generate candidates if((int)word.tags.size() <= lev) word.tags.resize(lev+1); word.tags[lev] = generateTagCandidates(word.norm, lev); vector & tags = word.tags[lev]; // get the max double maxProb = -1e20, totalProb = 0; for(unsigned i = 0; i < tags.size(); i++) maxProb = max(maxProb,tags[i].second); // convert to prob and get the normalizing constant for(unsigned i = 0; i < tags.size(); i++) { tags[i].second = exp(tags[i].second-maxProb); totalProb += tags[i].second; } // normalize the values for(unsigned i = 0; i < tags.size(); i++) tags[i].second /= totalProb; sort(tags.begin(), tags.end()); // trim the number of candidates if(config_->getTagMax() != 0 && config_->getTagMax() < tags.size()) tags.resize(config_->getTagMax()); } void Kytea::calculateTags(KyteaSentence & sent, int lev) { int startPos = 0, finPos=0; KyteaString charStr = sent.norm; KyteaString typeStr = util_->mapString(util_->getTypeString(charStr)); KyteaString kssx = util_->mapString("SX"), ksst = util_->mapString("ST"); const string & defTag = config_->getDefaultTag(); for(unsigned i = 0; i < sent.words.size(); i++) { KyteaWord & word = sent.words[i]; if((int)word.tags.size() > lev && (int)word.tags[lev].size() > 0 && abs(word.tags[lev][0].second) > config_->getConfidence()) continue; startPos = finPos; finPos = startPos+word.norm.length(); // Find the word in the dictionary and set it to unknown if it is ModelTagEntry* ent = dict_->findEntry(word.norm); word.setUnknown(ent == 0); // choose whether to do local or global estimation vector * tags = 0; KyteaModel * tagMod = 0; bool useSelf = false; if(lev < (int)globalMods_.size() && globalMods_[lev] != 0) { tagMod = globalMods_[lev]; tags = &globalTags_[lev]; useSelf = true; } else if(ent != 0 && (int)ent->tags.size() > lev) { tagMod = ent->tagMods[lev]; tags = &(ent->tags[lev]); } // calculate unknown tags if(tags == 0 || tags->size() == 0) { if(config_->getDoUnk()) { calculateUnknownTag(word,lev); if(config_->getDebug() >= 2) cerr << "Tag "<showString(sent.words[i].surface)<<"->UNK)"< feat; FeatureLookup * look; if(tagMod == 0 || (look = tagMod->getFeatureLookup()) == NULL) word.setTag(lev, KyteaTag((*tags)[0],(KyteaModel::isProbabilistic(config_->getSolverType())?1:100))); else { #ifdef KYTEA_SAFE if(look == NULL) THROW_ERROR("null lookure lookup during analysis"); #endif vector scores(tagMod->getNumWeights(), 0); look->addTagNgrams(charStr, look->getCharDict(), scores, config_->getCharN(), startPos, finPos); look->addTagNgrams(typeStr, look->getTypeDict(), scores, config_->getTypeN(), startPos, finPos); if(useSelf) { look->addSelfWeights(charStr.substr(startPos,finPos-startPos), scores, 0); look->addSelfWeights(typeStr.substr(startPos,finPos-startPos), scores, 1); look->addTagDictWeights(getDictionaryMatches(charStr.substr(startPos,finPos-startPos), 0), scores); } for(int j = 0; j < (int)scores.size(); j++) scores[j] += look->getBias(j); if(scores.size() == 1) scores.push_back(KyteaModel::isProbabilistic(config_->getSolverType())?-1*scores[0]:0); word.clearTags(lev); for(int i = 0; i < (int)scores.size(); i++) word.addTag(lev, KyteaTag((*tags)[i],scores[i]*tagMod->getMultiplier())); sort(word.tags[lev].begin(), word.tags[lev].end(), kyteaTagMore); // Convert to a proper margin or probability if(KyteaModel::isProbabilistic(config_->getSolverType())) { double sum = 0; for(int i = 0; i < (int)word.tags[lev].size(); i++) { word.tags[lev][i].second = exp(word.tags[lev][i].second); sum += word.tags[lev][i].second; } for(int i = 0; i < (int)word.tags[lev].size(); i++) { word.tags[lev][i].second /= sum; } } else { double secondBest = word.tags[lev][1].second; for(int i = 0; i < (int)word.tags[lev].size(); i++) word.tags[lev][i].second -= secondBest; } } } if(!word.hasTag(lev) && defTag.length()) word.addTag(lev,KyteaTag(util_->mapString(defTag),0)); if(config_->getTagMax() > 0) word.limitTags(lev,config_->getTagMax()); } } // train the analyzer void Kytea::trainAll() { // sanity check trainSanityCheck(); // handle the feature files if(config_->getFeatureIn().length()) { if(config_->getDebug() > 0) cerr << "Loading features from "<getFeatureIn() << "..."; fio_->load(config_->getFeatureIn(),util_); if(config_->getDebug() > 0) cerr << " done!" << endl; } config_->setNumTags(max(config_->getNumTags(),fio_->getNumTags())); if(config_->getFeatureOut().length()) fio_->openOut(config_->getFeatureOut()); // load the vocabulary, tags buildVocabulary(); fio_->setNumTags(config_->getNumTags()); fio_->printWordMap(util_); // train the word segmenter if(config_->getDoWS()) trainWS(); // train the taggers if(config_->getDoTags()) { if((int)globalMods_.size() <= config_->getNumTags()) { globalMods_.resize(config_->getNumTags(),0); globalTags_.resize(config_->getNumTags(), vector()); } for(int i = 0; i < config_->getNumTags(); i++) { if(config_->getGlobal(i)) trainGlobalTags(i); else { trainLocalTags(i); if(config_->getSubwordDictFiles().size() > 0) trainUnk(i); } } } // close the feature output fio_->closeOut(); // write the models out to a file writeModel(config_->getModelFile().c_str()); } // load the models and analyze the input void Kytea::analyze() { // on full input, disable word segmentation if(config_->getInputFormat() == CORP_FORMAT_FULL || config_->getInputFormat() == CORP_FORMAT_TOK) config_->setDoWS(false); // sanity check std::ostringstream buff; if(config_->getModelFile().length() == 0) throw std::runtime_error("A model file must be specified to run Kytea (-model)"); // read the models in from the model file readModel(config_->getModelFile().c_str()); if(!config_->getDoWS() && !config_->getDoTags()) { buff << "Both word segmentation and tagging are disabled." << std::endl << "At least one must be selected to perform processing." << std::endl; throw std::runtime_error(buff.str()); } // set the input format if(config_->getDoWS()) { if(config_->getInputFormat() == CORP_FORMAT_DEFAULT) config_->setInputFormat(CORP_FORMAT_RAW); } else { if(config_->getInputFormat() == CORP_FORMAT_DEFAULT) config_->setInputFormat(CORP_FORMAT_TOK); else if(config_->getInputFormat() == CORP_FORMAT_RAW) { buff << "In order to handle raw corpus input, word segmentation must be turned on." << std::endl << "Either specify -in {full,part,prob}, stop using -nows, or train a new " << std::endl << "model that has word segmentation included." << std::endl; throw std::runtime_error(buff.str()); } } // sanity checks if(config_->getDoWS() && wsModel_ == NULL) THROW_ERROR("Word segmentation cannot be performed with this model. A new model must be retrained without the -nows option."); if(config_->getDebug() > 0) cerr << "Analyzing input "; CorpusIO *in, *out; iostream *inStr = 0, *outStr = 0; const vector & args = config_->getArguments(); if(args.size() > 0) { in = CorpusIO::createIO(args[0].c_str(),config_->getInputFormat(), *config_, false, util_); } else { inStr = new iostream(cin.rdbuf()); in = CorpusIO::createIO(*inStr, config_->getInputFormat(), *config_, false, util_); } if(args.size() > 1) { out = CorpusIO::createIO(args[1].c_str(),config_->getOutputFormat(), *config_, true, util_); } else { outStr = new iostream(cout.rdbuf()); out = CorpusIO::createIO(*outStr, config_->getOutputFormat(), *config_, true, util_); } out->setUnkTag(config_->getUnkTag()); out->setNumTags(config_->getNumTags()); for(int i = 0; i < config_->getNumTags(); i++) out->setDoTag(i,config_->getDoTag(i)); KyteaSentence* next; while((next = in->readSentence()) != 0) { if(config_->getDoWS()) calculateWS(*next); if(config_->getDoTags()) for(int i = 0; i < config_->getNumTags(); i++) if(config_->getDoTag(i)) calculateTags(*next, i); out->writeSentence(next); delete next; } delete in; delete out; if(inStr) delete inStr; if(outStr) delete outStr; if(config_->getDebug() > 0) cerr << "done!" << endl; } void Kytea::checkEqual(const Kytea & rhs) { checkPointerEqual(util_, rhs.util_); // checkPointerEqual(config_, rhs.config_); checkPointerEqual(dict_, rhs.dict_); checkPointerEqual(wsModel_, rhs.wsModel_); checkPointerEqual(subwordDict_, rhs.subwordDict_); checkPointerVecEqual(subwordModels_, rhs.subwordModels_); checkPointerVecEqual(globalMods_, rhs.globalMods_); checkValueVecEqual(globalTags_, rhs.globalTags_); checkValueVecEqual(dictFeats_, rhs.dictFeats_); } // Destructor and other misc. small functions Kytea::~Kytea() { if(dict_) delete dict_; if(subwordDict_) delete subwordDict_; if(wsModel_) delete wsModel_; if(config_) delete config_; if(fio_) delete fio_; for(int i = 0; i < (int)subwordModels_.size(); i++) { if(subwordModels_[i] != 0) delete subwordModels_[i]; } for(int i = 0; i < (int)globalMods_.size(); i++) if(globalMods_[i] != 0) delete globalMods_[i]; for(Sentences::iterator it = sentences_.begin(); it != sentences_.end(); it++) delete *it; } void Kytea::init() { util_ = config_->getStringUtil(); // dict_ = new Dictionary(util_); dict_ = NULL; wsModel_ = NULL; subwordDict_ = NULL; fio_ = new FeatureIO; } template void Kytea::setDictionary(Dictionary * dict) { if(dict_ != 0) delete dict_; dict_ = dict; } template void Kytea::setDictionary(Dictionary * dict); kytea_0.4.6+dfsg.orig/src/lib/corpus-io-raw.cpp0000644000175000017500000000161712122355536020655 0ustar koichikoichi#include #include #include #include #include #define PROB_TRUE 100.0 #define PROB_FALSE -100.0 #define PROB_UNKNOWN 0.0 using namespace kytea; using namespace std; KyteaSentence * RawCorpusIO::readSentence() { #ifdef KYTEA_SAFE if(out_ || !str_) THROW_ERROR("Attempted to read a sentence from an closed or output object"); #endif string s; getline(*str_, s); if(str_->eof()) return 0; KyteaSentence * ret = new KyteaSentence(); ret->surface = util_->mapString(s); ret->norm = util_->normalize(ret->surface); if(ret->surface.length() != 0) ret->wsConfs.resize(ret->surface.length()-1,0); return ret; } void RawCorpusIO::writeSentence(const KyteaSentence * sent, double conf) { *str_ << util_->showString(sent->surface) << endl; } kytea_0.4.6+dfsg.orig/src/lib/corpus-io-tokenized.cpp0000644000175000017500000000604712122355536022062 0ustar koichikoichi#include #include #include #include #include #include #define PROB_TRUE 100.0 #define PROB_FALSE -100.0 #define PROB_UNKNOWN 0.0 using namespace kytea; using namespace std; KyteaSentence * TokenizedCorpusIO::readSentence() { #ifdef KYTEA_SAFE if(out_ || !str_) THROW_ERROR("Attempted to read a sentence from an closed or output object"); #endif string s; getline(*str_, s); if(str_->eof()) return 0; KyteaChar spaceChar = bounds_[0]; KyteaString ks = util_->mapString(s), buff(ks.length()); int len = ks.length(); KyteaSentence * ret = new KyteaSentence(); int charLen = 0; // go through the whole string int j = 0, bpos; for(j = 0; j < len; j++) { // 1) get the word bpos = 0; for( ; j < len && ks[j] != spaceChar; j++) buff[bpos++] = ks[j]; if(bpos == 0) { if(ks[j] == spaceChar) continue; else THROW_ERROR("Empty word at position "<normalize(word_str)); charLen += bpos; ret->words.push_back(word); } // make the character/ws string ret->surface = KyteaString(charLen); ret->norm = KyteaString(charLen); unsigned pos = 0; for(KyteaSentence::Words::const_iterator tit = ret->words.begin(); tit != ret->words.end(); tit++) { ret->surface.splice(tit->surface, pos); ret->norm.splice(tit->norm, pos); unsigned nextPos = pos + tit->surface.length() - 1; while(pos++ < nextPos) ret->wsConfs.push_back(PROB_FALSE); ret->wsConfs.push_back(PROB_TRUE); } if(ret->wsConfs.size() > 0) ret->wsConfs.pop_back(); return ret; } void TokenizedCorpusIO::writeSentence(const KyteaSentence * sent, double conf) { const string & wb = util_->showChar(bounds_[0]); for(unsigned i = 0; i < sent->words.size(); i++) { if(i != 0) *str_ << wb; const KyteaWord & w = sent->words[i]; *str_ << util_->showString(w.surface); if(w.getUnknown()) *str_ << unkTag_; } *str_ << endl; } TokenizedCorpusIO::TokenizedCorpusIO(StringUtil * util, const char* wordBound) : CorpusIO(util), bounds_(1) { bounds_[0] = util_->mapChar(wordBound); } TokenizedCorpusIO::TokenizedCorpusIO(const CorpusIO & c, const char* wordBound) : CorpusIO(c), allTags_(false), bounds_(1) { bounds_[0] = util_->mapChar(wordBound); } TokenizedCorpusIO::TokenizedCorpusIO(StringUtil * util, const char* file, bool out, const char* wordBound) : CorpusIO(util,file,out), allTags_(false), bounds_(1) { bounds_[0] = util_->mapChar(wordBound); } TokenizedCorpusIO::TokenizedCorpusIO(StringUtil * util, std::iostream & str, bool out, const char* wordBound) : CorpusIO(util,str,out), allTags_(false), bounds_(1) { bounds_[0] = util_->mapChar(wordBound); } kytea_0.4.6+dfsg.orig/src/lib/corpus-io-part.cpp0000644000175000017500000001515712122355536021036 0ustar koichikoichi#include #include #include #include #include #include #define PROB_TRUE 100.0 #define PROB_FALSE -100.0 #define PROB_UNKNOWN 0.0 using namespace kytea; using namespace std; KyteaSentence * PartCorpusIO::readSentence() { #ifdef KYTEA_SAFE if(out_ || !str_) THROW_ERROR("Attempted to read a sentence from an closed or output object"); #endif string s; getline(*str_, s); if(str_->eof()) return 0; KyteaString ks = util_->mapString(s), buff(ks.length()); KyteaChar ukBound = bounds_[0], skipBound = bounds_[1], noBound = bounds_[2], hasBound = bounds_[3], slashChar = bounds_[4], elemChar = bounds_[5], escapeChar = bounds_[6]; KyteaSentence * ret = new KyteaSentence(); int len = ks.length(), charLen = 0; for(int j = 0; j < len; j++) { int bpos = 0; bool cert = true; // read in a word for( ; j < len; j++) { if(ks[j] == ukBound || ks[j] == skipBound || ks[j] == noBound || ks[j] == hasBound || ks[j] == slashChar || ks[j] == elemChar) THROW_ERROR("Misplaced character '"<showChar(ks[j])<<"' in "<= len) THROW_ERROR("Misplaced escape at the end of "<= len || ks[j] == slashChar || ks[j] == hasBound) break; else if(ks[j] == ukBound || ks[j] == skipBound) { ret->wsConfs.push_back(PROB_UNKNOWN); cert = false; } else if(ks[j] != noBound) { THROW_ERROR("Misplaced character '"<showChar(ks[j])<<"' in "<wsConfs.push_back(PROB_FALSE); } KyteaString word_str = buff.substr(0,bpos); KyteaWord word(word_str, util_->normalize(word_str)); charLen += bpos; word.isCertain = cert; bpos = 0; // read in the tags int lev = -1; while(j < len && ks[j] != hasBound) { if(ks[j] == slashChar) lev++; bpos = 0; for(++j ; j < len && ks[j] != hasBound && ks[j] != slashChar && ks[j] != elemChar; j++) { if(ks[j] == escapeChar && ++j == len) THROW_ERROR("Illegal trailing escape character at "<wsConfs.push_back(PROB_TRUE); ret->words.push_back(word); } // make the character/ws string ret->surface = KyteaString(charLen); ret->norm = KyteaString(charLen); unsigned pos = 0; for(KyteaSentence::Words::const_iterator tit = ret->words.begin(); tit != ret->words.end(); tit++) { ret->surface.splice(tit->surface, pos); ret->norm.splice(tit->norm, pos); pos += tit->surface.length(); } return ret; } void PartCorpusIO::writeSentence(const KyteaSentence * sent, double conf) { unsigned curr = 0; const string & ukBound = util_->showChar(bounds_[0]), & skipBound = util_->showChar(bounds_[1]), &noBound = util_->showChar(bounds_[2]), &hasBound = util_->showChar(bounds_[3]), &slashChar = util_->showChar(bounds_[4]), &elemChar = util_->showChar(bounds_[5]); for(unsigned i = 0; i < sent->words.size(); i++) { const KyteaWord & w = sent->words[i]; string sepType = ukBound; for(unsigned j = 0; j < w.surface.length(); ) { *str_ << util_->showChar(sent->surface[curr]); if(curr == sent->wsConfs.size()) sepType = skipBound; else if(sent->wsConfs[curr] > conf) sepType = hasBound; else if(sent->wsConfs[curr] < conf*-1) sepType = noBound; else sepType = ukBound; if(++j != w.surface.length()) *str_ << sepType; curr++; } for(int j = 0; j < w.getNumTags(); j++) { const vector & tags = w.getTags(j); for(int k = 0; k < (int)tags.size(); k++) if(tags[k].second > conf) *str_ << (k==0?slashChar:elemChar) << util_->showString(tags[k].first); } if(w.getUnknown()) *str_ << unkTag_; if(sepType != skipBound) *str_ << sepType; } *str_ << endl; } PartCorpusIO::PartCorpusIO(StringUtil * util, const char* unkBound, const char* skipBound, const char* noBound, const char* hasBound, const char* tagBound, const char* elemBound, const char* escape) : CorpusIO(util), bounds_(7) { bounds_[0] = util_->mapChar(unkBound); bounds_[1] = util_->mapChar(skipBound); bounds_[2] = util_->mapChar(noBound); bounds_[3] = util_->mapChar(hasBound); bounds_[4] = util_->mapChar(tagBound); bounds_[5] = util_->mapChar(elemBound); bounds_[6] = util_->mapChar(escape); } PartCorpusIO::PartCorpusIO(const CorpusIO & c, const char* unkBound, const char* skipBound, const char* noBound, const char* hasBound, const char* tagBound, const char* elemBound, const char* escape) : CorpusIO(c), bounds_(7) { bounds_[0] = util_->mapChar(unkBound); bounds_[1] = util_->mapChar(skipBound); bounds_[2] = util_->mapChar(noBound); bounds_[3] = util_->mapChar(hasBound); bounds_[4] = util_->mapChar(tagBound); bounds_[5] = util_->mapChar(elemBound); bounds_[6] = util_->mapChar(escape); } PartCorpusIO::PartCorpusIO(StringUtil * util, std::iostream & str, bool out, const char* unkBound, const char* skipBound, const char* noBound, const char* hasBound, const char* tagBound, const char* elemBound, const char* escape) : CorpusIO(util,str,out), bounds_(7) { bounds_[0] = util_->mapChar(unkBound); bounds_[1] = util_->mapChar(skipBound); bounds_[2] = util_->mapChar(noBound); bounds_[3] = util_->mapChar(hasBound); bounds_[4] = util_->mapChar(tagBound); bounds_[5] = util_->mapChar(elemBound); bounds_[6] = util_->mapChar(escape); } PartCorpusIO::PartCorpusIO(StringUtil * util, const char* file, bool out, const char* unkBound, const char* skipBound, const char* noBound, const char* hasBound, const char* tagBound, const char* elemBound, const char* escape) : CorpusIO(util,file,out), bounds_(7) { bounds_[0] = util_->mapChar(unkBound); bounds_[1] = util_->mapChar(skipBound); bounds_[2] = util_->mapChar(noBound); bounds_[3] = util_->mapChar(hasBound); bounds_[4] = util_->mapChar(tagBound); bounds_[5] = util_->mapChar(elemBound); bounds_[6] = util_->mapChar(escape); } kytea_0.4.6+dfsg.orig/src/lib/dictionary.cpp0000644000175000017500000002020612122355536020306 0ustar koichikoichi#include #include #include #include #include #include #include using namespace kytea; using namespace std; namespace kytea { ModelTagEntry::~ModelTagEntry() { for(int i = 0; i < (int)tagMods.size(); i++) if(tagMods[i]) delete tagMods[i]; } double ProbTagEntry::incrementProb(const KyteaString & str, int lev) { // std::cerr << "p.size=="< void Dictionary::checkEqual(const Dictionary & rhs) const { if(states_.size() != rhs.states_.size()) THROW_ERROR("states_.size() != rhs.states_.size() ("<= "<first.length() == lev) { node.output.push_back(entries_.size()); node.isBranch = true; entries_.push_back(startCopy->second); startCopy++; } if(startCopy == end) return; // count the number of buckets wm_const_iterator binEnd = startCopy, binStart; unsigned numBins = 0; KyteaChar lastChar = binEnd->first[lev]; do { binEnd++; KyteaChar nextChar = (binEnd == end?0:binEnd->first[lev]); if(nextChar != lastChar) { numBins++; lastChar = nextChar; } } while(binEnd != end); node.gotos.reserve(numBins); // add bucket strings binStart = startCopy, binEnd = startCopy; lastChar = binStart->first[lev]; do { binEnd++; KyteaChar nextChar = (binEnd == end?0:binEnd->first[lev]); if(nextChar != lastChar) { unsigned nextNode = states_.size(); states_.push_back(new DictionaryState()); node.gotos.push_back(std::pair(lastChar,nextNode)); buildGoto(binStart,binEnd,lev+1,nextNode); binStart = binEnd; lastChar = nextChar; } } while(binEnd != end); } template void Dictionary::buildFailures() { if(states_.size() == 0) return; std::deque sq; DictionaryState::Gotos & g0 = states_[0]->gotos; for(unsigned i = 0; i < g0.size(); i++) sq.push_back(g0[i].second); while(sq.size() != 0) { unsigned r = sq.front(); sq.pop_front(); DictionaryState::Gotos & gr = states_[r]->gotos; for(unsigned i = 0; i < gr.size(); i++) { KyteaChar a = gr[i].first; unsigned s = gr[i].second; sq.push_back(s); unsigned state = states_[r]->failure; unsigned trans = 0; while((trans = states_[state]->step(a)) == 0 && (state != 0)) state = states_[state]->failure; states_[s]->failure = trans; for(unsigned j = 0; j < states_[trans]->output.size(); j++) states_[s]->output.push_back(states_[trans]->output[j]); } } } template void Dictionary::clearData() { for(unsigned i = 0; i < states_.size(); i++) delete states_[i]; for(unsigned i = 0; i < entries_.size(); i++) delete entries_[i]; entries_.clear(); states_.clear(); } template void Dictionary::buildIndex(const WordMap & input) { if(input.size() == 0) THROW_ERROR("Cannot build dictionary for no input"); clearData(); states_.push_back(new DictionaryState()); buildGoto(input.begin(), input.end(), 0, 0); buildFailures(); } inline string showWord(StringUtil * util, const ModelTagEntry * entry) { return util->showString(entry->word); } inline string showWord(StringUtil * util, const ProbTagEntry * entry) { return util->showString(entry->word); } inline string showWord(StringUtil * util, const FeatVec * entry) { ostringstream oss; for(int i = 0; i < (int)entry->size(); i++) { if(i != 0) oss << ","; oss << (*entry)[i]; } return oss.str(); } template void Dictionary::print() { for(unsigned i = 0; i < states_.size(); i++) { std::cout << "s="< Entry * Dictionary::findEntry(KyteaString str) { if(str.length() == 0) return 0; unsigned state = 0, lev = 0; do { #ifdef KYTEA_SAFE if(state >= states_.size()) THROW_ERROR("Accessing state "<step(str[lev++]); } while (state != 0 && lev < str.length()); if(states_[state]->output.size() == 0) return 0; if(!states_[state]->isBranch) return 0; return entries_[states_[state]->output[0]]; } template const Entry * Dictionary::findEntry(KyteaString str) const { if(str.length() == 0) return 0; unsigned state = 0, lev = 0; do { state = states_[state]->step(str[lev++]); } while (state != 0 && lev < str.length()); if(states_[state]->output.size() == 0) return 0; if(!states_[state]->isBranch) return 0; return entries_[states_[state]->output[0]]; } template <> unsigned Dictionary::getTagID(KyteaString str, KyteaString tag, int lev) { return 0; } template unsigned Dictionary::getTagID(KyteaString str, KyteaString tag, int lev) { const Entry * ent = findEntry(str); if(ent == 0) return 0; for(unsigned i = 0; i < ent->tags[lev].size(); i++) if(ent->tags[lev][i] == tag) return i+1; return 0; } template typename Dictionary::MatchResult Dictionary::match( const KyteaString & chars ) const { const unsigned len = chars.length(); unsigned currState = 0, nextState; MatchResult ret; for(unsigned i = 0; i < len; i++) { KyteaChar c = chars[i]; while((nextState = states_[currState]->step(c)) == 0 && currState != 0) currState = states_[currState]->failure; currState = nextState; std::vector & output = states_[currState]->output; for(unsigned j = 0; j < output.size(); j++) ret.push_back( std::pair(i, entries_[output[j]]) ); } return ret; } template class Dictionary; template class Dictionary; template class Dictionary; } kytea_0.4.6+dfsg.orig/src/lib/model-io.cpp0000644000175000017500000006414512122355536017660 0ustar koichikoichi/* * Copyright 2009, KyTea Development Team * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define BUFFER_SIZE 4096 #define NEG_INFINITY -999.0 #define NULL_STRING "" using namespace std; namespace kytea { static const char *solver_type_table[]= { "L2R_LR", "L2R_L2LOSS_SVC_DUAL", "L2R_L2LOSS_SVC","L2R_L1LOSS_SVC_DUAL","MCSVM_CS","L1R_L2LOSS_SVC","L1R_LR","L2R_LR_DUAL", NULL }; ModelIO * ModelIO::createIO(const char* file, Format form, bool output, KyteaConfig & config) { if(output && form == ModelIO::FORMAT_UNKNOWN) { THROW_ERROR("A format must be specified for model output"); } else if(!output) { ifstream ifs(file); if(!ifs.good()) THROW_ERROR("Could not open model file "<> buff1) || !(iss >> buff2) || !(iss >> buff3) || !(iss >> buff4) || buff1 != "KyTea" || buff3.length() != 1) THROW_ERROR("Badly formed model (header incorrect)"); if(buff2 != MODEL_IO_VERSION) THROW_ERROR("Incompatible model version. Expected " << MODEL_IO_VERSION << ", but found " << buff2 << "."); form = buff3[0]; config.setEncoding(buff4.c_str()); ifs.close(); } StringUtil * util = config.getStringUtil(); if(form == ModelIO::FORMAT_TEXT) { return new TextModelIO(util,file,output); } else if(form == ModelIO::FORMAT_BINARY) { return new BinaryModelIO(util,file,output); } else { THROW_ERROR("Illegal model format"); } } ModelIO * ModelIO::createIO(iostream & file, Format form, bool output, KyteaConfig & config) { StringUtil * util = config.getStringUtil(); if(form == ModelIO::FORMAT_TEXT) { return new TextModelIO(util,file,output); } else if(form == ModelIO::FORMAT_BINARY) { return new BinaryModelIO(util,file,output); } else { THROW_ERROR("Illegal model format"); } } void TextModelIO::writeConfig(const KyteaConfig & config) { *str_ << "KyTea " << MODEL_IO_VERSION << " T " << config.getEncodingString() << endl; numTags_ = (int)config.getNumTags(); if(!config.getDoWS()) *str_ << "-nows" << endl; if(!config.getDoTags()) *str_ << "-notags" << endl; *str_ << "-numtags " << numTags_ << endl; if(config.getBias()<0) *str_ << "-nobias" << endl; *str_ << "-charw " << (int)config.getCharWindow() << endl << "-charn " << (int)config.getCharN() << endl << "-typew " << (int)config.getTypeWindow() << endl << "-typen " << (int)config.getTypeN() << endl << "-dicn " << (int)config.getDictionaryN() << endl << "-eps " << config.getEpsilon() << endl << "-solver " << config.getSolverType() << endl << endl; // write the character map *str_ << "characters" << endl << config.getStringUtil()->serialize() << endl; *str_ << endl; } void TextModelIO::readConfig(KyteaConfig & config) { string line,s1,s2; getline(*str_,line); // ignore the header while(getline(*str_, line) && line.length() != 0) { istringstream iss(line); iss >> s1; iss >> s2; config.parseTrainArg(s1.c_str(), (s2.length()==0?0:s2.c_str())); } numTags_ = config.getNumTags(); getline(*str_,line); // check the header if(line != "characters") THROW_ERROR("Badly formatted file, expected 'characters', got '" << line << "'"); getline(*str_, line); // get the serialized string util config.getStringUtil()->unserialize(line); getline(*str_, line); // check the last line } void TextModelIO::writeModel(const KyteaModel * mod) { // print a single endl for empty models if(mod == 0 || mod->getNumClasses() < 2) { *str_ << endl; return; } int i; int nr_feature=mod->getNumFeatures(); int n; if(mod->getBias()>=0) n=nr_feature+1; else n=nr_feature; int w_size = n; int nr_w = mod->getNumWeights(); *str_ << "solver_type " << solver_type_table[mod->getSolver()] << endl; *str_ << "nr_class " << mod->getNumClasses() << endl; *str_ << "label"; for(i=0; i<(int)mod->getNumClasses(); i++) *str_ << " " << mod->getLabel(i); *str_ << endl; *str_ << "nr_feature " << nr_feature << endl; char buffer[50]; // fix this sprintf(buffer, "%.16g", mod->getBias()); *str_ << "bias " << buffer << endl; sprintf(buffer, "%.16g", mod->getMultiplier()); *str_ << "mult " << buffer << endl; *str_ << "w" << endl; // print the feature names and values const FeatNameVec & names = mod->getNames(); for(i=0; ishowString(names[i+1]) << endl; for(j=0; jgetWeight(i,j) << " "; *str_ << endl; } *str_ << endl; writeFeatureLookup(mod->getFeatureLookup()); } // write out a language model void TextModelIO::writeLM(const KyteaLM * lm) { // print a single endl for empty models if(lm == 0) { *str_ << endl; return; } *str_ << "lmn " << lm->n_ << endl; *str_ << "lmvocab " << lm->vocabSize_ << endl; KyteaChar spaceChar = util_->mapChar(" "); KyteaString nullString = util_->mapString(NULL_STRING); // sort the set of all keys set keys; for(KyteaDoubleMap::const_iterator it = lm->probs_.begin(); it != lm->probs_.end(); it++) keys.insert(it->first); for(KyteaDoubleMap::const_iterator it = lm->fallbacks_.begin(); it != lm->fallbacks_.end(); it++) keys.insert(it->first); for(set::const_iterator it = keys.begin(); it != keys.end(); it++) { KyteaDoubleMap::const_iterator fit = const_cast(lm)->probs_.find(*it); KyteaString displayString; if(it->length() == 0) displayString = nullString; else { displayString = *it; // remove the null characters for(unsigned i = 0; i < displayString.length(); i++) if(!displayString[i]) displayString[i] = spaceChar; } *str_ << (fit == lm->probs_.end() ? NEG_INFINITY : fit->second) << "\t" << util_->showString(displayString); fit = const_cast(lm)->fallbacks_.find(*it); if(fit != lm->fallbacks_.end()) *str_ << "\t" << fit->second; *str_ << endl; } *str_ << endl; } KyteaModel * TextModelIO::readModel() { // the first line either contains the feature count or empty line string line; getline(*str_, line); if(line.length() == 0) return 0; int i; int nr_feature; int n; int nr_class; double bias; double mult; KyteaModel * mod = new KyteaModel(); string str; while(1) { *str_ >> str; if(strcmp(str.c_str(),"solver_type")==0) { *str_ >> str; int i; for(i=0;solver_type_table[i];i++) { if(strcmp(solver_type_table[i],str.c_str())==0) { mod->setSolver(i); break; } } if(solver_type_table[i] == NULL) { delete mod; THROW_ERROR("unknown solver type."); } } else if(strcmp(str.c_str(),"nr_class")==0) { *str_ >> nr_class; mod->setNumClasses(nr_class); } else if(strcmp(str.c_str(),"nr_feature")==0) { *str_ >> nr_feature; } else if(strcmp(str.c_str(),"bias")==0) { *str_ >> bias; mod->setBias(bias); } else if(strcmp(str.c_str(),"mult")==0) { *str_ >> mult; mod->setMultiplier(mult); } else if(strcmp(str.c_str(),"w")==0) { // clear out the rest of the line getline(*str_,str); break; } else if(strcmp(str.c_str(),"label")==0) { int nr_class = mod->getNumClasses(); int label; for(int i=0;i> label; mod->setLabel(i,label); } } else { delete mod; THROW_ERROR("Unknown text in model file '" << str << "'"); } } if(mod->getBias()>=0) n=nr_feature+1; else n=nr_feature; int w_size = n; int nr_w = mod->getNumWeights(); mod->initializeWeights(w_size,nr_w); for(i=0; imapFeat(util_->mapString(line)); } getline(*str_,str); istringstream iss(str); string buff; for(j=0; j> buff; mod->setWeight(i,j,(FeatVal)util_->parseFloat(buff.c_str())); } } mod->setNumFeatures(nr_feature); getline(*str_, str); if(str.length() != 0 && str != " ") THROW_ERROR("Bad line when expecting end of file: '" << str << "'"); // read models shouldn't add any additional features mod->setAddFeatures(false); mod->setFeatureLookup(readFeatureLookup()); return mod; } // write out a language model KyteaLM * TextModelIO::readLM() { // the first line either contains the n-gram length, or an empty line string line, str; getline(*str_, line); if(line.length() == 0) return 0; // get and check the first line istringstream linestream1(line); linestream1 >> str; if(str != "lmn") { cerr << str << endl; THROW_ERROR("Badly formatted first line in LM"); } linestream1 >> str; KyteaLM* lm = new KyteaLM(util_->parseInt(str.c_str())); // get and check the second line getline(*str_, line); istringstream linestream2(line); linestream2 >> str; if(str != "lmvocab") THROW_ERROR("Badly formatted second line in LM"); linestream2 >> str; lm->vocabSize_ = util_->parseInt(str.c_str()); KyteaChar spaceChar = util_->mapChar(" "); // get and check double prob, fb; KyteaString kword; while(getline(*str_, line)) { if(line.length() == 0) break; istringstream linestream(line); // prob getline(linestream, str, '\t'); prob = util_->parseFloat(str.c_str()); // word getline(linestream, str, '\t'); if(str == NULL_STRING) str = ""; kword = util_->mapString(str); for(unsigned i = 0; i < kword.length(); i++) if(kword[i] == spaceChar) kword[i] = 0; // fallback if(getline(linestream, str, '\t')) { fb = util_->parseFloat(str.c_str()); if(fb != NEG_INFINITY) lm->fallbacks_.insert(pair(kword,fb)); } if(prob != NEG_INFINITY) lm->probs_.insert(pair(kword,prob)); } return lm; } void TextModelIO::writeFeatVec(const vector * entry) { int mySize = (int)(entry ? entry->size() : 0); for(int j = 0; j < mySize; j++) { if(j!=0) *str_ << " "; *str_ << (*entry)[j]; // *str_ << util_->showString((*entry)[j]); } *str_ << endl; } template <> void TextModelIO::writeEntry(const vector * entry) { writeFeatVec(entry); } template <> void TextModelIO::writeEntry(const ModelTagEntry * entry) { *str_ << util_->showString(entry->word) << endl; for(int i = 0; i < numTags_; i++) { int mySize = (int)entry->tags.size() > i ? entry->tags[i].size() : 0; for(int j = 0; j < mySize; j++) { if(j!=0) *str_ << " "; *str_ << util_->showString(entry->tags[i][j]); } *str_ << endl; for(int j = 0; j < mySize; j++) { if(j!=0) *str_ << " "; *str_ << (int)entry->tagInDicts[i][j]; } *str_ << endl; } bool has = false; for(unsigned j = 0; j < 8; j++) { if(entry->isInDict(j)) { if(has) *str_ << " "; *str_ << (unsigned)j; has = true; } } *str_ << endl; for(int i = 0; i < numTags_; i++) { writeModel((int)entry->tagMods.size() > i?entry->tagMods[i]:0); } } template <> void TextModelIO::writeEntry(const ProbTagEntry * entry) { *str_ << util_->showString(entry->word) << endl; for(int i = 0; i < numTags_; i++) { int mySize = (int)entry->tags.size() > i ? entry->tags[i].size() : 0; for(int j = 0; j < mySize; j++) { if(j!=0) *str_ << " "; *str_ << util_->showString(entry->tags[i][j]); } *str_ << endl; } for(int i = 0; i < numTags_; i++) { int mySize = (int)entry->probs.size() > i ? entry->probs[i].size() : 0; for(int j = 0; j < mySize; j++) { if(j!=0) *str_ << " "; *str_ << entry->probs[i][j]; } *str_< * entry) { int mySize = (int)(entry ? entry->size() : 0); writeBinary((uint32_t)mySize); for(int j = 0; j < mySize; j++) writeBinary((FeatVal)(*entry)[j]); } template <> void BinaryModelIO::writeEntry(const vector * entry) { writeFeatVec(entry); } template <> void BinaryModelIO::writeEntry(const ProbTagEntry * entry) { writeString(entry->word); for(int i = 0; i < numTags_; i++) { int mySize = (int)entry->tags.size() > i ? entry->tags[i].size() : 0; writeBinary((uint32_t)mySize); for(int j = 0; j < mySize; j++) { writeString(entry->tags[i][j]); writeBinary((double)entry->probs[i][j]); } } } vector* TextModelIO::readFeatVec() { string line, buff; vector * entry = new vector; getline(*str_, line); istringstream iss(line); while(iss >> buff) entry->push_back((FeatVal)util_->parseFloat(buff.c_str())); return entry; } template <> vector* TextModelIO::readEntry >() { return readFeatVec(); } template <> ModelTagEntry* TextModelIO::readEntry() { string line, buff; getline(*str_, line); ModelTagEntry* entry = new ModelTagEntry(util_->mapString(line)); entry->setNumTags(numTags_); for(int i = 0; i < numTags_; i++) { // get the tags getline(*str_, line); istringstream iss(line); while(iss >> buff) entry->tags[i].push_back(util_->mapString(buff.c_str())); // get which dictionaries each tag is in getline(*str_, line); istringstream iss2(line); while(iss2 >> buff) entry->tagInDicts[i].push_back(util_->parseInt(buff.c_str())); } getline(*str_, line); istringstream iss2(line); while(iss2 >> buff) entry->setInDict(util_->parseInt(buff.c_str())); for(int i = 0; i < numTags_; i++) { entry->tagMods[i] = readModel(); if(entry->tagMods[i] && entry->tagMods[i]->getNumClasses() > entry->tags[i].size()) THROW_ERROR("Model classes > tag classes ("<tagMods[i]->getNumClasses()<<", "<tags[i].size()<<") @ "<showString(entry->word)); } return entry; } template <> ProbTagEntry* TextModelIO::readEntry() { string line, buff; getline(*str_, line); ProbTagEntry* entry = new ProbTagEntry(util_->mapString(line)); entry->setNumTags(numTags_); for(int i = 0; i < numTags_; i++) { getline(*str_, line); istringstream iss(line); while(iss >> buff) entry->tags[i].push_back(util_->mapString(buff.c_str())); } for(int i = 0; i < numTags_; i++) { getline(*str_, line); istringstream iss2(line); while(iss2 >> buff) entry->probs[i].push_back(util_->parseFloat(buff.c_str())); if(entry->probs[i].size() != entry->tags[i].size()) THROW_ERROR("Non-matching probability and tag values "<probs[i].size() << " != " << entry->tags[i].size()); } return entry; } void BinaryModelIO::writeConfig(const KyteaConfig & config) { *str_ << "KyTea " << MODEL_IO_VERSION << " B " << config.getEncodingString() << endl; writeBinary(config.getDoWS()); writeBinary(config.getDoTags()); numTags_ = config.getNumTags(); writeBinary((uint32_t)numTags_); writeBinary(config.getCharWindow()); writeBinary(config.getCharN()); writeBinary(config.getTypeWindow()); writeBinary(config.getTypeN()); writeBinary(config.getDictionaryN()); writeBinary(config.getBias()<0); writeBinary(config.getEpsilon()); writeBinary((char)config.getSolverType()); // write the character map writeString(config.getStringUtil()->serialize()); } void BinaryModelIO::readConfig(KyteaConfig & config) { string line; getline(*str_,line); // ignore the header config.setDoWS(readBinary() && config.getDoWS()); config.setDoTags(readBinary() && config.getDoTags()); numTags_ = readBinary(); config.setNumTags(numTags_); config.setCharWindow(readBinary()); config.setCharN(readBinary()); config.setTypeWindow(readBinary()); config.setTypeN(readBinary()); config.setDictionaryN(readBinary()); config.setBias(readBinary()?1.0:-1.0); config.setEpsilon(readBinary()); config.setSolverType(readBinary()); config.getStringUtil()->unserialize(readString()); } void BinaryModelIO::writeModel(const KyteaModel * mod) { // Write the number of classes, or 0 if there is no model if(mod == 0 || mod->getNumClasses() < 2) { writeBinary((int32_t)0); return; } writeBinary((int32_t)mod->getNumClasses()); writeBinary((char)mod->getSolver()); for(int i=0; i<(int)mod->getNumClasses(); i++) writeBinary((int32_t)mod->getLabel(i)); writeBinary(mod->getBias()>=0); writeBinary(mod->getMultiplier()); writeFeatureLookup(mod->getFeatureLookup()); } void BinaryModelIO::writeLM(const KyteaLM * lm) { // print the n-gram length, 0 for empty models if(lm == 0) { writeBinary((uint32_t)0); return; } writeBinary((uint32_t)lm->n_); writeBinary((uint32_t)lm->vocabSize_); // sort the keys set keys; for(KyteaDoubleMap::const_iterator it = lm->probs_.begin(); it != lm->probs_.end(); it++) keys.insert(it->first); for(KyteaDoubleMap::const_iterator it = lm->fallbacks_.begin(); it != lm->fallbacks_.end(); it++) keys.insert(it->first); // print writeBinary((uint32_t)keys.size()); for(set::const_iterator it = keys.begin(); it != keys.end(); it++) { writeString(*it); KyteaDoubleMap::const_iterator fit = const_cast(lm)->probs_.find(*it); writeBinary(fit == lm->probs_.end() ? NEG_INFINITY : fit->second); fit = const_cast(lm)->fallbacks_.find(*it); if(it->length() != lm->n_) writeBinary(fit == lm->fallbacks_.end() ? NEG_INFINITY : fit->second); } } KyteaModel * BinaryModelIO::readModel() { // Read the number of classes int numC = readBinary(); if(numC == 0) return NULL; KyteaModel * mod = new KyteaModel(); mod->setAddFeatures(false); mod->setNumClasses(numC); // Read the solver mod->setSolver(readBinary()); // Read the class labels for(int i=0;(int)isetLabel(i, readBinary()); // Read the bias and multiplier mod->setBias(readBinary()?1.0:-1.0); mod->setMultiplier(readBinary()); // Read the feature lookup mod->setFeatureLookup(readFeatureLookup()); return mod; } KyteaLM* BinaryModelIO::readLM() { unsigned n = readBinary(); if(!n) return 0; KyteaLM* lm = new KyteaLM(n); lm->vocabSize_ = readBinary(); // sort the keys and print unsigned entrysize = readBinary(); while(entrysize-- != 0) { KyteaString str = readKyteaString(); double prob = readBinary(); if(prob != NEG_INFINITY) lm->probs_.insert(pair(str,prob)); if(str.length() != lm->n_) { double fallback = readBinary(); if(fallback != NEG_INFINITY) lm->fallbacks_.insert(pair(str,fallback)); } } return lm; } template <> void BinaryModelIO::writeEntry(const ModelTagEntry * entry) { writeString(entry->word); for(int i = 0; i < numTags_; i++) { int mySize = (int)entry->tags.size() > i ? entry->tags[i].size() : 0; // if there are multiple tags, but only one existed in model training data, swap writeBinary((uint32_t)mySize); for(int j = 0; j < mySize; j++) { writeString(entry->tags[i][j]); writeBinary((unsigned char)entry->tagInDicts[i][j]); } } writeBinary((unsigned char)entry->inDict); for(int i = 0; i < numTags_; i++) writeModel((int)entry->tagMods.size() > i ? entry->tagMods[i] : 0); } vector* BinaryModelIO::readFeatVec() { int mySize = readBinary(); vector * entry = new vector; for(int i = 0; i < mySize; i++) entry->push_back(readBinary()); return entry; } template <> vector* BinaryModelIO::readEntry >() { return readFeatVec(); } template <> ModelTagEntry* BinaryModelIO::readEntry() { ModelTagEntry* entry = new ModelTagEntry(readKyteaString()); entry->setNumTags(numTags_); for(int i = 0; i < numTags_; i++) { int mySize = readBinary(); entry->tags[i].resize(mySize); entry->tagInDicts[i].resize(mySize); for(int j = 0; j < mySize; j++) { entry->tags[i][j] = readKyteaString(); entry->tagInDicts[i][j] = readBinary(); } } entry->inDict = readBinary(); for(int i = 0; i < numTags_; i++) entry->tagMods[i] = readModel(); return entry; } template <> ProbTagEntry* BinaryModelIO::readEntry() { ProbTagEntry* entry = new ProbTagEntry(readKyteaString()); entry->setNumTags(numTags_); for(int i = 0; i < numTags_; i++) { unsigned mySize = readBinary(); entry->tags[i].resize(mySize); entry->probs[i].resize(mySize); for(unsigned j = 0; j < mySize; j++) { entry->tags[i][j] = readKyteaString(); entry->probs[i][j] = readBinary(); } } return entry; } void TextModelIO::writeWordList(const std::vector & list) { for(unsigned i = 0; i < list.size(); i++) { if(i != 0) *str_ << " "; *str_ << util_->showString(list[i]); } *str_ << endl; } void BinaryModelIO::writeWordList(const std::vector & list) { writeBinary((uint32_t)list.size()); for(unsigned i = 0; i < list.size(); i++) writeString(list[i]); } vector TextModelIO::readWordList() { string line,s; getline(*str_,line); // ignore the header istringstream iss(line); vector ret; while(iss >> s) ret.push_back(util_->mapString(s)); return ret; } vector BinaryModelIO::readWordList() { vector list(readBinary()); for(unsigned i = 0; i < list.size(); i++) list[i] = readKyteaString(); return list; } void TextModelIO::writeFeatureLookup(const FeatureLookup* featLookup) { if(!featLookup) { *str_ << endl; return; } *str_ << "lookup" << endl; writeVectorDictionary(featLookup->getCharDict()); writeVectorDictionary(featLookup->getTypeDict()); writeVectorDictionary(featLookup->getSelfDict()); writeFeatVec(featLookup->getDictVector()); writeFeatVec(featLookup->getBiases()); writeFeatVec(featLookup->getTagDictVector()); writeFeatVec(featLookup->getTagUnkVector()); } FeatureLookup * TextModelIO::readFeatureLookup() { string line; getline(*str_, line); if(line == "") return 0; else if (line != "lookup") THROW_ERROR("Poorly formatted model: expecting 'lookup' but got "<setCharDict(readVectorDictionary()); look->setTypeDict(readVectorDictionary()); look->setSelfDict(readVectorDictionary()); look->setDictVector(readFeatVec()); look->setBiases(readFeatVec()); look->setTagDictVector(readFeatVec()); look->setTagUnkVector(readFeatVec()); return look; } void BinaryModelIO::writeFeatureLookup(const FeatureLookup * featLookup) { if(featLookup) { writeBinary(1); writeVectorDictionary(featLookup->getCharDict()); writeVectorDictionary(featLookup->getTypeDict()); writeVectorDictionary(featLookup->getSelfDict()); writeFeatVec(featLookup->getDictVector()); writeFeatVec(featLookup->getBiases()); writeFeatVec(featLookup->getTagDictVector()); writeFeatVec(featLookup->getTagUnkVector()); } else { writeBinary(0); } } FeatureLookup * BinaryModelIO::readFeatureLookup() { char active = readBinary(); FeatureLookup * look = 0; if(active) { look = new FeatureLookup; look->setCharDict(readVectorDictionary()); look->setTypeDict(readVectorDictionary()); look->setSelfDict(readVectorDictionary()); look->setDictVector(readFeatVec()); look->setBiases(readFeatVec()); look->setTagDictVector(readFeatVec()); look->setTagUnkVector(readFeatVec()); } return look; } } kytea_0.4.6+dfsg.orig/src/lib/corpus-io.cpp0000644000175000017500000000673612133243467020076 0ustar koichikoichi/* * Copyright 2009, KyTea Development Team * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include #include #include #include "config.h" #define PROB_TRUE 100.0 #define PROB_FALSE -100.0 #define PROB_UNKNOWN 0.0 using namespace kytea; using namespace std; CorpusIO * CorpusIO::createIO(const char* file, Format form, const KyteaConfig & conf, bool output, StringUtil* util) { if(form == CORP_FORMAT_FULL) { return new FullCorpusIO(util,file,output,conf.getWordBound(),conf.getTagBound(),conf.getElemBound(),conf.getEscape()); } else if(form == CORP_FORMAT_TAGS) { FullCorpusIO * io = new FullCorpusIO(util,file,output,conf.getWordBound(),conf.getTagBound(),conf.getElemBound(),conf.getEscape()); io->setPrintWords(false); return io; } else if(form == CORP_FORMAT_TOK) { return new TokenizedCorpusIO(util,file,output,conf.getWordBound()); } else if(form == CORP_FORMAT_PART) { return new PartCorpusIO(util,file,output,conf.getUnkBound(),conf.getSkipBound(),conf.getNoBound(),conf.getHasBound(),conf.getTagBound(),conf.getElemBound(),conf.getEscape()); } else if(form == CORP_FORMAT_PROB) { return new ProbCorpusIO(util,file,output,conf.getWordBound(),conf.getTagBound(),conf.getElemBound(),conf.getEscape()); } else if(form == CORP_FORMAT_RAW) { return new RawCorpusIO(util,file,output); } else if(form == CORP_FORMAT_EDA) { return new EdaCorpusIO(util,file,output); } else THROW_ERROR("Illegal Output Format"); } CorpusIO * CorpusIO::createIO(iostream & file, Format form, const KyteaConfig & conf, bool output, StringUtil* util) { if(form == CORP_FORMAT_FULL) { return new FullCorpusIO(util,file,output,conf.getWordBound(),conf.getTagBound(),conf.getElemBound(),conf.getEscape()); } else if(form == CORP_FORMAT_TAGS) { FullCorpusIO * io = new FullCorpusIO(util,file,output,conf.getWordBound(),conf.getTagBound(),conf.getElemBound(),conf.getEscape()); io->setPrintWords(false); return io; } else if(form == CORP_FORMAT_TOK) { return new TokenizedCorpusIO(util,file,output,conf.getWordBound()); } else if(form == CORP_FORMAT_PART) { return new PartCorpusIO(util,file,output,conf.getUnkBound(),conf.getSkipBound(),conf.getNoBound(),conf.getHasBound(),conf.getTagBound(),conf.getElemBound(),conf.getEscape()); } else if(form == CORP_FORMAT_PROB) { return new ProbCorpusIO(util,file,output,conf.getWordBound(),conf.getTagBound(),conf.getElemBound(),conf.getEscape()); } else if(form == CORP_FORMAT_RAW) { return new RawCorpusIO(util,file,output); } else if(form == CORP_FORMAT_EDA) { return new EdaCorpusIO(util,file,output); } else THROW_ERROR("Illegal Output Format"); } kytea_0.4.6+dfsg.orig/src/lib/feature-io.cpp0000644000175000017500000001415512122355536020207 0ustar koichikoichi #include #include #include #include #include using namespace kytea; using namespace std; FeatureIO::~FeatureIO() { if(out_) delete out_; } void FeatureIO::load(const string& fileName,StringUtil* util) { ifstream in(fileName.c_str()); if(in.fail()) THROW_ERROR("Failed to open feature file "<parseInt(line.c_str()); getline(in,line); int numWords = util->parseInt(line.c_str()); for(int i = 0; i < numWords; i++) { getline(in,line); istringstream iss(line); iss >> str >> str2; int inDict = util->parseInt(str2.c_str()); ModelTagEntry* ent = new ModelTagEntry(util->mapString(str)); ent->inDict = (char)inDict; maxDict = max(maxDict,(unsigned char)inDict); ent->setNumTags(numTags_); for(int j = 0; j < numTags_; j++) { getline(in,line); istringstream tiss(line); tiss >> str; int numTypes = util->parseInt(str.c_str()); for(int k = 0; k < numTypes; k++) { tiss >> str >> str2; inDict = util->parseInt(str2.c_str()); ent->tags[j].push_back(util->mapString(str.c_str())); ent->tagInDicts[j].push_back((char)inDict); } } wm_.insert(pair(ent->word,ent)); } getline(in,line); if(line.length()) { THROW_ERROR("Expected empty line in feature file, but instead got '"<> str; if(str == "T") { titless >> str; numTags_ = max(numTags_,util->parseInt(str.c_str())+1); } // make the structure TagTriplet * trip = new TagTriplet(); trip->first = vector< vector >(); trip->second = vector(); trip->third = new KyteaModel(); feats_.insert(pair(util->mapString(line),trip)); // read the tags getline(in,line); istringstream iss(line); while(iss >> str) trip->fourth.push_back(util->mapString(str)); // read the feature names getline(in,line); int numFeats = util->parseInt(line.c_str()); vector featNames(numFeats); for(int i = 0; i < numFeats; i++) { getline(in,line); // cerr << "read feature "<third->mapFeat(util->mapString(line)); } // read the values while(getline(in,line) && line.length() > 0) { istringstream iss(line); iss >> str; trip->second.push_back(util->parseInt(str.c_str())); // cerr << str; vector x; while(iss >> str) { x.push_back(util->parseInt(str.c_str())); // cerr << " " << util->showString(name) << "("<first.push_back(x); } } } void FeatureIO::openOut(const string& fileName) { if(out_) delete out_; out_ = new ofstream(fileName.c_str()); } void FeatureIO::closeOut() { delete out_; out_ = 0; } TagTriplet * FeatureIO::getFeatures(const KyteaString & str, bool add) { TagHash::iterator it = feats_.find(str); TagTriplet * ret = 0; if(it != feats_.end()) ret = it->second; else if(add) { ret = new TagTriplet(); feats_.insert(pair(str,ret)); } return ret; } void FeatureIO::printFeatures(const KyteaString & featId, TagTriplet * trip, StringUtil * util) { if(!out_ || trip->first.size() == 0) return; *out_ << util->showString(featId) << endl; for(unsigned i = 0; i < trip->fourth.size(); i++) { if(i != 0) *out_ << " "; *out_ << util->showString(trip->fourth[i]); } // print the feature names const FeatNameVec & names = trip->third->getOldNames(); *out_ << endl << names.size() << endl; for(int i = 0; i < (int)names.size(); i++) { // cerr << "wrote feature "<showString(names[i])<showString(names[i]) << endl; } for(int i = 0; i < (int)trip->first.size(); i++) { // cerr << trip->second[i]; *out_ << trip->second[i]; for(int j = 0; j < (int)trip->first[i].size(); j++) { // cerr << " " << util->showString(names[trip->first[i][j]]) << "("<first[i][j]<<")"; *out_ << " " << trip->first[i][j]; } // cerr << endl; *out_ << endl; } *out_ << endl; } void FeatureIO::printFeatures(const KyteaString & featId, StringUtil * util) { vector gc; for(TagHash::iterator it = feats_.begin(); it != feats_.end(); it++) { if(it->first.beginsWith(featId)) { gc.push_back(it->first); printFeatures(it->first,it->second,util); delete it->second; } } for(unsigned i = 0; i < gc.size(); i++) { feats_.erase(gc[i]); } } void FeatureIO::printWordMap(StringUtil * util) { if(!out_) return; *out_ << numTags_ << endl; *out_ << wm_.size() << endl; for(Dictionary::WordMap::const_iterator it = wm_.begin(); it != wm_.end(); it++) { const TagEntry * te = it->second; *out_ << util->showString(te->word) << " " << (int)te->inDict << endl; for(int i = 0; i < numTags_; i++) { if(i >= (int)te->tags.size()) { *out_ << "0" << endl; } else { *out_ << te->tags[i].size(); for(unsigned j = 0; j < te->tags[i].size(); j++) *out_ << " " << util->showString(te->tags[i][j]) << " " << (int)te->tagInDicts[i][j]; *out_ << endl; } } } *out_ << endl; } kytea_0.4.6+dfsg.orig/src/lib/string-util.cpp0000644000175000017500000004035112127276745020436 0ustar koichikoichi/* * Copyright 2009, KyTea Development Team * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include using namespace kytea; using namespace std; // Check that these are equal by serializing them void StringUtil::checkEqual(const StringUtil & rhs) const { std::string me = serialize(); std::string you = rhs.serialize(); if(me != you) { THROW_ERROR("String utils don't match" << std::endl << " --- lhs --- " << std::endl << me << std::endl << " --- rhs --- " << std::endl << you); } } // parse an integer or float int StringUtil::parseInt(const char* str) { char* endP; int ret = strtol(str, &endP, 10); if(endP == str) THROW_ERROR("Bad integer value '" << str << "'"); return ret; } double StringUtil::parseFloat(const char* str) { char* endP; double ret = strtod(str, &endP); if(endP == str) THROW_ERROR("Bad floating-point value '" << str << "'"); return ret; } StringUtilUtf8::StringUtilUtf8() { const char * initial[7] = { "", "K", "T", "H", "R", "D", "O" }; for(unsigned i = 0; i < 7; i++) { charIds_.insert(std::pair(initial[i], i)); charTypes_.push_back(i==0?6:4); // first is other, rest romaji charNames_.push_back(initial[i]); } } GenericMap * StringUtilUtf8::getNormMap() { if(normMap_ == NULL) { normMap_ = new GenericMap; KyteaString orig = mapString(STRING_UTIL_ORIG_UTF8); KyteaString norm = mapString(STRING_UTIL_NORM_UTF8); if(orig.length() != norm.length()) THROW_ERROR("FATAL ERROR: unmatched strings in string-util.cpp : StringUtilUtf8"); for(int i = 0; i < (int)orig.length(); i++) normMap_->insert(pair(orig[i], norm[i])); } return normMap_; } GenericMap * StringUtilSjis::getNormMap() { if(normMap_ == NULL) { normMap_ = new GenericMap; KyteaString orig = mapString(STRING_UTIL_ORIG_SJIS); KyteaString norm = mapString(STRING_UTIL_NORM_SJIS); if(orig.length() != norm.length()) { for(int i = 0; i < (int)min(orig.length(), norm.length()); i++) cerr << showChar(orig[i]) << " <-> " << showChar(norm[i]) << endl; THROW_ERROR("FATAL ERROR: unmatched strings in string-util.cpp : StringUtilSjis"); } for(int i = 0; i < (int)orig.length(); i++) normMap_->insert(pair(orig[i], norm[i])); } return normMap_; } GenericMap * StringUtilEuc::getNormMap() { if(normMap_ == NULL) { normMap_ = new GenericMap; KyteaString orig = mapString(STRING_UTIL_ORIG_EUC); KyteaString norm = mapString(STRING_UTIL_NORM_EUC); if(orig.length() != norm.length()) { for(int i = 0; i < (int)min(orig.length(), norm.length()); i++) cerr << showChar(orig[i]) << " <-> " << showChar(norm[i]) << endl; THROW_ERROR("FATAL ERROR: unmatched strings in string-util.cpp : StringUtilEuc"); } for(int i = 0; i < (int)orig.length(); i++) normMap_->insert(pair(orig[i], norm[i])); } return normMap_; } // map a string to a character KyteaChar StringUtilUtf8::mapChar(const string & str, bool add) { StringCharMap::iterator it = charIds_.find(str); KyteaChar ret = 0; if(it != charIds_.end()) ret = it->second; else if (add) { if (charTypes_.size() > std::numeric_limits::max()) THROW_ERROR("FATAL ERROR: id exceeds numerical limit in string-util.cpp : StringUtilUtf8"); ret = charTypes_.size(); charIds_.insert(pair(str,ret)); charTypes_.push_back(findType(str)); charNames_.push_back(str); } return ret; } string StringUtilUtf8::showChar(KyteaChar c) { #ifdef KYTEA_SAFE if(c >= charNames_.size()) THROW_ERROR("FATAL: Index out of bounds in showChar"); #endif return charNames_[c]; } StringUtil::CharType StringUtilUtf8::findType(KyteaChar c) { return charTypes_[c]; } KyteaString StringUtilUtf8::mapString(const string & str) { unsigned pos = 0, len = str.length(); vector ret; while(pos < len) { // single character unicode values if(!(maskl1 & str[pos])) ret.push_back(mapChar(str.substr(pos++, 1))); else if((maskl5 & str[pos]) == maskl5) { THROW_ERROR("Expected UTF8 file but found non-UTF8 string (specify the proper encoding with -encode utf8/euc/sjis): "<= len || badu(str[pos+1]) || badu(str[pos+2]) || badu(str[pos+3])) THROW_ERROR("Expected UTF8 file but found non-UTF8 string (specify the proper encoding with -encode utf8/euc/sjis): "<= len || badu(str[pos+1]) || badu(str[pos+2])) THROW_ERROR("Expected UTF8 file but found non-UTF8 string (specify the proper encoding with -encode utf8/euc/sjis): "<= len || badu(str[pos+1])) THROW_ERROR("Expected UTF8 file but found non-UTF8 string (specify the proper encoding with -encode utf8/euc/sjis): "<4) THROW_ERROR("Malformed utf8 character in findType"); // parse into unicode integer values unsigned val = 0; if(str.length() == 1) val = str[0]; else if(str.length() == 2) val = ((str[0]&maskr5)<<6) | (maskr6&str[1]); else if(str.length() == 3) val = ((str[0]&maskr4)<<12) | ((maskr6&str[1])<<6) | (maskr6&str[2]); else val = ((str[0]&maskr3)<<18) | ((maskr6&str[1])<<12) | ((maskr6&str[2])<<18) | (maskr6&str[3]); // Basic latin uppercase, basic latin lowercase // Full width uppercase, full width lowercase if((val >= 0x41 && val <= 0x5A) || (val >= 0x61 && val <= 0x7A) || (val >= 0xFF21 && val <= 0xFF3A) || (val >= 0xFF41 && val <= 0xFF5A)) { return ROMAJI; } // hiragana (exclude repetition characters) else if((val >= 0x3040 && val <= 0x3096)) { return HIRAGANA; } // full width (exclude center dot), half width else if((val >= 0x30A0 && val <= 0x30FF && val != 0x30FB) || (val >= 0xFF66 && val <= 0xFF9F)) { return KATAKANA; } // basic latin digits else if((val >= 0x30 && val <= 0x39) || (val >= 0xFF10 && val <= 0xFF19)) { return DIGIT; } // CJK Unified Ideographs else if((val >= 0x3400 && val <= 0x4DBF) // CJK Unified Ideographs Extension A || (val >= 0x4E00 && val <= 0x9FFF) // CJK Unified Ideographs || (val >= 0xF900 && val <= 0xFAFF) // CJK Compatibility Ideographs //|| (val >= 0x1F200 && val <= 0x1F2FF) // Enclosed Ideographic Supplement || (val >= 0x20000 && val <= 0x2A6DF) // CJK Unified Ideographs Extension B || (val >= 0x2A700 && val <= 0x2B73F) // CJK Unified Ideographs Extension C || (val >= 0x2B740 && val <= 0x2B81F) // CJK Unified Ideographs Extension D || (val >= 0x2F800 && val <= 0x2FA1F)) { // CJK Compatibility Ideographs Supplement return KANJI; } return OTHER; } void StringUtilUtf8::unserialize(const string & str) { charIds_.clear(); charNames_.clear(); charTypes_.clear(); mapChar(""); KyteaString ret = mapString(str); } string StringUtilUtf8::serialize() const { ostringstream buff; for(unsigned i = 1; i < charNames_.size(); i++) buff << charNames_[i]; return buff.str(); } inline KyteaChar eucm(char a, char b) { KyteaChar ret = a & 0xFF; ret = ret << 8; ret = ret | (b&0xFF); return ret; } inline unsigned char euc1(KyteaChar a) { return (a & 0xFF00) >> 8; } inline unsigned char euc2(KyteaChar a) { return (a & 0xFF); } KyteaChar StringUtilEuc::mapChar(const string & str, bool add) { unsigned len = str.length(); KyteaChar ret; if(len == 1) { #ifdef KYTEA_SAFE if(str[0] & maskl1) THROW_ERROR("Expected EUC file but found non-EUC string (specify the proper encoding with -encode utf8/euc/sjis): "< ret; while(pos < len) { // single character unicode values if(!(maskl1 & str[pos])) ret.push_back(mapChar(str.substr(pos++, 1))); else { ret.push_back(mapChar(str.substr(pos,2))); pos += 2; } } KyteaString retstr(ret.size()); for(unsigned i = 0; i < ret.size(); i++) retstr[i] = ret[i]; return retstr; } // get the type of a character StringUtil::CharType StringUtilEuc::findType(const string & str) { return findType(mapChar(str)); } StringUtil::CharType StringUtilEuc::findType(KyteaChar c) { unsigned char c1 = euc1(c), c2 = euc2(c); // digits (hankaku/zenkaku) if((c2 >= 0x30 && c2 <= 0x39) || (c1 == 0xA3 && c2 >= 0xB0 && c2 <= 0xB9)) return DIGIT; // romaji (lower/upper for hankaku/zenkaku) else if((c2 >= 0x41 && c2 <= 0x5A) || (c2 >= 0x61 && c2 <= 0x7A) || (c1 == 0xA3 && ((c2 >= 0xC1 && c2 <= 0xDA) || (c2 >= 0xE1 && c2 <= 0xFA)))) { return ROMAJI; } // hiragana else if(c1 == 0xA4 && c2 >= 0xA1 && c2 <= 0xF3) { return HIRAGANA; } // katakana else if((c1 == 0xA5 && c2 >= 0xA1 && c2 <= 0xF6) || // full-width (c1 == 0xA1 && c2 == 0xBC) || // horizontal bar (c1 == 0x8E) // half-width ) { return KATAKANA; } // kanji else if(c1 >= 0xB0 && c1 <= 0xF4) { return KANJI; } return OTHER; } // return the encoding provided by this util StringUtil::Encoding StringUtilEuc::getEncoding() { return StringUtil::ENCODING_EUC; } const char* StringUtilEuc::getEncodingString() { return "euc"; } // transform to or from a character string void StringUtilEuc::unserialize(const string & str) { } string StringUtilEuc::serialize() const { string ret; return ret; } inline KyteaChar sjism(char a, char b) { KyteaChar ret = a & 0xFF; ret = ret << 8; ret = ret | (b&0xFF); return ret; } inline unsigned char sjis1(KyteaChar a) { return (a & 0xFF00) >> 8; } inline unsigned char sjis2(KyteaChar a) { return (a & 0xFF); } KyteaChar StringUtilSjis::mapChar(const string & str, bool add) { unsigned len = str.length(); KyteaChar ret; if(len == 1) { #ifdef KYTEA_SAFE const unsigned char first = (unsigned char)str[0]; if((first & maskl1) && !(first >= 0xA0 && first <= 0xDF)) THROW_ERROR("Expected SJIS file but found non-SJIS string (specify the proper encoding with -encode utf8/euc/sjis): "<= 0xA0 && first <= 0xDF)) THROW_ERROR("Expected SJIS file but found non-SJIS string (specify the proper encoding with -encode utf8/euc/sjis): "< ret; while(pos < len) { // single character unicode values const unsigned char first = (unsigned char)str[pos]; if(!(first & maskl1) || (first >= 0xA0 && first <= 0xDF)) ret.push_back(mapChar(str.substr(pos++, 1))); else { ret.push_back(mapChar(str.substr(pos,2))); pos += 2; } } KyteaString retstr(ret.size()); for(unsigned i = 0; i < ret.size(); i++) retstr[i] = ret[i]; return retstr; } // get the type of a character StringUtil::CharType StringUtilSjis::findType(const string & str) { return findType(mapChar(str)); } StringUtil::CharType StringUtilSjis::findType(KyteaChar c) { unsigned char c1 = sjis1(c), c2 = sjis2(c); // digits (hankaku/zenkaku) if((c1 == 0 && c2 >= 0x30 && c2 <= 0x39) || (c1 == 0x82 && c2 >= 0x4F && c2 <= 0x58)) return DIGIT; // romaji (lower/upper for hankaku/zenkaku) else if((c1 == 0 && ((c2 >= 0x41 && c2 <= 0x5A) || (c2 >= 0x61 && c2 <= 0x7A))) || (c1 == 0x82 && ((c2 >= 0x60 && c2 <= 0x79) || (c2 >= 0x81 && c2 <= 0x9A)))) { return ROMAJI; } // hiragana else if(c1 == 0x82 && c2 >= 0x9F && c2 <= 0xF1) { return HIRAGANA; } // katakana else if((c1 == 0x83 && c2 >= 0x40 && c2 <= 0x96) || // full-width (c1 == 0x81 && c2 == 0x5B) || // horizontal bar (c1 == 0 && c2 >= 0xA6 && c2 <= 0xDF) // half-width ) { return KATAKANA; } // kanji else if( (c1 >= 0x88 && c1 <= 0x9F) || (c1 >= 0xE0 && c1 <= 0xEA) ) { return KANJI; } return OTHER; } KyteaString StringUtil::normalize(const KyteaString & str) { // std::cerr << showString(str) << std::endl; KyteaString ret(str.length()); GenericMap * normMap = getNormMap(); for(int i = 0; i < (int)str.length(); i++) { GenericMap::const_iterator it = normMap->find(str[i]); ret[i] = (it == normMap->end()) ? str[i] : it->second; // std::cerr << " " << str[i] << "-->" << ret[i] << std::endl; } return ret; } // return the encoding provided by this util StringUtil::Encoding StringUtilSjis::getEncoding() { return StringUtil::ENCODING_SJIS; } const char* StringUtilSjis::getEncodingString() { return "sjis"; } // transform to or from a character string void StringUtilSjis::unserialize(const string & str) { } string StringUtilSjis::serialize() const { string ret; return ret; } kytea_0.4.6+dfsg.orig/src/lib/corpus-io-full.cpp0000644000175000017500000001237312133244357021027 0ustar koichikoichi#include #include #include #include #include #include "config.h" #define PROB_TRUE 100.0 #define PROB_FALSE -100.0 #define PROB_UNKNOWN 0.0 using namespace kytea; using namespace std; KyteaSentence * FullCorpusIO::readSentence() { #ifdef KYTEA_SAFE if(out_ || !str_) THROW_ERROR("Attempted to read a sentence from an closed or output object"); #endif string s; getline(*str_, s); if(str_->eof()) return 0; KyteaChar spaceChar = bounds_[0], slashChar = bounds_[1], ampChar = bounds_[2], bsChar = bounds_[3]; KyteaString ks = util_->mapString(s), buff(ks.length()); int len = ks.length(); KyteaSentence * ret = new KyteaSentence(); int charLen = 0; // go through the whole string int j = 0, bpos, lev; for(j = 0; j < len; j++) { // 1) get the word bpos = 0; for( ; j < len && ks[j] != spaceChar && ks[j] != slashChar; j++) { if(ks[j] == ampChar) { THROW_ERROR("Illegal tag separator in word position at "<normalize(word_str)); charLen += bpos; // 2) get the tags lev = -1; while(j < len && ks[j] != spaceChar) { if(ks[j] == slashChar) lev++; bpos = 0; for(++j ; j < len && ks[j] != spaceChar && ks[j] != slashChar && ks[j] != ampChar; j++) { if(ks[j] == bsChar && ++j == len) THROW_ERROR("Illegal trailing escape character at "<words.push_back(word); } // make the character/ws string ret->surface = KyteaString(charLen); ret->norm = KyteaString(charLen); unsigned pos = 0; for(KyteaSentence::Words::const_iterator tit = ret->words.begin(); tit != ret->words.end(); tit++) { ret->surface.splice(tit->surface, pos); ret->norm.splice(tit->norm, pos); unsigned nextPos = pos + tit->surface.length() - 1; while(pos++ < nextPos) ret->wsConfs.push_back(PROB_FALSE); ret->wsConfs.push_back(PROB_TRUE); } if(ret->wsConfs.size() > 0) ret->wsConfs.pop_back(); return ret; } void FullCorpusIO::writeSentence(const KyteaSentence * sent, double conf) { const string & wb = util_->showChar(bounds_[0]), tb = util_->showChar(bounds_[1]), eb = util_->showChar(bounds_[2]); for(unsigned i = 0; i < sent->words.size(); i++) { if(i != 0) *str_ << wb; const KyteaWord & w = sent->words[i]; if(printWords_) *str_ << util_->showString(w.surface); int printed = 0; for(int j = 0; j < w.getNumTags(); j++) { const vector< KyteaTag > & tags = w.getTags(j); if(tags.size() > 0) { *str_ << ((printWords_ || printed++ > 0) ? tb : "") << util_->showString(tags[0].first); if(allTags_) for(unsigned k = 1; k < tags.size(); k++) *str_ << eb << util_->showString(tags[k].first); } } if(w.getUnknown()) *str_ << unkTag_; } *str_ << endl; } FullCorpusIO::FullCorpusIO(StringUtil * util, const char* wordBound, const char* tagBound, const char* elemBound, const char* escape) : CorpusIO(util), allTags_(false), bounds_(4), printWords_(true) { bounds_[0] = util_->mapChar(wordBound); bounds_[1] = util_->mapChar(tagBound); bounds_[2] = util_->mapChar(elemBound); bounds_[3] = util_->mapChar(escape); } FullCorpusIO::FullCorpusIO(const CorpusIO & c, const char* wordBound, const char* tagBound, const char* elemBound, const char* escape) : CorpusIO(c), allTags_(false), bounds_(4), printWords_(true) { bounds_[0] = util_->mapChar(wordBound); bounds_[1] = util_->mapChar(tagBound); bounds_[2] = util_->mapChar(elemBound); bounds_[3] = util_->mapChar(escape); } FullCorpusIO::FullCorpusIO(StringUtil * util, const char* file, bool out, const char* wordBound, const char* tagBound, const char* elemBound, const char* escape) : CorpusIO(util,file,out), allTags_(false), bounds_(4), printWords_(true) { bounds_[0] = util_->mapChar(wordBound); bounds_[1] = util_->mapChar(tagBound); bounds_[2] = util_->mapChar(elemBound); bounds_[3] = util_->mapChar(escape); } FullCorpusIO::FullCorpusIO(StringUtil * util, std::iostream & str, bool out, const char* wordBound, const char* tagBound, const char* elemBound, const char* escape) : CorpusIO(util,str,out), allTags_(false), bounds_(4), printWords_(true) { bounds_[0] = util_->mapChar(wordBound); bounds_[1] = util_->mapChar(tagBound); bounds_[2] = util_->mapChar(elemBound); bounds_[3] = util_->mapChar(escape); } kytea_0.4.6+dfsg.orig/src/lib/general-io.cpp0000644000175000017500000000345012122355536020165 0ustar koichikoichi#include #include #include #include #include using namespace std; using namespace kytea; void GeneralIO::openFile(const char* file, bool out, bool bin) { fstream::openmode mode = (out?fstream::out:fstream::in); if(bin) mode = mode | fstream::binary; fstream * str = new fstream(file, mode); if(str->fail()) THROW_ERROR("Couldn't open file '"<precision(DECIMAL_PRECISION); bin_ = bin; out_ = out; owns_ = false; } void GeneralIO::writeString(const KyteaString & str) { writeBinary((uint32_t)str.length()); for(unsigned i = 0; i < str.length(); i++) writeBinary(str[i]); } // read template T GeneralIO::readBinary() { T v; str_->read(reinterpret_cast(&v),sizeof(T)); return v; } // Template instantiations template bool GeneralIO::readBinary(); template char GeneralIO::readBinary(); template short GeneralIO::readBinary(); template int GeneralIO::readBinary(); template double GeneralIO::readBinary(); template unsigned short GeneralIO::readBinary(); template unsigned int GeneralIO::readBinary(); template unsigned char GeneralIO::readBinary(); std::string GeneralIO::readString() { std::string str; getline(*str_, str, (char)0); return str; } KyteaString GeneralIO::readKyteaString() { KyteaString ret(readBinary()); for(unsigned i = 0; i < ret.length(); i++) ret[i] = readBinary(); return ret; } kytea_0.4.6+dfsg.orig/src/lib/Makefile.am0000644000175000017500000000136712122355536017500 0ustar koichikoichiLLLIBS = liblinear/liblinear.la KYTCPP = kytea.cpp general-io.cpp corpus-io-prob.cpp corpus-io-eda.cpp corpus-io-full.cpp corpus-io-part.cpp corpus-io-tokenized.cpp corpus-io-raw.cpp corpus-io.cpp model-io.cpp string-util.cpp kytea-model.cpp kytea-config.cpp kytea-lm.cpp feature-io.cpp dictionary.cpp feature-lookup.cpp kytea-util.cpp kytea-string.cpp kytea-struct.cpp # KYTH = kytea.h corpus-io.h model-io.h string-util.h \ # kytea-model.h kytea-string.h kytea-struct.h dictionary.h general-io.h \ # kytea-config.h AM_CPPFLAGS = -I$(srcdir)/../include -DPKGDATADIR='"$(pkgdatadir)"' SUBDIRS = liblinear lib_LTLIBRARIES = libkytea.la libkytea_la_SOURCES = ${KYTCPP} libkytea_la_LIBADD = ${LLLIBS} libkytea_la_LDFLAGS = -version-info 0:0:0 kytea_0.4.6+dfsg.orig/src/lib/kytea-lm.cpp0000644000175000017500000001146112122355536017667 0ustar koichikoichi#include #include #include #include using namespace kytea; using namespace std; // increment the count of a probability and return whether the // n-gram already exists bool addCount(KyteaDoubleMap & myMap, const KyteaString & str) { KyteaDoubleMap::iterator it = myMap.find(str); if(it == myMap.end()) { myMap.insert(pair(str,1.0)); return false; } it->second++; return true; } // train an n-gram model using Kneser-Ney smoothing void KyteaLM::train(const std::vector & corpus) { // get the marginal counts KyteaDoubleMap denominators; for(unsigned i = 0; i < corpus.size(); i++) { KyteaString trainString(corpus[i].length()+n_); for(unsigned j = 0; j < n_-1; j++) trainString[j] = 0; trainString[trainString.length()-1] = 0; trainString.splice(corpus[i],n_-1); for(unsigned j = n_; j < trainString.length(); j++) { for(unsigned len = n_; len > 0; len--) { KyteaString fbString = trainString.substr(j-len,len-1); addCount(denominators,fbString); // make sure we're getting marginal counts, not actual (Kneser-Ney) if(addCount(probs_, trainString.substr(j-len, len))) break; // add a unique count else addCount(fallbacks_,fbString); } } } // calculate the number of counts for absolute smoothing vector oneCounts(n_,0), twoCounts(n_,0), allCounts(n_,0); for(KyteaDoubleMap::const_iterator it = probs_.begin(); it != probs_.end(); it++) { unsigned n = it->first.length()-1; if(it->second == 1.0) oneCounts[n]++; else if(it->second == 2.0) twoCounts[n]++; allCounts[n]++; } // find the discounts according to standard Heuristics (see Chen and Goodman) vector discounts(n_,0); for(unsigned i = 0; i < n_; i++) { if(oneCounts[i] * twoCounts[i] == 0) { cerr << "WARNING: Setting discount["<first.length()-1; it->second = log((it->second-discounts[n])/denominators[it->first.substr(0,n)]); } // calculate the fallbacks for(KyteaDoubleMap::iterator it = fallbacks_.begin(); it != fallbacks_.end(); it++) it->second = log((it->second*discounts[it->first.length()])/denominators[it->first]); } double KyteaLM::scoreSingle(const KyteaString & val, int pos) { KyteaString ngram(n_); for(unsigned i = 0; i < n_; i++) ngram[i] = 0; int npos = n_; if((int)val.length() == pos) { npos--; pos--; } while(--npos >= 0 && pos >= 0) ngram[npos] = val[pos--]; double prob = 0; for(npos = 0; npos < (int)n_; npos++) { KyteaDoubleMap::const_iterator it = probs_.find(ngram.substr(npos)); if(it != probs_.end()) { prob += it->second; return prob; } else { it = fallbacks_.find(ngram.substr(npos, n_-npos-1)); if(it != fallbacks_.end()) prob += it->second; } } return prob + log(1.0/vocabSize_); } // score a string with the language model (log probability) double KyteaLM::score(const KyteaString& val) const { unsigned j, len; double prob = 0; KyteaString testString(val.length()+n_); for(j = 0; j < n_-1; j++) testString[j] = 0; testString[testString.length()-1] = 0; testString.splice(val,n_-1); for(j = n_; j < testString.length(); j++) { for(len = n_; len > 0; len--) { KyteaDoubleMap::const_iterator it = const_cast(&probs_)->find(testString.substr(j-len, len)); if(it != probs_.end()) { prob += it->second; break; } else { it = const_cast(&fallbacks_)->find(testString.substr(j-len, len-1)); if(it != fallbacks_.end()) prob += it->second; } } if(n_ == 0) prob += log(1.0/vocabSize_); } return prob; } void KyteaLM::checkEqual(const KyteaLM & rhs) const { if(n_ != rhs.n_) THROW_ERROR("KyteaLM n_ don't match: " << n_ << " != " << rhs.n_); if(vocabSize_ != rhs.vocabSize_) THROW_ERROR("KyteaLM vocabSize_ don't match: " << vocabSize_ << " != " << rhs.vocabSize_); checkMapEqual(probs_, rhs.probs_); checkMapEqual(fallbacks_, rhs.fallbacks_); } kytea_0.4.6+dfsg.orig/src/test/0000755000175000017500000000000012151067113015637 5ustar koichikoichikytea_0.4.6+dfsg.orig/src/test/test-analysis.h0000644000175000017500000004670012127276745020637 0ustar koichikoichi#ifndef TEST_ANALYSIS__ #define TEST_ANALYSIS__ #include #include "test-base.h" namespace kytea { class TestAnalysis : public TestBase { private: Kytea *kytea, *kyteaLogist, *kyteaMCSVM, *kyteaNoWS; StringUtil *util, *utilLogist, *utilMCSVM, *utilNoWS; public: TestAnalysis() { // Print the corpus const char* toy_text = "これ/代名詞/これ は/助詞/は 学習/名詞/がくしゅう データ/名詞/でーた で/助動詞/で す/語尾/す 。/補助記号/。\n" "大変/形状詞/でーた で/助動詞/で す/語尾/す 。/補助記号/。\n" "\n" "どうぞ/副詞/どうぞ モデル/名詞/もでる を/助詞/を KyTea/名詞/きゅーてぃー で/助詞/で 学習/名詞/がくしゅう し/動詞/し て/助詞/て くださ/動詞/くださ い/語尾/い !/補助記号/!\n" "処理/名詞/しょり を/助詞/を 行/動詞/おこな っ/語尾/っ た/助動詞/た ./補助記号/。\n" "京都/名詞/きょうと に/助詞/に 行/動詞/い っ/語尾/っ た/助動詞/た ./補助記号/。\n"; ofstream ofs("/tmp/kytea-toy-corpus.txt"); ofs << toy_text; ofs.close(); // Train the KyTea model with SVMs const char* toyCmd[7] = {"", "-model", "/tmp/kytea-svm-model.bin", "-full", "/tmp/kytea-toy-corpus.txt", "-global", "1"}; KyteaConfig * config = new KyteaConfig; config->setDebug(0); config->setOnTraining(true); config->parseTrainCommandLine(7, toyCmd); kytea = new Kytea(config); kytea->trainAll(); util = kytea->getStringUtil(); config->setOnTraining(false); // Train the KyTea model with logistic regression const char* toyCmdLogist[9] = {"", "-model", "/tmp/kytea-logist-model.bin", "-full", "/tmp/kytea-toy-corpus.txt", "-global", "1", "-solver", "0"}; KyteaConfig * configLogist = new KyteaConfig; configLogist->setDebug(0); configLogist->setTagMax(0); configLogist->setOnTraining(true); configLogist->parseTrainCommandLine(9, toyCmdLogist); kyteaLogist = new Kytea(configLogist); kyteaLogist->trainAll(); utilLogist = kyteaLogist->getStringUtil(); configLogist->setOnTraining(false); // Train the KyTea model with the multi-class svm const char* toyCmdMCSVM[9] = {"", "-model", "/tmp/kytea-logist-model.bin", "-full", "/tmp/kytea-toy-corpus.txt", "-global", "1", "-solver", "4"}; KyteaConfig * configMCSVM = new KyteaConfig; configMCSVM->setDebug(0); configMCSVM->setTagMax(0); configMCSVM->setOnTraining(true); configMCSVM->parseTrainCommandLine(9, toyCmdMCSVM); kyteaMCSVM = new Kytea(configMCSVM); kyteaMCSVM->trainAll(); utilMCSVM = kyteaMCSVM->getStringUtil(); configMCSVM->setOnTraining(false); // Train the KyTea model with logistic regression const char* toyCmdNoWS[8] = {"", "-model", "/tmp/kytea-logist-model.bin", "-full", "/tmp/kytea-toy-corpus.txt", "-global", "1", "-nows"}; KyteaConfig * configNoWS = new KyteaConfig; configNoWS->setDebug(0); configNoWS->setTagMax(0); configNoWS->setOnTraining(true); configNoWS->parseTrainCommandLine(8, toyCmdNoWS); kyteaNoWS = new Kytea(configNoWS); kyteaNoWS->trainAll(); utilNoWS = kyteaNoWS->getStringUtil(); configNoWS->setOnTraining(false); } ~TestAnalysis() { delete kytea; delete kyteaLogist; delete kyteaMCSVM; delete kyteaNoWS; } int testWordSegmentationEmpty() { // Do the analysis (This is very close to the training data, so it // should work perfectly) KyteaString str = util->mapString(""); KyteaSentence sentence(str, util->normalize(str)); kytea->calculateWS(sentence); // Make the correct words KyteaString::Tokens toks = util->mapString("").tokenize(util->mapString(" ")); return checkWordSeg(sentence,toks,util); } int testWordSegmentationUnk() { // Do the analysis (This is very close to the training data, so it // should work perfectly) KyteaString str = util->mapString("これは学習デエタです。"); KyteaSentence sentence(str, util->normalize(str)); kytea->calculateWS(sentence); // Make the correct words KyteaString::Tokens toks = util->mapString("これ は 学習 デエタ で す 。").tokenize(util->mapString(" ")); if(!checkWordSeg(sentence,toks,util)) { return 0; } vector unk_exp(6, false), unk_act(6); unk_exp[3] = true; for(int i = 0; i < 6; i++) unk_act[i] = sentence.words[i].getUnknown(); return checkVector(unk_exp, unk_act); } int testNormalizationUnk() { // Do the analysis (This is very close to the training data, so it // should work perfectly) KyteaString str = util->mapString("これはKyTeaです."); KyteaSentence sentence(str, util->normalize(str)); kytea->calculateWS(sentence); // Make the correct words KyteaString::Tokens toks = util->mapString("これ は KyTea で す .").tokenize(util->mapString(" ")); if(!checkWordSeg(sentence,toks,util)) { return 0; } vector unk_exp(6, false), unk_act(6); for(int i = 0; i < 6; i++) unk_act[i] = sentence.words[i].getUnknown(); return checkVector(unk_exp, unk_act); } int testWordSegmentationSVM() { // Do the analysis (This is very close to the training data, so it // should work perfectly) KyteaString str = util->mapString("これは学習データです。"); KyteaSentence sentence(str, util->normalize(str)); kytea->calculateWS(sentence); // Make the correct words KyteaString::Tokens toks = util->mapString("これ は 学習 データ で す 。").tokenize(util->mapString(" ")); return checkWordSeg(sentence,toks,util); } int testWordSegmentationMCSVM() { // Do the analysis (This is very close to the training data, so it // should work perfectly) KyteaString str = utilMCSVM->mapString("これは学習データです。"); KyteaSentence sentence(str, utilMCSVM->normalize(str)); kyteaMCSVM->calculateWS(sentence); // Make the correct words KyteaString::Tokens toks = utilMCSVM->mapString("これ は 学習 データ で す 。").tokenize(utilMCSVM->mapString(" ")); return checkWordSeg(sentence,toks,utilMCSVM); } int testWordSegmentationLogistic() { // Do the analysis (This is very close to the training data, so it // should work perfectly) KyteaString str = utilLogist->mapString("これは学習データです。"); KyteaSentence sentence(str, utilLogist->normalize(str)); kyteaLogist->calculateWS(sentence); // Make the correct words KyteaString::Tokens toks = utilLogist->mapString("これ は 学習 データ で す 。").tokenize(utilLogist->mapString(" ")); int correct = checkWordSeg(sentence,toks,utilLogist); if(correct) { for(int i = 0; i < (int)sentence.wsConfs.size(); i++) { if(sentence.wsConfs[i] < 0.0 || sentence.wsConfs[i] > 1.0) { cerr << "Confidience for logistic WS "<mapString("これは学習データです。"); KyteaSentence sentence(str, util->normalize(str)); kytea->calculateWS(sentence); kytea->calculateTags(sentence,0); // Make the correct tags KyteaString::Tokens toks = util->mapString("代名詞 助詞 名詞 名詞 助動詞 語尾 補助記号").tokenize(util->mapString(" ")); int correct = checkTags(sentence,toks,0,util); if(correct) { // Check the confidences for the SVM, the second candidate should // always be zero for(int i = 0; i < (int)sentence.words.size(); i++) { if(sentence.words[i].tags[0][1].second != 0.0) { cerr << "Margin on word "<mapString("これは学習データです。"); KyteaSentence sentence(str, utilMCSVM->normalize(str)); kyteaMCSVM->calculateWS(sentence); kyteaMCSVM->calculateTags(sentence,0); // Make the correct tags KyteaString::Tokens toks = utilMCSVM->mapString("代名詞 助詞 名詞 名詞 助動詞 語尾 補助記号").tokenize(utilMCSVM->mapString(" ")); int correct = checkTags(sentence,toks,0,utilMCSVM); if(correct) { // Check the confidences for the SVM, the second candidate should // always be zero for(int i = 0; i < (int)sentence.words.size(); i++) { if(sentence.words[i].tags[0][1].second != 0.0) { cerr << "Margin on word "<mapString("これは学習データです。"); KyteaSentence sentence(str, utilNoWS->normalize(str)); sentence.wsConfs[0] = -1; sentence.wsConfs[1] = 1; sentence.wsConfs[2] = 1; sentence.wsConfs[3] = -1; sentence.wsConfs[4] = 1; sentence.wsConfs[5] = -1; sentence.wsConfs[6] = -1; sentence.wsConfs[7] = 1; sentence.wsConfs[8] = 1; sentence.wsConfs[9] = 1; sentence.refreshWS(0); kyteaNoWS->calculateTags(sentence,0); // Make the correct tags KyteaString::Tokens toks = utilNoWS->mapString("代名詞 助詞 名詞 名詞 助動詞 語尾 補助記号").tokenize(utilNoWS->mapString(" ")); int correct = checkTags(sentence,toks,0,utilNoWS); if(correct) { // Check the confidences for the SVM, the second candidate should // always be zero for(int i = 0; i < (int)sentence.words.size(); i++) { if(sentence.words[i].tags[0][1].second != 0.0) { cerr << "Margin on word "<mapString("これは学習デエタです。"); KyteaSentence sentence(str, utilNoWS->normalize(str)); sentence.wsConfs[0] = -1; sentence.wsConfs[1] = 1; sentence.wsConfs[2] = 1; sentence.wsConfs[3] = -1; sentence.wsConfs[4] = 1; sentence.wsConfs[5] = -1; sentence.wsConfs[6] = -1; sentence.wsConfs[7] = 1; sentence.wsConfs[8] = 1; sentence.wsConfs[9] = 1; sentence.refreshWS(0); for(int i = 0; i < 7; i++) sentence.words[i].setUnknown(false); kyteaNoWS->calculateTags(sentence,0); // Check to make sure unknown is correct vector unk_exp(7, false), unk_act(7); unk_exp[3] = true; for(int i = 0; i < 7; i++) unk_act[i] = sentence.words[i].getUnknown(); return checkVector(unk_exp, unk_act); } int testGlobalTaggingLogistic() { // Do the analysis (This is very close to the training data, so it // should work perfectly) KyteaString str = utilLogist->mapString("これは学習データです。"); KyteaSentence sentence(str, utilLogist->normalize(str)); kyteaLogist->calculateWS(sentence); kyteaLogist->calculateTags(sentence,0); // Make the correct tags KyteaString::Tokens toks = utilLogist->mapString("代名詞 助詞 名詞 名詞 助動詞 語尾 補助記号").tokenize(util->mapString(" ")); int correct = checkTags(sentence,toks,0,utilLogist); if(correct) { // Check the confidences for the SVM, the second candidate should // always be zero for(int i = 0; i < (int)sentence.words.size(); i++) { double sum = 0.0; for(int j = 0; j < (int)sentence.words[i].tags[0].size(); j++) sum += sentence.words[i].tags[0][j].second; if(fabs(1.0-sum) > 0.01) { cerr << "Probability on word "<mapString("これ 京都 学習 データ どうぞ 。").tokenize(util->mapString(" ")); KyteaString::Tokens tags = util->mapString("代名詞 名詞 名詞 名詞 副詞 補助記号").tokenize(util->mapString(" ")); KyteaString::Tokens singleTag(1); if(words.size() != tags.size()) THROW_ERROR("words.size() != tags.size() in testGlobalSelf"); int ok = 1; for(int i = 0; i < (int)words.size(); i++) { KyteaSentence sent(words[i], util->normalize(words[i])); sent.refreshWS(1); if(sent.words.size() != 1) THROW_ERROR("Bad segmentation in testGlobalSelf"); kytea->calculateTags(sent,0); singleTag[0] = tags[i]; ok = (checkTags(sent,singleTag,0,util) ? ok : 0); } return ok; } int testLocalTagging() { // Do the analysis (This is very close to the training data, so it // should work perfectly) KyteaString str = util->mapString("東京に行った。"); KyteaSentence sentence(str, util->normalize(str)); kytea->calculateWS(sentence); kytea->calculateTags(sentence,1); // Make the correct tags KyteaString::Tokens toks = util->mapString("UNK に い っ た 。").tokenize(util->mapString(" ")); return checkTags(sentence,toks,1,util); } int testPartialSegmentation() { // Read in a partially annotated sentence stringstream instr; instr << "こ|れ-は デ ー タ で-す 。" << endl; PartCorpusIO io(util, instr, false); KyteaSentence * sent = io.readSentence(); kytea->calculateWS(*sent); // Make the correct words KyteaString::Tokens toks = util->mapString("こ れは データ です 。").tokenize(util->mapString(" ")); int ok = checkWordSeg(*sent,toks,util); delete sent; return ok; } int testConfidentInput() { string confident_text = "これ/代名詞/これ は/助詞/は 信頼/名詞/しんらい 度/接尾辞/ど の/助詞/の 高/形容詞/たか い/語尾/い 入力/名詞/にゅうりょく で/助動詞/で す/語尾/す 。/補助記号/。\n"; // Read in a partially annotated sentence stringstream instr; instr << confident_text; FullCorpusIO infcio(util, instr, false); KyteaSentence * sent = infcio.readSentence(); // Calculate the WS kytea->calculateWS(*sent); // Write out the sentence stringstream outstr1; FullCorpusIO outfcio1(util, outstr1, true); outfcio1.writeSentence(sent); string actual_text = outstr1.str(); if(actual_text != confident_text) { cout << "WS: actual_text != confident_text"<calculateTags(*sent,0); kytea->calculateTags(*sent,1); // Write out the sentence stringstream outstr2; FullCorpusIO outfcio2(util, outstr2, true); outfcio2.writeSentence(sent); actual_text = outstr2.str(); delete sent; if(actual_text != confident_text) { cout << "Tag: actual_text != confident_text"<getConfig()->setModelFormat(ModelIO::FORMAT_TEXT); kytea->writeModel("/tmp/kytea-model.txt"); // Read the model Kytea actKytea; actKytea.readModel("/tmp/kytea-model.txt"); // Check that they are equal kytea->checkEqual(actKytea); return 1; } int testBinaryIO() { // Write the model kytea->getConfig()->setModelFormat(ModelIO::FORMAT_BINARY); kytea->writeModel("/tmp/kytea-model.bin"); // Read the model Kytea actKytea; actKytea.readModel("/tmp/kytea-model.bin"); // Check that they are equal kytea->checkEqual(actKytea); return 1; } bool runTest() { int done = 0, succeeded = 0; done++; cout << "testWordSegmentationSVM()" << endl; if(testWordSegmentationSVM()) succeeded++; else cout << "FAILED!!!" << endl; done++; cout << "testWordSegmentationEmpty()" << endl; if(testWordSegmentationEmpty()) succeeded++; else cout << "FAILED!!!" << endl; done++; cout << "testWordSegmentationUnk()" << endl; if(testWordSegmentationUnk()) succeeded++; else cout << "FAILED!!!" << endl; done++; cout << "testWordSegmentationLogistic()" << endl; if(testWordSegmentationLogistic()) succeeded++; else cout << "FAILED!!!" << endl; done++; cout << "testGlobalTaggingSVM()" << endl; if(testGlobalTaggingSVM()) succeeded++; else cout << "FAILED!!!" << endl; done++; cout << "testGlobalTaggingLogistic()" << endl; if(testGlobalTaggingLogistic()) succeeded++; else cout << "FAILED!!!" << endl; done++; cout << "testGlobalTaggingNoWS()" << endl; if(testGlobalTaggingNoWS()) succeeded++; else cout << "FAILED!!!" << endl; done++; cout << "testNoWSUnk()" << endl; if(testNoWSUnk()) succeeded++; else cout << "FAILED!!!" << endl; done++; cout << "testGlobalSelf()" << endl; if(testGlobalSelf()) succeeded++; else cout << "FAILED!!!" << endl; done++; cout << "testNormalizationUnk()" << endl; if(testNormalizationUnk()) succeeded++; else cout << "FAILED!!!" << endl; done++; cout << "testLocalTagging()" << endl; if(testLocalTagging()) succeeded++; else cout << "FAILED!!!" << endl; done++; cout << "testPartialSegmentation()" << endl; if(testPartialSegmentation()) succeeded++; else cout << "FAILED!!!" << endl; done++; cout << "testTextIO()" << endl; if(testTextIO()) succeeded++; else cout << "FAILED!!!" << endl; done++; cout << "testBinaryIO()" << endl; if(testBinaryIO()) succeeded++; else cout << "FAILED!!!" << endl; done++; cout << "testConfidentInput()" << endl; if(testConfidentInput()) succeeded++; else cout << "FAILED!!!" << endl; cout << "#### TestAnalysis Finished with "< using namespace std; namespace kytea { class KyteaTest { public: int testGetTypeString() { StringUtilUtf8 util; KyteaString str = util.mapString("漢カひ。1A"); string act = util.getTypeString(str); string exp = "KTHODR"; if(act != exp) { cout << "testGetTypeString::Expected "< & exp, vector & act, StringUtilUtf8 & util) { sort(exp.begin(), exp.end()); sort(act.begin(), act.end()); bool bad = false; if(exp.size() != act.size()) { bad = true; cout << "Sizes exp.size()=="< exp, act; exp.push_back(util.mapString("X-2漢")); exp.push_back(util.mapString("X-1カ")); exp.push_back(util.mapString("X0ひ")); exp.push_back(util.mapString("X1。")); exp.push_back(util.mapString("X21")); exp.push_back(util.mapString("X3A")); exp.push_back(util.mapString("X-2漢カ")); exp.push_back(util.mapString("X-1カひ")); exp.push_back(util.mapString("X0ひ。")); exp.push_back(util.mapString("X1。1")); exp.push_back(util.mapString("X21A")); Kytea::SentenceFeatures sentFeats(5); vector charPrefixes_; for(int i = -2; i <= 3; i++) { ostringstream oss; oss << "X" << i; charPrefixes_.push_back(util.mapString(oss.str())); } kytea.wsNgramFeatures(str, sentFeats, charPrefixes_, 2); if((int)sentFeats.size() != 5) THROW_ERROR("sentFeats.size() == " << sentFeats.size()); for(int i = 0; i < (int)sentFeats[2].size(); i++) act.push_back(kytea.getWSModel()->showFeat(sentFeats[2][i])); return compareFeatures(exp, act, util); } int testTagNgramFeatures() { StringUtilUtf8 util; Kytea kytea; kytea.setWSModel(new KyteaModel()); KyteaString str = util.mapString("漢カひ単語。1A"); vector exp, act; exp.push_back(util.mapString("X-2漢")); exp.push_back(util.mapString("X-1カ")); exp.push_back(util.mapString("X0ひ")); exp.push_back(util.mapString("X1。")); exp.push_back(util.mapString("X21")); exp.push_back(util.mapString("X3A")); exp.push_back(util.mapString("X-2漢カ")); exp.push_back(util.mapString("X-1カひ")); exp.push_back(util.mapString("X0ひ。")); exp.push_back(util.mapString("X1。1")); exp.push_back(util.mapString("X21A")); vector charPrefixes_; for(int i = -2; i <= 3; i++) { ostringstream oss; oss << "X" << i; charPrefixes_.push_back(util.mapString(oss.str())); } vector act_feats; kytea.tagNgramFeatures(str, act_feats, charPrefixes_, kytea.getWSModel(), 2, 2, 5); for(int i = 0; i < (int)act_feats.size(); i++) act.push_back(kytea.getWSModel()->showFeat(act_feats[i])); return compareFeatures(exp, act, util); } int testTagSelfFeatures() { StringUtilUtf8 util; vector exp, act; Kytea kytea; kytea.setWSModel(new KyteaModel()); exp.push_back(util.mapString("SX単語")); exp.push_back(util.mapString("STKK")); vector feats; kytea.tagSelfFeatures(util.mapString("単語"), feats, util.mapString("SX"), kytea.getWSModel()); kytea.tagSelfFeatures(util.mapString("KK"), feats, util.mapString("ST"), kytea.getWSModel()); for(int i = 0; i < (int)feats.size(); i++) act.push_back(kytea.getWSModel()->showFeat(feats[i])); return compareFeatures(exp, act, util); } void makeDictionary(Kytea & kytea, StringUtil & util) { // Make the dictionary entry Dictionary::WordMap allWords; KyteaString noun = util.mapString("名詞"); KyteaString verb = util.mapString("動詞"); kytea.addTag(allWords, util.mapString("単語"), 0, &noun, 0); kytea.addTag(allWords, util.mapString("単語"), 0, &verb, 1); Dictionary * dict = new Dictionary(&util); dict->setNumDicts(2); dict->buildIndex(allWords); kytea.setDictionary(dict); } int testTagDictFeatures() { StringUtilUtf8 util; vector exp, act; Kytea kytea; kytea.setWSModel(new KyteaModel()); exp.push_back(util.mapString("D0T0")); exp.push_back(util.mapString("D1T1")); makeDictionary(kytea, util); // Test vector feats; kytea.tagDictFeatures(util.mapString("単語"), 0, feats, kytea.getWSModel()); for(int i = 0; i < (int)feats.size(); i++) act.push_back(kytea.getWSModel()->showFeat(feats[i])); return compareFeatures(exp, act, util); } KyteaModel * makeFeatureLookup(StringUtil * util, int classes) { // Make the feature values const int SIZE = 18; const char* featStrs[SIZE] = { "X-2漢", "X-1カ", "X0ひ", "X1。", "X21", "X3A", "X-2漢カ", "X-1カひ", "X0ひ。", "X1。1", "X21A", "D0L1", "D0I5", "D1R5", "SX単語", "STKK", "D0T0", "D1T1" }; KyteaModel * mod = new KyteaModel; mod->setNumClasses(classes); mod->setLabel(0, 1); mod->setLabel(1, -1); int lastFeat = -1; for(int i = 0; i < SIZE; i++) lastFeat = mod->mapFeat(util->mapString(featStrs[i])); int actual_classes = (classes == 2 ? 1 : classes); mod->initializeWeights(actual_classes, lastFeat+1); for(int i = 0; i < lastFeat; i++) for(int j = 0; j < actual_classes; j++) mod->setWeight(i, j, (i+1)*(j+1)); mod->buildFeatureLookup(util, 3, 3, 2, 5); return mod; } int testModelToLookup() { // Make the expected dictionary StringUtilUtf8 util; const int SIZE = 11; const char* wordStrs[SIZE] = { "漢", "カ", "ひ", "。", "1", "A", "漢カ", "カひ", "ひ。", "。1", "1A"}; const int wordPoss[SIZE] = { 5, 4, 3, 2, 1, 0, 4, 3, 2, 1, 0 }; typedef Dictionary >::WordMap WordMap; WordMap wm; for(int i = 0; i < SIZE; i++) { pair it = wm.insert(WordMap::value_type(util.mapString(wordStrs[i]),new vector(6,0))); (*it.first->second)[wordPoss[i]] = i+1; // Add one because first feature is NULL } Dictionary > exp(&util); exp.buildIndex(wm); // Convert the model to a feature lookup KyteaModel * mod = makeFeatureLookup(&util, 2); FeatureLookup * look = mod->getFeatureLookup(); // Check the n-gram values const Dictionary > * act = look->getCharDict(); int ret = 1; if((int)act->getEntries().size() != SIZE) { cerr << "act->getEntries().size() == "<getEntries().size()< * actVec = act->findEntry(util.mapString(wordStrs[i])); vector * expVec = exp.findEntry(util.mapString(wordStrs[i])); if(actVec == NULL) { cerr << "actVec["<size(); j++) cerr << (*expVec)[j] << " "; cerr << endl; for(int j = 0; j < (int)actVec->size(); j++) cerr << (*actVec)[j] << " "; cerr << endl; ret = 0; } } } // Check the dictionary match vector dictExp(2*5*3, 0); dictExp[0*15+0*3+2] = SIZE+1; dictExp[0*15+4*3+1] = SIZE+2; dictExp[1*15+4*3+0] = SIZE+3; const vector & dictAct = *look->getDictVector(); if(dictExp.size() != dictAct.size()) { cerr << "dictExp.size() == "< act(5,0); feat->addNgramScores(feat->getCharDict(), str, 3, act); vector exp(5,0); exp[2] = 11*(11+1)/2; // All features from 1-11 should fire int ret = 1; for(int i = 0; i < 5; i++) { if(act[i] != exp[i]) { cerr << "act["<mapString(buff.str())); } typePrefixes.resize(0); for(int i = 1; i <= 2*3; i++) { ostringstream buff; buff << "T" << i-3; typePrefixes.push_back(util->mapString(buff.str())); } } int testWSLookupMatchesModel() { // Make the full model StringUtilUtf8 util; Kytea kytea; kytea.setWSModel(new KyteaModel()); KyteaModel & mod = *kytea.getWSModel(); mod.setNumClasses(2); mod.setLabel(0, 1); mod.setLabel(1, -1); mod.setNumWeights(1); int id = 0; KyteaString str = util.mapString("漢カひ。1A"); KyteaString typeStr = util.mapString(util.getTypeString(str)); for(int i = 0; i < (int)str.length(); i++) { for(int j = 1; j <= 3; j++) { if(i+j > (int)str.length()) break; for(int k = -2; k <= 4-j; k++) { ostringstream oss1; oss1 << "X" << k << util.showString(str.substr(i,j)); id = mod.mapFeat(util.mapString(oss1.str())); ostringstream oss2; oss2 << "T" << k << util.showString(typeStr.substr(i,j)); id = mod.mapFeat(util.mapString(oss2.str())); } } } mod.initializeWeights(1, id+1); for(int i = 0; i <= id; i++) { mod.setWeight(i, 0, i); } // Make the feature lookup mod.buildFeatureLookup(&util, 3, 3, 2, 5); FeatureLookup * feat = mod.getFeatureLookup(); // Get the score matrix for lookup vector act(5,0); feat->addNgramScores(feat->getCharDict(), str, 3, act); feat->addNgramScores(feat->getTypeDict(), typeStr, 3, act); for(int i = 0; i < 5; i++) act[i] += feat->getBias(0); // Calculate the n-gram features Kytea::SentenceFeatures sentFeats(5); vector charPrefixes, typePrefixes; makePrefixes(charPrefixes, typePrefixes, &util); kytea.wsNgramFeatures(str, sentFeats, charPrefixes, 3); kytea.wsNgramFeatures(typeStr, sentFeats, typePrefixes, 3); int ret = 1; for(int i = 0; i < 5; i++) { pair answer = mod.runClassifier(sentFeats[i])[0]; if(answer.second != act[i]) { cerr << "model["<mapString("ST"); int id = 0; KyteaString str = util->mapString("漢カひ単語。1A"); KyteaString typeStr = util->mapString(util->getTypeString(str)); for(int i = 0; i < (int)str.length(); i++) { for(int j = 1; j <= 3; j++) { if(i+j > (int)str.length()) break; for(int k = -2; k <= 4-j; k++) { ostringstream oss1; oss1 << "X" << k << util->showString(str.substr(i,j)); id = mod.mapFeat(util->mapString(oss1.str())); ostringstream oss2; oss2 << "T" << k << util->showString(typeStr.substr(i,j)); id = mod.mapFeat(util->mapString(oss2.str())); } ostringstream oss3; oss3 << "SX" << util->showString(str.substr(i,j)); id = mod.mapFeat(util->mapString(oss3.str())); ostringstream oss4; oss4 << "ST" << util->showString(typeStr.substr(i,j)); id = mod.mapFeat(util->mapString(oss4.str())); } } id = mod.mapFeat(util->mapString("D0T0")); id = mod.mapFeat(util->mapString("D1T1")); id = mod.mapFeat(util->mapString("UNK")); mod.initializeWeights(3, id+1); for(int i = 0; i <= id; i++) for(int j = 0; j < 3; j++) mod.setWeight(i, j, i*3+j); mod.setAddFeatures(false); // Make the feature lookup mod.buildFeatureLookup(util, 3, 3, 2, 5); FeatureLookup * feat = mod.getFeatureLookup(); // Calculate the n-gram features Kytea::SentenceFeatures sentFeats(5); vector charPrefixes, typePrefixes; makePrefixes(charPrefixes, typePrefixes, util); int ret = 1; for(int i = 0; i < 5; i++) { // Get the score matrix for lookup vector act(3,0); feat->addTagNgrams(str, feat->getCharDict(), act, 3, i, i+2); feat->addTagNgrams(typeStr, feat->getTypeDict(), act, 3, i, i+2); for(int j = 0; j < 3; j++) act[j] += feat->getBias(j); feat->addSelfWeights(str.substr(i,2), act, 0); feat->addSelfWeights(typeStr.substr(i,2), act, 1); feat->addTagDictWeights(kytea.getDictionaryMatches(str.substr(i,2), 0), act); // Make with the model vector feats; kytea.tagNgramFeatures(str, feats, charPrefixes, &mod, 3, i-1, i+2); kytea.tagNgramFeatures(typeStr, feats, typePrefixes, &mod, 3, i-1, i+2); kytea.tagSelfFeatures(str.substr(i,2), feats, kssx, &mod); kytea.tagSelfFeatures(typeStr.substr(i,2), feats, ksst, &mod); kytea.tagDictFeatures(str.substr(i,2), 0, feats, &mod); vector< pair > answers = mod.runClassifier(feats); // Convert to margin FeatSum secondBest = act[answers[1].first-1]; for(int j = 0; j < (int)answers.size(); j++) { if(answers[j].first-1 >= (int)act.size()) THROW_ERROR("answers[j].first too big "<::WordMap dictMap; kytea.addTag(dictMap, util.mapString("1"), 0, NULL, 0); kytea.addTag(dictMap, util.mapString("漢カひ。1A"), 0, NULL, 0); kytea.addTag(dictMap, util.mapString("漢カひ。1"), 0, NULL, 0); kytea.addTag(dictMap, util.mapString("カひ。1A"), 0, NULL, 1); Dictionary dict(&util); dict.buildIndex(dictMap); // Check that these are added correctly vector exp(5,13); // All are inside D1I5 (only once) exp[4] += 12; // The last one is to the right of D1L1 exp[0] += 14; // The last one is to the right of D2R5 vector act(5,0); look->addDictionaryScores(dict.match(str), 2, 5, act); // Check that these are equal int ret = 1; for(int i = 0; i < 5; i++) { if(act[i] != exp[i]) { cerr << "act["< & toks, StringUtil * util) { const KyteaSentence::Words & words = sent.words; int ok = 1; for(int i = 0; i < (int)max(words.size(), toks.size()); i++) { if(i >= (int)words.size() || i >= (int)toks.size() || words[i].surface != toks[i]) { ok = 0; cout << "words["<= (int)words.size() ? "NULL" : util->showString(words[i].surface)) <<" != "<< (i >= (int)toks.size() ? "NULL" : util->showString(toks[i])) <<")"< & toks, int pos, StringUtil * util) { const KyteaSentence::Words & words = sent.words; int ok = (words.size() == toks.size() ? 1 : 0); KyteaString noneString = util->mapString("NONE"); for(int i = 0; i < (int)max(words.size(), toks.size()); i++) { // Find the proper tag KyteaString myTag; if(i >= (int)words.size()) myTag = util->mapString("NULL"); else if(pos >= (int)words[i].tags.size() || 0 == (int)words[i].tags[pos].size()) myTag = util->mapString("NONE"); else myTag = words[i].tags[pos][0].first; // If they don't match return if(i >= (int)toks.size() || myTag != toks[i]) { ok = 0; cout << "words["<showString(myTag) <<" != "<< (i >= (int)toks.size() ? "NULL" : util->showString(toks[i])) <<")"< int checkVector(const vector & exp, const vector & act) { int ok = 1; for(int i = 0; i < (int)max(exp.size(), act.size()); i++) { if(i >= (int)exp.size() || i >= (int)act.size() || exp[i] != act[i]) { ok = 0; cout << "exp["<= (int)exp.size()) cout << "NULL"; else cout << exp[i]; cout <<" != "; if(i >= (int)act.size()) cout << "NULL"; else cout << act[i]; cout << ")" << endl; } } return ok; } }; } #endif kytea_0.4.6+dfsg.orig/src/test/test-sentence.h0000644000175000017500000000266512122355536020611 0ustar koichikoichi#include "test-base.h" #ifndef TEST_SENTENCE_H__ #define TEST_SENTENCE_H__ namespace kytea { class TestSentence : public TestBase { private: StringUtilUtf8 * util; public: TestSentence() { util = new StringUtilUtf8; } ~TestSentence() { delete util; } int testRefreshWS() { // Build the string stringstream instr; instr << "これ は データ/名詞 で/助動詞 す/語尾 。" << endl; FullCorpusIO io(util, instr, false); KyteaSentence * sent = io.readSentence(); // Refresh the word segmentation sent->wsConfs[6] = -100; sent->refreshWS(1); // Check to make sure it matches our expectation KyteaString::Tokens words = util->mapString("これ は データ です 。").tokenize(util->mapString(" ")); KyteaString::Tokens tags = util->mapString("NONE NONE 名詞 NONE NONE").tokenize(util->mapString(" ")); bool wordsOK = checkWordSeg(*sent,words,util); bool tagsOK = checkTags(*sent,tags,0,util); delete sent; return wordsOK && tagsOK; } bool runTest() { int done = 0, succeeded = 0; done++; cout << "testRefreshWS()" << endl; if(testRefreshWS()) succeeded++; else cout << "FAILED!!!" << endl; cout << "#### TestSentence Finished with "< #include "test-base.h" namespace kytea { class TestCorpusIOSjis : public TestBase { private: StringUtilSjis * util; public: TestCorpusIOSjis() { util = new StringUtilSjis; } ~TestCorpusIOSjis() { delete util; } int testWordSegConf() { // Build the string stringstream instr; // instr << "|- f [ ^ - B" << endl; instr << "\x82\xb1\x7c\x82\xea\x2d\x82\xcd\x20\x83\x66\x20\x81\x5b\x20\x83\x5e\x20\x82\xc5\x2d\x82\xb7\x20\x81\x42" << endl; PartCorpusIO io(util, instr, false); KyteaSentence * sent = io.readSentence(); // Build the expectations vector exp(8,0.0); exp[0] = 100; exp[1] = -100; exp[6] = -100; bool ret = checkVector(exp, sent->wsConfs); delete sent; return ret; } int testPartEmptyLines() { // Build the string stringstream instr; instr << "" << endl; PartCorpusIO io(util, instr, false); KyteaSentence * sent = io.readSentence(); // Build the expectations vector exp(0,0.0); bool ret = checkVector(exp, sent->wsConfs); delete sent; return ret; } int testPartEmptyTag() { // Build the string stringstream instr; // instr << "-//" << endl; instr << "\x82\xb1\x2d\x82\xea\x2f\x2f\x82\xb1\x82\xea" << endl; PartCorpusIO io(util, instr, false); KyteaSentence * sent = io.readSentence(); int ret = 1; if(sent->words.size() != 1) { cerr << "Sentence size " << sent->words.size() << " != 1" << endl; ret = 0; } delete sent; return ret; } int testFullTagConf() { // Build the string stringstream instr; // instr << "-/ / f[^/ / / B/⏕L" << endl; instr << "\x82\xb1\x2d\x82\xea\x2f\x96\xbc\x8e\x8c\x20\x82\xcd\x2f\x8f\x95\x8e\x8c\x20\x83\x66\x81\x5b\x83\x5e\x2f\x96\xbc\x8e\x8c\x20\x82\xc5\x2f\x8f\x95\x93\xae\x8e\x8c\x20\x82\xb7\x2f\x8c\xea\x94\xf6\x20\x81\x42\x2f\x95\xe2\x8f\x95\x8b\x4c\x8d\x86" << endl; FullCorpusIO io(util, instr, false); KyteaSentence * sent = io.readSentence(); // Build the expectations if(sent->words.size() != 6) THROW_ERROR("sent->words size doesn't match 5 " << sent->words.size()); bool ret = true; for(int i = 0; i < 6; i ++) { if(sent->words[i].tags[0][0].second != 100.0) { cerr << "Bad confidence for tag " << i << ": " << sent->words[i].tags[0][0].second << endl; ret = false; } } delete sent; return ret; } int testLastValue() { // string confident_text = "/㖼/ // M//炢 x/ڔ/ // /`e/ // //ɂイ傭 // // B/⏕L/B\n"; string confident_text = "\x82\xb1\x82\xea\x2f\x91\xe3\x96\xbc\x8e\x8c\x2f\x82\xb1\x82\xea\x20\x82\xcd\x2f\x8f\x95\x8e\x8c\x2f\x82\xcd\x20\x90\x4d\x97\x8a\x2f\x96\xbc\x8e\x8c\x2f\x82\xb5\x82\xf1\x82\xe7\x82\xa2\x20\x93\x78\x2f\x90\xda\x94\xf6\x8e\xab\x2f\x82\xc7\x20\x82\xcc\x2f\x8f\x95\x8e\x8c\x2f\x82\xcc\x20\x8d\x82\x2f\x8c\x60\x97\x65\x8e\x8c\x2f\x82\xbd\x82\xa9\x20\x82\xa2\x2f\x8c\xea\x94\xf6\x2f\x82\xa2\x20\x93\xfc\x97\xcd\x2f\x96\xbc\x8e\x8c\x2f\x82\xc9\x82\xe3\x82\xa4\x82\xe8\x82\xe5\x82\xad\x20\x82\xc5\x2f\x8f\x95\x93\xae\x8e\x8c\x2f\x82\xc5\x20\x82\xb7\x2f\x8c\xea\x94\xf6\x2f\x82\xb7\x20\x81\x42\x2f\x95\xe2\x8f\x95\x8b\x4c\x8d\x86\x2f\x81\x42\n"; // Read in a partially annotated sentence stringstream instr; instr << confident_text; FullCorpusIO infcio(util, instr, false); KyteaSentence * sent = infcio.readSentence(); int ret = 1; if(sent->words.size() != 11) { cerr << "Did not get expected sentence size of 11: " << sent->words.size() << endl; ret = 0; } else if(sent->words[10].tags.size() != 2) { cerr << "Did not get two levels of tags for final word: " << sent->words[10].tags.size() << endl; ret = 0; } delete sent; return ret; } int testUnkIO() { // string input = "/㖼/ // m//݂\n"; string input = "\x82\xb1\x82\xea\x2f\x91\xe3\x96\xbc\x8e\x8c\x2f\x82\xb1\x82\xea\x20\x82\xcd\x2f\x8f\x95\x8e\x8c\x2f\x82\xcd\x20\x96\xa2\x92\x6d\x2f\x96\xbc\x8e\x8c\x2f\x82\xdd\x82\xbf\n"; // Read in a partially annotated sentence stringstream instr; instr << input; FullCorpusIO infcio(util, instr, false); KyteaSentence * sent = infcio.readSentence(); sent->words[2].setUnknown(true); // string exp = "/㖼/ // m//݂/UNK\n"; string exp = "\x82\xb1\x82\xea\x2f\x91\xe3\x96\xbc\x8e\x8c\x2f\x82\xb1\x82\xea\x20\x82\xcd\x2f\x8f\x95\x8e\x8c\x2f\x82\xcd\x20\x96\xa2\x92\x6d\x2f\x96\xbc\x8e\x8c\x2f\x82\xdd\x82\xbf\x2f\x55\x4e\x4b\n"; stringstream outstr; FullCorpusIO outfcio(util, outstr, true); outfcio.setUnkTag("/UNK"); outfcio.writeSentence(sent); string act = outstr.str(); if(exp != act) { cerr << "exp: "< #include "test-base.h" namespace kytea { class TestCorpusIO : public TestBase { private: StringUtilUtf8 * util; public: TestCorpusIO() { util = new StringUtilUtf8; } ~TestCorpusIO() { delete util; } int testWordSegConf() { // Build the string stringstream instr; instr << "こ|れ-は デ ー タ で-す 。" << endl; PartCorpusIO io(util, instr, false); KyteaSentence * sent = io.readSentence(); // Build the expectations vector exp(8,0.0); exp[0] = 100; exp[1] = -100; exp[6] = -100; bool ret = checkVector(exp, sent->wsConfs); delete sent; return ret; } int testPartEmptyLines() { // Build the string stringstream instr; instr << "" << endl; PartCorpusIO io(util, instr, false); KyteaSentence * sent = io.readSentence(); // Build the expectations vector exp(0,0.0); bool ret = checkVector(exp, sent->wsConfs); delete sent; return ret; } int testTokReadSentence() { stringstream instr; instr << "これ は 学習 データ で す 。" << endl; TokenizedCorpusIO io(util, instr, false); KyteaSentence * sent = io.readSentence(); // Make the correct words KyteaString::Tokens toks = util->mapString("これ は 学習 データ で す 。").tokenize(util->mapString(" ")); return checkWordSeg(*sent,toks,util); } int testRawReadSlash() { stringstream instr; instr << "右/左" << endl; RawCorpusIO io(util, instr, false); KyteaSentence * sent = io.readSentence(); // Make the correct words KyteaString exp = util->mapString("右/左"); return exp == sent->surface; } int testPartEmptyTag() { // Build the string stringstream instr; instr << "リ-リ-カ-ル//りりかる|な-の-は//なのは|を 中 心 に 、" << endl; PartCorpusIO io(util, instr, false); KyteaSentence * sent = io.readSentence(); int ret = 1; if(sent->words.size() != 3) { cerr << "Sentence size " << sent->words.size() << " != 3" << endl; ret = 0; } if(sent->words[0].tags.size() != 2) { cerr << "Words[0] tags " << sent->words[0].tags.size() << " != 2" << endl; ret = 0; } if(sent->words[1].tags.size() != 2) { cerr << "Words[1] tags " << sent->words[1].tags.size() << " != 2" << endl; ret = 0; } delete sent; return ret; } int testTagIO() { // Build the string stringstream instr, outstr; instr << "こ-れ/名詞 は/助詞 データ/名詞 で/助動詞 す/語尾 。/補助記号" << endl; FullCorpusIO ioin(util, instr, false); KyteaSentence * sent = ioin.readSentence(); FullCorpusIO ioout(util, outstr, true); ioout.setPrintWords(false); ioout.writeSentence(sent); string exp = "名詞 助詞 名詞 助動詞 語尾 補助記号\n"; string act = outstr.str(); delete sent; if(exp != act) { cerr << exp << endl << act << endl; return 0; } else { return 1; } } int testFullTagConf() { // Build the string stringstream instr; instr << "こ-れ/名詞 は/助詞 データ/名詞 で/助動詞 す/語尾 。/補助記号" << endl; FullCorpusIO io(util, instr, false); KyteaSentence * sent = io.readSentence(); // Build the expectations if(sent->words.size() != 6) THROW_ERROR("sent->words size doesn't match 5 " << sent->words.size()); bool ret = true; for(int i = 0; i < 6; i ++) { if(sent->words[i].tags[0][0].second != 100.0) { cerr << "Bad confidence for tag " << i << ": " << sent->words[i].tags[0][0].second << endl; ret = false; } } delete sent; return ret; } int testLastValue() { string confident_text = "これ/代名詞/これ は/助詞/は 信頼/名詞/しんらい 度/接尾辞/ど の/助詞/の 高/形容詞/たか い/語尾/い 入力/名詞/にゅうりょく で/助動詞/で す/語尾/す 。/補助記号/。\n"; // Read in a partially annotated sentence stringstream instr; instr << confident_text; FullCorpusIO infcio(util, instr, false); KyteaSentence * sent = infcio.readSentence(); int ret = 1; if(sent->words.size() != 11) { cerr << "Did not get expected sentence size of 11: " << sent->words.size() << endl; ret = 0; } else if(sent->words[10].tags.size() != 2) { cerr << "Did not get two levels of tags for final word: " << sent->words[10].tags.size() << endl; ret = 0; } delete sent; return ret; } int testUnkIO() { string input = "これ/代名詞/これ は/助詞/は 未知/名詞/みち\n"; // Read in a partially annotated sentence stringstream instr; instr << input; FullCorpusIO infcio(util, instr, false); KyteaSentence * sent = infcio.readSentence(); sent->words[2].setUnknown(true); string exp = "これ/代名詞/これ は/助詞/は 未知/名詞/みち/UNK\n"; stringstream outstr; FullCorpusIO outfcio(util, outstr, true); outfcio.setUnkTag("/UNK"); outfcio.writeSentence(sent); string act = outstr.str(); if(exp != act) { cerr << "exp: "< #include "test-base.h" namespace kytea { class TestCorpusIOEuc : public TestBase { private: StringUtilEuc * util; public: TestCorpusIOEuc() { util = new StringUtilEuc; } ~TestCorpusIOEuc() { delete util; } int testWordSegConf() { // Build the string stringstream instr; // instr << "|- - " << endl; instr << "\xa4\xb3\x7c\xa4\xec\x2d\xa4\xcf\x20\xa5\xc7\x20\xa1\xbc\x20\xa5\xbf\x20\xa4\xc7\x2d\xa4\xb9\x20\xa1\xa3" << endl; PartCorpusIO io(util, instr, false); KyteaSentence * sent = io.readSentence(); // Build the expectations vector exp(8,0.0); exp[0] = 100; exp[1] = -100; exp[6] = -100; bool ret = checkVector(exp, sent->wsConfs); delete sent; return ret; } int testPartEmptyLines() { // Build the string stringstream instr; instr << "" << endl; PartCorpusIO io(util, instr, false); KyteaSentence * sent = io.readSentence(); // Build the expectations vector exp(0,0.0); bool ret = checkVector(exp, sent->wsConfs); delete sent; return ret; } int testPartEmptyTag() { // Build the string stringstream instr; // instr << "-//" << endl; instr << "\xa4\xb3\x2d\xa4\xec\x2f\x2f\xa4\xb3\xa4\xec" << endl; PartCorpusIO io(util, instr, false); KyteaSentence * sent = io.readSentence(); int ret = 1; if(sent->words.size() != 1) { cerr << "Sentence size " << sent->words.size() << " != 1" << endl; ret = 0; } delete sent; return ret; } int testFullTagConf() { // Build the string stringstream instr; // instr << "-/̾ / ǡ/̾ /ư / /" << endl; instr << "\xa4\xb3\x2d\xa4\xec\x2f\xcc\xbe\xbb\xec\x20\xa4\xcf\x2f\xbd\xf5\xbb\xec\x20\xa5\xc7\xa1\xbc\xa5\xbf\x2f\xcc\xbe\xbb\xec\x20\xa4\xc7\x2f\xbd\xf5\xc6\xb0\xbb\xec\x20\xa4\xb9\x2f\xb8\xec\xc8\xf8\x20\xa1\xa3\x2f\xca\xe4\xbd\xf5\xb5\xad\xb9\xe6" << endl; FullCorpusIO io(util, instr, false); KyteaSentence * sent = io.readSentence(); // Build the expectations if(sent->words.size() != 6) THROW_ERROR("sent->words size doesn't match 5 " << sent->words.size()); bool ret = true; for(int i = 0; i < 6; i ++) { if(sent->words[i].tags[0][0].second != 100.0) { cerr << "Bad confidence for tag " << i << ": " << sent->words[i].tags[0][0].second << endl; ret = false; } } delete sent; return ret; } int testLastValue() { // string confident_text = "/̾/ // /̾/餤 // // /ƻ/ // /̾/ˤ夦礯 /ư/ // //\n"; string confident_text = "\xa4\xb3\xa4\xec\x2f\xc2\xe5\xcc\xbe\xbb\xec\x2f\xa4\xb3\xa4\xec\x20\xa4\xcf\x2f\xbd\xf5\xbb\xec\x2f\xa4\xcf\x20\xbf\xae\xcd\xea\x2f\xcc\xbe\xbb\xec\x2f\xa4\xb7\xa4\xf3\xa4\xe9\xa4\xa4\x20\xc5\xd9\x2f\xc0\xdc\xc8\xf8\xbc\xad\x2f\xa4\xc9\x20\xa4\xce\x2f\xbd\xf5\xbb\xec\x2f\xa4\xce\x20\xb9\xe2\x2f\xb7\xc1\xcd\xc6\xbb\xec\x2f\xa4\xbf\xa4\xab\x20\xa4\xa4\x2f\xb8\xec\xc8\xf8\x2f\xa4\xa4\x20\xc6\xfe\xce\xcf\x2f\xcc\xbe\xbb\xec\x2f\xa4\xcb\xa4\xe5\xa4\xa6\xa4\xea\xa4\xe7\xa4\xaf\x20\xa4\xc7\x2f\xbd\xf5\xc6\xb0\xbb\xec\x2f\xa4\xc7\x20\xa4\xb9\x2f\xb8\xec\xc8\xf8\x2f\xa4\xb9\x20\xa1\xa3\x2f\xca\xe4\xbd\xf5\xb5\xad\xb9\xe6\x2f\xa1\xa3\n"; // Read in a partially annotated sentence stringstream instr; instr << confident_text; FullCorpusIO infcio(util, instr, false); KyteaSentence * sent = infcio.readSentence(); int ret = 1; if(sent->words.size() != 11) { cerr << "Did not get expected sentence size of 11: " << sent->words.size() << endl; ret = 0; } else if(sent->words[10].tags.size() != 2) { cerr << "Did not get two levels of tags for final word: " << sent->words[10].tags.size() << endl; ret = 0; } delete sent; return ret; } int testUnkIO() { // string input = "/̾/ // ̤/̾/ߤ\n"; string input = "\xa4\xb3\xa4\xec\x2f\xc2\xe5\xcc\xbe\xbb\xec\x2f\xa4\xb3\xa4\xec\x20\xa4\xcf\x2f\xbd\xf5\xbb\xec\x2f\xa4\xcf\x20\xcc\xa4\xc3\xce\x2f\xcc\xbe\xbb\xec\x2f\xa4\xdf\xa4\xc1\n"; // Read in a partially annotated sentence stringstream instr; instr << input; FullCorpusIO infcio(util, instr, false); KyteaSentence * sent = infcio.readSentence(); sent->words[2].setUnknown(true); // string exp = "/̾/ // ̤/̾/ߤ/UNK\n"; string exp = "\xa4\xb3\xa4\xec\x2f\xc2\xe5\xcc\xbe\xbb\xec\x2f\xa4\xb3\xa4\xec\x20\xa4\xcf\x2f\xbd\xf5\xbb\xec\x2f\xa4\xcf\x20\xcc\xa4\xc3\xce\x2f\xcc\xbe\xbb\xec\x2f\xa4\xdf\xa4\xc1\x2f\x55\x4e\x4b\n"; stringstream outstr; FullCorpusIO outfcio(util, outstr, true); outfcio.setUnkTag("/UNK"); outfcio.writeSentence(sent); string act = outstr.str(); if(exp != act) { cerr << "exp: "< #include #include #include #include #include #include #include #include #include #include #include #include #include "test-kytea.h" #include "test-analysis.h" #include "test-corpusio.h" #include "test-corpusio-euc.h" #include "test-corpusio-sjis.h" #include "test-sentence.h" using namespace std; int main(int argv, char **argc) { kytea::KyteaTest test_kytea; kytea::TestAnalysis test_analysis; kytea::TestCorpusIO test_corpusio; kytea::TestCorpusIOEuc test_corpusio_euc; kytea::TestCorpusIOSjis test_corpusio_sjis; kytea::TestSentence test_sentence; if(!( test_kytea.runTest() && test_analysis.runTest() && test_sentence.runTest() && test_corpusio.runTest() && test_corpusio_euc.runTest() && test_corpusio_sjis.runTest() )) { cout << "**** FAILED!!! ****" << endl; } else { cout << "**** passed ****" << endl; } } kytea_0.4.6+dfsg.orig/src/test/Makefile.am0000644000175000017500000000075312122355536017707 0ustar koichikoichi# KYTH = kytea.h corpus-io.h model-io.h string-util.h \ # kytea-model.h kytea-string.h kytea-struct.h dictionary.h general-io.h \ # kytea-config.h KYTH = test-kytea.h test-analysis.h test-base.h test-corpusio.h \ test-sentence.h \ test-corpusio-euc.h \ test-corpusio-sjis.h AM_CPPFLAGS = -I$(srcdir)/../include -DPKGDATADIR='"$(pkgdatadir)"' noinst_PROGRAMS = test-kytea test_kytea_SOURCES = test-kytea.cpp ${KYTH} test_kytea_LDADD = ../lib/libkytea.la kytea_0.4.6+dfsg.orig/Makefile.am0000644000175000017500000000012112122357425016125 0ustar koichikoichiSUBDIRS = src data pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = kytea.pc