pax_global_header00006660000000000000000000000064126321347030014513gustar00rootroot0000000000000052 comment=f69113abacbb6436458df35b216e0a88c6b1cac2 irstlm-6.00.05/000077500000000000000000000000001263213470300131755ustar00rootroot00000000000000irstlm-6.00.05/CMakeLists.txt000066400000000000000000000034571263213470300157460ustar00rootroot00000000000000#CMake 2.6+ is recommended to an improved Boost module CMAKE_MINIMUM_REQUIRED(VERSION 2.6.0 FATAL_ERROR) PROJECT (irstlm) SET(IRSTLM_VERSION_MAJOR "5") SET(IRSTLM_VERSION_MINOR "80") SET(IRSTLM_VERSION_PATCH "08") SET(CMAKE_C_COMPILER "gcc" ) SET(CMAKE_CXX_COMPILER "g++" ) OPTION(CXX0 "Enable/Disable std=c++0" ON) OPTION(ASSERT "Enable/Disable MY_ASSERT_FLAG" ON) MATH(EXPR IRSTLM_INT_VERSION "(${IRSTLM_VERSION_MAJOR} * 10000) + (${IRSTLM_VERSION_MINOR} * 100) + (${IRSTLM_VERSION_PATCH} * 1)" ) SET(IRSTLM_VERSION "${IRSTLM_VERSION_MAJOR}.${IRSTLM_VERSION_MINOR}.${IRSTLM_VERSION_PATCH}") #ADD_DEFINITIONS(-DPS_CACHE_ENABLE) IF(COMMAND cmake_policy) cmake_policy(SET CMP0017 NEW) ENDIF(COMMAND cmake_policy) IF(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) SET(CMAKE_INSTALL_PREFIX "${CMAKE_SOURCE_DIR}/inst" CACHE PATH "IRSTLM install prefix" FORCE ) MESSAGE(STATUS "You have not set the install dir, default to ${CMAKE_INSTALL_PREFIX}, if you want to set it, use cmake -DCMAKE_INSTALL_PREFIX to do so") ENDIF(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) #set various platform specific global options IF(WIN32) SET(CMAKE_DEBUG_POSTFIX "d") OPTION( USE_64_BIT "Set to on if you want to compile Win64" OFF ) ENDIF(WIN32) # include specific modules SET(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") ADD_SUBDIRECTORY (src) ADD_SUBDIRECTORY (scripts) ADD_SUBDIRECTORY (doc) IF (WIN32) MESSAGE( STATUS "-------------------------------------------------------------------------------" ) MESSAGE( STATUS "USE_64_BIT = ${USE_64_BIT}" ) MESSAGE( STATUS "Change a value with: cmake -D=" ) MESSAGE( STATUS "-------------------------------------------------------------------------------" ) SET( USE_64_BIT "${USE_64_BIT}" CACHE BOOL "Set to ON to build Win64" FORCE ) ENDIF() irstlm-6.00.05/Copyright000066400000000000000000000017721263213470300150770ustar00rootroot00000000000000// $Id: Copyright 3686 2010-10-15 11:55:32Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ irstlm-6.00.05/LICENSE000066400000000000000000000167441263213470300142160ustar00rootroot00000000000000 GNU LESSER GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. This version of the GNU Lesser General Public License incorporates the terms and conditions of version 3 of the GNU General Public License, supplemented by the additional permissions listed below. 0. Additional Definitions. As used herein, "this License" refers to version 3 of the GNU Lesser General Public License, and the "GNU GPL" refers to version 3 of the GNU General Public License. "The Library" refers to a covered work governed by this License, other than an Application or a Combined Work as defined below. An "Application" is any work that makes use of an interface provided by the Library, but which is not otherwise based on the Library. Defining a subclass of a class defined by the Library is deemed a mode of using an interface provided by the Library. A "Combined Work" is a work produced by combining or linking an Application with the Library. The particular version of the Library with which the Combined Work was made is also called the "Linked Version". The "Minimal Corresponding Source" for a Combined Work means the Corresponding Source for the Combined Work, excluding any source code for portions of the Combined Work that, considered in isolation, are based on the Application, and not on the Linked Version. The "Corresponding Application Code" for a Combined Work means the object code and/or source code for the Application, including any data and utility programs needed for reproducing the Combined Work from the Application, but excluding the System Libraries of the Combined Work. 1. Exception to Section 3 of the GNU GPL. You may convey a covered work under sections 3 and 4 of this License without being bound by section 3 of the GNU GPL. 2. Conveying Modified Versions. If you modify a copy of the Library, and, in your modifications, a facility refers to a function or data to be supplied by an Application that uses the facility (other than as an argument passed when the facility is invoked), then you may convey a copy of the modified version: a) under this License, provided that you make a good faith effort to ensure that, in the event an Application does not supply the function or data, the facility still operates, and performs whatever part of its purpose remains meaningful, or b) under the GNU GPL, with none of the additional permissions of this License applicable to that copy. 3. Object Code Incorporating Material from Library Header Files. The object code form of an Application may incorporate material from a header file that is part of the Library. You may convey such object code under terms of your choice, provided that, if the incorporated material is not limited to numerical parameters, data structure layouts and accessors, or small macros, inline functions and templates (ten or fewer lines in length), you do both of the following: a) Give prominent notice with each copy of the object code that the Library is used in it and that the Library and its use are covered by this License. b) Accompany the object code with a copy of the GNU GPL and this license document. 4. Combined Works. You may convey a Combined Work under terms of your choice that, taken together, effectively do not restrict modification of the portions of the Library contained in the Combined Work and reverse engineering for debugging such modifications, if you also do each of the following: a) Give prominent notice with each copy of the Combined Work that the Library is used in it and that the Library and its use are covered by this License. b) Accompany the Combined Work with a copy of the GNU GPL and this license document. c) For a Combined Work that displays copyright notices during execution, include the copyright notice for the Library among these notices, as well as a reference directing the user to the copies of the GNU GPL and this license document. d) Do one of the following: 0) Convey the Minimal Corresponding Source under the terms of this License, and the Corresponding Application Code in a form suitable for, and under terms that permit, the user to recombine or relink the Application with a modified version of the Linked Version to produce a modified Combined Work, in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source. 1) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (a) uses at run time a copy of the Library already present on the user's computer system, and (b) will operate properly with a modified version of the Library that is interface-compatible with the Linked Version. e) Provide Installation Information, but only if you would otherwise be required to provide such information under section 6 of the GNU GPL, and only to the extent that such information is necessary to install and execute a modified version of the Combined Work produced by recombining or relinking the Application with a modified version of the Linked Version. (If you use option 4d0, the Installation Information must accompany the Minimal Corresponding Source and Corresponding Application Code. If you use option 4d1, you must provide the Installation Information in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source.) 5. Combined Libraries. You may place library facilities that are a work based on the Library side by side in a single library together with other library facilities that are not Applications and are not covered by this License, and convey such a combined library under terms of your choice, if you do both of the following: a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities, conveyed under the terms of this License. b) Give prominent notice with the combined library that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. 6. Revised Versions of the GNU Lesser General Public License. The Free Software Foundation may publish revised and/or new versions of the GNU Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library as you received it specifies that a certain numbered version of the GNU Lesser General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that published version or of any later version published by the Free Software Foundation. If the Library as you received it does not specify a version number of the GNU Lesser General Public License, you may choose any version of the GNU Lesser General Public License ever published by the Free Software Foundation. If the Library as you received it specifies that a proxy can decide whether future versions of the GNU Lesser General Public License shall apply, that proxy's public statement of acceptance of any version is permanent authorization for you to choose that version for the Library. irstlm-6.00.05/Makefile.am000066400000000000000000000005551263213470300152360ustar00rootroot00000000000000# not a GNU package. You can remove this line, if # have all needed files, that a GNU package needs AUTOMAKE_OPTIONS = foreign SUBDIRS = src scripts doc EXTRA_DIST = README RELEASE Copyright ACLOCAL_AMFLAGS = -I m4 LN_S=@LN_S@ install-exec-hook: cd ${exec_prefix}/ && \ ${LN_S} -n -f lib lib64 dist-hook: rm -rf `find $(distdir)/doc -type d -name .svn` irstlm-6.00.05/NOTE000066400000000000000000000007001263213470300136620ustar00rootroot00000000000000This repo has been created starting from revision 891 of the SourceForge repo of IRSTLM on 20/07/2015 GitHub repository: commit 55cf030f4b0de41049de8c996f3168080bf1eaa8 Author: Marcello Federico Date: Mon Jul 20 09:18:13 2015 +0200 SourceForge orinigal repo: URL: https://svn.code.sf.net/p/irstlm/code Repository Root: https://svn.code.sf.net/p/irstlm/code Repository UUID: ee1e936f-7723-0410-939f-cd31102c90e6 Revision: 891 irstlm-6.00.05/README.md000066400000000000000000000046201263213470300144560ustar00rootroot00000000000000# irstlm IRSTLM Toolkit CONTENT: - src: source code - scripts: supporting scripts - doc: documentation (in Latex) and in pdf (to be generated) - bin: binaries (to be generated) and scripts - lib: libraries (to be generated) - readme: this file DOCUMENTATION A User Manual is available under https://sourceforge.net/projects/irstlm The data for the examples described in the User Manual are available under http://sourceforge.net/projects/irstlm/files/irstlm/sampledata/ HOW TO INSTALL WITH AUTOMAKE Step 0: $> sh regenerate-makefiles.sh [--force] Set parameter force to the value "--force" if you want to recreate all links to the autotools Step 1: $> ./configure [--prefix=/path/where/to/install] ... Run "configure --help" to get more details on the compilation options If your g++ compiler does not support '-std=c++0x', please add parameter '--disable-cxx0'. To check whether g++ complier does support '-std=c++0x', please run the following command: $> echo | g++ -E -x c++ -std=c++0x -dM - >& /dev/null ; echo $? If it returns 0, g++ complier does support '-std=c++0x'; otherwise, it does not, and hence please use '--disable-cxx0' To enable/disable assert for debugging purpose, please add parameter '--enable-assert' (default) or '--disable-assert' To modify debugging level, please add parameter '--with-tracelevel=' (default is 0) Step 2: $> make Step 3: $> make install These steps will generate the irstlm library and commands, respectively, under the specified path where to install. HOW TO INSTALL WITH CMAKE Step 0: $> cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX="/path/where/to/install" Note: If your g++ compiler does not support '-std=c++0x', please add parameter '-DCXX0:BOOL=OFF'. To check whether g++ complier does support '-std=c++0x', please run the following command: $> echo | g++ -E -x c++ -std=c++0x -dM - >& /dev/null ; echo $? If it returns 0, g++ complier does support '-std=c++0x'; otherwise, it does not, and hence please use '-DCXX0:BOOL=OFF' To enable/disable assert for debugging purpose, please add parameter '-DASSERT:BOOL=ON' (default) or '-DASSERT:BOOL=OFF' To modify debugging level, please add parameter '-DTRACE_LEVEL=' (default is 0) Step 2: $> make Step 3: $> make install HOW TO CONTRIBUTE If you wish to contribute to the Open Source IRSTLM toolkit just tell us! Marcello Federico FBK, Trento, ITALY email: federico AT fbk DOT eu irstlm-6.00.05/configure.ac000066400000000000000000000134411263213470300154660ustar00rootroot00000000000000AC_INIT([irstlm], [5.80.06]) AM_INIT_AUTOMAKE([-Wall -Werror foreign]) AC_PROG_CC AC_PROG_CXX AC_PROG_LIBTOOL AC_CONFIG_HEADERS([config.h]) AC_CONFIG_MACRO_DIR([m4]) m4_pattern_allow([AM_PROG_AR],[AM_PROG_AR]) AM_PROG_AR AC_ARG_ENABLE([doc], [AC_HELP_STRING([--enable-doc|--disable-doc], [Enable or Disable (default) creation of documentation])]) AC_ARG_ENABLE([trace], [AC_HELP_STRING([--enable-trace|--disable-trace], [Enable (default) or Disable trace info at run-time])]) AC_ARG_ENABLE([assert], [AC_HELP_STRING([--enable-assert|--disable-assert], [Enable (default) or Disable assert calls at run-time])]) AC_ARG_ENABLE([debugging], [AC_HELP_STRING([--enable-debugging|--disable-debugging], [Enable or Disable (default) debugging info ("-g -O2")])]) AC_ARG_ENABLE(profiling, [AC_HELP_STRING([--enable-profiling|--disable-profiling], [Enable or Disable (default) profiling info ("-pg")])]) AC_ARG_ENABLE(caching, [AC_HELP_STRING([--enable-caching|--disable-caching], [Enable or Disable (default) internal caches to store probs and other info])]) AC_ARG_ENABLE(output, [AC_HELP_STRING([--enable-output|--disable-output], [Enable (default) or Disable part of the output])]) AC_ARG_ENABLE(interpolatedsearch, [AC_HELP_STRING([--enable-interpolatedsearch|--disable-interpolatedsearch], [Enable or Disable (default) interpolated search for n-grams])]) AC_ARG_ENABLE(optimization, [AC_HELP_STRING([--enable-optimization|--disable-optimization], [Enable or Disable (default) optimization info ("-O3")])]) AC_ARG_ENABLE(cxx0, [AC_HELP_STRING([--enable-cxx0|--disable-cxx0], [Enable or Disable (default) c++0x dialect ("-std=c++0x")])]) AC_ARG_WITH(zlib, [AC_HELP_STRING([--with-zlib=PATH], [(optional) path to zlib])], [with_zlib=$withval], [with_zlib=no] ) AC_ARG_WITH(tracelevel, [AC_HELP_STRING([--with-tracelevel=VAL], [(optional) level of tracing; default 0, tracing disabled])], [with_tracelevel=$withval], [with_tracelevel=0] ) AM_CONDITIONAL([DOC_COMPILATION],false) if test "x$enable_doc" = 'xyes' then AC_CHECK_TOOL(PDFLATEX,pdflatex,"no") AC_CHECK_TOOL(BIBTEX,bibtex,"no") if test "x$PDFLATEX" != "xno" && test "x$BIBTEX" != "xno" then AM_CONDITIONAL([DOC_COMPILATION],true) AC_MSG_NOTICE([pdflatex and bibtex are available]) AC_MSG_NOTICE([documentation will be created]) else AC_MSG_NOTICE([either pdflatex or bibtex is not available]) AC_MSG_NOTICE([documentation will not be created (default); get it through the website $WEBSITE]) fi else AC_MSG_NOTICE([documentation will not be created (default); get it through the website $WEBSITE]) fi #### Use this if you want that the default is yes #### if test "x$enable_foo" != 'xno' #### Use this if you want that the default is no #### if test "x$enable_foo" = 'xyes' if test "x$with_tracelevel" != 'x0' then if test "x$enable_trace" != 'xno' then TRACELEVEL=${with_tracelevel} AC_MSG_NOTICE([trace enabled (default); trace level is $TRACELEVEL]) else TRACELEVEL=0 AC_MSG_NOTICE([trace disabled; trace level is overwritten to $TRACELEVEL; most regression tests will fail]) fi else if test "x$enable_trace" != 'xno' then TRACELEVEL=1 AC_MSG_NOTICE([trace enabled (default); trace level is overwritten to default value $TRACELEVEL]) else TRACELEVEL=0 AC_MSG_NOTICE([trace disabled; trace level is $TRACELEVEL; most regression tests will fail]) fi fi CPPFLAGS="$CPPFLAGS -DTRACE_LEVEL=$TRACELEVEL" if test "x$enable_assert" != 'xno' then AC_MSG_NOTICE([assert enabled (default)]) CPPFLAGS="$CPPFLAGS -DMY_ASSERT_FLAG" else AC_MSG_NOTICE([assert disabled]) CPPFLAGS="$CPPFLAGS -UMY_ASSERT_FLAG" fi if test "x$enable_debugging" = 'xyes' then AC_MSG_NOTICE([generation of debugging symbols enabled, compilation with "-g -O2"]) CPPFLAGS="$CPPFLAGS -g -O2" else AC_MSG_NOTICE([generation of debugging symbols disabled (default), compilation without "-g", only "-O2"]) fi if test "x$enable_profiling" = 'xyes' then AC_MSG_NOTICE([profiling enabled, compilation with "-pg"]) CPPFLAGS="$CPPFLAGS -pg" LDFLAGS="$LDFLAGS -pg" else AC_MSG_NOTICE([profiling disabled (default)]) fi if test "x$enable_caching" = 'xyes' then AC_MSG_NOTICE([caching enabled]) CPPFLAGS="$CPPFLAGS -DPS_CACHE_ENABLE -DLMT_CACHE_ENABLE -DMDIADAPTLM_CACHE_ENABLE"; LDFLAGS="$LDFLAGS" else AC_MSG_NOTICE([caching disabled (default)]) fi if test "x$enable_output" = 'xno' then AC_MSG_NOTICE([part of the output on stdout is suppressed]) CPPFLAGS="$CPPFLAGS -DOUTPUT_SUPPRESSED"; LDFLAGS="$LDFLAGS" else AC_MSG_NOTICE([caching disabled (default)]) fi if test "x$enable_interpolatedsearch" = 'xyes' then AC_MSG_NOTICE([interpolated search enabled]) CPPFLAGS="$CPPFLAGS -DINTERP_SEARCH"; LDFLAGS="$LDFLAGS" else AC_MSG_NOTICE([interpolated search disabled (default)]) fi if test "x$enable_optimization" = 'xyes' then AC_MSG_NOTICE([optimization enabled, compilation with "-O3"]) CPPFLAGS="$CPPFLAGS -O3"; LDFLAGS="$LDFLAGS -O3" else AC_MSG_NOTICE([optimization disabled (default)]) fi if test "x$enable_cxx0" != 'xno' then AC_MSG_NOTICE([c++x0 dialect is enabled (default), compilation with "-DHAVE_CXX0 -std=c++0x "]) CPPFLAGS="$CPPFLAGS -DHAVE_CXX0 -std=c++0x"; else AC_MSG_NOTICE([c++x0 dialect is disabled, compilation without "-std=c++0x" and with "-UHAVE_CXX0"]) CPPFLAGS="$CPPFLAGS -UHAVE_CXX0"; fi if test "x$with_zlib" != 'xno' then CPPFLAGS="$CPPFLAGS -I${with_zlib}/include" LDFLAGS="$LDFLAGS -L${with_zlib}/lib" fi LIBS="$LIBS -lz" AC_CONFIG_FILES([ Makefile src/Makefile scripts/Makefile doc/Makefile ]) AC_SUBST(transform,'s/_lm/-lm/') AC_OUTPUT() AC_MSG_NOTICE([The software will be installed into $prefix]) irstlm-6.00.05/doc/000077500000000000000000000000001263213470300137425ustar00rootroot00000000000000irstlm-6.00.05/doc/CMakeLists.txt000066400000000000000000000025141263213470300165040ustar00rootroot00000000000000# include specific modules INCLUDE(UseLATEX OPTIONAL) if (PDFLATEX_COMPILER AND BIBTEX_COMPILER AND MAKEINDEX_COMPILER) message("PDFLATEX_COMPILER exists (${PDFLATEX_COMPILER})") message("BIBTEX_COMPILER exists (${BIBTEX_COMPILER})") message("MAKEINDEX_COMPILER exists (${MAKEINDEX_COMPILER})") SET(LATEX_OUTPUT_PATH build) PROJECT(irstlm-manual NONE) cmake_minimum_required(VERSION 2.8) SET(IRSTLM_INPUT_TEX ClassAndChunkLMs.tex LMFileFormats.tex LMFiltering.tex LMInterface.tex LMInterpolation.tex LMPruning.tex LMQuantization.tex LMAdaptation.tex LMCompilation.tex LMPrune.tex LMSmoothing.tex compileLM.tex dict.tex gettingStarted.tex giganticLM.tex installation.tex interpolateLM.tex interpolatedLM.tex introduction.tex mixtureLM.tex ngt.tex parallelComputation.tex pruneLM.tex quantizeLM.tex referenceMaterial.tex regressionTests.tex releaseNotes.tex tlm.tex ) ADD_LATEX_DOCUMENT( ./irstlm-manual.tex INPUTS ${IRSTLM_INPUT_TEX} DEFAULT_PDF ) add_custom_command(TARGET pdf POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${LATEX_OUTPUT_PATH}/irstlm-manual.pdf ${CMAKE_BINARY_DIR}/doc/irstlm-${IRSTLM_VERSION}-manual.pdf) INSTALL(PROGRAMS irstlm-${IRSTLM_VERSION}-manual.pdf DESTINATION doc PERMISSIONS OWNER_READ ) ELSE() message("PDFLATEX_COMPILER does not exists") ENDIF() irstlm-6.00.05/doc/ClassAndChunkLMs.tex000066400000000000000000000167231263213470300175720ustar00rootroot00000000000000{\IRSTLM} allows the use of class and chunk LMs, and a special handling of input tokens which are concatenation of $N \ge 1$ fields separated by the character \#, e.g. \begin{verbatim} word#lemma#part-of-speech#word-class \end{verbatim} \noindent The processing is guided by the format of the file passed to Moses or {\tt compile-lm}: if it contains just the LM, either in textual or binary format, it is treated as usual; otherwise, it is supposed to have the following format: \begin{verbatim} LMMACRO \end{verbatim} \noindent where: \begin{verbatim} LMMACRO is a reserved keyword is a positive integer is an integer >=-1 is a boolean value (true, false) is a file containing a LM (format compatible with {\IRSTLM}) is an (optional) file with a (one|many)-to-one map \end{verbatim} \noindent The various cases are discussed with examples in the following. Data used in those examples can be found in the directory {\tt example/chunkLM/} which represents the relative path for all the parameters of the referred commands. Note that texts with different tokens (words, POS, word\#POS pairs...) used either as input or for training LMs are all derived from the same multifield texts in order to allow direct comparison of results. \subsection{Field selection} The simplest case is that of the LM in {\tt } referring just to one specific field of the input tokens. In this case, it is possible to specify the field to be selected before querying the LM through the integer {\tt } ($0$ for the first filed, $1$ for the second...). With the value $-1$, no selection is applied and the LM is queried with n-grams of whole strings. The other parameters are set as: \begin{verbatim} : set to the size of the LM in : false \end{verbatim} \noindent The third line optionally reserved to {\tt } does not exist. \bigskip \noindent Examples: \bigskip \noindent \thesubsection.a) selection of the second field: \begin{verbatim} $> compile-lm --eval test/test.w-micro cfgfile/cfg.2ndfield %% Nw=126 PP=2.68 PPwp=0.00 Nbo=0 Noov=0 OOV=0.00% \end{verbatim} \noindent \thesubsection.b) selection of the first field: \begin{verbatim} $> compile-lm --eval test/test.w-micro cfgfile/cfg.1stfield %% Nw=126 PP=9.71 PPwp=0.00 Nbo=76 Noov=0 OOV=0.00% \end{verbatim} \noindent The result of the latter case is identical to that obtained with the standard configuration involving just words: \bigskip \noindent \thesubsection.c) usual case on words: \begin{verbatim} $> compile-lm --eval test/test.w lm/train.en.blm %% Nw=126 PP=9.71 PPwp=0.00 Nbo=76 Noov=0 OOV=0.00% \end{verbatim} \subsection{Class LMs} Possibly, a many-to-one or one-to-one map can be passed through the {\tt } parameter which has the simple format: \begin{verbatim} w1 class(w1) w2 class(w2) ... wM class(wM) \end{verbatim} \noindent The map is applied to each component of ngrams before the LM query. Examples: \bigskip \noindent \thesubsection.a) map applied to the second field: \begin{verbatim} $> compile-lm --eval test/test.w-micro cfgfile/cfg.2ndfld-map %% Nw=126 PP=16.40 PPwp=0.00 Nbo=33 Noov=0 OOV=0.00% \end{verbatim} \noindent \thesubsection.b) just to assess the correctness of the (16.2.a) result: \begin{verbatim} $> compile-lm --eval test/test.macro lm/train.macro.blm %% Nw=126 PP=16.40 PPwp=0.00 Nbo=33 Noov=0 OOV=0.00% \end{verbatim} \subsection{Chunk LMs} A particular processing is performed whenever fields are supposed to correspond to microtags, i.e. the per-word projections of chunk labels. By means of the {\tt } parameter, it is possible to activate a processing aiming at collapsing the sequence of microtags defining a chunk. The chunk LM is then queried with ngrams of chunk labels, in an asynchronous manner with respect to the sequence of words, as in general chunks consist of more words. \noindent The collapsing operation is automatically activated if the sequence of microtags is: \begin{verbatim} TAG( TAG+ TAG+ ... TAG+ TAG) \end{verbatim} \noindent Such a sequence is collapsed into a single chunk label (let us say {\tt CHNK}) as long as {\tt TAG(}, {\tt TAG+} and {\tt TAG)} are all mapped into the same label {\tt CHNK}. The map into different labels or a different use/position of characters $($, $+$ and $)$ in the lexicon of tags prevent the collapsing operation even if {\tt } is set to {\tt true}. Of course, if {\tt } is {\tt false}, no collapse is attempted. \paragraph{Warning:} In this context, it assumes an important role the parameter {\tt }: it defines the size of the n-gram before the collapsing operation, that is the number of microtags of the actually processed sequence. {\tt } should be large enough to ensure that after the collapsing operation, the resulting n-gram of chunks is at least of the size of the LM to be queried (the {\tt }). As an example, assuming {\tt =6}, {\tt =1}, {\tt =true} and 3 the size of the chunk LM, the following input \begin{verbatim} on#PP average#NP( 30#NP+ -#NP+ 40#NP+ cm#NP) \end{verbatim} \noindent will yield to query the LM with just the bigram {\tt (PP,NP)}, instead of a more informative trigram; for this particular case, the value 6 for {\tt } is not enough. On the other side, for efficiency reasons, it cannot be set to an unlimited valued. A reasonable value could derive from the average number of microtags per chunk (2-3), which means setting {\tt } to two-three times the size of the LM in {\tt }. Examples: \bigskip \noindent \thesubsection.a) second field, micro$\rightarrow$macro map, collapse: \begin{verbatim} $> compile-lm --eval test/test.w-micro cfgfile/cfg.2ndfld-map-cllps %% Nw=126 PP=1.84 PPwp=0.00 Nbo=0 Noov=0 OOV=0.00% $> compile-lm --eval test/test.w-micro cfgfile/cfg.2ndfld-map-cllps -d=1 %% Nw=126 PP=1.83774013 ... OOV=0.00% logPr=-33.29979642 \end{verbatim} \noindent \thesubsection.b) whole token, micro$\rightarrow$macro map, collapse: \begin{verbatim} $> compile-lm --eval test/test.micro cfgfile/cfg.token-map-cllps %% Nw=126 PP=1.84 PPwp=0.00 Nbo=0 Noov=0 OOV=0.00% \end{verbatim} \noindent \thesubsection.c) whole token, micro$\rightarrow$macro map, NO collapse: \begin{verbatim} $> compile-lm --eval test/test.micro cfgfile/cfg.token-map %% Nw=126 PP=16.40 PPwp=0.00 Nbo=0 Noov=0 OOV=0.00% \end{verbatim} \noindent Note that the configuration (16.3.c) gives the same result of that in example (16.2.b), as they are equivalent. \bigskip \noindent \thesubsection.d) As an actual example related to the ``warning'' note reported above, the following configuration with usual LM: \begin{verbatim} $> compile-lm --eval test/test.chunk lm/train.macro.blm -d=1 Nw=73 PP=2.85754443 ... OOV=0.00000000% logPr=-33.28748842 \end{verbatim} \noindent not necessarily yields the same log-likelihood ({\tt logPr}) nor the same perplexity ({\tt PP}) of case (16.3.a). In fact, concerning {\tt PP}, the length of the input sequence is definitely different (126 tokens before collapsing, 73 after that). Even the {\tt logPr} is different (-33.29979642 vs. -33.28748842) because in (16.3.a) some 6-grams ({\tt } is set to 6) after collapsing reduce to $n$-grams of size less than 3 (the size of lm/train.macro.blm). By setting {\tt } to a larger value (e.g. 8), the same {\tt logPr} will be computed. irstlm-6.00.05/doc/LMAdaptation.tex000066400000000000000000000065511263213470300170100ustar00rootroot00000000000000Language model adaptation can be applied when little training data is given for the task at hand, but much more data from other less related sources is available. {\tt tlm} supports two adaptation methods. \subsection{Minimum Discriminative Information Adaptation} MDI adaptation is used when domain related data is very little but enough to estimate a unigram LM. Basically, the n-gram probs of a general purpose (background) LM are scaled so that they match the target unigram distribution. \noindent Relevant parameters: \begin{itemize} \item {\tt -ar=value}: the adaptation {\tt rate}, a real number ranging from 0 (=no adaptation) to 1 (=strong adaptation). \item {\tt -ad=file}: the adaptation file, either a text or a unigram table. \item {\tt -ao=y}: open vocabulary mode, which must be set if the adaptation file might contain new words to be added to the basic dictionary. \end{itemize} \noindent As an example, we apply MDI adaptation on the ``adapt'' file: \begin{small} \begin{verbatim} $> tlm -tr=train.www -lm=wb -n=3 -te=test -dub=1000000 -ad=adapt -ar=0.8 -ao=yes n=49984 LP=326327.8053 PP=684.470312 OVVRate=0.04193341869 \end{verbatim} \end{small} \noindent \paragraph{Warning:} modified shift-beta smoothing cannot be applied in open vocabulary mode ({\tt -ao=yes}). If this is the case, you should either change smoothing method or simply add the adaptation text to the background LM (use {\tt -aug} parameter of {\tt ngt}). In general, this solution should provide better performance. \begin{small} \begin{verbatim} $> ngt -i=train.www -aug=adapt -o=train-adapt.www -n=3 -b=yes $> tlm -tr=train-adapt.www -lm=msb -n=3 -te=test -dub=1000000 -ad=adapt -ar=0.8 n=49984 LP=312276.1746 PP=516.7311396 OVVRate=0.04193341869 \end{verbatim} \end{small} \subsection{Mixture Adaptation} \noindent Mixture adaptation is useful when you have enough training data to estimate a bigram or trigram LM and you also have data collections from other domains. \noindent Relevant parameters: \begin{itemize} \item {\tt-lm=mix} : specifies mixture smoothing method \item {\tt -slmi=}: specifies filename with information about LMs to combine. \end{itemize} \noindent In the example directory, the file {\tt sublmi} contains the following lines: \begin{verbatim} 2 -slm=msb -str=adapt -sp=0 -slm=msb -str=train.www -sp=0 \end{verbatim} \noindent This means that we use train a mixture model on the {\tt adapt} data set and combine it with the train data. For each data set the desired smoothing method is specified (disregard the parameter {\tt -sp}). The file used for adaptation is the one in FIRST position. \begin{verbatim} $> tlm -tr=train.www -lm=mix -slmi=sublm -n=3 -te=test -dub=1000000 n=49984 LP=307199.3273 PP=466.8244383 OVVRate=0.04193341869 \end{verbatim} \noindent {\bf Warning}: for computational reasons it is expected that the $n$-gram table specified by {\tt -tr} contains AT LEAST the $n$-grams of the last table specified in the slmi file, i.e. {\tt train.www} in the example. Faster computations are achieved by putting the largest dataset as the last sub-model in the list and the union of all data sets as training file. \noindent It is also IMPORTANT that a large {\tt -dub} value is specified so that probabilities of sub-LMs can be correctly computed in case of out-of-vocabulary words. irstlm-6.00.05/doc/LMCompilation.tex000066400000000000000000000035171263213470300172010ustar00rootroot00000000000000LMs in ARPA, iARPA, and qARPA format can be stored in a compact binary table through the command: \begin{verbatim} $> compile-lm train.lm train.blm \end{verbatim} \noindent which generates the binary file {\tt train.blm} that can be quickly loaded in memory. If the LM is really very large, {\tt compile-lm} can avoid to create the binary LM directly in memory through the option {\tt -memmap 1}, which exploits the {\em Memory Mapping} mechanism in order to work as much as possible on disk rather than in RAM. \\ \begin{verbatim} $> compile-lm --memmap 1 train.lm train.blm \end{verbatim} \noindent This option clearly pays a fee in terms of speed, but is often the only way to proceed. It is also recommended that the hard disk for the LM storage belongs to the computer on which the compilation is performed. \noindent Notice that most of the functionalities of {\tt compile-lm} (see below) apply to binary and quantized models. \noindent By default, the command uses the directory ``/tmp'' for storing intermediate results. For huge LMs, the temporary files can grow dramatically causing a ``disk full'' system error. It is possible to explicitly set the directory used for temporary computation through the parameter ``--tmpdir''. \begin{verbatim} $> compile-lm --tmpdir= train.lm train.blm \end{verbatim} \subsection{Inverted order of ngrams} \label{sec:inverted-lm} For a faster access, the ngrams can be stored in inverted order with the following two commands: \begin{verbatim} $> sort-lm.pl -inv -ilm train.lm -olm train.inv.lm $> compile-lm train.inv.lm train.inv.blm --invert yes \end{verbatim} \paragraph{Warning:} The following pipeline is no more allowed!! \COMMENT{ or with the following pipeline: } \begin{verbatim} $> cat train.lm | sort-lm.pl -inv | \ compile-lm /dev/stdin train.inv.blm --invert yes \end{verbatim} irstlm-6.00.05/doc/LMFileFormats.tex000066400000000000000000000215001263213470300171260ustar00rootroot00000000000000{\IRSTLM} supports several types of input and output formats for handling LMs, $n$-gram counts, dictionaries. %{\IRSTLM} supports three output formats of LMs. These formats have the %purpose of permitting the use of LMs by external programs. \subsection{File Formats for Dictionary} The dictionary is the data structure exploited by {\IRSTLM} to store a set of terms. {\IRSTLM} saves a dictionary in textual file format consisting of: \begin{itemize} \item a header line specifying the most important information about the file itself: the keyword "dictionary", a fixed value 0, and the amount of terms the dictionary contains; \item a set of terms listed according to either their occurrence or their frequency in the data. \end{itemize} Here is an excerpt. \begin{verbatim} dictionary 0 7893 solemn ceremony marks .... \end{verbatim} \noindent Optionally, the occurrence frequencies of each term can be stored as well; in this case the keyword is "DICTIONARY". \noindent Here is an excerpt. \begin{verbatim} DICTIONARY 0 7893 5000 5001 solemn 7 ceremony 59 .... \end{verbatim} \IMPORTANT{The list order is used by {\IRSTLM} to define the internal codes of the terms. In the vast majority of cases, it is completely transparent and irrelevant to the user. Only in very few cases highlighted in this manual, this order is crucial.} \subsection{File Formats for $n$-gram Table} The $n$-gram table is the data structure exploited by {\IRSTLM} to store a set of $n$-grams. {\IRSTLM} stores an $n$-gram table either in textual or binary formats. \subsubsection{Textual format} The textual format consists of: \begin{itemize} \item a header line specifying the most important information about the file itself: the keyword "nGrAm", the order $n$ of the $n$-grams, the amount of $n$-grams the $n$-gram table contains, and a second keyword representing the table type; \item a second line reporting the size of the dictionary associated to the $n$-grams; \item the terms of the dictionary (one term per line) with their frequency; \item the list of all $n$-grams with their counts. \end{itemize} Here is an excerpt. \begin{verbatim} NgRaM 3 76857 ngram 7893 5000 5001 solemn 7 ceremony 59 ... 2 1 1 solemn ceremony 1 a solemn 1 \end{verbatim} \subsubsection{Binary format} The binary format is similar, but its main keyword is "{\tt NgRaM}" (different caseing), and the list of $n$-grams is binarized; hence, the last portion of the binary $n$-gram table is not user-readable. \subsubsection{Google $n$-gram format} {\IRSTLM} supports the Google $n$-gram format as well both for input and output. This format, always textual, simply consists of the list of all $n$-grams with their counts. Here is an excerpt. \begin{verbatim} 2 1 1 solemn ceremony 1 a solemn 1 ... \end{verbatim} \subsubsection{Table types} The table type keyword represents the way the $n$-grams are collected and the way they are exploited for further computation: \begin{itemize} \item{\tt ngram}: each entry is a standard $n$-gram, i.e. a contiguous sequence of $n$ terms; they are usually used to estimate a standard $n$-gram LM; \item{\tt co-occ$K$}: each entry is a xxxxxx, where $K$ is XXXXXX; \item{\tt hm$S$}: each entry is a xxxxxx, where $K$ is XXXXXX; \end{itemize} \subsection{File Formats for LM} {\IRSTLM} handles LM both in textual and binary formats. It provides facilities to save disk space storing probabilities as quantized values instead of floating point values, and to reduce access time saving $n$-grams in inverted order. \subsubsection{Textual Format} The textual format is the well-known ARPA format introduced in DARPA ASR evaluations to exchange LMs. ARPA format is supported by most third party LM toolkit, like SRILM and KenLM. The ARPA format consists of: \begin{itemize} \item one block reporting the amount $n$-grams stored for level $m$ of the LM ($m -0.826378 .... -1.20244 \2-grams: -3.29024 -0.221849 ... -0.289359 restructuring of \3-grams: -0.397606 -1.67881 a hong_kong ... -0.420213 seymf council , \end\ \end{verbatim} \noindent Empty lines can occur before and after each block. \noindent There is no limit to the order $n$ of $n$-grams. \IMPORTANT{Backoff log-probabilities are not reported if equal to 0; backoff log-probabilities do not exist for the largest order.} \subsubsection{Quantized Textual Format} This textual format extends the ARPA textual format including codebooks that quantize probabilities and back-off weights of each $n$-gram level. The quantized ARPA format consists of: \begin{itemize} \item a header line specifying the most important information about the file itself: the keyword "qARPA", the order $n$ of the LM, the size of the $n$ codebooks \item one block reporting the amount $n$-grams stored for level $m$ of the LM ($m 53 186 0 .... \2-grams: 256 -3.79901 -99 -3.62278 -3.01953 ... 7 255 65 255 .... \end\ \end{verbatim} \subsubsection{Intermediate Textual Format} This is an {\em intermediate} ARPA format used by {\IRSTLM} for optimizing computation of huge LM. It differs from the ARPA format in two aspects: \begin{itemize} \item the header line contains only the keyword {\tt iARPA}; \item the first field of each $n$-gram entry is its smoothed frequency of instead of its log-probability. \end{itemize} \COMMENT{ \noindent Nevertheless, iARPA format is properly managed by the {\tt compile-lm} command in order to generate a binary version or a standard ARPA version. } \subsubsection{Binary Format} The binary format supported by {\IRSTLM} allows for save disk space and upload the LM quicker. \noindent The binary format consists of: \begin{itemize} \item a header line specifying the most important information about the file itself: the keyword "blmt", the order $n$ of the LM, and the amount $n$-grams stored for level $m$ of the LM ($m 1 5001 solemn 7 ... _binary_data_ \end{verbatim} \subsubsection{Quantized Binary Format} The quantized binary format stores the quantized version of a LM. \noindent It consists of: \begin{itemize} \item a header line specifying the most important information about the file itself: the keyword "Qblmt", the order $n$ of the LM, and the amount $n$-grams stored for level $m$ of the LM ($m 1 5001 solemn 7 ... _binary_data_ \end{verbatim} \subsubsection{Inverted Binary Format} {\IRSTLM} can store the $n$-grams in inverted order to speed up access time. This applies to both standard and quantized binary formats, namely {\tt blmt} or {\tt Qblmt}. The keywords are {\tt blmtI} or {\tt QblmtI}, respectively. irstlm-6.00.05/doc/LMFiltering.tex000066400000000000000000000013011263213470300166330ustar00rootroot00000000000000A large LM can be filtered according to a word list through the command: \begin{verbatim} $> compile-lm train.lm --filter list filtered.lm \end{verbatim} The resulting LM will only contain n-grams inside the provided list of words, with the exception of the 1-gram level, which by default is preserved identical to the original LM. This behavior can be changed by setting the option {\tt --keepunigrams no}. LM filtering can be useful once very large LMs can be specialized in advance to work on a particular portion of language. \noindent If the original LM is in binary format and is very large, {\tt compile-lm} can avoid to load it in memory, through the memory mapping option {\tt -memmap 1}. irstlm-6.00.05/doc/LMInterface.tex000066400000000000000000000110131263213470300166110ustar00rootroot00000000000000LMs are useful when they can be queried through another application in order to compute perplexity scores or n-gram probabilities. {\IRSTLM} provides two possible interfaces: \begin{itemize} \item at the command level, through {\tt compile-lm} \item at the c++ library level, mainly through methods of the class {\tt lmtable} \end{itemize} \noindent In the following, we will only focus on the command level interface. Details about the c++ library interface will be provided in a future version of this manual. \subsection{Perplexity Computation} Assume we have estimated and saved the following LM: \begin{verbatim} $> tlm -tr=train.www -n=3 -lm=wb -te=test -o=train.lm -ps=no n=49984 LP=308057.0419 PP=474.9041687 OVVRate=0.05007602433 \end{verbatim} \noindent To compute the perplexity directly from the LM on disk, we can use the command: \begin{verbatim} $> compile-lm train.lm --eval test %% Nw=49984 PP=1064.40 PPwp=589.50 Nbo=38071 Noov=2503 OOV=5.01% \end{verbatim} Notice that {\tt PPwp} reports the contribution of OOV words to the perplexity. Each OOV word is indeed penalized by dividing the LM probability of the {\tt unk} word by the quantity \centerline{{\tt DictionaryUpperBound} - {\tt SizeOfDictionary}} \noindent The OOV penalty can be modify by changing the {\tt DictionaryUpperBound} with the parameter {\tt --dub} (whose default value is set to $10^7$). \\ \noindent The perplexity of the pruned LM can be computed with the command: \begin{verbatim} $> compile-lm train.plm --eval test --dub 10000000 %% Nw=49984 PP=1019.69 PPwp=564.73 Nbo=39907 Noov=2503 OOV=5.01% \end{verbatim} Interestingly, a slightly better value is obtained which could be explained by the fact that pruning has removed many unfrequent trigrams and has redistributed their probabilities over more frequent bigrams. \noindent Notice that {\tt PPwp} reports the perplexity with a fixed dictionary upper-bound of 10 million words. Indeed: \begin{verbatim} $> tlm -tr=train.www -n=3 -lm=wb -te=test -o=train.lm -ps=no -dub=10000000 n=49984 LP=348396.8632 PP=1064.401254 OVVRate=0.05007602433 \end{verbatim} \bigskip \noindent Again, if the LM is in binary format and is very large, {\tt compile-lm} can avoid to load it in memory, through the memory mapping option {\tt -memmap 1}. \bigskip \noindent By enabling the option ``{\tt --sentence yes}'', {\tt compile-lm} computes perplexity and related figures (OOV rate, number of backoffs, etc.) for each input sentence. The end of a sentence is identified by a given symbol ({\tt } by default). \begin{verbatim} $> compile-lm train.plm --eval test --dub 10000000 --sentence yes \end{verbatim} {\small \begin{verbatim} %% sent_Nw=1 sent_PP=23.22 sent_PPwp=0.00 sent_Nbo=0 sent_Noov=0 sent_OOV=0.00% %% sent_Nw=8 sent_PP=7489.50 sent_PPwp=7356.27 sent_Nbo=7 sent_Noov=2 sent_OOV=25.00% %% sent_Nw=9 sent_PP=1231.44 sent_PPwp=0.00 sent_Nbo=14 sent_Noov=0 sent_OOV=0.00% %% sent_Nw=6 sent_PP=27759.10 sent_PPwp=25867.42 sent_Nbo=19 sent_Noov=1 sent_OOV=16.67% ..... %% sent_Nw=5 sent_PP=378.38 sent_PPwp=0.00 sent_Nbo=39893 sent_Noov=0 sent_OOV=0.00% %% sent_Nw=15 sent_PP=4300.44 sent_PPwp=2831.89 sent_Nbo=39907 sent_Noov=1 sent_OOV=6.67% %% Nw=49984 PP=1019.69 PPwp=564.73 Nbo=39907 Noov=2503 OOV=5.01% \end{verbatim} } \bigskip \noindent Finally, tracing information with the {\tt --eval } option are shown by setting debug levels from 1 to 4 ({\tt --debug}): \begin{enumerate} \item reports the back-off level for each word \item adds the log-prob \item adds the back-off weight \item check if probabilities sum up to 1. \end{enumerate} \subsection{Probability Computations} Word-by-word log-probabilities can be computed as well from standard input with the command: \begin{verbatim} $> compile-lm train.lm --score yes < test > 1 p= NULL > 1 p= NULL > of 1 p= -3.530047e+00 bo= 2 > of the 1 p= -1.250668e+00 bo= 1 > of the senate 1 p= -1.170901e+01 bo= 1 > the senate ( 1 p= -5.457265e+00 bo= 2 > senate ( 1 p= -2.166440e+01 bo= 2 .... .... \end{verbatim} \noindent the command reports the currently observed n-gram, including {\tt\_unk\_} words, a dummy constant frequency 1, the log-probability of the n-gram, and the number of back-offs performed by the LM. \paragraph{Warning:} All cross-sentence $n$-grams are skipped. The 1-grams with the sentence start symbol are also skipped. In a $n$-grams all words before the sentence start symbol are removed. For $n$-grams, whose size is smaller than the LM order, probability is not computed, but a {\tt NULL} value is returned. irstlm-6.00.05/doc/LMInterpolation.tex000066400000000000000000000051541263213470300175510ustar00rootroot00000000000000We provide a convenient tool to estimate mixtures of LMs that have been already created in one of the available formats. The tool permits to estimate interpolation weights through the EM algorithm, to compute the perplexity, and to query the interpolated LM. \noindent Data used in those examples can be found in the directory {\tt example/interpolateLM/}, which represents the relative path for all the parameters of the referred commands. \noindent Interpolated LMs are defined by a configuration file in the following format: \begin{verbatim} 3 0.3 lm-file1 0.3 lm-file2 0.4 lm-file3 \end{verbatim} \noindent The first number indicates the number of LMs to be interpolated, then each LM is specified by its weight and its file (either in ARPA or binary format). Notice that you can interpolate LMs with different orders\\ \noindent Given an initial configuration file {\tt lmlist.init} (with arbitrary weights), new weights can be estimated through Expectation-Maximization on some text sample {\tt test} by running the command: \begin{verbatim} $> interpolate-lm lmlist.init --learn test \end{verbatim} \noindent New weights will be written in the updated configuration file, called by default {\tt lmlist.init.out}. You can also specify the name of the updated configuration file as follows: \begin{verbatim} $> interpolate-lm lmlist.init --learn test lmlist.final \end{verbatim} \noindent Similarly to {\tt compile-lm}, interpolated LMs can be queried through the option {\tt --score} \begin{verbatim} $> interpolate-lm lmlist.final --score yes < test \end{verbatim} \noindent and can return the perplexity of a given input text (``{\tt --eval text-file}''), optionally at sentence level by enabling the option ``{\tt --sentence yes}'', \begin{verbatim} $> interpolate-lm lmlist.final --eval test $> interpolate-lm lmlist.final --eval test --sentence yes \end{verbatim} \bigskip \noindent If there are binary LMs in the list, {\tt interpolate-lm} can avoid to load them in memory through the memory mapping option {\tt -memmap 1}. \noindent The full list of options is: \begin{verbatim} --learn text-file learn optimal interpolation for text-file --order n order of n-grams used in --learn (optional) --eval text-file compute perplexity on text-file --dub dict-size dictionary upper bound (default 10^7) --score [yes|no] compute log-probs of n-grams from stdin --debug [1-3] verbose output for --eval option (see compile-lm) --sentence [yes|no] (compute perplexity at sentence level (identified through the end symbol) --memmap 1 use memory map to read a binary LM \end{verbatim} irstlm-6.00.05/doc/LMPrune.tex000066400000000000000000000000001263213470300157740ustar00rootroot00000000000000irstlm-6.00.05/doc/LMPruning.tex000066400000000000000000000036071263213470300163450ustar00rootroot00000000000000Large LMs files can be pruned in a smart way by means of the command {\tt prune-lm} that removes $n$-grams for which resorting to the back-off results in a small loss. {\IRSTLM} implements a method similar to the Weighted Difference Method described in the paper {\em Scalable Backoff Language Models} by Seymore and Rosenfeld. \noindent The syntax is as follows: \begin{verbatim} $> prune-lm --threshold=1e-6,1e-6 train.lm.gz train.plm \end{verbatim} Thresholds for each n-gram level, up from 2-grams, are based on empirical evidence. Threshold zero results in no pruning. If less thresholds are specified, the right most is applied to the higher levels. Hence, in the above example we could have just specified one threshold, namely {\tt --threshold=1e-6}. The effect of pruning is shown in the following messages of {\tt prune-lm}: \begin{verbatim}1-grams: reading 15059 entries 2-grams: reading 142684 entries 3-grams: reading 293685 entries done OOV code is 15058 OOV code is 15058 pruning LM with thresholds: 1e-06 1e-06 savetxt: train.plm save: 15059 1-grams save: 138252 2-grams save: 194194 3-grams \end{verbatim} \noindent The saved LM table {\tt train.plm} contains about 3\% less bigrams, and 34\% less trigrams. Notice that the output of prune-lm is an ARPA LM file, while the input can be either an ARPA or binary LM. In order to measure the loss in accuracy introduced by pruning, perplexity of the resulting LM can be computed (see below). \paragraph{Warning:} the possible quantization should be performed after pruning. \paragraph{Warning:} {\IRSTLM} does not provide a reliable probability for the special 1-gram composed by the ``sentence start symbol'' ({\tt }) , because none should ever ask for it. However, this pruning method requires the computation of the probability of this 1-gram. Hence, (only) in this case the probability of this special 1-gram is arbitrarily set to 1. irstlm-6.00.05/doc/LMQuantization.tex000066400000000000000000000012311263213470300174000ustar00rootroot00000000000000A language model file in ARPA format, created with the IRST LM toolkit or with other tools, can be quantized and stored in a compact data structure, called language model table. Quantization can be performed by the command: \begin{verbatim} $> quantize-lm train.lm train.qlm \end{verbatim} \noindent which generates the quantized version {\tt train.qlm} that encodes all probabilities and back-off weights in 8 bits. The output is a modified ARPA format, called qARPA. Notice that quantized LMs reduce memory consumptions at the cost of some loss in performance. Moreover, probabilities of quantized LMs are not supposed to be properly normalized. irstlm-6.00.05/doc/LMSmoothing.tex000066400000000000000000000000001263213470300166520ustar00rootroot00000000000000irstlm-6.00.05/doc/Makefile.am000066400000000000000000000005771263213470300160070ustar00rootroot00000000000000documentationdir = @prefix@/doc if DOC_COMPILATION irstlm-manual.pdf: irstlm-manual.tex pdflatex irstlm-manual ; \ bibtex irstlm-manual ; \ pdflatex irstlm-manual ; \ pdflatex irstlm-manual ; \ rm *.aux *.log *.bbl *.blg clean-local: rm -rf irstlm-manual.pdf all: irstlm-manual.pdf dist_documentation_SCRIPTS = irstlm-manual.pdf endif EXTRA_DIST = irstlm-manual.tex irstlm-6.00.05/doc/RELEASE000066400000000000000000000000101263213470300147340ustar00rootroot000000000000005.80.08 irstlm-6.00.05/doc/compileLM.tex000066400000000000000000000000001263213470300163330ustar00rootroot00000000000000irstlm-6.00.05/doc/dict.tex000066400000000000000000000111521263213470300154070ustar00rootroot00000000000000{\tt dict} is the command which copes with the dictionaries. \begin{itemize} \item It extracts the dictionary from a corpus or a dictionary; \item It computes and shows the dictionary growth curve; \item It computes and shows the out-of-vocabulary rate on a test corpus. \end{itemize} \subsubsection{Synopsis} \begin{tabular}{llll} \multicolumn{4}{l}{USAGE}\\ & \multicolumn{3}{l}{\tt dict -i=$<$inputfile$>$ [options]} \\ \\ \multicolumn{4}{l}{OPTIONS} \\ & {\tt Curve}& {\tt c} & show dictionary growth curve; default is false\\ & {\tt CurveSize} & {\tt cs} & default 10\\ & {\tt Freq} & {\tt f} & output word frequencies; default is false\\ & {\tt Help} & {\tt h} & print this help\\ & {\tt InputFile} & {\tt i} & input file (Mandatory)\\ & {\tt IntSymb} & {\tt is} & interruption symbol\\ & {\tt ListOOV} & {\tt oov} & print OOV words to stderr; default is false\\ & {\tt LoadFactor} & {\tt lf} & set the load factor for cache; it should be a positive real value; default is 0\\ & {\tt OutputFile} & {\tt o} & output file\\ & {\tt PruneFreq} & {\tt pf} & prune words with frequency below the specified value\\ & {\tt PruneRank} & {\tt pr} & prune words with frequency rank above the specified value\\ & {\tt Size} & {\tt s} & initial dictionary size; default is $10^6$\\ & {\tt sort} & & sort dictionary by frequency; default is false\\ & {\tt TestFile} & {\tt t} & compute OOV rates on the specified test corpus\\ \end{tabular} \subsubsection{Extraction of a dictionary} To extract the dictionary from a given a text and store it in a file, run the following command: \begin{verbatim} $> dict -i=train.txt.se -o=train.dict -f=true \end{verbatim} The input text can be also generated on the fly by passing a command as value of the parameter{\tt InputFile }; in this case the single or double quotation marks are required. \begin{verbatim} $> dict -i="cat train.txt | add-start-end.sh" -o=train.dict -f=true \end{verbatim} \noindent For some applications like speech recognition, it can be useful to limit the LM dictionary. You can obtain such a pruned list either by means of the parameter {\tt PruneRank}, which only stores the top frequent, let us say, 10K words: \begin{verbatim} $> dict -i=train.txt.se -o=train.dict.pr10k -pr=10000 \end{verbatim} \noindent or by means of the parameter {\tt PruneFreq}, which only store the terms occurring more than a given amount of times, let us say, 5: \begin{verbatim} $> dict -i=train.txt.se -o=train.dict.pf5 -pf=5 \end{verbatim} \noindent The two pruning strategies can be combined. \subsubsection{Dictionary growth curve} {\tt dict} can display the distribution of the terms according to their frequency in a text or in a pre-computed dictionary. This facility is enabled by the parameter {\tt Curve}; the maximum frequency taken into account is specified by the parameter {\tt CurveSize}. \begin{verbatim} dict -i=train.dict -c=yes -cs=50 \end{verbatim} \noindent The output looks as follows \begin{verbatim} Dict size: 7893 **************** DICTIONARY GROWTH CURVE **************** Freq Entries Percent >0 7893 100.00% >1 4880 61.83% >2 3721 47.14% >3 2990 37.88% ... >47 271 3.43% >48 264 3.34% >49 258 3.27% ********************************************************* \end{verbatim} \noindent Each row of the table reports, given the value in the first column, the amount of terms (second column) having at least the given frequency (first column), and its percentage (third column) with respect to the total amount of entries. \subsubsection{Out-of-vocabulary rate statistics} {\tt dict} can display the distribution of the terms according to their frequency in a text or in a pre-computed dictionary; the maximum frequency taken into account is specified by the parameter {\tt CurveSize}. \begin{verbatim} $> dict -i=train.dict -t=test.txt.se -cs=50 \end{verbatim} \noindent The output looks as follows \begin{verbatim} Dict size: 7893 Words of test: 1009 **************** OOV RATE STATISTICS **************** Freq OOV_Entries OOV_Rate <1 119 11.79% <2 151 14.97% <3 191 18.93% ... <48 457 45.29% <49 457 45.29% <50 457 45.29% ********************************************************* \end{verbatim} \noindent Each row of the table reports, given the value in the first column, the out-of-vocabulary rate on the test set, assuming to prune the dictionary at the given frequency. In other words, 191 (18.93\%) of the running terms in the test set has a frequency smaller than 3 in the dictionary. irstlm-6.00.05/doc/gettingStarted.tex000066400000000000000000000073021263213470300174560ustar00rootroot00000000000000After a successful installation, you are ready to use {\IRSTLM}. \noindent In this Section, a basic 4-step procedure is given to estimate a LM and to compute its perplexity on a text. Many changes to this procedure can be done in order to optimize effectiveness and efficiency according to your needs. \noindent Please refer to Section~\ref{sec:commands} to learn more about each IRSTLM commands, and to Section~\ref{sec:functions} to get hints about IRSTLM functionalities. \IMPORTANT{All programs assume that the environment variable {\bf IRSTLM} is correctly set to {\tt /path/to/install/doc}, and that that environment variable {\bf PATH} includes the command directory {\tt /path/to/install/bin}. see above} \noindent Data used in the following usage examples can be found in an archive you can download from the official website of {\IRSTLM}. Most of them are very little, so the reported figures are not reliable. \subsection{Preparation of Training Data} In order to estimate a Language Model, you first need to prepare your training corpus. The corpus just consists of a text. We assume that the text is already preprocessed according to the user needs; this means that lowercasing, uppercasing, tokenization, and any other text transformation has to be performed beforehand with other tools. \noindent You can only decide whether you are interested that {\IRSTLM} is aware of sentence boundaries, i.e. where a sentence starts and ends. Otherwise, it considers the corpus as a continuous stream of text, and does not identify sentence splits. \noindent The following script adds start and end symbols ({\tt } and {\tt }, respectively) to all lines in your training corpus. \begin{verbatim} $> cat train.txt | add-start-end.sh > train.txt.se \end{verbatim} \noindent {\IRSTLM} does not compute probabilities for cross-sentence $n$-grams, i.e. $n$-grams including the pair {\tt }. \IMPORTANT{{\IRSTLM} assumes that each line corresponds to a sentence, regardless the presence of punctuation inside or at the end of the line.} \IMPORTANT{Start and end symbols ({\tt } and {\tt }) should be considered reserved symbols, and used only as sentence boundaries.} \subsection{Computation of $n$-gram statistics} \noindent You can now collect $n$-gram statistics for your training data (3-gram in this example) by running the command: \begin{verbatim} $> ngt -i=train.txt.se -n=3 -o=train.www -b=yes \end{verbatim} \noindent The $n$-grams counts are saved in the binary file "train.www". \subsection{Estimation of the LM} \noindent You can now estimate a $n$-gram LM (3-gram LM in this example) smoothed according to the Linear Witten Bell method by running the command: \begin{verbatim} $> tlm -tr=train.www -n=3 -lm=LinearWittenBell -obin=train.blm \end{verbatim} \noindent The estimated LM is saved in the binary file "train.blm". \subsection{Computation of the Perplexity} \noindent With the estimated LM, you can now compute the perplexity of any text contained in "test.txt" by running the commands below. \noindent To be compliant with the training data actually used to estimate the LM, start and end symbols are added to the text as well. \begin{verbatim} $> cat test.txt | add-start-end.sh > test.txt.se $> compile-lm train.blm --eval=test.txt.se \end{verbatim} \noindent which produces the output: \begin{verbatim} %% Nw=1009 PP=8547.90 PPwp=6870.51 Nbo=983 Noov=119 OOV=11.79% \end{verbatim} \noindent The output shows the number of words ({\tt Nw}), the LM perplexity ({\tt PP}), the portion of PP due to the out-of-vocabulary words ({\tt PPwp}), the amount of backoff calls({\tt Nbo}) required for computing PP, the amount of out-of-vocabulary words ({\tt Noov}), and the out-of-vocabulary rate ({\tt OOV}). irstlm-6.00.05/doc/giganticLM.tex000066400000000000000000000064721263213470300165130ustar00rootroot00000000000000LM estimation starts with the collection of n-grams and their frequency counters. Then, smoothing parameters are estimated for each n-gram level; infrequent n-grams are possibly pruned and, finally, a LM file is created containing n-grams with probabilities and back-off weights. This procedure can be very demanding in terms of memory and time if it applied on huge corpora. We provide here a way to split LM training into smaller and independent steps, that can be easily distributed among independent processes. The procedure relies on a training scripts that makes little use of computer RAM and implements the Witten-Bell smoothing method in an exact way. \noindent Before starting, let us create a working directory under {\tt examples}, as many files will be created: \begin{verbatim} $> mkdir stat \end{verbatim} The script to generate the LM is: \begin{verbatim} $> build-lm.sh -i "gunzip -c train.gz" -n 3 -o train.ilm.gz -k 5 \end{verbatim} where the available options are: \begin{verbatim} -i Input training file e.g. 'gunzip -c train.gz' -o Output gzipped LM, e.g. lm.gz -k Number of splits (default 5) -n Order of language model (default 3) -t Directory for temporary files (default ./stat) -p Prune singleton n-grams (default false) -s Smoothing: witten-bell (default), kneser-ney, improved-kneser-ney -b Include sentence boundary n-grams (optional) -d Define subdictionary for n-grams (optional) -v Verbose \end{verbatim} \noindent The script splits the estimation procedure into 5 distinct jobs, that are explained in the following section. There are other options that can be used. We recommend for instance to use pruning of singletons to get smaller LM files. Notice that {\tt build-lm.sh} produces a LM file {\tt train.ilm.gz} that is NOT in the final ARPA format, but in an intermediate format called {\tt iARPA}, that is recognized by the {\tt compile-lm} command and by the Moses SMT decoder running with {\IRSTLM}. To convert the file into the standard ARPA format you can use the command: \begin{verbatim} $> compile-lm train.ilm.gz --text yes train.lm \end{verbatim} this will create the proper ARPA file {\tt lm-final}. To create a gzipped file you might also use: \begin{verbatim} $> compile-lm train.ilm.gz --text yes /dev/stdout | gzip -c > train.lm.gz \end{verbatim} \noindent In the following sections, we will discuss on LM file formats, on compiling LMs into a more compact and efficient binary format, and on querying LMs. \subsection{Estimating a LM with a Partial Dictionary} A sub-dictionary can be defined by just taking words occurring more than 5 times ({\tt -pf=5}) and at most the top frequent 5000 words ({\tt -pr=5000}): \begin{verbatim} $>dict -i="gunzip -c train.gz" -o=sdict -pr=5000 -pf=5 \end{verbatim} \noindent The LM can be restricted to the defined sub-dictionary with the command {\tt build-lm.sh} by using the option {\tt -d}: \begin{verbatim} $> build-lm.sh -i "gunzip -c train.gz" -n 3 -o train.ilm.gz -k 5 -p -d sdict \end{verbatim} \noindent Notice that all words outside the sub-dictionary will be mapped into the {\tt } class, the probability of which will be directly estimated from the corpus statistics. A preferable alternative to this approach is to estimate a large LM and then to filter it according to a list of words (see Filtering a LM). irstlm-6.00.05/doc/installation.tex000066400000000000000000000063041263213470300171700ustar00rootroot00000000000000\IMPORTANT{The installation procedure has been tested using the {\tt bash} shell on the following operating systems: Mac OSx 10.6.8 (Snow Leopard), Ubuntu 14.04 LTS (trusty), Scientific Linux release 6.3 (carbon).} \noindent In order to install {\IRSTLM} on your machine, please perform the following steps. \subsection{Step 0: Preparation of the Configuration Scripts} Run the following command to prepare up-to-date configuration scripts. \begin{verbatim} $> ./regenerate-makefiles.sh [--force] \end{verbatim} \WARNING{Run with the "--force" parameter if you want to recreate all links to the autotools.} \subsection{Step 1: Configuration of the Compilation} Run the following command to prepare up-to-date compilation scripts, and to optionally set the installation directory (parameter "{\tt -prefix}". \begin{verbatim} $> ./configure [--prefix=/path/to/install] [optional-parameters] \end{verbatim} You can set other optional parameters to modify the standard compilation behavior. \begin{verbatim} --enable-doc|--disable-doc Enable or Disable (default) creation of documentation --enable-trace|--disable-trace Enable (default) or Disable trace info at run-time --enable-debugging|--disable-debugging Enable or Disable (default) debugging info ("-g -O2") --enable-profiling|--disable-profiling Enable or Disable (default) profiling info --enable-caching|--disable-caching Enable or Disable (default) internal caches to store probs and other info --enable-interpolatedsearch|--disable-interpolatedsearch Enable or Disable (default) interpolated search for n-grams --enable-optimization|--disable-optimization Enable or Disable (default) C++ optimization info ("-O3") \end{verbatim} \noindent Run the following command to get more details on the compilation options. \begin{verbatim} $> configure --help \end{verbatim} \subsection{Step 2: Compilation} \begin{verbatim} $> make clean $> make \end{verbatim} \subsection{Step 3: Installation} \begin{verbatim} $> make install \end{verbatim} \noindent Libraries and commands are generated, respectively, under the directories\newline {\tt /path/to/install/lib} and {\tt /path/to/install/bin}. \noindent If enabled and PdfLatex is installed, this user manual (in pdf) is generated under the directory\newline {\tt /path/to/install/doc}. \noindent Although caching is not enabled by default, it is highly recommended to activate through its compilation flag "{\tt --enable-caching}". %See Section~\ref{sec:caching} to learn more. \subsection{Step 4: Environment Settings} Set the environment variable {\tt IRSTLM} to {\tt /path/to/install}. \noindent Include the command directory {\tt /path/to/install/bin} into your environment variable {\tt PATH}. For instance, you can run the following commands \begin{verbatim} $> export IRSTLM=/path/to/install/ $> export PATH=${IRSTLM/bin:${PATH} \end{verbatim} \subsection{Step 5: Regression Tests} If the installation procedure succeeds, you can also run the regression tests to double-check the integrity of the software. Please go to Section~\ref{sec:regressionTests} to learn hot to run the regression tests. \noindent Regression tests should be run also in the case of any change made in the source code. irstlm-6.00.05/doc/interpolateLM.tex000066400000000000000000000000001263213470300172310ustar00rootroot00000000000000irstlm-6.00.05/doc/interpolatedLM.tex000066400000000000000000000000001263213470300173750ustar00rootroot00000000000000irstlm-6.00.05/doc/introduction.tex000066400000000000000000000033711263213470300172110ustar00rootroot00000000000000This manual illustrates the functionalities of the IRST Language Modeling (LM) toolkit ({\IRSTLM}). It should put you quickly in the condition of: \begin{itemize} \item extracting the dictionary from a corpus \item extracting n-gram statistics from it \item estimating n-gram LMs using different smoothing criteria \item saving a LM into several textual and binary file \item adapting a LM on task-specific data \item estimating and handling gigantic LMs \item pruning a LM \item reducing LM size through quantization \item querying a LM through a command or script \end{itemize} \noindent {\IRSTLM} features very efficient algorithms and data structures suitable to estimate, store, and access very large LMs. \noindent {\IRSTLM} provides adaptation methods to effectively adapt generic LM to specific task when only little task-related data are available. \noindent {\IRSTLM} provides standalone programs for all its functionalities, as well as library for its exploitation in other softwares, like for instance speech recognizers, machine translation decoders, and POS taggers. \noindent {\IRSTLM} has been integrated into a popular open source SMT decoder called {\tt Moses}\footnote{http://www.statmt.org/moses/}, and is compatible with LMs created with other tools, such as the SRILM Tooolkit\footnote{http://www.speech.sri.com/projects/srilm}. \paragraph{Acknowledgments.}Users of this toolkit might cite in their publications: \begin{quote} M. Federico, N. Bertoldi, M. Cettolo, {\em IRSTLM: an Open Source Toolkit for Handling Large Scale Language Models}, Proceedings of Interspeech, Brisbane, Australia, pp. 1618-1621, 2008. \end{quote} \noindent References to introductory material on $n$-gram LMs are given in Appendix~\ref{sec:ReferenceMaterial}. irstlm-6.00.05/doc/irstlm-manual.log000066400000000000000000000325211263213470300172350ustar00rootroot00000000000000This is pdfTeXk, Version 3.1415926-1.40.9 (Web2C 7.5.7) (format=pdflatex 2009.2.11) 11 JAN 2015 20:10 entering extended mode %&-line parsing enabled. **irstlm-manual.tex (./irstlm-manual.tex LaTeX2e <2005/12/01> Babel and hyphenation patterns for english, usenglishmax, dumylang, noh yphenation, german-x-2008-06-18, ngerman-x-2008-06-18, ancientgreek, ibycus, ar abic, basque, bulgarian, catalan, pinyin, coptic, croatian, czech, danish, dutc h, esperanto, estonian, farsi, finnish, french, galician, german, ngerman, mono greek, greek, hungarian, icelandic, indonesian, interlingua, irish, italian, la tin, lithuanian, mongolian, mongolian2a, bokmal, nynorsk, polish, portuguese, r omanian, russian, sanskrit, serbian, slovak, slovenian, spanish, swedish, turki sh, ukenglish, ukrainian, uppersorbian, welsh, loaded. (/usr/local/texlive/2008/texmf-dist/tex/latex/base/article.cls Document Class: article 2005/09/16 v1.4f Standard LaTeX document class (/usr/local/texlive/2008/texmf-dist/tex/latex/base/size11.clo File: size11.clo 2005/09/16 v1.4f Standard LaTeX file (size option) ) \c@part=\count79 \c@section=\count80 \c@subsection=\count81 \c@subsubsection=\count82 \c@paragraph=\count83 \c@subparagraph=\count84 \c@figure=\count85 \c@table=\count86 \abovecaptionskip=\skip41 \belowcaptionskip=\skip42 \bibindent=\dimen102 ) (/usr/local/texlive/2008/texmf-dist/tex/latex/preprint/fullpage.sty Package: fullpage 1999/02/23 1.1 (PWD) \FP@margin=\skip43 ) (/usr/local/texlive/2008/texmf-dist/tex/latex/psnfss/times.sty Package: times 2005/04/12 PSNFSS-v9.2a (SPQR) ) (/usr/local/texlive/2008/texmf-dist/tex/latex/base/latexsym.sty Package: latexsym 1998/08/17 v2.2e Standard LaTeX package (lasy symbols) \symlasy=\mathgroup4 LaTeX Font Info: Overwriting symbol font `lasy' in version `bold' (Font) U/lasy/m/n --> U/lasy/b/n on input line 47. ) (/usr/local/texlive/2008/texmf-dist/tex/generic/epsf/epsf.sty This is `epsf.tex' v2.7.3 <23 July 2005> \epsffilein=\read1 \epsfframemargin=\dimen103 \epsfframethickness=\dimen104 \epsfrsize=\dimen105 \epsftmp=\dimen106 \epsftsize=\dimen107 \epsfxsize=\dimen108 \epsfysize=\dimen109 \pspoints=\dimen110 ) (/usr/local/texlive/2008/texmf-dist/tex/latex/graphics/graphicx.sty Package: graphicx 1999/02/16 v1.0f Enhanced LaTeX Graphics (DPC,SPQR) (/usr/local/texlive/2008/texmf-dist/tex/latex/graphics/keyval.sty Package: keyval 1999/03/16 v1.13 key=value parser (DPC) \KV@toks@=\toks14 ) (/usr/local/texlive/2008/texmf-dist/tex/latex/graphics/graphics.sty Package: graphics 2006/02/20 v1.0o Standard LaTeX Graphics (DPC,SPQR) (/usr/local/texlive/2008/texmf-dist/tex/latex/graphics/trig.sty Package: trig 1999/03/16 v1.09 sin cos tan (DPC) ) (/usr/local/texlive/2008/texmf/tex/latex/config/graphics.cfg File: graphics.cfg 2007/01/18 v1.5 graphics configuration of teTeX/TeXLive ) Package graphics Info: Driver file: pdftex.def on input line 90. (/usr/local/texlive/2008/texmf-dist/tex/latex/pdftex-def/pdftex.def File: pdftex.def 2008/09/08 v0.04l Graphics/color for pdfTeX \Gread@gobject=\count87 )) \Gin@req@height=\dimen111 \Gin@req@width=\dimen112 ) (/usr/local/texlive/2008/texmf-dist/tex/latex/ltxmisc/version.sty) (/usr/local/texlive/2008/texmf-dist/tex/latex/graphics/color.sty Package: color 2005/11/14 v1.0j Standard LaTeX Color (DPC) (/usr/local/texlive/2008/texmf/tex/latex/config/color.cfg File: color.cfg 2007/01/18 v1.5 color configuration of teTeX/TeXLive ) Package color Info: Driver file: pdftex.def on input line 130. (/usr/local/texlive/2008/texmf-dist/tex/latex/graphics/dvipsnam.def File: dvipsnam.def 1999/02/16 v3.0i Driver-dependant file (DPC,SPQR) )) (/usr/local/texlive/2008/texmf-dist/tex/latex/ltxmisc/framed.sty Package: framed 2007/10/04 v 0.95: framed or shaded text with page breaks \fb@frw=\dimen113 \fb@frh=\dimen114 \FrameRule=\dimen115 \FrameSep=\dimen116 ) No file irstlm-manual.aux. \openout1 = `irstlm-manual.aux'. LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 27. LaTeX Font Info: ... okay on input line 27. LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 27. LaTeX Font Info: ... okay on input line 27. LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 27. LaTeX Font Info: ... okay on input line 27. LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 27. LaTeX Font Info: ... okay on input line 27. LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 27. LaTeX Font Info: ... okay on input line 27. LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 27. LaTeX Font Info: ... okay on input line 27. LaTeX Font Info: Try loading font information for OT1+ptm on input line 27. (/usr/local/texlive/2008/texmf-dist/tex/latex/psnfss/ot1ptm.fd File: ot1ptm.fd 2001/06/04 font definitions for OT1/ptm. ) (/usr/local/texlive/2008/texmf-dist/doc/pdftex/manual/samplepdf/supp-pdf.tex (/usr/local/texlive/2008/texmf-dist/doc/pdftex/manual/samplepdf/supp-mis.tex loading : Context Support Macros / Miscellaneous (2004.10.26) \protectiondepth=\count88 \scratchcounter=\count89 \scratchtoks=\toks15 \scratchdimen=\dimen117 \scratchskip=\skip44 \scratchmuskip=\muskip10 \scratchbox=\box26 \scratchread=\read2 \scratchwrite=\write3 \zeropoint=\dimen118 \onepoint=\dimen119 \onebasepoint=\dimen120 \minusone=\count90 \thousandpoint=\dimen121 \onerealpoint=\dimen122 \emptytoks=\toks16 \nextbox=\box27 \nextdepth=\dimen123 \everyline=\toks17 \!!counta=\count91 \!!countb=\count92 \recursecounter=\count93 ) loading : Context Support Macros / PDF (2004.03.26) \nofMPsegments=\count94 \nofMParguments=\count95 \MPscratchCnt=\count96 \MPscratchDim=\dimen124 \MPnumerator=\count97 \everyMPtoPDFconversion=\toks18 ) LaTeX Font Info: External font `cmex10' loaded for size (Font) <12> on input line 34. LaTeX Font Info: External font `cmex10' loaded for size (Font) <8> on input line 34. LaTeX Font Info: External font `cmex10' loaded for size (Font) <6> on input line 34. LaTeX Font Info: Try loading font information for U+lasy on input line 34. (/usr/local/texlive/2008/texmf-dist/tex/latex/base/ulasy.fd File: ulasy.fd 1998/08/17 v2.2e LaTeX symbol font definitions ) (..//RELEASE) LaTeX Font Info: Font shape `OT1/ptm/bx/n' in size <10.95> not available (Font) Font shape `OT1/ptm/b/n' tried instead on input line 40. LaTeX Font Info: External font `cmex10' loaded for size (Font) <10.95> on input line 51. LaTeX Font Info: External font `cmex10' loaded for size (Font) <9> on input line 51. LaTeX Font Info: External font `cmex10' loaded for size (Font) <5> on input line 51. LaTeX Font Info: Try loading font information for OT1+pcr on input line 51. (/usr/local/texlive/2008/texmf-dist/tex/latex/psnfss/ot1pcr.fd File: ot1pcr.fd 2001/06/04 font definitions for OT1/pcr. ) [1 {/usr/local/texlive/2008/texmf-var/fonts/map/pdftex/updmap/pdftex.map}] LaTeX Font Info: Font shape `OT1/ptm/bx/n' in size <14.4> not available (Font) Font shape `OT1/ptm/b/n' tried instead on input line 64. No file irstlm-manual.toc. \tf@toc=\write4 \openout4 = `irstlm-manual.toc'. [2] (./introduction.tex LaTeX Font Info: Try loading font information for OMS+ptm on input line 4. (/usr/local/texlive/2008/texmf-dist/tex/latex/psnfss/omsptm.fd File: omsptm.fd ) LaTeX Font Info: Font shape `OMS/ptm/m/n' in size <10.95> not available (Font) Font shape `OMS/cmsy/m/n' tried instead on input line 4. LaTeX Warning: Reference `sec:ReferenceMaterial' on page 3 undefined on input l ine 34. ) [3] (./installation.tex LaTeX Font Info: Font shape `OT1/ptm/bx/n' in size <12> not available (Font) Font shape `OT1/ptm/b/n' tried instead on input line 6. [4] LaTeX Warning: Reference `sec:caching' on page 5 undefined on input line 65. LaTeX Warning: Reference `sec:regressionTests' on page 5 undefined on input lin e 84. ) [5] (./gettingStarted.tex LaTeX Warning: Reference `sec:commands' on page 6 undefined on input line 8. LaTeX Warning: Reference `sec:functions' on page 6 undefined on input line 9. Underfull \hbox (badness 10000) in paragraph at lines 14--14 \OT1/ptm/m/n/10.95 All pro-grams as-sume that the en-vi-ron-ment vari-able \OT1 /ptm/b/n/10.95 IRSTLM \OT1/ptm/m/n/10.95 is cor-rectly set to [] [6]) [7] (./LMFileFormats.tex [8] [9] LaTeX Font Info: Try loading font information for OMS+pcr on input line 107. (/usr/local/texlive/2008/texmf-dist/tex/latex/psnfss/omspcr.fd File: omspcr.fd ) LaTeX Font Info: Font shape `OMS/pcr/m/n' in size <10.95> not available (Font) Font shape `OMS/cmsy/m/n' tried instead on input line 107. [10] [11] [12]) [13] (./LMsmoothing.tex) (./mixtureLM.tex) (./interpolatedLM.tex) [14] (./ClassAndChunkLMs.tex Overfull \hbox (16.4244pt too wide) in paragraph at lines 28--28 [] \OT1/pcr/m/n/10.95 is a file containing a LM (format compatible with {\IRSTLM})[] [] [15] [16] Overfull \hbox (3.28441pt too wide) in paragraph at lines 175--175 []\OT1/pcr/m/n/10.95 $> compile-lm --eval test/test.w-micro cfgfile/cfg.2ndfld- map-cllps -d=1[] [] ) [17] (./dict.tex Overfull \hbox (11.94783pt too wide) in paragraph at lines 40--41 []\OT1/ptm/m/n/10.95 The in-put text can be also gen-er-ated on the fly by pass -ing a com-mand as value of the pa-ram-e-ter\OT1/pcr/m/n/10.95 InputFile [] [18] [19]) (./ngt.tex) (./tlm.tex [20] [21]) (./compileLM.tex) (./interpolateLM.tex) (./pruneLM.tex) (./quantizeLM.tex) (./LMAdaptation.tex Overfull \hbox (10.24498pt too wide) in paragraph at lines 29--29 []\OT1/pcr/m/n/10 $> tlm -tr=train.www -lm=wb -n=3 -te=test -dub=1000000 -ad=ad apt -ar=0.8 -ao=yes[] [] Overfull \hbox (4.24498pt too wide) in paragraph at lines 43--43 []\OT1/pcr/m/n/10 $> tlm -tr=train-adapt.www -lm=msb -n=3 -te=test -dub=1000000 -ad=adapt -ar=0.8[] [] [22]) (./giganticLM.tex [23] Overfull \hbox (9.8544pt too wide) in paragraph at lines 51--51 []\OT1/pcr/m/n/10.95 $> compile-lm train.ilm.gz --text yes /dev/stdout | gzip - c > train.lm.gz[] [] Overfull \hbox (42.70436pt too wide) in paragraph at lines 72--72 []\OT1/pcr/m/n/10.95 $> build-lm.sh -i "gunzip -c train.gz" -n 3 -o train.ilm .gz -k 5 -p -d sdict[] [] ) [24] (./LMPruning.tex) [25] (./LMQuantization.tex) [26] (./LMCompilation.tex Underfull \hbox (badness 10000) in paragraph at lines 7--12 [] ) [27] (./LMInterpolation.tex Underfull \hbox (badness 10000) in paragraph at lines 19--23 [] ) [28] (./LMFiltering.tex) [29] (./ParallelComputation.tex LaTeX Warning: Reference `sec:giganticLM' on page 30 undefined on input line 17 . ) [30] (./LMInterface.tex Underfull \hbox (badness 10000) in paragraph at lines 33--35 [] Overfull \hbox (16.4244pt too wide) in paragraph at lines 51--51 []\OT1/pcr/m/n/10.95 $> tlm -tr=train.www -n=3 -lm=wb -te=test -o=train.lm -ps= no -dub=10000000[] [] Overfull \hbox (4.24498pt too wide) in paragraph at lines 74--74 []\OT1/pcr/m/n/10 %% sent_Nw=1 sent_PP=23.22 sent_PPwp=0.00 sent_Nbo=0 sent_Noo v=0 sent_OOV=0.00%[] [] Overfull \hbox (40.24498pt too wide) in paragraph at lines 74--74 []\OT1/pcr/m/n/10 %% sent_Nw=8 sent_PP=7489.50 sent_PPwp=7356.27 sent_Nbo=7 sen t_Noov=2 sent_OOV=25.00%[] [] Overfull \hbox (22.24498pt too wide) in paragraph at lines 74--74 []\OT1/pcr/m/n/10 %% sent_Nw=9 sent_PP=1231.44 sent_PPwp=0.00 sent_Nbo=14 sent_ Noov=0 sent_OOV=0.00%[] [] Overfull \hbox (58.24498pt too wide) in paragraph at lines 74--74 []\OT1/pcr/m/n/10 %% sent_Nw=6 sent_PP=27759.10 sent_PPwp=25867.42 sent_Nbo=19 sent_Noov=1 sent_OOV=16.67%[] [] [31] Overfull \hbox (34.24498pt too wide) in paragraph at lines 74--74 []\OT1/pcr/m/n/10 %% sent_Nw=5 sent_PP=378.38 sent_PPwp=0.00 sent_Nbo=39893 sen t_Noov=0 sent_OOV=0.00%[] [] Overfull \hbox (64.24498pt too wide) in paragraph at lines 74--74 []\OT1/pcr/m/n/10 %% sent_Nw=15 sent_PP=4300.44 sent_PPwp=2831.89 sent_Nbo=3990 7 sent_Noov=1 sent_OOV=6.67%[] [] ) [32] (./regressionTests.tex) [33] (./referenceMaterial.tex) [34] (./releaseNotes.tex [35] [36] [37] [38]) [39] (./irstlm-manual.aux) LaTeX Warning: There were undefined references. LaTeX Warning: Label(s) may have changed. Rerun to get cross-references right. ) Here is how much of TeX's memory you used: 1619 strings out of 493876 22592 string characters out of 1150568 72741 words of memory out of 3000000 4790 multiletter control sequences out of 10000+50000 27471 words of font info for 63 fonts, out of 3000000 for 5000 714 hyphenation exceptions out of 8191 25i,8n,19p,394b,297s stack positions out of 5000i,500n,10000p,200000b,50000s {/usr/local/texlive/2008/texmf-dist/fonts/enc/dvips/base/8r.enc} Output written on irstlm-manual.pdf (39 pages, 155611 bytes). PDF statistics: 166 PDF objects out of 1000 (max. 8388607) 0 named destinations out of 1000 (max. 131072) 1 words of extra memory for PDF output out of 10000 (max. 10000000) irstlm-6.00.05/doc/irstlm-manual.tex000066400000000000000000000106161263213470300172550ustar00rootroot00000000000000\documentclass[11pt]{article} \usepackage{fullpage} \usepackage{times} \usepackage{latexsym} \usepackage{epsf} \usepackage{graphicx} \usepackage{version} \usepackage[usenames,dvipsnames]{color} %\usepackage{mdframed} \usepackage{framed} \newcommand{\IRSTLM}{{\bf IRSTLM Toolkit}} \newcommand*{\MyPath}{../} \newcommand{\versionnumber}{\input{\MyPath/RELEASE}} %\newcommand{\IMPORTANT}[1]{\begin{mdframed}[linecolor=red]\noindent #1\end{mdframed}} \newcommand{\IMPORTANT}[1]{\begin{framed}\noindent #1\end{framed}} \newcommand{\WARNING}[1]{\paragraph{Warning:} #1} \newcommand{\NOTE}[1]{\textcolor{red}{\bf Note}: #1} \newcommand{\COMMENT}[1]{} \def\thesubsubsection{\thesubsection.\alph{subsubsection}} \begin{document} \title{IRST Language Modeling Toolkit \\USER MANUAL} \author{M. Federico, N. Bertoldi, M. Cettolo\\FBK-irst, Trento, Italy} \date{\today} \maketitle \centerline{Version \versionnumber} %% INTRODUCTION %%%% \vspace*{3cm} \noindent The official website of {\IRSTLM} is \bigskip {\bf http://hlt.fbk.eu/en/irstlm} \bigskip \noindent It contains this manual, source code, examples and regression tests. \vspace*{1cm} \noindent {\IRSTLM} is distributed under the GNU General Public License version 3 (GPLv3).\footnote{\tt http://www.gnu.org/licenses/gpl-3.0.html} \vspace*{1cm} \noindent Users of {\IRSTLM} might cite in their publications: \begin{quote} M. Federico, N. Bertoldi, M. Cettolo, {\em IRSTLM: an Open Source Toolkit for Handling Large Scale Language Models}, Proceedings of Interspeech, Brisbane, Australia, pp. 1618-1621, 2008. \end{quote} \newpage \setcounter{tocdepth}{2} \tableofcontents %%%% INTRODUCTION %%%%%%%%% \newpage \section{Introduction} \label{sec:introduction} \input{introduction} %%%% INSTALLATION %%%%%%%%% \newpage \section{Installation} \label{sec:installation} \input{installation} %%%% GETTING STARTED %%%%%%%%% \newpage \section{Getting started} \label{sec:gettingStarted} \input{gettingStarted} %%%% LM FORMATS %%%%%%%%%%% \newpage \section{LM File Formats} \label{sec:LMFileFormats} \input{LMFileFormats} %%%% LM TYPES %%%%%%%%% \newpage \section{LM Types} \label{sec:LMTypes} %%%% LM SMOOTHING %%%%%%%%% \subsection{LM smoothing} \label{sec:LMSmoothing} \input{LMSmoothing} %%%% MIXTURE LM %%%%%%%%% \subsection{Mixture LM} \label{sec:mixtureLM} \input{mixtureLM} %%%% INTERPOLATED LM %%%%%%%%% \subsection{Interpolated LM} \label{sec:InterpolatedLM} \input{interpolatedLM} %%%% CHUNK LM %%%%%%%%% \newpage \subsection{Class and Chunk LMs} \label{sec:ClassAndChunkLMs} \input{ClassAndChunkLMs} %%%% IRSTLM COMMANDS %%%%%%%%% \newpage \section{IRSTLM commands} \label{sec:commands} \subsection{dict} \label{sec:dict} \input{dict} \subsection{ngt} \label{sec:ngt} \input{ngt} \subsection{tlm} \label{sec:tlm} \input{tlm} \subsection{compile-lm} \label{sec:compileLM} \input{compileLM} \subsection{interpolate-lm} \label{sec:interpolateLM} \input{interpolateLM} \subsection{prune-lm} \label{sec:pruneLM} \input{pruneLM} \subsection{quantize-lm} \label{sec:quantizeLM} \input{quantizeLM} %% LM ADAPTATION %%%% \section{IRSTLM functions} \label{sec:functions} \subsection{LM Adaptation} \label{sec:LMAdaptation} \input{LMAdaptation} %% ESTIMATING GIGANTIC LMs %%%% \subsection{Estimating Gigantic LMs} \label{sec:giganticLM} \input{giganticLM} %%%% LM PRUNING %%%%% \newpage \subsection{LM Pruning} \label{sec:LMPruning} \input{LMPruning} %%%% LM QUANTIZATION %%%%% \newpage \subsection{LM Quantization} \label{sec:LMQuantization} \input{LMQuantization} %%%% LM COMPILATION %%%%% \newpage \subsection{LM Compilation} \label{sec:LMCompilation} \input{LMCompilation} %%%% LM INTERPOLATION %%%%%%%%% \newpage \subsection{LM Interpolation} \label{sec:LMInterpolation} \input{LMInterpolation} \newpage \subsection{Filtering a LM} \label{sec:LMFiltering} \input{LMFiltering} %%%% PARALLEL COMPUTATION %%%%%%%%% \newpage \section{Parallel Computation} \label{sec:ParallelComputation} \input{parallelComputation} %%%% LM INTERFACE %%%%%%%%% \newpage \section{IRSTLM Interface} \label{sec:LMInterface} \input{LMInterface} %%%% REGRESSION TESTS %%%%%%%%% \newpage \section{Regression Tests} \label{sec:regressionTests} \input{regressionTests} %%%% APPENDIX %%%%%%%%% \appendix \newpage \section{Reference Material} \label{sec:ReferenceMaterial} \input{referenceMaterial} \newpage \section{Release Notes} \label{sec:releaseNotes} \input{releaseNotes} \end{document} irstlm-6.00.05/doc/mdframed.sty000066400000000000000000001367231263213470300162760ustar00rootroot00000000000000%% This is file `mdframed.sty', %% generated with the docstrip utility. %% %% The original source files were: %% %% mdframed.dtx (with options: `package') %% ---------------------------------------------------------------- %% Working with the command fbox or fcolorbox, one has to %% handle page breaks by hand. The present package defines the %% environment mdframed which automatically deals with page breaks. %% %% Author's name: Marco Daniel and Elke Schubert (!new) %% License type: lppl %% %% ================================================== %% ========Is based on the idea of framed.sty======== %% ================================================== %% ===== Currently the package has a beta-Status ==== %% ================================================== %% WITH THANKS TO (alphabetically): %% ROLF NIEPRASCHK %% HEIKO OBERDIEK %% HERBERT VOSS %% %% Copyright (c) 2010 Marco Daniel %% %% This package may be distributed under the terms of the LaTeX Project %% Public License, as described in lppl.txt in the base LaTeX distribution. %% Either version 1.0 or, at your option, any later version. %% %% %% ================================================= %% Erstellung eines Rahmens, der am Seitenende keine %% horizontale Linie einfuegt %% >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> %% _______________ %% | page 1 | %% | Text | %% | __Text__ | %% | | Text | | %% P A G E B R E A K %% | | Text | | %% | |_Text_| | %% | Text | %% |____page 2___| %% %% >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> %% ================================================== %% \def\mdversion{v1.6b} \def\mdframedpackagename{mdframed} \def\mdf@maindate@svn$#1: #2 #3 #4-#5-#6 #7 #8${#4/#5/#6\space } \NeedsTeXFormat{LaTeX2e} \ProvidesPackage{mdframed}% [\mdf@maindate@svn$Id: mdframed.dtx 426 2012-06-02 12:18:56Z marco $% \mdversion: \mdframedpackagename] \newcommand*\mdf@PackageError[1]{\PackageError{\mdframedpackagename}{#1}} \newcommand*\mdf@PackageWarning[1]{\PackageWarning{\mdframedpackagename}{#1}} \newcommand*\mdf@PackageInfo[1]{\PackageInfo{\mdframedpackagename}{#1}} \newcommand*\mdf@LoadFile@IfExist[1]{% \IfFileExists{#1.sty}{% \RequirePackage{#1}% }{% \mdf@PackageWarning{The file #1 does not exist\MessageBreak but needed by \mdframedpackagename\MessageBreak see documentation fo further information }% } } \RequirePackage{kvoptions} \RequirePackage{xparse} \RequirePackage{etoolbox}[2011/01/03] \RequirePackage{zref-abspage} \RequirePackage{color} \SetupKeyvalOptions{family=mdf,prefix=mdf@} \newlength{\mdf@templength} \def\mdf@iflength#1{% \afterassignment\mdf@iflength@check% \mdf@templength=#1\mdf@defaultunit\relax\relax \expandafter\endgroup\next } \def\mdf@iflength@check#1{% \begingroup \ifx\relax#1\@empty \def\next{\@secondoftwo} \else \def\next{\@firstoftwo} \expandafter\mdf@iflength@cleanup \fi } \def\mdf@iflength@cleanup#1\relax{} \DeclareListParser*{\mdf@dolist}{,} \newrobustcmd*{\mdf@option@length}[2]{% \expandafter\newlength\csname mdf@#1@length\endcsname% \expandafter\setlength\csname mdf@#1@length\endcsname{#2}% } \newrobustcmd*{\mdf@define@key@length}[1]{% \define@key{mdf}{#1}{% \def\@tempa{##1} \mdf@iflength{\@tempa}% {\csxdef{mdfl@#1}{\the\mdf@templength}}% {\csxdef{mdfl@#1}{\the\mdf@templength}}% \setlength{\csname mdf@#1@length\endcsname}{\csname mdfl@#1\endcsname}% }% } \def\mdf@do@lengthoption#1{% \mdf@lengthoption@doubledo#1\@nil% } \def\mdf@lengthoption@doubledo#1==#2\@nil{% \mdf@option@length{#1}{#2}% \mdf@define@key@length{#1}% } \def\mdf@do@stringoption#1{% \mdf@stringoption@doubledo#1\@nil% } \def\mdf@stringoption@doubledo#1==#2\@nil{% \expandafter\gdef\csname mdf@#1\endcsname{#2}% \define@key{mdf}{#1}{% \csdef{mdf@#1}{##1}% }% } \def\mdf@do@booloption#1{% \mdf@booloption@doubledo#1\@nil% } \def\mdf@booloption@doubledo#1==#2\@nil{% \newbool{mdf@#1}\setbool{mdf@#1}{#2}% \define@key{mdf}{#1}[#2]{% \setbool{mdf@#1}{##1}% }% } \def\mdf@do@alignoption#1{% \mdf@alignoption@tripledo#1\@nil% } \def\mdf@alignoption@tripledo#1==#2==#3\@nil{% \csdef{mdf@align@#1@left}{\null\hspace*{#2}}% \csdef{mdf@align@#1@right}{\hspace*{#3}\null}% } \newcounter{mdf@globalstyle@cnt} \defcounter{mdf@globalstyle@cnt}{0} \newcommand*\mdfglobal@style{0} \define@key{mdf}{style}{% \mdf@PackageWarning{package option style is depreciated^^J use framemethod instead\MessageBreak}% \renewcommand*\mdfglobal@style{#1}% \defcounter{mdf@globalstyle@cnt}{#1}% \ifcase\value{mdf@globalstyle@cnt}\relax \or\mdf@LoadFile@IfExist{tikz}%=1 \or\mdf@LoadFile@IfExist{pstricks-add}%=2 \or\defcounter{mdf@globalstyle@cnt}{2}%=3 \mdf@LoadFile@IfExist{pst-node}% \or\mdf@LoadFile@IfExist{pst-node}%=4 \else%%>4 \mdf@PackageWarning{Unknown global style \value{mdf@globalstyle@cnt}}% \fi% } \providecommand*\mdf@framemethod{} \def\mdf@framemethod@i{}% \def\mdf@framemethod@ii{}% \def\mdf@framemethod@iii{}% \define@key{mdf}{framemethod}[default]{% \lowercase{\def\mdf@tempa{#1}}%lowercase not expandable \forcsvlist{\listadd\mdf@framemethod@i}{default,tex,latex,none,0} \forcsvlist{\listadd\mdf@framemethod@ii}{pgf,tikz,1} \forcsvlist{\listadd\mdf@framemethod@iii}{pstricks,ps,2,postscript} \xifinlist{\mdf@tempa}{\mdf@framemethod@i}% {\def\mdf@@framemethod{default}\defcounter{mdf@globalstyle@cnt}{0}}% {\xifinlist{\mdf@tempa}{\mdf@framemethod@ii}% {\def\mdf@@framemethod{tikz}\defcounter{mdf@globalstyle@cnt}{1}}% {\xifinlist{\mdf@tempa}{\mdf@framemethod@iii}% {\def\mdf@@framemethod{pstricks}\defcounter{mdf@globalstyle@cnt}{2}}% {\mdf@LoadFile@IfExist{#1}}% }% }% \ifcase\value{mdf@globalstyle@cnt}\relax% \or\mdf@LoadFile@IfExist{tikz}%=1 \or\mdf@LoadFile@IfExist{pst-node}%=2 \or\mdf@LoadFile@IfExist{pst-node}%=3 \fi% } \mdf@dolist{\mdf@do@lengthoption}{% {skipabove==\z@},% {skipbelow==\z@},% {leftmargin==\z@},% {rightmargin==\z@},% {innerleftmargin==10pt},% {innerrightmargin==10pt},% {innertopmargin==0.4\baselineskip},% {innerbottommargin==0.4\baselineskip},% {splittopskip==\z@},% {splitbottomskip==\z@},% {outermargin==\z@},% {innermargin==\z@},% {linewidth==0.4pt},% {innerlinewidth==\z@},% {middlelinewidth==\expandafter\mdf@linewidth@length},% {outerlinewidth==\z@},% {roundcorner==\z@},% {footenotedistance==\medskipamount}, {userdefinedwidth==\linewidth}, {frametitleaboveskip==5pt}, {frametitlebelowskip==5pt}, {frametitlerulewidth==.2pt}, {frametitleleftmargin==10pt},% {frametitlerightmargin==10pt},% {shadowsize==8pt},% {extratopheight==\z@},% {subtitleabovelinewidth==.8pt},% {subtitlebelowlinewidth==.6pt},% {subtitleaboveskip==\baselineskip},% {subtitlebelowskip==1.2\baselineskip},% {subtitleinneraboveskip==.5\baselineskip},% {subtitleinnerbelowskip==.5\baselineskip},% {subsubtitleabovelinewidth==.8pt},% {subsubtitlebelowlinewidth==.6pt},% {subsubtitleaboveskip==\baselineskip},% {subsubtitlebelowskip==1.2\baselineskip},% {subsubtitleinneraboveskip==.5\baselineskip},% {subsubtitleinnerbelowskip==.5\baselineskip},% } \mdf@dolist{\mdf@do@stringoption}{% {frametitle=={}},% {defaultunit==pt},% {linecolor==black},% {backgroundcolor==white},% {fontcolor==black},% {frametitlefontcolor==black},% {innerlinecolor==\mdf@linecolor},% {outerlinecolor==\mdf@linecolor},% {middlelinecolor==\mdf@linecolor},% {psroundlinecolor==\mdf@backgroundcolor},% {frametitlerulecolor==\mdf@linecolor}, {frametitlebackgroundcolor==\mdf@backgroundcolor},% {shadowcolor==black!50},% {settings=={}},% {frametitlesettings=={}},% {font=={}},% {frametitlefont==\normalfont\bfseries},% {printheight==none},% {alignment=={}},% {frametitlealignment=={}},% {theoremseparator=={:}},% {theoremcountersep=={.}},% {theoremtitlefont=={}},% {theoremspace=={\space}},% {singleextra=={}}, {firstextra=={}}, {middleextra=={}}, {secondextra=={}}, {subtitlefont==\normalfont\bfseries},% {subsubtitlefont==\normalfont},% {subtitlebackgroundcolor==white},% {subsubtitlebackgroundcolor==white},% {subtitleabovelinecolor==black},% {subtitlebelowlinecolor==black},% {subsubtitleabovelinecolor==black},% {subsubtitlebelowlinecolor==black},% } \mdf@dolist{\mdf@do@booloption}{% {ntheorem==false},% {topline==true},% {leftline==true},% {bottomline==true},% {rightline==true},% {frametitletopline==true},% {frametitleleftline==true},% {frametitlebottomline==true},% {frametitlerightline==true},% {frametitlerule==false},% {nobreak==false},% {footnoteinside==true},% {usetwoside==true},% {repeatframetitle==false},%Noch nicht richtig implementiert {shadow==false},% {everyline==false},% {ignorelastdescenders==false},% {subtitleaboveline==false}, {subtitlebelowline==false}, {subsubtitleaboveline==false}, {subsubtitlebelowline==false}, } %%special boolflag hidealllines: \newbool{mdf@hidealllines}% \define@key{mdf}{hidealllines}[false]{% \setbool{mdf@hidealllines}{#1}% \ifbool{mdf@hidealllines}{% \kvsetkeys{mdf}{leftline=false,topline=false,% rightline=false,bottomline=false}% }{}% } \mdf@dolist{\mdf@do@alignoption}{% {left==\mdf@leftmargin@length==\z@},% {center==\fill==\fill},% {right==\fill==\mdf@rightmargin@length},% {outer==\fill==\mdf@rightmargin@length},%not supported yet {outer==\mdf@leftmargin@length==\fill},%not supported yet } \newcommand*\mdf@align{}% \newcommand*\mdf@makeboxalign@left{\null\hspace*{\mdf@leftmargin@length}}% \newcommand*\mdf@makeboxalign@right{}% \define@key{mdf}{align}[left]{% \ifcsundef{mdf@align@#1@left}{% \mdf@PackageWarning{Unknown alignment #1\MessageBreak}% \letcs\mdf@makeboxalign@left{mdf@align@left@left}% \letcs\mdf@makeboxalign@right{mdf@align@left@right}% }{% \def\mdf@makeboxalign@left{\csuse{mdf@align@#1@left}}% \def\mdf@makeboxalign@right{\csuse{mdf@align@#1@right}}% }% } \def\mdf@tikzset@local{\tikzset{tikzsetting/.style={}}} \define@key{mdf}{tikzsetting}{% \def\mdf@tikzset@local{\tikzset{tikzsetting/.style={#1}}}% } \define@key{mdf}{apptotikzsetting}{% \appto\mdf@tikzset@local{#1}% } \def\mdf@psset@local{} \define@key{mdf}{pstrickssetting}{% \def\mdf@psset@local{#1} } \def\mdfpstricks@appendsettings{} \define@key{mdf}{pstricksappsetting}{% \def\mdfpstricks@appendsettings{#1}% } \def\mdf@xcolor{} \define@key{mdf}{xcolor}[]{% \def\@tempa{#1}% \@ifpackageloaded{xcolor}{% \let\mdf@xcolor\@empty %ignoriere die Eingabe der Optionen \def\@tempa{}% }{}% \ifx\relax\@tempa\relax\else \PassOptionsToPackage{\mdf@xcolor}{xcolor}% \RequirePackage{xcolor}% \fi% }% \define@key{mdf}{needspace}[\z@]{% \begingroup% \setlength{\dimen@}{#1}% \vskip\z@\@plus\dimen@% \penalty -100\vskip\z@\@plus -\dimen@% \vskip\dimen@% \penalty 9999% \vskip -\dimen@% \vskip\z@skip % hide the previous |\vskip| from |\addvspace| \endgroup% } \DeclareDefaultOption{% \mdf@PackageError{Unknown Option '\CurrentOption' for mdframed}} \ProcessKeyvalOptions*\relax \newrobustcmd*{\mdfsetup}{\kvsetkeys{mdf}} \define@key{mdf}{style}{% \ifcsundef{mdf@definestyle@#1}{% \mdf@PackageWarning{Unknown definedstyle #1^^J You have to define a style ^^J via \string\mdfdefinedstyle\MessageBreak }% }% {\expandafter\expandafter\expandafter\mdfsetup\expandafter% \expandafter\expandafter{\csname mdf@definestyle@#1\endcsname}}% }% \let\mdf@PackageNoInfo\@gobble \newrobustcmd*\mdf@ifstrequal@expand{% \expandafter\ifstrequal\expandafter{\mdf@printheight}% } \newrobustcmd*\mdf@print@space{% %case "none" \mdf@ifstrequal@expand{none}{\def\mdf@tempa{NoInfo}}{% %case "info" \mdf@ifstrequal@expand{info}{\def\mdf@tempa{Info}}{% %case "warning" \mdf@ifstrequal@expand{warning}{\def\mdf@tempa{Warning}}{% %case "unknown" \mdf@PackageWarning{Unknown key for printheight=\mdf@printheight^^J use none, info or warning}% \def\mdf@tempa{none}% }% }% }% \def\mdf@PackageInfoSpace{\csname mdf@Package\mdf@tempa\endcsname}% } \newsavebox\mdf@frametitlebox \newsavebox\mdf@footnotebox \newsavebox\mdf@splitbox@one \newsavebox\mdf@splitbox@two \newsavebox\mdf@splitbox@save \newlength\mdfsplitboxwidth \newlength\mdfsplitboxtotalwidth \newlength\mdfsplitboxheight \newlength\mdfsplitboxdepth \newlength\mdfsplitboxtotalheight \newlength\mdfframetitleboxwidth \newlength\mdfframetitleboxtotalwidth \newlength\mdfframetitleboxheight \newlength\mdfframetitleboxdepth \newlength\mdfframetitleboxtotalheight \newlength\mdffootnoteboxwidth \newlength\mdffootnoteboxtotalwidth \newlength\mdffootnoteboxheight \newlength\mdffootnoteboxdepth \newlength\mdffootnoteboxtotalheight \newlength\mdftotallinewidth \newlength\mdfboundingboxwidth \newlength\mdfboundingboxtotalwidth \newlength\mdfboundingboxheight \newlength\mdfboundingboxdepth \newlength\mdfboundingboxtotalheight \newlength\mdf@freevspace@length \newlength\mdf@horizontalwidthofbox@length \newlength\mdf@verticalmarginwhole@length \newtoggle{mdf@notfirstframetitle}% \togglefalse{mdf@notfirstframetitle}% \newrobustcmd\mdfcreateextratikz{} \def\mdf@lrbox#1{% %%patch to work with amsthm \mdf@patchamsthm %%%end patch \edef\mdf@restoreparams{% \parindent=\the\parindent\relax \parskip=\the\parskip\relax}% \setbox#1\vbox\bgroup% \color@begingroup% \mdf@horizontalmargin@equation% \columnwidth=\hsize% \textwidth=\hsize% \let\if@nobreak\iffalse% \let\if@noskipsec\iffalse% \let\par\@@par% \let\-\@dischyph% \let\'\@acci\let\`\@accii\let\=\@acciii% \parindent\z@ \parskip\z@skip% \linewidth\hsize% \@totalleftmargin\z@% \leftskip\z@skip \rightskip\z@skip \@rightskip\z@skip% \parfillskip\@flushglue \lineskip\normallineskip% \baselineskip\normalbaselineskip% %% \sloppy% \let\\\@normalcr% \mdf@restoreparams\relax% \@afterindentfalse% \@afterheading% } \def\endmdf@lrbox{\color@endgroup\egroup} \newrobustcmd*\mdf@ignorevbadness{% \edef\mdf@currentvbadness{\the\vbadness}% \vbadness=\@M% \afterassignment\mdf@restorevbadness} \newrobustcmd*\mdf@restorevbadness{\vbadness=\mdf@currentvbadness\relax} \@ifpackageloaded{amsthm}% {% \newrobustcmd\mdf@patchamsthm{% \let\mdf@deferred@thm@head\deferred@thm@head \patchcmd{\deferred@thm@head}{\indent}{}% {\mdf@PackageInfo{mdframed detected package amsthm ^^J changed the theoerem header of amsthm\MessageBreak}% }{% \mdf@PackageError{mdframed detected package amsthm ^^J changed the theoerem header of amsthm failed\MessageBreak}% }% }% }{\let\mdf@patchamsthm\relax}% \def\mdf@trivlist#1{% \setlength{\topsep}{#1}% \partopsep\z@% \parsep\z@% \@nmbrlistfalse% \@trivlist% \labelwidth\z@% \leftmargin\z@% \itemindent\z@% \let\@itemlabel\@empty% \def\makelabel##1{##1}% %% \item\leavevmode\hrule \@height\z@ \@width\linewidth\relax% %% \item\mbox{}\relax% second version \item\relax% first Version } \let\endmdf@trivlist\endtrivlist \patchcmd\endmdf@trivlist\@endparenv\mdf@endparenv{% \immediate\typeout{^^J****** mdframed patching \string\endmdf@trivlist}% \immediate\typeout{^^J****** -- success******^^J}% }{% \immediate\typeout{^^J****** mdframed patching \string\endmdf@trivlist}% \immediate\typeout{^^J****** -- failed******^^J}% } \def\mdf@endparenv{% \addpenalty\@endparpenalty\addvspace\mdf@skipbelow@length\@endpetrue} \newrobustcmd*\mdf@makebox@out[2][\linewidth]{% \noindent\hb@xt@\z@{% \noindent\makebox[\dimexpr #1\relax][l]{#2}% \hss}% }% \newrobustcmd*\mdf@makebox@in[2][\mdf@userdefinedwidth@length]{% \noindent\makebox[\dimexpr #1\relax][l]{#2}% } \newrobustcmd*\mdfdefinestyle[2]{% \csdef{mdf@definestyle@#1}{#2}% } \newrobustcmd*\mdfapptodefinestyle[2]{% \ifcsundef{mdf@definestyle@#1}% {\mdf@PackageWarning{Unknown style #1}}% {\csappto{mdf@definestyle@#1}{,#2}}% } \newrobustcmd*{\mdflength}[1]{\csuse{mdf@#1@length}} \newrobustcmd*{\surroundwithmdframed}[2][]{% \BeforeBeginEnvironment{#2}{\begin{mdframed}[#1]}% \AfterEndEnvironment{#2}{\end{mdframed}}% } \newrobustcmd*\newmdenv[2][]{% \newenvironment{#2}{% \mdfsetup{#1}% \begin{mdframed}% }{% \end{mdframed}% }% } \newrobustcmd*\renewmdenv[2][]{% \expandafter\let\csname #2\endcsname\relax% \expandafter\let\csname end#2\endcsname\relax% \newmdenv[#1]{#2}% }% \DeclareDocumentCommand\newmdtheoremenv{O{} m o m o }{% \ifboolexpr{ test {\IfNoValueTF {#3}} and test {\IfNoValueTF {#5}} }% {\newtheorem{#2}{#4}}{% \IfValueTF{#3}{\newtheorem{#2}[#3]{#4}}{}% \IfValueTF{#5}{\newtheorem{#2}{#4}[#5]}{}% }% \BeforeBeginEnvironment{#2}{% \begin{mdframed}[#1]}% \AfterEndEnvironment{#2}{% \end{mdframed}}% } \newrobustcmd*\mdf@thm@caption[2]{} \AtBeginDocument{% \@ifpackageloaded{ntheorem}% {\renewrobustcmd*\mdf@thm@caption{\thm@thmcaption}}{}% } \DeclareDocumentCommand{\mdtheorem}{ O{} m o m o }% {\ifcsdef{#2}% {\mdf@PackageWarning{Environment #2 already exits\MessageBreak}}% {% \IfNoValueTF {#3}% {%#3 not given -- number relationship \IfNoValueTF {#5}% {%#3+#5 not given \@definecounter{#2}% \expandafter\xdef\csname the#2\endcsname{\@thmcounter{#2}}% \newenvironment{#2}[1][]{% \refstepcounter{#2}% \ifstrempty{##1}% {\let\@temptitle\relax}% {% \def\@temptitle{\mdf@theoremseparator% \mdf@theoremspace% \mdf@theoremtitlefont% ##1}% \mdf@thm@caption{#2}{{#4}{\csname the#2\endcsname}{##1}}% }% \begin{mdframed}[#1,frametitle={\strut#4\ \csname the#2\endcsname% \@temptitle}]}% {\end{mdframed}}% \newenvironment{#2*}[1][]{% \ifstrempty{##1}{\let\@temptitle\relax}{\def\@temptitle{:\ ##1}}% \begin{mdframed}[#1,frametitle={\strut#4\@temptitle}]}% {\end{mdframed}}% }% {%#5 given -- reset counter \@definecounter{#2}\@newctr{#2}[#5]% \expandafter\xdef\csname the#2\endcsname{\@thmcounter{#2}}% \expandafter\xdef\csname the#2\endcsname{% \expandafter\noexpand\csname the#5\endcsname \@thmcountersep% \@thmcounter{#2}}% \newenvironment{#2}[1][]{% \refstepcounter{#2}% \ifstrempty{##1}% {\let\@temptitle\relax}% {% \def\@temptitle{\mdf@theoremseparator% \mdf@theoremspace% \mdf@theoremtitlefont% ##1}% \mdf@thm@caption{#2}{{#4}{\csname the#2\endcsname}{##1}}% } \begin{mdframed}[#1,frametitle={\strut#4\ \csname the#2\endcsname% \@temptitle}]}% {\end{mdframed}}% \newenvironment{#2*}[1][]{% \ifstrempty{##1}% {\let\@temptitle\relax}% {% \def\@temptitle{\mdf@theoremseparator% \mdf@theoremspace% \mdf@theoremtitlefont% ##1}% \mdf@thm@caption{#2}{{#4}{\csname the#2\endcsname}{##1}}% }% \begin{mdframed}[#1,frametitle={\strut#4\@temptitle}]}% {\end{mdframed}}% }% }% {%#3 given -- number relationship \global\@namedef{the#2}{\@nameuse{the#3}}% \newenvironment{#2}[1][]{% \refstepcounter{#3}% \ifstrempty{##1}% {\let\@temptitle\relax}% {% \def\@temptitle{\mdf@theoremseparator% \mdf@theoremspace% \mdf@theoremtitlefont% ##1}% \mdf@thm@caption{#2}{{#4}{\csname the#2\endcsname}{##1}}% } \begin{mdframed}[#1,frametitle={\strut#4\ \csname the#2\endcsname% \@temptitle}]}% {\end{mdframed}}% \newenvironment{#2*}[1][]{% \ifstrempty{##1}{\let\@temptitle\relax}{\def\@temptitle{:\ ##1}}% \begin{mdframed}[#1,frametitle={\strut#4\@temptitle}]}% {\end{mdframed}}% }% }% } \newrobustcmd\mdfframedtitleenv[1]{% \mdf@lrbox{\mdf@frametitlebox}% \mdf@frametitlealignment% \leavevmode\color{\mdf@frametitlefontcolor}% \normalfont\mdf@frametitlefont{#1} \ifbool{mdf@ignorelastdescenders}% {% \par\strut\par \unskip\unskip\setbox0=\lastbox \vspace*{\dimexpr\ht\strutbox-\baselineskip\relax}% }{}% \par\unskip\ifvmode\nointerlineskip\hrule \@height\z@ \@width\hsize\fi%% \endmdf@lrbox\relax% \mdf@ignorevbadness% \setbox\mdf@frametitlebox=\vbox{\unvbox\mdf@frametitlebox}% \mdfframetitleboxwidth=\wd\mdf@frametitlebox\relax% \mdfframetitleboxheight=\ht\mdf@frametitlebox\relax% \mdfframetitleboxdepth=\dp\mdf@frametitlebox\relax% \mdfframetitleboxtotalheight=\dimexpr \ht\mdf@frametitlebox +\dp\mdf@frametitlebox% +\mdf@frametitleaboveskip@length +\mdf@frametitlebelowskip@length \relax% } \newrobustcmd*\mdf@@frametitle{% \mdfframedtitleenv{\mdf@frametitle}% } \newrobustcmd*\mdf@@frametitle@use{% \parskip\z@\relax% \parindent\z@\relax% \offinterlineskip\relax% \mdf@ignorevbadness% \setbox\mdf@splitbox@one=\vbox{% \unvcopy\mdf@frametitlebox\relax% \mdf@@frametitlerule\relax% \unvbox\mdf@splitbox@one\relax% }% \mdf@ignorevbadness% \setbox\mdf@splitbox@one=\vbox{\unvbox\mdf@splitbox@one}% \mdfsetup{innertopmargin=\mdf@frametitleaboveskip@length}% } \newrobustcmd*\mdf@checkntheorem{% \ifbool{mdf@ntheorem}% {\ifundef{\theorempreskipamount}% {\mdf@PackageWarning{You have not loaded ntheorem yet}}% {\setlength{\theorempreskipamount}{\z@}% \setlength{\theorempostskipamount}{\z@}% }% }{}% } \newrobustcmd*\mdf@footnoterule{% \kern0\p@% \hrule \@width 1in \kern 2.6\p@} \newrobustcmd*\mdf@footnoteoutput{% \ifvoid\@mpfootins\else% \nobreak% \vskip\mdf@footenotedistance@length% \normalcolor% \mdf@footnoterule% \unvbox\@mpfootins% \fi% } \newrobustcmd*\mdf@footnoteinput{% \def\@mpfn{mpfootnote}% \def\thempfn{\thempfootnote}% \c@mpfootnote\z@% \let\@footnotetext\@mpfootnotetext% } \newrobustcmd*\mdf@load@style{% \ifcase\value{mdf@globalstyle@cnt}\relax% \input{md-frame-0.mdf}% \or\input{md-frame-1.mdf}% \or\input{md-frame-2.mdf}% \or\input{md-frame-3.mdf}% \else% \IfFileExists{md-frame-\value{mdf@globalstyle@cnt}.mdf}% {\input{md-frame-\value{mdf@globalstyle@cnt}.mdf}}% {% \input{md-frame-0.mdf}% \mdf@PackageWarning{The style number \value{mdf@globalstyle@cnt} does not exist^^J mdframed ues instead style=0 \mdframedpackagename}% }% \fi% }% \mdf@load@style \newrobustcmd*\mdf@styledefinition{%AVOID!!!Needed for framemethod=default \ifnumequal{\value{mdf@globalstyle@cnt}}{0}% {\deflength{\mdf@innerlinewidth@length}{\z@}% \deflength{\mdf@middlelinewidth@length}{\mdf@linewidth@length}% \deflength{\mdf@outerlinewidth@length}{\z@}% \let\mdf@innerlinecolor\mdf@linecolor% \let\mdf@middlelinecolor\mdf@linecolor% \let\mdf@outerlinecolor\mdf@linecolor% }{}% } \let\mdf@reserved@a\@empty \newrobustcmd*\detected@mdf@put@frame{% \ifmdf@nobreak%Option nobreak=true? \def\mdf@reserved@a{\mdf@put@frame@standalone}% \else \def\mdf@reserved@a{\mdf@put@frame}% \ifx\@captype\@undefined \def\mdf@reserved@a{\mdf@put@frame}% \else \mdf@PackageInfo{mdframed inside float ^^J mdframed uses option nobreak \mdframedpackagename}% \def\mdf@reserved@a{\mdf@put@frame@standalone}% \fi \if@minipage% \mdf@PackageInfo{mdframed inside minipage ^^J mdframed uses option nobreak \mdframedpackagename}% \def\mdf@reserved@a{\mdf@put@frame@standalone}% \fi% \ifinner% \mdf@PackageInfo{mdframed inside a box ^^J mdframed uses option nobreak \mdframedpackagename}% \def\mdf@reserved@a{\mdf@put@frame@standalone}% \fi% \fi% \mdf@reserved@a% } \newenvironment{mdframed}[1][]{% \color@begingroup% \mdfsetup{userdefinedwidth=\linewidth,#1}% \mdf@twoside@checklength% \let\width\z@% \let\height\z@% \mdf@checkntheorem% \mdf@styledefinition% \mdf@footnoteinput% \color{\mdf@fontcolor}% \mdf@font% \ifvmode\nointerlineskip\fi% \mdf@trivlist{\mdf@skipabove@length}%% \ifdefempty{\mdf@frametitle}{}{\mdf@@frametitle}% \mdf@settings% \mdf@lrbox{\mdf@splitbox@one}% }% {% \ifbool{mdf@ignorelastdescenders}% {% \par\strut\par \unskip\unskip\setbox0=\lastbox \vspace*{\dimexpr\ht\strutbox-\baselineskip\relax}% }{}% \par\unskip\ifvmode\nointerlineskip\hrule \@height\z@ \@width\hsize\fi%% \ifmdf@footnoteinside% \def\mdf@reserveda{% \mdf@footnoteoutput% \endmdf@lrbox% \ifdefempty{\mdf@frametitle}{}{\mdf@@frametitle@use}% \detected@mdf@put@frame}% \else% \def\mdf@reserveda{% \endmdf@lrbox% \ifdefempty{\mdf@frametitle}{}{\mdf@@frametitle@use}% \detected@mdf@put@frame% \mdf@footnoteoutput% }% \fi% \mdf@reserveda% \endmdf@trivlist% \color@endgroup\@doendpe% } \newtoggle{md:checktwoside} \settoggle{md:checktwoside}{false} \newrobustcmd*\mdf@twoside@checklength{% \if@twoside \ifbool{mdf@usetwoside}% {\mdf@PackageInfo{mdframed works in twoside mode}% \settoggle{md:checktwoside}{true}% \setlength\mdf@rightmargin@length{\mdf@outermargin@length}% \setlength\mdf@leftmargin@length{\mdf@innermargin@length}% }% {\mdf@PackageInfo{mdframed inside twoside mode but\MessageBreak works with oneside mode}% \settoggle{md:checktwoside}{false}% }% \fi% } \newcounter{mdf@zref@counter}%keine doppelten laebes \zref@newprop*{mdf@pagevalue}[0]{\number\value{page}} \zref@addprop{\ZREF@mainlist}{mdf@pagevalue} \newrobustcmd*\mdf@zref@label{% \stepcounter{mdf@zref@counter} \zref@label{mdf@pagelabel-\number\value{mdf@zref@counter}}% } \newrobustcmd*\if@mdf@pageodd{% \zref@refused{mdf@pagelabel-\the\value{mdf@zref@counter}}% \ifodd\zref@extract{mdf@pagelabel-\the\value{mdf@zref@counter}}% {mdf@pagevalue}% \setlength\mdf@rightmargin@length{\mdf@outermargin@length}% \setlength\mdf@leftmargin@length{\mdf@innermargin@length}% \else \setlength\mdf@rightmargin@length{\mdf@innermargin@length}% \setlength\mdf@leftmargin@length{\mdf@outermargin@length}% \fi% } \newrobustcmd*\mdf@@setzref{% \iftoggle{md:checktwoside}{\mdf@zref@label\if@mdf@pageodd}{}% } \newrobustcmd*\mdf@freepagevspace{% \bgroup\@nobreakfalse\addpenalty\z@\egroup%added 29.5.12 \penalty\@M\relax\vskip 2\baselineskip\relax% \penalty9999\relax\vskip -2\baselineskip\relax% \penalty9999% \ifdimequal{\pagegoal}{\maxdimen}% {\mdf@freevspace@length\vsize}% {\mdf@freevspace@length=\pagegoal\relax% \advance\mdf@freevspace@length by -\pagetotal\relax% \addtolength\mdf@freevspace@length{\dimexpr-\parskip\relax}\relax% }% } \newrobustcmd*\mdf@advancelength@horizontalmargin@sub[1]{% \advance\mdf@horizontalspaceofbox by -\csname mdf@#1@length\endcsname\relax% } \newlength\mdf@horizontalspaceofbox \newrobustcmd*\mdf@horizontalmargin@equation{% \setlength{\mdf@horizontalspaceofbox}{\mdf@userdefinedwidth@length}% \mdf@dolist{\mdf@advancelength@horizontalmargin@sub}{% leftmargin,outerlinewidth,middlelinewidth,% innerlinewidth,innerleftmargin,innerrightmargin,% innerlinewidth,middlelinewidth,outerlinewidth,% rightmargin}% \notbool{mdf@leftline}% {% \advance\mdf@horizontalspaceofbox by \mdf@innerlinewidth@length\relax% \advance\mdf@horizontalspaceofbox by \mdf@middlelinewidth@length\relax% \advance\mdf@horizontalspaceofbox by \mdf@outerlinewidth@length\relax% }{}% \notbool{mdf@rightline}% {% \advance\mdf@horizontalspaceofbox by \mdf@innerlinewidth@length\relax% \advance\mdf@horizontalspaceofbox by \mdf@middlelinewidth@length\relax% \advance\mdf@horizontalspaceofbox by \mdf@outerlinewidth@length\relax% }{}% \ifdimless{\mdf@horizontalspaceofbox}{3cm}% {\mdf@PackageWarning{You have only a width of 3cm}}{}% \hsize=\mdf@horizontalspaceofbox% } \newrobustcmd*\mdf@keeplines@single{% \notbool{mdf@topline}% {% \advance\mdf@verticalmarginwhole@length % by -\mdf@innerlinewidth@length\relax% \advance\mdf@verticalmarginwhole@length % by -\mdf@middlelinewidth@length\relax% \advance\mdf@verticalmarginwhole@length % by -\mdf@outerlinewidth@length\relax% }{}% \notbool{mdf@bottomline}% {% \advance\mdf@verticalmarginwhole@length % by -\mdf@innerlinewidth@length\relax% \advance\mdf@verticalmarginwhole@length % by -\mdf@middlelinewidth@length\relax% \advance\mdf@verticalmarginwhole@length % by -\mdf@outerlinewidth@length\relax% }{}% } \newrobustcmd*\mdf@advancelength@verticalmarginwhole[1]{% \advance\mdf@verticalmarginwhole@length % by \csname mdf@#1@length\endcsname\relax% } \newrobustcmd*\mdf@advancelength@freevspace@sub[1]{% \advance\dimen@ by -\csname mdf@#1@length\endcsname\relax% } \newrobustcmd*\mdf@advancelength@freevspace@add[1]{% \advance\dimen@ by \csname mdf@#1@length\endcsname\relax% } \protected@edef\mdf@reset{\boxmaxdepth\the\boxmaxdepth \splittopskip\the\splittopskip}% \newrobustcmd*\mdf@put@frame@standalone{\relax% \ifvoid\mdf@splitbox@one\relax \mdf@PackageWarning{The environment is empty\MessageBreak}% \let\mdf@reserved@a\relax% \else %Hier berechnung Box-Inhalt+Rahmen oben und unten \setlength{\mdf@verticalmarginwhole@length}% {\dimexpr\ht\mdf@splitbox@one+\dp\mdf@splitbox@one\relax}% \mdf@dolist{\mdf@advancelength@verticalmarginwhole}{% outerlinewidth,middlelinewidth,innerlinewidth,% innertopmargin,innerbottommargin,innerlinewidth,% middlelinewidth,outerlinewidth}% \mdf@keeplines@single% \def\mdf@reserved@a{\mdf@putbox@single}% \fi \mdf@reserved@a% } \def\mdf@put@frame{\relax% \ifvoid\mdf@splitbox@one\relax \mdf@PackageWarning{The environment is empty\MessageBreak}% \let\mdf@reserved@a\relax% \else \setlength\mdfboundingboxwidth{\wd\mdf@splitbox@one}% \mdf@print@space% \mdf@freepagevspace%gives \mdf@freevspace@length \mdf@PackageInfoSpace{\the\mdf@freevspace@length before the beginning of \MessageBreak the environment ending on input line \MessageBreak}% \ifdimless{\mdf@freevspace@length}{2\baselineskip} {% \mdf@PackageInfo{Not enough space on this page} \vfill\eject% \def\mdf@reserved@a{\mdf@put@frame}% }{% %Hier berechnung Box-Inhalt+Rahmen oben und unten \setlength{\mdf@verticalmarginwhole@length}% {\dimexpr\ht\mdf@splitbox@one+\dp\mdf@splitbox@one\relax}% \mdf@dolist{\mdf@advancelength@verticalmarginwhole}% {% outerlinewidth,middlelinewidth,innerlinewidth,% innertopmargin,innerbottommargin,% innerlinewidth,middlelinewidth,outerlinewidth}% \mdf@keeplines@single% \ifdimless{\mdf@verticalmarginwhole@length}{\mdf@freevspace@length}% {%passt auf Seite% \begingroup\mdf@@setzref\mdf@putbox@single\endgroup%Output no break \let\mdf@reserved@a\relax% }% {% \def\mdf@reserved@a{\mdf@put@frame@i}%passt nicht auf Seite } }% \fi \mdf@reserved@a% } \def\mdf@put@frame@i{%Box must be splitted \mdf@freepagevspace%gives \mdf@freevspace@length \dimen@=\the\mdf@freevspace@length\relax% \dimen@i=\mdf@innertopmargin@length\relax% \advance\dimen@i by \mdf@innerlinewidth@length\relax% \advance\dimen@i by \mdf@middlelinewidth@length\relax% \advance\dimen@i by \mdf@outerlinewidth@length\relax% \advance\dimen@i by 2\baselineskip\relax% \ifdimless{\dimen@}{\dimen@i}% {\hrule \@height\z@ \@width\hsize% \vfill\eject% \def\mdf@reserved@a{\mdf@put@frame}% }% {% \mdf@dolist{\mdf@advancelength@freevspace@sub}{%calculate with \dimen@ outerlinewidth,middlelinewidth,innerlinewidth,% innertopmargin,splitbottomskip}% \ifbool{mdf@everyline}% {% \ifbool{mdf@bottomline}% {% \advance\dimen@ by -\mdf@innerlinewidth@length% \advance\dimen@ by -\mdf@middlelinewidth@length% \advance\dimen@ by -\mdf@outerlinewidth@length% }{}% }{}% \notbool{mdf@topline}% {% \advance\dimen@ by \mdf@innerlinewidth@length% \advance\dimen@ by \mdf@middlelinewidth@length% \advance\dimen@ by \mdf@outerlinewidth@length% }{}% \advance\dimen@.8\pageshrink \ifdimless{\ht\mdf@splitbox@one+\dp\mdf@splitbox@one}{\dimen@}% {\mdf@PackageWarning{You got a bad break\MessageBreak because the last box will be empty\MessageBreak you have to change it manually\MessageBreak by changing the text, the space\MessageBreak or something else}% \advance\dimen@ by -1.8\baselineskip\relax%needed???????????????????? }{}% \setbox\mdf@splitbox@save=\vbox{\unvcopy\mdf@splitbox@one}% \splitmaxdepth\z@ \splittopskip\mdf@splittopskip@length% \mdf@ignorevbadness% \setbox\mdf@splitbox@two\vsplit\mdf@splitbox@one to \dimen@ \setbox\mdf@splitbox@two\vbox{\unvbox\mdf@splitbox@two}% \setbox\mdf@splitbox@one\vbox{\unvbox\mdf@splitbox@one}% \ifdimgreater{\ht\mdf@splitbox@two+\dp\mdf@splitbox@two}{\dimen@}% {%splitted wrong \mdf@PackageInfo{Box was splittet wrong^^M starting loop to iterate the splitting point\MessageBreak}% \setbox\mdf@splitbox@one=\vbox{\unvcopy\mdf@splitbox@save}% \dimen@i=\dimen@%\relax \@tempcnta=\z@\relax \loop \ifdim\dimexpr\ht\mdf@splitbox@two+\dp\mdf@splitbox@two\relax>\dimen@ \advance\dimen@i by -\p@\relax \advance\@tempcnta by \@ne\relax \ifnum\@tempcnta>100 \let\iterate\relax \mdf@PackageWarning{correct box splittet fails^^M It seems you are using a non splittable contents\MessageBreak} \fi \mdf@ignorevbadness% \setbox\mdf@splitbox@one=\vbox{\break\unvcopy\mdf@splitbox@save}% \splitmaxdepth\z@ \splittopskip\mdf@splittopskip@length% \mdf@ignorevbadness% \setbox\mdf@splitbox@two\vsplit\mdf@splitbox@one to \dimen@i\relax% \setbox\mdf@splitbox@two\vbox{\unvbox\mdf@splitbox@two}% \setbox\mdf@splitbox@one\vbox{\unvbox\mdf@splitbox@one}% \repeat% }{}% \ifvoid\mdf@splitbox@one\relax% \mdf@PackageWarning{You got a bad break because the splittet box is empty^^M You have to change the page settings^^M like enlargethispage or something else^^M the package increases do \enlargethispage{\baselineskip}\MessageBreak}% \setbox\mdf@splitbox@one=\vbox{\unvcopy\mdf@splitbox@save} \enlargethispage{\baselineskip}% \def\mdf@reserved@a{\mdf@put@frame}% \fi% \ifdim\wd\mdf@splitbox@two=\wd\mdf@splitbox@one\relax \else% \mdf@PackageInfo{You first box width is to small^^M mdframed fixed it\MessageBreak}% \setbox\mdf@splitbox@two=\vbox% {% \hrule \@height\z@ \@width\wd\mdf@splitbox@one\relax \unvcopy\mdf@splitbox@two% }% \fi% \ifvoid\mdf@splitbox@two\relax% {\hrule \@height\f@size pt \@width\z@% \hrule \@height\z@ \@width\hsize}% \setbox\mdf@splitbox@one=\vbox{\unvcopy\mdf@splitbox@save}% \def\mdf@reserved@a{\mdf@put@frame}% \else% \ifdimequal{\ht\mdf@splitbox@two}{0pt}% {\hrule \@height\z@ \@width\hsize% \vfill\eject% \setbox\mdf@splitbox@one=\vbox{\unvcopy\mdf@splitbox@save}% \def\mdf@reserved@a{\mdf@put@frame}% }% {% \begingroup\mdf@@setzref\mdf@putbox@first\endgroup% \hrule \@height\z@ \@width\hsize% \vfill\eject% \def\mdf@reserved@a{\mdf@put@frame@ii}% }% \fi% }% \mdf@reserved@a% } \def\mdf@put@frame@ii{% \setlength{\mdf@freevspace@length}{\vsize}% \ifbool{mdf@repeatframetitle}% {% \toggletrue{mdf@notfirstframetitle}% \splitmaxdepth\z@ \splittopskip\z@% \setbox\mdf@splitbox@one=\vbox{\break\unvbox\mdf@splitbox@one}% \mdf@ignorevbadness% \setbox0=\vsplit\mdf@splitbox@one to \z@\relax% \setbox\mdf@splitbox@one=\vbox{\unvbox\mdf@splitbox@one} \setbox\mdf@splitbox@one\vbox% {% \vbox to \mdf@frametitleaboveskip@length{} \unvcopy\mdf@frametitlebox\relax% \mdf@@frametitlerule\relax% \unvbox\mdf@splitbox@one\relax% }% \setbox\mdf@splitbox@one=\vbox{\unvbox\mdf@splitbox@one}% }{}% \setlength{\dimen@}{\dimexpr\ht\mdf@splitbox@one+\dp\mdf@splitbox@one\relax}% \mdf@dolist{\mdf@advancelength@freevspace@add}% {%used \dimen@ innerbottommargin,innerlinewidth,middlelinewidth,outerlinewidth,% }% \ifbool{mdf@everyline}% {% \ifbool{mdf@topline}% {% \advance\dimen@ by \mdf@innerlinewidth@length\relax% \advance\dimen@ by \mdf@middlelinewidth@length\relax% \advance\dimen@ by \mdf@outerlinewidth@length\relax% }{}% }{}% \notbool{mdf@bottomline}% {% \advance\dimen@ by -\mdf@innerlinewidth@length\relax% \advance\dimen@ by -\mdf@middlelinewidth@length\relax% \advance\dimen@ by -\mdf@outerlinewidth@length\relax% \relax% }{}% \ifdimgreater{\dimen@}{\mdf@freevspace@length}% {%have a middle box \advance\mdf@freevspace@length by -\mdf@splitbottomskip@length\relax% \ifbool{mdf@everyline}% {% \ifbool{mdf@topline}% {% \advance\mdf@freevspace@length by -\mdf@innerlinewidth@length\relax% \advance\mdf@freevspace@length by -\mdf@middlelinewidth@length\relax% \advance\mdf@freevspace@length by -\mdf@outerlinewidth@length\relax% }{}% \ifbool{mdf@bottomline}% {% \advance\mdf@freevspace@length by -\mdf@innerlinewidth@length\relax% \advance\mdf@freevspace@length by -\mdf@middlelinewidth@length\relax% \advance\mdf@freevspace@length by -\mdf@outerlinewidth@length\relax% \relax }{}% }{}% \setbox\mdf@splitbox@save=\vbox{\unvcopy\mdf@splitbox@one}% \splitmaxdepth\z@ \splittopskip\mdf@splittopskip@length% \mdf@ignorevbadness% \setbox\mdf@splitbox@two\vsplit\mdf@splitbox@one to \mdf@freevspace@length \setbox\mdf@splitbox@two\vbox{\unvbox\mdf@splitbox@two} \setbox\mdf@splitbox@one\vbox{\unvbox\mdf@splitbox@one} \ifdimgreater{\ht\mdf@splitbox@two+\dp\mdf@splitbox@two}{\dimen@}% {%splitted wrong \mdf@PackageInfo{Box was splittet wrong^^M starting loop to iterate the splitting point\MessageBreak}% \dimen@i=\mdf@freevspace@length%\relax \@tempcnta=\z@\relax \loop \ifdim\dimexpr\ht\mdf@splitbox@two+\dp\mdf@splitbox@two\relax>% \mdf@freevspace@length\relax \advance\dimen@i by -\p@\relax% \advance\@tempcnta by \@ne\relax% \ifnum\@tempcnta>100 \let\iterate\relax% \mdf@PackageWarning{correct box splittet fails^^M It seems you are using a non splittable contents\MessageBreak}% \fi \setbox\mdf@splitbox@one=\vbox{\break\unvcopy\mdf@splitbox@save}% \splitmaxdepth\z@ \splittopskip\mdf@splittopskip@length% \mdf@ignorevbadness% \setbox\mdf@splitbox@two\vsplit\mdf@splitbox@one to \dimen@i\relax% \setbox\mdf@splitbox@two\vbox{\unvbox\mdf@splitbox@two}% \setbox\mdf@splitbox@one\vbox{\unvbox\mdf@splitbox@one}% \repeat% }{}% \ifvoid\mdf@splitbox@one\relax% \mdf@PackageWarning{You got a bad break because the splittet box is empty^^M You have to change the page settings^^M like enlargethispage or something else^^M the package increases do \enlargethispage{\baselineskip}\MessageBreak}% \setbox\mdf@splitbox@one=\vbox{\unvcopy\mdf@splitbox@save}% \enlargethispage{\baselineskip}% \def\mdf@reserved@a{\mdf@put@frame@ii}% \else \begingroup\mdf@@setzref\mdf@putbox@middle\endgroup% \hrule \@height\z@ \@width\hsize% \vfill\eject% \def\mdf@reserved@a{\mdf@put@frame@ii}% \fi }%End middle box case {%start last box case \ifvoid\mdf@splitbox@one \mdf@PackageWarning{You got a bad break\MessageBreak because the last split box is empty\MessageBreak You have to change the settings}%% \setbox\mdf@splitbox@one=\vbox% {% \unvbox\mdf@splitbox@one% \hrule \@height\z@ \@width\mdfboundingboxwidth }% \fi% \ifdimless{\ht\mdf@splitbox@one}{1sp}% {% \mdf@PackageWarning{You got a bad break\MessageBreak because the last split box is empty\MessageBreak You have to change the settings}% \let\mdf@reserved@a\relax% \setbox\mdf@splitbox@one=\vbox% {% \unvbox\mdf@splitbox@one% \hrule \@height\z@ \@width\mdfboundingboxwidth }% }{}% \begingroup\mdf@@setzref\mdf@putbox@second\endgroup% \hrule \@height\z@ \@width\hsize% \let\mdf@reserved@a\relax% }% \mdf@reserved@a% } %%%% _____t_____ %%%% | | %%%% | | %%%% | | %%%% l| |r %%%% | | %%%% | | %%%% |___________| %%%% b %%Zusammenhaenge abfragen: \newrobustcmd*\mdf@test@ltrb{% \ifboolexpr{ (bool {mdf@topline}) and (bool {mdf@bottomline}) and (bool {mdf@leftline}) and (bool {mdf@rightline})}} \newrobustcmd*\mdf@test@ltr{% \ifboolexpr{ (bool {mdf@topline}) and not (bool {mdf@bottomline}) and (bool {mdf@leftline}) and (bool {mdf@rightline})}} \newrobustcmd*\mdf@test@ltb{% \ifboolexpr{ (bool {mdf@topline}) and (bool {mdf@bottomline}) and (bool {mdf@leftline}) and not (bool {mdf@rightline})}} \newrobustcmd*\mdf@test@trb{% \ifboolexpr{ (bool {mdf@topline}) and (bool {mdf@bottomline}) and not (bool {mdf@leftline}) and (bool {mdf@rightline})}} \newrobustcmd*\mdf@test@lrb{% \ifboolexpr{ not (bool {mdf@topline}) and (bool {mdf@bottomline}) and (bool {mdf@leftline}) and (bool {mdf@rightline})}} \newrobustcmd*\mdf@test@lb{% \ifboolexpr{ not (bool {mdf@topline}) and (bool {mdf@bottomline}) and (bool {mdf@leftline}) and not (bool {mdf@rightline})}} \newrobustcmd*\mdf@test@rb{% \ifboolexpr{ not (bool {mdf@topline}) and (bool {mdf@bottomline}) and not (bool {mdf@leftline}) and (bool {mdf@rightline})}} \newrobustcmd*\mdf@test@tr{% \ifboolexpr{ (bool {mdf@topline}) and not (bool {mdf@bottomline}) and not (bool {mdf@leftline}) and (bool {mdf@rightline})}} \newrobustcmd*\mdf@test@lt{% \ifboolexpr{ (bool {mdf@topline}) and not (bool {mdf@bottomline}) and (bool {mdf@leftline}) and not (bool {mdf@rightline})}} \newrobustcmd*\mdf@test@lr{% \ifboolexpr{ not (bool {mdf@topline}) and not (bool {mdf@bottomline}) and (bool {mdf@leftline}) and (bool {mdf@rightline})}} \newrobustcmd*\mdf@test@tb{% \ifboolexpr{ (bool {mdf@topline}) and (bool {mdf@bottomline}) and not (bool {mdf@leftline}) and not (bool {mdf@rightline})}} \newrobustcmd*\mdf@test@l{% \ifboolexpr{ not (bool {mdf@topline}) and not (bool {mdf@bottomline}) and (bool {mdf@leftline}) and not (bool {mdf@rightline})}} \newrobustcmd*\mdf@test@r{% \ifboolexpr{ not (bool {mdf@topline}) and not (bool {mdf@bottomline}) and not (bool {mdf@leftline}) and (bool {mdf@rightline})}} \newrobustcmd*\mdf@test@t{% \ifboolexpr{ (bool {mdf@topline}) and not (bool {mdf@bottomline}) and not (bool {mdf@leftline}) and not (bool {mdf@rightline})}} \newrobustcmd*\mdf@test@b{% \ifboolexpr{ not (bool {mdf@topline}) and (bool {mdf@bottomline}) and not (bool {mdf@leftline}) and not (bool {mdf@rightline})}} \newrobustcmd*\mdf@test@noline{% \ifboolexpr{ not (bool {mdf@topline}) and not (bool {mdf@bottomline}) and not (bool {mdf@leftline}) and not (bool {mdf@rightline})}} \newrobustcmd*\mdf@test@single{% \ifboolexpr{ not (test {\mdf@test@ltrb} or test {\mdf@test@ltr} or test {\mdf@test@ltb} or test {\mdf@test@trb} or test {\mdf@test@lrb} or test {\mdf@test@lb} or test {\mdf@test@rb} or test {\mdf@test@tr} or test {\mdf@test@lt} ) }} \DisableKeyvalOption[action=warning,package=mdframed]{mdf}{framemethod}% \DisableKeyvalOption[action=warning,package=mdframed]{mdf}{xcolor}% \endinput %% %% ================================================================ %% Copyright (C) 2012 by Marco Daniel %% %% This work may be distributed and/or modified under the %% conditions of the LaTeX Project Public License (LPPL), either %% version 1.3c of this license or (at your option) any later %% version. The latest version of this license is in the file: %% %% http://www.latex-project.org/lppl.txt %% %% This work is "maintained" (as per LPPL maintenance status) by %% Marco Daniel. %% %% Have fun! %% %% ================================================================ %% %% End of file `mdframed.sty'. irstlm-6.00.05/doc/mixtureLM.tex000066400000000000000000000000001263213470300164000ustar00rootroot00000000000000irstlm-6.00.05/doc/ngt.tex000066400000000000000000000027451263213470300152640ustar00rootroot00000000000000{\tt ngt} is the command which copes with the $n$-gram counts. \begin{itemize} \item It extracts the $n$-gram counts and stores into a $n$-gram table. \item It prunes $n$-gram table. \item It merges $n$-gram tables. \item It transforms $n$-gram table formats. \end{itemize} \noindent A new $n$-gram table for the limited dictionary can be computed with {\tt ngt} by specifying the sub-dictionary: \begin{verbatim} $> ngt -i=train.www -sd=top10k -n=3 -o=train.10k.www -b=yes \end{verbatim} The command replaces all words outside top10K with the special out-of-vocabulary symbol {\tt \_unk\_}.{\tt dict} is the command which copes with the dictionaries. \noindent Another useful feature of ngt is the merging of two $n$-gram tables. Assume that we have split our training corpus into files {\tt text-a} and file {\tt text-b} and have computed $n$-gram tables for both files, we can merge them with the option {\tt -aug}: \begin{verbatim} $> ngt -i="gunzip -c text-a.gz" -n=3 -o=text-a.www -b=yes $> ngt -i="gunzip -c text-b.gz" -n=3 -o=text-b.www -b=yes $> ngt -i=text-a.www -aug=text-b.www -n=3 -o=text.www -b=yes \end{verbatim} \paragraph{Warning:} Note that if the concatenation of {\tt text-a.gz} and {\tt text-b.gz} is equal to {\tt train.gz} the resulting $n$-gram tables {\tt text.www} and {\tt train.www} can slightly differ. This happens because during the construction of each single $n$-gram table few $n$-grams are automatically added to make it consistent for further computation. irstlm-6.00.05/doc/parallelComputation.tex000066400000000000000000000015711263213470300205070ustar00rootroot00000000000000This package provides facilities to build a gigantic LM in parallel in order to reduce computation time. The script implementing this feature is based on the {\tt SUN Grid Engine} software\footnote{http://www.sun.com/software/gridware}. \noindent To apply the parallel computation run the following script (instead of {\tt build-lm.sh}): \begin{verbatim} $> build-lm-qsub.sh -i "gunzip -c train.gz" -n 3 -o train.ilm.gz -k 5 \end{verbatim} Besides the options of {\tt build-lm.sh}, parameters for the SGE manager can be provided through the following one: \begin{verbatim} -q parameters for qsub, e.g. "-q ", "-l " \end{verbatim} \noindent The script performs the same {\em split-and-merge} policy described in Section~\ref{sec:giganticLM}, but some computation is performed in parallel (instead of sequentially) distributing the tasks on several machines. irstlm-6.00.05/doc/pruneLM.tex000066400000000000000000000000001263213470300160340ustar00rootroot00000000000000irstlm-6.00.05/doc/quantizeLM.tex000066400000000000000000000000001263213470300165430ustar00rootroot00000000000000irstlm-6.00.05/doc/referenceMaterial.tex000066400000000000000000000031731263213470300201050ustar00rootroot00000000000000The following books contain basic introductions to statistical language modeling: \begin{itemize} \item {\em Spoken Dialogues with Computers}, by Renato DeMori, chapter 7. \item {\em Speech and Language Processing}, by Dan Jurafsky and Jim Martin, chapter 6. \item {\em Foundations of Statistical Natural Language Processing}, by C. Manning and H. Schuetze. \item {\em Statistical Methods for Speech Recognition}, by Frederick Jelinek. \item {\em Spoken Language Processing}, by Huang, Acero and Hon. \end{itemize} \noindent The following papers describe the IRST LM toolkit: \begin{itemize} \item Efficient data structures to handle huge language models: \begin{quote} Marcello Federico and Mauro Cettolo, {\em Efficient Handling of N-gram Language Models for Statistical Machine Translation}, In Proc. of the Second Workshop on Statistical Machine Translation, pp. 88--95, ACL, Prague, Czech Republic, 2007. \end{quote} \item Language Model quantization: \begin{quote} Marcello Federico and Nicola Bertoldi, {\em How Many Bits Are Needed To Store Probabilities for Phrase-Based Translation?}, In Proc. of the Workshop on Statistical Machine Translation. pp. 94-101, NAACL, New York City, NY, 2006. \end{quote} \item Language Model adaptation with mixtures: \begin{quote} Marcello Federico and Nicola Bertoldi, {\em Broadcast news LM adaptation over time}, Computer Speech and Language. 18(4): pp. 417-435, October, 2004. \end{quote} \item Language Model adaptation with MDI: \begin{quote} Marcello Federico, {\em Efficient LM Adaptation through MDI Estimation}. In Proc. of Eurospeech, Budapest, Hungary, 1999. \end{quote} \end{itemize} irstlm-6.00.05/doc/regressionTests.tex000066400000000000000000000000001263213470300176550ustar00rootroot00000000000000irstlm-6.00.05/doc/releaseNotes.tex000066400000000000000000000156641263213470300171310ustar00rootroot00000000000000 \IMPORTANT{If present, the index in parentheses refers to the revision number in IRSTLM repository (until 5.60.02) or SourceForge repository (from 5.60.03).} \subsection{Version 3.2} \begin{itemize} \item Quantization of probabilities \item Efficient run-time data structure for LM querying \item Dismissal of MT output format \end{itemize} \subsection{Version 4.2} \begin{itemize} \item Distinction between open source and internal Irstlm tools \item More memory efficient versions of binarization and quantization commands \item Memory mapping of run-time LM \item Scripts and data structures for the estimation and handling of gigantic LMs \item Integration of {\IRSTLM} into Moses Decoder \end{itemize} \subsection{Version 5.00} \begin{itemize} \item Fixed bug in the documentation \item General script {\tt build-lm.sh} for the estimation of large LMs. \item Management of iARPA file format. \item Bug fixes \item Estimation of LM over a partial dictionary. \end{itemize} \subsection{Version 5.04} \begin{itemize} \item Extended documentation with ShiftBeta smoothing. \item Smoothing parameter of ShiftBeta can be set manually. \item Robust handling for smoothing parameters of ModifiedShiftBeta. \item Fixed probability checks in TLM. \item Parallel estimation of gigantic LM through SGE \item Better management of sub dictionary with build-lm.sh \item Minor bug fixes \end{itemize} \subsection{Version 5.05} \begin{itemize} \item (Optional) computation of OOV penalty in terms of single OOV word instead of OOV class \item Extended use of OOV penalty to the standard input LM scores of compile-lm. \item Minor bug fixes \end{itemize} \subsection{Version 5.10} \begin{itemize} \item Extended ngt to compute statistics for approximated Kneser-Ney smoothing \item New implementation of approximated Kneser-Ney smoothing method \item Minor bug fixes \item More to be added here .... \end{itemize} \subsection{Version 5.20} \begin{itemize} \item Improved tracing of back-offs \item Added command prune-lm (thanks to Fabio Brugnara) \item Extended lprob function to supply back-off weight/level information \item Improved back-off handling of OOV words with quantized LM \item Added more debug modalities to compile-lm \item Fixed minor bugs in regression tests \item Updated documentation \end{itemize} \subsection{Version 5.21} \begin{itemize} \item Addition of interpolate-lm \item Added LM filtering to compile-lm \item Improved regression tests \item Integration of interpolated LMs in Moses \item Extended tests on compilers and platforms \item Improved documentation with website \end{itemize} \subsection{Version 5.22} \begin{itemize} \item Use of AutoConf/AutoMake toolkit compilation and installation \end{itemize} \subsection{Version 5.30} \begin{itemize} \item Support for a safe management of LMs with a total amount of $n$-grams larger than 250 million \item Use of a new parameter to specify a directory for temporary computation because the default ("/tmp") could be too small \item Improved a safer method of concatenation of gzipped sub lms \item Improved management of log files \end{itemize} \subsection{Version 5.40} \begin{itemize} \item Merging of internal-only tlm code into the public version \item Updated documentation into the public version \item Included documentation into the public version \end{itemize} \subsection{Version 5.50} \begin{itemize} \item {\bf 5.50.01} \begin{itemize} \item Binary saving directly with tlm \item Speed improvement through \item Caching of probability and states of $n$-grams in the LM interface \item Storing of $n$-grams in inverted order \end{itemize} \item {\bf 5.50.02} \begin{itemize} \item Optional creation of documentation \item Improved documentation \item Optional computation of the perplexity at sentence-level \end{itemize} \end{itemize} \subsection{Version 5.60} \begin{itemize} \item {\bf 5.60.01} \begin{itemize} \item Handling of class/chunk LMs with both compile-lm and interpolate-lm \item Improved pruning strategy to handle with sentence-start symbols \item Improved documentation and examples \end{itemize} \item {\bf 5.60.02} \begin{itemize} \item Code cleanup \end{itemize} \item {\bf 5.60.03 (r404)} \begin{itemize} \item Xcode project \item import from IRSTLM repository (revision 4263) \end{itemize} \end{itemize} \subsection{Version 5.70} \begin{itemize} \item {\bf 5.70.01 (r454)} \begin{itemize} \item Class-based LM \item Added improved-kneser-ney smoothing for lm-build-qsub.sh \item Enabled different singleton pruning policy for each submodel of mixture LM \item Enabled the possibility to load an existing LM up to a specific level smaller than the actual LM order \item Code tracing \item Handling of error codes \item Handling of long filenames and parameters \item Improved parallel code compilation \item Improved documentation and examples \end{itemize} \item {\bf 5.70.02 (r469)} \begin{itemize} \item Code optimization \item Common interface for all LM types \end{itemize} \end{itemize} \subsection{Version 5.80} \begin{itemize} \item {\bf 5.80.01 (r501)} \begin{itemize} \item Facility to {\em beautify} source code \item Re-activation of filtering on a sub-dictionary \item Code optimization related to LM dumping \item Transformation of scripts into Bourne shell scripts \end{itemize} \item {\bf 5.80.03 (r579)} \begin{itemize} \item Data selection tool \item Handling of precision upper- and lower-bounds by means of constants. \item Improved of Xcode project \item Improved code compilation \item Improved documentation \item Improved handling of help \item Facility to check whether IRSTLM is compile with or without caching \end{itemize} \item {\bf 5.80.05 (r642)} \begin{itemize} \item Introduction of {\em namespace irstlm} \item Code optimization \item Code compliant with OsX Maverick \item Support for Redis output format to ngt \item Support CRC16 algorithm \item Improved plsa command and regression test \item Improved handling of tracing \item Improved handling of help \item Improved handling of error messages \end{itemize} \item {\bf 5.80.06 (r647)} \begin{itemize} \item Improved code compilation \end{itemize} \item {\bf 5.80.07} \begin{itemize} \item Changes to LM smoothing types; removed Good-Turing, added Approximated Modified ShiftBeta, renaming \item Added an additional pruning method, based on level-dependent pruning frequency \item Improved code compilation \item Improved documentation \item Code cleanup \item Added support for long names of parameters \item Improved output format \end{itemize} \item {\bf 5.80.08} \begin{itemize} \item Added functionality to score n-grams in isolation \item Added level-based caches for storing prob, state, and statesize \item Improved management of tracing assert \item Improved management of tracing/assert/caching AutoConf compilation flags \item Improved output format \item Improved code compilation \item Code cleanup \end{itemize} \end{itemize} \COMMENT{ \subsection{Version 5.xx} \begin{itemize} \item {\bf 5.xx.01} \begin{itemize} \item \end{itemize} \end{itemize} } irstlm-6.00.05/doc/tlm.tex000066400000000000000000000071701263213470300152650ustar00rootroot00000000000000Language models have to cope with out-of-vocabulary words, that is internally represented with the word class {\tt \_unk\_}. In order to compare perplexity of LMs having different vocabulary size it is better to define a conventional dictionary size, or dictionary upper bound size, trough the parameter ({\tt -dub}). In the following example, we compare the perplexity of the full vocabulary LM against the perplexity of the LM estimated over the more frequent 10K-words. In our comparison, we assume a dictionary upper bound of one million words. \begin{verbatim} $>tlm -tr=train.10k.www -n=3 -lm=wb -te=test -dub=1000000 n=49984 LP=342160.8721 PP=939.5565162 OVVRate=0.07666453265 $>tlm -tr=train.www -n=3 -lm=wb -te=test -dub=1000000 n=49984 LP=336276.7842 PP=835.2144716 OVVRate=0.05007602433 \end{verbatim} \noindent The large difference in perplexity between the two LMs is explained by the significantly higher OOV rate of the 10K-word LM. \noindent N-gram LMs generally apply frequency smoothing techniques, and combine smoothed frequencies according to two main schemes: interpolation and back-off. The toolkit assumes interpolation as default. The back-off scheme is computationally more costly but often provides better performance. It can be activated with the option {\tt -bo=yes}, e.g.: \begin{verbatim} $>tlm -tr=train.10k.www -n=3 -lm=wb -te=test -dub=1000000 -bo=yes n=49984 LP=337278.3227 PP=852.1186066 OVVRate=0.07666453265 \end{verbatim} \noindent This toolkit implements several frequency smoothing methods, which are specified by the parameter {\tt -lm}. Three methods are particularly recommended: \begin{itemize} \item [a)] {\bf Modified shift-beta}, also known as ``improved kneser-ney smoothing''. This smoothing scheme gives top performance when training data is not very sparse but it is more time and memory consuming during the estimation phase: \begin{verbatim} $>tlm -tr=train.www -n=3 -lm=msb -te=test -dub=1000000 -bo=yes n=49984 LP=321877.3411 PP=626.1609806 OVVRate=0.05007602433 \end{verbatim} \item [b)] {\bf Witten Bell smoothing}. This is an excellent smoothing method which works well in every data condition and is much less time and memory consuming: \begin{verbatim} $> tlm -tr=train.www -n=3 -lm=wb -te=test -dub=1000000 -bo=yes n=49984 LP=331577.2279 PP=760.2652095 OVVRate=0.05007602433 \end{verbatim} \item [c)] {\bf Shift-beta smoothing}. This smoothing method is a simpler and cheaper version of the Modified shift-beta method and works sometimes better than Witten-Bell method: \begin{verbatim} $> tlm -tr=train.www -n=3 -lm=sb -te=test -dub=1000000 -bo=yes n=49984 LP=334724.5032 PP=809.6750442 OVVRate=0.05007602433 \end{verbatim} \noindent Moreover, the non linear smoothing parameter $\beta$ can be specified with the option {\tt -beta}: \begin{verbatim} $> tlm -tr=train.www -n=3 -lm=sb -beta=0.001 -te=test -dub=1000000 -bo=yes n=49984 LP=449339.8282 PP=8019.836058 OVVRate=0.05007602433 \end{verbatim} This could be helpful in case we need to use language models with very limited frequency smoothing. \end{itemize} \subsection*{Limited Vocabulary} \noindent Using an n-gram table with a fixed or limited dictionary will cause some performance degradation, as LM smoothing statistics result slightly distorted. A valid alternative is to estimate the LM on the full dictionary of the training corpus and to use a limited dictionary just when saving the LM on a file. This can be achieved with the option {\tt -d} (or {\tt -dictionary}): \begin{verbatim} $> tlm -tr=train.www -n=3 -lm=msb -bo=y -te=test -o=train.lm -d=top10k \end{verbatim}irstlm-6.00.05/regenerate-makefiles.sh000077500000000000000000000030551263213470300176160ustar00rootroot00000000000000#!/bin/bash # NOTE: # Versions 1.9 (or higher) of aclocal and automake are required. # Version 2.59 (or higher) of autoconf is required. # For Mac OSX users: # Standard distribution usually includes versions 1.6 for aclocal and automake. # Get versions 1.9 or higher # Set the following variable to the correct paths #ACLOCAL="/path/to/aclocal-1.9" #AUTOMAKE="/path/to/automake-1.9" force=$1; # set parameter force to the value "--force" if you want to recreate all links to the autotools die () { echo "$@" >&2 exit 1 } if [ -z "$ACLOCAL" ] then ACLOCAL=`which aclocal` fi if [ -z "$AUTOMAKE" ] then AUTOMAKE=`which automake` fi if [ -z "$AUTORECONF" ] then AUTORECONF=`which autoreconf` fi if [ -z "$AUTOCONF" ] then AUTOCONF=`which autoconf` fi if [ -z "$LIBTOOLIZE" ] then LIBTOOLIZE=`which libtoolize` if [ -z "$LIBTOOLIZE" ] then LIBTOOLIZE=`which glibtoolize` fi fi if [ ! -d m4 ] ; then mkdir m4 fi echo "Calling $AUTORECONF" $AUTORECONF ret=$? if [ $ret -ne 0 ] ; then echo "autoreconf FAILED" echo "trying '$LIBTOOLIZE --force; $AUTOMAKE --add-missing ; $AUTORECONF'" $LIBTOOLIZE --force $AUTOMAKE --add-missing $AUTORECONF if [ ! -e config.guess ] ; then $AUTOMAKE --add-missing $AUTORECONF fi fi #echo "Calling $LIBTOOLIZE $force" #$LIBTOOLIZE $force || die "libtoolize failed" #echo "Calling $ACLOCAL..." #$ACLOCAL -I m4 || die "aclocal failed" #echo "Calling $AUTOCONF..." #$AUTOCONF || die "autoconf failed" #echo "Calling $AUTOMAKE --add-missing..." #$AUTOMAKE --add-missing || die "automake failed" irstlm-6.00.05/scripts/000077500000000000000000000000001263213470300146645ustar00rootroot00000000000000irstlm-6.00.05/scripts/CMakeLists.txt000077500000000000000000000004461263213470300174330ustar00rootroot00000000000000INSTALL(PROGRAMS add-start-end.sh build-lm-qsub.sh build-lm.sh build-sublm.pl goograms2ngrams.pl lm-stat.pl mdtsel.sh merge-sublm.pl ngram-split.pl rm-start-end.sh sort-lm.pl split-dict.pl split-ngt.sh wrapper DESTINATION bin PERMISSIONS OWNER_EXECUTE OWNER_READ OWNER_WRITE ) irstlm-6.00.05/scripts/Makefile.am000066400000000000000000000004371263213470300167240ustar00rootroot00000000000000wrapperbindir = @prefix@/bin dist_wrapperbin_SCRIPTS = \ add-start-end.sh build-lm-qsub.sh build-lm.sh rm-start-end.sh split-ngt.sh mdtsel.sh \ build-sublm.pl goograms2ngrams.pl lm-stat.pl merge-sublm.pl ngram-split.pl sort-lm.pl split-dict.pl \ plsa.sh qplsa.sh EXTRA_DIST = wrapper irstlm-6.00.05/scripts/add-start-end.sh000077500000000000000000000037301263213470300176550ustar00rootroot00000000000000#! /bin/bash #***************************************************************************** # IrstLM: IRST Language Model Toolkit # Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #****************************************************************************** function usage() { cmnd=$(basename $0); cat<&2; exit 0; ;; r) repeat=$OPTARG ;; t) maxwordlen=$OPTARG ;; s) symbol=$OPTARG ;; esac done #adds start/end symbols to standard input and #trims words longer than 80 characters eos=""; bos=""; for i in `seq $repeat`; do bos="$bos<${symbol}> "; eos="$eos <\/${symbol}>";done (sed "s/^/$bos/" | sed "s/\$/ $eos/";) |\ sed "s/\([^ ]\{$maxwordlen\}\)\([^ ]\{1,\}\)/\1/g" irstlm-6.00.05/scripts/build-lm-qsub.sh000077500000000000000000000173631263213470300177120ustar00rootroot00000000000000#! /bin/bash function usage() { cmnd=$(basename $0); cat<", and any other) -s Smoothing methods: witten-bell (default), kneser-ney (approximated kneser-ney), improved-kneser-ney -b Include sentence boundary n-grams (optional) -d Define subdictionary for n-grams (optional) -v Verbose EOF } hostname=`uname -n` if [ $hostname == "voxgate" ] ; then echo "voxgate can not be used as submission host" echo "use any other cluster machine" exit fi if [ ! $IRSTLM ]; then echo "Set IRSTLM environment variable with path to irstlm" exit 2; fi #paths to scripts and commands in irstlm scr=$IRSTLM/bin bin=$IRSTLM/bin gzip=`which gzip 2> /dev/null`; gunzip=`which gunzip 2> /dev/null`; #check irstlm installation if [ ! -e $bin/dict -o ! -e $scr/split-dict.pl ]; then echo "$IRSTLM does not contain a proper installation of IRSTLM" exit 3; fi #default parameters logfile=/dev/null tmpdir=stat_$$ order=3 parts=3 inpfile=""; outfile="" verbose=""; smoothing="--witten-bell"; prune=""; boundaries=""; dictionary=""; uniform="-f=y"; queueparameters="" while getopts "hvi:o:n:k:t:s:q:pbl:d:u" OPTION do case $OPTION in h) usage exit 0 ;; v) verbose="--verbose"; ;; i) inpfile=$OPTARG ;; d) dictionary="-sd=$OPTARG" ;; u) uniform=" " ;; o) outfile=$OPTARG ;; n) order=$OPTARG ;; k) parts=$OPTARG ;; t) tmpdir=$OPTARG ;; s) smoothing=$OPTARG case $smoothing in witten-bell) smoothing="--witten-bell" ;; kneser-ney) smoothing="--kneser-ney" ;; improved-kneser-ney) smoothing="--improved-kneser-ney" ;; *) echo "wrong smoothing setting"; exit 4; esac ;; p) prune='--prune-singletons'; ;; q) queueparameters=$OPTARG; ;; b) boundaries='--cross-sentence'; ;; l) logfile=$OPTARG ;; ?) usage exit ;; esac done if [ $verbose ]; then echo inpfile=\"$inpfile\" outfile=$outfile order=$order parts=$parts tmpdir=$tmpdir prune=$prune smoothing=$smoothing dictionary=$dictionary verbose=$verbose fi if [ ! "$inpfile" -o ! "$outfile" ]; then usage exit 5 fi if [ -e $outfile ]; then echo "Output file $outfile already exists! either remove or rename it." exit 6; fi if [ -e $logfile -a $logfile != "/dev/null" -a $logfile != "/dev/stdout" ]; then echo "Logfile $logfile already exists! either remove or rename it." exit 7; fi #check tmpdir tmpdir_created=0; if [ ! -d $tmpdir ]; then echo "Temporary directory $tmpdir does not exist"; echo "creating $tmpdir"; mkdir -p $tmpdir; tmpdir_created=1; else echo "Cleaning temporary directory $tmpdir"; rm $tmpdir 2> /dev/null if [ $? != 0 ]; then echo "Warning: some temporary files could not be removed" fi fi workingdir=`pwd | perl -pe 's/\/nfsmnt//g'` cd $workingdir qsubout="$workingdir/DICT-OUT$$" qsuberr="$workingdir/DICT-ERR$$" qsublog="$workingdir/DICT-LOG$$" qsubname="DICT" (\ qsub $queueparameters -b no -sync yes -o $qsubout -e $qsuberr -N $qsubname << EOF cd $workingdir echo exit status $? echo "Extracting dictionary from training corpus" $bin/dict -i="$inpfile" -o=$tmpdir/dictionary $uniform -sort=no echo exit status $? echo "Splitting dictionary into $parts lists" $scr/split-dict.pl --input $tmpdir/dictionary --output $tmpdir/dict. --parts $parts echo exit status $? EOF ) 2>&1 > $qsublog unset suffix #getting list of suffixes for file in `ls $tmpdir/dict.*` ; do sfx=`echo $file | perl -pe 's/^.+\.(\d+)$/$1/'` suffix[${#suffix[@]}]=$sfx done qsubout="$workingdir/NGT-OUT$$" qsuberr="$workingdir/NGT-ERR$$" qsublog="$workingdir/NGT-LOG$$" qsubname="NGT" unset getpids echo "Extracting n-gram statistics for each word list" echo "Important: dictionary must be ordered according to order of appearance of words in data" echo "used to generate n-gram blocks, so that sub language model blocks results ordered too" for sfx in ${suffix[@]} ; do (\ qsub $queueparameters -b no -j yes -sync no -o $qsubout.$sfx -e $qsuberr.$sfx -N $qsubname-$sfx << EOF cd $workingdir echo exit status $? $bin/ngt -i="$inpfile" -n=$order -gooout=y -o="$gzip -c > $tmpdir/ngram.dict.${sfx}.gz" -fd="$tmpdir/dict.${sfx}" $dictionary -iknstat="$tmpdir/ikn.stat.dict.${sfx}" echo exit status $? echo EOF ) 2>&1 > $qsublog.$sfx id=`cat $qsublog.$sfx | grep 'Your job' | awk '{print $3}'` sgepid[${#sgepid[@]}]=$id done waiting="" for id in ${sgepid[@]} ; do waiting="$waiting -hold_jid $id" ; done qsub $queueparameters -sync yes $waiting -j y -o /dev/null -e /dev/null -N $qsubname.W -b y /bin/ls 2>&1 > $qsubname.W.log rm $qsubname.W.log qsubout="$workingdir/SUBLM-OUT$$" qsuberr="$workingdir/SUBLM-ERR$$" qsublog="$workingdir/SUBLM-LOG$$" qsubname="SUBLM" unset getpids echo "Estimating language models for each word list" if [ $smoothing = "--kneser-ney" -o $smoothing = "--improved-kneser-ney" ]; then for sfx in ${suffix[@]} ; do (\ qsub $queueparameters -b no -j yes -sync no -o $qsubout.$sfx -e $qsuberr.$sfx -N $qsubname-$sfx << EOF cd $workingdir echo exit status $? $scr/build-sublm.pl $verbose $prune $smoothing "cat $tmpdir/ikn.stat.dict*" --size $order --ngrams "$gunzip -c $tmpdir/ngram.dict.${sfx}.gz" -sublm $tmpdir/lm.dict.${sfx} echo exit status $? echo EOF ) 2>&1 > $qsublog.$sfx id=`cat $qsublog.$sfx | grep 'Your job' | awk '{print $3}'` sgepid[${#sgepid[@]}]=$id done else for sfx in ${suffix[@]} ; do (\ qsub $queueparameters -b no -j yes -sync no -o $qsubout.$sfx -e $qsuberr.$sfx -N $qsubname-$sfx << EOF cd $workingdir echo exit status $? $scr/build-sublm.pl $verbose $prune $smoothing --size $order --ngrams "$gunzip -c $tmpdir/ngram.dict.${sfx}.gz" -sublm $tmpdir/lm.dict.${sfx} echo EOF ) 2>&1 > $qsublog.$sfx id=`cat $qsublog.$sfx | grep 'Your job' | awk '{print $3}'` sgepid[${#sgepid[@]}]=$id done fi waiting="" for id in ${sgepid[@]} ; do waiting="$waiting -hold_jid $id" ; done qsub $queueparameters -sync yes $waiting -o /dev/null -e /dev/null -N $qsubname.W -b yes /bin/ls 2>&1 > $qsubname.W.log rm $qsubname.W.log echo "Merging language models into $outfile" qsubout="$workingdir/MERGE-OUT$$" qsuberr="$workingdir/MERGE-ERR$$" qsublog="$workingdir/MERGE-LOG$$" qsubname="MERGE" (\ qsub $queueparameters -b no -j yes -sync yes -o $qsubout -e $qsuberr -N $qsubname << EOF cd $workingdir $scr/merge-sublm.pl --size $order --sublm $tmpdir/lm.dict -lm $outfile EOF ) 2>&1 > $qsublog echo "Cleaning temporary directory $tmpdir"; rm $tmpdir/* 2> /dev/null rm $qsubout* $qsuberr* $qsublog* 2> /dev/null if [ $tmpdir_created -eq 1 ]; then echo "Removing temporary directory $tmpdir"; rmdir $tmpdir 2> /dev/null if [ $? != 0 ]; then echo "Warning: the temporary directory could not be removed." fi fi exit 0 irstlm-6.00.05/scripts/build-lm.sh000077500000000000000000000210021263213470300167230ustar00rootroot00000000000000#! /bin/bash set -m # Enable Job Control function usage() { cmnd=$(basename $0); cat< /dev/null`; gunzip=`which gunzip 2> /dev/null`; #check irstlm installation if [ ! -e $bin/dict -o ! -e $scr/split-dict.pl ]; then echo "$IRSTLM does not contain a proper installation of IRSTLM" exit 3 fi #default parameters logfile=/dev/null tmpdir=stat_$$ order=3 parts=3 inpfile=""; outfile="" verbose=""; smoothing="witten-bell"; prune=""; prune_thr_str=""; boundaries=""; dictionary=""; uniform="-f=y"; backoff="" while [ "$1" != "" ]; do case $1 in -i | --InputFile ) shift; inpfile=$1; ;; -o | --OutputFile ) shift; outfile=$1; ;; -n | --NgramSize ) shift; order=$1; ;; -k | --Parts ) shift; parts=$1; ;; -d | --Dictionary ) shift; dictionary="-sd=$1"; ;; -s | --LanguageModelType ) shift; smoothing=$1; ;; -f | --PruneFrequencyThreshold ) shift; prune_thr_str="--PruneFrequencyThreshold=$1"; ;; -p | --PruneSingletons ) prune='--prune-singletons'; ;; -l | --LogFile ) shift; logfile=$1; ;; -t | --TmpDir ) shift; tmpdir=$1; ;; -u | --uniform ) uniform=' '; ;; -b | --boundaries ) boundaries='--cross-sentence'; ;; -v | --verbose ) verbose='--verbose'; ;; -h | -? | --help ) usage; exit 0; ;; * ) usage; exit 1; esac shift done case $smoothing in witten-bell) smoothing="--witten-bell"; ;; kneser-ney) ## kneser-ney still accepted for back-compatibility, but mapped into shift-beta smoothing="--shift-beta"; ;; improved-kneser-ney) ## improved-kneser-ney still accepted for back-compatibility, but mapped into improved-shift-beta smoothing="--improved-shift-beta"; ;; shift-beta) smoothing="--shift-beta"; ;; improved-shift-beta) smoothing="--improved-shift-beta"; ;; stupid-backoff) smoothing="--stupid-backoff"; backoff="--backoff" ;; *) echo "wrong smoothing setting; '$smoothing' does not exist"; exit 4 esac echo "LOGFILE:$logfile" if [ $verbose ] ; then echo inpfile='"'$inpfile'"' outfile=$outfile order=$order parts=$parts tmpdir=$tmpdir prune=$prune smoothing=$smoothing dictionary=$dictionary verbose=$verbose prune_thr_str=$prune_thr_str >> $logfile 2>&1 fi if [ ! "$inpfile" -o ! "$outfile" ] ; then usage exit 5 fi if [ -e $outfile ]; then echo "Output file $outfile already exists! either remove or rename it." exit 6 fi if [ -e $logfile -a $logfile != "/dev/null" -a $logfile != "/dev/stdout" ]; then echo "Logfile $logfile already exists! either remove or rename it." exit 7 fi echo "BIS LOGFILE:$logfile" >> $logfile 2>&1 #check tmpdir tmpdir_created=0; if [ ! -d $tmpdir ]; then echo "Temporary directory $tmpdir does not exist" >> $logfile 2>&1 echo "creating $tmpdir" >> $logfile 2>&1 mkdir -p $tmpdir tmpdir_created=1 else echo "Cleaning temporary directory $tmpdir" >> $logfile 2>&1 rm $tmpdir/* 2> /dev/null if [ $? != 0 ]; then echo "Warning: some temporary files could not be removed" >> $logfile 2>&1 fi fi echo "Extracting dictionary from training corpus" >> $logfile 2>&1 $bin/dict -i="$inpfile" -o=$tmpdir/dictionary $uniform -sort=no 2> $logfile echo "Splitting dictionary into $parts lists" >> $logfile 2>&1 $scr/split-dict.pl --input $tmpdir/dictionary --output $tmpdir/dict. --parts $parts >> $logfile 2>&1 echo "Extracting n-gram statistics for each word list" >> $logfile 2>&1 echo "Important: dictionary must be ordered according to order of appearance of words in data" >> $logfile 2>&1 echo "used to generate n-gram blocks, so that sub language model blocks results ordered too" >> $logfile 2>&1 for sdict in $tmpdir/dict.*;do sdict=`basename $sdict` echo "Extracting n-gram statistics for $sdict" >> $logfile 2>&1 if [ $smoothing = "--shift-beta" -o $smoothing = "--improved-shift-beta" ]; then additional_parameters="-iknstat=$tmpdir/ikn.stat.$sdict" else additional_parameters="" fi $bin/ngt -i="$inpfile" -n=$order -gooout=y -o="$gzip -c > $tmpdir/ngram.${sdict}.gz" -fd="$tmpdir/$sdict" $dictionary $additional_parameters >> $logfile 2>&1 & #$bin/ngt -i="$inpfile" -n=$order -gooout=y -o="$gzip -c > $tmpdir/ngram.${sdict}.gz" -fd="$tmpdir/$sdict" $dictionary -iknstat="$tmpdir/ikn.stat.$sdict" >> $logfile 2>&1 & #else #$bin/ngt -i="$inpfile" -n=$order -gooout=y -o="$gzip -c > $tmpdir/ngram.${sdict}.gz" -fd="$tmpdir/$sdict" $dictionary >> $logfile 2>&1 & #fi done # Wait for all parallel jobs to finish while [ 1 ]; do fg 2> /dev/null; [ $? == 1 ] && break; done echo "Estimating language models for each word list" >> $logfile 2>&1 for sdict in `ls $tmpdir/dict.*` ; do sdict=`basename $sdict` echo "Estimating language models for $sdict" >> $logfile 2>&1 if [ $smoothing = "--shift-beta" -o $smoothing = "--improved-shift-beta" ]; then additional_smoothing_parameters="cat $tmpdir/ikn.stat.dict.*" additional_parameters="$backoff" else additional_smoothing_parameters="" additional_parameters="" fi $scr/build-sublm.pl $verbose $prune $prune_thr_str $smoothing "$additional_smoothing_parameters" --size $order --ngrams "$gunzip -c $tmpdir/ngram.${sdict}.gz" -sublm $tmpdir/lm.$sdict $additional_parameters >> $logfile 2>&1 & #if [ $smoothing = "--shift-beta" -o $smoothing = "--improved-shift-beta" ]; then #$scr/build-sublm.pl $verbose $prune $prune_thr_str $smoothing "cat $tmpdir/ikn.stat.dict.*" --size $order --ngrams "$gunzip -c $tmpdir/ngram.${sdict}.gz" -sublm $tmpdir/lm.$sdict $backoff >> $logfile 2>&1 & #else #$scr/build-sublm.pl $verbose $prune $prune_thr_str $smoothing --size $order --ngrams "$gunzip -c $tmpdir/ngram.${sdict}.gz" -sublm $tmpdir/lm.$sdict >> $logfile 2>&1 & #fi done # Wait for all parallel jobs to finish while [ 1 ]; do fg 2> /dev/null; [ $? == 1 ] && break; done echo "Merging language models into $outfile" >> $logfile 2>&1 $scr/merge-sublm.pl --size $order --sublm $tmpdir/lm.dict -lm $outfile $backoff >> $logfile 2>&1 echo "Cleaning temporary directory $tmpdir" >> $logfile 2>&1 rm $tmpdir/* 2> /dev/null if [ $tmpdir_created -eq 1 ]; then echo "Removing temporary directory $tmpdir" >> $logfile 2>&1 rmdir $tmpdir 2> /dev/null if [ $? != 0 ]; then echo "Warning: the temporary directory could not be removed." >> $logfile 2>&1 fi fi exit 0 irstlm-6.00.05/scripts/build-sublm.pl000077500000000000000000000414051263213470300174470ustar00rootroot00000000000000#! /usr/bin/perl #***************************************************************************** # IrstLM: IRST Language Model Toolkit # Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #****************************************************************************** #first pass: read dictionary and generate 1-grams #second pass: #for n=2 to N # foreach n-1-grams # foreach n-grams with history n-1 # compute smoothing statistics # store successors # compute back-off probability # compute smoothing probability # write n-1 gram with back-off prob # write all n-grams with smoothed probability use strict; use Getopt::Long "GetOptions"; use File::Basename; my $gzip=`which gzip 2> /dev/null`; my $gunzip=`which gunzip 2> /dev/null`; chomp($gzip); chomp($gunzip); my $cutoffword=""; #special word for Google 1T-ngram cut-offs my $cutoffvalue=39; #cut-off threshold for Google 1T-ngram cut-offs #set defaults for optional parameters my ($verbose,$size,$ngrams,$sublm)=(0, 0, undef, undef); my ($witten_bell,$good_turing,$shift_beta,$improved_shift_beta,$stupid_backoff)=(0, 0, "", "", ""); my ($witten_bell_flag,$good_turing_flag,$shift_beta_flag,$improved_shift_beta_flag,$stupid_backoff_flag)=(0, 0, 0, 0, 0); my ($freqshift,$prune_singletons,$prune_thr_str,$cross_sentence)=(0, 0, "", 0); my $help = 0; $help = 1 unless &GetOptions('size=i' => \$size, 'freq-shift=i' => \$freqshift, 'ngrams=s' => \$ngrams, 'sublm=s' => \$sublm, 'witten-bell' => \$witten_bell, 'good-turing' => \$good_turing, 'shift-beta=s' => \$shift_beta, 'improved-shift-beta=s' => \$improved_shift_beta, 'stupid-backoff' => \$stupid_backoff, 'prune-singletons' => \$prune_singletons, 'pft|PruneFrequencyThreshold=s' => \$prune_thr_str, 'cross-sentence' => \$cross_sentence, 'h|help' => \$help, 'verbose' => \$verbose); if ($help || !$size || !$ngrams || !$sublm) { my $cmnd = basename($0); print "\n$cmnd - estimates single LMs\n", "\nUSAGE:\n", " $cmnd [options]\n", "\nOPTIONS:\n", " --size maximum n-gram size for the language model\n", " --ngrams input file or command to read the ngram table\n", " --sublm output file prefix to write the sublm statistics \n", " --freq-shift (optional) value to be subtracted from all frequencies\n", " --witten-bell (optional) use Witten-Bell linear smoothing (default) \n", " --shift-beta (optional) use Shift-Beta smoothing with statistics in \n", " --improved-shift-beta (optional) use Improved Shift-Beta smoothing with statistics in , similar to Improved Kneser Ney but without corrected counts\n", " --good-turing (optional) use Good-Turing linear smoothing\n", " --stupid-backoff (optional) use Stupid-Backoff smoothing\n", " --prune-singletons (optional) remove n-grams occurring once, for n=3,4,5,... (disabled by default)\n", " -pft, --PruneFrequencyThreshold (optional) pruning frequency threshold for each level; comma-separated list of values; (default is \"0,0,...,0\", for all levels)\n", " --cross-sentence (optional) include cross-sentence bounds (disabled by default)\n", " --verbose (optional) print debugging info\n", " -h, --help (optional) print these instructions\n", "\n"; exit(1); } $good_turing_flag = 1 if ($good_turing); die "build-sublm: This LM is no more supported\n\n" if ($good_turing_flag==1); $witten_bell_flag = 1 if ($witten_bell); $shift_beta_flag = 1 if ($shift_beta); $stupid_backoff_flag = 1 if ($stupid_backoff); $improved_shift_beta_flag = 1 if ($improved_shift_beta); $witten_bell = $witten_bell_flag = 1 if ($witten_bell_flag + $shift_beta_flag + $improved_shift_beta_flag + $stupid_backoff_flag) == 0; print STDERR "build-sublm: size=$size ngrams=$ngrams sublm=$sublm witten-bell=$witten_bell shift-beta=$shift_beta improved-shift-beta=$improved_shift_beta stupid-backoff=$stupid_backoff prune-singletons=$prune_singletons cross-sentence=$cross_sentence PruneFrequencyThreshold=$prune_thr_str\n" if $verbose; die "build-sublm: choose only one smoothing method\n" if ($witten_bell_flag + $shift_beta_flag + $improved_shift_beta_flag + $stupid_backoff_flag) > 1; die "build-sublm: value of --size must be larger than 0\n" if $size<1; my @pruneFreqThr=(); my $i=0; while ($i<=$size){ $pruneFreqThr[$i++]=0; } print STDERR "Pruning frequency threshold values:$prune_thr_str\n" if ($verbose); my @v=split(/,/,$prune_thr_str); $i=0; while ($i=$size){ print STDERR "too many pruning frequency threshold values; kept the first values and skipped the others\n" if ($verbose); last; }; } $i=1; while ($i<=$size){ if ($pruneFreqThr[$i] < $pruneFreqThr[$i-1]){ $pruneFreqThr[$i]=$pruneFreqThr[$i-1]; print STDERR "the value of the pruning frequency threshold for level $i has been adjusted to value $pruneFreqThr[$i]\n" if ($verbose); } $i++; } if ($verbose){ $i=0; while ($i<=$size){ print STDERR "pruneFreqThr[$i]=$pruneFreqThr[$i]\n"; $i++; } } my $log10=log(10.0); #service variable to convert log into log10 my $oldwrd=""; #variable to check if 1-gram changed my @cnt=(); #counter of n-grams my $totcnt=0; #total counter of n-grams my ($ng,@ng); #read ngrams my $ngcnt=0; #store ngram frequency my $n; print STDERR "Collecting 1-gram counts\n" if $verbose; open(INP,"$ngrams") || open(INP,"$ngrams|") || die "cannot open $ngrams\n"; open(GR,"|$gzip -c >${sublm}.1gr.gz") || die "cannot create ${sublm}.1gr.gz\n"; while ($ng=) { chomp($ng); @ng=split(/[ \t]+/,$ng); $ngcnt=(pop @ng) - $freqshift; # warn "ng: |@ng| ngcnt:$ngcnt\n"; if ($oldwrd ne $ng[0]) { # warn "$totcnt,$oldwrd,$ng[0]\n" if $oldwrd ne ''; printf (GR "%s\t%s\n",$totcnt,$oldwrd) if $oldwrd ne ''; $totcnt=0;$oldwrd=$ng[0]; } #update counter $totcnt+=$ngcnt; } printf GR "%s\t%s\n",$totcnt,$oldwrd; close(INP); close(GR); my (@h,$h,$hpr); #n-gram history my (@dict,$code); #sorted dictionary of history successors my ($diff,$singlediff,$diff1,$diff2,$diff3); #different successors of history my (@n1,@n2,@n3,@n4,@uno3); #IKN: n-grams occurring once or twice ... my (@beta,$beta); #IKN: n-grams occurring once or twice ... my $locfreq; #collect global statistics for (Improved) Shift-Beta smoothing if ($shift_beta_flag || $improved_shift_beta_flag) { my $statfile=$shift_beta || $improved_shift_beta; print STDERR "load \& merge IKN statistics from $statfile \n" if $verbose; open(IKN,"$statfile") || open(IKN,"$statfile|") || die "cannot open $statfile\n"; while () { my($lev,$n1,$n2,$n3,$n4,$uno3)=$_=~/level: (\d+) n1: (\d+) n2: (\d+) n3: (\d+) n4: (\d+) unover3: (\d+)/; $n1[$lev]+=$n1;$n2[$lev]+=$n2;$n3[$lev]+=$n3;$n4[$lev]+=$n4;$uno3[$lev]+=$uno3; print STDERR "from $statfile level $lev: n1:$n1 n2:$n2 n3:$n3 n4:$n4 uno3:$uno3\n"; print STDERR "level $lev: n1[$lev]:$n1[$lev] n3[$lev]:$n2[$lev] n3[$lev]:$n3[$lev] n4[$lev]:$n4[$lev] uno3[$lev]:$uno3[$lev]\n"; } if ($verbose){ for (my $lev=1;$lev<=$#n1;$lev++) { print STDERR "level $lev: n1[$lev]:$n1[$lev] n3[$lev]:$n2[$lev] n3[$lev]:$n3[$lev] n4[$lev]:$n4[$lev] uno3[$lev]:$uno3[$lev]\n"; } } close(IKN); } print STDERR "Computing n-gram probabilities:\n" if $verbose; foreach ($n=2;$n<=$size;$n++) { $code=-1;@cnt=(); @dict=(); $totcnt=0;$diff=0; $singlediff=1; $diff1=0; $diff2=0; $diff3=0; $oldwrd=""; #compute smothing statistics my (@beta,$beta); if ($stupid_backoff_flag) { $beta=0.4; print STDERR "Stupid-Backoff smoothing: beta $n: $beta\n" if $verbose; } if ($shift_beta_flag) { if ($n1[$n]==0 || $n2[$n]==0) { print STDERR "Error in Shift-Beta smoothing statistics: resorting to Witten-Bell\n" if $verbose; $beta=0; } else { $beta=$n1[$n]/($n1[$n] + 2 * $n2[$n]); print STDERR "Shift-Beta smoothing: beta $n: $beta\n" if $verbose; } } if ($improved_shift_beta_flag) { my $Y=$n1[$n]/($n1[$n] + 2 * $n2[$n]); if ($n3[$n] == 0 || $n4[$n] == 0 || $n2[$n] <= $n3[$n] || $n3[$n] <= $n4[$n]) { print STDERR "Warning: higher order count-of-counts are wrong\n" if $verbose; print STDERR "Fixing this problem by resorting only on the lower order count-of-counts\n" if $verbose; $beta[1] = $Y; $beta[2] = $Y; $beta[3] = $Y; } else { $beta[1] = 1 - 2 * $Y * $n2[$n] / $n1[$n]; $beta[2] = 2 - 3 * $Y * $n3[$n] / $n2[$n]; $beta[3] = 3 - 4 * $Y * $n4[$n] / $n3[$n]; } print STDERR "Improved-Shift-Beta smoothing: level:$n beta[1]:$beta[1] beta[2]:$beta[2] beta[3]:$beta[3]\n" if $verbose; } open(HGR,"$gunzip -c ${sublm}.".($n-1)."gr.gz |") || die "cannot open ${sublm}.".($n-1)."gr.gz\n"; open(INP,"$ngrams") || open(INP,"$ngrams |") || die "cannot open $ngrams\n"; open(GR,"| $gzip -c >${sublm}.${n}gr.gz"); open(NHGR,"| $gzip -c > ${sublm}.".($n-1)."ngr.gz") || die "cannot open ${sublm}.".($n-1)."ngr.gz"; my $ngram; my ($reduced_h, $reduced_ng) = ("", ""); $ng=; chomp($ng); @ng=split(/[ \t]+/,$ng); $ngcnt=(pop @ng) - $freqshift; $h=; chomp($h); @h=split(/[ \t]+/,$h); $hpr=shift @h; $reduced_ng=join(" ",@ng[0..$n-2]); $reduced_h=join(" ",@h[0..$n-2]); @cnt=(); @dict=(); $code=-1; $totcnt=0; $diff=0; $singlediff=0; $diff1=0; $diff2=0; $diff3=0; $oldwrd=""; do{ #load all n-grams starting with history h, and collect useful statistics while ($reduced_h eq $reduced_ng){ #must be true the first time! if ($oldwrd ne $ng[$n-1]) { #could this be otherwise? [Marcello 22/5/09] $oldwrd=$ng[$n-1]; ++$code; } $dict[$code]=$ng[$n-1]; $cnt[$code]+=$ngcnt; $totcnt+=$ngcnt; $ng=; if (defined($ng)){ chomp($ng); @ng=split(/[ \t]+/,$ng);$ngcnt=(pop @ng) - $freqshift; $reduced_ng=join(" ",@ng[0..$n-2]); } else{ last; } } $diff=scalar(@cnt); for (my $c=0;$c1 && $dict[$c] eq $cutoffword) { # in google n-grams #find estimates for remaining diff and singlediff #proportional estimate $diff--; #remove cutoffword my $concentration=1.0-($diff-1)/$totcnt; my $mass=1; #$totcnt/($totcnt+$ngcnt); my $index=(1-($concentration * $mass))/(1-1/$cutoffvalue) + (1/$cutoffvalue); my $cutoffdiff=int($ngcnt * $index); $cutoffdiff=1 if $cutoffdiff==0; print STDERR "diff $diff $totcnt cutofffreq $ngcnt -- cutoffdiff: $cutoffdiff\n"; print STDERR "concentration:",$concentration," mass:", $mass,"\n"; $diff+=$cutoffdiff; } } if ($improved_shift_beta) { for (my $c=0;$c<=$code;$c++) { $diff1++ if $cnt[$c]==1; $diff2++ if $cnt[$c]==2; $diff3++ if $cnt[$c]>=3; } } #print smoothed probabilities my $boprob=0; #accumulate pruned probabilities my $prob=0; my $boprob_correction=0; #prob for the correction due to singleton pruning if ($totcnt>0){ for (my $c=0;$c<=$code;$c++) { $ngram=join(" ",$reduced_h,$dict[$c]); print STDERR "totcnt:$totcnt diff:$diff singlediff:$singlediff\n" if $totcnt+$diff+$singlediff==0; if ($shift_beta && $beta>0) { $prob=($cnt[$c]-$beta)/$totcnt; } elsif ($improved_shift_beta) { my $b=($cnt[$c]>= 3? $beta[3]:$beta[$cnt[$c]]); $prob=($cnt[$c] - $b)/$totcnt; } elsif ($stupid_backoff) { $prob=$cnt[$c]/$totcnt; } else { ### other smoothing types, like Witten-Bell $prob=$cnt[$c]/($totcnt+$diff); } ## skip n-grams containing OOV ## if (&containsOOV($ngram)){ print STDERR "ngram:|$ngram| contains OOV --> hence skip\n"; next; } ## skip also n-grams containing eos symbols not at the final ## if (&CrossSentence($ngram)){ print STDERR "ngram:|$ngram| is Cross Sentence --> hence skip\n"; next; } #rm singleton n-grams for (n>=3), if flag is active #rm n-grams (n>=2) containing cross-sentence boundaries, if flag is not active #rm n-grams containing or except for 1-grams #warn "considering $size $n |$ngram|\n"; if (($prune_singletons && $n>=3 && $cnt[$c]==1) || (!$cross_sentence && &CrossSentence($ngram)) || (&containsOOV($dict[$c])) || ($n>=2 && &containsOOV($h)) || ($dict[$c] eq $cutoffword) ) { $boprob+=$prob; if ($n<$size) { #output this anyway because it will be an history for n+1 printf GR "%f\t%s %s\n",-10000,$reduced_h,$dict[$c]; } } else { if ($cnt[$c] > $pruneFreqThr[$n]){ # print unpruned n-1 gram my $logp=log($prob)/$log10; printf(GR "%f\t%s %s\n",($logp>0?0:$logp),$reduced_h,$dict[$c]); }else{ if ($n<$size) { #output this anyway because it will be an history for n+1 printf GR "%f\t%s %s\n",-10000,$reduced_h,$dict[$c]; } } } } }else{ $boprob=0; } if (($prune_singletons && $n>=3)){ if ($shift_beta && $beta>0) { # correction due to singleton pruning $boprob_correction += (1.0-$beta) * $singlediff / $totcnt; } elsif ($improved_shift_beta) { # correction due to singleton pruning $boprob_correction += (1-$beta[1]) * $singlediff / $totcnt; } elsif ($stupid_backoff) { # correction due to singleton pruning $boprob_correction += $singlediff/($totcnt); } else { # correction due to singleton pruning $boprob_correction += $singlediff/($totcnt+$diff); } } else{ $boprob_correction = 0; } $boprob=$boprob_correction; #rewrite history including back-off weight #check if history has to be pruned out if ($hpr==-10000) { #skip this history } elsif ($shift_beta && $beta>0) { print STDERR "wrong division: considering rewriting history --- h:|$h| --- hpr=$hpr --- totcnt:$totcnt -- denumerator:",($totcnt),"\n" if $totcnt==0; my $lambda=$beta * $diff/$totcnt; my $logp=log($boprob+$lambda)/$log10; printf NHGR "%s\t%f\n",$h,($logp>0?0:$logp); } elsif ($improved_shift_beta) { print STDERR "wrong division: considering rewriting history --- h:|$h| --- hpr=$hpr --- totcnt:$totcnt -- denumerator:",($totcnt),"\n" if $totcnt==0; my $lambda=($beta[1] * $diff1 + $beta[2] * $diff2 + $beta[3] * $diff3)/$totcnt; my $logp=log($boprob+$lambda)/$log10; printf NHGR "%s\t%f\n",$h,($logp>0?0:$logp); } elsif ($stupid_backoff) { print STDERR "wrong division: considering rewriting history --- h:|$h| --- hpr=$hpr --- totcnt:$totcnt -- denumerator:",($totcnt),"\n" if $totcnt==0; my $lambda=$beta; my $logp=log($lambda)/$log10; printf NHGR "%s\t%f\n",$h,($logp>0?0:$logp); } else { print STDERR "wrong division: considering rewriting history --- h:|$h| --- hpr=$hpr --- totcnt:$totcnt diff:$diff -- denumerator:",($totcnt+$diff),"\n" if $totcnt+$diff==0; my $lambda=$diff/($totcnt+$diff); my $logp=log($boprob+$lambda)/$log10; printf NHGR "%s\t%f\n",$h,($logp>0?0:$logp); } #reset smoothing statistics $code=-1;@cnt=(); @dict=(); $totcnt=0;$diff=0;$singlediff=0;$oldwrd="";$diff1=0;$diff2=0;$diff3=0;$locfreq=0; #read next history $h=; if (defined($h)){ chomp($h); @h=split(/[ \t]+/,$h); $hpr=shift @h; $reduced_h=join(" ",@h[0..$n-2]); }else{ die "ERROR: Something could be wrong: history are terminated before ngrams!" if defined($ng); } }until (!defined($ng)); #n-grams are over close(HGR); close(INP); close(GR); close(NHGR); rename("${sublm}.".($n-1)."ngr.gz","${sublm}.".($n-1)."gr.gz"); } #check if n-gram contains cross-sentence boundaries sub CrossSentence(){ my ($ngram) = @_; if ($ngram=~/<\/s> /i) { #if occurs not only in the last place print STDERR "check CrossSentence ngram:|$ngram| is CrossSentence\n" if $verbose; return 1; } return 0; } #check if n-gram contains OOV sub containsOOV(){ my ($ngram) = @_; if ($ngram=~//i){ print STDERR "check containsOOV ngram:|$ngram| contains OOV\n" if $verbose; return 1; } return 0; } irstlm-6.00.05/scripts/goograms2ngrams.pl000077500000000000000000000113121263213470300203320ustar00rootroot00000000000000#! /usr/bin/perl #***************************************************************************** # IrstLM: IRST Language Model Toolkit # Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #****************************************************************************** #transforms google n-grams into real n-grams so that counts are #consistent with respect to lower order n-grams use strict; use Getopt::Long "GetOptions"; my $gzip=`which gzip 2> /dev/null`; my $gunzip=`which gunzip 2> /dev/null`; chomp($gzip); chomp($gunzip); my $cutoffword=""; #special word for Google 1T-ngram cut-offs my $blocksize=10000000; #this is the blocksize of produced n-grams my $from=2; #starting n-gram level my($help,$verbose,$maxsize,$googledir,$ngramdir)=(); $help=1 unless &GetOptions('maxsize=i' => \$maxsize, 'startfrom=i' => \$from, 'googledir=s' => \$googledir, 'ngramdir=s' => \$ngramdir, 'h|help' => \$help, 'verbose' => \$verbose); if ($help || !$maxsize || !$googledir || !$ngramdir ) { my $cmnd = "goograms2ngrams.pl"; print "\n$cmnd - transforms google n-grams into real n-grams so that\n", " counts are consistent with respect to lower order n-grams\n", "\nUSAGE:\n", " $cmnd [options]\n", "\nOPTIONS:\n", " --maxsize maximum n-gram level of conversion\n", " --startfrom skip initial levels if already available (default 2)\n", " --googledir directory containing the google-grams dirs (1gms,2gms,...)\n", " --ngramdir directory where to write the n-grams \n", " --verbose (optional) very talktive output\n", " -h, --help (optional) print these instructions\n", "\n"; exit(1); } warn "goograms2ngrams: maxsize $maxsize from $from googledir $googledir ngramdir $ngramdir \n" if $verbose; die "goograms2ngrams: value of --maxsize must be between 2 and 5\n" if $maxsize<2 || $maxsize>5; die "goograms2ngrams: cannot find --googledir $googledir \n" if ! -d $googledir; die "goograms2ngrams: cannot find --ngramdir $ngramdir \n" if ! -d $ngramdir; my ($n,$hgrams,$ggrams,$ngrams)=(); my ($ggr,$hgr,$hgrcnt,$ggrcnt,$totggrcnt)=(); my (@ggr,@hgr)=(); foreach ($n=$from;$n<=$maxsize;$n++){ my $counter=0; warn "Converting google-$n-grams into $n-gram\n"; $hgrams=($n==2?"${googledir}/1gms/vocab.gz":"${ngramdir}/".($n-1)."grams-*.gz"); open(HGR,"$gunzip -c $hgrams |") || die "cannot open $hgrams\n"; $ggrams="${googledir}/".($n)."gms/".($n)."gm-*"; open(GGR,"$gunzip -c $ggrams |") || die "cannot open $ggrams\n"; my $id = sprintf("%04d", 0); $ngrams="${ngramdir}/".($n)."grams-${id}.gz"; next if -e $ngrams; #go to next step if file exists already; open(NGR,"|$gzip -c > $ngrams ") || die "cannot open $ngrams\n"; chop($ggr=); @ggr=split(/[ \t]/,$ggr);$ggrcnt=(pop @ggr); #warn "ggr: ",$ggrcnt," ",join(" ",@ggr[0..$n-1]),"\n"; while ($hgr=){ $counter++; printf(STDERR ".") if ($counter % 1000000)==0; chop($hgr); @hgr=split(/[ \t]/,$hgr); $hgrcnt=(pop @hgr); #warn "hgr: ",$hgrcnt," ",join(" ",@hgr[0..$n-2]),"\n"; if (join(" ",@hgr[0..$n-2]) eq join(" ",@ggr[0..$n-2])){ $totggrcnt=0; do{ $totggrcnt+=$ggrcnt; print NGR join(" ",@ggr[0..$n-1])," ",$ggrcnt,"\n"; chop($ggr=);@ggr=split(/[ \t]/,$ggr);$ggrcnt=(pop @ggr); }until (join(" ",@hgr[0..$n-2]) ne join(" ",@ggr[0..$n-2])); if ($hgrcnt > $totggrcnt){ #warn "difference: $hgrcnt $totggrcnt =",$hgrcnt-$totggrcnt,"\n"; print NGR join(" ",@hgr[0..$n-1])," ",$cutoffword," ",$hgrcnt-$totggrcnt,"\n"; } } else{ #warn "fully pruned context: $hgr\n"; print NGR join(" ",@hgr[0..$n-1])," ",$cutoffword," ",$hgrcnt,"\n"; } if (($counter % $blocksize)==0){ close(NGR); my $id = sprintf("%04d", int($counter / $blocksize)); $ngrams="${ngramdir}/".($n)."grams-${id}.gz"; open(NGR,"|$gzip -c > $ngrams ") || die "cannot open $ngrams\n"; } } close(HGR);close(NGR);close(GGR); } irstlm-6.00.05/scripts/lm-stat.pl000077500000000000000000000035061263213470300166110ustar00rootroot00000000000000#! /usr/bin/perl #***************************************************************************** # IrstLM: IRST Language Model Toolkit # Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #****************************************************************************** #computes LM statistics over a string use strict; use Getopt::Long "GetOptions"; use File::Basename; my ($help,$lm,$txt)=(); $help=1 unless &GetOptions('lm=s' => \$lm, 'txt=s' => \$txt, 'h|help' => \$help,); if ($help || !$lm || !$txt) { my $cmnd = basename($0); print "\n$cmnd - computes LM statistics over a string\n", "\nUSAGE:\n", " $cmnd [options]\n", "\nOPTIONS:\n", " --lm language model file \n", " --txt text file\n", " -h, --help (optional) print these instructions\n", "\n"; exit(1); } if (!$ENV{IRSTLM}){ print "Set environment variable IRSTLM with path to the irstlm directory\n"; exit(1); } my $clm="$ENV{IRSTLM}/bin/compile-lm"; open (OUT,"$clm $lm --eval $txt --debug 1|"); while (){ print; } close(OUT); irstlm-6.00.05/scripts/mdtsel.sh000077500000000000000000000121331263213470300165130ustar00rootroot00000000000000#! /bin/bash #/****************************************************************************** #IrstLM: IRST Language Model Toolkit #Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy # #This library is free software; you can redistribute it and/or #modify it under the terms of the GNU Lesser General Public #License as published by the Free Software Foundation; either #version 2.1 of the License, or (at your option) any later version. # #This library is distributed in the hope that it will be useful, # # #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #Lesser General Public License for more details. # #You should have received a copy of the GNU Lesser General Public #License along with this library; if not, write to the Free Software #Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # #******************************************************************************/ # mdtsel.sh # by M. Federico # Copyright Marcello Federico, Fondazione Bruno Kessler, 2012 set -m #enable job control usage() { cmnd=$(basename $0); cat << EOF $cmnd - performs data selection assuming an indomain corpus and a very large out of domain corpus. USAGE: $cmnd [options] DESCRIPTION. This command performs data selection assuming an indomain corpus and a very large out of domain corpus. Both corpora must contain one sentence in each line delimited with and . The process produces a file of scores. OPTIONS: -h Show this message -v Verbose -i In-domain corpus -o Out-domain corpus -s Scores output file -x Out-domain lines are indexed -w Temporary work directory (default /tmp) -j Number of jobs (default 6) -m Data selection model (1 or 2, default 2) -f Word frequency threshold (default 2) -n Ngram order to use (n>=1 default 3) -d Vocabulary size upper bound (default 10000000) -c Cross-validation parameter (cv>=1, default 1) EOF } if [ ! $IRSTLM ]; then echo "Set IRSTLM environment variable with path to irstlm" exit 2 fi #paths to scripts and commands in irstlm scr=$IRSTLM/bin bin=$IRSTLM/bin #check irstlm installation if [ ! -e $bin/dtsel ]; then echo "$IRSTLM does not contain a proper installation of IRSTLM" exit 3 fi #default parameters indomfile=""; outdomfile=""; scoresfile=""; workdir=/tmp logfile="/dev/null" jobs=6 model=2 minfreq=2 ngramorder=3 cv=1 dub=10000000 verbose=""; useindex=0; while getopts “hvi:o:s:l:w:j:m:f:n:c:d:x:” OPTION do case $OPTION in h) usage exit 1 ;; v) verbose="--verbose"; ;; i) indfile=$OPTARG ;; o) outdfile=$OPTARG ;; s) scorefile=$OPTARG ;; l) logfile=$OPTARG ;; w) workdir=$OPTARG ;; j) jobs=$OPTARG ;; m) model=$OPTARG ;; n) ngramorder=$OPTARG ;; f) minfreq=$OPTARG; ;; d) dub=$OPTARG; ;; x) useindex=$OPTARG; ;; ?) usage exit 1 ;; esac done if [ $verbose ];then echo indfile= $indfile outdfile= $outdfile scorefile= $scorefile useindex= $useindex echo logfile= $logfile workdir= $workdir echo jobs= $jobs model= $model ngramorder= $ngramorder minfreq= $minfreq dub=$dub fi if [ ! $indfile -o ! $outdfile -o ! $scorefile ]; then usage exit 5 fi if [ -e $scorefile ]; then echo "Output score file $outfile already exists! either remove or rename it." exit 6 fi if [ $logfile != "/dev/null" -a $logfile != "/dev/stdout" -a -e $logfile ]; then echo "Logfile $logfile already exists! either remove or rename it." exit 7 fi workdir_created=0 if [ ! -d $workdir ]; then echo "Temporary work directory $workdir does not exist"; echo "creating $workdir"; mkdir -p $workdir; workdir_created=1; fi #get process id to name process specific temporary files pid=$$ #compute size of out domain corpus and block size of split lines=`wc -l < $outdfile` size=`echo "( $lines + 1000 )" / $jobs | bc` #to avoid any small block #perform split split -l $size $outdfile $workdir/dtsel${pid}-files- for file in $workdir/dtsel${pid}-files-* do echo $file ( \ $bin/dtsel -x=$useindex -i=$indfile -o=$file -s=${file}.scores -n=$ngramorder -dub=$dub -f=$minfreq -m=$model ; \ cat ${file}.scores | perl -pe '/^nan /1000 /g;' | sort -g > ${file}.scores.tmp ; \ mv ${file}.scores.tmp ${file}.scores \ ) >>$logfile 2>&1 & done # Wait for all parallel jobs to finish while [ 1 ]; do fg 2> /dev/null; [ $? == 1 ] && break; done sort -g -m $workdir/dtsel${pid}-files-*.scores > $scorefile rm $workdir/dtsel${pid}-files-* if [ $workdir_created == 1 ] then rmdir $workdir fi irstlm-6.00.05/scripts/merge-sublm.pl000077500000000000000000000140011263213470300174370ustar00rootroot00000000000000#! /usr/bin/perl #***************************************************************************** # IrstLM: IRST Language Model Toolkit # Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #****************************************************************************** #merge prefix LMs into one single file use strict; use Getopt::Long "GetOptions"; use File::Basename; my ($help,$lm,$size,$sublm,$backoff)=(); $help=0; $backoff=0; &GetOptions('size=i' => \$size, 'lm=s' => \$lm, 'sublm=s' => \$sublm, 'backoff' => \$backoff, 'h|help' => \$help); if ($help || !$size || !$lm || !$sublm) { my $cmnd = basename($0); print "\n$cmnd - merge single LMs\n", "\nUSAGE:\n", " $cmnd [options]\n", "\nOPTIONS:\n", " --size maximum n-gram size for the language model\n", " --sublm path identifying all input prefix sub LMs\n", " --lm name of the output LM file (will be gzipped)\n", " --backoff (optional) create a backoff LM, output is directly in ARPA format (default is false, i.e. iARPA format) \n", " -h, --help (optional) print these instructions\n", "\n"; exit(1); } my $gzip=`which gzip 2> /dev/null`; my $gunzip=`which gunzip 2> /dev/null`; chomp($gzip); chomp($gunzip); warn "merge-sublm.pl --size $size --sublm $sublm --lm $lm --backoff $backoff\n"; warn "Compute total sizes of n-grams\n"; my @size=(); #number of n-grams for each level my $tot1gr=0; #total frequency of 1-grams my $unk=0; #frequency of my $pr; #probability of 1-grams my (@files,$files); #sublm files for a given n-gram size for (my $n=1;$n<=$size;$n++){ @files=map { glob($_) } "${sublm}*.${n}gr*"; $files=join(" ",@files); $files || die "cannot find sublm files\n"; warn "join files $files\n"; if ($n==1){ open(INP,"$gunzip -c $files|") || die "cannot open $files\n"; while(my $line = ){ $size[$n]++; chomp($line); warn "there is an empty line in any of these files ($files); this should not happen\n" if $line =~ /^$/; my @words = split(/[ \t]+/,$line); #cut down counts for sentence initial $words[0]=1 if $words[1]=~//; #there could be more independent words #generated by ngt with -sd option $size[$n]-- if $unk && $words[1] eq ""; $unk+=$words[0] if $words[1]=~//i; $tot1gr+=$words[0]; } close(INP); if ($unk==0){ warn "implicitely add word to counters\n"; $tot1gr+=$size[$n]; #equivalent to WB smoothing $size[$n]++; } }else{ for (my $j=0;$j wc$$") or die; open(INP,"wc$$") || die "cannot open wc$$\n"; my $wc = ; chomp($wc); $size[$n] += $wc; close(INP); unlink("wc$$"); } } warn "n:$n size:$size[$n] unk:$unk\n"; } warn "Merge all sub LMs\n"; $lm.=".gz" if $lm!~/.gz$/; open(LM,"|$gzip -c > $lm") || die "Cannot open $lm\n"; warn "Write LM Header\n"; if ($backoff){ printf LM "ARPA\n\n"; } else{ printf LM "iARPA\n\n"; } printf LM "\\data\\\n"; for (my $n=1;$n<=$size;$n++){ printf LM "ngram $n=\t$size[$n]\n"; } printf LM "\n"; close(LM); warn "Writing LM Tables\n"; for (my $n=1;$n<=$size;$n++){ warn "Level $n\n"; @files=map { glob($_) } "${sublm}*.${n}gr*"; $files=join(" ",@files); warn "input from: $files\n"; if ($n==1){ open(INP,"$gunzip -c $files|") || die "cannot open $files\n"; open(LM,"|$gzip -c >> $lm"); printf LM "\\$n-grams:\n"; while(my $line = ){ chomp($line); warn "there is an empty line in any of these files ($files); this should not happen\n" if $line =~ /^$/; #lowercase some expressions of google n-grams $line=~s///g; $line=~s/<\/S>/<\/s>/g; $line=~s///g; my @words = split(/[ \t]+/,$line); #always print unk a the eqnd next if $words[1]=~//i; #cut down counts for sentence initial $words[0]=1 if $words[1]=~//i; #apply witten-bell smoothing on 1-grams $pr=(log($words[0]+1)-log($tot1gr+$size[1]))/log(10.0); shift @words; printf LM "%f\t%s\t%f\n",$pr,$words[0],$words[1]; } close(INP); #print final #witten-bell smoothing of probability if ($unk){ $pr=(log($unk+1)-log($tot1gr+$size[1]))/log(10.0); }else{ $pr=(log($size[1]-1+1)-log($tot1gr+$size[1]))/log(10.0); } printf LM "%f \n",$pr; close(LM); }else{ open(LM,"|$gzip -c >> $lm"); printf LM "\\$n-grams:\n"; close(LM); for (my $j=0;$j> $lm") or die; } } } open(LM,"|$gzip -c >> $lm") || die "Cannot open $lm\n"; printf LM "\\end\\\n"; close(LM); sub safesystem { print STDERR "Executing: @_\n"; system(@_); if ($? == -1) { print STDERR "Failed to execute: @_\n $!\n"; exit(1); } elsif ($? & 127) { printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", ($? & 127), ($? & 128) ? 'with' : 'without'; exit(1); } else { my $exitcode = $? >> 8; print STDERR "Exit code: $exitcode\n" if $exitcode; return ! $exitcode; } } irstlm-6.00.05/scripts/ngram-split.pl000077500000000000000000000051311263213470300174610ustar00rootroot00000000000000#! /usr/bin/perl #***************************************************************************** # IrstLM: IRST Language Model Toolkit # Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #****************************************************************************** #re-segment google n-gram count files into files so that #n-grams starting with a given word (prefix) are all #contained in one file. use Getopt::Long "GetOptions"; use File::Basename; my ($help,$lm,$size,$sublm)=(); $help=1 unless &GetOptions('h|help' => \$help); if ($help) { my $cmnd = basename($0); print "\n$cmnd - re-segment google n-gram count files so that n-grams\n", " starting with a given word (prefix) are all contained in one file\n", "\nUSAGE:\n", " $cmnd [options] []\n", "\nDESCRIPTION:\n", " Input is expected on STDIN.\n", " prefix of files to be created\n", "\nOPTIONS:\n", " -h, --help (optional) print these instructions\n", "\n"; exit(1); } $max_pref=10000; #number of prefixes to be put in one file $max_ngram=5000000;#number of n-grams to be put in one file $file_cnt=0; #counter of files $pref_cnt=0; #counter of prefixes in the current file $ngram_cnt=0; #counter of n-gram in the current file $path=($ARGV[0]?$ARGV[0]:"goong"); #path of files to be created $gzip=`which gzip`; chomp($gzip); $pwrd=""; open(OUT,sprintf("|$gzip -c > %s.%04d.gz",$path,++$file_cnt)); while ($ng=){ ($wrd)=$ng=~/^([^ ]+)/; #warn "$wrd\n"; if ($pwrd ne $wrd){ $pwrd=$wrd; if ($file_pref>$max_pref || $ngram_cnt>$max_ngram){ warn "it's time to change file\n"; close(OUT); open(OUT,sprintf("|$gzip -c > %s.%04d.gz",$path,++$file_cnt)); $pref_cnt=$ngram_cnt=0; } else{ $pref_cnt++; } } print OUT $ng; $ngram_cnt++; } close(OUT); irstlm-6.00.05/scripts/other/000077500000000000000000000000001263213470300160055ustar00rootroot00000000000000irstlm-6.00.05/scripts/other/beautify.perl000077500000000000000000000012561263213470300205100ustar00rootroot00000000000000#!/usr/bin/perl my $os=`uname | tr -d '\012'`; my $dir=`dirname $0 | tr -d '\012'`; my $astyle="$dir/astyle_$os"; opendir(DIR,".") or die "Can't open the current directory: $!\n"; # read file/directory names in that directory into @names @names = readdir(DIR) or die "Unable to read current dir:$!\n"; foreach $name (@names) { next if ($name eq "."); # skip the current directory entry next if ($name eq ".."); # skip the parent directory entry if (-d $name){ # is this a directory? `$astyle --style="k&r" -s2 --recursive -v "$name/*.h" "$name/*.cpp"`; next; # can skip to the next name in the for loop } } closedir(DIR); irstlm-6.00.05/scripts/plsa.sh000077500000000000000000000162501263213470300161660ustar00rootroot00000000000000#! /bin/bash set -m # Enable Job Control function usage() { cmnd=$(basename $0); cat< /dev/null`; gunzip=`which gunzip 2> /dev/null`; #default parameters tmpdir=stat_$$ data="" topics=100 splits=5 iter=20 prunefreq=2 spectopics=0 logfile="/dev/null" verbose="" unigram="" outtopic="" dict="dictionary" forcedict="" model="" txtfile="/dev/null" while getopts "hvfc:m:r:k:i:n:t:d:p:s:l:u:o:" OPTION do case $OPTION in h) usage exit 0 ;; v) verbose="--verbose"; ;; c) data=$OPTARG ;; m) model=$OPTARG ;; r) txtfile=$OPTARG ;; k) splits=$OPTARG ;; i) iter=$OPTARG ;; t) tmpdir=$OPTARG ;; d) dict=$OPTARG ;; f) forcedict="TRUE" ;; p) prunefreq=$OPTARG ;; s) spectopics=$OPTARG ;; n) topics=$OPTARG ;; l) logfile=$OPTARG ;; u) unigram=$OPTARG ;; o) outtopic=$OPTARG ;; ?) usage exit 1 ;; esac done if [ $verbose ]; then echo data=$data model=$model topics=$topics iter=$iter dict=$dict logfile="/dev/stdout" fi if [ "$unigram" == "" -a "$outtopic" == "" ]; then #training branch if [ ! "$data" -o ! "$model" ]; then usage exit 1 fi if [ -e $logfile -a $logfile != "/dev/null" -a $logfile != "/dev/stdout" ]; then echo "Logfile $logfile already exists! either remove or rename it." exit 1 fi if [ -e $model ]; then echo "Output file $model already exists! either remove or rename it." >> $logfile 2>&1 exit 1 fi if [ -e $txtfile -a $txtfile != "/dev/null" ]; then echo "Output file $txtfile already exists! either remove or rename it." >> $logfile 2>&1 exit 1 fi if [ -e $logfile -a $logfile != "/dev/null" -a $logfile != "/dev/stdout" ]; then echo "Logfile $logfile already exists! either remove or rename it." >> $logfile 2>&1 exit 1 fi #if [ ! -e "$data" ]; then #echo "Cannot find data $data." >> $logfile 2>&1 #exit 1; #fi if [ ! -e $dict ]; then echo extract dictionary >> $logfile $bin/dict -i="$data" -o=$dict -PruneFreq=$prunefreq -f=y >> $logfile 2>&1 if [ `head -n 1 $dict| cut -d " " -f 3` -lt 10 ]; then echo "Dictionary contains errors" exit 2; fi else echo "Warning: dictionary file already exists." >> $logfile 2>&1 if [ $forcedict ]; then echo "Warning: authorization to use it." >> $logfile 2>&1 else echo "No authorization to use it (see option -f)." >> $logfile 2>&1 exit 1 fi fi #check tmpdir tmpdir_created=0; if [ ! -d $tmpdir ]; then echo "Creating temporary working directory $tmpdir" >> $logfile 2>&1 mkdir -p $tmpdir; tmpdir_created=1; else echo "Cleaning temporary directory $tmpdir" >> $logfile 2>&1 rm $tmpdir/* 2> /dev/null if [ $? != 0 ]; then echo "Warning: some temporary files could not be removed" >> $logfile 2>&1 fi fi ##### echo split documents >> $logfile 2>&1 $bin/plsa -c="$data" -d=$dict -b=$tmpdir/data -sd=$splits >> $logfile 2>&1 machine=`uname -s` if [ $machine == "Darwin" ] ; then splitlist=`jot - 1 $splits` iterlist=`jot - 1 $iter` else splitlist=`seq 1 1 $splits` iterlist=`seq 1 1 $iter` fi #rm $tmpdir/Tlist for sp in $splitlist ; do echo $tmpdir/data.T.$sp >> $tmpdir/Tlist 2>&1; done #rm $model for it in $iterlist ; do echo "ITERATION $it" >> $logfile 2>&1 for sp in $splitlist ; do (date; echo it $it split $sp )>> $logfile 2>&1 $bin/plsa -c=$tmpdir/data.$sp -d=$dict -st=$spectopics -hf=$tmpdir/data.H.$sp -tf=$tmpdir/data.T.$sp -wf=$model -m=$model -t=$topics -it=1 -tit=$it >> $logfile 2>&1 & done while [ 1 ]; do fg 2> /dev/null; [ $? == 1 ] && break; done (date; echo recombination ) >> $logfile 2>&1 $bin/plsa -ct=$tmpdir/Tlist -c="$data" -d=$dict -hf=$tmpdir/data.H -m=$model -t=$topics -it=1 -txt=$txtfile >> $logfile 2>&1 done (date; echo End of training) >> $logfile 2>&1 echo "Cleaning temporary directory $tmpdir" >> $logfile 2>&1 rm $tmpdir/* 2> /dev/null if [ $tmpdir_created -eq 1 ]; then echo "Removing temporary directory $tmpdir" >> $logfile 2>&1 rmdir $tmpdir 2> /dev/null if [ $? != 0 ]; then echo "Warning: the temporary directory could not be removed." >> $logfile 2>&1 fi fi exit 0 #testing branch else if [ ! $model -o ! -e $model ]; then echo "Need to specify existing model" >> $logfile 2>&1 exit 1; fi if [ ! $dict -o ! -e $dict ]; then echo "Need to specify dictionary file of the model" >> $logfile 2>&1 exit 1; fi if [ $unigram ]; then $bin/plsa -inf="$data" -d=$dict -m=$model -hf=hfff.out$$ -t=$topics -it=$iter -wof=$unigram >> $logfile 2>&1 rm hfff.out$$ else #topic distribution #check tmpdir tmpdir_created=0; if [ ! -d $tmpdir ]; then echo "Creating temporary working directory $tmpdir" >> $logfile 2>&1 mkdir -p $tmpdir; tmpdir_created=1; else echo "Cleaning temporary directory $tmpdir" >> $logfile 2>&1 rm $tmpdir/* 2> /dev/null if [ $? != 0 ]; then echo "Warning: some temporary files could not be removed" >> $logfile 2>&1 fi fi ##### echo split documents >> $logfile 2>&1 $bin/plsa -c="$data" -d=$dict -b=$tmpdir/data -sd=$splits >> $logfile 2>&1 machine=`uname -s` if [ $machine == "Darwin" ] ; then splitlist=`jot - 1 $splits` else splitlist=`seq 1 1 $splits` fi #rm $tmpdir/Tlist for sp in $splitlist ; do echo $tmpdir/data.T.$sp >> $tmpdir/Tlist 2>&1; done #rm $model for sp in $splitlist ; do (date; echo split $sp )>> $logfile 2>&1 $bin/plsa -inf=$tmpdir/data.$sp -d=$dict -hf=$tmpdir/data.H.$sp -m=$model -t=$topics -it=$iter -tof=$tmpdir/topic.$sp >> $logfile 2>&1 & done while [ 1 ]; do fg 2> /dev/null; [ $? == 1 ] && break; done (date; echo recombination ) >> $logfile 2>&1 echo > $outtopic for sp in $splitlist ; do #makes sure that 1 < 2 < ... < 11 ... cat $tmpdir/topic.$sp >> $outtopic done (date; echo End of training) >> $logfile 2>&1 echo "Cleaning temporary directory $tmpdir" >> $logfile 2>&1 rm $tmpdir/* 2> /dev/null if [ $tmpdir_created -eq 1 ]; then echo "Removing temporary directory $tmpdir" >> $logfile 2>&1 rmdir $tmpdir 2> /dev/null if [ $? != 0 ]; then echo "Warning: the temporary directory could not be removed." >> $logfile 2>&1 fi fi fi fi exit 0 irstlm-6.00.05/scripts/qplsa.sh000077500000000000000000000101741263213470300163460ustar00rootroot00000000000000#! /bin/bash sDir=$(cd $(dirname $0) ; /bin/pwd) #Task data bin=/hltsrv0/federico/plsa/bin wdir=/panfs/panfem/test-hlt/federico/plsa/CC #/hltsrv0/federico/plsa/ted ldir=/scratch/federico data=doc_en.00.bin dict=ted.dict #ted-en topics=150 iter=2 prunefreq=5 spectopics=500 Tlist=$wdir/tlist splits=2 model=model.$splits txtfile=Wfile.$splits #parameters numSlots=1-3 ram=10G qL=bld.q,bld-ib.q #Preparation phase jName=PLSA.PRE #preparation ends when tlist is prepared rm $Tlist jName=PLSA.TRAIN range=`yes | head -n $splits | awk '{printf("%02d ",a);a++}'` iter=`seq 1 1 $iter| tr "\012" " "` qsub -cwd -N $jName -j y -q $qL -l mf=$ram -t $numSlots -o $wdir/log -S /bin/bash < $wdir/monitor.\$SGE_TASK_ID echo if [[ ! -d $ldir ]]; then mkdir $ldir; fi ################################# if [ \$me -eq \$lastid ] then (echo master starts ; uname -n ; date) > $wdir/monitor.\$SGE_TASK_ID #prepare Tlist file rm $Tlist for sp in $range; do echo $wdir/$data.T.\$sp >> $Tlist done #tell slaves to copy and binarize data for sp in $range; do (echo cp $wdir/$data.\$sp.gz $wdir/$dict $ldir \; ;\ echo $bin/plsa -c=\"gunzip -c $ldir/$data.\$sp.gz\" -d=$ldir/$dict -b=$ldir/$data.\$sp \; ;\ echo rm $ldir/$data.\$sp.gz ) > $wdir/taskfor_\$sp touch $wdir/doit_\$sp done (echo master prepare ; date) >> $wdir/monitor.\$SGE_TASK_ID #wait that all have finished while ls $wdir/doit_* &> /dev/null; do sleep 1; done (echo master start iteration ; date) >> $wdir/monitor.\$SGE_TASK_ID for it in $iter; do for sp in $range; do (echo master iteration \$it split \$sp; date) >> $wdir/monitor.\$SGE_TASK_ID echo tell slave to run an iteration (echo if [[ -e $wdir/$model ]] \; then cp $wdir/$model $ldir/$model \; fi ; echo $bin/plsa -c=$ldir/$data.\$sp -d=$ldir/$dict -st=$spectopics -hf=$ldir/$data.H.\$sp -tf=$ldir/$data.T.\$sp -wf=$ldir/$model -m=$ldir/$model -t=$topics -it=1 -tit=\$it ;\ echo cp $ldir/$data.T.\$sp $wdir ) > $wdir/taskfor_\$sp touch $wdir/taskfor_\$sp touch $wdir/doit_\$sp done (echo master start waiting \$it ; date) >> $wdir/monitor.\$SGE_TASK_ID #echo wait that all have finished while ls $wdir/doit_* &> /dev/null; do (echo master waiting \$it ; date) >> $wdir/monitor.\$SGE_TASK_ID ls $wdir/doit_* sleep 1; done (echo master start recombination \$it ; date) >> $wdir/monitor.\$SGE_TASK_ID echo recombine $bin/plsa -ct=$Tlist -c=dummy -d=$wdir/$dict -m=$wdir/$model -t=$topics -it=1 -txt=$wdir/$txtfile done (echo master tells slaves to remove data; date) >> $wdir/monitor.\$SGE_TASK_ID echo tell slaves to remove their local data for sp in $range; do echo rm $ldir/$dict $ldir/$data.\$sp $ldir/$model > $wdir/taskfor_\$sp touch $wdir/taskfor_\$sp touch $wdir/doit_\$sp done echo wait that all have finished (echo master waits for slaves; date) >> $wdir/monitor.\$SGE_TASK_ID while ls $wdir/doit_* &> /dev/null; do sleep 1; done echo tell slaves to exit (echo master tells slaves to exit; date) >> $wdir/monitor.\$SGE_TASK_ID for sp in $range; do echo exit > $wdir/taskfor_\$sp touch $wdir/taskfor_\$sp touch $wdir/doit_\$sp done (echo master waits for slaves; date) >> $wdir/monitor.\$SGE_TASK_ID while ls $wdir/doit_* &> /dev/null; do sleep 1; done (echo master ends; date) >> $wdir/monitor.\$SGE_TASK_ID rm $wdir/$data.H* $wdir/$model $wdir/$data.T* $wdir/taskfor_* ############################# else (echo slave starts ; uname -n ; date) > $wdir/monitor.\$SGE_TASK_ID while : do (echo slave \$me iteration \$it waits for job; echo \$cmd; date) >> $wdir/monitor.\$SGE_TASK_ID touch $wdir if [[ -e $wdir/doit_\$me ]]; then cmd=\`cat $wdir/taskfor_\$me\` (echo slave \$me starts executing; echo \$cmd; date) >> $wdir/monitor.\$SGE_TASK_ID if [[ \$cmd == *exit* ]]; then #rm before cmd execution rm $wdir/doit_\$me >& /dev/null exit 0 else /bin/sh $wdir/taskfor_\$me #rm after cmd execution rm $wdir/doit_\$me >& /dev/null fi (echo slave ended executing; date) >> $wdir/monitor.\$SGE_TASK_ID fi sleep 1 done fi (echo end;uname -a; date) >> $wdir/monitor.\$SGE_TASK_ID exit 0 EOF irstlm-6.00.05/scripts/rm-start-end.sh000066400000000000000000000006471263213470300175440ustar00rootroot00000000000000#! /bin/bash function usage() { cmnd=$(basename $0); cat<&2; exit 0; ;; esac done sed 's///g' | sed 's/<\/s>//g' | sed 's/^ *//' | sed 's/ *$//' | sed '/^$/d' irstlm-6.00.05/scripts/sort-lm.pl000077500000000000000000000065721263213470300166330ustar00rootroot00000000000000#! /usr/bin/perl #***************************************************************************** # IrstLM: IRST Language Model Toolkit # Copyright (C) 2010 Marcello Federico, FBK-irst Trento, Italy # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #****************************************************************************** #Sorts n-grams of an ARPA file according to lexicographic order. #Inverted sorting option is propedeutic to building a binary #lmtable with compile-lm with n-grams stored in reverted order. use strict; use Getopt::Long "GetOptions"; my ($help,$ilm,$olm,$inv,$tmpdir)=(); $help=0; $ilm="/dev/stdin"; $olm="/dev/stdout"; my $tmpdir="$ENV{TMP}"; &GetOptions('ilm=s' => \$ilm, 'olm=s' => \$olm, 'tmpdir=s' => \$tmpdir, 'inv' => \$inv, 'help' => \$help,); if ($help || !$ilm || !$olm){ print "sort-lm.pl [--ilm ] [--olm ] [--inv]\n", "-ilm input ARPA LM filename (default /dev/stdin)\n", "-olm output ARPA LM filename (default /dev/stdout)\n", "-tmpdir temporary directory for sorting (default is the enivronment variable TMP\n", "-inv inverted n-gram sort for compile-lm\n", "-help print these instructions\n"; exit(1); } warn "temporary directory for sorting is $tmpdir\n"; my $order=0; my $sortcmd=""; $ENV{'LC_ALL'}='C'; open (INP, "< $ilm") || die "cannot open input LM file: $ilm\n"; open (OUT, "> $olm") || die "cannot open output LM file: $olm\n"; warn "reading from standard input\n" if $ilm eq "/dev/stdin"; warn "writing to standard output\n" if $olm eq "/dev/stdout"; $_=; #sanity check die "Error: input cannot be an intermediate iARPA file. First convert it to ARPA format with compile-lm.\n" if $_=~/^iARPA/; my $isQuantized=0; $isQuantized=1 if $_=~/^qARPA/; while(!/^\\end\\/){ if (($order)=$_=~/^\\(\d+)-grams:/){ print(OUT $_);$_=; if ($isQuantized){ print(OUT $_); chop $_;#print centers my $centers=$_; $_=; warn "skip $centers centers\n"; for (my $c=1;$c<=$centers;$c++){ print(OUT $_);$_=; } } #sort command #$sortcmd="sort -b"; #does not seem to work properly $sortcmd="sort --temporary-directory=$tmpdir"; if ($inv){ warn "inverted sorting of $order-grams\n"; for (my $n=$order;$n>0;$n--){ $sortcmd.=" -k ".($n+1).",".($n+1); } }else{ warn "direct sorting of $order-grams\n"; for (my $n=1;$n<=$order;$n++){ $sortcmd.=" -k ".($n+1).",".($n+1); } } close(OUT);open (OUT,"|$sortcmd >> $olm"); do{ print(OUT $_);$_=; }until (/^\\/ || /^\n/); close(OUT); open(OUT, ">> $olm"); } else{ print(OUT $_);$_=; } } print(OUT $_); close(INP); close(OUT); irstlm-6.00.05/scripts/split-dict.pl000077500000000000000000000103101263213470300172730ustar00rootroot00000000000000#! /usr/bin/perl #***************************************************************************** # IrstLM: IRST Language Model Toolkit # Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #****************************************************************************** #usage: #split-dict.pl #It splits the dictionary into dictionaries #(named , ... ) #splitting is balanced wrt to frequency of the dictionary #if not available a frequency of 1 is considered use strict; use Getopt::Long "GetOptions"; use File::Basename; my ($help,$input,$output,$parts)=(); $help=1 unless &GetOptions('input=s' => \$input, 'output=s' => \$output, 'parts=i' => \$parts, 'h|help' => \$help,); if ($help || !$input || !$output || !$parts) { my $cmnd = basename($0); print "\n$cmnd - splits a dictionary into frequency-balanced partitions\n", "\nUSAGE:\n", " $cmnd [options]\n", "\nDESCRIPTION:\n", " $cmnd splits a dictionary into frequency-balanced partitions.\n", " The dictionary must be generated with IRSTLM command dict.\n", " If dictionary does not contain frequencies, then a frequency 1 is\n", " assumed for all words.\n", "\nOPTIONS:\n", " --input input dictionary with frequencies\n", " --output prefix of output dictionaries\n", " --parts number of partitions to create\n", " -h, --help (optional) print these instructions\n", "\n"; exit(1); } my $freqflag=0; my ($w,$f,$globf,$thr); my (@D,@F,%S,@C); open(IN,"$input"); chomp($_=); #if input is a dictionary. if (/^dictionary[ \t]+\d+[ \t]+\d+$/i){ my ($dummy,$size); ($dummy,$dummy,$size)=split(/[ \t]+/,$_); $freqflag=1 if /DICTIONARY/; } $globf=0; while(chomp($_=)){ if ($freqflag){ ($w,$f)=split(/[ \t]+/,$_); } else{ $w=$_; $f=1; } push @D, $w; push @F, $f; $globf+=$f; } close (IN); $thr=$globf/$parts; my $totf=0; print STDERR "Dictionary 0: (thr: $thr , $globf, $totf , $parts)\n"; my $sfx=0; my $w; for (my $i=0;$i<=$#D;$i++){ # if the remaining words are less than or equal to # the number of remaining sub-dictionaries to create # put only one word per each sub-dictionary. if (($totf>0) && ($#D+1-$i) <= ($parts-1-$sfx)){ # recompute threshold on the remaining global frequency # according to the number of remaining parts $sfx++; $globf-=$totf; $thr=($globf)/($parts-$sfx); print STDERR "Dictionary $sfx: (thr: $thr , $globf , $totf , ",($parts-$sfx),")\n"; $totf=0; } $totf+=$F[$i]; $w=$D[$i]; $S{$w}=$sfx; $C[$sfx]++; if ($totf>$thr){ # recompute threshold on the remaining global frequency # according to the number of remaining parts $sfx++; $globf-=$totf; $thr=($globf)/($parts-$sfx); print STDERR "Dictionary $sfx: (thr: $thr , $globf , $totf , ",($parts-$sfx),")\n"; $totf=0; } } my $oldsfx=-1; for (my $i=0;$i<=$#D;$i++){ $w=$D[$i]; $sfx="0000$S{$w}"; $sfx=~s/.+(\d{3})/$1/; if ($sfx != $oldsfx){ #print STDERR "opening $output$sfx\n"; close (OUT) if $oldsfx!= -1; open(OUT,">$output$sfx"); if ($freqflag){ print OUT "DICTIONARY 0 $C[$sfx]\n"; } else{ print OUT "dictionary 0 $C[$sfx]\n"; } $oldsfx=$sfx; } if ($freqflag){ print OUT "$w $F[$i]\n"; } else{ print OUT "$w\n"; } } close (OUT) if $oldsfx!= -1; my $numdict=$S{$D[$#D]}+1; die "Only $numdict dictionaries were crested instead of $parts!" if ($numdict != $parts); irstlm-6.00.05/scripts/split-ngt.sh000077500000000000000000000036441263213470300171530ustar00rootroot00000000000000#! /bin/bash function usage() { cmnd=$(basename $0); cat< DESCRIPTION: Input file name Partition files name prefix Order of the ngrams Number of partitions OPTIONS: -h Show this message EOF } # Parse options while getopts h OPT; do case "$OPT" in h) usage >&2; exit 0; ;; * ) usage; exit 1; ;; esac done #usage: #ngt-split.sh [options] #It creates files (named , ... ) #containing ngram statistics (of length) in Google format #These files are a partition of the whole set of ngrams basedir=$IRSTLM bindir=$basedir/bin scriptdir=$basedir/scripts unset par while [ $# -gt 0 ] do echo "$0: arg $1" par[${#par[@]}]="$1" shift done inputfile=${par[0]} outputfile=${par[1]} order=${par[2]} parts=${par[3]} dictfile=dict$$ echo "Extracting dictionary from training corpus" $bindir/dict -i="$inputfile" -o=$dictfile -f=y -sort=n echo "Splitting dictionary into $parts lists" $scriptdir/split-dict.pl --input $dictfile --output ${dictfile}. --parts $parts rm $dictfile echo "Extracting n-gram statistics for each word list" echo "Important: dictionary must be ordered according to order of appearance of words in data" echo "used to generate n-gram blocks, so that sub language model blocks results ordered too" for d in `ls ${dictfile}.*` ; do w=`echo $d | perl -pe 's/.+(\.[0-9]+)$/$1/i'` w="$outputfile$w" sdict=`basename $sdict` echo "Extracting n-gram statistics for $sdict" echo "$bindir/ngt -i="$inputfile" -n=$order -gooout=y -o=$w -fd=$d > /dev/null" $bindir/ngt -n=$order -gooout=y -o=$w -fd=$d -i="$inputfile" > /dev/null rm $d done exit 0 irstlm-6.00.05/scripts/wrapper000066400000000000000000000002041263213470300162630ustar00rootroot00000000000000#! /bin/sh #set machine type for compilation MY_ARCH=`uname -m` name=`basename $0` dir=`dirname $0`"/$MY_ARCH" $dir/$name "$@" irstlm-6.00.05/src/000077500000000000000000000000001263213470300137645ustar00rootroot00000000000000irstlm-6.00.05/src/CMakeLists.txt000066400000000000000000000046221263213470300165300ustar00rootroot00000000000000# Set output directory SET(EXECUTABLE_OUTPUT_PATH ${CMAKE_INSTALL_PREFIX}/bin) SET(LIBRARY_OUTPUT_PATH ${CMAKE_INSTALL_PREFIX}/lib) ADD_DEFINITIONS("-D_LARGE_FILES") ADD_DEFINITIONS("-D_FILE_OFFSET_BITS=64") ADD_DEFINITIONS("-DMYCODESIZE=3") ADD_DEFINITIONS("-DDEBUG") if (TRACE_LEVEL) ADD_DEFINITIONS("-DTRACE_LEVEL=${TRACE_LEVEL}") endif() if (OPTION) ADD_DEFINITIONS("-DOPTION_${OPTION}") endif() if (SOLUTION) ADD_DEFINITIONS("-DSOLUTION_${SOLUTION}") endif() if (CXX0) MESSAGE( STATUS "HAVE_CXX0=true; hence, variable HAVE_CXX0 is set" ) SET(STD_FLAG "-std=c++0x") ADD_DEFINITIONS("-DHAVE_CXX0") else() MESSAGE( STATUS "HAVE_CXX0=false; hence, variable HAVE_CXX0 is unset" ) SET(STD_FLAG "") ADD_DEFINITIONS("-UHAVE_CXX0") endif() SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g ${STD_FLAG} -isystem/usr/include -W -Wall -ffor-scope") SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") INCLUDE_DIRECTORIES("${PROJECT_SOURCE_DIR}/src") SET( LIB_IRSTLM_SRC cmd.h cmd.c thpool.h thpool.c gzfilebuf.h index.h dictionary.h dictionary.cpp htable.h htable.cpp lmContainer.h lmContainer.cpp lmclass.h lmclass.cpp lmmacro.h lmmacro.cpp lmtable.h lmtable.cpp lmInterpolation.h lmInterpolation.cpp mempool.h mempool.cpp mfstream.h mfstream.cpp n_gram.h n_gram.cpp ngramcache.h ngramcache.cpp ngramtable.h ngramtable.cpp timer.h timer.cpp util.h util.cpp crc.h crc.cpp interplm.h interplm.cpp linearlm.h linearlm.cpp mdiadapt.h mdiadapt.cpp mixture.h mixture.cpp normcache.h normcache.cpp shiftlm.h shiftlm.cpp cplsa.h cplsa.cpp cswam.h cswam.cpp doc.h doc.cpp ) ADD_LIBRARY(irstlm STATIC ${LIB_IRSTLM_SRC}) LINK_DIRECTORIES (${LIBRARY_OUTPUT_PATH}) FOREACH(CMD dict ngt tlm dtsel plsa cswa compile-lm interpolate-lm prune-lm quantize-lm score-lm verify-caching) ADD_EXECUTABLE(${CMD} ${CMD}.cpp) TARGET_LINK_LIBRARIES (${CMD} irstlm -lm -lz -lpthread) ENDFOREACH() #INSTALL INCLUDE FILES FILE(GLOB includes src *.h) INSTALL(FILES ${includes} DESTINATION include) add_custom_command(OUTPUT LNS COMMAND ln -s -f -n lib lib64 WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX} COMMENT "creating link from lib to lib64") add_custom_target(LinkToLib64 ALL DEPENDS LNS) irstlm-6.00.05/src/Makefile.am000066400000000000000000000041721263213470300160240ustar00rootroot00000000000000lib_LTLIBRARIES = libirstlm.la AM_CXXFLAGS = -static -isystem/usr/include -W -Wall -ffor-scope -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES $(BOOST_CPPFLAGS) -DMYCODESIZE=3 libirstlm_ladir = ${includedir} libirstlm_la_HEADERS = \ cmd.h \ thpool.h \ dictionary.h \ gzfilebuf.h \ htable.h \ index.h \ lmContainer.h \ lmclass.h \ lmmacro.h \ lmtable.h \ lmInterpolation.h \ mempool.h \ mfstream.h \ n_gram.h \ ngramcache.h \ ngramtable.h \ timer.h \ util.h \ crc.h \ interplm.h \ linearlm.h \ mdiadapt.h \ mixture.h \ normcache.h \ shiftlm.h \ cplsa.h \ cswam.h \ doc.h libirstlm_la_SOURCES = \ cmd.c \ thpool.c \ dictionary.cpp \ htable.cpp \ lmContainer.cpp \ lmclass.cpp \ lmmacro.cpp \ lmtable.cpp \ lmInterpolation.cpp \ mempool.cpp \ mfstream.cpp \ n_gram.cpp \ ngramcache.cpp \ ngramtable.cpp \ timer.cpp \ util.cpp \ crc.cpp \ interplm.cpp \ linearlm.cpp \ mdiadapt.cpp \ mixture.cpp \ normcache.cpp \ shiftlm.cpp \ cplsa.cpp \ cswam.cpp \ doc.cpp CLEANFILES = $(BUILT_SOURCES) libirstlm_la_LIBADD = $(BOOST_LDFLAGS) $(BOOST_THREAD_LIB) LDADD = -lirstlm -lpthread DEPENDENCIES = libirstlm.la bin_PROGRAMS = dict ngt dtsel compile-lm interpolate-lm prune-lm quantize-lm prune-lm score-lm tlm plsa verify-caching cswa dict_SOURCES = dict.cpp dict_DEPENDENCIES = $(DEPENDENCIES) ngt_SOURCES = ngt.cpp ngt_DEPENDENCIES = $(DEPENDENCIES) dtsel_SOURCES = dtsel.cpp dtsel_DEPENDENCIES = $(DEPENDENCIES) compile_lm_SOURCES = compile-lm.cpp compile_lm_DEPENDENCIES = $(DEPENDENCIES) interpolate_lm_SOURCES = interpolate-lm.cpp interpolate_lm_DEPENDENCIES = $(DEPENDENCIES) prune_lm_SOURCES = prune-lm.cpp prune_lm_DEPENDENCIES = $(DEPENDENCIES) quantize_lm_SOURCES = quantize-lm.cpp quantize_lm_DEPENDENCIES = $(DEPENDENCIES) score_lm_SOURCES = score-lm.cpp score_lm_DEPENDENCIES = $(DEPENDENCIES) tlm_SOURCES = tlm.cpp tlm_DEPENDENCIES = $(DEPENDENCIES) plsa_SOURCES = plsa.cpp plsa_DEPENDENCIES = $(DEPENDENCIES) verify_caching_SOURCES = verify-caching.cpp verify_caching_DEPENDENCIES = $(DEPENDENCIES) cswa_SOURCES = cswa.cpp cswa_DEPENDENCIES = $(DEPENDENCIES) irstlm-6.00.05/src/cmd.c000066400000000000000000000642301263213470300147000ustar00rootroot00000000000000/****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #ifndef _WIN32_WCE #include #endif #include #include #include #include #if defined(_WIN32) #include #else #include #endif #ifdef USE_UPIO #include "missing.h" #include "updef.h" #endif #include "cmd.h" #ifdef NEEDSTRDUP char *strdup(const char *s); #endif #define LINSIZ 10240 static Bool_T BoolEnum[] = { { (char*)"FALSE", FALSE}, { (char*)"TRUE", TRUE}, { (char*)"false", FALSE}, { (char*)"true", TRUE}, { (char*)"0", FALSE}, { (char*)"1", TRUE}, { (char*)"NO", FALSE}, { (char*)"YES", TRUE}, { (char*)"No", FALSE}, { (char*)"Yes", TRUE}, { (char*)"no", FALSE}, { (char*)"yes", TRUE}, { (char*)"N", FALSE}, { (char*)"Y", TRUE}, { (char*)"n", FALSE}, { (char*)"y", TRUE}, END_ENUM }; static char *GetLine(FILE *fp, int n, char *Line), **str2array(char *s, char *sep); static int str2narray(int type, char *s, char *sep, void **a); static int Scan(char *ProgName, Cmd_T *cmds, char *Line), SetParam(Cmd_T *cmd, char *s), SetEnum(Cmd_T *cmd, char *s), SetBool(Cmd_T *cmd, char *s), SetFlag(Cmd_T *cmd, char *s), SetSubrange(Cmd_T *cmd, char *s), SetStrArray(Cmd_T *cmd, char *s), SetNumArray(Cmd_T *cmd, char *s), SetGte(Cmd_T *cmd, char *s), SetLte(Cmd_T *cmd, char *s), CmdError(char *opt), EnumError(Cmd_T *cmd, char *s), BoolError(Cmd_T *cmd, char *s), SubrangeError(Cmd_T *cmd, int n), GteError(Cmd_T *cmd, int n), LteError(Cmd_T *cmd, int n), PrintParam(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp), PrintParams4(int TypeFlag, int ValFlag, int MsgFlag, FILE *fp), FreeParam(Cmd_T *cmd), PrintEnum(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp), PrintBool(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp), PrintFlag(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp), PrintStrArray(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp), PrintIntArray(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp), PrintDblArray(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp), BuildCmdList(Cmd_T **cmdp, int *cmdSz, char *ParName, va_list args), StoreCmdLine(char *s); static Cmd_T *pgcmds = 0; static int pgcmdN = 0; static int pgcmdSz = 0; static char *SepString = " \t\r\n"; static char *ProgName = 0; static char **CmdLines = 0; static int CmdLinesSz = 0, CmdLinesL = 0; int DeclareParams(char *ParName, ...) { va_list args; va_start(args, ParName); pgcmdN = BuildCmdList(&pgcmds, &pgcmdSz, ParName, args); va_end(args); return 0; } int GetParams(int *n, char ***a, char *DefCmd) { char *Line; int i, argc = *n; char **argv = *a, *s, *p, *defCmd; #if defined(MSDOS)||defined(_WIN32) char *dot = 0; #endif extern char **environ; if(!(Line=malloc(LINSIZ))) { fprintf(stderr, "GetParams(): Unable to alloc %d bytes\n", LINSIZ); exit(IRSTLM_CMD_ERROR_MEMORY); } for(ProgName=*argv+strlen(*argv); ProgName-->*argv && *ProgName!='/' && *ProgName!='\\';); ++ProgName; #if defined(MSDOS)||defined(_WIN32) if((dot=strchr(ProgName, '.'))) *dot=0; #endif --argc; ++argv; for(i=0; environ[i]; i++) { if(strncmp(environ[i], "cmd_", 4)) continue; strcpy(Line, environ[i]+4); if(!(p=strchr(Line, '='))) continue; *p=' '; StoreCmdLine(Line); if(Scan(ProgName, pgcmds, Line)) CmdError(environ[i]); } if((defCmd=DefCmd?(DefCmd=strdup(DefCmd)):0)) { defCmd += strspn(defCmd, "\n\r"); } for(;;) { char *CmdFile = NULL; if(argc && argv[0][0]=='-' && argv[0][1]=='=') { CmdFile = argv[0]+2; ++argv; --argc; defCmd = 0; } if(!CmdFile) { int i; char ch; if(!defCmd||!(i=strcspn(defCmd, "\n\r"))) break; ch = defCmd[i]; defCmd[i] = 0; CmdFile = defCmd; defCmd += i+!!ch; defCmd += strspn(defCmd, "\n\r"); } int IsPipe = !strncmp(CmdFile, "@@", 2); FILE *fp = IsPipe ? popen(CmdFile+2, "r") : strcmp(CmdFile, "-") ? fopen(CmdFile, "r") : stdin; if(!fp) { if(defCmd) continue; fprintf(stderr, "Unable to open command file %s\n", CmdFile); exit(IRSTLM_CMD_ERROR_IO); } while(GetLine(fp, LINSIZ, Line) && strcmp(Line, "\\End")) { StoreCmdLine(Line); if(Scan(ProgName, pgcmds, Line)) CmdError(Line); } if(fp!=stdin) { if(IsPipe) pclose(fp); else fclose(fp); } } if(DefCmd) free(DefCmd); // while(argc && **argv=='-'){ while(argc){ if (**argv=='-'){ s=strchr(*argv, '='); //allows double dash for parameters int dash_number=1; if (*(*argv+1) == '-') dash_number++; if (s){ *s = ' '; if((p=strchr(*argv+dash_number, '.'))&&pName; cmd++) n += !!cmd->ArgStr; a[0] = calloc(n, sizeof(char*)); for(n=0, cmd=pgcmds; cmd->Name; cmd++) { if(!cmd->ArgStr) continue; a[0][n] = malloc(strlen(cmd->Name)+strlen(cmd->ArgStr)+l+2); sprintf(a[0][n], "%s%s=%s", pfx, cmd->Name, cmd->ArgStr); ++n; } return n; } static int BuildCmdList(Cmd_T **cmdp, int *cmdSz, char *ParName, va_list args) { int j, c, cmdN=0; char *s; Cmd_T *cmd, *cmds; if(!*cmdSz) { if(!(cmds=*cmdp=malloc((1+(*cmdSz=BUFSIZ))*sizeof(Cmd_T)))) { fprintf(stderr, "BuildCmdList(): malloc() failed\n"); exit(IRSTLM_CMD_ERROR_MEMORY); } } else { for(cmds=*cmdp; cmds[cmdN].Name; ++cmdN); } while(ParName) { if(cmdN==*cmdSz) { cmds=*cmdp=realloc(cmds, (1+(*cmdSz+=BUFSIZ))*sizeof(Cmd_T)); if(!cmds) { fprintf(stderr, "BuildCmdList(): realloc() failed\n"); exit(IRSTLM_CMD_ERROR_MEMORY); } } for(j=0; jj; c--) cmds[c] = cmds[c-1]; cmd = cmds+j; cmd->Name = ParName; cmd->Type = va_arg(args, int); cmd->Val = va_arg(args, void*); cmd->Msg = 0; cmd->Flag = 0; cmd->p = 0; switch(cmd->Type&~CMDMSG) { case CMDENUMTYPE: /* get the pointer to Enum_T struct */ case CMDFLAGTYPE: cmd->p = va_arg(args, void*); break; case CMDSUBRANGETYPE: /* get the two limits */ cmd->p = (void*)calloc(2, sizeof(int)); ((int*)cmd->p)[0] = va_arg(args, int); ((int*)cmd->p)[1] = va_arg(args, int); break; case CMDGTETYPE: /* lower or upper bound */ case CMDLTETYPE: cmd->p = (void*)calloc(1, sizeof(int)); ((int*)cmd->p)[0] = va_arg(args, int); break; case CMDSTRARRAYTYPE: /* separator string */ cmd->p = (s=va_arg(args, char*)) ? (void*)strdup(s) : 0; break; case CMDDBLARRAYTYPE: case CMDINTARRAYTYPE: /* separator & pointer to length */ cmd->p = (void*)calloc(2, sizeof(void*)); s = va_arg(args, char*); ((char**)cmd->p)[0] = s ? strdup(s) : 0; ((int**)cmd->p)[1] = va_arg(args, int*); *((int**)cmd->p)[1] = 0; break; case CMDBOOLTYPE: cmd->p = BoolEnum; break; //cmd->p = (Bool_T*)calloc(1, sizeof(Bool_T)); // cmd->p = va_arg(args, void*); // cmd->p = BoolEnum; case CMDDOUBLETYPE: /* nothing else is needed */ case CMDFLOATTYPE: case CMDINTTYPE: case CMDSTRINGTYPE: break; default: fprintf(stderr, "%s: %s %d %s \"%s\"\n", "BuildCmdList()", "Unknown Type", cmd->Type&~CMDMSG, "for parameter", cmd->Name); exit(IRSTLM_CMD_ERROR_DATA); } if(cmd->Type&CMDMSG) { cmd->Type&=~CMDMSG; cmd->Msg = va_arg(args, char*); } cmdN++; ParName = va_arg(args, char*); } cmds[cmdN].Name = 0; return cmdN; } static int CmdError(char *opt) { fprintf(stderr, "Invalid option \"%s\"\n", opt); fprintf(stderr, "This program expects the following parameters:\n"); PrintParams4(TRUE, FALSE, TRUE, stderr); exit(IRSTLM_CMD_ERROR_DATA); return 0; } static int FreeParam(Cmd_T *cmd) { switch(cmd->Type) { case CMDBOOLTYPE2: case CMDSUBRANGETYPE: case CMDGTETYPE: case CMDLTETYPE: case CMDSTRARRAYTYPE: if(cmd->p) free(cmd->p); break; case CMDINTARRAYTYPE: case CMDDBLARRAYTYPE: if(!cmd->p) break; if(*(char**)cmd->p) free(*(char**)cmd->p); free(cmd->p); break; } return 0; } static int PrintParam(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp) { char ts[128]; *ts=0; fprintf(fp, "%4s", ""); switch(cmd->Type) { case CMDDOUBLETYPE: fprintf(fp, "%s", cmd->Name); if(TypeFlag) fprintf(fp, " [double]"); if(ValFlag) fprintf(fp, ": %22.15e", *(double*)cmd->Val); break; case CMDFLOATTYPE: fprintf(fp, "%s", cmd->Name); if(TypeFlag) fprintf(fp, " [float]"); if(ValFlag) fprintf(fp, ": %22.15e", *(float *)cmd->Val); break; case CMDBOOLTYPE2: case CMDBOOLTYPE: PrintBool(cmd, TypeFlag, ValFlag, fp); break; case CMDENUMTYPE: PrintEnum(cmd, TypeFlag, ValFlag, fp); break; case CMDFLAGTYPE: PrintFlag(cmd, TypeFlag, ValFlag, fp); break; case CMDINTTYPE: if(TypeFlag) sprintf(ts, " [int]"); case CMDSUBRANGETYPE: if(TypeFlag&&!*ts) sprintf(ts, " [int %d ... %d]", ((int*)cmd->p)[0], ((int*)cmd->p)[1]); case CMDGTETYPE: if(TypeFlag&&!*ts) sprintf(ts, " [int >= %d]", ((int*)cmd->p)[0]); case CMDLTETYPE: if(TypeFlag&&!*ts) sprintf(ts, " [int <= %d]", ((int*)cmd->p)[0]); fprintf(fp, "%s", cmd->Name); if(*ts) fprintf(fp, " %s", ts); if(ValFlag) fprintf(fp, ": %d", *(int*)cmd->Val); break; case CMDSTRINGTYPE: fprintf(fp, "%s", cmd->Name); if(TypeFlag) fprintf(fp, " [string]"); if(ValFlag) { if(*(char **)cmd->Val) { fprintf(fp, ": \"%s\"", *(char**)cmd->Val); } else { fprintf(fp, ": %s", "NULL"); } } break; case CMDSTRARRAYTYPE: PrintStrArray(cmd, TypeFlag, ValFlag, fp); break; case CMDINTARRAYTYPE: PrintIntArray(cmd, TypeFlag, ValFlag, fp); break; case CMDDBLARRAYTYPE: PrintDblArray(cmd, TypeFlag, ValFlag, fp); break; default: fprintf(stderr, "%s: %s %d %s \"%s\"\n", "PrintParam", "Unknown Type", cmd->Type, "for parameter", cmd->Name); exit(IRSTLM_CMD_ERROR_DATA); } fprintf(fp, ":"); // fprintf(fp, "\n"); fflush(fp); return 0; } static char * GetLine(FILE *fp, int n, char *Line) { int j, l, offs=0; for(;;) { if(!fgets(Line+offs, n-offs, fp)) { return 0; } if(Line[offs]=='#') continue; l = strlen(Line+offs)-1; Line[offs+l] = 0; for(j=offs; Line[j]&&isspace((unsigned char)Line[j]); j++,l--); if(l<1) continue; if(j > offs) { char *s = Line+offs, *q = Line+j; while((*s++=*q++)) ; } if(Line[offs+l-1]=='\\') { offs += l; Line[offs-1] = ' '; } else { break; } } return Line; } static int Scan(char *ProgName, Cmd_T *cmds, char *Line) { char *q, *p; int i, hl, HasToMatch = FALSE, c0, c; p = Line+strspn(Line, SepString); if(!(hl=strcspn(p, SepString))) return 0; if(ProgName&&(q=strchr(p, '/')) && q-pType==CMDENUMTYPE && cmd->Flag==1){ s=(char*) malloc(5); strcpy(s,"TRUE"); }else{ s=_s; } if (!*s || (s=='\0' && cmd->Flag==0)){ fprintf(stderr, "WARNING: No value specified for parameter \"%s\"\n", cmd->Name); return 0; } switch(cmd->Type) { case CMDDOUBLETYPE: if(sscanf(s, "%lf", (double*)cmd->Val)!=1) { fprintf(stderr, "Float value required for parameter \"%s\"\n", cmd->Name); exit(IRSTLM_CMD_ERROR_DATA); } break; case CMDFLOATTYPE: if(sscanf(s, "%f", (float*)cmd->Val)!=1) { fprintf(stderr, "Float value required for parameter \"%s\"\n", cmd->Name); exit(IRSTLM_CMD_ERROR_DATA); } break; case CMDBOOLTYPE2: case CMDBOOLTYPE: SetBool(cmd, s); break; case CMDENUMTYPE: SetEnum(cmd, s); break; case CMDFLAGTYPE: SetFlag(cmd, s); break; case CMDINTTYPE: /*They are the same when used for output, e.g. with printf, but different when used as input specifier e.g. with scanf, where %d scans an integer as a signed decimal number, but %i defaults to decimal but also allows hexadecimal (if preceded by "0x") and octal if preceded by "0". So "033" would be 27 with %i but 33 with %d. */ if(sscanf(s, "%d", (int*)cmd->Val)!=1) { fprintf(stderr, "Integer value required for parameter \"%s\"\n", cmd->Name); exit(IRSTLM_CMD_ERROR_DATA); } break; case CMDSTRINGTYPE: *(char **)cmd->Val = (strcmp(s, "") && strcmp(s, "NULL")) ? strdup(s) : 0; break; case CMDSTRARRAYTYPE: SetStrArray(cmd, s); break; case CMDINTARRAYTYPE: case CMDDBLARRAYTYPE: SetNumArray(cmd, s); break; case CMDGTETYPE: SetGte(cmd, s); break; case CMDLTETYPE: SetLte(cmd, s); break; case CMDSUBRANGETYPE: SetSubrange(cmd, s); break; default: fprintf(stderr, "%s: %s %d %s \"%s\"\n", "SetParam", "Unknown Type", cmd->Type, "for parameter", cmd->Name); exit(IRSTLM_CMD_ERROR_DATA); } cmd->ArgStr = strdup(s); if(!*_s && cmd->Type==CMDENUMTYPE && cmd->Flag==1){ free (s); } return 0; } static int SetBool(Cmd_T *cmd, char *s) { Bool_T *en; for(en=(Bool_T*)cmd->p; en->Name; en++) { if(*en->Name && !strcmp(s, en->Name)) { *(char*)cmd->Val = en->Idx; return 0; } } return BoolError(cmd, s); } static int SetEnum(Cmd_T *cmd, char *s) { Enum_T *en; for(en=(Enum_T*)cmd->p; en->Name; en++) { if(*en->Name && !strcmp(s, en->Name)) { *(int*)cmd->Val = en->Idx; return 0; } } return EnumError(cmd, s); } int EnumIdx(Enum_T *en, char *s) { if(en) for(; en->Name; en++) { if(*en->Name && !strcmp(s, en->Name)) return en->Idx; } return -1; } char BoolIdx(Bool_T *en, char *s) { if(en) for(; en->Name; en++) { if(*en->Name && !strcmp(s, en->Name)) return en->Idx; } return -1; } char * EnumStr(Enum_T *en, int i) { if(en) for(; en->Name; en++) if(en->Idx==i) return en->Name; return 0; } char * BoolStr(Bool_T *en, int i) { if(en) for(; en->Name; en++) if(en->Idx==i) return en->Name; return 0; } static int SetFlag(Cmd_T *cmd, char *s) { Enum_T *en; int l; for(; (l=strcspn(s, "+"))>0; s+=l,s+=!!*s) { for(en=(Enum_T*)cmd->p; en->Name&&(l!=strlen(en->Name)||strncmp(s, en->Name, l)); en++); if(!en->Name) return EnumError(cmd, s); *(int*)cmd->Val |= en->Idx; } return 0; } static int SetSubrange(Cmd_T *cmd, char *s) { int n; /*They are the same when used for output, e.g. with printf, but different when used as input specifier e.g. with scanf, where %d scans an integer as a signed decimal number, but %i defaults to decimal but also allows hexadecimal (if preceded by "0x") and octal if preceded by "0". So "033" would be 27 with %i but 33 with %d. */ if(sscanf(s, "%d", &n)!=1) { fprintf(stderr, "Integer value required for parameter \"%s\"\n", cmd->Name); exit(IRSTLM_CMD_ERROR_DATA); } if(n < *(int*)cmd->p || n > *((int*)cmd->p+1)) { return SubrangeError(cmd, n); } *(int*)cmd->Val = n; return 0; } static int SetGte(Cmd_T *cmd, char *s) { int n; /*They are the same when used for output, e.g. with printf, but different when used as input specifier e.g. with scanf, where %d scans an integer as a signed decimal number, but %i defaults to decimal but also allows hexadecimal (if preceded by "0x") and octal if preceded by "0". So "033" would be 27 with %i but 33 with %d. */ if(sscanf(s, "%d", &n)!=1) { fprintf(stderr, "Integer value required for parameter \"%s\"\n", cmd->Name); exit(IRSTLM_CMD_ERROR_DATA); } if(n<*(int*)cmd->p) { return GteError(cmd, n); } *(int*)cmd->Val = n; return 0; } static int SetStrArray(Cmd_T *cmd, char *s) { *(char***)cmd->Val = str2array(s, (char*)cmd->p); return 0; } static int SetNumArray(Cmd_T *cmd, char *s) { *((int**)cmd->p)[1] = str2narray(cmd->Type, s, *((char**)cmd->p), cmd->Val); return 0; } static int SetLte(Cmd_T *cmd, char *s) { int n; /*They are the same when used for output, e.g. with printf, but different when used as input specifier e.g. with scanf, where %d scans an integer as a signed decimal number, but %i defaults to decimal but also allows hexadecimal (if preceded by "0x") and octal if preceded by "0". So "033" would be 27 with %i but 33 with %d. */ if(sscanf(s, "%d", &n)!=1) { fprintf(stderr, "Integer value required for parameter \"%s\"\n", cmd->Name); exit(IRSTLM_CMD_ERROR_DATA); } if(n > *(int*)cmd->p) { return LteError(cmd, n); } *(int*)cmd->Val = n; return 0; } static int EnumError(Cmd_T *cmd, char *s) { Enum_T *en; fprintf(stderr, "Invalid value \"%s\" for parameter \"%s\"\n", s, cmd->Name); fprintf(stderr, "Valid values are:\n"); for(en=(Enum_T*)cmd->p; en->Name; en++) { if(*en->Name) fprintf(stderr, " %s\n", en->Name); } fprintf(stderr, "\n"); exit(IRSTLM_CMD_ERROR_DATA); return 0; } static int BoolError(Cmd_T *cmd, char *s) { Bool_T *en; fprintf(stderr, "Invalid value \"%s\" for parameter \"%s\"\n", s, cmd->Name); fprintf(stderr, "Valid values are:\n"); for(en=(Bool_T*)cmd->p; en->Name; en++) { if(*en->Name) fprintf(stderr, " %s\n", en->Name); } fprintf(stderr, "\n"); exit(IRSTLM_CMD_ERROR_DATA); return 0; } static int GteError(Cmd_T *cmd, int n) { fprintf(stderr, "Value %d out of range for parameter \"%s\"\n", n, cmd->Name); fprintf(stderr, "Valid values must be greater than or equal to %d\n", *(int*)cmd->p); exit(IRSTLM_CMD_ERROR_DATA); return 0; } static int LteError(Cmd_T *cmd, int n) { fprintf(stderr, "Value %d out of range for parameter \"%s\"\n", n, cmd->Name); fprintf(stderr, "Valid values must be less than or equal to %d\n", *(int*)cmd->p); exit(IRSTLM_CMD_ERROR_DATA); return 0; } static int SubrangeError(Cmd_T *cmd, int n) { fprintf(stderr, "Value %d out of range for parameter \"%s\"\n", n, cmd->Name); fprintf(stderr, "Valid values range from %d to %d\n", *(int*)cmd->p, *((int*)cmd->p+1)); exit(IRSTLM_CMD_ERROR_DATA); return 0; } static int PrintEnum(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp) { Enum_T *en; fprintf(fp, "%s", cmd->Name); if(TypeFlag) { fprintf(fp, " [enum { "); char *sep=""; for(en=(Enum_T*)cmd->p; en->Name; en++) { if(*en->Name) { fprintf(fp, "%s%s", sep, en->Name); sep=", "; } } fprintf(fp, " }]"); } if(ValFlag) { for(en=(Enum_T*)cmd->p; en->Name; en++) { if(*en->Name && en->Idx==*(int*)cmd->Val) { fprintf(fp, ": %s", en->Name); } } } // fprintf(fp, "\n"); return 0; } static int PrintBool(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp) { Bool_T *en; fprintf(fp, "%s", cmd->Name); if(TypeFlag) { fprintf(fp, " [enum { "); char *sep=""; for(en=(Bool_T*)cmd->p; en->Name; en++) { if(*en->Name) { fprintf(fp, "%s%s", sep, en->Name); sep=", "; } } fprintf(fp, " }]"); } if(ValFlag) { for(en=(Bool_T*)cmd->p; en->Name; en++) { if(*en->Name && en->Idx==*(int*)cmd->Val) { fprintf(fp, ": %s", en->Name); } } } // fprintf(fp, "\n"); return 0; } static int PrintFlag(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp) { Enum_T *en; char *sep=""; fprintf(fp, "%s", cmd->Name); if(TypeFlag) { fprintf(fp, ": flag { "); for(en=(Enum_T*)cmd->p; en->Name; en++) { if(*en->Name) { fprintf(fp, "%s%s", sep, en->Name); sep=", "; } } fprintf(fp, " }"); } if(ValFlag) { fprintf(fp, ": "); for(en=(Enum_T*)cmd->p; en->Name; en++) { if(*en->Name && (en->Idx&*(int*)cmd->Val)==en->Idx) { fprintf(fp, "%s%s", sep, en->Name); sep="+"; } } } fprintf(fp, "\n"); return 0; } static int PrintStrArray(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp) { char *indent, **s = *(char***)cmd->Val; int l = 4+strlen(cmd->Name); fprintf(fp, "%s", cmd->Name); if(TypeFlag) { fprintf(fp, ": string array, separator \"%s\"", cmd->p?(char*)cmd->p:""); } indent = malloc(l+2); memset(indent, ' ', l+1); indent[l+1] = 0; if(ValFlag) { fprintf(fp, ": %s", s ? (*s ? *s++ : "NULL") : ""); if(s) while(*s) { fprintf(fp, "\n%s %s", indent, *s++); } } free(indent); fprintf(fp, "\n"); return 0; } static int PrintIntArray(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp) { char *indent; int l = 4+strlen(cmd->Name), n, *i = *(int**)cmd->Val; fprintf(fp, "%s", cmd->Name); if(TypeFlag) { fprintf(fp, ": int array, separator \"%s\"", *(char**)cmd->p?*(char**)cmd->p:""); } n = *((int**)cmd->p)[1]; indent = malloc(l+2); memset(indent, ' ', l+1); indent[l+1] = 0; if(ValFlag) { fprintf(fp, ":"); if(i&&n>0) { fprintf(fp, " %d", *i++); while(--n) fprintf(fp, "\n%s %d", indent, *i++); } } free(indent); fprintf(fp, "\n"); return 0; } static int PrintDblArray(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp) { char *indent; int l = 4+strlen(cmd->Name), n; double *x = *(double**)cmd->Val; fprintf(fp, "%s", cmd->Name); if(TypeFlag) { fprintf(fp, ": double array, separator \"%s\"", *(char**)cmd->p?*(char**)cmd->p:""); } n = *((int**)cmd->p)[1]; indent = malloc(l+2); memset(indent, ' ', l+1); indent[l+1] = 0; if(ValFlag) { fprintf(fp, ":"); if(x&&n>0) { fprintf(fp, " %e", *x++); while(--n) fprintf(fp, "\n%s %e", indent, *x++); } } free(indent); fprintf(fp, "\n"); return 0; } static char ** str2array(char *s, char *sep) { char *p, **a; int n = 0; if(!sep) sep = SepString; p = s += strspn(s, sep); if(!*p) return 0; while(*p) { p += strcspn(p, sep); p += strspn(p, sep); ++n; } a = calloc(n+1, sizeof(char*)); p = s; n = 0; while(*p) { int l = strcspn(p, sep); a[n] = malloc(l+1); memcpy(a[n], p, l); a[n][l] = 0; ++n; p += l; p += strspn(p, sep); } return a; } int str2narray(int type, char *s, char *sep, void **a) { char *p; double *x; int *i; int n = 0; if(!sep) sep=SepString; for(p=s; *p; ) { p += strcspn(p, sep); p += !!*p; ++n; } *a = 0; if(!n) return 0; *a = calloc(n, (type==CMDINTARRAYTYPE)?sizeof(int):sizeof(double)); i = (int*)*a; x = (double*)*a; p = s; n = 0; while(*p) { switch(type) { case CMDINTARRAYTYPE: *i++ = atoi(p); break; case CMDDBLARRAYTYPE: *x++ = atof(p); break; } ++n; p += strcspn(p, sep); p += !!*p; } return n; } static int StoreCmdLine(char *s) { s += strspn(s, SepString); if(!*s) return 0; if(CmdLinesL>=CmdLinesSz) { CmdLines=CmdLinesSz ? (char**)realloc(CmdLines, (CmdLinesSz+=BUFSIZ)*sizeof(char**)) : (char**)malloc((CmdLinesSz=BUFSIZ)*sizeof(char**)); if(!CmdLines) { fprintf(stderr, "%s\n", "StoreCmdLine(): malloc() failed"); exit(IRSTLM_CMD_ERROR_MEMORY); } } CmdLines[CmdLinesL++] = strdup(s); return 0; } irstlm-6.00.05/src/cmd.h000066400000000000000000000047701263213470300147100ustar00rootroot00000000000000// $Id: cmd.h 3626 2010-10-07 11:41:05Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #if !defined(CMD_H) #define CMD_H #define FALSE 0 #define TRUE 1 #define END_ENUM { (char*)0, 0 } #define IRSTLM_CMD_NO_ERROR 0 #define IRSTLM_CMD_ERROR_GENERIC 1 #define IRSTLM_CMD_ERROR_IO 2 #define IRSTLM_CMD_ERROR_MEMORY 3 #define IRSTLM_CMD_ERROR_DATA 4 #define IRSTLM_CMD_ERROR_MODEL 5 #define CMDDOUBLETYPE 1 #define CMDENUMTYPE 2 #define CMDINTTYPE 3 #define CMDSTRINGTYPE 4 #define CMDSUBRANGETYPE 5 #define CMDGTETYPE 6 #define CMDLTETYPE 7 #define CMDSTRARRAYTYPE 8 #define CMDBOOLTYPE 9 #define CMDBOOLTYPE2 19 #define CMDFLAGTYPE 10 #define CMDINTARRAYTYPE 11 #define CMDDBLARRAYTYPE 12 #define CMDFLOATTYPE 13 #define CMDMSG (1<<31) #include #ifdef __cplusplus extern "C" { #endif typedef struct { char *Name; int Idx; } Enum_T; typedef struct { char *Name; char Idx; } Bool_T; typedef struct { int Type; int Flag; void *Val; void *p; char *Name; char *ArgStr; char *Msg; } Cmd_T; int DeclareParams(char *, ...), GetParams(int *n, char ***a, char *CmdFileName), GetDotParams(char *, ...), SPrintParams(char ***a, char *pfx), PrintParams(int ValFlag, FILE *fp), FullPrintParams(int TypeFlag, int ValFlag, int MsgFlag, FILE *fp), EnumIdx(Enum_T *en, char *s); char BoolIdx(Bool_T *en, char *s); char *EnumStr(Enum_T *en, int i); char *BoolStr(Bool_T *en, int i); #ifdef __cplusplus } #endif #endif irstlm-6.00.05/src/compile-lm.cpp000066400000000000000000000442071263213470300165350ustar00rootroot00000000000000// $Id: compile-lm.cpp 3677 2010-10-13 09:06:51Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit, compile LM Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include #include "cmd.h" #include "util.h" #include "math.h" #include "lmContainer.h" using namespace std; using namespace irstlm; /********************************/ void print_help(int TypeFlag=0){ std::cerr << std::endl << "compile-lm - compiles an ARPA format LM into an IRSTLM format one" << std::endl; std::cerr << std::endl << "USAGE:" << std::endl; std::cerr << " compile-lm [options] [output-file.blm]" << std::endl; std::cerr << std::endl << "DESCRIPTION:" << std::endl; std::cerr << " compile-lm reads a standard LM file in ARPA format and produces" << std::endl; std::cerr << " a compiled representation that the IRST LM toolkit can quickly" << std::endl; std::cerr << " read and process. LM file can be compressed." << std::endl; std::cerr << std::endl << "OPTIONS:" << std::endl; FullPrintParams(TypeFlag, 0, 1, stderr); } void usage(const char *msg = 0) { if (msg) { std::cerr << msg << std::endl; } if (!msg){ print_help(); } } int main(int argc, char **argv) { char *seval=NULL; char *tmpdir=NULL; char *sfilter=NULL; bool textoutput = false; bool sent_PP_flag = false; bool invert = false; bool sscore = false; bool ngramscore = false; bool skeepunigrams = false; int debug = 0; bool memmap = false; int requiredMaxlev = IRSTLM_REQUIREDMAXLEV_DEFAULT; int dub = IRSTLM_DUB_DEFAULT; int randcalls = 0; float ngramcache_load_factor = 0.0; float dictionary_load_factor = 0.0; bool help=false; std::vector files; DeclareParams((char*) "text", CMDBOOLTYPE|CMDMSG, &textoutput, "output is again in text format; default is false", "t", CMDBOOLTYPE|CMDMSG, &textoutput, "output is again in text format; default is false", "filter", CMDSTRINGTYPE|CMDMSG, &sfilter, "filter a binary language model with a word list", "f", CMDSTRINGTYPE|CMDMSG, &sfilter, "filter a binary language model with a word list", "keepunigrams", CMDBOOLTYPE|CMDMSG, &skeepunigrams, "filter by keeping all unigrams in the table, default is true", "ku", CMDBOOLTYPE|CMDMSG, &skeepunigrams, "filter by keeping all unigrams in the table, default is true", "eval", CMDSTRINGTYPE|CMDMSG, &seval, "computes perplexity of the specified text file", "e", CMDSTRINGTYPE|CMDMSG, &seval, "computes perplexity of the specified text file", "randcalls", CMDINTTYPE|CMDMSG, &randcalls, "computes N random calls on the specified text file", "r", CMDINTTYPE|CMDMSG, &randcalls, "computes N random calls on the specified text file", "score", CMDBOOLTYPE|CMDMSG, &sscore, "computes log-prob scores of n-grams from standard input", "s", CMDBOOLTYPE|CMDMSG, &sscore, "computes log-prob scores of n-grams from standard input", "ngramscore", CMDBOOLTYPE|CMDMSG, &ngramscore, "computes log-prob scores of the last n-gram before an _END_NGRAM_ symbol from standard input", "ns", CMDBOOLTYPE|CMDMSG, &ngramscore, "computes log-prob scores of the last n-gram before an _END_NGRAM_ symbol from standard input", "debug", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0", "d", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0", "level", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken", "l", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken", "memmap", CMDBOOLTYPE|CMDMSG, &memmap, "uses memory map to read a binary LM", "mm", CMDBOOLTYPE|CMDMSG, &memmap, "uses memory map to read a binary LM", "dub", CMDINTTYPE|CMDMSG, &dub, "dictionary upperbound to compute OOV word penalty: default 10^7", "tmpdir", CMDSTRINGTYPE|CMDMSG, &tmpdir, "directory for temporary computation, default is either the environment variable TMP if defined or \"/tmp\")", "invert", CMDBOOLTYPE|CMDMSG, &invert, "builds an inverted n-gram binary table for fast access; default if false", "i", CMDBOOLTYPE|CMDMSG, &invert, "builds an inverted n-gram binary table for fast access; default if false", "sentence", CMDBOOLTYPE|CMDMSG, &sent_PP_flag, "computes perplexity at sentence level (identified through the end symbol)", "dict_load_factor", CMDFLOATTYPE|CMDMSG, &dictionary_load_factor, "sets the load factor for ngram cache; it should be a positive real value; default is 0", "ngram_load_factor", CMDFLOATTYPE|CMDMSG, &ngramcache_load_factor, "sets the load factor for ngram cache; it should be a positive real value; default is false", "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help", "h", CMDBOOLTYPE|CMDMSG, &help, "print this help", (char*)NULL ); if (argc == 1){ usage(); exit_error(IRSTLM_NO_ERROR); } for(int i=1; i < argc; i++) { if(argv[i][0] != '-'){ files.push_back(argv[i]); } } GetParams(&argc, &argv, (char*) NULL); if (help){ usage(); exit_error(IRSTLM_NO_ERROR); } if (files.size() > 2) { usage(); exit_error(IRSTLM_ERROR_DATA,"Warning: Too many arguments"); } if (files.size() < 1) { usage(); exit_error(IRSTLM_ERROR_DATA,"Warning: Please specify a LM file to read from"); } std::string infile = files[0]; std::string outfile = ""; if (files.size() == 1) { outfile=infile; //remove path information std::string::size_type p = outfile.rfind('/'); if (p != std::string::npos && ((p+1) < outfile.size())) outfile.erase(0,p+1); //eventually strip .gz if (outfile.compare(outfile.size()-3,3,".gz")==0) outfile.erase(outfile.size()-3,3); outfile+=(textoutput?".lm":".blm"); } else{ outfile = files[1]; } std::cerr << "inpfile: " << infile << std::endl; std::cerr << "outfile: " << outfile << std::endl; if (seval!=NULL) std::cerr << "evalfile: " << seval << std::endl; if (sscore==true) std::cerr << "interactive: " << sscore << std::endl; if (ngramscore==true) std::cerr << "interactive for ngrams only: " << ngramscore << std::endl; if (memmap) std::cerr << "memory mapping: " << memmap << std::endl; std::cerr << "loading up to the LM level " << requiredMaxlev << " (if any)" << std::endl; std::cerr << "dub: " << dub<< std::endl; if (tmpdir != NULL) { if (setenv("TMP",tmpdir,1)) std::cerr << "temporary directory has not been set" << std::endl; std::cerr << "tmpdir: " << tmpdir << std::endl; } //checking the language model type lmContainer* lmt = lmContainer::CreateLanguageModel(infile,ngramcache_load_factor,dictionary_load_factor); //let know that table has inverted n-grams if (invert) lmt->is_inverted(invert); lmt->setMaxLoadedLevel(requiredMaxlev); lmt->load(infile); lmt->print_table_stat(); //CHECK this part for sfilter to make it possible only for LMTABLE if (sfilter != NULL) { lmContainer* filtered_lmt = NULL; std::cerr << "BEFORE sublmC (" << (void*) filtered_lmt << ") (" << (void*) &filtered_lmt << ")\n"; // the function filter performs the filtering and returns true, only for specific lm type if (((lmContainer*) lmt)->filter(sfilter,filtered_lmt,skeepunigrams?"yes":"no")) { std::cerr << "BFR filtered_lmt (" << (void*) filtered_lmt << ") (" << (void*) &filtered_lmt << ")\n"; filtered_lmt->stat(); delete lmt; lmt=filtered_lmt; std::cerr << "AFTER filtered_lmt (" << (void*) filtered_lmt << ")\n"; filtered_lmt->stat(); std::cerr << "AFTER lmt (" << (void*) lmt << ")\n"; lmt->stat(); } } if (dub) lmt->setlogOOVpenalty((int)dub); //use caches to save time (only if PS_CACHE_ENABLE is defined through compilation flags) lmt->init_caches(lmt->maxlevel()); if (seval != NULL) { if (randcalls>0) { cerr << "perform random " << randcalls << " using dictionary of test set\n"; dictionary *dict; dict=new dictionary(seval); //build extensive histogram int histo[dict->totfreq()]; //total frequency int totfreq=0; for (int n=0; nsize(); n++) for (int m=0; mfreq(n); m++) histo[totfreq++]=n; ngram ng(lmt->getDict()); srand(1234); double bow; int bol=0; if (debug>1) ResetUserTime(); for (int n=0; ngetDict()->encode(dict->decode(w))); lmt->clprob(ng,&bow,&bol); //(using caches if available) if (debug==1) { std::cout << ng.dict->decode(*ng.wordp(1)) << " [" << lmt->maxlevel()-bol << "]" << " "; std::cout << std::endl; std::cout.flush(); } if ((n % 100000)==0) { std::cerr << "."; lmt->check_caches_levels(); } } std::cerr << "\n"; if (debug>1) PrintUserTime("Finished in"); if (debug>1) lmt->stat(); delete lmt; return 0; } else { if (lmt->getLanguageModelType() == _IRSTLM_LMINTERPOLATION) { debug = (debug>4)?4:debug; std::cerr << "Maximum debug value for this LM type: " << debug << std::endl; } if (lmt->getLanguageModelType() == _IRSTLM_LMMACRO) { debug = (debug>4)?4:debug; std::cerr << "Maximum debug value for this LM type: " << debug << std::endl; } if (lmt->getLanguageModelType() == _IRSTLM_LMCLASS) { debug = (debug>4)?4:debug; std::cerr << "Maximum debug value for this LM type: " << debug << std::endl; } std::cerr << "Start Eval" << std::endl; std::cerr << "OOV code: " << lmt->getDict()->oovcode() << std::endl; ngram ng(lmt->getDict()); std::cout.setf(ios::fixed); std::cout.precision(2); // if (debug>0) std::cout.precision(8); std::fstream inptxt(seval,std::ios::in); int Nbo=0, Nw=0,Noov=0; double logPr=0,PP=0,PPwp=0,Pr; // variables for storing sentence-based Perplexity int sent_Nbo=0, sent_Nw=0,sent_Noov=0; double sent_logPr=0,sent_PP=0,sent_PPwp=0; int bos=lmt->addWord(lmt->getDict()->BoS()); int eos=lmt->addWord(lmt->getDict()->EoS()); double bow; int bol=0; ngram_state_t msidx; char *msp; unsigned int statesize; lmt->dictionary_incflag(1); while(inptxt >> ng) { VERBOSE(3,"read ng:|" << ng << "|" << std::endl); if (ng.size>lmt->maxlevel()) ng.size=lmt->maxlevel(); // reset ngram at begin of sentence if (*ng.wordp(1)==bos) { ng.size=1; continue; } if (ng.size>=1) { VERBOSE(3,"computing clprob ng:|" << ng << "|" << std::endl); // Pr=lmt->clprob(ng,&bow,&bol,&msp,&statesize); Pr=lmt->clprob(ng,&bow,&bol,&msidx,&msp,&statesize); logPr+=Pr; sent_logPr+=Pr; if (debug==1) { std::cout << ng.dict->decode(*ng.wordp(1)) << " [" << ng.size-bol << "]" << " "; if (*ng.wordp(1)==eos) std::cout << std::endl; } else if (debug==2) { std::cout << ng << " [" << ng.size-bol << "-gram]" << " " << Pr; std::cout << std::endl; std::cout.flush(); } else if (debug==3) { std::cout << ng << " [" << ng.size-bol << "-gram]" << " " << Pr << " bow:" << bow; std::cout << std::endl; std::cout.flush(); } else if (debug==4) { std::cout << ng << " [" << ng.size-bol << "-gram: recombine:" << statesize << " ngramstate:" << msidx << " state:" << (void*) msp << "] [" << ng.size+1-((bol==0)?(1):bol) << "-gram: bol:" << bol << "] " << Pr << " bow:" << bow; std::cout << std::endl; std::cout.flush(); } else if (debug>4) { std::cout << ng << " [" << ng.size-bol << "-gram: recombine:" << statesize << " ngramstate:" << msidx << " state:" << (void*) msp << "] [" << ng.size+1-((bol==0)?(1):bol) << "-gram: bol:" << bol << "] " << Pr << " bow:" << bow; double totp=0.0; int oldw=*ng.wordp(1); double oovp=lmt->getlogOOVpenalty(); lmt->setlogOOVpenalty((double) 0); for (int c=0; csize(); c++) { *ng.wordp(1)=c; totp+=pow(10.0,lmt->clprob(ng)); //using caches if available } *ng.wordp(1)=oldw; if ( totp < (1.0 - 1e-5) || totp > (1.0 + 1e-5)) std::cout << " [t=" << totp << "] POSSIBLE ERROR"; std::cout << std::endl; std::cout.flush(); lmt->setlogOOVpenalty((double)oovp); } if (lmt->is_OOV(*ng.wordp(1))) { Noov++; sent_Noov++; } if (bol) { Nbo++; sent_Nbo++; } Nw++; sent_Nw++; if (sent_PP_flag && (*ng.wordp(1)==eos)) { sent_PP=exp((-sent_logPr * log(10.0)) /sent_Nw); sent_PPwp= sent_PP * (1 - 1/exp((sent_Noov * lmt->getlogOOVpenalty()) * log(10.0) / sent_Nw)); std::cout << "%% sent_Nw=" << sent_Nw << " sent_PP=" << sent_PP << " sent_PPwp=" << sent_PPwp << " sent_Nbo=" << sent_Nbo << " sent_Noov=" << sent_Noov << " sent_OOV=" << (float)sent_Noov/sent_Nw * 100.0 << "%" << std::endl; std::cout.flush(); //reset statistics for sentence based Perplexity sent_Nw=sent_Noov=sent_Nbo=0; sent_logPr=0.0; } if ((Nw % 100000)==0) { std::cerr << "."; lmt->check_caches_levels(); } VERBOSE(3,"computing clprob END" << std::endl); } VERBOSE(3,"read END" << std::endl); } PP=exp((-logPr * log(10.0)) /Nw); PPwp= PP * (1 - 1/exp((Noov * lmt->getlogOOVpenalty()) * log(10.0) / Nw)); std::cout << "%% Nw=" << Nw << " PP=" << PP << " PPwp=" << PPwp << " Nbo=" << Nbo << " Noov=" << Noov << " OOV=" << (float)Noov/Nw * 100.0 << "%"; if (debug) std::cout << " logPr=" << logPr; std::cout << std::endl; std::cout.flush(); if (debug>1) lmt->used_caches(); if (debug>1) lmt->stat(); delete lmt; return 0; }; } if (sscore == true) { ngram ng(lmt->getDict()); int bos=ng.dict->encode(ng.dict->BoS()); int bol; double bow; unsigned int n=0; std::cout.setf(ios::scientific); std::cout.setf(ios::fixed); std::cout.precision(2); std::cout << "> "; lmt->dictionary_incflag(1); while(std::cin >> ng) { //std::cout << ng << std::endl;; // reset ngram at begin of sentence if (*ng.wordp(1)==bos) { ng.size=1; continue; } if (ng.size>=lmt->maxlevel()) { ng.size=lmt->maxlevel(); ++n; if ((n % 100000)==0) { std::cerr << "."; lmt->check_caches_levels(); } std::cout << ng << " p= " << lmt->clprob(ng,&bow,&bol) * M_LN10; std::cout << " bo= " << bol << std::endl; } else { std::cout << ng << " p= NULL" << std::endl; } std::cout << "> "; } std::cout << std::endl; std::cout.flush(); if (debug>1) lmt->used_caches(); if (debug>1) lmt->stat(); delete lmt; return 0; } if (ngramscore == true) { const char* _END_NGRAM_="_END_NGRAM_"; ngram ng(lmt->getDict()); double Pr; double bow; int bol=0; ngram_state_t msidx; char *msp; unsigned int statesize; std::cout.setf(ios::fixed); std::cout.precision(2); ng.dict->incflag(1); int endngram=ng.dict->encode(_END_NGRAM_); ng.dict->incflag(0); while(std::cin >> ng) { // compute score for the last ngram when endngram symbols is found // and reset ngram if (*ng.wordp(1)==endngram) { ng.shift(); if (ng.size>=lmt->maxlevel()) { ng.size=lmt->maxlevel(); } // Pr=lmt->clprob(ng,&bow,&bol,&msp,&statesize); Pr=lmt->clprob(ng,&bow,&bol,&msidx, &msp,&statesize); #ifndef OUTPUT_SUPPRESSED std::cout << ng << " [" << ng.size-bol << "-gram: recombine:" << statesize << " ngramstate:" << msidx << " state:" << (void*) msp << "] [" << ng.size+1-((bol==0)?(1):bol) << "-gram: bol:" << bol << "] " << Pr << " bow:" << bow; std::cout << std::endl; std::cout.flush(); #endif ng.size=0; } } if (debug>1) lmt->used_caches(); if (debug>1) lmt->stat(); delete lmt; return 0; } if (textoutput == true) { std::cerr << "Saving in txt format to " << outfile << std::endl; lmt->savetxt(outfile.c_str()); } else if (!memmap) { std::cerr << "Saving in bin format to " << outfile << std::endl; lmt->savebin(outfile.c_str()); } else { std::cerr << "Impossible to save to " << outfile << std::endl; } delete lmt; return 0; } irstlm-6.00.05/src/cplsa.cpp000077500000000000000000000432361263213470300156050ustar00rootroot00000000000000/****************************************************************************** IrstLM: IRST Language Model Toolkit, compile LM Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA **********************************************dou********************************/ #include #include #include #include #include #include #include "thpool.h" #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "n_gram.h" #include "util.h" #include "dictionary.h" #include "ngramtable.h" #include "doc.h" #include "cplsa.h" using namespace std; namespace irstlm { plsa::plsa(dictionary* d,int top,char* wd,int th,bool mm){ dict=d; topics=top; tmpdir=wd; memorymap=mm; threads=th; MY_ASSERT (topics>0); //actual model structure W=NULL; //training support structure T=NULL; //allocate/free at training time// this is the huge table H=NULL; srandom(100); //consistent generation of random noise bucket=BUCKET; maxiter=0; } plsa::~plsa() { freeW(); freeH(); free(T); } int plsa::initW(char* modelfile,float noise,int spectopic){ //W needs a dictionary, either from an existing model or //from the training data assert(W==NULL); if (dict==NULL) loadW(modelfile); else{ cerr << "Allocating W table\n"; W=new float* [dict->size()]; for (int i=0; isize(); i++){ W[i]=new float [topics](); //initialized to zero since C++11 memset(W[i],0,sizeof(float)*topics); } cerr << "Initializing W table\n"; if (spectopic) { //special topic 0: first st most frequent //assume dictionary is sorted by frequency!!! float TotW=0; for (int i=0; ifreq(i); for (int i=0; isize(); i++) TotW+=W[i][t]=1 + noise * MY_RAND; for (int i=spectopic; i< dict->size(); i++) W[i][t]/=TotW; } } return 1; } int plsa::freeW(){ if (W!=NULL){ cerr << "Releasing memory of W table\n"; for (int i=0; isize(); i++) delete [] W[i]; delete [] W; W=NULL; } return 1; } int plsa::initH(){ assert(trset->numdoc()); //need a date set long long len=(unsigned long long)trset->numdoc() * topics; FILE *fd; if (H == NULL){ if (memorymap){ cerr << "Creating memory mapped H table\n"; //generate a name for the memory map file sprintf(Hfname,"/%s/Hfname%d",tmpdir,(int)getpid()); if ((fd=fopen(Hfname,"w+"))==0){ perror("Could not create file"); exit_error(IRSTLM_ERROR_IO, "plsa::initH fopen error"); } //H is aligned at integer ftruncate(fileno(fd),len * sizeof(float)); H = (float *)mmap( 0, len * sizeof(float) , PROT_READ|PROT_WRITE, MAP_PRIVATE,fileno(fd),0); fclose(fd); if (H == MAP_FAILED){ perror("Mmap error"); exit_error(IRSTLM_ERROR_IO, "plsa::initH MMAP error"); } } else{ cerr << "Allocating " << len << " entries for H table\n"; fprintf(stderr,"%llu\n",len); if ((H=new float[len])==NULL){ perror("memory allocation error"); exit_error(IRSTLM_ERROR_IO, "plsa::cannot allocate memory for H"); } } } cerr << "Initializing H table " << "\n"; float value=1/(float)topics; for (long long d=0; d< trset->numdoc(); d++) for (int t=0; tnumdoc()*topics*sizeof(float)); remove(Hfname); }else delete [] H; H=NULL; } return 1; } int plsa::initT(){ //keep double for counts collected over the whole training data if (T==NULL){ T=new double* [dict->size()]; for (int i=0; isize(); i++) T[i]=new double [topics]; } for (int i=0; isize(); i++) memset((void *)T[i],0,topics * sizeof(double)); return 1; } int plsa::freeT(){ if (T!=NULL){ cerr << "Releasing memory for T table\n"; for (int i=0; isize(); i++) delete [] T[i]; delete [] T; T=NULL; } return 1; } /* int plsa::saveWtxt2(char* fname){ cerr << "Writing text W table into: " << fname << "\n"; mfstream out(fname,ios::out); out.precision(5); // out << topics << "\n"; for (int i=0; isize(); i++) { out << dict->decode(i);// << " " << dict->freq(i); //double totW=0; //for (int t=0; t (*(mypairtype *)b).score ) return -1; return 0; } int plsa::saveWtxt(char* fname,int tw){ cerr << "Writing model W into: " << fname << "\n"; mfstream out(fname,ios::out); out.precision(5); mypairtype *vect=new mypairtype[dict->size()]; // out << topics << "\n"; for (int t=0; tsize(); i++){ vect[i].word=i; vect[i].score=W[i][t]; } vect[dict->oovcode()].score=0; qsort((void *)vect,dict->size(),sizeof(mypairtype),comparepair); out << "T" << t; for (int i=0;idecode(vect[i].word);// << " " << vect[i].score << " "; } out << "\n"; } delete [] vect; out.close(); return 1; } int plsa::saveW(char* fname){ cerr << "Saving model into: " << fname << " ..."; mfstream out(fname,ios::out); out << "PLSA " << topics << "\n"; dict->save(out); for (int i=0; isize(); i++) out.write((const char*)W[i],sizeof(float) * topics); out.close(); cerr << "\n"; return 1; } int plsa::loadW(char* fname){ assert(dict==NULL); cerr << "Loading model from: " << fname << "\n"; mfstream inp(fname,ios::in); char header[100]; inp.getline(header,100); cerr << header ; int r; sscanf(header,"PLSA %d\n",&r); if (topics>0 && r != topics) exit_error(IRSTLM_ERROR_DATA, "incompatible number of topics"); else topics=r; cerr << "Loading dictionary\n"; dict=new dictionary(NULL,1000000); dict->load(inp); dict->encode(dict->OOV()); cerr << "Allocating W table\n"; W=new float* [dict->size()]; for (int i=0; isize(); i++) W[i]=new float [topics]; cerr << "Reading W table .... "; for (int i=0; isize(); i++) inp.read((char *)W[i],sizeof(float) * topics); inp.close(); cerr << "\n"; return 1; } int plsa::saveWordFeatures(char* fname,long long d){ //extend this to save features for all adapation documents //compute distribution on doc 0 assert(trset !=NULL); if (d<100){ double *WH=new double [dict->size()]; char *outfname=new char[strlen(fname)+10]; sprintf(outfname,"%s.%03d",fname,(int)d+1); cerr << "Saving word features in " << fname << "\n"; for (int i=0; isize(); i++) { WH[i]=0; for (int t=0; tsize(); i++) if (WH[i]>maxp) maxp=WH[i]; cerr << "Get max prob" << maxp << "\n"; //save unigrams in google ngram format mfstream out(outfname,ios::out); for (int i=0; isize(); i++){ int freq=(int)floor((WH[i]/maxp) * 1000000); if (freq) out << dict->decode(i) <<" \t" << freq<<"\n"; } out.close(); delete [] outfname; delete [] WH; } return 1; } ///***** pthread_mutex_t cplsa_mut1; pthread_mutex_t cplsa_mut2; double cplsa_LL=0; //Log likelihood const float topicthreshold=0.00001; const float deltathreshold=0.0001; void plsa::expected_counts(void *argv){ long long d; d=(long long) argv; int frac=(d * 1000)/trset->numdoc(); if (!(frac % 10)) fprintf(stderr,"%2d\b\b",frac/10); //fprintf(stderr,"Thread: %lu Document: %d (out of %d)\n",(long)pthread_self(),d,trset->numdoc()); int r=topics; int m=trset->doclen(d); //actual length of document int N=m ; // doc length is the same of double totH=0; for (int t=0; t0) for (int i=0; idocword(d,i)][t] * H[d * r + t]); //UPDATE LOCAL Tia (for each word and topic) //seems more efficient perform local computation on complex structures //and perform exclusive computations on simpler structures. float *lT=new float[m * r]; memset(lT,0,sizeof(float)*m*r); for (int t=0; t0) for (int i=0; idocword(d,i)][t] * H[d * r + t]/WH[i]); //UPDATE GLOBAL T and cplsa_LL pthread_mutex_lock(&cplsa_mut1); for (int i=0; idocword(d,i)][t]+=(double)lT[i * r + t]; cplsa_LL+= log( WH[i] ); } pthread_mutex_unlock(&cplsa_mut1); //UPDATE Haj (topic a and document j) totH=0; for (int t=0; t0){ for (int i=0; i < m; i++) tmpHaj+=(W[trset->docword(d,i)][t] * H[d * r + t]/WH[i]); H[d * r + t]=tmpHaj/N; totH+=H[d * r + t]; } } if(totH>UPPER_SINGLE_PRECISION_OF_1 || totHnumdoc()]; pthread_mutex_init(&cplsa_mut1, NULL); //pthread_mutex_init(&cplsa_mut2, NULL); while (iter < maxiter){ cplsa_LL=0; cerr << "Iteration: " << ++iter << " "; //initialize T table initT(); for (long long d=0;dnumdoc();d++){ //prepare and assign tasks to threads t[d].ctx=this; t[d].argv=(void *)d; thpool_add_work(thpool, &plsa::expected_counts_helper, (void *)&t[d]); } //join all threads thpool_wait(thpool); //Recombination and normalization of expected counts for (int t=0; tsize(); i++) Tsum+=T[i][t]; for (int i=0; isize(); i++) W[i][t]=(float)(T[i][t]/Tsum); } cerr << " LL: " << cplsa_LL << "\n"; if (trset->numdoc()> 10) system("date"); saveW(modelfile); } //destroy thread pool thpool_destroy(thpool); freeH(); freeT(); freeW(); delete trset; delete [] t; return 1; } void plsa::single_inference(void *argv){ long long d; d=(long long) argv; int frac=(d * 1000)/trset->numdoc(); if (!(frac % 10)) fprintf(stderr,"%2d\b\b",frac/10); //fprintf(stderr,"Thread: %lu Document: %d (out of %d)\n",(long)pthread_self(),d,trset->numdoc()); float *WH=new float [dict->size()]; bool *Hflags=new bool[topics]; int M=trset->doclen(d); //vocabulary size of current documents with repetitions int N=M; //document length //initialize H: we estimate one H for each document for (int t=0; t deltathreshold){ maxdelta=0; iter++; //precompute denominator WH for (int t=0; tdocword(d,i)]=0; //initialized for (int t=0; tdocword(d,i)]+=W[trset->docword(d,i)][t] * H[(d % bucket) * topics + t]; } } //UPDATE H float totH=0; for (int t=0; tdocword(d,i)][t] * H[(d % bucket) * topics + t]/WH[trset->docword(d,i)]); delta=abs(H[(d % bucket) * topics + t]-tmpH/N); if (delta > maxdelta) maxdelta=delta; H[(d % bucket) * topics + t]=tmpH/N; totH+=H[(d % bucket) * topics + t]; //to check that sum is 1 } } if(totH>UPPER_SINGLE_PRECISION_OF_1 || totHnumdoc();d++){ t[d % bucket].ctx=this; t[d % bucket].argv=(void *)d; thpool_add_work(thpool, &plsa::single_inference_helper, (void *)&t[d % bucket]); if (((d % bucket) == (bucket-1)) || (d==(trset->numdoc()-1)) ){ //join all threads thpool_wait(thpool); if ((d % bucket) != (bucket-1)) bucket=trset->numdoc() % bucket; //last bucket at end of file if (topicfeatfile){ mfstream out(topicfeatfile,ios::out | ios::app); for (int b=0;bexpected_counts(t.argv);return NULL; }; static void *single_inference_helper(void *argv){ task t=*(task *)argv; ((plsa *)t.ctx)->single_inference(t.argv);return NULL; }; int train(char *trainfile,char* modelfile, int maxiter, float noiseW,int spectopic=0); int inference(char *trainfile, char* modelfile, int maxiter, char* topicfeatfile,char* wordfeatfile); void single_inference(void *argv); int saveWordFeatures(char* fname, long long d); }; } //namespace irstlm #endif irstlm-6.00.05/src/crc.cpp000066400000000000000000000073731263213470300152510ustar00rootroot00000000000000/* * Copyright 2001-2010 Georges Menie (www.menie.org) * All rights reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the University of California, Berkeley nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "crc.h" /* CRC16 implementation acording to CCITT standards */ static const unsigned short crc16tab[256]= { 0x0000,0x1021,0x2042,0x3063,0x4084,0x50a5,0x60c6,0x70e7, 0x8108,0x9129,0xa14a,0xb16b,0xc18c,0xd1ad,0xe1ce,0xf1ef, 0x1231,0x0210,0x3273,0x2252,0x52b5,0x4294,0x72f7,0x62d6, 0x9339,0x8318,0xb37b,0xa35a,0xd3bd,0xc39c,0xf3ff,0xe3de, 0x2462,0x3443,0x0420,0x1401,0x64e6,0x74c7,0x44a4,0x5485, 0xa56a,0xb54b,0x8528,0x9509,0xe5ee,0xf5cf,0xc5ac,0xd58d, 0x3653,0x2672,0x1611,0x0630,0x76d7,0x66f6,0x5695,0x46b4, 0xb75b,0xa77a,0x9719,0x8738,0xf7df,0xe7fe,0xd79d,0xc7bc, 0x48c4,0x58e5,0x6886,0x78a7,0x0840,0x1861,0x2802,0x3823, 0xc9cc,0xd9ed,0xe98e,0xf9af,0x8948,0x9969,0xa90a,0xb92b, 0x5af5,0x4ad4,0x7ab7,0x6a96,0x1a71,0x0a50,0x3a33,0x2a12, 0xdbfd,0xcbdc,0xfbbf,0xeb9e,0x9b79,0x8b58,0xbb3b,0xab1a, 0x6ca6,0x7c87,0x4ce4,0x5cc5,0x2c22,0x3c03,0x0c60,0x1c41, 0xedae,0xfd8f,0xcdec,0xddcd,0xad2a,0xbd0b,0x8d68,0x9d49, 0x7e97,0x6eb6,0x5ed5,0x4ef4,0x3e13,0x2e32,0x1e51,0x0e70, 0xff9f,0xefbe,0xdfdd,0xcffc,0xbf1b,0xaf3a,0x9f59,0x8f78, 0x9188,0x81a9,0xb1ca,0xa1eb,0xd10c,0xc12d,0xf14e,0xe16f, 0x1080,0x00a1,0x30c2,0x20e3,0x5004,0x4025,0x7046,0x6067, 0x83b9,0x9398,0xa3fb,0xb3da,0xc33d,0xd31c,0xe37f,0xf35e, 0x02b1,0x1290,0x22f3,0x32d2,0x4235,0x5214,0x6277,0x7256, 0xb5ea,0xa5cb,0x95a8,0x8589,0xf56e,0xe54f,0xd52c,0xc50d, 0x34e2,0x24c3,0x14a0,0x0481,0x7466,0x6447,0x5424,0x4405, 0xa7db,0xb7fa,0x8799,0x97b8,0xe75f,0xf77e,0xc71d,0xd73c, 0x26d3,0x36f2,0x0691,0x16b0,0x6657,0x7676,0x4615,0x5634, 0xd94c,0xc96d,0xf90e,0xe92f,0x99c8,0x89e9,0xb98a,0xa9ab, 0x5844,0x4865,0x7806,0x6827,0x18c0,0x08e1,0x3882,0x28a3, 0xcb7d,0xdb5c,0xeb3f,0xfb1e,0x8bf9,0x9bd8,0xabbb,0xbb9a, 0x4a75,0x5a54,0x6a37,0x7a16,0x0af1,0x1ad0,0x2ab3,0x3a92, 0xfd2e,0xed0f,0xdd6c,0xcd4d,0xbdaa,0xad8b,0x9de8,0x8dc9, 0x7c26,0x6c07,0x5c64,0x4c45,0x3ca2,0x2c83,0x1ce0,0x0cc1, 0xef1f,0xff3e,0xcf5d,0xdf7c,0xaf9b,0xbfba,0x8fd9,0x9ff8, 0x6e17,0x7e36,0x4e55,0x5e74,0x2e93,0x3eb2,0x0ed1,0x1ef0 }; unsigned short crc16_ccitt(const char *buf, int len) { register int counter; register unsigned short crc = 0; for( counter = 0; counter < len; counter++) crc = (crc<<8) ^ crc16tab[((crc>>8) ^ *(char *)buf++)&0x00FF]; return crc; } irstlm-6.00.05/src/crc.h000066400000000000000000000033041263213470300147040ustar00rootroot00000000000000/* * Copyright 2001-2010 Georges Menie (www.menie.org) * All rights reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the University of California, Berkeley nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _CRC16_H_ #define _CRC16_H_ unsigned short crc16_ccitt(const char *buf, int len); #endif /* _CRC16_H_ */ irstlm-6.00.05/src/cswa.cpp000077500000000000000000000220651263213470300154350ustar00rootroot00000000000000/****************************************************************************** IrstLM: IRST Language Model Toolkit, compile LM Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include "cmd.h" #include #include "thpool.h" #include "util.h" #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "ngramtable.h" #include "doc.h" #include "cswam.h" using namespace std; using namespace irstlm; void print_help(int TypeFlag=0){ std::cerr << std::endl << "cswa - continuous space word alignment model" << std::endl; std::cerr << std::endl << "USAGE:" << std::endl; std::cerr << " Training mode:" << std::endl; std::cerr << " cswa -sd= -td= -w2v= -m= -it= -th= [options]" << std::endl; std::cerr << " Alignment mode:" << std::endl; std::cerr << " cswa -sd= -td= -w2v= -m= -al= -th= [options]" << std::endl; std::cerr << " Data format:" << std::endl; std::cerr << " and must have an header with the number of following lines. " << std::endl; std::cerr << " Each text line must be sourrounded by the symbols and . " << std::endl; std::cerr << " Hint: (echo `wc -l < yourfile`; add-start-end.sh -s \"d\" < yourfile) > yourfile.doc " << std::endl; std::cerr << std::endl; FullPrintParams(TypeFlag, 0, 1, stderr); } void usage(const char *msg = 0) { if (msg){ std::cerr << msg << std::endl; } else{ print_help(); } } int main(int argc, char **argv){ char *srcdatafile=NULL; char *trgdatafile=NULL; char *w2vfile=NULL; char *modelfile=NULL; char *modeltxtfile=NULL; char *alignfile=NULL; bool forcemodel=false; int iterations=0; //number of EM iterations to run int threads=1; //current EM iteration for multi-thread training bool help=false; bool trainvar=true; bool normvectors=false; bool usenullword=true; double fixnullprob=0; bool verbosity=false; double minvar=0.2; bool distmean=true; bool distvar=true; bool distbeta=false; int model1iter=7; int distwin=8; DeclareParams((char*) "SrcData", CMDSTRINGTYPE|CMDMSG, &srcdatafile, " : source text collection ", "sd", CMDSTRINGTYPE|CMDMSG, &srcdatafile, " : source text collection ", "TrgData", CMDSTRINGTYPE|CMDMSG, &trgdatafile, " : target text collection ", "td", CMDSTRINGTYPE|CMDMSG, &trgdatafile, " : target text collection ", "Word2Vec", CMDSTRINGTYPE|CMDMSG, &w2vfile, " : word2vec file ", "w2v", CMDSTRINGTYPE|CMDMSG, &w2vfile, " : word2vec file ", "Model", CMDSTRINGTYPE|CMDMSG, &modelfile, " : model file", "m", CMDSTRINGTYPE|CMDMSG, &modelfile, " : model model file", "Iterations", CMDINTTYPE|CMDMSG, &iterations, " : training iterations", "it", CMDINTTYPE|CMDMSG, &iterations, " : training iterations", "Alignments", CMDSTRINGTYPE|CMDMSG, &alignfile, " : output alignment file", "al", CMDSTRINGTYPE|CMDMSG, &alignfile, " : output alignment file", "UseNullWord", CMDBOOLTYPE|CMDMSG, &usenullword, ": use null word (default true)", "unw", CMDBOOLTYPE|CMDMSG, &usenullword, ": use null word (default true)", "Threads", CMDINTTYPE|CMDMSG, &threads, ": number of threads (default 2)", "th", CMDINTTYPE|CMDMSG, &threads, ": number of threads (default 2)", "ForceModel", CMDBOOLTYPE|CMDMSG, &forcemodel, ": force to use existing model for training", "fm", CMDBOOLTYPE|CMDMSG, &forcemodel, ": force to use existing model for training", "TrainVariances", CMDBOOLTYPE|CMDMSG, &trainvar, ": train variances (default true)", "tv", CMDBOOLTYPE|CMDMSG, &trainvar, ": train variances (default true)", "FixNullProb", CMDDOUBLETYPE|CMDMSG, &fixnullprob, ": fix null probability (default estimate)", "fnp", CMDDOUBLETYPE|CMDMSG, &fixnullprob, ": fix null probability (default estimate)", "MinVariance", CMDDOUBLETYPE|CMDMSG, &minvar, ": minimum variance (default 0.01)", "mv", CMDDOUBLETYPE|CMDMSG, &minvar, ": minimum variance (default 0.01)", "NormalizeVectors", CMDBOOLTYPE|CMDMSG, &normvectors, ": normalize vectors (default false)", "nv", CMDBOOLTYPE|CMDMSG, &normvectors, ": normalize vectors (default false)", "DistVar", CMDBOOLTYPE|CMDMSG, &distvar, ": use distortion variance (default true)", "dv", CMDBOOLTYPE|CMDMSG, &distvar, ": use distortion variance (default true)", "DistMean", CMDBOOLTYPE|CMDMSG, &distmean, ": use distortion mean (default true)", "dm", CMDBOOLTYPE|CMDMSG, &distmean, ": use distortion mean (default true)", "DistBeta", CMDBOOLTYPE|CMDMSG, &distbeta, ": use beta distribution for distortion (default true)", "db", CMDBOOLTYPE|CMDMSG, &distbeta, ": use beta distribution for distortion (default true)", "TxtModel", CMDSTRINGTYPE|CMDMSG, &modeltxtfile, " : model in textual form", "txt", CMDSTRINGTYPE|CMDMSG, &modeltxtfile, " : model in readable form", "DistWin", CMDINTTYPE|CMDMSG, &distwin, ": distortion window (default 8)", "dw", CMDINTTYPE|CMDMSG, &distwin, ": distortion window (default 8)", "M1iter", CMDINTTYPE|CMDMSG, &model1iter, ": number of itereations with model 1 (default 7)", "m1", CMDINTTYPE|CMDMSG, &model1iter, ": number of itereations with model 1 (default 7)", "Verbosity", CMDBOOLTYPE|CMDMSG, &verbosity, "verbose output", "v", CMDBOOLTYPE|CMDMSG, &verbosity, "verbose output", "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help", "h", CMDBOOLTYPE|CMDMSG, &help, "print this help", (char *)NULL ); if (argc == 1){ usage(); exit_error(IRSTLM_NO_ERROR); } GetParams(&argc, &argv, (char*) NULL); if (help){ usage(); exit_error(IRSTLM_NO_ERROR); } if (!srcdatafile || !trgdatafile || !w2vfile || !modelfile ) { usage(); exit_error(IRSTLM_ERROR_DATA,"Missing parameters"); } //check if model is readable bool testmodel=false; FILE* f;if ((f=fopen(modelfile,"r"))!=NULL){fclose(f);testmodel=true;} if (iterations && testmodel && !forcemodel) exit_error(IRSTLM_ERROR_DATA,"Use -ForceModel=y option to update an existing model."); cswam *model=new cswam(srcdatafile,trgdatafile,w2vfile, forcemodel, usenullword,fixnullprob, normvectors, model1iter, trainvar,minvar, distwin,distbeta, distmean,distvar, verbosity); if (iterations) model->train(srcdatafile,trgdatafile,modelfile,iterations,threads); if (alignfile) model->test(srcdatafile,trgdatafile,modelfile,alignfile,threads); if (modeltxtfile){ model->loadModel(modelfile); model->saveModelTxt(modeltxtfile); } delete model; exit_error(IRSTLM_NO_ERROR); } irstlm-6.00.05/src/cswam.cpp000077500000000000000000001426041263213470300156140ustar00rootroot00000000000000/****************************************************************************** IrstLM: IRST Language Model Toolkit, compile LM Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *******************************************************************************/ #include #include #include #include #include #include #include #include "thpool.h" #include "crc.h" #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "n_gram.h" #include "util.h" #include "dictionary.h" #include "ngramtable.h" #include "doc.h" #include #include #include "cswam.h" using namespace std; namespace irstlm { cswam::cswam(char* sdfile,char *tdfile, char* w2vfile, bool forcemodel, bool usenull,double fixnullprob, bool normvect, int model1iter, bool trainvar,float minvar, int distwin,bool distbeta,bool distmean,bool distvar, bool verbose){ //actual model structure TM=NULL; A=NULL; Den=NULL; friends=NULL; efcounts=NULL; ecounts=NULL; loc_efcounts=NULL; loc_ecounts=NULL; //setting incremental_train=forcemodel; normalize_vectors=normvect; train_variances=trainvar; use_null_word=usenull; min_variance=minvar; distortion_window=distwin; distortion_mean=distmean; distortion_var=distvar; use_beta_distortion=distbeta; fix_null_prob=fixnullprob; DistMean=DistVar=0; //distortion mean and variance DistA=DistB=0; //beta parameters NullProb=0; M1iter=model1iter; //set mininum word frequency to collect friends minfreq=10; cout << "cswam configuration.\n"; cout << "Vectors: normalize [" << normalize_vectors << "] \n"; cout << "Gaussian Variances: train [" << train_variances << "] min [" << min_variance << "] initial [" << min_variance * SSEED << "]\n"; cout << "Null word: active [" << use_null_word << "] fix_null_prob [" << fix_null_prob << "]\n"; cout << "Distortion model: window [" << distortion_window << "] use beta [" << use_beta_distortion << "] update mean [" << distortion_mean << "] update variance [" << distortion_var << "]\n"; srandom(100); //ensure repicable generation of random numbers bucket=BUCKET; threads=1; verbosity=verbose; //create dictionaries srcdict=new dictionary(NULL,100000); srcdict->generate(sdfile,true); trgdict=new dictionary(NULL,100000); trgdict->generate(tdfile,true); //make aware of oov word srcdict->encode(srcdict->OOV()); trgdict->encode(trgdict->OOV()); trgBoD = trgdict->encode(trgdict->BoD()); //codes for begin/end sentence markers trgEoD = trgdict->encode(trgdict->EoD()); srcBoD = srcdict->encode(srcdict->BoD()); //codes for begin/end sentence markers srcEoD = srcdict->encode(srcdict->EoD()); //load word2vec dictionary W2V=NULL; D=0; loadword2vec(w2vfile); //check consistency of word2vec with target vocabulary } cswam::~cswam() { assert(A==NULL); if (TM){ cerr << "Releasing memory of Translation Model\n"; for (int e=0;esize();e++){ for (int n=0;nsize();f++) if (W2V[f]!=NULL) delete [] W2V[f]; delete [] W2V; } if (friends) delete [] friends; cerr << "Releasing memory of srcdict\n"; delete srcdict; cerr << "Releasing memory of srcdict\n"; delete trgdict; } void cswam::randword2vec(const char* word,float* vec,int it){ //initialize random generator srandom(crc16_ccitt(word,strlen(word))+it); //generate random numbers between -1 and +1, //then scale and shift according to w2v for (int d=0;d> w2vsize; cerr << " size= " << w2vsize; inp >> D ; cout << " dim= " << D << "\n"; assert(D>0 && D<1000); int srcoov=srcdict->oovcode(); W2V=new float* [srcdict->size()]; for (int f=0;fsize();f++) W2V[f]=NULL; char word[100]; float dummy; int f; for (long long i=0;i> word; f=srcdict->encode(word); if (f != srcoov){ W2V[f]=new float[D]; for (int d=0;d> W2V[f][d]; } else //skip this word for (int d=0;d> dummy; if (!(i % 10000)) cerr<< "."; } cerr << "\n"; cerr << "looking for missing source words in w2v\n"; int newwords=0; for ( f=0;fsize();f++){ if (W2V[f]==NULL && f!=srcBoD && f!=srcEoD) { if (verbosity) cerr << "Missing src word in w2v: [" << f << "] " << srcdict->decode(f) << "\n"; W2V[f]=new float[D]; //generate random vectors with same distribution randword2vec(srcdict->decode(f),W2V[f]); newwords++; if (verbosity){ for (int d=0;dsize();f++) if (W2V[f]!=NULL){ float norm=0; for (int d=0;d0); TM[e].G=new Gaussian [TM[e].n];TM[e].W=new float[TM[e].n]; for (int n=0;ndecode(e),TM[e].G[n].M,n); } for (int d=0;dfreq(e)+1.1)); // // //some exceptions if // // assert(TM[e].n>0); // TM[e].G=new Gaussian [TM[e].n];TM[e].W=new float[TM[e].n]; // for (int n=0;nencode(trgdict->decode(e)); // float srcfreq=srcdict->freq(f);float trgfreq=trgdict->freq(e); // if (f!=srcdict->oovcode() && srcfreq/trgfreq < 1.1 && srcfreq/trgfreq > 0.9 && srcfreq < 10 && f!=srcBoD && f!=srcEoD){ // memcpy(TM[e].G[n].M,W2V[f],sizeof(float) * D); // for (int d=0;ddecode(f) << "\n"; // }else{ // //pick candidates from friends // // // randword2vec(trgdict->decode(e),TM[e].G[n].M,n); // // for (int d=0;dsize()]; friends=new FriendList[trgdict->size()]; findfriends(friends); for (int e=0; esize(); e++) initEntry(e); } //this can overwrite existing model if (use_null_word) NullProb=(fix_null_prob?fix_null_prob:0.05); //null word alignment probability } int cswam::saveModelTxt(char* fname){ cerr << "Writing model into: " << fname << "\n"; mfstream out(fname,ios::out); out << "=dist= " << DistMean << " " << DistVar << "\n"; out << "=nullprob= " << NullProb << "\n"; for (int e=0; esize(); e++){ out << "=h= " << trgdict->decode(e) << " sz= " << TM[e].n << "\n"; for (int n=0;ndecode(e) << " w= " << TM[e].W[n] << " eC= " << TM[e].G[n].eC << " mS= " << TM[e].G[n].mS << "\n"; out << "=m= " << trgdict->decode(e); for (int d=0;ddecode(e); for (int d=0;dsave(out); out.write((const char*)&DistMean,sizeof(float)); out.write((const char*)&DistVar,sizeof(float)); out.write((const char*)&NullProb,sizeof(float)); for (int e=0; esize(); e++){ out.write((const char*)&TM[e].n,sizeof(int)); out.write((const char*)TM[e].W,TM[e].n * sizeof(float)); for (int n=0;n0 && r != D) exit_error(IRSTLM_ERROR_DATA, "incompatible dimension in model"); else D=r; if (verbosity) cerr << "\nLoading dictionary ... "; dictionary* dict=new dictionary(NULL,1000000); dict->load(inp); dict->encode(dict->OOV()); int current_size=dict->size(); //expand the model for training or keep the model fixed for testing if (expand){ if (verbosity) cerr << "\nExpanding model to include targer dictionary"; dict->incflag(1); for (int code=0;codesize();code++) dict->encode(trgdict->decode(code)); dict->incflag(2); } //replace the trgdict with the model dictionary delete trgdict;trgdict=dict; trgdict->encode(trgdict->OOV()); //updated dictionary codes trgBoD = trgdict->encode(trgdict->BoD()); //codes for begin/end sentence markers trgEoD = trgdict->encode(trgdict->EoD()); TM=new TransModel [trgdict->size()]; if (verbosity) cerr << "\nReading parameters .... "; inp.read((char*)&DistMean, sizeof(float)); inp.read((char*)&DistVar, sizeof(float)); inp.read((char*)&NullProb,sizeof(float)); cerr << "DistMean: " << DistMean << " DistVar: " << DistVar << " NullProb: " << NullProb << "\n"; if (use_beta_distortion) EstimateBeta(DistA,DistB,DistMean,DistVar); for (int e=0; esize()-current_size << " new entries .... "; for (int e=current_size; esize(); e++) initEntry(e); cerr << "\nDone\n"; return 1; } void cswam::initAlphaDen(){ //install Alpha[s][i][j] to collect counts //allocate if empty if (A==NULL){ assert(trgdata->numdoc()==srcdata->numdoc()); A=new float ***[trgdata->numdoc()]; for (int s=0;snumdoc();s++){ A[s]=new float **[trgdata->doclen(s)]; for (int i=0;idoclen(s);i++){ A[s][i]=new float *[TM[trgdata->docword(s,i)].n]; for (int n=0;ndocword(s,i)].n;n++) A[s][i][n]=new float [srcdata->doclen(s)]; } } } //initialize for (int s=0;snumdoc();s++) for (int i=0;idoclen(s);i++) for (int n=0;ndocword(s,i)].n;n++) memset(A[s][i][n],0,sizeof(float) * srcdata->doclen(s)); //allocate if (Den==NULL){ Den=new float*[trgdict->size()]; for (int e=0;esize();e++) Den[e]=new float[TM[e].n]; } //initialize for (int e=0;esize();e++) memset(Den[e],0,sizeof(float)*TM[e].n); } void cswam::freeAlphaDen(){ if (A!=NULL){ for (int s=0;snumdoc();s++){ for (int i=0;idoclen(s);i++){ for (int n=0;ndocword(s,i)].n;n++) delete [] A[s][i][n]; delete [] A[s][i]; } delete [] A[s]; } delete [] A; A=NULL; } if (Den!=NULL){ for (int e=0;esize();e++) delete [] Den[e]; delete [] Den; Den=NULL; } } ///***** //pthread_mutex_t cswam_mut1; //pthread_mutex_t cswam_mut2; double cswam_LL=0; //Log likelihood float logsum(float a,float b){ if (b0); dist+=(x[i]-m[i])*(x[i]-m[i])/(s[i]); norm+=s[i]; } return -0.5 * (dist + dim * log2pi + logf(norm)); } float cswam::LogBeta( float x,float a,float b){ assert(x>0 && x <1); //disregard constant factor! return (a-1) * log(x) + (b-1) * log(1-x); } float cswam::Delta(int i,int j,int l,int m){ i-=(use_null_word?1:0); l-=(use_null_word?1:0); float d=((i - j)>0?(float)(i-j)/l:(float)(i-j)/m); //range is [-1,+1]; if (use_beta_distortion) d=(d+1)/2; //move in range [0,1]; //reduce length penalty for short sentences if (l<=6 || m<=6) d/=2; return d; } float cswam::LogDistortion(float d){ if (use_beta_distortion) return LogBeta(d,DistA,DistB); else return LogGauss(1,&d,&DistMean,&DistVar); } void cswam::expected_counts(void *argv){ long long s=(long long) argv; ShowProgress(s, srcdata->numdoc()); int trglen=trgdata->doclen(s); // length of target sentence int srclen=srcdata->doclen(s); //length of source sentence float den;float delta=0; //distortion //reset likelihood localLL[s]=0; //compute denominator for each source-target pair for (int j=0;jdecode(srcdata->docword(s,j)) << "\n"; den=0; for (int i=0;idocword(s,i)].n;n++){ if (!(TM[trgdata->docword(s,i)].W[n]>0)) cerr << trgdict->decode(trgdata->docword(s,i)) << " n:" << n << "\n"; assert(TM[trgdata->docword(s,i)].W[n]>0); //weight zero must be prevented!!! //global_i=i; //cout << "i: " << trgdict->decode(trgdata->docword(s,i)) << "\n"; A[s][i][n][j]=LogGauss(D, W2V[srcdata->docword(s,j)], TM[trgdata->docword(s,i)].G[n].M, TM[trgdata->docword(s,i)].G[n].S) +log(TM[trgdata->docword(s,i)].W[n]) +(i>0 || !use_null_word ?logf(1-NullProb):logf(NullProb)) +(i>0 || !use_null_word ?LogDistortion(delta):0); if (i==0 && n==0) //den must be initialized den=A[s][i][n][j]; else den=logsum(den,A[s][i][n][j]); } } //update local likelihood localLL[s]+=den; for (int i=0;idocword(s,i)].n;n++){ assert(A[s][i][n][j]<= den); A[s][i][n][j]=expf(A[s][i][n][j]-den); // A is now a regular expected count if (A[s][i][n][j]<0.000000001) A[s][i][n][j]=0; //take mall risk of wrong normalization if (A[s][i][n][j]>0) TM[trgdata->docword(s,i)].G[n].eC++; //increase support set size } } } void cswam::EstimateBeta(float &a, float &b, float m, float s){ b = (s * m -s + m * m * m - 2 * m * m + m)/s; a = ( m * b )/(1-m); } void cswam::maximization(void *argv){ long long d=(long long) argv; ShowProgress(d,D); if (d==D){ //this thread is to maximize the global distortion model //Maximization step: Mean and variance of distortion model //Mean double totwdist=0, totdistprob=0, totnullprob=0, delta=0; for (int s=0;snumdoc();s++){ for (int j=0;jdoclen(s);j++) for (int i=0;idoclen(s);i++) if ((use_null_word && i==0) || abs(i-j-1) <= distortion_window){ delta=Delta(i,j,trgdata->doclen(s),srcdata->doclen(s)); for (int n=0;ndocword(s,i)].n;n++) if (A[s][i][n][j]>0){ if (i>0 || !use_null_word){ totwdist+=A[s][i][n][j]*delta; totdistprob+=A[s][i][n][j]; } else{ totnullprob+=A[s][i][n][j]; } } } } if (use_null_word && fix_null_prob==0) NullProb=(float)totnullprob/(totdistprob+totnullprob); if (distortion_mean && iter >0) //then update the mean DistMean=totwdist/totdistprob; //Variance if (distortion_var && iter >0){ double totwdeltadist=0; for (int s=0;snumdoc();s++) for (int i=1;idoclen(s);i++) //exclude i=0! for (int j=0;jdoclen(s);j++) if (abs(i-j-1) <= distortion_window){ delta=Delta(i,j,trgdata->doclen(s),srcdata->doclen(s)); for (int n=0;ndocword(s,i)].n;n++) if (A[s][i][n][j]>0) totwdeltadist+=A[s][i][n][j] * (delta-DistMean) * (delta-DistMean); } DistVar=totwdeltadist/totdistprob; } cerr << "Dist: " << DistMean << " " << DistVar << "\n"; if (use_null_word) cerr << "NullProb: " << NullProb << "\n"; if (use_beta_distortion){ cerr << "Beta A: " << DistA << " Beta B: " << DistB << "\n"; EstimateBeta(DistA,DistB,DistMean,DistVar); } } else{ //Maximization step: Mean; for (int s=0;snumdoc();s++) for (int j=0;jdoclen(s);j++) for (int i=0;idoclen(s);i++) if ((use_null_word && i==0) || abs(i-j-1) <= distortion_window) for (int n=0;ndocword(s,i)].n;n++) if (A[s][i][n][j]>0) TM[trgdata->docword(s,i)].G[n].M[d]+=A[s][i][n][j] * W2V[srcdata->docword(s,j)][d]; //second pass for (int e=0;esize();e++) for (int n=0;n0) TM[e].G[n].M[d]/=Den[e][n]; //update the mean estimated if (train_variances){ //Maximization step: Variance; for (int s=0;snumdoc();s++) for (int j=0;jdoclen(s);j++) for (int i=0;idoclen(s);i++) if ((use_null_word && i==0) || abs(i-j-1) <= distortion_window) for (int n=0;ndocword(s,i)].n;n++) if (A[s][i][n][j]>0) TM[trgdata->docword(s,i)].G[n].S[d]+= (A[s][i][n][j] * (W2V[srcdata->docword(s,j)][d]-TM[trgdata->docword(s,i)].G[n].M[d]) * (W2V[srcdata->docword(s,j)][d]-TM[trgdata->docword(s,i)].G[n].M[d]) ); //second pass for (int e=0;esize();e++) for (int n=0;n0){ TM[e].G[n].S[d]/=Den[e][n]; //might be too aggressive? if (TM[e].G[n].S[d] < min_variance) TM[e].G[n].S[d]=min_variance; //improves generalization! } } } } void cswam::expansion(void *argv){ long long e=(long long) argv; for (int n=0;n= 0.95 && //mean variance does not reduce significantly TM[e].G[n].eC >= eCThresh && //population is large S > SThresh) { //variance is large if (verbosity) cerr << "\n" << trgdict->decode(e) << " n= " << n << " (" << TM[e].n << ") Counts: " << TM[e].G[n].eC << " mS: " << S << "\n"; //expand: create new Gaussian after Gaussian n Gaussian *nG=new Gaussian[TM[e].n+1]; float *nW=new float[TM[e].n+1]; memcpy((void *)nG,(const void *)TM[e].G, (n+1) * sizeof(Gaussian)); memcpy((void *)nW,(const void *)TM[e].W, (n+1) * sizeof(float)); if (n+1 < TM[e].n){ memcpy((void *)&nG[n+2],(const void*)&TM[e].G[n+1],(TM[e].n-n-1) * sizeof(Gaussian)); memcpy((void *)&nW[n+2],(const void*)&TM[e].W[n+1],(TM[e].n-n-1) * sizeof(float)); } //initialize mean and variance vectors nG[n+1].M=new float[D];nG[n+1].S=new float[D]; for (int d=0;d0?dist/norm:1); } float rl1(const float* a,const float*b, int d){ float maxreldist=0; float reldist; for (int i=0;imaxreldist) maxreldist=reldist; } return maxreldist; } float al1(const float* a,const float*b, int d){ float maxdist=0; float dist; for (int i=0;imaxdist) maxdist=dist; } return maxdist; } void cswam::contraction(void *argv){ long long e=(long long) argv; float min_std=sqrt(min_variance); float min_weight=0.01; for (int n=0;n1); if (verbosity) cerr << "\n" << trgdict->decode(e) << " n= " << n << " Weight: " << TM[e].W[n] << " Dist= " << max_dist << "\n"; //expand: create new mixture model with n-1 components Gaussian *nG=new Gaussian[TM[e].n-1]; float *nW=new float[TM[e].n-1]; if (n>0){ //copy all entries before n memcpy((void *)nG,(const void *)TM[e].G, n * sizeof(Gaussian)); memcpy((void *)nW,(const void *)TM[e].W, n * sizeof(float)); } if (n+1 < TM[e].n){ //copy all entries after memcpy((void *)&nG[n],(const void*)&TM[e].G[n+1],(TM[e].n-n-1) * sizeof(Gaussian)); memcpy((void *)&nW[n],(const void*)&TM[e].W[n+1],(TM[e].n-n-1) * sizeof(float)); } //don't need to normalized weights! if (max_dist < min_std)// this is the gaussian overlapping case nW[n1]+=TM[e].W[n]; //the left gaussian inherits the weight //update TM[e] structure TM[e].n--;n--; delete [] TM[e].G;TM[e].G=nG; delete [] TM[e].W; TM[e].W=nW; } } //re-normalize weights float totw=0; for (int n=0;n 0.0001);} for (int n=0;nsize()>trgdata->numdoc()?trgdict->size():trgdata->numdoc(); task *t=new task[numtasks]; assert(numtasks>D); //multi-threading also distributed over D //support variable to compute likelihood localLL=new float[srcdata->numdoc()]; while (iter < maxiter){ cerr << "\nIteration: " << ++iter << "\n"; initAlphaDen(); //reset support set size for (int e=0;esize();e++) for (int n=0;nnumdoc();s++){ //prepare and assign tasks to threads t[s].ctx=this; t[s].argv=(void *)s; thpool_add_work(thpool, &cswam::expected_counts_helper, (void *)&t[s]); } //join all threads thpool_wait(thpool); //Reset model before update for (int e=0;e size();e++) for (int n=0;nsize();e++) memset(Den[e],0,TM[e].n * sizeof(float)); cswam_LL=0; //compute LL of current model //compute normalization term for each target word for (int s=0;snumdoc();s++){ cswam_LL+=localLL[s]; for (int i=0;idoclen(s);i++) for (int n=0;ndocword(s,i)].n;n++) for (int j=0;jdoclen(s);j++) Den[trgdata->docword(s,i)][n]+=A[s][i][n][j]; } cerr << "LL = " << cswam_LL << "\n"; cerr << "M-step: "; for (long long d=0;d<=D;d++){ //include special job d=D for distortion model t[d].ctx=this; t[d].argv=(void *)d; thpool_add_work(thpool, &cswam::maximization_helper, (void *)&t[d]); } //join all threads thpool_wait(thpool); //some checks of the models: fix degenerate models for (int e=0;esize();e++) if (e!=trgEoD) for (int n=0;ndecode(e) << " n: " << n << " eC:" << TM[e].G[n].eC << "\n"; for (int d=0;dencode("bege")==e){ // cerr << "bege " << " mS: " << TM[e].G[0].mS << " n: " << TM[e].n << " eC " << TM[e].G[0].eC << "\n"; // cerr << "M:"; for (int d=0;d<10;d++) cerr << " " << TM[e].G[0].M[d]; cerr << "\n"; // cerr << "S:"; for (int d=0;d<10;d++) cerr << " " << TM[e].G[0].S[d]; cerr << "\n"; // } // } //update the weight estimates: ne need of multithreading float totW; int ngauss=0; for (int e=0;esize();e++){ totW=0; for (int n=0;n0) for (int n=0;n 1 || incremental_train ){ freeAlphaDen(); //needs to be reallocated as models might change cerr << "\nP-step: "; for (long long e=0;esize();e++){ //check if to decrease number of gaussians per target word t[e].ctx=this; t[e].argv=(void *)e; thpool_add_work(thpool, &cswam::contraction_helper, (void *)&t[e]); } //join all threads thpool_wait(thpool); cerr << "\nS-step: "; for (long long e=0;esize();e++){ //check if to increase number of gaussians per target word t[e].ctx=this; t[e].argv=(void *)e; thpool_add_work(thpool, &cswam::expansion_helper, (void *)&t[e]); } //join all threads thpool_wait(thpool); } if (srcdata->numdoc()>10000) system("date"); saveModel(modelfile); } // for (int e=0;esize();e++) // for (int d=0;ddecode(e) << " S: " << S[e][d] << " M: " << M[e][d]<< "\n"; //destroy thread pool thpool_destroy(thpool); freeAlphaDen(); delete srcdata; delete trgdata; delete [] t; delete [] localLL; return 1; } void cswam::aligner(void *argv){ long long s=(long long) argv; static float maxfloat=std::numeric_limits::max(); if (! (s % 10000)) {cerr << ".";cerr.flush();} //fprintf(stderr,"Thread: %lu Document: %d (out of %d)\n",(long)pthread_self(),s,srcdata->numdoc()); int trglen=trgdata->doclen(s); // length of target sentence int srclen=srcdata->doclen(s); //length of source sentence assert(trglendecode(srcdata->docword(s,j)) << "\n"; best_score=-maxfloat;best_i=0; for (int i=first_target;idecode(trgdata->docword(s,i)) << " "; for (int n=0;ndocword(s,i)].n;n++){ score=LogGauss(D, W2V[srcdata->docword(s,j)], TM[trgdata->docword(s,i)].G[n].M, TM[trgdata->docword(s,i)].G[n].S)+log(TM[trgdata->docword(s,i)].W[n]); if (n==0) sum=score; else sum=logsum(sum,score); } //completed mixture score if (distortion_var || distortion_mean){ if (i>0 ||!use_null_word){ float d=Delta(i,j,trglen,srclen); sum+=logf(1-NullProb) + LogDistortion(d); } else if (use_null_word ) sum+=logf(NullProb); } else //use plain distortion model if (i>0){ if (i - (use_null_word?1:0) > j ) sum-=(i- (use_null_word?1:0) -j); else if (i - (use_null_word?1:0) < j ) sum-=(j - i + (use_null_word?1:0)); } //add distortion score now //cout << "score: " << sum << "\n"; // cout << "\t " << srcdict->decode(srcdata->docword(s,j)) << " " << dist << "\n"; //if (dist > -50) score=(float)exp(-dist)/norm; if (sum > best_score){ best_score=sum; best_i=i; if ((!use_null_word || best_i>0) && !some_not_null) some_not_null=true; } } //cout << "best_i: " << best_i << "\n"; alignments[s % bucket][j]=best_i; if (j==(srclen-1) && !some_not_null){ j=-1; //restart loop and remove null word from options first_target=1; some_not_null=true; //make sure to pass this check next time } } } int cswam::test(char *srctestfile, char *trgtestfile, char* modelfile, char* alignfile,int threads){ {mfstream out(alignfile,ios::out);} //empty the file initModel(modelfile); if (!distortion_mean){ if (use_beta_distortion){ cerr << "ERROR: cannot test with beta distribution without mean\n"; return 0; } DistMean=0; //force mean to zero } //Load training data srcdata=new doc(srcdict,srctestfile); trgdata=new doc(trgdict,trgtestfile,use_null_word); assert(srcdata->numdoc()==trgdata->numdoc()); bucket=BUCKET; //initialize the bucket size alignments=new int* [BUCKET]; for (int s=0;snumdoc();s++){ t[s % BUCKET].ctx=this; t[s % BUCKET].argv=(void *)s; thpool_add_work(thpool, &cswam::aligner_helper, (void *)&t[s % BUCKET]); if (((s % BUCKET) == (BUCKET-1)) || (s==(srcdata->numdoc()-1)) ){ //join all threads thpool_wait(thpool); //cerr << "Start printing\n"; if ((s % BUCKET) != (BUCKET-1)) bucket=srcdata->numdoc() % bucket; //last bucket at end of file mfstream out(alignfile,ios::out | ios::app); for (int b=0;bdoclen(s-bucket+1+b); j++) if (!use_null_word || alignments[b][j]>0){ //print target using 0 for first actual word out << (first?"":" ") << j << "-" << alignments[b][j]-(use_null_word?1:0); first=false; } out << "\n"; } } } //destroy thread pool thpool_destroy(thpool); delete [] t; for (int s=0;s mientry; //pair type containing src word and mi score bool myrank (Friend a,Friend b) { return (a.score > b.score ); } //void cswam::findfriends(FriendList* friends){ // // typedef std::unordered_map src_map; // src_map* table= new src_map[trgdict->size()]; // // // amap["def"][7] = 2.2; // // std::cout << amap["abc"][12] << '\n'; // // std::cout << amap["def"][7] << '\n'; // // // int *srcfreq=new int[srcdict->size()]; // int *trgfreq=new int[trgdict->size()]; // int totfreq=0; // int minfreq=10; // // cerr << "collecting co-occurrences\n"; // for (int s=0;snumdoc();s++){ // // int trglen=trgdata->doclen(s); // length of target sentence // int srclen=srcdata->doclen(s); //length of source sentence // // int frac=(s * 1000)/srcdata->numdoc(); // if (!(frac % 10)) fprintf(stderr,"%02d\b\b",frac/10); // // for (int i=0;idocword(s,i); // float trgdictfreq=trgdict->freq(trg); // if (trgdict->freq(trg)>=10){ // for (int j=0;jdocword(s,j); // float freqratio=srcdict->freq(src)/trgdictfreq; // if (srcdict->freq(src)>=minfreq && freqratio <= 10 && freqratio >= 0.1){ // table[trg][src]++; // totfreq++; // srcfreq[src]++; // trgfreq[trg]++; // } // } // } // } // } // // cerr << "computing mutual information\n"; // Friend mie; FriendList mivec; // // // for (int i = 0; i < trgdict->size(); i++){ // // int frac=(i * 1000)/trgdict->size(); // if (!(frac % 10)) fprintf(stderr,"%02d\b\b",frac/10); // // mivec.clear(); // for (auto jtr = table[i].begin(); jtr != table[i].end();jtr++){ // int j=(*jtr).first; int freq=(*jtr).second; // float freqratio=(float)srcdict->freq(j)/(float)trgdict->freq(i); // if (freq>minfreq){ // && freqratio < 10 && freqratio > 0.1){ // //compute mutual information // float mutualinfo= // logf(freq/(float)trgfreq[i]) - log((float)srcfreq[j]/totfreq); // mutualinfo/=log(2); // mie.word=j; mie.score=mutualinfo; // mivec.push_back(mie); // } // } // if (mivec.size()>0){ // std::sort(mivec.begin(),mivec.end(),myrank); // //sort the vector and take the top log(10) // int count=0; // for (auto jtr = mivec.begin(); jtr != mivec.end();jtr++){ // //int j=(*jtr).word; float mutualinfo=(*jtr).score; // friends[i].push_back(*jtr); // //cout << trgdict->decode(i) << " " << srcdict->decode(j) << " " << mutualinfo << endl; // //if (++count >=50) break; // } // // } // } // // //} void cswam::M1_ecounts(void *argv){ long long s=(long long) argv; int b=s % threads; //value of the actual bucket int trglen=trgdata->doclen(s); // length of target sentence int srclen=srcdata->doclen(s); //length of source sentence float pef=0; ShowProgress(s,srcdata->numdoc()); float lowprob=0.0000001; for (int j=0;jdocword(s,j); if (srcdict->freq(f)>=minfreq){ float t=0; for (int i=0;idocword(s,i); if (trgdict->freq(e)>=minfreq && (i==0 || abs(i-j-1) <= distortion_window) && prob[e][f]>lowprob) t+=prob[e][f]; } for (int i=0;idocword(s,i); if (trgdict->freq(e)>=minfreq && (i==0 || abs(i-j-1) <= distortion_window) && prob[e][f]>lowprob){ pef=prob[e][f]/t; loc_efcounts[b][e][f]+=pef; loc_ecounts[b][e]+=pef; } } } } } void cswam::M1_update(void *argv){ long long e=(long long) argv; ShowProgress(e,trgdict->size()); // for (auto jtr = efcounts[e].begin(); jtr != efcounts[e].end();jtr++){ for (src_map::iterator jtr = efcounts[e].begin(); jtr != efcounts[e].end();jtr++){ int f=(*jtr).first; prob[e][f]=efcounts[e][f]/ecounts[e]; } } void cswam::M1_collect(void *argv){ long long e=(long long) argv; ShowProgress(e,trgdict->size()); for (int b=0;bsize()]; loc_ecounts[b]=new float[trgdict->size()]; } cerr << "allocating global count structures\n"; //allocate the global count structures efcounts=new src_map[trgdict->size()]; ecounts=new float[trgdict->size()]; } if (clearmem){ for (int b=0;bsize(); e++){ efcounts[e].clear(); memset(ecounts,0,sizeof(int)*trgdict->size()); } //local expected counts are reset in main loop } } void cswam::findfriends(FriendList* friends){ //allocate the global prob table prob= new src_map[trgdict->size()]; //allocate thread safe structures M1_clearcounts(false); //prepare thread pool threadpool thpool=thpool_init(threads); task *t=new task[trgdict->size()>threads?trgdict->size():threads]; float minprob=0.01; cerr << "initializing M1\n"; for (int s=0;snumdoc();s++){ int trglen=trgdata->doclen(s); // length of target sentence int srclen=srcdata->doclen(s); //length of source sentence int frac=(s * 1000)/srcdata->numdoc(); if (!(frac % 10)) fprintf(stderr,"%02d\b\b",frac/10); for (int j=0;jdocword(s,j); if (srcdict->freq(f)>=minfreq){ for (int i=0;idocword(s,i); if (trgdict->freq(e)>=minfreq && (i==0 || abs(i-j-1) <= distortion_window)) prob[e][f]=1; } } } } cerr << "training M1\n"; for (int it=0;itnumdoc();s++){ t[s % threads].ctx=this; t[s % threads].argv=(void *)s; thpool_add_work(thpool, &cswam::M1_ecounts_helper,(void *)&t[s % threads]); if (((s % threads) == (threads-1)) || (s==(srcdata->numdoc()-1))) thpool_wait(thpool);//join all threads } //update the global counts for (long long e = 0; e < trgdict->size(); e++){ t[e].ctx=this; t[e].argv=(void *)e; thpool_add_work(thpool, &cswam::M1_collect_helper,(void *)&t[e]); } thpool_wait(thpool);//join all threads //update probabilities for (long long e = 0; e < trgdict->size(); e++){ t[e].ctx=this; t[e].argv=(void *)e; thpool_add_work(thpool, &cswam::M1_update_helper,(void *)&t[e]); } thpool_wait(thpool); //join all threads } cerr << "computing candidates\n"; Friend f;FriendList fv; for (int e = 0; e < trgdict->size(); e++){ ShowProgress(e,trgdict->size()); fv.clear(); //save in a vector and compute entropy float H=0; // for (auto jtr = prob[e].begin(); jtr != prob[e].end();jtr++){ for (src_map::iterator jtr = prob[e].begin(); jtr != prob[e].end();jtr++){ f.word=(*jtr).first; f.score=(*jtr).second; assert(f.score>=0 && f.score<=1); if (f.score>0) H-=f.score * logf(f.score); if (f.score >= minprob) //never include options with prob < minprob fv.push_back(f); } std::sort(fv.begin(),fv.end(),myrank); int PP=round(expf(H)); //compute perplexity cout << trgdict->decode(e) << " # friends: " << fv.size() << " PP " << PP << endl; int count=0; // for (auto jtr = fv.begin(); jtr != fv.end();jtr++){ for (FriendList::iterator jtr = fv.begin(); jtr != fv.end();jtr++){ friends[e].push_back(*jtr); //if (verbosity) cout << trgdict->decode(e) << " " << srcdict->decode((*jtr).word) << " " << (*jtr).score << endl; if (++count >= PP) break; } } //destroy thread pool thpool_destroy(thpool); delete [] t; M1_clearcounts(true); delete [] prob; } } //namespace irstlm irstlm-6.00.05/src/cswam.h000077500000000000000000000147471263213470300152670ustar00rootroot00000000000000/****************************************************************************** IrstLM: IRST Language Model Toolkit, compile LM Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #ifndef MF_CSWAM_H #define MF_CSWAM_H #ifdef HAVE_CXX0 #include #else #include #endif #include namespace irstlm { typedef struct{ float* M; //mean vectors float* S; //variance vectors //training support items float eC; //support set size float mS; //mean variance } Gaussian; typedef struct{ int n; //number of Gaussians float *W; //weight vector Gaussian *G; //Gaussians } TransModel; typedef struct{ int word; //word code float score; //score (mutual information) } Friend; typedef std::vector FriendList; //list of word Friends #ifdef HAVE_CXX0 typedef std::unordered_map src_map; //target to source associative memory #else typedef std::map src_map; //target to source associative memory #endif class cswam { //data dictionary* srcdict; //source dictionary dictionary* trgdict; //target dictionary doc* srcdata; //source training data doc* trgdata; //target trainign data FriendList* friends; //prior list of translation candidates //word2vec float **W2V; //vector for each source word int D; //dimension of vector space //model TransModel *TM; float DistMean,DistVar; //distortion mean and variance float DistA,DistB; //gamma parameters float NullProb; //null probability //settings bool normalize_vectors; bool train_variances; double fix_null_prob; bool use_null_word; bool verbosity; float min_variance; int distortion_window; bool distortion_mean; bool distortion_var; bool use_beta_distortion; int minfreq; bool incremental_train; //private info shared among threads int trgBoD; //code of segment begin in target dict int trgEoD; //code of segment end in target dict int srcBoD; //code of segment begin in src dict int srcEoD; //code of segment end in src dict float ****A; //expected counts float **Den; //alignment probs float *localLL; //local log-likelihood int **alignments; //word alignment info int threads; //number of threads int bucket; //size of bucket int iter; //current iteration int M1iter; //iterations with model 1 //Model 1 initialization private variables src_map* prob; //model one probabilities src_map** loc_efcounts; //expected count probabilities float **loc_ecounts; //expected count probabilities src_map* efcounts; //expected count probabilities float *ecounts; //expected count probabilities struct task { //basic task info to run task void *ctx; void *argv; }; public: cswam(char* srcdatafile,char* trgdatafile, char* word2vecfile, bool forcemodel, bool usenull,double fix_null_prob, bool normv2w, int model1iter, bool trainvar,float minvar, int distwin,bool distbeta, bool distmean,bool distvar, bool verbose); ~cswam(); void loadword2vec(char* fname); void randword2vec(const char* word,float* vec,int it=0); void initModel(char* fname); void initEntry(int entry); int saveModel(char* fname); int saveModelTxt(char* fname); int loadModel(char* fname,bool expand=false); void initAlphaDen(); void freeAlphaDen(); float LogGauss(const int dim,const float* x,const float *m, const float *s); float LogDistortion(float d); float LogBeta(float x, float a, float b); void EstimateBeta(float &a, float &b, float m, float s); float Delta( int i, int j, int l=1, int m=1); void expected_counts(void *argv); static void *expected_counts_helper(void *argv){ task t=*(task *)argv; ((cswam *)t.ctx)->expected_counts(t.argv);return NULL; }; void maximization(void *argv); static void *maximization_helper(void *argv){ task t=*(task *)argv; ((cswam *)t.ctx)->maximization(t.argv);return NULL; }; void expansion(void *argv); static void *expansion_helper(void *argv){ task t=*(task *)argv; ((cswam *)t.ctx)->expansion(t.argv);return NULL; }; void contraction(void *argv); static void *contraction_helper(void *argv){ task t=*(task *)argv; ((cswam *)t.ctx)->contraction(t.argv);return NULL; }; void M1_ecounts(void *argv); static void *M1_ecounts_helper(void *argv){ task t=*(task *)argv; ((cswam *)t.ctx)->M1_ecounts(t.argv);return NULL; } void M1_collect(void *argv); static void *M1_collect_helper(void *argv){ task t=*(task *)argv; ((cswam *)t.ctx)->M1_collect(t.argv);return NULL; } void M1_update(void *argv); static void *M1_update_helper(void *argv){ task t=*(task *)argv; ((cswam *)t.ctx)->M1_update(t.argv);return NULL; } void M1_clearcounts(bool clearmem=false); void findfriends(FriendList* friends); int train(char *srctrainfile,char *trgtrainfile,char* modelfile, int maxiter,int threads=1); void aligner(void *argv); static void *aligner_helper(void *argv){ task t=*(task *)argv; ((cswam *)t.ctx)->aligner(t.argv);return NULL; }; int test(char *srctestfile, char* trgtestfile, char* modelfile,char* alignmentfile, int threads=1); }; } //namespace irstlm #endif irstlm-6.00.05/src/dict.cpp000066400000000000000000000135051263213470300154170ustar00rootroot00000000000000// $Id: dict.cpp 3677 2010-10-13 09:06:51Z bertoldi $ #include #include "cmd.h" #include "util.h" #include "mfstream.h" #include "mempool.h" #include "dictionary.h" using namespace std; void print_help(int TypeFlag=0){ std::cerr << std::endl << "dict - extracts a dictionary" << std::endl; std::cerr << std::endl << "USAGE:" << std::endl; std::cerr << " dict -i= [options]" << std::endl; std::cerr << std::endl << "DESCRIPTION:" << std::endl; std::cerr << " dict extracts a dictionary from a corpus or a dictionary." << std::endl; std::cerr << std::endl << "OPTIONS:" << std::endl; FullPrintParams(TypeFlag, 0, 1, stderr); } void usage(const char *msg = 0) { if (msg){ std::cerr << msg << std::endl; } else{ print_help(); } } int main(int argc, char **argv) { char *inp=NULL; char *out=NULL; char *testfile=NULL; char *intsymb=NULL; //must be single characters int freqflag=0; //print frequency of words int sortflag=0; //sort dictionary by frequency int curveflag=0; //plot dictionary growth curve int curvesize=10; //size of curve int listflag=0; //print oov words in test file int size=1000000; //initial size of table .... float load_factor=0; //initial load factor, default LOAD_FACTOR int prunefreq=0; //pruning according to freq value int prunerank=0; //pruning according to freq rank bool help=false; DeclareParams((char*) "InputFile", CMDSTRINGTYPE|CMDMSG, &inp, "input file (Mandatory)", "i", CMDSTRINGTYPE|CMDMSG, &inp, "input file (Mandatory)", "OutputFile", CMDSTRINGTYPE|CMDMSG, &out, "output file", "o", CMDSTRINGTYPE|CMDMSG, &out, "output file", "f", CMDBOOLTYPE|CMDMSG, &freqflag,"output word frequencies; default is false", "Freq", CMDBOOLTYPE|CMDMSG, &freqflag,"output word frequencies; default is false", "sort", CMDBOOLTYPE|CMDMSG, &sortflag,"sort dictionary by frequency; default is false", "Size", CMDINTTYPE|CMDMSG, &size, "Initial dictionary size; default is 1000000", "s", CMDINTTYPE|CMDMSG, &size, "Initial dictionary size; default is 1000000", "LoadFactor", CMDFLOATTYPE|CMDMSG, &load_factor, "set the load factor for cache; it should be a positive real value; default is 0", "lf", CMDFLOATTYPE|CMDMSG, &load_factor, "set the load factor for cache; it should be a positive real value; default is 0", "IntSymb", CMDSTRINGTYPE|CMDMSG, &intsymb, "interruption symbol", "is", CMDSTRINGTYPE|CMDMSG, &intsymb, "interruption symbol", "PruneFreq", CMDINTTYPE|CMDMSG, &prunefreq, "prune words with frequency below the specified value", "pf", CMDINTTYPE|CMDMSG, &prunefreq, "prune words with frequency below the specified value", "PruneRank", CMDINTTYPE|CMDMSG, &prunerank, "prune words with frequency rank above the specified value", "pr", CMDINTTYPE|CMDMSG, &prunerank, "prune words with frequency rank above the specified value", "Curve", CMDBOOLTYPE|CMDMSG, &curveflag,"show dictionary growth curve; default is false", "c", CMDBOOLTYPE|CMDMSG, &curveflag,"show dictionary growth curve; default is false", "CurveSize", CMDINTTYPE|CMDMSG, &curvesize, "default 10", "cs", CMDINTTYPE|CMDMSG, &curvesize, "default 10", "TestFile", CMDSTRINGTYPE|CMDMSG, &testfile, "compute OOV rates on the specified test corpus", "t", CMDSTRINGTYPE|CMDMSG, &testfile, "compute OOV rates on the specified test corpus", "ListOOV", CMDBOOLTYPE|CMDMSG, &listflag, "print OOV words to stderr; default is false", "oov", CMDBOOLTYPE|CMDMSG, &listflag, "print OOV words to stderr; default is false", "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help", "h", CMDBOOLTYPE|CMDMSG, &help, "print this help", (char*)NULL ); if (argc == 1){ usage(); exit_error(IRSTLM_NO_ERROR); } GetParams(&argc, &argv, (char*) NULL); if (help){ usage(); exit_error(IRSTLM_NO_ERROR); } if (inp==NULL) { usage(); exit_error(IRSTLM_NO_ERROR, "Warning: no input file specified"); }; // options compatibility issues: if (curveflag && !freqflag) freqflag=1; if (testfile!=NULL && !freqflag) { freqflag=1; mfstream test(testfile,ios::in); if (!test) { usage(); std::string msg("Warning: cannot open testfile: "); msg.append(testfile); exit_error(IRSTLM_NO_ERROR, msg); } test.close(); } //create dictionary: generating it from training corpus, or loading it from a dictionary file dictionary *d = new dictionary(inp,size,load_factor); // sort dictionary if (prunefreq>0 || prunerank>0 || sortflag) { dictionary *sortd=new dictionary(d,false); sortd->sort(); delete d; d=sortd; } // show statistics on dictionary growth and OOV rates on test corpus if (testfile != NULL) d->print_curve_oov(curvesize, testfile, listflag); if (curveflag) d->print_curve_growth(curvesize); //prune words according to frequency and rank if (prunefreq>0 || prunerank>0) { cerr << "pruning dictionary prunefreq:" << prunefreq << " prunerank: " << prunerank <<" \n"; int count=0; int bos=d->encode(d->BoS()); int eos=d->encode(d->EoS()); for (int i=0; i< d->size() ; i++) { if (prunefreq && d->freq(i) <= prunefreq && i!=bos && i!=eos) { d->freq(i,0); continue; } if (prunerank>0 && count>=prunerank && i!=bos && i!=eos) { d->freq(i,0); continue; } count++; } } // if outputfile is provided, write the dictionary into it if(out!=NULL) d->save(out,freqflag); } irstlm-6.00.05/src/dictionary.cpp000066400000000000000000000313351263213470300166420ustar00rootroot00000000000000// $Id: dictionary.cpp 3640 2010-10-08 14:58:17Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include #include #include "mempool.h" #include "htable.h" #include "index.h" #include "util.h" #include "dictionary.h" #include "mfstream.h" using namespace std; dictionary::dictionary(char *filename,int size, float lf) { if (lf<=0.0) lf=DICTIONARY_LOAD_FACTOR; load_factor=lf; htb = new HASHTABLE_t((size_t) (size/load_factor)); tb = new dict_entry[size]; st = new strstack(size * 10); for (int i=0; i> setw(100) >> buffer; inp.close(); if ((strncmp(buffer,"dict",4)==0) || (strncmp(buffer,"DICT",4)==0)) load(filename); else generate(filename); cerr << "loaded \n"; } int dictionary::getword(fstream& inp , char* buffer) const { while(inp >> setw(MAX_WORD) >> buffer) { //warn if the word is very long if (strlen(buffer)==(MAX_WORD-1)) { cerr << "getword: a very long word was read (" << buffer << ")\n"; } //skip words of length zero chars: why should this happen? if (strlen(buffer)==0) { cerr << "zero length word!\n"; continue; } return 1; } return 0; } void dictionary::generate(char *filename,bool header) { char buffer[MAX_WORD]; int counter=0; mfstream inp(filename,ios::in); if (!inp) { std::stringstream ss_msg; ss_msg << "cannot open " << filename << "\n"; exit_error(IRSTLM_ERROR_IO, ss_msg.str()); } cerr << "dict:"; ifl=1; //skip header if (header) inp.getline(buffer,MAX_WORD); while (getword(inp,buffer)) { incfreq(encode(buffer),1); if (!(++counter % 1000000)) cerr << "."; } ifl=0; cerr << "\n"; inp.close(); } void dictionary::augment(dictionary *d) { incflag(1); for (int i=0; in; i++) encode(d->decode(i)); incflag(0); encode(OOV()); } // print_curve: show statistics on dictionary growth void dictionary::print_curve_growth(int curvesize) const { int* curve = new int[curvesize]; for (int i=0; i curvesize-1) curve[curvesize-1]++; else curve[tb[i].freq-1]++; } //cumulating results for (int i=curvesize-2; i>=0; i--) { curve[i] = curve[i] + curve[i+1]; } cout.setf(ios::fixed); cout << "Dict size: " << n << "\n"; cout << "**************** DICTIONARY GROWTH CURVE ****************\n"; cout << "Freq\tEntries\tPercent"; cout << "\n"; for (int i=0; i" << i << "\t" << curve[i] << "\t" << setprecision(2) << (float)curve[i]/n * 100.0 << "%"; cout << "\n"; } cout << "*********************************************************\n"; delete []curve; } // print_curve_oov: show OOV amount and OOV rates computed on test corpus void dictionary::print_curve_oov(int curvesize, const char *filename, int listflag) { int *OOVchart=new int[curvesize]; int NwTest; test(OOVchart, &NwTest, curvesize, filename, listflag); cout.setf(ios::fixed); cout << "Dict size: " << n << "\n"; cout << "Words of test: " << NwTest << "\n"; cout << "**************** OOV RATE STATISTICS ****************\n"; cout << "Freq\tOOV_Entries\tOOV_Rate"; cout << "\n"; for (int i=0; i" << buffer << "\n"; } } else { if(freq < curvesize) OOVchart[freq]++; } m_NwTest++; if (!(++k % 1000000)) cerr << "."; } cerr << "\n"; inp.close(); // cumulating results for (int i=1; ipush(buffer); tb[n].code=n; if (freqflag) inp >> tb[n].freq; else tb[n].freq=0; //always insert without checking whether the word is already in if ((addr=htb->insert((char*)&tb[n].word))) { if (addr!=(char *)&tb[n].word) { cerr << "dictionary::loadtxt wrong entry was found (" << buffer << ") in position " << n << "\n"; // exit(1); continue; // continue loading dictionary } } N+=tb[n].freq; if (strcmp(buffer,OOV())==0) oov_code=n; if (++n==lim) grow(); } inp.close(); } void dictionary::load(std::istream& inp) { char buffer[MAX_WORD]; char *addr; int size; inp >> size; for (int i=0; i> setw(MAX_WORD) >> buffer; tb[n].word=st->push(buffer); tb[n].code=n; inp >> tb[n].freq; N+=tb[n].freq; //always insert without checking whether the word is already in if ((addr=htb->insert((char *)&tb[n].word))) { if (addr!=(char *)&tb[n].word) { std::stringstream ss_msg; ss_msg << "dictionary::loadtxt wrong entry was found (" << buffer << ") in position " << n; exit_error(IRSTLM_ERROR_DATA, ss_msg.str()); } } if (strcmp(tb[n].word,OOV())==0) oov_code=n; if (++n==lim) grow(); } inp.getline(buffer,MAX_WORD-1); } void dictionary::save(std::ostream& out) { out << n << "\n"; for (int i=0; ifreq-ae->freq) return be->freq-ae->freq; else return strcmp(ae->word,be->word); } dictionary::dictionary(dictionary* d,bool prune, int prunethresh) { MY_ASSERT(d!=NULL); //transfer values n=0; //total entries N=0; //total frequency load_factor=d->load_factor; //load factor lim=d->lim; //limit of entries oov_code=-1; //code od oov must be re-defined ifl=0; //increment flag=0; dubv=d->dubv; //dictionary upperbound transferred //creates a sorted copy of the table tb = new dict_entry[lim]; htb = new HASHTABLE_t((size_t) (lim/load_factor)); st = new strstack(lim * 10); //copy in the entries with frequency > threshold n=0; for (int i=0; in; i++) if (!prune || d->tb[i].freq>prunethresh){ tb[n].code=n; tb[n].freq=d->tb[i].freq; tb[n].word=st->push(d->tb[i].word); htb->insert((char*)&tb[n].word); if (d->oov_code==i) oov_code=n; //reassign oov_code N+=tb[n].freq; n++; } }; void dictionary::sort() { if (htb != NULL ) delete htb; htb = new HASHTABLE_t((int) (lim/load_factor)); //sort all entries according to frequency cerr << "sorting dictionary ..."; qsort(tb,n,sizeof(dict_entry),cmpdictentry); cerr << "done\n"; for (int i=0; iinsert((char*)&tb[i].word); }; } dictionary::~dictionary() { delete htb; delete st; delete [] tb; } void dictionary::stat() const { cout << "dictionary class statistics\n"; cout << "size " << n << " used memory " << (lim * sizeof(int) + htb->used() + st->used())/1024 << " Kb\n"; } void dictionary::grow() { delete htb; cerr << "+\b"; int newlim=(int) (lim*GROWTH_STEP); dict_entry *tb2=new dict_entry[newlim]; memcpy(tb2,tb,sizeof(dict_entry) * lim ); delete [] tb; tb=tb2; htb=new HASHTABLE_t((size_t) ((newlim)/load_factor)); for (int i=0; iinsert((char*)&tb[i].word); } for (int i=lim; ifind((char *)&w); if (ptr==NULL) return -1; return ptr->code; } int dictionary::encode(const char *w) { //case of strange characters if (strlen(w)==0) { cerr << "0"; w=OOV(); } dict_entry* ptr; if ((ptr=(dict_entry *)htb->find((char *)&w))!=NULL) return ptr->code; else { if (!ifl) { //do not extend dictionary if (oov_code==-1) { //did not use OOV yet cerr << "starting to use OOV words [" << w << "]\n"; tb[n].word=st->push(OOV()); htb->insert((char *)&tb[n].word); tb[n].code=n; tb[n].freq=0; oov_code=n; if (++n==lim) grow(); } return encode(OOV()); } else { //extend dictionary tb[n].word=st->push((char *)w); htb->insert((char*)&tb[n].word); tb[n].code=n; tb[n].freq=0; if (++n==lim) grow(); return n-1; } } } const char *dictionary::decode(int c) const { if (c>=0 && c < n) return tb[c].word; else { cerr << "decode: code out of boundary\n"; return OOV(); } } dictionary_iter::dictionary_iter(dictionary *dict) : m_dict(dict) { m_dict->scan(HT_INIT); } dict_entry* dictionary_iter::next() { return (dict_entry*) m_dict->scan(HT_CONT); } /* main(int argc,char **argv){ dictionary d(argv[1],40000); d.stat(); cout << "ROMA" << d.decode(0) << "\n"; cout << "ROMA:" << d.encode("ROMA") << "\n"; d.save(argv[2]); } */ irstlm-6.00.05/src/dictionary.h000066400000000000000000000127121263213470300163050ustar00rootroot00000000000000// $Id: dictionary.h 3679 2010-10-13 09:10:01Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #ifndef MF_DICTIONARY_H #define MF_DICTIONARY_H #include "mfstream.h" #include "htable.h" #include #include using namespace std; #define MAX_WORD 1000 #define DICTIONARY_LOAD_FACTOR 2.0 #ifndef GROWTH_STEP #define GROWTH_STEP 1.5 #endif #ifndef DICT_INITSIZE #define DICT_INITSIZE 100000 #endif //Begin of sentence symbol #ifndef BOS_ #define BOS_ "" #endif //End of sentence symbol #ifndef EOS_ #define EOS_ "" #endif //End of document symbol #ifndef BOD_ #define BOD_ "" #endif //End of document symbol #ifndef EOD_ #define EOD_ "" #endif //Out-Of-Vocabulary symbol #ifndef OOV_ #define OOV_ "" #endif typedef struct { const char *word; int code; long long freq; } dict_entry; typedef htable HASHTABLE_t; class strstack; class dictionary { strstack *st; //!< stack of strings dict_entry *tb; //!< entry table HASHTABLE_t *htb; //!< hash table int n; //!< number of entries long long N; //!< total frequency int lim; //!< limit of entries int oov_code; //!< code assigned to oov words char ifl; //!< increment flag int dubv; //!< dictionary size upper bound float load_factor; //!< dictionary loading factor char* oov_str; //!< oov string void test(int* OOVchart, int* NwTest, int curvesize, const char *filename, int listflag=0); // prepare into testOOV the OOV statistics computed on test set public: friend class dictionary_iter; dictionary* oovlex; //=0?v:oov_code); } inline int incflag() const { return ifl; } inline int incflag(int v) { return ifl=v; } int getword(fstream& inp , char* buffer) const; int isprintable(char* w) const { char buffer[MAX_WORD]; sprintf(buffer,"%s",w); return strcmp(w,buffer)==0; } inline void genoovcode() { int c=encode(OOV()); std::cerr << "OOV code is "<< c << std::endl; cerr << "OOV code is "<< c << std::endl; oovcode(c); } inline void genBoScode() { int c=encode(BoS()); std::cerr << "BoS code is "<< c << std::endl; } inline void genEoScode() { int c=encode(EoS()); std::cerr << "EoS code is "<< c << std::endl; } inline long long setoovrate(double oovrate) { encode(OOV()); //be sure OOV code exists long long oovfreq=(long long)(oovrate * totfreq()); std::cerr << "setting OOV rate to: " << oovrate << " -- freq= " << oovfreq << std::endl; return freq(oovcode(),oovfreq); } inline long long incfreq(int code,long long value) { N+=value; return tb[code].freq+=value; } inline long long multfreq(int code,double value) { N+=(long long)(value * tb[code].freq)-tb[code].freq; return tb[code].freq=(long long)(value * tb[code].freq); } inline long long freq(int code,long long value=-1) { if (value>=0) { N+=value-tb[code].freq; tb[code].freq=value; } return tb[code].freq; } inline long long totfreq() const { return N; } inline float set_load_factor(float value) { return load_factor=value; } void grow(); void sort(); dictionary(char *filename,int size=DICT_INITSIZE,float lf=DICTIONARY_LOAD_FACTOR); dictionary(dictionary* d, bool prune=false,int prunethresh=0); //make a copy and eventually filter out unfrequent words ~dictionary(); void generate(char *filename,bool header=false); void load(char *filename); void save(char *filename, int freqflag=0); void load(std::istream& fd); void save(std::ostream& fd); void augment(dictionary *d); int size() const { return n; } int getcode(const char *w); int encode(const char *w); const char *decode(int c) const; void stat() const; void print_curve_growth(int curvesize) const; void print_curve_oov(int curvesize, const char *filename, int listflag=0); void cleanfreq() { for (int i=0; iscan(action); } }; class dictionary_iter { public: dictionary_iter(dictionary *dict); dict_entry* next(); private: dictionary* m_dict; }; #endif irstlm-6.00.05/src/doc.cpp000077500000000000000000000055051263213470300152450ustar00rootroot00000000000000/****************************************************************************** IrstLM: IRST Language Model Toolkit, compile LM Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include "util.h" #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "doc.h" using namespace std; doc::doc(dictionary* d,char* docfname,bool use_null_word){ mfstream df(docfname,ios::in); char header[100]; df.getline(header,100); sscanf(header,"%d",&N); assert(N>0 && N < MAXDOCNUM); M=new int [N]; V=new int* [N]; int eod=d->encode(d->EoD()); int bod=d->encode(d->BoD()); ngram ng(d); int n=0; //track documents int m=0; //track document length int w=0; //track words in doc int tmp[MAXDOCLEN]; while (n> ng) if (ng.size>0){ w=*ng.wordp(1); if (w==bod){ if (use_null_word){ ng.size=1; //use as NULL word }else{ ng.size=0; //skip continue; } } if (w==eod && m>0){ M[n]=m; //length of n-th document V[n]=new int[m]; memcpy(V[n],tmp,m * sizeof(int)); m=0; n++; continue; } if (m < MAXDOCLEN) tmp[m++]=w; if (m==MAXDOCLEN) {cerr<< "warn: clipping long document (line " << n << " )\n";exit(1);}; } cerr << "uploaded " << n << " documents\n"; }; doc::~doc(){ cerr << "releasing document storage\n"; for (int i=0;i=0 && index < N); return M[index]; } int doc::docword( int docindex, int wordindex){ assert(wordindex>=0 && wordindex #include "util.h" #include #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "ngramtable.h" #include "cmd.h" using namespace std; #define YES 1 #define NO 0 void print_help(int TypeFlag=0){ std::cerr << std::endl << "dtsel - performs data selection" << std::endl; std::cerr << std::endl << "USAGE:" << std::endl << " dtsel -s= [options]" << std::endl; std::cerr << std::endl << "OPTIONS:" << std::endl; FullPrintParams(TypeFlag, 0, 1, stderr); } void usage(const char *msg = 0) { if (msg){ std::cerr << msg << std::endl; } else{ print_help(); } } double prob(ngramtable* ngt,ngram ng,int size,int cv){ MY_ASSERT(size<=ngt->maxlevel() && size<=ng.size); if (size>1){ ngram history=ng; if (ngt->get(history,size,size-1) && history.freq>cv){ double fstar=0.0; double lambda; if (ngt->get(ng,size,size)){ cv=(cv>ng.freq)?ng.freq:cv; if (ng.freq>cv){ fstar=(double)(ng.freq-cv)/(double)(history.freq -cv + history.succ); lambda=(double)history.succ/(double)(history.freq -cv + history.succ); }else //ng.freq==cv lambda=(double)(history.succ-1)/(double)(history.freq -cv + history.succ-1); } else lambda=(double)history.succ/(double)(history.freq -cv + history.succ); return fstar + lambda * prob(ngt,ng,size-1,cv); } else return prob(ngt,ng,size-1,cv); }else{ //unigram branch if (ngt->get(ng,1,1) && ng.freq>cv) return (double)(ng.freq-cv)/(ngt->totfreq()-1); else{ //cerr << "backoff to oov unigram " << ng.freq << " " << cv << "\n"; *ng.wordp(1)=ngt->dict->oovcode(); if (ngt->get(ng,1,1) && ng.freq>0) return (double)ng.freq/ngt->totfreq(); else //use an automatic estimate of Pr(oov) return (double)ngt->dict->size()/(ngt->totfreq()+ngt->dict->size()); } } } double computePP(ngramtable* train,ngramtable* test,double oovpenalty,double& oovrate,int cv=0){ ngram ng2(test->dict);ngram ng1(train->dict); int N=0; double H=0; oovrate=0; test->scan(ng2,INIT,test->maxlevel()); while(test->scan(ng2,CONT,test->maxlevel())) { ng1.trans(ng2); H-=log(prob(train,ng1,ng1.size,cv)); if (*ng1.wordp(1)==train->dict->oovcode()){ H-=oovpenalty; oovrate++; } N++; } oovrate/=N; return exp(H/N); } int main(int argc, char **argv) { char *indom=NULL; //indomain data: one sentence per line char *outdom=NULL; //domain data: one sentence per line char *scorefile=NULL; //score file char *evalset=NULL; //evalset to measure performance int minfreq=2; //frequency threshold for dictionary pruning (optional) int ngsz=0; // n-gram size int dub=IRSTLM_DUB_DEFAULT; //upper bound of true vocabulary int model=2; //data selection model: 1 only in-domain cross-entropy, //2 cross-entropy difference. int cv=1; //cross-validation parameter: 1 only in-domain cross-entropy, int blocksize=100000; //block-size in words int verbose=0; int useindex=0; //provided score file includes and index double convergence_treshold=0; bool help=false; DeclareParams((char*) "min-word-freq", CMDINTTYPE|CMDMSG, &minfreq, "frequency threshold for dictionary pruning, default: 2", "f", CMDINTTYPE|CMDMSG, &minfreq, "frequency threshold for dictionary pruning, default: 2", "ngram-order", CMDSUBRANGETYPE|CMDMSG, &ngsz, 1 , MAX_NGRAM, "n-gram default size, default: 0", "n", CMDSUBRANGETYPE|CMDMSG, &ngsz, 1 , MAX_NGRAM, "n-gram default size, default: 0", "in-domain-file", CMDSTRINGTYPE|CMDMSG, &indom, "indomain data file: one sentence per line", "i", CMDSTRINGTYPE|CMDMSG, &indom, "indomain data file: one sentence per line", "out-domain-file", CMDSTRINGTYPE|CMDMSG, &outdom, "domain data file: one sentence per line", "o", CMDSTRINGTYPE|CMDMSG, &outdom, "domain data file: one sentence per line", "score-file", CMDSTRINGTYPE|CMDMSG, &scorefile, "score output file", "s", CMDSTRINGTYPE|CMDMSG, &scorefile, "score output file", "dictionary-upper-bound", CMDINTTYPE|CMDMSG, &dub, "upper bound of true vocabulary, default: 10000000", "dub", CMDINTTYPE|CMDMSG, &dub, "upper bound of true vocabulary, default: 10000000", "model", CMDSUBRANGETYPE|CMDMSG, &model, 1 , 2, "data selection model: 1 only in-domain cross-entropy, 2 cross-entropy difference; default: 2", "m", CMDSUBRANGETYPE|CMDMSG, &model, 1 , 2, "data selection model: 1 only in-domain cross-entropy, 2 cross-entropy difference; default: 2", "cross-validation", CMDSUBRANGETYPE|CMDMSG, &cv, 1 , 3, "cross-validation parameter: 1 only in-domain cross-entropy; default: 1", "cv", CMDSUBRANGETYPE|CMDMSG, &cv, 1 , 3, "cross-validation parameter: 1 only in-domain cross-entropy; default: 1", "test", CMDSTRINGTYPE|CMDMSG, &evalset, "evaluation set file to measure performance", "t", CMDSTRINGTYPE|CMDMSG, &evalset, "evaluation set file to measure performance", "block-size", CMDINTTYPE|CMDMSG, &blocksize, "block-size in words, default: 100000", "bs", CMDINTTYPE|CMDMSG, &blocksize, "block-size in words, default: 100000", "convergence-threshold", CMDDOUBLETYPE|CMDMSG, &convergence_treshold, "convergence threshold, default: 0", "c", CMDDOUBLETYPE|CMDMSG, &convergence_treshold, "convergence threshold, default: 0", "index", CMDSUBRANGETYPE|CMDMSG, &useindex,0,1, "provided score file includes and index, default: 0", "x", CMDSUBRANGETYPE|CMDMSG, &useindex,0,1, "provided score file includes and index, default: 0", "verbose", CMDSUBRANGETYPE|CMDMSG, &verbose,0,2, "verbose level, default: 0", "v", CMDSUBRANGETYPE|CMDMSG, &verbose,0,2, "verbose level, default: 0", "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help", "h", CMDBOOLTYPE|CMDMSG, &help, "print this help", (char *)NULL ); GetParams(&argc, &argv, (char*) NULL); if (help){ usage(); exit_error(IRSTLM_NO_ERROR); } if (scorefile==NULL) { usage(); exit_error(IRSTLM_NO_ERROR); } if (!evalset && (!indom || !outdom)){ exit_error(IRSTLM_ERROR_DATA, "Must specify in-domain and out-domain data files"); }; //score file is always required: either as output or as input if (!scorefile){ exit_error(IRSTLM_ERROR_DATA, "Must specify score file"); }; if (!evalset && !model){ exit_error(IRSTLM_ERROR_DATA, "Must specify data selection model"); } if (evalset && (convergence_treshold<0 || convergence_treshold > 0.1)){ exit_error(IRSTLM_ERROR_DATA, "Convergence threshold must be between 0 and 0.1"); } TABLETYPE table_type=COUNT; if (!evalset){ //computed dictionary on indomain data dictionary *dict = new dictionary(indom,1000000,0); dictionary *pd=new dictionary(dict,true,minfreq); delete dict;dict=pd; //build in-domain table restricted to the given dictionary ngramtable *indngt=new ngramtable(indom,ngsz,NULL,dict,NULL,0,0,NULL,0,table_type); double indoovpenalty=-log(dub-indngt->dict->size()); ngram indng(indngt->dict); int indoovcode=indngt->dict->oovcode(); //build out-domain table restricted to the in-domain dictionary char command[1000]=""; if (useindex) sprintf(command,"cut -d \" \" -f 2- %s",outdom); else sprintf(command,"%s",outdom); ngramtable *outdngt=new ngramtable(command,ngsz,NULL,dict,NULL,0,0,NULL,0,table_type); double outdoovpenalty=-log(dub-outdngt->dict->size()); ngram outdng(outdngt->dict); int outdoovcode=outdngt->dict->oovcode(); cerr << "dict size idom: " << indngt->dict->size() << " odom: " << outdngt->dict->size() << "\n"; cerr << "oov penalty idom: " << indoovpenalty << " odom: " << outdoovpenalty << "\n"; //go through the odomain sentences int bos=dict->encode(dict->BoS()); mfstream inp(outdom,ios::in); ngram ng(dict); mfstream txt(outdom,ios::in); mfstream output(scorefile,ios::out); int linenumber=1; string line; int length=0; float deltaH=0; float deltaHoov=0; int words=0;string index; while (getline(inp,line)){ istringstream lninp(line); linenumber++; if (useindex) lninp >> index; // reset ngram at begin of sentence ng.size=1; deltaH=0;deltaHoov=0; length=0; while(lninp>>ng){ if (*ng.wordp(1)==bos) continue; length++; words++; if ((words % 1000000)==0) cerr << "."; if (ng.size>ngsz) ng.size=ngsz; indng.trans(ng);outdng.trans(ng); if (model==1){//compute cross-entropy deltaH-=log(prob(indngt,indng,indng.size,0)); deltaHoov-=(*indng.wordp(1)==indoovcode?indoovpenalty:0); } if (model==2){ //compute cross-entropy difference deltaH+=log(prob(outdngt,outdng,outdng.size,cv))-log(prob(indngt,indng,indng.size,0)); deltaHoov+=(*outdng.wordp(1)==outdoovcode?outdoovpenalty:0)-(*indng.wordp(1)==indoovcode?indoovpenalty:0); } } output << (deltaH + deltaHoov)/length << " " << line << "\n"; } } else{ //build in-domain LM from evaluation set ngramtable *tstngt=new ngramtable(evalset,ngsz,NULL,NULL,NULL,0,0,NULL,0,table_type); //build empty out-domain LM ngramtable *outdngt=new ngramtable(NULL,ngsz,NULL,NULL,NULL,0,0,NULL,0,table_type); //if indomain data is passed then limit comparison to its dictionary dictionary *dict = NULL; if (indom){ cerr << "dtsel: limit evaluation dict to indomain words with freq >=" << minfreq << "\n"; //computed dictionary on indomain data dict = new dictionary(indom,1000000,0); dictionary *pd=new dictionary(dict,true,minfreq); delete dict;dict=pd; outdngt->dict=dict; } dictionary* outddict=outdngt->dict; //get codes of , and UNK outddict->incflag(1); int bos=outddict->encode(outddict->BoS()); int oov=outddict->encode(outddict->OOV()); outddict->incflag(0); outddict->oovcode(oov); double oldPP=dub; double newPP=0; double oovrate=0; long totwords=0; long totlines=0; long nextstep=blocksize; double score; string index; mfstream outd(scorefile,ios::in); string line; //initialize n-gram ngram ng(outdngt->dict); for (int i=1;iincflag(1); while (getline(outd,line)){ istringstream lninp(line); //skip score and eventually the index lninp >> score; if (useindex) lninp >> index; while (lninp >> ng){ if (*ng.wordp(1) == bos) continue; if (ng.size>ngsz) ng.size=ngsz; outdngt->put(ng); totwords++; } totlines++; if (totwords>=nextstep){ //if block is complete if (!dict) outddict->incflag(0); newPP=computePP(outdngt,tstngt,-log(dub-outddict->size()),oovrate); if (!dict) outddict->incflag(1); cout << totwords << " " << newPP; if (verbose) cout << " " << totlines << " " << oovrate; cout << "\n"; if (convergence_treshold && (oldPP-newPP)/oldPP < convergence_treshold) return 1; oldPP=newPP; nextstep+=blocksize; } } if (!dict) outddict->incflag(0); newPP=computePP(outdngt,tstngt,-log(dub-outddict->size()),oovrate); cout << totwords << " " << newPP; if (verbose) cout << " " << totlines << " " << oovrate; } } irstlm-6.00.05/src/gzfilebuf.h000066400000000000000000000047441263213470300161230ustar00rootroot00000000000000// $Id: gzfilebuf.h 236 2009-02-03 13:25:19Z nicolabertoldi $ #ifndef _GZFILEBUF_H_ #define _GZFILEBUF_H_ #include #include #include #include #include class gzfilebuf : public std::streambuf { public: gzfilebuf(const char *filename) { _gzf = gzopen(filename, "rb"); setg (_buff+sizeof(int), // beginning of putback area _buff+sizeof(int), // read position _buff+sizeof(int)); // end position } ~gzfilebuf() { gzclose(_gzf); } protected: virtual int_type overflow (int_type /* unused parameter: c */) { std::cerr << "gzfilebuf::overflow is not implemented" << std::endl;; throw; } // write multiple characters virtual std::streamsize xsputn (const char* /* unused parameter: s */, std::streamsize /* unused parameter: num */) { std::cerr << "gzfilebuf::xsputn is not implemented" << std::endl;; throw; } virtual std::streampos seekpos ( std::streampos /* unused parameter: sp */, std::ios_base::openmode /* unused parameter: which */= std::ios_base::in | std::ios_base::out ) { std::cerr << "gzfilebuf::seekpos is not implemented" << std::endl;; throw; } //read one character virtual int_type underflow () { // is read position before end of _buff? if (gptr() < egptr()) { return traits_type::to_int_type(*gptr()); } /* process size of putback area * - use number of characters read * - but at most four */ unsigned int numPutback = gptr() - eback(); if (numPutback > sizeof(int)) { numPutback = sizeof(int); } /* copy up to four characters previously read into * the putback _buff (area of first four characters) */ std::memmove (_buff+(sizeof(int)-numPutback), gptr()-numPutback, numPutback); // read new characters int num = gzread(_gzf, _buff+sizeof(int), _buffsize-sizeof(int)); if (num <= 0) { // ERROR or EOF return EOF; } // reset _buff pointers setg (_buff+(sizeof(int)-numPutback), // beginning of putback area _buff+sizeof(int), // read position _buff+sizeof(int)+num); // end of buffer // return next character return traits_type::to_int_type(*gptr()); } std::streamsize xsgetn (char* s, std::streamsize num) { return gzread(_gzf,s,num); } private: gzFile _gzf; static const unsigned int _buffsize = 1024; char _buff[_buffsize]; }; #endif irstlm-6.00.05/src/htable.cpp000066400000000000000000000045401263213470300157320ustar00rootroot00000000000000// $Id: htable.cpp 3680 2010-10-13 09:10:21Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include "mempool.h" #include "htable.h" #include "util.h" using namespace std; template <> void htable::set_keylen(int kl) { keylen=kl/sizeof(int); return; } template <> void htable::set_keylen(int kl) { keylen=kl; return; } template <> address htable::Hash(int* key) { address h; register int i; //Thomas Wang's 32 bit Mix Function for (i=0,h=0; i> 10); h += (h << 3); h ^= (h >> 6); h += ~(h << 11); h ^= (h >> 16); }; return h; } template <> address htable::Hash(char* key) { //actually char* key is a char**, i.e. a pointer to a char* char *Key = *(char**)key; int length=strlen(Key); register address h=0; register int i; for (i=0,h=0; i int htable::Comp(int *key1, int *key2) const { MY_ASSERT(key1 && key2); register int i; for (i=0; i int htable::Comp(char *key1, char *key2) const { MY_ASSERT(key1 && key2); char *Key1 = *(char**)key1; char *Key2 = *(char**)key2; MY_ASSERT(Key1 && Key2); return (strcmp(Key1,Key2)); } irstlm-6.00.05/src/htable.h000066400000000000000000000124211263213470300153740ustar00rootroot00000000000000// $Id: htable.h 3680 2010-10-13 09:10:21Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #ifndef MF_HTABLE_H #define MF_HTABLE_H using namespace std; #include #include #include #include "mempool.h" #define Prime1 37 #define Prime2 1048583 #define BlockSize 100 typedef unsigned int address; // Fast arithmetic, relying on powers of 2, // and on pre-processor concatenation property //use as template struct entry { T key; entry* next; // secret from user }; typedef enum {HT_FIND, //!< search: find an entry HT_ENTER, //!< search: enter an entry HT_INIT, //!< scan: start scan HT_CONT //!< scan: continue scan } HT_ACTION; //!T is the type of the key and should be (int*) or (char*) template class htable { int size; //!< table size int keylen; //!< key length entry **table; //!< hash table int scan_i; //!< scan support entry *scan_p; //!< scan support // statistics long keys; //!< # of entries long accesses; //!< # of accesses long collisions; //!< # of collisions mempool *memory; //!< memory pool public: //! Creates an hash table htable(int n,int kl=0); //! Destroys an and hash table ~htable(); void set_keylen(int kl); //! Computes the hash function address Hash(const T key); //! Compares the keys of two entries int Comp(const T Key1, const T Key2) const; //! Searches for an item T find(T item); T insert(T item); //! Scans the content T scan(HT_ACTION action); //! Prints statistics void stat() const ; //! Print a map of memory use void map(std::ostream& co=std::cout, int cols=80); //! Returns amount of used memory int used() const { return size * sizeof(entry **) + memory->used(); } }; template htable::htable(int n,int kl) { memory=new mempool( sizeof(entry) , BlockSize ); table = new entry* [ size=n ]; memset(table,0,sizeof(entry *) * n ); set_keylen(kl); keys = accesses = collisions = 0; } template htable::~htable() { delete []table; delete memory; } template T htable::find(T key) { // std::cerr << "T htable::find(T key) size:" << size << std::endl; address h; entry *q,**p; accesses++; h = Hash(key); // std::cerr << "T htable::find(T key) h:" << h << std::endl; p=&table[h%size]; q=*p; /* Follow collision chain */ while (q != NULL && Comp(q->key,key)) { p = &(q->next); q = q->next; collisions++; } if (q != NULL) return q->key; /* found */ return NULL; } template T htable::insert(T key) { address h; entry *q,**p; accesses++; h = Hash(key); p=&table[h%size]; q=*p; /* Follow collision chain */ while (q != NULL && Comp(q->key,key)) { p = &(q->next); q = q->next; collisions++; } if (q != NULL) return q->key; /* found */ /* not found */ if ((q = (entry *)memory->allocate()) == NULL) /* no room */ return NULL; /* link into chain */ *p = q; /* Initialize new element */ q->key = key; q->next = NULL; keys++; return q->key; } template T htable::scan(HT_ACTION action) { if (action == HT_INIT) { scan_i=0; scan_p=table[0]; return NULL; } // if scan_p==NULL go to the first non null pointer while ((scan_p==NULL) && (++scan_ikey; scan_p=(entry *)scan_p->next; return k; }; return NULL; } template void htable::map(ostream& co,int cols) { entry *p; char* img=new char[cols+1]; img[cols]='\0'; memset(img,'.',cols); co << "htable memory map: . (0 items), - (<5), # (>5)\n"; for (int i=0; i *)p->next; }; if (i && (i % cols)==0) { co << img << "\n"; memset(img,'.',cols); } if (n>0) img[i % cols]=n<=5?'-':'#'; } img[size % cols]='\0'; co << img << "\n"; delete []img; } template void htable::stat() const { cerr << "htable class statistics\n"; cerr << "size " << size << " keys " << keys << " acc " << accesses << " coll " << collisions << " used memory " << used()/1024 << "Kb\n"; }; #endif irstlm-6.00.05/src/index.h000066400000000000000000000004011263213470300152370ustar00rootroot00000000000000// $Id: index.h 236 2009-02-03 13:25:19Z nicolabertoldi $ #pragma once #ifdef WIN32 inline const char *index(const char *str, char search) { size_t i=0; while (i< strlen(str) ) { if (str[i]==search) return &str[i]; } return NULL; } #endif irstlm-6.00.05/src/interplm.cpp000066400000000000000000000277271263213470300163410ustar00rootroot00000000000000/****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include "util.h" #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "mempool.h" #include "ngramcache.h" #include "ngramtable.h" #include "normcache.h" #include "interplm.h" using namespace std; void interplm::trainunigr() { int oov=dict->getcode(dict->OOV()); cerr << "oovcode: " << oov << "\n"; if (oov>=0 && dict->freq(oov)>= dict->size()) { cerr << "Using current estimate of OOV frequency " << dict->freq(oov)<< "\n"; } else { oov=dict->encode(dict->OOV()); dict->oovcode(oov); //choose unigram smoothing method according to //sample size //if (dict->totfreq()>100){ //witten bell //cerr << "select unigram smoothing: " << dict->totfreq() << "\n"; if (unismooth) { dict->incfreq(oov,dict->size()-1); cerr << "Witten-Bell estimate of OOV freq:"<< (double)(dict->size()-1)/dict->totfreq() << "\n"; } else { if (dict->dub()) { cerr << "DUB estimate of OOV size\n"; dict->incfreq(oov,dict->dub()-dict->size()+1); } else { cerr << "1 = estimate of OOV size\n"; dict->incfreq(oov,1); } } } } double interplm::unigrWB(ngram ng) { return ((double)(dict->freq(*ng.wordp(1))+epsilon))/ ((double)dict->totfreq() + (double) dict->size() * epsilon); } interplm::interplm(char *ngtfile,int depth,TABLETYPE tabtype): ngramtable(ngtfile,depth,NULL,NULL,NULL,0,0,NULL,0,tabtype) { if (maxlevel() int BoS=dict->encode(dict->BoS()); if (BoS != dict->oovcode()) { cerr << "setting counter of Begin of Sentence to 1 ..." << "\n"; dict->freq(BoS,1); cerr << "start_sent: " << (char *)dict->decode(BoS) << " " << dict->freq(BoS) << "\n"; } }; interplm::~interplm() { delete_prune_ngram(); } void interplm::delete_prune_ngram() { delete []prune_freq_threshold; } void interplm::init_prune_ngram(int sz) { prune_freq_threshold = new int[sz+1]; for (int i=0; i<=sz; ++i) { prune_freq_threshold[i] = 0; } } void interplm::print_prune_ngram() { for (int i=1; i<=lms; ++i) VERBOSE(0,"level " << i << " prune_freq_threshold[" << i << "]=" << prune_freq_threshold[i] << "\n"); } void interplm::set_prune_ngram(char* values) { char *s=strdup(values); char *tk; prune_freq_threshold[0]=0; int i=1; tk=strtok(s, ","); while (tk) { if (i<=lms) { prune_freq_threshold[i]=atoi(tk); VERBOSE(2,"prune_freq_threshold[" << i << "]=" << prune_freq_threshold[i] << "\n"); tk=strtok(NULL, ","); } else { VERBOSE(2,"too many pruning frequency threshold values; kept the first values and skipped the others\n"); break; } ++i; } for (int i=1; i<=lms; ++i) { if (prune_freq_threshold[i] 0) { prune_freq_threshold[lev] = val; } else { VERBOSE(2,"Value (" << val << ") must be larger than 0\n"); } } else { VERBOSE(2,"lev (" << lev << ") is larger than the lm order (" << lms<< ")\n"); } } void interplm::gensuccstat() { ngram hg(dict); int s1,s2; cerr << "Generating successor statistics\n"; for (int l=2; l<=lms; l++) { cerr << "level " << l << "\n"; scan(hg,INIT,l-1); while(scan(hg,CONT,l-1)) { s1=s2=0; ngram ng=hg; ng.pushc(0); succscan(hg,ng,INIT,l); while(succscan(hg,ng,CONT,l)) { if (corrcounts && l=1; l--) { cerr << "level " << l << "\n"; ngram ng(dict); int count=0; //now update counts scan(ng,INIT,l+1); while(scan(ng,CONT,l+1)) { ngram ng2=ng; ng2.size--; if (get(ng2,ng2.size,ng2.size)) { if (!ng2.containsWord(dict->BoS(),1)) //counts number of different n-grams setfreq(ng2.link,ng2.pinfo,1+getfreq(ng2.link,ng2.pinfo,1),1); else // use correct count for n-gram " w .. .. " //setfreq(ng2.link,ng2.pinfo,ng2.freq+getfreq(ng2.link,ng2.pinfo,1),1); setfreq(ng2.link,ng2.pinfo,ng2.freq,1); } else { MY_ASSERT(lms==l+1); cerr << "cannot find2 " << ng2 << "count " << count << "\n"; cerr << "inserting ngram and starting from scratch\n"; ng2.pushw(dict->BoS()); ng2.freq=100; put(ng2); cerr << "reset all counts at last level\n"; scan(ng2,INIT,lms-1); while(scan(ng2,CONT,lms-1)) { setfreq(ng2.link,ng2.pinfo,0,1); } gencorrcounts(); return; } } } cerr << "Updating history counts\n"; for (int l=lms-2; l>=1; l--) { cerr << "level " << l << "\n"; cerr << "reset counts\n"; ngram ng(dict); scan(ng,INIT,l); while(scan(ng,CONT,l)) { freq(ng.link,ng.pinfo,0); } scan(ng,INIT,l+1); while(scan(ng,CONT,l+1)) { ngram ng2=ng; get(ng2,l+1,l); freq(ng2.link,ng2.pinfo,freq(ng2.link,ng2.pinfo)+getfreq(ng.link,ng.pinfo,1)); } } cerr << "Adding unigram of OOV word if missing\n"; ngram ng(dict,maxlevel()); for (int i=1; i<=maxlevel(); i++) *ng.wordp(i)=dict->oovcode(); if (!get(ng,lms,1)) { // oov is missing in the ngram-table // f(oov) = dictionary size (Witten Bell) (excluding oov itself) ng.freq=dict->size()-1; cerr << "adding oov unigram |" << ng << "| with frequency " << ng.freq << "\n"; put(ng); get(ng,lms,1); setfreq(ng.link,ng.pinfo,ng.freq,1); } cerr << "Replacing unigram of BoS \n"; if (dict->encode(dict->BoS()) != dict->oovcode()) { ngram ng(dict,1); *ng.wordp(1)=dict->encode(dict->BoS()); if (get(ng,1,1)) { ng.freq=1; //putting Pr()=0 would create problems!! setfreq(ng.link,ng.pinfo,ng.freq,1); } } cerr << "compute unigram totfreq \n"; int totf=0; scan(ng,INIT,1); while(scan(ng,CONT,1)) { totf+=getfreq(ng.link,ng.pinfo,1); } btotfreq(totf); cerr << "compute unigram btotfreq(totf):" << btotfreq() << "\n"; corrcounts=1; } double interplm::zerofreq(int lev) { cerr << "Computing lambda: ..."; ngram ng(dict); double N=0,N1=0; scan(ng,INIT,lev); while(scan(ng,CONT,lev)) { if ((lev==1) && (*ng.wordp(1)==dict->oovcode())) continue; N+=ng.freq; if (ng.freq==1) N1++; } cerr << (double)(N1/N) << "\n"; return N1/N; } void interplm::test(char* filename,int size,bool backoff,bool checkpr,char* outpr) { if (size>lmsize()) { exit_error(IRSTLM_ERROR_DATA, "interplm::test: wrong ngram size"); } mfstream inp(filename,ios::in ); char header[100]; inp >> header; inp.close(); if (strncmp(header,"nGrAm",5)==0 || strncmp(header,"NgRaM",5)==0) { ngramtable ngt(filename,size,NULL,NULL,NULL,0,0,NULL,0,COUNT); test_ngt(ngt,size,backoff,checkpr); } else test_txt(filename,size,backoff,checkpr,outpr); } void interplm::test_txt(char* filename,int size,bool /* unused parameter: backoff */,bool checkpr,char* outpr) { cerr << "test text " << filename << " "; mfstream inp(filename,ios::in ); ngram ng(dict); double n=0,lp=0,pr; double oov=0; cout.precision(10); mfstream outp(outpr?outpr:"/dev/null",ios::out ); if (checkpr) cerr << "checking probabilities\n"; while(inp >> ng) if (ng.size>=1) { ng.size=ng.size>size?size:ng.size; if (dict->encode(dict->BoS()) != dict->oovcode()) { if (*ng.wordp(1) == dict->encode(dict->BoS())) { ng.size=1; //reset n-grams starting with BoS continue; } } pr=prob(ng,ng.size); if (outpr) outp << ng << "[" << ng.size << "-gram]" << " " << pr << " " << log(pr)/log(10.0) << std::endl; lp-=log(pr); n++; if (((int) n % 10000)==0) cerr << "."; if (*ng.wordp(1) == dict->oovcode()) oov++; if (checkpr) { double totp=0.0; int oldw=*ng.wordp(1); for (int c=0; csize(); c++) { *ng.wordp(1)=c; totp+=prob(ng,ng.size); } *ng.wordp(1)=oldw; if ( totp < (1.0 - 1e-5) || totp > (1.0 + 1e-5)) cout << ng << " " << pr << " [t="<< totp << "] ***\n"; } } if (oov && dict->dub()>obswrd()) lp += oov * log(dict->dub() - obswrd()); cout << "n=" << (int) n << " LP=" << (double) lp << " PP=" << exp(lp/n) << " OVVRate=" << (oov)/n //<< " OVVLEXRate=" << (oov-in_oov_list)/n // << " OOVPP=" << exp((lp+oovlp)/n) << "\n"; outp.close(); inp.close(); } void interplm::test_ngt(ngramtable& ngt,int sz,bool /* unused parameter: backoff */,bool checkpr) { double pr; int n=0,c=0; double lp=0; double oov=0; cout.precision(10); if (sz > ngt.maxlevel()) { exit_error(IRSTLM_ERROR_DATA, "interplm::test_ngt: ngramtable has uncompatible size"); } if (checkpr) cerr << "checking probabilities\n"; cerr << "Computing PP:"; ngram ng(dict); ngram ng2(ngt.dict); ngt.scan(ng2,INIT,sz); while(ngt.scan(ng2,CONT,sz)) { ng.trans(ng2); if (dict->encode(dict->BoS()) != dict->oovcode()) { if (*ng.wordp(1) == dict->encode(dict->BoS())) { ng.size=1; //reset n-grams starting with BoS continue; } } n+=ng.freq; pr=prob(ng,sz); lp-=(ng.freq * log(pr)); if (*ng.wordp(1) == dict->oovcode()) oov+=ng.freq; if (checkpr) { double totp=0.0; for (c=0; csize(); c++) { *ng.wordp(1)=c; totp+=prob(ng,sz); } if ( totp < (1.0 - 1e-5) || totp > (1.0 + 1e-5)) cout << ng << " " << pr << " [t="<< totp << "] ***\n"; } if ((++c % 100000)==0) cerr << "."; } //double oovlp=oov * log((double)(dict->dub() - obswrd())); if (oov && dict->dub()>obswrd()) lp+=oov * log((dict->dub() - obswrd())); cout << "n=" << (int) n << " LP=" << (double) lp << " PP=" << exp(lp/n) << " OVVRate=" << (oov)/n //<< " OVVLEXRate=" << (oov-in_oov_list)/n // << " OOVPP=" << exp((lp+oovlp)/n) << "\n"; cout.flush(); } /* main(int argc, char** argv){ dictionary d(argv[1]); shiftbeta ilm(&d,argv[2],3); ngramtable test(&d,argv[2],3); ilm.train(); cerr << "PP " << ilm.test(test) << "\n"; ilm.savebin("newlm.lm",3); } */ irstlm-6.00.05/src/interplm.h000066400000000000000000000073701263213470300157760ustar00rootroot00000000000000/****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ // Basic Interpolated LM class #ifndef MF_INTERPLM_H #define MF_INTERPLM_H #define SHIFT_BETA 1 #define SHIFT_ONE 2 #define SHIFT_ZERO 3 #define LINEAR_STB 4 #define LINEAR_WB 5 #define LINEAR_GT 6 #define MIXTURE 7 #define MOD_SHIFT_BETA 8 #define IMPROVED_SHIFT_BETA 9 #define KNESER_NEY 10 #define IMPROVED_KNESER_NEY 11 class interplm:public ngramtable { int lms; double epsilon; //Bayes smoothing int unismooth; //0 Bayes, 1 Witten Bell int prune_singletons; int prune_top_singletons; int* prune_freq_threshold; public: int backoff; //0 interpolation, 1 Back-off interplm(char* ngtfile,int depth=0,TABLETYPE tt=FULL); virtual ~interplm(); int prunesingletons(int flag=-1) { return (flag==-1?prune_singletons:prune_singletons=flag); } int prunetopsingletons(int flag=-1) { return (flag==-1?prune_top_singletons:prune_top_singletons=flag); } inline bool prune_ngram(int lev, int freq) { return (freq > prune_freq_threshold[lev])?false:true; } void init_prune_ngram(int sz); void delete_prune_ngram(); void set_prune_ngram(int lev, int val); void set_prune_ngram(char* values); void print_prune_ngram(); void gencorrcounts(); void gensuccstat(); virtual int dub() { return dict->dub(); } virtual int dub(int value) { return dict->dub(value); } int setusmooth(int v=0) { return unismooth=v; } double setepsilon(double v=1.0) { return epsilon=v; } ngramtable *unitbl; void trainunigr(); double unigrWB(ngram ng); virtual double unigr(ngram ng){ return unigrWB(ng); }; double zerofreq(int lev); inline int lmsize() const { return lms; } inline int obswrd() const { return dict->size(); } virtual int train() { return 0; } virtual void adapt(char* /* unused parameter: ngtfile */, double /* unused parameter: w */) {} virtual double prob(ngram /* unused parameter: ng */,int /* unused parameter: size */) { return 0.0; } virtual double boprob(ngram /* unused parameter: ng */,int /* unused parameter: size */) { return 0.0; } void test_ngt(ngramtable& ngt,int sz=0,bool backoff=false,bool checkpr=false); void test_txt(char *filename,int sz=0,bool backoff=false,bool checkpr=false,char* outpr=NULL); void test(char* filename,int sz,bool backoff=false,bool checkpr=false,char* outpr=NULL); virtual int discount(ngram /* unused parameter: ng */,int /* unused parameter: size */,double& /* unused parameter: fstar */ ,double& /* unused parameter: lambda */,int /* unused parameter: cv*/=0) { return 0; } virtual int savebin(char* /* unused parameter: filename */,int /* unused parameter: lmsize=2 */) { return 0; } virtual int netsize() { return 0; } void lmstat(int level) { stat(level); } }; #endif irstlm-6.00.05/src/interpolate-lm.cpp000066400000000000000000000446561263213470300174430ustar00rootroot00000000000000/****************************************************************************** IrstLM: IRST Language Model Toolkit, compile LM Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include #include #include #include "cmd.h" #include "util.h" #include "math.h" #include "lmContainer.h" #define MAX_N 100 /********************************/ using namespace std; using namespace irstlm; inline void error(const char* message) { std::cerr << message << "\n"; throw std::runtime_error(message); } lmContainer* load_lm(std::string file,int requiredMaxlev,int dub,int memmap, float nlf, float dlf); void print_help(int TypeFlag=0){ std::cerr << std::endl << "interpolate-lm - interpolates language models" << std::endl; std::cerr << std::endl << "USAGE:" << std::endl; std::cerr << " interpolate-lm [options] [lm-list-file.out]" << std::endl; std::cerr << std::endl << "DESCRIPTION:" << std::endl; std::cerr << " interpolate-lm reads a LM list file including interpolation weights " << std::endl; std::cerr << " with the format: N\\n w1 lm1 \\n w2 lm2 ...\\n wN lmN\n" << std::endl; std::cerr << " It estimates new weights on a development text, " << std::endl; std::cerr << " computes the perplexity on an evaluation text, " << std::endl; std::cerr << " computes probabilities of n-grams read from stdin." << std::endl; std::cerr << " It reads LMs in ARPA and IRSTLM binary format." << std::endl; std::cerr << std::endl << "OPTIONS:" << std::endl; FullPrintParams(TypeFlag, 0, 1, stderr); } void usage(const char *msg = 0) { if (msg){ std::cerr << msg << std::endl; } else{ print_help(); } } int main(int argc, char **argv) { char *slearn = NULL; char *seval = NULL; bool learn=false; bool score=false; bool sent_PP_flag = false; int order = 0; int debug = 0; int memmap = 0; int requiredMaxlev = IRSTLM_REQUIREDMAXLEV_DEFAULT; int dub = IRSTLM_DUB_DEFAULT; float ngramcache_load_factor = 0.0; float dictionary_load_factor = 0.0; bool help=false; std::vector files; DeclareParams((char*) "learn", CMDSTRINGTYPE|CMDMSG, &slearn, "learn optimal interpolation for text-file; default is false", "l", CMDSTRINGTYPE|CMDMSG, &slearn, "learn optimal interpolation for text-file; default is false", "order", CMDINTTYPE|CMDMSG, &order, "order of n-grams used in --learn (optional)", "o", CMDINTTYPE|CMDMSG, &order, "order of n-grams used in --learn (optional)", "eval", CMDSTRINGTYPE|CMDMSG, &seval, "computes perplexity of the specified text file", "e", CMDSTRINGTYPE|CMDMSG, &seval, "computes perplexity of the specified text file", "DictionaryUpperBound", CMDINTTYPE|CMDMSG, &dub, "dictionary upperbound to compute OOV word penalty: default 10^7", "dub", CMDINTTYPE|CMDMSG, &dub, "dictionary upperbound to compute OOV word penalty: default 10^7", "score", CMDBOOLTYPE|CMDMSG, &score, "computes log-prob scores of n-grams from standard input", "s", CMDBOOLTYPE|CMDMSG, &score, "computes log-prob scores of n-grams from standard input", "debug", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0", "d", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0", "memmap", CMDINTTYPE|CMDMSG, &memmap, "uses memory map to read a binary LM", "mm", CMDINTTYPE|CMDMSG, &memmap, "uses memory map to read a binary LM", "sentence", CMDBOOLTYPE|CMDMSG, &sent_PP_flag, "computes perplexity at sentence level (identified through the end symbol)", "dict_load_factor", CMDFLOATTYPE|CMDMSG, &dictionary_load_factor, "sets the load factor for ngram cache; it should be a positive real value; default is 0", "ngram_load_factor", CMDFLOATTYPE|CMDMSG, &ngramcache_load_factor, "sets the load factor for ngram cache; it should be a positive real value; default is false", "level", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken", "lev", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken", "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help", "h", CMDBOOLTYPE|CMDMSG, &help, "print this help", (char *)NULL ); if (argc == 1){ usage(); exit_error(IRSTLM_NO_ERROR); } for(int i=1; i < argc; i++) { if(argv[i][0] != '-') files.push_back(argv[i]); } GetParams(&argc, &argv, (char*) NULL); if (help){ usage(); exit_error(IRSTLM_NO_ERROR); } if (files.size() > 2) { usage(); exit_error(IRSTLM_ERROR_DATA,"Too many arguments"); } if (files.size() < 1) { usage(); exit_error(IRSTLM_ERROR_DATA,"Must pecify a LM list file to read from"); } std::string infile = files[0]; std::string outfile=""; if (files.size() == 1) { outfile=infile; //remove path information std::string::size_type p = outfile.rfind('/'); if (p != std::string::npos && ((p+1) < outfile.size())) outfile.erase(0,p+1); outfile+=".out"; } else outfile = files[1]; std::cerr << "inpfile: " << infile << std::endl; learn = ((slearn != NULL)? true : false); if (learn) std::cerr << "outfile: " << outfile << std::endl; if (score) std::cerr << "interactive: " << score << std::endl; if (memmap) std::cerr << "memory mapping: " << memmap << std::endl; std::cerr << "loading up to the LM level " << requiredMaxlev << " (if any)" << std::endl; std::cerr << "order: " << order << std::endl; if (requiredMaxlev > 0) std::cerr << "loading up to the LM level " << requiredMaxlev << " (if any)" << std::endl; std::cerr << "dub: " << dub<< std::endl; lmContainer *lmt[MAX_N], *start_lmt[MAX_N]; //interpolated language models std::string lmf[MAX_N]; //lm filenames float w[MAX_N]; //interpolation weights int N; //Loading Language Models` std::cerr << "Reading " << infile << "..." << std::endl; std::fstream inptxt(infile.c_str(),std::ios::in); //std::string line; char line[BUFSIZ]; const char* words[3]; int tokenN; inptxt.getline(line,BUFSIZ,'\n'); tokenN = parseWords(line,words,3); if (tokenN != 2 || ((strcmp(words[0],"LMINTERPOLATION") != 0) && (strcmp(words[0],"lminterpolation")!=0))) error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1\nweight_of_LM_2 filename_of_LM_2"); N=atoi(words[1]); std::cerr << "Number of LMs: " << N << "..." << std::endl; if(N > MAX_N) { exit_error(IRSTLM_ERROR_DATA,"Can't interpolate more than MAX_N language models"); } for (int i=0; i lmt[i]->maxlevel())?maxorder:lmt[i]->maxlevel(); } if (order <= 0) { order = maxorder; std::cerr << "order is not set or wrongly set to a non positive value; reset to the maximum order of LMs: " << order << std::endl; } else if (order > maxorder) { order = maxorder; std::cerr << "order is too high; reset to the maximum order of LMs" << order << std::endl; } //Learning mixture weights if (learn) { std::vector *p = new std::vector[N]; //LM probabilities float c[N]; //expected counts float den,norm; //inner denominator, normalization term float variation=1.0; // global variation between new old params dictionary* dict=new dictionary(slearn,1000000,dictionary_load_factor); ngram ng(dict); int bos=ng.dict->encode(ng.dict->BoS()); std::ifstream dev(slearn,std::ios::in); for(;;) { std::string line; getline(dev, line); if(dev.eof()) break; if(dev.fail()) { exit_error(IRSTLM_ERROR_IO,"Problem reading input file"); } std::istringstream lstream(line); if(line.substr(0, 29) == "###interpolate-lm:replace-lm ") { std::string token, newlm; int id; lstream >> token >> id >> newlm; if(id <= 0 || id > N) { std::cerr << "LM id out of range." << std::endl; delete[] p; return 1; } id--; // count from 0 now if(lmt[id] != start_lmt[id]) delete lmt[id]; lmt[id] = load_lm(newlm,requiredMaxlev,dub,memmap,ngramcache_load_factor,dictionary_load_factor); continue; } while(lstream >> ng) { // reset ngram at begin of sentence if (*ng.wordp(1)==bos) { ng.size=1; continue; } if (order > 0 && ng.size > order) ng.size=order; for (int i=0; igetDict()); ong.trans(ng); double logpr; logpr = lmt[i]->clprob(ong); //LM log-prob (using caches if available) p[i].push_back(pow(10.0,logpr)); } } for (int i=0; icheck_caches_levels(); } dev.close(); while( variation > 0.01 ) { for (int i=0; ic[i]?(w[i]-c[i]):(c[i]-w[i])); w[i]=c[i]; //update weights } std::cerr << "Variation " << variation << std::endl; } //Saving results std::cerr << "Saving in " << outfile << "..." << std::endl; std::fstream outtxt(outfile.c_str(),std::ios::out); outtxt << "LMINTERPOLATION " << N << "\n"; for (int i=0; iincflag(1); ngram ng(dict); int bos=ng.dict->encode(ng.dict->BoS()); int eos=ng.dict->encode(ng.dict->EoS()); std::fstream inptxt(seval,std::ios::in); for(;;) { std::string line; getline(inptxt, line); if(inptxt.eof()) break; if(inptxt.fail()) { std::cerr << "Problem reading input file " << seval << std::endl; return 1; } std::istringstream lstream(line); if(line.substr(0, 26) == "###interpolate-lm:weights ") { std::string token; lstream >> token; for(int i = 0; i < N; i++) { if(lstream.eof()) { std::cerr << "Not enough weights!" << std::endl; return 1; } lstream >> w[i]; } continue; } if(line.substr(0, 29) == "###interpolate-lm:replace-lm ") { std::string token, newlm; int id; lstream >> token >> id >> newlm; if(id <= 0 || id > N) { std::cerr << "LM id out of range." << std::endl; return 1; } id--; // count from 0 now delete lmt[id]; lmt[id] = load_lm(newlm,requiredMaxlev,dub,memmap,ngramcache_load_factor,dictionary_load_factor); continue; } double bow; int bol=0; ngram_state_t msidx; char *msp; unsigned int statesize; while(lstream >> ng) { // reset ngram at begin of sentence if (*ng.wordp(1)==bos) { ng.size=1; continue; } if (order > 0 && ng.size > order) ng.size=order; if (ng.size>=1) { int minbol=MAX_NGRAM; //minimum backoff level of the mixture bool OOV_all_flag=true; //OOV flag wrt all LM[i] bool OOV_any_flag=false; //OOV flag wrt any LM[i] float logpr; Pr = 0.0; for (i=0; igetDict()); ong.trans(ng); // logpr = lmt[i]->clprob(ong,&bow,&bol,&msp,&statesize); //actual prob of the interpolation logpr = lmt[i]->clprob(ong,&bow,&bol,&msidx,&msp,&statesize); //actual prob of the interpolation //logpr = lmt[i]->clprob(ong,&bow,&bol); //LM log-prob Pr+=w[i] * pow(10.0,logpr); //actual prob of the interpolation if (bol < minbol) minbol=bol; //backoff of LM[i] if (*ong.wordp(1) != lmt[i]->getDict()->oovcode()) OOV_all_flag=false; //OOV wrt LM[i] if (*ong.wordp(1) == lmt[i]->getDict()->oovcode()) OOV_any_flag=true; //OOV wrt LM[i] } lPr=log(Pr)/M_LN10; logPr+=lPr; sent_logPr+=lPr; if (debug==1) { std::cout << ng.dict->decode(*ng.wordp(1)) << " [" << ng.size-minbol << "]" << " "; if (*ng.wordp(1)==eos) std::cout << std::endl; } if (debug==2) std::cout << ng << " [" << ng.size-minbol << "-gram]" << " " << log(Pr) << std::endl; if (minbol) { Nbo++; //all LMs have back-offed by at least one sent_Nbo++; } if (OOV_all_flag) { Noov_all++; //word is OOV wrt all LM sent_Noov_all++; } if (OOV_any_flag) { Noov_any++; //word is OOV wrt any LM sent_Noov_any++; } Nw++; sent_Nw++; if (*ng.wordp(1)==eos && sent_PP_flag) { sent_PP=exp((-sent_logPr * log(10.0)) /sent_Nw); std::cout << "%% sent_Nw=" << sent_Nw << " sent_PP=" << sent_PP << " sent_Nbo=" << sent_Nbo << " sent_Noov=" << sent_Noov_all << " sent_OOV=" << (float)sent_Noov_all/sent_Nw * 100.0 << "%" << " sent_Noov_any=" << sent_Noov_any << " sent_OOV_any=" << (float)sent_Noov_any/sent_Nw * 100.0 << "%" << std::endl; //reset statistics for sentence based Perplexity sent_Nw=sent_Noov_any=sent_Noov_all=sent_Nbo=0; sent_logPr=0.0; } if ((Nw % 10000)==0) std::cerr << "."; } } } PP=exp((-logPr * M_LN10) /Nw); std::cout << "%% Nw=" << Nw << " PP=" << PP << " Nbo=" << Nbo << " Noov=" << Noov_all << " OOV=" << (float)Noov_all/Nw * 100.0 << "%" << " Noov_any=" << Noov_any << " OOV_any=" << (float)Noov_any/Nw * 100.0 << "%" << std::endl; }; if (score == true) { dictionary* dict=new dictionary(NULL,1000000,dictionary_load_factor); dict->incflag(1); // start generating the dictionary; ngram ng(dict); int bos=ng.dict->encode(ng.dict->BoS()); double Pr,logpr; double bow; int bol=0, maxbol=0; unsigned int maxstatesize, statesize; int i,n=0; std::cout << "> "; while(std::cin >> ng) { // reset ngram at begin of sentence if (*ng.wordp(1)==bos) { ng.size=1; continue; } if (ng.size>=maxorder) { if (order > 0 && ng.size > order) ng.size=order; n++; maxstatesize=0; maxbol=0; Pr=0.0; for (i=0; igetDict()); ong.trans(ng); // logpr = lmt[i]->clprob(ong,&bow,&bol,NULL,&statesize); //LM log-prob (using caches if available) logpr = lmt[i]->clprob(ong,&bow,&bol,NULL,NULL,&statesize); //LM log-prob (using caches if available) Pr+=w[i] * pow(10.0,logpr); //actual prob of the interpolation std::cout << "lm " << i << ":" << " logpr: " << logpr << " weight: " << w[i] << std::endl; if (maxbolcheck_caches_levels(); } } else { std::cout << ng << " p= NULL" << std::endl; } std::cout << "> "; } } for (int i=0; isetMaxLoadedLevel(requiredMaxlev); lmt->load(file,memmap); if (dub) lmt->setlogOOVpenalty((int)dub); //use caches to save time (only if PS_CACHE_ENABLE is defined through compilation flags) lmt->init_caches(lmt->maxlevel()); return lmt; } irstlm-6.00.05/src/linearlm.cpp000066400000000000000000000151201263213470300162720ustar00rootroot00000000000000/****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "mempool.h" #include "ngramtable.h" #include "ngramcache.h" #include "normcache.h" #include "interplm.h" #include "mdiadapt.h" #include "linearlm.h" #include "util.h" namespace irstlm { // //Linear interpolated language model: Witten & Bell discounting scheme // linearwb::linearwb(char* ngtfile,int depth,int prunefreq,TABLETYPE tt): mdiadaptlm(ngtfile,depth,tt) { prunethresh=prunefreq; cerr << "PruneThresh: " << prunethresh << "\n"; }; int linearwb::train() { trainunigr(); gensuccstat(); return 1; } int linearwb::discount(ngram ng_,int size,double& fstar,double& lambda,int cv) { VERBOSE(3,"linearwb::discount(ngram ng_,int size,double& fstar,double& lambda,int cv) ng_:|" << ng_ << "| size:" << size << " cv:" << cv<< std::endl); ngram ng(dict); ng.trans(ng_); if (size > 1) { ngram history=ng; if (ng.ckhisto(size) && get(history,size,size-1) && (history.freq>cv) && ((size < 3) || ((history.freq-cv) > prunethresh))) { // apply history pruning on trigrams only if (get(ng,size,size) && (!prunesingletons() || ng.freq>1 || size<3)) { // apply frequency pruning on trigrams only cv=(ng.freq=cv if (ng.freq>cv) { fstar=(double)(ng.freq-cv)/(double)(history.freq - cv + history.succ); lambda=(double)history.succ/(double)(history.freq - cv + history.succ); if (size>=3 && prunesingletons()){ // correction due to frequency pruning lambda+=(double)succ1(history.link)/(double)(history.freq - cv + history.succ); // succ1(history.link) is not affected when ng.freq > cv } } else { // ng.freq == cv fstar=0.0; lambda=(double)(history.succ - 1)/ (double)(history.freq - cv + history.succ - 1); // remove cv n-grams from data if (size>=3 && prunesingletons()){ // correction due to frequency pruning lambda+=(double)succ1(history.link)-(cv==1 && ng.freq==1?1:0)/(double)(history.freq - cv + history.succ - 1); } } } else { fstar=0.0; lambda=(double)history.succ/(double)(history.freq + history.succ); if (size>=3 && prunesingletons()){ // correction due to frequency pruning lambda+=(double)succ1(history.link)/(double)(history.freq + history.succ); } } //cerr << "ngram :" << ng << "\n"; // if current word is OOV then back-off to unigrams! if (*ng.wordp(1)==dict->oovcode()) { lambda+=fstar; fstar=0.0; MY_ASSERT(lambda<=1 && lambda>0); } else { // add f*(oov|...) to lambda *ng.wordp(1)=dict->oovcode(); if (get(ng,size,size) && (!prunesingletons() || ng.freq>1 || size<3)){ lambda+=(double)ng.freq/(double)(history.freq - cv + history.succ); } } } else { fstar=0; lambda=1; } } else { fstar=unigr(ng); lambda=0; } VERBOSE(3,"linearwb::discount(ngram ng_,int size,double& fstar,double& lambda,int cv) ng_:|" << ng_ << "| returning fstar:" << fstar << " lambda:" << lambda << std::endl); return 1; } linearstb::linearstb(char* ngtfile,int depth,int prunefreq,TABLETYPE tt): mdiadaptlm(ngtfile,depth,tt) { prunethresh=prunefreq; cerr << "PruneThresh: " << prunethresh << "\n"; }; int linearstb::train() { trainunigr(); gensuccstat(); return 1; } int linearstb::discount(ngram ng_,int size,double& fstar,double& lambda,int cv) { VERBOSE(3,"linearstb::discount(ngram ng_,int size,double& fstar,double& lambda,int cv) ng_:|" << ng_ << "| size:" << size << " cv:" << cv<< std::endl); ngram ng(dict); ng.trans(ng_); lambda = 0.4; if (size > 1) { ngram history=ng; if (ng.ckhisto(size) && get(history,size,size-1) && (history.freq>cv) && ((size < 3) || ((history.freq-cv) > prunethresh))) { // apply history pruning on trigrams only if (get(ng,size,size) && (!prunesingletons() || ng.freq>1 || size<3)) { // apply frequency pruning on trigrams only cv=(ng.freq=cv if (ng.freq>cv) { fstar=(double)(ng.freq-cv)/(double)(history.freq - cv); if (size>=3 && prunesingletons()){ // correction due to frequency pruning if (history.freq<=1 && size>3){ lambda = 1.0; } } } else { // ng.freq == cv fstar=0.0; if (size>=3 && prunesingletons()){ // correction due to frequency pruning if (history.freq<=1 && history.size>3){ lambda = 1.0; } } } } else { fstar=0.0; if (size>=3 && prunesingletons()){ // correction due to frequency pruning if (history.freq<=1 && history.size>3){ lambda = 1.0; } } } //cerr << "ngram :" << ng << "\n"; // if current word is OOV then back-off to unigrams! if (*ng.wordp(1)==dict->oovcode()) { fstar=0.0; } } else { fstar=0; lambda=1; } } else { fstar=unigr(ng); lambda=0; } VERBOSE(3,"linearstb::discount(ngram ng_,int size,double& fstar,double& lambda,int cv) ng_:|" << ng_ << "| returning fstar:" << fstar << " lambda:" << lambda << std::endl); return 1; } int linearstb::compute_backoff() { VERBOSE(3,"linearstb::compute_backoff() ... "); this->backoff=1; for (int size=1; size #include #include #include #include #include #include #include "util.h" #include "lmContainer.h" #include "lmtable.h" #include "lmmacro.h" #include "lmclass.h" #include "lmInterpolation.h" using namespace std; namespace irstlm { #ifdef PS_CACHE_ENABLE #if PS_CACHE_ENABLE==0 #undef PS_CACHE_ENABLE #endif #endif #ifdef LMT_CACHE_ENABLE #if LMT_CACHE_ENABLE==0 #undef LMT_CACHE_ENABLE #endif #endif #if PS_CACHE_ENABLE bool lmContainer::ps_cache_enabled=true; #else bool lmContainer::ps_cache_enabled=false; #endif #if LMT_CACHE_ENABLE bool lmContainer::lmt_cache_enabled=true; #else bool lmContainer::lmt_cache_enabled=false; #endif inline void error(const char* message) { std::cerr << message << "\n"; throw std::runtime_error(message); } lmContainer::lmContainer() { requiredMaxlev=IRSTLM_REQUIREDMAXLEV_DEFAULT; lmtype=_IRSTLM_LMUNKNOWN; maxlev=0; } int lmContainer::getLanguageModelType(std::string filename) { fstream inp(filename.c_str(),ios::in|ios::binary); if (!inp.good()) { std::stringstream ss_msg; ss_msg << "Failed to open " << filename; exit_error(IRSTLM_ERROR_IO, ss_msg.str()); } //give a look at the header to get informed about the language model type std::string header; inp >> header; inp.close(); VERBOSE(1,"LM header:|" << header << "|" << std::endl); int type=_IRSTLM_LMUNKNOWN; VERBOSE(1,"type: " << type << std::endl); if (header == "lmminterpolation" || header == "LMINTERPOLATION") { type = _IRSTLM_LMINTERPOLATION; } else if (header == "lmmacro" || header == "LMMACRO") { type = _IRSTLM_LMMACRO; } else if (header == "lmclass" || header == "LMCLASS") { type = _IRSTLM_LMCLASS; } else { type = _IRSTLM_LMTABLE; } VERBOSE(1,"type: " << type << std::endl); return type; }; lmContainer* lmContainer::CreateLanguageModel(const std::string infile, float nlf, float dlf) { int type = lmContainer::getLanguageModelType(infile); VERBOSE(1,"lmContainer* lmContainer::CreateLanguageModel(...) Language Model Type of " << infile << " is " << type << std::endl); return lmContainer::CreateLanguageModel(type, nlf, dlf); } lmContainer* lmContainer::CreateLanguageModel(int type, float nlf, float dlf) { VERBOSE(1,"Language Model Type is " << type << std::endl); lmContainer* lm=NULL; switch (type) { case _IRSTLM_LMTABLE: VERBOSE(1,"_IRSTLM_LMTABLE" << std::endl); lm = new lmtable(nlf, dlf); break; case _IRSTLM_LMMACRO: VERBOSE(1,"_IRSTLM_LMMACRO" << std::endl); lm = new lmmacro(nlf, dlf); break; case _IRSTLM_LMCLASS: VERBOSE(1,"_IRSTLM_LMCLASS" << std::endl); lm = new lmclass(nlf, dlf); break; case _IRSTLM_LMINTERPOLATION: VERBOSE(1,"_IRSTLM_LMINTERPOLATION" << std::endl); lm = new lmInterpolation(nlf, dlf); break; default: VERBOSE(1,"UNKNOWN" << std::endl); exit_error(IRSTLM_ERROR_DATA, "This language model type is unknown!"); } VERBOSE(1,"lmContainer* lmContainer::CreateLanguageModel(int type, float nlf, float dlf) lm:|" << (void*) lm << "|" << std::endl); lm->setLanguageModelType(type); VERBOSE(1,"lmContainer* lmContainer::CreateLanguageModel(int type, float nlf, float dlf) lm->getLanguageModelType:|" << lm->getLanguageModelType() << "|" << std::endl) return lm; } bool lmContainer::filter(const string sfilter, lmContainer*& sublmC, const string skeepunigrams) { if (lmtype == _IRSTLM_LMTABLE) { sublmC = lmContainer::CreateLanguageModel(lmtype,((lmtable*) this)->GetNgramcacheLoadFactor(),((lmtable*) this)->GetDictionaryLoadFactor()); //let know that table has inverted n-grams sublmC->is_inverted(is_inverted()); sublmC->setMaxLoadedLevel(getMaxLoadedLevel()); sublmC->maxlevel(maxlevel()); bool res=((lmtable*) this)->filter(sfilter, (lmtable*) sublmC, skeepunigrams); return res; } return false; }; }//namespace irstlm irstlm-6.00.05/src/lmContainer.h000066400000000000000000000250661263213470300164210ustar00rootroot00000000000000// $Id: lmContainer.h 3686 2010-10-15 11:55:32Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #ifndef MF_LMCONTAINER_H #define MF_LMCONTAINER_H #define _IRSTLM_LMUNKNOWN 0 #define _IRSTLM_LMTABLE 1 #define _IRSTLM_LMMACRO 2 #define _IRSTLM_LMCLASS 3 #define _IRSTLM_LMINTERPOLATION 4 #include #include #include #include "util.h" #include "n_gram.h" #include "dictionary.h" typedef enum {BINARY,TEXT,YRANIB,NONE} OUTFILE_TYPE; typedef enum {LMT_FIND, //!< search: find an entry LMT_ENTER, //!< search: enter an entry LMT_INIT, //!< scan: start scan LMT_CONT //!< scan: continue scan } LMT_ACTION; namespace irstlm { class lmContainer { static const bool debug=true; static bool ps_cache_enabled; static bool lmt_cache_enabled; protected: int lmtype; //auto reference to its own type int maxlev; //maximun order of sub LMs; int requiredMaxlev; //max loaded level, i.e. load up to requiredMaxlev levels public: lmContainer(); virtual ~lmContainer() {}; virtual void load(const std::string &filename, int mmap=0) { UNUSED(filename); UNUSED(mmap); }; virtual void savetxt(const char *filename) { UNUSED(filename); }; virtual void savebin(const char *filename) { UNUSED(filename); }; virtual double getlogOOVpenalty() const { return 0.0; }; virtual double setlogOOVpenalty(int dub) { UNUSED(dub); return 0.0; }; virtual double setlogOOVpenalty(double oovp) { UNUSED(oovp); return 0.0; }; inline virtual dictionary* getDict() const { return NULL; }; inline virtual void maxlevel(int lev) { maxlev = lev; }; inline virtual int maxlevel() const { return maxlev; }; inline virtual void stat(int lev=0) { UNUSED(lev); }; inline virtual void setMaxLoadedLevel(int lev) { requiredMaxlev=lev; }; inline virtual int getMaxLoadedLevel() { return requiredMaxlev; }; virtual bool is_inverted(const bool flag) { UNUSED(flag); return false; }; virtual bool is_inverted() { return false; }; virtual double clprob(ngram ng) { return clprob(ng, NULL, NULL, NULL, NULL, NULL, NULL, NULL); } virtual double clprob(ngram ng, double* bow) { return clprob(ng, bow, NULL, NULL, NULL, NULL, NULL, NULL); } virtual double clprob(ngram ng, double* bow, int* bol) { return clprob(ng, bow, bol, NULL, NULL, NULL, NULL, NULL); } virtual double clprob(ngram ng, double* bow, int* bol, char** maxsuffptr) { return clprob(ng, bow, bol, NULL, maxsuffptr, NULL, NULL, NULL); } virtual double clprob(ngram ng, double* bow, int* bol, char** maxsuffptr, unsigned int* statesize) { return clprob(ng, bow, bol, NULL, maxsuffptr, statesize, NULL, NULL); } virtual double clprob(ngram ng, double* bow, int* bol, char** maxsuffptr, unsigned int* statesize, bool* extendible) { return clprob(ng, bow, bol, NULL, maxsuffptr, statesize, extendible, NULL); }; virtual double clprob(ngram ng, double* bow, int* bol, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow) { return clprob(ng, bow, bol, NULL, maxsuffptr, statesize, extendible, lastbow); } virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx) { return clprob(ng, bow, bol, maxsuffidx, NULL, NULL, NULL, NULL); } virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr) { return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, NULL, NULL, NULL); } virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize) { return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, NULL, NULL); } virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible) { return clprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, NULL); }; virtual double clprob(int* ng, int ngsize){ return clprob(ng, ngsize, NULL, NULL, NULL, NULL, NULL, NULL, NULL); } virtual double clprob(int* ng, int ngsize, double* bow){ return clprob(ng, ngsize, bow, NULL, NULL, NULL, NULL, NULL, NULL); } virtual double clprob(int* ng, int ngsize, double* bow, int* bol){ return clprob(ng, ngsize, bow, bol, NULL, NULL, NULL, NULL, NULL); } virtual double clprob(int* ng, int ngsize, double* bow, int* bol, char** maxsuffptr, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL){ return clprob(ng, ngsize, bow, bol, NULL, maxsuffptr, statesize, extendible, lastbow); } virtual double clprob(int* ng, int ngsize, double* bow, int* bol, ngram_state_t* maxsuffidx){ return clprob(ng, ngsize, bow, bol, maxsuffidx, NULL, NULL, NULL, NULL); } virtual double clprob(int* ng, int ngsize, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL) { //create the actual ngram ngram ong(getDict()); ong.pushc(ng,ngsize); MY_ASSERT (ong.size == ngsize); return clprob(ong, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow); } virtual double clprob(int* ng, int ngsize, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL) { //create the actual ngram ngram ong(getDict()); ong.pushc(ng,ngsize); MY_ASSERT (ong.size == ngsize); return clprob(ong, topic_weights, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow); } virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow) { UNUSED(ng); UNUSED(bow); UNUSED(bol); UNUSED(maxsuffidx); UNUSED(maxsuffptr); UNUSED(statesize); UNUSED(extendible); UNUSED(lastbow); return 0.0; } //this is a function which could be overwritten virtual double clprob(ngram ng, topic_map_t& topic_weights, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL, double* lastbow=NULL) { UNUSED(topic_weights); UNUSED(ng); UNUSED(bow); UNUSED(bol); UNUSED(maxsuffidx); UNUSED(maxsuffptr); UNUSED(statesize); UNUSED(extendible); UNUSED(lastbow); return 0.0; } virtual const char *cmaxsuffptr(ngram ng, unsigned int* statesize=NULL) { UNUSED(ng); UNUSED(statesize); return NULL; } virtual const char *cmaxsuffptr(int* ng, int ngsize, unsigned int* statesize=NULL) { //create the actual ngram ngram ong(getDict()); ong.pushc(ng,ngsize); MY_ASSERT (ong.size == ngsize); return cmaxsuffptr(ong, statesize); } virtual ngram_state_t cmaxsuffidx(ngram ng, unsigned int* statesize=NULL) { UNUSED(ng); UNUSED(statesize); return 0; } virtual ngram_state_t cmaxsuffidx(int* ng, int ngsize, unsigned int* statesize=NULL) { //create the actual ngram ngram ong(getDict()); ong.pushc(ng,ngsize); MY_ASSERT (ong.size == ngsize); return cmaxsuffidx(ong,statesize); } virtual inline int get(ngram& ng) { UNUSED(ng); return 0; } virtual int get(ngram& ng,int n,int lev){ UNUSED(ng); UNUSED(n); UNUSED(lev); return 0; } virtual int succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev){ UNUSED(ng); UNUSED(h); UNUSED(action); UNUSED(lev); return 0; } virtual void used_caches() {}; virtual void init_caches(int uptolev) { UNUSED(uptolev); }; virtual void check_caches_levels() {}; virtual void reset_caches() {}; virtual void reset_mmap() {}; void inline setLanguageModelType(int type) { lmtype=type; }; int getLanguageModelType() const { return lmtype; }; static int getLanguageModelType(std::string filename); inline virtual void dictionary_incflag(const bool flag) { UNUSED(flag); }; virtual bool filter(const string sfilter, lmContainer*& sublmt, const string skeepunigrams); static lmContainer* CreateLanguageModel(const std::string infile, float nlf=0.0, float dlf=0.0); static lmContainer* CreateLanguageModel(int type, float nlf=0.0, float dlf=0.0); inline virtual bool is_OOV(int code) { UNUSED(code); return false; }; inline static bool is_lmt_cache_enabled(){ VERBOSE(3,"inline static bool is_lmt_cache_enabled() " << lmt_cache_enabled << std::endl); return lmt_cache_enabled; } inline static bool is_ps_cache_enabled(){ VERBOSE(3,"inline static bool is_ps_cache_enabled() " << ps_cache_enabled << std::endl); return ps_cache_enabled; } inline static bool is_cache_enabled(){ return is_lmt_cache_enabled() && is_ps_cache_enabled(); } virtual int addWord(const char *w){ getDict()->incflag(1); int c=getDict()->encode(w); getDict()->incflag(0); return c; } virtual void print_table_stat(){ VERBOSE(3,"virtual void lmContainer::print_table_stat() "<< std::endl); }; }; }//namespace irstlm #endif irstlm-6.00.05/src/lmInterpolation.cpp000066400000000000000000000250451263213470300176560ustar00rootroot00000000000000// $Id: lmInterpolation.cpp 3686 2010-10-15 11:55:32Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include #include #include "lmContainer.h" #include "lmInterpolation.h" #include "util.h" using namespace std; namespace irstlm { lmInterpolation::lmInterpolation(float nlf, float dlf) { ngramcache_load_factor = nlf; dictionary_load_factor = dlf; order=0; memmap=0; isInverted=false; } void lmInterpolation::load(const std::string &filename,int mmap) { VERBOSE(2,"lmInterpolation::load(const std::string &filename,int memmap)" << std::endl); VERBOSE(2," filename:|" << filename << "|" << std::endl); dictionary_upperbound=1000000; int memmap=mmap; dict=new dictionary((char *)NULL,1000000,dictionary_load_factor); //get info from the configuration file fstream inp(filename.c_str(),ios::in|ios::binary); char line[MAX_LINE]; const char* words[LMINTERPOLATION_MAX_TOKEN]; int tokenN; inp.getline(line,MAX_LINE,'\n'); tokenN = parseWords(line,words,LMINTERPOLATION_MAX_TOKEN); if (tokenN != 2 || ((strcmp(words[0],"LMINTERPOLATION") != 0) && (strcmp(words[0],"lminterpolation")!=0))){ exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format: LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1\nweight_of_LM_2 filename_of_LM_2"); } m_number_lm = atoi(words[1]); m_weight.resize(m_number_lm); m_file.resize(m_number_lm); m_isinverted.resize(m_number_lm); m_lm.resize(m_number_lm); VERBOSE(2,"lmInterpolation::load(const std::string &filename,int mmap) m_number_lm:"<< m_number_lm << std::endl); dict->incflag(1); for (size_t i=0; i3) { exit_error(IRSTLM_ERROR_DATA, "ERROR: wrong header format of configuration file\ncorrect format: LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1\nweight_of_LM_2 filename_of_LM_2"); } //check whether the (textual) LM has to be loaded as inverted m_isinverted[i] = false; if(tokenN == 3) { if (strcmp(words[2],"inverted") == 0) m_isinverted[i] = true; } VERBOSE(2,"i:" << i << " m_isinverted[i]:" << m_isinverted[i] << endl); m_weight[i] = (float) atof(words[0]); m_file[i] = words[1]; VERBOSE(2,"lmInterpolation::load(const std::string &filename,int mmap) m_file:"<< words[1] << std::endl); m_lm[i] = load_lm(i,memmap,ngramcache_load_factor,dictionary_load_factor); //set the actual value for inverted flag, which is known only after loading the lM m_isinverted[i] = m_lm[i]->is_inverted(); dictionary *_dict=m_lm[i]->getDict(); for (int j=0; j<_dict->size(); j++) { dict->encode(_dict->decode(j)); } } dict->genoovcode(); inp.close(); int maxorder = 0; for (size_t i=0; i m_lm[i]->maxlevel())?maxorder:m_lm[i]->maxlevel(); } if (order == 0) { order = maxorder; VERBOSE(3, "order is not set; reset to the maximum order of LMs: " << order << std::endl); } else if (order > maxorder) { order = maxorder; VERBOSE(3, "order is too high; reset to the maximum order of LMs: " << order << std::endl); } maxlev=order; } lmContainer* lmInterpolation::load_lm(int i,int memmap, float nlf, float dlf) { //checking the language model type lmContainer* lmt=lmContainer::CreateLanguageModel(m_file[i],nlf,dlf); //let know that table has inverted n-grams lmt->is_inverted(m_isinverted[i]); //set inverted flag for each LM lmt->setMaxLoadedLevel(requiredMaxlev); lmt->load(m_file[i], memmap); lmt->init_caches(lmt->maxlevel()); return lmt; } //return log10 prob of an ngram double lmInterpolation::clprob(ngram ng, double* bow,int* bol,ngram_state_t* maxsuffidx, char** maxsuffptr,unsigned int* statesize,bool* extendible, double* lastbow) { double pr=0.0; double _logpr; char* _maxsuffptr=NULL,*actualmaxsuffptr=NULL; ngram_state_t _maxsuffidx=0,actualmaxsuffidx=0; unsigned int _statesize=0,actualstatesize=0; int _bol=0,actualbol=MAX_NGRAM; double _bow=0.0,actualbow=0.0; double _lastbow=0.0,actuallastbow=0.0; bool _extendible=false,actualextendible=false; for (size_t i=0; i0.0){ ngram _ng(m_lm[i]->getDict()); _ng.trans(ng); // _logpr=m_lm[i]->clprob(_ng,&_bow,&_bol,&_maxsuffptr,&_statesize,&_extendible); _logpr=m_lm[i]->clprob(_ng,&_bow,&_bol,&_maxsuffidx,&_maxsuffptr,&_statesize,&_extendible, lastbow); IFVERBOSE(3){ //cerr.precision(10); VERBOSE(3," LM " << i << " weight:" << m_weight[i] << std::endl); VERBOSE(3," LM " << i << " log10 logpr:" << _logpr<< std::endl); VERBOSE(3," LM " << i << " pr:" << pow(10.0,_logpr) << std::endl); VERBOSE(3," _statesize:" << _statesize << std::endl); VERBOSE(3," _bow:" << _bow << std::endl); VERBOSE(3," _bol:" << _bol << std::endl); VERBOSE(3," _lastbow:" << _lastbow << std::endl); } /* //TO CHECK the following claims //What is the statesize of a LM interpolation? The largest _statesize among the submodels //What is the maxsuffptr of a LM interpolation? The _maxsuffptr of the submodel with the largest _statesize //What is the bol of a LM interpolation? The smallest _bol among the submodels //What is the bow of a LM interpolation? The weighted sum of the bow of the submodels //What is the prob of a LM interpolation? The weighted sum of the prob of the submodels //What is the extendible flag of a LM interpolation? true if the extendible flag is one for any LM //What is the lastbow of a LM interpolation? The weighted sum of the lastbow of the submodels */ pr+=m_weight[i]*pow(10.0,_logpr); actualbow+=m_weight[i]*pow(10.0,_bow); if(_statesize > actualstatesize || i == 0) { actualmaxsuffptr = _maxsuffptr; actualmaxsuffidx = _maxsuffidx; actualstatesize = _statesize; } if (_bol < actualbol) { actualbol=_bol; //backoff limit of LM[i] } if (_extendible) { actualextendible=true; //set extendible flag to true if the ngram is extendible for any LM } if (_lastbow < actuallastbow) { actuallastbow=_lastbow; //backoff limit of LM[i] } } } if (bol) *bol=actualbol; if (bow) *bow=log(actualbow); if (maxsuffptr) *maxsuffptr=actualmaxsuffptr; if (maxsuffidx) *maxsuffidx=actualmaxsuffidx; if (statesize) *statesize=actualstatesize; if (extendible) *extendible=actualextendible; if (lastbow) *bol=actuallastbow; if (statesize) VERBOSE(3, " statesize:" << *statesize << std::endl); if (bow) VERBOSE(3, " bow:" << *bow << std::endl); if (bol) VERBOSE(3, " bol:" << *bol << std::endl); if (lastbow) VERBOSE(3, " lastbow:" << *lastbow << std::endl); return log10(pr); } const char *lmInterpolation::cmaxsuffptr(ngram ng, unsigned int* statesize) { char *maxsuffptr=NULL; unsigned int _statesize=0,actualstatesize=0; for (size_t i=0; i0.0){ ngram _ng(m_lm[i]->getDict()); _ng.trans(ng); const char* _maxsuffptr = m_lm[i]->cmaxsuffptr(_ng,&_statesize); IFVERBOSE(3){ //cerr.precision(10); VERBOSE(3," LM " << i << " weight:" << m_weight[i] << std::endl); VERBOSE(3," _statesize:" << _statesize << std::endl); } /* //TO CHECK the following claims //What is the statesize of a LM interpolation? The largest _statesize among the submodels //What is the maxsuffptr of a LM interpolation? The _maxsuffptr of the submodel with the largest _statesize */ if(_statesize > actualstatesize || i == 0) { maxsuffptr = (char*) _maxsuffptr; actualstatesize = _statesize; } } } if (statesize) *statesize=actualstatesize; if (statesize) VERBOSE(3, " statesize:" << *statesize << std::endl); return maxsuffptr; } ngram_state_t lmInterpolation::cmaxsuffidx(ngram ng, unsigned int* statesize) { ngram_state_t maxsuffidx=0; unsigned int _statesize=0,actualstatesize=0; for (size_t i=0; i0.0){ ngram _ng(m_lm[i]->getDict()); _ng.trans(ng); ngram_state_t _maxsuffidx = m_lm[i]->cmaxsuffidx(_ng,&_statesize); IFVERBOSE(3){ //cerr.precision(10); VERBOSE(3," LM " << i << " weight:" << m_weight[i] << std::endl); VERBOSE(3," _statesize:" << _statesize << std::endl); } /* //TO CHECK the following claims //What is the statesize of a LM interpolation? The largest _statesize among the submodels //What is the maxsuffptr of a LM interpolation? The _maxsuffptr of the submodel with the largest _statesize */ if(_statesize > actualstatesize || i == 0) { maxsuffidx = _maxsuffidx; actualstatesize = _statesize; } } } if (statesize) *statesize=actualstatesize; if (statesize) VERBOSE(3, " statesize:" << *statesize << std::endl); return maxsuffidx; } double lmInterpolation::setlogOOVpenalty(int dub) { MY_ASSERT(dub > dict->size()); double _logpr; double OOVpenalty=0.0; for (size_t i=0; i0.0){ m_lm[i]->setlogOOVpenalty(dub); //set OOV Penalty for each LM _logpr=m_lm[i]->getlogOOVpenalty(); // logOOV penalty is in log10 // OOVpenalty+=m_weight[i]*exp(_logpr); OOVpenalty+=m_weight[i]*exp(_logpr*M_LN10); // logOOV penalty is in log10 } } // logOOVpenalty=log(OOVpenalty); logOOVpenalty=log10(OOVpenalty); return logOOVpenalty; } }//namespace irstlm irstlm-6.00.05/src/lmInterpolation.h000066400000000000000000000101301263213470300173100ustar00rootroot00000000000000// $Id: lmInterpolation.h 3686 2010-10-15 11:55:32Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #ifndef MF_LMINTERPOLATION_H #define MF_LMINTERPOLATION_H #include #include #include #include #include #include #include "util.h" #include "dictionary.h" #include "n_gram.h" #include "lmContainer.h" namespace irstlm { /* interpolation of several sub LMs */ #define LMINTERPOLATION_MAX_TOKEN 3 class lmInterpolation: public lmContainer { static const bool debug=true; size_t m_number_lm; int order; int dictionary_upperbound; //set by user double logOOVpenalty; //penalty for OOV words (default 0) bool isInverted; int memmap; //level from which n-grams are accessed via mmap std::vector m_weight; std::vector m_file; std::vector m_isinverted; std::vector m_lm; int maxlev; //maximun order of sub LMs; float ngramcache_load_factor; float dictionary_load_factor; dictionary *dict; // dictionary for all interpolated LMs public: lmInterpolation(float nlf=0.0, float dlfi=0.0); virtual ~lmInterpolation() {}; virtual void load(const std::string &filename,int mmap=0); lmContainer* load_lm(int i, int memmap, float nlf, float dlf); virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow); virtual const char *cmaxsuffptr(ngram ong, unsigned int* size=NULL); virtual ngram_state_t cmaxsuffidx(ngram ong, unsigned int* size=NULL); int maxlevel() const { return maxlev; }; virtual inline void setDict(dictionary* d) { if (dict) delete dict; dict=d; }; virtual inline dictionary* getDict() const { return dict; }; //set penalty for OOV words virtual inline double getlogOOVpenalty() const { return logOOVpenalty; } virtual double setlogOOVpenalty(int dub); double inline setlogOOVpenalty(double oovp) { return logOOVpenalty=oovp; } //set the inverted flag (used to set the inverted flag of each subLM, when loading) inline bool is_inverted(const bool flag) { return isInverted = flag; } //for an interpolation LM this variable does not make sense //for compatibility, we return true if all subLM return true inline bool is_inverted() { for (size_t i=0; iincflag(flag); }; inline virtual bool is_OOV(int code) { //returns true if the word is OOV for each subLM for (size_t i=0; igetDict()->encode(getDict()->decode(code)); if (m_lm[i]->is_OOV(_code) == false) return false; } return true; } virtual int addWord(const char *w){ for (size_t i=0; igetDict()->incflag(1); m_lm[i]->getDict()->encode(w); m_lm[i]->getDict()->incflag(0); } getDict()->incflag(1); int c=getDict()->encode(w); getDict()->incflag(0); return c; } }; }//namespace irstlm #endif irstlm-6.00.05/src/lmclass.cpp000066400000000000000000000167171263213470300161420ustar00rootroot00000000000000// $Id: lmclass.cpp 3631 2010-10-07 12:04:12Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include #include #include "math.h" #include "mempool.h" #include "htable.h" #include "ngramcache.h" #include "dictionary.h" #include "n_gram.h" #include "lmclass.h" #include "util.h" using namespace std; // local utilities: start int parseWords(char *sentence, const char **words, int max); inline void error(const char* message) { cerr << message << "\n"; throw runtime_error(message); } // local utilities: end namespace irstlm { lmclass::lmclass(float nlf, float dlfi):lmtable(nlf,dlfi) { MaxMapSize=1000000; MapScore= (double *)malloc(MaxMapSize*sizeof(double));// //array of probabilities memset(MapScore,0,MaxMapSize*sizeof(double)); MapScoreN=0; dict = new dictionary((char *)NULL,MaxMapSize); //word to cluster dictionary }; lmclass::~lmclass() { free (MapScore); delete dict; } void lmclass::load(const std::string &filename,int memmap) { VERBOSE(2,"lmclass::load(const std::string &filename,int memmap)" << std::endl); //get info from the configuration file fstream inp(filename.c_str(),ios::in|ios::binary); char line[MAX_LINE]; const char* words[LMCLASS_MAX_TOKEN]; int tokenN; inp.getline(line,MAX_LINE,'\n'); tokenN = parseWords(line,words,LMCLASS_MAX_TOKEN); if (tokenN != 2 || ((strcmp(words[0],"LMCLASS") != 0) && (strcmp(words[0],"lmclass")!=0))) error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCLASS LM_order\nfilename_of_LM\nfilename_of_map"); maxlev = atoi(words[1]); std::string lmfilename; if (inp.getline(line,MAX_LINE,'\n')) { tokenN = parseWords(line,words,LMCLASS_MAX_TOKEN); lmfilename = words[0]; } else { error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCLASS LM_order\nfilename_of_LM\nfilename_of_map"); } std::string W2Cdict = ""; if (inp.getline(line,MAX_LINE,'\n')) { tokenN = parseWords(line,words,LMCLASS_MAX_TOKEN); W2Cdict = words[0]; } else { error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCLASS LM_order\nfilename_of_LM\nfilename_of_map"); } inp.close(); std::cerr << "lmfilename:" << lmfilename << std::endl; if (W2Cdict != "") { std::cerr << "mapfilename:" << W2Cdict << std::endl; } else { error((char*)"ERROR: you must specify a map!"); } // Load the (possibly binary) LM inputfilestream inpLM(lmfilename.c_str()); if (!inpLM.good()) { std::cerr << "Failed to open " << lmfilename << "!" << std::endl; exit(1); } lmtable::load(inpLM,lmfilename.c_str(),NULL,memmap); inputfilestream inW2C(W2Cdict); if (!inW2C.good()) { std::cerr << "Failed to open " << W2Cdict << "!" << std::endl; exit(1); } loadMap(inW2C); getDict()->genoovcode(); VERBOSE(2,"OOV code of lmclass is " << getDict()->oovcode() << " mapped into " << getMap(getDict()->oovcode())<< "\n"); getDict()->incflag(1); } void lmclass::loadMap(istream& inW2C) { double lprob=0.0; int howmany=0; const char* words[1 + LMTMAXLEV + 1 + 1]; //open input stream and prepare an input string char line[MAX_LINE]; dict->incflag(1); //can add to the map dictionary cerr<<"loadW2Cdict()...\n"; //save freq of EOS and BOS loadMapElement(dict->BoS(),lmtable::dict->BoS(),0.0); loadMapElement(dict->EoS(),lmtable::dict->EoS(),0.0); //should i add to the dict or just let the trans_freq handle loadMapElement(dict->OOV(),lmtable::dict->OOV(),0.0); while (inW2C.getline(line,MAX_LINE)) { if (strlen(line)==MAX_LINE-1) { cerr << "lmtable::loadW2Cdict: input line exceed MAXLINE (" << MAX_LINE << ") chars " << line << "\n"; exit(1); } howmany = parseWords(line, words, 4); //3 if(howmany == 3) { MY_ASSERT(sscanf(words[2], "%lf", &lprob)); lprob=(double)log10(lprob); } else if(howmany==2) { VERBOSE(3,"No score for the pair (" << words[0] << "," << words[1] << "); set to default 1.0\n"); lprob=0.0; } else { cerr << "parseline: not enough entries" << line << "\n"; exit(1); } loadMapElement(words[0],words[1],lprob); //check if the are available position in MapScore checkMap(); } VERBOSE(2,"There are " << MapScoreN << " entries in the map\n"); dict->incflag(0); //can NOT add to the dictionary of lmclass } void lmclass::checkMap() { if (MapScoreN > MaxMapSize) { MaxMapSize=2*MapScoreN; MapScore = (double*) reallocf(MapScore, sizeof(double)*(MaxMapSize)); VERBOSE(2,"In lmclass::checkMap(...) MaxMapSize=" << MaxMapSize << " MapScoreN=" << MapScoreN << "\n"); } } void lmclass::loadMapElement(const char* in, const char* out, double sc) { //freq of word (in) encodes the ID of the class (out) //save the probability associated with the pair (in,out) int wcode=dict->encode(in); dict->freq(wcode,lmtable::dict->encode(out)); MapScore[wcode]=sc; VERBOSE(3,"In lmclass::loadMapElement(...) in=" << in << " wcode=" << wcode << " out=" << out << " ccode=" << lmtable::dict->encode(out) << " MapScoreN=" << MapScoreN << "\n"); if (wcode >= MapScoreN) MapScoreN++; //increment size of the array MapScore if the element is new } //double lmclass::lprob(ngram ong,double* bow, int* bol, char** maxsuffptr,unsigned int* statesize,bool* extendible) double lmclass::lprob(ngram ong,double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow) { double lpr=getMapScore(*ong.wordp(1)); VERBOSE(3,"In lmclass::lprob(...) Mapscore = " << lpr << "\n"); //convert ong to it's clustered encoding ngram mapped_ng(lmtable::getDict()); // mapped_ng.trans_freq(ong); mapping(ong,mapped_ng); // lpr+=lmtable::clprob(mapped_ng,bow,bol,maxsuffptr,statesize, extendible); lpr+=lmtable::clprob(mapped_ng, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow); VERBOSE(3,"In lmclass::lprob(...) global prob = " << lpr << "\n"); return lpr; } void lmclass::mapping(ngram &in, ngram &out) { int insize = in.size; VERBOSE(3,"In lmclass::mapping(ngram &in, ngram &out) in = " << in << "\n"); // map the input sequence (in) into the corresponding output sequence (out), by applying the provided map for (int i=insize; i>0; i--) { out.pushc(getMap(*in.wordp(i))); } VERBOSE(3,"In lmclass::mapping(ngram &in, ngram &out) out = " << out << "\n"); return; } }//namespace irstlm irstlm-6.00.05/src/lmclass.h000066400000000000000000000056571263213470300156100ustar00rootroot00000000000000// $Id: lmclass.h 3461 2010-08-27 10:17:34Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #ifndef MF_LMCLASS_H #define MF_LMCLASS_H #ifndef WIN32 #include #include #endif #include "util.h" #include "ngramcache.h" #include "dictionary.h" #include "n_gram.h" #include "lmtable.h" #define LMCLASS_MAX_TOKEN 2 namespace irstlm { class lmclass: public lmtable { dictionary *dict; // dictionary (words - macro tags) double *MapScore; int MapScoreN; int MaxMapSize; protected: void loadMap(std::istream& inp); void loadMapElement(const char* in, const char* out, double sc); void mapping(ngram &in, ngram &out); inline double getMapScore(int wcode) { //the input word is un-known by the map, so I "transform" this word into the oov (of the words) if (wcode >= MapScoreN) { wcode = getDict()->oovcode(); } return MapScore[wcode]; }; inline size_t getMap(int wcode) { //the input word is un-known by the map, so I "transform" this word into the oov (of the words) if (wcode >= MapScoreN) { wcode = getDict()->oovcode(); } return dict->freq(wcode); }; void checkMap(); public: lmclass(float nlf=0.0, float dlfi=0.0); ~lmclass(); virtual void load(const std::string &filename,int mmap=0); virtual double lprob(ngram ng, double* bow,int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow); virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow) { return lprob(ng, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow); }; inline bool is_OOV(int code) { //a word is consisdered OOV if its mapped value is OOV return lmtable::is_OOV(getMap(code)); }; inline dictionary* getDict() const { return dict; } inline virtual void dictionary_incflag(const bool flag) { dict->incflag(flag); }; }; }//namespace irstlm #endif irstlm-6.00.05/src/lmmacro.cpp000066400000000000000000000724201263213470300161270ustar00rootroot00000000000000// $Id: lmmacro.cpp 3631 2010-10-07 12:04:12Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include #include #include "math.h" #include "mempool.h" #include "htable.h" #include "ngramcache.h" #include "dictionary.h" #include "n_gram.h" #include "lmtable.h" #include "lmmacro.h" #include "util.h" using namespace std; // local utilities: start inline void error(const char* message) { cerr << message << "\n"; throw runtime_error(message); } // local utilities: end namespace irstlm { lmmacro::lmmacro(float nlf, float dlfi):lmtable(nlf,dlfi) { dict = new dictionary((char *)NULL,1000000); // dict of micro tags getDict()->incflag(1); }; lmmacro::~lmmacro() { if (mapFlag) unloadmap(); } void lmmacro::load(const std::string &filename,int memmap) { VERBOSE(2,"lmmacro::load(const std::string &filename,int memmap)" << std::endl); //get info from the configuration file fstream inp(filename.c_str(),ios::in|ios::binary); char line[MAX_LINE]; const char* words[MAX_TOKEN_N_MAP]; int tokenN; inp.getline(line,MAX_LINE,'\n'); tokenN = parseWords(line,words,MAX_TOKEN_N_MAP); if (tokenN != 4 || ((strcmp(words[0],"LMMACRO") != 0) && (strcmp(words[0],"lmmacro")!=0))) error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMMACRO lmsize field [true|false]\nfilename_of_LM\nfilename_of_map (optional)"); maxlev = atoi(words[1]); selectedField = atoi(words[2]); if ((strcmp(words[3],"TRUE") == 0) || (strcmp(words[3],"true") == 0)) collapseFlag = true; else if ((strcmp(words[3],"FALSE") == 0) || (strcmp(words[3],"false") == 0)) collapseFlag = false; else error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMMACRO lmsize field [true|false]\nfilename_of_LM\nfilename_of_map (optional)"); #ifdef DLEXICALLM selectedFieldForLexicon = atoi(words[3]); collapseFlag = atoi(words[4]); #endif if (selectedField == -1) cerr << "no selected field: the whole string is used" << std::endl; else cerr << "selected field n. " << selectedField << std::endl; if (collapseFlag) cerr << "collapse is enabled" << std::endl; else cerr << "collapse is disabled" << std::endl; std::string lmfilename; if (inp.getline(line,MAX_LINE,'\n')) { tokenN = parseWords(line,words,MAX_TOKEN_N_MAP); lmfilename = words[0]; } else error((char*)"ERROR: wrong format of configuration file\ncorrect format: LMMACRO lmsize field [true|false]\nfilename_of_LM\nfilename_of_map (optional)"); std::string mapfilename = ""; if (inp.getline(line,MAX_LINE,'\n')) { tokenN = parseWords(line,words,MAX_TOKEN_N_MAP); mapfilename = words[0]; mapFlag = true; } else { mapFlag = false; } inp.close(); std::cerr << "lmfilename:" << lmfilename << std::endl; if (mapfilename != "") { std::cerr << "mapfilename:" << mapfilename << std::endl; } else { std::cerr << "no mapfilename" << std::endl; mapFlag = false; } //allow the dictionary to add new words getDict()->incflag(1); if ((!mapFlag) && (collapseFlag)) { error((char*)"ERROR: you must specify a map if you want to collapse a specific field!"); } #ifdef DLEXICALLM std::string lexicalclassesfilename = words[2]; if (lexicalclassesfilename != "NULL" && lexicalclassesfilename != "null") lexicalclassesfilename = ""; if (lexicalclassesfilename != "") std::cerr << "lexicalclassesfilename:" << lexicalclassesfilename << std::endl; else std::cerr << "no lexicalclassesfilename" << std::endl; // Load the classes of lexicalization tokens: if (lexicalclassesfilename != "") loadLexicalClasses(lexicalclassesfilename.c_str()); #endif // Load the (possibly binary) LM lmtable::load(lmfilename,memmap); getDict()->incflag(1); if (mapFlag) loadmap(mapfilename); getDict()->genoovcode(); }; void lmmacro::unloadmap() { delete dict; free(microMacroMap); if (collapseFlag) { free(collapsableMap); free(collapsatorMap); } #ifdef DLEXICALLM free(lexicaltoken2classMap); #endif } void lmmacro::loadmap(const std::string mapfilename) { microMacroMapN = 0; microMacroMap = NULL; collapsableMap = NULL; collapsatorMap = NULL; #ifdef DLEXICALLM lexicaltoken2classMap = NULL; lexicaltoken2classMapN = 0; #endif microMacroMap = (int *)calloc(BUFSIZ, sizeof(int)); if (collapseFlag) { collapsableMap = (bool *)calloc(BUFSIZ, sizeof(bool)); collapsatorMap = (bool *)calloc(BUFSIZ, sizeof(bool)); } getDict()->genoovcode(); microMacroMap[microMacroMapN] = lmtable::getDict()->oovcode(); MY_ASSERT(microMacroMapN == getDict()->oovcode()); microMacroMapN++; if (lmtable::getDict()->getcode(BOS_)==-1) { lmtable::getDict()->incflag(1); lmtable::getDict()->encode(BOS_); lmtable::getDict()->incflag(0); } if (lmtable::getDict()->getcode(EOS_)==-1) { lmtable::getDict()->incflag(1); lmtable::getDict()->encode(EOS_); lmtable::getDict()->incflag(0); } char line[MAX_LINE]; const char* words[MAX_TOKEN_N_MAP]; const char *macroW; const char *microW; int tokenN; bool bos=false,eos=false; // Load the dictionary of micro tags (to be put in "dict" of lmmacro class): inputfilestream inpMap(mapfilename.c_str()); std::cerr << "Reading map " << mapfilename << "..." << std::endl; while (inpMap.getline(line,MAX_LINE,'\n')) { tokenN = parseWords(line,words,MAX_TOKEN_N_MAP); if (tokenN != 2) error((char*)"ERROR: wrong format of map file\n"); microW = words[0]; macroW = words[1]; int microW_c=getDict()->encode(microW); VERBOSE(4, "microW gets the code:" << microW_c << std::endl); if (microMacroMapN>0 && !(microMacroMapN % BUFSIZ)) { microMacroMap = (int *)reallocf(microMacroMap, sizeof(int)*(BUFSIZ*(1+microMacroMapN/BUFSIZ))); if (collapseFlag) { //create supporting info for collapse collapsableMap = (bool *)reallocf(collapsableMap, sizeof(bool)*(BUFSIZ*(1+microMacroMapN/BUFSIZ))); collapsatorMap = (bool *)reallocf(collapsatorMap, sizeof(bool)*(BUFSIZ*(1+microMacroMapN/BUFSIZ))); } } microMacroMap[microMacroMapN] = lmtable::getDict()->getcode(macroW); if (collapseFlag) { int len = strlen(microW)-1; if (microW[len] == '(') { collapsableMap[microMacroMapN] = false; collapsatorMap[microMacroMapN] = true; } else if (microW[len] == ')') { collapsableMap[microMacroMapN] = true; collapsatorMap[microMacroMapN] = false; } else if (microW[len] == '+') { collapsableMap[microMacroMapN] = true; collapsatorMap[microMacroMapN] = true; } else { collapsableMap[microMacroMapN] = false; collapsatorMap[microMacroMapN] = false; } } if (!bos && !strcmp(microW,BOS_)) bos=true; if (!eos && !strcmp(microW,EOS_)) eos=true; VERBOSE(2,"\nmicroW = " << microW << "\n" << "macroW = " << macroW << "\n" << "microMacroMapN = " << microMacroMapN << "\n" << "code of micro = " << getDict()->getcode(microW) << "\n" << "code of macro = " << lmtable::getDict()->getcode(macroW) << "\n"); microMacroMapN++; } if ((microMacroMapN == 0) && (selectedField == -1)) error((char*)"ERROR: with no field selection, a map for the whole string is mandatory\n"); if (microMacroMapN>0) { // Add -> to map if missing if (!bos) { getDict()->encode(BOS_); if (microMacroMapN && !(microMacroMapN%BUFSIZ)) microMacroMap = (int *)reallocf(microMacroMap, sizeof(int)*(microMacroMapN+BUFSIZ)); microMacroMap[microMacroMapN++] = lmtable::getDict()->getcode(BOS_); } // Add -> to map if missing if (!eos) { getDict()->encode(EOS_); if (microMacroMapN && !(microMacroMapN%BUFSIZ)) microMacroMap = (int *)reallocf(microMacroMap, sizeof(int)*(microMacroMapN+BUFSIZ)); microMacroMap[microMacroMapN++] = lmtable::getDict()->getcode(EOS_); } } // getDict()->incflag(0); VERBOSE(2,"oovcode(micro)=" << getDict()->oovcode() << "\n" << "oovcode(macro)=" << lmtable::getDict()->oovcode() << "\n" << "microMacroMapN = " << microMacroMapN << "\n" << "macrodictsize = " << getDict()->size() << "\n" << "microdictsize = " << lmtable::getDict()->size() << "\n"); IFVERBOSE(2) { for (int i=0; idecode(i) << "] {"<< i << "} -> " << lmtable::getDict()->decode(microMacroMap[i]) << " {" << microMacroMap[i]<< "}" << "\n"); } } std::cerr << "...done\n"; } // double lmmacro::lprob(ngram micro_ng) double lmmacro::lprob(ngram micro_ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow) { VERBOSE(2,"lmmacro::lprob, parameter = <" << micro_ng << ">\n"); ngram macro_ng(lmtable::getDict()); if (micro_ng.dict == macro_ng.dict) macro_ng.trans(micro_ng); // micro to macro mapping already done else map(µ_ng, ¯o_ng); // mapping required VERBOSE(3,"lmmacro::lprob: micro_ng = " << micro_ng << "\n" << "lmmacro::lprob: macro_ng = " << macro_ng << "\n"); // ask LM with macro double prob; prob = lmtable::lprob(macro_ng, bow, bol, maxsuffidx, maxsuffptr, statesize, extendible, lastbow); VERBOSE(3,"prob = " << prob << "\n"); return prob; }; double lmmacro::clprob(ngram micro_ng, double* bow, int* bol, ngram_state_t* ngramstate, char** state,unsigned int* statesize,bool* extendible, double* lastbow) { VERBOSE(3," lmmacro::clprob(ngram), parameter = <" << micro_ng << ">\n"); ngram transformed_ng(lmtable::getDict()); bool collapsed = transform(micro_ng, transformed_ng); VERBOSE(3,"lmmacro::clprob(ngram), transformed_ng = <" << transformed_ng << ">\n"); double logpr; if (collapsed) { // the last token of the ngram continues an already open "chunk" // the probability at chunk-level is not computed because it has been already computed when the actual"chunk" opens VERBOSE(3," SKIPPED call to lmtable::clprob because of collapse; logpr: 0.0\n"); logpr = 0.0; } else { VERBOSE(3," QUERY MACRO LM on (after transformation and size reduction) " << transformed_ng << "\n"); // logpr = lmtable::clprob(transformed_ng, bow, bol, state, statesize, extendible); logpr = lmtable::clprob(transformed_ng, bow, bol, ngramstate, state, statesize, extendible, lastbow); } VERBOSE(3," GET logpr: " << logpr << "\n"); return logpr; } bool lmmacro::transform(ngram &in, ngram &out) { VERBOSE(3,"lmmacro::transform(ngram &in, ngram &out), in = <" << in << ">\n"); //step 1: selection of the correct field ngram field_ng(getDict()); if (selectedField >= 0) field_selection(in, field_ng); else field_ng = in; //step 2: collapsing ngram collapsed_ng(getDict()); bool collapsed = false; if (collapseFlag) collapsed = collapse(field_ng, collapsed_ng); else collapsed_ng = field_ng; //step 3: mapping using the loaded map if (mapFlag) mapping(collapsed_ng, out); else out.trans(collapsed_ng); if (out.size>lmtable::maxlevel()) out.size=lmtable::maxlevel(); VERBOSE(3,"lmmacro::transform(ngram &in, ngram &out), out = <" << out << ">\n"); return collapsed; } void lmmacro::field_selection(ngram &in, ngram &out) { VERBOSE(3,"In lmmacro::field_selection(ngram &in, ngram &out) in = " << in << "\n"); int microsize = in.size; for (int i=microsize; i>0; i--) { char curr_token[BUFSIZ]; strcpy(curr_token, getDict()->decode(*in.wordp(i))); char *field; if (strcmp(curr_token,"") && strcmp(curr_token,"") && strcmp(curr_token,"_unk_")) { field = strtok(curr_token, "#"); int j=0; while (j1; i--) { curr_code = *in.wordp(i); if (microMacroMap[curr_code] != microMacroMap[prev_code]) { out.pushc(curr_code); } else { if (!(collapsableMap[curr_code] && collapsatorMap[prev_code])) { out.pushc(prev_code); } } prev_code = curr_code; } // and insert the most recent token out.pushc(*in.wordp(1)); VERBOSE(3,"In lmmacro::collapse(ngram &in, ngram &out) out = " << out << "\n"); return false; } void lmmacro::mapping(ngram &in, ngram &out) { VERBOSE(3,"In lmmacro::mapping(ngram &in, ngram &out) in = " << in << "\n"); int microsize = in.size; // map microtag sequence (in) into the corresponding sequence of macrotags (possibly shorter) (out) for (int i=microsize; i>0; i--) { int in_code = *in.wordp(i); int out_code; if (in_code < microMacroMapN) out_code = microMacroMap[in_code]; else out_code = lmtable::getDict()->oovcode(); out.pushc(out_code); } VERBOSE(3,"In lmmacro::mapping(ngram &in, ngram &out) out = " << out << "\n"); return; } //maxsuffptr returns the largest suffix of an n-gram that is contained //in the LM table. This can be used as a compact representation of the //(n-1)-gram state of a n-gram LM. if the input k-gram has k>=n then it //is trimmed to its n-1 suffix. const char *lmmacro::maxsuffptr(ngram micro_ng, unsigned int* size) { ngram macro_ng(lmtable::getDict()); if (micro_ng.dict == macro_ng.dict) macro_ng.trans(micro_ng); // micro to macro mapping already done else map(µ_ng, ¯o_ng); // mapping required VERBOSE(2,"lmmacro::lprob: micro_ng = " << micro_ng << "\n" << "lmmacro::lprob: macro_ng = " << macro_ng << "\n"); return lmtable::maxsuffptr(macro_ng,size); } /* const char *lmmacro::cmaxsuffptr(ngram micro_ng, unsigned int* size) { //cerr << "lmmacro::CMAXsuffptr\n"; //cerr << "micro_ng: " << micro_ng // << " -> micro_ng.size: " << micro_ng.size << "\n"; //the LM working on the selected field = 0 //contributes to the LM state // if (selectedField>0) return NULL; ngram macro_ng(lmtable::getDict()); if (micro_ng.dict == macro_ng.dict) macro_ng.trans(micro_ng); // micro to macro mapping already done else map(µ_ng, ¯o_ng); // mapping required VERBOSE(2,"lmmacro::lprob: micro_ng = " << micro_ng << "\n" << "lmmacro::lprob: macro_ng = " << macro_ng << "\n") return lmtable::cmaxsuffptr(macro_ng,size); } */ ngram_state_t lmmacro::maxsuffidx(ngram micro_ng, unsigned int* size) { //cerr << "lmmacro::CMAXsuffptr\n"; //cerr << "micro_ng: " << micro_ng // << " -> micro_ng.size: " << micro_ng.size << "\n"; //the LM working on the selected field = 0 //contributes to the LM state // if (selectedField>0) return NULL; ngram macro_ng(lmtable::getDict()); if (micro_ng.dict == macro_ng.dict) macro_ng.trans(micro_ng); // micro to macro mapping already done else map(µ_ng, ¯o_ng); // mapping required VERBOSE(2,"lmmacro::lprob: micro_ng = " << micro_ng << "\n" << "lmmacro::lprob: macro_ng = " << macro_ng << "\n") return lmtable::cmaxsuffidx(macro_ng,size); } /* ngram_state_t lmmacro::cmaxsuffidx(ngram micro_ng, unsigned int* size) { //cerr << "lmmacro::CMAXsuffptr\n"; //cerr << "micro_ng: " << micro_ng // << " -> micro_ng.size: " << micro_ng.size << "\n"; //the LM working on the selected field = 0 //contributes to the LM state // if (selectedField>0) return NULL; ngram macro_ng(lmtable::getDict()); if (micro_ng.dict == macro_ng.dict) macro_ng.trans(micro_ng); // micro to macro mapping already done else map(µ_ng, ¯o_ng); // mapping required VERBOSE(2,"lmmacro::lprob: micro_ng = " << micro_ng << "\n" << "lmmacro::lprob: macro_ng = " << macro_ng << "\n") return lmtable::cmaxsuffidx(macro_ng,size); } */ void lmmacro::map(ngram *in, ngram *out) { VERBOSE(2,"In lmmacro::map, in = " << *in << endl << " (selectedField = " << selectedField << " )\n"); if (selectedField==-2) // the whole token is compatible with the LM words One2OneMapping(in, out); else if (selectedField==-1) // the whole token has to be mapped before querying the LM Micro2MacroMapping(in, out); else if (selectedField<10) { // select the field "selectedField" from tokens (separator is assumed to be "#") ngram field_ng(((lmmacro *)this)->getDict()); int microsize = in->size; for (int i=microsize; i>0; i--) { char curr_token[BUFSIZ]; strcpy(curr_token, ((lmmacro *)this)->getDict()->decode(*(in->wordp(i)))); char *field; if (strcmp(curr_token,"") && strcmp(curr_token,"") && strcmp(curr_token,"_unk_")) { field = strtok(curr_token, "#"); int j=0; while (j0) Micro2MacroMapping(&field_ng, out); else out->trans(field_ng); } else { #ifdef DLEXICALLM // selectedField>=10: tens=idx of micro tag (possibly to be mapped to // macro tag), unidx=idx of lemma to be concatenated by "_" to the // (mapped) tag int tagIdx = selectedField/10; int lemmaIdx = selectedField%10; // micro (or mapped to macro) sequence construction: ngram tag_ng(getDict()); char *lemmas[BUFSIZ]; int microsize = in->size; for (int i=microsize; i>0; i--) { char curr_token[BUFSIZ]; strcpy(curr_token, getDict()->decode(*(in->wordp(i)))); char *tag = NULL, *lemma = NULL; if (strcmp(curr_token,"") && strcmp(curr_token,"") && strcmp(curr_token,"_unk_")) { if (tagIdx0) Micro2MacroMapping(&tag_ng, out, lemmas); else out->trans(tag_ng); // qui si dovrebbero sostituire i tag con tag_lemma, senza mappatura! #endif } VERBOSE(2,"In lmmacro::map, FINAL out = " << *out << endl); } void lmmacro::One2OneMapping(ngram *in, ngram *out) { int insize = in->size; // map each token of the sequence "in" into the same-length sequence "out" through the map for (int i=insize; i>0; i--) { int curr_code = *(in->wordp(i)); const char *outtoken = lmtable::getDict()->decode((curr_codeoovcode()); out->pushw(outtoken); } return; } void lmmacro::Micro2MacroMapping(ngram *in, ngram *out) { int microsize = in->size; VERBOSE(2,"In Micro2MacroMapping, in = " << *in << "\n"); // map microtag sequence (in) into the corresponding sequence of macrotags (possibly shorter) (out) for (int i=microsize; i>0; i--) { int curr_code = *(in->wordp(i)); const char *curr_macrotag = lmtable::getDict()->decode((curr_codeoovcode()); if (i==microsize) { out->pushw(curr_macrotag); } else { int prev_code = *(in->wordp(i+1)); const char *prev_microtag = getDict()->decode(prev_code); const char *curr_microtag = getDict()->decode(curr_code); const char *prev_macrotag = lmtable::getDict()->decode((prev_codeoovcode()); int prev_len = strlen(prev_microtag)-1; int curr_len = strlen(curr_microtag)-1; if (strcmp(curr_macrotag,prev_macrotag) != 0 || !( (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!= ')' )) && ( curr_microtag[curr_len]==')' && curr_microtag[0]!='(')) || (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!= ')' )) && curr_microtag[curr_len]=='+' ) || (prev_microtag[prev_len]== '+' && curr_microtag[curr_len]=='+' ) || (prev_microtag[prev_len]== '+' && ( curr_microtag[curr_len]==')' && curr_microtag[0]!='(' )))) out->pushw(curr_macrotag); } } return; } // DISMITTED ON FEB 2011 BECAUSE TOO MUCH PROBLEMATIC FROM A THEORETICAL POINT OF VIEW #ifdef DLEXICALLM void lmmacro::Micro2MacroMapping(ngram *in, ngram *out, char **lemmas) { VERBOSE(2,"In Micro2MacroMapping, in = " << *in << "\n") int microsize = in->size; IFVERBOSE(3) { VERBOSE(3,"In Micro2MacroMapping, lemmas:\n"); if (lexicaltoken2classMap) for (int i=microsize; i>0; i--) VERBOSE(3,"lemmas[" << i << "]=" << lemmas[i] << " -> class -> " << lexicaltoken2classMap[lmtable::getDict()->encode(lemmas[i])] << endl); else for (int i=microsize; i>0; i--) VERBOSE(3,"lemmas[" << i << "]=" << lemmas[i] << endl); } // map microtag sequence (in) into the corresponding sequence of macrotags (possibly shorter) (out) char tag_lemma[BUFSIZ]; for (int i=microsize; i>0; i--) { int curr_code = *(in->wordp(i)); const char *curr_microtag = getDict()->decode(curr_code); const char *curr_lemma = lemmas[i]; const char *curr_macrotag = lmtable::getDict()->decode((curr_codeoovcode()); int curr_len = strlen(curr_microtag)-1; if (i==microsize) { if (( curr_microtag[curr_len]=='(' ) || ( curr_microtag[0]=='(' && curr_microtag[curr_len]!=')' ) || ( curr_microtag[curr_len]=='+' )) sprintf(tag_lemma, "%s", curr_macrotag); // non lessicalizzo il macrotag se sono ancora all''interno del chunk else if (lexicaltoken2classMap) sprintf(tag_lemma, "%s_class%d", curr_macrotag, lexicaltoken2classMap[lmtable::getDict()->encode(curr_lemma)]); else sprintf(tag_lemma, "%s_%s", curr_macrotag, lemmas[microsize]); VERBOSE(2,"In Micro2MacroMapping, starting tag_lemma = >" << tag_lemma << "<\n"); out->pushw(tag_lemma); free(lemmas[microsize]); } else { int prev_code = *(in->wordp(i+1)); const char *prev_microtag = getDict()->decode(prev_code); const char *prev_macrotag = lmtable::getDict()->decode((prev_codeoovcode()); int prev_len = strlen(prev_microtag)-1; if (( curr_microtag[curr_len]=='(' ) || ( curr_microtag[0]=='(' && curr_microtag[curr_len]!=')' ) || ( curr_microtag[curr_len]=='+' )) sprintf(tag_lemma, "%s", curr_macrotag); // non lessicalizzo il macrotag se sono ancora all''interno del chunk else if (lexicaltoken2classMap) sprintf(tag_lemma, "%s_class%d", curr_macrotag, lexicaltoken2classMap[lmtable::getDict()->encode(curr_lemma)]); else sprintf(tag_lemma, "%s_%s", curr_macrotag, curr_lemma); VERBOSE(2,"In Micro2MacroMapping, tag_lemma = >" << tag_lemma << "<\n"); if (strcmp(curr_macrotag,prev_macrotag) != 0 || !( (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!=')' )) && curr_microtag[curr_len]==')' && curr_microtag[0]!='(') || (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!= ')')) && curr_microtag[curr_len]=='+' ) || (prev_microtag[prev_len]== '+' && curr_microtag[curr_len]=='+' ) || (prev_microtag[prev_len]== '+' && curr_microtag[curr_len]==')' && curr_microtag[0]!='(' ))) { VERBOSE(2,"In Micro2MacroMapping, before pushw, out = " << *out << endl); out->pushw(tag_lemma); VERBOSE(2,"In Micro2MacroMapping, after pushw, out = " << *out << endl); } else { VERBOSE(2,"In Micro2MacroMapping, before shift, out = " << *out << endl); out->shift(); VERBOSE(2,"In Micro2MacroMapping, after shift, out = " << *out << endl); out->pushw(tag_lemma); VERBOSE(2,"In Micro2MacroMapping, after push, out = " << *out << endl); } free(lemmas[i]); } } return; } void lmmacro::loadLexicalClasses(const char *fn) { char line[MAX_LINE]; const char* words[MAX_TOKEN_N_MAP]; int tokenN; lexicaltoken2classMap = (int *)calloc(BUFSIZ, sizeof(int)); lexicaltoken2classMapN = BUFSIZ; lmtable::getDict()->incflag(1); inputfilestream inp(fn); while (inp.getline(line,MAX_LINE,'\n')) { tokenN = parseWords(line,words,MAX_TOKEN_N_MAP); if (tokenN != 2) error((char*)"ERROR: wrong format of lexical classes file\n"); else { int classIdx = atoi(words[1]); int wordCode = lmtable::getDict()->encode(words[0]); if (wordCode>=lexicaltoken2classMapN) { int r = (wordCode-lexicaltoken2classMapN)/BUFSIZ; lexicaltoken2classMapN += (r+1)*BUFSIZ; lexicaltoken2classMap = (int *)reallocf(lexicaltoken2classMap, sizeof(int)*lexicaltoken2classMapN); } lexicaltoken2classMap[wordCode] = classIdx; } } lmtable::getDict()->incflag(0); IFVERBOSE(3) { for (int x=0; xsize(); x++) VERBOSE(3,"class of <" << lmtable::getDict()->decode(x) << "> (code=" << x << ") = " << lexicaltoken2classMap[x] << endl); } return; } void lmmacro::cutLex(ngram *in, ngram *out) { *out=*in; const char *curr_macro = out->dict->decode(*(out->wordp(1))); out->shift(); const char *p = strrchr(curr_macro, '_'); int lexLen; if (p) lexLen=strlen(p); else lexLen=0; char curr_NoLexMacro[BUFSIZ]; memset(&curr_NoLexMacro,0,BUFSIZ); strncpy(curr_NoLexMacro,curr_macro,strlen(curr_macro)-lexLen); out->pushw(curr_NoLexMacro); return; } #endif }//namespace irstlm irstlm-6.00.05/src/lmmacro.h000066400000000000000000000103351263213470300155710ustar00rootroot00000000000000// $Id: lmmacro.h 3461 2010-08-27 10:17:34Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #ifndef MF_LMMACRO_H #define MF_LMMACRO_H #ifndef WIN32 #include #include #endif #include "util.h" #include "ngramcache.h" #include "dictionary.h" #include "n_gram.h" #include "lmtable.h" #define MAX_TOKEN_N_MAP 5 namespace irstlm { class lmmacro: public lmtable { dictionary *dict; int maxlev; //max level of table int selectedField; bool collapseFlag; //flag for the presence of collapse bool mapFlag; //flag for the presence of map int microMacroMapN; int *microMacroMap; bool *collapsableMap; bool *collapsatorMap; #ifdef DLEXICALLM int selectedFieldForLexicon; int *lexicaltoken2classMap; int lexicaltoken2classMapN; #endif void loadmap(const std::string mapfilename); void unloadmap(); bool transform(ngram &in, ngram &out); void field_selection(ngram &in, ngram &out); bool collapse(ngram &in, ngram &out); void mapping(ngram &in, ngram &out); public: lmmacro(float nlf=0.0, float dlfi=0.0); ~lmmacro(); virtual void load(const std::string &filename,int mmap=0); virtual double lprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow); virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow); virtual const char *maxsuffptr(ngram ong, unsigned int* size=NULL); virtual ngram_state_t maxsuffidx(ngram ong, unsigned int* size=NULL); void map(ngram *in, ngram *out); void One2OneMapping(ngram *in, ngram *out); void Micro2MacroMapping(ngram *in, ngram *out); #ifdef DLEXICALLM void Micro2MacroMapping(ngram *in, ngram *out, char **lemma); void loadLexicalClasses(const char *fn); void cutLex(ngram *in, ngram *out); #endif inline bool is_OOV(int code) { ngram word_ng(getDict()); ngram field_ng(getDict()); word_ng.pushc(code); if (selectedField >= 0) field_selection(word_ng, field_ng); else field_ng = word_ng; int field_code=*field_ng.wordp(1); VERBOSE(2,"inline virtual bool lmmacro::is_OOV(int code) word_ng:" << word_ng << " field_ng:" << field_ng << std::endl); //the selected field(s) of a token is considered OOV //either if unknown by the microMacroMap //or if its mapped macroW is OOV if (field_code >= microMacroMapN) return true; VERBOSE(2,"inline virtual bool lmmacro::is_OOV(int code)*field_code:" << field_code << " microMacroMap[field_code]:" << microMacroMap[field_code] << " lmtable::dict->oovcode():" << lmtable::dict->oovcode() << std::endl); return (microMacroMap[field_code] == lmtable::dict->oovcode()); }; inline dictionary* getDict() const { return dict; } inline int maxlevel() const { return maxlev; }; inline virtual void dictionary_incflag(const bool flag) { dict->incflag(flag); }; inline virtual bool filter(const string sfilter, lmContainer* sublmt, const string skeepunigrams) { UNUSED(sfilter); UNUSED(sublmt); UNUSED(skeepunigrams); return false; } }; }//namespace irstlm #endif irstlm-6.00.05/src/lmtable.cpp000066400000000000000000002336571263213470300161300ustar00rootroot00000000000000// $Id: lmtable.cpp 3686 2010-10-15 11:55:32Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include #include #include #include #include #include "math.h" #include "mempool.h" #include "htable.h" #include "ngramcache.h" #include "dictionary.h" #include "n_gram.h" #include "lmContainer.h" #include "lmtable.h" #include "util.h" //special value for pruned iprobs #define NOPROB ((float)-1.329227995784915872903807060280344576e36) using namespace std; inline void error(const char* message) { VERBOSE(2,message << std::endl); throw std::runtime_error(message); } void print(prob_and_state_t* pst, std::ostream& out) { if (pst != NULL) { out << "PST ["; out << "logpr:" << pst->logpr; out << ",state:" << (void*) pst->state; out << ",statesize:" << pst->statesize; out << ",bow:" << pst->bow; out << ",bol:" << pst->bol; out << "]"; out << std::endl; } else { out << "PST [NULL]" << std::endl; } } namespace irstlm { //instantiate an empty lm table lmtable::lmtable(float nlf, float dlf):lmContainer() { ngramcache_load_factor = nlf; dictionary_load_factor = dlf; isInverted=false; configure(1,false); dict=new dictionary((char *)NULL,1000000,dictionary_load_factor); delete_dict=true; memset(table, 0, sizeof(table)); memset(tableGaps, 0, sizeof(tableGaps)); memset(cursize, 0, sizeof(cursize)); memset(tbltype, 0, sizeof(tbltype)); memset(maxsize, 0, sizeof(maxsize)); memset(tb_offset, 0, sizeof(maxsize)); memset(info, 0, sizeof(info)); memset(NumCenters, 0, sizeof(NumCenters)); max_cache_lev=0; for (int i=0; iclose(); delete cacheout; #endif for (int l=1; l<=maxlev; l++) { if (table[l]) { if (memmap > 0 && l >= memmap) Munmap(table[l]-tableGaps[l],cursize[l]*nodesize(tbltype[l])+tableGaps[l],0); else delete [] table[l]; } if (isQtable) { if (Pcenters[l]) delete [] Pcenters[l]; if (lstat(); } } #endif } void lmtable::stat_lmtcaches() { #ifdef PS_CACHE_ENABLE for (int i=2; i<=max_cache_lev; i++) { std::cout << "void lmtable::stat_lmtcaches() level:" << i << std::endl; if (lmtcache[i]) { lmtcache[i]->stat(); } } #endif } void lmtable::stat_caches() { #ifdef PS_CACHE_ENABLE stat_prob_and_state_cache(); #endif #ifdef LMT_CACHE_ENABLE stat_lmtcaches(); #endif } void lmtable::used_prob_and_state_cache() const { #ifdef PS_CACHE_ENABLE for (int i=1; i<=max_cache_lev; i++) { if (prob_and_state_cache[i]) { prob_and_state_cache[i]->used(); } } #endif } void lmtable::used_lmtcaches() const { #ifdef LMT_CACHE_ENABLE for (int i=2; i<=max_cache_lev; i++) { if (lmtcache[i]) { lmtcache[i]->used(); } } #endif } void lmtable::used_caches() const { #ifdef PS_CACHE_ENABLE used_prob_and_state_cache(); #endif #ifdef LMT_CACHE_ENABLE used_lmtcaches(); #endif } void lmtable::check_prob_and_state_cache_levels() const { #ifdef PS_CACHE_ENABLE for (int i=1; i<=max_cache_lev; i++) { if (prob_and_state_cache[i] && prob_and_state_cache[i]->isfull()) { prob_and_state_cache[i]->reset(prob_and_state_cache[i]->cursize()); } } #endif } void lmtable::check_lmtcaches_levels() const { #ifdef LMT_CACHE_ENABLE for (int i=2; i<=max_cache_lev; i++) { if (lmtcache[i] && lmtcache[i]->isfull()) { lmtcache[i]->reset(lmtcache[i]->cursize()); } } #endif } void lmtable::check_caches_levels() const { #ifdef PS_CACHE_ENABLE check_prob_and_state_cache_levels(); #endif #ifdef LMT_CACHE_ENABLE check_lmtcaches_levels(); #endif } void lmtable::reset_prob_and_state_cache() { #ifdef PS_CACHE_ENABLE for (int i=1; i<=max_cache_lev; i++) { if (prob_and_state_cache[i]) { prob_and_state_cache[i]->reset(MAX(prob_and_state_cache[i]->cursize(),prob_and_state_cache[i]->maxsize())); } } #endif } void lmtable::reset_lmtcaches() { #ifdef LMT_CACHE_ENABLE for (int i=2; i<=max_cache_lev; i++) { if (lmtcache[i]) { lmtcache[i]->reset(MAX(lmtcache[i]->cursize(),lmtcache[i]->maxsize())); } } #endif } void lmtable::reset_caches() { VERBOSE(2,"void lmtable::reset_caches()" << std::endl); #ifdef PS_CACHE_ENABLE reset_prob_and_state_cache(); #endif #ifdef LMT_CACHE_ENABLE reset_lmtcaches(); #endif } bool lmtable::are_prob_and_state_cache_active() const { #ifdef PS_CACHE_ENABLE if (max_cache_lev < 1) { return false; } for (int i=1; i<=max_cache_lev; i++) { if (prob_and_state_cache[i]==NULL) { return false; } } return true; // return prob_and_state_cache!=NULL; #else return false; #endif } bool lmtable::are_lmtcaches_active() const { #ifdef LMT_CACHE_ENABLE if (max_cache_lev < 2) { return false; } for (int i=2; i<=max_cache_lev; i++) { if (lmtcache[i]==NULL) { return false; } } return true; #else return false; #endif } bool lmtable::are_caches_active() const { return (are_prob_and_state_cache_active() && are_lmtcaches_active()); } void lmtable::configure(int n,bool quantized) { VERBOSE(2,"void lmtable::configure(int n,bool quantized) with n:" << n << std::endl); maxlev=n; VERBOSE(2," maxlev:" << maxlev << " maxlevel():" << maxlevel() << " this->maxlevel():" << this->maxlevel() << std::endl); //The value for index 0 is never used for (int i=0; i0) { //check whether memory mapping can be used #ifdef WIN32 mmap=0; //don't use memory map #endif } load(inp,infile.c_str(),NULL,mmap); getDict()->incflag(0); } void lmtable::load(istream& inp,const char* filename,const char* outfilename,int keep_on_disk) { VERBOSE(2,"lmtable::load(istream& inp,...)" << std::endl); #ifdef WIN32 if (keep_on_disk>0) { VERBOSE(2, "lmtable::load memory mapping not yet available under WIN32" << std::endl); keep_on_disk = 0; } #endif //give a look at the header to select loading method char header[MAX_LINE]; inp >> header; VERBOSE(2, header << std::endl); if (strncmp(header,"Qblmt",5)==0 || strncmp(header,"blmt",4)==0) { loadbin(inp,header,filename,keep_on_disk); } else { //input is in textual form if (keep_on_disk && outfilename==NULL) { VERBOSE(2, "Load Error: inconsistent setting. Passed input file: textual. Memory map: yes. Outfilename: not specified." << std::endl); exit(0); } loadtxt(inp,header,outfilename,keep_on_disk); } VERBOSE(2, "OOV code is " << lmtable::getDict()->oovcode() << std::endl); } //load language model on demand through a word-list file int lmtable::reload(std::set words) { //build dictionary dictionary dict(NULL,(int)words.size()); dict.incflag(1); std::set::iterator w; for (w = words.begin(); w != words.end(); ++w) dict.encode((*w).c_str()); return 1; } void lmtable::load_centers(istream& inp,int Order) { char line[MAX_LINE]; //first read the coodebook VERBOSE(2, Order << " read code book " << std::endl); inp >> NumCenters[Order]; Pcenters[Order]=new float[NumCenters[Order]]; Bcenters[Order]=(Order> Pcenters[Order][c]; if (Order> Bcenters[Order][c]; }; //empty the last line inp.getline((char*)line,MAX_LINE); } void lmtable::loadtxt(istream& inp,const char* header,const char* outfilename,int mmap) { if (mmap>0) loadtxt_mmap(inp,header,outfilename); else { loadtxt_ram(inp,header); lmtable::getDict()->genoovcode(); } } void lmtable::loadtxt_mmap(istream& inp,const char* header,const char* outfilename) { char nameNgrams[BUFSIZ]; char nameHeader[BUFSIZ]; FILE *fd = NULL; table_pos_t filesize=0; int Order,n; //char *SepString = " \t\n"; unused //open input stream and prepare an input string char line[MAX_LINE]; //prepare word dictionary //dict=(dictionary*) new dictionary(NULL,1000000,NULL,NULL); lmtable::getDict()->incflag(1); //check the header to decide if the LM is quantized or not isQtable=(strncmp(header,"qARPA",5)==0?true:false); //check the header to decide if the LM table is incomplete isItable=(strncmp(header,"iARPA",5)==0?true:false); if (isQtable) { int maxlevel_h; //check if header contains other infos inp >> line; if (!(maxlevel_h=atoi(line))) { VERBOSE(2, "loadtxt with mmap requires new qARPA header. Please regenerate the file." << std::endl); exit(1); } for (n=1; n<=maxlevel_h; n++) { inp >> line; if (!(NumCenters[n]=atoi(line))) { VERBOSE(2, "loadtxt with mmap requires new qARPA header. Please regenerate the file." << std::endl); exit(0); } } } //we will configure the table later we we know the maxlev; bool yetconfigured=false; VERBOSE(2,"loadtxtmmap()" << std::endl); // READ ARPA Header while (inp.getline(line,MAX_LINE)) { if (strlen(line)==MAX_LINE-1) { VERBOSE(2,"lmtable::loadtxt_mmap: input line exceed MAXLINE (" << MAX_LINE << ") chars " << line << std::endl); exit(1); } bool backslash = (line[0] == '\\'); if (sscanf(line, "ngram %d=%d", &Order, &n) == 2) { maxsize[Order] = n; maxlev=Order; //upadte Order VERBOSE(2,"size[" << Order << "]=" << maxsize[Order] << std::endl); } VERBOSE(2,"maxlev" << maxlev << std::endl); if (maxlev>requiredMaxlev) maxlev=requiredMaxlev; VERBOSE(2,"maxlev" << maxlev << std::endl); VERBOSE(2,"lmtable:requiredMaxlev" << requiredMaxlev << std::endl); if (backslash && sscanf(line, "\\%d-grams", &Order) == 1) { //at this point we are sure about the size of the LM if (!yetconfigured) { configure(maxlev,isQtable); yetconfigured=true; //opening output file strcpy(nameNgrams,outfilename); strcat(nameNgrams, "-ngrams"); fd = fopen(nameNgrams, "w+"); // compute the size of file (only for tables and - possibly - centroids; no header nor dictionary) for (int l=1; l<=maxlev; l++) { if (l1) table[1]=table[0] + (table_pos_t) (2 * NumCenters[1] * sizeof(float)); else table[1]=table[0] + (table_pos_t) (NumCenters[1] * sizeof(float)); */ for (int l=1; l<=maxlev; l++) { if (lincflag(0); lmtable::getDict()->genoovcode(); // saving header + dictionary strcpy(nameHeader,outfilename); strcat(nameHeader, "-header"); VERBOSE(2,"saving header+dictionary in " << nameHeader << "\n"); fstream out(nameHeader,ios::out); // print header if (isQtable) { out << "Qblmt" << (isInverted?"I ":" ") << maxlev; for (int i=1; i<=maxlev; i++) out << " " << maxsize[i]; // not cursize[i] because the file was already allocated out << "\nNumCenters"; for (int i=1; i<=maxlev; i++) out << " " << NumCenters[i]; out << "\n"; } else { out << "blmt" << (isInverted?"I ":" ") << maxlev; for (int i=1; i<=maxlev; i++) out << " " << maxsize[i]; // not cursize[i] because the file was already allocated out << "\n"; } lmtable::getDict()->save(out); out.close(); VERBOSE(2,"done" << std::endl); // cat header+dictionary and n-grams files: char cmd[BUFSIZ]; sprintf(cmd,"cat %s >> %s", nameNgrams, nameHeader); VERBOSE(2,"run cmd <" << cmd << std::endl); system(cmd); sprintf(cmd,"mv %s %s", nameHeader, outfilename); VERBOSE(2,"run cmd <" << cmd << std::endl); system(cmd); removefile(nameNgrams); //no more operations are available, the file must be saved! exit(0); return; } void lmtable::loadtxt_ram(istream& inp,const char* header) { //open input stream and prepare an input string char line[MAX_LINE]; //prepare word dictionary lmtable::getDict()->incflag(1); //check the header to decide if the LM is quantized or not isQtable=(strncmp(header,"qARPA",5)==0?true:false); //check the header to decide if the LM table is incomplete isItable=(strncmp(header,"iARPA",5)==0?true:false); //we will configure the table later when we will know the maxlev; bool yetconfigured=false; VERBOSE(2,"loadtxt_ram()" << std::endl); // READ ARPA Header int Order; unsigned int n; while (inp.getline(line,MAX_LINE)) { if (strlen(line)==MAX_LINE-1) { VERBOSE(2,"lmtable::loadtxt_ram: input line exceed MAXLINE (" << MAX_LINE << ") chars " << line << std::endl); exit(1); } bool backslash = (line[0] == '\\'); if (sscanf(line, "ngram %d=%u", &Order, &n) == 2) { maxsize[Order] = n; maxlev=Order; //update Order } if (maxlev>requiredMaxlev) maxlev=requiredMaxlev; if (backslash && sscanf(line, "\\%d-grams", &Order) == 1) { //at this point we are sure about the size of the LM if (!yetconfigured) { configure(maxlev,isQtable); yetconfigured=true; //allocate space for loading the table of this level for (int i=1; i<=maxlev; i++) table[i] = new char[(table_pos_t) maxsize[i] * nodesize(tbltype[i])]; } loadtxt_level(inp,Order); // now we can fix table at level Order - 1 if (maxlev>1 && Order>1) { checkbounds(Order-1); } } } lmtable::getDict()->incflag(0); VERBOSE(2,"done" << std::endl); } void lmtable::loadtxt_level(istream& inp, int level) { VERBOSE(2, level << "-grams: reading " << std::endl); if (isQtable) { load_centers(inp,level); } //allocate support vector to manage badly ordered n-grams if (maxlev>1 && level1)) { ing.invert(ng); ng=ing; } //if table is in incomplete ARPA format prob is just the //discounted frequency, so we need to add bow * Pr(n-1 gram) if (isItable && (level>1)) { //get bow of lower context get(ng,ng.size,ng.size-1); float rbow=0.0; if (ng.lev==ng.size-1) { //found context rbow=ng.bow; } int tmp=maxlev; maxlev=level-1; prob= log(exp((double)prob * M_LN10) + exp(((double)rbow + lprob(ng)) * M_LN10))/M_LN10; maxlev=tmp; } //insert an n-gram into the TRIE table if (isQtable) add(ng, (qfloat_t)prob, (qfloat_t)bow); else add(ng, prob, bow); } } VERBOSE(2, "done level " << level << std::endl); } void lmtable::expand_level(int level, table_entry_pos_t size, const char* outfilename, int mmap) { if (mmap>0) expand_level_mmap(level, size, outfilename); else { expand_level_nommap(level, size); } } void lmtable::expand_level_mmap(int level, table_entry_pos_t size, const char* outfilename) { maxsize[level]=size; //getting the level-dependent filename char nameNgrams[BUFSIZ]; sprintf(nameNgrams,"%s-%dgrams",outfilename,level); //opening output file FILE *fd = NULL; fd = fopen(nameNgrams, "w+"); if (fd == NULL) { perror("Error opening file for writing"); exit_error(IRSTLM_ERROR_IO, "Error opening file for writing"); } table_pos_t filesize=(table_pos_t) maxsize[level] * nodesize(tbltype[level]); // set the file to the proper size: ftruncate(fileno(fd),filesize); /* Now the file is ready to be mmapped. */ table[level]=(char *)(MMap(fileno(fd),PROT_READ|PROT_WRITE,0,filesize,&tableGaps[level])); if (table[level] == MAP_FAILED) { fclose(fd); perror("Error mmapping the file"); exit_error(IRSTLM_ERROR_IO, "Error mmapping the file"); } if (maxlev>1 && level1 && level0) // printEntryN=(printEntryN " << dict->decode(word(tbl)) << " bw:" << bw << " bnd:" << bnd << " " << start << " tb_offset:" << tb_offset[level+1] << std::endl); tbl+=ndsz; } }else{ float p; for (table_entry_pos_t c=0; c " << dict->decode(word(tbl)) << std::endl); tbl+=ndsz; } } return; } //Checkbound with sorting of n-gram table on disk void lmtable::checkbounds(int level) { VERBOSE(2,"lmtable::checkbounds START Level:" << level << endl); if (getCurrentSize(level) > 0 ){ char* tbl=table[level]; char* succtbl=table[level+1]; LMT_TYPE ndt=tbltype[level]; LMT_TYPE succndt=tbltype[level+1]; int ndsz=nodesize(ndt); int succndsz=nodesize(succndt); //re-order table at level+1 on disk //generate random filename to avoid collisions std::string filePath; // ofstream out; mfstream out; createtempfile(out, filePath, ios::out|ios::binary); if (out.fail()) { perror("checkbound creating out on filePath"); exit(4); } table_entry_pos_t start,end,newend; table_entry_pos_t succ; //re-order table at level l+1 char* found; for (table_entry_pos_t c=0; c0) newend=boundwithoffset(found-ndsz,ndt,level); else newend=0; //if start==BOUND_EMPTY1 there are no successors for this entry if (start==BOUND_EMPTY1){ succ=0; } else{ MY_ASSERT(end>start); succ=end-start; } startpos[level][c]=newend; newend += succ; MY_ASSERT(newend<=cursize[level+1]); if (succ>0) { out.write((char*)(succtbl + (table_pos_t) start * succndsz),(table_pos_t) succ * succndsz); if (!out.good()) { VERBOSE(2," Something went wrong while writing temporary file " << filePath << " Maybe there is not enough space on this filesystem" << endl); out.close(); exit(2); removefile(filePath); } } boundwithoffset(found,ndt,newend,level); } out.close(); if (out.fail()) { perror("error closing out"); exit(4); } fstream inp(filePath.c_str(),ios::in|ios::binary); if (inp.fail()) { perror("error opening inp"); exit(4); } inp.read(succtbl,(table_pos_t) cursize[level+1]*succndsz); inp.close(); if (inp.fail()) { perror("error closing inp"); exit(4); } removefile(filePath); } VERBOSE(2,"lmtable::checkbounds END Level:" << level << endl); } //Add method inserts n-grams in the table structure. It is ONLY used during //loading of LMs in text format. It searches for the prefix, then it adds the //suffix to the last level and updates the start-end positions. int lmtable::addwithoffset(ngram& ng, float iprob, float ibow) { char *found; LMT_TYPE ndt=tbltype[1]; //default initialization int ndsz=nodesize(ndt); //default initialization static int no_more_msg = 0; if (ng.size>1) { // find the prefix starting from the first level table_entry_pos_t start=0; table_entry_pos_t end=cursize[1]; table_entry_pos_t position; for (int l=1; l //int lmtable::add(ngram& ng, TA iprob,TB ibow) int lmtable::add(ngram& ng, float iprob, float ibow) { char *found; LMT_TYPE ndt=tbltype[1]; //default initialization int ndsz=nodesize(ndt); //default initialization static int no_more_msg = 0; if (ng.size>1) { // find the prefix starting from the first level table_entry_pos_t start=0; table_entry_pos_t end=cursize[1]; table_entry_pos_t position; for (int l=1; l=2) cout << "searching entry for codeword: " << ngp[0] << "..."; ***/ //assume 1-grams is a 1-1 map of the vocabulary //CHECK: explicit cast of n into float because table_pos_t could be unsigned and larger than MAXINT if (lev==1) return *found=(*ngp < (float) n ? table[1] + (table_pos_t)*ngp * sz:NULL); //prepare table to be searched with mybsearch char* tb; tb=table[lev] + (table_pos_t) offs * sz; //prepare search pattern char w[LMTCODESIZE]; putmem(w,ngp[0],0,LMTCODESIZE); table_entry_pos_t idx=0; // index returned by mybsearch *found=NULL; //initialize output variable totbsearch[lev]++; switch(action) { case LMT_FIND: // if (!tb || !mybsearch(tb,n,sz,(unsigned char *)w,&idx)) return NULL; if (!tb || !mybsearch(tb,n,sz,w,&idx)) { return NULL; } else { // return *found=tb + (idx * sz); return *found=tb + ((table_pos_t)idx * sz); } default: error((char*)"lmtable::search: this option is available"); }; return NULL; } /* returns idx with the first position in ar with entry >= key */ int lmtable::mybsearch(char *ar, table_entry_pos_t n, int size, char *key, table_entry_pos_t *idx) { if (n==0) return 0; *idx=0; register table_entry_pos_t low=0, high=n; register unsigned char *p; int result; #ifdef INTERP_SEARCH char *lp=NULL; char *hp=NULL; #endif while (low < high) { #ifdef INTERP_SEARCH //use interpolation search only for intervals with at least 4096 entries if ((high-low)>=10000) { lp=(char *) (ar + (low * size)); if (codecmp((char *)key,lp)<0) { *idx=low; return 0; } hp=(char *) (ar + ((high-1) * size)); if (codecmp((char *)key,hp)>0) { *idx=high; return 0; } *idx= low + ((high-1)-low) * codediff((char *)key,lp)/codediff(hp,(char *)lp); } else #endif *idx = (low + high) / 2; //after redefining the interval there is no guarantee //that wlp <= wkey <= whigh p = (unsigned char *) (ar + (*idx * size)); result=codecmp((char *)key,(char *)p); if (result < 0) high = *idx; else if (result > 0) low = ++(*idx); else return 1; } *idx=low; return 0; } // generates a LM copy for a smaller dictionary void lmtable::cpsublm(lmtable* slmt, dictionary* subdict,bool keepunigr) { //keepunigr=false; //let slmt inherit all features of this lmtable slmt->configure(maxlev,isQtable); slmt->dict=new dictionary((keepunigr?dict:subdict),false); if (isQtable) { for (int i=1; i<=maxlev; i++) { slmt->NumCenters[i]=NumCenters[i]; slmt->Pcenters[i]=new float [NumCenters[i]]; memcpy(slmt->Pcenters[i],Pcenters[i],NumCenters[i] * sizeof(float)); if (iBcenters[i]=new float [NumCenters[i]]; memcpy(slmt->Bcenters[i],Bcenters[i],NumCenters[i] * sizeof(float)); } } } //manage dictionary information //generate OOV codes and build dictionary lookup table dict->genoovcode(); slmt->dict->genoovcode(); subdict->genoovcode(); int* lookup=new int [dict->size()]; for (int c=0; csize(); c++) { lookup[c]=subdict->encode(dict->decode(c)); if (c != dict->oovcode() && lookup[c] == subdict->oovcode()) lookup[c]=-1; // words of this->dict that are not in slmt->dict } //variables useful to navigate in the lmtable structure LMT_TYPE ndt,pndt; int ndsz,pndsz; char *entry, *newentry; table_entry_pos_t start, end, origin; for (int l=1; l<=maxlev; l++) { slmt->cursize[l]=0; slmt->table[l]=NULL; if (l==1) { //1-gram level ndt=tbltype[l]; ndsz=nodesize(ndt); for (table_entry_pos_t p=0; pcursize[l] % slmt->dict->size()) ==0) slmt->table[l]=(char *)reallocf(slmt->table[l],((table_pos_t) slmt->cursize[l] + (table_pos_t) slmt->dict->size()) * ndsz); newentry=slmt->table[l] + (table_pos_t) slmt->cursize[l] * ndsz; memcpy(newentry,entry,ndsz); if (!keepunigr) //do not change encoding if keepunigr is true slmt->word(newentry,lookup[word(entry)]); if (lbound(newentry,ndt,p); //store in bound the entry itself (**) !!!! slmt->cursize[l]++; } } } else { //n-grams n>1: scan lower order table pndt=tbltype[l-1]; pndsz=nodesize(pndt); ndt=tbltype[l]; ndsz=nodesize(ndt); for (table_entry_pos_t p=0; pcursize[l-1]; p++) { //determine start and end of successors of this entry origin=slmt->bound(slmt->table[l-1] + (table_pos_t)p * pndsz,pndt); //position of n-1 gram in this table (**) if (origin == 0) start=0; //succ start at first pos in table[l] else start=bound(table[l-1] + (table_pos_t)(origin-1) * pndsz,pndt);//succ start after end of previous entry end=bound(table[l-1] + (table_pos_t)origin * pndsz,pndt); //succ end where indicated if (!keepunigr || lookup[word(table[l-1] + (table_pos_t)origin * pndsz)]!=-1) { while (start < end) { entry=table[l] + (table_pos_t) start * ndsz; if (lookup[word(entry)]!=-1) { if ((slmt->cursize[l] % slmt->dict->size()) ==0) slmt->table[l]=(char *)reallocf(slmt->table[l],(table_pos_t) (slmt->cursize[l]+slmt->dict->size()) * ndsz); newentry=slmt->table[l] + (table_pos_t) slmt->cursize[l] * ndsz; memcpy(newentry,entry,ndsz); if (!keepunigr) //do not change encoding if keepunigr is true slmt->word(newentry,lookup[word(entry)]); if (lbound(newentry,ndt,start); //store in bound the entry itself!!!! slmt->cursize[l]++; } start++; } } //updated bound information of incoming entry slmt->bound(slmt->table[l-1] + (table_pos_t) p * pndsz, pndt,slmt->cursize[l]); } } } return; } // saves a LM table in text format void lmtable::savetxt(const char *filename) { fstream out(filename,ios::out); table_entry_pos_t cnt[1+MAX_NGRAM]; int l; // out.precision(7); out.precision(6); if (isQtable) { out << "qARPA " << maxlev; for (l=1; l<=maxlev; l++) out << " " << NumCenters[l]; out << endl; } ngram ng(lmtable::getDict(),0); VERBOSE(2, "savetxt: " << filename << std::endl); if (isPruned) ngcnt(cnt); //check size of table by considering pruned n-grams out << "\n\\data\\\n"; char buff[100]; for (l=1; l<=maxlev; l++) { sprintf(buff,"ngram %2d=%10d\n",l,(isPruned?cnt[l]:cursize[l])); out << buff; } out << "\n"; for (l=1; l<=maxlev; l++) { out << "\n\\" << l << "-grams:\n"; VERBOSE(2, "save: " << (isPruned?cnt[l]:cursize[l]) << " " << l << "-grams" << std::endl); if (isQtable) { out << NumCenters[l] << "\n"; for (int c=0; csave(out); for (int i=1; i<=maxlev; i++) { if (isQtable) { out.write((char*)Pcenters[i],NumCenters[i] * sizeof(float)); if (isave(out); } void lmtable::appendbin_level(int level, fstream &out, int mmap) { if (getCurrentSize(level) > 0 ){ if (mmap>0) appendbin_level_mmap(level, out); else { appendbin_level_nommap(level, out); } } } void lmtable::appendbin_level_nommap(int level, fstream &out) { VERBOSE(2,"lmtable:appendbin_level_nommap START Level:" << level << std::endl); /* if (isPruned){ VERBOSE(2,"savebin_level (level " << level << "): pruned LM cannot be saved in binary form" << std::endl); exit(0); } */ MY_ASSERT(level<=maxlev); // print header if (isQtable) { //NOT IMPLEMENTED } else { //do nothing } VERBOSE(3,"appending " << cursize[level] << " (maxsize:" << maxsize[level] << ") " << level << "-grams" << " table " << (void*) table << " table[level] " << (void*) table[level] << endl); if (isQtable) { //NOT IMPLEMENTED } out.write(table[level],(table_pos_t) cursize[level]*nodesize(tbltype[level])); if (!out.good()) { perror("Something went wrong while writing"); out.close(); exit(2); } VERBOSE(2,"lmtable:appendbin_level_nommap END Level:" << level << std::endl); } void lmtable::appendbin_level_mmap(int level, fstream &out) { UNUSED(out); VERBOSE(2,"appending " << level << " (Actually do nothing)" << std::endl); } void lmtable::savebin_level(int level, const char* outfilename, int mmap) { if (mmap>0) savebin_level_mmap(level, outfilename); else { savebin_level_nommap(level, outfilename); } } void lmtable::savebin_level_nommap(int level, const char* outfilename) { VERBOSE(2,"lmtable:savebin_level_nommap START" << requiredMaxlev << std::endl); /* if (isPruned){ cerr << "savebin_level (level " << level << "): pruned LM cannot be saved in binary form\n"; exit(0); } */ MY_ASSERT(level<=maxlev); char nameNgrams[BUFSIZ]; sprintf(nameNgrams,"%s-%dgrams",outfilename,level); fstream out(nameNgrams, ios::out|ios::binary); if (out.fail()) { perror("cannot be opened"); exit(3); } // print header if (isQtable) { //NOT IMPLEMENTED } else { //do nothing } VERBOSE(3,"saving " << cursize[level] << "(maxsize:" << maxsize[level] << ") " << level << "-grams in " << nameNgrams << " table " << (void*) table << " table[level] " << (void*) table[level] << endl); if (isQtable) { //NOT IMPLEMENTED } out.write(table[level],(table_pos_t) cursize[level]*nodesize(tbltype[level])); if (!out.good()) { VERBOSE(2," Something went wrong while writing temporary file " << nameNgrams << endl); out.close(); removefile(nameNgrams); exit(2); } out.close(); if (out.fail()) { perror("cannot be closed"); exit(3); } VERBOSE(2,"lmtable:savebin_level_nommap END" << requiredMaxlev << std::endl); } void lmtable::savebin_level_mmap(int level, const char* outfilename) { char nameNgrams[BUFSIZ]; sprintf(nameNgrams,"%s-%dgrams",outfilename,level); VERBOSE(2,"saving " << level << "-grams probs in " << nameNgrams << " (Actually do nothing)" <> %s", fromnameNgrams, tonameNgrams); system(cmd); } //remove all single level files void lmtable::remove_all_levels(const char* filename){ //single level files should have a name derived from "filename" for (int i=1; i<=maxlevel(); i++) { remove_single_level(i,filename); } } //remove a single level file void lmtable::remove_single_level(int level, const char* filename){ //single level files should have a name derived from "filename" char nameNgrams[BUFSIZ]; sprintf(nameNgrams,"%s-%dgrams",filename,level); //removing temporary files removefile(nameNgrams); } //delete the table of a single level void lmtable::delete_level(int level, const char* outfilename, int mmap){ if (mmap>0) delete_level_mmap(level, outfilename); else { delete_level_nommap(level); } } void lmtable::delete_level_mmap(int level, const char* outfilename) { //getting the level-dependent filename char nameNgrams[BUFSIZ]; sprintf(nameNgrams,"%s-%dgrams",outfilename,level); //compute exact filesize table_pos_t filesize=(table_pos_t) cursize[level] * nodesize(tbltype[level]); // set the file to the proper size: Munmap(table[level]-tableGaps[level],(table_pos_t) filesize+tableGaps[level],0); maxsize[level]=cursize[level]=0; } void lmtable::delete_level_nommap(int level) { delete table[level]; maxsize[level]=cursize[level]=0; } void lmtable::compact_all_levels(const char* filename){ //single level files should have a name derived from "filename" for (int i=1; i<=maxlevel(); i++) { compact_single_level(i,filename); } } void lmtable::compact_single_level(int level, const char* filename) { char nameNgrams[BUFSIZ]; sprintf(nameNgrams,"%s-%dgrams",filename,level); VERBOSE(2,"concatenating " << level << "-grams probs from " << nameNgrams << " to " << filename<< std::endl); //concatenating of new table to the existing data char cmd[BUFSIZ]; sprintf(cmd,"cat %s >> %s", nameNgrams, filename); system(cmd); //removing temporary files removefile(nameNgrams); } void lmtable::resize_level(int level, const char* outfilename, int mmap) { if (getCurrentSize(level) > 0 ){ if (mmap>0) resize_level_mmap(level, outfilename); else { if (level> maxlev; //set the inverted falg to false, in order to rely on the header only isInverted=false; if (strncmp(header,"Qblmt",5)==0) { isQtable=true; if (strncmp(header,"QblmtI",6)==0) isInverted=true; } else if(strncmp(header,"blmt",4)==0) { isQtable=false; if (strncmp(header,"blmtI",5)==0) isInverted=true; } else error((char*)"loadbin: LM file is not in binary format"); configure(maxlev,isQtable); for (int l=1; l<=maxlev; l++) { inp >> cursize[l]; maxsize[l]=cursize[l]; } //update table offsets for (int l=2; l<=maxlev; l++) update_offset(l,tb_offset[l-1]+maxsize[l-1]); char header2[MAX_LINE]; if (isQtable) { inp >> header2; for (int i=1; i<=maxlev; i++) { inp >> NumCenters[i]; VERBOSE(2,"reading " << NumCenters[i] << " centers" << "\n"); } } inp.getline(header2, MAX_LINE); } //load codebook of level l void lmtable::loadbin_codebook(istream& inp,int l) { Pcenters[l]=new float [NumCenters[l]]; inp.read((char*)Pcenters[l],NumCenters[l] * sizeof(float)); if (lrequiredMaxlev) maxlev=requiredMaxlev; VERBOSE(3,"lmtable::maxlev:" << maxlev << std::endl); VERBOSE(3,"lmtable::requiredMaxlev" << requiredMaxlev << std::endl); //if MMAP is used, then open the file if (filename && mmap>0) { #ifdef WIN32 error("lmtable::loadbin mmap facility not yet supported under WIN32\n"); #else if (mmap <= maxlev) memmap=mmap; else error((char*)"keep_on_disk value is out of range\n"); if ((diskid=open(filename, O_RDONLY))<0) { VERBOSE(2,"cannot open " << filename << std::endl); error((char*)"dying"); } //check that the LM is uncompressed char miniheader[4]; read(diskid,miniheader,4); if (strncmp(miniheader,"Qblm",4) && strncmp(miniheader,"blmt",4)) error((char*)"mmap functionality does not work with compressed binary LMs\n"); #endif } for (int l=1; l<=maxlev; l++) { loadbin_level(inp,l); } VERBOSE(2,"done" << std::endl); } //load only the dictionary of a binary lmfile void lmtable::loadbin_dict(istream& inp) { VERBOSE(2,"lmtable::loadbin_dict()" << std::endl); lmtable::getDict()->load(inp); VERBOSE(2,"dict->size(): " << lmtable::getDict()->size() << std::endl); } //load ONE level of a binary lmfile void lmtable::loadbin_level(istream& inp, int level) { VERBOSE(2,"loadbin_level (level " << level << std::endl); if (isQtable) { loadbin_codebook(inp,level); } if ((memmap == 0) || (level < memmap)) { VERBOSE(2,"loading " << cursize[level] << " " << level << "-grams" << std::endl); table[level]=new char[(table_pos_t) cursize[level] * nodesize(tbltype[level])]; inp.read(table[level],(table_pos_t) cursize[level] * nodesize(tbltype[level])); } else { #ifdef WIN32 error((char*)"mmap not available under WIN32\n"); #else VERBOSE(2,"mapping " << cursize[level] << " " << level << "-grams" << std::endl); tableOffs[level]=inp.tellg(); table[level]=(char *)MMap(diskid,PROT_READ, tableOffs[level], (table_pos_t) cursize[level]*nodesize(tbltype[level]), &tableGaps[level]); table[level]+=(table_pos_t) tableGaps[level]; VERBOSE(2,"tableOffs " << tableOffs[level] << " tableGaps" << tableGaps[level] << "-grams" << std::endl); inp.seekg((table_pos_t) cursize[level]*nodesize(tbltype[level]),ios_base::cur); #endif } VERBOSE(2,"done (level " << level << std::endl); } int lmtable::get(ngram& ng,int n,int lev) { totget[lev]++; if (lev > maxlev) error((char*)"get: lev exceeds maxlevel"); if (n < lev) error((char*)"get: ngram is too small"); //set boudaries for 1-gram table_entry_pos_t offset=0,limit=cursize[1]; //information of table entries char* found; LMT_TYPE ndt; ng.link=NULL; ng.lev=0; for (int l=1; l<=lev; l++) { //initialize entry information found = NULL; ndt=tbltype[l]; #ifdef LMT_CACHE_ENABLE bool hit = false; if (lmtcache[l] && lmtcache[l]->get(ng.wordp(n),found)) { hit=true; } else { search(l, offset, (limit-offset), nodesize(ndt), ng.wordp(n-l+1), LMT_FIND, &found); } //insert both found and not found items!!! // if (lmtcache[l] && hit==true) { //insert only not found items!!! if (lmtcache[l] && hit==false) { const char* found2=found; lmtcache[l]->add(ng.wordp(n),found2); } #else search(l, offset, (limit-offset), nodesize(ndt), ng.wordp(n-l+1), LMT_FIND, &found); #endif if (!found) return 0; float pr = prob(found,ndt); if (pr==NOPROB) return 0; //pruned n-gram ng.path[l]=found; //store path of found entries ng.bow=(l0) { *cacheout << sentence_id << " miss " << ng << " " << ng.link << "\n"; } #endif return 1; } //recursively prints the language model table void lmtable::dumplm(fstream& out,ngram ng, int ilev, int elev, table_entry_pos_t ipos,table_entry_pos_t epos) { LMT_TYPE ndt=tbltype[ilev]; ngram ing(ng.dict); int ndsz=nodesize(ndt); MY_ASSERT(ng.size==ilev-1); //Note that ipos and epos are always larger than or equal to 0 because they are unsigned int MY_ASSERT(epos<=cursize[ilev]); MY_ASSERT(ipos0?bound(table[ilev]+ (table_pos_t) (i-1) * ndsz,ndt):0); table_entry_pos_t esucc=bound(found,ndt); if (isucc < esucc) //there are successors! dumplm(out,ng,ilev+1,elev,isucc,esucc); } else { out << ipr <<"\t"; // if table is inverted then revert n-gram if (isInverted && (ng.size>1)) { ing.invert(ng); for (int k=ing.size; k>=1; k--) { if (kdecode(*ing.wordp(k)); } } else { for (int k=ng.size; k>=1; k--) { if (kdecode(*ng.wordp(k)); } } if (ilevUPPER_SINGLE_PRECISION_OF_0 || ibo<-UPPER_SINGLE_PRECISION_OF_0)) out << "\t" << ibo; } } out << "\n"; } } } //succscan iteratively returns all successors of an ngram h for which //get(h,h.size,h.size) returned true. int lmtable::succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev) { MY_ASSERT(lev==h.lev+1 && h.size==lev && lev<=maxlev); LMT_TYPE ndt=tbltype[h.lev]; int ndsz=nodesize(ndt); table_entry_pos_t offset; switch (action) { case LMT_INIT: //reset ngram local indexes ng.size=lev; ng.trans(h); //get number of successors of h ng.midx[lev]=0; offset=(h.link>table[h.lev]?bound(h.link-ndsz,ndt):0); h.succ=bound(h.link,ndt)-offset; h.succlink=table[lev]+(table_pos_t) offset * nodesize(tbltype[lev]); return 1; case LMT_CONT: if (ng.midx[lev] < h.succ) { //put current word into ng *ng.wordp(1)=word(h.succlink+(table_pos_t) ng.midx[lev]*nodesize(tbltype[lev])); ng.midx[lev]++; return 1; } else return 0; default: exit_error(IRSTLM_ERROR_MODEL, "succscan: only permitted options are LMT_INIT and LMT_CONT"); } return 0; } ngram_state_t lmtable::convert(const char* suffptr, size_t lev){ int ndsz=nodesize(tbltype[lev]); ngram_state_t suffidx=0; if (suffptr){ suffidx = (ngram_state_t) ( ((table_pos_t) suffptr - (table_pos_t) table[lev]) / ndsz ) + tb_offset[lev] + 1; //added 1 to distinguish from zero-ngram } return suffidx; } //maxsuffptr returns the largest suffix of an n-gram that is contained //in the LM table. This can be used as a compact representation of the //(n-1)-gram state of a n-gram LM. If the input k-gram has k>=n then it //is trimmed to its n-1 suffix. //non recursive version const char *lmtable::maxsuffptr(ngram ong, unsigned int* size) { VERBOSE(3,"const char *lmtable::maxsuffptr(ngram ong, unsigned int* size) ong:|" << ong <<"|\n"); if (ong.size==0) { if (size!=NULL) *size=0; return (char*) NULL; } if (isInverted) { if (ong.size>maxlev) ong.size=maxlev; //if larger than maxlen reduce size ngram ing=ong; //inverted ngram ing.invert(ong); get(ing,ing.size,ing.size); // dig in the trie if (ing.lev > 0) { //found something? unsigned int isize = MIN(ing.lev,(ing.size-1)); //find largest n-1 gram suffix if (size!=NULL) *size=isize; return ing.path[isize]; } else { // means a real unknown word! if (size!=NULL) *size=0; //default statesize for zero-gram! return NULL; //default stateptr for zero-gram! } } else { if (ong.size>0) ong.size--; //always reduced by 1 word if (ong.size>=maxlev) ong.size=maxlev-1; //if still larger or equals to maxlen reduce again if (size!=NULL) *size=ong.size; //will return the largest found ong.size for (ngram ng=ong; ng.size>0; ng.size--) { if (get(ng,ng.size,ng.size)) { // if (ng.succ==0) (*size)--; // if (size!=NULL) *size=ng.size; if (size!=NULL) { if (ng.succ==0) *size=ng.size-1; else *size=ng.size; } return ng.link; } } if (size!=NULL) *size=0; return NULL; } } const char *lmtable::cmaxsuffptr(ngram ong, unsigned int* size) { VERBOSE(3,"const char *lmtable::cmaxsuffptr(ngram ong, unsigned int* size) ong:|" << ong << "|\n"); if (ong.size==0) { if (size!=NULL) *size=0; return (char*) NULL; } if (size!=NULL) *size=ong.size; //will return the largest found ong.size #ifdef PS_CACHE_ENABLE prob_and_state_t pst; size_t orisize=ong.size; if (ong.size>=maxlev) ong.size=maxlev; //cache hit // if (prob_and_state_cache && ong.size==maxlev && prob_and_state_cache->get(ong.wordp(maxlev),pst)) { if (prob_and_state_cache[ong.size] && prob_and_state_cache[ong.size]->get(ong.wordp(ong.size),pst)) { *size=pst.statesize; return pst.state; } ong.size = orisize; //cache miss unsigned int isize; //internal state size variable char* found=(char *)maxsuffptr(ong,&isize); ngram_state_t msidx = convert(found,isize); //cache insert //IMPORTANT: this function updates only two fields (state, statesize) of the entry of the cache; the reminaing fields (logpr, bow, bol, extendible) are undefined; hence, it should not be used before the corresponding clprob() if (ong.size>=maxlev) ong.size=maxlev; // if (prob_and_state_cache && ong.size==maxlev) { if (prob_and_state_cache[ong.size]) { pst.state=found; pst.ngramstate=msidx; pst.statesize=isize; // prob_and_state_cache->add(ong.wordp(maxlev),pst); prob_and_state_cache[ong.size]->add(ong.wordp(ong.size),pst); } if (size!=NULL) *size=isize; return found; #else return (char *)maxsuffptr(ong,size); #endif } //maxsuffidx returns an index of the largest of an n-gram that is contained //in the LM table. This can be used as a compact representation of the //(n-1)-gram state of a n-gram LM. If the input k-gram has k>=n then it //is trimmed to its n-1 suffix. //non recursive version //It relies on the computation of maxsuffptr ngram_state_t lmtable::maxsuffidx(ngram ong, unsigned int* size) { VERBOSE(3,"ngram_state_t lmtable::maxsuffidx(ngram ong, unsigned int* size) ong:|" << ong << "|\n"); unsigned int isize; const char* suffptr = cmaxsuffptr(ong,&isize); if (size) *size=isize; return convert(suffptr,isize); } ngram_state_t lmtable::cmaxsuffidx(ngram ong, unsigned int* size) { VERBOSE(3,"ngram_state_t lmtable::cmaxsuffidx(ngram ong, unsigned int* size) ong:|" << ong << "|\n"); if (ong.size==0) { if (size!=NULL) *size=0; return 0; } if (size!=NULL) *size=ong.size; //will return the largest found ong.size #ifdef PS_CACHE_ENABLE prob_and_state_t pst; size_t orisize=ong.size; if (ong.size>=maxlev) ong.size=maxlev; //cache hit // if (prob_and_state_cache && ong.size==maxlev && prob_and_state_cache->get(ong.wordp(maxlev),pst)) { if (prob_and_state_cache[ong.size] && prob_and_state_cache[ong.size]->get(ong.wordp(ong.size),pst)) { *size=pst.statesize; return pst.ngramstate; } ong.size = orisize; //cache miss unsigned int isize; //internal state size variable char* msptr = cmaxsuffptr(ong,&isize); ngram_state_t msidx = convert(suffptr,isize); //cache insert //IMPORTANT: this function updates only two fields (ngramstate, statesize) of the entry of the cache; the reminaing fields (logpr, bow, bol, extendible) are undefined; hence, it should not be used before the corresponding clprob() if (ong.size>=maxlev) ong.size=maxlev; // if (prob_and_state_cache && ong.size==maxlev) { if (prob_and_state_cache[ong.size]) { pst.state=found; pst.ngramstate=msidx; pst.statesize=isize; // prob_and_state_cache->add(ong.wordp(maxlev),pst); prob_and_state_cache[ong.size]->add(ong.wordp(ong.size),pst); } if (size!=NULL) *size=isize; return msidx; #else return maxsuffidx(ong,size); #endif } //returns log10prob of n-gram //bow: backoff weight //bol: backoff level //additional infos related to use in Moses: //maxsuffptr: recombination state after the LM call //statesize: lenght of the recombination state //extensible: true if the deepest found ngram has successors //lastbow: bow of the deepest found ngram //non recursive version, also includes maxsuffptr and maxsuffidx double lmtable::lprob(ngram ong,double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr,unsigned int* statesize,bool* extendible, double *lastbow) { VERBOSE(3," lmtable::lprob(ngram) ong |" << ong << "|\n" << std::endl); if (ong.size==0){ //sanity check if (maxsuffptr) *maxsuffptr=NULL; if (maxsuffidx) *maxsuffidx=0; return 0.0; } if (ong.size>maxlev) ong.size=maxlev; //adjust n-gram level to table size if (bow) *bow=0; //initialize back-off weight if (bol) *bol=0; //initialize bock-off level if (lastbow) *lastbow=0; //initialize back-off weight of the deepest found ngram double rbow=0,lpr=0; //output back-off weight and logprob float ibow,iprob; //internal back-off weight and logprob if (isInverted) { ngram ing=ong; //Inverted ngram TRIE ing.invert(ong); get(ing,ing.size,ing.size); // dig in the trie if (ing.lev >0) { //found something? iprob=ing.prob; lpr = (double)(isQtable?Pcenters[ing.lev][(qfloat_t)iprob]:iprob); if (*ong.wordp(1)==dict->oovcode()) lpr-=logOOVpenalty; //add OOV penalty size_t isize=MIN(ing.lev,(ing.size-1)); if (statesize) *statesize=isize; //find largest n-1 gram suffix char* suffptr=ing.path[isize]; if (maxsuffptr) *maxsuffptr=suffptr; if (maxsuffidx) *maxsuffidx = convert(suffptr,isize); if (extendible) *extendible=succrange(ing.path[ing.lev],ing.lev)>0; if (lastbow) *lastbow=(double) (isQtable?Bcenters[ing.lev][(qfloat_t)ing.bow]:ing.bow); } else { // means a real unknown word! lpr=-log(UNIGRAM_RESOLUTION)/M_LN10; if (statesize) *statesize=0; //default statesize for zero-gram! if (maxsuffptr) *maxsuffptr=NULL; //default stateptr for zero-gram! if (maxsuffidx) *maxsuffidx=0; //default state-value for zero-gram! } if (ing.lev < ing.size) { //compute backoff weight int depth=(ing.lev>0?ing.lev:1); //ing.lev=0 (real unknown word) is still a 1-gram if (bol) *bol=ing.size-depth; ing.size--; //get n-gram context get(ing,ing.size,ing.size); // dig in the trie if (ing.lev>0) { //found something? //collect back-off weights for (int l=depth; l<=ing.lev; l++) { //start from first back-off level MY_ASSERT(ing.path[l]!=NULL); //check consistency of table ibow=this->bow(ing.path[l],tbltype[l]); rbow+= (double) (isQtable?Bcenters[l][(qfloat_t)ibow]:ibow); //avoids bad quantization of bow of //if (isQtable && (*ing.wordp(1)==dict->oovcode())) { if (isQtable && (*ing.wordp(ing.size)==dict->oovcode())) { rbow-=(double)Bcenters[l][(qfloat_t)ibow]; } } } } if (bow) (*bow)=rbow; return rbow + lpr; } //Direct ngram TRIE else { MY_ASSERT((extendible == NULL) || (extendible && *extendible==false)); // MY_ASSERT(lastbow==NULL); for (ngram ng=ong; ng.size>0; ng.size--) { if (get(ng,ng.size,ng.size)) { iprob=ng.prob; lpr = (double)(isQtable?Pcenters[ng.size][(qfloat_t)iprob]:iprob); if (*ng.wordp(1)==dict->oovcode()) lpr-=logOOVpenalty; //add OOV penalty if (maxsuffptr || maxsuffidx || statesize) { //one extra step is needed if ng.size=ong.size if (ong.size==ng.size) { ng.size--; get(ng,ng.size,ng.size); } if (statesize) *statesize=ng.size; char* suffptr=ng.link; //we should check ng.link != NULL size_t isize=ng.size; if (maxsuffptr) *maxsuffptr=suffptr; if (maxsuffidx) *maxsuffidx = convert(suffptr,isize); } return rbow+lpr; } else { if (ng.size==1) { //means a real unknow word! if (statesize) *statesize=0; if (maxsuffptr) *maxsuffptr=NULL; //default stateptr for zero-gram! if (maxsuffidx) *maxsuffidx=0; //default state-value for zero-gram! return rbow -log(UNIGRAM_RESOLUTION)/M_LN10; } else { //compute backoff if (bol) (*bol)++; //increase backoff level if (ng.lev==(ng.size-1)) { //if search stopped at previous level ibow=ng.bow; rbow+= (double) (isQtable?Bcenters[ng.lev][(qfloat_t)ibow]:ibow); //avoids bad quantization of bow of if (isQtable && (*ng.wordp(2)==dict->oovcode())) { rbow-=(double)Bcenters[ng.lev][(qfloat_t)ibow]; } } if (bow) (*bow)=rbow; } } } } MY_ASSERT(0); //never pass here!!! return 1.0; } //return log10 probsL use cache memory double lmtable::clprob(ngram ong,double* bow, int* bol, ngram_state_t* ngramstate, char** state, unsigned int* statesize, bool* extendible, double* lastbow) { VERBOSE(3,"double lmtable::clprob(ngram ong,double* bow, int* bol, ngram_state_t* ngramstate, char** state, unsigned int* statesize, bool* extendible, double* lastbow) ong:|" << ong << "|\n"); #ifdef TRACE_CACHELM // if (probcache && ong.size==maxlev && sentence_id>0) { if (probcache && sentence_id>0) { *cacheout << sentence_id << " " << ong << "\n"; } #endif if (ong.size==0) { if (statesize!=NULL) *statesize=0; if (state!=NULL) *state=NULL; if (ngramstate!=NULL) *ngramstate=0; if (extendible!=NULL) *extendible=false; if (lastbow!=NULL) *lastbow=false; return 0.0; } if (ong.size>maxlev) ong.size=maxlev; //adjust n-gram level to table size #ifdef PS_CACHE_ENABLE double logpr = 0.0; //cache hit prob_and_state_t pst_get; // if (prob_and_state_cache && ong.size==maxlev && prob_and_state_cache->get(ong.wordp(maxlev),pst_get)) { if (prob_and_state_cache[ong.size] && prob_and_state_cache[ong.size]->get(ong.wordp(ong.size),pst_get)) { logpr=pst_get.logpr; if (bow) *bow = pst_get.bow; if (bol) *bol = pst_get.bol; if (state) *state = pst_get.state; if (ngramstate) *ngramstate = pst_get.ngramstate; if (statesize) *statesize = pst_get.statesize; if (extendible) *extendible = pst_get.extendible; if (lastbow) *lastbow = pst_get.lastbow; return logpr; } //cache miss prob_and_state_t pst_add; logpr = pst_add.logpr = lmtable::lprob(ong, &(pst_add.bow), &(pst_add.bol), &(pst_add.ngramstate), &(pst_add.state), &(pst_add.statesize), &(pst_add.extendible), &(pst_add.lastbow)); if (bow) *bow = pst_add.bow; if (bol) *bol = pst_add.bol; if (state) *state = pst_add.state; if (ngramstate) *ngramstate = pst_add.ngramstate; if (statesize) *statesize = pst_add.statesize; if (extendible) *extendible = pst_add.extendible; if (lastbow) *lastbow = pst_add.lastbow; // if (prob_and_state_cache && ong.size==maxlev) { // prob_and_state_cache->add(ong.wordp(maxlev),pst_add); // } if (prob_and_state_cache[ong.size]) { prob_and_state_cache[ong.size]->add(ong.wordp(ong.size),pst_add); } return logpr; #else return lmtable::lprob(ong, bow, bol, ngramstate, state, statesize, extendible, lastbow); #endif }; int lmtable::succrange(node ndp,int level,table_entry_pos_t* isucc,table_entry_pos_t* esucc) { table_entry_pos_t first,last; LMT_TYPE ndt=tbltype[level]; //get table boundaries for next level if (leveltable[level]? bound(ndp-nodesize(ndt), ndt) : 0; last = bound(ndp, ndt); } else { first=last=0; } if (isucc) *isucc=first; if (esucc) *esucc=last; return last-first; } void lmtable::stat(int level) { table_pos_t totmem=0,memory; float mega=1024 * 1024; cout.precision(2); cout << "lmtable class statistics\n"; cout << "levels " << maxlev << "\n"; for (int l=1; l<=maxlev; l++) { memory=(table_pos_t) cursize[l] * nodesize(tbltype[l]); cout << "lev " << l << " entries "<< cursize[l] << " used mem " << memory/mega << "Mb\n"; totmem+=memory; } cout << "total allocated mem " << totmem/mega << "Mb\n"; cout << "total number of get and binary search calls\n"; for (int l=1; l<=maxlev; l++) { cout << "level " << l << " get: " << totget[l] << " bsearch: " << totbsearch[l] << "\n"; } if (level >1 ) lmtable::getDict()->stat(); stat_caches(); } void lmtable::reset_mmap() { #ifndef WIN32 if (memmap>0 and memmap<=maxlev) for (int l=memmap; l<=maxlev; l++) { VERBOSE(2,"resetting mmap at level:" << l << std::endl); Munmap(table[l]-tableGaps[l],(table_pos_t) cursize[l]*nodesize(tbltype[l])+tableGaps[l],0); table[l]=(char *)MMap(diskid,PROT_READ, tableOffs[l], (table_pos_t)cursize[l]*nodesize(tbltype[l]), &tableGaps[l]); table[l]+=(table_pos_t)tableGaps[l]; } #endif } // ng: input n-gram // *lk: prob of n-(*bol) gram // *boff: backoff weight vector // *bol: backoff level double lmtable::lprobx(ngram ong, double *lkp, double *bop, int *bol) { double bo, lbo, pr; float ipr; //int ipr; ngram ng(dict), ctx(dict); if(bol) *bol=0; if(ong.size==0) { if(lkp) *lkp=0; return 0; // lprob ritorna 0, prima lprobx usava LOGZERO } if(ong.size>maxlev) ong.size=maxlev; ctx = ng = ong; bo=0; ctx.shift(); while(!get(ng)) { // back-off //OOV not included in dictionary if(ng.size==1) { pr = -log(UNIGRAM_RESOLUTION)/M_LN10; if(lkp) *lkp=pr; // this is the innermost probability pr += bo; //add all the accumulated back-off probability return pr; } // backoff-probability lbo = 0.0; //local back-off: default is logprob 0 if(get(ctx)) { //this can be replaced with (ng.lev==(ng.size-1)) ipr = ctx.bow; lbo = isQtable?Bcenters[ng.size][(qfloat_t)ipr]:ipr; //lbo = isQtable?Bcenters[ng.size][ipr]:*(float*)&ipr; } if(bop) *bop++=lbo; if(bol) ++*bol; bo += lbo; ng.size--; ctx.size--; } ipr = ng.prob; pr = isQtable?Pcenters[ng.size][(qfloat_t)ipr]:ipr; //pr = isQtable?Pcenters[ng.size][ipr]:*((float*)&ipr); if(lkp) *lkp=pr; pr += bo; return pr; } // FABIO table_entry_pos_t lmtable::wdprune(float *thr, int aflag) { //this function implements a method similar to the "Weighted Difference Method" //described in "Scalable Backoff Language Models" by Kristie Seymore and Ronald Rosenfeld int l; ngram ng(lmtable::getDict(),0); isPruned=true; //the table now might contain pruned n-grams ng.size=0; for(l=2; l<=maxlev; l++) wdprune(thr, aflag, ng, 1, l, 0, cursize[1]); return 0; } // FABIO: LM pruning method table_entry_pos_t lmtable::wdprune(float *thr, int aflag, ngram ng, int ilev, int elev, table_entry_pos_t ipos, table_entry_pos_t epos, double tlk, double bo, double *ts, double *tbs) { LMT_TYPE ndt=tbltype[ilev]; int ndsz=nodesize(ndt); char *ndp; float lk; float ipr, ibo; //int ipr, ibo; table_entry_pos_t i, k, nk; MY_ASSERT(ng.size==ilev-1); //Note that ipos and epos are always larger than or equal to 0 because they are unsigned int MY_ASSERT(epos<=cursize[ilev] && iposgetcode(BOS_))) { //the n-gram starts with the sentence start symbol //do not consider is actual probability because it is not reliable (its frequency is manually set) ipr = 0.0; } lk = ipr; if(ilev0 ? bound(ndp-ndsz, ndt) : 0; //table_entry_pos_t esucc = bound(ndp, ndt); if(isucc>=esucc) continue; // no successors //look for n-grams to be pruned with this context (see //back-off weight) prune: double nextlevel_ts=0, nextlevel_tbs=0; k = wdprune(thr, aflag, ng, ilev+1, elev, isucc, esucc, tlk+lk, bo, &nextlevel_ts, &nextlevel_tbs); //k is the number of pruned n-grams with this context if(ilev!=elev-1) continue; if(nextlevel_ts>=1 || nextlevel_tbs>=1) { VERBOSE(2, "ng: " << ng <<" nextlevel_ts=" << nextlevel_ts <<" nextlevel_tbs=" << nextlevel_tbs <<" k=" << k <<" ns=" << esucc-isucc << "\n"); if(nextlevel_ts>=1) { pscale(ilev+1, isucc, esucc, 0.999999/nextlevel_ts); goto prune; } } // adjusts backoff: // 1-sum_succ(pr(w|ng)) / 1-sum_succ(pr(w|bng)) bo = log((1-nextlevel_ts)/(1-nextlevel_tbs))/M_LN10; ibo=(float)bo; bow(ndp, ndt, ibo); } else { //we are at the highest level //get probability of lower order n-gram ngram bng = ng; bng.size--; double blk = lprob(bng); double wd = pow(10., tlk+lk) * (lk-bo-blk); if(aflag&&wd<0) wd=-wd; if(wd > thr[elev-1]) { // kept *ts += pow(10., lk); *tbs += pow(10., blk); } else { // discarded ++nk; prob(ndp, ndt, NOPROB); } } } return nk; } int lmtable::pscale(int lev, table_entry_pos_t ipos, table_entry_pos_t epos, double s) { LMT_TYPE ndt=tbltype[lev]; int ndsz=nodesize(ndt); char *ndp; float ipr; s=log(s)/M_LN10; ndp = table[lev]+ (table_pos_t) ipos*ndsz; for(table_entry_pos_t i=ipos; i #include #endif #include #include #include #include #include #include "util.h" #include "ngramcache.h" #include "dictionary.h" #include "n_gram.h" #include "lmContainer.h" #define MAX(a,b) (((a)>(b))?(a):(b)) #define MIN(a,b) (((a)<(b))?(a):(b)) #define LMTMAXLEV 20 #define MAX_LINE 100000 #ifndef LMTCODESIZE #define LMTCODESIZE (int)3 #endif #define SHORTSIZE (int)2 #define PTRSIZE (int)sizeof(char *) #define INTSIZE (int)4 #define CHARSIZE (int)1 #define PROBSIZE (int)4 //use float #define QPROBSIZE (int)1 //use qfloat_t //#define BOUNDSIZE (int)4 //use table_pos_t #define BOUNDSIZE (int)sizeof(table_entry_pos_t) //use table_pos_t #define UNIGRAM_RESOLUTION 10000000.0 typedef enum {INTERNAL,QINTERNAL,LEAF,QLEAF} LMT_TYPE; typedef char* node; typedef unsigned int ngram_state_t; //type for pointing to a full ngram in the table typedef unsigned int table_entry_pos_t; //type for pointing to a full ngram in the table typedef unsigned long table_pos_t; // type for pointing to a single char in the table typedef unsigned char qfloat_t; //type for quantized probabilities //CHECK this part to HERE #define BOUND_EMPTY1 (numeric_limits::max() - 2) #define BOUND_EMPTY2 (numeric_limits::max() - 1) namespace irstlm { class lmtable: public lmContainer { static const bool debug=true; void loadtxt(std::istream& inp,const char* header,const char* filename,int mmap); void loadtxt_ram(std::istream& inp,const char* header); void loadtxt_mmap(std::istream& inp,const char* header,const char* outfilename); void loadtxt_level(std::istream& inp,int l); void loadbin(std::istream& inp,const char* header,const char* filename,int mmap); void loadbin_header(std::istream& inp, const char* header); void loadbin_dict(std::istream& inp); void loadbin_codebook(std::istream& inp,int l); void loadbin_level(std::istream& inp,int l); ngram_state_t convert(const char* suffptr, size_t lev); protected: char* table[LMTMAXLEV+1]; //storage of all levels LMT_TYPE tbltype[LMTMAXLEV+1]; //table type for each levels table_entry_pos_t cursize[LMTMAXLEV+1]; //current size of levels //current offset for in-memory tables (different for each level //needed to manage partial tables // mempos = diskpos - offset[level] table_entry_pos_t tb_offset[LMTMAXLEV+1]; table_entry_pos_t maxsize[LMTMAXLEV+1]; //max size of levels table_entry_pos_t* startpos[LMTMAXLEV+1]; //support vector to store start positions char info[100]; //information put in the header //statistics int totget[LMTMAXLEV+1]; int totbsearch[LMTMAXLEV+1]; //probability quantization bool isQtable; //Incomplete LM table from distributed training bool isItable; //Table with reverted n-grams for fast access bool isInverted; //Table might contain pruned n-grams bool isPruned; int NumCenters[LMTMAXLEV+1]; float* Pcenters[LMTMAXLEV+1]; float* Bcenters[LMTMAXLEV+1]; double logOOVpenalty; //penalty for OOV words (default 0) int dictionary_upperbound; //set by user int backoff_state; //improve access speed int max_cache_lev; // NGRAMCACHE_t* prob_and_state_cache; NGRAMCACHE_t* prob_and_state_cache[LMTMAXLEV+1]; NGRAMCACHE_t* lmtcache[LMTMAXLEV+1]; float ngramcache_load_factor; float dictionary_load_factor; //memory map on disk int memmap; //level from which n-grams are accessed via mmap int diskid; off_t tableOffs[LMTMAXLEV+1]; off_t tableGaps[LMTMAXLEV+1]; // is this LM queried for knowing the matching order or (standard // case) for score? bool orderQuery; //flag to enable/disable deletion of dict in the destructor bool delete_dict; public: #ifdef TRACE_CACHELM std::fstream* cacheout; int sentence_id; #endif dictionary *dict; // dictionary (words - macro tags) lmtable(float nlf=0.0, float dlfi=0.0); virtual ~lmtable(); table_entry_pos_t wdprune(float *thr, int aflag=0); table_entry_pos_t wdprune(float *thr, int aflag, ngram ng, int ilev, int elev, table_entry_pos_t ipos, table_entry_pos_t epos, double lk=0, double bo=0, double *ts=0, double *tbs=0); double lprobx(ngram ong, double *lkp=0, double *bop=0, int *bol=0); table_entry_pos_t ngcnt(table_entry_pos_t *cnt); table_entry_pos_t ngcnt(table_entry_pos_t *cnt, ngram ng, int l, table_entry_pos_t ipos, table_entry_pos_t epos); int pscale(int lev, table_entry_pos_t ipos, table_entry_pos_t epos, double s); void init_prob_and_state_cache(); void init_probcache() { init_prob_and_state_cache(); }; //kept for back compatibility void init_statecache() {}; //kept for back compatibility void init_lmtcaches(); // void init_lmtcaches(int uptolev); void init_caches(int uptolev); void used_prob_and_state_cache() const; void used_lmtcaches() const; void used_caches() const; void delete_prob_and_state_cache(); void delete_probcache() { delete_prob_and_state_cache(); }; //kept for back compatibility void delete_statecache() {}; //kept for back compatibility void delete_lmtcaches(); void delete_caches(); void stat_prob_and_state_cache(); void stat_lmtcaches(); void stat_caches(); void check_prob_and_state_cache_levels() const; void check_probcache_levels() const { check_prob_and_state_cache_levels(); }; //kept for back compatibility void check_statecache_levels() const{}; //kept for back compatibility void check_lmtcaches_levels() const; void check_caches_levels() const; void reset_prob_and_state_cache(); void reset_probcache() { reset_prob_and_state_cache(); }; //kept for back compatibility void reset_statecache() {}; //kept for back compatibility void reset_lmtcaches(); void reset_caches(); bool are_prob_and_state_cache_active() const; bool is_probcache_active() const { return are_prob_and_state_cache_active(); }; //kept for back compatibility bool is_statecache_active() const { return are_prob_and_state_cache_active(); }; //kept for back compatibility bool are_lmtcaches_active() const; bool are_caches_active() const; void reset_mmap(); //set the inverted flag to load ngrams in an inverted order //this choice is disregarded if a binary LM is loaded, //because the info is stored into the header inline bool is_inverted(const bool flag) { return isInverted=flag; } inline bool is_inverted() const { return isInverted; } void configure(int n,bool quantized); //set penalty for OOV words inline double getlogOOVpenalty() const { return logOOVpenalty; } inline double setlogOOVpenalty(int dub) { MY_ASSERT(dub > dict->size()); dictionary_upperbound = dub; return logOOVpenalty=log((double)(dictionary_upperbound - dict->size()))/M_LN10; } inline double setlogOOVpenalty(double oovp) { return logOOVpenalty=oovp; } virtual int maxlevel() const { return maxlev; }; inline bool isQuantized() const { return isQtable; } void savetxt(const char *filename); void savebin(const char *filename); void appendbin_level(int level, fstream &out, int mmap); void appendbin_level_nommap(int level, fstream &out); void appendbin_level_mmap(int level, fstream &out); void savebin_level(int level, const char* filename, int mmap); void savebin_level_nommap(int level, const char* filename); void savebin_level_mmap(int level, const char* filename); void savebin_dict(std::fstream& out); void compact_all_levels(const char* filename); void compact_single_level(int level, const char* filename); void concatenate_all_levels(const char* fromfilename, const char* tofilename); void concatenate_single_level(int level, const char* fromfilename, const char* tofilename); void remove_all_levels(const char* filename); void remove_single_level(int level, const char* filename); void print_table_stat(); void print_table_stat(int level); void dumplm(std::fstream& out,ngram ng, int ilev, int elev, table_entry_pos_t ipos,table_entry_pos_t epos); void delete_level(int level, const char* outfilename, int mmap); void delete_level_nommap(int level); void delete_level_mmap(int level, const char* filename); void resize_level(int level, const char* outfilename, int mmap); void resize_level_nommap(int level); void resize_level_mmap(int level, const char* filename); inline void update_offset(int level, table_entry_pos_t value) { tb_offset[level]=value; }; virtual void load(const std::string &filename, int mmap=0); virtual void load(std::istream& inp,const char* filename=NULL,const char* outfilename=NULL,int mmap=0); void load_centers(std::istream& inp,int l); void expand_level(int level, table_entry_pos_t size, const char* outfilename, int mmap); void expand_level_nommap(int level, table_entry_pos_t size); void expand_level_mmap(int level, table_entry_pos_t size, const char* outfilename); void cpsublm(lmtable* sublmt, dictionary* subdict,bool keepunigr=true); int reload(std::set words); void filter(const char* /* unused parameter: lmfile */) {}; virtual double lprob(ngram ng, double* bow=NULL, int* bol=NULL, ngram_state_t* maxsuffidx=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL); virtual double clprob(ngram ng, double* bow, int* bol, ngram_state_t* maxsuffidx, char** maxsuffptr, unsigned int* statesize, bool* extendible, double* lastbow); virtual const char *maxsuffptr(ngram ong, unsigned int* size=NULL); virtual const char *cmaxsuffptr(ngram ong, unsigned int* size=NULL); virtual ngram_state_t maxsuffidx(ngram ong, unsigned int* size=NULL); virtual ngram_state_t cmaxsuffidx(ngram ong, unsigned int* size=NULL); void *search(int lev,table_entry_pos_t offs,table_entry_pos_t n,int sz,int *w, LMT_ACTION action,char **found=(char **)NULL); int mybsearch(char *ar, table_entry_pos_t n, int size, char *key, table_entry_pos_t *idx); int add(ngram& ng, float prob,float bow); //template int add(ngram& ng, TA prob,TB bow); int addwithoffset(ngram& ng, float prob,float bow); // template int addwithoffset(ngram& ng, TA prob,TB bow); void checkbounds(int level); virtual inline int get(ngram& ng) { return get(ng,ng.size,ng.size); } virtual int get(ngram& ng,int n,int lev); int succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev); inline void putmem(char* ptr,int value,int offs,int size) { MY_ASSERT(ptr!=NULL); for (int i=0; i> (8 * i)) & 0xff; }; inline void getmem(char* ptr,int* value,int offs,int size) { MY_ASSERT(ptr!=NULL); *value=ptr[offs] & 0xff; for (int i=1; i inline void putmem(char* ptr,T value,int offs) { MY_ASSERT(ptr!=NULL); memcpy(ptr+offs, &value, sizeof(T)); }; template inline void getmem(char* ptr,T* value,int offs) { MY_ASSERT(ptr!=NULL); memcpy((void*)value, ptr+offs, sizeof(T)); }; int nodesize(LMT_TYPE ndt) { switch (ndt) { case INTERNAL: return LMTCODESIZE + PROBSIZE + PROBSIZE + BOUNDSIZE; case QINTERNAL: return LMTCODESIZE + QPROBSIZE + QPROBSIZE + BOUNDSIZE; case LEAF: return LMTCODESIZE + PROBSIZE; case QLEAF: return LMTCODESIZE + QPROBSIZE; default: MY_ASSERT(0); return 0; } } inline int word(node nd,int value=-1) { int offset=0; if (value==-1) getmem(nd,&value,offset,LMTCODESIZE); else putmem(nd,value,offset,LMTCODESIZE); return value; }; int codecmp(node a,node b) { register int i,result; for (i=(LMTCODESIZE-1); i>=0; i--) { result=(unsigned char)a[i]-(unsigned char)b[i]; if(result) return result; } return 0; }; int codediff(node a,node b) { return word(a)-word(b); }; inline float prob(node nd,LMT_TYPE ndt) { int offs=LMTCODESIZE; float fv; unsigned char cv; switch (ndt) { case INTERNAL: getmem(nd,&fv,offs); return fv; case QINTERNAL: getmem(nd,&cv,offs); return (float) cv; case LEAF: getmem(nd,&fv,offs); return fv; case QLEAF: getmem(nd,&cv,offs); return (float) cv; default: MY_ASSERT(0); return 0; } }; template inline T prob(node nd, LMT_TYPE ndt, T value) { int offs=LMTCODESIZE; switch (ndt) { case INTERNAL: putmem(nd, value,offs); break; case QINTERNAL: putmem(nd,(unsigned char) value,offs); break; case LEAF: putmem(nd, value,offs); break; case QLEAF: putmem(nd,(unsigned char) value,offs); break; default: MY_ASSERT(0); return (T) 0; } return value; }; inline float bow(node nd,LMT_TYPE ndt) { int offs=LMTCODESIZE+(ndt==QINTERNAL?QPROBSIZE:PROBSIZE); float fv; unsigned char cv; switch (ndt) { case INTERNAL: getmem(nd,&fv,offs); return fv; case QINTERNAL: getmem(nd,&cv,offs); return (float) cv; case LEAF: getmem(nd,&fv,offs); return fv; case QLEAF: getmem(nd,&cv,offs); return (float) cv; default: MY_ASSERT(0); return 0; } }; template inline T bow(node nd,LMT_TYPE ndt, T value) { int offs=LMTCODESIZE+(ndt==QINTERNAL?QPROBSIZE:PROBSIZE); switch (ndt) { case INTERNAL: putmem(nd, value,offs); break; case QINTERNAL: putmem(nd,(unsigned char) value,offs); break; case LEAF: putmem(nd, value,offs); break; case QLEAF: putmem(nd,(unsigned char) value,offs); break; default: MY_ASSERT(0); return 0; } return value; }; inline table_entry_pos_t boundwithoffset(node nd,LMT_TYPE ndt, int level){ return bound(nd,ndt) - tb_offset[level+1]; } inline table_entry_pos_t boundwithoffset(node nd,LMT_TYPE ndt, table_entry_pos_t value, int level){ return bound(nd, ndt, value + tb_offset[level+1]); } // table_entry_pos_t bound(node nd,LMT_TYPE ndt, int level=0) { table_entry_pos_t bound(node nd,LMT_TYPE ndt) { int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE); table_entry_pos_t value; getmem(nd,&value,offs); // value -= tb_offset[level+1]; return value; }; // table_entry_pos_t bound(node nd,LMT_TYPE ndt, table_entry_pos_t value, int level=0) { table_entry_pos_t bound(node nd,LMT_TYPE ndt, table_entry_pos_t value) { int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE); // value += tb_offset[level+1]; putmem(nd,value,offs); return value; }; //template T boundwithoffset(node nd,LMT_TYPE ndt, T value, int level); /* table_entry_pos_t boundwithoffset(node nd,LMT_TYPE ndt, int level) { int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE); table_entry_pos_t value; getmem(nd,&value,offs); return value; // return value-tb_offset[level+1]; }; */ /* table_entry_pos_t boundwithoffset(node nd,LMT_TYPE ndt, table_entry_pos_t value, int level) { int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE); putmem(nd,value,offs); return value; // return value+tb_offset[level+1]; }; */ /* inline table_entry_pos_t bound(node nd,LMT_TYPE ndt) { int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE); table_entry_pos_t value; getmem(nd,&value,offs); return value; }; template inline T bound(node nd,LMT_TYPE ndt, T value) { int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE); putmem(nd,value,offs); return value; }; */ //returns the indexes of the successors of a node int succrange(node ndp,int level,table_entry_pos_t* isucc=NULL,table_entry_pos_t* esucc=NULL); void stat(int lev=0); void printTable(int level); virtual inline void setDict(dictionary* d) { if (delete_dict==true && dict) delete dict; dict=d; delete_dict=false; }; inline dictionary* getDict() const { return dict; }; inline table_entry_pos_t getCurrentSize(int l) const { return cursize[l]; }; inline void setOrderQuery(bool v) { orderQuery = v; } inline bool isOrderQuery() const { return orderQuery; } inline float GetNgramcacheLoadFactor() { return ngramcache_load_factor; } inline float GetDictionaryLoadFactor() { return ngramcache_load_factor; } //never allow the increment of the dictionary through this function inline virtual void dictionary_incflag(const bool flag) { UNUSED(flag); }; inline virtual bool filter(const string sfilter, lmtable* sublmt, const string skeepunigrams) { std::cerr << "filtering... \n"; dictionary *dict=new dictionary((char *)sfilter.c_str()); cpsublm(sublmt, dict,(skeepunigrams=="yes")); delete dict; std::cerr << "...done\n"; return true; } inline virtual bool is_OOV(int code) { return (code == dict->oovcode()); }; }; }//namespace irstlm #endif irstlm-6.00.05/src/mdiadapt.cpp000066400000000000000000001610301263213470300162540ustar00rootroot00000000000000/****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include "util.h" #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "mempool.h" #include "ngramcache.h" #include "ngramtable.h" #include "normcache.h" #include "interplm.h" #include "mdiadapt.h" #include "shiftlm.h" #include "lmtable.h" using namespace std; namespace irstlm { #ifdef MDIADAPTLM_CACHE_ENABLE #if MDIADAPTLM_CACHE_ENABLE==0 #undef MDIADAPTLM_CACHE_ENABLE #endif #endif #ifdef MDIADAPTLM_CACHE_ENABLE bool mdiadaptlm::mdiadaptlm_cache_enable=true; #else bool mdiadaptlm::mdiadaptlm_cache_enable=false; #endif // //Minimum discrimination adaptation for interplm // mdiadaptlm::mdiadaptlm(char* ngtfile,int depth,TABLETYPE tbtype): interplm(ngtfile,depth,tbtype) { adaptlev=0; forelm=NULL; cache=NULL; m_save_per_level=true; }; mdiadaptlm::~mdiadaptlm() { if (cache) delete cache; delete_caches(); }; void mdiadaptlm::delete_caches(int level) { if (probcache[level]) delete probcache[level]; if (backoffcache[level]) delete backoffcache[level]; }; void mdiadaptlm::delete_caches() { #ifdef MDIADAPTLM_CACHE_ENABLE for (int i=0; i<=max_caching_level; i++) delete_caches(i); delete [] probcache; delete [] backoffcache; #endif }; void mdiadaptlm::caches_stat() { #ifdef MDIADAPTLM_CACHE_ENABLE for (int i=1; i<=max_caching_level; i++) { if (probcache[i]) { cerr << "Statistics of probcache at level " << i << " (of " << maxlevel() << ") "; probcache[i]->stat(); } if (backoffcache[i]) { cerr << "Statistics of backoffcache at level " << i << " (of " << maxlevel() << ") "; backoffcache[i]->stat(); } } #endif }; void mdiadaptlm::create_caches(int mcl) { max_caching_level=(mcl>=0 && mclisfull()) probcache[level]->reset(probcache[level]->cursize()); if (backoffcache[level] && backoffcache[level]->isfull()) backoffcache[level]->reset(backoffcache[level]->cursize()); }; void mdiadaptlm::check_cache_levels() { #ifdef MDIADAPTLM_CACHE_ENABLE for (int i=1; i<=max_caching_level; i++) check_cache_levels(i); #endif }; void mdiadaptlm::reset_caches(int level) { if (probcache[level]) probcache[level]->reset(MAX(probcache[level]->cursize(),probcache[level]->maxsize())); if (backoffcache[level]) backoffcache[level]->reset(MAX(backoffcache[level]->cursize(),backoffcache[level]->maxsize())); }; void mdiadaptlm::reset_caches() { #ifdef MDIADAPTLM_CACHE_ENABLE for (int i=1; i<=max_caching_level; i++) reset_caches(i); #endif }; inline NGRAMCACHE_t* mdiadaptlm::get_probcache(int level) { return probcache[level]; } inline NGRAMCACHE_t* mdiadaptlm::get_backoffcache(int level) { return backoffcache[level]; } int mdiadaptlm::scalefact(char *ngtfile) { if (forelm!=NULL) delete forelm; if (cache!=NULL) delete cache; cache=new normcache(dict); forelm=new shiftbeta(ngtfile,1); forelm->train(); //compute oov scalefact term ngram fng(forelm->dict,1); ngram ng(dict,1); int* w=fng.wordp(1); oovscaling=1.0; for ((*w)=0; (*w)dict->size(); (*w)++) if ((*w) != forelm->dict->oovcode()) { ng.trans(fng); if (*ng.wordp(1)==dict->oovcode()) { cerr << "adaptation file contains new words: use -ao=yes option\n"; exit(1); } //forbidden situation oovscaling-=backunig(ng); } *w=forelm->dict->oovcode(); oovscaling=foreunig(fng)/oovscaling; return 1; }; int mdiadaptlm::savescalefactor(char* filename) { ngram ng(dict,1); int* w=ng.wordp(1); mfstream out(filename,ios::out); out << "\n\\data\\" << "\nngram 1=" << dict->size() << "\n\n1grams:\n"; for ((*w)=0; (*w)size(); (*w)++) { double ratio=scalefact(ng); out << (float) (ratio?log10(ratio):-99); if (*w==dict->oovcode()) out << "\t" << "\n"; else out << "\t" << (char *)dict->decode(*w) << "\n"; } out << "\\end\\\n"; return 1; } double mdiadaptlm::scalefact(ngram ng) { ngram fng(forelm->dict,1); fng.trans(ng); if (*fng.wordp(1)==forelm->dict->oovcode()) return pow(oovscaling,gis_step); else { double prback=backunig(ng); double prfore=foreunig(ng); return pow(prfore/prback,gis_step); } } double mdiadaptlm::foreunig(ngram ng) { double fstar,lambda; forelm->discount(ng,1,fstar,lambda); return fstar; } double mdiadaptlm::backunig(ngram ng) { double fstar,lambda; discount(ng,1,fstar,lambda,0); return fstar; }; int mdiadaptlm::adapt(char* ngtfile,int alev,double step) { if (alev > lmsize() || alev<=0) { cerr << "setting adaptation level to " << lmsize() << "\n"; alev=lmsize(); } adaptlev=alev; cerr << "adapt ...."; gis_step=step; if (ngtfile==NULL) { cerr << "adaptation file is missing\n"; exit(1); } //compute the scaling factor; scalefact(ngtfile); //compute 1-gram zeta ngram ng(dict,2); int* w=ng.wordp(1); cerr << "precomputing 1-gram normalization ...\n"; zeta0=0; for ((*w)=0; (*w)size(); (*w)++) zeta0+=scalefact(ng) * backunig(ng); if (alev==1) return 1 ; cerr << "precomputing 2-gram normalization:\n"; //precompute the bigram normalization w=ng.wordp(2); *ng.wordp(1)=0; for ((*w)=0; (*w)size(); (*w)++) { zeta(ng,2); if ((*w % 1000)==0) cerr << "."; } cerr << "done\n"; return 1; }; double mdiadaptlm::zeta(ngram ng,int size) { MY_ASSERT(size>=1); double z=0; // compute normalization term ng.size=size; if (size==1) return zeta0; else { //size>1 //check in the 2gr and 3gr cache if (size <=3 && cache->get(ng,size,z)) return z; double fstar,lambda; ngram histo=ng; int succ=0; discount(ng,size,fstar,lambda,(int)0); if ((lambda<1) && get(histo,size,size-1)) { ; //scan all its successors succ=0; succscan(histo,ng,INIT,size); while(succscan(histo,ng,CONT,size)) { discount(ng,size,fstar,lambda,0); if (fstar>0) { z+=(scalefact(ng) * fstar); succ++; //cerr << ng << "zeta= " << z << "\n"; } } } z+=lambda*zeta(ng,size-1); if (size<=3 && succ>1) cache->put(ng,size,z); return z; } } int mdiadaptlm::discount(ngram ng_,int size,double& fstar,double& lambda,int /* unused parameter: cv */) { VERBOSE(3,"mdiadaptlm::discount(ngram ng_,int size,double& fstar,double& lambda,int)) ng_:|" << ng_ << "| size:" << size << std::endl); ngram ng(dict); ng.trans(ng_); double __fstar, __lambda; bool lambda_cached=0; int size_lambda=size-1; ngram histo=ng; histo.shift(); if (size_lambda>0 && histo.size>=size_lambda) { #ifdef MDIADAPTLM_CACHE_ENABLE if (size_lambda<=max_caching_level) { //backoffcache hit if (backoffcache[size_lambda] && backoffcache[size_lambda]->get(histo.wordp(size_lambda),__lambda)) lambda_cached=1; } #endif } discount(ng,size,__fstar,__lambda,0); if ((size>0) && (size<=adaptlev) && (__lambda<1)) { if (size>1) { double numlambda, numfstar, den; numfstar=scalefact(ng); den=zeta(ng,size); __fstar=__fstar * numfstar/den; if (!lambda_cached) { numlambda=zeta(ng,size-1); __lambda=__lambda * numlambda/den; } } else if (size==1) { double ratio; ratio=scalefact(ng)/zeta0; __fstar=__fstar * ratio; if (!lambda_cached) { __lambda=__lambda * ratio; } } else { //size==0 do nothing } } #ifdef MDIADAPTLM_CACHE_ENABLE //backoffcache insert if (!lambda_cached && size_lambda>0 && size_lambda<=max_caching_level && histo.size>=size_lambda && backoffcache[size_lambda]) backoffcache[size_lambda]->add(histo.wordp(size_lambda),__lambda); #endif lambda=__lambda; fstar=__fstar; return 1; } int mdiadaptlm::compute_backoff() { VERBOSE(3,"mdiadaptlm::compute_backoff() "); if (m_save_per_level){ VERBOSE(3," per level ...\n"); return mdiadaptlm::compute_backoff_per_level(); }else{ VERBOSE(3," per word ...\n"); return mdiadaptlm::compute_backoff_per_word(); } } int mdiadaptlm::compute_backoff_per_level() { VERBOSE(3,"mdiadaptlm::compute_backoff_per_level()\n"); double fstar,lambda; this->backoff=1; for (int size=1; size0){ ng.size=ng.size-1; pr -= mdiadaptlm::prob(ng,size); } } MY_ASSERT(pr>=LOWER_SINGLE_PRECISION_OF_0 && pr<=UPPER_SINGLE_PRECISION_OF_1); boff(hg.link,pr); } } VERBOSE(3,"mdiadaptlm::compute_backoff_per_level() DONE\n"); return 1; } int mdiadaptlm::compute_backoff_per_word() { cerr << "Current implementation does not support the usage of backoff (-bo=yes) mixture models (-lm=mix) combined with the per-word saving (-saveperllevel=no)." << endl; cerr << "Please, either choose a per-level saving (-saveperllevel=yes) or do not use backoff (-bo=no) " << endl; exit(1); } double mdiadaptlm::prob2(ngram ng,int size,double& fstar) { double lambda; mdiadaptlm::discount(ng,size,fstar,lambda); if (size>1) return fstar + lambda * prob(ng,size-1); else return fstar; } //inline double mdiadaptlm::prob(ngram ng,int size){ double mdiadaptlm::prob(ngram ng,int size) { double fstar,lambda,bo; return prob(ng,size,fstar,lambda,bo); } double mdiadaptlm::prob(ngram ng,int size,double& fstar,double& lambda, double& bo) { VERBOSE(3,"mdiadaptlm::prob(ngram ng,int size,double& fstar,double& lambda, double& bo) ng:|" << ng << "| size:" << size << std::endl); double pr; #ifdef MDIADAPTLM_CACHE_ENABLE //probcache hit if (size<=max_caching_level && probcache[size] && ng.size>=size && probcache[size]->get(ng.wordp(size),pr)) return pr; #endif //probcache miss mdiadaptlm::bodiscount(ng,size,fstar,lambda,bo); VERBOSE(3,"mdiadaptlm::prob(ngram ng,int size,double& fstar,double& lambda, double& bo) after bodiscount @@@@@@@@@ ng:|" << ng << "| size:" << size << "| fstar:" << fstar << "| lambda:" << lambda << "| bo:" << bo << std::endl); if (fstar>UPPER_SINGLE_PRECISION_OF_1 || lambda>UPPER_SINGLE_PRECISION_OF_1) { cerr << "wrong probability: " << ng << " , size " << size << " , fstar " << fstar << " , lambda " << lambda << "\n"; fstar=(fstar>UPPER_SINGLE_PRECISION_OF_1?UPPER_SINGLE_PRECISION_OF_1:fstar); lambda=(lambda>UPPER_SINGLE_PRECISION_OF_1?UPPER_SINGLE_PRECISION_OF_1:lambda); //exit(1); } if (backoff) { if (size>1) { if (fstar>0){ pr=fstar; }else { if (lambda<1){ pr = lambda/bo * prob(ng,size-1); }else { MY_ASSERT(lambda1) pr = fstar + lambda * prob(ng,size-1); else pr = fstar; } #ifdef MDIADAPTLM_CACHE_ENABLE //probcache insert if (size<=max_caching_level && probcache[size] && ng.size>=size) probcache[size]->add(ng.wordp(size),pr); #endif VERBOSE(3,"mdiadaptlm::prob(ngram ng,int size,double& fstar,double& lambda, double& bo) returning ng:|" << ng << "| pr:" << pr << std::endl); return pr; } int mdiadaptlm::bodiscount(ngram ng_,int size,double& fstar,double& lambda,double& bo) { VERBOSE(3,"mdiadaptlm::bodiscount(ngram ng_,int size,double& fstar,double& lambda,double& bo) ng_:|" << ng_ << "| size:" << size << std::endl); ngram ng(dict); ng.trans(ng_); mdiadaptlm::discount(ng,size,fstar,lambda); bo=1.0; if (backoff) { //get back-off probability if (size>1 && lambda<1) { ngram hg=ng; // cerr<< "hg:|" << hg << "| size:|" << size << "|" << endl; if (! get(hg,size,size-1)){ cerr << "ERROR: int mdiadaptlm::bodiscount(ngram ng_,int size,double& fstar,double& lambda,double& bo) -> get(hg,size,size-1) returns NULL\n"; } MY_ASSERT(get(hg,size,size-1)); bo=boff(hg.link); // if (lambda > bo){ // cerr << " mdiadaptlm::bodiscount ERROR: " << " lambda:" << lambda << " bo:" << bo << "\n"; // exit(1); // } } } return 1; } double mdiadaptlm::txclprob(ngram ng,int size) { double fstar,lambda; if (size>1) { mdiadaptlm::discount(ng,size,fstar,lambda); return fstar + lambda * txclprob(ng,size-1); } else { double freq=1; if ((*ng.wordp(1)!=dict->oovcode()) && get(ng,1,1)) freq+=ng.freq; double N=totfreq()+dict->dub()-dict->size(); return freq/N; } } int mdiadaptlm::netsize() { double fstar,lambda; int size,totsize; ngram ng(dict); cerr << "Computing LM size:\n"; totsize=dict->size() * 2; cout << "1-gram " << totsize << "\n"; for (int i=2; i<=maxlevel(); i++) { size=0; scan(ng,INIT,i); while (scan(ng,CONT,i)) { mdiadaptlm::discount(ng,i,fstar,lambda); if (fstar>0) size++; } size+=size * (i dictionary length repeat [ dictionary length ] { word; } while [ first word != STOP ] { first word number of successors repeat [ number of successors ] { second word prob } } STOP while [ first word != STOP ] { first word number of successor sets repeat [ number of successor sets ] { second word number of successors repeat [ number of successors ] { third word prob } } } STOP */ //void writeNull(mfbstream& out,unsigned short nullCode,float nullProb){ // out.writex(&nullCode,sizeof(short)); // out.writex(&nullProb,sizeof(float)); //} int swapbytes(char *p, int sz, int n) { char c,*l,*h; if((n<1) ||(sz<2)) return 0; for(; n--; p+=sz) for(h=(l=p)+sz; --h>l; l++) { c=*h; *h=*l; *l=c; } return 0; }; void fwritex(char *p,int sz,int n,FILE* f) { if(*(short *)"AB"==0x4241) { swapbytes((char*)p, sz,n); } fwrite((char *)p,sz,n,f); if(*(short *)"AB"==0x4241) swapbytes((char*)p, sz,n); } void ifwrite(long loc,void *ptr,int size,int /* unused parameter: n */,FILE* f) { fflush(f); long pos=ftell(f); fseek(f,loc,SEEK_SET); fwritex((char *)ptr,size,1,f); fseek(f,pos,SEEK_SET); fflush(f); } void writeNull(unsigned short nullCode,float nullProb,FILE* f) { fwritex((char *)&nullCode,sizeof(short),1,f); fwritex((char *)&nullProb,sizeof(float),1,f); } int mdiadaptlm::saveASR(char *filename,int /* unused parameter: backoff */,char* subdictfile) { int totbg,tottr; dictionary* subdict; if (subdictfile) subdict=new dictionary(subdictfile); else subdict=dict; // default is subdict=dict typedef unsigned short code; system("date"); if (lmsize()>3 || lmsize()<1) { cerr << "wrong lmsize\n"; exit(1); } if (dict->size()>=0xffff && subdict->size()>=0xffff) { cerr << "save bin requires unsigned short codes\n"; exit(1); } FILE* f=fopen(filename,"w"); double fstar,lambda,boff; float pr; long succ1pos,succ2pos; code succ1,succ2,w,h1,h2; code stop=0xffff; //dictionary //#dictsize w1\n ..wN\n NULL\n code oovcode=subdict->oovcode(); //includes at least NULL code subdictsz=subdict->size()+1; fwritex((char *)&subdictsz,sizeof(code),1,f); subdictsz--; for (w=0; wdecode(w)); fprintf(f,"____\n"); //unigram part //NULL #succ w1 pr1 ..wN prN h1=subdictsz; fwritex((char *)&h1,sizeof(code),1,f); //NULL succ1=0; succ1pos=ftell(f); fwritex((char *)&succ1,sizeof(code),1,f); ngram ng(dict); ngram sng(subdict); ng.size=sng.size=1; scan(ng,INIT,1); while(scan(ng,CONT,1)) { sng.trans(ng); if (sng.containsWord(subdict->OOV(),1)) continue; pr=(float)mdiadaptlm::prob(ng,1); if (pr>1e-50) { //do not consider too low probabilities succ1++; w=*sng.wordp(1); fwritex((char *)&w,sizeof(code),1,f); fwritex((char *)&pr,sizeof(float),1,f); } else { cerr << "small prob word " << ng << "\n"; } } // update number of unigrams ifwrite(succ1pos,&succ1,sizeof(code),1,f); cerr << "finito unigrammi " << succ1 << "\n"; fflush(f); if (lmsize()==1) { fclose(f); return 1; } // rest of bigrams // w1 #succ w1 pr1 .. wN prN succ1=0; h1=subdictsz; totbg=subdictsz; ngram hg1(dict,1); ng.size=sng.size=2; scan(hg1,INIT,1); while(scan(hg1,CONT,1)) { if (hg1.containsWord(dict->OOV(),1)) continue; MY_ASSERT((*hg1.wordp(1))size()); *ng.wordp(2)=*hg1.wordp(1); *ng.wordp(1)=0; sng.trans(ng); if (sng.containsWord(dict->OOV(),1)) continue; mdiadaptlm::bodiscount(ng,2,fstar,lambda,boff); if (lambda < 1.0) { h1=*sng.wordp(2); fwritex((char *)&h1,sizeof(code),1,f); succ1=0; succ1pos=ftell(f); fwritex((char *)&succ1,sizeof(code),1,f); ngram shg=hg1; get(shg,1,1); succscan(shg,ng,INIT,2); while(succscan(shg,ng,CONT,2)) { if (*ng.wordp(1)==oovcode) continue; sng.trans(ng); if (sng.containsWord(dict->OOV(),2)) continue; mdiadaptlm::discount(ng,2,fstar,lambda); if (fstar>1e-50) { w=*sng.wordp(1); fwritex((char *)&w,sizeof(code),1,f); pr=(float)mdiadaptlm::prob(ng,2); //cerr << ng << " prob=" << log(pr) << "\n"; fwritex((char *)&pr,sizeof(float),1,f); succ1++; } } if (succ1) { lambda/=boff; //consider backoff writeNull(subdictsz,(float)lambda,f); succ1++; totbg+=succ1; ifwrite(succ1pos,&succ1,sizeof(code),1,f); } else { //go back one word fseek(f,succ1pos-(streampos)sizeof(code),SEEK_SET); } } } fwritex((char *)&stop,sizeof(code),1,f); cerr << " finito bigrammi! " << subdictsz << "\n"; fflush(f); system("date"); if (lmsize()<3) { fclose(f); return 1; } //TRIGRAM PART h1=subdictsz; h2=subdictsz; tottr=0; succ1=0; succ2=0; ngram hg2(dict,2); ng.size=sng.size=3; scan(hg1,INIT,1); while(scan(hg1,CONT,1)) { if ((*hg1.wordp(1)==oovcode)) continue; *ng.wordp(3)=*hg1.wordp(1); sng.trans(ng); if (sng.containsWord(dict->OOV(),1)) continue; MY_ASSERT((*sng.wordp(3))OOV(),2)) continue; mdiadaptlm::bodiscount(ng,3,fstar,lambda,boff); if (lambda < 1.0) { h2=*sng.wordp(2); fwritex((char *)&h2,sizeof(code),1,f); succ2=0; succ2pos=ftell(f); fwritex((char *)&succ2,sizeof(code),1,f); ngram shg2=ng; get(shg2,3,2); succscan(shg2,ng,INIT,3); while(succscan(shg2,ng,CONT,3)) { if (*ng.wordp(1)==oovcode) continue; sng.trans(ng); if (sng.containsWord(dict->OOV(),3)) continue; mdiadaptlm::discount(ng,3,fstar,lambda); //pr=(float)mdiadaptlm::prob2(ng,3,fstar); if (fstar>1e-50) { w=*sng.wordp(1); fwritex((char *)&w,sizeof(code),1,f); pr=(float)mdiadaptlm::prob(ng,3); // cerr << ng << " prob=" << log(pr) << "\n"; fwritex((char *)&pr,sizeof(float),1,f); succ2++; } } if (succ2) { lambda/=boff; writeNull(subdictsz,(float)lambda,f); succ2++; tottr+=succ2; ifwrite(succ2pos,&succ2,sizeof(code),1,f); succ1++; } else { //go back one word fseek(f,succ2pos-(long)sizeof(code),SEEK_SET); } } } if (succ1) ifwrite(succ1pos,&succ1,sizeof(code),1,f); else fseek(f,succ1pos-(long)sizeof(code),SEEK_SET); } fwritex((char *)&stop,sizeof(code),1,f); fclose(f); cerr << "Tot bg: " << totbg << " tg: " << tottr<< "\n"; system("date"); return 1; }; ///// Save in IRST MT format int mdiadaptlm::saveMT(char *filename,int backoff, char* subdictfile,int resolution,double decay) { double logalpha=log(decay); dictionary* subdict; if (subdictfile) subdict=new dictionary(subdictfile); else subdict=dict; // default is subdict=dict ngram ng(dict,lmsize()); ngram sng(subdict,lmsize()); cerr << "Adding unigram of OOV word if missing\n"; for (int i=1; i<=maxlevel(); i++) *ng.wordp(i)=dict->oovcode(); if (!get(ng,maxlevel(),1)) { cerr << "oov is missing in the ngram-table\n"; // f(oov) = dictionary size (Witten Bell) ng.freq=dict->freq(dict->oovcode()); cerr << "adding oov unigram " << ng << "\n"; put(ng); } cerr << "Eventually adding OOV symbol to subdictionary\n"; subdict->encode(OOV_); system("date"); mfstream out(filename,ios::out); //add special symbols subdict->incflag(1); int bo_code=subdict->encode(BACKOFF_); int du_code=subdict->encode(DUMMY_); subdict->incflag(0); out << "nGrAm " << lmsize() << " " << 0 << " " << "LM_ " << resolution << " " << decay << "\n"; subdict->save(out); //start writing ngrams cerr << "write unigram of oov probability\n"; ng.size=1; *ng.wordp(1)=dict->oovcode(); double pr=(float)mdiadaptlm::prob(ng,1); sng.trans(ng); sng.size=lmsize(); for (int s=2; s<=lmsize(); s++) *sng.wordp(s)=du_code; sng.freq=(int)ceil(pr * (double)10000000)-1; out << sng << "\n"; for (int i=1; i<=lmsize(); i++) { cerr << "LEVEL " << i << "\n"; double fstar,lambda,bo,dummy; scan(ng,INIT,i); while(scan(ng,CONT,i)) { sng.trans(ng); sng.size=lmsize(); for (int s=i+1; s<=lmsize(); s++) *sng.wordp(s)=du_code; if (i>=1 && sng.containsWord(subdict->OOV(),sng.size)) { cerr << "skipping : " << sng << "\n"; continue; } // skip also eos symbols not at the final //if (i>=1 && sng.containsWord(dict->EoS(),sng.size)) //continue; mdiadaptlm::discount(ng,i,fstar,dummy); //out << sng << " fstar " << fstar << " lambda " << lambda << "\n"; //if (i==1 && sng.containsWord(subdict->OOV(),i)){ // cerr << sng << " fstar " << fstar << "\n"; //} if (fstar>0) { double pr=(float)mdiadaptlm::prob(ng,i); if (i>1 && resolution<10000000) { sng.freq=resolution-(int)(log(pr)/logalpha)-1; sng.freq=(sng.freq>=0?sng.freq:0); } else sng.freq=(int)ceil(pr * (double)10000000)-1; out << sng << "\n"; } if (i=0?sng.freq:0); } else sng.freq=(int)ceil(lambda/bo * (double)10000000)-1; out << sng << "\n"; } } } cerr << "LEVEL " << i << "DONE \n"; } return 1; }; ///// Save in binary format forbackoff N-gram models int mdiadaptlm::saveBIN_per_word(char *filename,int backoff,char* subdictfile,int mmap) { VERBOSE(2,"mdiadaptlm::saveBIN_per_word START\n"); system("date"); //subdict dictionary* subdict; //accumulated unigram oov prob //CHECK why this is not used (differently from what happens in the other save functions // double oovprob=0; if (subdictfile) subdict=new dictionary(subdictfile); else subdict=dict; // default is subdict=dict if (mmap) { VERBOSE(2,"savebin with memory map: " << filename << "\n"); } else { VERBOSE(2,"savebin: " << filename << "\n"); } int maxlev=lmsize(); streampos pos[LMTMAXLEV+1]; char buff[100]; int isQuant=0; //savebin for quantized LM is not yet implemented //temporary filename to save the LM related to a single term char tmpfilename[BUFSIZ]; //create temporary output file stream to store single levels for all terms MY_ASSERT(strlen(filename)<1000); char tfilename[LMTMAXLEV+1][1000]; mfstream *tout[LMTMAXLEV+1]; tout[0]=NULL; for (int i=1; i<=maxlev; i++) { sprintf(tfilename[i],"%s-%dgrams",filename,i); tout[i]=new mfstream(tfilename[i],ios::out); } // print header in the main output file mfstream out(filename,ios::out); out << "blmt " << maxlev; for (int i=1; i<=maxlev; i++) { //reserve space for ngram statistics (which are not yet avalable) pos[i]=out.tellp(); sprintf(buff," %10d",0); out << buff; } out << "\n"; subdict->save(out); out.flush(); ngram ng(dict,maxlev); ngram oldng(dict,maxlev); ngram locng(dict,maxlev); ngram sng(subdict,maxlev); double fstar,lambda,bo,dummy,dummy2,pr,ibow; double oovprob=0.0; //accumulated unigram oov pro bool _OOV_unigram=false; //flag to check whether an OOV word is present or not //n-gram counters table_entry_pos_t num[LMTMAXLEV+1]; for (int i=1; i<=maxlev; i++) num[i]=0; lmtable* lmt = new lmtable(); lmt->configure(maxlev,isQuant); lmt->setDict(subdict); lmt->expand_level(1,dict->size(),filename,mmap); //main loop for (int w=0; wsize(); w++) { int i=1; //set the initial value of level sprintf(tmpfilename,"%s_tmp_%d",filename,w); if (!w % 10000) cerr << "."; //1-gram ngram ung(dict,1); *ung.wordp(1)=w; sng.trans(ung); // frequency pruning is not applied to unigrams /* //exclude words not occurring in the subdictionary if (sng.containsWord(subdict->OOV(),1) && !ung.containsWord(dict->OOV(),1)) continue; */ pr=mdiadaptlm::prob(ung,1); if (sng.containsWord(subdict->OOV(),1) || ung.containsWord(dict->OOV(),1)) { _OOV_unigram=true; oovprob+=pr; //accumulate oov probability continue; } pr=(pr?log10(pr):-99); if (iLOWER_SINGLE_PRECISION_OF_1){ //ngram must be skipped ibow = DONT_PRINT; }else{ if (backoff){ ibow=log10(lambda) - log10(bo); }else{ MY_ASSERT((lambdaLOWER_SINGLE_PRECISION_OF_1) || boaddwithoffset(ung,(float)pr,(float)ibow); } num[i]++; //manage n-grams if (get(ung,1,1)) { //create n-gram with history w *ng.wordp(lmsize())=w; //create sentinel n-gram for (int i=1; i<=maxlev; i++) *oldng.wordp(i)=-1; //create the table for all levels but the level 1, with the maximum number of possible entries for (int i=2; i<=maxlev; i++) lmt->expand_level(i,entries(i),tmpfilename,mmap); scan(ung.link,ung.info,1,ng,INIT,lmsize()); while(scan(ung.link,ung.info,1,ng,CONT,lmsize())) { sng.trans(ng); // convert to subdictionary // locng=ng; // make a local copy //find first internal level that changed int f=maxlev-1; //unigrams have been already covered while (f>1 && (*oldng.wordp(f)==*ng.wordp(f))){ f--; } for (int l=maxlev-(f-1); l<=lmsize(); l++){ locng=ng; // make a local copy if (lOOV(),l)) continue; // skip also n-grams containing eos symbols not at the final if (sng.containsWord(dict->EoS(),l-1)) continue; VERBOSE(3,"mdiadaptlm::saveBIN_per_word(char *filename,int backoff,char* subdictfile ) computing prob for locng:|" << locng << "| size:" << l << std::endl); pr=mdiadaptlm::prob(locng,l,fstar,dummy,dummy2); VERBOSE(3,"mdiadaptlm::saveBIN_per_word(char *filename,int backoff,char* subdictfile ) getting prob locng:|" << locng << "| size:" << l << " fstar:" << fstar << " pr:" << pr << std::endl); //PATCH by Nicola (16-04-2008) if (!(pr<=1.0 && pr > 1e-10)) { cerr << ng << " " << pr << "\n"; MY_ASSERT(pr<=1.0); cerr << "prob modified to 1e-10\n"; pr=1e-10; } if (lLOWER_SINGLE_PRECISION_OF_1){ //ngram must be skipped ibow = DONT_PRINT; }else{ if (backoff){ ibow = (float) (log10(lambda) - log10(bo)); }else{ MY_ASSERT((lambdaLOWER_SINGLE_PRECISION_OF_1) || bo=UPPER_SINGLE_PRECISION_OF_0 || ibow!=DONT_PRINT ) { if (lmt->addwithoffset(locng,(float)log10(pr),(float)ibow)){ num[l]++; }else{ continue; } } else{ continue; //skip n-grams with too small fstar } } oldng=ng; } } else{ //create empty tables for all levels but the level 1, to keep consistency with the rest of the code for (int i=2; i<=maxlev; i++) lmt->expand_level(i,0,tmpfilename,mmap); } //level 1 is not modified until everything is done //because it has to contain the full dictionary //which provides the direct access to the second level for (int i=2; i<=lmsize(); i++){ if (i>2) { lmt->checkbounds(i-1); lmt->appendbin_level(i-1, *tout[i-1], mmap); } // now we can resize table at level i lmt->resize_level(i, tmpfilename, mmap); } // now we can save table at level maxlev, if not equal to 1 if (maxlev>1){ lmt->appendbin_level(maxlev, *tout[maxlev], mmap); } //delete levels from 2 to lmsize(); for (int i=2; i<=maxlev; i++) lmt->delete_level(i, tmpfilename, mmap); //update table offsets for (int i=2; i<=maxlev; i++) lmt->update_offset(i,num[i]); } if (_OOV_unigram){ ngram ung(dict,1); *ung.wordp(1)=dict->oovcode(); ibow=0.0; pr=oovprob; pr=(pr?log10(pr):-99); lmt->addwithoffset(ung,(float)pr,(float)ibow); num[1]++; } //close levels from 2 to lmsize() for (int i=2; i<=maxlev; i++) tout[i]->close(); //now we can save level 1, which contains all unigrams //cerr << "saving level 1" << "...\n"; lmt->savebin_level(1, filename, mmap); //update headers for (int i=1; i<=maxlev; i++) { sprintf(buff," %10d",num[i]); out.seekp(pos[i]); out << buff; } out.close(); //concatenate files for each single level into one file //single level files should have a name derived from "filename" lmt->compact_all_levels(filename); cerr << "\n"; system("date"); VERBOSE(2,"mdiadaptlm::saveBIN_per_word END\n"); return 1; }; ///// Save in binary format forbackoff N-gram models int mdiadaptlm::saveBIN_per_level(char *filename,int backoff,char* subdictfile,int mmap) { VERBOSE(2,"mdiadaptlm::saveBIN_per_level START\n"); system("date"); //subdict dictionary* subdict; if (subdictfile) subdict=new dictionary(subdictfile); else subdict=dict; // default is subdict=dict if (mmap) { VERBOSE(2,"savebin with memory map: " << filename << "\n"); } else { VERBOSE(2,"savebin: " << filename << "\n"); } int maxlev=lmsize(); streampos pos[LMTMAXLEV+1]; char buff[100]; int isQuant=0; //savebin for quantized LM is not yet implemented // print header fstream out(filename,ios::out); out << "blmt " << maxlev; for (int i=1; i<=maxlev; i++) { //reserve space for ngram statistics (which are not yet avalable) pos[i]=out.tellp(); sprintf(buff," %10d",0); out << buff; } out << "\n"; lmtable* lmt = new lmtable(); lmt->configure(maxlev,isQuant); lmt->setDict(subdict); subdict->save(out); out.flush(); //start adding n-grams to lmtable for (int i=1; i<=maxlev; i++) { cerr << "saving level " << i << "...\n"; table_entry_pos_t numberofentries; if (i==1) { //unigram numberofentries = (table_entry_pos_t) subdict->size(); } else { numberofentries = (table_entry_pos_t) entries(i); } system("date"); lmt->expand_level(i,numberofentries,filename,mmap); double fstar,lambda,bo,dummy,dummy2,pr,ibow; ngram ng(dict,1); ngram ng2(dict); ngram sng(subdict,1); if (i==1) { //unigram case double oovprob=0.0; //accumulated unigram oov pro bool _OOV_unigram=false; //flag to check whether an OOV word is present or not //scan the dictionary for (int w=0; wsize(); w++) { *ng.wordp(1)=w; sng.trans(ng); // frequency pruning is not applied to unigrams pr=mdiadaptlm::prob(ng,i); if (sng.containsWord(subdict->OOV(),i) || ng.containsWord(dict->OOV(),i)) { _OOV_unigram=true; oovprob+=pr; //accumulate oov probability continue; } /* if (sng.containsWord(subdict->OOV(),i) && !ng.containsWord(dict->OOV(),i)) { oovprob+=pr; //accumulate oov probability continue; } if (ng.containsWord(dict->OOV(),i)) pr+=oovprob; */ //cerr << ng << " freq " << dict->freq(w) << " - Pr " << pr << "\n"; pr=(pr?log10(pr):-99); /* if (w==dict->oovcode()){ //CHECK whether we can avoid this reassignment because dict should be lmt->getDict() *ng.wordp(1)=lmt->getDict()->oovcode(); ibow=0.0; } else { // } //do nothing */ if (iLOWER_SINGLE_PRECISION_OF_1){ //ngram must be skipped ibow = DONT_PRINT; }else{ if (backoff){ ibow = log10(lambda) - log10(bo); } else{ MY_ASSERT((lambdaLOWER_SINGLE_PRECISION_OF_1) || bodecode(w) << "| pr:" << pr << " ibow:" << ibow << std::endl); if (ibow != DONT_PRINT ) { lmt->add(ng,(float)pr,(float)ibow); } } //add unigram with OOV and its accumulate oov probability if (_OOV_unigram){ *ng.wordp(1)=lmt->getDict()->oovcode(); ibow=0.0; pr=oovprob; pr=(pr?log10(pr):-99); lmt->add(ng,(float)pr,(float)ibow); } } else { //i>1 , bigrams, trigrams, fourgrams... *ng.wordp(1)=0; get(ng,1,1); //this scan(ng,INIT,i); while(scan(ng,CONT,i)) { sng.trans(ng); // frequency pruning: skip n-grams with low frequency if (prune_ngram(i,sng.freq)) continue; // skip n-grams containing OOV if (sng.containsWord(subdict->OOV(),i)) continue; // skip also n-grams containing eos symbols not at the final if (sng.containsWord(dict->EoS(),i-1)) continue; // mdiadaptlm::discount(ng,i,fstar,dummy); // pr=mdiadaptlm::prob(ng,i); pr=mdiadaptlm::prob(ng,i,fstar,dummy,dummy2); if (!(pr<=1.0 && pr > 1e-10)) { cerr << ng << " " << pr << "\n"; MY_ASSERT(pr<=1.0); cerr << "prob modified to 1e-10\n"; pr=1e-10; } if (iLOWER_SINGLE_PRECISION_OF_1){ //ngram must be skipped ibow=DONT_PRINT; }else{ if (backoff){ ibow=log10(lambda) - log10(bo); }else{ MY_ASSERT((lambdaLOWER_SINGLE_PRECISION_OF_1) || boadd(ng,(float)log10(pr),(float)ibow); } } } // now we can fix table at level i-1 // now we can save table at level i-1 // now we can remove table at level i-1 if (maxlev>1 && i>1) { lmt->checkbounds(i-1); lmt->savebin_level(i-1, filename, mmap); } // now we can resize table at level i lmt->resize_level(i, filename, mmap); } // now we can save table at level maxlev lmt->savebin_level(maxlev, filename, mmap); //update headers for (int i=1; i<=maxlev; i++) { sprintf(buff," %10d",lmt->getCurrentSize(i)); out.seekp(pos[i]); out << buff; } out.close(); //concatenate files for each single level into one file //single level files should have a name derived from "filename" lmt->compact_all_levels(filename); VERBOSE(2,"mdiadaptlm::saveBIN_per_level END\n"); return 1; } ///// Save in format for ARPA backoff N-gram models int mdiadaptlm::saveARPA_per_word(char *filename,int backoff,char* subdictfile ) { VERBOSE(2,"mdiadaptlm::saveARPA_per_word START\n"); system("date"); //subdict dictionary* subdict; if (subdictfile) subdict=new dictionary(subdictfile); else subdict=dict; // default is subdict=dict //main output file mfstream out(filename,ios::out); int maxlev=lmsize(); //create temporary output file stream MY_ASSERT(strlen(filename)<1000); char tfilename[LMTMAXLEV+1][1000]; mfstream *tout[LMTMAXLEV+1]; tout[0]=NULL; for (int i=1; i<=maxlev; i++) { sprintf(tfilename[i],"%s.%d",filename,i); tout[i]=new mfstream(tfilename[i],ios::out); *tout[i] << "\n\\" << i << "-grams:\n"; } ngram ng(dict,lmsize()); ngram oldng(dict,lmsize()); ngram locng(dict,lmsize()); ngram sng(subdict,lmsize()); double fstar,lambda,bo,dummy,dummy2,pr,outLambda; double oovprob=0.0; //accumulated unigram oov pro bool _OOV_unigram=false; //flag to check whether an OOV word is present or not //n-gram counters table_entry_pos_t num[LMTMAXLEV+1]; for (int i=1; i<=maxlev; i++) num[i]=0; //main loop for (int w=0; wsize(); w++) { int i=1; //set the initial value of level if (!w % 10000) cerr << "."; //1-gram ngram ung(dict,1); *ung.wordp(1)=w; sng.trans(ung); // frequency pruning is not applied to unigrams /* //exclude words not occurring in the subdictionary if (sng.containsWord(subdict->OOV(),1) && !ung.containsWord(dict->OOV(),1)) continue; */ pr=mdiadaptlm::prob(ung,1); pr=(pr?log10(pr):-99); //////CHECK if (sng.containsWord(subdict->OOV(),1) || ung.containsWord(dict->OOV(),1)) { _OOV_unigram=true; oovprob+=pr; //accumulate oov probability continue; } if (iLOWER_SINGLE_PRECISION_OF_1){ //ngram must be skipped outLambda = DONT_PRINT; }else{ if (backoff){ outLambda = (float) (log10(lambda) - log10(bo)); } else{ MY_ASSERT((lambdaLOWER_SINGLE_PRECISION_OF_1) || bofreq(w) << " - Pr " << pr << "\n"; *tout[i] << (float) (pr?log10(pr):-99); *tout[i] << "\t" << (char *)dict->decode(w); if (outLambda != DONT_PRINT){ *tout[i] << "\t" << outLambda; } *tout[i] << "\n"; num[i]++; //manage n-grams if (get(ung,1,1)) { //create n-gram with history w *ng.wordp(maxlev)=w; //create sentinel n-gram for (i=1; i<=maxlev; i++) *oldng.wordp(i)=-1; scan(ung.link,ung.info,1,ng,INIT,maxlev); while(scan(ung.link,ung.info,1,ng,CONT,maxlev)) { //cerr << ng << "\n"; sng.trans(ng); // convert to subdictionary locng=ng; // make a local copy //find first internal level that changed int f=maxlev-1; //unigrams have been already covered while (f>1 && (*oldng.wordp(f)==*ng.wordp(f))){ f--; } for (int l=maxlev; l>maxlev-f;l--){ if (lOOV(),l)) continue; // skip also n-grams containing eos symbols not at the final if (sng.containsWord(dict->EoS(),l-1)) continue; VERBOSE(3,"mdiadaptlm::saveARPA_per_word(char *filename,int backoff,char* subdictfile ) computing prob for locng:|" << locng << "| size:" << i << std::endl); pr=mdiadaptlm::prob(locng,l,fstar,dummy,dummy2); VERBOSE(3,"mdiadaptlm::saveARPA_per_word(char *filename,int backoff,char* subdictfile ) getting prob locng:|" << locng << "| size:" << i << " fstar:" << fstar << " pr:" << pr << std::endl); //PATCH by Nicola (16-04-2008) if (!(pr<=1.0 && pr > 1e-10)) { cerr << ng << " " << pr << "\n"; MY_ASSERT(pr<=1.0); cerr << "prob modified to 1e-10\n"; pr=1e-10; } if (lLOWER_SINGLE_PRECISION_OF_1){ //ngram must be skipped outLambda = DONT_PRINT; }else{ if (backoff){ outLambda = (float) (log10(lambda) - log10(bo)); }else{ MY_ASSERT((lambdaLOWER_SINGLE_PRECISION_OF_1) || bo=UPPER_SINGLE_PRECISION_OF_0 || outLambda!=DONT_PRINT ) { *tout[l] << (float) log10(pr); *tout[l] << "\t" << (char *)dict->decode(*ng.wordp(i)); for (int j=i-1; j>0; j--) *tout[l] << " " << (char *)dict->decode(*ng.wordp(j)); if (outLambda != DONT_PRINT){ *tout[l] << "\t" << outLambda; } *tout[l] << "\n"; num[l]++; } else{ continue; //skip n-grams with too small fstar } } oldng=ng; } } } if (_OOV_unigram){ pr=oovprob; num[1]++; out << (float) (pr?log10(pr):-99); out << "\t" << "\n"; } //print header out << "\n\\data\\" << "\n"; char buff[100]; for (int i=1; i<=maxlev; i++) { sprintf(buff,"ngram %2d=%10d\n",i,num[i]); out << buff; } out << "\n"; //append and remove temporary files for (int i=1; i<=maxlev; i++) { delete tout[i]; tout[i]=new mfstream(tfilename[i],ios::in); out << tout[i]->rdbuf(); delete tout[i]; removefile(tfilename[i]); } out << "\\end\\" << "\n"; cerr << "\n"; system("date"); VERBOSE(2,"mdiadaptlm::saveARPA_per_word END\n"); return 1; }; ///// Save in format for ARPA backoff N-gram models int mdiadaptlm::saveARPA_per_level(char *filename,int backoff,char* subdictfile ) { VERBOSE(2,"mdiadaptlm::saveARPA_per_level START\n"); system("date"); //subdict dictionary* subdict; if (subdictfile) { subdict=new dictionary(subdictfile); } else subdict=dict; // default is subdict=dict fstream out(filename,ios::out); // out.precision(15); int maxlev = lmsize(); streampos pos[LMTMAXLEV+1]; table_entry_pos_t num[LMTMAXLEV+1]; char buff[100]; //print header out << "\n\\data\\" << "\n"; for (int i=1; i<=maxlev; i++) { num[i]=0; pos[i]=out.tellp(); sprintf(buff,"ngram %2d=%10d\n",i,num[i]); out << buff; } out << "\n"; //start writing n-grams for (int i=1; i<=maxlev; i++) { cerr << "saving level " << i << "...\n"; out << "\n\\" << i << "-grams:\n"; double fstar,lambda,bo,dummy,dummy2,pr,outLambda; ngram ng(dict,1); ngram ng2(dict); ngram sng(subdict,1); if (i==1) { //unigram case double oovprob=0.0; //accumulated unigram oov pro bool _OOV_unigram=false; //flag to check whether an OOV word is present or not //scan the dictionary for (int w=0; wsize(); w++) { *ng.wordp(1)=w; sng.trans(ng); // frequency pruning is not applied to unigrams VERBOSE(3,"mdiadaptlm::saveARPA_per_level(char *filename,int backoff,char* subdictfile ) computing prob for ng:|" << ng << "| size:" << i << std::endl); pr=mdiadaptlm::prob(ng,i); VERBOSE(3,"mdiadaptlm::saveARPA_per_level(char *filename,int backoff,char* subdictfile ) getting prob for ng:|" << ng << "| pr:" << pr << std::endl); if (sng.containsWord(subdict->OOV(),i) || ng.containsWord(dict->OOV(),i)) { _OOV_unigram=true; oovprob+=pr; //accumulate oov probability continue; } /* if (sng.containsWord(subdict->OOV(),i) && !ng.containsWord(dict->OOV(),i)) { oovprob+=pr; //accumulate oov probability continue; } if (ng.containsWord(dict->OOV(),i)) pr+=oovprob; */ if (iLOWER_SINGLE_PRECISION_OF_1){ //ngram must be skipped outLambda = DONT_PRINT; }else{ if (backoff){ outLambda = (float) (log10(lambda) - log10(bo)); } else{ MY_ASSERT((lambdaLOWER_SINGLE_PRECISION_OF_1) || bodecode(w) << "| pr:" << pr << " outLambda:" << outLambda << std::endl); //cerr << ng << " freq " << dict->freq(w) << " - Pr " << pr << "\n"; out << (float) (pr?log10(pr):-99); out << "\t" << (char *)dict->decode(w); if (outLambda != DONT_PRINT){ out << "\t" << outLambda; } out << "\n"; num[i]++; } //add unigram with OOV and its accumulate oov probability if (_OOV_unigram){ pr=oovprob; num[i]++; out << (float) (pr?log10(pr):-99); out << "\t" << "\n"; } } else { //i>1 , bigrams, trigrams, fourgrams... *ng.wordp(1)=0; get(ng,1,1); //this scan(ng,INIT,i); while(scan(ng,CONT,i)) { sng.trans(ng); // frequency pruning: skip n-grams with low frequency if (prune_ngram(i,sng.freq)) continue; // skip n-grams containing OOV if (sng.containsWord(subdict->OOV(),i)) continue; // skip also n-grams containing eos symbols not at the final if (sng.containsWord(dict->EoS(),i-1)) continue; VERBOSE(3,"mdiadaptlm::saveARPA_per_level(char *filename,int backoff,char* subdictfile ) computing prob for ng:|" << ng << "| size:" << i << std::endl); pr=mdiadaptlm::prob(ng,i,fstar,dummy,dummy2); VERBOSE(3,"mdiadaptlm::saveARPA_per_level(char *filename,int backoff,char* subdictfile ) getting prob ng:|" << ng << "| size:" << i << " fstar:" << fstar << " pr:" << pr << std::endl); //PATCH by Nicola (16-04-2008) if (!(pr<=1.0 && pr > 1e-10)) { cerr << ng << " " << pr << "\n"; MY_ASSERT(pr<=1.0); cerr << "prob modified to 1e-10\n"; pr=1e-10; } if (iLOWER_SINGLE_PRECISION_OF_1){ //ngram must be skipped outLambda = DONT_PRINT; }else{ if (backoff){ outLambda = (float) (log10(lambda) - log10(bo)); }else{ MY_ASSERT((lambdaLOWER_SINGLE_PRECISION_OF_1) || bo=UPPER_SINGLE_PRECISION_OF_0 || outLambda!=DONT_PRINT ) { out << (float) log10(pr); out << "\t" << (char *)dict->decode(*ng.wordp(i)); for (int j=i-1; j>0; j--) out << " " << (char *)dict->decode(*ng.wordp(j)); if (outLambda != DONT_PRINT){ out << "\t" << outLambda; } out << "\n"; num[i]++; } } } cerr << i << "grams tot:" << num[i] << "\n"; } streampos last=out.tellp(); //update headers for (int i=1; i<=maxlev; i++) { sprintf(buff,"ngram %2d=%10u\n",i,num[i]); out.seekp(pos[i]); out << buff; } out.seekp(last); out << "\\end\\" << "\n"; system("date"); VERBOSE(2,"mdiadaptlm::saveARPA_per_level END\n"); return 1; }; }//namespace irstlm /* main(int argc,char** argv){ char* dictname=argv[1]; char* backngram=argv[2]; int depth=atoi(argv[3]); char* forengram=argv[4]; char* testngram=argv[5]; dictionary dict(dictname); ngramtable test(&dict,testngram,depth); shiftbeta lm2(&dict,backngram,depth); lm2.train(); //lm2.test(test,depth); mdi lm(&dict,backngram,depth); lm.train(); for (double w=0.0;w<=1.0;w+=0.1){ lm.getforelm(forengram); lm.adapt(w); lm.test(test,depth); } } */ irstlm-6.00.05/src/mdiadapt.h000066400000000000000000000104701263213470300157220ustar00rootroot00000000000000/****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ // Adapted LM classes: extension of interp classes #ifndef MF_MDIADAPTLM_H #define MF_MDIADAPTLM_H #include "ngramcache.h" #include "normcache.h" #include "interplm.h" #define DONT_PRINT 1000000 namespace irstlm { class mdiadaptlm:public interplm { int adaptlev; interplm* forelm; double zeta0; double oovscaling; bool m_save_per_level; static bool mdiadaptlm_cache_enable; protected: normcache *cache; //to improve access speed NGRAMCACHE_t** probcache; NGRAMCACHE_t** backoffcache; int max_caching_level; int saveARPA_per_word(char *filename,int backoff=0,char* subdictfile=NULL); int saveARPA_per_level(char *filename,int backoff=0,char* subdictfile=NULL); int saveBIN_per_word(char *filename,int backoff=0,char* subdictfile=NULL,int mmap=0); int saveBIN_per_level(char *filename,int backoff=0,char* subdictfile=NULL,int mmap=0); public: mdiadaptlm(char* ngtfile,int depth=0,TABLETYPE tt=FULL); inline normcache* get_zetacache() { return cache; } inline NGRAMCACHE_t* get_probcache(int level); inline NGRAMCACHE_t* get_backoffcache(int level); void create_caches(int mcl); void init_caches(); void init_caches(int level); void delete_caches(); void delete_caches(int level); void check_cache_levels(); void check_cache_levels(int level); void reset_caches(); void reset_caches(int level); void caches_stat(); double gis_step; double zeta(ngram ng,int size); int discount(ngram ng,int size,double& fstar,double& lambda,int cv=0); int bodiscount(ngram ng,int size,double& fstar,double& lambda,double& bo); virtual int compute_backoff(); virtual int compute_backoff_per_level(); virtual int compute_backoff_per_word(); double backunig(ngram ng); double foreunig(ngram ng); int adapt(char* ngtfile,int alev=1,double gis_step=0.4); int scalefact(char* ngtfile); int savescalefactor(char* filename); double scalefact(ngram ng); double prob(ngram ng,int size); double prob(ngram ng,int size,double& fstar,double& lambda, double& bo); double prob2(ngram ng,int size,double & fstar); double txclprob(ngram ng,int size); int saveASR(char *filename,int backoff,char* subdictfile=NULL); int saveMT(char *filename,int backoff,char* subdictfile=NULL,int resolution=10000000,double decay=0.999900); int saveARPA(char *filename,int backoff=0,char* subdictfile=NULL){ if (m_save_per_level){ cerr << " per level ..."; return saveARPA_per_level(filename, backoff, subdictfile); }else{ cerr << " per word ..."; return saveARPA_per_word(filename, backoff, subdictfile); } } int saveBIN(char *filename,int backoff=0,char* subdictfile=NULL,int mmap=0){ if (m_save_per_level){ cerr << " per level ..."; return saveBIN_per_level(filename, backoff, subdictfile, mmap); }else{ cerr << " per word ..."; return saveBIN_per_word(filename, backoff, subdictfile, mmap); } } inline void save_per_level(bool value){ m_save_per_level=value; } inline bool save_per_level() const { return m_save_per_level; } int netsize(); ~mdiadaptlm(); double myround(double x) { long int value = (long int) x; return (x-value)>0.500?value+1.0:(double)value; } inline static bool is_train_cache_enabled() { VERBOSE(3,"inline static bool is_train_cache_enabled() " << mdiadaptlm_cache_enable << std::endl); return mdiadaptlm_cache_enable; } }; }//namespace irstlm #endif irstlm-6.00.05/src/mempool.cpp000066400000000000000000000207301263213470300161420ustar00rootroot00000000000000// $Id: mempool.cpp 302 2009-08-25 13:04:13Z nicolabertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ // An efficient memory pool manager // by M. Federico // Copyright Marcello Federico, ITC-irst, 1998 #include #include #include #include #include #include #include #include "util.h" #include "mempool.h" using namespace std; /*! The pool contains: - entries of size is - tables for bs entries */ mempool::mempool(int is, int bs) { // item size must be multiple of memory alignment step (4 bytes) // example: is is=9 becomes i=12 (9 + 4 - 9 %4 ) is=(is>(int)sizeof(char *)?is:0); is=is + sizeof(char *) - (is % sizeof(char *)); item_size = is; block_size = bs; true_size = is * bs; block_list = new memnode; block_list->block = new char[true_size]; memset(block_list->block,'0',true_size); block_list->next = 0; blocknum = 1; entries = 0; // build free list char *ptr = free_list = block_list->block; for (int i=0; iblock = new char[true_size]; //memset(new_block->block,'0',true_size); new_block->next = block_list; block_list=new_block; // update block list /* update free list */ ptr = free_list = block_list->block; for (int i=0; iblock) || (addr >= (list->block + true_size)))) list=list->next; if ((list==NULL) || (((addr - list->block) % item_size)!=0)) { //cerr << "mempool::free-> addr does not belong to this pool\n"; return 0; } */ *(char **)addr=free_list; free_list=addr; entries--; return 1; } mempool::~mempool() { memnode *ptr; while (block_list !=NULL) { ptr=block_list->next; delete [] block_list->block; delete block_list; block_list=ptr; } } void mempool::map (ostream& co) { co << "mempool memory map:\n"; //percorri piu` volte la lista libera memnode *bl=block_list; char *fl=free_list; char* img=new char[block_size+1]; img[block_size]='\0'; while (bl !=NULL) { memset(img,'#',block_size); fl=free_list; while (fl != NULL) { if ((fl >= bl->block) && (fl < bl->block + true_size)) { img[(fl-bl->block)/item_size]='-'; } fl=*(char **)fl; } co << img << "\n"; bl=bl->next; } delete [] img; } void mempool::stat() { VERBOSE(1, "mempool class statistics\n" << "entries " << entries << " blocks " << blocknum << " used memory " << (blocknum * true_size)/1024 << " Kb\n"); } strstack::strstack(int bs) { size=bs; list=new memnode; list->block=new char[size]; list->next=0; memset(list->block,'\0',size); idx=0; waste=0; memory=size; entries=0; blocknum=1; } void strstack::stat() { VERBOSE(1, "strstack class statistics\n" << "entries " << entries << " blocks " << blocknum << " used memory " << memory/1024 << " Kb\n"); } const char *strstack::push(const char *s) { int len=strlen(s); if ((len+1) >= size) { exit_error(IRSTLM_ERROR_DATA, "strstack::push string is too long"); }; if ((idx+len+1) >= size) { //append a new block //there must be space to //put the index after //the word waste+=size-idx; blocknum++; memory+=size; memnode* nd=new memnode; nd->block=new char[size]; nd->next=list; list=nd; memset(list->block,'\0',size); idx=0; } // append in current block strcpy(&list->block[idx],s); idx+=len+1; entries++; return &list->block[idx-len-1]; } const char *strstack::pop() { if (list==0) return 0; if (idx==0) { // free this block and go to next memnode *ptr=list->next; delete [] list->block; delete list; list=ptr; if (list==0) return 0; else idx=size-1; } //go back to first non \0 while (idx>0) if (list->block[idx--]!='\0') break; //go back to first \0 while (idx>0) if (list->block[idx--]=='\0') break; entries--; if (list->block[idx+1]=='\0') { idx+=2; memset(&list->block[idx],'\0',size-idx); return &list->block[idx]; } else { idx=0; memset(&list->block[idx],'\0',size); return &list->block[0]; } } const char *strstack::top() { int tidx=idx; memnode *tlist=list; if (tlist==0) return 0; if (idx==0) { tlist=tlist->next; if (tlist==0) return 0; tidx=size-1; } //go back to first non \0 while (tidx>0) if (tlist->block[tidx--]!='\0') break; //aaa\0bbb\0\0\0\0 //go back to first \0 while (tidx>0) if (tlist->block[tidx--]=='\0') break; if (tlist->block[tidx+1]=='\0') { tidx+=2; return &tlist->block[tidx]; } else { tidx=0; return &tlist->block[0]; } } strstack::~strstack() { memnode *ptr; while (list !=NULL) { ptr=list->next; delete [] list->block; delete list; list=ptr; } } storage::storage(int maxsize,int blocksize) { newmemory=0; newcalls=0; setsize=maxsize; poolsize=blocksize; //in bytes poolset=new mempool* [setsize+1]; for (int i=0; i<=setsize; i++) poolset[i]=NULL; } storage::~storage() { for (int i=0; i<=setsize; i++) if (poolset[i]) delete poolset[i]; delete [] poolset; } char *storage::allocate(int size) { if (size<=setsize) { if (!poolset[size]) { poolset[size]=new mempool(size,poolsize/size); } return poolset[size]->allocate(); } else { newmemory+=size+8; newcalls++; char* p=(char *)calloc(sizeof(char),size); if (p==NULL) { exit_error(IRSTLM_ERROR_MEMORY, "storage::alloc insufficient memory"); } return p; } } char *storage::reallocate(char *oldptr,int oldsize,int newsize) { char *newptr; MY_ASSERT(newsize>oldsize); if (oldsize<=setsize) { if (newsize<=setsize) { if (!poolset[newsize]) poolset[newsize]=new mempool(newsize,poolsize/newsize); newptr=poolset[newsize]->allocate(); memset((char*)newptr,0,newsize); } else newptr=(char *)calloc(sizeof(char),newsize); if (oldptr && oldsize) { memcpy(newptr,oldptr,oldsize); poolset[oldsize]->free(oldptr); } } else { newptr=(char *)realloc(oldptr,newsize); if (newptr==oldptr) cerr << "r\b"; else cerr << "a\b"; } if (newptr==NULL) { exit_error(IRSTLM_ERROR_MEMORY,"storage::realloc insufficient memory"); } return newptr; } int storage::free(char *addr,int size) { /* while(size<=setsize){ if (poolset[size] && poolset[size]->free(addr)) break; size++; } */ if (size>setsize) return free(addr),1; else { poolset[size] && poolset[size]->free(addr); } return 1; } void storage::stat() { IFVERBOSE(1){ int used=0; int memory=sizeof(char *) * setsize; int waste=0; for (int i=0; i<=setsize; i++) if (poolset[i]) { used++; memory+=poolset[i]->used(); waste+=poolset[i]->wasted(); } VERBOSE(1, "storage class statistics\n" << "alloc entries " << newcalls << " used memory " << newmemory/1024 << "Kb\n" << "mpools " << setsize << " active " << used << " used memory " << memory/1024 << "Kb" << " wasted " << waste/1024 << "Kb\n"); } } irstlm-6.00.05/src/mempool.h000066400000000000000000000114101263213470300156020ustar00rootroot00000000000000// $Id: mempool.h 383 2010-04-23 15:29:28Z nicolabertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ // An efficient memory manager // by M. Federico // Copyright Marcello Federico, ITC-irst, 1998 #ifndef MF_MEMPOOL_H #define MF_MEMPOOL_H #ifndef NULL const int NULL=0; #endif #include // std::ostream //! Memory block /*! This can be used by: - mempool to store items of fixed size - strstack to store strings of variable size */ #define MP_BLOCK_SIZE 1000000 class memnode { friend class mempool; //!< grant access friend class strstack; //!< grant access char *block; //!< block of memory memnode *next; //!< next block ptr public: //! Creates a memory node memnode():block(NULL), next(NULL){}; //! Destroys memory node ~memnode(){}; }; //! Memory pool /*! A memory pool is composed of: - a linked list of block_num memory blocks - each block might contain up to block_size items - each item is made of exactly item_size bytes */ class mempool { int block_size; //!< number of entries per block int item_size; //!< number of bytes per entry int true_size; //!< number of bytes per block memnode* block_list; //!< list of blocks char* free_list; //!< free entry list int entries; //!< number of stored entries int blocknum; //!< number of allocated blocks public: //! Creates a memory pool mempool(int is, int bs=MP_BLOCK_SIZE); //! Destroys memory pool ~mempool(); //! Prints a map of memory occupancy void map(std::ostream& co); //! Allocates a single memory entry char *allocate(); //! Frees a single memory entry int free(char* addr); //! Prints statistics about this mempool void stat(); //! Returns effectively used memory (bytes) /*! includes 8 bytes required by each call of new */ int used() const { return blocknum * (true_size + 8); } //! Returns amount of wasted memory (bytes) int wasted() const { return used()-(entries * item_size); } }; //! A stack to store strings /*! The stack is composed of - a list of blocks memnode of fixed size - attribute blocknum tells the block on top - attribute idx tells position of the top string */ class strstack { memnode* list; //!< list of memory blocks int size; //!< size of each block int idx; //!< index of last stored string int waste; //!< current waste of memory int memory; //!< current use of memory int entries; //!< current number of stored strings int blocknum; //!< current number of used blocks public: strstack(int bs=1000); ~strstack(); const char *push(const char *s); const char *pop(); const char *top(); void stat(); int used() const { return memory; } int wasted() const { return waste; } }; //! Manages multiple memory pools /*! This class permits to manage memory pools with items up to a specified size. - items within the allowed range are stored in memory pools - items larger than the limit are allocated with new */ class storage { mempool **poolset; //!< array of memory pools int setsize; //!< number of memory pools/maximum elem size int poolsize; //!< size of each block int newmemory; //!< stores amount of used memory int newcalls; //!< stores number of allocated blocks public: //! Creates storage storage(int maxsize,int blocksize); //! Destroys storage ~storage(); /* names of below functions have been changed so as not to interfere with macros for malloc/realloc/etc -- EVH */ //! Allocates memory char *allocate(int size); //! Realloc memory char *reallocate(char *oldptr,int oldsize,int newsize); //! Frees memory of an entry int free(char *addr,int size=0); //! Prints statistics about storage void stat(); }; #endif irstlm-6.00.05/src/mfstream.cpp000066400000000000000000000102461263213470300163110ustar00rootroot00000000000000// $Id: mfstream.cpp 294 2009-08-19 09:57:27Z mfederico $ /****************************************************************************** IrstLM: IRST Language Model Toolkit, compile LM Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include "util.h" #include "mfstream.h" #include "gzfilebuf.h" using namespace std; void mfstream::open(const char *name,openmode mode) { char cmode[10]; if (strchr(name,' ')!=0) { if (mode & ios::in) strcpy(cmode,"r"); else if (mode & ios::out) strcpy(cmode,"w"); else if (mode & ios::app) strcpy(cmode,"a"); else { exit_error(IRSTLM_ERROR_IO, "cannot open file"); } _cmd=1; strcpy(_cmdname,name); _FILE=popen(name,cmode); buf=new fdbuf(fileno(_FILE)); iostream::rdbuf((streambuf*) buf); } else { _cmd=0; fstream::open(name,mode); } } void mfstream::close() { if (_cmd==1) { pclose(_FILE); delete buf; } else { fstream::clear(); fstream::close(); } _cmd=2; } int mfstream::swapbytes(char *p, int sz, int n) { char c, *l, *h; if((n<1) ||(sz<2)) return 0; for(; n--; p+=sz) for(h=(l=p)+sz; --h>l; l++) { c=*h; *h=*l; *l=c; } return 0; }; mfstream& mfstream::iwritex(streampos loc,void *ptr,int size,int n) { streampos pos=tellp(); seekp(loc); writex(ptr,size,n); seekp(pos); return *this; } mfstream& mfstream::readx(void *p, int sz,int n) { if(!read((char *)p, sz * n)) return *this; if(*(short *)"AB"==0x4241) { swapbytes((char*)p, sz,n); } return *this; } mfstream& mfstream::writex(void *p, int sz,int n) { if(*(short *)"AB"==0x4241) { swapbytes((char*)p, sz,n); } write((char *)p, sz * n); if(*(short *)"AB"==0x4241) swapbytes((char*)p, sz,n); return *this; } //! Tells current position within a file streampos mfstream::tellp() { if (_cmd!=0) exit_error(IRSTLM_ERROR_IO, "mfstream::tellp tellp not allowed on commands"); return (streampos) fstream::tellg(); } //! Seeks a position within a file mfstream& mfstream::seekp(streampos loc) { if (_cmd==0) fstream::seekg(loc); else { exit_error(IRSTLM_ERROR_IO, "mfstream::seekp seekp not allowed on commands"); } return *this; } //! Reopens an input stream mfstream& mfstream::reopen() { if (_mode != in) { exit_error(IRSTLM_ERROR_IO, "mfstream::reopen() openmode must be ios:in"); } if (strlen(_cmdname)>0) { char *a=new char[strlen(_cmdname)+1]; strcpy(a,_cmdname); cerr << "close/open " << a <<"\n"; close(); open(a,ios::in); delete []a; } else{ seekp(0); } return *this; } inputfilestream::inputfilestream(const std::string &filePath) : std::istream(0), m_streambuf(0) { //check if file is readable std::filebuf* fb = new std::filebuf(); _good=(fb->open(filePath.c_str(), std::ios::in)!=NULL); if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") { fb->close(); delete fb; m_streambuf = new gzfilebuf(filePath.c_str()); } else { m_streambuf = fb; } this->init(m_streambuf); } inputfilestream::~inputfilestream() { delete m_streambuf; m_streambuf = 0; } void inputfilestream::close() { } /* int main() { char word[1000]; mfstream inp("cat pp",ios::in); mfbstream outp("aa",ios::out,100); while (inp >> word){ outp << word << "\n"; cout << word << "\n"; } } */ irstlm-6.00.05/src/mfstream.h000066400000000000000000000126161263213470300157610ustar00rootroot00000000000000// $Id: mfstream.h 383 2010-04-23 15:29:28Z nicolabertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit, compile LM Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include #include #include #include #include #include using namespace std; #ifndef MF_STREAM_H #define MF_STREAM_H extern "C" { ssize_t write (int fd, const void* buf, size_t num); ssize_t read (int fd, void* buf, size_t num); FILE *popen(const char *command, const char *type); int pclose(FILE *stream); int fseek( FILE *stream, long offset, int whence); long ftell( FILE *stream); }; //! File description for I/O stream buffer class fdbuf : public std::streambuf { protected: int fd; // file descriptor // write one character virtual int_type overflow (int_type c) { char z = c; if (c != EOF) { if (write (fd, &z, 1) != 1) { return EOF; } } //cerr << "overflow: \n"; //cerr << "pptr: " << (int) pptr() << "\n"; return c; } // write multiple characters virtual std::streamsize xsputn (const char* s, std::streamsize num) { return write(fd,s,num); } virtual streampos seekpos ( streampos /* unused parameter: sp */, ios_base::openmode /* unused parameter: which */= ios_base::in | ios_base::out ) { std::cerr << "mfstream::seekpos is not implemented" << std::endl;; return (streampos) 0; } //read one character virtual int_type underflow () { // is read position before end of buffer? if (gptr() < egptr()) { return traits_type::to_int_type(*gptr()); } /* process size of putback area * - use number of characters read * - but at most four */ int numPutback; numPutback = gptr() - eback(); if (numPutback > 4) { numPutback = 4; } /* copy up to four characters previously read into * the putback buffer (area of first four characters) */ std::memmove (buffer+(4-numPutback), gptr()-numPutback, numPutback); // read new characters int num; num = read (fd, buffer+4, bufferSize-4); if (num <= 0) { // ERROR or EOF return EOF; } // reset buffer pointers setg (buffer+(4-numPutback), // beginning of putback area buffer+4, // read position buffer+4+num); // end of buffer // return next character return traits_type::to_int_type(*gptr()); } // read multiple characters virtual std::streamsize xsgetn (char* s, std::streamsize num) { return read(fd,s,num); } static const int bufferSize = 10; // size of the data buffer char buffer[bufferSize]; // data buffer public: // constructor fdbuf (int _fd) : fd(_fd) { setg (buffer+4, // beginning of putback area buffer+4, // read position buffer+4); // end position } }; //! Extension of fstream to commands class mfstream : public std::fstream { protected: fdbuf* buf; int _cmd; openmode _mode; FILE* _FILE; char _cmdname[500]; int swapbytes(char *p, int sz, int n); public: //! Creates and opens a file/command stream without a specified nmode mfstream () : std::fstream(), buf(NULL), _cmd(0), _FILE(NULL) { _cmdname[0]='\0'; } //! Creates and opens a file/command stream in a specified nmode mfstream (const char* name,openmode mode) : std::fstream() { _cmdname[0]='\0'; _mode=mode; open(name,mode); } //! Closes and destroys a file/command stream ~mfstream() { if (_cmd<2) close(); } //! Opens an existing mfstream void open(const char *name,openmode mode); //! Closes an existing mfstream void close(); //! Write function for machine-independent byte order mfstream& writex(void *p, int sz,int n=1); //! Read function for machine-independent byte order mfstream& readx(void *p, int sz,int n=1); //! Write function at a given stream position for machine-independent byte order mfstream& iwritex(streampos loc,void *ptr,int size,int n=1); //! Tells current position within a file streampos tellp(); //! Seeks a position within a file mfstream& seekp(streampos loc); //! Reopens an input stream mfstream& reopen(); }; class inputfilestream : public std::istream { protected: std::streambuf *m_streambuf; bool _good; public: inputfilestream(const std::string &filePath); ~inputfilestream(); inline bool good() { return _good; } void close(); }; #endif irstlm-6.00.05/src/mixture.cpp000066400000000000000000000417271263213470300162000ustar00rootroot00000000000000/****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include "mfstream.h" #include "mempool.h" #include "dictionary.h" #include "n_gram.h" #include "ngramtable.h" #include "interplm.h" #include "normcache.h" #include "ngramcache.h" #include "mdiadapt.h" #include "shiftlm.h" #include "linearlm.h" #include "mixture.h" #include "cmd.h" #include "util.h" using namespace std; namespace irstlm { // //Mixture interpolated language model // static Enum_T SLmTypeEnum [] = { { (char*)"ImprovedKneserNey", IMPROVED_KNESER_NEY }, { (char*)"ikn", IMPROVED_KNESER_NEY }, { (char*)"KneserNey", KNESER_NEY }, { (char*)"kn", KNESER_NEY }, { (char*)"ModifiedShiftBeta", MOD_SHIFT_BETA }, { (char*)"msb", MOD_SHIFT_BETA }, { (char*)"ImprovedShiftBeta", IMPROVED_SHIFT_BETA }, { (char*)"isb", IMPROVED_SHIFT_BETA }, { (char*)"InterpShiftBeta", SHIFT_BETA }, { (char*)"ShiftBeta", SHIFT_BETA }, { (char*)"sb", SHIFT_BETA }, { (char*)"InterpShiftOne", SHIFT_ONE }, { (char*)"ShiftOne", SHIFT_ONE }, { (char*)"s1", SHIFT_ONE }, { (char*)"InterpShiftZero", SHIFT_ZERO }, { (char*)"s0", SHIFT_ZERO }, { (char*)"LinearWittenBell", LINEAR_WB }, { (char*)"wb", LINEAR_WB }, { (char*)"Mixture", MIXTURE }, { (char*)"mix", MIXTURE }, END_ENUM }; mixture::mixture(bool fulltable,char* sublminfo,int depth,int prunefreq,char* ipfile,char* opfile): mdiadaptlm((char *)NULL,depth) { prunethresh=prunefreq; ipfname=ipfile; opfname=opfile; usefulltable=fulltable; mfstream inp(sublminfo,ios::in ); if (!inp) { std::stringstream ss_msg; ss_msg << "cannot open " << sublminfo; exit_error(IRSTLM_ERROR_IO, ss_msg.str()); } char line[MAX_LINE]; inp.getline(line,MAX_LINE); sscanf(line,"%d",&numslm); sublm=new interplm* [numslm]; cerr << "WARNING: Parameters PruneSingletons (ps) and PruneTopSingletons (pts) are not taken into account for this type of LM (mixture); please specify the singleton pruning policy for each submodel using parameters \"-sps\" and \"-spts\" in the configuraton file\n"; int max_npar=6; for (int i=0; imax_npar){ std::stringstream ss_msg; ss_msg << "Too many parameters (expected " << max_npar << ")"; exit_error(IRSTLM_ERROR_DATA, ss_msg.str()); } par[j] = new char[MAX_LINE]; strcpy(par[j],word); // std::cerr << "par[j]:|" << par[j] << "|" << std::endl; word = strtok(0, wordSeparators); j++; } int actual_npar = j; char *subtrainfile; int slmtype; bool subprunesingletons; bool subprunetopsingletons; char *subprune_thr_str=NULL; int subprunefreq; DeclareParams((char*) "SubLanguageModelType",CMDENUMTYPE|CMDMSG, &slmtype, SLmTypeEnum, "type of the sub LM", "slm",CMDENUMTYPE|CMDMSG, &slmtype, SLmTypeEnum, "type of the sub LM", "sTrainOn",CMDSTRINGTYPE|CMDMSG, &subtrainfile, "training file of the sub LM", "str",CMDSTRINGTYPE|CMDMSG, &subtrainfile, "training file of the sub LM", "sPruneThresh",CMDSUBRANGETYPE|CMDMSG, &subprunefreq, 0, 1000, "threshold for pruning the sub LM", "sp",CMDSUBRANGETYPE|CMDMSG, &subprunefreq, 0, 1000, "threshold for pruning the sub LM", "sPruneSingletons",CMDBOOLTYPE|CMDMSG, &subprunesingletons, "boolean flag for pruning of singletons of the sub LM (default is true)", "sps",CMDBOOLTYPE|CMDMSG, &subprunesingletons, "boolean flag for pruning of singletons of the sub LM (default is true)", "sPruneTopSingletons",CMDBOOLTYPE|CMDMSG, &subprunetopsingletons, "boolean flag for pruning of singletons at the top level of the sub LM (default is false)", "spts",CMDBOOLTYPE|CMDMSG, &subprunetopsingletons, "boolean flag for pruning of singletons at the top level of the sub LM (default is false)", "sPruneFrequencyThreshold",CMDSTRINGTYPE|CMDMSG, &subprune_thr_str, "pruning frequency threshold for each level of the sub LM; comma-separated list of values; (default is \"0 0 ... 0\", for all levels)", "spft",CMDSTRINGTYPE|CMDMSG, &subprune_thr_str, "pruning frequency threshold for each level of the sub LM; comma-separated list of values; (default is \"0 0 ... 0\", for all levels)", (char *)NULL ); subtrainfile=NULL; slmtype=0; subprunefreq=0; subprunesingletons=true; subprunetopsingletons=false; GetParams(&actual_npar, &par, (char*) NULL); if (!slmtype) { std::stringstream ss_msg; ss_msg << "The type (-slm) for sub LM number " << i+1 << " is not specified" ; exit_error(IRSTLM_ERROR_DATA, ss_msg.str()); } if (!subtrainfile) { std::stringstream ss_msg; ss_msg << "The file (-str) for sub lm number " << i+1 << " is not specified"; exit_error(IRSTLM_ERROR_DATA, ss_msg.str()); } if (subprunefreq==-1) { std::stringstream ss_msg; ss_msg << "The prune threshold (-sp) for sub lm number " << i+1 << " is not specified"; exit_error(IRSTLM_ERROR_DATA, ss_msg.str()); } switch (slmtype) { case LINEAR_WB: sublm[i]=new linearwb(subtrainfile,depth,subprunefreq,IMPROVEDSHIFTBETA_I); break; case SHIFT_BETA: sublm[i]=new shiftbeta(subtrainfile,depth,subprunefreq,-1,SHIFTBETA_I); break; case KNESER_NEY: // lm=new kneserney(subtrainfile,depth,subprunefreq,-1,KNESERNEY_I); break; case MOD_SHIFT_BETA: case IMPROVED_KNESER_NEY: sublm[i]=new improvedkneserney(subtrainfile,depth,subprunefreq,IMPROVEDKNESERNEY_I); break; case IMPROVED_SHIFT_BETA: sublm[i]=new improvedshiftbeta(subtrainfile,depth,subprunefreq,IMPROVEDSHIFTBETA_I); break; case SHIFT_ONE: sublm[i]=new shiftone(subtrainfile,depth,subprunefreq,SIMPLE_I); break; case MIXTURE: sublm[i]=new mixture(usefulltable,subtrainfile,depth,subprunefreq); break; default: exit_error(IRSTLM_ERROR_DATA, "not implemented yet"); }; sublm[i]->prunesingletons(subprunesingletons==true); sublm[i]->prunetopsingletons(subprunetopsingletons==true); if (subprunetopsingletons==true) //apply most specific pruning method sublm[i]->prunesingletons(false); if (subprune_thr_str) sublm[i]->set_prune_ngram(subprune_thr_str); cerr << "eventually generate OOV code of sub lm[" << i << "]\n"; sublm[i]->dict->genoovcode(); //create super dictionary dict->augment(sublm[i]->dict); //creates the super n-gram table if(usefulltable) augment(sublm[i]); cerr << "super table statistics\n"; stat(2); } cerr << "eventually generate OOV code of the mixture\n"; dict->genoovcode(); cerr << "dict size of the mixture:" << dict->size() << "\n"; //tying parameters k1=2; k2=10; }; double mixture::reldist(double *l1,double *l2,int n) { double dist=0.0,size=0.0; for (int i=0; idict; cerr << "Computing parameters mapping: ..." << d->size() << " "; pm=new int[d->size()]; //initialize for (int i=0; isize(); i++) pm[i]=0; pmax=k2-k1+1; //update # of parameters for (int w=0; wsize(); w++) { int f=d->freq(w); if ((f>k1) && (f<=k2)) pm[w]=f-k1; else if (f>k2) { pm[w]=pmax++; } } cerr << "pmax " << pmax << " "; return 1; } int mixture::pmap(ngram ng,int lev) { ngram h(sublm[0]->dict); h.trans(ng); if (lev<=1) return 0; //get the last word of history if (!sublm[0]->get(h,2,1)) return 0; return (int) pm[*h.wordp(2)]; } int mixture::savepar(char* opf) { mfstream out(opf,ios::out); cerr << "saving parameters in " << opf << "\n"; out << lmsize() << " " << pmax << "\n"; for (int i=0; i<=lmsize(); i++) for (int j=0; jsize()) { std::stringstream ss_msg; ss_msg << "\nERROR: DUB value is too small: the LM will possibly compute wrong probabilities if sub-LMs have different vocabularies!\n"; ss_msg << "This exception should already have been handled before!!!\n"; exit_error(IRSTLM_ERROR_MODEL, ss_msg.str()); } cerr << "mixlm --> DUB: " << dub() << endl; for (int i=0; i DUB: " << sublm[i]->dub() << endl; cerr << "eventually generate OOV code "; cerr << sublm[i]->dict->encode(sublm[i]->dict->OOV()) << "\n"; sublm[i]->train(); } //initialize parameters for (int i=0; i<=lmsize(); i++) { l[i]=new double*[pmax]; for (int j=0; jdict); for (int lev=1; lev<=lmsize(); lev++) { zf=sublm[0]->zerofreq(lev); cerr << "Starting training at lev:" << lev << "\n"; for (int i=0; iscan(ng,INIT,lev); while(sublm[0]->scan(ng,CONT,lev)) { //do not include oov for unigrams if ((lev==1) && (*ng.wordp(1)==sublm[0]->dict->oovcode())) continue; int par=pmap(ng,lev); used[par]=1; //controllo se aggiornare il parametro if (alive[par]) { double backoff=(lev>1?prob(ng,lev-1):1); //backoff double denom=0.0; double* numer = new double[numslm]; double fstar,lambda; //int cv=(int)floor(zf * (double)ng.freq + rand01()); //int cv=1; //old version of leaving-one-out int cv=(int)floor(zf * (double)ng.freq)+1; //int cv=1; //old version of leaving-one-out //if (lev==3)q //if (iter>10) // cout << ng // << " backoff " << backoff // << " level " << lev // << "\n"; for (int i=0; idiscount(ng,lev,fstar,lambda,(i==0)*(cv)); numer[i]=oldl[par][i]*(fstar + lambda * backoff); ngram ngslm(sublm[i]->dict); ngslm.trans(ng); if ((*ngslm.wordp(1)==sublm[i]->dict->oovcode()) && (dict->dub() > sublm[i]->dict->size())) numer[i]/=(double)(dict->dub() - sublm[i]->dict->size()); denom+=numer[i]; } for (int i=0; i10) //cout << ng << " l: " << l[lev][par][i] << "\n"; } delete []numer; } } //normalize all parameters totalive=0; for (int i=0; idiscount(ng,size,fstar2,lambda2,0); ngram ngslm(sublm[i]->dict); ngslm.trans(ng); if (dict->dub() > sublm[i]->dict->size()){ if (*ngslm.wordp(1) == sublm[i]->dict->oovcode()) { fstar2/=(double)(sublm[i]->dict->dub() - sublm[i]->dict->size()+1); } } fstar+=(l[size][p][i]*fstar2); lambda+=(l[size][p][i]*lambda2); lsum+=l[size][p][i]; } if (dict->dub() > dict->size()) if (*ng.wordp(1) == dict->oovcode()) { fstar*=(double)(dict->dub() - dict->size()+1); } MY_ASSERT((lsum>LOWER_DOUBLE_PRECISION_OF_1) && (lsum<=UPPER_DOUBLE_PRECISION_OF_1)); return 1; } //creates the ngramtable on demand from the sublm tables int mixture::get(ngram& ng,int n,int lev) { if (usefulltable) { return ngramtable::get(ng,n,lev); } //free current tree resetngramtable(); //get 1-word prefix from ng ngram ug(dict,1); *ug.wordp(1)=*ng.wordp(ng.size); //local ngram to upload entries ngram locng(dict,maxlevel()); //allocate subtrees from sublm for (int i=0; idict,1); subug.trans(ug); if (sublm[i]->get(subug,1,1)) { ngram subng(sublm[i]->dict,maxlevel()); *subng.wordp(maxlevel())=*subug.wordp(1); sublm[i]->scan(subug.link,subug.info,1,subng,INIT,maxlevel()); while(sublm[i]->scan(subug.link,subug.info,1,subng,CONT,maxlevel())) { locng.trans(subng); put(locng); } } } return ngramtable::get(ng,n,lev); } }//namespace irstlm irstlm-6.00.05/src/mixture.h000066400000000000000000000043121263213470300156320ustar00rootroot00000000000000/****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ // Mixture of linear interpolation LMs #ifndef LM_MIXTURE #define LM_MIXTURE namespace irstlm { class mixture: public mdiadaptlm { double** l[MAX_NGRAM]; //interpolation parameters int* pm; //parameter mappings int pmax; //#parameters int k1,k2; //two thresholds int numslm; int prunethresh; interplm** sublm; char *ipfname; char *opfname; double reldist(double *l1,double *l2,int n); int genpmap(); int pmap(ngram ng,int lev); public: bool usefulltable; mixture(bool fulltable,char *sublminfo,int depth,int prunefreq=0,char* ipfile=NULL,char* opfile=NULL); int train(); int savepar(char* opf); int loadpar(char* opf); inline int dub() { return dict->dub(); } inline int dub(int value) { for (int i=0; idub(value); } return dict->dub(value); } void settying(int a,int b) { k1=a; k2=b; } int discount(ngram ng,int size,double& fstar,double& lambda,int cv=0); ~mixture(){ for (int i=0;i<=lmsize();i++){ for (int j=0; j #include #include #include #include #include "util.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "index.h" using namespace std; ngram::ngram(dictionary* d,int sz) { dict=d; size=sz; succ=0; freq=0; info=0; pinfo=0; link=NULL; isym=-1; memset(word,0,sizeof(int)*MAX_NGRAM); memset(midx,0,sizeof(int)*MAX_NGRAM); memset(path,0,sizeof(char *)*MAX_NGRAM); } ngram::ngram(const ngram& ng) { size=ng.size; freq=ng.freq; succ=0; info=0; pinfo=0; link=NULL; isym=-1; dict=ng.dict; memcpy(word,ng.word,sizeof(int)*MAX_NGRAM); memcpy(midx,ng.word,sizeof(int)*MAX_NGRAM); } int ngram::containsWord(const char* s,int lev) { int c=dict->encode(s); if (c == -1) return 0; MY_ASSERT(lev <= size); for (int i=0; iencode(ng.dict->decode(*ng.wordp(i))); } } void ngram::invert (const ngram& ng) { size=ng.size; for (int i=1; i<=size; i++) { *wordp(i)=*ng.wordp(size-i+1); } } void ngram::shift () { memmove((void *)&word[MAX_NGRAM-size+1],(void *)&word[MAX_NGRAM-size],(size-1) * sizeof(int)); size--; } void ngram::shift (int sz) { if (sz>size) sz=size; memmove((void *)&word[MAX_NGRAM-size+sz],(void *)&word[MAX_NGRAM-size],(size-sz) * sizeof(int)); size-=sz; } ifstream& operator>> ( ifstream& fi , ngram& ng) { char w[MAX_WORD]; memset(w,0,MAX_WORD); w[0]='\0'; if (!(fi >> setw(MAX_WORD) >> w)) return fi; if (strlen(w)==(MAX_WORD-1)) cerr << "ngram: a too long word was read (" << w << ")\n"; int c=ng.dict->encode(w); if (c == -1 ) { std::stringstream ss_msg; ss_msg << "ngram: " << w << " is OOV"; exit_error(IRSTLM_ERROR_MODEL, ss_msg.str()); } memcpy(ng.word,ng.word+1,(MAX_NGRAM-1)*sizeof(int)); ng.word[MAX_NGRAM-1]=(int)c; ng.freq=1; if (ng.sizeencode(w); if (c == -1 ) { cerr << "ngram: " << w << " is OOV \n"; exit(1); } pushc(c); return 1; } int ngram::pushc(int c) { size++; if (size>MAX_NGRAM) size=MAX_NGRAM; size_t len = size - 1; //i.e. if size==MAX_NGRAM, the farthest position is lost size_t src = MAX_NGRAM - len; memmove((void *)&word[src - 1],(void *)&word[src], len * sizeof(int)); word[MAX_NGRAM-1]=c; // fill the most recent position return 1; } int ngram::pushc(int* codes, int codes_len) { //copy the first codes_len elements from codes into the actual ngram; sz must be smaller than MAX_NGRAM //shift codes_len elements of the ngram backwards MY_ASSERT (codes_len <= MAX_NGRAM); size+=codes_len; if (size>MAX_NGRAM) size=MAX_NGRAM; size_t len = size - codes_len; size_t src = MAX_NGRAM - len; if (len > 0) memmove((void *)&word[src - codes_len],(void *)&word[src], len * sizeof(int)); memcpy((void *)&word[MAX_NGRAM - codes_len],(void*)&codes[0],codes_len*sizeof(int)); return 1; } int ngram::ckhisto(int sz) { for (int i=sz; i>1; i--) if (*wordp(i)==dict->oovcode()) return 0; return 1; } bool ngram::operator==(const ngram &compare) const { if ( size != compare.size || dict != compare.dict) return false; else for (int i=size; i>0; i--) if (word[MAX_NGRAM-i] != compare.word[MAX_NGRAM-i]) return false; return true; } bool ngram::operator!=(const ngram &compare) const { if ( size != compare.size || dict != compare.dict) return true; else for (int i=size; i>0; i--) if (word[MAX_NGRAM-i] != compare.word[MAX_NGRAM-i]) return true; return false; } istream& operator>> ( istream& fi , ngram& ng) { char w[MAX_WORD]; memset(w,0,MAX_WORD); w[0]='\0'; MY_ASSERT(ng.dict != NULL); if (!(fi >> setw(MAX_WORD) >> w)) return fi; if (strlen(w)==(MAX_WORD-1)) cerr << "ngram: a too long word was read (" << w << ")\n"; ng.pushw(w); ng.freq=1; return fi; } ofstream& operator<< (ofstream& fo,ngram& ng) { MY_ASSERT(ng.dict != NULL); for (int i=ng.size; i>0; i--) fo << ng.dict->decode(ng.word[MAX_NGRAM-i]) << (i>1?" ":""); fo << "\t" << ng.freq; return fo; } ostream& operator<< (ostream& fo,ngram& ng) { MY_ASSERT(ng.dict != NULL); for (int i=ng.size; i>0; i--) fo << ng.dict->decode(ng.word[MAX_NGRAM-i]) << (i>1?" ":""); fo << "\t" << ng.freq; return fo; } /* main(int argc, char** argv){ dictionary d(argv[1]); ifstream txt(argv[1]); ngram ng(&d); while (txt >> ng){ std::cout << ng << "\n"; } ngram ng2=ng; cerr << "copy last =" << ng << "\n"; } */ irstlm-6.00.05/src/n_gram.h000066400000000000000000000072511263213470300154050ustar00rootroot00000000000000// $Id: n_gram.h 3461 2010-08-27 10:17:34Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ // n-gram tables // by M. Federico // Copyright Marcello Federico, ITC-irst, 1998 #ifndef MF_NGRAM_H #define MF_NGRAM_H #include #include "util.h" #include "dictionary.h" #ifndef MYMAXNGRAM #define MYMAXNGRAM 20 #endif #define MAX_NGRAM MYMAXNGRAM class dictionary; //typedef int code; class ngram { int word[MAX_NGRAM]; //encoded ngram public: dictionary *dict; // dictionary char* link; // ngram-tree pointer char* succlink; // pointer to the first successor int midx[MAX_NGRAM]; // ngram-tree scan pointer char* path[MAX_NGRAM]; // path in the ngram-trie float bowv[MAX_NGRAM]; // vector of bow found in the trie int lev; // ngram-tree level int size; // ngram size long long freq; // ngram frequency or integer prob int succ; // number of successors float bow; // back-off weight float prob; // probability unsigned char info; // ngram-tree info flags unsigned char pinfo; // ngram-tree parent info flags int isym; // last interruption symbol ngram(dictionary* d,int sz=0); ngram(const ngram& ng); inline int *wordp() { // n-gram pointer return wordp(size); } inline int *wordp(int k) { // n-gram pointer return size>=k?&word[MAX_NGRAM-k]:0; } inline const int *wordp() const { // n-gram pointer return wordp(size); } inline const int *wordp(int k) const { // n-gram pointer return size>=k?&word[MAX_NGRAM-k]:0; } int containsWord(const char* s,int lev); void trans(const ngram& ng); void invert (const ngram& ng); void shift (); void shift (int sz); friend std::ifstream& operator>> (std::ifstream& fi,ngram& ng); friend std::ofstream& operator<< (std::ofstream& fi,ngram& ng); friend std::istream& operator>> (std::istream& fi,ngram& ng); friend std::ostream& operator<< (std::ostream& fi,ngram& ng); bool operator==(const ngram &compare) const; bool operator!=(const ngram &compare) const; /* friend bool operator==(const ngram &compare) const { if ( size != compare.size || dict != compare.dict) return false; else for (int i=size; i>0; i--) if (word[MAX_NGRAM-i] != compare.word[MAX_NGRAM-i]) return false; return true; } inline bool operator!=(const ngram &compare) const { if ( size != compare.size || dict != compare.dict) return true; else for (int i=size; i>0; i--) if (word[MAX_NGRAM-i] != compare.word[MAX_NGRAM-i]) return true; return false; } */ int ckhisto(int sz); int pushc(int c); int pushc(int* codes, int sz); int pushw(const char* w); //~ngram(); }; #endif irstlm-6.00.05/src/ngramcache.cpp000066400000000000000000000102011263213470300165520ustar00rootroot00000000000000// $Id: ngramcache.cpp 3679 2010-10-13 09:10:01Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include #include #include #include #include "math.h" #include "mempool.h" #include "htable.h" #include "lmtable.h" #include "util.h" #include "ngramcache.h" using namespace std; void ngramcache::print (const int* ngp) { std::cerr << "ngp: size:" << ngsize << "|"; for (int i=0; i((size_t) (maxn/load_factor), ngsize * sizeof(int)); //decrease the lower load factor to reduce collision mp=new mempool(ngsize * sizeof(int)+infosize,MP_BLOCK_SIZE); accesses=0; hits=0; }; ngramcache::~ngramcache() { delete ht; delete mp; }; //resize cache to specified number of entries void ngramcache::reset(int n) { //ht->stat(); delete ht; delete mp; if (n>0) maxn=n; ht=new htable ((size_t) (maxn/load_factor), ngsize * sizeof(int)); //decrease the lower load factor to reduce collision mp=new mempool(ngsize * sizeof(int)+infosize,MP_BLOCK_SIZE); entries=0; }; char* ngramcache::get(const int* ngp,char*& info) { char* found; accesses++; if ((found=(char*) ht->find((int *)ngp))) { memcpy(&info,found+ngsize*sizeof(int),infosize); hits++; } return found; }; char* ngramcache::get(const int* ngp,double& info) { char *found; accesses++; if ((found=(char*) ht->find((int *)ngp))) { memcpy(&info,found+ngsize*sizeof(int),infosize); hits++; }; return found; }; char* ngramcache::get(const int* ngp,prob_and_state_t& info) { char *found; accesses++; if ((found=(char*) ht->find((int *)ngp))) { memcpy(&info,found+ngsize*sizeof(int),infosize); hits++; } return found; }; int ngramcache::add(const int* ngp,const char*& info) { char* entry=mp->allocate(); memcpy(entry,(char*) ngp,sizeof(int) * ngsize); memcpy(entry + ngsize * sizeof(int),&info,infosize); char* found=(char*)ht->insert((int *)entry); MY_ASSERT(found == entry); //false if key is already inside entries++; return 1; }; int ngramcache::add(const int* ngp,const double& info) { char* entry=mp->allocate(); memcpy(entry,(char*) ngp,sizeof(int) * ngsize); memcpy(entry + ngsize * sizeof(int),&info,infosize); char *found=(char*) ht->insert((int *)entry); MY_ASSERT(found == entry); //false if key is already inside entries++; return 1; }; int ngramcache::add(const int* ngp,const prob_and_state_t& info) { char* entry=mp->allocate(); memcpy(entry,(char*) ngp,sizeof(int) * ngsize); memcpy(entry + ngsize * sizeof(int),&info,infosize); char *found=(char*) ht->insert((int *)entry); MY_ASSERT(found == entry); //false if key is already inside entries++; return 1; }; void ngramcache::stat() const { std::cout << "ngramcache stats: entries=" << entries << " acc=" << accesses << " hits=" << hits << " ht.used= " << ht->used() << " mp.used= " << mp->used() << " mp.wasted= " << mp->wasted() << "\n"; }; irstlm-6.00.05/src/ngramcache.h000066400000000000000000000063421263213470300162320ustar00rootroot00000000000000// $Id: ngramcache.h 3679 2010-10-13 09:10:01Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #ifndef MF_NGRAMCACHE_H #define MF_NGRAMCACHE_H #include "mempool.h" #include "htable.h" #include "util.h" #define NGRAMCACHE_t ngramcache #define NGRAMCACHE_LOAD_FACTOR 0.5 typedef struct PROB_AND_STATE_ENTRY { double logpr; //!< probability value of an ngram ngram_state_t ngramstate; //!< index of the largest n-gram contained in the LM table. char* state; //!< the largest suffix of an n-gram contained in the LM table. unsigned int statesize; //!< LM statesize of an ngram double bow; //!< backoff weight int bol; //!< backoff level bool extendible; //!< flag for extendibility of the ngram double lastbow; //!< bow weight of the deepest found ngram // PROB_AND_STATE_ENTRY(double lp=0.0, char* st=NULL, unsigned int stsz=0, double bw=0.0, int bl=0, bool ex=false): logpr(lp), state(st), statesize(stsz), bow(bw), bol(bl), extendible(ex) {}; //initializer PROB_AND_STATE_ENTRY(double lp=0.0, ngram_state_t ngramst=0, char* st=NULL, unsigned int stsz=0, double bw=0.0, int bl=0, bool ex=false, double lstbw=0): logpr(lp), ngramstate(ngramst), state(st), statesize(stsz), bow(bw), bol(bl), extendible(ex), lastbow(lstbw) {}; //initializer } prob_and_state_t; void print(prob_and_state_t* pst, std::ostream& out=std::cout); class ngramcache { private: static const bool debug=true; htable* ht; mempool *mp; int maxn; int ngsize; int infosize; int accesses; int hits; int entries; float load_factor; //!< ngramcache loading factor void print(const int*); public: ngramcache(int n,int size,int maxentries,float lf=NGRAMCACHE_LOAD_FACTOR); ~ngramcache(); inline int cursize() const { return entries; } inline int maxsize() const { return maxn; } void reset(int n=0); char* get(const int* ngp,char*& info); char* get(const int* ngp,double& info); char* get(const int* ngp,prob_and_state_t& info); int add(const int* ngp,const char*& info); int add(const int* ngp,const double& info); int add(const int* ngp,const prob_and_state_t& info); inline int isfull() const { return (entries >= maxn); } void stat() const; inline void used() const { stat(); }; inline float set_load_factor(float value) { return load_factor=value; } }; #endif irstlm-6.00.05/src/ngramtable.cpp000066400000000000000000001202211263213470300166020ustar00rootroot00000000000000// $Id: ngramtable.cpp 35 2010-07-19 14:52:11Z nicolabertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit, compile LM Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include "util.h" #include "mfstream.h" #include "math.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "ngramtable.h" #include "crc.h" using namespace std; tabletype::tabletype(TABLETYPE tt,int codesize) { if (codesize<=4 && codesize>0) CODESIZE=codesize; else { exit_error(IRSTLM_ERROR_DATA,"ngramtable wrong codesize"); } code_range[1]=255; code_range[2]=65535; code_range[3]=16777214; code_range[4]=2147483640; code_range[6]=140737488360000LL; //stay below true limit // code_range[6]=281474977000000LL; //stay below true limit //information which is useful to initialize //LEAFPROB tables L_FREQ_SIZE=FREQ1; WORD_OFFS =0; MSUCC_OFFS =CODESIZE; MTAB_OFFS =MSUCC_OFFS+CODESIZE; FLAGS_OFFS =MTAB_OFFS+PTRSIZE; switch (tt) { case COUNT: SUCC1_OFFS =0; SUCC2_OFFS =0; BOFF_OFFS =0; I_FREQ_OFFS=FLAGS_OFFS+CHARSIZE; I_FREQ_NUM=1; L_FREQ_NUM=1; ttype=tt; break; case FULL: case IMPROVEDKNESERNEY_B: case IMPROVEDSHIFTBETA_B: SUCC1_OFFS =FLAGS_OFFS+CHARSIZE; SUCC2_OFFS =SUCC1_OFFS+CODESIZE; BOFF_OFFS =SUCC2_OFFS+CODESIZE; I_FREQ_OFFS=BOFF_OFFS+INTSIZE; L_FREQ_OFFS=CODESIZE; I_FREQ_NUM=2; L_FREQ_NUM=1; ttype=tt; break; case IMPROVEDKNESERNEY_I: case IMPROVEDSHIFTBETA_I: SUCC1_OFFS =FLAGS_OFFS+CHARSIZE; SUCC2_OFFS =SUCC1_OFFS+CODESIZE; BOFF_OFFS =0; I_FREQ_OFFS=SUCC2_OFFS+CODESIZE; L_FREQ_OFFS=CODESIZE; I_FREQ_NUM=2; L_FREQ_NUM=1; ttype=tt; break; case SIMPLE_I: SUCC1_OFFS = 0; SUCC2_OFFS = 0; BOFF_OFFS = 0; I_FREQ_OFFS= FLAGS_OFFS+CHARSIZE; L_FREQ_OFFS=CODESIZE; I_FREQ_NUM=1; L_FREQ_NUM=1; ttype=tt; break; case SIMPLE_B: SUCC1_OFFS = 0; SUCC2_OFFS = 0; BOFF_OFFS = FLAGS_OFFS+CHARSIZE; I_FREQ_OFFS = BOFF_OFFS+INTSIZE; L_FREQ_OFFS = CODESIZE; I_FREQ_NUM = 1; L_FREQ_NUM = 1; ttype=tt; break; case KNESERNEY_I: case SHIFTBETA_I: SUCC1_OFFS = FLAGS_OFFS+CHARSIZE; SUCC2_OFFS = 0; BOFF_OFFS = 0; I_FREQ_OFFS= SUCC1_OFFS+CODESIZE; L_FREQ_OFFS=CODESIZE; I_FREQ_NUM=1; L_FREQ_NUM=1; ttype=tt; break; case KNESERNEY_B: case SHIFTBETA_B: SUCC1_OFFS = FLAGS_OFFS+CHARSIZE; SUCC2_OFFS = 0; BOFF_OFFS = SUCC1_OFFS+CODESIZE; I_FREQ_OFFS = BOFF_OFFS+INTSIZE; L_FREQ_OFFS = CODESIZE; I_FREQ_NUM = 1; L_FREQ_NUM = 1; ttype=tt; break; case LEAFPROB: case FLEAFPROB: SUCC1_OFFS = 0; SUCC2_OFFS = 0; BOFF_OFFS = 0; I_FREQ_OFFS = FLAGS_OFFS+CHARSIZE; I_FREQ_NUM = 0; L_FREQ_NUM = 1; ttype=tt; break; case LEAFPROB2: SUCC1_OFFS =0; SUCC2_OFFS =0; BOFF_OFFS =0; I_FREQ_OFFS=FLAGS_OFFS+CHARSIZE; I_FREQ_NUM=0; L_FREQ_NUM=2; ttype=LEAFPROB; break; case LEAFPROB3: SUCC1_OFFS =0; SUCC2_OFFS =0; BOFF_OFFS =0; I_FREQ_OFFS=FLAGS_OFFS+CHARSIZE; I_FREQ_NUM=0; L_FREQ_NUM=3; ttype=LEAFPROB; break; case LEAFPROB4: SUCC1_OFFS =0; SUCC2_OFFS =0; BOFF_OFFS =0; I_FREQ_OFFS=FLAGS_OFFS+CHARSIZE; I_FREQ_NUM=0; L_FREQ_NUM=4; ttype=LEAFPROB; break; default: MY_ASSERT(tt==COUNT); } L_FREQ_OFFS=CODESIZE; }; ngramtable::ngramtable(char* filename,int maxl,char* /* unused parameter: is */, dictionary* extdict /* external dictionary */,char* filterdictfile, int googletable,int dstco,char* hmask, int inplen,TABLETYPE ttype, int codesize): tabletype(ttype,codesize) { cerr << "[codesize " << CODESIZE << "]\n"; char header[100]; info[0]='\0'; corrcounts=0; if (filename) { int n; mfstream inp(filename,ios::in ); inp >> header; if (strncmp(header,"nGrAm",5)==0 || strncmp(header,"NgRaM",5)==0) { inp >> n; inp >> card; inp >> info; if (strcmp(info,"LM_")==0) { inp >> resolution; inp >> decay; char info2[100]; sprintf(info2,"%s %d %f",info,resolution,decay); strcpy(info, info2); } else { //default for old LM probs resolution=10000000; decay=0.9999; } maxl=n; //owerwrite maxl cerr << n << " " << card << " " << info << "\n"; } inp.close(); } if (!maxl) { exit_error(IRSTLM_ERROR_DATA,"ngramtable: ngram size must be specified"); } //distant co-occurreces works for bigrams and trigrams if (dstco && (maxl!=2) && (maxl!=3)) { exit_error(IRSTLM_ERROR_DATA,"distant co-occurrences work with 2-gram and 3-gram"); } maxlev=maxl; //Root not must have maximum frequency size treeflags=INODE | FREQ6; tree=(node) new char[inodesize(6)]; memset(tree,0,inodesize(6)); //1-gram table initial flags if (maxlev>1) mtflags(tree,INODE | FREQ4); else if (maxlev==1) mtflags(tree,LNODE | FREQ4); else { exit_error(IRSTLM_ERROR_DATA,"ngramtable: wrong level setting"); } word(tree,0); // dummy variable if (I_FREQ_NUM) freq(tree,treeflags,0); // frequency of all n-grams msucc(tree,0); // number of different n-grams mtable(tree,NULL); // table of n-gram mem=new storage(256,10000); mentr=new long long[maxlev+1]; memory= new long long[maxlev+1]; occupancy= new long long[maxlev+1]; //Book keeping of occupied memory mentr[0]=1; memory[0]=inodesize(6); // root is an inode with highest frequency occupancy[0]=inodesize(6); // root is an inode with highest frequency for (int i=1; i<=maxlev; i++) mentr[i]=memory[i]=occupancy[i]=0; dict=new dictionary(NULL,1000000); if (!filename) return ; filterdict=NULL; if (filterdictfile) { filterdict=new dictionary(filterdictfile,1000000); /* filterdict->incflag(1); filterdict->encode(BOS_); filterdict->encode(EOS_); filterdict->incflag(0); */ } // switch to specific loading methods if ((strncmp(header,"ngram",5)==0) || (strncmp(header,"NGRAM",5)==0)) { exit_error(IRSTLM_ERROR_DATA,"this ngram file format is no more supported"); } if (strncmp(header,"nGrAm",5)==0) loadtxt(filename); else if (strncmp(header,"NgRaM",5)==0) loadbin(filename); else if (dstco>0) generate_dstco(filename,dstco); else if (hmask != NULL) generate_hmask(filename,hmask,inplen); else if (googletable) loadtxt(filename,googletable); else generate(filename,extdict); if (tbtype()==LEAFPROB) { du_code=dict->encode(DUMMY_); bo_code=dict->encode(BACKOFF_); } } void ngramtable::savetxt(char *filename,int depth,bool googleformat,bool hashvalue,int startfrom) { char ngstring[10000]; if (depth>maxlev) { exit_error(IRSTLM_ERROR_DATA,"ngramtable::savetxt: wrong n-gram size"); } if (startfrom>0 && !googleformat) { exit_error(IRSTLM_ERROR_DATA, "ngramtable::savetxt: multilevel output only allowed in googleformat"); } depth=(depth>0?depth:maxlev); card=mentr[depth]; ngram ng(dict); if (googleformat) cerr << "savetxt in Google format: nGrAm " << depth << " " << card << " " << info << "\n"; else cerr << "savetxt: nGrAm " << depth << " " << card << " " << info << "\n"; mfstream out(filename,ios::out ); if (!googleformat){ out << "nGrAm " << depth << " " << card << " " << info << "\n"; dict->save(out); } if (startfrom<=0 || startfrom > depth) startfrom=depth; for (int d=startfrom;d<=depth;d++){ scan(ng,INIT,d); while(scan(ng,CONT,d)){ if (hashvalue){ strcpy(ngstring,ng.dict->decode(*ng.wordp(ng.size))); for (int i=ng.size-1; i>0; i--){ strcat(ngstring," "); strcat(ngstring,ng.dict->decode(*ng.wordp(i))); } out << ngstring << "\t" << ng.freq << "\t" << crc16_ccitt(ngstring,strlen(ngstring)) << "\n"; } else out << ng << "\n"; } } cerr << "\n"; out.close(); } void ngramtable::loadtxt(char *filename,int googletable) { ngram ng(dict);; cerr << "loadtxt:" << (googletable?"google format":"std table"); mfstream inp(filename,ios::in); int i,c=0; if (googletable) { dict->incflag(1); } else { char header[100]; inp.getline(header,100); cerr << header ; dict->load(inp); } while (!inp.eof()) { for (i=0; i> ng; inp >> ng.freq; if (ng.size==0) continue; //update dictionary frequency when loading from if (googletable) dict->incfreq(*ng.wordp(1),ng.freq); // if filtering dictionary exists // and if the first word of the ngram does not belong to it // do not insert the ngram if (filterdict) { int code=filterdict->encode(dict->decode(*ng.wordp(maxlev))); if (code!=filterdict->oovcode()) put(ng); } else put(ng); ng.size=0; if (!(++c % 1000000)) cerr << "."; } if (googletable) { dict->incflag(0); } cerr << "\n"; inp.close(); } void ngramtable::savebin(mfstream& out,node nd,NODETYPE ndt,int lev,int mlev) { out.write(nd+WORD_OFFS,CODESIZE); //write frequency int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS; int frnum=1; if (tbtype()==LEAFPROB && (ndt & LNODE)) frnum=L_FREQ_NUM; if ((ndt & LNODE) || I_FREQ_NUM) { //check if to write freq if (ndt & FREQ1) out.write(nd+offs,1 * frnum); else if (ndt & FREQ2) out.write(nd+offs,2 * frnum); else if (ndt & FREQ3) out.write(nd+offs,3 * frnum); else out.write(nd+offs,INTSIZE * frnum); } if ((lev maxlev) { exit_error(IRSTLM_ERROR_DATA,"ngramtable::savebin: wrong n-gram size"); } depth=(depth>0?depth:maxlev); card=mentr[depth]; cerr << "savebin NgRaM " << depth << " " << card; mfstream out(filename,ios::out ); if (dict->oovcode()!=-1) //there are OOV words out << "NgRaM_ " << depth << " " << card << " " << info << "\n"; else out << "NgRaM " << depth << " " << card << " " << info << "\n"; dict->save(out); out.writex((char *)&depth,INTSIZE); out.write((char *)&treeflags,CHARSIZE); savebin(out,tree,treeflags,0,depth); out.close(); cerr << "\n"; } void ngramtable::loadbin(mfstream& inp,node nd,NODETYPE ndt,int lev) { static int c=0; // read code inp.read(nd+WORD_OFFS,CODESIZE); // read frequency int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS; int frnum=1; if (tbtype()==LEAFPROB && (ndt & LNODE)) frnum=L_FREQ_NUM; if ((ndt & LNODE) || I_FREQ_NUM) { //check if to read freq if (ndt & FREQ1) inp.read(nd+offs,1 * frnum); else if (ndt & FREQ2) inp.read(nd+offs,2 * frnum); else if (ndt & FREQ3) inp.read(nd+offs,3 * frnum); else inp.read(nd+offs,4 * frnum); } if (ndt & INODE) { //read flags inp.read(nd+FLAGS_OFFS,CHARSIZE); unsigned char fl=mtflags(nd); //read #of multiple entries inp.read(nd+MSUCC_OFFS,CODESIZE); int m=msucc(nd); if (m>0) { //read multiple entries int msz=mtablesz(nd); table mtb=mtable(nd); //table entries increase grow(&mtb,INODE,lev+1,m,msz); for (int i=0; iload(inp); inp.readx((char *)&maxlev,INTSIZE); inp.read((char *)&treeflags,CHARSIZE); loadbin(inp,tree,treeflags,0); inp.close(); cerr << "\n"; } void ngramtable::generate(char *filename, dictionary* extdict) { mfstream inp(filename,ios::in); int i,c=0; if (!inp) { std::stringstream ss_msg; ss_msg << "cannot open " << filename; exit_error(IRSTLM_ERROR_IO, ss_msg.str()); } cerr << "load:"; ngram ng(extdict==NULL?dict:extdict); //use possible prescribed dictionary if (extdict) dict->genoovcode(); ngram ng2(dict); dict->incflag(1); cerr << "prepare initial n-grams to make table consistent\n"; for (i=1; iBoS()); ng.freq=1; }; while (inp >> ng) { if (ng.size>maxlev) ng.size=maxlev; //speeds up ng2.trans(ng); //reencode with new dictionary check_dictsize_bound(); if (ng2.size) dict->incfreq(*ng2.wordp(1),1); // if filtering dictionary exists // and if the first word of the ngram does not belong to it // do not insert the ngram if (filterdict) { int code=filterdict->encode(dict->decode(*ng2.wordp(maxlev))); if (code!=filterdict->oovcode()) put(ng2); } else put(ng2); if (!(++c % 1000000)) cerr << "."; } cerr << "adding some more n-grams to make table consistent\n"; for (i=1; i<=maxlev; i++) { ng2.pushw(dict->BoS()); ng2.freq=1; // if filtering dictionary exists // and if the first word of the ngram does not belong to it // do not insert the ngram if (filterdict) { int code=filterdict->encode(dict->decode(*ng2.wordp(maxlev))); if (code!=filterdict->oovcode()) put(ng2); } else put(ng2); }; dict->incflag(0); inp.close(); strcpy(info,"ngram"); cerr << "\n"; } void ngramtable::generate_hmask(char *filename,char* hmask,int inplen) { mfstream inp(filename,ios::in); if (!inp) { std::stringstream ss_msg; ss_msg << "cannot open " << filename; exit_error(IRSTLM_ERROR_IO, ss_msg.str()); } int selmask[MAX_NGRAM]; memset(selmask, 0, sizeof(int)*MAX_NGRAM); //parse hmask selmask[0]=1; int i=1; for (size_t c=0; cincflag(1); long c=0; while (inp >> ng) { if (inplen && ng.size= selmask[maxlev-1]) { for (int j=0; jincfreq(*ng2.wordp(1),1); if (!(++c % 1000000)) cerr << "."; }; dict->incflag(0); inp.close(); sprintf(info,"hm%s\n",hmask); cerr << "\n"; } int cmpint(const void *a,const void *b) { return (*(int *)b)-(*(int *)a); } void ngramtable::generate_dstco(char *filename,int dstco) { mfstream inp(filename,ios::in); int c=0; if (!inp) { std::stringstream ss_msg; ss_msg << "cannot open " << filename; exit_error(IRSTLM_ERROR_IO, ss_msg.str()); } cerr << "load distant co-occurrences:"; if (dstco>MAX_NGRAM) { inp.close(); std::stringstream ss_msg; ss_msg << "window size (" << dstco << ") exceeds MAXNGRAM"; exit_error(IRSTLM_ERROR_DATA, ss_msg.str()); } ngram ng(dict); ngram ng2(dict); ngram dng(dict); dict->incflag(1); while (inp >> ng) { if (ng.size) { ng2.trans(ng); //reencode with new dictionary if (ng2.size>dstco) ng2.size=dstco; //maximum distance check_dictsize_bound(); dict->incfreq(*ng2.wordp(1),1); if (maxlev == 1 ) cerr << "maxlev is wrong! (Possible values are 2 or 3)\n"; else if (maxlev == 2 ) { //maxlev ==2 dng.size=2; dng.freq=1; //cerr << "size=" << ng2.size << "\n"; for (int i=2; i<=ng2.size; i++) { if (*ng2.wordp(1)<*ng2.wordp(i)) { *dng.wordp(2)=*ng2.wordp(i); *dng.wordp(1)=*ng2.wordp(1); } else { *dng.wordp(1)=*ng2.wordp(i); *dng.wordp(2)=*ng2.wordp(1); } //cerr << dng << "\n"; put(dng); } if (!(++c % 1000000)) cerr << "."; } else { //maxlev ==3 dng.size=3; dng.freq=1; //cerr << "size=" << ng2.size << "\n"; int ar[3]; ar[0]=*ng2.wordp(1); for (int i=2; iincflag(0); inp.close(); sprintf(info,"co-occ%d\n",dstco); cerr << "\n"; } void ngramtable::augment(ngramtable* ngt) { if (ngt->maxlev != maxlev) { exit_error(IRSTLM_ERROR_DATA,"ngramtable::augment augmentation is not possible due to table incompatibility"); } if (ngt->dict->oovcode()!=-1) cerr <<"oov: " << ngt->dict->freq(ngt->dict->oovcode()) << "\n"; cerr <<"size: " << ngt->dict->size() << "\n"; if (dict->oovcode()!=-1) cerr <<"oov: " << dict->freq(dict->oovcode()) << "\n"; cerr <<"size: " << dict->size() << "\n"; dict->incflag(1); cerr << "augmenting ngram table\n"; ngram ng1(ngt->dict); ngram ng2(dict); ngt->scan(ng1,INIT); int c=0; while (ngt->scan(ng1,CONT)) { ng2.trans(ng1); put(ng2); if ((++c % 1000000) ==0) cerr <<"."; } cerr << "\n"; for (int i=0; idict->size(); i++) dict->incfreq(dict->encode(ngt->dict->decode(i)), ngt->dict->freq(i)); dict->incflag(0); int oov=dict->getcode(dict->OOV()); if (oov>=0) { dict->oovcode(oov); } cerr << "oov: " << dict->freq(dict->oovcode()) << "\n"; cerr << "size: " << dict->size() << "\n"; } void ngramtable::show() { ngram ng(dict); scan(ng,INIT); cout << "Stampo contenuto della tabella\n"; while (scan(ng)) { cout << ng << "\n"; } } int ngramtable::mybsearch(char *ar, int n, int size, unsigned char *key, int *idx) { if (n==0) return 0; register int low = 0, high = n; *idx=0; register unsigned char *p=NULL; int result; #ifdef INTERP_SEARCH char* lp; char* hp; #endif /* return idx with the first position equal or greater than key */ /* Warning("start bsearch \n"); */ while (low < high) { #ifdef INTERP_SEARCH //use interpolation search only for intervals with at least 4096 entries if ((high-low)>=10000) { lp=(char *) (ar + (low * size)); if (codecmp((char *)key,lp)<0) { *idx=low; return 0; } hp=(char *) (ar + ((high-1) * size)); if (codecmp((char *)key,hp)>0) { *idx=high; return 0; } *idx= low + ((high-1)-low) * codediff((char *)key,lp)/codediff(hp,(char *)lp); } else #endif *idx = (low + high) / 2; //after redefining the interval there is no guarantee //that wlp <= wkey <= whigh p = (unsigned char *) (ar + (*idx * size)); result=codecmp((char *)key,(char *)p); if (result < 0) { high = *idx; } else if (result > 0) { low = ++(*idx); } else return 1; } *idx=low; return 0; } void *ngramtable::search(table *tb,NODETYPE ndt,int lev,int n,int sz,int *ngp, ACTION action,char **found) { char w[CODESIZE]; putmem(w,ngp[0],0,CODESIZE); int wint=ngp[0]; // index returned by mybsearch if (found) *found=NULL; int idx=0; switch(action) { case ENTER: if (!*tb || !mybsearch(*tb,n,sz,(unsigned char *)w,&idx)) { // let possibly grow the table grow(tb,ndt,lev,n,sz); // devo aggiungere un elemento n+1 //shift table by one memmove(*tb + (idx+1) * sz, *tb + idx * sz, (n-idx) * sz); memset(*tb + idx * sz , 0 , sz); word(*tb + idx * sz, wint); } else if (found) *found=*tb + ( idx * sz ); return *tb + ( idx * sz ); break; case FIND: if (!*tb || !mybsearch(*tb,n,sz,(unsigned char *)w,&idx)) return 0; else if (found) *found=*tb + (idx * sz); return *tb + (idx * sz); break; case DELETE: if (*tb && mybsearch(*tb,n,sz,(unsigned char *)w,&idx)) { //shift table down by one static char buffer[100]; memcpy(buffer,*tb + idx * sz , sz); if (idx <(n-1)) memmove(*tb + idx * sz, *tb + (idx + 1) * sz, (n-idx-1) * sz); //put the deleted item after the last item memcpy(*tb + (n-1) * sz , buffer , sz); if (found) *found=*tb + (n-1) * sz ; return *tb + (n-1) * sz ; } else return NULL; break; default: cerr << "this option is not implemented yet\n"; break; } return NULL; } int ngramtable::comptbsize(int n) { if (n>16384) return(n/16384)*16384+(n % 16384?16384:0); else if (n>8192) return 16384; else if (n>4096) return 8192; else if (n>2048) return 4096; else if (n>1024) return 2048; else if (n>512) return 1024; else if (n>256) return 512; else if (n>128) return 256; else if (n>64) return 128; else if (n>32) return 64; else if (n>16) return 32; else if (n>8) return 16; else if (n>4) return 8; else if (n>2) return 4; else if (n>1) return 2; else return 1; } char **ngramtable::grow(table *tb,NODETYPE ndt,int lev, int n,int sz,NODETYPE oldndt) { int inc; int num; //memory pools for inode/lnode tables if (oldndt==0) { if ((*tb==NULL) && n>0) { // n is the target number of entries //first allocation if (n>16384) inc=(n/16384)*16384+(n % 16384?16384:0); else if (n>8192) inc=16384; else if (n>4096) inc=8192; else if (n>2048) inc=4096; else if (n>1024) inc=2048; else if (n>512) inc=1024; else if (n>256) inc=512; else if (n>128) inc=256; else if (n>64) inc=128; else if (n>32) inc=64; else if (n>16) inc=32; else if (n>8) inc=16; else if (n>4) inc=8; else if (n>2) inc=4; else if (n>1) inc=2; else inc=1; n=0; //inc is the correct target size } else { // table will be extended on demand // I'm sure that one entry will be // added next // check multiples of 1024 if ((n>=16384) && !(n % 16384)) inc=16384; else { switch (n) { case 0: inc=1; break; case 1: case 2: case 4: case 8: case 16: case 32: case 64: case 128: case 256: case 512: case 1024: case 2048: case 4096: case 8192: inc=n; break; default: return tb; } } } table ntb=(char *)mem->reallocate(*tb,n * sz,(n + inc) * sz); memory[lev]+= (inc * sz); *tb=ntb; } else { //change frequency type of table //no entries will be added now int oldsz=0; // guess the current memory size !!!! num=comptbsize(n); if ((ndt & INODE) && I_FREQ_NUM) { if (oldndt & FREQ1) oldsz=inodesize(1); else if (oldndt & FREQ2) oldsz=inodesize(2); else if (oldndt & FREQ3) oldsz=inodesize(3); else if (oldndt & FREQ4) oldsz=inodesize(4); else { exit_error(IRSTLM_ERROR_DATA,"ngramtable::grow functionality not available"); } } else if (ndt & LNODE) { if (oldndt & FREQ1) oldsz=lnodesize(1); else if (oldndt & FREQ2) oldsz=lnodesize(2); else if (oldndt & FREQ3) oldsz=lnodesize(3); else if (oldndt & FREQ4) oldsz=lnodesize(4); else { exit_error(IRSTLM_ERROR_DATA,"ngramtable::grow functionality not available"); } } table ntb=(char *)mem->allocate(num * sz); memset((char *)ntb,0,num * sz); if (ndt & INODE) for (int i=0; ifree(*tb,num * oldsz); //num is the correct size memory[lev]+=num * (sz - oldsz); occupancy[lev]+=n * (sz - oldsz); *tb=ntb; } return tb; }; int ngramtable::put(ngram& ng) { return ngramtable::put(ng,tree,treeflags,0); } int ngramtable::put(ngram& ng,node nd,NODETYPE ndt,int lev) { char *found; node subnd; if (ng.size65535?FREQ4:FREQ1); else //all leafprob with L_FREQ_NUM >=1 //do NOT have INTERNAL freqs //will have freq size specified //by the resolution parameter //to avoid expansion freq_flag=L_FREQ_SIZE; if ((l+1)255)) mtflags(nd,(mtflags(nd) & ~FREQ1) | FREQ2); //update flags if ((I_FREQ_NUM || (mtflags(nd) & LNODE)) && (mtflags(nd) & FREQ2) && ((freq(subnd,mtflags(nd))+ng.freq)>65535)) mtflags(nd,(mtflags(nd) & ~FREQ2) | FREQ3); //update flags if ((I_FREQ_NUM || (mtflags(nd) & LNODE)) && (mtflags(nd) & FREQ3) && ((freq(subnd,mtflags(nd))+ng.freq)>16777215)) mtflags(nd,(mtflags(nd) & ~FREQ3) | FREQ4); //update flags if ((I_FREQ_NUM || (mtflags(nd) & LNODE)) && (mtflags(nd) & FREQ4) && ((freq(subnd,mtflags(nd))+ng.freq)>4294967295LL)) mtflags(nd,(mtflags(nd) & ~FREQ4) | FREQ6); //update flags if (mtflags(nd)!=oldndt) { // flags have changed, table has to be expanded //expand subtable cerr << "+"<= n); if ((I_FREQ_NUM==0) && (lev < maxlev)) { exit_error(IRSTLM_ERROR_DATA,"ngramtable::get for this type of table ngram cannot be smaller than table size"); } if (ng.wordp(n)) { nd=tree; ndt=treeflags; for (int l=0; l(maxl-1)) return 0; if (ng.midx[lev]free(mtable(nd),msz*truem); } ngramtable::~ngramtable() { freetree(tree); delete [] tree; delete mem; delete [] memory; delete [] occupancy; delete [] mentr; delete dict; }; void ngramtable::stat(int level) { long long totmem=0; long long totwaste=0; float mega=1024 * 1024; cout.precision(2); cout << "ngramtable class statistics\n"; cout << "levels " << maxlev << "\n"; for (int l=0; l<=maxlev; l++) { cout << "lev " << l << " entries "<< mentr[l] << " allocated mem " << memory[l]/mega << "Mb " << " used mem " << occupancy[l]/mega << "Mb \n"; totmem+=memory[l]; totwaste+=(memory[l]-occupancy[l]); } cout << "total allocated mem " << totmem/mega << "Mb "; cout << "wasted mem " << totwaste/mega << "Mb\n\n\n"; if (level >1 ) dict->stat(); cout << "\n\n"; if (level >2) mem->stat(); } double ngramtable::prob(ngram ong) { if (ong.size==0) return 0.0; if (ong.size>maxlev) ong.size=maxlev; MY_ASSERT(tbtype()==LEAFPROB && ong.size<=maxlev); ngram ng(dict); ng.trans(ong); double bo; ng.size=maxlev; for (int s=ong.size+1; s<=maxlev; s++) *ng.wordp(s)=du_code; if (get(ng)) { if (ong.size>1 && resolution<10000000) return (double)pow(decay,(resolution-ng.freq)); else return (double)(ng.freq+1)/10000000.0; } else { // backoff-probability bo_state(1); //set backoff state to 1 *ng.wordp(1)=bo_code; if (get(ng)) bo=resolution<10000000 ?(double)pow(decay,(resolution-ng.freq)) :(double)(ng.freq+1)/10000000.0; else bo=1.0; ong.size--; return bo * prob(ong); } } bool ngramtable::check_dictsize_bound() { if (dict->size() >= code_range[CODESIZE]) { std::stringstream ss_msg; ss_msg << "dictionary size overflows code range " << code_range[CODESIZE]; exit_error(IRSTLM_ERROR_MODEL, ss_msg.str()); } return true; } int ngramtable::update(ngram ng) { if (!get(ng,ng.size,ng.size)) { std::stringstream ss_msg; ss_msg << "cannot find " << ng; exit_error(IRSTLM_ERROR_MODEL, ss_msg.str()); } freq(ng.link,ng.pinfo,ng.freq); return 1; } void ngramtable::resetngramtable() { //clean up all memory and restart from an empty table freetree(); //clean memory pool memset(tree,0,inodesize(6)); //reset tree //1-gram table initial flags if (maxlev>1) mtflags(tree,INODE | FREQ4); else if (maxlev==1) mtflags(tree,LNODE | FREQ4); word(tree,0); //dummy word msucc(tree,0); // number of different n-grams mtable(tree,NULL); // table of n-gram for (int i=1; i<=maxlev; i++) mentr[i]=memory[i]=occupancy[i]=0; } int ngramtable::putmem(char* ptr,int value,int offs,int size) { MY_ASSERT(ptr!=NULL); for (int i=0; i> (8 * i)) & 0xff; return value; } int ngramtable::getmem(char* ptr,int* value,int offs,int size) { MY_ASSERT(ptr!=NULL); *value=ptr[offs] & 0xff; for (int i=1; i> (8 * i)) & 0xffLL; return value; } long ngramtable::getmem(char* ptr,long long* value,int offs,int size) { MY_ASSERT(ptr!=NULL); *value=ptr[offs] & 0xff; for (int i=1; i=0; i--) { result=(unsigned char)a[i]-(unsigned char)b[i]; if(result) return result; } return 0; }; long long ngramtable::freq(node nd,NODETYPE ndt,long long value) { int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS; if (ndt & FREQ1) putmem(nd,value,offs,1); else if (ndt & FREQ2) putmem(nd,value,offs,2); else if (ndt & FREQ3) putmem(nd,value,offs,3); else if (ndt & FREQ4) putmem(nd,value,offs,4); else putmem(nd,value,offs,6); return value; } long long ngramtable::freq(node nd,NODETYPE ndt) { int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS; long long value; if (ndt & FREQ1) getmem(nd,&value,offs,1); else if (ndt & FREQ2) getmem(nd,&value,offs,2); else if (ndt & FREQ3) getmem(nd,&value,offs,3); else if (ndt & FREQ4) getmem(nd,&value,offs,4); else getmem(nd,&value,offs,6); return value; } long long ngramtable::setfreq(node nd,NODETYPE ndt,long long value,int index) { int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS; if (ndt & FREQ1) putmem(nd,value,offs+index * 1,1); else if (ndt & FREQ2) putmem(nd,value,offs+index * 2,2); else if (ndt & FREQ3) putmem(nd,value,offs+index * 3,3); else if (ndt & FREQ4) putmem(nd,value,offs+index * 4,4); else putmem(nd,value,offs+index * 6,6); return value; } long long ngramtable::getfreq(node nd,NODETYPE ndt,int index) { int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS; long long value; if (ndt & FREQ1) getmem(nd,&value,offs+ index * 1,1); else if (ndt & FREQ2) getmem(nd,&value,offs+ index * 2,2); else if (ndt & FREQ3) getmem(nd,&value,offs+ index * 3,3); else if (ndt & FREQ4) getmem(nd,&value,offs+ index * 4,4); else getmem(nd,&value,offs+ index * 6,6); return value; } table ngramtable::mtable(node nd) { char v[PTRSIZE];; for (int i=0; i0.500?i+1.0:(double)i; } int boff(node nd,double value) { int v=(int)myround(value * 1000000000.0); putmem(nd,v,BOFF_OFFS,INTSIZE); return 1; } int succ2(node nd,int value) { putmem(nd,value,SUCC2_OFFS,CODESIZE); return value; } int succ2(node nd) { int value=0; getmem(nd,&value,SUCC2_OFFS,CODESIZE); return value; } int succ1(node nd,int value) { putmem(nd,value,SUCC1_OFFS,CODESIZE); return value; } int succ1(node nd) { int value=0; getmem(nd,&value,SUCC1_OFFS,CODESIZE); return value; } int msucc(node nd,int value) { putmem(nd,value,MSUCC_OFFS,CODESIZE); return value; } int msucc(node nd) { int value; getmem(nd,&value,MSUCC_OFFS,CODESIZE); return value; } table mtable(node nd); table mtable(node nd,table value); int mtablesz(node nd); inline int bo_state() { return backoff_state; } inline int bo_state(int value) { return backoff_state=value; } }; #endif irstlm-6.00.05/src/ngt.cpp000066400000000000000000000405051263213470300152640ustar00rootroot00000000000000// $Id: ngt.cpp 245 2009-04-02 14:05:40Z fabio_brugnara $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ // ngt // by M. Federico // Copyright Marcello Federico, ITC-irst, 1998 #include #include #include #include "util.h" #include "cmd.h" #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "ngramtable.h" using namespace std; void print_help(int TypeFlag=0){ std::cerr << std::endl << "ngt - collects n-grams" << std::endl; std::cerr << std::endl << "USAGE:" << std::endl; std::cerr << " ngt -i= [options]" << std::endl; std::cerr << std::endl << "OPTIONS:" << std::endl; FullPrintParams(TypeFlag, 0, 1, stderr); } void usage(const char *msg = 0) { if (msg){ std::cerr << msg << std::endl; } else{ print_help(); } } int main(int argc, char **argv) { char *inp=NULL; char *out=NULL; char *dic=NULL; // dictionary filename char *subdic=NULL; // subdictionary filename char *filterdict=NULL; // subdictionary filename char *filtertable=NULL; // ngramtable filename char *iknfile=NULL; // filename to save IKN statistics double filter_hit_rate=1.0; // minimum hit rate of filter char *aug=NULL; // augmentation data char *hmask=NULL; // historymask bool inputgoogleformat=false; //reads ngrams in Google format bool outputgoogleformat=false; //print ngrams in Google format bool outputredisformat=false; //print ngrams in Redis format int ngsz=0; // n-gram default size int dstco=0; // compute distance co-occurrences bool bin=false; bool ss=false; //generate single table bool LMflag=false; //work with LM table bool saveeach=false; //save all n-gram orders int inplen=0; //input length for mask generation bool tlm=false; //test lm table char* ftlm=NULL; //file to test LM table bool memuse=false; bool help=false; DeclareParams((char*) "Dictionary", CMDSTRINGTYPE|CMDMSG, &dic, "dictionary filename", "d", CMDSTRINGTYPE|CMDMSG, &dic, "dictionary filename", "NgramSize", CMDSUBRANGETYPE|CMDMSG, &ngsz, 1, MAX_NGRAM, "n-gram default size; default is 0", "n", CMDSUBRANGETYPE|CMDMSG, &ngsz, 1, MAX_NGRAM, "n-gram default size; default is 0", "InputFile", CMDSTRINGTYPE|CMDMSG, &inp, "input file", "i", CMDSTRINGTYPE|CMDMSG, &inp, "input file", "OutputFile", CMDSTRINGTYPE|CMDMSG, &out, "output file", "o", CMDSTRINGTYPE|CMDMSG, &out, "output file", "InputGoogleFormat", CMDBOOLTYPE|CMDMSG, &inputgoogleformat, "the input file contains data in the n-gram Google format; default is false", "gooinp", CMDBOOLTYPE|CMDMSG, &inputgoogleformat, "the input file contains data in the n-gram Google format; default is false", "OutputGoogleFormat", CMDBOOLTYPE|CMDMSG, &outputgoogleformat, "the output file contains data in the n-gram Google format; default is false", "gooout", CMDBOOLTYPE|CMDMSG, &outputgoogleformat, "the output file contains data in the n-gram Google format; default is false", "OutputRedisFormat", CMDBOOLTYPE|CMDMSG, &outputredisformat, "as Goolge format plus corresponding CRC.16 hash values; default is false", "redisout", CMDBOOLTYPE|CMDMSG, &outputredisformat, "as Goolge format plus corresponding CRC.16 hash values; default is false", "SaveEach", CMDBOOLTYPE|CMDMSG, &saveeach, "save all ngram orders; default is false", "saveeach", CMDBOOLTYPE|CMDMSG, &saveeach, "save all ngram orders; default is false", "SaveBinaryTable", CMDBOOLTYPE|CMDMSG, &bin, "saves into binary format; default is false", "b", CMDBOOLTYPE|CMDMSG, &bin, "saves into binary format; default is false", "LmTable", CMDBOOLTYPE|CMDMSG, &LMflag, "works with LM table; default is false", "lm", CMDBOOLTYPE|CMDMSG, &LMflag, "works with LM table; default is false", "DistCo", CMDINTTYPE|CMDMSG, &dstco, "computes distance co-occurrences at the specified distance; default is 0", "dc", CMDINTTYPE|CMDMSG, &dstco, "computes distance co-occurrences at the specified distance; default is 0", "AugmentFile", CMDSTRINGTYPE|CMDMSG, &aug, "augmentation data", "aug", CMDSTRINGTYPE|CMDMSG, &aug, "augmentation data", "SaveSingle", CMDBOOLTYPE|CMDMSG, &ss, "generates single table; default is false", "ss", CMDBOOLTYPE|CMDMSG, &ss, "generates single table; default is false", "SubDict", CMDSTRINGTYPE|CMDMSG, &subdic, "subdictionary", "sd", CMDSTRINGTYPE|CMDMSG, &subdic, "subdictionary", "FilterDict", CMDSTRINGTYPE|CMDMSG, &filterdict, "filter dictionary", "fd", CMDSTRINGTYPE|CMDMSG, &filterdict, "filter dictionary", "ConvDict", CMDSTRINGTYPE|CMDMSG, &subdic, "subdictionary", "cd", CMDSTRINGTYPE|CMDMSG, &subdic, "subdictionary", "FilterTable", CMDSTRINGTYPE|CMDMSG, &filtertable, "ngramtable filename", "ftr", CMDDOUBLETYPE|CMDMSG, &filter_hit_rate, "ngramtable filename", "FilterTableRate", CMDDOUBLETYPE|CMDMSG, &filter_hit_rate, "minimum hit rate of filter; default is 1.0", "ft", CMDSTRINGTYPE|CMDMSG, &filtertable, "minimum hit rate of filter; default is 1.0", "HistoMask",CMDSTRINGTYPE|CMDMSG, &hmask, "history mask", "hm",CMDSTRINGTYPE|CMDMSG, &hmask, "history mask", "InpLen",CMDINTTYPE|CMDMSG, &inplen, "input length for mask generation; default is 0", "il",CMDINTTYPE|CMDMSG, &inplen, "input length for mask generation; default is 0", "tlm", CMDBOOLTYPE|CMDMSG, &tlm, "test LM table; default is false", "ftlm", CMDSTRINGTYPE|CMDMSG, &ftlm, "file to test LM table", "memuse", CMDBOOLTYPE|CMDMSG, &memuse, "default is false", "iknstat", CMDSTRINGTYPE|CMDMSG, &iknfile, "filename to save IKN statistics", "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help", "h", CMDBOOLTYPE|CMDMSG, &help, "print this help", (char *)NULL ); if (argc == 1){ usage(); exit_error(IRSTLM_NO_ERROR); } GetParams(&argc, &argv, (char*) NULL); if (help){ usage(); exit_error(IRSTLM_NO_ERROR); } if (inp==NULL) { usage(); exit_error(IRSTLM_ERROR_DATA,"Warning: no input file specified"); }; if (out==NULL) { cerr << "Warning: no output file specified!\n"; } TABLETYPE table_type=COUNT; if (LMflag) { cerr << "Working with LM table\n"; table_type=LEAFPROB; } // check word order of subdictionary if (filtertable) { { ngramtable ngt(filtertable,1,NULL,NULL,NULL,0,0,NULL,0,table_type); mfstream inpstream(inp,ios::in); //google input table mfstream outstream(out,ios::out); //google output table cerr << "Filtering table " << inp << " assumed to be in Google Format with size " << ngsz << "\n"; cerr << "with table " << filtertable << " of size " << ngt.maxlevel() << "\n"; cerr << "with hit rate " << filter_hit_rate << "\n"; //order of filter table must be smaller than that of input n-grams MY_ASSERT(ngt.maxlevel() <= ngsz); //read input googletable of ngrams of size ngsz //output entries made of at least X% n-grams contained in filtertable // words are not accepted ngram ng(ngt.dict), ng2(ng.dict); double hits=0; double maxhits=(double)(ngsz-ngt.maxlevel()+1); long c=0; while(inpstream >> ng) { if (ng.size>= ngt.maxlevel()) { //need to make a copy ng2=ng; ng2.size=ngt.maxlevel(); //cerr << "check if " << ng2 << " is contained: "; hits+=(ngt.get(ng2)?1:0); } if (ng.size==ngsz) { if (!(++c % 1000000)) cerr << "."; //cerr << ng << " -> " << is_included << "\n"; //you reached the last word before freq inpstream >> ng.freq; //consistency check of n-gram if (((hits/maxhits)>=filter_hit_rate) && (!ng.containsWord(ngt.dict->OOV(),ng.size)) ) outstream << ng << "\n"; hits=0; ng.size=0; } } outstream.flush(); inpstream.flush(); } exit_error(IRSTLM_NO_ERROR); } //ngramtable* ngt=new ngramtable(inp,ngsz,NULL,dic,dstco,hmask,inplen,table_type); ngramtable* ngt=new ngramtable(inp,ngsz,NULL,NULL,filterdict,inputgoogleformat,dstco,hmask,inplen,table_type); if (aug) { ngt->dict->incflag(1); // ngramtable ngt2(aug,ngsz,isym,NULL,0,NULL,0,table_type); ngramtable ngt2(aug,ngsz,NULL,NULL,NULL,0,0,NULL,0,table_type); ngt->augment(&ngt2); ngt->dict->incflag(0); } if (subdic) { ngramtable *ngt2=new ngramtable(NULL,ngsz,NULL,NULL,NULL,0,0,NULL,0,table_type); // enforce the subdict to follow the same word order of the main dictionary dictionary tmpdict(subdic); ngt2->dict->incflag(1); for (int j=0; jdict->size(); j++) { if (tmpdict.encode(ngt->dict->decode(j)) != tmpdict.oovcode()) { ngt2->dict->encode(ngt->dict->decode(j)); } } ngt2->dict->incflag(0); ngt2->dict->cleanfreq(); //possibly include standard symbols if (ngt->dict->encode(ngt->dict->EoS())!=ngt->dict->oovcode()) { ngt2->dict->incflag(1); ngt2->dict->encode(ngt2->dict->EoS()); ngt2->dict->incflag(0); } if (ngt->dict->encode(ngt->dict->BoS())!=ngt->dict->oovcode()) { ngt2->dict->incflag(1); ngt2->dict->encode(ngt2->dict->BoS()); ngt2->dict->incflag(0); } ngram ng(ngt->dict); ngram ng2(ngt2->dict); ngt->scan(ng,INIT,ngsz); long c=0; while (ngt->scan(ng,CONT,ngsz)) { ng2.trans(ng); ngt2->put(ng2); if (!(++c % 1000000)) cerr << "."; } //makes ngt2 aware of oov code int oov=ngt2->dict->getcode(ngt2->dict->OOV()); if(oov>=0) ngt2->dict->oovcode(oov); for (int j=0; jdict->size(); j++) { ngt2->dict->incfreq(ngt2->dict->encode(ngt->dict->decode(j)), ngt->dict->freq(j)); } cerr <<" oov: " << ngt2->dict->freq(ngt2->dict->oovcode()) << "\n"; delete ngt; ngt=ngt2; } if (ngsz < ngt->maxlevel() && hmask) { cerr << "start projection of ngramtable " << inp << " according to hmask\n"; int selmask[MAX_NGRAM]; memset(selmask, 0, sizeof(int)*MAX_NGRAM); //parse hmask selmask[0]=1; int i=1; for (size_t c=0; cdict->incflag(1); ngram ng(ngt->dict); ngram png(ngt->dict,ngsz); ngram ng2(ngt2->dict,ngsz); ngt->scan(ng,INIT,ngt->maxlevel()); long c=0; while (ngt->scan(ng,CONT,ngt->maxlevel())) { //projection for (int j=0; jput(ng2); if (!(++c % 1000000)) cerr << "."; } char info[100]; sprintf(info,"hm%s",hmask); ngt2->ngtype(info); //makes ngt2 aware of oov code int oov=ngt2->dict->getcode(ngt2->dict->OOV()); if(oov>=0) ngt2->dict->oovcode(oov); for (int j=0; jdict->size(); j++) { ngt2->dict->incfreq(ngt2->dict->encode(ngt->dict->decode(j)), ngt->dict->freq(j)); } cerr <<" oov: " << ngt2->dict->freq(ngt2->dict->oovcode()) << "\n"; delete ngt; ngt=ngt2; } if (tlm && table_type==LEAFPROB) { ngram ng(ngt->dict); cout.setf(ios::scientific); cout << "> "; while(cin >> ng) { ngt->bo_state(0); if (ng.size>=ngsz) { cout << ng << " p= " << log(ngt->prob(ng)); cout << " bo= " << ngt->bo_state() << "\n"; } else cout << ng << " p= NULL\n"; cout << "> "; } } if (ftlm && table_type==LEAFPROB) { ngram ng(ngt->dict); cout.setf(ios::fixed); cout.precision(2); mfstream inptxt(ftlm,ios::in); int Nbo=0,Nw=0,Noov=0; float logPr=0,PP=0,PPwp=0; int bos=ng.dict->encode(ng.dict->BoS()); while(inptxt >> ng) { // reset ngram at begin of sentence if (*ng.wordp(1)==bos) { ng.size=1; continue; } ngt->bo_state(0); if (ng.size>=1) { logPr+=log(ngt->prob(ng)); if (*ng.wordp(1) == ngt->dict->oovcode()) Noov++; Nw++; if (ngt->bo_state()) Nbo++; } } PP=exp(-logPr/Nw); PPwp= PP * exp(Noov * log(10000000.0-ngt->dict->size())/Nw); cout << "%%% NGT TEST OF SMT LM\n"; cout << "%% LM=" << inp << " SIZE="<< ngt->maxlevel(); cout << " TestFile="<< ftlm << "\n"; cout << "%% OOV PENALTY = 1/" << 10000000.0-ngt->dict->size() << "\n"; cout << "%% Nw=" << Nw << " PP=" << PP << " PPwp=" << PPwp << " Nbo=" << Nbo << " Noov=" << Noov << " OOV=" << (float)Noov/Nw * 100.0 << "%\n"; } if (memuse) ngt->stat(0); if (iknfile) { //compute and save statistics of Improved Kneser Ney smoothing ngram ng(ngt->dict); int n1,n2,n3,n4; int unover3=0; mfstream iknstat(iknfile,ios::out); //output of ikn statistics for (int l=1; l<=ngt->maxlevel(); l++) { cerr << "level " << l << "\n"; iknstat << "level: " << l << " "; cerr << "computing statistics\n"; n1=0; n2=0; n3=0,n4=0; ngt->scan(ng,INIT,l); while(ngt->scan(ng,CONT,l)) { //skip ngrams containing _OOV if (l>1 && ng.containsWord(ngt->dict->OOV(),l)) { //cerr << "skp ngram" << ng << "\n"; continue; } //skip n-grams containing in context if (l>1 && ng.containsWord(ngt->dict->EoS(),l-1)) { //cerr << "skp ngram" << ng << "\n"; continue; } //skip 1-grams containing if (l==1 && ng.containsWord(ngt->dict->BoS(),l)) { //cerr << "skp ngram" << ng << "\n"; continue; } if (ng.freq==1) n1++; else if (ng.freq==2) n2++; else if (ng.freq==3) n3++; else if (ng.freq==4) n4++; if (l==1 && ng.freq >=3) unover3++; } cerr << " n1: " << n1 << " n2: " << n2 << " n3: " << n3 << " n4: " << n4 << "\n"; iknstat << " n1: " << n1 << " n2: " << n2 << " n3: " << n3 << " n4: " << n4 << " unover3: " << unover3 << "\n"; } } if (out){ if (bin) ngt->savebin(out,ngsz); else if (outputredisformat) ngt->savetxt(out,ngsz,true,true, 1); else if (outputgoogleformat) ngt->savetxt(out,ngsz,true,false); else ngt->savetxt(out,ngsz,false,false); } } irstlm-6.00.05/src/normcache.cpp000066400000000000000000000057651263213470300164440ustar00rootroot00000000000000/****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "ngramtable.h" #include "normcache.h" using namespace std; // Normalization factors cache normcache::normcache(dictionary* d) { dict=d; //trigram and bigram normalization cache //ngt=new ngramtable(NULL,2,NULL,NULL,0,0,NULL,0,LEAFPROB); ngt=new ngramtable(NULL,2,NULL,NULL,NULL,0,0,NULL,0,LEAFPROB); maxcache[0]=d->size();//unigram cache maxcache[1]=d->size();//bigram cache cache[0]=new double[maxcache[0]]; cache[1]=new double[maxcache[1]]; for (int i=0; isize(); i++) cache[0][i]=cache[1][i]=0.0; cachesize[0]=cachesize[1]=0; hit=miss=0; } void normcache::expand(int n) { int step=100000; cerr << "Expanding cache ...\n"; double *newcache=new double[maxcache[n]+step]; memcpy(newcache,cache[n],sizeof(double)*maxcache[n]); delete [] cache[n]; cache[n]=newcache; for (int i=0; iget(ng,size,size-1)) { hit++; // cerr << "hit " << ng << "\n"; return value=cache[1][ng.freq]; } else { miss++; return value=0; } } return 0; } double normcache::put(ngram ng,int size,double value) { if (size==2) { if (*ng.wordp(2)>= maxcache[0]) expand(0); cache[0][*ng.wordp(2)]=value; cachesize[0]++; return value; } else if (size==3) { if (ngt->get(ng,size,size-1)) return cache[1][ng.freq]=value; else { ngram histo(dict,2); *histo.wordp(1)=*ng.wordp(2); *histo.wordp(2)=*ng.wordp(3); histo.freq=cachesize[1]++; if (cachesize[1]==maxcache[1]) expand(1); ngt->put(histo); return cache[1][histo.freq]=value; } } return 0; } void normcache::stat() { std::cout << "misses " << miss << ", hits " << hit << "\n"; } irstlm-6.00.05/src/normcache.h000066400000000000000000000027261263213470300161030ustar00rootroot00000000000000/****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #ifndef MF_NORMCACHE_H #define MF_NORMCACHE_H #include "dictionary.h" #include "ngramtable.h" // Normalization factors cache class normcache { dictionary* dict; ngramtable *ngt; double* cache[2]; int cachesize[2]; int maxcache[2]; int hit; int miss; public: normcache(dictionary* d); ~normcache() { delete [] cache[0]; delete [] cache[1]; delete ngt; } void expand(int i); double get(ngram ng,int size,double& value); double put(ngram ng,int size,double value); void stat(); }; #endif irstlm-6.00.05/src/plsa.cpp000077500000000000000000000250101263213470300154300ustar00rootroot00000000000000/****************************************************************************** IrstLM: IRST Language Model Toolkit, compile LM Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include "cmd.h" #include #include "thpool.h" #include "util.h" #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "ngramtable.h" #include "doc.h" #include "cplsa.h" using namespace std; using namespace irstlm; void print_help(int TypeFlag=0){ std::cerr << std::endl << "plsa - probabilistic latent semantic analysis modeling" << std::endl; std::cerr << std::endl << "USAGE:" << std::endl; std::cerr << " plsa -tr|te= -m= -t= [options]" << std::endl; std::cerr << std::endl << "DESCRIPTION:" << std::endl; std::cerr << " Train a PLSA model from a corpus and test it to infer topic or word " << std::endl; std::cerr << " distributions from other texts." << std::endl; std::cerr << " Notice: multithreading is available both for training and inference." << std::endl; std::cerr << std::endl << "OPTIONS:" << std::endl; FullPrintParams(TypeFlag, 0, 1, stderr); std::cerr << std::endl << "EXAMPLES:" << std::endl; std::cerr <<" (1) plsa -tr= -t= -m= " << std::endl; std::cerr <<" Train a PLSA model with topics on text " << std::endl; std::cerr <<" Example of content:" << std::endl; std::cerr <<" 3" << std::endl; std::cerr <<" hello world ! " << std::endl; std::cerr <<" good morning good afternoon " << std::endl; std::cerr <<" welcome aboard " << std::endl; std::cerr <<" (2) plsa -m= -te= -tf=" << std::endl; std::cerr <<" Infer topic distribution with model for each doc in " << std::endl; std::cerr <<" (3) plsa -m= -te= -wf=" << std::endl; std::cerr <<" Infer word distribution with model for each doc in " << std::endl; std::cerr << std::endl; } void usage(const char *msg = 0) { if (msg){ std::cerr << msg << std::endl; } else{ print_help(); } } int main(int argc, char **argv){ char *dictfile=NULL; char *trainfile=NULL; char *testfile=NULL; char *topicfeaturefile=NULL; char *wordfeaturefile=NULL; char *modelfile=NULL; char *tmpdir = getenv("TMP"); char *txtfile=NULL; bool forcemodel=false; int topics=0; //number of topics int specialtopic=0; //special topic: first st dict words int iterations=10; //number of EM iterations to run int threads=1; //current EM iteration for multi-thread training bool help=false; bool memorymap=true; int prunethreshold=3; int topwords=20; DeclareParams((char*) "Train", CMDSTRINGTYPE|CMDMSG, &trainfile, " : training text collection ", "tr", CMDSTRINGTYPE|CMDMSG, &trainfile, " : training text collection ", "Model", CMDSTRINGTYPE|CMDMSG, &modelfile, " : model file", "m", CMDSTRINGTYPE|CMDMSG, &modelfile, " : model file", "TopWordsFile", CMDSTRINGTYPE|CMDMSG, &txtfile, " to write top words per topic", "twf", CMDSTRINGTYPE|CMDMSG, &txtfile, " to write top words per topic", "PruneFreq", CMDINTTYPE|CMDMSG, &prunethreshold, ": prune words with freq <= count (default 3)", "pf", CMDINTTYPE|CMDMSG, &prunethreshold, ": : prune words with freq <= count (default 3)", "TopWordsNum", CMDINTTYPE|CMDMSG, &topwords, ": number of top words per topic ", "twn", CMDINTTYPE|CMDMSG, &topwords, ": number of top words per topic", "Test", CMDSTRINGTYPE|CMDMSG, &testfile, " : inference text collection file", "te", CMDSTRINGTYPE|CMDMSG, &testfile, " : inference text collection file", "WordFeatures", CMDSTRINGTYPE|CMDMSG, &wordfeaturefile, " : unigram feature file", "wf", CMDSTRINGTYPE|CMDMSG, &wordfeaturefile," : unigram feature file", "TopicFeatures", CMDSTRINGTYPE|CMDMSG, &topicfeaturefile, " : topic feature file", "tf", CMDSTRINGTYPE|CMDMSG, &topicfeaturefile, " : topic feature file", "Topics", CMDINTTYPE|CMDMSG, &topics, " : number of topics (default 0)", "t", CMDINTTYPE|CMDMSG, &topics," : number of topics (default 0)", "SpecialTopic", CMDINTTYPE|CMDMSG, &specialtopic, " : put top- frequent words in a special topic (default 0)", "st", CMDINTTYPE|CMDMSG, &specialtopic, " : put top- frequent words in a special topic (default 0)", "Iterations", CMDINTTYPE|CMDMSG, &iterations, " : training/inference iterations (default 10)", "it", CMDINTTYPE|CMDMSG, &iterations, " : training/inference iterations (default 10)", "Threads", CMDINTTYPE|CMDMSG, &threads, ": number of threads (default 2)", "th", CMDINTTYPE|CMDMSG, &threads, ": number of threads (default 2)", "ForceModel", CMDBOOLTYPE|CMDMSG, &forcemodel, ": force to use existing model for training", "fm", CMDBOOLTYPE|CMDMSG, &forcemodel, ": force to use existing model for training", "MemoryMap", CMDBOOLTYPE|CMDMSG, &memorymap, ": use memory mapping (default true)", "mm", CMDBOOLTYPE|CMDMSG, &memorymap, ": use memory mapping (default true)", "Dictionary", CMDSTRINGTYPE|CMDMSG, &dictfile, " : specify a training dictionary (optional)", "d", CMDSTRINGTYPE|CMDMSG, &dictfile, " : specify training a dictionary (optional)", "TmpDir", CMDSTRINGTYPE|CMDMSG, &tmpdir, ": tmp directory for memory map (default /tmp)", "tmp", CMDSTRINGTYPE|CMDMSG, &tmpdir, ": tmp directory for memory map (default /tmp )", "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help", "h", CMDBOOLTYPE|CMDMSG, &help, "print this help", (char *)NULL ); if (argc == 1){ usage(); exit_error(IRSTLM_NO_ERROR); } GetParams(&argc, &argv, (char*) NULL); if (help){ usage(); exit_error(IRSTLM_NO_ERROR); } if (trainfile && ( !topics || !modelfile )) { usage(); exit_error(IRSTLM_ERROR_DATA,"Missing training parameters"); } if (testfile && (!modelfile || !(topicfeaturefile || wordfeaturefile))) { usage(); exit_error(IRSTLM_ERROR_DATA,"Missing inference parameters"); } dictionary *dict=NULL; //Training phase //test if model is readable bool testmodel=false; FILE* f;if ((f=fopen(modelfile,"r"))!=NULL){fclose(f);testmodel=true;} if (trainfile){ if (testmodel){ if (!forcemodel) //training with pretrained model: no need of dictionary exit_error(IRSTLM_ERROR_DATA,"Use -ForceModel=y option to use and update an existing model."); } else{//training with empty model and no dictionary: dictionary must be first extracted if (!dictfile){ // exit_error(IRSTLM_ERROR_DATA,"Missing dictionary. Provide a dictionary with option -d."); cerr << "Extracting dictionary from training data (word with freq>=" << prunethreshold << ")\n"; dict=new dictionary(NULL,10000); dict->generate(trainfile,true); dictionary *sortd=new dictionary(dict,true,prunethreshold); sortd->sort(); delete dict; dict=sortd; } else dict=new dictionary(dictfile,10000); dict->encode(dict->OOV()); } plsa tc(dict,topics,tmpdir,threads,memorymap); tc.train(trainfile,modelfile,iterations,0.5,specialtopic); if (dict!=NULL) delete dict; } //Training phase //test if model is readable: notice test could be executed after training testmodel=false; if ((f=fopen(modelfile,"r"))!=NULL){fclose(f);testmodel=true;} if (testfile){ if (!testmodel) exit_error(IRSTLM_ERROR_DATA,"Cannot read model file to run test inference."); if (dictfile) cerr << "Will rely on model dictionary."; dict=NULL; plsa tc(dict,topics,tmpdir,threads,memorymap); tc.inference(testfile,modelfile,iterations,topicfeaturefile,wordfeaturefile); if (dict!=NULL) delete dict; } //save/convert model in text format if (txtfile){ if (!testmodel) exit_error(IRSTLM_ERROR_DATA,"Cannot open model to be printed in readable format."); dict=NULL; plsa tc(dict,topics,tmpdir,threads,memorymap); tc.initW(modelfile,1,0); tc.saveWtxt(txtfile,topwords); tc.freeW(); } exit_error(IRSTLM_NO_ERROR); } irstlm-6.00.05/src/prune-lm.cpp000066400000000000000000000117421263213470300162340ustar00rootroot00000000000000// $Id: prune-lm.cpp 27 2010-05-03 14:33:51Z nicolabertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit, prune LM Copyright (C) 2008 Fabio Brugnara, FBK-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include #include #include "cmd.h" #include "util.h" #include "math.h" #include "lmtable.h" /********************************/ using namespace std; using namespace irstlm; void print_help(int TypeFlag=0){ std::cerr << std::endl << "prune-lm - prunes language models" << std::endl; std::cerr << std::endl << "USAGE:" << std::endl; std::cerr << " prune-lm [options] []" << std::endl; std::cerr << std::endl << "DESCRIPTION:" << std::endl; std::cerr << " prune-lm reads a LM in either ARPA or compiled format and" << std::endl; std::cerr << " prunes out n-grams (n=2,3,..) for which backing-off to the" << std::endl; std::cerr << " lower order n-gram results in a small difference in probability." << std::endl; std::cerr << " The pruned LM is saved in ARPA format" << std::endl; std::cerr << std::endl << "OPTIONS:" << std::endl; FullPrintParams(TypeFlag, 0, 1, stderr); } void usage(const char *msg = 0) { if (msg){ std::cerr << msg << std::endl; } if (!msg){ print_help(); } } void s2t(string cps, float *thr) { int i; char *s=strdup(cps.c_str()); char *tk; thr[0]=0; for(i=1,tk=strtok(s, ","); tk; tk=strtok(0, ","),i++) thr[i]=atof(tk); for(; i files; bool help=false; DeclareParams((char*) "threshold", CMDSTRINGTYPE|CMDMSG, &spthr, "pruning thresholds for 2-grams, 3-grams, 4-grams,...; if less thresholds are specified, the last one is applied to all following n-gram levels; default is 0", "t", CMDSTRINGTYPE|CMDMSG, &spthr, "pruning thresholds for 2-grams, 3-grams, 4-grams,...; if less thresholds are specified, the last one is applied to all following n-gram levels; default is 0", "abs", CMDBOOLTYPE|CMDMSG, &aflag, "uses absolute value of weighted difference; default is 0", "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help", "h", CMDBOOLTYPE|CMDMSG, &help, "print this help", (char *)NULL ); if (argc == 1){ usage(); exit_error(IRSTLM_NO_ERROR); } int first_file=1; for (int i=1; i < argc; i++) { if (strcmp(argv[i],"-") == 0){ //handles /dev/stdin or /dev/stdout if (first_file == 1){ files.push_back("/dev/stdin"); }else if (first_file == 2){ files.push_back("/dev/stdout"); }else{ usage("Warning: You can use the value for the input or output file only"); } first_file++; }else if(argv[i][0] != '-'){ files.push_back(argv[i]); first_file++; } } GetParams(&argc, &argv, (char*) NULL); if (help){ usage(); exit_error(IRSTLM_NO_ERROR); } if (files.size() > 2) { usage(); exit_error(IRSTLM_ERROR_DATA,"Too many arguments"); } if (files.size() < 1) { usage(); exit_error(IRSTLM_ERROR_DATA,"Specify a LM file to read from"); } memset(thr, 0, sizeof(thr)); if(spthr != NULL) s2t(spthr, thr); std::string infile = files[0]; std::string outfile= ""; if (files.size() == 1) { outfile=infile; //remove path information std::string::size_type p = outfile.rfind('/'); if (p != std::string::npos && ((p+1) < outfile.size())) outfile.erase(0,p+1); //eventually strip .gz if (outfile.compare(outfile.size()-3,3,".gz")==0) outfile.erase(outfile.size()-3,3); outfile+=".plm"; } else outfile = files[1]; lmtable lmt; inputfilestream inp(infile.c_str()); if (!inp.good()) { std::stringstream ss_msg; ss_msg << "Failed to open " << infile; exit_error(IRSTLM_ERROR_IO, ss_msg.str()); } lmt.load(inp,infile.c_str(),outfile.c_str(),0); std::cerr << "pruning LM with thresholds: \n"; for (int i=1; i #include #include #include #include #include #include "cmd.h" #include "math.h" #include "util.h" #include "mfstream.h" using namespace std; //---------------------------------------------------------------------- // Special type and global variable for the BIN CLUSTERING algorithm // // //---------------------------------------------------------------------- typedef struct { float pt; unsigned int idx; unsigned short code; } DataItem; int cmpFloatEntry(const void* a,const void* b) { if (*(float *)a > *(float*)b) return 1; else if (*(float *)a < *(float *)b) return -1; else return 0; } //---------------------------------------------------------------------- // Global entry points //---------------------------------------------------------------------- int ComputeCluster(int nc, double* cl,unsigned int N,DataItem* Pts); //---------------------------------------------------------------------- // Global parameters (some are set in getArgs()) //---------------------------------------------------------------------- int k = 256; // number of centers const int MAXLEV = 11; //maximum n-gram size //---------------------------------------------------------------------- // Main program //---------------------------------------------------------------------- void print_help(int TypeFlag=0){ std::cerr << std::endl << "quantize-lm - quantizes probabilities and back-off weights" << std::endl; std::cerr << std::endl << "USAGE:" << std::endl; std::cerr << " quantize-lm [ []]" << std::endl; std::cerr << std::endl << "DESCRIPTION:" << std::endl; std::cerr << " quantize-lm reads a standard LM file in ARPA format and produces" << std::endl; std::cerr << " a version of it with quantized probabilities and back-off weights"<< std::endl; std::cerr << " that the IRST LM toolkit can compile. Accepts LMs with .gz suffix." << std::endl; std::cerr << " You can specify the output file to be created and also the pathname" << std::endl; std::cerr << " of a temporary file used by the program. As default, the temporary " << std::endl; std::cerr << " file is created in the /tmp directory." << std::endl; std::cerr << " Output file can be written to standard output by using the special name -." << std::endl; std::cerr << std::endl << "OPTIONS:" << std::endl; FullPrintParams(TypeFlag, 0, 1, stderr); } void usage(const char *msg = 0) { if (msg){ std::cerr << msg << std::endl; } else{ print_help(); } } int main(int argc, char **argv) { std::vector files; bool help=false; DeclareParams((char*) "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help", "h", CMDBOOLTYPE|CMDMSG, &help, "print this help", (char *)NULL ); if (argc == 1){ usage(); } int first_file=1; for (int i=1; i < argc; i++) { if (strcmp(argv[i],"-") == 0){ //handles /dev/stdin or /dev/stdout if (first_file == 1){ files.push_back("/dev/stdin"); }else if (first_file == 2){ files.push_back("/dev/stdout"); }else{ usage("Warning: You can use the value for the input and/or output file only"); } first_file++; }else if(argv[i][0] != '-'){ files.push_back(argv[i]); first_file++; } } GetParams(&argc, &argv, (char*) NULL); if (help){ usage(); exit_error(IRSTLM_NO_ERROR); } if (files.size() > 3) { exit_error(IRSTLM_ERROR_DATA,"Too many arguments"); } if (files.size() < 1) { usage(); exit_error(IRSTLM_ERROR_DATA,"Please specify a LM file to read from"); } std::string infile = files[0]; std::string outfile=""; std::string tmpfile=""; if (files.size() == 1) { outfile=infile; //remove path information std::string::size_type p = outfile.rfind('/'); if (p != std::string::npos && ((p+1) < outfile.size())) outfile.erase(0,p+1); //eventually strip .gz if (outfile.compare(outfile.size()-3,3,".gz")==0) outfile.erase(outfile.size()-3,3); outfile+=".qlm"; } else outfile = files[1]; if (files.size()==3) { //create temporary file tmpfile = files[2]; mfstream dummy(tmpfile.c_str(),ios::out); dummy.close(); } else { //create temporary internal file in /tmp mfstream dummy; createtempfile(dummy,tmpfile,ios::out); dummy.close(); } std::cerr << "Reading " << infile << "..." << std::endl; inputfilestream inp(infile.c_str()); if (!inp.good()) { std::stringstream ss_msg; ss_msg << "Failed to open " << infile;; exit_error(IRSTLM_ERROR_IO, ss_msg.str()); } std::ofstream* out; if (outfile == "-") out = (ofstream *)&std::cout; else { out=new std::ofstream; out->open(outfile.c_str()); } if (!out->good()) { std::stringstream ss_msg; ss_msg << "Failed to open " << outfile; exit_error(IRSTLM_ERROR_IO, ss_msg.str()); } std::cerr << "Writing " << outfile << "..." << std::endl; //prepare temporary file to save n-gram blocks for multiple reads //this avoids using seeks which do not work with inputfilestream //it's odd but i need a bidirectional filestream! std::cerr << "Using temporary file " << tmpfile << std::endl; fstream filebuff(tmpfile.c_str(),ios::out|ios::in|ios::binary); unsigned int nPts = 0; // actual number of points // *** Read ARPA FILE ** unsigned int numNgrams[MAXLEV + 1]; /* # n-grams for each order */ int Order=0,MaxOrder=0; int n=0; float logprob,logbow; DataItem* dataPts; double* centersP=NULL; double* centersB=NULL; //maps from point index to code unsigned short* mapP=NULL; unsigned short* mapB=NULL; int centers[MAXLEV + 1]; streampos iposition; for (int i=1; i<=MAXLEV; i++) numNgrams[i]=0; for (int i=1; i<=MAXLEV; i++) centers[i]=k; /* all levels 256 centroids; in case read them as parameters */ char line[MAX_LINE]; while (inp.getline(line,MAX_LINE)) { bool backslash = (line[0] == '\\'); if (sscanf(line, "ngram %d=%d", &Order, &n) == 2) { numNgrams[Order] = n; MaxOrder=Order; continue; } if (!strncmp(line, "\\data\\", 6) || strlen(line)==0) continue; if (backslash && sscanf(line, "\\%d-grams", &Order) == 1) { // print output header: if (Order == 1) { *out << "qARPA " << MaxOrder; for (int i=1; i<=MaxOrder; i++) *out << " " << centers[i]; *out << "\n\n\\data\\\n"; for (int i=1; i<=MaxOrder; i++) *out << "ngram " << i << "= " << numNgrams[i] << "\n"; } *out << "\n"; *out << line << "\n"; cerr << "-- Start processing of " << Order << "-grams\n"; MY_ASSERT(Order <= MAXLEV); unsigned int N=numNgrams[Order]; const char* words[MAXLEV+3]; dataPts=new DataItem[N]; // allocate data //reset tempout file to start writing filebuff.seekg((streampos)0); for (nPts=0; nPtsflush(); out->close(); inp.close(); removefile(tmpfile.c_str()); } // Compute Clusters int ComputeCluster(int centers,double* ctrs,unsigned int N,DataItem* bintable) { //cerr << "\nExecuting Clutering Algorithm: k=" << centers<< "\n"; double log10=log(10.0); for (unsigned int i=0; i0) { currcode++; } } if (bintable[i].pt == bintable[i-1].pt) bintable[i].code=bintable[i-1].code; else { bintable[i].code=currcode; species[currcode]++; } population[bintable[i].code]++; MY_ASSERT(bintable[i].code < centers); ctrs[bintable[i].code]=ctrs[bintable[i].code]+exp(bintable[i].pt * log10); } for (int i=0; i0) ctrs[i]=log(ctrs[i]/population[i])/log10; else ctrs[i]=-99; if (ctrs[i]<-99) { cerr << "Warning: adjusting center with too small prob " << ctrs[i] << "\n"; ctrs[i]=-99; } cerr << i << " ctr " << ctrs[i] << " population " << population[i] << " species " << species[i] <<"\n"; } cout.flush(); delete [] population; delete [] species; return 1; } //---------------------------------------------------------------------- // Reading/Printing utilities // readPt - read a point from input stream into data storage // at position i. Returns false on error or EOF. // printPt - prints a points to output file //---------------------------------------------------------------------- irstlm-6.00.05/src/score-lm.cpp000066400000000000000000000102431263213470300162110ustar00rootroot00000000000000/****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2010 Christian Hardmeier, FBK-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include #include #include "cmd.h" #include "util.h" #include "lmtable.h" #include "n_gram.h" using namespace irstlm; void print_help(int TypeFlag=0){ std::cerr << std::endl << "score-lm - scores sentences with a language model" << std::endl; std::cerr << std::endl << "USAGE:" << std::endl << " score-lm -lm [options]" << std::endl; std::cerr << std::endl << "OPTIONS:" << std::endl; std::cerr << " -lm language model to use (must be specified)" << std::endl; std::cerr << " -dub dictionary upper bound (default: 10000000" << std::endl; std::cerr << " -level max level to load from the language models (default: 1000," << std::endl; std::cerr << " meaning the actual LM order)" << std::endl; std::cerr << " -mm 1 memory-mapped access to lm (default: 0)" << std::endl; std::cerr << std::endl; FullPrintParams(TypeFlag, 0, 1, stderr); } void usage(const char *msg = 0) { if (msg){ std::cerr << msg << std::endl; } else{ print_help(); } } int main(int argc, char **argv) { int mmap = 0; int dub = IRSTLM_DUB_DEFAULT; int requiredMaxlev = IRSTLM_REQUIREDMAXLEV_DEFAULT; char *lm = NULL; bool help=false; DeclareParams((char*) "lm", CMDSTRINGTYPE|CMDMSG, &lm, "language model to use (must be specified)", "DictionaryUpperBound", CMDINTTYPE|CMDMSG, &dub, "dictionary upperbound to compute OOV word penalty: default 10^7", "dub", CMDINTTYPE|CMDMSG, &dub, "dictionary upperbound to compute OOV word penalty: default 10^7", "memmap", CMDINTTYPE|CMDMSG, &mmap, "uses memory map to read a binary LM", "mm", CMDINTTYPE|CMDMSG, &mmap, "uses memory map to read a binary LM", "level", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken", "lev", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken", "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help", "h", CMDBOOLTYPE|CMDMSG, &help, "print this help", (char *)NULL ); if (argc == 1){ usage(); exit_error(IRSTLM_NO_ERROR); } GetParams(&argc, &argv, (char*) NULL); if (help){ usage(); exit_error(IRSTLM_NO_ERROR); } if(lm == NULL){ usage(); exit_error(IRSTLM_ERROR_DATA,"Missing parameter: please, specify the LM to use (-lm)"); } std::ifstream lmstr(lm); lmtable lmt; lmt.setMaxLoadedLevel(requiredMaxlev); lmt.load(lmstr, lm, NULL, mmap); lmt.setlogOOVpenalty(dub); for(;;) { std::string line; std::getline(std::cin, line); if(!std::cin.good()) return !std::cin.eof(); std::istringstream linestr(line); ngram ng(lmt.dict); double logprob = .0; while((linestr >> ng)) logprob += lmt.lprob(ng); std::cout << logprob << std::endl; } } irstlm-6.00.05/src/shiftlm.cpp000066400000000000000000000501671263213470300161470ustar00rootroot00000000000000/****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include #include "util.h" #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "ngramtable.h" #include "ngramcache.h" #include "normcache.h" #include "interplm.h" #include "mdiadapt.h" #include "shiftlm.h" namespace irstlm { // //Shiftone interpolated language model // shiftone::shiftone(char* ngtfile,int depth,int prunefreq,TABLETYPE tt): mdiadaptlm(ngtfile,depth,tt) { cerr << "Creating LM with ShiftOne smoothing\n"; prunethresh=prunefreq; cerr << "PruneThresh: " << prunethresh << "\n"; beta=1.0; }; int shiftone::train() { trainunigr(); return 1; } int shiftone::discount(ngram ng_,int size,double& fstar,double& lambda, int cv) { ngram ng(dict); ng.trans(ng_); //cerr << "size:" << size << " ng:|" << ng <<"|\n"; if (size > 1) { ngram history=ng; if (ng.ckhisto(size) && get(history,size,size-1) && (history.freq>cv) && ((size < 3) || ((history.freq-cv) > prunethresh))) { // this history is not pruned out get(ng,size,size); cv=(cv>ng.freq)?ng.freq:cv; if (ng.freq > cv) { fstar=(double)((double)(ng.freq - cv) - beta)/(double)(history.freq-cv); lambda=beta * ((double)history.succ/(double)(history.freq-cv)); } else { // ng.freq == cv: do like if ng was deleted from the table fstar=0.0; lambda=beta * ((double)(history.succ-1)/ //one successor has disappeared! (double)(history.freq-cv)); } //cerr << "ngram :" << ng << "\n"; //check if the last word is OOV if (*ng.wordp(1)==dict->oovcode()) { lambda+=fstar; fstar=0.0; } else { //complete lambda with oovcode probability *ng.wordp(1)=dict->oovcode(); if (get(ng,size,size)) lambda+=(double)((double)ng.freq - beta)/(double)(history.freq-cv); } } else { fstar=0; lambda=1; } } else { fstar=unigr(ng); lambda=0.0; } return 1; } // //Shiftbeta interpolated language model // shiftbeta::shiftbeta(char* ngtfile,int depth,int prunefreq,double b,TABLETYPE tt): mdiadaptlm(ngtfile,depth,tt) { cerr << "Creating LM with ShiftBeta smoothing\n"; if (b==-1.0 || (b < 1.0 && b >0.0)) { beta=new double[lmsize()+1]; for (int l=lmsize(); l>1; l--) beta[l]=b; } else { exit_error(IRSTLM_ERROR_DATA,"shiftbeta::shiftbeta beta must be < 1.0 and > 0"); } prunethresh=prunefreq; cerr << "PruneThresh: " << prunethresh << "\n"; }; int shiftbeta::train() { ngram ng(dict); int n1,n2; trainunigr(); beta[1]=0.0; for (int l=2; l<=lmsize(); l++) { cerr << "level " << l << "\n"; n1=0; n2=0; scan(ng,INIT,l); while(scan(ng,CONT,l)) { if (l1 && ng.containsWord(dict->OOV(),l)) { //cerr << "skp ngram" << ng << "\n"; continue; } //skip n-grams containing in context if (l>1 && ng.containsWord(dict->EoS(),l-1)) { //cerr << "skp ngram" << ng << "\n"; continue; } //skip 1-grams containing if (l==1 && ng.containsWord(dict->BoS(),l)) { //cerr << "skp ngram" << ng << "\n"; continue; } if (ng.freq==1) n1++; else if (ng.freq==2) n2++; } //compute statistics of shiftbeta smoothing if (beta[l]==-1) { if (n1>0) beta[l]=(double)n1/(double)(n1 + 2 * n2); else { cerr << "no singletons! \n"; beta[l]=1.0; } } cerr << beta[l] << "\n"; } return 1; }; int shiftbeta::discount(ngram ng_,int size,double& fstar,double& lambda, int cv) { ngram ng(dict); ng.trans(ng_); if (size > 1) { ngram history=ng; if (ng.ckhisto(size) && get(history,size,size-1) && (history.freq>cv) && ((size < 3) || ((history.freq-cv) > prunethresh ))) { // apply history pruning on trigrams only if (get(ng,size,size) && (!prunesingletons() || ng.freq >1 || size<3)) { cv=(cv>ng.freq)?ng.freq:cv; if (ng.freq>cv) { fstar=(double)((double)(ng.freq - cv) - beta[size])/(double)(history.freq-cv); lambda=beta[size]*((double)history.succ/(double)(history.freq-cv)); if (size>=3 && prunesingletons()) // correction due to frequency pruning lambda+=(1.0-beta[size]) * (double)succ1(history.link)/(double)(history.freq-cv); // succ1(history.link) is not affected if ng.freq > cv } else { // ng.freq == cv fstar=0.0; lambda=beta[size]*((double)(history.succ-1)/ //e` sparito il successore (double)(history.freq-cv)); if (size>=3 && prunesingletons()) //take into account single event pruning lambda+=(1.0-beta[size]) * (double)(succ1(history.link)-(cv==1 && ng.freq==1?1:0)) /(double)(history.freq-cv); } } else { fstar=0.0; lambda=beta[size]*(double)history.succ/(double)history.freq; if (size>=3 && prunesingletons()) // correction due to frequency pruning lambda+=(1.0-beta[size]) * (double)succ1(history.link)/(double)history.freq; } //cerr << "ngram :" << ng << "\n"; if (*ng.wordp(1)==dict->oovcode()) { lambda+=fstar; fstar=0.0; } else { *ng.wordp(1)=dict->oovcode(); if (get(ng,size,size) && (!prunesingletons() || ng.freq >1 || size<3)) lambda+=(double)((double)ng.freq - beta[size])/(double)(history.freq-cv); } } else { fstar=0; lambda=1; } } else { fstar=unigr(ng); lambda=0.0; } return 1; } // //Improved Kneser-Ney language model (previously ModifiedShiftBeta) // improvedkneserney::improvedkneserney(char* ngtfile,int depth,int prunefreq,TABLETYPE tt): mdiadaptlm(ngtfile,depth,tt) { cerr << "Creating LM with Improved Kneser-Ney smoothing\n"; prunethresh=prunefreq; cerr << "PruneThresh: " << prunethresh << "\n"; beta[1][0]=0.0; beta[1][1]=0.0; beta[1][2]=0.0; }; int improvedkneserney::train() { trainunigr(); gencorrcounts(); gensuccstat(); ngram ng(dict); int n1,n2,n3,n4; int unover3=0; oovsum=0; for (int l=1; l<=lmsize(); l++) { cerr << "level " << l << "\n"; cerr << "computing statistics\n"; n1=0; n2=0; n3=0,n4=0; scan(ng,INIT,l); while(scan(ng,CONT,l)) { //skip ngrams containing _OOV if (l>1 && ng.containsWord(dict->OOV(),l)) { continue; } //skip n-grams containing in context if (l>1 && ng.containsWord(dict->EoS(),l-1)) { continue; } //skip 1-grams containing if (l==1 && ng.containsWord(dict->BoS(),l)) { continue; } ng.freq=mfreq(ng,l); if (ng.freq==1) n1++; else if (ng.freq==2) n2++; else if (ng.freq==3) n3++; else if (ng.freq==4) n4++; if (l==1 && ng.freq >=3) unover3++; } if (l==1) { cerr << " n1: " << n1 << " n2: " << n2 << " n3: " << n3 << " n4: " << n4 << " unover3: " << unover3 << "\n"; } else { cerr << " n1: " << n1 << " n2: " << n2 << " n3: " << n3 << " n4: " << n4 << "\n"; } if (n1 == 0 || n2 == 0 || n1 <= n2) { std::stringstream ss_msg; ss_msg << "Error: lower order count-of-counts cannot be estimated properly\n"; ss_msg << "Hint: use another smoothing method with this corpus.\n"; exit_error(IRSTLM_ERROR_DATA,ss_msg.str()); } double Y=(double)n1/(double)(n1 + 2 * n2); beta[0][l] = Y; //equivalent to 1 - 2 * Y * n2 / n1 if (n3 == 0 || n4 == 0 || n2 <= n3 || n3 <= n4 ){ cerr << "Warning: higher order count-of-counts cannot be estimated properly\n"; cerr << "Fixing this problem by resorting only on the lower order count-of-counts\n"; beta[1][l] = Y; beta[2][l] = Y; } else{ beta[1][l] = 2 - 3 * Y * n3 / n2; beta[2][l] = 3 - 4 * Y * n4 / n3; } if (beta[1][l] < 0){ cerr << "Warning: discount coefficient is negative \n"; cerr << "Fixing this problem by setting beta to 0 \n"; beta[1][l] = 0; } if (beta[2][l] < 0){ cerr << "Warning: discount coefficient is negative \n"; cerr << "Fixing this problem by setting beta to 0 \n"; beta[2][l] = 0; } if (l==1) oovsum=beta[0][l] * (double) n1 + beta[1][l] * (double)n2 + beta[2][l] * (double)unover3; cerr << beta[0][l] << " " << beta[1][l] << " " << beta[2][l] << "\n"; } return 1; }; int improvedkneserney::discount(ngram ng_,int size,double& fstar,double& lambda, int cv) { ngram ng(dict); ng.trans(ng_); //cerr << "size:" << size << " ng:|" << ng <<"|\n"; if (size > 1) { ngram history=ng; //singleton pruning only on real counts!! if (ng.ckhisto(size) && get(history,size,size-1) && (history.freq > cv) && ((size < 3) || ((history.freq-cv) > prunethresh ))) { // no history pruning with corrected counts! int suc[3]; suc[0]=succ1(history.link); suc[1]=succ2(history.link); suc[2]=history.succ-suc[0]-suc[1]; if (get(ng,size,size) && (!prunesingletons() || mfreq(ng,size)>1 || size<3) && (!prunetopsingletons() || mfreq(ng,size)>1 || sizeng.freq)?ng.freq:cv; if (ng.freq>cv) { double b=(ng.freq-cv>=3?beta[2][size]:beta[ng.freq-cv-1][size]); fstar=(double)((double)(ng.freq - cv) - b)/(double)(history.freq-cv); lambda=(beta[0][size] * suc[0] + beta[1][size] * suc[1] + beta[2][size] * suc[2]) / (double)(history.freq-cv); if ((size>=3 && prunesingletons()) || (size==maxlevel() && prunetopsingletons())) // correction due to frequency pruning lambda+=(double)(suc[0] * (1-beta[0][size])) / (double)(history.freq-cv); } else { // ng.freq==cv ng.freq>=3?suc[2]--:suc[ng.freq-1]--; //update successor stat fstar=0.0; lambda=(beta[0][size] * suc[0] + beta[1][size] * suc[1] + beta[2][size] * suc[2]) / (double)(history.freq-cv); if ((size>=3 && prunesingletons()) || (size==maxlevel() && prunetopsingletons())) // correction due to frequency pruning lambda+=(double)(suc[0] * (1-beta[0][size])) / (double)(history.freq-cv); ng.freq>=3?suc[2]++:suc[ng.freq-1]++; //resume successor stat } } else { fstar=0.0; lambda=(beta[0][size] * suc[0] + beta[1][size] * suc[1] + beta[2][size] * suc[2]) / (double)(history.freq-cv); if ((size>=3 && prunesingletons()) || (size==maxlevel() && prunetopsingletons())) // correction due to frequency pruning lambda+=(double)(suc[0] * (1-beta[0][size])) / (double)(history.freq-cv); } //cerr << "ngram :" << ng << "\n"; if (*ng.wordp(1)==dict->oovcode()) { lambda+=fstar; fstar=0.0; } else { *ng.wordp(1)=dict->oovcode(); if (get(ng,size,size)) { ng.freq=mfreq(ng,size); if ((!prunesingletons() || ng.freq>1 || size<3) && (!prunetopsingletons() || ng.freq>1 || size=3?beta[2][size]:beta[ng.freq-1][size]); lambda+=(double)(ng.freq - b)/(double)(history.freq-cv); } } } } else { fstar=0; lambda=1; } } else { // unigram case, no cross-validation fstar=unigrIKN(ng); lambda=0.0; } return 1; } double improvedkneserney::unigrIKN(ngram ng) { int unigrtotfreq=(lmsize()>1)?btotfreq():totfreq(); double fstar=0.0; if (get(ng,1,1)) fstar=(double) mfreq(ng,1)/(double)unigrtotfreq; else { std::stringstream ss_msg; ss_msg << "Missing probability for word: " << dict->decode(*ng.wordp(1)); exit_error(IRSTLM_ERROR_DATA,ss_msg.str()); } return fstar; } // //Improved Shiftbeta language model (similar to Improved Kneser-Ney without corrected counts) // improvedshiftbeta::improvedshiftbeta(char* ngtfile,int depth,int prunefreq,TABLETYPE tt): mdiadaptlm(ngtfile,depth,tt) { cerr << "Creating LM with Improved ShiftBeta smoothing\n"; prunethresh=prunefreq; cerr << "PruneThresh: " << prunethresh << "\n"; beta[1][0]=0.0; beta[1][1]=0.0; beta[1][2]=0.0; }; int improvedshiftbeta::train() { trainunigr(); gensuccstat(); ngram ng(dict); int n1,n2,n3,n4; int unover3=0; oovsum=0; for (int l=1; l<=lmsize(); l++) { cerr << "level " << l << "\n"; cerr << "computing statistics\n"; n1=0; n2=0; n3=0,n4=0; scan(ng,INIT,l); while(scan(ng,CONT,l)) { //skip ngrams containing _OOV if (l>1 && ng.containsWord(dict->OOV(),l)) { continue; } //skip n-grams containing in context if (l>1 && ng.containsWord(dict->EoS(),l-1)) { continue; } //skip 1-grams containing if (l==1 && ng.containsWord(dict->BoS(),l)) { continue; } ng.freq=mfreq(ng,l); if (ng.freq==1) n1++; else if (ng.freq==2) n2++; else if (ng.freq==3) n3++; else if (ng.freq==4) n4++; if (l==1 && ng.freq >=3) unover3++; } if (l==1) { cerr << " n1: " << n1 << " n2: " << n2 << " n3: " << n3 << " n4: " << n4 << " unover3: " << unover3 << "\n"; } else { cerr << " n1: " << n1 << " n2: " << n2 << " n3: " << n3 << " n4: " << n4 << "\n"; } if (n1 == 0 || n2 == 0 || n1 <= n2) { std::stringstream ss_msg; ss_msg << "Error: lower order count-of-counts cannot be estimated properly\n"; ss_msg << "Hint: use another smoothing method with this corpus.\n"; exit_error(IRSTLM_ERROR_DATA,ss_msg.str()); } double Y=(double)n1/(double)(n1 + 2 * n2); beta[0][l] = Y; //equivalent to 1 - 2 * Y * n2 / n1 if (n3 == 0 || n4 == 0 || n2 <= n3 || n3 <= n4 ){ cerr << "Warning: higher order count-of-counts cannot be estimated properly\n"; cerr << "Fixing this problem by resorting only on the lower order count-of-counts\n"; beta[1][l] = Y; beta[2][l] = Y; } else{ beta[1][l] = 2 - 3 * Y * n3 / n2; beta[2][l] = 3 - 4 * Y * n4 / n3; } if (beta[1][l] < 0){ cerr << "Warning: discount coefficient is negative \n"; cerr << "Fixing this problem by setting beta to 0 \n"; beta[1][l] = 0; } if (beta[2][l] < 0){ cerr << "Warning: discount coefficient is negative \n"; cerr << "Fixing this problem by setting beta to 0 \n"; beta[2][l] = 0; } if (l==1) oovsum=beta[0][l] * (double) n1 + beta[1][l] * (double)n2 + beta[2][l] * (double)unover3; cerr << beta[0][l] << " " << beta[1][l] << " " << beta[2][l] << "\n"; } return 1; }; int improvedshiftbeta::discount(ngram ng_,int size,double& fstar,double& lambda, int cv) { ngram ng(dict); ng.trans(ng_); //cerr << "size:" << size << " ng:|" << ng <<"|\n"; if (size > 1) { ngram history=ng; //singleton pruning only on real counts!! if (ng.ckhisto(size) && get(history,size,size-1) && (history.freq > cv) && ((size < 3) || ((history.freq-cv) > prunethresh ))) { // no history pruning with corrected counts! int suc[3]; suc[0]=succ1(history.link); suc[1]=succ2(history.link); suc[2]=history.succ-suc[0]-suc[1]; if (get(ng,size,size) && (!prunesingletons() || mfreq(ng,size)>1 || size<3) && (!prunetopsingletons() || mfreq(ng,size)>1 || sizeng.freq)?ng.freq:cv; if (ng.freq>cv) { double b=(ng.freq-cv>=3?beta[2][size]:beta[ng.freq-cv-1][size]); fstar=(double)((double)(ng.freq - cv) - b)/(double)(history.freq-cv); lambda=(beta[0][size] * suc[0] + beta[1][size] * suc[1] + beta[2][size] * suc[2]) / (double)(history.freq-cv); if ((size>=3 && prunesingletons()) || (size==maxlevel() && prunetopsingletons())) // correction due to frequency pruning lambda+=(double)(suc[0] * (1-beta[0][size])) / (double)(history.freq-cv); } else { // ng.freq==cv ng.freq>=3?suc[2]--:suc[ng.freq-1]--; //update successor stat fstar=0.0; lambda=(beta[0][size] * suc[0] + beta[1][size] * suc[1] + beta[2][size] * suc[2]) / (double)(history.freq-cv); if ((size>=3 && prunesingletons()) || (size==maxlevel() && prunetopsingletons())) // correction due to frequency pruning lambda+=(double)(suc[0] * (1-beta[0][size])) / (double)(history.freq-cv); ng.freq>=3?suc[2]++:suc[ng.freq-1]++; //resume successor stat } } else { fstar=0.0; lambda=(beta[0][size] * suc[0] + beta[1][size] * suc[1] + beta[2][size] * suc[2]) / (double)(history.freq-cv); if ((size>=3 && prunesingletons()) || (size==maxlevel() && prunetopsingletons())) // correction due to frequency pruning lambda+=(double)(suc[0] * (1-beta[0][size])) / (double)(history.freq-cv); } //cerr << "ngram :" << ng << "\n"; if (*ng.wordp(1)==dict->oovcode()) { lambda+=fstar; fstar=0.0; } else { *ng.wordp(1)=dict->oovcode(); if (get(ng,size,size)) { ng.freq=mfreq(ng,size); if ((!prunesingletons() || ng.freq>1 || size<3) && (!prunetopsingletons() || ng.freq>1 || size=3?beta[2][size]:beta[ng.freq-1][size]); lambda+=(double)(ng.freq - b)/(double)(history.freq-cv); } } } } else { fstar=0; lambda=1; } } else { // unigram case, no cross-validation fstar=unigr(ng); lambda=0; } return 1; } //Symmetric Shiftbeta int symshiftbeta::discount(ngram ng_,int size,double& fstar,double& lambda, int /* unused parameter: cv */) { ngram ng(dict); ng.trans(ng_); //cerr << "size:" << size << " ng:|" << ng <<"|\n"; // Pr(x/y)= max{(c([x,y])-beta)/(N Pr(y)),0} + lambda Pr(x) // lambda=#bigrams/N MY_ASSERT(size<=2); // only works with bigrams // if (size == 2) { //compute unigram probability of denominator ngram unig(dict,1); *unig.wordp(1)=*ng.wordp(2); double prunig=unigr(unig); //create symmetric bigram if (*ng.wordp(1) > *ng.wordp(2)) { int tmp=*ng.wordp(1); *ng.wordp(1)=*ng.wordp(2); *ng.wordp(2)=tmp; } lambda=beta[2] * (double) entries(2)/(double)totfreq(); if (get(ng,2,2)) { fstar=(double)((double)ng.freq - beta[2])/ (totfreq() * prunig); } else { fstar=0; } } else { fstar=unigr(ng); lambda=0.0; } return 1; } }//namespace irstlm /* main(int argc, char** argv){ dictionary d(argv[1]); shiftbeta ilm(&d,argv[2],3); ngramtable test(&d,argv[2],3); ilm.train(); cerr << "PP " << ilm.test(test) << "\n"; ilm.savebin("newlm.lm",3); } */ irstlm-6.00.05/src/shiftlm.h000066400000000000000000000055511263213470300156110ustar00rootroot00000000000000/****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ namespace irstlm { // Non linear Shift based interpolated LMs class shiftone: public mdiadaptlm { protected: int prunethresh; double beta; public: shiftone(char* ngtfile,int depth=0,int prunefreq=0,TABLETYPE tt=SHIFTBETA_B); int train(); int discount(ngram ng,int size,double& fstar,double& lambda,int cv=0); ~shiftone() {} }; class shiftbeta: public mdiadaptlm { protected: int prunethresh; double* beta; public: shiftbeta(char* ngtfile,int depth=0,int prunefreq=0,double beta=-1,TABLETYPE tt=SHIFTBETA_B); int train(); int discount(ngram ng,int size,double& fstar,double& lambda,int cv=0); ~shiftbeta() { delete [] beta; } }; class symshiftbeta: public shiftbeta { public: symshiftbeta(char* ngtfile,int depth=0,int prunefreq=0,double beta=-1): shiftbeta(ngtfile,depth,prunefreq,beta) {} int discount(ngram ng,int size,double& fstar,double& lambda,int cv=0); }; class improvedkneserney: public mdiadaptlm { protected: int prunethresh; double beta[3][MAX_NGRAM]; ngramtable* tb[MAX_NGRAM]; double oovsum; public: improvedkneserney(char* ngtfile,int depth=0,int prunefreq=0,TABLETYPE tt=IMPROVEDKNESERNEY_B); int train(); int discount(ngram ng,int size,double& fstar,double& lambda,int cv=0); ~improvedkneserney() {} int mfreq(ngram& ng,int l) { return (l #include #include "mfstream.h" #include #include #include #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "mempool.h" #include "ngramcache.h" #include "ngramtable.h" #include "interplm.h" #include "normcache.h" #include "mdiadapt.h" #include "shiftlm.h" #include "linearlm.h" #include "mixture.h" #include "cmd.h" #include "lmtable.h" #define YES 1 #define NO 0 #define NGRAM 1 #define SEQUENCE 2 #define ADAPT 3 #define TURN 4 #define TEXT 5 #define END_ENUM { (char*)0, 0 } static Enum_T BooleanEnum [] = { { "Yes", YES }, { "No", NO}, { "yes", YES }, { "no", NO}, { "y", YES }, { "n", NO}, END_ENUM }; static Enum_T LmTypeEnum [] = { { "ImprovedKneserNey", IMPROVED_KNESER_NEY }, { "ikn", IMPROVED_KNESER_NEY }, { "KneserNey", KNESER_NEY }, { "kn", KNESER_NEY }, { "ModifiedShiftBeta", MOD_SHIFT_BETA }, { "msb", MOD_SHIFT_BETA }, { "ImprovedShiftBeta", IMPROVED_SHIFT_BETA }, { "isb", IMPROVED_SHIFT_BETA }, { "InterpShiftBeta", SHIFT_BETA }, { "ShiftBeta", SHIFT_BETA }, { "sb", SHIFT_BETA }, { "InterpShiftOne", SHIFT_ONE }, { "ShiftOne", SHIFT_ONE }, { "s1", SHIFT_ONE }, { "LinearWittenBell", LINEAR_WB }, { "wb", LINEAR_WB }, { "LinearGoodTuring", LINEAR_GT }, { "Mixture", MIXTURE }, { "mix", MIXTURE }, END_ENUM }; #define RESET 1 #define SAVE 2 #define LOAD 3 #define INIT 4 #define STOP 5 #define BIN 11 #define ARPA 12 #define ASR 13 #define TXT 14 #define NGT 15 int init(mdiadaptlm** lm, int lmtype, char *trainfile, int size, int prunefreq, double beta, int backoff, int dub, double oovrate, int mcl); int deinit(mdiadaptlm** lm); int main(int argc, char **argv) { char *dictfile=NULL; char *trainfile=NULL; char *BINfile=NULL; char *ARPAfile=NULL; char *ASRfile=NULL; int backoff=0; //back-off or interpolation int lmtype=0; int dub=0; //dictionary upper bound int size=0; //lm size int statistics=0; int prunefreq=NO; int prunesingletons=YES; int prunetopsingletons=NO; double beta=-1; int compsize=NO; int checkpr=NO; double oovrate=0; int max_caching_level=0; char *outpr=NULL; int memmap = 0; //write binary format with/without memory map, default is 0 DeclareParams( "Back-off",CMDENUMTYPE, &backoff, BooleanEnum, "bo",CMDENUMTYPE, &backoff, BooleanEnum, "Dictionary", CMDSTRINGTYPE, &dictfile, "d", CMDSTRINGTYPE, &dictfile, "DictionaryUpperBound", CMDINTTYPE, &dub, "dub", CMDINTTYPE, &dub, "NgramSize", CMDSUBRANGETYPE, &size, 1 , MAX_NGRAM, "n", CMDSUBRANGETYPE, &size, 1 , MAX_NGRAM, "Ngram", CMDSTRINGTYPE, &trainfile, "TrainOn", CMDSTRINGTYPE, &trainfile, "tr", CMDSTRINGTYPE, &trainfile, "oASR", CMDSTRINGTYPE, &ASRfile, "oasr", CMDSTRINGTYPE, &ASRfile, "o", CMDSTRINGTYPE, &ARPAfile, "oARPA", CMDSTRINGTYPE, &ARPAfile, "oarpa", CMDSTRINGTYPE, &ARPAfile, "oBIN", CMDSTRINGTYPE, &BINfile, "obin", CMDSTRINGTYPE, &BINfile, "LanguageModelType",CMDENUMTYPE, &lmtype, LmTypeEnum, "lm",CMDENUMTYPE, &lmtype, LmTypeEnum, "Statistics",CMDSUBRANGETYPE, &statistics, 1 , 3, "s",CMDSUBRANGETYPE, &statistics, 1 , 3, "PruneThresh",CMDSUBRANGETYPE, &prunefreq, 1 , 1000, "p",CMDSUBRANGETYPE, &prunefreq, 1 , 1000, "PruneSingletons",CMDENUMTYPE, &prunesingletons, BooleanEnum, "ps",CMDENUMTYPE, &prunesingletons, BooleanEnum, "PruneTopSingletons",CMDENUMTYPE, &prunetopsingletons, BooleanEnum, "pts",CMDENUMTYPE, &prunetopsingletons, BooleanEnum, "ComputeLMSize",CMDENUMTYPE, &compsize, BooleanEnum, "sz",CMDENUMTYPE, &compsize, BooleanEnum, "MaximumCachingLevel", CMDINTTYPE , &max_caching_level, "mcl", CMDINTTYPE, &max_caching_level, "MemoryMap", CMDENUMTYPE, &memmap, BooleanEnum, "memmap", CMDENUMTYPE, &memmap, BooleanEnum, "mm", CMDENUMTYPE, &memmap, BooleanEnum, "CheckProb",CMDENUMTYPE, &checkpr, BooleanEnum, "cp",CMDENUMTYPE, &checkpr, BooleanEnum, "OutProb",CMDSTRINGTYPE, &outpr, "op",CMDSTRINGTYPE, &outpr, "SetOovRate", CMDDOUBLETYPE, &oovrate, "or", CMDDOUBLETYPE, &oovrate, "Beta", CMDDOUBLETYPE, &beta, "beta", CMDDOUBLETYPE, &beta, (char *)NULL ); GetParams(&argc, &argv, (char*) NULL); if (!lmtype) { cerr <<"Missing parameters\n"; exit(1); } cerr <<"LM size: " << size << "\n"; char header[BUFSIZ]; char filename[BUFSIZ]; int cmdcounter=0; mdiadaptlm *lm=NULL; int cmdtype=INIT; int filetype=0; int BoSfreq=0; init(&lm, lmtype, trainfile, size, prunefreq, beta, backoff, dub, oovrate, max_caching_level); ngram ng(lm->dict), ng2(lm->dict); cerr << "filling the initial n-grams with BoS\n"; for (int i=1; imaxlevel(); i++) { ng.pushw(lm->dict->BoS()); ng.freq=1; } mfstream inp("/dev/stdin",ios::in ); int c=0; while (inp >> header) { if (strncmp(header,"@CMD@",5)==0) { cmdcounter++; inp >> header; cerr << "Read |@CMD@| |" << header << "|"; cmdtype=INIT; filetype=BIN; if (strncmp(header,"RESET",5)==0) cmdtype=RESET; else if (strncmp(header,"INIT",4)==0) cmdtype=INIT; else if (strncmp(header,"SAVEBIN",7)==0) { cmdtype=SAVE; filetype=BIN; } else if (strncmp(header,"SAVEARPA",8)==0) { cmdtype=SAVE; filetype=ARPA; } else if (strncmp(header,"SAVEASR",7)==0) { cmdtype=SAVE; filetype=ASR; } else if (strncmp(header,"SAVENGT",7)==0) { cmdtype=SAVE; filetype=NGT; } else if (strncmp(header,"LOADNGT",7)==0) { cmdtype=LOAD; filetype=NGT; } else if (strncmp(header,"LOADTXT",7)==0) { cmdtype=LOAD; filetype=TXT; } else if (strncmp(header,"STOP",4)==0) cmdtype=STOP; else { cerr << "CMD " << header << " is unknown\n"; exit(1); } char** lastwords; char *isym; switch (cmdtype) { case STOP: cerr << "\n"; exit(1); break; case SAVE: inp >> filename; //storing the output filename cerr << " |" << filename << "|\n"; //save actual ngramtable char tmpngtfile[BUFSIZ]; sprintf(tmpngtfile,"%s.ngt%d",filename,cmdcounter); cerr << "saving temporary ngramtable (binary)..." << tmpngtfile << "\n"; ((ngramtable*) lm)->ngtype("ngram"); ((ngramtable*) lm)->savetxt(tmpngtfile,size); //get the actual frequency of BoS symbol, because the constructor of LM will reset to 1; BoSfreq=lm->dict->freq(lm->dict->encode(lm->dict->BoS())); lm->train(); lm->prunesingletons(prunesingletons==YES); lm->prunetopsingletons(prunetopsingletons==YES); if (prunetopsingletons==YES) //keep most specific lm->prunesingletons(NO); switch (filetype) { case BIN: cerr << "saving lm (binary) ... " << filename << "\n"; lm->saveBIN(filename,backoff,dictfile,memmap); cerr << "\n"; break; case ARPA: cerr << "save lm (ARPA)... " << filename << "\n"; lm->saveARPA(filename,backoff,dictfile); cerr << "\n"; break; case ASR: cerr << "save lm (ASR)... " << filename << "\n"; lm->saveASR(filename,backoff,dictfile); cerr << "\n"; break; case NGT: cerr << "save the ngramtable on ... " << filename << "\n"; { ifstream ifs(tmpngtfile, ios::binary); std::ofstream ofs(filename, std::ios::binary); ofs << ifs.rdbuf(); } cerr << "\n"; break; default: cerr << "Saving type is unknown\n"; exit(1); }; //store last words up to the LM order (filling with BoS if needed) ng.size=(ng.size>lm->maxlevel())?lm->maxlevel():ng.size; lastwords = new char*[lm->maxlevel()]; for (int i=1; imaxlevel(); i++) { lastwords[i] = new char[BUFSIZ]; if (i<=ng.size) strcpy(lastwords[i],lm->dict->decode(*ng.wordp(i))); else strcpy(lastwords[i],lm->dict->BoS()); } deinit(&lm); init(&lm, lmtype, tmpngtfile, size, prunefreq, beta, backoff, dub, oovrate, max_caching_level); if (remove(tmpngtfile) != 0) cerr << "Error deleting file " << tmpngtfile << endl; else cerr << "File " << tmpngtfile << " successfully deleted" << endl; //re-set the dictionaries of the working ngrams and re-encode the actual ngram ng.dict=ng2.dict=lm->dict; ng.size=lm->maxlevel(); //restore the last words re-encoded wrt to the new dictionary for (int i=1; imaxlevel(); i++) { *ng.wordp(i)=lm->dict->encode(lastwords[i]); delete []lastwords[i]; } delete []lastwords; //re-set the actual frequency of BoS symbol, because the constructor of LM deleted it; lm->dict->freq(lm->dict->encode(lm->dict->BoS()), BoSfreq); break; case RESET: //restart from scratch deinit(&lm); init(&lm, lmtype, NULL, size, prunefreq, beta, backoff, dub, oovrate, max_caching_level); ng.dict=ng2.dict=lm->dict; cerr << "filling the initial n-grams with BoS\n"; for (int i=1; imaxlevel(); i++) { ng.pushw(lm->dict->BoS()); ng.freq=1; } break; case INIT: cerr << "CMD " << header << " not yet implemented\n"; exit(1); break; case LOAD: inp >> filename; //storing the input filename cerr << " |" << filename << "|\n"; isym=new char[BUFSIZ]; strcpy(isym,lm->dict->EoS()); ngramtable* ngt; switch (filetype) { case NGT: cerr << "loading an ngramtable..." << filename << "\n"; ngt = new ngramtable(filename,size,isym,NULL,NULL); ((ngramtable*) lm)->augment(ngt); cerr << "\n"; break; case TXT: cerr << "loading from text..." << filename << "\n"; ngt= new ngramtable(filename,size,isym,NULL,NULL); ((ngramtable*) lm)->augment(ngt); cerr << "\n"; break; default: cerr << "This file type is unknown\n"; exit(1); }; break; default: cerr << "CMD " << header << " is unknown\n"; exit(1); }; } else { ng.pushw(header); // CHECK: serve questa trans() ng2.trans(ng); //reencode with new dictionary lm->check_dictsize_bound(); //CHECK: e' corretto ng.size? non dovrebbe essere ng2.size? if (ng.size) lm->dict->incfreq(*ng2.wordp(1),1); //CHECK: what about filtering dictionary??? /* if (filterdict){ int code=filterdict->encode(dict->decode(*ng2.wordp(maxlev))); if (code!=filterdict->oovcode()) put(ng2); } else put(ng2); */ lm->put(ng2); if (!(++c % 1000000)) cerr << "."; } } if (statistics) { cerr << "TLM: lm stat ..."; lm->lmstat(statistics); cerr << "\n"; } cerr << "TLM: deleting lm ..."; //delete lm; cerr << "\n"; exit(0); } int init(mdiadaptlm** lm, int lmtype, char *trainfile, int size, int prunefreq, double beta, int backoff, int dub, double oovrate, int mcl) { cerr << "initializing lm... \n"; if (trainfile) cerr << "creating lm from " << trainfile << "\n"; else cerr << "creating an empty lm\n"; switch (lmtype) { case SHIFT_BETA: if (beta==-1 || (beta<1.0 && beta>0)) *lm=new shiftbeta(trainfile,size,prunefreq,beta,(backoff?SHIFTBETA_B:SHIFTBETA_I)); else { cerr << "ShiftBeta: beta must be >0 and <1\n"; exit(1); } break; case KNESER_NEY: if (size>1){ if (beta==-1 || (beta<1.0 && beta>0)){ // lm=new kneserney(trainfile,size,prunefreq,beta,(backoff?KNESERNEY_B:KNESERNEY_I)); } else { exit_error(IRSTLM_ERROR_DATA,"ShiftBeta: beta must be >0 and <1"); } } else { exit_error(IRSTLM_ERROR_DATA,"Kneser-Ney requires size >1"); } break; case MOD_SHIFT_BETA: cerr << "ModifiedShiftBeta (msb) is the old name for ImprovedKneserNey (ikn); this name is not supported anymore, but it is mapped into ImprovedKneserNey for back-compatibility"; case IMPROVED_KNESER_NEY: if (size>1){ lm=new improvedkneserney(trainfile,size,prunefreq,(backoff?IMPROVEDKNESERNEY_B:IMPROVEDKNESERNEY_I)); } else { exit_error(IRSTLM_ERROR_DATA,"Improved Kneser-Ney requires size >1"); } break; case IMPROVED_SHIFT_BETA: lm=new improvedshiftbeta(trainfile,size,prunefreq,(backoff?IMPROVEDSHIFTBETA_B:IMPROVEDSHIFTBETA_I)); break; case SHIFT_ONE: *lm=new shiftone(trainfile,size,prunefreq,(backoff?SIMPLE_B:SIMPLE_I)); break; case LINEAR_WB: *lm=new linearwb(trainfile,size,prunefreq,(backoff?MSHIFTBETA_B:MSHIFTBETA_I)); break; case LINEAR_GT: cerr << "This LM is no more supported\n"; break; case MIXTURE: cerr << "not implemented yet\n"; break; default: cerr << "not implemented yet\n"; exit(1); }; if (dub) (*lm)->dub(dub); (*lm)->create_caches(mcl); cerr << "eventually generate OOV code\n"; (*lm)->dict->genoovcode(); if (oovrate) (*lm)->dict->setoovrate(oovrate); (*lm)->dict->incflag(1); if (!trainfile) { cerr << "adding the initial dummy n-grams to make table consistent\n"; ngram dummyng((*lm)->dict); cerr << "preparing initial dummy n-grams\n"; for (int i=1; i<(*lm)->maxlevel(); i++) { dummyng.pushw((*lm)->dict->BoS()); dummyng.freq=1; } cerr << "inside init: dict: " << (*lm)->dict << " dictsize: " << (*lm)->dict->size() << "\n"; cerr << "dummyng: |" << dummyng << "\n"; (*lm)->put(dummyng); cerr << "inside init: dict: " << (*lm)->dict << " dictsize: " << (*lm)->dict->size() << "\n"; } cerr << "lm initialized \n"; return 1; } int deinit(mdiadaptlm** lm) { delete *lm; return 1; } irstlm-6.00.05/src/thpool.c000066400000000000000000000316521263213470300154440ustar00rootroot00000000000000/* ******************************** * Author: Johan Hanssen Seferidis * License: MIT * Description: Library providing a threading pool where you can add * work. For usage, check the thpool.h file or README.md * *//** @file thpool.h *//* * ********************************/ #include #include #include #include #include #include #include #include "thpool.h" #ifdef THPOOL_DEBUG #define THPOOL_DEBUG 1 #else #define THPOOL_DEBUG 0 #endif #define MAX_NANOSEC 999999999 #define CEIL(X) ((X-(int)(X)) > 0 ? (int)(X+1) : (int)(X)) static volatile int threads_keepalive; static volatile int threads_on_hold; /* ========================== STRUCTURES ============================ */ /* Binary semaphore */ typedef struct bsem { pthread_mutex_t mutex; pthread_cond_t cond; int v; } bsem; /* Job */ typedef struct job{ struct job* prev; /* pointer to previous job */ void* (*function)(void* arg); /* function pointer */ void* arg; /* function's argument */ } job; /* Job queue */ typedef struct jobqueue{ pthread_mutex_t rwmutex; /* used for queue r/w access */ job *front; /* pointer to front of queue */ job *rear; /* pointer to rear of queue */ bsem *has_jobs; /* flag as binary semaphore */ int len; /* number of jobs in queue */ } jobqueue; /* Thread */ typedef struct thread{ int id; /* friendly id */ pthread_t pthread; /* pointer to actual thread */ struct thpool_* thpool_p; /* access to thpool */ } thread; /* Threadpool */ typedef struct thpool_{ thread** threads; /* pointer to threads */ volatile int num_threads_alive; /* threads currently alive */ volatile int num_threads_working; /* threads currently working */ pthread_mutex_t thcount_lock; /* used for thread count etc */ jobqueue* jobqueue_p; /* pointer to the job queue */ } thpool_; /* ========================== PROTOTYPES ============================ */ static void thread_init(thpool_* thpool_p, struct thread** thread_p, int id); static void* thread_do(struct thread* thread_p); static void thread_hold(); static void thread_destroy(struct thread* thread_p); static int jobqueue_init(thpool_* thpool_p); static void jobqueue_clear(thpool_* thpool_p); static void jobqueue_push(thpool_* thpool_p, struct job* newjob_p); static struct job* jobqueue_pull(thpool_* thpool_p); static void jobqueue_destroy(thpool_* thpool_p); static void bsem_init(struct bsem *bsem_p, int value); static void bsem_reset(struct bsem *bsem_p); static void bsem_post(struct bsem *bsem_p); static void bsem_post_all(struct bsem *bsem_p); static void bsem_wait(struct bsem *bsem_p); /* ========================== THREADPOOL ============================ */ /* Initialise thread pool */ struct thpool_* thpool_init(int num_threads){ threads_on_hold = 0; threads_keepalive = 1; if ( num_threads < 0){ num_threads = 0; } /* Make new thread pool */ thpool_* thpool_p=NULL; thpool_p = (struct thpool_*)calloc(1,sizeof(struct thpool_)); if (thpool_p==NULL){ fprintf(stderr, "thpool_init(): Could not allocate memory for thread pool\n"); exit(1); } pthread_mutex_init(&(thpool_p->thcount_lock), NULL); thpool_p->num_threads_alive = 0; thpool_p->num_threads_working = 0; /* Initialise the job queue */ if (jobqueue_init(thpool_p)==-1){ fprintf(stderr, "thpool_init(): Could not allocate memory for job queue\n"); exit(1); } /* Make threads in pool */ thpool_p->threads = (struct thread**)calloc(num_threads,sizeof(struct thread)); if (thpool_p->threads==NULL){ fprintf(stderr, "thpool_init(): Could not allocate memory for threads\n"); exit(1); } /* Thread init */ int n; for (n=0; nthreads[n], n); if (THPOOL_DEBUG) printf("THPOOL_DEBUG: Created thread %d in pool \n", n); } /* Wait for threads to initialize */ while (thpool_p->num_threads_alive != num_threads) {} return thpool_p; } /* Add work to the thread pool */ int thpool_add_work(thpool_* thpool_p, void *(*function_p)(void*), void* arg_p){ job* newjob=NULL; newjob=(struct job*)calloc(1,sizeof(struct job)); if (newjob==NULL){ fprintf(stderr, "thpool_add_work(): Could not allocate memory for new job\n"); return -1; } /* add function and argument */ newjob->function=function_p; newjob->arg=arg_p; /* add job to queue */ pthread_mutex_lock(&thpool_p->jobqueue_p->rwmutex); jobqueue_push(thpool_p, newjob); pthread_mutex_unlock(&thpool_p->jobqueue_p->rwmutex); return 0; } /* Wait until all jobs have finished */ void thpool_wait(thpool_* thpool_p){ /* Continuous polling */ double timeout = 1.0; time_t start, end; double tpassed = 0.0; time (&start); while (tpassed < timeout && (thpool_p->jobqueue_p->len || thpool_p->num_threads_working)) { time (&end); tpassed = difftime(end,start); } /* Exponential polling */ long init_nano = 1; /* MUST be above 0 */ long new_nano; double multiplier = 1.01; int max_secs = 20; struct timespec polling_interval; polling_interval.tv_sec = 0; polling_interval.tv_nsec = init_nano; while (thpool_p->jobqueue_p->len || thpool_p->num_threads_working) { nanosleep(&polling_interval, NULL); if ( polling_interval.tv_sec < max_secs ){ new_nano = CEIL(polling_interval.tv_nsec * multiplier); polling_interval.tv_nsec = new_nano % MAX_NANOSEC; if ( new_nano > MAX_NANOSEC ) { polling_interval.tv_sec ++; } } else break; } /* Fall back to max polling */ while (thpool_p->jobqueue_p->len || thpool_p->num_threads_working){ sleep(max_secs); } } /* Destroy the threadpool */ void thpool_destroy(thpool_* thpool_p){ volatile int threads_total = thpool_p->num_threads_alive; /* End each thread 's infinite loop */ threads_keepalive = 0; /* Give one second to kill idle threads */ double TIMEOUT = 1.0; time_t start, end; double tpassed = 0.0; time (&start); while (tpassed < TIMEOUT && thpool_p->num_threads_alive){ bsem_post_all(thpool_p->jobqueue_p->has_jobs); time (&end); tpassed = difftime(end,start); } /* Poll remaining threads */ while (thpool_p->num_threads_alive){ bsem_post_all(thpool_p->jobqueue_p->has_jobs); sleep(1); } /* Job queue cleanup */ jobqueue_destroy(thpool_p); free(thpool_p->jobqueue_p); /* Deallocs */ int n; for (n=0; n < threads_total; n++){ thread_destroy(thpool_p->threads[n]); } free(thpool_p->threads); free(thpool_p); } /* Pause all threads in threadpool */ void thpool_pause(thpool_* thpool_p) { int n; for (n=0; n < thpool_p->num_threads_alive; n++){ pthread_kill(thpool_p->threads[n]->pthread, SIGUSR1); } } /* Resume all threads in threadpool */ void thpool_resume(thpool_* thpool_p) { threads_on_hold = 0; } /* ============================ THREAD ============================== */ /* Initialize a thread in the thread pool * * @param thread address to the pointer of the thread to be created * @param id id to be given to the thread * */ static void thread_init (thpool_* thpool_p, struct thread** thread_p, int id){ *thread_p = (struct thread*)calloc(1,sizeof(struct thread)); if (thread_p == NULL){ fprintf(stderr, "thpool_init(): Could not allocate memory for thread\n"); exit(1); } (*thread_p)->thpool_p = thpool_p; (*thread_p)->id = id; pthread_create(&(*thread_p)->pthread, NULL, (void *)thread_do, (*thread_p)); pthread_detach((*thread_p)->pthread); } /* Sets the calling thread on hold */ static void thread_hold () { threads_on_hold = 1; while (threads_on_hold){ sleep(1); } } /* What each thread is doing * * In principle this is an endless loop. The only time this loop gets interuppted is once * thpool_destroy() is invoked or the program exits. * * @param thread thread that will run this function * @return nothing */ static void* thread_do(struct thread* thread_p){ /* Assure all threads have been created before starting serving */ thpool_* thpool_p = thread_p->thpool_p; /* Register signal handler */ struct sigaction act; act.sa_handler = thread_hold; if (sigaction(SIGUSR1, &act, NULL) == -1) { fprintf(stderr, "thread_do(): cannot handle SIGUSR1"); } /* Mark thread as alive (initialized) */ pthread_mutex_lock(&thpool_p->thcount_lock); thpool_p->num_threads_alive += 1; pthread_mutex_unlock(&thpool_p->thcount_lock); while(threads_keepalive){ bsem_wait(thpool_p->jobqueue_p->has_jobs); if (threads_keepalive){ pthread_mutex_lock(&thpool_p->thcount_lock); thpool_p->num_threads_working++; pthread_mutex_unlock(&thpool_p->thcount_lock); /* Read job from queue and execute it */ void*(*func_buff)(void* arg); void* arg_buff; job* job_p; pthread_mutex_lock(&thpool_p->jobqueue_p->rwmutex); job_p = jobqueue_pull(thpool_p); pthread_mutex_unlock(&thpool_p->jobqueue_p->rwmutex); if (job_p) { func_buff = job_p->function; arg_buff = job_p->arg; func_buff(arg_buff); free(job_p); } pthread_mutex_lock(&thpool_p->thcount_lock); thpool_p->num_threads_working--; pthread_mutex_unlock(&thpool_p->thcount_lock); } } pthread_mutex_lock(&thpool_p->thcount_lock); thpool_p->num_threads_alive --; pthread_mutex_unlock(&thpool_p->thcount_lock); return NULL; } /* Frees a thread */ static void thread_destroy (thread* thread_p){ free(thread_p); } /* ============================ JOB QUEUE =========================== */ /* Initialize queue */ static int jobqueue_init(thpool_* thpool_p){ thpool_p->jobqueue_p = (struct jobqueue*)calloc(1,sizeof(struct jobqueue)); pthread_mutex_init(&(thpool_p->jobqueue_p->rwmutex), NULL); if (thpool_p->jobqueue_p == NULL){ return -1; } thpool_p->jobqueue_p->has_jobs = (struct bsem*)calloc(1,sizeof(struct bsem)); if (thpool_p->jobqueue_p->has_jobs == NULL){ return -1; } bsem_init(thpool_p->jobqueue_p->has_jobs, 0); jobqueue_clear(thpool_p); return 0; } /* Clear the queue */ static void jobqueue_clear(thpool_* thpool_p){ while(thpool_p->jobqueue_p->len){ free(jobqueue_pull(thpool_p)); } thpool_p->jobqueue_p->front = NULL; thpool_p->jobqueue_p->rear = NULL; bsem_reset(thpool_p->jobqueue_p->has_jobs); thpool_p->jobqueue_p->len = 0; } /* Add (allocated) job to queue * * Notice: Caller MUST hold a mutex */ static void jobqueue_push(thpool_* thpool_p, struct job* newjob){ newjob->prev = NULL; switch(thpool_p->jobqueue_p->len){ case 0: /* if no jobs in queue */ thpool_p->jobqueue_p->front = newjob; thpool_p->jobqueue_p->rear = newjob; break; default: /* if jobs in queue */ thpool_p->jobqueue_p->rear->prev = newjob; thpool_p->jobqueue_p->rear = newjob; } thpool_p->jobqueue_p->len++; bsem_post(thpool_p->jobqueue_p->has_jobs); } /* Get first job from queue(removes it from queue) * * Notice: Caller MUST hold a mutex */ static struct job* jobqueue_pull(thpool_* thpool_p){ job* job_p; job_p = thpool_p->jobqueue_p->front; switch(thpool_p->jobqueue_p->len){ case 0: /* if no jobs in queue */ return NULL; case 1: /* if one job in queue */ thpool_p->jobqueue_p->front = NULL; thpool_p->jobqueue_p->rear = NULL; break; default: /* if >1 jobs in queue */ thpool_p->jobqueue_p->front = job_p->prev; } thpool_p->jobqueue_p->len--; /* Make sure has_jobs has right value */ if (thpool_p->jobqueue_p->len > 0) { bsem_post(thpool_p->jobqueue_p->has_jobs); } return job_p; } /* Free all queue resources back to the system */ static void jobqueue_destroy(thpool_* thpool_p){ jobqueue_clear(thpool_p); free(thpool_p->jobqueue_p->has_jobs); } /* ======================== SYNCHRONISATION ========================= */ /* Init semaphore to 1 or 0 */ static void bsem_init(bsem *bsem_p, int value) { if (value < 0 || value > 1) { fprintf(stderr, "bsem_init(): Binary semaphore can take only values 1 or 0"); exit(1); } pthread_mutex_init(&(bsem_p->mutex), NULL); pthread_cond_init(&(bsem_p->cond), NULL); bsem_p->v = value; } /* Reset semaphore to 0 */ static void bsem_reset(bsem *bsem_p) { bsem_init(bsem_p, 0); } /* Post to at least one thread */ static void bsem_post(bsem *bsem_p) { pthread_mutex_lock(&bsem_p->mutex); bsem_p->v = 1; pthread_cond_signal(&bsem_p->cond); pthread_mutex_unlock(&bsem_p->mutex); } /* Post to all threads */ static void bsem_post_all(bsem *bsem_p) { pthread_mutex_lock(&bsem_p->mutex); bsem_p->v = 1; pthread_cond_broadcast(&bsem_p->cond); pthread_mutex_unlock(&bsem_p->mutex); } /* Wait on semaphore until semaphore has value 0 */ static void bsem_wait(bsem* bsem_p) { pthread_mutex_lock(&bsem_p->mutex); while (bsem_p->v != 1) { pthread_cond_wait(&bsem_p->cond, &bsem_p->mutex); } bsem_p->v = 0; pthread_mutex_unlock(&bsem_p->mutex); } irstlm-6.00.05/src/thpool.h000066400000000000000000000100541263213470300154420ustar00rootroot00000000000000/********************************** * @author Johan Hanssen Seferidis * License: MIT * **********************************/ #ifndef _THPOOL_ #define _THPOOL_ #ifdef __cplusplus extern "C" { #endif /* =================================== API ======================================= */ typedef struct thpool_* threadpool; /** * @brief Initialize threadpool * * Initializes a threadpool. This function will not return untill all * threads have initialized successfully. * * @example * * .. * threadpool thpool; //First we declare a threadpool * thpool = thpool_init(4); //then we initialize it to 4 threads * .. * * @param num_threads number of threads to be created in the threadpool * @return threadpool created threadpool on success, * NULL on error */ threadpool thpool_init(int num_threads); /** * @brief Add work to the job queue * * Takes an action and its argument and adds it to the threadpool's job queue. * If you want to add to work a function with more than one arguments then * a way to implement this is by passing a pointer to a structure. * * NOTICE: You have to cast both the function and argument to not get warnings. * * @example * * void print_num(int num){ * printf("%d\n", num); * } * * int main() { * .. * int a = 10; * thpool_add_work(thpool, (void*)print_num, (void*)a); * .. * } * * @param threadpool threadpool to which the work will be added * @param function_p pointer to function to add as work * @param arg_p pointer to an argument * @return nothing */ int thpool_add_work(threadpool, void *(*function_p)(void*), void* arg_p); /** * @brief Wait for all queued jobs to finish * * Will wait for all jobs - both queued and currently running to finish. * Once the queue is empty and all work has completed, the calling thread * (probably the main program) will continue. * * Smart polling is used in wait. The polling is initially 0 - meaning that * there is virtually no polling at all. If after 1 seconds the threads * haven't finished, the polling interval starts growing exponentially * untill it reaches max_secs seconds. Then it jumps down to a maximum polling * interval assuming that heavy processing is being used in the threadpool. * * @example * * .. * threadpool thpool = thpool_init(4); * .. * // Add a bunch of work * .. * thpool_wait(thpool); * puts("All added work has finished"); * .. * * @param threadpool the threadpool to wait for * @return nothing */ void thpool_wait(threadpool); /** * @brief Pauses all threads immediately * * The threads will be paused no matter if they are idle or working. * The threads return to their previous states once thpool_resume * is called. * * While the thread is being paused, new work can be added. * * @example * * threadpool thpool = thpool_init(4); * thpool_pause(thpool); * .. * // Add a bunch of work * .. * thpool_resume(thpool); // Let the threads start their magic * * @param threadpool the threadpool where the threads should be paused * @return nothing */ void thpool_pause(threadpool); /** * @brief Unpauses all threads if they are paused * * @example * .. * thpool_pause(thpool); * sleep(10); // Delay execution 10 seconds * thpool_resume(thpool); * .. * * @param threadpool the threadpool where the threads should be unpaused * @return nothing */ void thpool_resume(threadpool); /** * @brief Destroy the threadpool * * This will wait for the currently active threads to finish and then 'kill' * the whole threadpool to free up memory. * * @example * int main() { * threadpool thpool1 = thpool_init(2); * threadpool thpool2 = thpool_init(2); * .. * thpool_destroy(thpool1); * .. * return 0; * } * * @param threadpool the threadpool to destroy * @return nothing */ void thpool_destroy(threadpool); #ifdef __cplusplus } #endif #endif irstlm-6.00.05/src/timer.cpp000066400000000000000000000052651263213470300156200ustar00rootroot00000000000000#include #include #include #include "util.h" #include "timer.h" /*** * Return the total time that the timer has been in the "running" * state since it was first "started" or last "restarted". For * "short" time periods (less than an hour), the actual cpu time * used is reported instead of the elapsed time. */ double Timer::elapsed_time() { time_t now; time(&now); return difftime(now, start_time); } /*** * Return the total time that the timer has been in the "running" * state since it was first "started" or last "restarted". For * "short" time periods (less than an hour), the actual cpu time * used is reported instead of the elapsed time. * This function is the public version of elapsed_time() */ double Timer::get_elapsed_time() { return elapsed_time(); } /*** * Start a timer. If it is already running, let it continue running. * Print an optional message. */ void Timer::start(const char* msg) { // Print an optional message, something like "Starting timer t"; if (msg) VERBOSE(0, msg << std::endl); // Return immediately if the timer is already running if (running) return; // Change timer status to running running = true; // Set the start time; time(&start_time); } /*** * Turn the timer off and start it again from 0. Print an optional message. */ /* inline void Timer::restart(const char* msg) { // Print an optional message, something like "Restarting timer t"; if (msg) VERBOSE(0, msg << std::endl; // Set the timer status to running running = true; // Set the accumulated time to 0 and the start time to now acc_time = 0; start_clock = clock(); start_time = time(0); } */ /*** * Stop the timer and print an optional message. */ /* inline void Timer::stop(const char* msg) { // Print an optional message, something like "Stopping timer t"; check(msg); // Recalculate and store the total accumulated time up until now if (running) acc_time += elapsed_time(); running = false; } */ /*** * Print out an optional message followed by the current timer timing. */ void Timer::check(const char* msg) { // Print an optional message, something like "Checking timer t"; if (msg) VERBOSE(0, msg << " : "); VERBOSE(0, "[" << (running ? elapsed_time() : 0) << "] seconds\n"); } /*** * Allow timers to be printed to ostreams using the syntax 'os << t' * for an ostream 'os' and a timer 't'. For example, "cout << t" will * print out the total amount of time 't' has been "running". */ std::ostream& operator<<(std::ostream& os, Timer& t) { //os << std::setprecision(2) << std::setiosflags(std::ios::fixed) << (t.running ? t.elapsed_time() : 0); os << (t.running ? t.elapsed_time() : 0); return os; } irstlm-6.00.05/src/timer.h000066400000000000000000000012141263213470300152530ustar00rootroot00000000000000#ifndef TIMER_H #define TIMER_H #include #include #include #include "util.h" class Timer { friend std::ostream& operator<<(std::ostream& os, Timer& t); private: bool running; time_t start_time; //TODO in seconds? double elapsed_time(); public: /*** * 'running' is initially false. A timer needs to be explicitly started * using 'start' or 'restart' */ Timer() : running(false), start_time(0) { } void start(const char* msg = 0); // void restart(const char* msg = 0); // void stop(const char* msg = 0); void check(const char* msg = 0); double get_elapsed_time(); }; #endif // TIMER_H irstlm-6.00.05/src/tlm.cpp000066400000000000000000000450511263213470300152710ustar00rootroot00000000000000 /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include "cmd.h" #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "mempool.h" #include "ngramtable.h" #include "interplm.h" #include "normcache.h" #include "ngramcache.h" #include "mdiadapt.h" #include "shiftlm.h" #include "linearlm.h" #include "mixture.h" #include "lmtable.h" /********************************/ using namespace std; using namespace irstlm; #define NGRAM 1 #define SEQUENCE 2 #define ADAPT 3 #define TURN 4 #define TEXT 5 static Enum_T LmTypeEnum [] = { { (char*)"ImprovedKneserNey", IMPROVED_KNESER_NEY }, { (char*)"ikn", IMPROVED_KNESER_NEY }, { (char*)"KneserNey", KNESER_NEY }, { (char*)"kn", KNESER_NEY }, { (char*)"ModifiedShiftBeta", MOD_SHIFT_BETA }, { (char*)"msb", MOD_SHIFT_BETA }, { (char*)"ImprovedShiftBeta", IMPROVED_SHIFT_BETA }, { (char*)"isb", IMPROVED_SHIFT_BETA }, { (char*)"InterpShiftBeta", SHIFT_BETA }, { (char*)"ShiftBeta", SHIFT_BETA }, { (char*)"sb", SHIFT_BETA }, { (char*)"InterpShiftOne", SHIFT_ONE }, { (char*)"ShiftOne", SHIFT_ONE }, { (char*)"s1", SHIFT_ONE }, { (char*)"LinearWittenBell", LINEAR_WB }, { (char*)"wb", LINEAR_WB }, { (char*)"StupidBackoff", LINEAR_STB }, { (char*)"stb", LINEAR_STB }, { (char*)"LinearGoodTuring", LINEAR_GT }, { (char*)"Mixture", MIXTURE }, { (char*)"mix", MIXTURE }, END_ENUM }; static Enum_T InteractiveModeEnum [] = { { (char*)"Ngram", NGRAM }, { (char*)"Sequence", SEQUENCE }, { (char*)"Adapt", ADAPT }, { (char*)"Turn", TURN }, { (char*)"Text", TEXT }, { (char*)"Yes", NGRAM }, END_ENUM }; void print_help(int TypeFlag=0){ std::cerr << std::endl << "tlm - estimates a language model" << std::endl; std::cerr << std::endl << "USAGE:" << std::endl; std::cerr << " not yet available" << std::endl; std::cerr << std::endl << "DESCRIPTION:" << std::endl; std::cerr << " tlm is a tool for the estimation of language model" << std::endl; std::cerr << std::endl << "OPTIONS:" << std::endl; std::cerr << " -Help|-h this help" << std::endl; std::cerr << std::endl; FullPrintParams(TypeFlag, 0, 1, stderr); } void usage(const char *msg = 0) { if (msg){ std::cerr << msg << std::endl; } else{ print_help(); } } int main(int argc, char **argv) { char *dictfile=NULL; char *trainfile=NULL; char *testfile=NULL; char *adaptfile=NULL; char *slminfo=NULL; char *imixpar=NULL; char *omixpar=NULL; char *BINfile=NULL; char *ARPAfile=NULL; bool SavePerLevel=true; //save-per-level or save-for-word char *ASRfile=NULL; char* scalefactorfile=NULL; bool backoff=false; //back-off or interpolation int lmtype=0; int dub=IRSTLM_DUB_DEFAULT; //dictionary upper bound int size=0; //lm size int interactive=0; int statistics=0; int prunefreq=0; bool prunesingletons=true; bool prunetopsingletons=false; char *prune_thr_str=NULL; double beta=-1; bool compsize=false; bool checkpr=false; double oovrate=0; int max_caching_level=0; char *outpr=NULL; bool memmap = false; //write binary format with/without memory map, default is 0 int adaptlevel=0; //adaptation level double adaptrate=1.0; bool adaptoov=false; //do not increment the dictionary bool help=false; DeclareParams((char*) "Back-off",CMDBOOLTYPE|CMDMSG, &backoff, "boolean flag for backoff LM (default is false, i.e. interpolated LM)", "bo",CMDBOOLTYPE|CMDMSG, &backoff, "boolean flag for backoff LM (default is false, i.e. interpolated LM)", "Dictionary", CMDSTRINGTYPE|CMDMSG, &dictfile, "dictionary to filter the LM (default is NULL)", "d", CMDSTRINGTYPE|CMDMSG, &dictfile, "dictionary to filter the LM (default is NULL)", "DictionaryUpperBound", CMDINTTYPE|CMDMSG, &dub, "dictionary upperbound to compute OOV word penalty: default 10^7", "dub", CMDINTTYPE|CMDMSG, &dub, "dictionary upperbound to compute OOV word penalty: default 10^7", "NgramSize", CMDSUBRANGETYPE|CMDMSG, &size, 1, MAX_NGRAM, "order of the LM", "n", CMDSUBRANGETYPE|CMDMSG, &size, 1, MAX_NGRAM, "order of the LM", "Ngram", CMDSTRINGTYPE|CMDMSG, &trainfile, "training file", "TrainOn", CMDSTRINGTYPE|CMDMSG, &trainfile, "training file", "tr", CMDSTRINGTYPE|CMDMSG, &trainfile, "training file", "oASR", CMDSTRINGTYPE|CMDMSG, &ASRfile, "output file in ASR format", "oasr", CMDSTRINGTYPE|CMDMSG, &ASRfile, "output file in ASR format", "o", CMDSTRINGTYPE|CMDMSG, &ARPAfile, "output file in ARPA format", "oARPA", CMDSTRINGTYPE|CMDMSG, &ARPAfile, "output file in ARPA format", "oarpa", CMDSTRINGTYPE|CMDMSG, &ARPAfile, "output file in ARPA format", "oBIN", CMDSTRINGTYPE|CMDMSG, &BINfile, "output file in binary format", "obin", CMDSTRINGTYPE|CMDMSG, &BINfile, "output file in binary format", "SavePerLevel",CMDBOOLTYPE|CMDMSG, &SavePerLevel, "saving type of the LM (true: per level (default), false: per word)", "spl",CMDBOOLTYPE|CMDMSG, &SavePerLevel, "saving type of the LM (true: per level (default), false: per word)", "TestOn", CMDSTRINGTYPE|CMDMSG, &testfile, "file for testing", "te", CMDSTRINGTYPE|CMDMSG, &testfile, "file for testing", "AdaptOn", CMDSTRINGTYPE|CMDMSG, &adaptfile, "file for adaptation", "ad", CMDSTRINGTYPE|CMDMSG, &adaptfile, "file for adaptation", "AdaptRate",CMDDOUBLETYPE|CMDMSG , &adaptrate, "adaptation rate", "ar", CMDDOUBLETYPE|CMDMSG, &adaptrate, "adaptation rate", "AdaptLevel", CMDSUBRANGETYPE|CMDMSG, &adaptlevel, 1 , MAX_NGRAM, "adaptation level", "al",CMDSUBRANGETYPE|CMDMSG, &adaptlevel, 1, MAX_NGRAM, "adaptation level", "AdaptOOV", CMDBOOLTYPE|CMDMSG, &adaptoov, "boolean flag for increasing the dictionary during adaptation (default is false)", "ao", CMDBOOLTYPE|CMDMSG, &adaptoov, "boolean flag for increasing the dictionary during adaptation (default is false)", "SaveScaleFactor", CMDSTRINGTYPE|CMDMSG, &scalefactorfile, "output file for the scale factors", "ssf", CMDSTRINGTYPE|CMDMSG, &scalefactorfile, "output file for the scale factors", "LanguageModelType",CMDENUMTYPE|CMDMSG, &lmtype, LmTypeEnum, "type of the LM", "lm",CMDENUMTYPE|CMDMSG, &lmtype, LmTypeEnum, "type of the LM", "Interactive",CMDENUMTYPE|CMDMSG, &interactive, InteractiveModeEnum, "type of interaction", "i",CMDENUMTYPE|CMDMSG, &interactive, InteractiveModeEnum, "type of interaction", "Statistics",CMDSUBRANGETYPE|CMDMSG, &statistics, 1, 3, "output statistics of the LM of increasing detail (default is 0)", "s",CMDSUBRANGETYPE|CMDMSG, &statistics, 1, 3, "output statistics of the LM of increasing detail (default is 0)", "PruneThresh",CMDSUBRANGETYPE|CMDMSG, &prunefreq, 0, 1000, "threshold for pruning (default is 0)", "p",CMDSUBRANGETYPE|CMDMSG, &prunefreq, 0, 1000, "threshold for pruning (default is 0)", "PruneSingletons",CMDBOOLTYPE|CMDMSG, &prunesingletons, "boolean flag for pruning of singletons (default is true)", "ps",CMDBOOLTYPE|CMDMSG, &prunesingletons, "boolean flag for pruning of singletons (default is true)", "PruneTopSingletons",CMDBOOLTYPE|CMDMSG, &prunetopsingletons, "boolean flag for pruning of singletons at the top level (default is false)", "pts",CMDBOOLTYPE|CMDMSG, &prunetopsingletons, "boolean flag for pruning of singletons at the top level (default is false)", "PruneFrequencyThreshold",CMDSTRINGTYPE|CMDMSG, &prune_thr_str, "pruning frequency threshold for each level; comma-separated list of values; (default is \"0,0,...,0\", for all levels)", "pft",CMDSTRINGTYPE|CMDMSG, &prune_thr_str, "pruning frequency threshold for each level; comma-separated list of values; (default is \"0,0,...,0\", for all levels)", "ComputeLMSize",CMDBOOLTYPE|CMDMSG, &compsize, "boolean flag for output the LM size (default is false)", "sz",CMDBOOLTYPE|CMDMSG, &compsize, "boolean flag for output the LM size (default is false)", "MaximumCachingLevel", CMDINTTYPE|CMDMSG , &max_caching_level, "maximum level for caches (default is: LM order - 1)", "mcl", CMDINTTYPE|CMDMSG, &max_caching_level, "maximum level for caches (default is: LM order - 1)", "MemoryMap", CMDBOOLTYPE|CMDMSG, &memmap, "use memory mapping for bianry saving (default is false)", "memmap", CMDBOOLTYPE|CMDMSG, &memmap, "use memory mapping for bianry saving (default is false)", "mm", CMDBOOLTYPE|CMDMSG, &memmap, "use memory mapping for bianry saving (default is false)", "CheckProb",CMDBOOLTYPE|CMDMSG, &checkpr, "boolean flag for checking probability distribution during test (default is false)", "cp",CMDBOOLTYPE|CMDMSG, &checkpr, "boolean flag for checking probability distribution during test (default is false)", "OutProb",CMDSTRINGTYPE|CMDMSG, &outpr, "output file for debugging during test (default is \"/dev/null\")", "op",CMDSTRINGTYPE|CMDMSG, &outpr, "output file for debugging during test (default is \"/dev/null\")", "SubLMInfo", CMDSTRINGTYPE|CMDMSG, &slminfo, "configuration file for the mixture LM", "slmi", CMDSTRINGTYPE|CMDMSG, &slminfo, "configuration file for the mixture LM", "SaveMixParam", CMDSTRINGTYPE|CMDMSG, &omixpar, "output file for weights of the mixture LM", "smp", CMDSTRINGTYPE|CMDMSG, &omixpar, "output file for weights of the mixture LM", "LoadMixParam", CMDSTRINGTYPE|CMDMSG, &imixpar, "input file for weights of the mixture LM", "lmp", CMDSTRINGTYPE|CMDMSG, &imixpar, "input file for weights of the mixture LM", "SetOovRate", CMDDOUBLETYPE|CMDMSG, &oovrate, "rate for computing the OOV frequency (=oovrate*totfreq if oovrate>0) (default is 0)", "or", CMDDOUBLETYPE|CMDMSG, &oovrate, "rate for computing the OOV frequency (=oovrate*totfreq if oovrate>0) (default is 0)", "Beta", CMDDOUBLETYPE|CMDMSG, &beta, "beta value for Shift-Beta and Kneser-Ney LMs (default is -1, i.e. automatic estimation)", "beta", CMDDOUBLETYPE|CMDMSG, &beta, "beta value for Shift-Beta and Kneser-Ney LMs (default is -1, i.e. automatic estimation)", "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help", "h", CMDBOOLTYPE|CMDMSG, &help, "print this help", (char *)NULL ); if (argc == 1){ usage(); exit_error(IRSTLM_NO_ERROR); } GetParams(&argc, &argv, (char*) NULL); if (help){ usage(); exit_error(IRSTLM_NO_ERROR); } if (!lmtype) { exit_error(IRSTLM_ERROR_DATA,"The lm type (-lm) is not specified"); } if (!trainfile && lmtype!=MIXTURE) { exit_error(IRSTLM_ERROR_DATA,"The LM file (-tr) is not specified"); } if (SavePerLevel == false && backoff == true){ cerr << "WARNING: Current implementation does not support the usage of backoff (-bo=true) mixture models (-lm=mix) combined with the per-word saving (-saveperllevel=false)." << endl; cerr << "WARNING: The usage of backoff is disabled, i.e. -bo=no is forced" << endl; backoff=false; } mdiadaptlm *lm=NULL; switch (lmtype) { case SHIFT_BETA: if (beta==-1 || (beta<1.0 && beta>0)){ lm=new shiftbeta(trainfile,size,prunefreq,beta,(backoff?SHIFTBETA_B:SHIFTBETA_I)); } else { exit_error(IRSTLM_ERROR_DATA,"ShiftBeta: beta must be >0 and <1"); } break; case KNESER_NEY: if (size>1){ if (beta==-1 || (beta<1.0 && beta>0)){ // lm=new kneserney(trainfile,size,prunefreq,beta,(backoff?KNESERNEY_B:KNESERNEY_I)); } else { exit_error(IRSTLM_ERROR_DATA,"Kneser-Ney: beta must be >0 and <1"); } } else { exit_error(IRSTLM_ERROR_DATA,"Kneser-Ney requires size >1"); } break; case MOD_SHIFT_BETA: cerr << "ModifiedShiftBeta (msb) is the old name for ImprovedKneserNey (ikn); this name is not supported anymore, but it is mapped into ImprovedKneserNey for back-compatibility"; case IMPROVED_KNESER_NEY: if (size>1){ lm=new improvedkneserney(trainfile,size,prunefreq,(backoff?IMPROVEDKNESERNEY_B:IMPROVEDKNESERNEY_I)); } else { exit_error(IRSTLM_ERROR_DATA,"Improved Kneser-Ney requires size >1"); } break; case IMPROVED_SHIFT_BETA: lm=new improvedshiftbeta(trainfile,size,prunefreq,(backoff?IMPROVEDSHIFTBETA_B:IMPROVEDSHIFTBETA_I)); break; case SHIFT_ONE: lm=new shiftone(trainfile,size,prunefreq,(backoff?SIMPLE_B:SIMPLE_I)); break; case LINEAR_STB: lm=new linearstb(trainfile,size,prunefreq,IMPROVEDSHIFTBETA_B); break; case LINEAR_WB: lm=new linearwb(trainfile,size,prunefreq,(backoff?IMPROVEDSHIFTBETA_B:IMPROVEDSHIFTBETA_I)); break; case LINEAR_GT: cerr << "This LM is no more supported\n"; break; case MIXTURE: //temporary check: so far unable to proper handle this flag in sub LMs //no ngramtable is created lm=new mixture(SavePerLevel,slminfo,size,prunefreq,imixpar,omixpar); break; default: cerr << "not implemented yet\n"; return 1; }; if (dub < lm->dict->size()){ cerr << "dub (" << dub << ") is not set or too small. dub is re-set to the dictionary size (" << lm->dict->size() << ")" << endl; dub = lm->dict->size(); } lm->dub(dub); lm->create_caches(max_caching_level); cerr << "eventually generate OOV code\n"; lm->dict->genoovcode(); if (oovrate) lm->dict->setoovrate(oovrate); lm->save_per_level(SavePerLevel); lm->train(); //it never occurs that both prunetopsingletons and prunesingletons are true if (prunetopsingletons==true) { //keep most specific lm->prunetopsingletons(true); lm->prunesingletons(false); } else { lm->prunetopsingletons(false); if (prunesingletons==true) { lm->prunesingletons(true); } else { lm->prunesingletons(false); } } if (prune_thr_str) lm->set_prune_ngram(prune_thr_str); if (adaptoov) lm->dict->incflag(1); if (adaptfile) lm->adapt(adaptfile,adaptlevel,adaptrate); if (adaptoov) lm->dict->incflag(0); if (scalefactorfile) lm->savescalefactor(scalefactorfile); if (backoff) lm->compute_backoff(); if (size>lm->maxlevel()) { exit_error(IRSTLM_ERROR_DATA,"lm size is too large"); } if (!size) size=lm->maxlevel(); if (testfile) { cerr << "TLM: test ..."; lm->test(testfile,size,backoff,checkpr,outpr); if (adaptfile) ((mdiadaptlm *)lm)->get_zetacache()->stat(); cerr << "\n"; }; if (compsize) cout << "LM size " << (int)lm->netsize() << "\n"; if (interactive) { ngram ng(lm->dict); int nsize=0; cout.setf(ios::scientific); switch (interactive) { case NGRAM: cout << "> "; while(cin >> ng) { if (ng.wordp(size)) { cout << ng << " p=" << (double)log(lm->prob(ng,size)) << "\n"; ng.size=0; cout << "> "; } } break; case SEQUENCE: { char c; double p=0; cout << "> "; while(cin >> ng) { nsize=ng.sizeprob(ng,nsize)); cout << ng << " p=" << p << "\n"; while((c=cin.get())==' ') { cout << c; } cin.putback(c); //cout << "-" << c << "-"; if (c=='\n') { ng.size=0; cout << "> "; p=0; } } } break; case TURN: { int n=0; double lp=0; double oov=0; while(cin >> ng) { if (ng.size>0) { nsize=ng.sizeprob(ng,nsize)); n++; if (*ng.wordp(1) == lm->dict->oovcode()) oov++; } else { if (n>0) cout << n << " " << lp/(log(2.0) * n) << " " << oov/n << "\n"; n=0; lp=0; oov=0; } } break; } case TEXT: { int order; int n=0; double lp=0; double oov=0; while (!cin.eof()) { cin >> order; if (order>size) cerr << "Warning: order > lm size\n"; order=order>size?size:order; while (cin >> ng) { if (ng.size>0) { nsize=ng.sizeprob(ng,nsize)); n++; if (*ng.wordp(1) == lm->dict->oovcode()) oov++; } else { if (n>0) cout << n << " " << lp/(log(2.0)*n) << " " << oov/n << "\n"; n=0; lp=0; oov=0; if (ng.isym>0) break; } } } } break; case ADAPT: { if (backoff) { exit_error(IRSTLM_ERROR_DATA,"This modality is not supported with backoff LMs"); } char afile[50],tfile[50]; while (!cin.eof()) { cin >> afile >> tfile; system("echo > .tlmlock"); cerr << "interactive adaptation: " << afile << " " << tfile << "\n"; if (adaptoov) lm->dict->incflag(1); lm->adapt(afile,adaptlevel,adaptrate); if (adaptoov) lm->dict->incflag(0); if (scalefactorfile) lm->savescalefactor(scalefactorfile); if (ASRfile) lm->saveASR(ASRfile,backoff,dictfile); if (ARPAfile) lm->saveARPA(ARPAfile,backoff,dictfile); if (BINfile) lm->saveBIN(BINfile,backoff,dictfile,memmap); lm->test(tfile,size,checkpr); cout.flush(); system("rm .tlmlock"); } } break; } exit_error(IRSTLM_NO_ERROR); } if (ASRfile) { cerr << "TLM: save lm (ASR)..."; lm->saveASR(ASRfile,backoff,dictfile); cerr << "\n"; } if (ARPAfile) { cerr << "TLM: save lm (ARPA)..."; lm->saveARPA(ARPAfile,backoff,dictfile); cerr << "\n"; } if (BINfile) { cerr << "TLM: save lm (binary)..."; lm->saveBIN(BINfile,backoff,dictfile,memmap); cerr << "\n"; } if (statistics) { cerr << "TLM: lm stat ..."; lm->lmstat(statistics); cerr << "\n"; } // lm->cache_stat(); cerr << "TLM: deleting lm ..."; delete lm; cerr << "\n"; exit_error(IRSTLM_NO_ERROR); } irstlm-6.00.05/src/util.cpp000066400000000000000000000174241263213470300154550ustar00rootroot00000000000000// $Id: util.cpp 363 2010-02-22 15:02:45Z mfederico $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #ifdef WIN32 #include #include #include #else #include #include #include #include #include #include #endif #include "gzfilebuf.h" #include "timer.h" #include "util.h" #include "n_gram.h" #include "mfstream.h" using namespace std; string gettempfolder() { #ifdef _WIN32 char *tmpPath = getenv("TMP"); string str(tmpPath); if (str.substr(str.size() - 1, 1) != "\\") str += "\\"; return str; #else char *tmpPath = getenv("TMP"); if (!tmpPath || !*tmpPath) return "/tmp/"; string str(tmpPath); if (str.substr(str.size() - 1, 1) != "/") str += "/"; return str; #endif } string createtempName() { string tmpfolder = gettempfolder(); #ifdef _WIN32 char buffer[BUFSIZ]; //To check whether the following function open the stream as well //In this case it is mandatory to close it immediately ::GetTempFileNameA(tmpfolder.c_str(), "", 0, buffer); #else char buffer[tmpfolder.size() + 16]; strcpy(buffer, tmpfolder.c_str()); strcat(buffer, "dskbuff--XXXXXX"); int fd=mkstemp(buffer); close(fd); #endif return (string) buffer; } void createtempfile(mfstream &fileStream, string &filePath, std::ios_base::openmode flags) { filePath = createtempName(); fileStream.open(filePath.c_str(), flags); } void removefile(const std::string &filePath) { #ifdef _WIN32 ::DeleteFileA(filePath.c_str()); #else if (remove(filePath.c_str()) != 0) { perror("Error deleting file" ); exit_error(IRSTLM_ERROR_IO); } #endif } /* MemoryMap Management Code kindly provided by Fabio Brugnara, ITC-irst Trento. How to use it: - call MMap with offset and required size (psgz): pg->b = MMap(fd, rdwr,offset,pgsz,&g); - correct returned pointer with the alignment gap and save the gap: pg->b += pg->gap = g; - when releasing mapped memory, subtract the gap from the pointer and add the gap to the requested dimension Munmap(pg->b-pg->gap, pgsz+pg->gap, 0); */ void *MMap(int fd, int access, off_t offset, size_t len, off_t *gap) { void *p=NULL; #ifdef _WIN32 /* int g=0; // code for windows must be checked HANDLE fh, mh; fh = (HANDLE)_get_osfhandle(fd); if(offset) { // bisogna accertarsi che l'offset abbia la granularita` //corretta, MAI PROVATA! SYSTEM_INFO si; GetSystemInfo(&si); g = *gap = offset % si.dwPageSize; } else if(gap) { *gap=0; } if(!(mh=CreateFileMapping(fh, NULL, PAGE_READWRITE, 0, len+g, NULL))) { return 0; } p = (char*)MapViewOfFile(mh, FILE_MAP_ALL_ACCESS, 0, offset-*gap, len+*gap); CloseHandle(mh); */ #else int pgsz,g=0; if(offset) { pgsz = sysconf(_SC_PAGESIZE); g = *gap = offset%pgsz; } else if(gap) { *gap=0; } p = mmap((void*)0, len+g, access, MAP_SHARED|MAP_FILE, fd, offset-g); if((long)p==-1L) { perror("mmap failed"); p=0; } #endif return p; } int Munmap(void *p,size_t len,int sync) { int r=0; #ifdef _WIN32 /* //code for windows must be checked if(sync) FlushViewOfFile(p, len); UnmapViewOfFile(p); */ #else cerr << "len = " << len << endl; cerr << "sync = " << sync << endl; cerr << "running msync..." << endl; if(sync) msync(p, len, MS_SYNC); cerr << "done. Running munmap..." << endl; if((r=munmap((void*)p, len))) { perror("munmap() failed"); } cerr << "done" << endl; #endif return r; } //global variable Timer g_timer; void ResetUserTime() { g_timer.start(); }; void PrintUserTime(const std::string &message) { g_timer.check(message.c_str()); } double GetUserTime() { return g_timer.get_elapsed_time(); } void ShowProgress(long long current, long long target){ int frac=(current * 1000)/target; if (!(frac % 10)) fprintf(stderr,"%02d\b\b",frac/10); } int parseWords(char *sentence, const char **words, int max) { char *word; int i = 0; const char *const wordSeparators = " \t\r\n"; for (word = strtok(sentence, wordSeparators); i < max && word != 0; i++, word = strtok(0, wordSeparators)) { words[i] = word; } if (i < max) { words[i] = 0; } return i; } //Load a LM as a text file. LM could have been generated either with the //IRST LM toolkit or with the SRILM Toolkit. In the latter we are not //sure that n-grams are lexically ordered (according to the 1-grams). //However, we make the following assumption: //"all successors of any prefix are sorted and written in contiguous lines!" //This method also loads files processed with the quantization //tool: qlm int parseline(istream& inp, int Order,ngram& ng,float& prob,float& bow) { const char* words[1+ LMTMAXLEV + 1 + 1]; int howmany; char line[MAX_LINE]; inp.getline(line,MAX_LINE); if (strlen(line)==MAX_LINE-1) { std::stringstream ss_msg; ss_msg << "parseline: input line exceed MAXLINE (" << MAX_LINE << ") chars " << line << "\n"; exit_error(IRSTLM_ERROR_DATA, ss_msg.str()); } howmany = parseWords(line, words, Order + 3); if (!(howmany == (Order+ 1) || howmany == (Order + 2))){ MY_ASSERT(howmany == (Order+ 1) || howmany == (Order + 2)); } //read words ng.size=0; for (int i=1; i<=Order; i++) ng.pushw(strcmp(words[i],"")?words[i]:ng.dict->OOV()); //read logprob/code and logbow/code MY_ASSERT(sscanf(words[0],"%f",&prob)); if (howmany==(Order+2)){ MY_ASSERT(sscanf(words[Order+1],"%f",&bow)); }else{ bow=0.0; //this is log10prob=0 for implicit backoff } return 1; } void exit_error(int err, const std::string &msg){ if (msg != "") { VERBOSE(0,msg+"\n";); } else{ switch(err){ case IRSTLM_NO_ERROR: VERBOSE(0,"Everything OK\n"); break; case IRSTLM_ERROR_GENERIC: VERBOSE(0,"Generic error\n"); break; case IRSTLM_ERROR_IO: VERBOSE(0,"Input/Output error\n"); break; case IRSTLM_ERROR_MEMORY: VERBOSE(0,"Allocation memory error\n"); break; case IRSTLM_ERROR_DATA: VERBOSE(0,"Data format error\n"); break; case IRSTLM_ERROR_MODEL: VERBOSE(0,"Model computation error\n"); break; default: VERBOSE(0,"Undefined error\n"); break; } } exit(err); }; /* #ifdef MY_ASSERT_FLAG #if MY_ASSERT_FLAG>0 #undef MY_ASSERT(x) #define MY_ASSERT(x) do { assert(x); } while (0) #else #define MY_ASSERT(x) { UNUSED(x); } #endif #else #define MY_ASSERT(x) { UNUSED(x); } #endif */ /** assert macros e functions**/ #ifdef MY_ASSERT_FLAG #if MY_ASSERT_FLAG==0 #undef MY_ASSERT_FLAG #endif #endif #ifdef MY_ASSERT_FLAG void MY_ASSERT(bool x) { assert(x); } #else void MY_ASSERT(bool x) { UNUSED(x); } #endif /** trace macros and functions**/ /** verbose macros and functions**/ #ifdef TRACE_LEVEL //int tracelevel=TRACE_LEVEL; const int tracelevel=TRACE_LEVEL; #else //int tracelevel=0; const int tracelevel=0; #endif namespace irstlm { void* reallocf(void *ptr, size_t size){ void *p=realloc(ptr,size); if (p) { return p; } else { free(ptr); return NULL; } } } irstlm-6.00.05/src/util.h000066400000000000000000000053571263213470300151240ustar00rootroot00000000000000// $Id: util.h 363 2010-02-22 15:02:45Z mfederico $ #ifndef IRSTLM_UTIL_H #define IRSTLM_UTIL_H #include #include #include #include #include using namespace std; #define MAX(a,b) (((a)>(b))?(a):(b)) #define MIN(a,b) (((a)<(b))?(a):(b)) //random values between -1 and +1 #define MY_RAND (((float)random()/RAND_MAX)* 2.0 - 1.0) #define UNUSED(x) { (void) x; } #define LMTMAXLEV 20 #define MAX_LINE 100000 #define IRSTLM_DUB_DEFAULT 10000000 #define IRSTLM_REQUIREDMAXLEV_DEFAULT 1000 //0.000001 = 10^(-6) //0.000000000001 = 10^(-12) //1.000001 = 1+10^(-6) //1.000000000001 = 1+10^(-12) //0.999999 = 1-10^(-6) //0.999999999999 = 1-10^(-12) #define LOWER_SINGLE_PRECISION_OF_0 -0.000001 #define UPPER_SINGLE_PRECISION_OF_0 0.000001 #define LOWER_DOUBLE_PRECISION_OF_0 -0.000000000001 #define UPPER_DOUBLE_PRECISION_OF_0 0.000000000001 #define UPPER_SINGLE_PRECISION_OF_1 1.000001 #define LOWER_SINGLE_PRECISION_OF_1 0.999999 #define UPPER_DOUBLE_PRECISION_OF_1 1.000000000001 #define LOWER_DOUBLE_PRECISION_OF_1 0.999999999999 #define IRSTLM_NO_ERROR 0 #define IRSTLM_ERROR_GENERIC 1 #define IRSTLM_ERROR_IO 2 #define IRSTLM_ERROR_MEMORY 3 #define IRSTLM_ERROR_DATA 4 #define IRSTLM_ERROR_MODEL 5 #define BUCKET 10000 #define SSEED 50 typedef std::map< std::string, float > topic_map_t; class ngram; typedef unsigned int ngram_state_t; //type for pointing to a full ngram in the table class mfstream; std::string gettempfolder(); std::string createtempName(); void createtempfile(mfstream &fileStream, std::string &filePath, std::ios_base::openmode flags); void removefile(const std::string &filePath); void *MMap(int fd, int access, off_t offset, size_t len, off_t *gap); int Munmap(void *p,size_t len,int sync); // A couple of utilities to measure access time void ResetUserTime(); void PrintUserTime(const std::string &message); double GetUserTime(); void ShowProgress(long long current,long long total); int parseWords(char *, const char **, int); int parseline(istream& inp, int Order,ngram& ng,float& prob,float& bow); void exit_error(int err, const std::string &msg=""); namespace irstlm{ void* reallocf(void *ptr, size_t size); } //extern int tracelevel; extern const int tracelevel; #define TRACE_ERR(str) { std::cerr << str; } #define VERBOSE(level,str) { if (tracelevel > level) { TRACE_ERR("DEBUG_LEVEL:" << level << "/" << tracelevel << " "); TRACE_ERR(str); } } #define IFVERBOSE(level) if (tracelevel > level) /* #define _DEBUG_LEVEL TRACE_LEVEL #define TRACE_ERR(str) { std::cerr << str; } #define VERBOSE(level,str) { if (_DEBUG_LEVEL > level) { TRACE_ERR("DEBUG_LEVEL:" <<_DEBUG_LEVEL << " "); TRACE_ERR(str); } } #define IFVERBOSE(level) if (_DEBUG_LEVEL > level) */ void MY_ASSERT(bool x); #endif irstlm-6.00.05/src/verify-caching.cpp000066400000000000000000000047631263213470300174000ustar00rootroot00000000000000// $Id: verify_caching.cpp 3677 2010-10-13 09:06:51Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit, compile LM Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include "cmd.h" #include "util.h" #include "mdiadapt.h" #include "lmContainer.h" /********************************/ using namespace std; using namespace irstlm; void print_help(int TypeFlag=0){ std::cerr << std::endl << "verify_caching - verify whether caching is enabled or disabled" << std::endl; std::cerr << std::endl << "USAGE:" << std::endl; std::cerr << " verify_caching" << std::endl; std::cerr << std::endl << "DESCRIPTION:" << std::endl; std::cerr << std::endl << "OPTIONS:" << std::endl; FullPrintParams(TypeFlag, 0, 1, stderr); } void usage(const char *msg = 0) { if (msg) { std::cerr << msg << std::endl; } if (!msg){ print_help(); } } int main(int argc, char **argv) { bool help=false; DeclareParams((char*) "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help", "h", CMDBOOLTYPE|CMDMSG, &help, "print this help", (char *)NULL ); if (argc > 1){ usage(); exit_error(IRSTLM_NO_ERROR); } GetParams(&argc, &argv, (char*) NULL); if (help){ usage(); exit_error(IRSTLM_NO_ERROR); } if (lmContainer::is_cache_enabled()){ std::cout << " caching is ENABLED" << std::endl; }else{ std::cout << " caching is DISABLED" << std::endl; } if (mdiadaptlm::is_train_cache_enabled()){ std::cout << " train-caching is ENABLED" << std::endl; }else{ std::cout << " train-caching is DISABLED" << std::endl; } }