irstlm-5.80.03/000755 000766 000024 00000000000 12137531466 015347 5ustar00nicolabertoldistaff000000 000000 irstlm-5.80.03/config.h.in000644 000766 000024 00000003135 12042554746 017375 0ustar00nicolabertoldistaff000000 000000 /* config.h.in. Generated from configure.in by autoheader. */ /* Define to 1 if you have the header file. */ #undef HAVE_DLFCN_H /* Define to 1 if you have the header file. */ #undef HAVE_GETOPT_H /* Define to 1 if you have the header file. */ #undef HAVE_INTTYPES_H /* Define to 1 if you have the header file. */ #undef HAVE_MEMORY_H /* Define to 1 if you have the header file. */ #undef HAVE_STDINT_H /* Define to 1 if you have the header file. */ #undef HAVE_STDLIB_H /* Define to 1 if you have the header file. */ #undef HAVE_STRINGS_H /* Define to 1 if you have the header file. */ #undef HAVE_STRING_H /* Define to 1 if you have the header file. */ #undef HAVE_SYS_STAT_H /* Define to 1 if you have the header file. */ #undef HAVE_SYS_TYPES_H /* Define to 1 if you have the header file. */ #undef HAVE_UNISTD_H /* Define to the sub-directory in which libtool stores uninstalled libraries. */ #undef LT_OBJDIR /* Name of package */ #undef PACKAGE /* Define to the address where bug reports for this package should be sent. */ #undef PACKAGE_BUGREPORT /* Define to the full name of this package. */ #undef PACKAGE_NAME /* Define to the full name and version of this package. */ #undef PACKAGE_STRING /* Define to the one symbol short name of this package. */ #undef PACKAGE_TARNAME /* Define to the version of this package. */ #undef PACKAGE_VERSION /* Define to 1 if you have the ANSI C header files. */ #undef STDC_HEADERS /* Version number of package */ #undef VERSION irstlm-5.80.03/configure.in000644 000766 000024 00000010002 12116035653 017643 0ustar00nicolabertoldistaff000000 000000 AC_INIT(src) CFLAGS="$CFLAGS" CPPFLAGS="$CPPFLAGS" CXXFLAGS="$CXXFLAGS" AC_PREFIX_DEFAULT(/usr/local/irstlm) AM_CONFIG_HEADER(config.h) AM_INIT_AUTOMAKE(irstlm, 5.70.03) WEBSITE="http://hlt.fbk.eu/en/irstlm" AC_CONFIG_MACRO_DIR([m4]) AC_PROG_CXX AC_PROG_CXXCPP AC_LANG_CPLUSPLUS AC_DISABLE_SHARED AC_PROG_LIBTOOL # Shared library are disabled for default #LT_INIT([disable-shared]) AC_CHECK_TOOL(PDFLATEX,pdflatex,"no") AC_CHECK_TOOL(BIBTEX,bibtex,"no") AC_ARG_ENABLE([trace], AS_HELP_STRING([--enable-trace|--disable-trace], [Enable or Disable (default) trace])) AC_ARG_ENABLE([debugging], AS_HELP_STRING([--enable-debugging|--disable-debugging], [Enable or Disable (default) compilation with debugging "-g -O2" info])) AC_ARG_ENABLE(profiling, [AC_HELP_STRING([--enable-profiling|--disable-profiling], [Enable or Disable (default) profiling info])]) AC_ARG_ENABLE(traincaching, [AC_HELP_STRING([--enable-traincaching|--disable-traincaching], [Enable or Disable (default) the usage of caching in training])]) AC_ARG_ENABLE(caching, [AC_HELP_STRING([--enable-caching|--disable-caching], [Enable or Disable (default) the usage prob caches to store probs and other info])]) AC_ARG_ENABLE(interpsearch, [AC_HELP_STRING([--enable-interpsearch|--disable-interpsearch], [Enable or Disable (default) the use interpolated search to retrieve n-grams])]) AC_ARG_ENABLE(optimization, [AC_HELP_STRING([--enable-optimization|--disable-optimization], [Enable or Disable (default) compilation with optimization -O3 flag])]) AC_ARG_WITH(zlib, [AC_HELP_STRING([--with-zlib=PATH], [(optional) path to zlib])], [with_zlib=$withval], [with_zlib=no] ) AM_CONDITIONAL([am__fastdepCC], false) AM_CONDITIONAL([WITH_THREADS],false) #### Use this if you want that the default is yes #### if test "x$enable_foo" != 'xno' #### Use this if you want that the default is no #### if test "x$enable_foo" = 'xyes' if test "x$enable_trace" = 'xyes' then AC_MSG_NOTICE([trace enabled]) CPPFLAGS="$CPPFLAGS -DTRACE_ENABLE=1" else AC_MSG_NOTICE([trace disabled (default), most regression tests will fail]) fi if test "x$enable_debugging" = 'xyes' then AC_MSG_NOTICE([generation of debugging symbols enabled, compilation with "-g -O2"]) CPPFLAGS="$CPPFLAGS -g -O2" else AC_MSG_NOTICE([generation of debugging symbols disabled (default), compilation without "-g", only "-O2"]) fi if test "x$enable_profiling" = 'xyes' then AC_MSG_NOTICE([profiling enabled]) CPPFLAGS="$CPPFLAGS -pg" LDFLAGS="$LDFLAGS -pg" else AC_MSG_NOTICE([profiling disabled (default)]) fi if test "x$enable_traincaching" = 'xyes' then AC_MSG_NOTICE([train-caching enabled]) CPPFLAGS="$CPPFLAGS -DMDIADAPTLM_CACHE_ENABLE=1"; LDFLAGS="$LDFLAGS" else AC_MSG_NOTICE([train-caching disabled (default)]) fi if test "x$enable_caching" = 'xyes' then AC_MSG_NOTICE([caching enabled]) CPPFLAGS="$CPPFLAGS -DPS_CACHE_ENABLE=1 -DLMT_CACHE_ENABLE=1"; LDFLAGS="$LDFLAGS" else AC_MSG_NOTICE([caching disabled (default)]) fi if test "x$enable_interpsearch" = 'xyes' then AC_MSG_NOTICE([interpolated search enabled]) CPPFLAGS="$CPPFLAGS -DINTERP_SEARCH=1"; LDFLAGS="$LDFLAGS" else AC_MSG_NOTICE([interpolated search disabled (default)]) fi if test "x$enable_optimization" = 'xyes' then AC_MSG_NOTICE([optimization enabled]) CPPFLAGS="$CPPFLAGS -O3"; LDFLAGS="$LDFLAGS -O3" else AC_MSG_NOTICE([optimization disabled (default)]) fi AC_MSG_NOTICE([Building non-threaded irstlm.]) AC_CHECK_HEADERS([getopt.h], [AM_CONDITIONAL([WITH_MERT],true)], [AC_MSG_WARN([Cannot find getopt.h - disabling new mert])]) if test "x$with_zlib" != 'xno' then CPPFLAGS="$CPPFLAGS -I${with_zlib}/include" LDFLAGS="$LDFLAGS -L${with_zlib}/lib" fi LIBS="$LIBS -lz" AC_CONFIG_FILES(Makefile src/Makefile scripts/Makefile) AC_SUBST(transform,'s/_lm/-lm/') AC_OUTPUT() AC_MSG_NOTICE([The software will be installed into $prefix]) irstlm-5.80.03/Copyright000644 000766 000024 00000001772 11552037751 017247 0ustar00nicolabertoldistaff000000 000000 // $Id: Copyright 3686 2010-10-15 11:55:32Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ irstlm-5.80.03/Makefile.am000644 000766 000024 00000000330 12035745550 017375 0ustar00nicolabertoldistaff000000 000000 # not a GNU package. You can remove this line, if # have all needed files, that a GNU package needs AUTOMAKE_OPTIONS = foreign SUBDIRS = src scripts EXTRA_DIST = README RELEASE Copyright ACLOCAL_AMFLAGS = -I m4 irstlm-5.80.03/README000644 000766 000024 00000002066 11637144404 016227 0ustar00nicolabertoldistaff000000 000000 IRSTLM Toolkit CONTENT: - src: source code - scripts: supporting scripts - doc: documentation (in Latex) and in pdf (to be generated) - bin: binaries (to be generated) and scripts - lib: libraries (to be generated) - readme: this file DOCUMENTATION A User Manual is available under https://sourceforge.net/projects/irstlm The data for the examples described in the User Manual are available under http://sourceforge.net/projects/irstlm/files/irstlm/sampledata/ HOW TO INSTALL step 0: sh regenerate-makefiles.sh [--force] # set parameter force to the value "--force" if you want to recreate all links to the autotools step 1: ./configure [--prefix=/path/where/to/install] ... # run "configure --help" to get more details on the compilation options step 2: make step 3: make install These steps will generate the irstlm library and commands, respectively, under the specified path where to install. HOW TO CONTRIBUTE If you wish to contribute to the Open Source IRSTLM toolkit just tell us! Marcello Federico FBK-irst, Trento, ITALY email: federico AT fbk DOT eu irstlm-5.80.03/regenerate-makefiles.sh000755 000766 000024 00000002213 11746325041 021756 0ustar00nicolabertoldistaff000000 000000 #!/bin/sh # NOTE: # Versions 1.9 (or higher) of aclocal and automake are required. # Version 2.59 (or higher) of autoconf is required. # For Mac OSX users: # Standard distribution usually includes versions 1.6 for aclocal and automake. # Get versions 1.9 or higher # Set the following variable to the correct paths #ACLOCAL="/path/to/aclocal-1.9" #AUTOMAKE="/path/to/automake-1.9" force=$1; # set parameter force to the value "--force" if you want to recreate all links to the autotools die () { echo "$@" >&2 exit 1 } if [ -z "$ACLOCAL" ] then ACLOCAL=`which aclocal` fi if [ -z "$AUTOMAKE" ] then AUTOMAKE=`which automake` fi if [ -z "$AUTOCONF" ] then AUTOCONF=`which autoconf` fi if [ -z "$LIBTOOLIZE" ] then LIBTOOLIZE=`which libtoolize` if [ -z "$LIBTOOLIZE" ] then LIBTOOLIZE=`which glibtoolize` fi fi echo "Calling $LIBTOOLIZE $force" $LIBTOOLIZE $force || die "libtoolize failed" echo "Calling $ACLOCAL..." $ACLOCAL -I m4 || die "aclocal failed" echo "Calling $AUTOCONF..." $AUTOCONF || die "autoconf failed" echo "Calling $AUTOMAKE --add-missing..." $AUTOMAKE --add-missing || die "automake failed" irstlm-5.80.03/RELEASE000644 000766 000024 00000000010 12137531403 016330 0ustar00nicolabertoldistaff000000 000000 5.80.03 irstlm-5.80.03/scripts/000755 000766 000024 00000000000 12137531466 017036 5ustar00nicolabertoldistaff000000 000000 irstlm-5.80.03/src/000755 000766 000024 00000000000 12137531466 016136 5ustar00nicolabertoldistaff000000 000000 irstlm-5.80.03/src/cmd.c000644 000766 000024 00000055655 12033355375 017063 0ustar00nicolabertoldistaff000000 000000 /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #ifndef _WIN32_WCE #include #endif #include #include #include #include #if defined(_WIN32) #include #else #include #endif #ifdef USE_UPIO #include "missing.h" #include "updef.h" #endif #include "cmd.h" #define FALSE 0 #define TRUE 1 static Enum_T BoolEnum[] = { { (char*)"FALSE", FALSE}, { (char*)"TRUE", TRUE}, { (char*)"false", FALSE}, { (char*)"true", TRUE}, { (char*)"0", FALSE}, { (char*)"1", TRUE}, { (char*)"NO", FALSE}, { (char*)"YES", TRUE}, { (char*)"No", FALSE}, { (char*)"Yes", TRUE}, { (char*)"no", FALSE}, { (char*)"yes", TRUE}, { (char*)"N", FALSE}, { (char*)"Y", TRUE}, { (char*)"n", FALSE}, { (char*)"y", TRUE}, { 0, 0 } }; #ifdef NEEDSTRDUP char *strdup(const char *s); #endif #define LINSIZ 10240 static char *GetLine(FILE *fp, int n, char *Line), **str2array(char *s, char *sep); static int str2narray(int type, char *s, char *sep, void **a); static int Scan(char *ProgName, Cmd_T *cmds, char *Line), SetParam(Cmd_T *cmd, char *s), SetEnum(Cmd_T *cmd, char *s), SetFlag(Cmd_T *cmd, char *s), SetSubrange(Cmd_T *cmd, char *s), SetStrArray(Cmd_T *cmd, char *s), SetNumArray(Cmd_T *cmd, char *s), SetGte(Cmd_T *cmd, char *s), SetLte(Cmd_T *cmd, char *s), CmdError(char *opt), EnumError(Cmd_T *cmd, char *s), SubrangeError(Cmd_T *cmd, int n), GteError(Cmd_T *cmd, int n), LteError(Cmd_T *cmd, int n), PrintParam(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp), PrintParams4(int TypeFlag, int ValFlag, int MsgFlag, FILE *fp), FreeParam(Cmd_T *cmd), PrintEnum(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp), PrintFlag(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp), PrintStrArray(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp), PrintIntArray(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp), PrintDblArray(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp), BuildCmdList(Cmd_T **cmdp, int *cmdSz, char *ParName, va_list args), StoreCmdLine(char *s); static Cmd_T *pgcmds = 0; static int pgcmdN = 0; static int pgcmdSz = 0; static char *SepString = " \t\r\n"; static char *ProgName = 0; static char **CmdLines = 0; static int CmdLinesSz = 0, CmdLinesL = 0; int DeclareParams(char *ParName, ...) { va_list args; va_start(args, ParName); pgcmdN = BuildCmdList(&pgcmds, &pgcmdSz, ParName, args); va_end(args); return 0; } int GetParams(int *n, char ***a, char *DefCmd) { char *Line; int i, argc = *n; char **argv = *a, *s, *p, *CmdFile, *defCmd; FILE *fp; int IsPipe; #if defined(MSDOS)||defined(_WIN32) char *dot = 0; #endif extern char **environ; if(!(Line=malloc(LINSIZ))) { fprintf(stderr, "GetParams(): Unable to alloc %d bytes\n", LINSIZ); exit(1); } for(ProgName=*argv+strlen(*argv); ProgName-->*argv && *ProgName!='/' && *ProgName!='\\';); ++ProgName; #if defined(MSDOS)||defined(_WIN32) if((dot=strchr(ProgName, '.'))) *dot=0; #endif --argc; ++argv; for(i=0; environ[i]; i++) { if(strncmp(environ[i], "cmd_", 4)) continue; strcpy(Line, environ[i]+4); if(!(p=strchr(Line, '='))) continue; *p=' '; StoreCmdLine(Line); if(Scan(ProgName, pgcmds, Line)) CmdError(environ[i]); } if((defCmd=DefCmd?(DefCmd=strdup(DefCmd)):0)) { defCmd += strspn(defCmd, "\n\r"); } for(;;) { CmdFile=0; if(argc && argv[0][0]=='-' && argv[0][1]=='=') { CmdFile = argv[0]+2; ++argv; --argc; defCmd = 0; } if(!CmdFile) { int i; char ch; if(!defCmd||!(i=strcspn(defCmd, "\n\r"))) break; ch = defCmd[i]; defCmd[i] = 0; CmdFile = defCmd; defCmd += i+!!ch; defCmd += strspn(defCmd, "\n\r"); } IsPipe = !strncmp(CmdFile, "@@", 2); fp = IsPipe ? popen(CmdFile+2, "r") : strcmp(CmdFile, "-") ? fopen(CmdFile, "r") : stdin; if(!fp) { if(defCmd) continue; fprintf(stderr, "Unable to open command file %s\n", CmdFile); exit(1); } while(GetLine(fp, LINSIZ, Line) && strcmp(Line, "\\End")) { StoreCmdLine(Line); if(Scan(ProgName, pgcmds, Line)) CmdError(Line); } if(fp!=stdin) { if(IsPipe) pclose(fp); else fclose(fp); } CmdFile = NULL; } if(DefCmd) free(DefCmd); // while(argc && **argv=='-'){ while(argc){ if (**argv=='-'){ s=strchr(*argv, '='); //allows double dash for parameters int dash_number=1; if (*(*argv+1) == '-') dash_number++; if (s){ *s = ' '; if((p=strchr(*argv+dash_number, '.'))&&pName; cmd++) n += !!cmd->ArgStr; a[0] = calloc(n, sizeof(char*)); for(n=0, cmd=pgcmds; cmd->Name; cmd++) { if(!cmd->ArgStr) continue; a[0][n] = malloc(strlen(cmd->Name)+strlen(cmd->ArgStr)+l+2); sprintf(a[0][n], "%s%s=%s", pfx, cmd->Name, cmd->ArgStr); ++n; } return n; } static int BuildCmdList(Cmd_T **cmdp, int *cmdSz, char *ParName, va_list args) { int j, c, cmdN=0; char *s; Cmd_T *cmd, *cmds; if(!*cmdSz) { if(!(cmds=*cmdp=malloc((1+(*cmdSz=BUFSIZ))*sizeof(Cmd_T)))) { fprintf(stderr, "BuildCmdList(): malloc() failed\n"); exit(-1); } } else { for(cmds=*cmdp; cmds[cmdN].Name; ++cmdN); } while(ParName) { if(cmdN==*cmdSz) { cmds=*cmdp=realloc(cmds, (1+(*cmdSz+=BUFSIZ))*sizeof(Cmd_T)); if(!cmds) { fprintf(stderr, "BuildCmdList(): realloc() failed\n"); exit(-1); } } for(j=0; jj; c--) cmds[c] = cmds[c-1]; cmd = cmds+j; cmd->Name = ParName; cmd->Type = va_arg(args, int); cmd->Val = va_arg(args, void*); cmd->Msg = 0; cmd->Flag = 0; switch(cmd->Type&~CMDMSG) { case CMDENUMTYPE: /* get the pointer to Enum_T struct */ case CMDFLAGTYPE: cmd->p = va_arg(args, void*); break; case CMDSUBRANGETYPE: /* get the two limits */ cmd->p = (void*)calloc(2, sizeof(int)); ((int*)cmd->p)[0] = va_arg(args, int); ((int*)cmd->p)[1] = va_arg(args, int); break; case CMDGTETYPE: /* lower or upper bound */ case CMDLTETYPE: cmd->p = (void*)calloc(1, sizeof(int)); ((int*)cmd->p)[0] = va_arg(args, int); break; case CMDSTRARRAYTYPE: /* separator string */ cmd->p = (s=va_arg(args, char*)) ? (void*)strdup(s) : 0; break; case CMDDBLARRAYTYPE: case CMDINTARRAYTYPE: /* separator & pointer to length */ cmd->p = (void*)calloc(2, sizeof(void*)); s = va_arg(args, char*); ((char**)cmd->p)[0] = s ? strdup(s) : 0; ((int**)cmd->p)[1] = va_arg(args, int*); *((int**)cmd->p)[1] = 0; break; case CMDBOOLTYPE: cmd->Type = CMDENUMTYPE|(cmd->Type&CMDMSG); cmd->p = BoolEnum; cmd->Flag = 1; break; case CMDDOUBLETYPE: /* nothing else is needed */ case CMDFLOATTYPE: case CMDINTTYPE: case CMDSTRINGTYPE: break; default: fprintf(stderr, "%s: %s %d %s \"%s\"\n", "BuildCmdList()", "Unknown Type", cmd->Type&~CMDMSG, "for parameter", cmd->Name); exit(1); } if(cmd->Type&CMDMSG) { cmd->Type&=~CMDMSG; cmd->Msg = va_arg(args, char*); } cmdN++; ParName = va_arg(args, char*); } cmds[cmdN].Name = 0; return cmdN; } static int CmdError(char *opt) { fprintf(stderr, "Invalid option \"%s\"\n", opt); fprintf(stderr, "This program expects the following parameters:\n"); PrintParams4(TRUE, FALSE, TRUE, stderr); exit(0); return 0; } static int FreeParam(Cmd_T *cmd) { switch(cmd->Type) { case CMDSUBRANGETYPE: case CMDGTETYPE: case CMDLTETYPE: case CMDSTRARRAYTYPE: if(cmd->p) free(cmd->p); break; case CMDINTARRAYTYPE: case CMDDBLARRAYTYPE: if(!cmd->p) break; if(*(char**)cmd->p) free(*(char**)cmd->p); free(cmd->p); break; } return 0; } static int PrintParam(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp) { char ts[128]; *ts=0; fprintf(fp, "%4s", ""); switch(cmd->Type) { case CMDDOUBLETYPE: fprintf(fp, "%s", cmd->Name); if(TypeFlag) fprintf(fp, " [double]"); if(ValFlag) fprintf(fp, ": %22.15e", *(double*)cmd->Val); break; case CMDFLOATTYPE: fprintf(fp, "%s", cmd->Name); if(TypeFlag) fprintf(fp, " [float]"); if(ValFlag) fprintf(fp, ": %22.15e", *(float *)cmd->Val); break; case CMDENUMTYPE: PrintEnum(cmd, TypeFlag, ValFlag, fp); break; case CMDFLAGTYPE: PrintFlag(cmd, TypeFlag, ValFlag, fp); break; case CMDINTTYPE: if(TypeFlag) sprintf(ts, " [int]"); case CMDSUBRANGETYPE: if(TypeFlag&&!*ts) sprintf(ts, " [int %d ... %d]", ((int*)cmd->p)[0], ((int*)cmd->p)[1]); case CMDGTETYPE: if(TypeFlag&&!*ts) sprintf(ts, " [int >= %d]", ((int*)cmd->p)[0]); case CMDLTETYPE: if(TypeFlag&&!*ts) sprintf(ts, " [int <= %d]", ((int*)cmd->p)[0]); fprintf(fp, "%s", cmd->Name); if(*ts) fprintf(fp, " %s", ts); if(ValFlag) fprintf(fp, ": %d", *(int*)cmd->Val); break; case CMDSTRINGTYPE: fprintf(fp, "%s", cmd->Name); if(TypeFlag) fprintf(fp, " [string]"); if(ValFlag) { if(*(char **)cmd->Val) { fprintf(fp, ": \"%s\"", *(char**)cmd->Val); } else { fprintf(fp, ": %s", "NULL"); } } break; case CMDSTRARRAYTYPE: PrintStrArray(cmd, TypeFlag, ValFlag, fp); break; case CMDINTARRAYTYPE: PrintIntArray(cmd, TypeFlag, ValFlag, fp); break; case CMDDBLARRAYTYPE: PrintDblArray(cmd, TypeFlag, ValFlag, fp); break; default: fprintf(stderr, "%s: %s %d %s \"%s\"\n", "PrintParam", "Unknown Type", cmd->Type, "for parameter", cmd->Name); exit(1); } fprintf(fp, ":"); // fprintf(fp, "\n"); fflush(fp); return 0; } static char * GetLine(FILE *fp, int n, char *Line) { int j, l, offs=0; for(;;) { if(!fgets(Line+offs, n-offs, fp)) { return 0; } if(Line[offs]=='#') continue; l = strlen(Line+offs)-1; Line[offs+l] = 0; for(j=offs; Line[j]&&isspace((unsigned char)Line[j]); j++,l--); if(l<1) continue; if(j > offs) { char *s = Line+offs, *q = Line+j; while((*s++=*q++)) ; } if(Line[offs+l-1]=='\\') { offs += l; Line[offs-1] = ' '; } else { break; } } return Line; } static int Scan(char *ProgName, Cmd_T *cmds, char *Line) { char *q, *p; int i, hl, HasToMatch = FALSE, c0, c; p = Line+strspn(Line, SepString); if(!(hl=strcspn(p, SepString))) return 0; if(ProgName&&(q=strchr(p, '/')) && q-pType==CMDENUMTYPE && cmd->Flag==1){ s=(char*) malloc(5); strcpy(s,"TRUE"); }else{ s=_s; } if (!*s || (s=='\0' && cmd->Flag==0)){ fprintf(stderr, "WARNING: No value specified for parameter \"%s\"\n", cmd->Name); return 0; } switch(cmd->Type) { case CMDDOUBLETYPE: if(sscanf(s, "%lf", (double*)cmd->Val)!=1) { fprintf(stderr, "Float value required for parameter \"%s\"\n", cmd->Name); exit(1); } break; case CMDFLOATTYPE: if(sscanf(s, "%f", (float*)cmd->Val)!=1) { fprintf(stderr, "Float value required for parameter \"%s\"\n", cmd->Name); exit(1); } break; case CMDENUMTYPE: SetEnum(cmd, s); break; case CMDFLAGTYPE: SetFlag(cmd, s); break; case CMDINTTYPE: if(sscanf(s, "%i", (int*)cmd->Val)!=1) { fprintf(stderr, "Integer value required for parameter \"%s\"\n", cmd->Name); exit(1); } break; case CMDSTRINGTYPE: *(char **)cmd->Val = (strcmp(s, "") && strcmp(s, "NULL")) ? strdup(s) : 0; break; case CMDSTRARRAYTYPE: SetStrArray(cmd, s); break; case CMDINTARRAYTYPE: case CMDDBLARRAYTYPE: SetNumArray(cmd, s); break; case CMDGTETYPE: SetGte(cmd, s); break; case CMDLTETYPE: SetLte(cmd, s); break; case CMDSUBRANGETYPE: SetSubrange(cmd, s); break; default: fprintf(stderr, "%s: %s %d %s \"%s\"\n", "SetParam", "Unknown Type", cmd->Type, "for parameter", cmd->Name); exit(1); } cmd->ArgStr = strdup(s); if(!*_s && cmd->Type==CMDENUMTYPE && cmd->Flag==1){ free (s); } return 0; } static int SetEnum(Cmd_T *cmd, char *s) { Enum_T *en; for(en=(Enum_T*)cmd->p; en->Name; en++) { if(*en->Name && !strcmp(s, en->Name)) { *(int*)cmd->Val = en->Idx; return 0; } } return EnumError(cmd, s); } int EnumIdx(Enum_T *en, char *s) { if(en) for(; en->Name; en++) { if(*en->Name && !strcmp(s, en->Name)) return en->Idx; } return -1; } char * EnumStr(Enum_T *en, int i) { if(en) for(; en->Name; en++) if(en->Idx==i) return en->Name; return 0; } static int SetFlag(Cmd_T *cmd, char *s) { Enum_T *en; int l; for(; (l=strcspn(s, "+"))>0; s+=l,s+=!!*s) { for(en=(Enum_T*)cmd->p; en->Name&&(l!=strlen(en->Name)||strncmp(s, en->Name, l)); en++); if(!en->Name) return EnumError(cmd, s); *(int*)cmd->Val |= en->Idx; } return 0; } static int SetSubrange(Cmd_T *cmd, char *s) { int n; if(sscanf(s, "%i", &n)!=1) { fprintf(stderr, "Integer value required for parameter \"%s\"\n", cmd->Name); exit(1); } if(n < *(int*)cmd->p || n > *((int*)cmd->p+1)) { return SubrangeError(cmd, n); } *(int*)cmd->Val = n; return 0; } static int SetGte(Cmd_T *cmd, char *s) { int n; if(sscanf(s, "%i", &n)!=1) { fprintf(stderr, "Integer value required for parameter \"%s\"\n", cmd->Name); exit(1); } if(n<*(int*)cmd->p) { return GteError(cmd, n); } *(int*)cmd->Val = n; return 0; } static int SetStrArray(Cmd_T *cmd, char *s) { *(char***)cmd->Val = str2array(s, (char*)cmd->p); return 0; } static int SetNumArray(Cmd_T *cmd, char *s) { *((int**)cmd->p)[1] = str2narray(cmd->Type, s, *((char**)cmd->p), cmd->Val); return 0; } static int SetLte(Cmd_T *cmd, char *s) { int n; if(sscanf(s, "%i", &n)!=1) { fprintf(stderr, "Integer value required for parameter \"%s\"\n", cmd->Name); exit(1); } if(n > *(int*)cmd->p) { return LteError(cmd, n); } *(int*)cmd->Val = n; return 0; } static int EnumError(Cmd_T *cmd, char *s) { Enum_T *en; fprintf(stderr, "Invalid value \"%s\" for parameter \"%s\"\n", s, cmd->Name); fprintf(stderr, "Valid values are:\n"); for(en=(Enum_T*)cmd->p; en->Name; en++) { if(*en->Name) fprintf(stderr, " %s\n", en->Name); } fprintf(stderr, "\n"); exit(1); return 0; } static int GteError(Cmd_T *cmd, int n) { fprintf(stderr, "Value %d out of range for parameter \"%s\"\n", n, cmd->Name); fprintf(stderr, "Valid values must be greater than or equal to %d\n", *(int*)cmd->p); exit(1); return 0; } static int LteError(Cmd_T *cmd, int n) { fprintf(stderr, "Value %d out of range for parameter \"%s\"\n", n, cmd->Name); fprintf(stderr, "Valid values must be less than or equal to %d\n", *(int*)cmd->p); exit(1); return 0; } static int SubrangeError(Cmd_T *cmd, int n) { fprintf(stderr, "Value %d out of range for parameter \"%s\"\n", n, cmd->Name); fprintf(stderr, "Valid values range from %d to %d\n", *(int*)cmd->p, *((int*)cmd->p+1)); exit(1); return 0; } static int PrintEnum(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp) { Enum_T *en; char *sep=""; fprintf(fp, "%s", cmd->Name); if(TypeFlag) { fprintf(fp, " [enum { "); for(en=(Enum_T*)cmd->p; en->Name; en++) { if(*en->Name) { fprintf(fp, "%s%s", sep, en->Name); sep=", "; } } fprintf(fp, " }]"); } if(ValFlag) { for(en=(Enum_T*)cmd->p; en->Name; en++) { if(*en->Name && en->Idx==*(int*)cmd->Val) { fprintf(fp, ": %s", en->Name); } } } // fprintf(fp, "\n"); return 0; } static int PrintFlag(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp) { Enum_T *en; char *sep=""; fprintf(fp, "%s", cmd->Name); if(TypeFlag) { fprintf(fp, ": flag { "); for(en=(Enum_T*)cmd->p; en->Name; en++) { if(*en->Name) { fprintf(fp, "%s%s", sep, en->Name); sep=", "; } } fprintf(fp, " }"); } if(ValFlag) { fprintf(fp, ": "); for(en=(Enum_T*)cmd->p; en->Name; en++) { if(*en->Name && (en->Idx&*(int*)cmd->Val)==en->Idx) { fprintf(fp, "%s%s", sep, en->Name); sep="+"; } } } fprintf(fp, "\n"); return 0; } static int PrintStrArray(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp) { char *indent, **s = *(char***)cmd->Val; int l = 4+strlen(cmd->Name); fprintf(fp, "%s", cmd->Name); if(TypeFlag) { fprintf(fp, ": string array, separator \"%s\"", cmd->p?(char*)cmd->p:""); } indent = malloc(l+2); memset(indent, ' ', l+1); indent[l+1] = 0; if(ValFlag) { fprintf(fp, ": %s", s ? (*s ? *s++ : "NULL") : ""); if(s) while(*s) { fprintf(fp, "\n%s %s", indent, *s++); } } free(indent); fprintf(fp, "\n"); return 0; } static int PrintIntArray(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp) { char *indent; int l = 4+strlen(cmd->Name), n, *i = *(int**)cmd->Val; fprintf(fp, "%s", cmd->Name); if(TypeFlag) { fprintf(fp, ": int array, separator \"%s\"", *(char**)cmd->p?*(char**)cmd->p:""); } n = *((int**)cmd->p)[1]; indent = malloc(l+2); memset(indent, ' ', l+1); indent[l+1] = 0; if(ValFlag) { fprintf(fp, ":"); if(i&&n>0) { fprintf(fp, " %d", *i++); while(--n) fprintf(fp, "\n%s %d", indent, *i++); } } free(indent); fprintf(fp, "\n"); return 0; } static int PrintDblArray(Cmd_T *cmd, int TypeFlag, int ValFlag, FILE *fp) { char *indent; int l = 4+strlen(cmd->Name), n; double *x = *(double**)cmd->Val; fprintf(fp, "%s", cmd->Name); if(TypeFlag) { fprintf(fp, ": double array, separator \"%s\"", *(char**)cmd->p?*(char**)cmd->p:""); } n = *((int**)cmd->p)[1]; indent = malloc(l+2); memset(indent, ' ', l+1); indent[l+1] = 0; if(ValFlag) { fprintf(fp, ":"); if(x&&n>0) { fprintf(fp, " %e", *x++); while(--n) fprintf(fp, "\n%s %e", indent, *x++); } } free(indent); fprintf(fp, "\n"); return 0; } static char ** str2array(char *s, char *sep) { char *p, **a; int n = 0, l; if(!sep) sep = SepString; p = s += strspn(s, sep); if(!*p) return 0; while(*p) { p += strcspn(p, sep); p += strspn(p, sep); ++n; } a = calloc(n+1, sizeof(char*)); p = s; n = 0; while(*p) { l = strcspn(p, sep); a[n] = malloc(l+1); memcpy(a[n], p, l); a[n][l] = 0; ++n; p += l; p += strspn(p, sep); } return a; } int str2narray(int type, char *s, char *sep, void **a) { char *p; double *x; int *i; int n = 0; if(!sep) sep=SepString; for(p=s; *p; ) { p += strcspn(p, sep); p += !!*p; ++n; } *a = 0; if(!n) return 0; *a = calloc(n, (type==CMDINTARRAYTYPE)?sizeof(int):sizeof(double)); i = (int*)*a; x = (double*)*a; p = s; n = 0; while(*p) { switch(type) { case CMDINTARRAYTYPE: *i++ = atoi(p); break; case CMDDBLARRAYTYPE: *x++ = atof(p); break; } ++n; p += strcspn(p, sep); p += !!*p; } return n; } static int StoreCmdLine(char *s) { s += strspn(s, SepString); if(!*s) return 0; if(CmdLinesL>=CmdLinesSz) { CmdLines=CmdLinesSz ? (char**)realloc(CmdLines, (CmdLinesSz+=BUFSIZ)*sizeof(char**)) : (char**)malloc((CmdLinesSz=BUFSIZ)*sizeof(char**)); if(!CmdLines) { fprintf(stderr, "%s\n", "StoreCmdLine(): malloc() failed"); exit(-1); } } CmdLines[CmdLinesL++] = strdup(s); return 0; } irstlm-5.80.03/src/cmd.h000644 000766 000024 00000004026 12041267671 017052 0ustar00nicolabertoldistaff000000 000000 // $Id: cmd.h 3626 2010-10-07 11:41:05Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #if !defined(CMD_H) #define CMD_H #define CMDDOUBLETYPE 1 #define CMDENUMTYPE 2 #define CMDINTTYPE 3 #define CMDSTRINGTYPE 4 #define CMDSUBRANGETYPE 5 #define CMDGTETYPE 6 #define CMDLTETYPE 7 #define CMDSTRARRAYTYPE 8 #define CMDBOOLTYPE 9 #define CMDFLAGTYPE 10 #define CMDINTARRAYTYPE 11 #define CMDDBLARRAYTYPE 12 #define CMDFLOATTYPE 13 #define CMDMSG (1<<31) #include #ifdef __cplusplus extern "C" { #endif typedef struct { char *Name; int Idx; } Enum_T; typedef struct { int Type; int Flag; char *Name, *ArgStr; char *Msg; void *Val, *p; } Cmd_T; int DeclareParams(char *, ...), GetParams(int *n, char ***a, char *CmdFileName), GetDotParams(char *, ...), SPrintParams(char ***a, char *pfx), PrintParams(int ValFlag, FILE *fp), FullPrintParams(int TypeFlag, int ValFlag, int MsgFlag, FILE *fp), EnumIdx(Enum_T *en, char *s); char *EnumStr(Enum_T *en, int i); #ifdef __cplusplus } #endif #endif irstlm-5.80.03/src/compile-lm.cpp000644 000766 000024 00000037003 12033356002 020665 0ustar00nicolabertoldistaff000000 000000 // $Id: compile-lm.cpp 3677 2010-10-13 09:06:51Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit, compile LM Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ using namespace std; #include #include #include #include #include #include "cmd.h" #include "util.h" #include "math.h" #include "lmContainer.h" /********************************/ void print_help(int TypeFlag=0){ std::cerr << std::endl << "compile-lm - compiles an ARPA format LM into an IRSTLM format one" << std::endl; std::cerr << std::endl << "USAGE:" << std::endl; std::cerr << " compile-lm [options] [output-file.blm]" << std::endl; std::cerr << std::endl << "DESCRIPTION:" << std::endl; std::cerr << " compile-lm reads a standard LM file in ARPA format and produces" << std::endl; std::cerr << " a compiled representation that the IRST LM toolkit can quickly" << std::endl; std::cerr << " read and process. LM file can be compressed." << std::endl; std::cerr << std::endl << "OPTIONS:" << std::endl; FullPrintParams(TypeFlag, 0, 1, stderr); } void usage(const char *msg = 0) { if (msg) { std::cerr << msg << std::endl; } if (!msg){ print_help(); } exit(1); } int main(int argc, char **argv) { char *seval=NULL; char *tmpdir=NULL; char *sfilter=NULL; bool textoutput = false; bool sent_PP_flag = false; bool invert = false; bool sscore = false; bool skeepunigrams = false; int debug = 0; bool memmap = false; int requiredMaxlev = 1000; int dub = 10000000; int randcalls = 0; float ngramcache_load_factor = 0.0; float dictionary_load_factor = 0.0; bool help=false; std::vector files; DeclareParams((char*) "text", CMDBOOLTYPE|CMDMSG, &textoutput, "output is again in text format; default is false", "t", CMDBOOLTYPE|CMDMSG, &textoutput, "output is again in text format; default is false", "filter", CMDSTRINGTYPE|CMDMSG, &sfilter, "filter a binary language model with a word list", "f", CMDSTRINGTYPE|CMDMSG, &sfilter, "filter a binary language model with a word list", "keepunigrams", CMDBOOLTYPE|CMDMSG, &skeepunigrams, "filter by keeping all unigrams in the table, default is true", "ku", CMDBOOLTYPE|CMDMSG, &skeepunigrams, "filter by keeping all unigrams in the table, default is true", "eval", CMDSTRINGTYPE|CMDMSG, &seval, "computes perplexity of the specified text file", "e", CMDSTRINGTYPE|CMDMSG, &seval, "computes perplexity of the specified text file", "randcalls", CMDINTTYPE|CMDMSG, &randcalls, "computes N random calls on the specified text file", "r", CMDINTTYPE|CMDMSG, &randcalls, "computes N random calls on the specified text file", "score", CMDBOOLTYPE|CMDMSG, &sscore, "computes log-prob scores of n-grams from standard input", "s", CMDBOOLTYPE|CMDMSG, &sscore, "computes log-prob scores of n-grams from standard input", "debug", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0", "d", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0", "level", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken", "l", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken", "memmap", CMDBOOLTYPE|CMDMSG, &memmap, "uses memory map to read a binary LM", "mm", CMDBOOLTYPE|CMDMSG, &memmap, "uses memory map to read a binary LM", "dub", CMDINTTYPE|CMDMSG, &dub, "dictionary upperbound to compute OOV word penalty: default 10^7", "tmpdir", CMDSTRINGTYPE|CMDMSG, &tmpdir, "directory for temporary computation, default is either the environment variable TMP if defined or \"/tmp\")", "invert", CMDBOOLTYPE|CMDMSG, &invert, "builds an inverted n-gram binary table for fast access; default if false", "i", CMDBOOLTYPE|CMDMSG, &invert, "builds an inverted n-gram binary table for fast access; default if false", "sentence", CMDBOOLTYPE|CMDMSG, &sent_PP_flag, "computes perplexity at sentence level (identified through the end symbol)", "dict_load_factor", CMDFLOATTYPE|CMDMSG, &dictionary_load_factor, "sets the load factor for ngram cache; it should be a positive real value; default is 0", "ngram_load_factor", CMDFLOATTYPE|CMDMSG, &ngramcache_load_factor, "sets the load factor for ngram cache; it should be a positive real value; default is false", "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help", "h", CMDBOOLTYPE|CMDMSG, &help, "print this help", (char*)NULL ); if (argc == 1){ usage(); } for(int i=1; i < argc; i++) { if(argv[i][0] != '-'){ files.push_back(argv[i]); } } GetParams(&argc, &argv, (char*) NULL); if (help){ usage(); } if (files.size() > 2) { usage("Warning: Too many arguments"); } if (files.size() < 1) { usage("Warning: Please specify a LM file to read from"); } //Define output type of table OUTFILE_TYPE outtype; if (textoutput) outtype=TEXT; else if (seval != NULL || sscore) outtype=NONE; else outtype=BINARY; std::string infile = files[0]; std::string outfile = ""; if (files.size() == 1) { outfile=infile; //remove path information std::string::size_type p = outfile.rfind('/'); if (p != std::string::npos && ((p+1) < outfile.size())) outfile.erase(0,p+1); //eventually strip .gz if (outfile.compare(outfile.size()-3,3,".gz")==0) outfile.erase(outfile.size()-3,3); outfile+=(textoutput?".lm":".blm"); } else{ outfile = files[1]; } std::cerr << "inpfile: " << infile << std::endl; std::cerr << "outfile: " << outfile << std::endl; if (seval!=NULL) std::cerr << "evalfile: " << seval << std::endl; if (sscore==true) std::cerr << "interactive: " << sscore << std::endl; if (memmap) std::cerr << "memory mapping: " << memmap << std::endl; std::cerr << "loading up to the LM level " << requiredMaxlev << " (if any)" << std::endl; std::cerr << "dub: " << dub<< std::endl; if (tmpdir != NULL) { if (setenv("TMP",tmpdir,1)) std::cerr << "temporary directory has not been set" << std::endl; std::cerr << "tmpdir: " << tmpdir << std::endl; } //checking the language model type lmContainer* lmt=NULL; lmt = lmt->CreateLanguageModel(infile,ngramcache_load_factor,dictionary_load_factor); //let know that table has inverted n-grams if (invert) lmt->is_inverted(invert); lmt->setMaxLoadedLevel(requiredMaxlev); lmt->load(infile); //CHECK this part for sfilter to make it possible only for LMTABLE if (sfilter != NULL) { lmContainer* filtered_lmt = NULL; std::cerr << "BEFORE sublmC (" << (void*) filtered_lmt << ") (" << (void*) &filtered_lmt << ")\n"; // the function filter performs the filtering and returns true, only for specific lm type if (((lmContainer*) lmt)->filter(sfilter,filtered_lmt,skeepunigrams?"yes":"no")) { std::cerr << "BFR filtered_lmt (" << (void*) filtered_lmt << ") (" << (void*) &filtered_lmt << ")\n"; filtered_lmt->stat(); delete lmt; lmt=filtered_lmt; std::cerr << "AFTER filtered_lmt (" << (void*) filtered_lmt << ")\n"; filtered_lmt->stat(); std::cerr << "AFTER lmt (" << (void*) lmt << ")\n"; lmt->stat(); } } if (dub) lmt->setlogOOVpenalty((int)dub); //use caches to save time (only if PS_CACHE_ENABLE is defined through compilation flags) lmt->init_caches(lmt->maxlevel()); if (seval != NULL) { if (randcalls>0) { cerr << "perform random " << randcalls << " using dictionary of test set\n"; dictionary *dict; dict=new dictionary(seval); //build extensive histogram int histo[dict->totfreq()]; //total frequency int totfreq=0; for (int n=0; nsize(); n++) for (int m=0; mfreq(n); m++) histo[totfreq++]=n; ngram ng(lmt->getDict()); srand(1234); double bow; int bol=0; if (debug>1) ResetUserTime(); for (int n=0; ngetDict()->encode(dict->decode(w))); lmt->clprob(ng,&bow,&bol); //(using caches if available) if (debug==1) { std::cout << ng.dict->decode(*ng.wordp(1)) << " [" << lmt->maxlevel()-bol << "]" << " "; std::cout << std::endl; } if ((n % 100000)==0) { std::cerr << "."; lmt->check_caches_levels(); } } std::cerr << "\n"; if (debug>1) PrintUserTime("Finished in"); if (debug>1) lmt->stat(); delete lmt; return 0; } else { if (lmt->getLanguageModelType() == _IRSTLM_LMINTERPOLATION) { debug = (debug>4)?4:debug; std::cerr << "Maximum debug value for this LM type: " << debug << std::endl; } std::cerr << "Start Eval" << std::endl; std::cerr << "OOV code: " << lmt->getDict()->oovcode() << std::endl; ngram ng(lmt->getDict()); std::cout.setf(ios::fixed); std::cout.precision(2); // if (debug>0) std::cout.precision(8); std::fstream inptxt(seval,std::ios::in); int Nbo=0, Nw=0,Noov=0; double logPr=0,PP=0,PPwp=0,Pr; // variables for storing sentence-based Perplexity int sent_Nbo=0, sent_Nw=0,sent_Noov=0; double sent_logPr=0,sent_PP=0,sent_PPwp=0; ng.dict->incflag(1); int bos=ng.dict->encode(ng.dict->BoS()); int eos=ng.dict->encode(ng.dict->EoS()); ng.dict->incflag(0); double bow; int bol=0; char *msp; unsigned int statesize; lmt->dictionary_incflag(1); while(inptxt >> ng) { if (ng.size>lmt->maxlevel()) ng.size=lmt->maxlevel(); // reset ngram at begin of sentence if (*ng.wordp(1)==bos) { ng.size=1; continue; } if (ng.size>=1) { Pr=lmt->clprob(ng,&bow,&bol,&msp,&statesize); logPr+=Pr; sent_logPr+=Pr; if (debug==1) { std::cout << ng.dict->decode(*ng.wordp(1)) << " [" << ng.size-bol << "]" << " "; if (*ng.wordp(1)==eos) std::cout << std::endl; } if (debug==2) { std::cout << ng << " [" << ng.size-bol << "-gram]" << " " << Pr; std::cout << std::endl; } if (debug==3) { std::cout << ng << " [" << ng.size-bol << "-gram]" << " " << Pr << " bow:" << bow; std::cout << std::endl; } if (debug==4) { std::cout << ng << " [" << ng.size-bol << "-gram: recombine:" << statesize << " state:" << (void*) msp << "] [" << ng.size+1-((bol==0)?(1):bol) << "-gram: bol:" << bol << "] " << Pr << " bow:" << bow; std::cout << std::endl; } if (debug>4) { std::cout << ng << " [" << ng.size-bol << "-gram: recombine:" << statesize << " state:" << (void*) msp << "] [" << ng.size+1-((bol==0)?(1):bol) << "-gram: bol:" << bol << "] " << Pr << " bow:" << bow; double totp=0.0; int oldw=*ng.wordp(1); double oovp=lmt->getlogOOVpenalty(); lmt->setlogOOVpenalty((double) 0); for (int c=0; csize(); c++) { *ng.wordp(1)=c; totp+=pow(10.0,lmt->clprob(ng)); //using caches if available } *ng.wordp(1)=oldw; if ( totp < (1.0 - 1e-5) || totp > (1.0 + 1e-5)) std::cout << " [t=" << totp << "] POSSIBLE ERROR"; std::cout << std::endl; lmt->setlogOOVpenalty((double)oovp); } if (lmt->is_OOV(*ng.wordp(1))) { Noov++; sent_Noov++; } if (bol) { Nbo++; sent_Nbo++; } Nw++; sent_Nw++; if (sent_PP_flag && (*ng.wordp(1)==eos)) { sent_PP=exp((-sent_logPr * log(10.0)) /sent_Nw); sent_PPwp= sent_PP * (1 - 1/exp((sent_Noov * lmt->getlogOOVpenalty()) * log(10.0) / sent_Nw)); std::cout << "%% sent_Nw=" << sent_Nw << " sent_PP=" << sent_PP << " sent_PPwp=" << sent_PPwp << " sent_Nbo=" << sent_Nbo << " sent_Noov=" << sent_Noov << " sent_OOV=" << (float)sent_Noov/sent_Nw * 100.0 << "%" << std::endl; //reset statistics for sentence based Perplexity sent_Nw=sent_Noov=sent_Nbo=0; sent_logPr=0.0; } if ((Nw % 100000)==0) { std::cerr << "."; lmt->check_caches_levels(); } } } PP=exp((-logPr * log(10.0)) /Nw); PPwp= PP * (1 - 1/exp((Noov * lmt->getlogOOVpenalty()) * log(10.0) / Nw)); std::cout << "%% Nw=" << Nw << " PP=" << PP << " PPwp=" << PPwp << " Nbo=" << Nbo << " Noov=" << Noov << " OOV=" << (float)Noov/Nw * 100.0 << "%"; if (debug) std::cout << " logPr=" << logPr; std::cout << std::endl; lmt->used_caches(); lmt->stat(); if (debug>1) lmt->stat(); delete lmt; return 0; }; } if (sscore == true) { ngram ng(lmt->getDict()); int bos=ng.dict->encode(ng.dict->BoS()); int bol; double bow; unsigned int n=0; std::cout.setf(ios::scientific); std::cout << "> "; lmt->dictionary_incflag(1); while(std::cin >> ng) { //std::cout << ng << std::endl;; // reset ngram at begin of sentence if (*ng.wordp(1)==bos) { ng.size=1; continue; } if (ng.size>=lmt->maxlevel()) { ng.size=lmt->maxlevel(); ++n; if ((n % 100000)==0) { std::cerr << "."; lmt->check_caches_levels(); } std::cout << ng << " p= " << lmt->clprob(ng,&bow,&bol) * M_LN10; std::cout << " bo= " << bol << std::endl; } else { std::cout << ng << " p= NULL" << std::endl; } std::cout << "> "; } std::cout << std::endl; if (debug>1) lmt->stat(); delete lmt; return 0; } if (textoutput == true) { std::cerr << "Saving in txt format to " << outfile << std::endl; lmt->savetxt(outfile.c_str()); } else if (!memmap) { std::cerr << "Saving in bin format to " << outfile << std::endl; lmt->savebin(outfile.c_str()); } else { std::cerr << "Impossible to save to " << outfile << std::endl; } delete lmt; return 0; } irstlm-5.80.03/src/cplsa.cpp000755 000766 000024 00000022017 12032511222 017727 0ustar00nicolabertoldistaff000000 000000 /****************************************************************************** IrstLM: IRST Language Model Toolkit, compile LM Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA **********************************************dou********************************/ using namespace std; #include #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "n_gram.h" #include "util.h" #include "dictionary.h" #include "ngramtable.h" #include "doc.h" #include "cplsa.h" #define MY_RAND (((double)rand()/RAND_MAX)* 2.0 - 1.0) plsa::plsa(dictionary* dictfile,int top, char* baseFile,char* featFile,char* hFile,char* wFile,char* tFile) { dict = dictfile; topics=top; assert (topics>0); W=new double* [dict->size()+1]; for (int i=0; i<(dict->size()+1); i++) W[i]=new double [topics]; T=new double* [dict->size()+1]; for (int i=0; i<(dict->size()+1); i++) T[i]=new double [topics]; H=new double [topics]; basefname=baseFile; featfname=featFile; tfname=tFile; wfname=wFile; hinfname=new char[BUFSIZ]; sprintf(hinfname,"%s",hFile); houtfname=new char[BUFSIZ]; sprintf(houtfname,"%s.out",hinfname); cerr << "Hfile in:" << hinfname << " out:" << houtfname << "\n"; } int plsa::initW(double noise,int spectopic) { FILE *f; if (wfname && ((f=fopen(wfname,"r"))!=NULL)) { fclose(f); loadW(wfname); } else { if (spectopic) { //special topic 0: first st words from dict double TotW=0; for (int i=0; ifreq(i); for (int i=0; i<(dict->size()+1); i++) W[i][0]/=TotW; } for (int t=(spectopic?1:0); tsize()+1); i++) TotW+=W[i][t]=1 + noise * MY_RAND; for (int i=0; i<(dict->size()+1); i++) W[i][t]/=TotW; } } return 1; } int plsa::initH(double noise,int n) { FILE *f; if ((f=fopen(hinfname,"r"))==NULL) { mfstream hinfd(hinfname,ios::out); for (int j=0; jsize(); i++) { out << dict->decode(i) << " " << dict->freq(i); double totW=0; for (int t=0; tsize(); i++) out.write((const char*)W[i],sizeof(double)*topics); out.close(); return 1; } int plsa::saveT(char* fname) { mfstream out(fname,ios::out); out.write((const char*)&topics,sizeof(int)); for (int i=0; isize(); i++) { double totT=0.0; for (int t=0; t0.00001) { out.write((const char*)&i,sizeof(int)); out.write((const char*)T[i],sizeof(double)*topics); } } out.close(); return 1; } int plsa::combineT(char* tlist) { double *tvec=new double[topics]; int w; int to; char fname[1000]; for (int i=0; isize(); i++) for (int t=0; t> fname) { mfstream tin(fname,ios::in); tin.read((char *)&to,sizeof(int)); assert(to==topics); while(!tin.eof()) { tin.read((char *)&w,sizeof(int)); tin.read((char *)tvec,sizeof(double)*topics); for (int t=0; tsize(); i++) { if (T[i][t]==0.0) T[i][t]=1e-10; //add some noise Tsum+=T[i][t]; } for (int i=0; isize(); i++) W[i][t]=T[i][t]/Tsum; } //check return 1; } int plsa::loadW(char* fname) { int r; mfstream inp(fname,ios::in); inp.read((char *)&r,sizeof(int)); //number of topics if (topics>0 && r != topics) { cerr << "incompatible number of topics: " << r << "\n"; exit(2); } else topics=r; for (int i=0; isize(); i++) inp.read((char *)W[i],sizeof(double)*topics); return 1; } int plsa::saveFeat(char* fname) { //compute distribution on doc 0 double *WH=new double [dict->size()]; for (int i=0; isize(); i++) { WH[i]=0; for (int t=0; tsize(); i++) if (WH[i]>maxp) maxp=WH[i]; cerr << "Get max prob" << maxp << "\n"; mfstream out(fname,ios::out); ngramtable ngt(NULL,1,NULL,NULL,NULL,0,0,NULL,0,COUNT); ngt.dict->incflag(1); ngram ng(dict,1); ngram ng2(ngt.dict,1); for (int i=0; isize(); i++) { *ng.wordp(1)=i; ng.freq=(int)floor((WH[i]/maxp) * 1000000); if (ng.freq) { ng2.trans(ng); ng2.freq=ng.freq; //cout << ng << "\n" << ng2 << "\n"; ngt.put(ng2); ngt.dict->incfreq(*ng2.wordp(1),ng2.freq); } } ngt.dict->incflag(0); ngt.savetxt(fname,1,1);// save in google format return 1; } int plsa::train(char *trainfile,int maxiter,double noiseH,int flagW,double noiseW,int spectopic) { int dsize=dict->size(); //includes possible OOV srand(100); if (flagW) { //intialize W initW(noiseW,spectopic); } doc trset(dict,trainfile); trset.open(); //n is known initH(noiseH,trset.n); //support array double *WH=new double [dsize]; //command char cmd[100]; sprintf(cmd,"mv %s %s",houtfname,hinfname); //start of training double lastLL=10; double LL=-1e+99; int iter=0; int r=topics; while (iter < maxiter) //while ( (iter < maxiter) && (((lastLL-LL)/lastLL)>0.00001)) { lastLL=LL; LL=0; if (flagW) //reset support arrays for (int i=0; isize(); i++) for (int t=0; tUPPER_SINGLE_PRECISION_OF_1 || totHsize(); i++) { delete W[i]; delete T[i]; } delete [] W; delete [] H; delete [] T; } int saveW(char* fname); int saveT(char* fname); int combineT(char* tlist); int saveWtxt(char* fname); int loadW(char* fname); int initW(double noise,int spectopic); int initH(double noise,int maxdoc); int train(char *trainfile,int maxiter,double noiseH,int flagW=0,double noiseW=0,int spectopic=0); int saveFeat(char* fname); }; irstlm-5.80.03/src/dict.cpp000644 000766 000024 00000013212 12032511222 017542 0ustar00nicolabertoldistaff000000 000000 // $Id: dict.cpp 3677 2010-10-13 09:06:51Z bertoldi $ using namespace std; #include #include "cmd.h" #include "mfstream.h" #include "mempool.h" #include "dictionary.h" void print_help(int TypeFlag=0){ std::cerr << std::endl << "dict - extracts a dictionary" << std::endl; std::cerr << std::endl << "USAGE:" << std::endl; std::cerr << " dict -i= [options]" << std::endl; std::cerr << std::endl << "DESCRIPTION:" << std::endl; std::cerr << " dict extracts a dictionary from a corpus or a dictionary." << std::endl; std::cerr << std::endl << "OPTIONS:" << std::endl; FullPrintParams(TypeFlag, 0, 1, stderr); } void usage(const char *msg = 0) { if (msg){ std::cerr << msg << std::endl; } else{ print_help(); } exit(1); } int main(int argc, char **argv) { char *inp=NULL; char *out=NULL; char *testfile=NULL; char *intsymb=NULL; //must be single characters int freqflag=0; //print frequency of words int sortflag=0; //sort dictionary by frequency int curveflag=0; //plot dictionary growth curve int curvesize=10; //size of curve int listflag=0; //print oov words in test file int size=1000000; //initial size of table .... float load_factor=0; //initial load factor, default LOAD_FACTOR int prunefreq=0; //pruning according to freq value int prunerank=0; //pruning according to freq rank bool help=false; DeclareParams((char*) "InputFile", CMDSTRINGTYPE|CMDMSG, &inp, "input file (Mandatory)", "i", CMDSTRINGTYPE|CMDMSG, &inp, "input file (Mandatory)", "OutputFile", CMDSTRINGTYPE|CMDMSG, &out, "output file", "o", CMDSTRINGTYPE|CMDMSG, &out, "output file", "f", CMDBOOLTYPE|CMDMSG, &freqflag,"output word frequencies; default is false", "Freq", CMDBOOLTYPE|CMDMSG, &freqflag,"output word frequencies; default is false", "sort", CMDBOOLTYPE|CMDMSG, &sortflag,"sort dictionary by frequency; default is false", "Size", CMDINTTYPE|CMDMSG, &size, "Initial dictionary size; default is 1000000", "s", CMDINTTYPE|CMDMSG, &size, "Initial dictionary size; default is 1000000", "LoadFactor", CMDFLOATTYPE|CMDMSG, &load_factor, "set the load factor for cache; it should be a positive real value; default is 0", "lf", CMDFLOATTYPE|CMDMSG, &load_factor, "set the load factor for cache; it should be a positive real value; default is 0", "IntSymb", CMDSTRINGTYPE|CMDMSG, &intsymb, "interruption symbol", "is", CMDSTRINGTYPE|CMDMSG, &intsymb, "interruption symbol", "PruneFreq", CMDINTTYPE|CMDMSG, &prunefreq, "prune words with frequency below the specified value", "pf", CMDINTTYPE|CMDMSG, &prunefreq, "prune words with frequency below the specified value", "PruneRank", CMDINTTYPE|CMDMSG, &prunerank, "prune words with frequency rank above the specified value", "pr", CMDINTTYPE|CMDMSG, &prunerank, "prune words with frequency rank above the specified value", "Curve", CMDBOOLTYPE|CMDMSG, &curveflag,"show dictionary growth curve; default is false", "c", CMDBOOLTYPE|CMDMSG, &curveflag,"show dictionary growth curve; default is false", "CurveSize", CMDINTTYPE|CMDMSG, &curvesize, "default 10", "cs", CMDINTTYPE|CMDMSG, &curvesize, "default 10", "TestFile", CMDSTRINGTYPE|CMDMSG, &testfile, "compute OOV rates on the specified test corpus", "t", CMDSTRINGTYPE|CMDMSG, &testfile, "compute OOV rates on the specified test corpus", "ListOOV", CMDBOOLTYPE|CMDMSG, &listflag, "print OOV words to stderr; default is false", "oov", CMDBOOLTYPE|CMDMSG, &listflag, "print OOV words to stderr; default is false", "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help", "h", CMDBOOLTYPE|CMDMSG, &help, "print this help", (char*)NULL ); if (argc == 1){ usage(); } GetParams(&argc, &argv, (char*) NULL); if (help){ usage(); } if (inp==NULL) { usage("Warning: no input file specified"); }; // options compatibility issues: if (curveflag && !freqflag) freqflag=1; if (testfile!=NULL && !freqflag) { freqflag=1; mfstream test(testfile,ios::in); if (!test) { usage(strcat((char*) "Warning: cannot open testfile: ", testfile)); exit(1); } test.close(); } //create dictionary: generating it from training corpus, or loading it from a dictionary file dictionary *d = new dictionary(inp,size,load_factor); // sort dictionary if (prunefreq>0 || prunerank>0 || sortflag) { dictionary *sortd=new dictionary(d,false); sortd->sort(); delete d; d=sortd; } // show statistics on dictionary growth and OOV rates on test corpus if (testfile != NULL) d->print_curve(curvesize, d->test(curvesize, testfile, listflag)); else if (curveflag) d->print_curve(curvesize); //prune words according to frequency and rank if (prunefreq>0 || prunerank>0) { cerr << "pruning dictionary prunefreq:" << prunefreq << " prunerank: " << prunerank <<" \n"; int count=0; int bos=d->encode(d->BoS()); int eos=d->encode(d->EoS()); for (int i=0; i< d->size() ; i++) { if (prunefreq && d->freq(i) <= prunefreq && i!=bos && i!=eos) { d->freq(i,0); continue; } if (prunerank>0 && count>=prunerank && i!=bos && i!=eos) { d->freq(i,0); continue; } count++; } } // if outputfile is provided, write the dictionary into it if(out!=NULL) d->save(out,freqflag); } irstlm-5.80.03/src/dictionary.cpp000644 000766 000024 00000027040 12032511222 020770 0ustar00nicolabertoldistaff000000 000000 // $Id: dictionary.cpp 3640 2010-10-08 14:58:17Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include "mfstream.h" #include #include #include #include #include #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "index.h" #include "util.h" using namespace std; dictionary::dictionary(char *filename,int size, float lf) { if (lf<=0.0) lf=DICTIONARY_LOAD_FACTOR; load_factor=lf; htb = new HASHTABLE_t((size_t) (size/load_factor)); tb = new dict_entry[size]; st = new strstack(size * 10); for (int i=0; i> setw(100) >> buffer; inp.close(); if ((strncmp(buffer,"dict",4)==0) || (strncmp(buffer,"DICT",4)==0)) load(filename); else generate(filename); cerr << "loaded \n"; } int dictionary::getword(fstream& inp , char* buffer) { while(inp >> setw(MAX_WORD) >> buffer) { //warn if the word is very long if (strlen(buffer)==(MAX_WORD-1)) { cerr << "getword: a very long word was read (" << buffer << ")\n"; } //skip words of length zero chars: why should this happen? if (strlen(buffer)==0) { cerr << "zero length word!\n"; continue; } return 1; } return 0; } void dictionary::generate(char *filename) { char buffer[MAX_WORD]; int counter=0; mfstream inp(filename,ios::in); if (!inp) { cerr << "cannot open " << filename << "\n"; exit(1); } cerr << "dict:"; ifl=1; while (getword(inp,buffer)) { incfreq(encode(buffer),1); if (!(++counter % 1000000)) cerr << "."; } ifl=0; cerr << "\n"; inp.close(); } void dictionary::augment(dictionary *d) { incflag(1); for (int i=0; in; i++) encode(d->decode(i)); incflag(0); encode(OOV()); } // print_curve: show statistics on dictionary growth and (optionally) on // OOV rates computed on test corpus void dictionary::print_curve(int curvesize, float* testOOV) { int* curve = new int[curvesize]; for (int i=0; i curvesize-1) curve[curvesize-1]++; else curve[tb[i].freq-1]++; } //cumulating results for (int i=curvesize-2; i>=0; i--) { curve[i] = curve[i] + curve[i+1]; } cout.setf(ios::fixed); cout << "Dict size: " << n << "\n"; cout << "**************** DICTIONARY GROWTH CURVE ****************\n"; cout << "Freq\tEntries\tPercent"; if(testOOV!=NULL) cout << "\t\tFreq\tOOV onTest"; cout << "\n"; for (int i=0; i" << i << "\t" << curve[i] << "\t" << setprecision(2) << (float)curve[i]/n * 100.0 << "%"; // display OOV rates on test if(testOOV!=NULL) cout << "\t\t<" << i+1<< "\t" << testOOV[i] << "%"; cout << "\n"; } cout << "*********************************************************\n"; } // // test : compute OOV rates on test corpus using dictionaries of different sizes // float* dictionary::test(int curvesize, const char *filename, int listflag) { int NwTest=0; int* OOVchart = new int[curvesize]; for (int j=0; j" << buffer << "\n"; } } else { if(freq < curvesize) OOVchart[freq]++; } NwTest++; if (!(++k % 1000000)) cerr << "."; } cerr << "\n"; inp.close(); cout << "nb words of test: " << NwTest << "\n"; // cumulating results for (int i=1; ipush(buffer); tb[n].code=n; if (freqflag) inp >> tb[n].freq; else tb[n].freq=0; //always insert without checking whether the word is already in if ((addr=htb->insert((char*)&tb[n].word))) { if (addr!=(char *)&tb[n].word) { cerr << "dictionary::loadtxt wrong entry was found (" << buffer << ") in position " << n << "\n"; // exit(1); continue; // continue loading dictionary } } N+=tb[n].freq; if (strcmp(buffer,OOV())==0) oov_code=n; if (++n==lim) grow(); } inp.close(); } void dictionary::load(std::istream& inp) { char buffer[MAX_WORD]; char *addr; int size; inp >> size; for (int i=0; i> setw(MAX_WORD) >> buffer; tb[n].word=st->push(buffer); tb[n].code=n; inp >> tb[n].freq; N+=tb[n].freq; //always insert without checking whether the word is already in if ((addr=htb->insert((char *)&tb[n].word))) { if (addr!=(char *)&tb[n].word) { cerr << "dictionary::loadtxt wrong entry was found (" << buffer << ") in position " << n << "\n"; exit(1); } } if (strcmp(tb[n].word,OOV())==0) oov_code=n; if (++n==lim) grow(); } inp.getline(buffer,MAX_WORD-1); } void dictionary::save(std::ostream& out) { out << n << "\n"; for (int i=0; ifreq-ae->freq) return be->freq-ae->freq; else return strcmp(ae->word,be->word); } dictionary::dictionary(dictionary* d,bool prune, int prunethresh) { assert(d!=NULL); //transfer values n=0; //total entries N=0; //total frequency load_factor=d->load_factor; //load factor lim=d->lim; //limit of entries oov_code=-1; //code od oov must be re-defined ifl=0; //increment flag=0; dubv=d->dubv; //dictionary upperbound transferred //creates a sorted copy of the table tb = new dict_entry[lim]; htb = new HASHTABLE_t((size_t) (lim/load_factor)); st = new strstack(lim * 10); //copy in the entries with frequency > threshold n=0; for (int i=0; in; i++) if (!prune || d->tb[i].freq>=prunethresh){ tb[n].code=n; tb[n].freq=d->tb[i].freq; tb[n].word=st->push(d->tb[i].word); htb->insert((char*)&tb[n].word); if (d->oov_code==i) oov_code=n; //reassign oov_code N+=tb[n].freq; n++; } }; void dictionary::sort() { if (htb != NULL ) delete htb; htb = new HASHTABLE_t((int) (lim/load_factor)); //sort all entries according to frequency cerr << "sorting dictionary ..."; qsort(tb,n,sizeof(dict_entry),cmpdictentry); cerr << "done\n"; for (int i=0; iinsert((char*)&tb[i].word); }; } dictionary::~dictionary() { delete htb; delete st; delete [] tb; } void dictionary::stat() { cout << "dictionary class statistics\n"; cout << "size " << n << " used memory " << (lim * sizeof(int) + htb->used() + st->used())/1024 << " Kb\n"; } void dictionary::grow() { delete htb; cerr << "+\b"; int newlim=(int) (lim*GROWTH_STEP); dict_entry *tb2=new dict_entry[newlim]; memcpy(tb2,tb,sizeof(dict_entry) * lim ); delete [] tb; tb=tb2; htb=new HASHTABLE_t((size_t) ((newlim)/load_factor)); for (int i=0; iinsert((char*)&tb[i].word); } for (int i=lim; ifind((char *)&w); if (ptr==NULL) return -1; return ptr->code; } int dictionary::encode(const char *w) { //case of strange characters if (strlen(w)==0) { cerr << "0"; w=OOV(); } dict_entry* ptr; if ((ptr=(dict_entry *)htb->find((char *)&w))!=NULL) return ptr->code; else { if (!ifl) { //do not extend dictionary if (oov_code==-1) { //did not use OOV yet cerr << "starting to use OOV words [" << w << "]\n"; tb[n].word=st->push(OOV()); htb->insert((char *)&tb[n].word); tb[n].code=n; tb[n].freq=0; oov_code=n; if (++n==lim) grow(); } return encode(OOV()); } else { //extend dictionary tb[n].word=st->push((char *)w); htb->insert((char*)&tb[n].word); tb[n].code=n; tb[n].freq=0; if (++n==lim) grow(); return n-1; } } } const char *dictionary::decode(int c) { if (c>=0 && c < n) return tb[c].word; else { cerr << "decode: code out of boundary\n"; return OOV(); } } dictionary_iter::dictionary_iter(dictionary *dict) : m_dict(dict) { m_dict->scan(HT_INIT); } dict_entry* dictionary_iter::next() { return (dict_entry*) m_dict->scan(HT_CONT); } /* main(int argc,char **argv){ dictionary d(argv[1],40000); d.stat(); cout << "ROMA" << d.decode(0) << "\n"; cout << "ROMA:" << d.encode("ROMA") << "\n"; d.save(argv[2]); } */ irstlm-5.80.03/src/dictionary.h000644 000766 000024 00000012425 12030777462 020460 0ustar00nicolabertoldistaff000000 000000 // $Id: dictionary.h 3679 2010-10-13 09:10:01Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #ifndef MF_DICTIONARY_H #define MF_DICTIONARY_H #include "mfstream.h" #include "htable.h" #include #include #define MAX_WORD 1000 #define DICTIONARY_LOAD_FACTOR 2.0 #ifndef GROWTH_STEP #define GROWTH_STEP 1.5 #endif #ifndef DICT_INITSIZE #define DICT_INITSIZE 100000 #endif //Begin of sentence symbol #ifndef BOS_ #define BOS_ "" #endif //End of sentence symbol #ifndef EOS_ #define EOS_ "" #endif //End of document symbol #ifndef BOD_ #define BOD_ "" #endif //End of document symbol #ifndef EOD_ #define EOD_ "" #endif //Out-Of-Vocabulary symbol #ifndef OOV_ #define OOV_ "" #endif typedef struct { const char *word; int code; long long freq; } dict_entry; typedef htable HASHTABLE_t; class strstack; class dictionary { strstack *st; //!< stack of strings dict_entry *tb; //!< entry table HASHTABLE_t *htb; //!< hash table int n; //!< number of entries long long N; //!< total frequency int lim; //!< limit of entries int oov_code; //!< code assigned to oov words char ifl; //!< increment flag int dubv; //!< dictionary size upper bound float load_factor; //!< dictionary loading factor char* oov_str; //!< oov string public: friend class dictionary_iter; dictionary* oovlex; //=0?v:oov_code); } inline int incflag() { return ifl; } inline int incflag(int v) { return ifl=v; } int getword(fstream& inp , char* buffer); int isprintable(char* w) { char buffer[MAX_WORD]; sprintf(buffer,"%s",w); return strcmp(w,buffer)==0; } inline void genoovcode() { int c=encode(OOV()); std::cerr << "OOV code is "<< c << std::endl; oovcode(c); } inline void genBoScode() { int c=encode(BoS()); std::cerr << "BoS code is "<< c << std::endl; } inline void genEoScode() { int c=encode(EoS()); std::cerr << "EoS code is "<< c << std::endl; } inline int setoovrate(double oovrate) { encode(OOV()); //be sure OOV code exists int oovfreq=(int)(oovrate * totfreq()); std::cerr << "setting OOV rate to: " << oovrate << " -- freq= " << oovfreq << std::endl; return freq(oovcode(),oovfreq); } inline long long incfreq(int code,long long value) { N+=value; return tb[code].freq+=value; } inline long long multfreq(int code,double value) { N+=(long long)(value * tb[code].freq)-tb[code].freq; return tb[code].freq=(long long)(value * tb[code].freq); } inline long freq(int code,long long value=-1) { if (value>=0) { N+=value-tb[code].freq; tb[code].freq=value; } return tb[code].freq; } inline long long totfreq() { return N; } inline float set_load_factor(float value) { return load_factor=value; } void grow(); void sort(); dictionary(char *filename,int size=DICT_INITSIZE,float lf=DICTIONARY_LOAD_FACTOR); dictionary(dictionary* d, bool prune=false,int prunethresh=0); //make a copy and eventually filter out unfrequent words ~dictionary(); void generate(char *filename); void load(char *filename); void save(char *filename, int freqflag=0); void load(std::istream& fd); void save(std::ostream& fd); void augment(dictionary *d); int size() { return n; } int getcode(const char *w); int encode(const char *w); const char *decode(int c); void stat(); void print_curve(int curvesize, float* testOOV=NULL); float* test(int curvesize, const char *filename, int listflag=0); // return OOV statistics computed on test set void cleanfreq() { for (int i=0; iscan(action); } }; class dictionary_iter { public: dictionary_iter(dictionary *dict); dict_entry* next(); private: dictionary* m_dict; }; #endif irstlm-5.80.03/src/doc.cpp000755 000766 000024 00000011122 12013405172 017372 0ustar00nicolabertoldistaff000000 000000 /****************************************************************************** IrstLM: IRST Language Model Toolkit, compile LM Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ using namespace std; #include #include #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "doc.h" doc::doc(dictionary* d,char* docfname) { dict=d; n=0; m=0; V=new int[dict->size()]; N=new int[dict->size()]; T=new int[dict->size()]; cd=-1; dfname=docfname; df=NULL; }; doc::~doc() { delete [] V; delete [] N; delete [] T; } int doc::open() { df=new mfstream(dfname,ios::in); char header[100]; df->getline(header,100); if (sscanf(header,"DoC %d",&n) && n>0) binary=true; else if (sscanf(header,"%d",&n) && n>0) binary=false; else { cerr << "doc::open error wrong header\n"; exit(0); } cerr << "opening: " << n << (binary?" bin-":" txt-") << "docs\n"; cd=-1; return 1; } int doc::reset() { cd=-1; m=0; df->close(); delete df; open(); return 1; } int doc::read() { if (cd >=(n-1)) return 0; m=0; for (int i=0; isize(); i++) N[i]=0; if (binary) { df->read((char *)&m,sizeof(int)); df->read((char *)V,m * sizeof(int)); df->read((char *)T,m * sizeof(int)); for (int i=0; iencode(dict->EoD()); int bod=dict->encode(dict->BoD()); ngram ng(dict); while((*df) >> ng) { if (ng.size>0) { if (*ng.wordp(1)==bod) { ng.size=0; continue; } if (*ng.wordp(1)==eod) { ng.size=0; break; } N[*ng.wordp(1)]++; if (N[*ng.wordp(1)]==1)V[m++]=*ng.wordp(1); } } } cd++; return 1; } int doc::savernd(char* fname,int num) { assert((df!=NULL) && (cd==-1)); srand(100); mfstream out(fname,ios::out); out << "DoC\n"; out.write((const char*) &n,sizeof(int)); cerr << "n=" << n << "\n"; //first select num random docs char taken[n]; int r; for (int i=0; i #include "util.h" #include #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "ngramtable.h" #include "cmd.h" #define YES 1 #define NO 0 void print_help(int TypeFlag=0){ std::cerr << std::endl << "dtsel - performs data selection" << std::endl; std::cerr << std::endl << "USAGE:" << std::endl << " dtsel -s= [options]" << std::endl; std::cerr << std::endl << "OPTIONS:" << std::endl; FullPrintParams(TypeFlag, 0, 1, stderr); } void usage(const char *msg = 0) { if (msg){ std::cerr << msg << std::endl; } else{ print_help(); } exit(1); } double prob(ngramtable* ngt,ngram ng,int size,int cv){ double fstar,lambda; assert(size<=ngt->maxlevel() && size<=ng.size); if (size>1){ ngram history=ng; if (ngt->get(history,size,size-1) && history.freq>cv){ fstar=0.0; if (ngt->get(ng,size,size)){ cv=(cv>ng.freq)?ng.freq:cv; if (ng.freq>cv){ fstar=(double)(ng.freq-cv)/(double)(history.freq -cv + history.succ); lambda=(double)history.succ/(double)(history.freq -cv + history.succ); }else //ng.freq==cv lambda=(double)(history.succ-1)/(double)(history.freq -cv + history.succ-1); } else lambda=(double)history.succ/(double)(history.freq -cv + history.succ); return fstar + lambda * prob(ngt,ng,size-1,cv); } else return prob(ngt,ng,size-1,cv); }else{ //unigram branch if (ngt->get(ng,1,1) && ng.freq>cv) return (double)(ng.freq-cv)/(ngt->totfreq()-1); else{ //cerr << "backoff to oov unigram " << ng.freq << " " << cv << "\n"; *ng.wordp(1)=ngt->dict->oovcode(); if (ngt->get(ng,1,1) && ng.freq>0) return (double)ng.freq/ngt->totfreq(); else //use an automatic estimate of Pr(oov) return (double)ngt->dict->size()/(ngt->totfreq()+ngt->dict->size()); } } } double computePP(ngramtable* train,ngramtable* test,double oovpenalty,double& oovrate,int cv=0){ ngram ng2(test->dict);ngram ng1(train->dict); int N=0; double H=0; oovrate=0; test->scan(ng2,INIT,test->maxlevel()); while(test->scan(ng2,CONT,test->maxlevel())) { ng1.trans(ng2); H-=log(prob(train,ng1,ng1.size,cv)); if (*ng1.wordp(1)==train->dict->oovcode()){ H-=oovpenalty; oovrate++; } N++; } oovrate/=N; return exp(H/N); } int main(int argc, char **argv) { char *indom=NULL; //indomain data: one sentence per line char *outdom=NULL; //domain data: one sentence per line char *scorefile=NULL; //score file char *evalset=NULL; //evalset to measure performance int minfreq=2; //frequency threshold for dictionary pruning (optional) int ngsz=0; // n-gram size int dub=10000000; //upper bound of true vocabulary int model=2; //data selection model: 1 only in-domain cross-entropy, //2 cross-entropy difference. int cv=1; //cross-validation parameter: 1 only in-domain cross-entropy, int blocksize=100000; //block-size in words int verbose=0; int useindex=0; //provided score file includes and index double convergence_treshold=0; bool help=false; DeclareParams((char*) "min-word-freq", CMDINTTYPE|CMDMSG, &minfreq, "frequency threshold for dictionary pruning, default: 2", "f", CMDINTTYPE|CMDMSG, &minfreq, "frequency threshold for dictionary pruning, default: 2", "ngram-order", CMDSUBRANGETYPE|CMDMSG, &ngsz, 1 , MAX_NGRAM, "n-gram default size, default: 0", "n", CMDSUBRANGETYPE|CMDMSG, &ngsz, 1 , MAX_NGRAM, "n-gram default size, default: 0", "in-domain-file", CMDSTRINGTYPE|CMDMSG, &indom, "indomain data file: one sentence per line", "i", CMDSTRINGTYPE|CMDMSG, &indom, "indomain data file: one sentence per line", "out-domain-file", CMDSTRINGTYPE|CMDMSG, &outdom, "domain data file: one sentence per line", "o", CMDSTRINGTYPE|CMDMSG, &outdom, "domain data file: one sentence per line", "score-file", CMDSTRINGTYPE|CMDMSG, &scorefile, "score output file", "s", CMDSTRINGTYPE|CMDMSG, &scorefile, "score output file", "dictionary-upper-bound", CMDINTTYPE|CMDMSG, &dub, "upper bound of true vocabulary, default: 10000000", "dub", CMDINTTYPE|CMDMSG, &dub, "upper bound of true vocabulary, default: 10000000", "model", CMDSUBRANGETYPE|CMDMSG, &model, 1 , 2, "data selection model: 1 only in-domain cross-entropy, 2 cross-entropy difference; default: 2", "m", CMDSUBRANGETYPE|CMDMSG, &model, 1 , 2, "data selection model: 1 only in-domain cross-entropy, 2 cross-entropy difference; default: 2", "cross-validation", CMDSUBRANGETYPE|CMDMSG, &cv, 1 , 3, "cross-validation parameter: 1 only in-domain cross-entropy; default: 1", "cv", CMDSUBRANGETYPE|CMDMSG, &cv, 1 , 3, "cross-validation parameter: 1 only in-domain cross-entropy; default: 1", "test", CMDSTRINGTYPE|CMDMSG, &evalset, "evaluation set file to measure performance", "t", CMDSTRINGTYPE|CMDMSG, &evalset, "evaluation set file to measure performance", "block-size", CMDINTTYPE|CMDMSG, &blocksize, "block-size in words, default: 100000", "bs", CMDINTTYPE|CMDMSG, &blocksize, "block-size in words, default: 100000", "convergence-threshold", CMDDOUBLETYPE|CMDMSG, &convergence_treshold, "convergence threshold, default: 0", "c", CMDDOUBLETYPE|CMDMSG, &convergence_treshold, "convergence threshold, default: 0", "index", CMDSUBRANGETYPE|CMDMSG, &useindex,0,1, "provided score file includes and index, default: 0", "x", CMDSUBRANGETYPE|CMDMSG, &useindex,0,1, "provided score file includes and index, default: 0", "verbose", CMDSUBRANGETYPE|CMDMSG, &verbose,0,2, "verbose level, default: 0", "v", CMDSUBRANGETYPE|CMDMSG, &verbose,0,2, "verbose level, default: 0", "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help", "h", CMDBOOLTYPE|CMDMSG, &help, "print this help", (char *)NULL ); GetParams(&argc, &argv, (char*) NULL); if (help){ usage(); } if (scorefile==NULL) { usage(); } if (!evalset && (!indom || !outdom)){ cerr <<"Must specify in-domain and out-domain data files\n"; exit(1); }; //score file is always required: either as output or as input if (!scorefile){ cerr <<"Must specify score file\n"; exit(1); }; if (!evalset && !model){ cerr <<"Must specify data selection model\n"; exit(1); } if (evalset && (convergence_treshold<0 || convergence_treshold > 0.1)){ cerr <<"Convergence threshold must be between 0 and 0.1. \n"; exit(1); } TABLETYPE table_type=COUNT; if (!evalset){ //computed dictionary on indomain data dictionary *dict = new dictionary(indom,1000000,0); dictionary *pd=new dictionary(dict,true,minfreq); delete dict;dict=pd; //build in-domain table restricted to the given dictionary ngramtable *indngt=new ngramtable(indom,ngsz,NULL,dict,NULL,0,0,NULL,0,table_type); double indoovpenalty=-log(dub-indngt->dict->size()); ngram indng(indngt->dict); int indoovcode=indngt->dict->oovcode(); //build out-domain table restricted to the in-domain dictionary char command[1000]=""; if (useindex) sprintf(command,"cut -d \" \" -f 2- %s",outdom); else sprintf(command,"%s",outdom); ngramtable *outdngt=new ngramtable(command,ngsz,NULL,dict,NULL,0,0,NULL,0,table_type); double outdoovpenalty=-log(dub-outdngt->dict->size()); ngram outdng(outdngt->dict); int outdoovcode=outdngt->dict->oovcode(); cerr << "dict size idom: " << indngt->dict->size() << " odom: " << outdngt->dict->size() << "\n"; cerr << "oov penalty idom: " << indoovpenalty << " odom: " << outdoovpenalty << "\n"; //go through the odomain sentences int bos=dict->encode(dict->BoS()); mfstream inp(outdom,ios::in); ngram ng(dict); mfstream txt(outdom,ios::in); mfstream output(scorefile,ios::out); int linenumber=1; string line; int lenght=0;float deltaH=0; float deltaHoov=0; int words=0;string index; while (getline(inp,line)){ istringstream lninp(line); linenumber++; if (useindex) lninp >> index; // reset ngram at begin of sentence ng.size=1; deltaH=0;deltaHoov=0; lenght=0; while(lninp>>ng){ if (*ng.wordp(1)==bos) continue; lenght++; words++; if ((words % 1000000)==0) cerr << "."; if (ng.size>ngsz) ng.size=ngsz; indng.trans(ng);outdng.trans(ng); if (model==1){//compute cross-entropy deltaH-=log(prob(indngt,indng,indng.size,0)); deltaHoov-=(*indng.wordp(1)==indoovcode?indoovpenalty:0); } if (model==2){ //compute cross-entropy difference deltaH+=log(prob(outdngt,outdng,outdng.size,cv))-log(prob(indngt,indng,indng.size,0)); deltaHoov+=(*outdng.wordp(1)==outdoovcode?outdoovpenalty:0)-(*indng.wordp(1)==indoovcode?indoovpenalty:0); } } output << (deltaH + deltaHoov)/lenght << " " << line << "\n"; } } else{ //build in-domain LM from evaluation set ngramtable *tstngt=new ngramtable(evalset,ngsz,NULL,NULL,NULL,0,0,NULL,0,table_type); //build empty out-domain LM ngramtable *outdngt=new ngramtable(NULL,ngsz,NULL,NULL,NULL,0,0,NULL,0,table_type); //if indomain data is passed then limit comparison to its dictionary dictionary *dict = NULL; if (indom){ cerr << "dtsel: limit evaluation dict to indomain words with freq >=" << minfreq << "\n"; //computed dictionary on indomain data dict = new dictionary(indom,1000000,0); dictionary *pd=new dictionary(dict,true,minfreq); delete dict;dict=pd; outdngt->dict=dict; } dictionary* outddict=outdngt->dict; //get codes of , and UNK outddict->incflag(1); int bos=outddict->encode(outddict->BoS()); int oov=outddict->encode(outddict->OOV()); outddict->incflag(0); outddict->oovcode(oov); double oldPP=dub; double newPP=0; double oovrate=0; long totwords=0; long totlines=0; long nextstep=blocksize; double score; string index; mfstream outd(scorefile,ios::in); string line; //initialize n-gram ngram ng(outdngt->dict); for (int i=1;iincflag(1); while (getline(outd,line)){ istringstream lninp(line); //skip score and eventually the index lninp >> score; if (useindex) lninp >> index; while (lninp >> ng){ if (*ng.wordp(1) == bos) continue; if (ng.size>ngsz) ng.size=ngsz; outdngt->put(ng); totwords++; } totlines++; if (totwords>=nextstep){ //if block is complete if (!dict) outddict->incflag(0); newPP=computePP(outdngt,tstngt,-log(dub-outddict->size()),oovrate); if (!dict) outddict->incflag(1); cout << totwords << " " << newPP; if (verbose) cout << " " << totlines << " " << oovrate; cout << "\n"; if (convergence_treshold && (oldPP-newPP)/oldPP < convergence_treshold) return 1; oldPP=newPP; nextstep+=blocksize; } } if (!dict) outddict->incflag(0); newPP=computePP(outdngt,tstngt,-log(dub-outddict->size()),oovrate); cout << totwords << " " << newPP; if (verbose) cout << " " << totlines << " " << oovrate; } } irstlm-5.80.03/src/gzfilebuf.h000644 000766 000024 00000004722 12013405172 020254 0ustar00nicolabertoldistaff000000 000000 // $Id: gzfilebuf.h 236 2009-02-03 13:25:19Z nicolabertoldi $ #ifndef _GZFILEBUF_H_ #define _GZFILEBUF_H_ #include #include #include #include class gzfilebuf : public std::streambuf { public: gzfilebuf(const char *filename) { _gzf = gzopen(filename, "rb"); setg (_buff+sizeof(int), // beginning of putback area _buff+sizeof(int), // read position _buff+sizeof(int)); // end position } ~gzfilebuf() { gzclose(_gzf); } protected: virtual int_type overflow (int_type /* unused parameter: c */) { std::cerr << "gzfilebuf::overflow is not implemented" << std::endl;; throw; } // write multiple characters virtual std::streamsize xsputn (const char* /* unused parameter: s */, std::streamsize /* unused parameter: num */) { std::cerr << "gzfilebuf::xsputn is not implemented" << std::endl;; throw; } virtual std::streampos seekpos ( std::streampos /* unused parameter: sp */, std::ios_base::openmode /* unused parameter: which */= std::ios_base::in | std::ios_base::out ) { std::cerr << "gzfilebuf::seekpos is not implemented" << std::endl;; throw; } //read one character virtual int_type underflow () { // is read position before end of _buff? if (gptr() < egptr()) { return traits_type::to_int_type(*gptr()); } /* process size of putback area * - use number of characters read * - but at most four */ unsigned int numPutback = gptr() - eback(); if (numPutback > sizeof(int)) { numPutback = sizeof(int); } /* copy up to four characters previously read into * the putback _buff (area of first four characters) */ std::memmove (_buff+(sizeof(int)-numPutback), gptr()-numPutback, numPutback); // read new characters int num = gzread(_gzf, _buff+sizeof(int), _buffsize-sizeof(int)); if (num <= 0) { // ERROR or EOF return EOF; } // reset _buff pointers setg (_buff+(sizeof(int)-numPutback), // beginning of putback area _buff+sizeof(int), // read position _buff+sizeof(int)+num); // end of buffer // return next character return traits_type::to_int_type(*gptr()); } std::streamsize xsgetn (char* s, std::streamsize num) { return gzread(_gzf,s,num); } private: gzFile _gzf; static const unsigned int _buffsize = 1024; char _buff[_buffsize]; }; #endif irstlm-5.80.03/src/htable.cpp000644 000766 000024 00000004514 12013405172 020070 0ustar00nicolabertoldistaff000000 000000 // $Id: htable.cpp 3680 2010-10-13 09:10:21Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include #include "mempool.h" #include "htable.h" using namespace std; template <> void htable::set_keylen(int kl) { keylen=kl/sizeof(int); return; } template <> void htable::set_keylen(int kl) { keylen=kl; return; } template <> address htable::Hash(int* key) { address h; register int i; //Thomas Wang's 32 bit Mix Function for (i=0,h=0; i> 10); h += (h << 3); h ^= (h >> 6); h += ~(h << 11); h ^= (h >> 16); }; return h; } template <> address htable::Hash(char* key) { //actually char* key is a char**, i.e. a pointer to a char* char *Key = *(char**)key; int length=strlen(Key); register address h=0; register int i; for (i=0,h=0; i int htable::Comp(int *key1, int *key2) { assert(key1 && key2); register int i; for (i=0; i int htable::Comp(char *key1, char *key2) { assert(key1 && key2); char *Key1 = *(char**)key1; char *Key2 = *(char**)key2; assert(Key1 && Key2); return (strcmp(Key1,Key2)); } irstlm-5.80.03/src/htable.h000644 000766 000024 00000012431 12013405172 017532 0ustar00nicolabertoldistaff000000 000000 // $Id: htable.h 3680 2010-10-13 09:10:21Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #ifndef MF_HTABLE_H #define MF_HTABLE_H using namespace std; #include #include #include #include "mempool.h" #define Prime1 37 #define Prime2 1048583 #define BlockSize 100 typedef unsigned int address; // Fast arithmetic, relying on powers of 2, // and on pre-processor concatenation property //use as template struct entry { T key; entry* next; // secret from user }; typedef enum {HT_FIND, //!< search: find an entry HT_ENTER, //!< search: enter an entry HT_INIT, //!< scan: start scan HT_CONT //!< scan: continue scan } HT_ACTION; //!T is the type of the key and should be (int*) or (char*) template class htable { int size; //!< table size int keylen; //!< key length entry **table; //!< hash table int scan_i; //!< scan support entry *scan_p; //!< scan support // statistics long keys; //!< # of entries long accesses; //!< # of accesses long collisions; //!< # of collisions mempool *memory; //!< memory pool public: //! Creates an hash table htable(int n,int kl=0); //! Destroys an and hash table ~htable(); void set_keylen(int kl); //! Computes the hash function address Hash(const T key); //! Compares the keys of two entries int Comp(const T Key1, const T Key2); //! Searches for an item T find(T item); T insert(T item); //! Scans the content T scan(HT_ACTION action); //! Prints statistics void stat(); //! Print a map of memory use void map(std::ostream& co=std::cout, int cols=80); //! Returns amount of used memory int used() { return size * sizeof(entry **) + memory->used(); } }; template htable::htable(int n,int kl) { memory=new mempool( sizeof(entry) , BlockSize ); table = new entry* [ size=n ]; memset(table,0,sizeof(entry *) * n ); set_keylen(kl); keys = accesses = collisions = 0; } template htable::~htable() { delete []table; delete memory; } template T htable::find(T key) { address h; entry *q,**p; accesses++; h = Hash(key); p=&table[h%size]; q=*p; /* Follow collision chain */ while (q != NULL && Comp(q->key,key)) { p = &(q->next); q = q->next; collisions++; } if (q != NULL) return q->key; /* found */ return NULL; } template T htable::insert(T key) { address h; entry *q,**p; accesses++; h = Hash(key); p=&table[h%size]; q=*p; /* Follow collision chain */ while (q != NULL && Comp(q->key,key)) { p = &(q->next); q = q->next; collisions++; } if (q != NULL) return q->key; /* found */ /* not found */ if ((q = (entry *)memory->allocate()) == NULL) /* no room */ return NULL; /* link into chain */ *p = q; /* Initialize new element */ q->key = key; q->next = NULL; keys++; return q->key; } template T htable::scan(HT_ACTION action) { T k; if (action == HT_INIT) { scan_i=0; scan_p=table[0]; return NULL; } // if scan_p==NULL go to the first non null pointer while ((scan_p==NULL) && (++scan_ikey; scan_p=(entry *)scan_p->next; return k; }; return NULL; } template void htable::map(ostream& co,int cols) { entry *p; char* img=new char[cols+1]; img[cols]='\0'; memset(img,'.',cols); co << "htable memory map: . (0 items), - (<5), # (>5)\n"; for (int i=0; i *)p->next; }; if (i && (i % cols)==0) { co << img << "\n"; memset(img,'.',cols); } if (n>0) img[i % cols]=n<=5?'-':'#'; } img[size % cols]='\0'; co << img << "\n"; delete []img; } template void htable::stat() { cerr << "htable class statistics\n"; cerr << "size " << size << " keys " << keys << " acc " << accesses << " coll " << collisions << " used memory " << used()/1024 << "Kb\n"; }; #endif irstlm-5.80.03/src/index.h000644 000766 000024 00000000402 12013405172 017375 0ustar00nicolabertoldistaff000000 000000 // $Id: index.h 236 2009-02-03 13:25:19Z nicolabertoldi $ #pragma once #ifdef WIN32 inline const char *index(const char *str, char search) { size_t i=0; while (i< strlen(str) ) { if (str[i]==search) return &str[i]; } return NULL; } #endif irstlm-5.80.03/src/interplm.cpp000644 000766 000024 00000026413 12114671302 020467 0ustar00nicolabertoldistaff000000 000000 /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ using namespace std; #include #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "mempool.h" #include "ngramcache.h" #include "ngramtable.h" #include "normcache.h" #include "interplm.h" void interplm::trainunigr() { int oov=dict->getcode(dict->OOV()); cerr << "oovcode: " << oov << "\n"; if (oov>=0 && dict->freq(oov)>= dict->size()) { cerr << "Using current estimate of OOV frequency " << dict->freq(oov)<< "\n"; } else { oov=dict->encode(dict->OOV()); dict->oovcode(oov); //choose unigram smoothing method according to //sample size //if (dict->totfreq()>100){ //witten bell //cerr << "select unigram smoothing: " << dict->totfreq() << "\n"; if (unismooth) { dict->incfreq(oov,dict->size()-1); cerr << "Witten-Bell estimate of OOV freq:"<< (double)(dict->size()-1)/dict->totfreq() << "\n"; } else { if (dict->dub()) { cerr << "DUB estimate of OOV size\n"; dict->incfreq(oov,dict->dub()-dict->size()+1); } else { cerr << "1 = estimate of OOV size\n"; dict->incfreq(oov,1); } } } } double interplm::unigr(ngram ng) { return ((double)(dict->freq(*ng.wordp(1))+epsilon))/ ((double)dict->totfreq() + (double) dict->size() * epsilon); } interplm::interplm(char *ngtfile,int depth,TABLETYPE tabtype): ngramtable(ngtfile,depth,NULL,NULL,NULL,0,0,NULL,0,tabtype) { if (maxlevel() int BoS=dict->encode(dict->BoS()); if (BoS != dict->oovcode()) { cerr << "setting counter of Begin of Sentence to 1 ..." << "\n"; dict->freq(BoS,1); cerr << "start_sent: " << (char *)dict->decode(BoS) << " " << dict->freq(BoS) << "\n"; } }; void interplm::gensuccstat() { ngram hg(dict); int s1,s2; cerr << "Generating successor statistics\n"; for (int l=2; l<=lms; l++) { cerr << "level " << l << "\n"; scan(hg,INIT,l-1); while(scan(hg,CONT,l-1)) { s1=s2=0; ngram ng=hg; ng.pushc(0); succscan(hg,ng,INIT,l); while(succscan(hg,ng,CONT,l)) { // cerr << ng << "\n"; if (corrcounts && l=1; l--) { cerr << "level " << l << "\n"; ngram ng(dict); int count=0; //now update counts scan(ng,INIT,l+1); while(scan(ng,CONT,l+1)) { ngram ng2=ng; ng2.size--; if (get(ng2,ng2.size,ng2.size)) { if (!ng2.containsWord(dict->BoS(),1)) //counts number of different n-grams setfreq(ng2.link,ng2.pinfo,1+getfreq(ng2.link,ng2.pinfo,1),1); else // use correct count for n-gram " w .. .. " //setfreq(ng2.link,ng2.pinfo,ng2.freq+getfreq(ng2.link,ng2.pinfo,1),1); setfreq(ng2.link,ng2.pinfo,ng2.freq,1); } else { assert(lms==l+1); cerr << "cannot find2 " << ng2 << "count " << count << "\n"; cerr << "inserting ngram and starting from scratch\n"; ng2.pushw(dict->BoS()); ng2.freq=100; put(ng2); cerr << "reset all counts at last level\n"; scan(ng2,INIT,lms-1); while(scan(ng2,CONT,lms-1)) { setfreq(ng2.link,ng2.pinfo,0,1); } gencorrcounts(); return; } } } cerr << "Updating history counts\n"; for (int l=lms-2; l>=1; l--) { cerr << "level " << l << "\n"; cerr << "reset counts\n"; ngram ng(dict); scan(ng,INIT,l); while(scan(ng,CONT,l)) { freq(ng.link,ng.pinfo,0); } scan(ng,INIT,l+1); while(scan(ng,CONT,l+1)) { ngram ng2=ng; get(ng2,l+1,l); freq(ng2.link,ng2.pinfo,freq(ng2.link,ng2.pinfo)+getfreq(ng.link,ng.pinfo,1)); } } cerr << "Adding unigram of OOV word if missing\n"; ngram ng(dict,maxlevel()); for (int i=1; i<=maxlevel(); i++) *ng.wordp(i)=dict->oovcode(); if (!get(ng,lms,1)) { // oov is missing in the ngram-table // f(oov) = dictionary size (Witten Bell) ng.freq=dict->size(); cerr << "adding oov unigram " << ng << "\n"; put(ng); get(ng,lms,1); setfreq(ng.link,ng.pinfo,ng.freq,1); } cerr << "Replacing unigram of BoS \n"; if (dict->encode(dict->BoS()) != dict->oovcode()) { ngram ng(dict,1); *ng.wordp(1)=dict->encode(dict->BoS()); if (get(ng,1,1)) { ng.freq=1; //putting Pr()=0 would create problems!! setfreq(ng.link,ng.pinfo,ng.freq,1); } } cerr << "compute unigram totfreq \n"; int totf=0; scan(ng,INIT,1); while(scan(ng,CONT,1)) { totf+=getfreq(ng.link,ng.pinfo,1); } btotfreq(totf); corrcounts=1; } /* void gencorrcounts2(){ cerr << "Generating corrected n-gram tables\n"; for (int l=lms-1;l>=1;l--){ cerr << "level " << l << "\n"; // tb[l]=new ngramtable(NULL,l,NULL,NULL); // tb[l]->dict=dict; //dict must be the same ngram ng(dict); int count=0; tb[l+1]->scan(ng,INIT,l+1); while(tb[l+1]->scan(ng,CONT,l+1)){ count++; //generate tables according to Chen & Goodman, 1998 // cerr << ng << "\n"; ng.size--; if (!ng.containsWord(dict->BoS(),1)) ng.freq=1; //tb[l]->put(ng); //cerr << ng << "\n"; //tb[l]->update(ng); ngram ng2=ng; if (tb[l]->get(ng2,ng2.size,ng2.size)){ tb[l]->freq(ng2.link,ng2.info,0); } else{ cerr << "cannot find " << ng2 << "count " << count << "\n"; exit(1); } ng.size++; } //add unigram of OOV word if missing if (l==1){ ngram oovw(dict,1); *oovw.wordp(1)=dict->oovcode(); if (!tb[1]->get(oovw,1,1)){ oovw.freq=dict->freq(dict->oovcode()); cerr << "adding oov unigram " << oovw << "\n"; tb[1]->put(oovw); } } } exit(1); } */ double interplm::zerofreq(int lev) { cerr << "Computing lambda: ..."; ngram ng(dict); double N=0,N1=0; scan(ng,INIT,lev); while(scan(ng,CONT,lev)) { if ((lev==1) && (*ng.wordp(1)==dict->oovcode())) continue; N+=ng.freq; if (ng.freq==1) N1++; } cerr << (double)(N1/N) << "\n"; return N1/N; } void interplm::test(char* filename,int size,int backoff,int checkpr,char* outpr) { if (size>lmsize()) { cerr << "test: wrong ngram size\n"; exit(1); } mfstream inp(filename,ios::in ); char header[100]; inp >> header; inp.close(); if (strncmp(header,"nGrAm",5)==0 || strncmp(header,"NgRaM",5)==0) { ngramtable ngt(filename,size,NULL,NULL,NULL,0,0,NULL,0,COUNT); test_ngt(ngt,size,backoff,checkpr); } else test_txt(filename,size,backoff,checkpr,outpr); } void interplm::test_txt(char* filename,int size,int /* unused parameter: backoff */,int checkpr,char* outpr) { cerr << "test text " << filename << " "; mfstream inp(filename,ios::in ); ngram ng(dict); double n=0,lp=0,pr; double oov=0; cout.precision(10); mfstream outp(outpr?outpr:"/dev/null",ios::out ); if (checkpr) cerr << "checking probabilities\n"; while(inp >> ng) if (ng.size>=1) { ng.size=ng.size>size?size:ng.size; if (dict->encode(dict->BoS()) != dict->oovcode()) { if (*ng.wordp(1) == dict->encode(dict->BoS())) { ng.size=1; //reset n-grams starting with BoS continue; } } pr=prob(ng,ng.size); if (outpr) outp << ng << "[" << ng.size << "-gram]" << " " << pr << " " << log(pr)/log(10.0) << std::endl; lp-=log(pr); n++; if (((int) n % 10000)==0) cerr << "."; if (*ng.wordp(1) == dict->oovcode()) oov++; if (checkpr) { double totp=0.0; int oldw=*ng.wordp(1); for (int c=0; csize(); c++) { *ng.wordp(1)=c; totp+=prob(ng,ng.size); } *ng.wordp(1)=oldw; if ( totp < (1.0 - 1e-5) || totp > (1.0 + 1e-5)) cout << ng << " " << pr << " [t="<< totp << "] ***\n"; } } if (oov && dict->dub()>obswrd()) lp += oov * log(dict->dub() - obswrd()); cout << "n=" << (int) n << " LP=" << (double) lp << " PP=" << exp(lp/n) << " OVVRate=" << (oov)/n //<< " OVVLEXRate=" << (oov-in_oov_list)/n // << " OOVPP=" << exp((lp+oovlp)/n) << "\n"; outp.close(); inp.close(); } void interplm::test_ngt(ngramtable& ngt,int sz,int /* unused parameter: backoff */,int checkpr) { double pr; int n=0,c=0; double lp=0; double oov=0; cout.precision(10); if (sz > ngt.maxlevel()) { cerr << "test_ngt: ngramtable has uncompatible size\n"; exit(1); } if (checkpr) cerr << "checking probabilities\n"; cerr << "Computing PP:"; ngram ng(dict); ngram ng2(ngt.dict); ngt.scan(ng2,INIT,sz); while(ngt.scan(ng2,CONT,sz)) { ng.trans(ng2); if (dict->encode(dict->BoS()) != dict->oovcode()) { if (*ng.wordp(1) == dict->encode(dict->BoS())) { ng.size=1; //reset n-grams starting with BoS continue; } } n+=ng.freq; pr=prob(ng,sz); lp-=(ng.freq * log(pr)); if (*ng.wordp(1) == dict->oovcode()) oov+=ng.freq; if (checkpr) { double totp=0.0; for (c=0; csize(); c++) { *ng.wordp(1)=c; totp+=prob(ng,sz); } if ( totp < (1.0 - 1e-5) || totp > (1.0 + 1e-5)) cout << ng << " " << pr << " [t="<< totp << "] ***\n"; } if ((++c % 100000)==0) cerr << "."; } //double oovlp=oov * log((double)(dict->dub() - obswrd())); if (oov && dict->dub()>obswrd()) lp+=oov * log((dict->dub() - obswrd())); cout << "n=" << (int) n << " LP=" << (double) lp << " PP=" << exp(lp/n) << " OVVRate=" << (oov)/n //<< " OVVLEXRate=" << (oov-in_oov_list)/n // << " OOVPP=" << exp((lp+oovlp)/n) << "\n"; cout.flush(); } /* main(int argc, char** argv){ dictionary d(argv[1]); shiftbeta ilm(&d,argv[2],3); ngramtable test(&d,argv[2],3); ilm.train(); cerr << "PP " << ilm.test(test) << "\n"; ilm.savebin("newlm.lm",3); } */ irstlm-5.80.03/src/interplm.h000644 000766 000024 00000006370 12114671302 020134 0ustar00nicolabertoldistaff000000 000000 /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ // Basic Interpolated LM class #ifndef MF_INTERPLM_H #define MF_INTERPLM_H #define SHIFT_BETA 1 #define SHIFT_ONE 2 #define SHIFT_ZERO 3 #define SHIFT_ONE_BETA 4 #define LINEAR_WB 5 #define LINEAR_GT 6 #define MIXTURE 7 #define MOD_SHIFT_BETA 8 class interplm:public ngramtable { int lms; double epsilon; //Bayes smoothing int unismooth; //0 Bayes, 1 Witten Bell int prune_singletons; int prune_top_singletons; public: int backoff; //0 interpolation, 1 Back-off interplm(char* ngtfile,int depth=0,TABLETYPE tt=FULL); int prunesingletons(int flag=-1) { return (flag==-1?prune_singletons:prune_singletons=flag); } int prunetopsingletons(int flag=-1) { return (flag==-1?prune_top_singletons:prune_top_singletons=flag); } void gencorrcounts(); void gensuccstat(); virtual int dub() { return dict->dub(); } virtual int dub(int value) { return dict->dub(value); } int setusmooth(int v=0) { return unismooth=v; } double setepsilon(double v=1.0) { return epsilon=v; } ngramtable *unitbl; void trainunigr(); double unigr(ngram ng); double zerofreq(int lev); inline int lmsize() { return lms; } inline int obswrd() { return dict->size(); } virtual int train() { return 0; } virtual void adapt(char* /* unused parameter: ngtfile */, double /* unused parameter: w */) {} virtual double prob(ngram /* unused parameter: ng */,int /* unused parameter: size */) { return 0.0; } virtual double boprob(ngram /* unused parameter: ng */,int /* unused parameter: size */) { return 0.0; } void test_ngt(ngramtable& ngt,int sz=0,int backoff=0,int checkpr=0); void test_txt(char *filename,int sz=0,int backoff=0,int checkpr=0,char* outpr=NULL); void test(char* filename,int sz,int backoff=0,int checkpr=0,char* outpr=NULL); virtual int discount(ngram /* unused parameter: ng */,int /* unused parameter: size */,double& /* unused parameter: fstar */ ,double& /* unused parameter: lambda */,int /* unused parameter: cv*/=0) { return 0; } virtual int savebin(char* /* unused parameter: filename */,int /* unused parameter: lmsize=2 */) { return 0; } virtual int netsize() { return 0; } void lmstat(int level) { stat(level); } virtual ~interplm() {} }; #endif irstlm-5.80.03/src/interpolate-lm.cpp000644 000766 000024 00000043722 12032511222 021564 0ustar00nicolabertoldistaff000000 000000 /****************************************************************************** IrstLM: IRST Language Model Toolkit, compile LM Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ using namespace std; #include #include #include #include #include #include #include #include "cmd.h" #include "util.h" #include "math.h" #include "lmContainer.h" /********************************/ inline void error(const char* message) { std::cerr << message << "\n"; throw std::runtime_error(message); } lmContainer* load_lm(std::string file,int requiredMaxlev,int dub,int memmap, float nlf, float dlf); void print_help(int TypeFlag=0){ std::cerr << std::endl << "interpolate-lm - interpolates language models" << std::endl; std::cerr << std::endl << "USAGE:" << std::endl; std::cerr << " interpolate-lm [options] [lm-list-file.out]" << std::endl; std::cerr << std::endl << "DESCRIPTION:" << std::endl; std::cerr << " interpolate-lm reads a LM list file including interpolation weights " << std::endl; std::cerr << " with the format: N\\n w1 lm1 \\n w2 lm2 ...\\n wN lmN\n" << std::endl; std::cerr << " It estimates new weights on a development text, " << std::endl; std::cerr << " computes the perplexity on an evaluation text, " << std::endl; std::cerr << " computes probabilities of n-grams read from stdin." << std::endl; std::cerr << " It reads LMs in ARPA and IRSTLM binary format." << std::endl; std::cerr << std::endl << "OPTIONS:" << std::endl; FullPrintParams(TypeFlag, 0, 1, stderr); } void usage(const char *msg = 0) { if (msg){ std::cerr << msg << std::endl; } else{ print_help(); } exit(1); } int main(int argc, char **argv) { char *slearn = NULL; char *seval = NULL; bool learn=false; bool score=false; bool sent_PP_flag = false; int order = 0; int debug = 0; int memmap = 0; int requiredMaxlev = 1000; int dub = 10000000; float ngramcache_load_factor = 0.0; float dictionary_load_factor = 0.0; bool help=false; std::vector files; DeclareParams((char*) "learn", CMDSTRINGTYPE|CMDMSG, &slearn, "learn optimal interpolation for text-file; default is false", "l", CMDSTRINGTYPE|CMDMSG, &slearn, "learn optimal interpolation for text-file; default is false", "order", CMDINTTYPE|CMDMSG, &order, "order of n-grams used in --learn (optional)", "o", CMDINTTYPE|CMDMSG, &order, "order of n-grams used in --learn (optional)", "eval", CMDSTRINGTYPE|CMDMSG, &seval, "computes perplexity of the specified text file", "e", CMDSTRINGTYPE|CMDMSG, &seval, "computes perplexity of the specified text file", "dub", CMDINTTYPE|CMDMSG, &dub, "dictionary upperbound to compute OOV word penalty: default 10^7", "score", CMDBOOLTYPE|CMDMSG, &score, "computes log-prob scores of n-grams from standard input", "s", CMDBOOLTYPE|CMDMSG, &score, "computes log-prob scores of n-grams from standard input", "debug", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0", "d", CMDINTTYPE|CMDMSG, &debug, "verbose output for --eval option; default is 0", "memmap", CMDINTTYPE|CMDMSG, &memmap, "uses memory map to read a binary LM", "mm", CMDINTTYPE|CMDMSG, &memmap, "uses memory map to read a binary LM", "sentence", CMDBOOLTYPE|CMDMSG, &sent_PP_flag, "computes perplexity at sentence level (identified through the end symbol)", "dict_load_factor", CMDFLOATTYPE|CMDMSG, &dictionary_load_factor, "sets the load factor for ngram cache; it should be a positive real value; default is 0", "ngram_load_factor", CMDFLOATTYPE|CMDMSG, &ngramcache_load_factor, "sets the load factor for ngram cache; it should be a positive real value; default is false", "level", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken", "lev", CMDINTTYPE|CMDMSG, &requiredMaxlev, "maximum level to load from the LM; if value is larger than the actual LM order, the latter is taken", "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help", "h", CMDBOOLTYPE|CMDMSG, &help, "print this help", (char *)NULL ); if (argc == 1){ usage(); } for(int i=1; i < argc; i++) { if(argv[i][0] != '-') files.push_back(argv[i]); } GetParams(&argc, &argv, (char*) NULL); if (help){ usage(); } if (files.size() > 2) { usage("Warning: Too many arguments"); } if (files.size() < 1) { usage("Warning: specify a LM list file to read from"); } std::string infile = files[0]; std::string outfile=""; if (files.size() == 1) { outfile=infile; //remove path information std::string::size_type p = outfile.rfind('/'); if (p != std::string::npos && ((p+1) < outfile.size())) outfile.erase(0,p+1); outfile+=".out"; } else outfile = files[1]; std::cerr << "inpfile: " << infile << std::endl; learn = ((slearn != NULL)? true : false); if (learn) std::cerr << "outfile: " << outfile << std::endl; if (score) std::cerr << "interactive: " << score << std::endl; if (memmap) std::cerr << "memory mapping: " << memmap << std::endl; std::cerr << "loading up to the LM level " << requiredMaxlev << " (if any)" << std::endl; std::cerr << "order: " << order << std::endl; if (requiredMaxlev > 0) std::cerr << "loading up to the LM level " << requiredMaxlev << " (if any)" << std::endl; std::cerr << "dub: " << dub<< std::endl; lmContainer *lmt[100], *start_lmt[100]; //interpolated language models std::string lmf[100]; //lm filenames float w[100]; //interpolation weights int N; //Loading Language Models` std::cerr << "Reading " << infile << "..." << std::endl; std::fstream inptxt(infile.c_str(),std::ios::in); //std::string line; char line[BUFSIZ]; const char* words[3]; int tokenN; inptxt.getline(line,BUFSIZ,'\n'); tokenN = parseWords(line,words,3); if (tokenN != 2 || ((strcmp(words[0],"LMINTERPOLATION") != 0) && (strcmp(words[0],"lminterpolation")!=0))) error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1\nweight_of_LM_2 filename_of_LM_2"); N=atoi(words[1]); std::cerr << "Number of LMs: " << N << "..." << std::endl; if(N > 100) { std::cerr << "Can't interpolate more than 100 language models." << std::endl; exit(1); } for (int i=0; i lmt[i]->maxlevel())?maxorder:lmt[i]->maxlevel(); } if (order <= 0) { order = maxorder; std::cerr << "order is not set or wrongly set to a non positive value; reset to the maximum order of LMs: " << order << std::endl; } else if (order > maxorder) { order = maxorder; std::cerr << "order is too high; reset to the maximum order of LMs" << order << std::endl; } //Learning mixture weights if (learn) { std::vector p[N]; //LM probabilities float c[N]; //expected counts float den,norm; //inner denominator, normalization term float variation=1.0; // global variation between new old params dictionary* dict=new dictionary(slearn,1000000,dictionary_load_factor); ngram ng(dict); int bos=ng.dict->encode(ng.dict->BoS()); std::ifstream dev(slearn,std::ios::in); for(;;) { std::string line; getline(dev, line); if(dev.eof()) break; if(dev.fail()) { std::cerr << "Problem reading input file " << seval << std::endl; exit(1); } std::istringstream lstream(line); if(line.substr(0, 29) == "###interpolate-lm:replace-lm ") { std::string token, newlm; int id; lstream >> token >> id >> newlm; if(id <= 0 || id > N) { std::cerr << "LM id out of range." << std::endl; return 1; } id--; // count from 0 now if(lmt[id] != start_lmt[id]) delete lmt[id]; lmt[id] = load_lm(newlm,requiredMaxlev,dub,memmap,ngramcache_load_factor,dictionary_load_factor); continue; } while(lstream >> ng) { // reset ngram at begin of sentence if (*ng.wordp(1)==bos) { ng.size=1; continue; } if (order > 0 && ng.size > order) ng.size=order; for (int i=0; igetDict()); ong.trans(ng); double logpr; logpr = lmt[i]->clprob(ong); //LM log-prob (using caches if available) p[i].push_back(pow(10.0,logpr)); } } for (int i=0; icheck_caches_levels(); } dev.close(); while( variation > 0.01 ) { for (int i=0; ic[i]?(w[i]-c[i]):(c[i]-w[i])); w[i]=c[i]; //update weights } std::cerr << "Variation " << variation << std::endl; } //Saving results std::cerr << "Saving in " << outfile << "..." << std::endl; //saving result std::fstream outtxt(outfile.c_str(),std::ios::out); outtxt << "LMINTERPOLATION " << N << "\n"; for (int i=0; iincflag(1); ngram ng(dict); int bos=ng.dict->encode(ng.dict->BoS()); int eos=ng.dict->encode(ng.dict->EoS()); std::fstream inptxt(seval,std::ios::in); for(;;) { std::string line; getline(inptxt, line); if(inptxt.eof()) break; if(inptxt.fail()) { std::cerr << "Problem reading input file " << seval << std::endl; return 1; } std::istringstream lstream(line); if(line.substr(0, 26) == "###interpolate-lm:weights ") { std::string token; lstream >> token; for(int i = 0; i < N; i++) { if(lstream.eof()) { std::cerr << "Not enough weights!" << std::endl; return 1; } lstream >> w[i]; } continue; } if(line.substr(0, 29) == "###interpolate-lm:replace-lm ") { std::string token, newlm; int id; lstream >> token >> id >> newlm; if(id <= 0 || id > N) { std::cerr << "LM id out of range." << std::endl; return 1; } id--; // count from 0 now delete lmt[id]; lmt[id] = load_lm(newlm,requiredMaxlev,dub,memmap,ngramcache_load_factor,dictionary_load_factor); continue; } double bow; int bol=0; char *msp; unsigned int statesize; while(lstream >> ng) { // reset ngram at begin of sentence if (*ng.wordp(1)==bos) { ng.size=1; continue; } if (order > 0 && ng.size > order) ng.size=order; if (ng.size>=1) { int minbol=MAX_NGRAM; //minimum backoff level of the mixture bool OOV_all_flag=true; //OOV flag wrt all LM[i] bool OOV_any_flag=false; //OOV flag wrt any LM[i] float logpr; Pr = 0.0; for (i=0; igetDict()); ong.trans(ng); logpr = lmt[i]->clprob(ong,&bow,&bol,&msp,&statesize); //actual prob of the interpolation //logpr = lmt[i]->clprob(ong,&bow,&bol); //LM log-prob Pr+=w[i] * pow(10.0,logpr); //actual prob of the interpolation if (bol < minbol) minbol=bol; //backoff of LM[i] if (*ong.wordp(1) != lmt[i]->getDict()->oovcode()) OOV_all_flag=false; //OOV wrt LM[i] if (*ong.wordp(1) == lmt[i]->getDict()->oovcode()) OOV_any_flag=true; //OOV wrt LM[i] } lPr=log(Pr)/M_LN10; logPr+=lPr; sent_logPr+=lPr; if (debug==1) { std::cout << ng.dict->decode(*ng.wordp(1)) << " [" << ng.size-minbol << "]" << " "; if (*ng.wordp(1)==eos) std::cout << std::endl; } if (debug==2) std::cout << ng << " [" << ng.size-minbol << "-gram]" << " " << log(Pr) << std::endl; if (minbol) { Nbo++; //all LMs have back-offed by at least one sent_Nbo++; } if (OOV_all_flag) { Noov_all++; //word is OOV wrt all LM sent_Noov_all++; } if (OOV_any_flag) { Noov_any++; //word is OOV wrt any LM sent_Noov_any++; } Nw++; sent_Nw++; if (*ng.wordp(1)==eos && sent_PP_flag) { sent_PP=exp((-sent_logPr * log(10.0)) /sent_Nw); std::cout << "%% sent_Nw=" << sent_Nw << " sent_PP=" << sent_PP << " sent_Nbo=" << sent_Nbo << " sent_Noov=" << sent_Noov_all << " sent_OOV=" << (float)sent_Noov_all/sent_Nw * 100.0 << "%" << " sent_Noov_any=" << sent_Noov_any << " sent_OOV_any=" << (float)sent_Noov_any/sent_Nw * 100.0 << "%" << std::endl; //reset statistics for sentence based Perplexity sent_Nw=sent_Noov_any=sent_Noov_all=sent_Nbo=0; sent_logPr=0.0; } if ((Nw % 10000)==0) std::cerr << "."; } } } PP=exp((-logPr * M_LN10) /Nw); std::cout << "%% Nw=" << Nw << " PP=" << PP << " Nbo=" << Nbo << " Noov=" << Noov_all << " OOV=" << (float)Noov_all/Nw * 100.0 << "%" << " Noov_any=" << Noov_any << " OOV_any=" << (float)Noov_any/Nw * 100.0 << "%" << std::endl; }; if (score == true) { dictionary* dict=new dictionary(NULL,1000000,dictionary_load_factor); dict->incflag(1); // start generating the dictionary; ngram ng(dict); int bos=ng.dict->encode(ng.dict->BoS()); double Pr,logpr; double bow; int bol=0, maxbol=0; unsigned int maxstatesize, statesize; int i,n=0; std::cout << "> "; while(std::cin >> ng) { // reset ngram at begin of sentence if (*ng.wordp(1)==bos) { ng.size=1; continue; } if (ng.size>=maxorder) { if (order > 0 && ng.size > order) ng.size=order; n++; maxstatesize=0; maxbol=0; Pr=0.0; for (i=0; igetDict()); ong.trans(ng); logpr = lmt[i]->clprob(ong,&bow,&bol,NULL,&statesize); //LM log-prob (using caches if available) Pr+=w[i] * pow(10.0,logpr); //actual prob of the interpolation std::cout << "lm " << i << ":" << " logpr: " << logpr << " weight: " << w[i] << std::endl; if (maxbolcheck_caches_levels(); } } else { std::cout << ng << " p= NULL" << std::endl; } std::cout << "> "; } } for (int i=0; iCreateLanguageModel(file,nlf,dlf); lmt->setMaxLoadedLevel(requiredMaxlev); lmt->load(file,memmap); if (dub) lmt->setlogOOVpenalty((int)dub); //use caches to save time (only if PS_CACHE_ENABLE is defined through compilation flags) lmt->init_caches(lmt->maxlevel()); return lmt; } irstlm-5.80.03/src/linearlm.cpp000644 000766 000024 00000007513 12013405172 020436 0ustar00nicolabertoldistaff000000 000000 /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "mempool.h" #include "ngramtable.h" #include "ngramcache.h" #include "normcache.h" #include "interplm.h" #include "mdiadapt.h" #include "linearlm.h" // //Linear interpolated language model: Witten & Bell discounting scheme // linearwb::linearwb(char* ngtfile,int depth,int prunefreq,TABLETYPE tt): mdiadaptlm(ngtfile,depth,tt) { prunethresh=prunefreq; cerr << "PruneThresh: " << prunethresh << "\n"; }; int linearwb::train() { trainunigr(); gensuccstat(); return 1; } int linearwb::discount(ngram ng_,int size,double& fstar,double& lambda,int cv) { ngram ng(dict); ng.trans(ng_); if (size > 1) { ngram history=ng; if (ng.ckhisto(size) && get(history,size,size-1) && (history.freq>cv) && ((size < 3) || ((history.freq-cv) > prunethresh))) { // apply history pruning on trigrams only if (get(ng,size,size) && (!prunesingletons() || ng.freq>1 || size<3)) { // apply frequency pruning on trigrams only cv=(cv>ng.freq)?ng.freq:cv; if (ng.freq >cv) { fstar=(double)(ng.freq-cv)/(double)(history.freq -cv + history.succ); lambda=(double)history.succ/(double)(history.freq -cv + history.succ); if (size>=3 && prunesingletons()) // correction due to frequency pruning lambda+=(double)succ1(history.link)/(double)(history.freq -cv + history.succ); // succ1(history.link) is not affected when ng.freq > cv } else { // ng.freq == cv fstar=0.0; lambda=(double)(history.succ-1)/ // remove cv n-grams from data (double)(history.freq - cv + history.succ - 1); if (size>=3 && prunesingletons()) // correction due to frequency pruning lambda+=(double)succ1(history.link)-(cv==1 && ng.freq==1?1:0)/(double)(history.freq -cv + history.succ -1); } } else { fstar=0.0; lambda=(double)history.succ/(double)(history.freq + history.succ); if (size>=3 && prunesingletons()) // correction due to frequency pruning lambda+=(double)succ1(history.link)/(double)(history.freq + history.succ); } //cerr << "ngram :" << ng << "\n"; // if current word is OOV then back-off to unigrams! if (*ng.wordp(1)==dict->oovcode()) { lambda+=fstar; fstar=0.0; assert(lambda<=1 && lambda>0); } else { // add f*(oov|...) to lambda *ng.wordp(1)=dict->oovcode(); if (get(ng,size,size) && (!prunesingletons() || ng.freq>1 || size<3)) lambda+=(double)ng.freq/(double)(history.freq - cv + history.succ); } } else { fstar=0; lambda=1; } } else { fstar=unigr(ng); lambda=0; } return 1; } irstlm-5.80.03/src/linearlm.h000644 000766 000024 00000002533 12013405172 020100 0ustar00nicolabertoldistaff000000 000000 /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ // Linear discounting interpolated LMs //Witten and Bell linear discounting class linearwb: public mdiadaptlm { int prunethresh; int minfreqthresh; public: linearwb(char* ngtfile,int depth=0,int prunefreq=0,TABLETYPE tt=SHIFTBETA_B); int train(); int discount(ngram ng,int size,double& fstar,double& lambda,int cv=0); ~linearwb() {} }; //Good Turing linear discounting //no more supported irstlm-5.80.03/src/lmclass.cpp000644 000766 000024 00000016126 12013405172 020271 0ustar00nicolabertoldistaff000000 000000 // $Id: lmclass.cpp 3631 2010-10-07 12:04:12Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include #include #include #include "math.h" #include "mempool.h" #include "htable.h" #include "ngramcache.h" #include "dictionary.h" #include "n_gram.h" #include "lmtable.h" #include "lmclass.h" #include "util.h" using namespace std; // local utilities: start int parseWords(char *sentence, const char **words, int max); inline void error(const char* message) { cerr << message << "\n"; throw runtime_error(message); } // local utilities: end lmclass::lmclass(float nlf, float dlfi):lmtable(nlf,dlfi) { MaxMapSize=1000000; MapScore= (double *)malloc(MaxMapSize*sizeof(double));// //array of probabilities memset(MapScore,0,MaxMapSize*sizeof(double)); MapScoreN=0; dict = new dictionary((char *)NULL,MaxMapSize); //word to cluster dictionary }; lmclass::~lmclass() { free (MapScore); delete dict; } void lmclass::load(const std::string filename,int memmap) { VERBOSE(2,"lmclass::load(const std::string filename,int memmap)" << std::endl); //get info from the configuration file fstream inp(filename.c_str(),ios::in|ios::binary); char line[MAX_LINE]; const char* words[MAX_TOKEN]; int tokenN; inp.getline(line,MAX_LINE,'\n'); tokenN = parseWords(line,words,MAX_TOKEN); if (tokenN != 2 || ((strcmp(words[0],"LMCLASS") != 0) && (strcmp(words[0],"lmclass")!=0))) error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCLASS LM_order\nfilename_of_LM\nfilename_of_map"); maxlev = atoi(words[1]); std::string lmfilename; if (inp.getline(line,MAX_LINE,'\n')) { tokenN = parseWords(line,words,MAX_TOKEN); lmfilename = words[0]; } else { error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCLASS LM_order\nfilename_of_LM\nfilename_of_map"); } std::string W2Cdict = ""; if (inp.getline(line,MAX_LINE,'\n')) { tokenN = parseWords(line,words,MAX_TOKEN); W2Cdict = words[0]; } else { error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMCLASS LM_order\nfilename_of_LM\nfilename_of_map"); } inp.close(); std::cerr << "lmfilename:" << lmfilename << std::endl; if (W2Cdict != "") { std::cerr << "mapfilename:" << W2Cdict << std::endl; } else { error((char*)"ERROR: you must specify a map!"); } // Load the (possibly binary) LM inputfilestream inpLM(lmfilename.c_str()); if (!inpLM.good()) { std::cerr << "Failed to open " << lmfilename << "!" << std::endl; exit(1); } lmtable::load(inpLM,lmfilename.c_str(),NULL,memmap); inputfilestream inW2C(W2Cdict); if (!inW2C.good()) { std::cerr << "Failed to open " << W2Cdict << "!" << std::endl; exit(1); } loadMap(inW2C); getDict()->genoovcode(); VERBOSE(2,"OOV code of lmclass is " << getDict()->oovcode() << " mapped into " << getMap(getDict()->oovcode())<< "\n"); getDict()->incflag(1); } void lmclass::loadMap(istream& inW2C) { double lprob=0.0; int howmany=0; const char* words[1 + LMTMAXLEV + 1 + 1]; //open input stream and prepare an input string char line[MAX_LINE]; dict->incflag(1); //can add to the map dictionary cerr<<"loadW2Cdict()...\n"; //save freq of EOS and BOS loadMapElement(dict->BoS(),lmtable::dict->BoS(),0.0); loadMapElement(dict->EoS(),lmtable::dict->EoS(),0.0); //should i add to the dict or just let the trans_freq handle loadMapElement(dict->OOV(),lmtable::dict->OOV(),0.0); while (inW2C.getline(line,MAX_LINE)) { if (strlen(line)==MAX_LINE-1) { cerr << "lmtable::loadW2Cdict: input line exceed MAXLINE (" << MAX_LINE << ") chars " << line << "\n"; exit(1); } howmany = parseWords(line, words, 4); //3 if(howmany == 3) { assert(sscanf(words[2], "%lf", &lprob)); lprob=(double)log10(lprob); } else if(howmany==2) { VERBOSE(3,"No score for the pair (" << words[0] << "," << words[1] << "); set to default 1.0\n"); lprob=0.0; } else { cerr << "parseline: not enough entries" << line << "\n"; exit(1); } loadMapElement(words[0],words[1],lprob); //check if the are available position in MapScore checkMap(); } VERBOSE(2,"There are " << MapScoreN << " entries in the map\n"); dict->incflag(0); //can NOT add to the dictionary of lmclass } void lmclass::checkMap() { if (MapScoreN > MaxMapSize) { MaxMapSize=2*MapScoreN; MapScore = (double*) realloc(MapScore, sizeof(double)*(MaxMapSize)); VERBOSE(2,"In lmclass::checkMap(...) MaxMapSize=" << MaxMapSize << " MapScoreN=" << MapScoreN << "\n"); } } void lmclass::loadMapElement(const char* in, const char* out, double sc) { //freq of word (in) encodes the ID of the class (out) //save the probability associated with the pair (in,out) int wcode=dict->encode(in); dict->freq(wcode,lmtable::dict->encode(out)); MapScore[wcode]=sc; VERBOSE(3,"In lmclass::loadMapElement(...) in=" << in << " wcode=" << wcode << " out=" << out << " ccode=" << lmtable::dict->encode(out) << " MapScoreN=" << MapScoreN << "\n"); if (wcode >= MapScoreN) MapScoreN++; //increment size of the array MapScore if the element is new } double lmclass::lprob(ngram ong,double* bow, int* bol, char** maxsuffptr,unsigned int* statesize,bool* extendible) { double lpr=getMapScore(*ong.wordp(1)); VERBOSE(3,"In lmclass::lprob(...) Mapscore = " << lpr << "\n"); //convert ong to it's clustered encoding ngram mapped_ng(lmtable::getDict()); // mapped_ng.trans_freq(ong); mapping(ong,mapped_ng); lpr+=lmtable::clprob(mapped_ng,bow,bol,maxsuffptr,statesize, extendible); VERBOSE(3,"In lmclass::lprob(...) global prob = " << lpr << "\n"); return lpr; } void lmclass::mapping(ngram &in, ngram &out) { int insize = in.size; VERBOSE(3,"In lmclass::mapping(ngram &in, ngram &out) in = " << in << "\n"); // map the input sequence (in) into the corresponding output sequence (out), by applying the provided map for (int i=insize; i>0; i--) { out.pushc(getMap(*in.wordp(i))); } VERBOSE(3,"In lmclass::mapping(ngram &in, ngram &out) out = " << out << "\n"); return; } irstlm-5.80.03/src/lmclass.h000644 000766 000024 00000005615 12013405172 017737 0ustar00nicolabertoldistaff000000 000000 // $Id: lmclass.h 3461 2010-08-27 10:17:34Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #ifndef MF_LMCLASS_H #define MF_LMCLASS_H #ifndef WIN32 #include #include #endif #include "util.h" #include "ngramcache.h" #include "dictionary.h" #include "n_gram.h" #include "lmtable.h" #define MAX_TOKEN 2 class lmclass: public lmtable { dictionary *dict; // dictionary (words - macro tags) double *MapScore; int MapScoreN; int MaxMapSize; protected: void loadMap(std::istream& inp); void loadMapElement(const char* in, const char* out, double sc); void mapping(ngram &in, ngram &out); inline double getMapScore(int wcode) { //the input word is un-known by the map, so I "transform" this word into the oov (of the words) if (wcode >= MapScoreN) { wcode = getDict()->oovcode(); } return MapScore[wcode]; }; inline size_t getMap(int wcode) { //the input word is un-known by the map, so I "transform" this word into the oov (of the words) if (wcode >= MapScoreN) { wcode = getDict()->oovcode(); } return dict->freq(wcode); }; void checkMap(); public: lmclass(float nlf=0.0, float dlfi=0.0); ~lmclass(); void load(const std::string filename,int mmap=0); double lprob(ngram ng, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL); inline double clprob(ngram ng,double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL) { return lprob(ng,bow,bol,maxsuffptr,statesize,extendible); }; inline double clprob(int* ng, int ngsize, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL) { ngram ong(getDict()); ong.pushc(ng,ngsize); return lprob(ong,bow,bol,maxsuffptr,statesize,extendible); }; inline dictionary* getDict() const { return dict; } inline virtual void dictionary_incflag(const bool flag) { dict->incflag(flag); }; }; #endif irstlm-5.80.03/src/lmContainer.cpp000644 000766 000024 00000007361 12013405172 021107 0ustar00nicolabertoldistaff000000 000000 // $Id: lmContainer.cpp 3686 2010-10-15 11:55:32Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include #include #include #include "lmContainer.h" #include "lmtable.h" #include "lmmacro.h" #include "lmclass.h" #include "lmInterpolation.h" using namespace std; inline void error(const char* message) { std::cerr << message << "\n"; throw std::runtime_error(message); } lmContainer::lmContainer() { requiredMaxlev=1000; } int lmContainer::getLanguageModelType(std::string filename) { fstream inp(filename.c_str(),ios::in|ios::binary); if (!inp.good()) { std::cerr << "Failed to open " << filename << "!" << std::endl; exit(1); } //give a look at the header to get informed about the language model type std::string header; inp >> header; inp.close(); VERBOSE(1,"LM header:|" << header << "|" << std::endl); int type=_IRSTLM_LMUNKNOWN; VERBOSE(1,"type: " << type << std::endl); if (header == "lmminterpolation" || header == "LMINTERPOLATION") { type = _IRSTLM_LMINTERPOLATION; } else if (header == "lmmacro" || header == "LMMACRO") { type = _IRSTLM_LMMACRO; } else if (header == "lmclass" || header == "LMCLASS") { type = _IRSTLM_LMCLASS; } else { type = _IRSTLM_LMTABLE; } VERBOSE(1,"type: " << type << std::endl); return type; }; lmContainer* lmContainer::CreateLanguageModel(const std::string infile, float nlf, float dlf) { int type = getLanguageModelType(infile); std::cerr << "Language Model Type of " << infile << " is " << type << std::endl; return CreateLanguageModel(type, nlf, dlf); } lmContainer* lmContainer::CreateLanguageModel(int type, float nlf, float dlf) { std::cerr << "Language Model Type is " << type << std::endl; lmContainer* lm=NULL; switch (type) { case _IRSTLM_LMTABLE: lm = new lmtable(nlf, dlf); break; case _IRSTLM_LMMACRO: lm = new lmmacro(nlf, dlf); break; case _IRSTLM_LMCLASS: lm = new lmclass(nlf, dlf); break; case _IRSTLM_LMINTERPOLATION: lm = new lmInterpolation(nlf, dlf); break; } if (lm == NULL) { std::cerr << "This language model type is unknown!" << std::endl; exit(1); } lm->setLanguageModelType(type); return lm; } bool lmContainer::filter(const string sfilter, lmContainer*& sublmC, const string skeepunigrams) { if (lmtype == _IRSTLM_LMTABLE) { sublmC = sublmC->CreateLanguageModel(lmtype,((lmtable*) this)->GetNgramcacheLoadFactor(),((lmtable*) this)->GetDictioanryLoadFactor()); //let know that table has inverted n-grams sublmC->is_inverted(is_inverted()); sublmC->setMaxLoadedLevel(getMaxLoadedLevel()); sublmC->maxlevel(maxlevel()); bool res=((lmtable*) this)->filter(sfilter, (lmtable*) sublmC, skeepunigrams); return res; } return false; }; irstlm-5.80.03/src/lmContainer.h000644 000766 000024 00000010663 12114670667 020572 0ustar00nicolabertoldistaff000000 000000 // $Id: lmContainer.h 3686 2010-10-15 11:55:32Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #ifndef MF_LMCONTAINER_H #define MF_LMCONTAINER_H #define _IRSTLM_LMUNKNOWN 0 #define _IRSTLM_LMTABLE 1 #define _IRSTLM_LMMACRO 2 #define _IRSTLM_LMCLASS 3 #define _IRSTLM_LMINTERPOLATION 4 #include #include #include #include "util.h" #include "n_gram.h" #include "dictionary.h" typedef enum {BINARY,TEXT,YRANIB,NONE} OUTFILE_TYPE; class lmContainer { static const bool debug=true; protected: int lmtype; //auto reference to its own type int maxlev; //maximun order of sub LMs; int requiredMaxlev; //max loaded level, i.e. load up to requiredMaxlev levels public: lmContainer(); virtual ~lmContainer() {}; virtual void load(const std::string filename, int mmap=0) { UNUSED(filename); UNUSED(mmap); }; virtual void savetxt(const char *filename) { UNUSED(filename); }; virtual void savebin(const char *filename) { UNUSED(filename); }; virtual double getlogOOVpenalty() const { return 0.0; }; virtual double setlogOOVpenalty(int dub) { UNUSED(dub); return 0.0; }; virtual double setlogOOVpenalty(double oovp) { UNUSED(oovp); return 0.0; }; inline virtual dictionary* getDict() const { return NULL; }; inline virtual void maxlevel(int lev) { maxlev = lev; }; inline virtual int maxlevel() const { return maxlev; }; inline virtual void stat(int lev=0) { UNUSED(lev); }; inline virtual void setMaxLoadedLevel(int lev) { requiredMaxlev=lev; }; inline virtual int getMaxLoadedLevel() { return requiredMaxlev; }; virtual bool is_inverted(const bool flag) { UNUSED(flag); return false; }; virtual bool is_inverted() { return false; }; virtual double clprob(ngram ng, double* bow=NULL, int* bol=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL) { UNUSED(ng); UNUSED(bow); UNUSED(bol); UNUSED(maxsuffptr); UNUSED(statesize); UNUSED(extendible); return 0.0; }; virtual double clprob(int* ng, int ngsize, double* bow=NULL, int* bol=NULL, char** maxsuffptr=NULL, unsigned int* statesize=NULL,bool* extendible=NULL) { UNUSED(ng); UNUSED(ngsize); UNUSED(bow); UNUSED(bol); UNUSED(maxsuffptr); UNUSED(statesize); UNUSED(extendible); return 0.0; }; virtual void used_caches() {}; virtual void init_caches(int uptolev) { UNUSED(uptolev); }; virtual void check_caches_levels() {}; virtual void reset_caches() {}; virtual void reset_mmap() {}; inline void setLanguageModelType(int type) { lmtype=type; }; inline int getLanguageModelType() { return lmtype; }; int getLanguageModelType(std::string filename); inline virtual void dictionary_incflag(const bool flag) { UNUSED(flag); }; virtual bool filter(const string sfilter, lmContainer*& sublmt, const string skeepunigrams); lmContainer* CreateLanguageModel(const std::string infile, float nlf=0.0, float dlf=0.0); lmContainer* CreateLanguageModel(int type, float nlf=0.0, float dlf=0.0); inline virtual bool is_OOV(int code) { UNUSED(code); return false; }; inline bool is_lmt_cache_enabled(){ #ifdef LMT_CACHE_ENABLE return true; #endif return false; } inline bool is_ps_cache_enabled(){ #ifdef PS_CACHE_ENABLE return true; #endif return false; } inline bool is_cache_enabled(){ return is_lmt_cache_enabled() && is_ps_cache_enabled(); } }; #endif irstlm-5.80.03/src/lmInterpolation.cpp000644 000766 000024 00000020001 12013405172 021776 0ustar00nicolabertoldistaff000000 000000 // $Id: lmInterpolation.cpp 3686 2010-10-15 11:55:32Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include #include #include #include "lmContainer.h" #include "lmInterpolation.h" using namespace std; inline void error(const char* message) { std::cerr << message << "\n"; throw std::runtime_error(message); } lmInterpolation::lmInterpolation(float nlf, float dlf) { ngramcache_load_factor = nlf; dictionary_load_factor = dlf; order=0; memmap=0; isInverted=false; } void lmInterpolation::load(const std::string filename,int mmap) { VERBOSE(2,"lmInterpolation::load(const std::string filename,int memmap)" << std::endl); VERBOSE(2," filename:|" << filename << "|" << std::endl); dictionary_upperbound=1000000; int memmap=mmap; dict=new dictionary((char *)NULL,1000000,dictionary_load_factor); //get info from the configuration file fstream inp(filename.c_str(),ios::in|ios::binary); char line[MAX_LINE]; const char* words[MAX_TOKEN]; int tokenN; inp.getline(line,MAX_LINE,'\n'); tokenN = parseWords(line,words,MAX_TOKEN); if (tokenN != 2 || ((strcmp(words[0],"LMINTERPOLATION") != 0) && (strcmp(words[0],"lminterpolation")!=0))) error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1\nweight_of_LM_2 filename_of_LM_2"); m_number_lm = atoi(words[1]); m_weight.resize(m_number_lm); m_file.resize(m_number_lm); m_isinverted.resize(m_number_lm); m_lm.resize(m_number_lm); VERBOSE(2,"lmInterpolation::load(const std::string filename,int mmap) m_number_lm:"<< m_number_lm << std::endl;); dict->incflag(1); for (int i=0; i3) { error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMINTERPOLATION number_of_models\nweight_of_LM_1 filename_of_LM_1\nweight_of_LM_2 filename_of_LM_2"); } //check whether the (textual) LM has to be loaded as inverted m_isinverted[i] = false; if(tokenN == 3) { if (strcmp(words[2],"inverted") == 0) m_isinverted[i] = true; } VERBOSE(2,"i:" << i << " m_isinverted[i]:" << m_isinverted[i] << endl); m_weight[i] = (float) atof(words[0]); m_file[i] = words[1]; VERBOSE(2,"lmInterpolation::load(const std::string filename,int mmap) m_file:"<< words[1] << std::endl;); m_lm[i] = load_lm(i,memmap,ngramcache_load_factor,dictionary_load_factor); //set the actual value for inverted flag, which is known only after loading the lM m_isinverted[i] = m_lm[i]->is_inverted(); dictionary *_dict=m_lm[i]->getDict(); for (int j=0; j<_dict->size(); j++) { dict->encode(_dict->decode(j)); } } getDict()->genoovcode(); getDict()->incflag(1); inp.close(); int maxorder = 0; for (int i=0; i m_lm[i]->maxlevel())?maxorder:m_lm[i]->maxlevel(); } if (order == 0) { order = maxorder; std::cerr << "order is not set; reset to the maximum order of LMs: " << order << std::endl; } else if (order > maxorder) { order = maxorder; std::cerr << "order is too high; reset to the maximum order of LMs: " << order << std::endl; } maxlev=order; } lmContainer* lmInterpolation::load_lm(int i,int memmap, float nlf, float dlf) { //checking the language model type lmContainer* lmt=NULL; lmt = lmt->CreateLanguageModel(m_file[i],nlf,dlf); //let know that table has inverted n-grams lmt->is_inverted(m_isinverted[i]); //set inverted flag for each LM lmt->setMaxLoadedLevel(requiredMaxlev); lmt->load(m_file[i], memmap); lmt->init_caches(lmt->maxlevel()); return lmt; } double lmInterpolation::clprob(ngram ng, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible) { double pr=0.0; double _logpr; char* _maxsuffptr=NULL,*actualmaxsuffptr=NULL; unsigned int _statesize=0,actualstatesize=0; int _bol=0,actualbol=MAX_NGRAM; double _bow=0.0,actualbow=0.0; // bool _extendible=false,actualextendible=false; bool* _extendible=NULL,actualextendible=false; if (extendible) { _extendible=new bool; _extendible=false; } for (size_t i=0; igetDict()); _ng.trans(ng); // _logpr=m_lm[i]->clprob(_ng,&_bow,&_bol,&_maxsuffptr,&_statesize,&_extendible); _logpr=m_lm[i]->clprob(_ng,&_bow,&_bol,&_maxsuffptr,&_statesize,_extendible); // assert(_statesize != InvalidContextLength); /* cerr.precision(10); std::cerr << " LM " << i << " weight:" << m_weight[i] << std::endl; std::cerr << " LM " << i << " log10 logpr:" << _logpr<< std::endl; std::cerr << " LM " << i << " pr:" << pow(10.0,_logpr) << std::endl; std::cerr << " _statesize:" << _statesize << std::endl; std::cerr << " _bow:" << _bow << std::endl; std::cerr << " _bol:" << _bol << std::endl; */ //TO CHECK the following claims //What is the statesize of a LM interpolation? The largest _statesize among the submodels //What is the maxsuffptr of a LM interpolation? The _maxsuffptr of the submodel with the largest _statesize //What is the bol of a LM interpolation? The smallest _bol among the submodels //What is the bow of a LM interpolation? The weighted sum of the bow of the submodels //What is the prob of a LM interpolation? The weighted sum of the prob of the submodels //What is the extendible flag of a LM interpolation? true if the extendible flag is one for any LM pr+=m_weight[i]*pow(10.0,_logpr); actualbow+=m_weight[i]*pow(10.0,_bow); if(_statesize > actualstatesize || i == 0) { actualmaxsuffptr = _maxsuffptr; actualstatesize = _statesize; } if (_bol < actualbol) { actualbol=_bol; //backoff limit of LM[i] } if (_extendible) { actualextendible=true; //set extendible flag to true if the ngram is extendible for any LM } } if (bol) *bol=actualbol; if (bow) *bow=log(actualbow); if (maxsuffptr) *maxsuffptr=actualmaxsuffptr; if (statesize) *statesize=actualstatesize; if (extendible) { *extendible=actualextendible; delete _extendible; } /* if (statesize) std::cerr << " statesize:" << *statesize << std::endl; if (bow) std::cerr << " bow:" << *bow << std::endl; if (bol) std::cerr << " bol:" << *bol << std::endl; */ return log(pr)/M_LN10; } double lmInterpolation::clprob(int* codes, int sz, double* bow,int* bol,char** maxsuffptr,unsigned int* statesize,bool* extendible) { //create the actual ngram ngram ong(dict); ong.pushc(codes,sz); assert (ong.size == sz); return clprob(ong, bow, bol, maxsuffptr, statesize, extendible); } double lmInterpolation::setlogOOVpenalty(int dub) { assert(dub > dict->size()); double _logpr; double OOVpenalty=0.0; for (int i=0; isetlogOOVpenalty(dub); //set OOV Penalty for each LM _logpr=m_lm[i]->getlogOOVpenalty(); OOVpenalty+=m_weight[i]*exp(_logpr); } logOOVpenalty=log(OOVpenalty); return logOOVpenalty; } irstlm-5.80.03/src/lmInterpolation.h000644 000766 000024 00000007252 12042554746 021476 0ustar00nicolabertoldistaff000000 000000 // $Id: lmInterpolation.h 3686 2010-10-15 11:55:32Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #ifndef MF_LMINTERPOLATION_H #define MF_LMINTERPOLATION_H #include #include #include #include #include #include #include "util.h" #include "dictionary.h" #include "n_gram.h" #include "lmContainer.h" /* interpolation of several sub LMs */ #define MAX_TOKEN 2 class lmInterpolation: public lmContainer { static const bool debug=true; int m_number_lm; int order; int dictionary_upperbound; //set by user double logOOVpenalty; //penalty for OOV words (default 0) bool isInverted; int memmap; //level from which n-grams are accessed via mmap std::vector m_weight; std::vector m_file; std::vector m_isinverted; std::vector m_lm; int maxlev; //maximun order of sub LMs; float ngramcache_load_factor; float dictionary_load_factor; dictionary *dict; // dictionary for all interpolated LMs public: lmInterpolation(float nlf=0.0, float dlfi=0.0); virtual ~lmInterpolation() {}; void load(const std::string filename,int mmap=0); lmContainer* load_lm(int i, int memmap, float nlf, float dlf); virtual double clprob(ngram ng, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL); virtual double clprob(int* ng, int ngsize, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL); int maxlevel() const { return maxlev; }; virtual inline void setDict(dictionary* d) { if (dict) delete dict; dict=d; }; virtual inline dictionary* getDict() const { return dict; }; //set penalty for OOV words virtual inline double getlogOOVpenalty() const { return logOOVpenalty; } virtual double setlogOOVpenalty(int dub); double inline setlogOOVpenalty(double oovp) { return logOOVpenalty=oovp; } //set the inverted flag (used to set the inverted flag of each subLM, when loading) inline bool is_inverted(const bool flag) { return isInverted = flag; } //for an interpolation LM this variable does not make sense //for compatibility, we return true if all subLM return true inline bool is_inverted() { for (int i=0; iincflag(flag); }; inline virtual bool is_OOV(int code) { //returns true if the word is OOV for each subLM for (int i=0; igetDict()->encode(getDict()->decode(code)); if (m_lm[i]->is_OOV(_code) == false) return false; } return true; } }; #endif irstlm-5.80.03/src/lmmacro.cpp000644 000766 000024 00000067130 12013405172 020266 0ustar00nicolabertoldistaff000000 000000 // $Id: lmmacro.cpp 3631 2010-10-07 12:04:12Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include #include #include #include "math.h" #include "mempool.h" #include "htable.h" #include "ngramcache.h" #include "dictionary.h" #include "n_gram.h" #include "lmtable.h" #include "lmmacro.h" #include "util.h" using namespace std; // local utilities: start inline void error(const char* message) { cerr << message << "\n"; throw runtime_error(message); } // local utilities: end lmmacro::lmmacro(float nlf, float dlfi):lmtable(nlf,dlfi) { dict = new dictionary((char *)NULL,1000000); // dict of micro tags getDict()->incflag(1); }; lmmacro::~lmmacro() { if (mapFlag) unloadmap(); } void lmmacro::load(const std::string filename,int memmap) { VERBOSE(2,"lmmacro::load(const std::string filename,int memmap)" << std::endl); //get info from the configuration file fstream inp(filename.c_str(),ios::in|ios::binary); char line[MAX_LINE]; const char* words[MAX_TOKEN_N_MAP]; int tokenN; inp.getline(line,MAX_LINE,'\n'); tokenN = parseWords(line,words,MAX_TOKEN_N_MAP); if (tokenN != 4 || ((strcmp(words[0],"LMMACRO") != 0) && (strcmp(words[0],"lmmacro")!=0))) error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMMACRO lmsize field [true|false]\nfilename_of_LM\nfilename_of_map (optional)"); maxlev = atoi(words[1]); selectedField = atoi(words[2]); if ((strcmp(words[3],"TRUE") == 0) || (strcmp(words[3],"true") == 0)) collapseFlag = true; else if ((strcmp(words[3],"FALSE") == 0) || (strcmp(words[3],"false") == 0)) collapseFlag = false; else error((char*)"ERROR: wrong header format of configuration file\ncorrect format: LMMACRO lmsize field [true|false]\nfilename_of_LM\nfilename_of_map (optional)"); #ifdef DLEXICALLM selectedFieldForLexicon = atoi(words[3]); collapseFlag = atoi(words[4]); #endif if (selectedField == -1) cerr << "no selected field: the whole string is used" << std::endl; else cerr << "selected field n. " << selectedField << std::endl; if (collapseFlag) cerr << "collapse is enabled" << std::endl; else cerr << "collapse is disabled" << std::endl; std::string lmfilename; if (inp.getline(line,MAX_LINE,'\n')) { tokenN = parseWords(line,words,MAX_TOKEN_N_MAP); lmfilename = words[0]; } else error((char*)"ERROR: wrong format of configuration file\ncorrect format: LMMACRO lmsize field [true|false]\nfilename_of_LM\nfilename_of_map (optional)"); std::string mapfilename = ""; if (inp.getline(line,MAX_LINE,'\n')) { tokenN = parseWords(line,words,MAX_TOKEN_N_MAP); mapfilename = words[0]; mapFlag = true; } else { mapFlag = false; } inp.close(); std::cerr << "lmfilename:" << lmfilename << std::endl; if (mapfilename != "") { std::cerr << "mapfilename:" << mapfilename << std::endl; } else { std::cerr << "no mapfilename" << std::endl; mapFlag = false; } //allow the dictionary to add new words getDict()->incflag(1); if ((!mapFlag) && (collapseFlag)) { error((char*)"ERROR: you must specify a map if you want to collapse a specific field!"); } #ifdef DLEXICALLM std::string lexicalclassesfilename = lexicalclassesfilename = words[2]; if (lexicalclassesfilename != "NULL" && lexicalclassesfilename != "null") lexicalclassesfilename = ""; if (lexicalclassesfilename != "") std::cerr << "lexicalclassesfilename:" << lexicalclassesfilename << std::endl; else std::cerr << "no lexicalclassesfilename" << std::endl; // Load the classes of lexicalization tokens: if (lexicalclassesfilename != "") loadLexicalClasses(lexicalclassesfilename.c_str()); #endif // Load the (possibly binary) LM lmtable::load(lmfilename,memmap); if (mapFlag) loadmap(mapfilename); getDict()->genoovcode(); getDict()->incflag(1); }; void lmmacro::unloadmap() { delete dict; free(microMacroMap); if (collapseFlag) { free(collapsableMap); free(collapsatorMap); } #ifdef DLEXICALLM free(lexicaltoken2classMap); #endif } void lmmacro::loadmap(const std::string mapfilename) { microMacroMapN = 0; microMacroMap = NULL; collapsableMap = NULL; collapsatorMap = NULL; #ifdef DLEXICALLM lexicaltoken2classMap = NULL; lexicaltoken2classMapN = 0; #endif microMacroMap = (int *)calloc(BUFSIZ, sizeof(int)); if (collapseFlag) { collapsableMap = (bool *)calloc(BUFSIZ, sizeof(bool)); collapsatorMap = (bool *)calloc(BUFSIZ, sizeof(bool)); } if (lmtable::getDict()->getcode(BOS_)==-1) { lmtable::getDict()->incflag(1); lmtable::getDict()->encode(BOS_); lmtable::getDict()->incflag(0); } if (lmtable::getDict()->getcode(EOS_)==-1) { lmtable::getDict()->incflag(1); lmtable::getDict()->encode(EOS_); lmtable::getDict()->incflag(0); } char line[MAX_LINE]; const char* words[MAX_TOKEN_N_MAP]; const char *macroW; const char *microW; int tokenN; bool bos=false,eos=false; // Load the dictionary of micro tags (to be put in "dict" of lmmacro class): inputfilestream inpMap(mapfilename.c_str()); std::cerr << "Reading map " << mapfilename << "..." << std::endl; while (inpMap.getline(line,MAX_LINE,'\n')) { tokenN = parseWords(line,words,MAX_TOKEN_N_MAP); if (tokenN != 2) error((char*)"ERROR: wrong format of map file\n"); microW = words[0]; macroW = words[1]; getDict()->encode(microW); if (microMacroMapN>0 && !(microMacroMapN % BUFSIZ)) { microMacroMap = (int *)realloc(microMacroMap, sizeof(int)*(BUFSIZ*(1+microMacroMapN/BUFSIZ))); if (collapseFlag) { //create supporting info for collapse collapsableMap = (bool *)realloc(collapsableMap, sizeof(bool)*(BUFSIZ*(1+microMacroMapN/BUFSIZ))); collapsatorMap = (bool *)realloc(collapsatorMap, sizeof(bool)*(BUFSIZ*(1+microMacroMapN/BUFSIZ))); } } microMacroMap[microMacroMapN] = lmtable::getDict()->getcode(macroW); if (collapseFlag) { int len = strlen(microW)-1; if (microW[len] == '(') { collapsableMap[microMacroMapN] = false; collapsatorMap[microMacroMapN] = true; } else if (microW[len] == ')') { collapsableMap[microMacroMapN] = true; collapsatorMap[microMacroMapN] = false; } else if (microW[len] == '+') { collapsableMap[microMacroMapN] = true; collapsatorMap[microMacroMapN] = true; } else { collapsableMap[microMacroMapN] = false; collapsatorMap[microMacroMapN] = false; } } if (!bos && !strcmp(microW,BOS_)) bos=true; if (!eos && !strcmp(microW,EOS_)) eos=true; VERBOSE(2,"\nmicroW = " << microW << "\n" << "macroW = " << macroW << "\n" << "microMacroMapN = " << microMacroMapN << "\n" << "code of micro = " << getDict()->getcode(microW) << "\n" << "code of macro = " << lmtable::getDict()->getcode(macroW) << "\n"); microMacroMapN++; } if ((microMacroMapN == 0) && (selectedField == -1)) error((char*)"ERROR: with no field selection, a map for the whole string is mandatory\n"); if (microMacroMapN>0) { // Add -> to map if missing if (!bos) { getDict()->encode(BOS_); if (microMacroMapN && !(microMacroMapN%BUFSIZ)) microMacroMap = (int *)realloc(microMacroMap, sizeof(int)*(microMacroMapN+BUFSIZ)); microMacroMap[microMacroMapN++] = lmtable::getDict()->getcode(BOS_); } // Add -> to map if missing if (!eos) { getDict()->encode(EOS_); if (microMacroMapN && !(microMacroMapN%BUFSIZ)) microMacroMap = (int *)realloc(microMacroMap, sizeof(int)*(microMacroMapN+BUFSIZ)); microMacroMap[microMacroMapN++] = lmtable::getDict()->getcode(EOS_); } } // getDict()->incflag(0); VERBOSE(2,"oovcode(micro)=" << getDict()->oovcode() << "\n" << "oovcode(macro)=" << lmtable::getDict()->oovcode() << "\n" << "microMacroMapN = " << microMacroMapN << "\n" << "macrodictsize = " << getDict()->size() << "\n" << "microdictsize = " << lmtable::getDict()->size() << "\n"); IFVERBOSE(2) { for (int i=0; idecode(i) << "] -> " << lmtable::getDict()->decode(microMacroMap[i]) << "\n"); } } std::cerr << "...done\n"; } double lmmacro::lprob(ngram micro_ng) { VERBOSE(2,"lmmacro::lprob, parameter = <" << micro_ng << ">\n"); ngram macro_ng(lmtable::getDict()); if (micro_ng.dict == macro_ng.dict) macro_ng.trans(micro_ng); // micro to macro mapping already done else map(µ_ng, ¯o_ng); // mapping required VERBOSE(3,"lmmacro::lprob: micro_ng = " << micro_ng << "\n" << "lmmacro::lprob: macro_ng = " << macro_ng << "\n"); // ask LM with macro double prob; prob = lmtable::lprob(macro_ng); VERBOSE(3,"prob = " << prob << "\n"); return prob; }; double lmmacro::clprob(int* codes, int sz, double* bow, int* bol, char** state,unsigned int* statesize,bool* extendible) { ngram micro_ng(getDict()); micro_ng.pushc(codes,sz); return clprob(micro_ng,bow,bol,state,statesize,extendible); } double lmmacro::clprob(ngram micro_ng, double* bow, int* bol, char** state,unsigned int* statesize,bool* extendible) { VERBOSE(3," lmmacro::clprob(ngram), parameter = <" << micro_ng << ">\n"); ngram transformed_ng(lmtable::getDict()); bool collapsed = transform(micro_ng, transformed_ng); VERBOSE(3,"lmmacro::clprob(ngram), transformed_ng = <" << transformed_ng << ">\n"); double logpr; if (collapsed) { // the last token of the ngram continues an already open "chunk" // the probability at chunk-level is not computed because it has been already computed when the actual"chunk" opens VERBOSE(3," SKIPPED call to lmtable::clprob because of collapse; logpr: 0.0\n"); logpr = 0.0; } else { VERBOSE(3," QUERY MACRO LM on (after transformation and size reduction) " << transformed_ng << "\n"); logpr = lmtable::clprob(transformed_ng, bow, bol, state, statesize, extendible); } VERBOSE(3," GET logpr: " << logpr << "\n"); return logpr; } bool lmmacro::transform(ngram &in, ngram &out) { VERBOSE(3,"lmmacro::transform(ngram &in, ngram &out), in = <" << in << ">\n"); //step 1: selection of the correct field ngram field_ng(getDict()); if (selectedField >= 0) field_selection(in, field_ng); else field_ng = in; //step 2: collapsing ngram collapsed_ng(getDict()); bool collapsed = false; if (collapseFlag) collapsed = collapse(field_ng, collapsed_ng); else collapsed_ng = field_ng; //step 3: mapping using the loaded map if (mapFlag) mapping(collapsed_ng, out); else out.trans(collapsed_ng); if (out.size>lmtable::maxlevel()) out.size=lmtable::maxlevel(); VERBOSE(3,"lmmacro::transform(ngram &in, ngram &out), out = <" << out << ">\n"); return collapsed; } void lmmacro::field_selection(ngram &in, ngram &out) { VERBOSE(3,"In lmmacro::field_selection(ngram &in, ngram &out) in = " << in << "\n"); int microsize = in.size; for (int i=microsize; i>0; i--) { char curr_token[BUFSIZ]; strcpy(curr_token, getDict()->decode(*in.wordp(i))); char *field; if (strcmp(curr_token,"") && strcmp(curr_token,"") && strcmp(curr_token,"_unk_")) { field = strtok(curr_token, "#"); int j=0; while (j1; i--) { curr_code = *in.wordp(i); if (microMacroMap[curr_code] != microMacroMap[prev_code]) { out.pushc(curr_code); } else { if (!(collapsableMap[curr_code] && collapsatorMap[prev_code])) { out.pushc(prev_code); } } prev_code = curr_code; } // and insert the most recent token out.pushc(*in.wordp(1)); VERBOSE(3,"In lmmacro::collapse(ngram &in, ngram &out) out = " << out << "\n"); return false; } void lmmacro::mapping(ngram &in, ngram &out) { VERBOSE(3,"In lmmacro::mapping(ngram &in, ngram &out) in = " << in << "\n"); int microsize = in.size; // map microtag sequence (in) into the corresponding sequence of macrotags (possibly shorter) (out) for (int i=microsize; i>0; i--) { int in_code = *in.wordp(i); int out_code; if (in_code < microMacroMapN) out_code = microMacroMap[in_code]; else out_code = lmtable::getDict()->oovcode(); out.pushc(out_code); } VERBOSE(3,"In lmmacro::mapping(ngram &in, ngram &out) out = " << out << "\n"); return; } //maxsuffptr returns the largest suffix of an n-gram that is contained //in the LM table. This can be used as a compact representation of the //(n-1)-gram state of a n-gram LM. if the input k-gram has k>=n then it //is trimmed to its n-1 suffix. const char *lmmacro::maxsuffptr(ngram micro_ng, unsigned int* size) { ngram macro_ng(lmtable::getDict()); if (micro_ng.dict == macro_ng.dict) macro_ng.trans(micro_ng); // micro to macro mapping already done else map(µ_ng, ¯o_ng); // mapping required VERBOSE(2,"lmmacro::lprob: micro_ng = " << micro_ng << "\n" << "lmmacro::lprob: macro_ng = " << macro_ng << "\n"); return lmtable::maxsuffptr(macro_ng,size); } const char *lmmacro::cmaxsuffptr(ngram micro_ng, unsigned int* size) { //cerr << "lmmacro::CMAXsuffptr\n"; //cerr << "micro_ng: " << micro_ng // << " -> micro_ng.size: " << micro_ng.size << "\n"; //the LM working on the selected field = 0 //contributes to the LM state // if (selectedField>0) return NULL; ngram macro_ng(lmtable::getDict()); if (micro_ng.dict == macro_ng.dict) macro_ng.trans(micro_ng); // micro to macro mapping already done else map(µ_ng, ¯o_ng); // mapping required VERBOSE(2,"lmmacro::lprob: micro_ng = " << micro_ng << "\n" << "lmmacro::lprob: macro_ng = " << macro_ng << "\n") return lmtable::cmaxsuffptr(macro_ng,size); } void lmmacro::map(ngram *in, ngram *out) { VERBOSE(2,"In lmmacro::map, in = " << *in << endl << " (selectedField = " << selectedField << " )\n"); if (selectedField==-2) // the whole token is compatible with the LM words One2OneMapping(in, out); else if (selectedField==-1) // the whole token has to be mapped before querying the LM Micro2MacroMapping(in, out); else if (selectedField<10) { // select the field "selectedField" from tokens (separator is assumed to be "#") ngram field_ng(((lmmacro *)this)->getDict()); int microsize = in->size; for (int i=microsize; i>0; i--) { char curr_token[BUFSIZ]; strcpy(curr_token, ((lmmacro *)this)->getDict()->decode(*(in->wordp(i)))); char *field; if (strcmp(curr_token,"") && strcmp(curr_token,"") && strcmp(curr_token,"_unk_")) { field = strtok(curr_token, "#"); int j=0; while (j0) Micro2MacroMapping(&field_ng, out); else out->trans(field_ng); } else { #ifdef DLEXICALLM // selectedField>=10: tens=idx of micro tag (possibly to be mapped to // macro tag), unidx=idx of lemma to be concatenated by "_" to the // (mapped) tag int tagIdx = selectedField/10; int lemmaIdx = selectedField%10; // micro (or mapped to macro) sequence construction: ngram tag_ng(getDict()); char *lemmas[BUFSIZ]; int microsize = in->size; for (int i=microsize; i>0; i--) { char curr_token[BUFSIZ]; strcpy(curr_token, getDict()->decode(*(in->wordp(i)))); char *tag = NULL, *lemma = NULL; if (strcmp(curr_token,"") && strcmp(curr_token,"") && strcmp(curr_token,"_unk_")) { if (tagIdx0) Micro2MacroMapping(&tag_ng, out, lemmas); else out->trans(tag_ng); // qui si dovrebbero sostituire i tag con tag_lemma, senza mappatura! #endif } VERBOSE(2,"In lmmacro::map, FINAL out = " << *out << endl); } void lmmacro::One2OneMapping(ngram *in, ngram *out) { int insize = in->size; // map each token of the sequence "in" into the same-length sequence "out" through the map for (int i=insize; i>0; i--) { int curr_code = *(in->wordp(i)); const char *outtoken = lmtable::getDict()->decode((curr_codeoovcode()); out->pushw(outtoken); } return; } void lmmacro::Micro2MacroMapping(ngram *in, ngram *out) { int microsize = in->size; VERBOSE(2,"In Micro2MacroMapping, in = " << *in << "\n"); // map microtag sequence (in) into the corresponding sequence of macrotags (possibly shorter) (out) for (int i=microsize; i>0; i--) { int curr_code = *(in->wordp(i)); const char *curr_macrotag = lmtable::getDict()->decode((curr_codeoovcode()); if (i==microsize) { out->pushw(curr_macrotag); } else { int prev_code = *(in->wordp(i+1)); const char *prev_microtag = getDict()->decode(prev_code); const char *curr_microtag = getDict()->decode(curr_code); const char *prev_macrotag = lmtable::getDict()->decode((prev_codeoovcode()); int prev_len = strlen(prev_microtag)-1; int curr_len = strlen(curr_microtag)-1; if (strcmp(curr_macrotag,prev_macrotag) != 0 || !( (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!= ')' )) && ( curr_microtag[curr_len]==')' && curr_microtag[0]!='(')) || (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!= ')' )) && curr_microtag[curr_len]=='+' ) || (prev_microtag[prev_len]== '+' && curr_microtag[curr_len]=='+' ) || (prev_microtag[prev_len]== '+' && ( curr_microtag[curr_len]==')' && curr_microtag[0]!='(' )))) out->pushw(curr_macrotag); } } return; } // DISMITTED ON FEB 2011 BECAUSE TOO MUCH PROBLEMATIC FROM A THEORETICAL POINT OF VIEW #ifdef DLEXICALLM void lmmacro::Micro2MacroMapping(ngram *in, ngram *out, char **lemmas) { VERBOSE(2,"In Micro2MacroMapping, in = " << *in << "\n") int microsize = in->size; IFVERBOSE(3) { VERBOSE(3,"In Micro2MacroMapping, lemmas:\n"); if (lexicaltoken2classMap) for (int i=microsize; i>0; i--) VERBOSE(3,"lemmas[" << i << "]=" << lemmas[i] << " -> class -> " << lexicaltoken2classMap[lmtable::getDict()->encode(lemmas[i])] << endl); else for (int i=microsize; i>0; i--) VERBOSE(3,"lemmas[" << i << "]=" << lemmas[i] << endl); } // map microtag sequence (in) into the corresponding sequence of macrotags (possibly shorter) (out) char tag_lemma[BUFSIZ]; for (int i=microsize; i>0; i--) { int curr_code = *(in->wordp(i)); const char *curr_microtag = getDict()->decode(curr_code); const char *curr_lemma = lemmas[i]; const char *curr_macrotag = lmtable::getDict()->decode((curr_codeoovcode()); int curr_len = strlen(curr_microtag)-1; if (i==microsize) { if (( curr_microtag[curr_len]=='(' ) || ( curr_microtag[0]=='(' && curr_microtag[curr_len]!=')' ) || ( curr_microtag[curr_len]=='+' )) sprintf(tag_lemma, "%s", curr_macrotag); // non lessicalizzo il macrotag se sono ancora all''interno del chunk else if (lexicaltoken2classMap) sprintf(tag_lemma, "%s_class%d", curr_macrotag, lexicaltoken2classMap[lmtable::getDict()->encode(curr_lemma)]); else sprintf(tag_lemma, "%s_%s", curr_macrotag, lemmas[microsize]); VERBOSE(2,"In Micro2MacroMapping, starting tag_lemma = >" << tag_lemma << "<\n"); out->pushw(tag_lemma); free(lemmas[microsize]); } else { int prev_code = *(in->wordp(i+1)); const char *prev_microtag = getDict()->decode(prev_code); const char *prev_macrotag = lmtable::getDict()->decode((prev_codeoovcode()); int prev_len = strlen(prev_microtag)-1; if (( curr_microtag[curr_len]=='(' ) || ( curr_microtag[0]=='(' && curr_microtag[curr_len]!=')' ) || ( curr_microtag[curr_len]=='+' )) sprintf(tag_lemma, "%s", curr_macrotag); // non lessicalizzo il macrotag se sono ancora all''interno del chunk else if (lexicaltoken2classMap) sprintf(tag_lemma, "%s_class%d", curr_macrotag, lexicaltoken2classMap[lmtable::getDict()->encode(curr_lemma)]); else sprintf(tag_lemma, "%s_%s", curr_macrotag, curr_lemma); VERBOSE(2,"In Micro2MacroMapping, tag_lemma = >" << tag_lemma << "<\n"); if (strcmp(curr_macrotag,prev_macrotag) != 0 || !( (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!=')' )) && curr_microtag[curr_len]==')' && curr_microtag[0]!='(') || (( prev_microtag[prev_len]== '(' || ( prev_microtag[0]== '(' && prev_microtag[prev_len]!= ')')) && curr_microtag[curr_len]=='+' ) || (prev_microtag[prev_len]== '+' && curr_microtag[curr_len]=='+' ) || (prev_microtag[prev_len]== '+' && curr_microtag[curr_len]==')' && curr_microtag[0]!='(' ))) { VERBOSE(2,"In Micro2MacroMapping, before pushw, out = " << *out << endl); out->pushw(tag_lemma); VERBOSE(2,"In Micro2MacroMapping, after pushw, out = " << *out << endl); } else { VERBOSE(2,"In Micro2MacroMapping, before shift, out = " << *out << endl); out->shift(); VERBOSE(2,"In Micro2MacroMapping, after shift, out = " << *out << endl); out->pushw(tag_lemma); VERBOSE(2,"In Micro2MacroMapping, after push, out = " << *out << endl); } free(lemmas[i]); } } return; } void lmmacro::loadLexicalClasses(const char *fn) { char line[MAX_LINE]; const char* words[MAX_TOKEN_N_MAP]; int tokenN; lexicaltoken2classMap = (int *)calloc(BUFSIZ, sizeof(int)); lexicaltoken2classMapN = BUFSIZ; lmtable::getDict()->incflag(1); inputfilestream inp(fn); while (inp.getline(line,MAX_LINE,'\n')) { tokenN = parseWords(line,words,MAX_TOKEN_N_MAP); if (tokenN != 2) error((char*)"ERROR: wrong format of lexical classes file\n"); else { int classIdx = atoi(words[1]); int wordCode = lmtable::getDict()->encode(words[0]); if (wordCode>=lexicaltoken2classMapN) { int r = (wordCode-lexicaltoken2classMapN)/BUFSIZ; lexicaltoken2classMapN += (r+1)*BUFSIZ; lexicaltoken2classMap = (int *)realloc(lexicaltoken2classMap, sizeof(int)*lexicaltoken2classMapN); } lexicaltoken2classMap[wordCode] = classIdx; } } lmtable::getDict()->incflag(0); IFVERBOSE(3) { for (int x=0; xsize(); x++) VERBOSE(3,"class of <" << lmtable::getDict()->decode(x) << "> (code=" << x << ") = " << lexicaltoken2classMap[x] << endl); } return; } void lmmacro::cutLex(ngram *in, ngram *out) { *out=*in; const char *curr_macro = out->dict->decode(*(out->wordp(1))); out->shift(); const char *p = strrchr(curr_macro, '_'); int lexLen; if (p) lexLen=strlen(p); else lexLen=0; char curr_NoLexMacro[BUFSIZ]; memset(&curr_NoLexMacro,0,BUFSIZ); strncpy(curr_NoLexMacro,curr_macro,strlen(curr_macro)-lexLen); out->pushw(curr_NoLexMacro); return; } #endif irstlm-5.80.03/src/lmmacro.h000644 000766 000024 00000006370 12013405172 017732 0ustar00nicolabertoldistaff000000 000000 // $Id: lmmacro.h 3461 2010-08-27 10:17:34Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #ifndef MF_LMMACRO_H #define MF_LMMACRO_H #ifndef WIN32 #include #include #endif #include "util.h" #include "ngramcache.h" #include "dictionary.h" #include "n_gram.h" #include "lmtable.h" #define MAX_TOKEN_N_MAP 4 class lmmacro: public lmtable { dictionary *dict; int maxlev; //max level of table int selectedField; bool collapseFlag; //flag for the presence of collapse bool mapFlag; //flag for the presence of map int microMacroMapN; int *microMacroMap; bool *collapsableMap; bool *collapsatorMap; #ifdef DLEXICALLM int selectedFieldForLexicon; int *lexicaltoken2classMap; int lexicaltoken2classMapN; #endif void loadmap(const std::string mapfilename); void unloadmap(); bool transform(ngram &in, ngram &out); void field_selection(ngram &in, ngram &out); bool collapse(ngram &in, ngram &out); void mapping(ngram &in, ngram &out); public: lmmacro(float nlf=0.0, float dlfi=0.0); ~lmmacro(); void load(const std::string filename,int mmap=0); double lprob(ngram ng); double clprob(ngram ng,double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL); double clprob(int* ng, int ngsize, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL); const char *maxsuffptr(ngram ong, unsigned int* size=NULL); const char *cmaxsuffptr(ngram ong, unsigned int* size=NULL); void map(ngram *in, ngram *out); void One2OneMapping(ngram *in, ngram *out); void Micro2MacroMapping(ngram *in, ngram *out); #ifdef DLEXICALLM void Micro2MacroMapping(ngram *in, ngram *out, char **lemma); void loadLexicalClasses(const char *fn); void cutLex(ngram *in, ngram *out); #endif inline dictionary* getDict() const { return dict; } inline int maxlevel() const { return maxlev; }; inline virtual void dictionary_incflag(const bool flag) { dict->incflag(flag); }; inline virtual bool filter(const string sfilter, lmContainer* sublmt, const string skeepunigrams) { UNUSED(sfilter); UNUSED(sublmt); UNUSED(skeepunigrams); return false; } }; #endif irstlm-5.80.03/src/lmtable.cpp000644 000766 000024 00000225177 12042554746 020301 0ustar00nicolabertoldistaff000000 000000 // $Id: lmtable.cpp 3686 2010-10-15 11:55:32Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include "math.h" #include "mempool.h" #include "htable.h" #include "ngramcache.h" #include "dictionary.h" #include "n_gram.h" #include "lmContainer.h" #include "lmtable.h" #include "util.h" //special value for pruned iprobs #define NOPROB ((float)-1.329227995784915872903807060280344576e36) using namespace std; inline void error(const char* message) { std::cerr << message << "\n"; throw std::runtime_error(message); } void print(prob_and_state_t* pst, std::ostream& out) { if (pst != NULL) { out << "PST ["; out << "logpr:" << pst->logpr; out << ",state:" << (void*) pst->state; out << ",statesize:" << pst->statesize; out << ",bow:" << pst->bow; out << ",bol:" << pst->bol; out << "]"; out << std::endl; } else { out << "PST [NULL]" << std::endl; } } //instantiate an empty lm table lmtable::lmtable(float nlf, float dlf):lmContainer() { ngramcache_load_factor = nlf; dictionary_load_factor = dlf; isInverted=false; configure(1,false); dict=new dictionary((char *)NULL,1000000,dictionary_load_factor); delete_dict=true; memset(table, 0, sizeof(table)); memset(tableGaps, 0, sizeof(tableGaps)); memset(cursize, 0, sizeof(cursize)); memset(tbltype, 0, sizeof(tbltype)); memset(maxsize, 0, sizeof(maxsize)); memset(tb_offset, 0, sizeof(maxsize)); memset(info, 0, sizeof(info)); memset(NumCenters, 0, sizeof(NumCenters)); max_cache_lev=0; for (int i=0; iclose(); delete cacheout; #endif for (int l=1; l<=maxlev; l++) { if (table[l]) { if (memmap > 0 && l >= memmap) Munmap(table[l]-tableGaps[l],cursize[l]*nodesize(tbltype[l])+tableGaps[l],0); else delete [] table[l]; } if (isQtable) { if (Pcenters[l]) delete [] Pcenters[l]; if (lused(); #endif } void lmtable::used_lmtcaches() { #ifdef LMT_CACHE_ENABLE for (int i=2; i<=max_cache_lev; i++) { std::cerr << "lmtcaches with order " << i << " "; if (lmtcache[i]) lmtcache[i]->used(); } #endif } void lmtable::used_caches() { used_prob_and_state_cache(); used_lmtcaches(); } void lmtable::check_prob_and_state_cache_levels() { #ifdef PS_CACHE_ENABLE if (prob_and_state_cache && prob_and_state_cache->isfull()) prob_and_state_cache->reset(prob_and_state_cache->cursize()); #endif } void lmtable::check_lmtcaches_levels() { #ifdef LMT_CACHE_ENABLE for (int i=2; i<=max_cache_lev; i++) if (lmtcache[i]->isfull()) lmtcache[i]->reset(lmtcache[i]->cursize()); #endif } void lmtable::check_caches_levels() { check_prob_and_state_cache_levels(); check_lmtcaches_levels(); } void lmtable::reset_prob_and_state_cache() { #ifdef PS_CACHE_ENABLE if (prob_and_state_cache) prob_and_state_cache->reset(MAX(prob_and_state_cache->cursize(),prob_and_state_cache->maxsize())); #endif } void lmtable::reset_lmtcaches() { #ifdef LMT_CACHE_ENABLE for (int i=2; i<=max_cache_lev; i++) lmtcache[i]->reset(MAX(lmtcache[i]->cursize(),lmtcache[i]->maxsize())); #endif } void lmtable::reset_caches() { reset_prob_and_state_cache(); reset_lmtcaches(); } bool lmtable::are_prob_and_state_cache_active() { #ifdef PS_CACHE_ENABLE return prob_and_state_cache!=NULL; #else return false; #endif } bool lmtable::are_lmtcaches_active() { #ifdef LMT_CACHE_ENABLE if (max_cache_lev < 2) return false; for (int i=2; i<=max_cache_lev; i++) if (lmtcache[i]==NULL) return false; return true; #else return false; #endif } bool lmtable::are_caches_active() { return (are_prob_and_state_cache_active() && are_lmtcaches_active()); } void lmtable::configure(int n,bool quantized) { VERBOSE(2,"void lmtable::configure(int n,bool quantized) with n:" << n << std::endl); maxlev=n; VERBOSE(2," maxlev:" << maxlev << " maxlevel():" << maxlevel() << " this->maxlevel():" << this->maxlevel() << std::endl); //The value for index 0 is never used for (int i=0; i0) { //check whether memory mapping can be used #ifdef WIN32 mmap=0; //don't use memory map #endif } load(inp,infile.c_str(),NULL,mmap,NONE); getDict()->incflag(0); } void lmtable::load(istream& inp,const char* filename,const char* outfilename,int keep_on_disk, OUTFILE_TYPE /* unused parameter: outtype */) { VERBOSE(2,"lmtable::load(istream& inp,...)" << std::endl); #ifdef WIN32 if (keep_on_disk>0) { std::cerr << "lmtable::load memory mapping not yet available under WIN32\n"; keep_on_disk = 0; } #endif //give a look at the header to select loading method char header[MAX_LINE]; inp >> header; std::cerr << header << "\n"; if (strncmp(header,"Qblmt",5)==0 || strncmp(header,"blmt",4)==0) { loadbin(inp,header,filename,keep_on_disk); } else { //input is in textual form if (keep_on_disk && outfilename==NULL) { cerr << "Load Error: inconsistent setting. Passed input file: textual. Memory map: yes. Outfilename: not specified.\n"; exit(0); } loadtxt(inp,header,outfilename,keep_on_disk); } cerr << "OOV code is " << lmtable::getDict()->oovcode() << "\n"; } //load language model on demand through a word-list file int lmtable::reload(std::set words) { //build dictionary dictionary dict(NULL,(int)words.size()); dict.incflag(1); std::set::iterator w; for (w = words.begin(); w != words.end(); ++w) dict.encode((*w).c_str()); return 1; } void lmtable::load_centers(istream& inp,int Order) { char line[MAX_LINE]; //first read the coodebook cerr << Order << " read code book "; inp >> NumCenters[Order]; Pcenters[Order]=new float[NumCenters[Order]]; Bcenters[Order]=(Order> Pcenters[Order][c]; if (Order> Bcenters[Order][c]; }; //empty the last line inp.getline((char*)line,MAX_LINE); } void lmtable::loadtxt(istream& inp,const char* header,const char* outfilename,int mmap) { if (mmap>0) loadtxt_mmap(inp,header,outfilename); else { loadtxt_ram(inp,header); lmtable::getDict()->genoovcode(); } } void lmtable::loadtxt_mmap(istream& inp,const char* header,const char* outfilename) { char nameNgrams[BUFSIZ]; char nameHeader[BUFSIZ]; FILE *fd = NULL; table_pos_t filesize=0; int Order,n; int maxlevel_h; //char *SepString = " \t\n"; unused //open input stream and prepare an input string char line[MAX_LINE]; //prepare word dictionary //dict=(dictionary*) new dictionary(NULL,1000000,NULL,NULL); lmtable::getDict()->incflag(1); //check the header to decide if the LM is quantized or not isQtable=(strncmp(header,"qARPA",5)==0?true:false); //check the header to decide if the LM table is incomplete isItable=(strncmp(header,"iARPA",5)==0?true:false); if (isQtable) { //check if header contains other infos inp >> line; if (!(maxlevel_h=atoi(line))) { cerr << "loadtxt with mmap requires new qARPA header. Please regenerate the file.\n"; exit(1); } for (n=1; n<=maxlevel_h; n++) { inp >> line; if (!(NumCenters[n]=atoi(line))) { cerr << "loadtxt with mmap requires new qARPA header. Please regenerate the file.\n"; exit(0); } } } //we will configure the table later we we know the maxlev; bool yetconfigured=false; cerr << "loadtxtmmap()\n"; // READ ARPA Header while (inp.getline(line,MAX_LINE)) { if (strlen(line)==MAX_LINE-1) { cerr << "lmtable::loadtxt_mmap: input line exceed MAXLINE (" << MAX_LINE << ") chars " << line << "\n"; exit(1); } bool backslash = (line[0] == '\\'); if (sscanf(line, "ngram %d=%d", &Order, &n) == 2) { maxsize[Order] = n; maxlev=Order; //upadte Order cerr << "size[" << Order << "]=" << maxsize[Order] << "\n"; } VERBOSE(2,"maxlev" << maxlev << std::endl); if (maxlev>requiredMaxlev) maxlev=requiredMaxlev; VERBOSE(2,"maxlev" << maxlev << std::endl); VERBOSE(2,"lmtable:requiredMaxlev" << requiredMaxlev << std::endl); if (backslash && sscanf(line, "\\%d-grams", &Order) == 1) { //at this point we are sure about the size of the LM if (!yetconfigured) { configure(maxlev,isQtable); yetconfigured=true; //opening output file strcpy(nameNgrams,outfilename); strcat(nameNgrams, "-ngrams"); fd = fopen(nameNgrams, "w+"); // compute the size of file (only for tables and - possibly - centroids; no header nor dictionary) for (int l=1; l<=maxlev; l++) { if (l1) table[1]=table[0] + (table_pos_t) (2 * NumCenters[1] * sizeof(float)); else table[1]=table[0] + (table_pos_t) (NumCenters[1] * sizeof(float)); */ for (int l=1; l<=maxlev; l++) { if (l1 && Order>1) { checkbounds(Order-1); delete startpos[Order-1]; } } } cerr << "closing output file: " << nameNgrams << "\n"; for (int i=1; i<=maxlev; i++) { if (maxsize[i] != cursize[i]) { for (int l=1; l<=maxlev; l++) cerr << "Level " << l << ": starting ngrams=" << maxsize[l] << " - actual stored ngrams=" << cursize[l] << "\n"; break; } } Munmap(table[0],filesize,MS_SYNC); for (int l=1; l<=maxlev; l++) table[l]=0; // to avoid wrong free in ~lmtable() cerr << "running fclose...\n"; fclose(fd); cerr << "done\n"; lmtable::getDict()->incflag(0); lmtable::getDict()->genoovcode(); // saving header + dictionary strcpy(nameHeader,outfilename); strcat(nameHeader, "-header"); VERBOSE(2,"saving header+dictionary in " << nameHeader << "\n"); fstream out(nameHeader,ios::out); // print header if (isQtable) { out << "Qblmt" << (isInverted?"I ":" ") << maxlev; for (int i=1; i<=maxlev; i++) out << " " << maxsize[i]; // not cursize[i] because the file was already allocated out << "\nNumCenters"; for (int i=1; i<=maxlev; i++) out << " " << NumCenters[i]; out << "\n"; } else { out << "blmt" << (isInverted?"I ":" ") << maxlev; for (int i=1; i<=maxlev; i++) out << " " << maxsize[i]; // not cursize[i] because the file was already allocated out << "\n"; } lmtable::getDict()->save(out); out.close(); cerr << "done\n"; // cat header+dictionary and n-grams files: char cmd[BUFSIZ]; sprintf(cmd,"cat %s >> %s", nameNgrams, nameHeader); cerr << "run cmd <" << cmd << ">\n"; system(cmd); sprintf(cmd,"mv %s %s", nameHeader, outfilename); cerr << "run cmd <" << cmd << ">\n"; system(cmd); removefile(nameNgrams); //no more operations are available, the file must be saved! exit(0); return; } void lmtable::loadtxt_ram(istream& inp,const char* header) { //open input stream and prepare an input string char line[MAX_LINE]; //prepare word dictionary lmtable::getDict()->incflag(1); //check the header to decide if the LM is quantized or not isQtable=(strncmp(header,"qARPA",5)==0?true:false); //check the header to decide if the LM table is incomplete isItable=(strncmp(header,"iARPA",5)==0?true:false); //we will configure the table later when we will know the maxlev; bool yetconfigured=false; cerr << "loadtxt_ram()\n"; // READ ARPA Header int Order,n; while (inp.getline(line,MAX_LINE)) { if (strlen(line)==MAX_LINE-1) { cerr << "lmtable::loadtxt_ram: input line exceed MAXLINE (" << MAX_LINE << ") chars " << line << "\n"; exit(1); } bool backslash = (line[0] == '\\'); if (sscanf(line, "ngram %d=%d", &Order, &n) == 2) { maxsize[Order] = n; maxlev=Order; //update Order } if (maxlev>requiredMaxlev) maxlev=requiredMaxlev; if (backslash && sscanf(line, "\\%d-grams", &Order) == 1) { //at this point we are sure about the size of the LM if (!yetconfigured) { configure(maxlev,isQtable); yetconfigured=true; //allocate space for loading the table of this level for (int i=1; i<=maxlev; i++) table[i] = new char[(table_pos_t) maxsize[i] * nodesize(tbltype[i])]; } loadtxt_level(inp,Order); // cerr << "START print_table_stat level" << Order << "\n"; // printTable(Order); // cerr << "END print_table_stat level" << Order << "\n"; // now we can fix table at level Order - 1 if (maxlev>1 && Order>1) { // cerr << "before checkbounds START print_table_stat level" << Order-1 << "\n"; // printTable(Order-1); // cerr << "before checkbounds END print_table_stat level" << Order-1 << "\n"; checkbounds(Order-1); // cerr << "after checkbounds START print_table_stat level" << Order-1 << "\n"; // printTable(Order-1); // cerr << "after checkbounds END print_table_stat level" << Order-1 << "\n"; // delete startpos[Order-1]; } } } lmtable::getDict()->incflag(0); cerr << "done\n"; } void lmtable::loadtxt_level(istream& inp, int level) { cerr << level << "-grams: reading "; if (isQtable) { load_centers(inp,level); } //allocate support vector to manage badly ordered n-grams if (maxlev>1 && level1) { ing.invert(ng); ng=ing; } //if table is in incomplete ARPA format prob is just the //discounted frequency, so we need to add bow * Pr(n-1 gram) if (isItable && level>1) { //get bow of lower context get(ng,ng.size,ng.size-1); float rbow=0.0; if (ng.lev==ng.size-1) { //found context rbow=ng.bow; } int tmp=maxlev; maxlev=level-1; prob= log(exp((double)prob * M_LN10) + exp(((double)rbow + lprob(ng)) * M_LN10))/M_LN10; maxlev=tmp; } //insert an n-gram into the TRIE table if (isQtable) add(ng, (qfloat_t)prob, (qfloat_t)bow); else add(ng, prob, bow); } } cerr << "done level " << level << "\n"; } void lmtable::expand_level(int level, table_entry_pos_t size, const char* outfilename, int mmap) { // cerr << "expanding level: " << level << " with " << size << " entries ...\n"; if (mmap>0) expand_level_mmap(level, size, outfilename); else { expand_level_nommap(level, size); } } void lmtable::expand_level_mmap(int level, table_entry_pos_t size, const char* outfilename) { maxsize[level]=size; //getting the level-dependent filename char nameNgrams[BUFSIZ]; sprintf(nameNgrams,"%s-%dgrams",outfilename,level); // cerr << level << "-grams: creating level of size " << maxsize[level] << " in memory map on "<< nameNgrams << std::endl; //opening output file FILE *fd = NULL; fd = fopen(nameNgrams, "w+"); if (fd == NULL) { perror("Error opening file for writing"); exit(EXIT_FAILURE); } table_pos_t filesize=(table_pos_t) maxsize[level] * nodesize(tbltype[level]); // set the file to the proper size: ftruncate(fileno(fd),filesize); /* Now the file is ready to be mmapped. */ table[level]=(char *)(MMap(fileno(fd),PROT_READ|PROT_WRITE,0,filesize,&tableGaps[level])); if (table[level] == MAP_FAILED) { fclose(fd); perror("Error mmapping the file"); exit(EXIT_FAILURE); } if (maxlev>1 && level1 && level0) // printEntryN=(printEntryN " << dict->decode(word(tbl)) << " bw:" << bw << " bnd:" << bnd << " " << start << " tb_offset:" << tb_offset[level+1] << "\n"; //cout << *(float *)&p << " " << word(tbl) << "\n"; tbl+=ndsz; } }else{ for (table_entry_pos_t c=0; c " << dict->decode(word(tbl)) << "\n"; //cout << *(float *)&p << " " << word(tbl) << "\n"; tbl+=ndsz; } } return; } //Checkbound with sorting of n-gram table on disk void lmtable::checkbounds(int level) { VERBOSE(2,"lmtable::checkbounds START Level:" << level << endl); if (getCurrentSize(level) > 0 ){ char* tbl=table[level]; char* succtbl=table[level+1]; LMT_TYPE ndt=tbltype[level]; LMT_TYPE succndt=tbltype[level+1]; int ndsz=nodesize(ndt); int succndsz=nodesize(succndt); //re-order table at level+1 on disk //generate random filename to avoid collisions std::string filePath; // ofstream out; mfstream out; createtempfile(out, filePath, ios::out|ios::binary); if (out.fail()) { perror("checkbound creating out on filePath"); exit(4); } table_entry_pos_t start,end,newend; table_entry_pos_t succ; //re-order table at level l+1 char* found; for (table_entry_pos_t c=0; c0) newend=boundwithoffset(found-ndsz,ndt,level); else newend=0; //if start==BOUND_EMPTY1 there are no successors for this entry if (start==BOUND_EMPTY1){ succ=0; } else{ assert(end>start); succ=end-start; } startpos[level][c]=newend; newend += succ; assert(newend<=cursize[level+1]); // cerr << "checkbound HERE c:" << c << endl; if (succ>0) { // cerr << "checkbound HERE2 c:" << c << endl; out.write((char*)(succtbl + (table_pos_t) start * succndsz),(table_pos_t) succ * succndsz); if (!out.good()) { std::cerr << " Something went wrong while writing temporary file " << filePath << " Maybe there is not enough space on this filesystem\n"; out.close(); exit(2); removefile(filePath); } } boundwithoffset(found,ndt,newend,level); } out.close(); if (out.fail()) { perror("error closing out"); exit(4); } fstream inp(filePath.c_str(),ios::in|ios::binary); if (inp.fail()) { perror("error opening inp"); exit(4); } inp.read(succtbl,(table_pos_t) cursize[level+1]*succndsz); inp.close(); if (inp.fail()) { perror("error closing inp"); exit(4); } removefile(filePath); } VERBOSE(2,"lmtable::checkbounds END Level:" << level << endl); } //Add method inserts n-grams in the table structure. It is ONLY used during //loading of LMs in text format. It searches for the prefix, then it adds the //suffix to the last level and updates the start-end positions. int lmtable::addwithoffset(ngram& ng, float iprob, float ibow) { char *found; LMT_TYPE ndt=tbltype[1]; //default initialization int ndsz=nodesize(ndt); //default initialization static int no_more_msg = 0; if (ng.size>1) { // find the prefix starting from the first level table_entry_pos_t start=0; table_entry_pos_t end=cursize[1]; table_entry_pos_t position; for (int l=1; l //int lmtable::add(ngram& ng, TA iprob,TB ibow) int lmtable::add(ngram& ng, float iprob, float ibow) { char *found; LMT_TYPE ndt=tbltype[1]; //default initialization int ndsz=nodesize(ndt); //default initialization static int no_more_msg = 0; if (ng.size>1) { // find the prefix starting from the first level table_entry_pos_t start=0; table_entry_pos_t end=cursize[1]; table_entry_pos_t position; for (int l=1; l=2) cout << "searching entry for codeword: " << ngp[0] << "..."; ***/ //assume 1-grams is a 1-1 map of the vocabulary //CHECK: explicit cast of n into float because table_pos_t could be unsigned and larger than MAXINT if (lev==1) return *found=(*ngp < (float) n ? table[1] + (table_pos_t)*ngp * sz:NULL); //prepare table to be searched with mybsearch char* tb; tb=table[lev] + (table_pos_t) offs * sz; //prepare search pattern char w[LMTCODESIZE]; putmem(w,ngp[0],0,LMTCODESIZE); table_entry_pos_t idx=0; // index returned by mybsearch *found=NULL; //initialize output variable totbsearch[lev]++; switch(action) { case LMT_FIND: // if (!tb || !mybsearch(tb,n,sz,(unsigned char *)w,&idx)) return NULL; if (!tb || !mybsearch(tb,n,sz,w,&idx)) { return NULL; } else { // return *found=tb + (idx * sz); return *found=tb + ((table_pos_t)idx * sz); } default: error((char*)"lmtable::search: this option is available"); }; return NULL; } /* returns idx with the first position in ar with entry >= key */ int lmtable::mybsearch(char *ar, table_entry_pos_t n, int size, char *key, table_entry_pos_t *idx) { if (n==0) return 0; *idx=0; register table_entry_pos_t low=0, high=n; register unsigned char *p; int result; #ifdef INTERP_SEARCH char *lp=NULL; char *hp=NULL; #endif for (unsigned int i=0;i=10000) { lp=(char *) (ar + (low * size)); if (codecmp((char *)key,lp)<0) { *idx=low; return 0; } hp=(char *) (ar + ((high-1) * size)); if (codecmp((char *)key,hp)>0) { *idx=high; return 0; } *idx= low + ((high-1)-low) * codediff((char *)key,lp)/codediff(hp,(char *)lp); } else #endif *idx = (low + high) / 2; //after redefining the interval there is no guarantee //that wlp <= wkey <= whigh p = (unsigned char *) (ar + (*idx * size)); result=codecmp((char *)key,(char *)p); if (result < 0) high = *idx; else if (result > 0) low = ++(*idx); else return 1; } *idx=low; return 0; } // generates a LM copy for a smaller dictionary void lmtable::cpsublm(lmtable* slmt, dictionary* subdict,bool keepunigr) { //keepunigr=false; //let slmt inherit all features of this lmtable slmt->configure(maxlev,isQtable); slmt->dict=new dictionary((keepunigr?dict:subdict),false); if (isQtable) { for (int i=1; i<=maxlev; i++) { slmt->NumCenters[i]=NumCenters[i]; slmt->Pcenters[i]=new float [NumCenters[i]]; memcpy(slmt->Pcenters[i],Pcenters[i],NumCenters[i] * sizeof(float)); if (iBcenters[i]=new float [NumCenters[i]]; memcpy(slmt->Bcenters[i],Bcenters[i],NumCenters[i] * sizeof(float)); } } } //manage dictionary information //generate OOV codes and build dictionary lookup table dict->genoovcode(); slmt->dict->genoovcode(); subdict->genoovcode(); int* lookup=new int [dict->size()]; for (int c=0; csize(); c++) { lookup[c]=subdict->encode(dict->decode(c)); if (c != dict->oovcode() && lookup[c] == subdict->oovcode()) lookup[c]=-1; // words of this->dict that are not in slmt->dict } //variables useful to navigate in the lmtable structure LMT_TYPE ndt,pndt; int ndsz,pndsz; char *entry, *newentry; table_entry_pos_t start, end, origin; for (int l=1; l<=maxlev; l++) { slmt->cursize[l]=0; slmt->table[l]=NULL; if (l==1) { //1-gram level ndt=tbltype[l]; ndsz=nodesize(ndt); for (table_entry_pos_t p=0; pcursize[l] % slmt->dict->size()) ==0) slmt->table[l]=(char *)realloc(slmt->table[l],((table_pos_t) slmt->cursize[l] + (table_pos_t) slmt->dict->size()) * ndsz); newentry=slmt->table[l] + (table_pos_t) slmt->cursize[l] * ndsz; memcpy(newentry,entry,ndsz); if (!keepunigr) //do not change encoding if keepunigr is true slmt->word(newentry,lookup[word(entry)]); if (lbound(newentry,ndt,p); //store in bound the entry itself (**) !!!! slmt->cursize[l]++; } } } else { //n-grams n>1: scan lower order table pndt=tbltype[l-1]; pndsz=nodesize(pndt); ndt=tbltype[l]; ndsz=nodesize(ndt); for (table_entry_pos_t p=0; pcursize[l-1]; p++) { //determine start and end of successors of this entry origin=slmt->bound(slmt->table[l-1] + (table_pos_t)p * pndsz,pndt); //position of n-1 gram in this table (**) if (origin == 0) start=0; //succ start at first pos in table[l] else start=bound(table[l-1] + (table_pos_t)(origin-1) * pndsz,pndt);//succ start after end of previous entry end=bound(table[l-1] + (table_pos_t)origin * pndsz,pndt); //succ end where indicated if (!keepunigr || lookup[word(table[l-1] + (table_pos_t)origin * pndsz)]!=-1) { while (start < end) { entry=table[l] + (table_pos_t) start * ndsz; if (lookup[word(entry)]!=-1) { if ((slmt->cursize[l] % slmt->dict->size()) ==0) slmt->table[l]=(char *)realloc(slmt->table[l],(table_pos_t) (slmt->cursize[l]+slmt->dict->size()) * ndsz); newentry=slmt->table[l] + (table_pos_t) slmt->cursize[l] * ndsz; memcpy(newentry,entry,ndsz); if (!keepunigr) //do not change encoding if keepunigr is true slmt->word(newentry,lookup[word(entry)]); if (lbound(newentry,ndt,start); //store in bound the entry itself!!!! slmt->cursize[l]++; } start++; } } //updated bound information of incoming entry slmt->bound(slmt->table[l-1] + (table_pos_t) p * pndsz, pndt,slmt->cursize[l]); } } } return; } // saves a LM table in text format void lmtable::savetxt(const char *filename) { fstream out(filename,ios::out); table_entry_pos_t cnt[1+MAX_NGRAM]; int l; // out.precision(7); out.precision(6); if (isQtable) { out << "qARPA " << maxlev; for (l=1; l<=maxlev; l++) out << " " << NumCenters[l]; out << endl; } ngram ng(lmtable::getDict(),0); cerr << "savetxt: " << filename << "\n"; if (isPruned) ngcnt(cnt); //check size of table by considering pruned n-grams out << "\n\\data\\\n"; char buff[100]; for (l=1; l<=maxlev; l++) { sprintf(buff,"ngram %2d=%10d\n",l,(isPruned?cnt[l]:cursize[l])); out << buff; //out << "ngram " << l << "= " << (isPruned?cnt[l]:cursize[l]) << "\n"; } out << "\n"; for (l=1; l<=maxlev; l++) { out << "\n\\" << l << "-grams:\n"; cerr << "save: " << (isPruned?cnt[l]:cursize[l]) << " " << l << "-grams\n"; if (isQtable) { out << NumCenters[l] << "\n"; for (int c=0; csave(out); for (int i=1; i<=maxlev; i++) { if (isQtable) { out.write((char*)Pcenters[i],NumCenters[i] * sizeof(float)); if (isave(out); } void lmtable::appendbin_level(int level, fstream &out, int mmap) { if (getCurrentSize(level) > 0 ){ if (mmap>0) appendbin_level_mmap(level, out); else { appendbin_level_nommap(level, out); } } } void lmtable::appendbin_level_nommap(int level, fstream &out) { VERBOSE(2,"lmtable:appendbin_level_nommap START Level:" << level << std::endl); /* if (isPruned){ cerr << "savebin_level (level " << level << "): pruned LM cannot be saved in binary form\n"; exit(0); } */ assert(level<=maxlev); // print header if (isQtable) { //NOT IMPLEMENTED } else { //do nothing } VERBOSE(3,"appending " << cursize[level] << " (maxsize:" << maxsize[level] << ") " << level << "-grams" << " table " << (void*) table << " table[level] " << (void*) table[level] << " out:" << (void*) out << endl); if (isQtable) { //NOT IMPLEMENTED } out.write(table[level],(table_pos_t) cursize[level]*nodesize(tbltype[level])); if (!out.good()) { perror("Something went wrong while writing"); out.close(); exit(2); } VERBOSE(2,"lmtable:appendbin_level_nommap END Level:" << level << std::endl); } void lmtable::appendbin_level_mmap(int level, fstream &out) { UNUSED(out); cerr << "appending " << level << " (Actually do nothing)" <0) savebin_level_mmap(level, outfilename); else { savebin_level_nommap(level, outfilename); } } void lmtable::savebin_level_nommap(int level, const char* outfilename) { VERBOSE(2,"lmtable:savebin_level_nommap START" << requiredMaxlev << std::endl); /* if (isPruned){ cerr << "savebin_level (level " << level << "): pruned LM cannot be saved in binary form\n"; exit(0); } */ assert(level<=maxlev); char nameNgrams[BUFSIZ]; sprintf(nameNgrams,"%s-%dgrams",outfilename,level); // mfstream out(nameNgrams, ios::out|ios::binary); fstream out(nameNgrams, ios::out|ios::binary); if (out.fail()){ // cerr << " out:" << (void*) *out << " cannot be opened" << endl; perror("cannot be opened"); exit(3); } // print header if (isQtable) { //NOT IMPLEMENTED } else { //do nothing } VERBOSE(3,"saving " << cursize[level] << "(maxsize:" << maxsize[level] << ") " << level << "-grams in " << nameNgrams << " table " << (void*) table << " table[level] " << (void*) table[level] << " out:" << (void*) out << endl); if (isQtable) { //NOT IMPLEMENTED } out.write(table[level],(table_pos_t) cursize[level]*nodesize(tbltype[level])); if (!out.good()) { std::cerr << " Something went wrong while writing temporary file " << nameNgrams << "\n"; out.close(); removefile(nameNgrams); exit(2); } out.close(); if (out.fail()){ // cerr << " out:" << (void*) *out << " cannot be opened" << endl; perror("cannot be closed"); exit(3); } VERBOSE(2,"lmtable:savebin_level_nommap END" << requiredMaxlev << std::endl); } void lmtable::savebin_level_mmap(int level, const char* outfilename) { char nameNgrams[BUFSIZ]; sprintf(nameNgrams,"%s-%dgrams",outfilename,level); VERBOSE(2,"saving " << level << "-grams probs in " << nameNgrams << " (Actually do nothing)" <> %s", fromnameNgrams, tonameNgrams); system(cmd); } //remove all single level files void lmtable::remove_all_levels(const char* filename){ //single level files should have a name derived from "filename" for (int i=1; i<=maxlevel(); i++) { remove_single_level(i,filename); } } //remove a single level file void lmtable::remove_single_level(int level, const char* filename){ //single level files should have a name derived from "filename" char nameNgrams[BUFSIZ]; sprintf(nameNgrams,"%s-%dgrams",filename,level); //removing temporary files removefile(nameNgrams); } //delete the table of a single level void lmtable::delete_level(int level, const char* outfilename, int mmap){ if (mmap>0) delete_level_mmap(level, outfilename); else { delete_level_nommap(level); } } void lmtable::delete_level_mmap(int level, const char* outfilename) { //getting the level-dependent filename char nameNgrams[BUFSIZ]; sprintf(nameNgrams,"%s-%dgrams",outfilename,level); //compute exact filesize table_pos_t filesize=(table_pos_t) cursize[level] * nodesize(tbltype[level]); // set the file to the proper size: Munmap(table[level]-tableGaps[level],(table_pos_t) filesize+tableGaps[level],0); maxsize[level]=cursize[level]=0; } void lmtable::delete_level_nommap(int level) { delete table[level]; maxsize[level]=cursize[level]=0; } void lmtable::compact_all_levels(const char* filename){ //single level files should have a name derived from "filename" for (int i=1; i<=maxlevel(); i++) { compact_single_level(i,filename); } } void lmtable::compact_single_level(int level, const char* filename) { char nameNgrams[BUFSIZ]; sprintf(nameNgrams,"%s-%dgrams",filename,level); VERBOSE(2,"concatenating " << level << "-grams probs from " << nameNgrams << " to " << filename<< std::endl); //concatenating of new table to the existing data char cmd[BUFSIZ]; sprintf(cmd,"cat %s >> %s", nameNgrams, filename); system(cmd); //removing temporary files removefile(nameNgrams); } void lmtable::resize_level(int level, const char* outfilename, int mmap) { if (getCurrentSize(level) > 0 ){ if (mmap>0) resize_level_mmap(level, outfilename); else { if (level> maxlev; //set the inverted falg to false, in order to rely on the header only isInverted=false; if (strncmp(header,"Qblmt",5)==0) { isQtable=true; if (strncmp(header,"QblmtI",6)==0) isInverted=true; } else if(strncmp(header,"blmt",4)==0) { isQtable=false; if (strncmp(header,"blmtI",5)==0) isInverted=true; } else error((char*)"loadbin: LM file is not in binary format"); configure(maxlev,isQtable); for (int l=1; l<=maxlev; l++) { inp >> cursize[l]; maxsize[l]=cursize[l]; } char header2[MAX_LINE]; if (isQtable) { inp >> header2; for (int i=1; i<=maxlev; i++) { inp >> NumCenters[i]; cerr << "reading " << NumCenters[i] << " centers\n"; } } inp.getline(header2, MAX_LINE); } //load codebook of level l void lmtable::loadbin_codebook(istream& inp,int l) { Pcenters[l]=new float [NumCenters[l]]; inp.read((char*)Pcenters[l],NumCenters[l] * sizeof(float)); if (lrequiredMaxlev) maxlev=requiredMaxlev; VERBOSE(3,"lmtable::maxlev:" << maxlev << std::endl); VERBOSE(3,"lmtable::requiredMaxlev" << requiredMaxlev << std::endl); //if MMAP is used, then open the file if (filename && mmap>0) { #ifdef WIN32 error("lmtable::loadbin mmap facility not yet supported under WIN32\n"); #else if (mmap <= maxlev) memmap=mmap; else error((char*)"keep_on_disk value is out of range\n"); if ((diskid=open(filename, O_RDONLY))<0) { std::cerr << "cannot open " << filename << "\n"; error((char*)"dying"); } //check that the LM is uncompressed char miniheader[4]; read(diskid,miniheader,4); if (strncmp(miniheader,"Qblm",4) && strncmp(miniheader,"blmt",4)) error((char*)"mmap functionality does not work with compressed binary LMs\n"); #endif } for (int l=1; l<=maxlev; l++) { loadbin_level(inp,l); } cerr << "done\n"; } //load only the dictionary of a binary lmfile void lmtable::loadbin_dict(istream& inp) { cerr << "lmtable::loadbin_dict()\n"; lmtable::getDict()->load(inp); cerr << "dict->size(): " << lmtable::getDict()->size() << "\n"; } //load ONE level of a binary lmfile void lmtable::loadbin_level(istream& inp, int level) { cerr << "loadbin_level (level " << level << ")\n"; if (isQtable) loadbin_codebook(inp,level); if ((memmap == 0) || (level < memmap)) { cerr << "loading " << cursize[level] << " " << level << "-grams\n"; table[level]=new char[(table_pos_t) cursize[level] * nodesize(tbltype[level])]; inp.read(table[level],(table_pos_t) cursize[level] * nodesize(tbltype[level])); } else { #ifdef WIN32 error((char*)"mmap not available under WIN32\n"); #else cerr << "mapping " << cursize[level] << " " << level << "-grams\n"; tableOffs[level]=inp.tellg(); table[level]=(char *)MMap(diskid,PROT_READ, tableOffs[level], (table_pos_t) cursize[level]*nodesize(tbltype[level]), &tableGaps[level]); table[level]+=(table_pos_t) tableGaps[level]; cerr << "tableOffs " << tableOffs[level] << " tableGaps" << tableGaps[level] << "-grams\n"; inp.seekg((table_pos_t) cursize[level]*nodesize(tbltype[level]),ios_base::cur); #endif } cerr << "done (level " << level <<")\n"; } int lmtable::get(ngram& ng,int n,int lev) { totget[lev]++; if (lev > maxlev) error((char*)"get: lev exceeds maxlevel"); if (n < lev) error((char*)"get: ngram is too small"); //set boudaries for 1-gram table_entry_pos_t offset=0,limit=cursize[1]; //information of table entries table_entry_pos_t hit; char* found; LMT_TYPE ndt; ng.link=NULL; ng.lev=0; for (int l=1; l<=lev; l++) { //initialize entry information hit = 0 ; found = NULL; ndt=tbltype[l]; #ifdef LMT_CACHE_ENABLE if (lmtcache[l] && lmtcache[l]->get(ng.wordp(n),found)) { hit=1; } else { search(l, offset, (limit-offset), nodesize(ndt), ng.wordp(n-l+1), LMT_FIND, &found); } //insert both found and not found items!!! //insert only not found items!!! if (lmtcache[l] && hit==0) { const char* found2=found; lmtcache[l]->add(ng.wordp(n),found2); } #else search(l, offset, (limit-offset), nodesize(ndt), ng.wordp(n-l+1), LMT_FIND, &found); #endif if (!found) return 0; float pr = prob(found,ndt); if (pr==NOPROB) return 0; //pruned n-gram ng.path[l]=found; //store path of found entries ng.bow=(l0) { *cacheout << sentence_id << " miss " << ng << " " << ng.link << "\n"; } #endif return 1; } //recursively prints the language model table void lmtable::dumplm(fstream& out,ngram ng, int ilev, int elev, table_entry_pos_t ipos,table_entry_pos_t epos) { LMT_TYPE ndt=tbltype[ilev]; ngram ing(ng.dict); int ndsz=nodesize(ndt); assert(ng.size==ilev-1); //Note that ipos and epos are always larger than or equal to 0 because they are unsigned int assert(epos<=cursize[ilev]); assert(ipos0?bound(table[ilev]+ (table_pos_t) (i-1) * ndsz,ndt):0); table_entry_pos_t esucc=bound(found,ndt); // cerr << "looking for successors for ng:" << ng << " cursize[ilev]:" << cursize[ilev] << " isucc:" << isucc << " esucc:" << esucc << endl; if (isucc < esucc) //there are successors! dumplm(out,ng,ilev+1,elev,isucc,esucc); } else { out << ipr <<"\t"; // if table is inverted then revert n-gram if (isInverted & ng.size>1) { ing.invert(ng); for (int k=ing.size; k>=1; k--) { if (kdecode(*ing.wordp(k)); } } else { for (int k=ng.size; k>=1; k--) { if (kdecode(*ng.wordp(k)); } } if (ilevUPPER_SINGLE_PRECISION_OF_0 || ibo<-UPPER_SINGLE_PRECISION_OF_0)) out << "\t" << ibo; } } out << "\n"; } } } //succscan iteratively returns all successors of an ngram h for which //get(h,h.size,h.size) returned true. int lmtable::succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev) { assert(lev==h.lev+1 && h.size==lev && lev<=maxlev); LMT_TYPE ndt=tbltype[h.lev]; int ndsz=nodesize(ndt); table_entry_pos_t offset; switch (action) { case LMT_INIT: //reset ngram local indexes ng.size=lev; ng.trans(h); //get number of successors of h ng.midx[lev]=0; offset=(h.link>table[h.lev]?bound(h.link-ndsz,ndt):0); h.succ=bound(h.link,ndt)-offset; h.succlink=table[lev]+(table_pos_t) offset * nodesize(tbltype[lev]); return 1; case LMT_CONT: if (ng.midx[lev] < h.succ) { //put current word into ng *ng.wordp(1)=word(h.succlink+(table_pos_t) ng.midx[lev]*nodesize(tbltype[lev])); ng.midx[lev]++; return 1; } else return 0; default: cerr << "succscan: only permitted options are LMT_INIT and LMT_CONT\n"; exit(0); } } //maxsuffptr returns the largest suffix of an n-gram that is contained //in the LM table. This can be used as a compact representation of the //(n-1)-gram state of a n-gram LM. if the input k-gram has k>=n then it //is trimmed to its n-1 suffix. //non recursive version const char *lmtable::maxsuffptr(ngram ong, unsigned int* size) { // cerr << "lmtable::maxsuffptr\n"; // cerr << "ong: " << ong << " -> ong.size: " << ong.size << "\n"; if (ong.size==0) { if (size!=NULL) *size=0; return (char*) NULL; } if (isInverted) { if (ong.size>maxlev) ong.size=maxlev; //if larger mthan maxlen reduce size ngram ing=ong; //inverted ngram ing.invert(ong); //cout << "ngram:" << ing << "\n"; get(ing,ing.size,ing.size); // dig in the trie if (ing.lev > 0) { //found something? unsigned int isize = MIN(ing.lev,(ing.size-1)); //find largest n-1 gram suffix if (size!=NULL) *size=isize; return ing.path[isize]; } else { // means a real unknown word! if (size!=NULL) *size=0; //default statesize for zero-gram! return NULL; //default stateptr for zero-gram! } } else { if (ong.size>0) ong.size--; //always reduced by 1 word if (ong.size>=maxlev) ong.size=maxlev-1; //if still larger or equals to maxlen reduce again if (size!=NULL) *size=ong.size; //will return the largest found ong.size for (ngram ng=ong; ng.size>0; ng.size--) { if (get(ng,ng.size,ng.size)) { if (ng.succ==0) (*size)--; if (size!=NULL) *size=ng.size; return ng.link; } } if (size!=NULL) *size=0; return NULL; } } const char *lmtable::cmaxsuffptr(ngram ong, unsigned int* size) { //cerr << "lmtable::CMAXsuffptr\n"; //cerr << "ong: " << ong // << " -> ong.size: " << ong.size << "\n"; if (size!=NULL) *size=ong.size; //will return the largest found ong.size if (ong.size==0) return (char*) NULL; char* found; unsigned int isize; //internal state size variable #ifdef PS_CACHE_ENABLE prob_and_state_t pst; size_t orisize=ong.size; if (ong.size>=maxlev) ong.size=maxlev-1; //cache hit if (prob_and_state_cache && (ong.size==maxlev-1) && prob_and_state_cache->get(ong.wordp(maxlev-1),pst)) { *size=pst.statesize; return pst.state; } ong.size = orisize; #endif //cache miss found=(char *)maxsuffptr(ong,&isize); #ifdef PS_CACHE_ENABLE //cache insert if (ong.size>=maxlev) ong.size=maxlev-1; if (prob_and_state_cache && ong.size==maxlev-1) { pst.state=found; pst.statesize=isize; prob_and_state_cache->add(ong.wordp(maxlev-1),pst); } #endif if (size!=NULL) *size=isize; return found; } //returns log10prob of n-gram //bow: backoff weight //bol: backoff level //additional infos related to use in Moses: //maxsuffptr: recombination state after the LM call //statesize: lenght of the recombination state //extensible: true if the deepest found ngram has successors //lastbow: bow of the deepest found ngram //non recursive version, also includes maxsuffptr double lmtable::lprob(ngram ong,double* bow, int* bol, char** maxsuffptr,unsigned int* statesize, bool* extendible, double *lastbow) { VERBOSE(3," lmtable::lprob(ngram) ong " << ong << "\n"); if (ong.size==0) return 0.0; //sanity check if (ong.size>maxlev) ong.size=maxlev; //adjust n-gram level to table size if (bow) *bow=0; //initialize back-off weight if (bol) *bol=0; //initialize bock-off level double rbow=0,lpr=0; //output back-off weight and logprob float ibow,iprob; //internal back-off weight and logprob if (isInverted) { ngram ing=ong; //Inverted ngram TRIE ing.invert(ong); get(ing,ing.size,ing.size); // dig in the trie if (ing.lev >0) { //found something? iprob=ing.prob; lpr = (double)(isQtable?Pcenters[ing.lev][(qfloat_t)iprob]:iprob); if (*ong.wordp(1)==dict->oovcode()) lpr-=logOOVpenalty; //add OOV penalty if (statesize) *statesize=MIN(ing.lev,(ing.size-1)); //find largest n-1 gram suffix if (maxsuffptr) *maxsuffptr=ing.path[MIN(ing.lev,(ing.size-1))]; if (extendible) *extendible=succrange(ing.path[ing.lev],ing.lev)>0; if (lastbow) *lastbow=(double) (isQtable?Bcenters[ing.lev][(qfloat_t)ing.bow]:ing.bow); } else { // means a real unknown word! lpr=-log(UNIGRAM_RESOLUTION)/M_LN10; if (statesize) *statesize=0; //default statesize for zero-gram! if (maxsuffptr) *maxsuffptr=NULL; //default stateptr for zero-gram! } if (ing.lev < ing.size) { //compute backoff weight int depth=(ing.lev>0?ing.lev:1); //ing.lev=0 (real unknown word) is still a 1-gram if (bol) *bol=ing.size-depth; ing.size--; //get n-gram context get(ing,ing.size,ing.size); // dig in the trie if (ing.lev>0) { //found something? //collect back-off weights for (int l=depth; l<=ing.lev; l++) { //start from first back-off level assert(ing.path[l]!=NULL); //check consistency of table ibow=this->bow(ing.path[l],tbltype[l]); rbow+= (double) (isQtable?Bcenters[l][(qfloat_t)ibow]:ibow); //avoids bad quantization of bow of // if (isQtable && (*ing.wordp(1)==dict->oovcode())) { if (isQtable && (*ing.wordp(ing.size)==dict->oovcode())) { rbow-=(double)Bcenters[l][(qfloat_t)ibow]; } } } } if (bow) (*bow)=rbow; return rbow + lpr; } //Direct ngram TRIE else { assert(extendible==NULL && lastbow==NULL); for (ngram ng=ong; ng.size>0; ng.size--) { if (get(ng,ng.size,ng.size)) { iprob=ng.prob; lpr = (double)(isQtable?Pcenters[ng.size][(qfloat_t)iprob]:iprob); if (*ng.wordp(1)==dict->oovcode()) lpr-=logOOVpenalty; //add OOV penalty if (maxsuffptr || statesize) { //one extra step is needed if ng.size=ong.size if (ong.size==ng.size) { ng.size--; get(ng,ng.size,ng.size); } if (statesize) *statesize=ng.size; if (maxsuffptr) *maxsuffptr=ng.link; //we should check ng.link != NULL } return rbow+lpr; } else { if (ng.size==1) { //means a real unknow word! if (maxsuffptr) *maxsuffptr=NULL; //default stateptr for zero-gram! if (statesize) *statesize=0; return rbow -log(UNIGRAM_RESOLUTION)/M_LN10; } else { //compute backoff if (bol) (*bol)++; //increase backoff level if (ng.lev==(ng.size-1)) { //if search stopped at previous level ibow=ng.bow; rbow+= (double) (isQtable?Bcenters[ng.lev][(qfloat_t)ibow]:ibow); //avoids bad quantization of bow of if (isQtable && (*ng.wordp(2)==dict->oovcode())) { rbow-=(double)Bcenters[ng.lev][(qfloat_t)ibow]; } } if (bow) (*bow)=rbow; } } } } assert(0); //never pass here!!! return 1.0; } //return log10 probsL use cache memory double lmtable::clprob(ngram ong,double* bow, int* bol, char** state,unsigned int* statesize,bool* extendible) { VERBOSE(3," lmtable::clprob(ngram), parameter = <" << ong << ">\n"); #ifdef TRACE_CACHELM if (probcache && ong.size==maxlev && sentence_id>0) { *cacheout << sentence_id << " " << ong << "\n"; } #endif if (ong.size==0) { if (statesize!=NULL) *statesize=0; if (state!=NULL) *state=NULL; if (extendible!=NULL) *extendible=false; return 0.0; } if (ong.size>maxlev) ong.size=maxlev; //adjust n-gram level to table size #ifdef PS_CACHE_ENABLE double logpr = 0.0; //cache hit prob_and_state_t pst_get; if (prob_and_state_cache && ong.size==maxlev && prob_and_state_cache->get(ong.wordp(maxlev),pst_get)) { logpr=pst_get.logpr; if (bow) *bow = pst_get.bow; if (bol) *bol = pst_get.bol; if (state) *state = pst_get.state; if (statesize) *statesize = pst_get.statesize; if (extendible) *extendible = pst_get.extendible; return logpr; } //cache miss prob_and_state_t pst_add; logpr = pst_add.logpr = lmtable::lprob(ong, &(pst_add.bow), &(pst_add.bol), &(pst_add.state), &(pst_add.statesize), &(pst_add.extendible)); if (bow) *bow = pst_add.bow; if (bol) *bol = pst_add.bol; if (state) *state = pst_add.state; if (statesize) *statesize = pst_add.statesize; if (extendible) *extendible = pst_add.extendible; if (prob_and_state_cache && ong.size==maxlev) { prob_and_state_cache->add(ong.wordp(maxlev),pst_add); } return logpr; #else return lmtable::lprob(ong, bow, bol, state, statesize, extendible); #endif }; //return log10 probsL use cache memory //this functions simulates the clprob(ngram, ...) but it takes as input an array of codes instead of the ngram double lmtable::clprob(int* codes, int sz, double* bow, int* bol, char** state,unsigned int* statesize,bool* extendible) { VERBOSE(3," lmmacro::clprob(int*, int,...)\n"); #ifdef TRACE_CACHELM if (probcache && sz==maxlev && sentence_id>0) { *cacheout << sentence_id << "\n"; //print the codes of the vector ng } #endif if (sz==0) { if (statesize!=NULL) *statesize=0; if (state!=NULL) *state=NULL; if (extendible!=NULL) *extendible=false; return 0.0; } if (sz>maxlev) sz=maxlev; //adjust n-gram level to table size double logpr = 0.0; #ifdef PS_CACHE_ENABLE //cache hit prob_and_state_t pst_get; if (prob_and_state_cache && sz==maxlev && prob_and_state_cache->get(codes,pst_get)) { logpr=pst_get.logpr; if (bow) *bow = pst_get.bow; if (bol) *bol = pst_get.bol; if (state) *state = pst_get.state; if (statesize) *statesize = pst_get.statesize; if (extendible) *extendible = pst_get.extendible; return logpr; } //create the actual ngram ngram ong(dict); ong.pushc(codes,sz); assert (ong.size == sz); //cache miss prob_and_state_t pst_add; logpr = pst_add.logpr = lmtable::lprob(ong, &(pst_add.bow), &(pst_add.bol), &(pst_add.state), &(pst_add.statesize), &(pst_add.extendible)); if (bow) *bow = pst_add.bow; if (bol) *bol = pst_add.bol; if (state) *state = pst_add.state; if (statesize) *statesize = pst_add.statesize; if (extendible) *extendible = pst_add.extendible; if (prob_and_state_cache && ong.size==maxlev) { prob_and_state_cache->add(ong.wordp(maxlev),pst_add); } return logpr; #else //create the actual ngram ngram ong(dict); ong.pushc(codes,sz); assert (ong.size == sz); logpr = lmtable::lprob(ong, bow, bol, state, statesize, extendible); return logpr; #endif }; int lmtable::succrange(node ndp,int level,table_entry_pos_t* isucc,table_entry_pos_t* esucc) { table_entry_pos_t first,last; LMT_TYPE ndt=tbltype[level]; //get table boundaries for next level if (leveltable[level]? bound(ndp-nodesize(ndt), ndt) : 0; last = bound(ndp, ndt); } else { first=last=0; } if (isucc) *isucc=first; if (esucc) *esucc=last; return last-first; } void lmtable::stat(int level) { table_pos_t totmem=0,memory; float mega=1024 * 1024; cout.precision(2); cout << "lmtable class statistics\n"; cout << "levels " << maxlev << "\n"; for (int l=1; l<=maxlev; l++) { memory=(table_pos_t) cursize[l] * nodesize(tbltype[l]); cout << "lev " << l << " entries "<< cursize[l] << " used mem " << memory/mega << "Mb\n"; totmem+=memory; } cout << "total allocated mem " << totmem/mega << "Mb\n"; cout << "total number of get and binary search calls\n"; for (int l=1; l<=maxlev; l++) { cout << "level " << l << " get: " << totget[l] << " bsearch: " << totbsearch[l] << "\n"; } if (level >1 ) lmtable::getDict()->stat(); } void lmtable::reset_mmap() { #ifndef WIN32 if (memmap>0 and memmap<=maxlev) for (int l=memmap; l<=maxlev; l++) { //std::cerr << "resetting mmap at level:" << l << "\n"; Munmap(table[l]-tableGaps[l],(table_pos_t) cursize[l]*nodesize(tbltype[l])+tableGaps[l],0); table[l]=(char *)MMap(diskid,PROT_READ, tableOffs[l], (table_pos_t)cursize[l]*nodesize(tbltype[l]), &tableGaps[l]); table[l]+=(table_pos_t)tableGaps[l]; } #endif } // ng: input n-gram // *lk: prob of n-(*bol) gram // *boff: backoff weight vector // *bol: backoff level double lmtable::lprobx(ngram ong, double *lkp, double *bop, int *bol) { double bo, lbo, pr; float ipr; //int ipr; ngram ng(dict), ctx(dict); if(bol) *bol=0; if(ong.size==0) { if(lkp) *lkp=0; return 0; // lprob ritorna 0, prima lprobx usava LOGZERO } if(ong.size>maxlev) ong.size=maxlev; ctx = ng = ong; bo=0; ctx.shift(); while(!get(ng)) { // back-off //OOV not included in dictionary if(ng.size==1) { pr = -log(UNIGRAM_RESOLUTION)/M_LN10; if(lkp) *lkp=pr; // this is the innermost probability pr += bo; //add all the accumulated back-off probability return pr; } // backoff-probability lbo = 0.0; //local back-off: default is logprob 0 if(get(ctx)) { //this can be replaced with (ng.lev==(ng.size-1)) ipr = ctx.bow; lbo = isQtable?Bcenters[ng.size][(qfloat_t)ipr]:ipr; //lbo = isQtable?Bcenters[ng.size][ipr]:*(float*)&ipr; } if(bop) *bop++=lbo; if(bol) ++*bol; bo += lbo; ng.size--; ctx.size--; } ipr = ng.prob; pr = isQtable?Pcenters[ng.size][(qfloat_t)ipr]:ipr; //pr = isQtable?Pcenters[ng.size][ipr]:*((float*)&ipr); if(lkp) *lkp=pr; pr += bo; return pr; } // FABIO table_entry_pos_t lmtable::wdprune(float *thr, int aflag) { //this function implements a method similar to the "Weighted Difference Method" //described in "Scalable Backoff Language Models" by Kristie Seymore and Ronald Rosenfeld int l; ngram ng(lmtable::getDict(),0); isPruned=true; //the table now might contain pruned n-grams ng.size=0; double tlk, bo, ts, tbs; tlk = bo = ts = tbs = 0; for(l=2; l<=maxlev; l++) wdprune(thr, aflag, ng, 1, l, 0, cursize[1]); return 0; } // FABIO: LM pruning method table_entry_pos_t lmtable::wdprune(float *thr, int aflag, ngram ng, int ilev, int elev, table_entry_pos_t ipos, table_entry_pos_t epos, double tlk, double bo, double *ts, double *tbs) { LMT_TYPE ndt=tbltype[ilev]; int ndsz=nodesize(ndt); char *ndp; float lk; float ipr, ibo; //int ipr, ibo; table_entry_pos_t i, k, nk; assert(ng.size==ilev-1); //Note that ipos and epos are always larger than or equal to 0 because they are unsigned int assert(epos<=cursize[ilev] && iposgetcode(BOS_))) { //the n-gram starts with the sentence start symbol //do not consider is actual probability because it is not reliable (its frequency is manually set) ipr = 0.0; } lk = ipr; if(ilev0 ? bound(ndp-ndsz, ndt) : 0; //table_entry_pos_t esucc = bound(ndp, ndt); if(isucc>=esucc) continue; // no successors //look for n-grams to be pruned with this context (see //back-off weight) prune: double nextlevel_ts=0, nextlevel_tbs=0; k = wdprune(thr, aflag, ng, ilev+1, elev, isucc, esucc, tlk+lk, bo, &nextlevel_ts, &nextlevel_tbs); //k is the number of pruned n-grams with this context if(ilev!=elev-1) continue; if(nextlevel_ts>=1 || nextlevel_tbs>=1) { cerr << "ng: " << ng <<" nextlevel_ts=" << nextlevel_ts <<" nextlevel_tbs=" << nextlevel_tbs <<" k=" << k <<" ns=" << esucc-isucc << "\n"; if(nextlevel_ts>=1) { pscale(ilev+1, isucc, esucc, 0.999999/nextlevel_ts); goto prune; } } // adjusts backoff: // 1-sum_succ(pr(w|ng)) / 1-sum_succ(pr(w|bng)) bo = log((1-nextlevel_ts)/(1-nextlevel_tbs))/M_LN10; ibo=(float)bo; bow(ndp, ndt, ibo); } else { //we are at the highest level //get probability of lower order n-gram ngram bng = ng; bng.size--; double blk = lprob(bng); double wd = pow(10., tlk+lk) * (lk-bo-blk); if(aflag&&wd<0) wd=-wd; if(wd > thr[elev-1]) { // kept *ts += pow(10., lk); *tbs += pow(10., blk); } else { // discarded ++nk; prob(ndp, ndt, NOPROB); } } } return nk; } int lmtable::pscale(int lev, table_entry_pos_t ipos, table_entry_pos_t epos, double s) { LMT_TYPE ndt=tbltype[lev]; int ndsz=nodesize(ndt); char *ndp; float ipr; s=log(s)/M_LN10; ndp = table[lev]+ (table_pos_t) ipos*ndsz; for(table_entry_pos_t i=ipos; i #include #endif #include #include #include #include #include #include "util.h" #include "ngramcache.h" #include "dictionary.h" #include "n_gram.h" #include "lmContainer.h" #define MAX(a,b) (((a)>(b))?(a):(b)) #define MIN(a,b) (((a)<(b))?(a):(b)) #define LMTMAXLEV 20 #define MAX_LINE 100000 #ifndef LMTCODESIZE #define LMTCODESIZE (int)3 #endif #define SHORTSIZE (int)2 #define PTRSIZE (int)sizeof(char *) #define INTSIZE (int)4 #define CHARSIZE (int)1 #define PROBSIZE (int)4 //use float #define QPROBSIZE (int)1 //use qfloat_t //#define BOUNDSIZE (int)4 //use table_pos_t #define BOUNDSIZE (int)sizeof(table_entry_pos_t) //use table_pos_t #define UNIGRAM_RESOLUTION 10000000.0 typedef enum {INTERNAL,QINTERNAL,LEAF,QLEAF} LMT_TYPE; //typedef enum {BINARY,TEXT,YRANIB,NONE} OUTFILE_TYPE; typedef char* node; typedef enum {LMT_FIND, //!< search: find an entry LMT_ENTER, //!< search: enter an entry LMT_INIT, //!< scan: start scan LMT_CONT //!< scan: continue scan } LMT_ACTION; typedef unsigned int table_entry_pos_t; //type for pointing to a full ngram in the table typedef unsigned long table_pos_t; // type for pointing to a single char in the table typedef unsigned char qfloat_t; //type for quantized probabilities //CHECK this part to HERE #define BOUND_EMPTY1 (numeric_limits::max() - 2) #define BOUND_EMPTY2 (numeric_limits::max() - 1) class lmtable: public lmContainer { static const bool debug=true; void loadtxt(std::istream& inp,const char* header,const char* filename,int mmap); void loadtxt_ram(std::istream& inp,const char* header); void loadtxt_mmap(std::istream& inp,const char* header,const char* outfilename); void loadtxt_level(std::istream& inp,int l); void loadbin(std::istream& inp,const char* header,const char* filename,int mmap); void loadbin_header(std::istream& inp, const char* header); void loadbin_dict(std::istream& inp); void loadbin_codebook(std::istream& inp,int l); void loadbin_level(std::istream& inp,int l); protected: char* table[LMTMAXLEV+1]; //storage of all levels LMT_TYPE tbltype[LMTMAXLEV+1]; //table type for each levels table_entry_pos_t cursize[LMTMAXLEV+1]; //current size of levels //current offset for in-memory tables (different for each level //needed to manage partial tables // mempos = diskpos - offset[level] table_entry_pos_t tb_offset[LMTMAXLEV+1]; table_entry_pos_t maxsize[LMTMAXLEV+1]; //max size of levels table_entry_pos_t* startpos[LMTMAXLEV+1]; //support vector to store start positions char info[100]; //information put in the header //statistics int totget[LMTMAXLEV+1]; int totbsearch[LMTMAXLEV+1]; //probability quantization bool isQtable; //Incomplete LM table from distributed training bool isItable; //Table with reverted n-grams for fast access bool isInverted; //Table might contain pruned n-grams bool isPruned; int NumCenters[LMTMAXLEV+1]; float* Pcenters[LMTMAXLEV+1]; float* Bcenters[LMTMAXLEV+1]; double logOOVpenalty; //penalty for OOV words (default 0) int dictionary_upperbound; //set by user int backoff_state; //improve access speed int max_cache_lev; NGRAMCACHE_t* prob_and_state_cache; NGRAMCACHE_t* lmtcache[LMTMAXLEV+1]; float ngramcache_load_factor; float dictionary_load_factor; //memory map on disk int memmap; //level from which n-grams are accessed via mmap int diskid; off_t tableOffs[LMTMAXLEV+1]; off_t tableGaps[LMTMAXLEV+1]; // is this LM queried for knowing the matching order or (standard // case) for score? bool orderQuery; //flag to enable/disable deletion of dict in the destructor bool delete_dict; public: #ifdef TRACE_CACHELM std::fstream* cacheout; int sentence_id; #endif dictionary *dict; // dictionary (words - macro tags) lmtable(float nlf=0.0, float dlfi=0.0); virtual ~lmtable(); table_entry_pos_t wdprune(float *thr, int aflag=0); table_entry_pos_t wdprune(float *thr, int aflag, ngram ng, int ilev, int elev, table_entry_pos_t ipos, table_entry_pos_t epos, double lk=0, double bo=0, double *ts=0, double *tbs=0); double lprobx(ngram ong, double *lkp=0, double *bop=0, int *bol=0); table_entry_pos_t ngcnt(table_entry_pos_t *cnt); table_entry_pos_t ngcnt(table_entry_pos_t *cnt, ngram ng, int l, table_entry_pos_t ipos, table_entry_pos_t epos); int pscale(int lev, table_entry_pos_t ipos, table_entry_pos_t epos, double s); void init_prob_and_state_cache(); void init_probcache() { init_prob_and_state_cache(); }; //kept for back compatibility void init_statecache() {}; //kept for back compatibility void init_lmtcaches(int uptolev); void init_caches(int uptolev); void used_prob_and_state_cache(); void used_lmtcaches(); void used_caches(); void delete_prob_and_state_cache(); void delete_probcache() { delete_prob_and_state_cache(); }; //kept for back compatibility void delete_statecache() {}; //kept for back compatibility void delete_lmtcaches(); void delete_caches(); void check_prob_and_state_cache_levels(); void check_probcache_levels() { check_prob_and_state_cache_levels(); }; //kept for back compatibility void check_statecache_levels() {}; //kept for back compatibility void check_lmtcaches_levels(); void check_caches_levels(); void reset_prob_and_state_cache(); void reset_probcache() { reset_prob_and_state_cache(); }; //kept for back compatibility void reset_statecache() {}; //kept for back compatibility void reset_lmtcaches(); void reset_caches(); bool are_prob_and_state_cache_active(); bool is_probcache_active() { return are_prob_and_state_cache_active(); }; //kept for back compatibility bool is_statecache_active() { return are_prob_and_state_cache_active(); }; //kept for back compatibility bool are_lmtcaches_active(); bool are_caches_active(); void reset_mmap(); //set the inverted flag to load ngrams in an inverted order //this choice is disregarded if a binary LM is loaded, //because the info is stored into the header bool is_inverted(const bool flag) { return isInverted=flag; } bool is_inverted() { return isInverted; } void configure(int n,bool quantized); //set penalty for OOV words double getlogOOVpenalty() const { return logOOVpenalty; } double setlogOOVpenalty(int dub) { assert(dub > dict->size()); dictionary_upperbound = dub; return logOOVpenalty=log((double)(dictionary_upperbound - dict->size()))/M_LN10; } double setlogOOVpenalty(double oovp) { return logOOVpenalty=oovp; } virtual int maxlevel() const { return maxlev; }; bool isQuantized() const { return isQtable; } void savetxt(const char *filename); void savebin(const char *filename); void appendbin_level(int level, fstream &out, int mmap); void appendbin_level_nommap(int level, fstream &out); void appendbin_level_mmap(int level, fstream &out); void savebin_level(int level, const char* filename, int mmap); void savebin_level_nommap(int level, const char* filename); void savebin_level_mmap(int level, const char* filename); void savebin_dict(std::fstream& out); void compact_all_levels(const char* filename); void compact_single_level(int level, const char* filename); void concatenate_all_levels(const char* fromfilename, const char* tofilename); void concatenate_single_level(int level, const char* fromfilename, const char* tofilename); void remove_all_levels(const char* filename); void remove_single_level(int level, const char* filename); void print_table_stat(); void print_table_stat(int level); void dumplm(std::fstream& out,ngram ng, int ilev, int elev, table_entry_pos_t ipos,table_entry_pos_t epos); void delete_level(int level, const char* outfilename, int mmap); void delete_level_nommap(int level); void delete_level_mmap(int level, const char* filename); void resize_level(int level, const char* outfilename, int mmap); void resize_level_nommap(int level); void resize_level_mmap(int level, const char* filename); inline void update_offset(int level, table_entry_pos_t value) { tb_offset[level]=value; }; void load(const std::string filename, int mmap=0); void load(std::istream& inp,const char* filename=NULL,const char* outfilename=NULL,int mmap=0,OUTFILE_TYPE outtype=NONE); void load_centers(std::istream& inp,int l); void expand_level(int level, table_entry_pos_t size, const char* outfilename, int mmap); void expand_level_nommap(int level, table_entry_pos_t size); void expand_level_mmap(int level, table_entry_pos_t size, const char* outfilename); void cpsublm(lmtable* sublmt, dictionary* subdict,bool keepunigr=true); int reload(std::set words); void filter(const char* /* unused parameter: lmfile */) {}; virtual double lprob(ngram ng, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL, bool* extendible=NULL, double* lastbow=NULL); virtual double clprob(ngram ng, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL); virtual double clprob(int* ng, int ngsize, double* bow=NULL,int* bol=NULL,char** maxsuffptr=NULL,unsigned int* statesize=NULL,bool* extendible=NULL); void *search(int lev,table_entry_pos_t offs,table_entry_pos_t n,int sz,int *w, LMT_ACTION action,char **found=(char **)NULL); int mybsearch(char *ar, table_entry_pos_t n, int size, char *key, table_entry_pos_t *idx); int add(ngram& ng, float prob,float bow); //template int add(ngram& ng, TA prob,TB bow); int addwithoffset(ngram& ng, float prob,float bow); // template int addwithoffset(ngram& ng, TA prob,TB bow); void checkbounds(int level); inline int get(ngram& ng) { return get(ng,ng.size,ng.size); } int get(ngram& ng,int n,int lev); int succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev); virtual const char *maxsuffptr(ngram ong, unsigned int* size=NULL); virtual const char *cmaxsuffptr(ngram ong, unsigned int* size=NULL); inline void putmem(char* ptr,int value,int offs,int size) { assert(ptr!=NULL); for (int i=0; i> (8 * i)) & 0xff; }; inline void getmem(char* ptr,int* value,int offs,int size) { assert(ptr!=NULL); *value=ptr[offs] & 0xff; for (int i=1; i inline void putmem(char* ptr,T value,int offs) { assert(ptr!=NULL); memcpy(ptr+offs, &value, sizeof(T)); }; template inline void getmem(char* ptr,T* value,int offs) { assert(ptr!=NULL); memcpy((void*)value, ptr+offs, sizeof(T)); }; int nodesize(LMT_TYPE ndt) { switch (ndt) { case INTERNAL: return LMTCODESIZE + PROBSIZE + PROBSIZE + BOUNDSIZE; case QINTERNAL: return LMTCODESIZE + QPROBSIZE + QPROBSIZE + BOUNDSIZE; case LEAF: return LMTCODESIZE + PROBSIZE; case QLEAF: return LMTCODESIZE + QPROBSIZE; default: assert(0); return 0; } } inline int word(node nd,int value=-1) { int offset=0; if (value==-1) getmem(nd,&value,offset,LMTCODESIZE); else putmem(nd,value,offset,LMTCODESIZE); return value; }; int codecmp(node a,node b) { register int i,result; for (i=(LMTCODESIZE-1); i>=0; i--) { result=(unsigned char)a[i]-(unsigned char)b[i]; if(result) return result; } return 0; }; int codediff(node a,node b) { return word(a)-word(b); }; inline float prob(node nd,LMT_TYPE ndt) { int offs=LMTCODESIZE; float fv; unsigned char cv; switch (ndt) { case INTERNAL: getmem(nd,&fv,offs); return fv; case QINTERNAL: getmem(nd,&cv,offs); return (float) cv; case LEAF: getmem(nd,&fv,offs); return fv; case QLEAF: getmem(nd,&cv,offs); return (float) cv; default: assert(0); return 0; } }; template inline T prob(node nd, LMT_TYPE ndt, T value) { int offs=LMTCODESIZE; switch (ndt) { case INTERNAL: putmem(nd, value,offs); break; case QINTERNAL: putmem(nd,(unsigned char) value,offs); break; case LEAF: putmem(nd, value,offs); break; case QLEAF: putmem(nd,(unsigned char) value,offs); break; default: assert(0); return (T) 0; } return value; }; inline float bow(node nd,LMT_TYPE ndt) { int offs=LMTCODESIZE+(ndt==QINTERNAL?QPROBSIZE:PROBSIZE); float fv; unsigned char cv; switch (ndt) { case INTERNAL: getmem(nd,&fv,offs); return fv; case QINTERNAL: getmem(nd,&cv,offs); return (float) cv; case LEAF: getmem(nd,&fv,offs); return fv; case QLEAF: getmem(nd,&cv,offs); return (float) cv; default: assert(0); return 0; } }; template inline T bow(node nd,LMT_TYPE ndt, T value) { int offs=LMTCODESIZE+(ndt==QINTERNAL?QPROBSIZE:PROBSIZE); switch (ndt) { case INTERNAL: putmem(nd, value,offs); break; case QINTERNAL: putmem(nd,(unsigned char) value,offs); break; case LEAF: putmem(nd, value,offs); break; case QLEAF: putmem(nd,(unsigned char) value,offs); break; default: assert(0); return 0; } return value; }; inline table_entry_pos_t boundwithoffset(node nd,LMT_TYPE ndt, int level){ return bound(nd,ndt) - tb_offset[level+1]; } inline table_entry_pos_t boundwithoffset(node nd,LMT_TYPE ndt, table_entry_pos_t value, int level){ return bound(nd, ndt, value + tb_offset[level+1]); } // table_entry_pos_t bound(node nd,LMT_TYPE ndt, int level=0) { table_entry_pos_t bound(node nd,LMT_TYPE ndt) { int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE); table_entry_pos_t value; getmem(nd,&value,offs); // value -= tb_offset[level+1]; return value; }; // table_entry_pos_t bound(node nd,LMT_TYPE ndt, table_entry_pos_t value, int level=0) { table_entry_pos_t bound(node nd,LMT_TYPE ndt, table_entry_pos_t value) { int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE); // value += tb_offset[level+1]; putmem(nd,value,offs); return value; }; //template T boundwithoffset(node nd,LMT_TYPE ndt, T value, int level); /* table_entry_pos_t boundwithoffset(node nd,LMT_TYPE ndt, int level) { int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE); table_entry_pos_t value; getmem(nd,&value,offs); return value; // return value-tb_offset[level+1]; }; */ /* table_entry_pos_t boundwithoffset(node nd,LMT_TYPE ndt, table_entry_pos_t value, int level) { int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE); putmem(nd,value,offs); return value; // return value+tb_offset[level+1]; }; */ /* inline table_entry_pos_t bound(node nd,LMT_TYPE ndt) { int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE); table_entry_pos_t value; getmem(nd,&value,offs); return value; }; template inline T bound(node nd,LMT_TYPE ndt, T value) { int offs=LMTCODESIZE+2*(ndt==QINTERNAL?QPROBSIZE:PROBSIZE); putmem(nd,value,offs); return value; }; */ //returns the indexes of the successors of a node int succrange(node ndp,int level,table_entry_pos_t* isucc=NULL,table_entry_pos_t* esucc=NULL); void stat(int lev=0); void printTable(int level); virtual inline void setDict(dictionary* d) { if (delete_dict==true && dict) delete dict; dict=d; delete_dict=false; }; virtual inline dictionary* getDict() const { return dict; }; inline table_entry_pos_t getCurrentSize(int l) const { return cursize[l]; }; inline void setOrderQuery(bool v) { orderQuery = v; } inline bool isOrderQuery() const { return orderQuery; } inline float GetNgramcacheLoadFactor() { return ngramcache_load_factor; } inline float GetDictioanryLoadFactor() { return ngramcache_load_factor; } //never allow the increment of the dictionary through this function inline virtual void dictionary_incflag(const bool flag) { UNUSED(flag); }; inline virtual bool filter(const string sfilter, lmtable* sublmt, const string skeepunigrams) { std::cerr << "filtering... \n"; dictionary *dict=new dictionary((char *)sfilter.c_str()); cpsublm(sublmt, dict,(skeepunigrams=="yes")); delete dict; std::cerr << "...done\n"; return true; } inline virtual bool is_OOV(int code) { return (code == dict->oovcode()); }; }; #endif irstlm-5.80.03/src/Makefile.am000644 000766 000024 00000004343 12114670667 020200 0ustar00nicolabertoldistaff000000 000000 lib_LTLIBRARIES = libirstlm.la AM_CXXFLAGS = -isystem/usr/include -W -Wall -ffor-scope -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES $(BOOST_CPPFLAGS) -DMYCODESIZE=3 libirstlm_ladir = ${includedir} libirstlm_la_HEADERS = \ cmd.h \ dictionary.h \ gzfilebuf.h \ htable.h \ index.h \ lmContainer.h \ lmclass.h \ lmmacro.h \ lmtable.h \ lmInterpolation.h \ mempool.h \ mfstream.h \ n_gram.h \ ngramcache.h \ ngramtable.h \ timer.h \ util.h \ interplm.h \ linearlm.h \ mdiadapt.h \ mixture.h \ normcache.h \ shiftlm.h \ cplsa.h \ doc.h libirstlm_la_SOURCES = \ cmd.c \ dictionary.cpp \ htable.cpp \ lmContainer.cpp \ lmclass.cpp \ lmmacro.cpp \ lmtable.cpp \ lmInterpolation.cpp \ mempool.cpp \ mfstream.cpp \ n_gram.cpp \ ngramcache.cpp \ ngramtable.cpp \ timer.cpp \ util.cpp \ interplm.cpp \ linearlm.cpp \ mdiadapt.cpp \ mixture.cpp \ normcache.cpp \ shiftlm.cpp \ cplsa.cpp \ doc.cpp CLEANFILES = $(BUILT_SOURCES) libirstlm_la_LIBADD = $(BOOST_LDFLAGS) $(BOOST_THREAD_LIB) LDADD = -lirstlm DEPENDENCIES = libirstlm.la bin_PROGRAMS = dict ngt dtsel compile-lm interpolate-lm prune-lm quantize-lm prune-lm score-lm tlm plsa verify-caching dict_SOURCES = dict.cpp dict_DEPENDENCIES = $(DEPENDENCIES) ngt_SOURCES = ngt.cpp ngt_DEPENDENCIES = $(DEPENDENCIES) dtsel_SOURCES = dtsel.cpp dtsel_DEPENDENCIES = $(DEPENDENCIES) compile_lm_SOURCES = compile-lm.cpp compile_lm_DEPENDENCIES = $(DEPENDENCIES) interpolate_lm_SOURCES = interpolate-lm.cpp interpolate_lm_DEPENDENCIES = $(DEPENDENCIES) prune_lm_SOURCES = prune-lm.cpp prune_lm_DEPENDENCIES = $(DEPENDENCIES) quantize_lm_SOURCES = quantize-lm.cpp quantize_lm_DEPENDENCIES = $(DEPENDENCIES) score_lm_SOURCES = score-lm.cpp score_lm_DEPENDENCIES = $(DEPENDENCIES) tlm_SOURCES = tlm.cpp tlm_DEPENDENCIES = $(DEPENDENCIES) plsa_SOURCES = plsa.cpp plsa_DEPENDENCIES = $(DEPENDENCIES) verify_caching_SOURCES = verify-caching.cpp verify_caching_DEPENDENCIES = $(DEPENDENCIES) irstlm-5.80.03/src/mdiadapt.cpp000644 000766 000024 00000133501 12042554746 020431 0ustar00nicolabertoldistaff000000 000000 /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include "util.h" #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "mempool.h" #include "ngramcache.h" #include "ngramtable.h" #include "normcache.h" #include "interplm.h" #include "mdiadapt.h" #include "shiftlm.h" #include "lmtable.h" using namespace std; // //Minimum discrimination adaptation for interplm // mdiadaptlm::mdiadaptlm(char* ngtfile,int depth,TABLETYPE tbtype): interplm(ngtfile,depth,tbtype) { adaptlev=0; forelm=NULL; cache=NULL; m_save_per_level=true; }; mdiadaptlm::~mdiadaptlm() { if (cache) delete cache; delete_caches(); }; void mdiadaptlm::delete_caches(int level) { if (probcache[level]) delete probcache[level]; if (backoffcache[level]) delete backoffcache[level]; }; void mdiadaptlm::delete_caches() { #ifdef MDIADAPTLM_CACHE_ENABLE for (int i=0; i<=max_caching_level; i++) delete_caches(i); delete [] probcache; delete [] backoffcache; #endif }; void mdiadaptlm::caches_stat() { #ifdef MDIADAPTLM_CACHE_ENABLE for (int i=1; i<=max_caching_level; i++) { if (probcache[i]) { cerr << "Statistics of probcache at level " << i << " (of " << lmsize() << ") "; probcache[i]->stat(); } if (backoffcache[i]) { cerr << "Statistics of backoffcache at level " << i << " (of " << lmsize() << ") "; backoffcache[i]->stat(); } } #endif }; void mdiadaptlm::create_caches(int mcl) { max_caching_level=(mcl>=0 && mclisfull()) probcache[level]->reset(probcache[level]->cursize()); if (backoffcache[level] && backoffcache[level]->isfull()) backoffcache[level]->reset(backoffcache[level]->cursize()); }; void mdiadaptlm::check_cache_levels() { #ifdef MDIADAPTLM_CACHE_ENABLE for (int i=1; i<=max_caching_level; i++) check_cache_levels(i); #endif }; void mdiadaptlm::reset_caches(int level) { if (probcache[level]) probcache[level]->reset(MAX(probcache[level]->cursize(),probcache[level]->maxsize())); if (backoffcache[level]) backoffcache[level]->reset(MAX(backoffcache[level]->cursize(),backoffcache[level]->maxsize())); }; void mdiadaptlm::reset_caches() { #ifdef MDIADAPTLM_CACHE_ENABLE for (int i=1; i<=max_caching_level; i++) reset_caches(i); #endif }; inline NGRAMCACHE_t* mdiadaptlm::get_probcache(int level) { return probcache[level]; } inline NGRAMCACHE_t* mdiadaptlm::get_backoffcache(int level) { return backoffcache[level]; } int mdiadaptlm::scalefact(char *ngtfile) { if (forelm!=NULL) delete forelm; if (cache!=NULL) delete cache; cache=new normcache(dict); forelm=new shiftbeta(ngtfile,1); forelm->train(); //compute oov scalefact term ngram fng(forelm->dict,1); ngram ng(dict,1); int* w=fng.wordp(1); oovscaling=1.0; for ((*w)=0; (*w)dict->size(); (*w)++) if ((*w) != forelm->dict->oovcode()) { ng.trans(fng); if (*ng.wordp(1)==dict->oovcode()) { cerr << "adaptation file contains new words: use -ao=yes option\n"; exit(1); } //forbidden situation oovscaling-=backunig(ng); } *w=forelm->dict->oovcode(); oovscaling=foreunig(fng)/oovscaling; return 1; }; int mdiadaptlm::savescalefactor(char* filename) { ngram ng(dict,1); int* w=ng.wordp(1); mfstream out(filename,ios::out); out << "\n\\data\\" << "\nngram 1=" << dict->size() << "\n\n1grams:\n"; for ((*w)=0; (*w)size(); (*w)++) { double ratio=scalefact(ng); out << (float) (ratio?log10(ratio):-99); if (*w==dict->oovcode()) out << "\t" << "\n"; else out << "\t" << (char *)dict->decode(*w) << "\n"; } out << "\\end\\\n"; return 1; } double mdiadaptlm::scalefact(ngram ng) { ngram fng(forelm->dict,1); fng.trans(ng); if (*fng.wordp(1)==forelm->dict->oovcode()) return pow(oovscaling,gis_step); else { double prback=backunig(ng); double prfore=foreunig(ng); return pow(prfore/prback,gis_step); } } double mdiadaptlm::foreunig(ngram ng) { double fstar,lambda; forelm->discount(ng,1,fstar,lambda); return fstar; } double mdiadaptlm::backunig(ngram ng) { double fstar,lambda; discount(ng,1,fstar,lambda,0); return fstar; }; int mdiadaptlm::adapt(char* ngtfile,int alev,double step) { if (alev > lmsize() || alev<=0) { cerr << "setting adaptation level to " << lmsize() << "\n"; alev=lmsize(); } adaptlev=alev; cerr << "adapt ...."; gis_step=step; if (ngtfile==NULL) { cerr << "adaptation file is missing\n"; exit(1); } //compute the scaling factor; scalefact(ngtfile); //compute 1-gram zeta ngram ng(dict,2); int* w=ng.wordp(1); cerr << "precomputing 1-gram normalization ...\n"; zeta0=0; for ((*w)=0; (*w)size(); (*w)++) zeta0+=scalefact(ng) * backunig(ng); if (alev==1) return 1 ; cerr << "precomputing 2-gram normalization:\n"; //precompute the bigram normalization w=ng.wordp(2); *ng.wordp(1)=0; for ((*w)=0; (*w)size(); (*w)++) { zeta(ng,2); if ((*w % 1000)==0) cerr << "."; } cerr << "done\n"; return 1; }; double mdiadaptlm::zeta(ngram ng,int size) { assert(size>=1); double z=0; // compute normalization term ng.size=size; if (size==1) return zeta0; else { //size>1 //check in the 2gr and 3gr cache if (size <=3 && cache->get(ng,size,z)) return z; double fstar,lambda; ngram histo=ng; int succ=0; discount(ng,size,fstar,lambda,(int)0); if ((lambda<1) && get(histo,size,size-1)) { ; //scan all its successors succ=0; succscan(histo,ng,INIT,size); while(succscan(histo,ng,CONT,size)) { discount(ng,size,fstar,lambda,0); if (fstar>0) { z+=(scalefact(ng) * fstar); succ++; //cerr << ng << "zeta= " << z << "\n"; } } } z+=lambda*zeta(ng,size-1); if (size<=3 && succ>1) cache->put(ng,size,z); return z; } } int mdiadaptlm::discount(ngram ng_,int size,double& fstar,double& lambda,int /* unused parameter: cv */) { ngram ng(dict); ng.trans(ng_); double __fstar, __lambda; bool lambda_cached=0; int size_lambda=size-1; ngram histo=ng; histo.shift(); if (size_lambda>0 && histo.size>=size_lambda) { #ifdef MDIADAPTLM_CACHE_ENABLE if (size_lambda<=max_caching_level) { //backoffcache hit if (backoffcache[size_lambda] && backoffcache[size_lambda]->get(histo.wordp(size_lambda),__lambda)) lambda_cached=1; } #endif } discount(ng,size,__fstar,__lambda,0); if ((size>0) && (size<=adaptlev) && (__lambda<1)) { if (size>1) { double numlambda, numfstar, den; numfstar=scalefact(ng); den=zeta(ng,size); __fstar=__fstar * numfstar/den; if (!lambda_cached) { numlambda=zeta(ng,size-1); __lambda=__lambda * numlambda/den; } } else if (size==1) { double ratio; ratio=scalefact(ng)/zeta0; __fstar=__fstar * ratio; if (!lambda_cached) { __lambda=__lambda * ratio; } } else { //size==0 do nothing } } #ifdef MDIADAPTLM_CACHE_ENABLE //backoffcache insert if (!lambda_cached && size_lambda>0 && size_lambda<=max_caching_level && histo.size>=size_lambda && backoffcache[size_lambda]) backoffcache[size_lambda]->add(histo.wordp(size_lambda),__lambda); #endif lambda=__lambda; fstar=__fstar; return 1; } int mdiadaptlm::compute_backoff_per_level() { double fstar,lambda; this->backoff=1; for (int size=1; size0){ ng.size=ng.size-1; pr -= mdiadaptlm::prob(ng,size); } } assert(pr>0 && pr<=1); boff(hg.link,pr); } } cerr << "done\n"; return 1; } int mdiadaptlm::compute_backoff_per_word() { cerr << "Current implementation does not support the usage of backoff (-bo=yes) mixture models (-lm=mix) combined with the per-word saving (-saveperllevel=no)." << endl; cerr << "Please, either choose a per-level saving (-saveperllevel=yes) or do not use backoff (-bo=no) " << endl; exit(1); } double mdiadaptlm::prob2(ngram ng,int size,double& fstar) { double lambda; mdiadaptlm::discount(ng,size,fstar,lambda); if (size>1) return fstar + lambda * prob(ng,size-1); else return fstar; } //inline double mdiadaptlm::prob(ngram ng,int size){ double mdiadaptlm::prob(ngram ng,int size) { double fstar,lambda,bo; return prob(ng,size,fstar,lambda,bo); } double mdiadaptlm::prob(ngram ng,int size,double& fstar,double& lambda, double& bo) { double pr; #ifdef MDIADAPTLM_CACHE_ENABLE //probcache hit if (size<=max_caching_level && probcache[size] && ng.size>=size && probcache[size]->get(ng.wordp(size),pr)) return pr; #endif //probcache miss mdiadaptlm::bodiscount(ng,size,fstar,lambda,bo); if (fstar>UPPER_SINGLE_PRECISION_OF_1 || lambda>UPPER_SINGLE_PRECISION_OF_1) { cerr << "wrong probability: " << ng << " , size " << size << " , fstar " << fstar << " , lambda " << lambda << "\n"; exit(1); } if (backoff) { if (size>1) { if (fstar>0){ pr=fstar; }else { if (lambda<1){ pr = lambda/bo * prob(ng,size-1); }else { assert(lambda1) pr = fstar + lambda * prob(ng,size-1); else pr = fstar; } #ifdef MDIADAPTLM_CACHE_ENABLE //probcache insert if (size<=max_caching_level && probcache[size] && ng.size>=size) probcache[size]->add(ng.wordp(size),pr); #endif return pr; } int mdiadaptlm::bodiscount(ngram ng_,int size,double& fstar,double& lambda,double& bo) { ngram ng(dict); ng.trans(ng_); mdiadaptlm::discount(ng,size,fstar,lambda); bo=1.0; if (backoff) { //get back-off probability if (size>1 && lambda<1) { ngram hg=ng; // cerr<< "hg:|" << hg << "| size:|" << size << "|" << endl; if (! get(hg,size,size-1)){ cerr << "ERROR: int mdiadaptlm::bodiscount(ngram ng_,int size,double& fstar,double& lambda,double& bo) -> get(hg,size,size-1) returns NULL\n"; } assert(get(hg,size,size-1)); bo=boff(hg.link); // if (lambda > bo){ // cerr << " mdiadaptlm::bodiscount ERROR: " << " lambda:" << lambda << " bo:" << bo << "\n"; // exit(1); // } } } return 1; } double mdiadaptlm::txclprob(ngram ng,int size) { double fstar,lambda; if (size>1) { mdiadaptlm::discount(ng,size,fstar,lambda); return fstar + lambda * txclprob(ng,size-1); } else { double freq=1; if ((*ng.wordp(1)!=dict->oovcode()) && get(ng,1,1)) freq+=ng.freq; double N=totfreq()+dict->dub()-dict->size(); return freq/N; } } int mdiadaptlm::netsize() { double fstar,lambda; int size,totsize; ngram ng(dict); cerr << "Computing LM size:\n"; totsize=dict->size() * 2; cout << "1-gram " << totsize << "\n"; for (int i=2; i<=maxlevel(); i++) { size=0; scan(ng,INIT,i); while (scan(ng,CONT,i)) { mdiadaptlm::discount(ng,i,fstar,lambda); if (fstar>0) size++; } size+=size * (i dictionary length repeat [ dictionary length ] { word; } while [ first word != STOP ] { first word number of successors repeat [ number of successors ] { second word prob } } STOP while [ first word != STOP ] { first word number of successor sets repeat [ number of successor sets ] { second word number of successors repeat [ number of successors ] { third word prob } } } STOP */ //void writeNull(mfbstream& out,unsigned short nullCode,float nullProb){ // out.writex(&nullCode,sizeof(short)); // out.writex(&nullProb,sizeof(float)); //} int swapbytes(char *p, int sz, int n) { char c,*l,*h; if((n<1) ||(sz<2)) return 0; for(; n--; p+=sz) for(h=(l=p)+sz; --h>l; l++) { c=*h; *h=*l; *l=c; } return 0; }; void fwritex(char *p,int sz,int n,FILE* f) { if(*(short *)"AB"==0x4241) { swapbytes((char*)p, sz,n); } fwrite((char *)p,sz,n,f); if(*(short *)"AB"==0x4241) swapbytes((char*)p, sz,n); } void ifwrite(long loc,void *ptr,int size,int /* unused parameter: n */,FILE* f) { fflush(f); long pos=ftell(f); fseek(f,loc,SEEK_SET); fwritex((char *)ptr,size,1,f); fseek(f,pos,SEEK_SET); fflush(f); } void writeNull(unsigned short nullCode,float nullProb,FILE* f) { fwritex((char *)&nullCode,sizeof(short),1,f); fwritex((char *)&nullProb,sizeof(float),1,f); } int mdiadaptlm::saveASR(char *filename,int /* unused parameter: backoff */,char* subdictfile) { int totbg,tottr; dictionary* subdict; if (subdictfile) subdict=new dictionary(subdictfile); else subdict=dict; // default is subdict=dict typedef unsigned short code; system("date"); if (lmsize()>3 || lmsize()<1) { cerr << "wrong lmsize\n"; exit(1); } if (dict->size()>=0xffff && subdict->size()>=0xffff) { cerr << "save bin requires unsigned short codes\n"; exit(1); } FILE* f=fopen(filename,"w"); double fstar,lambda,boff; float pr; long succ1pos,succ2pos; code succ1,succ2,w,h1,h2; code stop=0xffff; //dictionary //#dictsize w1\n ..wN\n NULL\n code oovcode=subdict->oovcode(); //includes at least NULL code subdictsz=subdict->size()+1; fwritex((char *)&subdictsz,sizeof(code),1,f); subdictsz--; for (w=0; wdecode(w)); fprintf(f,"____\n"); //unigram part //NULL #succ w1 pr1 ..wN prN h1=subdictsz; fwritex((char *)&h1,sizeof(code),1,f); //NULL succ1=0; succ1pos=ftell(f); fwritex((char *)&succ1,sizeof(code),1,f); ngram ng(dict); ngram sng(subdict); ng.size=sng.size=1; scan(ng,INIT,1); while(scan(ng,CONT,1)) { sng.trans(ng); if (sng.containsWord(subdict->OOV(),1)) continue; pr=(float)mdiadaptlm::prob(ng,1); if (pr>1e-50) { //do not consider too low probabilities succ1++; w=*sng.wordp(1); fwritex((char *)&w,sizeof(code),1,f); fwritex((char *)&pr,sizeof(float),1,f); } else { cerr << "small prob word " << ng << "\n"; } } // update number of unigrams ifwrite(succ1pos,&succ1,sizeof(code),1,f); cerr << "finito unigrammi " << succ1 << "\n"; fflush(f); if (lmsize()==1) { fclose(f); return 1; } // rest of bigrams // w1 #succ w1 pr1 .. wN prN succ1=0; h1=subdictsz; totbg=subdictsz; ngram hg1(dict,1); ng.size=sng.size=2; scan(hg1,INIT,1); while(scan(hg1,CONT,1)) { if (hg1.containsWord(dict->OOV(),1)) continue; assert((*hg1.wordp(1))size()); *ng.wordp(2)=*hg1.wordp(1); *ng.wordp(1)=0; sng.trans(ng); if (sng.containsWord(dict->OOV(),1)) continue; mdiadaptlm::bodiscount(ng,2,fstar,lambda,boff); if (lambda < 1.0) { h1=*sng.wordp(2); fwritex((char *)&h1,sizeof(code),1,f); succ1=0; succ1pos=ftell(f); fwritex((char *)&succ1,sizeof(code),1,f); ngram shg=hg1; get(shg,1,1); succscan(shg,ng,INIT,2); while(succscan(shg,ng,CONT,2)) { if (*ng.wordp(1)==oovcode) continue; sng.trans(ng); if (sng.containsWord(dict->OOV(),2)) continue; mdiadaptlm::discount(ng,2,fstar,lambda); if (fstar>1e-50) { w=*sng.wordp(1); fwritex((char *)&w,sizeof(code),1,f); pr=(float)mdiadaptlm::prob(ng,2); //cerr << ng << " prob=" << log(pr) << "\n"; fwritex((char *)&pr,sizeof(float),1,f); succ1++; } } if (succ1) { lambda/=boff; //consider backoff writeNull(subdictsz,(float)lambda,f); succ1++; totbg+=succ1; ifwrite(succ1pos,&succ1,sizeof(code),1,f); } else { //go back one word fseek(f,succ1pos-(streampos)sizeof(code),SEEK_SET); } } } fwritex((char *)&stop,sizeof(code),1,f); cerr << " finito bigrammi! " << subdictsz << "\n"; fflush(f); system("date"); if (lmsize()<3) { fclose(f); return 1; } //TRIGRAM PART h1=subdictsz; h2=subdictsz; tottr=0; succ1=0; succ2=0; ngram hg2(dict,2); ng.size=sng.size=3; scan(hg1,INIT,1); while(scan(hg1,CONT,1)) { if ((*hg1.wordp(1)==oovcode)) continue; *ng.wordp(3)=*hg1.wordp(1); sng.trans(ng); if (sng.containsWord(dict->OOV(),1)) continue; assert((*sng.wordp(3))OOV(),2)) continue; mdiadaptlm::bodiscount(ng,3,fstar,lambda,boff); if (lambda < 1.0) { h2=*sng.wordp(2); fwritex((char *)&h2,sizeof(code),1,f); succ2=0; succ2pos=ftell(f); fwritex((char *)&succ2,sizeof(code),1,f); ngram shg2=ng; get(shg2,3,2); succscan(shg2,ng,INIT,3); while(succscan(shg2,ng,CONT,3)) { if (*ng.wordp(1)==oovcode) continue; sng.trans(ng); if (sng.containsWord(dict->OOV(),3)) continue; mdiadaptlm::discount(ng,3,fstar,lambda); //pr=(float)mdiadaptlm::prob2(ng,3,fstar); if (fstar>1e-50) { w=*sng.wordp(1); fwritex((char *)&w,sizeof(code),1,f); pr=(float)mdiadaptlm::prob(ng,3); // cerr << ng << " prob=" << log(pr) << "\n"; fwritex((char *)&pr,sizeof(float),1,f); succ2++; } } if (succ2) { lambda/=boff; writeNull(subdictsz,(float)lambda,f); succ2++; tottr+=succ2; ifwrite(succ2pos,&succ2,sizeof(code),1,f); succ1++; } else { //go back one word fseek(f,succ2pos-(long)sizeof(code),SEEK_SET); } } } if (succ1) ifwrite(succ1pos,&succ1,sizeof(code),1,f); else fseek(f,succ1pos-(long)sizeof(code),SEEK_SET); } fwritex((char *)&stop,sizeof(code),1,f); fclose(f); cerr << "Tot bg: " << totbg << " tg: " << tottr<< "\n"; system("date"); return 1; }; ///// Save in IRST MT format int mdiadaptlm::saveMT(char *filename,int backoff, char* subdictfile,int resolution,double decay) { double logalpha=log(decay); dictionary* subdict; if (subdictfile) subdict=new dictionary(subdictfile); else subdict=dict; // default is subdict=dict ngram ng(dict,lmsize()); ngram sng(subdict,lmsize()); cerr << "Adding unigram of OOV word if missing\n"; for (int i=1; i<=maxlevel(); i++) *ng.wordp(i)=dict->oovcode(); if (!get(ng,maxlevel(),1)) { cerr << "oov is missing in the ngram-table\n"; // f(oov) = dictionary size (Witten Bell) ng.freq=dict->freq(dict->oovcode()); cerr << "adding oov unigram " << ng << "\n"; put(ng); } cerr << "Eventually adding OOV symbol to subdictionary\n"; subdict->encode(OOV_); system("date"); mfstream out(filename,ios::out); //add special symbols subdict->incflag(1); int bo_code=subdict->encode(BACKOFF_); int du_code=subdict->encode(DUMMY_); subdict->incflag(0); out << "nGrAm " << lmsize() << " " << 0 << " " << "LM_ " << resolution << " " << decay << "\n"; subdict->save(out); //start writing ngrams cerr << "write unigram of oov probability\n"; ng.size=1; *ng.wordp(1)=dict->oovcode(); double pr=(float)mdiadaptlm::prob(ng,1); sng.trans(ng); sng.size=lmsize(); for (int s=2; s<=lmsize(); s++) *sng.wordp(s)=du_code; sng.freq=(int)ceil(pr * (double)10000000)-1; out << sng << "\n"; for (int i=1; i<=lmsize(); i++) { cerr << "LEVEL " << i << "\n"; double fstar,lambda,bo,dummy; scan(ng,INIT,i); while(scan(ng,CONT,i)) { sng.trans(ng); sng.size=lmsize(); for (int s=i+1; s<=lmsize(); s++) *sng.wordp(s)=du_code; if (i>=1 && sng.containsWord(subdict->OOV(),sng.size)) { cerr << "skipping : " << sng << "\n"; continue; } // skip also eos symbols not at the final //if (i>=1 && sng.containsWord(dict->EoS(),sng.size)) //continue; mdiadaptlm::discount(ng,i,fstar,dummy); //out << sng << " fstar " << fstar << " lambda " << lambda << "\n"; //if (i==1 && sng.containsWord(subdict->OOV(),i)){ // cerr << sng << " fstar " << fstar << "\n"; //} if (fstar>0) { double pr=(float)mdiadaptlm::prob(ng,i); if (i>1 && resolution<10000000) { sng.freq=resolution-(int)(log(pr)/logalpha)-1; sng.freq=(sng.freq>=0?sng.freq:0); } else sng.freq=(int)ceil(pr * (double)10000000)-1; out << sng << "\n"; } if (i=0?sng.freq:0); } else sng.freq=(int)ceil(lambda/bo * (double)10000000)-1; out << sng << "\n"; } } } cerr << "LEVEL " << i << "DONE \n"; } return 1; }; ///// Save in binary format forbackoff N-gram models int mdiadaptlm::saveBIN_per_word(char *filename,int backoff,char* subdictfile,int mmap) { VERBOSE(2,"mdiadaptlm::saveBIN_per_word START\n"); system("date"); //subdict dictionary* subdict; //accumulated unigram oov prob //CHECK why this is not used (differently from what happens in the other save functions // double oovprob=0; if (subdictfile) subdict=new dictionary(subdictfile); else subdict=dict; // default is subdict=dict if (mmap) { VERBOSE(2,"savebin with memory map: " << filename << "\n"); } else { VERBOSE(2,"savebin: " << filename << "\n"); } streampos pos[lmsize()+1]; int maxlev=lmsize(); char buff[100]; int isQuant=0; //savebin for quantized LM is not yet implemented //temporary filename to save the LM related to a single term char tmpfilename[BUFSIZ]; //create temporary output file stream to store single levels for all terms assert(strlen(filename)<1000); char tfilename[MAX_NGRAM][1000]; mfstream *tout[MAX_NGRAM]; for (int i=1; i<=lmsize(); i++) { sprintf(tfilename[i],"%s-%dgrams",filename,i); tout[i]=new mfstream(tfilename[i],ios::out); } // print header in the main output file mfstream out(filename,ios::out); out << "blmt " << maxlev; for (int i=1; i<=maxlev; i++) { //reserve space for ngram statistics (which are not yet avalable) pos[i]=out.tellp(); sprintf(buff," %10d",0); out << buff; } out << "\n"; subdict->save(out); out.flush(); ngram ng(dict,lmsize()); ngram oldng(dict,lmsize()); ngram locng(dict,lmsize()); ngram sng(subdict,lmsize()); double fstar,lambda,bo,dummy,dummy2,pr,ibow; //n-gram counters table_entry_pos_t num[lmsize()+1]; for (int i=1; i<=lmsize(); i++) num[i]=0; lmtable* lmt = new lmtable(); lmt->configure(maxlev,isQuant); lmt->setDict(subdict); lmt->expand_level(1,dict->size(),filename,mmap); //main loop for (int w=0; wsize(); w++) { sprintf(tmpfilename,"%s_tmp_%d",filename,w); if (!w % 10000) cerr << "."; //1-gram ngram ung(dict,1); *ung.wordp(1)=w; sng.trans(ung); //exclude words not occurring in the subdictionary if (sng.containsWord(subdict->OOV(),1) && !ung.containsWord(dict->OOV(),1)) continue; pr=mdiadaptlm::prob(ung,1); pr=(pr?log10(pr):-99); if (lmsize()>1) { //compute back-off ung.pushc(0); //extend by one mdiadaptlm::bodiscount(ung,2,fstar,lambda,bo); ung.shift();//shrink by one assert(!backoff || ((lambdaLOWER_SINGLE_PRECISION_OF_1) || boaddwithoffset(ung,(float)pr,(float)ibow); num[1]++; //manage n-grams if (get(ung,1,1)) { //create n-gram with history w *ng.wordp(lmsize())=w; //create sentinel n-gram for (int i=1; i<=lmsize(); i++) *oldng.wordp(i)=-1; //create the table for all levels but the level 1, with the maximum number of possible entries for (int i=2; i<=lmsize(); i++) lmt->expand_level(i,entries(i),tmpfilename,mmap); scan(ung.link,ung.info,1,ng,INIT,lmsize()); while(scan(ung.link,ung.info,1,ng,CONT,lmsize())) { sng.trans(ng); // convert to subdictionary locng=ng; // make a local copy //find first internal level that changed int f=lmsize()-1; //unigrams have been already covered while (f>1 && (*oldng.wordp(f)==*ng.wordp(f))){ f--; } for (int l=lmsize()-(f-1); l<=lmsize(); l++){ locng=ng; // make a local copy if (lOOV(),l)) continue; // skip also eos symbols not at the final if (sng.containsWord(dict->EoS(),l-1)) continue; pr=mdiadaptlm::prob(locng,l,fstar,dummy,dummy2); //PATCH by Nicola (16-04-2008) if (!(pr<=1.0 && pr > 1e-10)) { cerr << ng << " " << pr << "\n"; assert(pr<=1.0); cerr << "prob modified to 1e-10\n"; pr=1e-10; } if (l=UPPER_SINGLE_PRECISION_OF_0 || lambda <= LOWER_SINGLE_PRECISION_OF_1) { ibow=log10(lambda) - log10(bo); if (lmt->addwithoffset(locng,(float)log10(pr),(float)ibow)){ num[l]++; }else{ continue; } } else{ continue; //skip n-grams with too small fstar } } else { if (fstar>=UPPER_SINGLE_PRECISION_OF_0) { ibow=0.0; //value for backoff weight at the highest level if (lmt->addwithoffset(locng,(float)log10(pr),(float)ibow)){ num[l]++; }else{ continue; } } else{ continue; //skip n-grams with too small fstar } } } oldng=ng; } } else{ //create empty tables for all levels but the level 1, to keep consistency with the rest of the code for (int i=2; i<=lmsize(); i++) lmt->expand_level(i,0,tmpfilename,mmap); } //level 1 is not modified until everything is done //because it has to contain the full dictionary //which provides the direct access to the second level for (int i=2; i<=lmsize(); i++){ if (i>2) { lmt->checkbounds(i-1); lmt->appendbin_level(i-1, *tout[i-1], mmap); } // now we can resize table at level i lmt->resize_level(i, tmpfilename, mmap); } // now we can save table at level maxlev, if not equal to 1 if (lmsize()>1){ lmt->appendbin_level(maxlev, *tout[maxlev], mmap); } //delete levels from 2 to lmsize(); for (int i=2; i<=lmsize(); i++) lmt->delete_level(i, tmpfilename, mmap); //update table offsets for (int i=2; i<=lmsize(); i++) lmt->update_offset(i,num[i]); } //close levels from 2 to lmsize() for (int i=2; i<=lmsize(); i++) tout[i]->close(); //now we can save level 1, which contains all unigrams //cerr << "saving level 1" << "...\n"; lmt->savebin_level(1, filename, mmap); //update headers for (int i=1; i<=lmsize(); i++) { sprintf(buff," %10d",num[i]); out.seekp(pos[i]); out << buff; } out.close(); //concatenate files for each single level into one file //single level files should have a name derived from "filename" lmt->compact_all_levels(filename); cerr << "\n"; system("date"); VERBOSE(2,"mdiadaptlm::saveBIN_per_word END\n"); return 1; }; ///// Save in binary format forbackoff N-gram models int mdiadaptlm::saveBIN_per_level(char *filename,int backoff,char* subdictfile,int mmap) { VERBOSE(2,"mdiadaptlm::saveBIN_per_level START\n"); system("date"); //subdict dictionary* subdict; //accumulated unigram oov prob double oovprob=0; if (subdictfile) subdict=new dictionary(subdictfile); else subdict=dict; // default is subdict=dict if (mmap) { VERBOSE(2,"savebin with memory map: " << filename << "\n"); } else { VERBOSE(2,"savebin: " << filename << "\n"); } streampos pos[lmsize()+1]; int maxlev=lmsize(); char buff[100]; int isQuant=0; //savebin for quantized LM is not yet implemented // print header fstream out(filename,ios::out); out << "blmt " << maxlev; for (int i=1; i<=maxlev; i++) { //reserve space for ngram statistics (which are not yet avalable) pos[i]=out.tellp(); sprintf(buff," %10d",0); out << buff; } out << "\n"; lmtable* lmt = new lmtable(); lmt->configure(maxlev,isQuant); lmt->setDict(subdict); subdict->save(out); out.flush(); //start adding n-grams to lmtable for (int i=1; i<=lmsize(); i++) { cerr << "saving level " << i << "...\n"; table_entry_pos_t numberofentries; if (i==1) { //unigram numberofentries = (table_entry_pos_t) subdict->size(); } else { numberofentries = (table_entry_pos_t) entries(i); } system("date"); lmt->expand_level(i,numberofentries,filename,mmap); double totp=0; double fstar,lambda,bo,dummy,dummy2,pr,ibow; ngram ng(dict,1); ngram ng2(dict); ngram sng(subdict,1); if (i==1) { //unigram case //scan the dictionary for (int w=0; wsize(); w++) { *ng.wordp(1)=w; sng.trans(ng); pr=mdiadaptlm::prob(ng,1); totp+=pr; if (sng.containsWord(subdict->OOV(),i) && !ng.containsWord(dict->OOV(),i)) { oovprob+=pr; //accumulate oov probability continue; } if (ng.containsWord(dict->OOV(),i)) pr+=oovprob; //cerr << ng << " freq " << dict->freq(w) << " - Pr " << pr << "\n"; pr=(pr?log10(pr):-99); if (w==dict->oovcode()){ //CHECK whether we can avoid this reassignment because dict should be lmt->getDict() *ng.wordp(1)=lmt->getDict()->oovcode(); ibow=0.0; } else { // } //do nothing if (lmsize()>1) { ngram ng2=ng; ng2.pushc(0); //extend by one //cerr << ng2 << "\n"; mdiadaptlm::bodiscount(ng2,i+1,fstar,lambda,bo); assert(!backoff || ((lambdaLOWER_SINGLE_PRECISION_OF_1) || boadd(ng,(float)pr,(float)ibow); } //cerr << "totprob = " << totp << "\n"; } else { //i>1 , bigrams, trigrams, fourgrams... *ng.wordp(1)=0; get(ng,1,1); //this scan(ng,INIT,i); while(scan(ng,CONT,i)) { sng.trans(ng); if (sng.containsWord(subdict->OOV(),i)) continue; // skip also eos symbols not at the final if (sng.containsWord(dict->EoS(),i-1)) continue; // mdiadaptlm::discount(ng,i,fstar,dummy); // pr=mdiadaptlm::prob(ng,i); pr=mdiadaptlm::prob(ng,i,fstar,dummy,dummy2); if (!(pr<=1.0 && pr > 1e-10)) { cerr << ng << " " << pr << "\n"; assert(pr<=1.0); cerr << "prob modified to 1e-10\n"; pr=1e-10; } if (i=UPPER_SINGLE_PRECISION_OF_0 || lambda <= LOWER_SINGLE_PRECISION_OF_1) { ibow=log10(lambda) - log10(bo); lmt->add(ng,(float)log10(pr),(float)ibow); } } else { if (fstar >= UPPER_SINGLE_PRECISION_OF_0) { ibow=0.0; //value for backoff weight at the highest level lmt->add(ng,(float)log10(pr),(float)ibow); } } } } // now we can fix table at level i-1 // now we can save table at level i-1 // now we can remove table at level i-1 if (maxlev>1 && i>1) { lmt->checkbounds(i-1); lmt->savebin_level(i-1, filename, mmap); } // now we can resize table at level i lmt->resize_level(i, filename, mmap); } // now we can save table at level maxlev lmt->savebin_level(maxlev, filename, mmap); //update headers for (int i=1; i<=lmsize(); i++) { sprintf(buff," %10d",lmt->getCurrentSize(i)); out.seekp(pos[i]); out << buff; } out.close(); //concatenate files for each single level into one file //single level files should have a name derived from "filename" lmt->compact_all_levels(filename); VERBOSE(2,"mdiadaptlm::saveBIN_per_level END\n"); return 1; } ///// Save in format for ARPA backoff N-gram models int mdiadaptlm::saveARPA_per_word(char *filename,int backoff,char* subdictfile ) { VERBOSE(2,"mdiadaptlm::saveARPA_per_word START\n"); system("date"); //subdict dictionary* subdict; //accumulated unigram oov prob //CHECK why this is not used (differently from what happens in the other save functions // double oovprob=0; if (subdictfile) subdict=new dictionary(subdictfile); else subdict=dict; // default is subdict=dict //main output file mfstream out(filename,ios::out); //create temporary output file stream assert(strlen(filename)<1000); char tfilename[MAX_NGRAM][1000]; mfstream *tout[MAX_NGRAM]; for (int i=1; i<=lmsize(); i++) { sprintf(tfilename[i],"%s.%d",filename,i); tout[i]=new mfstream(tfilename[i],ios::out); *tout[i] << "\n\\" << i << "-grams:\n"; } ngram ng(dict,lmsize()); ngram oldng(dict,lmsize()); ngram locng(dict,lmsize()); ngram sng(subdict,lmsize()); double fstar,lambda,bo,dummy,dummy2, pr; //n-gram counters table_entry_pos_t num[lmsize()+1]; for (int i=1; i<=lmsize(); i++) num[i]=0; //main loop for (int w=0; wsize(); w++) { if (!w % 10000) cerr << "."; //1-gram ngram ung(dict,1); *ung.wordp(1)=w; sng.trans(ung); //exclude words not occurring in the subdictionary if (sng.containsWord(subdict->OOV(),1) && !ung.containsWord(dict->OOV(),1)) continue; pr=mdiadaptlm::prob(ung,1); pr=(pr?log10(pr):-99); if (w==dict->oovcode()) *tout[1] << (float) pr << "\t" << ""; else *tout[1] << (float) pr << "\t" << (char *)dict->decode(w); num[1]++; if (lmsize()>1) { //print back-off ung.pushc(0); //extend by one mdiadaptlm::bodiscount(ung,2,fstar,lambda,bo); ung.shift();//shrink by one assert(!backoff || ((lambdaLOWER_SINGLE_PRECISION_OF_1) || bo1 && (*oldng.wordp(f)==*ng.wordp(f))){ f--; } for (int l=lmsize(); l>lmsize()-f;l--){ if (lOOV(),l)) continue; // skip also eos symbols not at the final if (sng.containsWord(dict->EoS(),l-1)) continue; pr=mdiadaptlm::prob(locng,l,fstar,dummy,dummy2); //PATCH by Nicola (16-04-2008) if (!(pr<=1.0 && pr > 1e-10)) { cerr << ng << " " << pr << "\n"; assert(pr<=1.0); cerr << "prob modified to 1e-10\n"; pr=1e-10; } if (l=UPPER_SINGLE_PRECISION_OF_0 || lambda <= LOWER_SINGLE_PRECISION_OF_1) { *tout[l] << (float) log10(pr); *tout[l] << "\t" << (char *)dict->decode(*locng.wordp(l)); for (int j=l-1; j>0; j--) *tout[l] << " " << (char *)dict->decode(*locng.wordp(j)); if (lambda < LOWER_SINGLE_PRECISION_OF_1) //output back-off prob *tout[l] << "\t" << (float) (log10(lambda) -log10(bo)); *tout[l] << "\n"; num[l]++; } else continue; //skip n-grams with too small fstar } else { if (fstar>=UPPER_SINGLE_PRECISION_OF_0 ) { *tout[l] << (float) log10(pr); *tout[l] << "\t" << (char *)dict->decode(*locng.wordp(l)); for (int j=l-1; j>0; j--) *tout[l] << " " << (char *)dict->decode(*locng.wordp(j)); *tout[l] << "\n"; num[l]++; } else continue; //skip n-grams with too small fstar } } oldng=ng; } } } //print header out << "\n\\data\\" << "\n"; char buff[100]; for (int i=1; i<=lmsize(); i++) { sprintf(buff,"ngram %2d=%10d\n",i,num[i]); out << buff; } out << "\n"; //append and remove temporary files for (int i=1; i<=lmsize(); i++) { delete tout[i]; tout[i]=new mfstream(tfilename[i],ios::in); out << tout[i]->rdbuf(); delete tout[i]; removefile(tfilename[i]); } out << "\\end\\" << "\n"; cerr << "\n"; system("date"); VERBOSE(2,"mdiadaptlm::saveARPA_per_word END\n"); return 1; }; ///// Save in format for ARPA backoff N-gram models int mdiadaptlm::saveARPA_per_level(char *filename,int backoff,char* subdictfile ) { VERBOSE(2,"mdiadaptlm::saveARPA_per_level START\n"); system("date"); //subdict dictionary* subdict; //accumulated unigram oov prob double oovprob=0; if (subdictfile) { subdict=new dictionary(subdictfile); } else subdict=dict; // default is subdict=dict fstream out(filename,ios::out); // out.precision(15); streampos pos[lmsize()+1]; table_entry_pos_t num[lmsize()+1]; char buff[100]; //print header out << "\n\\data\\" << "\n"; for (int i=1; i<=lmsize(); i++) { num[i]=0; pos[i]=out.tellp(); sprintf(buff,"ngram %2d=%10d\n",i,num[i]); out << buff; } out << "\n"; //start writing n-grams for (int i=1; i<=lmsize(); i++) { cerr << "saving level " << i << "...\n"; out << "\n\\" << i << "-grams:\n"; double totp=0; double fstar,lambda,bo,dummy,dummy2,pr; ngram ng(dict,1); ngram ng2(dict); ngram sng(subdict,1); if (i==1) { //unigram case //scan the dictionary for (int w=0; wsize(); w++) { *ng.wordp(1)=w; sng.trans(ng); pr=mdiadaptlm::prob(ng,1); totp+=pr; if (sng.containsWord(subdict->OOV(),i) && !ng.containsWord(dict->OOV(),i)) { oovprob+=pr; //accumulate oov probability continue; } if (ng.containsWord(dict->OOV(),i)) pr+=oovprob; //cerr << ng << " freq " << dict->freq(w) << " - Pr " << pr << "\n"; out << (float) (pr?log10(pr):-99); num[i]++; if (w==dict->oovcode()) out << "\t" << "\n"; else { out << "\t" << (char *)dict->decode(w); if (lmsize()>1) { ngram ng2=ng; ng2.pushc(0); //extend by one mdiadaptlm::bodiscount(ng2,i+1,fstar,lambda,bo); assert(!backoff || ((lambdaLOWER_SINGLE_PRECISION_OF_1) || bo1 , bigrams, trigrams, fourgrams... *ng.wordp(1)=0; get(ng,1,1); //this scan(ng,INIT,i); while(scan(ng,CONT,i)) { sng.trans(ng); if (sng.containsWord(subdict->OOV(),i)) continue; // skip also eos symbols not at the final if (sng.containsWord(dict->EoS(),i-1)) continue; pr=mdiadaptlm::prob(ng,i,fstar,dummy,dummy2); //PATCH by Nicola (16-04-2008) if (!(pr<=1.0 && pr > 1e-10)) { cerr << ng << " " << pr << "\n"; assert(pr<=1.0); cerr << "prob modified to 1e-10\n"; pr=1e-10; } if (i=UPPER_SINGLE_PRECISION_OF_0 || lambda <= LOWER_SINGLE_PRECISION_OF_1) { out << (float) log10(pr); out << "\t" << (char *)dict->decode(*ng.wordp(i)); for (int j=i-1; j>0; j--) out << " " << (char *)dict->decode(*ng.wordp(j)); if (backoff){ out << "\t" << (float) (log10(lambda) - log10(bo)); }else{ if (lambda=UPPER_SINGLE_PRECISION_OF_0) { out << (float) log10(pr); out << "\t" << (char *)dict->decode(*ng.wordp(i)); for (int j=i-1; j>0; j--) out << " " << (char *)dict->decode(*ng.wordp(j)); out << "\n"; num[i]++; } } } } cerr << i << "grams tot:" << num[i] << "\n"; } streampos last=out.tellp(); //update headers for (int i=1; i<=lmsize(); i++) { sprintf(buff,"ngram %2d=%10d\n",i,num[i]); out.seekp(pos[i]); out << buff; } out.seekp(last); out << "\\end\\" << "\n"; system("date"); VERBOSE(2,"mdiadaptlm::saveARPA_per_level END\n"); return 1; }; /* main(int argc,char** argv){ char* dictname=argv[1]; char* backngram=argv[2]; int depth=atoi(argv[3]); char* forengram=argv[4]; char* testngram=argv[5]; dictionary dict(dictname); ngramtable test(&dict,testngram,depth); shiftbeta lm2(&dict,backngram,depth); lm2.train(); //lm2.test(test,depth); mdi lm(&dict,backngram,depth); lm.train(); for (double w=0.0;w<=1.0;w+=0.1){ lm.getforelm(forengram); lm.adapt(w); lm.test(test,depth); } } */ irstlm-5.80.03/src/mdiadapt.h000644 000766 000024 00000010520 12114671302 020055 0ustar00nicolabertoldistaff000000 000000 /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ // Adapted LM classes: extension of interp classes #ifndef MF_MDIADAPTLM_H #define MF_MDIADAPTLM_H #include "ngramcache.h" #include "normcache.h" #include "interplm.h" class mdiadaptlm:public interplm { int adaptlev; interplm* forelm; double zeta0; double oovscaling; bool m_save_per_level; protected: normcache *cache; //to improve access speed NGRAMCACHE_t** probcache; NGRAMCACHE_t** backoffcache; int max_caching_level; int saveARPA_per_word(char *filename,int backoff=0,char* subdictfile=NULL); int saveARPA_per_level(char *filename,int backoff=0,char* subdictfile=NULL); int saveBIN_per_word(char *filename,int backoff=0,char* subdictfile=NULL,int mmap=0); int saveBIN_per_level(char *filename,int backoff=0,char* subdictfile=NULL,int mmap=0); public: mdiadaptlm(char* ngtfile,int depth=0,TABLETYPE tt=FULL); inline normcache* get_zetacache() { return cache; } inline NGRAMCACHE_t* get_probcache(int level); inline NGRAMCACHE_t* get_backoffcache(int level); void create_caches(int mcl); void init_caches(); void init_caches(int level); void delete_caches(); void delete_caches(int level); void check_cache_levels(); void check_cache_levels(int level); void reset_caches(); void reset_caches(int level); void caches_stat(); double gis_step; double zeta(ngram ng,int size); int discount(ngram ng,int size,double& fstar,double& lambda,int cv=0); int bodiscount(ngram ng,int size,double& fstar,double& lambda,double& bo); int compute_backoff() { cerr << "compute backoff probabilities ..."; if (m_save_per_level){ cerr << " per level ..."; return compute_backoff_per_level(); }else{ cerr << " per word ..."; return compute_backoff_per_word(); } } int compute_backoff_per_level(); int compute_backoff_per_word(); double backunig(ngram ng); double foreunig(ngram ng); int adapt(char* ngtfile,int alev=1,double gis_step=0.4); int scalefact(char* ngtfile); int savescalefactor(char* filename); double scalefact(ngram ng); double prob(ngram ng,int size); double prob(ngram ng,int size,double& fstar,double& lambda, double& bo); double prob2(ngram ng,int size,double & fstar); double txclprob(ngram ng,int size); int saveASR(char *filename,int backoff,char* subdictfile=NULL); int saveMT(char *filename,int backoff,char* subdictfile=NULL,int resolution=10000000,double decay=0.999900); int saveARPA(char *filename,int backoff=0,char* subdictfile=NULL){ if (m_save_per_level){ cerr << " per level ..."; return saveARPA_per_level(filename, backoff, subdictfile); }else{ cerr << " per word ..."; return saveARPA_per_word(filename, backoff, subdictfile); } } int saveBIN(char *filename,int backoff=0,char* subdictfile=NULL,int mmap=0){ if (m_save_per_level){ cerr << " per level ..."; return saveBIN_per_level(filename, backoff, subdictfile, mmap); }else{ cerr << " per word ..."; return saveBIN_per_word(filename, backoff, subdictfile, mmap); } } inline void save_per_level(bool value){ m_save_per_level=value; } inline bool save_per_level(){ return m_save_per_level; } int netsize(); ~mdiadaptlm(); double myround(double x) { long int value = (long int) x; return (x-value)>0.500?value+1.0:(double)value; } inline bool is_train_cache_enabled(){ #ifdef MDIADAPTLM_CACHE_ENABLE return true; #endif return false; } }; #endif irstlm-5.80.03/src/mempool.cpp000644 000766 000024 00000021416 12013405172 020301 0ustar00nicolabertoldistaff000000 000000 // $Id: mempool.cpp 302 2009-08-25 13:04:13Z nicolabertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ // An efficient memory pool manager // by M. Federico // Copyright Marcello Federico, ITC-irst, 1998 #include #include #include #include #include #include #include #include #include "util.h" #include "mempool.h" using namespace std; /*! The pool contains: - entries of size is - tables for bs entries */ mempool::mempool(int is, int bs) { // item size must be multiple of memory alignment step (4 bytes) // example: is is=9 becomes i=12 (9 + 4 - 9 %4 ) is=(is>(int)sizeof(char *)?is:0); is=is + sizeof(char *) - (is % sizeof(char *)); item_size = is; block_size = bs; true_size = is * bs; block_list = new memnode; block_list->block = new char[true_size]; memset(block_list->block,'0',true_size); block_list->next = 0; blocknum = 1; entries = 0; // build free list char *ptr = free_list = block_list->block; for (int i=0; iblock = new char[true_size]; //memset(new_block->block,'0',true_size); new_block->next = block_list; block_list=new_block; // update block list /* update free list */ ptr = free_list = block_list->block; for (int i=0; iblock) || (addr >= (list->block + true_size)))) list=list->next; if ((list==NULL) || (((addr - list->block) % item_size)!=0)) { //cerr << "mempool::free-> addr does not belong to this pool\n"; return 0; } */ *(char **)addr=free_list; free_list=addr; entries--; return 1; } mempool::~mempool() { memnode *ptr; while (block_list !=NULL) { ptr=block_list->next; delete [] block_list->block; delete block_list; block_list=ptr; } } void mempool::map (ostream& co) { co << "mempool memory map:\n"; //percorri piu` volte la lista libera memnode *bl=block_list; char *fl=free_list; char* img=new char[block_size+1]; img[block_size]='\0'; while (bl !=NULL) { memset(img,'#',block_size); fl=free_list; while (fl != NULL) { if ((fl >= bl->block) && (fl < bl->block + true_size)) { img[(fl-bl->block)/item_size]='-'; } fl=*(char **)fl; } co << img << "\n"; bl=bl->next; } delete [] img; } void mempool::stat() { VERBOSE(1, "mempool class statistics\n" << "entries " << entries << " blocks " << blocknum << " used memory " << (blocknum * true_size)/1024 << " Kb\n"); } strstack::strstack(int bs) { size=bs; list=new memnode; list->block=new char[size]; list->next=0; memset(list->block,'\0',size); idx=0; waste=0; memory=size; entries=0; blocknum=1; } void strstack::stat() { VERBOSE(1, "strstack class statistics\n" << "entries " << entries << " blocks " << blocknum << " used memory " << memory/1024 << " Kb\n"); } const char *strstack::push(const char *s) { int len=strlen(s); if ((len+1) >= size) { cerr << "strstack::push string is too long\n"; exit(1); }; if ((idx+len+1) >= size) { //append a new block //there must be space to //put the index after //the word waste+=size-idx; blocknum++; memory+=size; memnode* nd=new memnode; nd->block=new char[size]; nd->next=list; list=nd; memset(list->block,'\0',size); idx=0; } // append in current block strcpy(&list->block[idx],s); idx+=len+1; entries++; return &list->block[idx-len-1]; } const char *strstack::pop() { if (list==0) return 0; if (idx==0) { // free this block and go to next memnode *ptr=list->next; delete [] list->block; delete list; list=ptr; if (list==0) return 0; else idx=size-1; } //go back to first non \0 while (idx>0) if (list->block[idx--]!='\0') break; //go back to first \0 while (idx>0) if (list->block[idx--]=='\0') break; entries--; if (list->block[idx+1]=='\0') { idx+=2; memset(&list->block[idx],'\0',size-idx); return &list->block[idx]; } else { idx=0; memset(&list->block[idx],'\0',size); return &list->block[0]; } } const char *strstack::top() { int tidx=idx; memnode *tlist=list; if (tlist==0) return 0; if (idx==0) { tlist=tlist->next; if (tlist==0) return 0; tidx=size-1; } //go back to first non \0 while (tidx>0) if (tlist->block[tidx--]!='\0') break; //aaa\0bbb\0\0\0\0 //go back to first \0 while (tidx>0) if (tlist->block[tidx--]=='\0') break; if (tlist->block[tidx+1]=='\0') { tidx+=2; return &tlist->block[tidx]; } else { tidx=0; return &tlist->block[0]; } } strstack::~strstack() { memnode *ptr; while (list !=NULL) { ptr=list->next; delete [] list->block; delete list; list=ptr; } } storage::storage(int maxsize,int blocksize) { newmemory=0; newcalls=0; setsize=maxsize; poolsize=blocksize; //in bytes poolset=new mempool* [setsize+1]; for (int i=0; i<=setsize; i++) poolset[i]=NULL; } storage::~storage() { for (int i=0; i<=setsize; i++) if (poolset[i]) delete poolset[i]; delete [] poolset; } char *storage::allocate(int size) { if (size<=setsize) { if (!poolset[size]) { poolset[size]=new mempool(size,poolsize/size); } return poolset[size]->allocate(); } else { newmemory+=size+8; newcalls++; char* p=(char *)calloc(sizeof(char),size); if (p==NULL) { cerr << "storage::alloc insufficient memory\n"; exit(1); } return p; } } char *storage::reallocate(char *oldptr,int oldsize,int newsize) { char *newptr; assert(newsize>oldsize); if (oldsize<=setsize) { if (newsize<=setsize) { if (!poolset[newsize]) poolset[newsize]=new mempool(newsize,poolsize/newsize); newptr=poolset[newsize]->allocate(); memset((char*)newptr,0,newsize); } else newptr=(char *)calloc(sizeof(char),newsize); if (oldptr && oldsize) { memcpy(newptr,oldptr,oldsize); poolset[oldsize]->free(oldptr); } } else { newptr=(char *)realloc(oldptr,newsize); if (newptr==oldptr) cerr << "r\b"; else cerr << "a\b"; } if (newptr==NULL) { cerr << "storage::realloc insufficient memory\n"; exit(1); } return newptr; } int storage::free(char *addr,int size) { /* while(size<=setsize){ if (poolset[size] && poolset[size]->free(addr)) break; size++; } */ if (size>setsize) return free(addr),1; else { poolset[size] && poolset[size]->free(addr); } return 1; } void storage::stat() { int used=0; int memory=sizeof(char *) * setsize; int waste=0; for (int i=0; i<=setsize; i++) if (poolset[i]) { used++; memory+=poolset[i]->used(); waste+=poolset[i]->wasted(); } VERBOSE(1, "storage class statistics\n" << "alloc entries " << newcalls << " used memory " << newmemory/1024 << "Kb\n" << "mpools " << setsize << " active " << used << " used memory " << memory/1024 << "Kb" << " wasted " << waste/1024 << "Kb\n"); } irstlm-5.80.03/src/mempool.h000644 000766 000024 00000011166 12030777462 017764 0ustar00nicolabertoldistaff000000 000000 // $Id: mempool.h 383 2010-04-23 15:29:28Z nicolabertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ // An efficient memory manager // by M. Federico // Copyright Marcello Federico, ITC-irst, 1998 #ifndef MF_MEMPOOL_H #define MF_MEMPOOL_H #ifndef NULL const int NULL=0; #endif #define MP_BLOCK_SIZE 1000000 #include // std::ostream //! Memory block /*! This can be used by: - mempool to store items of fixed size - strstack to store strings of variable size */ class memnode { friend class mempool; //!< grant access friend class strstack; //!< grant access char *block; //!< block of memory memnode *next; //!< next block ptr }; //! Memory pool /*! A memory pool is composed of: - a linked list of block_num memory blocks - each block might contain up to block_size items - each item is made of exactly item_size bytes */ class mempool { int block_size; //!< number of entries per block int item_size; //!< number of bytes per entry int true_size; //!< number of bytes per block memnode* block_list; //!< list of blocks char* free_list; //!< free entry list int entries; //!< number of stored entries int blocknum; //!< number of allocated blocks public: //! Creates a memory pool mempool(int is, int bs=MP_BLOCK_SIZE); //! Destroys memory pool ~mempool(); //! Prints a map of memory occupancy void map(std::ostream& co); //! Allocates a single memory entry char *allocate(); //! Frees a single memory entry int free(char* addr); //! Prints statistics about this mempool void stat(); //! Returns effectively used memory (bytes) /*! includes 8 bytes required by each call of new */ int used() { return blocknum * (true_size + 8); } //! Returns amount of wasted memory (bytes) int wasted() { return used()-(entries * item_size); } }; //! A stack to store strings /*! The stack is composed of - a list of blocks memnode of fixed size - attribute blocknum tells the block on top - attribute idx tells position of the top string */ class strstack { memnode* list; //!< list of memory blocks int size; //!< size of each block int idx; //!< index of last stored string int waste; //!< current waste of memory int memory; //!< current use of memory int entries; //!< current number of stored strings int blocknum; //!< current number of used blocks public: strstack(int bs=1000); ~strstack(); const char *push(const char *s); const char *pop(); const char *top(); void stat(); int used() { return memory; } int wasted() { return waste; } }; //! Manages multiple memory pools /*! This class permits to manage memory pools with items up to a specified size. - items within the allowed range are stored in memory pools - items larger than the limit are allocated with new */ class storage { mempool **poolset; //!< array of memory pools int setsize; //!< number of memory pools/maximum elem size int poolsize; //!< size of each block int newmemory; //!< stores amount of used memory int newcalls; //!< stores number of allocated blocks public: //! Creates storage storage(int maxsize,int blocksize); //! Destroys storage ~storage(); /* names of below functions have been changed so as not to interfere with macros for malloc/realloc/etc -- EVH */ //! Allocates memory char *allocate(int size); //! Realloc memory char *reallocate(char *oldptr,int oldsize,int newsize); //! Frees memory of an entry int free(char *addr,int size=0); //! Prints statistics about storage void stat(); }; #endif irstlm-5.80.03/src/mfstream.cpp000644 000766 000024 00000005443 12013405172 020451 0ustar00nicolabertoldistaff000000 000000 // $Id: mfstream.cpp 294 2009-08-19 09:57:27Z mfederico $ /****************************************************************************** IrstLM: IRST Language Model Toolkit, compile LM Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include "mfstream.h" using namespace std; void mfstream::open(const char *name,openmode mode) { char cmode[10]; if (strchr(name,' ')!=0) { if (mode & ios::in) strcpy(cmode,"r"); else if (mode & ios::out) strcpy(cmode,"w"); else if (mode & ios::app) strcpy(cmode,"a"); else { cerr << "cannot open file\n"; exit(1); } _cmd=1; strcpy(_cmdname,name); _FILE=popen(name,cmode); buf=new fdbuf(fileno(_FILE)); iostream::rdbuf((streambuf*) buf); } else { _cmd=0; fstream::open(name,mode); } } void mfstream::close() { if (_cmd==1) { pclose(_FILE); delete buf; } else { fstream::clear(); fstream::close(); } _cmd=2; } int mfstream::swapbytes(char *p, int sz, int n) { char c, *l, *h; if((n<1) ||(sz<2)) return 0; for(; n--; p+=sz) for(h=(l=p)+sz; --h>l; l++) { c=*h; *h=*l; *l=c; } return 0; }; mfstream& mfstream::iwritex(streampos loc,void *ptr,int size,int n) { streampos pos=tellp(); seekp(loc); writex(ptr,size,n); seekp(pos); return *this; } mfstream& mfstream::readx(void *p, int sz,int n) { if(!read((char *)p, sz * n)) return *this; if(*(short *)"AB"==0x4241) { swapbytes((char*)p, sz,n); } return *this; } mfstream& mfstream::writex(void *p, int sz,int n) { if(*(short *)"AB"==0x4241) { swapbytes((char*)p, sz,n); } write((char *)p, sz * n); if(*(short *)"AB"==0x4241) swapbytes((char*)p, sz,n); return *this; } /* int main() { char word[1000]; mfstream inp("cat pp",ios::in); mfbstream outp("aa",ios::out,100); while (inp >> word){ outp << word << "\n"; cout << word << "\n"; } } */ irstlm-5.80.03/src/mfstream.h000644 000766 000024 00000013325 12042554746 020132 0ustar00nicolabertoldistaff000000 000000 // $Id: mfstream.h 383 2010-04-23 15:29:28Z nicolabertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit, compile LM Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include #include #include #include #include #include using namespace std; #ifndef MF_STREAM_H #define MF_STREAM_H extern "C" { ssize_t write (int fd, const void* buf, size_t num); ssize_t read (int fd, void* buf, size_t num); FILE *popen(const char *command, const char *type); int pclose(FILE *stream); int fseek( FILE *stream, long offset, int whence); long ftell( FILE *stream); }; //! File description for I/O stream buffer class fdbuf : public std::streambuf { protected: int fd; // file descriptor // write one character virtual int_type overflow (int_type c) { char z = c; if (c != EOF) { if (write (fd, &z, 1) != 1) { return EOF; } } //cerr << "overflow: \n"; //cerr << "pptr: " << (int) pptr() << "\n"; return c; } // write multiple characters virtual std::streamsize xsputn (const char* s, std::streamsize num) { return write(fd,s,num); } virtual streampos seekpos ( streampos /* unused parameter: sp */, ios_base::openmode /* unused parameter: which */= ios_base::in | ios_base::out ) { std::cerr << "mfstream::seekpos is not implemented" << std::endl;; return (streampos) 0; } //read one character virtual int_type underflow () { // is read position before end of buffer? if (gptr() < egptr()) { return traits_type::to_int_type(*gptr()); } /* process size of putback area * - use number of characters read * - but at most four */ int numPutback; numPutback = gptr() - eback(); if (numPutback > 4) { numPutback = 4; } /* copy up to four characters previously read into * the putback buffer (area of first four characters) */ std::memmove (buffer+(4-numPutback), gptr()-numPutback, numPutback); // read new characters int num; num = read (fd, buffer+4, bufferSize-4); if (num <= 0) { // ERROR or EOF return EOF; } // reset buffer pointers setg (buffer+(4-numPutback), // beginning of putback area buffer+4, // read position buffer+4+num); // end of buffer // return next character return traits_type::to_int_type(*gptr()); } // read multiple characters virtual std::streamsize xsgetn (char* s, std::streamsize num) { return read(fd,s,num); } static const int bufferSize = 10; // size of the data buffer char buffer[bufferSize]; // data buffer public: // constructor fdbuf (int _fd) : fd(_fd) { setg (buffer+4, // beginning of putback area buffer+4, // read position buffer+4); // end position } }; //! Extension of fstream to commands class mfstream : public std::fstream { protected: fdbuf* buf; int _cmd; openmode _mode; FILE* _FILE; int swapbytes(char *p, int sz, int n); public: char _cmdname[500]; //! Creates and opens a file/command stream without a specified nmode mfstream () : std::fstream(), _cmd(0) { } //! Creates and opens a file/command stream in a specified nmode mfstream (const char* name,openmode mode) : std::fstream() { _cmdname[0]='\0'; _mode=mode; open(name,mode); } //! Closes and destroys a file/command stream ~mfstream() { if (_cmd<2) close(); } //! Opens an existing mfstream void open(const char *name,openmode mode); //! Closes an existing mfstream void close(); //! Write function for machine-independent byte order mfstream& writex(void *p, int sz,int n=1); //! Read function for machine-independent byte order mfstream& readx(void *p, int sz,int n=1); //! Write function at a given stream position for machine-independent byte order mfstream& iwritex(streampos loc,void *ptr,int size,int n=1); //! Tells current position within a file streampos tellp() { if (_cmd==0) return (streampos) fstream::tellg(); cerr << "tellp not allowed on commands\n"; exit(1); } //! Seeks a position within a file mfstream& seekp(streampos loc) { if (_cmd==0) fstream::seekg(loc); else { cerr << "seekp not allowed on commands\n"; exit(1); } return *this; } //! Reopens an input stream mfstream& reopen() { if (_mode != in) { cerr << "mfstream::reopen() openmode must be ios:in\n"; exit(1); } if (strlen(_cmdname)>0) { char *a=new char[strlen(_cmdname)+1]; strcpy(a,_cmdname); cerr << "close/open " << a <<"\n"; close(); open(a,ios::in); } else seekp(0); return *this; } }; #endif irstlm-5.80.03/src/mixture.cpp000644 000766 000024 00000030362 12137531327 020337 0ustar00nicolabertoldistaff000000 000000 /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ using namespace std; #include #include "mfstream.h" #include "mempool.h" #include "dictionary.h" #include "n_gram.h" #include "ngramtable.h" #include "interplm.h" #include "normcache.h" #include "ngramcache.h" #include "mdiadapt.h" #include "shiftlm.h" #include "linearlm.h" #include "mixture.h" #include "cmd.h" #include "util.h" // //Mixture interpolated language model // static Enum_T SLmTypeEnum [] = { { (char*)"ModifiedShiftBeta", MOD_SHIFT_BETA }, { (char*)"msb", MOD_SHIFT_BETA }, { (char*)"InterpShiftBeta", SHIFT_BETA }, { (char*)"sb", SHIFT_BETA }, { (char*)"InterpShiftOne", SHIFT_ONE }, { (char*)"s1", SHIFT_ONE }, { (char*)"InterpShiftZero", SHIFT_ZERO }, { (char*)"s0", SHIFT_ZERO }, { (char*)"LinearWittenBell", LINEAR_WB }, { (char*)"wb", LINEAR_WB }, { (char*)"Mixture", MIXTURE}, END_ENUM }; mixture::mixture(bool fulltable,char* sublminfo,int depth,int prunefreq,char* ipfile,char* opfile): mdiadaptlm((char *)NULL,depth) { prunethresh=prunefreq; ipfname=ipfile; opfname=opfile; usefulltable=fulltable; mfstream inp(sublminfo,ios::in ); if (!inp) { cerr << "cannot open " << sublminfo << "\n"; exit(1); } inp >> numslm; sublm=new interplm* [numslm]; cerr << "WARNING: Parameters PruneTopSingletons (ps) and PruneSingletons (pts) are not taken into account for this type of LM (mixture); please specify the singleton pruning policy for each submodel using parameters \"-sps\" and \"-spts\" in the configuraton file\n"; for (int i=0; i> par[j]; } subtrainfile=NULL; slmtype=0; subprunefreq=-1; subprunesingletons=true; subprunetopsingletons=false; GetParams(&npar, &par, (char*) NULL); if (!slmtype || !subtrainfile || !subprunefreq==-1) { cerr << "slm incomplete parameters\n"; exit(1); } switch (slmtype) { case LINEAR_WB: sublm[i]=new linearwb(subtrainfile,depth,subprunefreq,MSHIFTBETA_I); break; case SHIFT_BETA: sublm[i]=new shiftbeta(subtrainfile,depth,subprunefreq,-1,SHIFTBETA_I); break; case SHIFT_ONE: sublm[i]=new shiftbeta(subtrainfile,depth,subprunefreq,SIMPLE_I); break; case MOD_SHIFT_BETA: sublm[i]=new mshiftbeta(subtrainfile,depth,subprunefreq,MSHIFTBETA_I); break; case MIXTURE: sublm[i]=new mixture(usefulltable,subtrainfile,depth,subprunefreq); break; default: cerr << "not implemented yet\n"; exit(1); }; sublm[i]->prunesingletons(subprunesingletons==true); sublm[i]->prunetopsingletons(subprunetopsingletons==true); if (subprunetopsingletons==true) //apply most specific pruning method sublm[i]->prunesingletons(false); cerr << "eventually generate OOV code of sub lm[" << i << "]\n"; sublm[i]->dict->genoovcode(); //create super dictionary dict->augment(sublm[i]->dict); //creates the super n-gram table if(usefulltable) augment(sublm[i]); } cerr << "eventually generate OOV code of the mixture\n"; dict->genoovcode(); cerr << "dict size of the mixture:" << dict->size() << "\n"; //tying parameters k1=2; k2=10; }; double mixture::reldist(double *l1,double *l2,int n) { double dist=0.0,size=0.0; for (int i=0; idict; cerr << "Computing parameters mapping: ..." << d->size() << " "; pm=new int[d->size()]; //initialize for (int i=0; isize(); i++) pm[i]=0; pmax=k2-k1+1; //update # of parameters for (int w=0; wsize(); w++) { int f=d->freq(w); if ((f>k1) && (f<=k2)) pm[w]=f-k1; else if (f>k2) { pm[w]=pmax++; } } cerr << "pmax " << pmax << " "; return 1; } int mixture::pmap(ngram ng,int lev) { ngram h(sublm[0]->dict); h.trans(ng); if (lev<=1) return 0; //get the last word of history if (!sublm[0]->get(h,2,1)) return 0; return (int) pm[*h.wordp(2)]; } int mixture::savepar(char* opf) { mfstream out(opf,ios::out); cerr << "saving parameters in " << opf << "\n"; out << lmsize() << " " << pmax << "\n"; for (int i=0; i<=lmsize(); i++) for (int j=0; jsize()) { cerr << "\nERROR: DUB value is too small: the LM will possibly compute wrong probabilities if sub-LMs have different vocabularies!\n"; cerr << "This exception should already have been handled before!!!\n"; exit(1); } cerr << "mixlm --> DUB: " << dub() << endl; for (int i=0; i DUB: " << sublm[i]->dub() << endl; cerr << "eventually generate OOV code "; cerr << sublm[i]->dict->encode(sublm[i]->dict->OOV()) << "\n"; sublm[i]->train(); } //initialize parameters for (int i=0; i<=lmsize(); i++) { l[i]=new double*[pmax]; for (int j=0; jdict); for (int lev=1; lev<=lmsize(); lev++) { zf=sublm[0]->zerofreq(lev); cerr << "Starting training at lev:" << lev << "\n"; for (int i=0; iscan(ng,INIT,lev); while(sublm[0]->scan(ng,CONT,lev)) { //do not include oov for unigrams if ((lev==1) && (*ng.wordp(1)==sublm[0]->dict->oovcode())) continue; int par=pmap(ng,lev); used[par]=1; //controllo se aggiornare il parametro if (alive[par]) { double backoff=(lev>1?prob(ng,lev-1):1); //backoff double denom=0.0; double* numer = new double[numslm]; double fstar,lambda; //int cv=(int)floor(zf * (double)ng.freq + rand01()); //int cv=1; //old version of leaving-one-out int cv=(int)floor(zf * (double)ng.freq)+1; //int cv=1; //old version of leaving-one-out //if (lev==3)q //if (iter>10) // cout << ng // << " backoff " << backoff // << " level " << lev // << "\n"; for (int i=0; idiscount(ng,lev,fstar,lambda,(i==0)*(cv)); numer[i]=oldl[par][i]*(fstar + lambda * backoff); ngram ngslm(sublm[i]->dict); ngslm.trans(ng); if ((*ngslm.wordp(1)==sublm[i]->dict->oovcode()) && (dict->dub() > sublm[i]->dict->size())) numer[i]/=(double)(dict->dub() - sublm[i]->dict->size()); denom+=numer[i]; } for (int i=0; i10) //cout << ng << " l: " << l[lev][par][i] << "\n"; } delete []numer; } } //normalize all parameters totalive=0; for (int i=0; idiscount(ng,size,fstar2,lambda2,0); ngram ngslm(sublm[i]->dict); ngslm.trans(ng); if (dict->dub() > sublm[i]->dict->size()){ if (*ngslm.wordp(1) == sublm[i]->dict->oovcode()) { fstar2/=(double)(sublm[i]->dict->dub() - sublm[i]->dict->size()+1); } } fstar+=(l[size][p][i]*fstar2); lambda+=(l[size][p][i]*lambda2); lsum+=l[size][p][i]; } if (dict->dub() > dict->size()) if (*ng.wordp(1) == dict->oovcode()) { fstar*=(double)(dict->dub() - dict->size()+1); } assert((lsum>LOWER_DOUBLE_PRECISION_OF_1) && (lsum<=UPPER_DOUBLE_PRECISION_OF_1)); return 1; } //creates the ngramtable on demand from the sublm tables int mixture::get(ngram& ng,int n,int lev) { if (usefulltable) { return ngramtable::get(ng,n,lev); } //free current tree resetngramtable(); //get 1-word prefix from ng ngram ug(dict,1); *ug.wordp(1)=*ng.wordp(ng.size); //local ngram to upload entries ngram locng(dict,maxlevel()); //allocate subtrees from sublm for (int i=0; idict,1); subug.trans(ug); if (sublm[i]->get(subug,1,1)) { ngram subng(sublm[i]->dict,maxlevel()); *subng.wordp(maxlevel())=*subug.wordp(1); sublm[i]->scan(subug.link,subug.info,1,subng,INIT,maxlevel()); while(sublm[i]->scan(subug.link,subug.info,1,subng,CONT,maxlevel())) { locng.trans(subng); put(locng); } } } return ngramtable::get(ng,n,lev); } irstlm-5.80.03/src/mixture.h000644 000766 000024 00000004310 12030777462 020002 0ustar00nicolabertoldistaff000000 000000 /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ // Mixture of linear interpolation LMs #ifndef LM_MIXTURE #define LM_MIXTURE #define END_ENUM { (char*)0, 0 } class mixture: public mdiadaptlm { double** l[MAX_NGRAM]; //interpolation parameters int* pm; //parameter mappings int pmax; //#parameters int k1,k2; //two thresholds int numslm; int prunethresh; interplm** sublm; char *ipfname; char *opfname; double reldist(double *l1,double *l2,int n); int genpmap(); int pmap(ngram ng,int lev); public: bool usefulltable; mixture(bool fulltable,char *sublminfo,int depth,int prunefreq=0,char* ipfile=NULL,char* opfile=NULL); int train(); int savepar(char* opf); int loadpar(char* opf); inline int dub() { return dict->dub(); } inline int dub(int value) { for (int i=0; idub(value); } return dict->dub(value); } void settying(int a,int b) { k1=a; k2=b; } int discount(ngram ng,int size,double& fstar,double& lambda,int cv=0); ~mixture(){ for (int i=0;i<=lmsize();i++){ for (int j=0; j #include #include #include #include #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "index.h" using namespace std; ngram::ngram(dictionary* d,int sz) { dict=d; size=sz; succ=0; freq=0; info=0; pinfo=0; link=NULL; isym=-1; memset(word,0,sizeof(int)*MAX_NGRAM); memset(midx,0,sizeof(int)*MAX_NGRAM); memset(path,0,sizeof(char *)*MAX_NGRAM); } ngram::ngram(ngram& ng) { size=ng.size; freq=ng.freq; succ=0; info=0; pinfo=0; link=NULL; isym=-1; dict=ng.dict; memcpy(word,ng.word,sizeof(int)*MAX_NGRAM); memcpy(midx,ng.word,sizeof(int)*MAX_NGRAM); } void ngram::trans (const ngram& ng) { size=ng.size; freq=ng.freq; if (dict == ng.dict) { info=ng.info; isym=ng.isym; memcpy(word,ng.word,sizeof(int)*MAX_NGRAM); memcpy(midx,ng.midx,sizeof(int)*MAX_NGRAM); } else { info=0; memset(midx,0,sizeof(int)*MAX_NGRAM); isym=-1; for (int i=1; i<=size; i++) word[MAX_NGRAM-i]=dict->encode(ng.dict->decode(*ng.wordp(i))); } } void ngram::invert (const ngram& ng) { size=ng.size; for (int i=1; i<=size; i++) { *wordp(i)=*ng.wordp(size-i+1); } } void ngram::shift () { memmove((void *)&word[MAX_NGRAM-size+1],(void *)&word[MAX_NGRAM-size],(size-1) * sizeof(int)); size--; } void ngram::shift (int sz) { if (sz>size) sz=size; memmove((void *)&word[MAX_NGRAM-size+sz],(void *)&word[MAX_NGRAM-size],(size-sz) * sizeof(int)); size-=sz; } ifstream& operator>> ( ifstream& fi , ngram& ng) { char w[MAX_WORD]; memset(w,0,MAX_WORD); w[0]='\0'; if (!(fi >> setw(MAX_WORD) >> w)) return fi; if (strlen(w)==(MAX_WORD-1)) cerr << "ngram: a too long word was read (" << w << ")\n"; int c=ng.dict->encode(w); if (c == -1 ) { cerr << "ngram: " << w << " is OOV \n"; exit(1); } memcpy(ng.word,ng.word+1,(MAX_NGRAM-1)*sizeof(int)); ng.word[MAX_NGRAM-1]=(int)c; ng.freq=1; if (ng.sizeencode(w); if (c == -1 ) { cerr << "ngram: " << w << " is OOV \n"; exit(1); } pushc(c); return 1; } int ngram::pushc(int c) { size++; if (size>MAX_NGRAM) size=MAX_NGRAM; size_t len = size - 1; //i.e. if size==MAX_NGRAM, the farthest position is lost size_t src = MAX_NGRAM - len; memmove((void *)&word[src - 1],(void *)&word[src], len * sizeof(int)); /* int buff[MAX_NGRAM-1]; memcpy(buff,word+1,(MAX_NGRAM-1)*sizeof(int)); memcpy(word,buff,(MAX_NGRAM-1)*sizeof(int)); */ word[MAX_NGRAM-1]=c; // fill the most recent position return 1; } int ngram::pushc(int* codes, int codes_len) { //copy the first codes_len elements from codes into the actual ngram; sz must be smaller than MAX_NGRAM //shift codes_len elements of the ngram backwards assert (codes_len <= MAX_NGRAM); size+=codes_len; /* std::cout << " codes_len:" << codes_len << " size:" << size << std::endl; */ if (size>MAX_NGRAM) size=MAX_NGRAM; size_t len = size - codes_len; size_t src = MAX_NGRAM - len; /* std::cout << " codes_len:" << codes_len << " size:" << size << " len:" << len << " src:" << src << std::endl; */ if (len > 0) memmove((void *)&word[src - codes_len],(void *)&word[src], len * sizeof(int)); // memcpy((void *)&word[MAX_NGRAM - codes_len],(void*)&codes[MAX_NGRAM - codes_len],codes_len*sizeof(int)); memcpy((void *)&word[MAX_NGRAM - codes_len],(void*)&codes[0],codes_len*sizeof(int)); return 1; } istream& operator>> ( istream& fi , ngram& ng) { char w[MAX_WORD]; memset(w,0,MAX_WORD); w[0]='\0'; assert(ng.dict != NULL); if (!(fi >> setw(MAX_WORD) >> w)) return fi; if (strlen(w)==(MAX_WORD-1)) cerr << "ngram: a too long word was read (" << w << ")\n"; ng.pushw(w); ng.freq=1; return fi; } ofstream& operator<< (ofstream& fo,ngram& ng) { assert(ng.dict != NULL); for (int i=ng.size; i>0; i--) fo << ng.dict->decode(ng.word[MAX_NGRAM-i]) << (i>1?" ":""); fo << "\t" << ng.freq; return fo; } ostream& operator<< (ostream& fo,ngram& ng) { assert(ng.dict != NULL); for (int i=ng.size; i>0; i--) fo << ng.dict->decode(ng.word[MAX_NGRAM-i]) << (i>1?" ":""); fo << "\t" << ng.freq; return fo; } /* main(int argc, char** argv){ dictionary d(argv[1]); ifstream txt(argv[1]); ngram ng(&d); while (txt >> ng){ cout << ng << "\n"; } ngram ng2=ng; cerr << "copia l'ultimo =" << ng << "\n"; } */ irstlm-5.80.03/src/n_gram.h000644 000766 000024 00000007520 12042554746 017557 0ustar00nicolabertoldistaff000000 000000 // $Id: n_gram.h 3461 2010-08-27 10:17:34Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ // n-gram tables // by M. Federico // Copyright Marcello Federico, ITC-irst, 1998 #ifndef MF_NGRAM_H #define MF_NGRAM_H #include #include #include "dictionary.h" #ifdef MYMAXNGRAM #define MAX_NGRAM MYMAXNGRAM #else #define MAX_NGRAM 20 #endif class dictionary; //typedef int code; class ngram { int word[MAX_NGRAM]; //encoded ngram public: dictionary *dict; // dictionary char* link; // ngram-tree pointer char* succlink; // pointer to the first successor int midx[MAX_NGRAM]; // ngram-tree scan pointer char* path[MAX_NGRAM]; // path in the ngram-trie float bowv[MAX_NGRAM]; // vector of bow found in the trie int lev; // ngram-tree level int size; // ngram size long long freq; // ngram frequency or integer prob int succ; // number of successors float bow; // back-off weight float prob; // probability unsigned char info; // ngram-tree info flags unsigned char pinfo; // ngram-tree parent info flags int isym; // last interruption symbol ngram(dictionary* d,int sz=0); ngram(ngram& ng); int *wordp() { // n-gram pointer return wordp(size); } int *wordp(int k) { // n-gram pointer return size>=k?&word[MAX_NGRAM-k]:0; } const int *wordp() const { // n-gram pointer return wordp(size); } const int *wordp(int k) const { // n-gram pointer return size>=k?&word[MAX_NGRAM-k]:0; } int containsWord(const char* s,int lev) { int c=dict->encode(s); if (c == -1) return 0; assert(lev <= size); for (int i=0; i> (std::ifstream& fi,ngram& ng); friend std::ofstream& operator<< (std::ofstream& fi,ngram& ng); friend std::istream& operator>> (std::istream& fi,ngram& ng); friend std::ostream& operator<< (std::ostream& fi,ngram& ng); inline bool operator==(const ngram &compare) const { if ( size != compare.size || dict != compare.dict) return false; else for (int i=size; i>0; i--) if (word[MAX_NGRAM-i] != compare.word[MAX_NGRAM-i]) return false; return true; } inline bool operator!=(const ngram &compare) const { if ( size != compare.size || dict != compare.dict) return true; else for (int i=size; i>0; i--) if (word[MAX_NGRAM-i] != compare.word[MAX_NGRAM-i]) return true; return false; } inline int ckhisto(int sz) { for (int i=sz; i>1; i--) if (*wordp(i)==dict->oovcode()) return 0; return 1; } int pushc(int c); int pushc(int* codes, int sz); int pushw(const char* w); //~ngram(); }; #endif irstlm-5.80.03/src/ngramcache.cpp000644 000766 000024 00000010217 12030777462 020733 0ustar00nicolabertoldistaff000000 000000 // $Id: ngramcache.cpp 3679 2010-10-13 09:10:01Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include #include #include #include #include #include "math.h" #include "mempool.h" #include "htable.h" #include "lmtable.h" #include "ngramcache.h" using namespace std; void ngramcache::print (const int* ngp) { std::cerr << "ngp: size:" << ngsize << "|"; for (int i=0; i((size_t) (maxn/load_factor), ngsize * sizeof(int)); //decrease the lower load factor to reduce collision mp=new mempool(ngsize * sizeof(int)+infosize,MP_BLOCK_SIZE); accesses=0; hits=0; }; ngramcache::~ngramcache() { //ht->stat(); //mp->stat(); delete ht; delete mp; }; //resize cache to specified number of entries void ngramcache::reset(int n) { //ht->stat(); delete ht; delete mp; if (n>0) maxn=n; ht=new htable ((size_t) (maxn/load_factor), ngsize * sizeof(int)); //decrease the lower load factor to reduce collision mp=new mempool(ngsize * sizeof(int)+infosize,MP_BLOCK_SIZE); entries=0; }; char* ngramcache::get(const int* ngp,char*& info) { char* found; accesses++; if ((found=(char*) ht->find((int *)ngp))) { memcpy(&info,found+ngsize*sizeof(int),infosize); hits++; } return found; }; char* ngramcache::get(const int* ngp,double& info) { char *found; accesses++; if ((found=(char*) ht->find((int *)ngp))) { memcpy(&info,found+ngsize*sizeof(int),infosize); hits++; }; return found; }; char* ngramcache::get(const int* ngp,prob_and_state_t& info) { char *found; accesses++; if ((found=(char*) ht->find((int *)ngp))) { memcpy(&info,found+ngsize*sizeof(int),infosize); hits++; }; return found; }; int ngramcache::add(const int* ngp,const char*& info) { char* entry=mp->allocate(); memcpy(entry,(char*) ngp,sizeof(int) * ngsize); memcpy(entry + ngsize * sizeof(int),&info,infosize); char* found=(char*)ht->insert((int *)entry); assert(found == entry); //false if key is already inside entries++; return 1; }; int ngramcache::add(const int* ngp,const double& info) { char* entry=mp->allocate(); memcpy(entry,(char*) ngp,sizeof(int) * ngsize); memcpy(entry + ngsize * sizeof(int),&info,infosize); char *found=(char*) ht->insert((int *)entry); assert(found == entry); //false if key is already inside entries++; return 1; }; int ngramcache::add(const int* ngp,const prob_and_state_t& info) { char* entry=mp->allocate(); memcpy(entry,(char*) ngp,sizeof(int) * ngsize); memcpy(entry + ngsize * sizeof(int),&info,infosize); char *found=(char*) ht->insert((int *)entry); assert(found == entry); //false if key is already inside entries++; return 1; }; void ngramcache::stat() { cerr << "ngramcache stats: entries=" << entries << " acc=" << accesses << " hits=" << hits << " ht.used= " << ht->used() << " mp.used= " << mp->used() << " mp.wasted= " << mp->wasted() << "\n"; }; irstlm-5.80.03/src/ngramcache.h000644 000766 000024 00000005344 12013405172 020370 0ustar00nicolabertoldistaff000000 000000 // $Id: ngramcache.h 3679 2010-10-13 09:10:01Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #ifndef MF_NGRAMCACHE_H #define MF_NGRAMCACHE_H #include "mempool.h" #include "htable.h" #define NGRAMCACHE_t ngramcache #define NGRAMCACHE_LOAD_FACTOR 0.5 typedef struct PROB_AND_STATE_ENTRY { double logpr; //!< probability value of an ngram char* state; //!< the largest suffix of an n-gram contained in the LM table. unsigned int statesize; //!< LM statesize of an ngram double bow; //!< backoff weight int bol; //!< backoff level bool extendible; //!< flag for extendibility of the ngram PROB_AND_STATE_ENTRY(double lp=0.0, char* st=NULL, unsigned int stsz=0, double bw=0.0, int bl=0, bool ex=false): logpr(lp), state(st), statesize(stsz), bow(bw), bol(bl), extendible(ex) {}; //initializer } prob_and_state_t; void print(prob_and_state_t* pst, std::ostream& out=std::cout); class ngramcache { private: static const bool debug=true; htable* ht; mempool *mp; int maxn; int ngsize; int infosize; int accesses; int hits; int entries; float load_factor; //!< ngramcache loading factor void print(const int*); public: ngramcache(int n,int size,int maxentries,float lf=NGRAMCACHE_LOAD_FACTOR); ~ngramcache(); int cursize() { return entries; } int maxsize() { return maxn; } void reset(int n=0); char* get(const int* ngp,char*& info); char* get(const int* ngp,double& info); char* get(const int* ngp,prob_and_state_t& info); int add(const int* ngp,const char*& info); int add(const int* ngp,const double& info); int add(const int* ngp,const prob_and_state_t& info); int isfull() { return (entries >= maxn); } void stat(); inline void used() { stat(); }; inline float set_load_factor(float value) { return load_factor=value; } }; #endif irstlm-5.80.03/src/ngramtable.cpp000644 000766 000024 00000074311 12026703050 020747 0ustar00nicolabertoldistaff000000 000000 // $Id: ngramtable.cpp 35 2010-07-19 14:52:11Z nicolabertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit, compile LM Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ using namespace std; #include "mfstream.h" #include "math.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "ngramtable.h" ngramtable::ngramtable(char* filename,int maxl,char* /* unused parameter: is */, dictionary* extdict /* external dictionary */,char* filterdictfile, int googletable,int dstco,char* hmask, int inplen,TABLETYPE ttype, int codesize): tabletype(ttype,codesize) { cerr << "[codesize " << CODESIZE << "]\n"; char header[100]; info[0]='\0'; corrcounts=0; if (filename) { int n; mfstream inp(filename,ios::in ); inp >> header; if (strncmp(header,"nGrAm",5)==0 || strncmp(header,"NgRaM",5)==0) { inp >> n; inp >> card; inp >> info; if (strcmp(info,"LM_")==0) { inp >> resolution; inp >> decay; sprintf(info,"%s %d %f",info,resolution,decay); } else { //default for old LM probs resolution=10000000; decay=0.9999; } maxl=n; //owerwrite maxl cerr << n << " " << card << " " << info << "\n"; } inp.close(); } if (!maxl) { cerr << "ngramtable: ngram size must be specified\n"; exit(1); } //distant co-occurreces works for bigrams and trigrams if (dstco && (maxl!=2) && (maxl!=3)) { cerr << "distant co-occurrences work with 2-gram and 3-gram!\n"; exit(1); } maxlev=maxl; //Root not must have maximum frequency size treeflags=INODE | FREQ6; tree=(node) new char[inodesize(6)]; memset(tree,0,inodesize(6)); //1-gram table initial flags if (maxlev>1) mtflags(tree,INODE | FREQ4); else if (maxlev==1) mtflags(tree,LNODE | FREQ4); else { cerr << "ngramtable: wrong level setting\n"; exit(1); } word(tree,0); // dummy variable if (I_FREQ_NUM) freq(tree,treeflags,0); // frequency of all n-grams msucc(tree,0); // number of different n-grams mtable(tree,NULL); // table of n-gram mem=new storage(256,10000); mentr=new long long[maxlev+1]; memory= new int[maxlev+1]; occupancy= new int[maxlev+1]; //Book keeping of occupied memory mentr[0]=1; memory[0]=inodesize(6); // root is an inode with highest frequency occupancy[0]=inodesize(6); // root is an inode with highest frequency for (int i=1; i<=maxlev; i++) mentr[i]=memory[i]=occupancy[i]=0; dict=new dictionary(NULL,1000000); if (!filename) return ; filterdict=NULL; if (filterdictfile) { filterdict=new dictionary(filterdictfile,1000000); /* filterdict->incflag(1); filterdict->encode(BOS_); filterdict->encode(EOS_); filterdict->incflag(0); */ } // switch to specific loading methods if ((strncmp(header,"ngram",5)==0) || (strncmp(header,"NGRAM",5)==0)) { cerr << "this ngram file format is no more supported!\n"; exit(1); } if (strncmp(header,"nGrAm",5)==0) loadtxt(filename); else if (strncmp(header,"NgRaM",5)==0) loadbin(filename); else if (dstco>0) generate_dstco(filename,dstco); else if (hmask != NULL) generate_hmask(filename,hmask,inplen); else if (googletable) loadtxt(filename,googletable); else generate(filename,extdict); if (tbtype()==LEAFPROB) { du_code=dict->encode(DUMMY_); bo_code=dict->encode(BACKOFF_); } } void ngramtable::savetxt(char *filename,int depth,int googleformat) { if (depth>maxlev) { cerr << "savetxt: wrong n-gram size\n"; exit(1); } depth=(depth>0?depth:maxlev); card=mentr[depth]; ngram ng(dict); if (googleformat) cerr << "savetxt in Google format: nGrAm " << depth << " " << card << " " << info << "\n"; else cerr << "savetxt: nGrAm " << depth << " " << card << " " << info << "\n"; mfstream out(filename,ios::out ); if (!googleformat) out << "nGrAm " << depth << " " << card << " " << info << "\n"; if (!googleformat) dict->save(out); scan(ng,INIT,depth); while(scan(ng,CONT,depth)) out << ng <<"\n"; cerr << "\n"; out.close(); } void ngramtable::loadtxt(char *filename,int googletable) { ngram ng(dict);; cerr << "loadtxt:" << (googletable?"google format":"std table"); mfstream inp(filename,ios::in); int i,c=0; if (googletable) { dict->incflag(1); } else { char header[100]; inp.getline(header,100); cerr << header ; dict->load(inp); } while (!inp.eof()) { for (i=0; i> ng; inp >> ng.freq; if (ng.size==0) continue; //update dictionary frequency when loading from if (googletable) dict->incfreq(*ng.wordp(1),ng.freq); // if filtering dictionary exists // and if the first word of the ngram does not belong to it // do not insert the ngram if (filterdict) { int code=filterdict->encode(dict->decode(*ng.wordp(maxlev))); if (code!=filterdict->oovcode()) put(ng); } else put(ng); ng.size=0; if (!(++c % 1000000)) cerr << "."; } if (googletable) { dict->incflag(0); } cerr << "\n"; inp.close(); } void ngramtable::savebin(mfstream& out,node nd,NODETYPE ndt,int lev,int mlev) { out.write(nd+WORD_OFFS,CODESIZE); //write frequency int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS; int frnum=1; if (tbtype()==LEAFPROB && (ndt & LNODE)) frnum=L_FREQ_NUM; if ((ndt & LNODE) || I_FREQ_NUM) { //check if to write freq if (ndt & FREQ1) out.write(nd+offs,1 * frnum); else if (ndt & FREQ2) out.write(nd+offs,2 * frnum); else if (ndt & FREQ3) out.write(nd+offs,3 * frnum); else out.write(nd+offs,INTSIZE * frnum); } if ((lev maxlev) { cerr << "savebin: wrong n-gram size\n"; exit(1); } depth=(depth>0?depth:maxlev); card=mentr[depth]; cerr << "savebin NgRaM " << depth << " " << card; mfstream out(filename,ios::out ); if (dict->oovcode()!=-1) //there are OOV words out << "NgRaM_ " << depth << " " << card << " " << info << "\n"; else out << "NgRaM " << depth << " " << card << " " << info << "\n"; dict->save(out); out.writex((char *)&depth,INTSIZE); out.write((char *)&treeflags,CHARSIZE); savebin(out,tree,treeflags,0,depth); out.close(); cerr << "\n"; } void ngramtable::loadbin(mfstream& inp,node nd,NODETYPE ndt,int lev) { static int c=0; // read code inp.read(nd+WORD_OFFS,CODESIZE); // read frequency int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS; int frnum=1; if (tbtype()==LEAFPROB && (ndt & LNODE)) frnum=L_FREQ_NUM; if ((ndt & LNODE) || I_FREQ_NUM) { //check if to read freq if (ndt & FREQ1) inp.read(nd+offs,1 * frnum); else if (ndt & FREQ2) inp.read(nd+offs,2 * frnum); else if (ndt & FREQ3) inp.read(nd+offs,3 * frnum); else inp.read(nd+offs,4 * frnum); } if (ndt & INODE) { //read flags inp.read(nd+FLAGS_OFFS,CHARSIZE); unsigned char fl=mtflags(nd); //read #of multiple entries inp.read(nd+MSUCC_OFFS,CODESIZE); int m=msucc(nd); if (m>0) { //read multiple entries int msz=mtablesz(nd); table mtb=mtable(nd); //table entries increase grow(&mtb,INODE,lev+1,m,msz); for (int i=0; iload(inp); inp.readx((char *)&maxlev,INTSIZE); inp.read((char *)&treeflags,CHARSIZE); loadbin(inp,tree,treeflags,0); inp.close(); cerr << "\n"; } void ngramtable::generate(char *filename, dictionary* extdict) { mfstream inp(filename,ios::in); int i,c=0; if (!inp) { cerr << "cannot open " << filename << "\n"; exit(1); } cerr << "load:"; ngram ng(extdict==NULL?dict:extdict); //use possible prescribed dictionary if (extdict) dict->genoovcode(); ngram ng2(dict); dict->incflag(1); cerr << "prepare initial n-grams to make table consistent\n"; for (i=1; iBoS()); ng.freq=1; }; while (inp >> ng) { if (ng.size>maxlev) ng.size=maxlev; //speeds up ng2.trans(ng); //reencode with new dictionary check_dictsize_bound(); if (ng2.size) dict->incfreq(*ng2.wordp(1),1); // if filtering dictionary exists // and if the first word of the ngram does not belong to it // do not insert the ngram if (filterdict) { int code=filterdict->encode(dict->decode(*ng2.wordp(maxlev))); if (code!=filterdict->oovcode()) put(ng2); } else put(ng2); if (!(++c % 1000000)) cerr << "."; } cerr << "adding some more n-grams to make table consistent\n"; for (i=1; i<=maxlev; i++) { ng2.pushw(dict->BoS()); ng2.freq=1; // if filtering dictionary exists // and if the first word of the ngram does not belong to it // do not insert the ngram if (filterdict) { int code=filterdict->encode(dict->decode(*ng2.wordp(maxlev))); if (code!=filterdict->oovcode()) put(ng2); } else put(ng2); }; dict->incflag(0); inp.close(); strcpy(info,"ngram"); cerr << "\n"; } void ngramtable::generate_hmask(char *filename,char* hmask,int inplen) { mfstream inp(filename,ios::in); int i,c=0; int selmask[MAX_NGRAM]; if (!inp) { cerr << "cannot open " << filename << "\n"; exit(1); } //parse hmask i=0; selmask[i++]=1; for (c=0; c< (int)strlen(hmask); c++) { cerr << hmask[c] << "\n"; if (hmask[c] == '1') selmask[i++]=c+2; } if (i!= maxlev) { cerr << "wrong mask: 1 bits=" << i << " maxlev=" << maxlev << "\n"; exit(1); } // for (i=0;i " << selmask[i] << "\n"; cerr << "load:"; ngram ng(dict); ngram ng2(dict); dict->incflag(1); while (inp >> ng) { if (inplen && ng.size= selmask[maxlev-1]) { for (i=0; iincfreq(*ng2.wordp(1),1); if (!(++c % 1000000)) cerr << "."; }; dict->incflag(0); inp.close(); sprintf(info,"hm%s\n",hmask); cerr << "\n"; } int cmpint(const void *a,const void *b) { return (*(int *)b)-(*(int *)a); } void ngramtable::generate_dstco(char *filename,int dstco) { mfstream inp(filename,ios::in); int c=0; if (!inp) { cerr << "cannot open " << filename << "\n"; exit(1); } cerr << "load distant co-occurrences:"; if (dstco>MAX_NGRAM) { cerr << "window size (" << dstco << ") exceeds MAXNGRAM\n"; inp.close(); exit (1); } ngram ng(dict); ngram ng2(dict); ngram dng(dict); dict->incflag(1); while (inp >> ng) { if (ng.size) { ng2.trans(ng); //reencode with new dictionary if (ng2.size>dstco) ng2.size=dstco; //maximum distance check_dictsize_bound(); dict->incfreq(*ng2.wordp(1),1); if (maxlev == 1 ) cerr << "maxlev is wrong! (Possible values are 2 or 3)\n"; else if (maxlev == 2 ) { //maxlev ==2 dng.size=2; dng.freq=1; //cerr << "size=" << ng2.size << "\n"; for (int i=2; i<=ng2.size; i++) { if (*ng2.wordp(1)<*ng2.wordp(i)) { *dng.wordp(2)=*ng2.wordp(i); *dng.wordp(1)=*ng2.wordp(1); } else { *dng.wordp(1)=*ng2.wordp(i); *dng.wordp(2)=*ng2.wordp(1); } //cerr << dng << "\n"; put(dng); } if (!(++c % 1000000)) cerr << "."; } else { //maxlev ==3 dng.size=3; dng.freq=1; //cerr << "size=" << ng2.size << "\n"; int ar[3]; ar[0]=*ng2.wordp(1); for (int i=2; iincflag(0); inp.close(); sprintf(info,"co-occ%d\n",dstco); cerr << "\n"; } void ngramtable::augment(ngramtable* ngt) { if (ngt->maxlev != maxlev) { cerr << "ngt augmentation is not possible " << "due to table incompatibility!"; exit(1); } if (ngt->dict->oovcode()!=-1) cerr <<"oov: " << ngt->dict->freq(ngt->dict->oovcode()) << "\n"; cerr <<"size: " << ngt->dict->size() << "\n"; if (dict->oovcode()!=-1) cerr <<"oov: " << dict->freq(dict->oovcode()) << "\n"; cerr <<"size: " << dict->size() << "\n"; dict->incflag(1); cerr << "augmenting ngram table\n"; ngram ng1(ngt->dict); ngram ng2(dict); ngt->scan(ng1,INIT); int c=0; while (ngt->scan(ng1,CONT)) { ng2.trans(ng1); put(ng2); if ((++c % 1000000) ==0) cerr <<"."; } cerr << "\n"; for (int i=0; idict->size(); i++) dict->incfreq(dict->encode(ngt->dict->decode(i)), ngt->dict->freq(i)); dict->incflag(0); int oov=dict->getcode(dict->OOV()); if (oov>=0) { dict->oovcode(oov); } cerr << "oov: " << dict->freq(dict->oovcode()) << "\n"; cerr << "size: " << dict->size() << "\n"; } void ngramtable::show() { ngram ng(dict); scan(ng,INIT); cout << "Stampo contenuto della tabella\n"; while (scan(ng)) { cout << ng << "\n"; } } int ngramtable::mybsearch(char *ar, int n, int size, unsigned char *key, int *idx) { if (n==0) return 0; register int low = 0, high = n; *idx=0; register unsigned char *p=NULL; int result; #ifdef INTERP_SEARCH char* lp; char* hp; #endif /* return idx with the first position equal or greater than key */ /* Warning("start bsearch \n"); */ while (low < high) { #ifdef INTERP_SEARCH //use interpolation search only for intervals with at least 4096 entries if ((high-low)>=10000) { lp=(char *) (ar + (low * size)); if (codecmp((char *)key,lp)<0) { *idx=low; return 0; } hp=(char *) (ar + ((high-1) * size)); if (codecmp((char *)key,hp)>0) { *idx=high; return 0; } *idx= low + ((high-1)-low) * codediff((char *)key,lp)/codediff(hp,(char *)lp); } else #endif *idx = (low + high) / 2; //after redefining the interval there is no guarantee //that wlp <= wkey <= whigh p = (unsigned char *) (ar + (*idx * size)); result=codecmp((char *)key,(char *)p); if (result < 0) { high = *idx; } else if (result > 0) { low = ++(*idx); } else return 1; } *idx=low; return 0; } void *ngramtable::search(table *tb,NODETYPE ndt,int lev,int n,int sz,int *ngp, ACTION action,char **found) { char w[CODESIZE]; putmem(w,ngp[0],0,CODESIZE); int wint=ngp[0]; // index returned by mybsearch if (found) *found=NULL; int idx=0; switch(action) { case ENTER: if (!*tb || !mybsearch(*tb,n,sz,(unsigned char *)w,&idx)) { // let possibly grow the table grow(tb,ndt,lev,n,sz); // devo aggiungere un elemento n+1 //shift table by one memmove(*tb + (idx+1) * sz, *tb + idx * sz, (n-idx) * sz); memset(*tb + idx * sz , 0 , sz); word(*tb + idx * sz, wint); } else if (found) *found=*tb + ( idx * sz ); return *tb + ( idx * sz ); break; case FIND: if (!*tb || !mybsearch(*tb,n,sz,(unsigned char *)w,&idx)) return 0; else if (found) *found=*tb + (idx * sz); return *tb + (idx * sz); break; case DELETE: if (*tb && mybsearch(*tb,n,sz,(unsigned char *)w,&idx)) { //shift table down by one static char buffer[100]; memcpy(buffer,*tb + idx * sz , sz); if (idx <(n-1)) memmove(*tb + idx * sz, *tb + (idx + 1) * sz, (n-idx-1) * sz); //put the deleted item after the last item memcpy(*tb + (n-1) * sz , buffer , sz); if (found) *found=*tb + (n-1) * sz ; return *tb + (n-1) * sz ; } else return NULL; break; default: cerr << "this option is not implemented yet\n"; break; } return NULL; } int ngramtable::comptbsize(int n) { if (n>16384) return(n/16384)*16384+(n % 16384?16384:0); else if (n>8192) return 16384; else if (n>4096) return 8192; else if (n>2048) return 4096; else if (n>1024) return 2048; else if (n>512) return 1024; else if (n>256) return 512; else if (n>128) return 256; else if (n>64) return 128; else if (n>32) return 64; else if (n>16) return 32; else if (n>8) return 16; else if (n>4) return 8; else if (n>2) return 4; else if (n>1) return 2; else return 1; } char **ngramtable::grow(table *tb,NODETYPE ndt,int lev, int n,int sz,NODETYPE oldndt) { int inc; int num; //memory pools for inode/lnode tables if (oldndt==0) { if ((*tb==NULL) && n>0) { // n is the target number of entries //first allocation if (n>16384) inc=(n/16384)*16384+(n % 16384?16384:0); else if (n>8192) inc=16384; else if (n>4096) inc=8192; else if (n>2048) inc=4096; else if (n>1024) inc=2048; else if (n>512) inc=1024; else if (n>256) inc=512; else if (n>128) inc=256; else if (n>64) inc=128; else if (n>32) inc=64; else if (n>16) inc=32; else if (n>8) inc=16; else if (n>4) inc=8; else if (n>2) inc=4; else if (n>1) inc=2; else inc=1; n=0; //inc is the correct target size } else { // table will be extended on demand // I'm sure that one entry will be // added next // check multiples of 1024 if ((n>=16384) && !(n % 16384)) inc=16384; else { switch (n) { case 0: inc=1; break; case 1: case 2: case 4: case 8: case 16: case 32: case 64: case 128: case 256: case 512: case 1024: case 2048: case 4096: case 8192: inc=n; break; default: return tb; } } } table ntb=(char *)mem->reallocate(*tb,n * sz,(n + inc) * sz); memory[lev]+= (inc * sz); *tb=ntb; } else { //change frequency type of table //no entries will be added now int oldsz=0; // guess the current memory size !!!! num=comptbsize(n); if ((ndt & INODE) && I_FREQ_NUM) { if (oldndt & FREQ1) oldsz=inodesize(1); else if (oldndt & FREQ2) oldsz=inodesize(2); else if (oldndt & FREQ3) oldsz=inodesize(3); else if (oldndt & FREQ4) oldsz=inodesize(4); else { cerr << "funzione non prevista\n"; exit(1); } } else if (ndt & LNODE) { if (oldndt & FREQ1) oldsz=lnodesize(1); else if (oldndt & FREQ2) oldsz=lnodesize(2); else if (oldndt & FREQ3) oldsz=lnodesize(3); else if (oldndt & FREQ4) oldsz=lnodesize(4); else { cerr << "funzione non prevista\n"; exit(1); } } table ntb=(char *)mem->allocate(num * sz); memset((char *)ntb,0,num * sz); if (ndt & INODE) for (int i=0; ifree(*tb,num * oldsz); //num is the correct size memory[lev]+=num * (sz - oldsz); occupancy[lev]+=n * (sz - oldsz); *tb=ntb; } return tb; }; int ngramtable::put(ngram& ng) { return ngramtable::put(ng,tree,treeflags,0); } int ngramtable::put(ngram& ng,node nd,NODETYPE ndt,int lev) { char *found; node subnd; if (ng.size65535?FREQ4:FREQ1); else //all leafprob with L_FREQ_NUM >=1 //do NOT have INTERNAL freqs //will have freq size specified //by the resolution parameter //to avoid expansion freq_flag=L_FREQ_SIZE; if ((l+1)255)) mtflags(nd,(mtflags(nd) & ~FREQ1) | FREQ2); //update flags if ((I_FREQ_NUM || (mtflags(nd) & LNODE)) && (mtflags(nd) & FREQ2) && ((freq(subnd,mtflags(nd))+ng.freq)>65535)) mtflags(nd,(mtflags(nd) & ~FREQ2) | FREQ3); //update flags if ((I_FREQ_NUM || (mtflags(nd) & LNODE)) && (mtflags(nd) & FREQ3) && ((freq(subnd,mtflags(nd))+ng.freq)>16777215)) mtflags(nd,(mtflags(nd) & ~FREQ3) | FREQ4); //update flags if ((I_FREQ_NUM || (mtflags(nd) & LNODE)) && (mtflags(nd) & FREQ4) && ((freq(subnd,mtflags(nd))+ng.freq)>4294967295LL)) mtflags(nd,(mtflags(nd) & ~FREQ4) | FREQ6); //update flags if (mtflags(nd)!=oldndt) { // flags have changed, table has to be expanded //expand subtable cerr << "+"<= n); if ((I_FREQ_NUM==0) && (lev < maxlev)) { cerr << "get: for this type of table ngram cannot be smaller than table size\n"; exit(1); } if (ng.wordp(n)) { nd=tree; ndt=treeflags; for (int l=0; l(maxl-1)) return 0; if (ng.midx[lev]free(mtable(nd),msz*truem); } ngramtable::~ngramtable() { freetree(tree); delete [] tree; delete mem; delete [] memory; delete [] occupancy; delete [] mentr; delete dict; }; void ngramtable::stat(int level) { int totmem=0; int totwaste=0; float mega=1024 * 1024; cout.precision(2); cout << "ngramtable class statistics\n"; cout << "levels " << maxlev << "\n"; for (int l=0; l<=maxlev; l++) { cout << "lev " << l << " entries "<< mentr[l] << " allocated mem " << memory[l]/mega << "Mb " << " used mem " << occupancy[l]/mega << "Mb \n"; totmem+=memory[l]; totwaste+=(memory[l]-occupancy[l]); } cout << "total allocated mem " << totmem/mega << "Mb "; cout << "wasted mem " << totwaste/mega << "Mb\n\n\n"; if (level >1 ) dict->stat(); cout << "\n\n"; if (level >2) mem->stat(); } double ngramtable::prob(ngram ong) { if (ong.size==0) return 0.0; if (ong.size>maxlev) ong.size=maxlev; assert(tbtype()==LEAFPROB && ong.size<=maxlev); ngram ng(dict); ng.trans(ong); double bo; ng.size=maxlev; for (int s=ong.size+1; s<=maxlev; s++) *ng.wordp(s)=du_code; if (get(ng)) { if (ong.size>1 && resolution<10000000) return (double)pow(decay,(resolution-ng.freq)); else return (double)(ng.freq+1)/10000000.0; } else { // backoff-probability bo_state(1); //set backoff state to 1 *ng.wordp(1)=bo_code; if (get(ng)) bo=resolution<10000000 ?(double)pow(decay,(resolution-ng.freq)) :(double)(ng.freq+1)/10000000.0; else bo=1.0; ong.size--; return bo * prob(ong); } } bool ngramtable::check_dictsize_bound() { if (dict->size() >= code_range[CODESIZE]) { cerr << "dictionary size overflows code range " << code_range[CODESIZE] << "\n"; exit(1); } return true; } /* main(int argc, char** argv){ dictionary d(argv[1]); ngram ng(&d); cerr << "caricato dizionario da " << argv[1] << "\n"; ngramtable t(&d,argv[2],1); t.stat(1); t.savetxt(argv[3]); } */ irstlm-5.80.03/src/ngramtable.h000644 000766 000024 00000041716 12022422255 020420 0ustar00nicolabertoldistaff000000 000000 // $Id: ngramtable.h 34 2010-06-03 09:19:34Z nicolabertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit, compile LM Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #ifndef MF_NGRAMTABLE_H #define MF_NGRAMTABLE_H //Backoff symbol #ifndef BACKOFF_ #define BACKOFF_ "_backoff_" #endif //Dummy symbol #ifndef DUMMY_ #define DUMMY_ "_dummy_" #endif // internal data structure #ifdef MYCODESIZE #define DEFCODESIZE MYCODESIZE #else #define DEFCODESIZE (int)2 #endif #define SHORTSIZE (int)2 #define PTRSIZE (int)sizeof(char *) #define INTSIZE (int)4 #define CHARSIZE (int)1 //Node flags #define FREQ1 (unsigned char) 1 #define FREQ2 (unsigned char) 2 #define FREQ4 (unsigned char) 4 #define INODE (unsigned char) 8 #define LNODE (unsigned char) 16 #define SNODE (unsigned char) 32 #define FREQ6 (unsigned char) 64 #define FREQ3 (unsigned char) 128 typedef char* node; //inodes, lnodes, snodes typedef char* table; //inode table, lnode table, singleton table typedef unsigned char NODETYPE; typedef enum {FIND, //!< search: find an entry ENTER, //!< search: enter an entry DELETE, //!< search: find and remove entry INIT, //!< scan: start scan CONT //!< scan: continue scan } ACTION; typedef enum {COUNT, //!< table: only counters LEAFPROB, //!< table: only probs on leafs FLEAFPROB, //!< table: only probs on leafs and FROZEN LEAFPROB2, //!< table: only probs on leafs LEAFPROB3, //!< table: only probs on leafs LEAFPROB4, //!< table: only probs on leafs LEAFCODE, //!< table: only codes on leafs SIMPLE_I, //!< table: simple interpolated LM SIMPLE_B, //!< table: simple backoff LM SHIFTBETA_I, //!< table: interpolated shiftbeta SHIFTBETA_B, //!< table: backoff shiftbeta MSHIFTBETA_I,//!< table: interp modified shiftbeta MSHIFTBETA_B,//!< table: backoff modified shiftbeta FULL, //!< table: full fledged table } TABLETYPE; class tabletype { TABLETYPE ttype; public: int CODESIZE; //sizeof word codes long long code_range[7]; //max code for each size //Offsets of internal node fields int WORD_OFFS; //word code position int MSUCC_OFFS; //number of successors int MTAB_OFFS; //pointer to successors int FLAGS_OFFS; //flag table int SUCC1_OFFS; //number of successors with freq=1 int SUCC2_OFFS; //number of successors with freq=2 int BOFF_OFFS; //back-off probability int I_FREQ_OFFS; //frequency offset int I_FREQ_NUM; //number of internal frequencies int L_FREQ_NUM; //number of leaf frequencies int L_FREQ_SIZE; //minimum size for leaf frequencies //Offsets of leaf node fields int L_FREQ_OFFS; //frequency offset TABLETYPE tbtype() { return ttype; } tabletype(TABLETYPE tt,int codesize=DEFCODESIZE) { if (codesize<=4 && codesize>0) CODESIZE=codesize; else { cerr << "ngramtable wrong codesize\n"; exit(1); } code_range[1]=255; code_range[2]=65535; code_range[3]=16777214; code_range[4]=2147483640; code_range[6]=140737488360000LL; //stay below true limit // code_range[6]=281474977000000LL; //stay below true limit //information which is useful to initialize //LEAFPROB tables L_FREQ_SIZE=FREQ1; WORD_OFFS =0; MSUCC_OFFS =CODESIZE; MTAB_OFFS =MSUCC_OFFS+CODESIZE; FLAGS_OFFS =MTAB_OFFS+PTRSIZE; switch (tt) { case COUNT: SUCC1_OFFS =0; SUCC2_OFFS =0; BOFF_OFFS =0; I_FREQ_OFFS=FLAGS_OFFS+CHARSIZE; I_FREQ_NUM=1; L_FREQ_NUM=1; ttype=tt; break; case FULL: case MSHIFTBETA_B: SUCC1_OFFS =FLAGS_OFFS+CHARSIZE; SUCC2_OFFS =SUCC1_OFFS+CODESIZE; BOFF_OFFS =SUCC2_OFFS+CODESIZE; I_FREQ_OFFS=BOFF_OFFS+INTSIZE; L_FREQ_OFFS=CODESIZE; I_FREQ_NUM=2; L_FREQ_NUM=1; ttype=tt; break; case MSHIFTBETA_I: SUCC1_OFFS =FLAGS_OFFS+CHARSIZE; SUCC2_OFFS =SUCC1_OFFS+CODESIZE; BOFF_OFFS =0; I_FREQ_OFFS=SUCC2_OFFS+CODESIZE; L_FREQ_OFFS=CODESIZE; I_FREQ_NUM=2; L_FREQ_NUM=1; ttype=tt; break; case SIMPLE_I: SUCC1_OFFS = 0; SUCC2_OFFS = 0; BOFF_OFFS = 0; I_FREQ_OFFS= FLAGS_OFFS+CHARSIZE; L_FREQ_OFFS=CODESIZE; I_FREQ_NUM=1; L_FREQ_NUM=1; ttype=tt; break; case SIMPLE_B: SUCC1_OFFS = 0; SUCC2_OFFS = 0; BOFF_OFFS = FLAGS_OFFS+CHARSIZE; I_FREQ_OFFS = BOFF_OFFS+INTSIZE; L_FREQ_OFFS = CODESIZE; I_FREQ_NUM = 1; L_FREQ_NUM = 1; ttype=tt; break; case SHIFTBETA_I: SUCC1_OFFS = FLAGS_OFFS+CHARSIZE; SUCC2_OFFS = 0; BOFF_OFFS = 0; I_FREQ_OFFS= SUCC1_OFFS+CODESIZE; L_FREQ_OFFS=CODESIZE; I_FREQ_NUM=1; L_FREQ_NUM=1; ttype=tt; break; case SHIFTBETA_B: SUCC1_OFFS = FLAGS_OFFS+CHARSIZE; SUCC2_OFFS = 0; BOFF_OFFS = SUCC1_OFFS+CODESIZE; I_FREQ_OFFS = BOFF_OFFS+INTSIZE; L_FREQ_OFFS = CODESIZE; I_FREQ_NUM = 1; L_FREQ_NUM = 1; ttype=tt; break; case LEAFPROB: case FLEAFPROB: SUCC1_OFFS = 0; SUCC2_OFFS = 0; BOFF_OFFS = 0; I_FREQ_OFFS = FLAGS_OFFS+CHARSIZE; I_FREQ_NUM = 0; L_FREQ_NUM = 1; ttype=tt; break; case LEAFPROB2: SUCC1_OFFS =0; SUCC2_OFFS =0; BOFF_OFFS =0; I_FREQ_OFFS=FLAGS_OFFS+CHARSIZE; I_FREQ_NUM=0; L_FREQ_NUM=2; ttype=LEAFPROB; break; case LEAFPROB3: SUCC1_OFFS =0; SUCC2_OFFS =0; BOFF_OFFS =0; I_FREQ_OFFS=FLAGS_OFFS+CHARSIZE; I_FREQ_NUM=0; L_FREQ_NUM=3; ttype=LEAFPROB; break; case LEAFPROB4: SUCC1_OFFS =0; SUCC2_OFFS =0; BOFF_OFFS =0; I_FREQ_OFFS=FLAGS_OFFS+CHARSIZE; I_FREQ_NUM=0; L_FREQ_NUM=4; ttype=LEAFPROB; break; default: assert(tt==COUNT); } L_FREQ_OFFS=CODESIZE; } int inodesize(int s) { return I_FREQ_OFFS + I_FREQ_NUM * s; } int lnodesize(int s) { return L_FREQ_OFFS + L_FREQ_NUM * s; } }; class ngramtable:tabletype { node tree; // ngram table root int maxlev; // max storable n-gram NODETYPE treeflags; char info[100]; //information put in the header int resolution; //max resolution for probabilities double decay; //decay constant storage* mem; //memory storage class int* memory; // memory load per level int* occupancy; // memory occupied per level long long* mentr; // multiple entries per level long long card; //entries at maxlev int idx[MAX_NGRAM+1]; int oov_code,oov_size,du_code, bo_code; //used by prob; int backoff_state; //used by prob; public: int corrcounts; //corrected counters flag dictionary *dict; // dictionary // filtering dictionary: // if the first word of the ngram does not belong to filterdict // do not insert the ngram dictionary *filterdict; ngramtable(char* filename,int maxl,char* is, dictionary* extdict, char* filterdictfile, int googletable=0, int dstco=0,char* hmask=NULL,int inplen=0, TABLETYPE tt=FULL,int codesize=DEFCODESIZE); inline char* ngtype(char *str=NULL) { if (str!=NULL) strcpy(info,str); return info; } virtual ~ngramtable(); inline void freetree() { freetree(tree); }; void freetree(node nd); void resetngramtable() { //clean up all memory and restart from an empty table freetree(); //clean memory pool memset(tree,0,inodesize(6)); //reset tree //1-gram table initial flags if (maxlev>1) mtflags(tree,INODE | FREQ4); else if (maxlev==1) mtflags(tree,LNODE | FREQ4); word(tree,0); //dummy word msucc(tree,0); // number of different n-grams mtable(tree,NULL); // table of n-gram for (int i=1; i<=maxlev; i++) mentr[i]=memory[i]=occupancy[i]=0; } void stat(int level=4); inline long long totfreq(long long v=-1) { return (v==-1?freq(tree,INODE):freq(tree,INODE,v)); } inline long long btotfreq(long long v=-1) { return (v==-1?getfreq(tree,treeflags,1):setfreq(tree,treeflags,v,1)); } inline long long entries(int lev) { return mentr[lev]; } int maxlevel() { return maxlev; } // void savetxt(char *filename,int sz=0); void savetxt(char *filename,int sz=0,int googleformat=0); void loadtxt(char *filename,int googletable=0); void savebin(char *filename,int sz=0); void savebin(mfstream& out); void savebin(mfstream& out,node nd,NODETYPE ndt,int lev,int mlev); void loadbin(const char *filename); void loadbin(mfstream& inp); void loadbin(mfstream& inp,node nd,NODETYPE ndt,int lev); void loadbinold(char *filename); void loadbinold(mfstream& inp,node nd,NODETYPE ndt,int lev); void generate(char *filename,dictionary *extdict=NULL); void generate_dstco(char *filename,int dstco); void generate_hmask(char *filename,char* hmask,int inplen=0); void augment(ngramtable* ngt); int scan(ngram& ng,ACTION action=CONT,int maxlev=-1) { return scan(tree,INODE,0,ng,action,maxlev); } int succscan(ngram& h,ngram& ng,ACTION action,int lev) { //return scan(h.link,h.info,h.lev,ng,action,lev); return scan(h.link,h.info,lev-1,ng,action,lev); } double prob(ngram ng); int scan(node nd,NODETYPE ndt,int lev,ngram& ng,ACTION action=CONT,int maxl=-1); void show(); void *search(table *tb,NODETYPE ndt,int lev,int n,int sz,int *w, ACTION action,char **found=(char **)NULL); int mybsearch(char *ar, int n, int size, unsigned char *key, int *idx); int put(ngram& ng); int put(ngram& ng,node nd,NODETYPE ndt,int lev); inline int get(ngram& ng) { return get(ng,maxlev,maxlev); } virtual int get(ngram& ng,int n,int lev); int comptbsize(int n); table *grow(table *tb,NODETYPE ndt,int lev,int n,int sz,NODETYPE oldndt=0); bool check_dictsize_bound(); inline int putmem(char* ptr,int value,int offs,int size) { assert(ptr!=NULL); for (int i=0; i> (8 * i)) & 0xff; return value; } inline int getmem(char* ptr,int* value,int offs,int size) { assert(ptr!=NULL); *value=ptr[offs] & 0xff; for (int i=1; i> (8 * i)) & 0xffLL; return value; } inline long getmem(char* ptr,long long* value,int offs,int size) { assert(ptr!=NULL); *value=ptr[offs] & 0xff; for (int i=1; i=0; i--) { result=(unsigned char)a[i]-(unsigned char)b[i]; if(result) return result; } return 0; }; int codediff(node a,node b) { return word(a)-word(b); }; int update(ngram ng) { if (!get(ng,ng.size,ng.size)) { cerr << "cannot find " << ng << "\n"; exit (1); } freq(ng.link,ng.pinfo,ng.freq); return 1; } long long freq(node nd,NODETYPE ndt,long long value) { int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS; if (ndt & FREQ1) putmem(nd,value,offs,1); else if (ndt & FREQ2) putmem(nd,value,offs,2); else if (ndt & FREQ3) putmem(nd,value,offs,3); else if (ndt & FREQ4) putmem(nd,value,offs,4); else putmem(nd,value,offs,6); return value; } long long freq(node nd,NODETYPE ndt) { int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS; long long value; if (ndt & FREQ1) getmem(nd,&value,offs,1); else if (ndt & FREQ2) getmem(nd,&value,offs,2); else if (ndt & FREQ3) getmem(nd,&value,offs,3); else if (ndt & FREQ4) getmem(nd,&value,offs,4); else getmem(nd,&value,offs,6); return value; } long long setfreq(node nd,NODETYPE ndt,long long value,int index=0) { int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS; if (ndt & FREQ1) putmem(nd,value,offs+index * 1,1); else if (ndt & FREQ2) putmem(nd,value,offs+index * 2,2); else if (ndt & FREQ3) putmem(nd,value,offs+index * 3,3); else if (ndt & FREQ4) putmem(nd,value,offs+index * 4,4); else putmem(nd,value,offs+index * 6,6); return value; } long long getfreq(node nd,NODETYPE ndt,int index=0) { int offs=(ndt & LNODE)?L_FREQ_OFFS:I_FREQ_OFFS; long long value; if (ndt & FREQ1) getmem(nd,&value,offs+ index * 1,1); else if (ndt & FREQ2) getmem(nd,&value,offs+ index * 2,2); else if (ndt & FREQ3) getmem(nd,&value,offs+ index * 3,3); else if (ndt & FREQ4) getmem(nd,&value,offs+ index * 4,4); else getmem(nd,&value,offs+ index * 6,6); return value; } double boff(node nd) { int value=0; getmem(nd,&value,BOFF_OFFS,INTSIZE); return double (value/(double)1000000000.0); } double myround(double x) { long int i=(long int)(x); return (x-i)>0.500?i+1.0:(double)i; } int boff(node nd,double value) { int v=(int)myround(value * 1000000000.0); putmem(nd,v,BOFF_OFFS,INTSIZE); return 1; } int succ2(node nd,int value) { putmem(nd,value,SUCC2_OFFS,CODESIZE); return value; } int succ2(node nd) { int value=0; getmem(nd,&value,SUCC2_OFFS,CODESIZE); return value; } int succ1(node nd,int value) { putmem(nd,value,SUCC1_OFFS,CODESIZE); return value; } int succ1(node nd) { int value=0; getmem(nd,&value,SUCC1_OFFS,CODESIZE); return value; } int msucc(node nd,int value) { putmem(nd,value,MSUCC_OFFS,CODESIZE); return value; } int msucc(node nd) { int value; getmem(nd,&value,MSUCC_OFFS,CODESIZE); return value; } table mtable(node nd) { char v[PTRSIZE];; for (int i=0; i #include #include "cmd.h" #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "ngramtable.h" void print_help(int TypeFlag=0){ std::cerr << std::endl << "ngt - collects n-grams" << std::endl; std::cerr << std::endl << "USAGE:" << std::endl; std::cerr << " ngt -i= [options]" << std::endl; std::cerr << std::endl << "OPTIONS:" << std::endl; FullPrintParams(TypeFlag, 0, 1, stderr); } void usage(const char *msg = 0) { if (msg){ std::cerr << msg << std::endl; } else{ print_help(); } exit(1); } int main(int argc, char **argv) { char *inp=NULL; char *out=NULL; char *dic=NULL; // dictionary filename char *subdic=NULL; // subdictionary filename char *filterdict=NULL; // subdictionary filename char *filtertable=NULL; // ngramtable filename char *iknfile=NULL; // filename to save IKN statistics double filter_hit_rate=1.0; // minimum hit rate of filter char *aug=NULL; // augmentation data char *hmask=NULL; // historymask bool inputgoogleformat=false; //reads ngrams in Google format bool outputgoogleformat=false; //print ngrams in Google format int ngsz=0; // n-gram default size int dstco=0; // compute distance co-occurrences bool bin=false; bool ss=false; //generate single table bool LMflag=false; //work with LM table int inplen=0; //input length for mask generation bool tlm=false; //test lm table char* ftlm=NULL; //file to test LM table bool memuse=false; bool help=false; DeclareParams((char*) "Dictionary", CMDSTRINGTYPE|CMDMSG, &dic, "dictionary filename", "d", CMDSTRINGTYPE|CMDMSG, &dic, "dictionary filename", "NgramSize", CMDSUBRANGETYPE|CMDMSG, &ngsz, 1, MAX_NGRAM, "n-gram default size; default is 0", "n", CMDSUBRANGETYPE|CMDMSG, &ngsz, 1, MAX_NGRAM, "n-gram default size; default is 0", "InputFile", CMDSTRINGTYPE|CMDMSG, &inp, "input file", "i", CMDSTRINGTYPE|CMDMSG, &inp, "input file", "OutputFile", CMDSTRINGTYPE|CMDMSG, &out, "output file", "o", CMDSTRINGTYPE|CMDMSG, &out, "output file", "InputGoogleFormat", CMDBOOLTYPE|CMDMSG, &inputgoogleformat, "the input file contains data in the n-gram Google format; default is false", "gooinp", CMDBOOLTYPE|CMDMSG, &inputgoogleformat, "the input file contains data in the n-gram Google format; default is false", "OutputGoogleFormat", CMDBOOLTYPE|CMDMSG, &outputgoogleformat, "the output file contains data in the n-gram Google format; default is false", "gooout", CMDBOOLTYPE|CMDMSG, &outputgoogleformat, "the output file contains data in the n-gram Google format; default is false", "SaveBinaryTable", CMDBOOLTYPE|CMDMSG, &bin, "saves into binary format; default is false", "b", CMDBOOLTYPE|CMDMSG, &bin, "saves into binary format; default is false", "LmTable", CMDBOOLTYPE|CMDMSG, &LMflag, "works with LM table; default is false", "lm", CMDBOOLTYPE|CMDMSG, &LMflag, "works with LM table; default is false", "DistCo", CMDINTTYPE|CMDMSG, &dstco, "computes distance co-occurrences at the specified distance; default is 0", "dc", CMDINTTYPE|CMDMSG, &dstco, "computes distance co-occurrences at the specified distance; default is 0", "AugmentFile", CMDSTRINGTYPE|CMDMSG, &aug, "augmentation data", "aug", CMDSTRINGTYPE|CMDMSG, &aug, "augmentation data", "SaveSingle", CMDBOOLTYPE|CMDMSG, &ss, "generates single table; default is false", "ss", CMDBOOLTYPE|CMDMSG, &ss, "generates single table; default is false", "SubDict", CMDSTRINGTYPE|CMDMSG, &subdic, "subdictionary", "sd", CMDSTRINGTYPE|CMDMSG, &subdic, "subdictionary", "FilterDict", CMDSTRINGTYPE|CMDMSG, &filterdict, "filter dictionary", "fd", CMDSTRINGTYPE|CMDMSG, &filterdict, "filter dictionary", "ConvDict", CMDSTRINGTYPE|CMDMSG, &subdic, "subdictionary", "cd", CMDSTRINGTYPE|CMDMSG, &subdic, "subdictionary", "FilterTable", CMDSTRINGTYPE|CMDMSG, &filtertable, "ngramtable filename", "ftr", CMDDOUBLETYPE|CMDMSG, &filter_hit_rate, "ngramtable filename", "FilterTableRate", CMDDOUBLETYPE|CMDMSG, &filter_hit_rate, "minimum hit rate of filter; default is 1.0", "ft", CMDSTRINGTYPE|CMDMSG, &filtertable, "minimum hit rate of filter; default is 1.0", "HistoMask",CMDSTRINGTYPE|CMDMSG, &hmask, "history mask", "hm",CMDSTRINGTYPE|CMDMSG, &hmask, "history mask", "InpLen",CMDINTTYPE|CMDMSG, &inplen, "input length for mask generation; default is 0", "il",CMDINTTYPE|CMDMSG, &inplen, "input length for mask generation; default is 0", "tlm", CMDBOOLTYPE|CMDMSG, &tlm, "test LM table; default is false", "ftlm", CMDSTRINGTYPE|CMDMSG, &ftlm, "file to test LM table", "memuse", CMDBOOLTYPE|CMDMSG, &memuse, "default is false", "iknstat", CMDSTRINGTYPE|CMDMSG, &iknfile, "filename to save IKN statistics", "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help", "h", CMDBOOLTYPE|CMDMSG, &help, "print this help", (char *)NULL ); if (argc == 1){ usage(); } GetParams(&argc, &argv, (char*) NULL); if (help){ usage(); } if (inp==NULL) { usage("Warning: no input file specified\n"); }; if (out==NULL) { cerr << "Warning: no output file specified!\n"; } TABLETYPE table_type=COUNT; if (LMflag) { cerr << "Working with LM table\n"; table_type=LEAFPROB; } // check word order of subdictionary if (filtertable) { { ngramtable ngt(filtertable,1,NULL,NULL,NULL,0,0,NULL,0,table_type); mfstream inpstream(inp,ios::in); //google input table mfstream outstream(out,ios::out); //google output table cerr << "Filtering table " << inp << " assumed to be in Google Format with size " << ngsz << "\n"; cerr << "with table " << filtertable << " of size " << ngt.maxlevel() << "\n"; cerr << "with hit rate " << filter_hit_rate << "\n"; //order of filter table must be smaller than that of input n-grams assert(ngt.maxlevel() <= ngsz); //read input googletable of ngrams of size ngsz //output entries made of at least X% n-grams contained in filtertable // words are not accepted ngram ng(ngt.dict), ng2(ng.dict); double hits=0; double maxhits=(double)(ngsz-ngt.maxlevel()+1); long c=0; while(inpstream >> ng) { if (ng.size>= ngt.maxlevel()) { //need to make a copy ng2=ng; ng2.size=ngt.maxlevel(); //cerr << "check if " << ng2 << " is contained: "; hits+=(ngt.get(ng2)?1:0); } if (ng.size==ngsz) { if (!(++c % 1000000)) cerr << "."; //cerr << ng << " -> " << is_included << "\n"; //you reached the last word before freq inpstream >> ng.freq; //consistency check of n-gram if (((hits/maxhits)>=filter_hit_rate) && (!ng.containsWord(ngt.dict->OOV(),ng.size)) ) outstream << ng << "\n"; hits=0; ng.size=0; } } outstream.flush(); inpstream.flush(); } exit(1); } //ngramtable* ngt=new ngramtable(inp,ngsz,NULL,dic,dstco,hmask,inplen,table_type); ngramtable* ngt=new ngramtable(inp,ngsz,NULL,NULL,filterdict,inputgoogleformat,dstco,hmask,inplen,table_type); if (aug) { ngt->dict->incflag(1); // ngramtable ngt2(aug,ngsz,isym,NULL,0,NULL,0,table_type); ngramtable ngt2(aug,ngsz,NULL,NULL,NULL,0,0,NULL,0,table_type); ngt->augment(&ngt2); ngt->dict->incflag(0); } if (subdic) { int c=0; ngramtable *ngt2=new ngramtable(NULL,ngsz,NULL,NULL,NULL,0,0,NULL,0,table_type); // enforce the subdict to follow the same word order of the main dictionary dictionary tmpdict(subdic); ngt2->dict->incflag(1); for (int i=0; idict->size(); i++) { if (tmpdict.encode(ngt->dict->decode(i)) != tmpdict.oovcode()) { ngt2->dict->encode(ngt->dict->decode(i)); } } ngt2->dict->incflag(0); ngt2->dict->cleanfreq(); //possibly include standard symbols if (ngt->dict->encode(ngt->dict->EoS())!=ngt->dict->oovcode()) { ngt2->dict->incflag(1); ngt2->dict->encode(ngt2->dict->EoS()); ngt2->dict->incflag(0); } if (ngt->dict->encode(ngt->dict->BoS())!=ngt->dict->oovcode()) { ngt2->dict->incflag(1); ngt2->dict->encode(ngt2->dict->BoS()); ngt2->dict->incflag(0); } ngram ng(ngt->dict); ngram ng2(ngt2->dict); ngt->scan(ng,INIT,ngsz); while (ngt->scan(ng,CONT,ngsz)) { ng2.trans(ng); ngt2->put(ng2); if (!(++c % 1000000)) cerr << "."; } //makes ngt2 aware of oov code int oov=ngt2->dict->getcode(ngt2->dict->OOV()); if(oov>=0) ngt2->dict->oovcode(oov); for (int i=0; idict->size(); i++) { ngt2->dict->incfreq(ngt2->dict->encode(ngt->dict->decode(i)), ngt->dict->freq(i)); } cerr <<" oov: " << ngt2->dict->freq(ngt2->dict->oovcode()) << "\n"; delete ngt; ngt=ngt2; } if (ngsz < ngt->maxlevel() && hmask) { cerr << "start projection of ngramtable " << inp << " according to hmask\n"; int i,c; int selmask[MAX_NGRAM]; //parse hmask i=0; selmask[i++]=1; for (c=0; c< (int)strlen(hmask); c++) { cerr << hmask[c] << "\n"; if (hmask[c] == '1') selmask[i++]=c+2; } if (i!= ngsz) { cerr << "wrong mask: 1 bits=" << i << " maxlev=" << ngsz << "\n"; exit(1); } if (selmask[ngsz-1] > ngt->maxlevel()) { cerr << "wrong mask: farest bits=" << selmask[ngsz-1] << " maxlev=" << ngt->maxlevel() << "\n"; exit(1); } //ngramtable* ngt2=new ngramtable(NULL,ngsz,NULL,NULL,0,NULL,0,table_type); ngramtable* ngt2=new ngramtable(NULL,ngsz,NULL,NULL,NULL,0,0,NULL,0,table_type); ngt2->dict->incflag(1); ngram ng(ngt->dict); ngram png(ngt->dict,ngsz); ngram ng2(ngt2->dict,ngsz); ngt->scan(ng,INIT,ngt->maxlevel()); while (ngt->scan(ng,CONT,ngt->maxlevel())) { //projection for (i=0; iput(ng2); if (!(++c % 1000000)) cerr << "."; } char info[100]; sprintf(info,"hm%s",hmask); ngt2->ngtype(info); //makes ngt2 aware of oov code int oov=ngt2->dict->getcode(ngt2->dict->OOV()); if(oov>=0) ngt2->dict->oovcode(oov); for (int i=0; idict->size(); i++) { ngt2->dict->incfreq(ngt2->dict->encode(ngt->dict->decode(i)), ngt->dict->freq(i)); } cerr <<" oov: " << ngt2->dict->freq(ngt2->dict->oovcode()) << "\n"; delete ngt; ngt=ngt2; } if (tlm && table_type==LEAFPROB) { ngram ng(ngt->dict); cout.setf(ios::scientific); cout << "> "; while(cin >> ng) { ngt->bo_state(0); if (ng.size>=ngsz) { cout << ng << " p= " << log(ngt->prob(ng)); cout << " bo= " << ngt->bo_state() << "\n"; } else cout << ng << " p= NULL\n"; cout << "> "; } } if (ftlm && table_type==LEAFPROB) { ngram ng(ngt->dict); cout.setf(ios::fixed); cout.precision(2); mfstream inptxt(ftlm,ios::in); int Nbo=0,Nw=0,Noov=0; float logPr=0,PP=0,PPwp=0; int bos=ng.dict->encode(ng.dict->BoS()); while(inptxt >> ng) { // reset ngram at begin of sentence if (*ng.wordp(1)==bos) { ng.size=1; continue; } ngt->bo_state(0); if (ng.size>=1) { logPr+=log(ngt->prob(ng)); if (*ng.wordp(1) == ngt->dict->oovcode()) Noov++; Nw++; if (ngt->bo_state()) Nbo++; } } PP=exp(-logPr/Nw); PPwp= PP * exp(Noov * log(10000000.0-ngt->dict->size())/Nw); cout << "%%% NGT TEST OF SMT LM\n"; cout << "%% LM=" << inp << " SIZE="<< ngt->maxlevel(); cout << " TestFile="<< ftlm << "\n"; cout << "%% OOV PENALTY = 1/" << 10000000.0-ngt->dict->size() << "\n"; cout << "%% Nw=" << Nw << " PP=" << PP << " PPwp=" << PPwp << " Nbo=" << Nbo << " Noov=" << Noov << " OOV=" << (float)Noov/Nw * 100.0 << "%\n"; } if (memuse) ngt->stat(0); if (iknfile) { //compute and save statistics of Improved Kneser Ney smoothing ngram ng(ngt->dict); int n1,n2,n3,n4; int unover3=0; mfstream iknstat(iknfile,ios::out); //output of ikn statistics for (int l=1; l<=ngt->maxlevel(); l++) { cerr << "level " << l << "\n"; iknstat << "level: " << l << " "; cerr << "computing statistics\n"; n1=0; n2=0; n3=0,n4=0; ngt->scan(ng,INIT,l); while(ngt->scan(ng,CONT,l)) { //skip ngrams containing _OOV if (l>1 && ng.containsWord(ngt->dict->OOV(),l)) { //cerr << "skp ngram" << ng << "\n"; continue; } //skip n-grams containing in context if (l>1 && ng.containsWord(ngt->dict->EoS(),l-1)) { //cerr << "skp ngram" << ng << "\n"; continue; } //skip 1-grams containing if (l==1 && ng.containsWord(ngt->dict->BoS(),l)) { //cerr << "skp ngram" << ng << "\n"; continue; } if (ng.freq==1) n1++; else if (ng.freq==2) n2++; else if (ng.freq==3) n3++; else if (ng.freq==4) n4++; if (l==1 && ng.freq >=3) unover3++; } cerr << " n1: " << n1 << " n2: " << n2 << " n3: " << n3 << " n4: " << n4 << "\n"; iknstat << " n1: " << n1 << " n2: " << n2 << " n3: " << n3 << " n4: " << n4 << " unover3: " << unover3 << "\n"; } } if (out) bin?ngt->savebin(out,ngsz): ngt->savetxt(out,ngsz,outputgoogleformat); } irstlm-5.80.03/src/normcache.cpp000644 000766 000024 00000005760 12013405172 020574 0ustar00nicolabertoldistaff000000 000000 /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ using namespace std; #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "ngramtable.h" #include "normcache.h" // Normalization factors cache normcache::normcache(dictionary* d) { dict=d; //trigram and bigram normalization cache //ngt=new ngramtable(NULL,2,NULL,NULL,0,0,NULL,0,LEAFPROB); ngt=new ngramtable(NULL,2,NULL,NULL,NULL,0,0,NULL,0,LEAFPROB); maxcache[0]=d->size();//unigram cache maxcache[1]=d->size();//bigram cache cache[0]=new double[maxcache[0]]; cache[1]=new double[maxcache[1]]; for (int i=0; isize(); i++) cache[0][i]=cache[1][i]=0.0; cachesize[0]=cachesize[1]=0; hit=miss=0; } void normcache::expand(int n) { int step=100000; cerr << "Expanding cache ...\n"; double *newcache=new double[maxcache[n]+step]; memcpy(newcache,cache[n],sizeof(double)*maxcache[n]); delete [] cache[n]; cache[n]=newcache; for (int i=0; iget(ng,size,size-1)) { hit++; // cerr << "hit " << ng << "\n"; return value=cache[1][ng.freq]; } else { miss++; return value=0; } } return 0; } double normcache::put(ngram ng,int size,double value) { if (size==2) { if (*ng.wordp(2)>= maxcache[0]) expand(0); cache[0][*ng.wordp(2)]=value; cachesize[0]++; return value; } else if (size==3) { if (ngt->get(ng,size,size-1)) return cache[1][ng.freq]=value; else { ngram histo(dict,2); *histo.wordp(1)=*ng.wordp(2); *histo.wordp(2)=*ng.wordp(3); histo.freq=cachesize[1]++; if (cachesize[1]==maxcache[1]) expand(1); ngt->put(histo); return cache[1][histo.freq]=value; } } return 0; } void normcache::stat() { cerr << "misses " << miss << ", hits " << hit << "\n"; } irstlm-5.80.03/src/normcache.h000644 000766 000024 00000002727 12114671302 020243 0ustar00nicolabertoldistaff000000 000000 /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #ifndef MF_NORMCACHE_H #define MF_NORMCACHE_H #include "dictionary.h" #include "ngramtable.h" // Normalization factors cache class normcache { dictionary* dict; ngramtable *ngt; double* cache[2]; int cachesize[2]; int maxcache[2]; int hit; int miss; public: normcache(dictionary* d); ~normcache() { delete [] cache[0]; delete [] cache[1]; delete ngt; } void expand(int i); double get(ngram ng,int size,double& value); double put(ngram ng,int size,double value); void stat(); }; #endif irstlm-5.80.03/src/plsa.cpp000755 000766 000024 00000020425 12032511222 017565 0ustar00nicolabertoldistaff000000 000000 /****************************************************************************** IrstLM: IRST Language Model Toolkit, compile LM Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ using namespace std; #include #include "cmd.h" #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "ngramtable.h" #include "doc.h" #include "cplsa.h" void print_help(int TypeFlag=0){ std::cerr << std::endl << "plsa - performs probabilistic latent semantic analysis LM inference" << std::endl; std::cerr << std::endl << "USAGE:" << std::endl; std::cerr << " plsa -c= -d= -m= -t= -it= [options]" << std::endl; std::cerr << " plsa -c= -d= -b= [options]" << std::endl; std::cerr << " plsa -d= -m= -t= -inf= -f= -it= [options]" << std::endl; std::cerr << std::endl << "DESCRIPTION:" << std::endl; std::cerr << " plsa is a tool for probabilistic latent semantic analysis" << std::endl; std::cerr << " LM inference. It can be used to train a PLSA model, to binarize" << std::endl; std::cerr << " a textual document collection to speed-up training or to" << std::endl; std::cerr << " infer a full n-gram distribution from a model and a small text." << std::endl; std::cerr << std::endl << "OPTIONS:" << std::endl; FullPrintParams(TypeFlag, 0, 1, stderr); std::cerr << std::endl << "EXAMPLES:" << std::endl; std::cerr <<" (1) plsa -c= -d= -m= -t= -it=" << std::endl; std::cerr <<" Train a PLSA model, , from the text collection" << std::endl; std::cerr <<" using the dictionary . The" << std::endl; std::cerr <<" number of EM iterations is specified by and the" << std::endl; std::cerr <<" number of topics is specified by ." << std::endl; std::cerr <<" The content must begin with the number of" << std::endl; std::cerr <<" documents and documents should be separated with the tag." << std::endl; std::cerr <<" The begin document tag is not considered." << std::endl; std::cerr <<" Example of content:" << std::endl; std::cerr <<" 3" << std::endl; std::cerr <<" hello world ! " << std::endl; std::cerr <<" good morning good afternoon " << std::endl; std::cerr <<" welcome aboard " << std::endl; std::cerr <<" (2) plsa -c= -d= -b=" << std::endl; std::cerr <<" Binarize a textual document collection to speed-up training (1)" << std::endl; std::cerr <<" (3) plsa -d= -m= -t= -inf= -f= -it=" << std::endl; std::cerr <<" Infer a full 1-gram distribution from a model and a small" << std::endl; std::cerr <<" text. The 1-gram is saved in the feature file. The 1-gram" << std::endl; std::cerr << std::endl; } void usage(const char *msg = 0) { if (msg){ std::cerr << msg << std::endl; } else{ print_help(); } exit(1); } int main(int argc, char **argv) { char *dictfile=NULL; char *trainfile=NULL; char *adafile=NULL; char *featurefile=NULL; char *basefile=NULL; char *hfile=NULL; char *tmphfile=NULL; char *tfile=NULL; char *wfile=NULL; char *ctfile=NULL; char *txtfile=NULL; char *binfile=NULL; int binsize=0; int topics=0; //number of topics int st=0; //special topic: first st dict words int it=0; bool help=false; DeclareParams((char*) "Dictionary", CMDSTRINGTYPE|CMDMSG, &dictfile, "dictionary file", "d", CMDSTRINGTYPE|CMDMSG, &dictfile, "dictionary file", "Binary", CMDSTRINGTYPE|CMDMSG, &binfile, "binary file", "b", CMDSTRINGTYPE|CMDMSG, &binfile, "binary file", "SplitData", CMDINTTYPE|CMDMSG, &binsize, "size of binary file; default is unlimited", "sd", CMDINTTYPE|CMDMSG, &binsize, "size of binary file; default is unlimited", "Collection", CMDSTRINGTYPE|CMDMSG, &trainfile, "text collection file", "c", CMDSTRINGTYPE|CMDMSG, &trainfile, "text collection file", "Model", CMDSTRINGTYPE|CMDMSG, &basefile, "model file", "m", CMDSTRINGTYPE|CMDMSG, &basefile, "model file", "HFile", CMDSTRINGTYPE, &tmphfile, "hf", CMDSTRINGTYPE, &tmphfile, "WFile", CMDSTRINGTYPE, &wfile, "wf", CMDSTRINGTYPE, &wfile, "TFile", CMDSTRINGTYPE, &tfile, "tf", CMDSTRINGTYPE, &tfile, "CombineTFile", CMDSTRINGTYPE, &ctfile, "ct", CMDSTRINGTYPE, &ctfile, "TxtFile", CMDSTRINGTYPE, &txtfile, "txt", CMDSTRINGTYPE, &txtfile, "Inference", CMDSTRINGTYPE, &adafile, "inf", CMDSTRINGTYPE, &adafile, "Features", CMDSTRINGTYPE, &featurefile, "f", CMDSTRINGTYPE, &featurefile, "Topics", CMDINTTYPE|CMDMSG, &topics, "number of topics; default is 0", "t", CMDINTTYPE|CMDMSG, &topics,"number of topics; default is 0", "SpecialTopic", CMDINTTYPE|CMDMSG, &st, "special topic: first dictionary words; default is 0", "st", CMDINTTYPE|CMDMSG, &st, "special topic: first dictionary words; default is 0", "Iterations", CMDINTTYPE|CMDMSG, &it, "number of EM iterations; default is 0", "it", CMDINTTYPE|CMDMSG, &it, "number of EM iterations; default is 0", "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help", "h", CMDBOOLTYPE|CMDMSG, &help, "print this help", (char *)NULL ); if (argc == 1){ usage(); } GetParams(&argc, &argv, (char*) NULL); if (help){ usage(); } if (!dictfile) { usage("Missing parameters dictionary"); }; if (!adafile & (!trainfile || !binfile) && (!trainfile || !it || !topics || !basefile)) { usage("Missing parameters for training"); } if ((!trainfile && basefile) && (!featurefile || !adafile || !it || !topics)) { usage("Missing parameters for adapting"); } if ((adafile) && (!featurefile)) { usage("Missing parameters for adapting 2"); } if (!tmphfile) { //set default value hfile=new char[4+1]; strcpy(hfile,"hfff"); } else { //set the value of the parameter hfile=new char[strlen(tmphfile)+1]; strcpy(hfile,tmphfile); } dictionary dict(dictfile); cout << dict.size() << "\n"; dict.incflag(1); dict.encode(dict.BoD()); dict.encode(dict.EoD()); dict.incflag(0); if (dict.oovcode()==-1) { dict.oovcode(dict.encode(dict.OOV())); } cout << dict.size() << "\n"; if (binfile) { cout << "opening collection\n"; doc col(&dict,trainfile); col.open(); if (binsize) col.save(binfile,binsize); else col.save(binfile); exit(1); } system("rm -f hfff"); plsa tc(&dict,topics,basefile,featurefile,hfile,wfile,tfile); if (ctfile) { //combine t tc.combineT(ctfile); tc.saveW(basefile); exit(1); } if (trainfile) { tc.train(trainfile,it,.5,1,0.5,st); if (txtfile) tc.saveWtxt(txtfile); } if (adafile) { tc.loadW(basefile); tc.train(adafile,it,.0); } if (strcmp(hfile,"hfff")==0) system("rm -f hfff"); delete hfile; exit(1); } irstlm-5.80.03/src/prune-lm.cpp000644 000766 000024 00000011471 12033356002 020367 0ustar00nicolabertoldistaff000000 000000 // $Id: prune-lm.cpp 27 2010-05-03 14:33:51Z nicolabertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit, prune LM Copyright (C) 2008 Fabio Brugnara, FBK-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ using namespace std; #include #include #include #include #include #include "cmd.h" #include "util.h" #include "math.h" #include "lmtable.h" /********************************/ void print_help(int TypeFlag=0){ std::cerr << std::endl << "prune-lm - prunes language models" << std::endl; std::cerr << std::endl << "USAGE:" << std::endl; std::cerr << " prune-lm [options] []" << std::endl; std::cerr << std::endl << "DESCRIPTION:" << std::endl; std::cerr << " prune-lm reads a LM in either ARPA or compiled format and" << std::endl; std::cerr << " prunes out n-grams (n=2,3,..) for which backing-off to the" << std::endl; std::cerr << " lower order n-gram results in a small difference in probability." << std::endl; std::cerr << " The pruned LM is saved in ARPA format" << std::endl; std::cerr << std::endl << "OPTIONS:" << std::endl; FullPrintParams(TypeFlag, 0, 1, stderr); } void usage(const char *msg = 0) { if (msg){ std::cerr << msg << std::endl; } if (!msg){ print_help(); } exit(1); } void s2t(string cps, float *thr) { int i; char *s=strdup(cps.c_str()), *tk; thr[0]=0; for(i=1,tk=strtok(s, ","); tk; tk=strtok(0, ","),i++) thr[i]=atof(tk); for(; i files; bool help=false; DeclareParams((char*) "threshold", CMDSTRINGTYPE|CMDMSG, &spthr, "pruning thresholds for 2-grams, 3-grams, 4-grams,...; if less thresholds are specified, the last one is applied to all following n-gram levels; default is 0", "t", CMDSTRINGTYPE|CMDMSG, &spthr, "pruning thresholds for 2-grams, 3-grams, 4-grams,...; if less thresholds are specified, the last one is applied to all following n-gram levels; default is 0", "abs", CMDBOOLTYPE|CMDMSG, &aflag, "uses absolute value of weighted difference; default is 0", "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help", "h", CMDBOOLTYPE|CMDMSG, &help, "print this help", (char *)NULL ); if (argc == 1){ usage(); } int first_file=1; for (int i=1; i < argc; i++) { if (strcmp(argv[i],"-") == 0){ //handles /dev/stdin or /dev/stdout if (first_file == 1){ files.push_back("/dev/stdin"); }else if (first_file == 2){ files.push_back("/dev/stdout"); }else{ usage("Warning: You can use the value for the input or output file only"); } first_file++; }else if(argv[i][0] != '-'){ files.push_back(argv[i]); first_file++; } } GetParams(&argc, &argv, (char*) NULL); if (help){ usage(); } if (files.size() > 2) { usage("Warning: Too many arguments"); } if (files.size() < 1) { usage("Warning: Specify a LM file to read from"); } memset(thr, 0, sizeof(thr)); if(spthr != NULL) s2t(spthr, thr); std::string infile = files[0]; std::string outfile= ""; if (files.size() == 1) { outfile=infile; //remove path information std::string::size_type p = outfile.rfind('/'); if (p != std::string::npos && ((p+1) < outfile.size())) outfile.erase(0,p+1); //eventually strip .gz if (outfile.compare(outfile.size()-3,3,".gz")==0) outfile.erase(outfile.size()-3,3); outfile+=".plm"; } else outfile = files[1]; lmtable lmt; inputfilestream inp(infile.c_str()); if (!inp.good()) { std::cerr << "Failed to open " << infile << "!" << std::endl; exit(1); } lmt.load(inp,infile.c_str(),outfile.c_str(),0,NONE); std::cerr << "pruning LM with thresholds: \n"; for (int i=1; i #include #include #include #include #include #include "cmd.h" #include "math.h" #include "util.h" //---------------------------------------------------------------------- // Special type and global variable for the BIN CLUSTERING algorithm // // //---------------------------------------------------------------------- typedef struct { float pt; unsigned int idx; unsigned short code; } DataItem; int cmpFloatEntry(const void* a,const void* b) { if (*(float *)a > *(float*)b) return 1; else if (*(float *)a < *(float *)b) return -1; else return 0; } //---------------------------------------------------------------------- // Global entry points //---------------------------------------------------------------------- int ComputeCluster(int nc, double* cl,unsigned int N,DataItem* Pts); //---------------------------------------------------------------------- // Global parameters (some are set in getArgs()) //---------------------------------------------------------------------- int k = 256; // number of centers const int MAXLEV = 11; //maximum n-gram size //---------------------------------------------------------------------- // Main program //---------------------------------------------------------------------- void print_help(int TypeFlag=0){ std::cerr << std::endl << "quantize-lm - quantizes probabilities and back-off weights" << std::endl; std::cerr << std::endl << "USAGE:" << std::endl; std::cerr << " quantize-lm [ []]" << std::endl; std::cerr << std::endl << "DESCRIPTION:" << std::endl; std::cerr << " quantize-lm reads a standard LM file in ARPA format and produces" << std::endl; std::cerr << " a version of it with quantized probabilities and back-off weights"<< std::endl; std::cerr << " that the IRST LM toolkit can compile. Accepts LMs with .gz suffix." << std::endl; std::cerr << " You can specify the output file to be created and also the pathname" << std::endl; std::cerr << " of a temporary file used by the program. As default, the temporary " << std::endl; std::cerr << " file is created in the /tmp directory." << std::endl; std::cerr << " Output file can be written to standard output by using the special name -." << std::endl; std::cerr << std::endl << "OPTIONS:" << std::endl; FullPrintParams(TypeFlag, 0, 1, stderr); } void usage(const char *msg = 0) { if (msg){ std::cerr << msg << std::endl; } else{ print_help(); } exit(1); } int main(int argc, char **argv) { std::vector files; bool help=false; DeclareParams((char*) "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help", "h", CMDBOOLTYPE|CMDMSG, &help, "print this help", (char *)NULL ); if (argc == 1){ usage(); } int first_file=1; for (int i=1; i < argc; i++) { if (strcmp(argv[i],"-") == 0){ //handles /dev/stdin or /dev/stdout if (first_file == 1){ files.push_back("/dev/stdin"); }else if (first_file == 2){ files.push_back("/dev/stdout"); }else{ usage("Warning: You can use the value for the input and/or output file only"); } first_file++; }else if(argv[i][0] != '-'){ files.push_back(argv[i]); first_file++; } } GetParams(&argc, &argv, (char*) NULL); if (help){ usage(); } if (files.size() > 3) { usage("Warning: Too many arguments"); } if (files.size() < 1) { usage("Warning: Please specify a LM file to read from"); } std::string infile = files[0]; std::string outfile=""; std::string tmpfile=""; if (files.size() == 1) { outfile=infile; //remove path information std::string::size_type p = outfile.rfind('/'); if (p != std::string::npos && ((p+1) < outfile.size())) outfile.erase(0,p+1); //eventually strip .gz if (outfile.compare(outfile.size()-3,3,".gz")==0) outfile.erase(outfile.size()-3,3); outfile+=".qlm"; } else outfile = files[1]; if (files.size()==3) { //create temporary file tmpfile = files[2]; mfstream dummy(tmpfile.c_str(),ios::out); dummy.close(); } else { //create temporary internal file in /tmp mfstream dummy; createtempfile(dummy,tmpfile,ios::out); dummy.close(); } std::cerr << "Reading " << infile << "..." << std::endl; inputfilestream inp(infile.c_str()); if (!inp.good()) { std::cerr << "Failed to open " << infile << "!\n"; exit(1); } std::ofstream* out; if (outfile == "-") out = (ofstream *)&std::cout; else { out=new std::ofstream; out->open(outfile.c_str()); } if (!out->good()) { std::cerr << "Failed to open " << outfile << "!\n"; exit(1); } std::cerr << "Writing " << outfile << "..." << std::endl; //prepare temporary file to save n-gram blocks for multiple reads //this avoids using seeks which do not work with inputfilestream //it's odd but i need a bidirectional filestream! std::cerr << "Using temporary file " << tmpfile << std::endl; fstream filebuff(tmpfile.c_str(),ios::out|ios::in|ios::binary); unsigned int nPts = 0; // actual number of points // *** Read ARPA FILE ** unsigned int numNgrams[MAXLEV + 1]; /* # n-grams for each order */ int Order=0,MaxOrder=0; int n=0; float logprob,logbow; DataItem* dataPts; double* centersP=NULL; double* centersB=NULL; //maps from point index to code unsigned short* mapP=NULL; unsigned short* mapB=NULL; int centers[MAXLEV + 1]; streampos iposition; for (int i=1; i<=MAXLEV; i++) numNgrams[i]=0; for (int i=1; i<=MAXLEV; i++) centers[i]=k; /* all levels 256 centroids; in case read them as parameters */ char line[MAX_LINE]; while (inp.getline(line,MAX_LINE)) { bool backslash = (line[0] == '\\'); if (sscanf(line, "ngram %d=%d", &Order, &n) == 2) { numNgrams[Order] = n; MaxOrder=Order; continue; } if (!strncmp(line, "\\data\\", 6) || strlen(line)==0) continue; if (backslash && sscanf(line, "\\%d-grams", &Order) == 1) { // print output header: if (Order == 1) { *out << "qARPA " << MaxOrder; for (int i=1; i<=MaxOrder; i++) *out << " " << centers[i]; *out << "\n\n\\data\\\n"; for (int i=1; i<=MaxOrder; i++) *out << "ngram " << i << "= " << numNgrams[i] << "\n"; } *out << "\n"; *out << line << "\n"; cerr << "-- Start processing of " << Order << "-grams\n"; assert(Order <= MAXLEV); unsigned int N=numNgrams[Order]; const char* words[MAXLEV+3]; dataPts=new DataItem[N]; // allocate data //reset tempout file to start writing filebuff.seekg((streampos)0); for (nPts=0; nPtsflush(); out->close(); inp.close(); removefile(tmpfile.c_str()); } // Compute Clusters int ComputeCluster(int centers,double* ctrs,unsigned int N,DataItem* bintable) { //cerr << "\nExecuting Clutering Algorithm: k=" << centers<< "\n"; double log10=log(10.0); for (unsigned int i=0; i0) { currcode++; } } if (bintable[i].pt == bintable[i-1].pt) bintable[i].code=bintable[i-1].code; else { bintable[i].code=currcode; species[currcode]++; } population[bintable[i].code]++; assert(bintable[i].code < centers); ctrs[bintable[i].code]=ctrs[bintable[i].code]+exp(bintable[i].pt * log10); } for (int i=0; i0) ctrs[i]=log(ctrs[i]/population[i])/log10; else ctrs[i]=-99; if (ctrs[i]<-99) { cerr << "Warning: adjusting center with too small prob " << ctrs[i] << "\n"; ctrs[i]=-99; } cerr << i << " ctr " << ctrs[i] << " population " << population[i] << " species " << species[i] <<"\n"; } cout.flush(); delete [] population; delete [] species; return 1; } //---------------------------------------------------------------------- // Reading/Printing utilities // readPt - read a point from input stream into data storage // at position i. Returns false on error or EOF. // printPt - prints a points to output file //---------------------------------------------------------------------- irstlm-5.80.03/src/score-lm.cpp000644 000766 000024 00000005660 12032511222 020350 0ustar00nicolabertoldistaff000000 000000 /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2010 Christian Hardmeier, FBK-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include #include #include #include void usage() { std::cerr << std::endl << "score-lm - scores sentences with a language model" << std::endl; std::cerr << std::endl << "USAGE:" << std::endl << " score-lm -lm [options]" << std::endl; std::cerr << std::endl << "OPTIONS:" << std::endl; std::cerr << " -lm language model to use (must be specified)" << std::endl; std::cerr << " -dub dictionary upper bound (default: 10000000" << std::endl; std::cerr << " -level max level to load from the language models (default: 1000," << std::endl; std::cerr << " meaning the actual LM order)" << std::endl; std::cerr << " -mm 1 memory-mapped access to lm (default: 0)" << std::endl; std::cerr << std::endl; exit(1); } int main(int argc, char **argv) { int mmap = 0; int dub = 10000000; int requiredMaxlev = 1000; char *lm = NULL; for(int i = 1; i < argc; i++) { if(!strcmp(argv[i], "-mm")) { if(++i == argc) usage(); mmap = atoi(argv[i]); } else if(!strcmp(argv[i], "-dub")) { if(++i == argc) usage(); dub = atoi(argv[i]); } else if(!strcmp(argv[i], "-lm")) { if(++i == argc) usage(); lm = argv[i]; } else if(!strcmp(argv[i], "-level")) { if(++i == argc) usage(); requiredMaxlev = atoi(argv[i]); } else usage(); } if(lm == NULL) usage(); std::ifstream lmstr(lm); lmtable lmt; lmt.setMaxLoadedLevel(requiredMaxlev); lmt.load(lmstr, lm, NULL, mmap); lmt.setlogOOVpenalty(dub); for(;;) { std::string line; std::getline(std::cin, line); if(!std::cin.good()) return !std::cin.eof(); std::istringstream linestr(line); ngram ng(lmt.dict); double logprob = .0; while((linestr >> ng)) logprob += lmt.lprob(ng); std::cout << logprob << std::endl; } } irstlm-5.80.03/src/shiftlm.cpp000644 000766 000024 00000034143 12030110631 020270 0ustar00nicolabertoldistaff000000 000000 /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include #include #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "ngramtable.h" #include "ngramcache.h" #include "normcache.h" #include "interplm.h" #include "mdiadapt.h" #include "shiftlm.h" // //Shiftone interpolated language model // shiftone::shiftone(char* ngtfile,int depth,int prunefreq,TABLETYPE tt): mdiadaptlm(ngtfile,depth,tt) { cerr << "Creating LM with ShiftOne smoothing\n"; prunethresh=prunefreq; cerr << "PruneThresh: " << prunethresh << "\n"; beta=1.0; }; int shiftone::train() { trainunigr(); return 1; } int shiftone::discount(ngram ng_,int size,double& fstar,double& lambda, int cv) { ngram ng(dict); ng.trans(ng_); // cout << "size :" << size << " " << ng <<"\n"; if (size > 1) { ngram history=ng; if (ng.ckhisto(size) && get(history,size,size-1) && (history.freq>cv) && ((size < 3) || ((history.freq-cv) > prunethresh))) { // this history is not pruned out get(ng,size,size); cv=(cv>ng.freq)?ng.freq:cv; if (ng.freq > cv) { fstar=(double)((double)(ng.freq - cv) - beta)/(double)(history.freq-cv); lambda=beta * ((double)history.succ/(double)(history.freq-cv)); } else { // ng.freq == cv: do like if ng was deleted from the table fstar=0.0; lambda=beta * ((double)(history.succ-1)/ //one successor has disappeared! (double)(history.freq-cv)); } //cerr << "ngram :" << ng << "\n"; //check if the last word is OOV if (*ng.wordp(1)==dict->oovcode()) { lambda+=fstar; fstar=0.0; } else { //complete lambda with oovcode probability *ng.wordp(1)=dict->oovcode(); if (get(ng,size,size)) lambda+=(double)((double)ng.freq - beta)/(double)(history.freq-cv); } } else { fstar=0; lambda=1; } } else { fstar=unigr(ng); lambda=0.0; } return 1; } // //Shiftbeta interpolated language model // shiftbeta::shiftbeta(char* ngtfile,int depth,int prunefreq,double b,TABLETYPE tt): mdiadaptlm(ngtfile,depth,tt) { cerr << "Creating LM with ShiftBeta smoothing\n"; if (b==-1.0 || (b < 1.0 && b >0.0)) { beta=new double[lmsize()+1]; for (int l=lmsize(); l>1; l--) beta[l]=b; } else { cerr << "shiftbeta: beta must be < 1.0 and > 0\n"; exit (1); } prunethresh=prunefreq; cerr << "PruneThresh: " << prunethresh << "\n"; }; int shiftbeta::train() { ngram ng(dict); int n1,n2; trainunigr(); beta[1]=0.0; for (int l=2; l<=lmsize(); l++) { cerr << "level " << l << "\n"; n1=0; n2=0; scan(ng,INIT,l); while(scan(ng,CONT,l)) { if (l1 && ng.containsWord(dict->OOV(),l)) { //cerr << "skp ngram" << ng << "\n"; continue; } //skip n-grams containing in context if (l>1 && ng.containsWord(dict->EoS(),l-1)) { //cerr << "skp ngram" << ng << "\n"; continue; } //skip 1-grams containing if (l==1 && ng.containsWord(dict->BoS(),l)) { //cerr << "skp ngram" << ng << "\n"; continue; } if (ng.freq==1) n1++; else if (ng.freq==2) n2++; } //compute statistics of shiftbeta smoothing if (beta[l]==-1) { if (n1>0) beta[l]=(double)n1/(double)(n1 + 2 * n2); else { cerr << "no singletons! \n"; beta[l]=1.0; } } cerr << beta[l] << "\n"; } return 1; }; int shiftbeta::discount(ngram ng_,int size,double& fstar,double& lambda, int cv) { ngram ng(dict); ng.trans(ng_); if (size > 1) { ngram history=ng; if (ng.ckhisto(size) && get(history,size,size-1) && (history.freq>cv) && ((size < 3) || ((history.freq-cv) > prunethresh ))) { // apply history pruning on trigrams only if (get(ng,size,size) && (!prunesingletons() || ng.freq >1 || size<3)) { cv=(cv>ng.freq)?ng.freq:cv; if (ng.freq>cv) { fstar=(double)((double)(ng.freq - cv) - beta[size])/(double)(history.freq-cv); lambda=beta[size]*((double)history.succ/(double)(history.freq-cv)); if (size>=3 && prunesingletons()) // correction due to frequency pruning lambda+=(1.0-beta[size]) * (double)succ1(history.link)/(double)(history.freq-cv); // succ1(history.link) is not affected if ng.freq > cv } else { // ng.freq == cv fstar=0.0; lambda=beta[size]*((double)(history.succ-1)/ //e` sparito il successore (double)(history.freq-cv)); if (size>=3 && prunesingletons()) //take into acccount single event pruning lambda+=(1.0-beta[size]) * (double)(succ1(history.link)-(cv==1 && ng.freq==1?1:0)) /(double)(history.freq-cv); } } else { fstar=0.0; lambda=beta[size]*(double)history.succ/(double)history.freq; if (size>=3 && prunesingletons()) // correction due to frequency pruning lambda+=(1.0-beta[size]) * (double)succ1(history.link)/(double)history.freq; } //cerr << "ngram :" << ng << "\n"; if (*ng.wordp(1)==dict->oovcode()) { lambda+=fstar; fstar=0.0; } else { *ng.wordp(1)=dict->oovcode(); if (get(ng,size,size) && (!prunesingletons() || ng.freq >1 || size<3)) lambda+=(double)((double)ng.freq - beta[size])/(double)(history.freq-cv); } } else { fstar=0; lambda=1; } } else { fstar=unigr(ng); lambda=0.0; } return 1; } // //Modified Shiftbeta language model // mshiftbeta::mshiftbeta(char* ngtfile,int depth,int prunefreq,TABLETYPE tt): mdiadaptlm(ngtfile,depth,tt) { cerr << "Creating LM with Modified ShiftBeta smoothing\n"; prunethresh=prunefreq; cerr << "PruneThresh: " << prunethresh << "\n"; beta[1][0]=0.0; beta[1][1]=0.0; beta[1][2]=0.0; }; int mshiftbeta::train() { trainunigr(); gencorrcounts(); gensuccstat(); ngram ng(dict); int n1,n2,n3,n4; int unover3=0; oovsum=0; for (int l=1; l<=lmsize(); l++) { cerr << "level " << l << "\n"; cerr << "computing statistics\n"; n1=0; n2=0; n3=0,n4=0; scan(ng,INIT,l); while(scan(ng,CONT,l)) { //skip ngrams containing _OOV if (l>1 && ng.containsWord(dict->OOV(),l)) { //cerr << "skp ngram" << ng << "\n"; continue; } //skip n-grams containing in context if (l>1 && ng.containsWord(dict->EoS(),l-1)) { //cerr << "skp ngram" << ng << "\n"; continue; } //skip 1-grams containing if (l==1 && ng.containsWord(dict->BoS(),l)) { //cerr << "skp ngram" << ng << "\n"; continue; } ng.freq=mfreq(ng,l); if (ng.freq==1) n1++; else if (ng.freq==2) n2++; else if (ng.freq==3) n3++; else if (ng.freq==4) n4++; if (l==1 && ng.freq >=3) unover3++; } if (l==1) { cerr << " n1: " << n1 << " n2: " << n2 << " n3: " << n3 << " n4: " << n4 << " unover3: " << unover3 << "\n"; } else { cerr << " n1: " << n1 << " n2: " << n2 << " n3: " << n3 << " n4: " << n4 << "\n"; } if (n1 == 0 || n2 == 0 || n1 <= n2) { cerr << "Error: lower order count-of-counts cannot be estimated properly\n"; cerr << "Hint: use another smoothing method with this corpus.\n"; exit(1); } double Y=(double)n1/(double)(n1 + 2 * n2); beta[0][l] = Y; //equivalent to 1 - 2 * Y * n2 / n1 if (n3 ==0 || n4 == 0 || n2 <= n3 || n3 <= n4 ){ cerr << "Warning: higher order count-of-counts cannot be estimated properly\n"; cerr << "Fixing this problem by resorting only on the lower order count-of-counts\n"; beta[1][l] = Y; beta[2][l] = Y; } else{ beta[1][l] = 2 - 3 * Y * n3 / n2; beta[2][l] = 3 - 4 * Y * n4 / n3; } if (beta[1][l] < 0){ cerr << "Warning: discount coefficient is negative \n"; cerr << "Fixing this problem by setting beta to 0 \n"; beta[1][l] = 0; } if (beta[2][l] < 0){ cerr << "Warning: discount coefficient is negative \n"; cerr << "Fixing this problem by setting beta to 0 \n"; beta[2][l] = 0; } if (l==1) oovsum=beta[0][l] * (double) n1 + beta[1][l] * (double)n2 + beta[2][l] * (double)unover3; cerr << beta[0][l] << " " << beta[1][l] << " " << beta[2][l] << "\n"; } return 1; }; int mshiftbeta::discount(ngram ng_,int size,double& fstar,double& lambda, int cv) { ngram ng(dict); ng.trans(ng_); //cout << "size :" << size << " " << ng <<"\n"; if (size > 1) { ngram history=ng; //singleton pruning only on real counts!! if (ng.ckhisto(size) && get(history,size,size-1) && (history.freq > cv) && ((size < 3) || ((history.freq-cv) > prunethresh ))) { // no history pruning with corrected counts! int suc[3]; suc[0]=succ1(history.link); suc[1]=succ2(history.link); suc[2]=history.succ-suc[0]-suc[1]; if (get(ng,size,size) && (!prunesingletons() || mfreq(ng,size)>1 || size<3) && (!prunetopsingletons() || mfreq(ng,size)>1 || sizeng.freq)?ng.freq:cv; if (ng.freq>cv) { double b=(ng.freq-cv>=3?beta[2][size]:beta[ng.freq-cv-1][size]); fstar=(double)((double)(ng.freq - cv) - b)/(double)(history.freq-cv); lambda=(beta[0][size] * suc[0] + beta[1][size] * suc[1] + beta[2][size] * suc[2]) / (double)(history.freq-cv); if ((size>=3 && prunesingletons()) || (size==maxlevel() && prunetopsingletons())) //correction lambda+=(double)(suc[0] * (1-beta[0][size])) / (double)(history.freq-cv); } else { // ng.freq==cv ng.freq>=3?suc[2]--:suc[ng.freq-1]--; //update successor stat fstar=0.0; lambda=(beta[0][size] * suc[0] + beta[1][size] * suc[1] + beta[2][size] * suc[2]) / (double)(history.freq-cv); if ((size>=3 && prunesingletons()) || (size==maxlevel() && prunetopsingletons())) //correction lambda+=(double)(suc[0] * (1-beta[0][size])) / (double)(history.freq-cv); ng.freq>=3?suc[2]++:suc[ng.freq-1]++; //resume successor stat } } else { fstar=0.0; lambda=(beta[0][size] * suc[0] + beta[1][size] * suc[1] + beta[2][size] * suc[2]) / (double)(history.freq-cv); if ((size>=3 && prunesingletons()) || (size==maxlevel() && prunetopsingletons())) //correction lambda+=(double)(suc[0] * (1-beta[0][size])) / (double)(history.freq-cv); } //cerr << "ngram :" << ng << "\n"; if (*ng.wordp(1)==dict->oovcode()) { lambda+=fstar; fstar=0.0; } else { *ng.wordp(1)=dict->oovcode(); if (get(ng,size,size)) { ng.freq=mfreq(ng,size); if ((!prunesingletons() || mfreq(ng,size)>1 || size<3) && (!prunetopsingletons() || mfreq(ng,size)>1 || size=3?beta[2][size]:beta[ng.freq-1][size]); lambda+=(double)(ng.freq - b)/(double)(history.freq-cv); } } } } else { fstar=0; lambda=1; } } else { // unigram case, no cross-validation lambda=0.0; int unigrtotfreq=(sizedecode(*ng.wordp(1)) << "\n"; exit(1); } } return 1; } //Symmetric Shiftbeta int symshiftbeta::discount(ngram ng_,int size,double& fstar,double& lambda, int /* unused parameter: cv */) { ngram ng(dict); ng.trans(ng_); // cout << "size :" << size << " " << ng <<"\n"; // Pr(x/y)= max{(c([x,y])-beta)/(N Pr(y)),0} + lambda Pr(x) // lambda=#bigrams/N assert(size<=2); // only works with bigrams // if (size == 3) { ngram history=ng; } if (size == 2) { //compute unigram probability of denominator ngram unig(dict,1); *unig.wordp(1)=*ng.wordp(2); double prunig=unigr(unig); //create symmetric bigram if (*ng.wordp(1) > *ng.wordp(2)) { int tmp=*ng.wordp(1); *ng.wordp(1)=*ng.wordp(2); *ng.wordp(2)=tmp; } lambda=beta[2] * (double) entries(2)/(double)totfreq(); if (get(ng,2,2)) { fstar=(double)((double)ng.freq - beta[2])/ (totfreq() * prunig); } else { fstar=0; } } else { fstar=unigr(ng); lambda=0.0; } return 1; } /* main(int argc, char** argv){ dictionary d(argv[1]); shiftbeta ilm(&d,argv[2],3); ngramtable test(&d,argv[2],3); ilm.train(); cerr << "PP " << ilm.test(test) << "\n"; ilm.savebin("newlm.lm",3); } */ irstlm-5.80.03/src/shiftlm.h000644 000766 000024 00000004376 12013405172 017752 0ustar00nicolabertoldistaff000000 000000 /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ // Non linear Shift based interpolated LMs class shiftone: public mdiadaptlm { protected: int prunethresh; double beta; public: shiftone(char* ngtfile,int depth=0,int prunefreq=0,TABLETYPE tt=SHIFTBETA_B); int train(); int discount(ngram ng,int size,double& fstar,double& lambda,int cv=0); ~shiftone() {} }; class shiftbeta: public mdiadaptlm { protected: int prunethresh; double* beta; public: shiftbeta(char* ngtfile,int depth=0,int prunefreq=0,double beta=-1,TABLETYPE tt=SHIFTBETA_B); int train(); int discount(ngram ng,int size,double& fstar,double& lambda,int cv=0); ~shiftbeta() { delete [] beta; } }; class symshiftbeta: public shiftbeta { public: symshiftbeta(char* ngtfile,int depth=0,int prunefreq=0,double beta=-1): shiftbeta(ngtfile,depth,prunefreq,beta) {} int discount(ngram ng,int size,double& fstar,double& lambda,int cv=0); }; class mshiftbeta: public mdiadaptlm { protected: int prunethresh; double beta[3][MAX_NGRAM]; ngramtable* tb[MAX_NGRAM]; double oovsum; public: mshiftbeta(char* ngtfile,int depth=0,int prunefreq=0,TABLETYPE tt=MSHIFTBETA_B); int train(); int discount(ngram ng,int size,double& fstar,double& lambda,int cv=0); ~mshiftbeta() {} int mfreq(ngram& ng,int l) { return (l #include #include "mfstream.h" #include #include #include #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "mempool.h" #include "ngramcache.h" #include "ngramtable.h" #include "interplm.h" #include "normcache.h" #include "mdiadapt.h" #include "shiftlm.h" #include "linearlm.h" #include "mixture.h" #include "cmd.h" #include "lmtable.h" #define YES 1 #define NO 0 #define NGRAM 1 #define SEQUENCE 2 #define ADAPT 3 #define TURN 4 #define TEXT 5 #define END_ENUM { (char*)0, 0 } static Enum_T BooleanEnum [] = { { "Yes", YES }, { "No", NO}, { "yes", YES }, { "no", NO}, { "y", YES }, { "n", NO}, END_ENUM }; static Enum_T LmTypeEnum [] = { { "ModifiedShiftBeta", MOD_SHIFT_BETA }, { "msb", MOD_SHIFT_BETA }, { "InterpShiftBeta", SHIFT_BETA }, { "ShiftBeta", SHIFT_BETA }, { "sb", SHIFT_BETA }, { "InterpShiftOne", SHIFT_ONE }, { "ShiftOne", SHIFT_ONE }, { "s1", SHIFT_ONE }, { "LinearWittenBell", LINEAR_WB }, { "wb", LINEAR_WB }, { "LinearGoodTuring", LINEAR_GT }, { "Mixture", MIXTURE }, { "mix", MIXTURE }, END_ENUM }; #define RESET 1 #define SAVE 2 #define LOAD 3 #define INIT 4 #define STOP 5 #define BIN 11 #define ARPA 12 #define ASR 13 #define TXT 14 #define NGT 15 int init(mdiadaptlm** lm, int lmtype, char *trainfile, int size, int prunefreq, double beta, int backoff, int dub, double oovrate, int mcl); int deinit(mdiadaptlm** lm); int main(int argc, char **argv) { char *dictfile=NULL; char *trainfile=NULL; char *BINfile=NULL; char *ARPAfile=NULL; char *ASRfile=NULL; int backoff=0; //back-off or interpolation int lmtype=0; int dub=0; //dictionary upper bound int size=0; //lm size int statistics=0; int prunefreq=NO; int prunesingletons=YES; int prunetopsingletons=NO; double beta=-1; int compsize=NO; int checkpr=NO; double oovrate=0; int max_caching_level=0; char *outpr=NULL; int memmap = 0; //write binary format with/without memory map, default is 0 DeclareParams( "Back-off",CMDENUMTYPE, &backoff, BooleanEnum, "bo",CMDENUMTYPE, &backoff, BooleanEnum, "Dictionary", CMDSTRINGTYPE, &dictfile, "d", CMDSTRINGTYPE, &dictfile, "DictionaryUpperBound", CMDINTTYPE, &dub, "dub", CMDINTTYPE, &dub, "NgramSize", CMDSUBRANGETYPE, &size, 1 , MAX_NGRAM, "n", CMDSUBRANGETYPE, &size, 1 , MAX_NGRAM, "Ngram", CMDSTRINGTYPE, &trainfile, "TrainOn", CMDSTRINGTYPE, &trainfile, "tr", CMDSTRINGTYPE, &trainfile, "oASR", CMDSTRINGTYPE, &ASRfile, "oasr", CMDSTRINGTYPE, &ASRfile, "o", CMDSTRINGTYPE, &ARPAfile, "oARPA", CMDSTRINGTYPE, &ARPAfile, "oarpa", CMDSTRINGTYPE, &ARPAfile, "oBIN", CMDSTRINGTYPE, &BINfile, "obin", CMDSTRINGTYPE, &BINfile, "LanguageModelType",CMDENUMTYPE, &lmtype, LmTypeEnum, "lm",CMDENUMTYPE, &lmtype, LmTypeEnum, "Statistics",CMDSUBRANGETYPE, &statistics, 1 , 3, "s",CMDSUBRANGETYPE, &statistics, 1 , 3, "PruneThresh",CMDSUBRANGETYPE, &prunefreq, 1 , 1000, "p",CMDSUBRANGETYPE, &prunefreq, 1 , 1000, "PruneSingletons",CMDENUMTYPE, &prunesingletons, BooleanEnum, "ps",CMDENUMTYPE, &prunesingletons, BooleanEnum, "PruneTopSingletons",CMDENUMTYPE, &prunetopsingletons, BooleanEnum, "pts",CMDENUMTYPE, &prunetopsingletons, BooleanEnum, "ComputeLMSize",CMDENUMTYPE, &compsize, BooleanEnum, "sz",CMDENUMTYPE, &compsize, BooleanEnum, "MaximumCachingLevel", CMDINTTYPE , &max_caching_level, "mcl", CMDINTTYPE, &max_caching_level, "MemoryMap", CMDENUMTYPE, &memmap, BooleanEnum, "memmap", CMDENUMTYPE, &memmap, BooleanEnum, "mm", CMDENUMTYPE, &memmap, BooleanEnum, "CheckProb",CMDENUMTYPE, &checkpr, BooleanEnum, "cp",CMDENUMTYPE, &checkpr, BooleanEnum, "OutProb",CMDSTRINGTYPE, &outpr, "op",CMDSTRINGTYPE, &outpr, "SetOovRate", CMDDOUBLETYPE, &oovrate, "or", CMDDOUBLETYPE, &oovrate, "Beta", CMDDOUBLETYPE, &beta, "beta", CMDDOUBLETYPE, &beta, (char *)NULL ); GetParams(&argc, &argv, (char*) NULL); if (!lmtype) { cerr <<"Missing parameters\n"; exit(1); } cerr <<"LM size: " << size << "\n"; char header[BUFSIZ]; char filename[BUFSIZ]; int cmdcounter=0; mdiadaptlm *lm=NULL; int cmdtype=INIT; int filetype=0; int BoSfreq=0; init(&lm, lmtype, trainfile, size, prunefreq, beta, backoff, dub, oovrate, max_caching_level); ngram ng(lm->dict), ng2(lm->dict); cerr << "filling the initial n-grams with BoS\n"; for (int i=1; imaxlevel(); i++) { ng.pushw(lm->dict->BoS()); ng.freq=1; } mfstream inp("/dev/stdin",ios::in ); int c=0; while (inp >> header) { if (strncmp(header,"@CMD@",5)==0) { cmdcounter++; inp >> header; cerr << "Read |@CMD@| |" << header << "|"; cmdtype=INIT; filetype=BIN; if (strncmp(header,"RESET",5)==0) cmdtype=RESET; else if (strncmp(header,"INIT",4)==0) cmdtype=INIT; else if (strncmp(header,"SAVEBIN",7)==0) { cmdtype=SAVE; filetype=BIN; } else if (strncmp(header,"SAVEARPA",8)==0) { cmdtype=SAVE; filetype=ARPA; } else if (strncmp(header,"SAVEASR",7)==0) { cmdtype=SAVE; filetype=ASR; } else if (strncmp(header,"SAVENGT",7)==0) { cmdtype=SAVE; filetype=NGT; } else if (strncmp(header,"LOADNGT",7)==0) { cmdtype=LOAD; filetype=NGT; } else if (strncmp(header,"LOADTXT",7)==0) { cmdtype=LOAD; filetype=TXT; } else if (strncmp(header,"STOP",4)==0) cmdtype=STOP; else { cerr << "CMD " << header << " is unknown\n"; exit(1); } char** lastwords; char *isym; switch (cmdtype) { case STOP: cerr << "\n"; exit(1); break; case SAVE: inp >> filename; //storing the output filename cerr << " |" << filename << "|\n"; //save actual ngramtable char tmpngtfile[BUFSIZ]; sprintf(tmpngtfile,"%s.ngt%d",filename,cmdcounter); cerr << "saving temporary ngramtable (binary)..." << tmpngtfile << "\n"; ((ngramtable*) lm)->ngtype("ngram"); ((ngramtable*) lm)->savetxt(tmpngtfile,size); //get the actual frequency of BoS symbol, because the constructor of LM will reset to 1; BoSfreq=lm->dict->freq(lm->dict->encode(lm->dict->BoS())); lm->train(); lm->prunesingletons(prunesingletons==YES); lm->prunetopsingletons(prunetopsingletons==YES); if (prunetopsingletons==YES) //keep most specific lm->prunesingletons(NO); switch (filetype) { case BIN: cerr << "saving lm (binary) ... " << filename << "\n"; lm->saveBIN(filename,backoff,dictfile,memmap); cerr << "\n"; break; case ARPA: cerr << "save lm (ARPA)... " << filename << "\n"; lm->saveARPA(filename,backoff,dictfile); cerr << "\n"; break; case ASR: cerr << "save lm (ASR)... " << filename << "\n"; lm->saveASR(filename,backoff,dictfile); cerr << "\n"; break; case NGT: cerr << "save the ngramtable on ... " << filename << "\n"; { ifstream ifs(tmpngtfile, ios::binary); std::ofstream ofs(filename, std::ios::binary); ofs << ifs.rdbuf(); } cerr << "\n"; break; default: cerr << "Saving type is unknown\n"; exit(1); }; //store last words up to the LM order (filling with BoS if needed) ng.size=(ng.size>lm->maxlevel())?lm->maxlevel():ng.size; lastwords = new char*[lm->maxlevel()]; for (int i=1; imaxlevel(); i++) { lastwords[i] = new char[BUFSIZ]; if (i<=ng.size) strcpy(lastwords[i],lm->dict->decode(*ng.wordp(i))); else strcpy(lastwords[i],lm->dict->BoS()); } deinit(&lm); init(&lm, lmtype, tmpngtfile, size, prunefreq, beta, backoff, dub, oovrate, max_caching_level); if (remove(tmpngtfile) != 0) cerr << "Error deleting file " << tmpngtfile << endl; else cerr << "File " << tmpngtfile << " successfully deleted" << endl; //re-set the dictionaries of the working ngrams and re-encode the actual ngram ng.dict=ng2.dict=lm->dict; ng.size=lm->maxlevel(); //restore the last words re-encoded wrt to the new dictionary for (int i=1; imaxlevel(); i++) { *ng.wordp(i)=lm->dict->encode(lastwords[i]); delete []lastwords[i]; } delete []lastwords; //re-set the actual frequency of BoS symbol, because the constructor of LM deleted it; lm->dict->freq(lm->dict->encode(lm->dict->BoS()), BoSfreq); break; case RESET: //restart from scratch deinit(&lm); init(&lm, lmtype, NULL, size, prunefreq, beta, backoff, dub, oovrate, max_caching_level); ng.dict=ng2.dict=lm->dict; cerr << "filling the initial n-grams with BoS\n"; for (int i=1; imaxlevel(); i++) { ng.pushw(lm->dict->BoS()); ng.freq=1; } break; case INIT: cerr << "CMD " << header << " not yet implemented\n"; exit(1); break; case LOAD: inp >> filename; //storing the input filename cerr << " |" << filename << "|\n"; isym=new char[BUFSIZ]; strcpy(isym,lm->dict->EoS()); ngramtable* ngt; switch (filetype) { case NGT: cerr << "loading an ngramtable..." << filename << "\n"; ngt = new ngramtable(filename,size,isym,NULL,NULL); ((ngramtable*) lm)->augment(ngt); cerr << "\n"; break; case TXT: cerr << "loading from text..." << filename << "\n"; ngt= new ngramtable(filename,size,isym,NULL,NULL); ((ngramtable*) lm)->augment(ngt); cerr << "\n"; break; default: cerr << "This file type is unknown\n"; exit(1); }; break; default: cerr << "CMD " << header << " is unknown\n"; exit(1); }; } else { ng.pushw(header); // CHECK: serve questa trans() ng2.trans(ng); //reencode with new dictionary lm->check_dictsize_bound(); //CHECK: e' corretto ng.size? non dovrebbe essere ng2.size? if (ng.size) lm->dict->incfreq(*ng2.wordp(1),1); //CHECK: what about filtering dictionary??? /* if (filterdict){ int code=filterdict->encode(dict->decode(*ng2.wordp(maxlev))); if (code!=filterdict->oovcode()) put(ng2); } else put(ng2); */ lm->put(ng2); if (!(++c % 1000000)) cerr << "."; } } if (statistics) { cerr << "TLM: lm stat ..."; lm->lmstat(statistics); cerr << "\n"; } cerr << "TLM: deleting lm ..."; //delete lm; cerr << "\n"; exit(0); } int init(mdiadaptlm** lm, int lmtype, char *trainfile, int size, int prunefreq, double beta, int backoff, int dub, double oovrate, int mcl) { cerr << "initializing lm... \n"; if (trainfile) cerr << "creating lm from " << trainfile << "\n"; else cerr << "creating an empty lm\n"; switch (lmtype) { case SHIFT_BETA: if (beta==-1 || (beta<1.0 && beta>0)) *lm=new shiftbeta(trainfile,size,prunefreq,beta,(backoff?SHIFTBETA_B:SHIFTBETA_I)); else { cerr << "ShiftBeta: beta must be >0 and <1\n"; exit(1); } break; case MOD_SHIFT_BETA: if (size>1) *lm=new mshiftbeta(trainfile,size,prunefreq,(backoff?MSHIFTBETA_B:MSHIFTBETA_I)); else { cerr << "Modified Shift Beta requires size > 1!\n"; exit(1); } break; case SHIFT_ONE: *lm=new shiftone(trainfile,size,prunefreq,(backoff?SIMPLE_B:SIMPLE_I)); break; case LINEAR_WB: *lm=new linearwb(trainfile,size,prunefreq,(backoff?MSHIFTBETA_B:MSHIFTBETA_I)); break; case LINEAR_GT: cerr << "This LM is no more supported\n"; break; case MIXTURE: cerr << "not implemented yet\n"; break; default: cerr << "not implemented yet\n"; exit(1); }; if (dub) (*lm)->dub(dub); (*lm)->create_caches(mcl); cerr << "eventually generate OOV code\n"; (*lm)->dict->genoovcode(); if (oovrate) (*lm)->dict->setoovrate(oovrate); (*lm)->dict->incflag(1); if (!trainfile) { cerr << "adding the initial dummy n-grams to make table consistent\n"; ngram dummyng((*lm)->dict); cerr << "preparing initial dummy n-grams\n"; for (int i=1; i<(*lm)->maxlevel(); i++) { dummyng.pushw((*lm)->dict->BoS()); dummyng.freq=1; } cerr << "inside init: dict: " << (*lm)->dict << " dictsize: " << (*lm)->dict->size() << "\n"; cerr << "dummyng: |" << dummyng << "\n"; (*lm)->put(dummyng); cerr << "inside init: dict: " << (*lm)->dict << " dictsize: " << (*lm)->dict->size() << "\n"; } cerr << "lm initialized \n"; return 1; } int deinit(mdiadaptlm** lm) { delete *lm; return 1; } irstlm-5.80.03/src/timer.cpp000644 000766 000024 00000005265 12013405172 017755 0ustar00nicolabertoldistaff000000 000000 #include #include #include #include "util.h" #include "timer.h" /*** * Return the total time that the timer has been in the "running" * state since it was first "started" or last "restarted". For * "short" time periods (less than an hour), the actual cpu time * used is reported instead of the elapsed time. */ double Timer::elapsed_time() { time_t now; time(&now); return difftime(now, start_time); } /*** * Return the total time that the timer has been in the "running" * state since it was first "started" or last "restarted". For * "short" time periods (less than an hour), the actual cpu time * used is reported instead of the elapsed time. * This function is the public version of elapsed_time() */ double Timer::get_elapsed_time() { return elapsed_time(); } /*** * Start a timer. If it is already running, let it continue running. * Print an optional message. */ void Timer::start(const char* msg) { // Print an optional message, something like "Starting timer t"; if (msg) VERBOSE(0, msg << std::endl); // Return immediately if the timer is already running if (running) return; // Change timer status to running running = true; // Set the start time; time(&start_time); } /*** * Turn the timer off and start it again from 0. Print an optional message. */ /* inline void Timer::restart(const char* msg) { // Print an optional message, something like "Restarting timer t"; if (msg) VERBOSE(0, msg << std::endl; // Set the timer status to running running = true; // Set the accumulated time to 0 and the start time to now acc_time = 0; start_clock = clock(); start_time = time(0); } */ /*** * Stop the timer and print an optional message. */ /* inline void Timer::stop(const char* msg) { // Print an optional message, something like "Stopping timer t"; check(msg); // Recalculate and store the total accumulated time up until now if (running) acc_time += elapsed_time(); running = false; } */ /*** * Print out an optional message followed by the current timer timing. */ void Timer::check(const char* msg) { // Print an optional message, something like "Checking timer t"; if (msg) VERBOSE(0, msg << " : "); VERBOSE(0, "[" << (running ? elapsed_time() : 0) << "] seconds\n"); } /*** * Allow timers to be printed to ostreams using the syntax 'os << t' * for an ostream 'os' and a timer 't'. For example, "cout << t" will * print out the total amount of time 't' has been "running". */ std::ostream& operator<<(std::ostream& os, Timer& t) { //os << std::setprecision(2) << std::setiosflags(std::ios::fixed) << (t.running ? t.elapsed_time() : 0); os << (t.running ? t.elapsed_time() : 0); return os; } irstlm-5.80.03/src/timer.h000644 000766 000024 00000001214 12013405172 017410 0ustar00nicolabertoldistaff000000 000000 #ifndef TIMER_H #define TIMER_H #include #include #include #include "util.h" class Timer { friend std::ostream& operator<<(std::ostream& os, Timer& t); private: bool running; time_t start_time; //TODO in seconds? double elapsed_time(); public: /*** * 'running' is initially false. A timer needs to be explicitly started * using 'start' or 'restart' */ Timer() : running(false), start_time(0) { } void start(const char* msg = 0); // void restart(const char* msg = 0); // void stop(const char* msg = 0); void check(const char* msg = 0); double get_elapsed_time(); }; #endif // TIMER_H irstlm-5.80.03/src/tlm.cpp000644 000766 000024 00000032701 12042554746 017442 0ustar00nicolabertoldistaff000000 000000 /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ using namespace std; #include #include #include #include "cmd.h" #include "mfstream.h" #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "n_gram.h" #include "mempool.h" #include "ngramtable.h" #include "interplm.h" #include "normcache.h" #include "ngramcache.h" #include "mdiadapt.h" #include "shiftlm.h" #include "linearlm.h" #include "mixture.h" #include "lmtable.h" #define NGRAM 1 #define SEQUENCE 2 #define ADAPT 3 #define TURN 4 #define TEXT 5 static Enum_T LmTypeEnum [] = { { (char*)"ModifiedShiftBeta", MOD_SHIFT_BETA }, { (char*)"msb", MOD_SHIFT_BETA }, { (char*)"InterpShiftBeta", SHIFT_BETA }, { (char*)"ShiftBeta", SHIFT_BETA }, { (char*)"sb", SHIFT_BETA }, { (char*)"InterpShiftOne", SHIFT_ONE }, { (char*)"ShiftOne", SHIFT_ONE }, { (char*)"s1", SHIFT_ONE }, { (char*)"LinearWittenBell", LINEAR_WB }, { (char*)"wb", LINEAR_WB }, { (char*)"LinearGoodTuring", LINEAR_GT }, { (char*)"Mixture", MIXTURE }, { (char*)"mix", MIXTURE }, END_ENUM }; static Enum_T InteractiveModeEnum [] = { { (char*)"Ngram", NGRAM }, { (char*)"Sequence", SEQUENCE }, { (char*)"Adapt", ADAPT }, { (char*)"Turn", TURN }, { (char*)"Text", TEXT }, { (char*)"Yes", NGRAM }, END_ENUM }; void print_help(int TypeFlag=0){ std::cerr << std::endl << "tlm - estimates a language model" << std::endl; std::cerr << std::endl << "USAGE:" << std::endl; std::cerr << " not yet available" << std::endl; std::cerr << std::endl << "DESCRIPTION:" << std::endl; std::cerr << " tlm is a tool for the estimation of language model" << std::endl; std::cerr << std::endl << "OPTIONS:" << std::endl; std::cerr << " -Help|-h this help" << std::endl; std::cerr << std::endl; FullPrintParams(TypeFlag, 0, 1, stderr); } void usage(const char *msg = 0) { if (msg){ std::cerr << msg << std::endl; } else{ print_help(); } exit(1); } int main(int argc, char **argv) { char *dictfile=NULL; char *trainfile=NULL; char *testfile=NULL; char *adaptfile=NULL; char *slminfo=NULL; char *imixpar=NULL; char *omixpar=NULL; char *BINfile=NULL; char *ARPAfile=NULL; bool SavePerLevel=true; //save-per-level or save-for-word char *ASRfile=NULL; char* scalefactorfile=NULL; int backoff=0; //back-off or interpolation int lmtype=0; int dub=0; //dictionary upper bound int size=0; //lm size int interactive=0; int statistics=0; bool prunefreq=false; bool prunesingletons=true; bool prunetopsingletons=false; double beta=-1; bool compsize=false; bool checkpr=false; double oovrate=0; int max_caching_level=0; char *outpr=NULL; bool memmap = false; //write binary format with/without memory map, default is 0 int adaptlevel=0; //adaptation level double adaptrate=1.0; bool adaptoov=false; //do not increment the dictionary bool help=false; DeclareParams((char*) "Back-off",CMDBOOLTYPE, &backoff, "bo",CMDBOOLTYPE, &backoff, "Dictionary", CMDSTRINGTYPE, &dictfile, "d", CMDSTRINGTYPE, &dictfile, "DictionaryUpperBound", CMDINTTYPE, &dub, "dub", CMDINTTYPE, &dub, "NgramSize", CMDSUBRANGETYPE, &size, 1, MAX_NGRAM, "n", CMDSUBRANGETYPE, &size, 1, MAX_NGRAM, "Ngram", CMDSTRINGTYPE, &trainfile, "TrainOn", CMDSTRINGTYPE, &trainfile, "tr", CMDSTRINGTYPE, &trainfile, "oASR", CMDSTRINGTYPE, &ASRfile, "oasr", CMDSTRINGTYPE, &ASRfile, "o", CMDSTRINGTYPE, &ARPAfile, "oARPA", CMDSTRINGTYPE, &ARPAfile, "oarpa", CMDSTRINGTYPE, &ARPAfile, "oBIN", CMDSTRINGTYPE, &BINfile, "obin", CMDSTRINGTYPE, &BINfile, "SavePerLevel",CMDBOOLTYPE, &SavePerLevel, "TestOn", CMDSTRINGTYPE, &testfile, "te", CMDSTRINGTYPE, &testfile, "AdaptOn", CMDSTRINGTYPE, &adaptfile, "ad", CMDSTRINGTYPE, &adaptfile, "AdaptRate",CMDDOUBLETYPE , &adaptrate, "ar", CMDDOUBLETYPE, &adaptrate, "AdaptLevel", CMDSUBRANGETYPE, &adaptlevel, 1 , MAX_NGRAM, "al",CMDSUBRANGETYPE , &adaptlevel, 1, MAX_NGRAM, "AdaptOOV", CMDBOOLTYPE, &adaptoov, "ao", CMDBOOLTYPE, &adaptoov, "SaveScaleFactor", CMDSTRINGTYPE, &scalefactorfile, "ssf", CMDSTRINGTYPE, &scalefactorfile, "LanguageModelType",CMDENUMTYPE, &lmtype, LmTypeEnum, "lm",CMDENUMTYPE, &lmtype, LmTypeEnum, "Interactive",CMDENUMTYPE, &interactive, InteractiveModeEnum, "i",CMDENUMTYPE, &interactive, InteractiveModeEnum, "Statistics",CMDSUBRANGETYPE, &statistics, 1, 3, "s",CMDSUBRANGETYPE, &statistics, 1, 3, "PruneThresh",CMDSUBRANGETYPE, &prunefreq, 1, 1000, "p",CMDSUBRANGETYPE, &prunefreq, 1, 1000, "PruneSingletons",CMDBOOLTYPE, &prunesingletons, "ps",CMDBOOLTYPE, &prunesingletons, "PruneTopSingletons",CMDBOOLTYPE, &prunetopsingletons, "pts",CMDBOOLTYPE, &prunetopsingletons, "ComputeLMSize",CMDBOOLTYPE, &compsize, "sz",CMDBOOLTYPE, &compsize, "MaximumCachingLevel", CMDINTTYPE , &max_caching_level, "mcl", CMDINTTYPE, &max_caching_level, "MemoryMap", CMDBOOLTYPE, &memmap, "memmap", CMDBOOLTYPE, &memmap, "mm", CMDBOOLTYPE, &memmap, "CheckProb",CMDBOOLTYPE, &checkpr, "cp",CMDBOOLTYPE, &checkpr, "OutProb",CMDSTRINGTYPE, &outpr, "op",CMDSTRINGTYPE, &outpr, "SubLMInfo", CMDSTRINGTYPE, &slminfo, "slmi", CMDSTRINGTYPE, &slminfo, "SaveMixParam", CMDSTRINGTYPE, &omixpar, "smp", CMDSTRINGTYPE, &omixpar, "LoadMixParam", CMDSTRINGTYPE, &imixpar, "lmp", CMDSTRINGTYPE, &imixpar, "SetOovRate", CMDDOUBLETYPE, &oovrate, "or", CMDDOUBLETYPE, &oovrate, "Beta", CMDDOUBLETYPE, &beta, "beta", CMDDOUBLETYPE, &beta, "Help", CMDBOOLTYPE|CMDMSG, &help, "print this help", "h", CMDBOOLTYPE|CMDMSG, &help, "print this help", (char *)NULL ); if (argc == 1){ usage(); } GetParams(&argc, &argv, (char*) NULL); if (help){ usage(); } if (!lmtype || (!trainfile && lmtype!=MIXTURE)) { usage("Warning: Missing parameters"); } if (SavePerLevel == false && backoff == true){ cerr << "WARNING: Current implementation does not support the usage of backoff (-bo=true) mixture models (-lm=mix) combined with the per-word saving (-saveperllevel=false)." << endl; cerr << "WARNING: The usage of backoff is disabled, i.e. -bo=no is forced" << endl; backoff=false; } mdiadaptlm *lm=NULL; switch (lmtype) { case SHIFT_BETA: if (beta==-1 || (beta<1.0 && beta>0)) lm=new shiftbeta(trainfile,size,prunefreq,beta,(backoff?SHIFTBETA_B:SHIFTBETA_I)); else { cerr << "ShiftBeta: beta must be >0 and <1\n"; exit(1); } break; case MOD_SHIFT_BETA: if (size>1) lm=new mshiftbeta(trainfile,size,prunefreq,(backoff?MSHIFTBETA_B:MSHIFTBETA_I)); else { cerr << "Modified Shift Beta requires size > 1!\n"; exit(1); } break; case SHIFT_ONE: lm=new shiftone(trainfile,size,prunefreq,(backoff?SIMPLE_B:SIMPLE_I)); break; case LINEAR_WB: lm=new linearwb(trainfile,size,prunefreq,(backoff?MSHIFTBETA_B:MSHIFTBETA_I)); break; case LINEAR_GT: cerr << "This LM is no more supported\n"; break; case MIXTURE: //temporary check: so far unable to proper handle this flag in sub LMs //no ngramtable is created lm=new mixture(SavePerLevel,slminfo,size,prunefreq,imixpar,omixpar); break; default: cerr << "not implemented yet\n"; return 1; }; if (dub < lm->dict->size()){ cerr << "dub (" << dub << ") is not set or too small. dub is re-set to the dictionary size (" << lm->dict->size() << ")" << endl; dub = lm->dict->size(); } lm->dub(dub); lm->create_caches(max_caching_level); cerr << "eventually generate OOV code\n"; lm->dict->genoovcode(); if (oovrate) lm->dict->setoovrate(oovrate); lm->save_per_level(SavePerLevel); lm->train(); //it never occurs that both prunetopsingletons and prunesingletons are true if (prunetopsingletons==true) { //keep most specific lm->prunetopsingletons(true); lm->prunesingletons(false); } else { lm->prunetopsingletons(false); if (prunesingletons==true) { lm->prunesingletons(true); } else { lm->prunesingletons(false); } } if (adaptoov) lm->dict->incflag(1); if (adaptfile) lm->adapt(adaptfile,adaptlevel,adaptrate); if (adaptoov) lm->dict->incflag(0); if (scalefactorfile) lm->savescalefactor(scalefactorfile); if (backoff) lm->compute_backoff(); if (size>lm->maxlevel()) { cerr << "lm size is too large\n"; exit(1); } if (!size) size=lm->maxlevel(); if (testfile) { cerr << "TLM: test ..."; lm->test(testfile,size,backoff,checkpr,outpr); if (adaptfile) ((mdiadaptlm *)lm)->get_zetacache()->stat(); cerr << "\n"; }; if (compsize) cout << "LM size " << (int)lm->netsize() << "\n"; if (interactive) { ngram ng(lm->dict); int nsize=0; cout.setf(ios::scientific); switch (interactive) { case NGRAM: cout << "> "; while(cin >> ng) { if (ng.wordp(size)) { cout << ng << " p=" << (double)log(lm->prob(ng,size)) << "\n"; ng.size=0; cout << "> "; } } break; case SEQUENCE: { char c; double p=0; cout << "> "; while(cin >> ng) { nsize=ng.sizeprob(ng,nsize)); cout << ng << " p=" << p << "\n"; while((c=cin.get())==' ') { cout << c; } cin.putback(c); //cout << "-" << c << "-"; if (c=='\n') { ng.size=0; cout << "> "; p=0; } } } break; case TURN: { int n=0; double lp=0; double oov=0; while(cin >> ng) { if (ng.size>0) { nsize=ng.sizeprob(ng,nsize)); n++; if (*ng.wordp(1) == lm->dict->oovcode()) oov++; } else { if (n>0) cout << n << " " << lp/(log(2.0) * n) << " " << oov/n << "\n"; n=0; lp=0; oov=0; } } break; } case TEXT: { int order; int n=0; double lp=0; double oov=0; while (!cin.eof()) { cin >> order; if (order>size) cerr << "Warning: order > lm size\n"; order=order>size?size:order; while (cin >> ng) { if (ng.size>0) { nsize=ng.sizeprob(ng,nsize)); n++; if (*ng.wordp(1) == lm->dict->oovcode()) oov++; } else { if (n>0) cout << n << " " << lp/(log(2.0)*n) << " " << oov/n << "\n"; n=0; lp=0; oov=0; if (ng.isym>0) break; } } } } break; case ADAPT: { if (backoff) { cerr << "This modality is not supported with backoff LMs\n"; exit(1); } char afile[50],tfile[50]; while (!cin.eof()) { cin >> afile >> tfile; system("echo > .tlmlock"); cerr << "interactive adaptation: " << afile << " " << tfile << "\n"; if (adaptoov) lm->dict->incflag(1); lm->adapt(afile,adaptlevel,adaptrate); if (adaptoov) lm->dict->incflag(0); if (scalefactorfile) lm->savescalefactor(scalefactorfile); if (ASRfile) lm->saveASR(ASRfile,backoff,dictfile); if (ARPAfile) lm->saveARPA(ARPAfile,backoff,dictfile); if (BINfile) lm->saveBIN(BINfile,backoff,dictfile,memmap); lm->test(tfile,size,checkpr); cout.flush(); system("rm .tlmlock"); } } break; } exit(1); } if (ASRfile) { cerr << "TLM: save lm (ASR)..."; lm->saveASR(ASRfile,backoff,dictfile); cerr << "\n"; } if (ARPAfile) { cerr << "TLM: save lm (ARPA)..."; lm->saveARPA(ARPAfile,backoff,dictfile); cerr << "\n"; } if (BINfile) { cerr << "TLM: save lm (binary)..."; lm->saveBIN(BINfile,backoff,dictfile,memmap); cerr << "\n"; } if (statistics) { cerr << "TLM: lm stat ..."; lm->lmstat(statistics); cerr << "\n"; } // lm->cache_stat(); cerr << "TLM: deleting lm ..."; delete lm; cerr << "\n"; exit(0); } irstlm-5.80.03/src/util.cpp000644 000766 000024 00000015445 12042554746 017631 0ustar00nicolabertoldistaff000000 000000 // $Id: util.cpp 363 2010-02-22 15:02:45Z mfederico $ /****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #ifdef WIN32 #include #include #include #else #include #include #include #include #include #endif #include "timer.h" #include "util.h" using namespace std; string gettempfolder() { #ifdef _WIN32 char *tmpPath = getenv("TMP"); string str(tmpPath); if (str.substr(str.size() - 1, 1) != "\\") str += "\\"; return str; #else char *tmpPath = getenv("TMP"); if (!tmpPath || !*tmpPath) return "/tmp/"; string str(tmpPath); if (str.substr(str.size() - 1, 1) != "/") str += "/"; return str; #endif } string createtempName() { string tmpfolder = gettempfolder(); #ifdef _WIN32 char buffer[BUFSIZ]; //To check whether the following function open the stream as well //In this case it is mandatory to close it immediately ::GetTempFileNameA(tmpfolder.c_str(), "", 0, buffer); #else char buffer[tmpfolder.size() + 16]; strcpy(buffer, tmpfolder.c_str()); strcat(buffer, "dskbuff--XXXXXX"); int fd=mkstemp(buffer); close(fd); #endif return (string) buffer; } void createtempfile(mfstream &fileStream, string &filePath, std::ios_base::openmode flags) { filePath = createtempName(); fileStream.open(filePath.c_str(), flags); if (fileStream == 0) { perror("error creating file"); exit(4); } } void removefile(const std::string &filePath) { #ifdef _WIN32 ::DeleteFileA(filePath.c_str()); #else if (remove(filePath.c_str()) != 0) { perror("Error deleting file" ); exit(2); } #endif } inputfilestream::inputfilestream(const std::string &filePath) : std::istream(0), m_streambuf(0) { //check if file is readable std::filebuf* fb = new std::filebuf(); _good=(fb->open(filePath.c_str(), std::ios::in)!=NULL); if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") { fb->close(); delete fb; m_streambuf = new gzfilebuf(filePath.c_str()); } else { m_streambuf = fb; } this->init(m_streambuf); } inputfilestream::~inputfilestream() { delete m_streambuf; m_streambuf = 0; } void inputfilestream::close() { } /* MemoryMap Management Code kindly provided by Fabio Brugnara, ITC-irst Trento. How to use it: - call MMap with offset and required size (psgz): pg->b = MMap(fd, rdwr,offset,pgsz,&g); - correct returned pointer with the alignment gap and save the gap: pg->b += pg->gap = g; - when releasing mapped memory, subtract the gap from the pointer and add the gap to the requested dimension Munmap(pg->b-pg->gap, pgsz+pg->gap, 0); */ void *MMap(int fd, int access, off_t offset, size_t len, off_t *gap) { void *p; int pgsz,g=0; #ifdef _WIN32 /* // code for windows must be checked HANDLE fh, mh; fh = (HANDLE)_get_osfhandle(fd); if(offset) { // bisogna accertarsi che l'offset abbia la granularita` //corretta, MAI PROVATA! SYSTEM_INFO si; GetSystemInfo(&si); g = *gap = offset % si.dwPageSize; } else if(gap) { *gap=0; } if(!(mh=CreateFileMapping(fh, NULL, PAGE_READWRITE, 0, len+g, NULL))) { return 0; } p = (char*)MapViewOfFile(mh, FILE_MAP_ALL_ACCESS, 0, offset-*gap, len+*gap); CloseHandle(mh); */ #else if(offset) { pgsz = sysconf(_SC_PAGESIZE); g = *gap = offset%pgsz; } else if(gap) { *gap=0; } p = mmap((void*)0, len+g, access, MAP_SHARED|MAP_FILE, fd, offset-g); if((long)p==-1L) { perror("mmap failed"); p=0; } #endif return p; } int Munmap(void *p,size_t len,int sync) { int r=0; #ifdef _WIN32 /* //code for windows must be checked if(sync) FlushViewOfFile(p, len); UnmapViewOfFile(p); */ #else cerr << "len = " << len << endl; cerr << "sync = " << sync << endl; cerr << "running msync..." << endl; if(sync) msync(p, len, MS_SYNC); cerr << "done. Running munmap..." << endl; if((r=munmap((void*)p, len))) { perror("munmap() failed"); } cerr << "done" << endl; #endif return r; } //global variable Timer g_timer; void ResetUserTime() { g_timer.start(); }; void PrintUserTime(const std::string &message) { g_timer.check(message.c_str()); } double GetUserTime() { return g_timer.get_elapsed_time(); } int parseWords(char *sentence, const char **words, int max) { char *word; int i = 0; const char *const wordSeparators = " \t\r\n"; for (word = strtok(sentence, wordSeparators); i < max && word != 0; i++, word = strtok(0, wordSeparators)) { words[i] = word; } if (i < max) { words[i] = 0; } return i; } //Load a LM as a text file. LM could have been generated either with the //IRST LM toolkit or with the SRILM Toolkit. In the latter we are not //sure that n-grams are lexically ordered (according to the 1-grams). //However, we make the following assumption: //"all successors of any prefix are sorted and written in contiguous lines!" //This method also loads files processed with the quantization //tool: qlm int parseline(istream& inp, int Order,ngram& ng,float& prob,float& bow) { const char* words[1+ LMTMAXLEV + 1 + 1]; int howmany; char line[MAX_LINE]; inp.getline(line,MAX_LINE); if (strlen(line)==MAX_LINE-1) { cerr << "parseline: input line exceed MAXLINE (" << MAX_LINE << ") chars " << line << "\n"; exit(1); } howmany = parseWords(line, words, Order + 3); if (!(howmany == (Order+ 1) || howmany == (Order + 2))) assert(howmany == (Order+ 1) || howmany == (Order + 2)); //read words ng.size=0; for (int i=1; i<=Order; i++) ng.pushw(strcmp(words[i],"")?words[i]:ng.dict->OOV()); //read logprob/code and logbow/code assert(sscanf(words[0],"%f",&prob)); if (howmany==(Order+2)) assert(sscanf(words[Order+1],"%f",&bow)); else bow=0.0; //this is log10prob=0 for implicit backoff return 1; } irstlm-5.80.03/src/util.h000644 000766 000024 00000003763 12042554746 017276 0ustar00nicolabertoldistaff000000 000000 // $Id: util.h 363 2010-02-22 15:02:45Z mfederico $ #ifndef IRSTLM_UTIL_H #define IRSTLM_UTIL_H #include #include #include "gzfilebuf.h" #include "n_gram.h" #define MAX(a,b) (((a)>(b))?(a):(b)) #define MIN(a,b) (((a)<(b))?(a):(b)) #define UNUSED(x) { (void) x; } #define _DEBUG_LEVEL 2 /** trace macros **/ /** verbose macros **/ #ifdef TRACE_ENABLE #define TRACE_ERR(str) { std::cerr << str; } #define VERBOSE(level,str) { if (_DEBUG_LEVEL){ if (_DEBUG_LEVEL >= level) { TRACE_ERR("DEBUG_LEVEL:" <<_DEBUG_LEVEL << " "); TRACE_ERR(str); } } } #define IFVERBOSE(level) if (_DEBUG_LEVEL) if (_DEBUG_LEVEL >= level) #else #define VERBOSE(level,str) { } #define IFVERBOSE(level) { } #endif #define LMTMAXLEV 20 #define MAX_LINE 100000 //0.000001 = 10^(-6) //0.000000000001 = 10^(-12) //1.000001 = 1+10^(-6) //1.000000000001 = 1+10^(-12) //0.999999 = 1-10^(-6) //0.999999999999 = 1-10^(-12) #define UPPER_SINGLE_PRECISION_OF_0 0.000001 #define UPPER_DOUBLE_PRECISION_OF_0 0.000000000001 #define UPPER_SINGLE_PRECISION_OF_1 1.000001 #define LOWER_SINGLE_PRECISION_OF_1 0.999999 #define UPPER_DOUBLE_PRECISION_OF_1 1.000000000001 #define LOWER_DOUBLE_PRECISION_OF_1 0.999999999999 std::string gettempfolder(); std::string createtempName(); void createtempfile(mfstream &fileStream, std::string &filePath, std::ios_base::openmode flags); void removefile(const std::string &filePath); class inputfilestream : public std::istream { protected: std::streambuf *m_streambuf; bool _good; public: inputfilestream(const std::string &filePath); ~inputfilestream(); bool good() { return _good; } void close(); }; void *MMap(int fd, int access, off_t offset, size_t len, off_t *gap); int Munmap(void *p,size_t len,int sync); // A couple of utilities to measure access time void ResetUserTime(); void PrintUserTime(const std::string &message); double GetUserTime(); int parseWords(char *, const char **, int); int parseline(istream& inp, int Order,ngram& ng,float& prob,float& bow); #endif irstlm-5.80.03/src/verify-caching.cpp000644 000766 000024 00000004176 12116036445 021542 0ustar00nicolabertoldistaff000000 000000 // $Id: compile-lm.cpp 3677 2010-10-13 09:06:51Z bertoldi $ /****************************************************************************** IrstLM: IRST Language Model Toolkit, compile LM Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ using namespace std; #include #include #include #include "cmd.h" #include "util.h" #include "mdiadapt.h" #include "lmContainer.h" /********************************/ void print_help(int TypeFlag=0){ std::cerr << std::endl << "verify_caching - verify whether caching is enabled or disabled" << std::endl; std::cerr << std::endl << "USAGE:" << std::endl; std::cerr << " verify_caching" << std::endl; std::cerr << std::endl << "DESCRIPTION:" << std::endl; std::cerr << std::endl << "OPTIONS:" << std::endl; FullPrintParams(TypeFlag, 0, 1, stderr); } void usage(const char *msg = 0) { if (msg) { std::cerr << msg << std::endl; } if (!msg){ print_help(); } exit(1); } int main(int argc, char **argv) { lmContainer* lmC; if (lmC->is_cache_enabled()){ std::cout << " caching is ENABLED" << std::endl; }else{ std::cout << " caching is DISABLED" << std::endl; } mdiadaptlm* lm; if (lm->is_train_cache_enabled()){ std::cout << " train-caching is ENABLED" << std::endl; }else{ std::cout << " train-caching is DISABLED" << std::endl; } } irstlm-5.80.03/scripts/add-start-end.sh000755 000766 000024 00000001071 12032513324 022006 0ustar00nicolabertoldistaff000000 000000 #! /bin/bash function usage() { cmnd=$(basename $0); cat<&2; exit 0; ;; esac done #adds sentence start/end symbols to standard input and #trims words longer than 80 characters (sed 's/^/ /' | sed 's/$/ <\/s>/';) |\ sed 's/\([^ ]\{80\}\)\([^ ]\{1,\}\)/\1/g' irstlm-5.80.03/scripts/build-lm-qsub.sh000755 000766 000024 00000017366 12127567154 022071 0ustar00nicolabertoldistaff000000 000000 #! /bin/bash function usage() { cmnd=$(basename $0); cat<", and any other) -s Smoothing methods: witten-bell (default), kneser-ney (approximated kneser-ney), improved-kneser-ney -b Include sentence boundary n-grams (optional) -d Define subdictionary for n-grams (optional) -v Verbose EOF } hostname=`uname -n` if [ $hostname == "voxgate" ] ; then echo "voxgate can not be used as submission host" echo "use any other cluster machine" exit fi if [ ! $IRSTLM ]; then echo "Set IRSTLM environment variable with path to irstlm" exit 2; fi #paths to scripts and commands in irstlm scr=$IRSTLM/bin bin=$IRSTLM/bin gzip=`which gzip 2> /dev/null`; gunzip=`which gunzip 2> /dev/null`; #check irstlm installation if [ ! -e $bin/dict -o ! -e $scr/split-dict.pl ]; then echo "$IRSTLM does not contain a proper installation of IRSTLM" exit 3; fi #default parameters logfile=/dev/null tmpdir=stat_$$ order=3 parts=3 inpfile=""; outfile="" verbose=""; smoothing="--witten-bell"; prune=""; boundaries=""; dictionary=""; uniform="-f=y"; queueparameters="" while getopts “hvi:o:n:k:t:s:q:pbl:d:u” OPTION do case $OPTION in h) usage exit 0 ;; v) verbose="--verbose"; ;; i) inpfile=$OPTARG ;; d) dictionary="-sd=$OPTARG" ;; u) uniform=" " ;; o) outfile=$OPTARG ;; n) order=$OPTARG ;; k) parts=$OPTARG ;; t) tmpdir=$OPTARG ;; s) smoothing=$OPTARG case $smoothing in witten-bell) smoothing="--witten-bell" ;; kneser-ney) smoothing="--kneser-ney" ;; improved-kneser-ney) smoothing="--improved-kneser-ney" ;; *) echo "wrong smoothing setting"; exit 4; esac ;; p) prune='--prune-singletons'; ;; q) queueparameters=$OPTARG; ;; b) boundaries='--cross-sentence'; ;; l) logfile=$OPTARG ;; ?) usage exit ;; esac done if [ $verbose ]; then echo inpfile=\"$inpfile\" outfile=$outfile order=$order parts=$parts tmpdir=$tmpdir prune=$prune smoothing=$smoothing dictionary=$dictionary verbose=$verbose fi if [ ! "$inpfile" -o ! "$outfile" ]; then usage exit 5 fi if [ -e $outfile ]; then echo "Output file $outfile already exists! either remove or rename it." exit 6; fi if [ -e $logfile -a $logfile != "/dev/null" -a $logfile != "/dev/stdout"]; then echo "Logfile $logfile already exists! either remove or rename it." exit 7; fi #check tmpdir tmpdir_created=0; if [ ! -d $tmpdir ]; then echo "Temporary directory $tmpdir does not exist"; echo "creating $tmpdir"; mkdir -p $tmpdir; tmpdir_created=1; else echo "Cleaning temporary directory $tmpdir"; rm $tmpdir 2> /dev/null if [ $? != 0 ]; then echo "Warning: some temporary files could not be removed" fi fi workingdir=`pwd | perl -pe 's/\/nfsmnt//g'` cd $workingdir qsubout="$workingdir/DICT-OUT$$" qsuberr="$workingdir/DICT-ERR$$" qsublog="$workingdir/DICT-LOG$$" qsubname="DICT" (\ qsub $queueparameters -b no -sync yes -o $qsubout -e $qsuberr -N $qsubname << EOF cd $workingdir echo exit status $? echo "Extracting dictionary from training corpus" $bin/dict -i="$inpfile" -o=$tmpdir/dictionary $uniform -sort=no echo exit status $? echo "Splitting dictionary into $parts lists" $scr/split-dict.pl --input $tmpdir/dictionary --output $tmpdir/dict. --parts $parts echo exit status $? EOF ) 2>&1 > $qsublog unset suffix #getting list of suffixes for file in `ls $tmpdir/dict.*` ; do sfx=`echo $file | perl -pe 's/^.+\.(\d+)$/$1/'` suffix[${#suffix[@]}]=$sfx done qsubout="$workingdir/NGT-OUT$$" qsuberr="$workingdir/NGT-ERR$$" qsublog="$workingdir/NGT-LOG$$" qsubname="NGT" unset getpids echo "Extracting n-gram statistics for each word list" echo "Important: dictionary must be ordered according to order of appearance of words in data" echo "used to generate n-gram blocks, so that sub language model blocks results ordered too" for sfx in ${suffix[@]} ; do (\ qsub $queueparameters -b no -j yes -sync no -o $qsubout.$sfx -e $qsuberr.$sfx -N $qsubname-$sfx << EOF cd $workingdir echo exit status $? $bin/ngt -i="$inpfile" -n=$order -gooout=y -o="$gzip -c > $tmpdir/ngram.dict.${sfx}.gz" -fd="$tmpdir/dict.${sfx}" $dictionary -iknstat="$tmpdir/ikn.stat.dict.${sfx}" echo exit status $? echo EOF ) 2>&1 > $qsublog.$sfx id=`cat $qsublog.$sfx | grep 'Your job' | awk '{print $3}'` sgepid[${#sgepid[@]}]=$id done waiting="" for id in ${sgepid[@]} ; do waiting="$waiting -hold_jid $id" ; done qsub $queueparameters -sync yes $waiting -j y -o /dev/null -e /dev/null -N $qsubname.W -b y /bin/ls 2>&1 > $qsubname.W.log rm $qsubname.W.log qsubout="$workingdir/SUBLM-OUT$$" qsuberr="$workingdir/SUBLM-ERR$$" qsublog="$workingdir/SUBLM-LOG$$" qsubname="SUBLM" unset getpids echo "Estimating language models for each word list" if [ $smoothing = "--kneser-ney" -o $smoothing = "--improved-kneser-ney" ]; then for sfx in ${suffix[@]} ; do (\ qsub $queueparameters -b no -j yes -sync no -o $qsubout.$sfx -e $qsuberr.$sfx -N $qsubname-$sfx << EOF cd $workingdir echo exit status $? $scr/build-sublm.pl $verbose $prune $smoothing "cat $tmpdir/ikn.stat.dict*" --size $order --ngrams "$gunzip -c $tmpdir/ngram.dict.${sfx}.gz" -sublm $tmpdir/lm.dict.${sfx} echo exit status $? echo EOF ) 2>&1 > $qsublog.$sfx id=`cat $qsublog.$sfx | grep 'Your job' | awk '{print $3}'` sgepid[${#sgepid[@]}]=$id done else for sfx in ${suffix[@]} ; do (\ qsub $queueparameters -b no -j yes -sync no -o $qsubout.$sfx -e $qsuberr.$sfx -N $qsubname-$sfx << EOF cd $workingdir echo exit status $? $scr/build-sublm.pl $verbose $prune $smoothing --size $order --ngrams "$gunzip -c $tmpdir/ngram.dict.${sfx}.gz" -sublm $tmpdir/lm.dict.${sfx} echo EOF ) 2>&1 > $qsublog.$sfx id=`cat $qsublog.$sfx | grep 'Your job' | awk '{print $3}'` sgepid[${#sgepid[@]}]=$id done fi waiting="" for id in ${sgepid[@]} ; do waiting="$waiting -hold_jid $id" ; done qsub $queueparameters -sync yes $waiting -o /dev/null -e /dev/null -N $qsubname.W -b yes /bin/ls 2>&1 > $qsubname.W.log rm $qsubname.W.log echo "Merging language models into $outfile" qsubout="$workingdir/MERGE-OUT$$" qsuberr="$workingdir/MERGE-ERR$$" qsublog="$workingdir/MERGE-LOG$$" qsubname="MERGE" (\ qsub $queueparameters -b no -j yes -sync yes -o $qsubout -e $qsuberr -N $qsubname << EOF cd $workingdir $scr/merge-sublm.pl --size $order --sublm $tmpdir/lm.dict -lm $outfile EOF ) 2>&1 > $qsublog echo "Cleaning temporary directory $tmpdir"; rm $tmpdir/* 2> /dev/null rm $qsubout* $qsuberr* $qsublog* 2> /dev/null if [ $tmpdir_created -eq 1 ]; then echo "Removing temporary directory $tmpdir"; rmdir $tmpdir 2> /dev/null if [ $? != 0 ]; then echo "Warning: the temporary directory could not be removed." fi fi exit 0 irstlm-5.80.03/scripts/build-lm.sh000755 000766 000024 00000012710 12032513324 021066 0ustar00nicolabertoldistaff000000 000000 #! /bin/bash set -m # Enable Job Control function usage() { cmnd=$(basename $0); cat< /dev/null`; gunzip=`which gunzip 2> /dev/null`; #check irstlm installation if [ ! -e $bin/dict -o ! -e $scr/split-dict.pl ]; then echo "$IRSTLM does not contain a proper installation of IRSTLM" exit 3 fi #default parameters logfile=/dev/null tmpdir=stat_$$ order=3 parts=3 inpfile=""; outfile="" verbose=""; smoothing="--witten-bell"; prune=""; boundaries=""; dictionary=""; uniform="-f=y"; while getopts “hvi:o:n:k:t:s:pbl:d:u” OPTION do case $OPTION in h) usage exit 0 ;; v) verbose="--verbose"; ;; i) inpfile=$OPTARG ;; d) dictionary="-sd=$OPTARG" ;; u) uniform=" " ;; o) outfile=$OPTARG ;; n) order=$OPTARG ;; k) parts=$OPTARG ;; t) tmpdir=$OPTARG ;; s) smoothing=$OPTARG case $smoothing in witten-bell) smoothing="--witten-bell" ;; kneser-ney) smoothing="--kneser-ney" ;; improved-kneser-ney) smoothing="--improved-kneser-ney" ;; *) echo "wrong smoothing setting"; exit 4 esac ;; p) prune='--prune-singletons'; ;; b) boundaries='--cross-sentence'; ;; l) logfile=$OPTARG ;; ?) usage exit 1 ;; esac done if [ $verbose ];then echo inpfile=\"$inpfile\" outfile=$outfile order=$order parts=$parts tmpdir=$tmpdir prune=$prune smoothing=$smoothing dictionary=$dictionary verbose=$verbose fi if [ ! "$inpfile" -o ! "$outfile" ]; then usage exit 5 fi if [ -e $outfile ]; then echo "Output file $outfile already exists! either remove or rename it." exit 6 fi if [ -e $logfile -a $logfile != "/dev/null" -a $logfile != "/dev/stdout" ]; then echo "Logfile $logfile already exists! either remove or rename it." exit 7 fi #check tmpdir tmpdir_created=0; if [ ! -d $tmpdir ]; then echo "Temporary directory $tmpdir does not exist"; echo "creating $tmpdir"; mkdir -p $tmpdir; tmpdir_created=1; else echo "Cleaning temporary directory $tmpdir"; rm $tmpdir/* 2> /dev/null if [ $? != 0 ]; then echo "Warning: some temporary files could not be removed" fi fi echo "Extracting dictionary from training corpus" $bin/dict -i="$inpfile" -o=$tmpdir/dictionary $uniform -sort=no 2> $logfile echo "Splitting dictionary into $parts lists" $scr/split-dict.pl --input $tmpdir/dictionary --output $tmpdir/dict. --parts $parts >> $logfile 2>&1 echo "Extracting n-gram statistics for each word list" echo "Important: dictionary must be ordered according to order of appearance of words in data" echo "used to generate n-gram blocks, so that sub language model blocks results ordered too" for sdict in $tmpdir/dict.*;do sdict=`basename $sdict` echo $sdict; $bin/ngt -i="$inpfile" -n=$order -gooout=y -o="$gzip -c > $tmpdir/ngram.${sdict}.gz" -fd="$tmpdir/$sdict" $dictionary -iknstat="$tmpdir/ikn.stat.$sdict" >> $logfile 2>&1 & done # Wait for all parallel jobs to finish while [ 1 ]; do fg 2> /dev/null; [ $? == 1 ] && break; done echo "Estimating language models for each word list" for sdict in `ls $tmpdir/dict.*` ; do sdict=`basename $sdict` echo $sdict; if [ $smoothing = "--kneser-ney" -o $smoothing = "--improved-kneser-ney" ]; then $scr/build-sublm.pl $verbose $prune $smoothing "cat $tmpdir/ikn.stat.dict.*" --size $order --ngrams "$gunzip -c $tmpdir/ngram.${sdict}.gz" -sublm $tmpdir/lm.$sdict >> $logfile 2>&1 & else $scr/build-sublm.pl $verbose $prune $smoothing --size $order --ngrams "$gunzip -c $tmpdir/ngram.${sdict}.gz" -sublm $tmpdir/lm.$sdict >> $logfile 2>&1 & fi done # Wait for all parallel jobs to finish while [ 1 ]; do fg 2> /dev/null; [ $? == 1 ] && break; done echo "Merging language models into $outfile" $scr/merge-sublm.pl --size $order --sublm $tmpdir/lm.dict -lm $outfile >> $logfile 2>&1 echo "Cleaning temporary directory $tmpdir"; rm $tmpdir/* 2> /dev/null if [ $tmpdir_created -eq 1 ]; then echo "Removing temporary directory $tmpdir"; rmdir $tmpdir 2> /dev/null if [ $? != 0 ]; then echo "Warning: the temporary directory could not be removed." fi fi exit 0 irstlm-5.80.03/scripts/build-sublm.pl000755 000766 000024 00000030626 12032513324 021607 0ustar00nicolabertoldistaff000000 000000 #! /usr/bin/perl #***************************************************************************** # IrstLM: IRST Language Model Toolkit # Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #****************************************************************************** #first pass: read dictionary and generate 1-grams #second pass: #for n=2 to N # foreach n-1-grams # foreach n-grams with history n-1 # compute smoothing statistics # store successors # compute back-off probability # compute smoothing probability # write n-1 gram with back-off prob # write all n-grams with smoothed probability use strict; use Getopt::Long "GetOptions"; use File::Basename; my $gzip=`which gzip 2> /dev/null`; my $gunzip=`which gunzip 2> /dev/null`; chomp($gzip); chomp($gunzip); my $cutoffword=""; #special word for Google 1T-ngram cut-offs my $cutoffvalue=39; #cut-off threshold for Google 1T-ngram cut-offs #set defaults for optional parameters my ($verbose,$size,$ngrams,$sublm)=(0, 0, undef, undef); my ($witten_bell,$good_turing,$kneser_ney,$improved_kneser_ney)=(0, 0, "", ""); my ($witten_bell_flag,$good_turing_flag,$kneser_ney_flag,$improved_kneser_ney_flag)=(0, 0, 0, 0); my ($freqshift,$prune_singletons,$cross_sentence)=(0, 0, 0); my $help = 0; $help = 1 unless &GetOptions('size=i' => \$size, 'freq-shift=i' => \$freqshift, 'ngrams=s' => \$ngrams, 'sublm=s' => \$sublm, 'witten-bell' => \$witten_bell, 'good-turing' => \$good_turing, 'kneser-ney=s' => \$kneser_ney, 'improved-kneser-ney=s' => \$improved_kneser_ney, 'prune-singletons' => \$prune_singletons, 'cross-sentence' => \$cross_sentence, 'h|help' => \$help, 'verbose' => \$verbose); if ($help || !$size || !$ngrams || !$sublm) { my $cmnd = basename($0); print "\n$cmnd - estimates single LMs\n", "\nUSAGE:\n", " $cmnd [options]\n", "\nOPTIONS:\n", " --size maximum n-gram size for the language model\n", " --ngrams input file or command to read the ngram table\n", " --sublm output file prefix to write the sublm statistics \n", " --freq-shift (optional) value to be subtracted from all frequencies\n", " --witten-bell (optional) use witten bell linear smoothing (default)\n", " --kneser-ney (optional) use kneser-ney smoothing with statistics in \n", " --improved-kneser-ney (optional) use improved kneser-ney smoothing with statistics in \n", " --good-turing (optional) use good-turing linear smoothing\n", " --prune-singletons (optional) remove n-grams occurring once, for n=3,4,5,... (disabled by default)\n", " --cross-sentence (optional) include cross-sentence bounds (disabled by default)\n", " --verbose (optional) print debugging info\n", " -h, --help (optional) print these instructions\n", "\n"; exit(1); } $witten_bell_flag = 1 if ($witten_bell); $good_turing_flag = 1 if ($good_turing); $kneser_ney_flag = 1 if ($kneser_ney); $improved_kneser_ney_flag = 1 if ($improved_kneser_ney); $witten_bell = $witten_bell_flag = 1 if ($witten_bell_flag + $kneser_ney_flag + $improved_kneser_ney_flag + $good_turing_flag) == 0; warn "build-sublm: size $size ngrams $ngrams sublm $sublm witten-bell $witten_bell kneser-ney $kneser_ney improved-kneser-ney $improved_kneser_ney good-turing $good_turing prune-singletons $prune_singletons cross-sentence $cross_sentence\n" if $verbose; die "build-sublm: value of --size must be larger than 0\n" if $size<1; die "build-sublm: choose only one smoothing method\n" if ($witten_bell_flag + $kneser_ney_flag + $improved_kneser_ney_flag + $good_turing_flag) > 1; my $log10=log(10.0); #service variable to convert log into log10 my $oldwrd=""; #variable to check if 1-gram changed my @cnt=(); #counter of n-grams my $totcnt=0; #total counter of n-grams my ($ng,@ng); #read ngrams my $ngcnt=0; #store ngram frequency my $n; warn "Collecting 1-gram counts\n"; open(INP,"$ngrams") || open(INP,"$ngrams|") || die "cannot open $ngrams\n"; open(GR,"|$gzip -c >${sublm}.1gr.gz") || die "cannot create ${sublm}.1gr.gz\n"; while ($ng=) { chomp($ng); @ng=split(/[ \t]+/,$ng); $ngcnt=(pop @ng) - $freqshift; if ($oldwrd ne $ng[0]) { printf (GR "%s %s\n",$totcnt,$oldwrd) if $oldwrd ne ''; $totcnt=0;$oldwrd=$ng[0]; } #update counter $totcnt+=$ngcnt; } printf GR "%s %s\n",$totcnt,$oldwrd; close(INP); close(GR); my (@h,$h,$hpr); #n-gram history my (@dict,$code); #sorted dictionary of history successors my ($diff,$singlediff,$diff1,$diff2,$diff3); #different successors of history my (@n1,@n2,@n3,@n4,@uno3); #IKN: n-grams occurring once or twice ... my (@beta,$beta); #IKN: n-grams occurring once or twice ... my $locfreq; #collect global statistics for (Improved) Kneser-Ney smoothing if ($kneser_ney || $improved_kneser_ney) { my $statfile=$kneser_ney || $improved_kneser_ney; warn "load \& merge IKN statistics from $statfile \n"; open(IKN,"$statfile") || open(IKN,"$statfile|") || die "cannot open $statfile\n"; while () { my($lev,$n1,$n2,$n3,$n4,$uno3)=$_=~/level: (\d+) n1: (\d+) n2: (\d+) n3: (\d+) n4: (\d+) unover3: (\d+)/; $n1[$lev]+=$n1;$n2[$lev]+=$n2;$n3[$lev]+=$n3;$n4[$lev]+=$n4;$uno3[$lev]+=$uno3; } for (my $lev=1;$lev<=$#n1;$lev++) { warn "level $lev: $n1[$lev] $n2[$lev] $n3[$lev] $n4[$lev] $uno3[$lev]\n"; } close(IKN); } warn "Computing n-gram probabilities:\n"; foreach ($n=2;$n<=$size;$n++) { $code=-1;@cnt=(); @dict=(); $totcnt=0;$diff=0; $singlediff=1; $diff1=0; $diff2=0; $diff3=0; $oldwrd=""; #compute smothing statistics my (@beta,$beta); if ($kneser_ney) { if ($n1[$n]==0 || $n2[$n]==0) { warn "Error in Kneser-Ney smoothing statistics: resorting to Witten-Bell\n"; $beta=0; } else { $beta=$n1[$n]/($n1[$n] + 2 * $n2[$n]); warn "beta $n: $beta\n"; } } if ($improved_kneser_ney) { my $Y=$n1[$n]/($n1[$n] + 2 * $n2[$n]); if ($n3[$n] == 0 || $n4[$n] == 0 || $n2[$n] <= $n3[$n] || $n3[$n] <= $n4[$n]) { warn "Warning: higher order count-of-counts are wrong\n"; warn "Fixing this problem by resorting only on the lower order count-of-counts\n"; $beta[1] = $Y; $beta[2] = $Y; $beta[3] = $Y; } else { $beta[1] = 1 - 2 * $Y * $n2[$n] / $n1[$n]; $beta[2] = 2 - 3 * $Y * $n3[$n] / $n2[$n]; $beta[3] = 3 - 4 * $Y * $n4[$n] / $n3[$n]; } } open(HGR,"$gunzip -c ${sublm}.".($n-1)."gr.gz|") || die "cannot open ${sublm}.".($n-1)."gr.gz\n"; open(INP,"$ngrams") || open(INP,"$ngrams|") || die "cannot open $ngrams\n"; open(GR,"|$gzip -c >${sublm}.${n}gr.gz"); open(NHGR,"|$gzip -c > ${sublm}.".($n-1)."ngr.gz") || die "cannot open ${sublm}.".($n-1)."ngr.gz"; my $ngram; my ($reduced_h, $reduced_ng) = ("", ""); $ng=; chomp($ng); @ng=split(/[ \t]+/,$ng); $ngcnt=(pop @ng) - $freqshift; $h=; chomp($h); @h=split(/ +/,$h); $hpr=shift @h; $reduced_ng=join(" ",@ng[0..$n-2]); $reduced_h=join(" ",@h[0..$n-2]); @cnt=(); @dict=(); $code=-1; $totcnt=0; $diff=0; $singlediff=1; $diff1=0; $diff2=0; $diff3=0; $oldwrd=""; do{ #load all n-grams starting with history h, and collect useful statistics while ($reduced_h eq $reduced_ng){ #must be true the first time! #print join(" ",@h[0..$n-2]),"--",join(" ",@ng[0..$n-1]),"--\n"; #print "oldwrd $oldwrd -- code $code\n"; if ($oldwrd ne $ng[$n-1]) { #could this be otherwise? [Marcello 22/5/09] $dict[++$code]=$oldwrd=$ng[$n-1]; $diff++; $singlediff++ if $ngcnt==1; } if ($diff>1 && $ng[$n-1] eq $cutoffword) { # in google n-grams #find estimates for remaining diff and singlediff #proportional estimate $diff--; #remove cutoffword my $concentration=1.0-($diff-1)/$totcnt; my $mass=1; #$totcnt/($totcnt+$ngcnt); my $index=(1-($concentration * $mass))/(1-1/$cutoffvalue) + (1/$cutoffvalue); my $cutoffdiff=int($ngcnt * $index); $cutoffdiff=1 if $cutoffdiff==0; #print "diff $diff $totcnt cutofffreq $ngcnt -- cutoffdiff: $cutoffdiff\n"; #print "concentration:",$concentration," mass:", $mass,"\n"; $diff+=$cutoffdiff; } $cnt[$code]+=$ngcnt; $totcnt+=$ngcnt; $ng=; if (defined($ng)){ chomp($ng); @ng=split(/[ \t]+/,$ng);$ngcnt=(pop @ng) - $freqshift; $reduced_ng=join(" ",@ng[0..$n-2]); }else{ last; } } if ($improved_kneser_ney) { for (my $c=0;$c<=$code;$c++) { $diff1++ if $cnt[$c]==1; $diff2++ if $cnt[$c]==2; $diff3++ if $cnt[$c]>=3; } } #print smoothed probabilities my $boprob=0; #accumulate pruned probabilities my $prob=0; for (my $c=0;$c<=$code;$c++) { if ($kneser_ney && $beta>0) { $prob=($cnt[$c]-$beta)/$totcnt; } elsif ($improved_kneser_ney) { my $b=($cnt[$c]>= 3? $beta[3]:$beta[$cnt[$c]]); $prob=($cnt[$c] - $b)/$totcnt; } elsif ($good_turing && $singlediff>0) { $prob=$cnt[$c]/($totcnt+$singlediff); } else { $prob=$cnt[$c]/($totcnt+$diff); } $ngram=join(" ",$reduced_h,$dict[$c]); #rm singleton n-grams for (n>=3), if flag is active #rm n-grams (n>=2) containing cross-sentence boundaries, if flag is not active #rm n-grams containing or except for 1-grams #warn "considering $size $n |$ngram|\n"; if (($prune_singletons && $n>=3 && $cnt[$c]==1) || (!$cross_sentence && $n>=2 && &CrossSentence($ngram)) || ($dict[$c]=~//i) || ($n>=2 && $h=~//i) || ($dict[$c] eq $cutoffword) ) { $boprob+=$prob; if ($n<$size) { #output this anyway because it will be an history for n+1 printf GR "%f %s %s\n",-10000,$reduced_h,$dict[$c]; } } else { # print unpruned n-1 gram printf(GR "%f %s %s\n",log($prob)/$log10,$reduced_h,$dict[$c]); } } #rewrite history including back-off weight print "$reduced_h --- $h --- $reduced_ng --- $ng --- $totcnt $diff \n" if $totcnt+$diff==0 && defined($ng); #check if history has to be pruned out if ($hpr==-10000) { #skip this history } elsif ($kneser_ney && $beta>0) { printf NHGR "%s %f\n",$h,log($boprob+($beta * $diff/$totcnt))/$log10; } elsif ($improved_kneser_ney) { my $lambda=($beta[1] * $diff1 + $beta[2] * $diff2 + $beta[3] * $diff3)/$totcnt; printf NHGR "%s %f\n",$h,log($boprob+$lambda)/$log10; } elsif ($good_turing && $singlediff>0) { printf NHGR "%s %f\n",$h,log($boprob+($singlediff/($totcnt+$singlediff)))/$log10; } else { printf NHGR "%s %f\n",$h,log($boprob+($diff/($totcnt+$diff)))/$log10; } #reset smoothing statistics $code=-1;@cnt=(); @dict=(); $totcnt=0;$diff=0;$singlediff=0;$oldwrd="";$diff1=0;$diff2=0;$diff3=0;$locfreq=0; #read next history $h=; if (defined($h)){ chomp($h); @h=split(/ +/,$h); $hpr=shift @h; $reduced_h=join(" ",@h[0..$n-2]); }else{ die "ERROR: Somehing could be wrong: history are terminated before ngrams!" if defined($ng); } }until (!defined($ng)); #n-grams are over close(HGR); close(INP); close(GR); close(NHGR); rename("${sublm}.".($n-1)."ngr.gz","${sublm}.".($n-1)."gr.gz"); } #check if n-gram contains cross-sentence boundaries #this happens if # either occurs not only in the first place # or occurs not only in thes last place sub CrossSentence(){ my ($ngram) = @_; # warn "check CrossSentence |$ngram|\n"; if (($ngram=~/ /i) || ($ngram=~/<\/s> /i)) { # warn "delete $ngram\n"; return 1; } return 0; } irstlm-5.80.03/scripts/goograms2ngrams.pl000755 000766 000024 00000011312 12032513324 022467 0ustar00nicolabertoldistaff000000 000000 #! /usr/bin/perl #***************************************************************************** # IrstLM: IRST Language Model Toolkit # Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #****************************************************************************** #transforms google n-grams into real n-grams so that counts are #consistent with respect to lower order n-grams use strict; use Getopt::Long "GetOptions"; my $gzip=`which gzip 2> /dev/null`; my $gunzip=`which gunzip 2> /dev/null`; chomp($gzip); chomp($gunzip); my $cutoffword=""; #special word for Google 1T-ngram cut-offs my $blocksize=10000000; #this is the blocksize of produced n-grams my $from=2; #starting n-gram level my($help,$verbose,$maxsize,$googledir,$ngramdir)=(); $help=1 unless &GetOptions('maxsize=i' => \$maxsize, 'startfrom=i' => \$from, 'googledir=s' => \$googledir, 'ngramdir=s' => \$ngramdir, 'h|help' => \$help, 'verbose' => \$verbose); if ($help || !$maxsize || !$googledir || !$ngramdir ) { my $cmnd = "goograms2ngrams.pl"; print "\n$cmnd - transforms google n-grams into real n-grams so that\n", " counts are consistent with respect to lower order n-grams\n", "\nUSAGE:\n", " $cmnd [options]\n", "\nOPTIONS:\n", " --maxsize maximum n-gram level of conversion\n", " --startfrom skip initial levels if already available (default 2)\n", " --googledir directory containing the google-grams dirs (1gms,2gms,...)\n", " --ngramdir directory where to write the n-grams \n", " --verbose (optional) very talktive output\n", " -h, --help (optional) print these instructions\n", "\n"; exit(1); } warn "goograms2ngrams: maxsize $maxsize from $from googledir $googledir ngramdir $ngramdir \n" if $verbose; die "goograms2ngrams: value of --maxsize must be between 2 and 5\n" if $maxsize<2 || $maxsize>5; die "goograms2ngrams: cannot find --googledir $googledir \n" if ! -d $googledir; die "goograms2ngrams: cannot find --ngramdir $ngramdir \n" if ! -d $ngramdir; my ($n,$hgrams,$ggrams,$ngrams)=(); my ($ggr,$hgr,$hgrcnt,$ggrcnt,$totggrcnt)=(); my (@ggr,@hgr)=(); foreach ($n=$from;$n<=$maxsize;$n++){ my $counter=0; warn "Converting google-$n-grams into $n-gram\n"; $hgrams=($n==2?"${googledir}/1gms/vocab.gz":"${ngramdir}/".($n-1)."grams-*.gz"); open(HGR,"$gunzip -c $hgrams |") || die "cannot open $hgrams\n"; $ggrams="${googledir}/".($n)."gms/".($n)."gm-*"; open(GGR,"$gunzip -c $ggrams |") || die "cannot open $ggrams\n"; my $id = sprintf("%04d", 0); $ngrams="${ngramdir}/".($n)."grams-${id}.gz"; next if -e $ngrams; #go to next step if file exists already; open(NGR,"|$gzip -c > $ngrams ") || die "cannot open $ngrams\n"; chop($ggr=); @ggr=split(/[ \t]/,$ggr);$ggrcnt=(pop @ggr); #warn "ggr: ",$ggrcnt," ",join(" ",@ggr[0..$n-1]),"\n"; while ($hgr=){ $counter++; printf(STDERR ".") if ($counter % 1000000)==0; chop($hgr); @hgr=split(/[ \t]/,$hgr); $hgrcnt=(pop @hgr); #warn "hgr: ",$hgrcnt," ",join(" ",@hgr[0..$n-2]),"\n"; if (join(" ",@hgr[0..$n-2]) eq join(" ",@ggr[0..$n-2])){ $totggrcnt=0; do{ $totggrcnt+=$ggrcnt; print NGR join(" ",@ggr[0..$n-1])," ",$ggrcnt,"\n"; chop($ggr=);@ggr=split(/[ \t]/,$ggr);$ggrcnt=(pop @ggr); }until (join(" ",@hgr[0..$n-2]) ne join(" ",@ggr[0..$n-2])); if ($hgrcnt > $totggrcnt){ #warn "difference: $hgrcnt $totggrcnt =",$hgrcnt-$totggrcnt,"\n"; print NGR join(" ",@hgr[0..$n-1])," ",$cutoffword," ",$hgrcnt-$totggrcnt,"\n"; } } else{ #warn "fully pruned context: $hgr\n"; print NGR join(" ",@hgr[0..$n-1])," ",$cutoffword," ",$hgrcnt,"\n"; } if (($counter % $blocksize)==0){ close(NGR); my $id = sprintf("%04d", int($counter / $blocksize)); $ngrams="${ngramdir}/".($n)."grams-${id}.gz"; open(NGR,"|$gzip -c > $ngrams ") || die "cannot open $ngrams\n"; } } close(HGR);close(NGR);close(GGR); } irstlm-5.80.03/scripts/lm-stat.pl000755 000766 000024 00000003506 12032513324 020746 0ustar00nicolabertoldistaff000000 000000 #! /usr/bin/perl #***************************************************************************** # IrstLM: IRST Language Model Toolkit # Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #****************************************************************************** #computes LM statistics over a string use strict; use Getopt::Long "GetOptions"; use File::Basename; my ($help,$lm,$txt)=(); $help=1 unless &GetOptions('lm=s' => \$lm, 'txt=s' => \$txt, 'h|help' => \$help,); if ($help || !$lm || !$txt) { my $cmnd = basename($0); print "\n$cmnd - computes LM statistics over a string\n", "\nUSAGE:\n", " $cmnd [options]\n", "\nOPTIONS:\n", " --lm language model file \n", " --txt text file\n", " -h, --help (optional) print these instructions\n", "\n"; exit(1); } if (!$ENV{IRSTLM}){ print "Set environment variable IRSTLM with path to the irstlm directory\n"; exit(1); } my $clm="$ENV{IRSTLM}/bin/compile-lm"; open (OUT,"$clm $lm --eval $txt --debug 1|"); while (){ print; } close(OUT); irstlm-5.80.03/scripts/Makefile.am000644 000766 000024 00000000414 12035633727 021071 0ustar00nicolabertoldistaff000000 000000 wrapperbindir = @prefix@/bin dist_wrapperbin_SCRIPTS = \ add-start-end.sh build-lm-qsub.sh build-lm.sh rm-start-end.sh split-ngt.sh mdtsel.sh \ build-sublm.pl goograms2ngrams.pl lm-stat.pl merge-sublm.pl ngram-split.pl sort-lm.pl split-dict.pl EXTRA_DIST = wrapper irstlm-5.80.03/scripts/mdtsel.sh000755 000766 000024 00000012133 12042551465 020661 0ustar00nicolabertoldistaff000000 000000 #! /bin/bash #/****************************************************************************** #IrstLM: IRST Language Model Toolkit #Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy # #This library is free software; you can redistribute it and/or #modify it under the terms of the GNU Lesser General Public #License as published by the Free Software Foundation; either #version 2.1 of the License, or (at your option) any later version. # #This library is distributed in the hope that it will be useful, # # #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #Lesser General Public License for more details. # #You should have received a copy of the GNU Lesser General Public #License along with this library; if not, write to the Free Software #Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # #******************************************************************************/ # mdtsel.sh # by M. Federico # Copyright Marcello Federico, Fondazione Bruno Kessler, 2012 set -m #enable job control usage() { cmnd=$(basename $0); cat << EOF $cmnd - performs data selection assuming an indomain corpus and a very large out of domain corpus. USAGE: $cmnd [options] DESCRIPTION. This command performs data selection assuming an indomain corpus and a very large out of domain corpus. Both corpora must contain one sentence in each line delimited with and . The process produces a file of scores. OPTIONS: -h Show this message -v Verbose -i In-domain corpus -o Out-domain corpus -s Scores output file -x Out-domain lines are indexed -w Temporary work directory (default /tmp) -j Number of jobs (default 6) -m Data selection model (1 or 2, default 2) -f Word frequency threshold (default 2) -n Ngram order to use (n>=1 default 3) -d Vocabulary size upper bound (default 10000000) -c Cross-validation parameter (cv>=1, default 1) EOF } if [ ! $IRSTLM ]; then echo "Set IRSTLM environment variable with path to irstlm" exit 2 fi #paths to scripts and commands in irstlm scr=$IRSTLM/bin bin=$IRSTLM/bin #check irstlm installation if [ ! -e $bin/dtsel ]; then echo "$IRSTLM does not contain a proper installation of IRSTLM" exit 3 fi #default parameters indomfile=""; outdomfile=""; scoresfile=""; workdir=/tmp logfile="/dev/null" jobs=6 model=2 minfreq=2 ngramorder=3 cv=1 dub=10000000 verbose=""; useindex=0; while getopts “hvi:o:s:l:w:j:m:f:n:c:d:x:” OPTION do case $OPTION in h) usage exit 1 ;; v) verbose="--verbose"; ;; i) indfile=$OPTARG ;; o) outdfile=$OPTARG ;; s) scorefile=$OPTARG ;; l) logfile=$OPTARG ;; w) workdir=$OPTARG ;; j) jobs=$OPTARG ;; m) model=$OPTARG ;; n) ngramorder=$OPTARG ;; f) minfreq=$OPTARG; ;; d) dub=$OPTARG; ;; x) useindex=$OPTARG; ;; ?) usage exit 1 ;; esac done if [ $verbose ];then echo indfile= $indfile outdfile= $outdfile scorefile= $scorefile useindex= $useindex echo logfile= $logfile workdir= $workdir echo jobs= $jobs model= $model ngramorder= $ngramorder minfreq= $minfreq dub=$dub fi if [ ! $indfile -o ! $outdfile -o ! $scorefile ]; then usage exit 5 fi if [ -e $scorefile ]; then echo "Output score file $outfile already exists! either remove or rename it." exit 6 fi if [ $logfile != "/dev/null" -a $logfile != "/dev/stdout" -a -e $logfile ]; then echo "Logfile $logfile already exists! either remove or rename it." exit 7 fi workdir_created=0 if [ ! -d $workdir ]; then echo "Temporary work directory $workdir does not exist"; echo "creating $workdir"; mkdir -p $workdir; workdir_created=1; fi #get process id to name process specific temporary files pid=$$ #compute size of out domain corpus and block size of split lines=`wc -l < $outdfile` size=`echo "( $lines + 1000 )" / $jobs | bc` #to avoid any small block #perform split split -l $size $outdfile $workdir/dtsel${pid}-files- for file in $workdir/dtsel${pid}-files-* do echo $file ( \ $bin/dtsel -x=$useindex -i=$indfile -o=$file -s=${file}.scores -n=$ngramorder -dub=$dub -f=$minfreq -m=$model ; \ cat ${file}.scores | perl -pe '/^nan /1000 /g;' | sort -g > ${file}.scores.tmp ; \ mv ${file}.scores.tmp ${file}.scores \ ) >>$logfile 2>&1 & done # Wait for all parallel jobs to finish while [ 1 ]; do fg 2> /dev/null; [ $? == 1 ] && break; done sort -g -m $workdir/dtsel${pid}-files-*.scores > $scorefile rm $workdir/dtsel${pid}-files-* if [ $workdir_created == 1 ] then rmdir $workdir fi irstlm-5.80.03/scripts/merge-sublm.pl000755 000766 000024 00000013412 12032513324 021601 0ustar00nicolabertoldistaff000000 000000 #! /usr/bin/perl #***************************************************************************** # IrstLM: IRST Language Model Toolkit # Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #****************************************************************************** #merge prefix LMs into one single file use strict; use Getopt::Long "GetOptions"; use File::Basename; my ($help,$lm,$size,$sublm)=(); $help=1 unless &GetOptions('size=i' => \$size, 'lm=s' => \$lm, 'sublm=s' => \$sublm, 'h|help' => \$help,); if ($help || !$size || !$lm || !$sublm) { my $cmnd = basename($0); print "\n$cmnd - merge single LMs\n", "\nUSAGE:\n", " $cmnd [options]\n", "\nOPTIONS:\n", " --size maximum n-gram size for the language model\n", " --sublm path identifying all input prefix sub LMs\n", " --lm name of the output LM file (will be gzipped)\n", " -h, --help (optional) print these instructions\n", "\n"; exit(1); } my $gzip=`which gzip 2> /dev/null`; my $gunzip=`which gunzip 2> /dev/null`; chomp($gzip); chomp($gunzip); warn "merge-sublm.pl --size $size --sublm $sublm --lm $lm\n"; warn "Compute total sizes of n-grams\n"; my @size=(); #number of n-grams for each level my $tot1gr=0; #total frequency of 1-grams my $unk=0; #frequency of my $pr; #probability of 1-grams my (@files,$files); #sublm files for a given n-gram size for (my $n=1;$n<=$size;$n++){ @files=map { glob($_) } "${sublm}*.${n}gr*"; $files=join(" ",@files); $files || die "cannot find sublm files\n"; warn "join files $files\n"; if ($n==1){ open(INP,"$gunzip -c $files|") || die "cannot open $files\n"; while(my $line = ){ $size[$n]++; chomp($line); warn "there is an empty line in any of these files ($files); this should not happen\n" if $line =~ /^$/; my @words = split(/ +/,$line); #cut down counts for sentence initial $words[0]=1 if $words[1]=~//; #there could be more independent words #generated by ngt with -sd option $size[$n]-- if $unk && $words[1] eq ""; $unk+=$words[0] if $words[1]=~//i; $tot1gr+=$words[0]; } close(INP); if ($unk==0){ warn "implicitely add word to counters\n"; $tot1gr+=$size[$n]; #equivalent to WB smoothing $size[$n]++; } }else{ for (my $j=0;$j wc$$") or die; open(INP,"wc$$") || die "cannot open wc$$\n"; my $wc = ; chomp($wc); $size[$n] += $wc; close(INP); unlink("wc$$"); } } warn "n:$n size:$size[$n] unk:$unk\n"; } warn "Merge all sub LMs\n"; $lm.=".gz" if $lm!~/.gz$/; open(LM,"|$gzip -c > $lm") || die "Cannot open $lm\n"; warn "Write LM Header\n"; printf LM "iARPA\n"; printf LM "\n\\data\\\n"; for (my $n=1;$n<=$size;$n++){ printf LM "ngram $n= $size[$n]\n"; } printf LM "\n\n"; close(LM); warn "Writing LM Tables\n"; for (my $n=1;$n<=$size;$n++){ warn "Level $n\n"; @files=map { glob($_) } "${sublm}*.${n}gr*"; $files=join(" ",@files); warn "input from: $files\n"; if ($n==1){ open(INP,"$gunzip -c $files|") || die "cannot open $files\n"; open(LM,"|$gzip -c >> $lm"); printf LM "\\$n-grams:\n"; while(my $line = ){ chomp($line); warn "there is an empty line in any of these files ($files); this should not happen\n" if $line =~ /^$/; #lowercase some expressions of google n-grams $line=~s///g; $line=~s/<\/S>/<\/s>/g; $line=~s///g; my @words = split(/ +/,$line); #always print unk a the eqnd next if $words[1]=~//i; #cut down counts for sentence initial $words[0]=1 if $words[1]=~//i; #apply witten-bell smoothing on 1-grams $pr=(log($words[0]+1)-log($tot1gr+$size[1]))/log(10.0); shift @words; printf LM "%f %s\n",$pr,join(" ",@words); } close(INP); #print final #witten-bell smoothing of probability if ($unk){ $pr=(log($unk+1)-log($tot1gr+$size[1]))/log(10.0); }else{ $pr=(log($size[1]-1+1)-log($tot1gr+$size[1]))/log(10.0); } printf LM "%f \n",$pr; close(LM); }else{ open(LM,"|$gzip -c >> $lm"); printf LM "\\$n-grams:\n"; close(LM); for (my $j=0;$j> $lm") or die; } } } open(LM,"|$gzip -c >> $lm") || die "Cannot open $lm\n"; printf LM "\\end\\\n"; close(LM); sub safesystem { print STDERR "Executing: @_\n"; system(@_); if ($? == -1) { print STDERR "Failed to execute: @_\n $!\n"; exit(1); } elsif ($? & 127) { printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", ($? & 127), ($? & 128) ? 'with' : 'without'; exit(1); } else { my $exitcode = $? >> 8; print STDERR "Exit code: $exitcode\n" if $exitcode; return ! $exitcode; } } irstlm-5.80.03/scripts/ngram-split.pl000755 000766 000024 00000005131 12032513324 021616 0ustar00nicolabertoldistaff000000 000000 #! /usr/bin/perl #***************************************************************************** # IrstLM: IRST Language Model Toolkit # Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #****************************************************************************** #re-segment google n-gram count files into files so that #n-grams starting with a given word (prefix) are all #contained in one file. use Getopt::Long "GetOptions"; use File::Basename; my ($help,$lm,$size,$sublm)=(); $help=1 unless &GetOptions('h|help' => \$help); if ($help) { my $cmnd = basename($0); print "\n$cmnd - re-segment google n-gram count files so that n-grams\n", " starting with a given word (prefix) are all contained in one file\n", "\nUSAGE:\n", " $cmnd [options] []\n", "\nDESCRIPTION:\n", " Input is expected on STDIN.\n", " prefix of files to be created\n", "\nOPTIONS:\n", " -h, --help (optional) print these instructions\n", "\n"; exit(1); } $max_pref=10000; #number of prefixes to be put in one file $max_ngram=5000000;#number of n-grams to be put in one file $file_cnt=0; #counter of files $pref_cnt=0; #counter of prefixes in the current file $ngram_cnt=0; #counter of n-gram in the current file $path=($ARGV[0]?$ARGV[0]:"goong"); #path of files to be created $gzip=`which gzip`; chomp($gzip); $pwrd=""; open(OUT,sprintf("|$gzip -c > %s.%04d.gz",$path,++$file_cnt)); while ($ng=){ ($wrd)=$ng=~/^([^ ]+)/; #warn "$wrd\n"; if ($pwrd ne $wrd){ $pwrd=$wrd; if ($file_pref>$max_pref || $ngram_cnt>$max_ngram){ warn "it's time to change file\n"; close(OUT); open(OUT,sprintf("|$gzip -c > %s.%04d.gz",$path,++$file_cnt)); $pref_cnt=$ngram_cnt=0; } else{ $pref_cnt++; } } print OUT $ng; $ngram_cnt++; } close(OUT); irstlm-5.80.03/scripts/rm-start-end.sh000644 000766 000024 00000000647 12032513324 021701 0ustar00nicolabertoldistaff000000 000000 #! /bin/bash function usage() { cmnd=$(basename $0); cat<&2; exit 0; ;; esac done sed 's///g' | sed 's/<\/s>//g' | sed 's/^ *//' | sed 's/ *$//' | sed '/^$/d' irstlm-5.80.03/scripts/sort-lm.pl000755 000766 000024 00000007123 12032513324 020761 0ustar00nicolabertoldistaff000000 000000 #! /usr/bin/perl #***************************************************************************** # IrstLM: IRST Language Model Toolkit # Copyright (C) 2010 Marcello Federico, FBK-irst Trento, Italy # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #****************************************************************************** #Sorts n-grams of an ARPA file according to lexicographic order. #Inverted sorting option is propedeutic to building a binary #lmtable with compile-lm with n-grams stored in reverted order. use strict; use Getopt::Long "GetOptions"; use File::Basename; my ($help,$ilm,$olm,$inv)=(); $help=1 unless $ilm="/dev/stdin"; $olm="/dev/stdout"; &GetOptions('ilm=s' => \$ilm, 'olm=s' => \$olm, 'inv' => \$inv, 'h|help' => \$help,); if ($help || !$ilm || !$olm) { my $cmnd = basename($0); print "\n$cmnd - sorts n-grams according to lexicographic order\n", "\nUSAGE:\n", " $cmnd [options]\n", "\nDESCRIPTION:\n", " $cmnd sorts n-grams of an ARPA file according to lexicographic order.\n", " Inverted sorting option is propedeutic to building a binary\n", " lmtable with compile-lm with n-grams stored in reverted order.\n", "\nOPTIONS:\n", " -ilm input ARPA LM filename (default /dev/stdin) \n", " -olm output ARPA LM filename (default /dev/stdout)\n", " -inv inverted n-gram sort for compile-lm \n", " -h, --help (optional) print these instructions\n", "\n"; exit(1); } my $order=0; my $sortcmd=""; $ENV{'LC_ALL'}='C'; open (INP, "< $ilm") || die "cannot open input LM file: $ilm\n"; open (OUT, "> $olm") || die "cannot open output LM file: $olm\n"; warn "reading from standard input\n" if $ilm eq "/dev/stdin"; warn "writing to standard output\n" if $olm eq "/dev/stdout"; $_=; #sanity check die "Error: input cannot be an intermediate iARPA file. First convert it to ARPA format with compile-lm.\n" if $_=~/^iARPA/; my $isQuantized=0; $isQuantized=1 if $_=~/^qARPA/; while(!/^\\end\\/){ if (($order)=$_=~/^\\(\d+)-grams:/){ print(OUT $_);$_=; if ($isQuantized){ print(OUT $_); chop $_;#print centers my $centers=$_; $_=; warn "skip $centers centers\n"; for (my $c=1;$c<=$centers;$c++){ print(OUT $_);$_=; } } #sort command #$sortcmd="sort -b"; #does not seem to work properly $sortcmd="sort "; if ($inv){ warn "inverted sorting of $order-grams\n"; for (my $n=$order;$n>0;$n--){ $sortcmd.=" -k ".($n+1).",".($n+1); } }else{ warn "direct sorting of $order-grams\n"; for (my $n=1;$n<=$order;$n++){ $sortcmd.=" -k ".($n+1).",".($n+1); } } close(OUT);open (OUT,"|$sortcmd >> $olm"); do{ print(OUT $_);$_=; }until (/^\\/ || /^\n/); close(OUT); open(OUT, ">> $olm"); } else{ print(OUT $_);$_=; } } print(OUT $_); close(INP); close(OUT); irstlm-5.80.03/scripts/split-dict.pl000755 000766 000024 00000010310 12032513324 021430 0ustar00nicolabertoldistaff000000 000000 #! /usr/bin/perl #***************************************************************************** # IrstLM: IRST Language Model Toolkit # Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #****************************************************************************** #usage: #split-dict.pl #It splits the dictionary into dictionaries #(named , ... ) #splitting is balanced wrt to frequency of the dictionary #if not available a frequency of 1 is considered use strict; use Getopt::Long "GetOptions"; use File::Basename; my ($help,$input,$output,$parts)=(); $help=1 unless &GetOptions('input=s' => \$input, 'output=s' => \$output, 'parts=i' => \$parts, 'h|help' => \$help,); if ($help || !$input || !$output || !$parts) { my $cmnd = basename($0); print "\n$cmnd - splits a dictionary into frequency-balanced partitions\n", "\nUSAGE:\n", " $cmnd [options]\n", "\nDESCRIPTION:\n", " $cmnd splits a dictionary into frequency-balanced partitions.\n", " The dictionary must be generated with IRSTLM command dict.\n", " If dictionary does not contain frequencies, then a frequency 1 is\n", " assumed for all words.\n", "\nOPTIONS:\n", " --input input dictionary with frequencies\n", " --output prefix of output dictionaries\n", " --parts number of partitions to create\n", " -h, --help (optional) print these instructions\n", "\n"; exit(1); } my $freqflag=0; my ($w,$f,$globf,$thr); my (@D,@F,%S,@C); open(IN,"$input"); chomp($_=); #if input is a dictionary. if (/^dictionary[ \t]+\d+[ \t]+\d+$/i){ my ($dummy,$size); ($dummy,$dummy,$size)=split(/[ \t]+/,$_); $freqflag=1 if /DICTIONARY/; } $globf=0; while(chomp($_=)){ if ($freqflag){ ($w,$f)=split(/[ \t]+/,$_); } else{ $w=$_; $f=1; } push @D, $w; push @F, $f; $globf+=$f; } close (IN); $thr=$globf/$parts; my $totf=0; print STDERR "Dictionary 0: (thr: $thr , $globf, $totf , $parts)\n"; my $sfx=0; my $w; for (my $i=0;$i<=$#D;$i++){ # if the remaining words are less than or equal to # the number of remaining sub-dictionaries to create # put only one word per each sub-dictionary. if (($totf>0) && ($#D+1-$i) <= ($parts-1-$sfx)){ # recompute threshold on the remaining global frequency # according to the number of remaining parts $sfx++; $globf-=$totf; $thr=($globf)/($parts-$sfx); print STDERR "Dictionary $sfx: (thr: $thr , $globf , $totf , ",($parts-$sfx),")\n"; $totf=0; } $totf+=$F[$i]; $w=$D[$i]; $S{$w}=$sfx; $C[$sfx]++; if ($totf>$thr){ # recompute threshold on the remaining global frequency # according to the number of remaining parts $sfx++; $globf-=$totf; $thr=($globf)/($parts-$sfx); print STDERR "Dictionary $sfx: (thr: $thr , $globf , $totf , ",($parts-$sfx),")\n"; $totf=0; } } my $oldsfx=-1; for (my $i=0;$i<=$#D;$i++){ $w=$D[$i]; $sfx="0000$S{$w}"; $sfx=~s/.+(\d{3})/$1/; if ($sfx != $oldsfx){ #print STDERR "opening $output$sfx\n"; close (OUT) if $oldsfx!= -1; open(OUT,">$output$sfx"); if ($freqflag){ print OUT "DICTIONARY 0 $C[$sfx]\n"; } else{ print OUT "dictionary 0 $C[$sfx]\n"; } $oldsfx=$sfx; } if ($freqflag){ print OUT "$w $F[$i]\n"; } else{ print OUT "$w\n"; } } close (OUT) if $oldsfx!= -1; my $numdict=$S{$D[$#D]}+1; die "Only $numdict dictionaries were crested instead of $parts!" if ($numdict != $parts); irstlm-5.80.03/scripts/split-ngt.sh000755 000766 000024 00000002710 12032513324 021301 0ustar00nicolabertoldistaff000000 000000 #! /bin/bash function usage() { cmnd=$(basename $0); cat< DESCRIPTION: Input file name Partition files name prefix Order of the ngrams Number of partitions OPTIONS: -h Show this message EOF } # Parse options while getopts h OPT; do case "$OPT" in h) usage >&2; exit 0; ;; esac done #usage: #ngt-split.sh #It creates files (named , ... ) #containing ngram statistics (of length) in Google format #These files are a partition of the whole set of ngrams basedir=$IRSTLM bindir=$basedir/bin scriptdir=$basedir/scripts unset par while [ $# -gt 0 ] do echo "$0: arg $1" par[${#par[@]}]="$1" shift done inputfile=${par[0]} outputfile=${par[1]} order=${par[2]} parts=${par[3]} dictfile=dict$$ $bindir/dict -i="$inputfile" -o=$dictfile -f=y -sort=n $scriptdir/split-dict.pl --input $dictfile --output ${dictfile}. --parts $parts rm $dictfile for d in `ls ${dictfile}.*` ; do w=`echo $d | perl -pe 's/.+(\.[0-9]+)$/$1/i'` w="$outputfile$w" echo "$bindir/ngt -i="$inputfile" -n=$order -gooout=y -o=$w -fd=$d > /dev/null" $bindir/ngt -n=$order -gooout=y -o=$w -fd=$d -i="$inputfile" > /dev/null rm $d done exit irstlm-5.80.03/scripts/wrapper000644 000766 000024 00000000204 11552037751 020433 0ustar00nicolabertoldistaff000000 000000 #! /bin/sh #set machine type for compilation MY_ARCH=`uname -m` name=`basename $0` dir=`dirname $0`"/$MY_ARCH" $dir/$name "$@"