ocamlagrep-1.0/0040755004340400511750000000000007427724604013104 5ustar xleroyformelocamlagrep-1.0/.depend0100644004340400511750000000005407426540256014336 0ustar xleroyformelagrep.cmo: agrep.cmi agrep.cmx: agrep.cmi ocamlagrep-1.0/Makefile0100644004340400511750000000222207427450332014531 0ustar xleroyformelCAMLSTDLIB=`ocamlc -where` DESTDIR=$(CAMLSTDLIB)/agrep OCAMLC=ocamlc -g OCAMLOPT=ocamlopt OCAMLMKLIB=ocamlmklib OCAMLDEP=ocamldep CFLAGS=-O -D_XOPEN_SOURCE=500 C_OBJS=engine.o CAML_OBJS=agrep.cmo all: libagrep.a agrep.cma agrep.cmxa agrep.cma: $(CAML_OBJS) $(OCAMLMKLIB) -o agrep $(CAML_OBJS) agrep.cmxa: $(CAML_OBJS:.cmo=.cmx) $(OCAMLMKLIB) -o agrep $(CAML_OBJS:.cmo=.cmx) libagrep.a: $(C_OBJS) $(OCAMLMKLIB) -o agrep $(C_OBJS) install: mkdir -p $(DESTDIR) cp agrep.cmi agrep.cma agrep.cmxa $(DESTDIR) cp libagrep.a $(DESTDIR) if test -f dllagrep.so; then cp dllagrep.so $(DESTDIR); fi destdir=$(DESTDIR); ldconf=$(CAMLSTDLIB)/ld.conf; \ if test `grep -s -c '^'$$destdir'$$' $$ldconf || :` = 0; \ then echo $$destdir >> $$ldconf; fi testagrep: testagrep.ml agrep.cma libagrep.a $(OCAMLC) -I . -custom -o $@ agrep.cma testagrep.ml clean:: rm -f testagrep .SUFFIXES: .ml .mli .cmo .cmi .cmx .mli.cmi: $(OCAMLC) -c $< .ml.cmo: $(OCAMLC) -c $< .ml.cmx: $(OCAMLOPT) -c $< .c.o: $(OCAMLC) -ccopt "$(CFLAGS)" -c $< clean:: rm -f *.cm* *.o *.a *.so depend: $(OCAMLDEP) *.ml *.mli > .depend engine.o: skeleton.h include .depend ocamlagrep-1.0/README0100644004340400511750000000415307426730535013763 0ustar xleroyformelDESCRIPTION ----------- This library implements the Wu-Manber algorithm for string searching with errors, popularized by the "agrep" Unix command and the "glimpse" file indexing tool. It was developed as part of a search engine for a largish MP3 collection; the "with error" searching comes handy for those who can't spell Liszt or Shostakovitch. Given a search pattern and a string, this algorithm determines whether the string contains a substring that matches the pattern up to a parameterizable number N of "errors". An "error" is either a substitution (replace a character of the string with another character), a deletion (remove a character) or an insertion (add a character to the string). In more scientific terms, the number of errors is the Levenshtein edit distance between the pattern and the matched substring. The search patterns are roughly those of the Unix shell, including one-character wildcard (?), character classes ([0-9]) and multi-character wildcard (*). In addition, conjunction (&) and alternative (|) are supported. General regular expressions are not supported, however. Performance is quite good: for short patterns (less than 31 characters) and no errors, this library is about 8 times faster than OCaml's "Str" regular expression library. Speed decreases with the number of errors allowed, but even with 3 errors we are still faster than "Str". The algorithm is described in S. Wu and U. Manber, ``Fast Text Searching With Errors", tech. rep. TR 91-11, University of Arizona, 1991. It's a nice exercise in dynamic programming and bit-parallel implementation. LICENSING --------- LGPL: This code is distributed under the terms of the GNU Library General Public License version 2. INSTALLATION ------------ OCaml 3.04 and up is required. Do "make". Become superuser, and do "make install" or "make DESTDIR=... install" to specify the installation directory (by default: the subdirectory "agrep" of OCaml's standard library directory). USAGE ----- ocamlc -I +agrep ... agrep.cma ... or ocamlopt -I +agrep ... agrep.cmxa ... See the commented interface agrep.mli for API documentation. ocamlagrep-1.0/agrep.ml0100644004340400511750000002240707426730535014535 0ustar xleroyformel(***********************************************************************) (* *) (* The "agrep" library for Objective Caml *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. All rights reserved. This file is distributed *) (* under the terms of the GNU Library General Public License. *) (* *) (***********************************************************************) (* $Id: agrep.ml,v 1.2 2002/02/02 09:29:01 xleroy Exp $ *) type bitmatrix external new_bitmatrix : int -> int -> bitmatrix = "caml_agrep_new_bitmatrix" external set_bit : bitmatrix -> int -> int -> int -> unit = "caml_agrep_set_bit" (* Deep syntax for patterns *) type pattern = CBase of int * bitmatrix | CAnd of pattern * pattern | COr of pattern * pattern (* String matching *) external match_: string -> int -> int -> int -> bitmatrix -> int -> bool -> int = "caml_agrep_match_bytecode" "caml_agrep_match" let string_match pat ?(numerrs = 0) ?(wholeword = false) text = if numerrs < 0 then invalid_arg "Agrep.string_match"; let rec do_match = function CBase(len, bm) -> match_ text 0 (String.length text) len bm numerrs wholeword < max_int | CAnd(p1, p2) -> do_match p1 && do_match p2 | COr(p1, p2) -> do_match p1 || do_match p2 in do_match pat let substring_match pat ?(numerrs = 0) ?(wholeword = false) text ~pos ~len = if pos < 0 || pos + len > String.length text then invalid_arg "Agrep.substring_match"; if numerrs < 0 then invalid_arg "Agrep.substring_match"; let rec do_match = function CBase(plen, bm) -> match_ text pos len plen bm numerrs wholeword < max_int | CAnd(p1, p2) -> do_match p1 && do_match p2 | COr(p1, p2) -> do_match p1 || do_match p2 in do_match pat let errors_substring_match pat ?(numerrs = 0) ?(wholeword = false) text ~pos ~len = if pos < 0 || pos + len > String.length text then invalid_arg "Agrep.errors_substring_match"; if numerrs < 0 then invalid_arg "Agrep.errors_substring_match"; let rec do_match = function CBase(plen, bm) -> match_ text pos len plen bm numerrs wholeword | CAnd(p1, p2) -> max (do_match p1) (do_match p2) | COr(p1, p2) -> min (do_match p1) (do_match p2) in do_match pat (* Representation of character sets *) module Charset = struct type t = string (* of length 32 *) let new_empty () = String.make 32 '\000' let all = String.make 32 '\255' let add s c = let i = Char.code c in s.[i lsr 3] <- Char.chr(Char.code s.[i lsr 3] lor (1 lsl (i land 7))) let add_range s c1 c2 = for i = Char.code c1 to Char.code c2 do add s (Char.chr i) done let complement s = let r = String.create 32 in for i = 0 to 31 do r.[i] <- Char.chr(Char.code s.[i] lxor 0xFF) done; r let iter fn s = for i = 0 to 31 do let c = Char.code s.[i] in for j = 0 to 7 do if c land (1 lsl j) <> 0 then fn (Char.chr ((i lsl 3) lor j)) done done end (* Shallow syntax for patterns *) type simple_pattern = Char of char | String of string | Char_class of Charset.t | Wildcard type complex_pattern = Simple of simple_pattern list | And of complex_pattern * complex_pattern | Or of complex_pattern * complex_pattern (* Compilation of shallow syntax into deep syntax *) let add_char transl bm len c r = match transl with None -> set_bit bm len (Char.code c) r | Some tr -> let t = tr.[Char.code c] in for i = 0 to 255 do if tr.[i] = t then set_bit bm len i r done let simple_pattern_len sp = List.fold_left (fun len p -> match p with Char c -> 1 + len | String s -> String.length s + len | Char_class s -> 1 + len | Wildcard -> len) 0 sp let compile_simple_pattern transl sp = let len = simple_pattern_len sp in let bm = new_bitmatrix len 257 in let rec fill pos = function [] -> () | Char c :: rem -> add_char transl bm len c pos; fill (pos + 1) rem | String s :: rem -> for i = 0 to String.length s - 1 do add_char transl bm len s.[i] (pos + i) done; fill (pos + String.length s) rem | Char_class cls :: rem -> Charset.iter (fun c -> add_char transl bm len c pos) cls; fill (pos + 1) rem | Wildcard :: rem -> set_bit bm len 256 pos; fill pos rem in fill 0 sp; CBase(len, bm) let rec compile_pattern ?transl = function Simple sp -> compile_simple_pattern transl sp | And(p1, p2) -> CAnd(compile_pattern ?transl p1, compile_pattern ?transl p2) | Or(p1, p2) -> COr(compile_pattern ?transl p1, compile_pattern ?transl p2) (* From concrete syntax to shallow abstract syntax *) exception Syntax_error of int let parse_pattern s = let rec parse_or i = let (p1, i1) = parse_and i in parse_ors p1 i1 and parse_ors p1 i = if i >= String.length s then (p1, i) else match s.[i] with ')' -> (p1, i) | '|' -> let (p2, i2) = parse_and (i + 1) in parse_ors (Or(p1, p2)) i2 | _ -> raise (Syntax_error i) and parse_and i = let (p1, i1) = parse_base i in parse_ands p1 i1 and parse_ands p1 i = if i >= String.length s then (p1, i) else match s.[i] with ')' | '|' -> (p1, i) | '&' -> let (p2, i2) = parse_base (i + 1) in parse_ands (And(p1, p2)) i2 | _ -> raise (Syntax_error i) and parse_base i = if i >= String.length s then (Simple [], i) else match s.[i] with ')' | '|' | '&' -> (Simple [], i) | '(' -> let (p, j) = parse_or (i + 1) in if j >= String.length s || s.[j] <> ')' then raise (Syntax_error j); (p, j + 1) | _ -> let (sl, j) = parse_simple_list [] i in (Simple (List.rev sl), j) and parse_simple_list sl i = if i >= String.length s then (sl, i) else match s.[i] with ')' | '&' | '|' -> (sl, i) | '(' -> raise (Syntax_error i) | '?' -> parse_simple_list (Char_class Charset.all :: sl) (i + 1) | '*' -> parse_simple_list (Wildcard :: sl) (i + 1) | '\\' when i + 1 < String.length s -> parse_simple_list (Char s.[i+1] :: sl) (i + 2) | '[' -> let (cls, i1) = parse_char_class (i + 1) in parse_simple_list (Char_class cls :: sl) i1 | c -> parse_simple_list (Char c :: sl) (i + 1) and parse_char_class i = let cls = Charset.new_empty() in if i < String.length s && s.[i] = '^' then begin let j = parse_class cls (i+1) in (Charset.complement cls, j) end else begin let j = parse_class cls i in (cls, j) end and parse_class cls i = if i >= String.length s then raise (Syntax_error i) else if s.[i] = ']' then i + 1 else if s.[i] = '\\' && i + 1 < String.length s then (Charset.add cls s.[i+1]; parse_class cls (i+2)) else if i + 2 < String.length s && s.[i+1] = '-' && s.[i+2] <> ']' then (Charset.add_range cls s.[i] s.[i+2]; parse_class cls (i+3)) else (Charset.add cls s.[i]; parse_class cls (i+1)) in let (p, i) = parse_or 0 in assert (i = String.length s); p (* All together *) let pattern ?transl s = compile_pattern ?transl (parse_pattern s) let pattern_string ?transl s = compile_pattern ?transl (Simple[String s]) (* Translation tables for ISO 8859-15 (Latin 1 with Euro) *) module Iso8859_15 = struct let case_insensitive = "\000\001\002\003\004\005\006\007\008\t\n\011\012\013\014\015\016\017\018\019\020\021\022\023\024\025\026\027\028\029\030\031 !\"#$%&'()*+,-./0123456789:;<=>?@abcdefghijklmnopqrstuvwxyz[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\127\128\129\130\131\132\133\134\135\136\137\138\139\140\141\142\143\144\145\146\147\148\149\150\151\152\153\154\155\156\157\158\159 ¡¢£¤¥¨§¨©ª«¬­®¯°±²³¸µ¶·¸¹º»½½ÿ¿àáâãäåæçèéêëìíîïðñòóôõö×øùúûüýþßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ" let accent_insensitive = "\000\001\002\003\004\005\006\007\008\t\n\011\012\013\014\015\016\017\018\019\020\021\022\023\024\025\026\027\028\029\030\031 !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\127\128\129\130\131\132\133\134\135\136\137\138\139\140\141\142\143\144\145\146\147\148\149\150\151\152\153\154\155\156\157\158\159 ¡¢£¤¥S§s©ª«¬­®¯°±²³Zµ¶·z¹º»OoY¿AAAAAAACEEEEIIIIÐNOOOOO×OUUUUYÞsaaaaaaaceeeeiiiiðnooooo÷ouuuuyþy" let case_and_accent_insensitive = "\000\001\002\003\004\005\006\007\008\t\n\011\012\013\014\015\016\017\018\019\020\021\022\023\024\025\026\027\028\029\030\031 !\"#$%&'()*+,-./0123456789:;<=>?@abcdefghijklmnopqrstuvwxyz[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\127\128\129\130\131\132\133\134\135\136\137\138\139\140\141\142\143\144\145\146\147\148\149\150\151\152\153\154\155\156\157\158\159 ¡¢£¤¥s§s©ª«¬­®¯°±²³zµ¶·z¹º»ooy¿aaaaaaaceeeeiiiiðnooooo×ouuuuyþsaaaaaaaceeeeiiiiðnooooo÷ouuuuyþy" end ocamlagrep-1.0/agrep.mli0100644004340400511750000001131307426730535014700 0ustar xleroyformel(***********************************************************************) (* *) (* The "agrep" library for Objective Caml *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) (* Copyright 2002 Institut National de Recherche en Informatique et *) (* en Automatique. All rights reserved. This file is distributed *) (* under the terms of the GNU Library General Public License. *) (* *) (***********************************************************************) (* $Id: agrep.mli,v 1.2 2002/02/02 09:29:01 xleroy Exp $ *) (** String searching with errors *) type pattern (** The type of compiled search patterns *) val pattern: ?transl:string -> string -> pattern (** Compile a search pattern. The syntax for patterns is similar to that of the Unix shell. The following constructs are recognized: - [? ] match any single character - [* ] match any sequence of characters - [[..] ] character set: ranges are denoted with [-], as in [[a-z]]; an initial [^], as in [[^0-9]], complements the set - [& ] conjunction (e.g. [sweet&sour]) - [| ] alternative (e.g. [high|low]) - [(..) ] grouping - [\ ] escape special characters; the special characters are [\?*[]&|()]. The optional argument [transl] is a character translation table. This is a string [s] of length 256 that ``translates'' a character [c] to the character [s.(Char.code c)]. A character of the text matches a character of the pattern if they both translate to the same character according to [transl]. If [transl] is not provided, the identity translation (two characters match iff they are equal) is assumed. Useful predefined translation tables are provided in {!Agrep.Iso8859_15}. *) exception Syntax_error of int (** Exception thrown by {!Agrep.pattern} when the given pattern is syntactically incorrect. The integer argument is the character number where the syntax error occurs. *) val pattern_string: ?transl:string -> string -> pattern (** [Agrep.pattern_string s] returns a pattern that matches exactly the string [s] and nothing else. The optional parameter [transl] is as in {!Agrep.pattern}. *) val string_match: pattern -> ?numerrs:int -> ?wholeword: bool -> string -> bool (** [string_match pat text] tests whether the string [text] matches the compiled pattern [pat]. The optional parameter [numerrs] is the number of errors permitted. One error corresponds to a substitution, an insertion or a deletion of a character. [numerrs] default to 0 (exact match). The optional parameter [wholeword] is [true] if the pattern must match a whole word, [false] if it can match inside a word. [wholeword] defaults to [false] (match inside words). *) val substring_match: pattern -> ?numerrs:int -> ?wholeword: bool -> string -> pos:int -> len:int -> bool (** Same as {!Agrep.string_match}, but restrict the match to the substring of the given string starting at character number [pos] and extending [len] characters. *) val errors_substring_match: pattern -> ?numerrs:int -> ?wholeword: bool -> string -> pos:int -> len:int -> int (** Same as {!Agrep.substring_match}, but return the smallest number of errors such that the substring matches the pattern. That is, it returns [0] if the substring matches exactly, [1] if the substring matches with one error, etc. Return [max_int] if the substring does not match the pattern with at most [numerrs] errors. *) module Iso8859_15: sig val case_insensitive: string (** Translation table identifying uppercase and lowercase letters. *) val accent_insensitive: string (** Translation table identifying accented letters with the corresponding non-accented letters, while still preserving case. *) val case_and_accent_insensitive: string (** Translation table identifying accented letters with the corresponding non-accented letters, and uppercase and lowercase letters. *) end (** Useful translation tables for the ISO 8859-15 (Latin-1 with Euro) character set. *) ocamlagrep-1.0/engine.c0100644004340400511750000001702607426730536014520 0ustar xleroyformel/***********************************************************************/ /* */ /* The "agrep" library for Objective Caml */ /* */ /* Xavier Leroy, projet Cristal, INRIA Rocquencourt */ /* */ /* Copyright 2002 Institut National de Recherche en Informatique et */ /* en Automatique. All rights reserved. This file is distributed */ /* under the terms of the GNU Library General Public License. */ /* */ /***********************************************************************/ /* $Id: engine.c,v 1.2 2002/02/02 09:29:02 xleroy Exp $ */ #include #include #include #include #include typedef unsigned char uchar; typedef unsigned int uint; typedef unsigned long ulong; #define BITS_PER_WORD (8 * sizeof(ulong)) #define Setbit(ptr,nbit) \ ((ptr)[(nbit) / BITS_PER_WORD] |= (1UL << ((nbit) % BITS_PER_WORD))) #define CAML_MAX_INT ((1L << (8 * sizeof(value) - 2)) - 1) CAMLprim value caml_agrep_new_bitmatrix(value v_patlen, value v_nentries) { ulong nwords = (Long_val(v_patlen) + BITS_PER_WORD - 1) / BITS_PER_WORD; ulong size = nwords * Long_val(v_nentries); value res = alloc(size, Abstract_tag); memset((ulong *) res, 0, size * sizeof(ulong)); return res; } CAMLprim value caml_agrep_set_bit(value v_matrix, value v_patlen, value v_index, value v_bitnum) { ulong nwords = (Long_val(v_patlen) + BITS_PER_WORD - 1) / BITS_PER_WORD; ulong bitnum = Long_val(v_bitnum); Setbit((ulong *) v_matrix + nwords * Long_val(v_index), bitnum); return Val_unit; } unsigned char word_constituent[256] = { /* 0 - 31 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, /* @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ */ 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \127 */ 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, /* 128-159 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ¡ ¢ £ ¤ ¥ ¦ § ¨ © ª « ¬ ­ ® ¯ ° ± ² ³ ´ µ ¶ · ¸ ¹ º » ¼ ½ ¾ ¿ */ 0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,1,0, /* À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1, /* à á â ã ä å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ÷ ø ù ú û ü ý þ ÿ */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1 }; /* Specialized versions of string matching code */ #undef WHOLE_WORD #define FUNCTION_NAME match_0errs #define NERRS 0 #include "skeleton.h" #undef FUNCTION_NAME #undef NERRS #define FUNCTION_NAME match_1errs #define NERRS 1 #include "skeleton.h" #undef FUNCTION_NAME #undef NERRS #define FUNCTION_NAME match_2errs #define NERRS 2 #include "skeleton.h" #undef FUNCTION_NAME #undef NERRS #define FUNCTION_NAME match_3errs #define NERRS 3 #include "skeleton.h" #undef FUNCTION_NAME #undef NERRS #define WHOLE_WORD #define FUNCTION_NAME match_word_0errs #define NERRS 0 #include "skeleton.h" #undef FUNCTION_NAME #undef NERRS #define FUNCTION_NAME match_word_1errs #define NERRS 1 #include "skeleton.h" #undef FUNCTION_NAME #undef NERRS #define FUNCTION_NAME match_word_2errs #define NERRS 2 #include "skeleton.h" #undef FUNCTION_NAME #undef NERRS #define FUNCTION_NAME match_word_3errs #define NERRS 3 #include "skeleton.h" #undef FUNCTION_NAME #undef NERRS /* General code: arbitrary errors, arbitrary pattern length */ static value match_general(ulong * table, ulong m, ulong nerrs, int wholeword, uchar * text, mlsize_t length) { ulong nwords, n, j; ulong ** R; ulong * Rpbefore; ulong Found_offset, Found_mask; ulong * Ssharp; ulong * Rc, * Rp; ulong carry; ulong match_empty; long retcode; nwords = (m + BITS_PER_WORD - 1) / BITS_PER_WORD; R = stat_alloc((nerrs + 1) * sizeof(ulong *)); for (n = 0; n <= nerrs; n++) R[n] = stat_alloc(nwords * sizeof(ulong)); Rpbefore = stat_alloc(nwords * sizeof(ulong)); /* Initialize Found */ Found_offset = m / BITS_PER_WORD; Found_mask = 1UL << (m % BITS_PER_WORD); /* Initialize R */ for (n = 0; n <= nerrs; n++) { memset(R[n], 0, nwords * sizeof(ulong)); for (j = 0; j <= n; j++) Setbit(R[n], j); } /* Initialize Ssharp & match_empty */ Ssharp = table + 256 * nwords; match_empty = 1; /* Main loop */ for (/*nothing*/; length > 0; length--, text++) { ulong * S = table + (*text) * nwords; if (wholeword) match_empty = word_constituent[text[0]] ^ word_constituent[text[1]]; /* Special case for 0 errors */ Rc = R[0]; carry = match_empty; for (j = 0; j < nwords; j++) { ulong Rcbefore = Rc[j]; ulong toshift = Rcbefore & S[j]; Rc[j] = (toshift << 1) | (Rcbefore & Ssharp[j]) | carry; carry = toshift >> (BITS_PER_WORD - 1); Rpbefore[j] = Rcbefore; } if (Rc[Found_offset] & Found_mask && match_empty) { retcode = 0; goto exit; } /* General case for > 0 errors */ for (n = 1; n <= nerrs; n++) { Rp = Rc; Rc = R[n]; carry = match_empty; for (j = 0; j < nwords; j++) { ulong Rcbefore = Rc[j]; ulong toshift = (Rcbefore & S[j]) | Rpbefore[j] | Rp[j]; Rc[j] = (toshift << 1) | Rpbefore[j] | (Rcbefore & Ssharp[j]) | carry; carry = toshift >> (BITS_PER_WORD - 1); Rpbefore[j] = Rcbefore; } if (Rc[Found_offset] & Found_mask && match_empty) { retcode = n; goto exit; } } } /* Not found */ retcode = CAML_MAX_INT; /* Cleanup */ exit: for (n = 0; n <= nerrs; n++) free(R[n]); free(R); free(Rpbefore); return Val_long(retcode); } /* Entry point */ CAMLprim value caml_agrep_match(value v_text, value v_ofs, value v_len, value v_patlen, value v_table, value v_nerrs, value v_wholeword) { uchar * text = &Byte_u(v_text, Long_val(v_ofs)); mlsize_t len = Long_val(v_len); ulong patlen = Long_val(v_patlen); if (patlen < BITS_PER_WORD) { switch (((Long_val(v_nerrs)) << 1) | Int_val(v_wholeword)) { case 2*0+0: return match_0errs((ulong *) v_table, patlen, text, len); case 2*0+1: return match_word_0errs((ulong *) v_table, patlen, text, len); case 2*1+0: return match_1errs((ulong *) v_table, patlen, text, len); case 2*1+1: return match_word_1errs((ulong *) v_table, patlen, text, len); case 2*2+0: return match_2errs((ulong *) v_table, patlen, text, len); case 2*2+1: return match_word_2errs((ulong *) v_table, patlen, text, len); case 2*3+0: return match_3errs((ulong *) v_table, patlen, text, len); case 2*3+1: return match_word_3errs((ulong *) v_table, patlen, text, len); } } return match_general((ulong *) v_table, patlen, Long_val(v_nerrs), Int_val(v_wholeword), text, len); } CAMLprim value caml_agrep_match_bytecode(value * argv, int argn) { return caml_agrep_match(argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6]); } ocamlagrep-1.0/skeleton.h0100644004340400511750000000477707426730536015115 0ustar xleroyformel/***********************************************************************/ /* */ /* The "agrep" library for Objective Caml */ /* */ /* Xavier Leroy, projet Cristal, INRIA Rocquencourt */ /* */ /* Copyright 2002 Institut National de Recherche en Informatique et */ /* en Automatique. All rights reserved. This file is distributed */ /* under the terms of the GNU Library General Public License. */ /* */ /***********************************************************************/ /* $Id: skeleton.h,v 1.2 2002/02/02 09:29:02 xleroy Exp $ */ static value FUNCTION_NAME(ulong * table, uint pattern_length, uchar * text, mlsize_t length) { ulong R0; #if NERRS >= 1 ulong R1, R0before; #endif #if NERRS >= 2 ulong R2, R1before; #endif #if NERRS == 3 ulong R3, R2before; #endif ulong Found, Ssharp; #ifdef WHOLE_WORD ulong word_boundary; #endif Ssharp = table[256]; Found = 1UL << pattern_length; R0 = 1; #if NERRS >= 1 R1 = 3; #endif #if NERRS >= 2 R2 = 7; #endif #if NERRS == 3 R3 = 0xF; #endif for (/*nothing*/; length > 0; length--, text++) { ulong S = table[*text]; #if NERRS >= 1 R0before = R0; #endif #if NERRS >= 2 R1before = R1; #endif #if NERRS == 3 R2before = R2; #endif #ifdef WHOLE_WORD word_boundary = word_constituent[text[0]] ^ word_constituent[text[1]]; # define MATCH_EMPTY word_boundary #else # define MATCH_EMPTY 1 #endif R0 = ((R0 & S) << 1) | (R0 & Ssharp) | MATCH_EMPTY; #if NERRS >= 1 R1 = (((R1 & S) | R0before | R0) << 1) | R0before | (R1 & Ssharp) | MATCH_EMPTY; #endif #if NERRS >= 2 R2 = (((R2 & S) | R1before | R1) << 1) | R1before | (R2 & Ssharp) | MATCH_EMPTY; #endif #if NERRS == 3 R3 = (((R3 & S) | R2before | R2) << 1) | R2before | (R3 & Ssharp) | MATCH_EMPTY; #endif if ((R0 & Found) && MATCH_EMPTY) return Val_int(0); #if NERRS >= 1 if ((R1 & Found) && MATCH_EMPTY) return Val_int(1); #endif #if NERRS >= 2 if ((R2 & Found) && MATCH_EMPTY) return Val_int(2); #endif #if NERRS == 3 if ((R3 & Found) && MATCH_EMPTY) return Val_int(3); #endif } return Val_long(CAML_MAX_INT); } #undef MATCH_EMPTY ocamlagrep-1.0/testagrep.ml0100644004340400511750000000310007427523312015414 0ustar xleroyformel(* Test harness *) open Printf let numerrs = ref 0 and wholeword = ref false and verbatim = ref false and fromfile = ref false let do_search p s = if !fromfile then begin let ic = open_in s in begin try while true do let l = input_line ic in let n = Agrep.errors_substring_match p ~numerrs:!numerrs ~wholeword:!wholeword l ~pos:0 ~len:(String.length l) in if n < max_int then printf "%d %s\n" n l done with End_of_file -> () end; close_in ic end else begin let n = Agrep.errors_substring_match p ~numerrs:!numerrs ~wholeword:!wholeword s ~pos:0 ~len:(String.length s) in if n = max_int then printf "No match" else printf "Match, with %d error(s)" n; print_newline() end let _ = let pattern = ref None in Arg.parse ["-1", Arg.Unit(fun () -> numerrs := 1), " one error"; "-2", Arg.Unit(fun () -> numerrs := 2), " two error"; "-3", Arg.Unit(fun () -> numerrs := 3), " three error"; "-e", Arg.Int(fun n -> numerrs := n), " n errors"; "-f", Arg.Set fromfile, " search in given file rather than in string"; "-w", Arg.Set wholeword, " match entire words"; "-v", Arg.Set verbatim, " match string verbatim (no special chars)"] (fun s -> match !pattern with None -> pattern := Some(if !verbatim then Agrep.pattern_string s else Agrep.pattern s) | Some p -> do_search p s) "Usage: testagrep [options] \nOptions are:"