pax_global_header00006660000000000000000000000064134060000220014475gustar00rootroot0000000000000052 comment=9ed2e0373376b3852ef7d1ee79707965408fd5e6 ulex-1.2-camlp5/000077500000000000000000000000001340600002200134735ustar00rootroot00000000000000ulex-1.2-camlp5/CHANGES000066400000000000000000000021751340600002200144730ustar00rootroot00000000000000-camlp5 version * Backported from camlp4 to camlp5 1.1 * Generate (more) globally unique identifiers to avoid conflicts when open'ing another module processed by ulex (issue reported by Gerd Stolpmann) 1.0 * Update to the new Camlp4 and to ocamlbuild (release for OCaml 3.10 only), by Nicolas Pouillard. 0.8 * Really make it work with OCaml 3.09. * Support for Utf-16. 0.7 released May 24 2005 * Bug fixes * Update to OCaml 3.09 (currently CVS). Still works with OCaml 3.08. * MIT-like license (used to LGPL) 0.5 release Jul. 8 2004 * Document how to use a custom implementation for lex buffers * Update to OCaml 3.08 0.4 released Jan. 10 2004 * Bug fix (accept 1114111 as valid Unicode code point) * Add the rollback function 0.3 released Oct. 8 2003 * Bug fix * Add a new predefined class for ISO identifiers 0.2 released Sep. 22 2003 * Changed the names of predefined regexp * Fix max_code = 0x10ffff * Lexers that changes encoding on the fly * Documentation of the interface Ulexing 0.1 released Sep. 20 2003 * Initial release ulex-1.2-camlp5/LICENSE000066400000000000000000000021461340600002200145030ustar00rootroot00000000000000The package ulex is released under the terms of an MIT-like license. Copyright 2005 by Alain Frisch. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ulex-1.2-camlp5/META000066400000000000000000000003551340600002200141470ustar00rootroot00000000000000version = "1.2" requires = "camlp5" description = "Runtime support for ulex" archive(byte) = "ulexing.cma" archive(native) = "ulexing.cmxa" archive(syntax,toploop) = "pa_ulex.cma ulexing.cma" archive(syntax,preprocessor) = "pa_ulex.cma" ulex-1.2-camlp5/Makefile000066400000000000000000000021361340600002200151350ustar00rootroot00000000000000ALL=pa_ulex.cma ulexing.cma OCAMLBUILD=ocamlbuild -byte-plugin -use-ocamlfind all:: $(OCAMLBUILD) $(ALL) all.opt:: $(OCAMLBUILD) $(ALL) $(ALL:.cma=.cmxa) MODS=ulexing utf16 utf8 install: all cd _build && $(MAKE) -f ../Makefile realinstall realinstall: ocamlfind install ulex-camlp5 ../META $(wildcard $(MODS:=.mli) $(MODS:=.cmi) $(MODS:=.cmx) pa_ulex.cma ulexing.a ulexing.cma ulexing.cmxa) uninstall: ocamlfind remove ulex-camlp5 clean: $(OCAMLBUILD) -clean rm -f *~ *.html *.css *.tar.gz view_test: all camlp5o pr_o.cmo ./_build/pa_ulex.cma test.ml run_test: $(OCAMLBUILD) test.byte ./test.byte custom_ulexing.byte: $(OCAMLBUILD) custom_ulexing.byte doc: ocamldoc -html ulexing.mli VERSION := $$(opam query --version) NAME_VERSION := $$(opam query --name-version) ARCHIVE := $$(opam query --archive) release: git tag -a v$(VERSION) -m "Version $(VERSION)." git push origin v$(VERSION) opam publish prepare $(NAME_VERSION) $(ARCHIVE) cp -t $(NAME_VERSION) descr grep -Ev '^(name|version):' opam >$(NAME_VERSION)/opam opam publish submit $(NAME_VERSION) rm -rf $(NAME_VERSION) ulex-1.2-camlp5/README000066400000000000000000000123131340600002200143530ustar00rootroot00000000000000Ulex-camlp5: a OCaml lexer generator for Unicode compiled with camlp5 License: Copyright (C) 2003, 2004, 2005, 2006 Alain Frisch distributed under the terms of an MIT-like license : see LICENSE Author: email: Alain.Frisch@inria.fr web: http://www.eleves.ens.fr/home/frisch, http://www.cduce.org -------------------------------------------------------------------------- Overview - ulex-camlp5 is a version of ulex compile with camlp5. - ulex is a lexer generator. - it is implemented as an OCaml syntax extension: lexer specifications are embedded in regular OCaml code. - the lexers work with a new kind of "lexbuf" that supports Unicode; a single lexer can work with arbitrary encodings of the input stream. -------------------------------------------------------------------------- Lexer specifications ulex adds a new kind of expression to OCaml: lexer definitions. The syntax for the new construction is: lexer R1 -> e1 | R2 -> e2 ... | Rn -> en where the Ri are regular expressions and the ei are OCaml expressions (called actions). The keyword lexer can optionally be followed by a vertical var. The type of this expression if Ulexing.lexbuf -> t where t is the common type of all the actions. Unlike ocamllex, lexers work on stream of Unicode code points, not bytes. The actions have access to a variable "lexbuf", of type Ulexing.lexbuf. They can call function from the Ulexing module to extract (parts of) the matched lexeme, in the desired encoding. It is legal to define mutually recursive lexers with additional arguments: let rec lex1 x y = lexer 'a' -> lex2 0 1 lexbuf | _ -> ... and lex2 a b = lexer ... The syntax of regular expressions is derived from ocamllex. Additional features: - integer literals, where character literal are expected. They represent a Unicode code point. E.g.: [ 'a'-'z' 1000-1500 ] 65 - inside square brackets, a string represents the union of all its characters Note: OCaml source files are supposed to be encoded in Latin1. It is possible to define named regular expressions with the following construction, that can appear in place of of structure item: let regexp n = R where n is the regexp name to be defined. Because ulex is implemented as a syntax extension, it can deal with both original and revised syntax (and possibly others). Note that lexer specifications need not be named. Here is an example: (lexer ("#!" [^ '\n']* "\n")? -> ()) lexbuf -------------------------------------------------------------------------- Scoping rules for regular expressions declarations Regexp declarations look pretty-much like regular OCaml let-bindings. However, they can only be used as structure item (no local binding let regexp n = R in ...). Moreover, they don't respect OCaml scoping rule. Indeed, the lexical scope of a given "let regexp" is the whole source file following the declaration. Also the regexps names are not exported by a module, and you cannot use qualified names A.n (where n is a regexp defined in module A). -------------------------------------------------------------------------- Predefined regexps ulex provides a set of predefined regexps: - eof: the virtual end-of-file character - xml_letter, xml_digit, xml_extender, xml_base_char, xml_ideographic, xml_combining_char, xml_blank: as defined by the XML recommandation - tr8876_ident_char: characters names in identifiers from ISO TR8876 -------------------------------------------------------------------------- Running a lexer To run a lexer, you must call it on a Ulexing.lexbuf. Such an object represents a Unicode buffer. It can be created from Latin1-encoded or utf8-encoded strings, stream, or channels, or from integer arrays or streams (which represent Unicode code points). See the interface of the module Ulexing. There is also some support for parsing Utf-16 encoded streams and manipulating utf16 strings. See the interface of the module Utf16. It is possible to work with a custom implementation for lex buffers. To do this, you just have to ensure that a module called Ulexing is in scope of your lexer specifications. See custom_ulexing.ml in the distribution for an example, and the interface of Ulexing for a specification of what the module should export. -------------------------------------------------------------------------- Using ulex The first thing to do is to compile and install ulex. You need recent versions of OCaml. make all make all.opt (* optional *) 1. With findlib If you have findlib, you can use it to install and use ulex. The name of the findlib package is "ulex". Installation: make install Compilation of OCaml files with lexer specifications: ocamlfind ocamlc -c -package ulex -syntax camlp5o my_file.ml When linking, you must also include the ulex package: ocamlfind ocamlc -o my_prog -linkpkg -package ulex my_file.cmo 2. Without findlib You can use ulex without findlib. To compile, you need to run the source file through the Camlp5 syntax extension pa_ulex.cma. Moreover, you need to link the application with the runtime support library for ulex (ulexing.cma / ulexing.cmxa). -------------------------------------------------------------------------- Acknowledgments Thanks to Benus Becker for contributing an implementation of Utf16. ulex-1.2-camlp5/_tags000066400000000000000000000001441340600002200145120ustar00rootroot00000000000000"pa_ulex.ml": package(camlp5), camlp5orf or "custom_ulexing.ml": use_ulex # true: use_ulex ulex-1.2-camlp5/cset.ml000066400000000000000000000252611340600002200147710ustar00rootroot00000000000000(* Character sets are represented as lists of intervals. The intervals must be non-overlapping and not collapsable, and the list must be ordered in increasing order. *) type t = (int * int) list let max_code = 0x10ffff (* must be < max_int *) let min_code = -1 let empty = [] let singleton i = [i,i] let is_empty = function [] -> true | _ -> false let interval i j = if i <= j then [i,j] else [j,i] let eof = singleton (-1) let any = interval 0 max_code let print ppf l = Format.fprintf ppf "[ "; List.iter (fun (i,j) -> Format.fprintf ppf "%i-%i " i j) l; Format.fprintf ppf "]" let dump l = print Format.std_formatter l let rec union c1 c2 = match c1,c2 with | [], _ -> c2 | _, [] -> c1 | ((i1,j1) as s1)::r1, (i2,j2)::r2 -> if (i1 <= i2) then if j1 + 1 < i2 then s1::(union r1 c2) else if (j1 < j2) then union r1 ((i1,j2)::r2) else union c1 r2 else union c2 c1 let complement c = let rec aux start = function | [] -> if start <= max_code then [start,max_code] else [] | (i,j)::l -> (start,i-1)::(aux (succ j) l) in match c with | (-1,j)::l -> aux (succ j) l | l -> aux (-1) l let intersection c1 c2 = complement (union (complement c1) (complement c2)) let difference c1 c2 = complement (union (complement c1) c2) (* Unicode classes from XML *) let base_char = [ 0x0041,0x005A; 0x0061,0x007A; 0x00C0,0x00D6; 0x00D8,0x00F6; 0x00F8,0x00FF; 0x0100,0x0131; 0x0134,0x013E; 0x0141,0x0148; 0x014A,0x017E; 0x0180,0x01C3; 0x01CD,0x01F0; 0x01F4,0x01F5; 0x01FA,0x0217; 0x0250,0x02A8; 0x02BB,0x02C1; 0x0386,0x0386; 0x0388,0x038A; 0x038C,0x038C; 0x038E,0x03A1; 0x03A3,0x03CE; 0x03D0,0x03D6; 0x03DA,0x03DA; 0x03DC,0x03DC; 0x03DE,0x03DE; 0x03E0,0x03E0; 0x03E2,0x03F3; 0x0401,0x040C; 0x040E,0x044F; 0x0451,0x045C; 0x045E,0x0481; 0x0490,0x04C4; 0x04C7,0x04C8; 0x04CB,0x04CC; 0x04D0,0x04EB; 0x04EE,0x04F5; 0x04F8,0x04F9; 0x0531,0x0556; 0x0559,0x0559; 0x0561,0x0586; 0x05D0,0x05EA; 0x05F0,0x05F2; 0x0621,0x063A; 0x0641,0x064A; 0x0671,0x06B7; 0x06BA,0x06BE; 0x06C0,0x06CE; 0x06D0,0x06D3; 0x06D5,0x06D5; 0x06E5,0x06E6; 0x0905,0x0939; 0x093D,0x093D; 0x0958,0x0961; 0x0985,0x098C; 0x098F,0x0990; 0x0993,0x09A8; 0x09AA,0x09B0; 0x09B2,0x09B2; 0x09B6,0x09B9; 0x09DC,0x09DD; 0x09DF,0x09E1; 0x09F0,0x09F1; 0x0A05,0x0A0A; 0x0A0F,0x0A10; 0x0A13,0x0A28; 0x0A2A,0x0A30; 0x0A32,0x0A33; 0x0A35,0x0A36; 0x0A38,0x0A39; 0x0A59,0x0A5C; 0x0A5E,0x0A5E; 0x0A72,0x0A74; 0x0A85,0x0A8B; 0x0A8D,0x0A8D; 0x0A8F,0x0A91; 0x0A93,0x0AA8; 0x0AAA,0x0AB0; 0x0AB2,0x0AB3; 0x0AB5,0x0AB9; 0x0ABD,0x0ABD; 0x0AE0,0x0AE0; 0x0B05,0x0B0C; 0x0B0F,0x0B10; 0x0B13,0x0B28; 0x0B2A,0x0B30; 0x0B32,0x0B33; 0x0B36,0x0B39; 0x0B3D,0x0B3D; 0x0B5C,0x0B5D; 0x0B5F,0x0B61; 0x0B85,0x0B8A; 0x0B8E,0x0B90; 0x0B92,0x0B95; 0x0B99,0x0B9A; 0x0B9C,0x0B9C; 0x0B9E,0x0B9F; 0x0BA3,0x0BA4; 0x0BA8,0x0BAA; 0x0BAE,0x0BB5; 0x0BB7,0x0BB9; 0x0C05,0x0C0C; 0x0C0E,0x0C10; 0x0C12,0x0C28; 0x0C2A,0x0C33; 0x0C35,0x0C39; 0x0C60,0x0C61; 0x0C85,0x0C8C; 0x0C8E,0x0C90; 0x0C92,0x0CA8; 0x0CAA,0x0CB3; 0x0CB5,0x0CB9; 0x0CDE,0x0CDE; 0x0CE0,0x0CE1; 0x0D05,0x0D0C; 0x0D0E,0x0D10; 0x0D12,0x0D28; 0x0D2A,0x0D39; 0x0D60,0x0D61; 0x0E01,0x0E2E; 0x0E30,0x0E30; 0x0E32,0x0E33; 0x0E40,0x0E45; 0x0E81,0x0E82; 0x0E84,0x0E84; 0x0E87,0x0E88; 0x0E8A,0x0E8A; 0x0E8D,0x0E8D; 0x0E94,0x0E97; 0x0E99,0x0E9F; 0x0EA1,0x0EA3; 0x0EA5,0x0EA5; 0x0EA7,0x0EA7; 0x0EAA,0x0EAB; 0x0EAD,0x0EAE; 0x0EB0,0x0EB0; 0x0EB2,0x0EB3; 0x0EBD,0x0EBD; 0x0EC0,0x0EC4; 0x0F40,0x0F47; 0x0F49,0x0F69; 0x10A0,0x10C5; 0x10D0,0x10F6; 0x1100,0x1100; 0x1102,0x1103; 0x1105,0x1107; 0x1109,0x1109; 0x110B,0x110C; 0x110E,0x1112; 0x113C,0x113C; 0x113E,0x113E; 0x1140,0x1140; 0x114C,0x114C; 0x114E,0x114E; 0x1150,0x1150; 0x1154,0x1155; 0x1159,0x1159; 0x115F,0x1161; 0x1163,0x1163; 0x1165,0x1165; 0x1167,0x1167; 0x1169,0x1169; 0x116D,0x116E; 0x1172,0x1173; 0x1175,0x1175; 0x119E,0x119E; 0x11A8,0x11A8; 0x11AB,0x11AB; 0x11AE,0x11AF; 0x11B7,0x11B8; 0x11BA,0x11BA; 0x11BC,0x11C2; 0x11EB,0x11EB; 0x11F0,0x11F0; 0x11F9,0x11F9; 0x1E00,0x1E9B; 0x1EA0,0x1EF9; 0x1F00,0x1F15; 0x1F18,0x1F1D; 0x1F20,0x1F45; 0x1F48,0x1F4D; 0x1F50,0x1F57; 0x1F59,0x1F59; 0x1F5B,0x1F5B; 0x1F5D,0x1F5D; 0x1F5F,0x1F7D; 0x1F80,0x1FB4; 0x1FB6,0x1FBC; 0x1FBE,0x1FBE; 0x1FC2,0x1FC4; 0x1FC6,0x1FCC; 0x1FD0,0x1FD3; 0x1FD6,0x1FDB; 0x1FE0,0x1FEC; 0x1FF2,0x1FF4; 0x1FF6,0x1FFC; 0x2126,0x2126; 0x212A,0x212B; 0x212E,0x212E; 0x2180,0x2182; 0x3041,0x3094; 0x30A1,0x30FA; 0x3105,0x312C; 0xAC00,0xD7A3 ] let ideographic = [ 0x3007,0x3007; 0x3021,0x3029; 0x4E00,0x9FA5 ] let combining_char = [ 0x0300,0x0345; 0x0360,0x0361; 0x0483,0x0486; 0x0591,0x05A1; 0x05A3,0x05B9; 0x05BB,0x05BD; 0x05BF,0x05BF; 0x05C1,0x05C2; 0x05C4,0x05C4; 0x064B,0x0652; 0x0670,0x0670; 0x06D6,0x06DC; 0x06DD,0x06DF; 0x06E0,0x06E4; 0x06E7,0x06E8; 0x06EA,0x06ED; 0x0901,0x0903; 0x093C,0x093C; 0x093E,0x094C; 0x094D,0x094D; 0x0951,0x0954; 0x0962,0x0963; 0x0981,0x0983; 0x09BC,0x09BC; 0x09BE,0x09BE; 0x09BF,0x09BF; 0x09C0,0x09C4; 0x09C7,0x09C8; 0x09CB,0x09CD; 0x09D7,0x09D7; 0x09E2,0x09E3; 0x0A02,0x0A02; 0x0A3C,0x0A3C; 0x0A3E,0x0A3E; 0x0A3F,0x0A3F; 0x0A40,0x0A42; 0x0A47,0x0A48; 0x0A4B,0x0A4D; 0x0A70,0x0A71; 0x0A81,0x0A83; 0x0ABC,0x0ABC; 0x0ABE,0x0AC5; 0x0AC7,0x0AC9; 0x0ACB,0x0ACD; 0x0B01,0x0B03; 0x0B3C,0x0B3C; 0x0B3E,0x0B43; 0x0B47,0x0B48; 0x0B4B,0x0B4D; 0x0B56,0x0B57; 0x0B82,0x0B83; 0x0BBE,0x0BC2; 0x0BC6,0x0BC8; 0x0BCA,0x0BCD; 0x0BD7,0x0BD7; 0x0C01,0x0C03; 0x0C3E,0x0C44; 0x0C46,0x0C48; 0x0C4A,0x0C4D; 0x0C55,0x0C56; 0x0C82,0x0C83; 0x0CBE,0x0CC4; 0x0CC6,0x0CC8; 0x0CCA,0x0CCD; 0x0CD5,0x0CD6; 0x0D02,0x0D03; 0x0D3E,0x0D43; 0x0D46,0x0D48; 0x0D4A,0x0D4D; 0x0D57,0x0D57; 0x0E31,0x0E31; 0x0E34,0x0E3A; 0x0E47,0x0E4E; 0x0EB1,0x0EB1; 0x0EB4,0x0EB9; 0x0EBB,0x0EBC; 0x0EC8,0x0ECD; 0x0F18,0x0F19; 0x0F35,0x0F35; 0x0F37,0x0F37; 0x0F39,0x0F39; 0x0F3E,0x0F3E; 0x0F3F,0x0F3F; 0x0F71,0x0F84; 0x0F86,0x0F8B; 0x0F90,0x0F95; 0x0F97,0x0F97; 0x0F99,0x0FAD; 0x0FB1,0x0FB7; 0x0FB9,0x0FB9; 0x20D0,0x20DC; 0x20E1,0x20E1; 0x302A,0x302F; 0x3099,0x3099; 0x309A,0x309A ] let digit = [ 0x0030,0x0039; 0x0660,0x0669; 0x06F0,0x06F9; 0x0966,0x096F; 0x09E6,0x09EF; 0x0A66,0x0A6F; 0x0AE6,0x0AEF; 0x0B66,0x0B6F; 0x0BE7,0x0BEF; 0x0C66,0x0C6F; 0x0CE6,0x0CEF; 0x0D66,0x0D6F; 0x0E50,0x0E59; 0x0ED0,0x0ED9; 0x0F20,0x0F29 ] let extender = [ 0x00B7,0x00B7; 0x02D0,0x02D1; 0x0387,0x0387; 0x0640,0x0640; 0x0E46,0x0E46; 0x0EC6,0x0EC6; 0x3005,0x3005; 0x3031,0x3035; 0x309D,0x309E; 0x30FC,0x30FE ] let blank = [ 0x0009,0x000A; 0x000D,0x000D; 0x0020,0x0020 ] let letter = union base_char ideographic (* Letters to be used in identifiers, as specified by ISO .... Data provided by John M. Skaller *) let tr8876_ident_char = [ (* ASCII *) (0x0041,0x005a); (0x0061,0x007a); (* Latin *) (0x00c0,0x00d6); (0x00d8,0x00f6); (0x00f8,0x01f5); (0x01fa,0x0217); (0x0250,0x02a8); (* Greek *) (0x0384,0x0384); (0x0388,0x038a); (0x038c,0x038c); (0x038e,0x03a1); (0x03a3,0x03ce); (0x03d0,0x03d6); (0x03da,0x03da); (0x03dc,0x03dc); (0x03de,0x03de); (0x03e0,0x03e0); (0x03e2,0x03f3); (* Cyrillic *) (0x0401,0x040d); (0x040f,0x044f); (0x0451,0x045c); (0x045e,0x0481); (0x0490,0x04c4); (0x04c7,0x04c4); (0x04cb,0x04cc); (0x04d0,0x04eb); (0x04ee,0x04f5); (0x04f8,0x04f9); (* Armenian *) (0x0531,0x0556); (0x0561,0x0587); (0x04d0,0x04eb); (* Hebrew *) (0x05d0,0x05ea); (0x05f0,0x05f4); (* Arabic *) (0x0621,0x063a); (0x0640,0x0652); (0x0670,0x06b7); (0x06ba,0x06be); (0x06c0,0x06ce); (0x06e5,0x06e7); (* Devanagari *) (0x0905,0x0939); (0x0958,0x0962); (* Bengali *) (0x0985,0x098c); (0x098f,0x0990); (0x0993,0x09a8); (0x09aa,0x09b0); (0x09b2,0x09b2); (0x09b6,0x09b9); (0x09dc,0x09dd); (0x09df,0x09e1); (0x09f0,0x09f1); (* Gurmukhi *) (0x0a05,0x0a0a); (0x0a0f,0x0a10); (0x0a13,0x0a28); (0x0a2a,0x0a30); (0x0a32,0x0a33); (0x0a35,0x0a36); (0x0a38,0x0a39); (0x0a59,0x0a5c); (0x0a5e,0x0a5e); (* Gunjarati *) (0x0a85,0x0a8b); (0x0a8d,0x0a8d); (0x0a8f,0x0a91); (0x0a93,0x0aa8); (0x0aaa,0x0ab0); (0x0ab2,0x0ab3); (0x0ab5,0x0ab9); (0x0ae0,0x0ae0); (* Oriya *) (0x0b05,0x0b0c); (0x0b0f,0x0b10); (0x0b13,0x0b28); (0x0b2a,0x0b30); (0x0b32,0x0b33); (0x0b36,0x0b39); (0x0b5c,0x0b5d); (0x0b5f,0x0b61); (* Tamil *) (0x0b85,0x0b8a); (0x0b8e,0x0b90); (0x0b92,0x0b95); (0x0b99,0x0b9a); (0x0b9c,0x0b9c); (0x0b9e,0x0b9f); (0x0ba3,0x0ba4); (0x0ba8,0x0baa); (0x0bae,0x0bb5); (0x0bb7,0x0bb9); (* Telugu *) (0x0c05,0x0c0c); (0x0c0e,0x0c10); (0x0c12,0x0c28); (0x0c2a,0x0c33); (0x0c35,0x0c39); (0x0c60,0x0c61); (* Kannada *) (0x0c85,0x0c8c); (0x0c8e,0x0c90); (0x0c92,0x0ca8); (0x0caa,0x0cb3); (0x0cb5,0x0cb9); (0x0ce0,0x0ce1); (* Malayam *) (0x0d05,0x0d0c); (0x0d0e,0x0d10); (0x0d12,0x0d28); (0x0d2a,0x0d39); (0x0d60,0x0d61); (* Thai *) (0x0e01,0x0e30); (0x0e32,0x0e33); (0x0e40,0x0e46); (0x0e4f,0x0e5b); (* Lao *) (0x0e81,0x0e82); (0x0e84,0x0e84); (0x0e87,0x0e88); (0x0e8a,0x0e8a); (0x0e0d,0x0e0d); (0x0e94,0x0e97); (0x0e99,0x0e9f); (0x0ea1,0x0ea3); (0x0ea5,0x0ea5); (0x0ea7,0x0ea7); (0x0eaa,0x0eab); (0x0ead,0x0eb0); (0x0eb2,0x0eb3); (0x0ebd,0x0ebd); (0x0ec0,0x0ec4); (0x0ec6,0x0ec6); (* Georgian *) (0x10a0,0x10c5); (0x10d0,0x10f6); (* Hangul Jamo *) (0x1100,0x1159); (0x1161,0x11a2); (0x11a8,0x11f9); (0x11d0,0x11f6); (* Latin extensions *) (0x1e00,0x1e9a); (0x1ea0,0x1ef9); (* Greek extended *) (0x1f00,0x1f15); (0x1f18,0x1f1d); (0x1f20,0x1f45); (0x1f48,0x1f4d); (0x1f50,0x1f57); (0x1f59,0x1f59); (0x1f5b,0x1f5b); (0x1f5d,0x1f5d); (0x1f5f,0x1f7d); (0x1f80,0x1fb4); (0x1fb6,0x1fbc); (0x1fc2,0x1fc4); (0x1fc6,0x1fcc); (0x1fd0,0x1fd3); (0x1fd6,0x1fdb); (0x1fe0,0x1fec); (0x1ff2,0x1ff4); (0x1ff6,0x1ffc); (* Hiragana *) (0x3041,0x3094); (0x309b,0x309e); (* Katakana *) (0x30a1,0x30fe); (* Bopmofo *) (0x3105,0x312c); (* CJK Unified Ideographs *) (0x4e00,0x9fa5); (* CJK Compatibility Ideographs *) (0xf900,0xfa2d); (* Arabic Presentation Forms *) (0xfb1f,0xfb36); (0xfb38,0xfb3c); (0xfb3e,0xfb3e); (0xfb40,0xfb41); (0xfb42,0xfb44); (0xfb46,0xfbb1); (0xfbd3,0xfd35); (* Arabic Presentation Forms-A *) (0xfd50,0xfd85); (0xfd92,0xfbc7); (0xfdf0,0xfdfb); (* Arabic Presentation Forms-B *) (0xfe70,0xfe72); (0xfe74,0xfe74); (0xfe76,0xfefc); (* Half width and Fullwidth Forms *) (0xff21,0xff3a); (0xff41,0xff5a); (0xff66,0xffbe); (0xffc2,0xffc7); (0xffca,0xffcf); (0xffd2,0xffd7); (0xffd2,0xffd7); (0xffda,0xffdc) ] ulex-1.2-camlp5/custom_ulexing.ml000066400000000000000000000017471340600002200171030ustar00rootroot00000000000000(* This example shows how to use ulex with a custom implementation for lex buffers. *) module Ulexing = struct exception Error type t = { buf : string; mutable pos : int; mutable mark_pos : int; mutable mark_val : int; mutable start : int; } let from_immutable_string s = { buf = s; pos = 0; mark_pos = 0; mark_val = 0; start = 0 } let start b = b.mark_pos <- b.pos; b.mark_val <- (-1); b.start <- b.pos let mark b i = b.mark_pos <- b.pos; b.mark_val <- i let backtrack b = b.pos <- b.mark_pos; b.mark_val let next b = if b.pos < String.length b.buf then let c = Char.code b.buf.[b.pos] in b.pos <- b.pos + 1; c else (-1) let lexeme b = String.sub b.buf b.start (b.pos - b.start) end let () = let rec split = lexer | ['a'-'z' 'A'-'Z']* -> print_endline (Ulexing.lexeme lexbuf); split lexbuf | eof -> () | _ -> split lexbuf in split (Ulexing.from_immutable_string "Hello, world !") ulex-1.2-camlp5/descr000066400000000000000000000000461340600002200145160ustar00rootroot00000000000000lexer generator for Unicode and OCaml ulex-1.2-camlp5/myocamlbuild.ml000066400000000000000000000005721340600002200165120ustar00rootroot00000000000000open Ocamlbuild_plugin;; open Command;; dispatch begin function | After_rules -> flag ["ocaml"; "pp"; "use_ulex"] (S[A"camlp5o"; A"pa_ulex.cma"]); flag ["ocaml"; "pp"; "camlp5orf"] (S[A"camlp5o"; A"pa_macro.cmo"; A"pa_extend.cmo"; A"q_MLast.cmo"]); dep ["ocaml"; "ocamldep"; "use_ulex"] ["pa_ulex.cma"]; ocaml_lib ~tag_name:"use_ulex" "ulexing"; | _ -> () end;; ulex-1.2-camlp5/opam000066400000000000000000000006651340600002200143610ustar00rootroot00000000000000opam-version: "1.2" name: "ulex-camlp5" version: "1.2" maintainer: "claudio.sacerdoticoen@unibo.it" authors: ["Alain.Frisch@inria.fr"] homepage: "https://github.com/whitequark/ulex" description: "A OCaml lexer generator for Unicode (backported to camlp5)" build: [ [make] [make "all.opt"] ] install: [make "install"] remove: [["ocamlfind" "remove" "ulex"]] depends: [ "base-bytes" "ocamlfind" "camlp5" "ocamlbuild" {build} ] ulex-1.2-camlp5/pa_ulex.ml000066400000000000000000000210021340600002200154550ustar00rootroot00000000000000(*open Camlp5.PreCast*) (*open Syntax*) let loc = Stdpp.dummy_loc (* Named regexp *) let named_regexps = (Hashtbl.create 13 : (string, Ulex.regexp) Hashtbl.t) let () = List.iter (fun (n,c) -> Hashtbl.add named_regexps n (Ulex.chars c)) [ "eof", Cset.eof; "xml_letter", Cset.letter; "xml_digit", Cset.digit; "xml_extender", Cset.extender; "xml_base_char", Cset.base_char; "xml_ideographic", Cset.ideographic; "xml_combining_char", Cset.combining_char; "xml_blank", Cset.blank; "tr8876_ident_char", Cset.tr8876_ident_char; ] (* Decision tree for partitions *) type decision_tree = | Lte of int * decision_tree * decision_tree | Table of int * int array | Return of int let decision l = let l = List.map (fun (a,b,i) -> (a,b,Return i)) l in let rec merge2 = function | (a1,b1,d1) :: (a2,b2,d2) :: rest -> let x = if b1 + 1 = a2 then d2 else Lte (a2 - 1,Return (-1), d2) in (a1,b2, Lte (b1,d1, x)) :: (merge2 rest) | rest -> rest in let rec aux = function | _::_::_ as l -> aux (merge2 l) | [(a,b,d)] -> Lte (a - 1, Return (-1), Lte (b, d, Return (-1))) | _ -> Return (-1) in aux l let limit = 8192 let decision_table l = let rec aux m accu = function | ((a,b,i) as x)::rem when (b < limit && i < 255)-> aux (min a m) (x::accu) rem | rem -> m,accu,rem in match (aux max_int [] l : int * 'a list * 'b list) with | _,[], _ -> decision l | min,((_,max,_)::_ as l1), l2 -> let arr = Array.make (max-min+1) 0 in List.iter (fun (a,b,i) -> for j = a to b do arr.(j-min) <- i + 1 done) l1; Lte (min-1, Return (-1), Lte (max, Table (min,arr), decision l2)) let rec simplify min max = function | Lte (i,yes,no) -> if i >= max then simplify min max yes else if i < min then simplify min max no else Lte (i, simplify min i yes, simplify (i+1) max no) | x -> x let tables = Hashtbl.create 31 let tables_counter = ref 0 let get_tables () = let t = Hashtbl.fold (fun key x accu -> (x,key)::accu) tables [] in Hashtbl.clear tables; t let table_name t = try Hashtbl.find tables t with Not_found -> incr tables_counter; let n = Printf.sprintf "__ulex_table_%i" !tables_counter in Hashtbl.add tables t n; n let output_byte buf b = Buffer.add_char buf '\\'; Buffer.add_char buf (Char.chr(48 + b / 100)); Buffer.add_char buf (Char.chr(48 + (b / 10) mod 10)); Buffer.add_char buf (Char.chr(48 + b mod 10)) let output_byte_array v = let b = Buffer.create (Array.length v * 5) in for i = 0 to Array.length v - 1 do output_byte b (v.(i) land 0xFF); if i land 15 = 15 then Buffer.add_string b "\\\n " done; let s = Buffer.contents b in <:expr< $str:s$ >> let table (n,t) = <:str_item< value $lid:n$ = $output_byte_array t$ >> let partition_name i = Printf.sprintf "__ulex_partition_%i" i let partition (i,p) = let rec gen_tree = function | Lte (i,yes,no) -> <:expr< if (c <= $int: string_of_int i$) then $gen_tree yes$ else $gen_tree no$ >> | Return i -> <:expr< $int: string_of_int i$ >> | Table (offset, t) -> let c = if offset = 0 then <:expr< c >> else <:expr< (c - $int: string_of_int offset$) >> in <:expr< Char.code ($lid: table_name t$.[$c$]) - 1>> in let body = gen_tree (simplify (-1) (Cset.max_code) (decision_table p)) in let f = partition_name i in <:str_item< value $lid:f$ = fun c -> $body$ >> (* Code generation for the automata *) let best_final final = let fin = ref None in Array.iteri (fun i b -> if b && (!fin = None) then fin := Some i) final; !fin let call_state auto state = match auto.(state) with (_,trans,final) -> if Array.length trans = 0 then match best_final final with | Some i -> <:expr< $int:string_of_int i$ >> | None -> assert false else let f = Printf.sprintf "__ulex_state_%i" state in <:expr< $lid:f$ lexbuf >> let vala_None = IFDEF STRICT THEN Ploc.VaVal None ELSE None END let gen_state auto loc i (part,trans,final) = let f = Printf.sprintf "__ulex_state_%i" i in let p = partition_name part in let cases = Array.mapi (fun i j -> <:patt< $int:string_of_int i$ >>, vala_None, call_state auto j ) trans in let cases = Array.to_list cases in let cases = cases @ [<:patt< _ >>, vala_None, <:expr< Ulexing.backtrack lexbuf >>] in let body = <:expr< match ($lid:p$ (Ulexing.next lexbuf)) with [ $list:cases$ ] >> in let ret body = [<:patt< $lid:f$ >>, <:expr< fun lexbuf -> $body$ >>] in match best_final final with | None -> ret body | Some i -> if Array.length trans = 0 then [] else ret <:expr< do { Ulexing.mark lexbuf $int:string_of_int i$; $body$ } >> let gen_definition loc l = let brs = Array.of_list l in let rs = Array.map fst brs in let auto = Ulex.compile rs in let cases = Array.mapi (fun i (_,e) -> <:patt< $int:string_of_int i$ >>, vala_None, e) brs in let cases = Array.to_list cases in let cases = cases @ [<:patt< _ >>, vala_None, <:expr< raise Ulexing.Error >>] in let actions = <:expr< match __ulex_state_0 lexbuf with [ $list:cases$ ] >> in let states = Array.mapi (gen_state auto loc) auto in let states = List.flatten (Array.to_list states) in let body = <:expr< let rec $list:states$ in do { Ulexing.start lexbuf; $actions$ } >> in <:expr< fun lexbuf -> $body$ >> (* Lexer specification parser *) let char s = Char.code (Token.eval_char s) let char_int s = let i = int_of_string s in if (i >=0) && (i <= Cset.max_code) then i else failwith ("Invalid Unicode code point: " ^ s) let regexp_for_string s = let rec aux n = if n = String.length s then Ulex.eps else Ulex.seq (Ulex.chars (Cset.singleton (Char.code s.[n]))) (aux (succ n)) in aux 0 EXTEND GLOBAL: Pcaml.expr Pcaml.str_item; Pcaml.expr: [ [ "lexer"; OPT "|"; l = LIST0 [ r=regexp; "->"; a=Pcaml.expr -> (r,a) ] SEP "|" -> gen_definition loc l ] ]; Pcaml.str_item: [ [ "let"; LIDENT "regexp"; x = LIDENT; "="; r = regexp -> if Hashtbl.mem named_regexps x then Printf.eprintf "pa_ulex (warning): multiple definition of named regexp '%s'\n" x; Hashtbl.add named_regexps x r; <:str_item< declare $list: []$ end >> ] ]; regexp: [ [ r1 = regexp; "|"; r2 = regexp -> Ulex.alt r1 r2 ] | [ r1 = regexp; r2 = regexp -> Ulex.seq r1 r2 ] | [ r1 = regexp; "#"; r2 = regexp -> try Ulex.diff r1 r2 with Not_found -> failwith ("pa_ulex (error): operands of # must be bare character sets") ] | [ r = regexp; "*" -> Ulex.rep r | r = regexp; "+" -> Ulex.plus r | r = regexp; "?" -> Ulex.alt Ulex.eps r | "("; r = regexp; ")" -> r | "_" -> Ulex.chars Cset.any | c = chr -> Ulex.chars (Cset.singleton c) | s = STRING -> regexp_for_string (Token.eval_string loc s) | "["; cc = ch_class; "]" -> Ulex.chars cc | x = LIDENT -> try Hashtbl.find named_regexps x with Not_found -> failwith ("pa_ulex (error): reference to unbound regexp name `"^x^"'") ] ]; chr: [ [ c = CHAR -> char c | i = INT -> char_int i ] ]; ch_class: [ [ "^"; cc = ch_class -> Cset.difference Cset.any cc] | [ c1 = chr; "-"; c2 = chr -> Cset.interval c1 c2 | c = chr -> Cset.singleton c | cc1 = ch_class; cc2 = ch_class -> Cset.union cc1 cc2 | s = STRING -> let s = Token.eval_string loc s in let c = ref Cset.empty in for i = 0 to String.length s - 1 do c := Cset.union !c (Cset.singleton (Char.code s.[i])) done; !c ] ]; END let () = let old_parse_implem = !Pcaml.parse_implem in let new_parse_implem s = let (items,d) = old_parse_implem s in let parts = List.map partition (Ulex.partitions ()) in let tables = List.map table (get_tables ()) in (<:str_item< declare $list:tables@parts$ end >>, loc) :: items, d in Pcaml.parse_implem := new_parse_implem (*let change_ids suffix = object inherit Ast.map as super method ident = function | Ast.IdLid (loc, s) when String.length s > 6 && String.sub s 0 6 = "__ulex" -> Ast.IdLid (loc, s ^ suffix) | i -> i end let () = let first = ref true in AstFilters.register_str_item_filter (fun s -> assert(!first); first := false; let parts = List.map partition (Ulex.partitions ()) in let tables = List.map table (get_tables ()) in let suffix = "__" ^ Digest.to_hex (Digest.string (Marshal.to_string (parts, tables) [])) in (change_ids suffix) # str_item <:str_item< declare $list:tables$ end >> ) (* $list:parts$; $s$ *) *) ulex-1.2-camlp5/ulex.ml000066400000000000000000000067731340600002200150170ustar00rootroot00000000000000(* NFA *) type node = { id : int; mutable eps : node list; mutable trans : (Cset.t * node) list; } (* Compilation regexp -> NFA *) type regexp = node -> node let cur_id = ref 0 let new_node () = incr cur_id; { id = !cur_id; eps = []; trans = [] } let seq r1 r2 succ = r1 (r2 succ) let alt r1 r2 succ = let n = new_node () in n.eps <- [r1 succ; r2 succ]; n let diff r1 r2 = let cset r = match r (new_node ()) with | { eps = []; trans = [cset, _] } -> cset | _ -> raise Not_found in let c1, c2 = cset r1, cset r2 in fun succ -> let n = new_node () in n.trans <- [Cset.difference c1 c2, succ]; n let rep r succ = let n = new_node () in n.eps <- [r n; succ]; n let plus r succ = let n = new_node () in let nr = r n in n.eps <- [nr; succ]; nr let eps succ = succ let chars c succ = let n = new_node () in n.trans <- [c,succ]; n let compile_re re = let final = new_node () in (re final, final) (* Determinization *) type state = node list let rec add_node state node = if List.memq node state then state else add_nodes (node::state) node.eps and add_nodes state nodes = List.fold_left add_node state nodes let transition state = (* Merge transition with the same target *) let rec norm = function | (c1,n1)::((c2,n2)::q as l) -> if n1 == n2 then norm ((Cset.union c1 c2,n1)::q) else (c1,n1)::(norm l) | l -> l in let t = List.concat (List.map (fun n -> n.trans) state) in let t = norm (List.sort (fun (c1,n1) (c2,n2) -> n1.id - n2.id) t) in (* Split char sets so as to make them disjoint *) let rec split (all,t) ((c0 : Cset.t),n0) = let t = [(Cset.difference c0 all, [n0])] @ List.map (fun (c,ns) -> (Cset.intersection c c0, n0::ns)) t @ List.map (fun (c,ns) -> (Cset.difference c c0, ns)) t in (Cset.union all c0, List.filter (fun (c,ns) -> not (Cset.is_empty c)) t) in let (_,t) = List.fold_left split (Cset.empty,[]) t in (* Epsilon closure of targets *) let t = List.map (fun (c,ns) -> (c,add_nodes [] ns)) t in (* Canonical ordering *) let t = Array.of_list t in Array.sort (fun (c1,ns1) (c2,ns2) -> compare c1 c2) t; Array.map fst t, Array.map snd t let find_alloc tbl counter x = try Hashtbl.find tbl x with Not_found -> let i = !counter in incr counter; Hashtbl.add tbl x i; i let part_tbl = Hashtbl.create 31 let part_id = ref 0 let get_part (t : Cset.t array) = find_alloc part_tbl part_id t let compile rs = let rs = Array.map compile_re rs in let counter = ref 0 in let states = Hashtbl.create 31 in let states_def = ref [] in let rec aux state = try Hashtbl.find states state with Not_found -> let i = !counter in incr counter; Hashtbl.add states state i; let (part,targets) = transition state in let part = get_part part in let targets = Array.map aux targets in let finals = Array.map (fun (_,f) -> List.mem f state) rs in states_def := (i, (part,targets,finals)) :: !states_def; i in let init = ref [] in Array.iter (fun (i,_) -> init := add_node !init i) rs; ignore (aux !init); Array.init !counter (fun id -> List.assoc id !states_def) let partitions () = let aux part = let seg = ref [] in Array.iteri (fun i c -> List.iter (fun (a,b) -> seg := (a,b,i) :: !seg) c) part; List.sort (fun (a1,_,_) (a2,_,_) -> compare a1 a2) !seg in let res = ref [] in Hashtbl.iter (fun part i -> res := (i, aux part) :: !res) part_tbl; Hashtbl.clear part_tbl; !res ulex-1.2-camlp5/ulex.mli000066400000000000000000000005321340600002200151530ustar00rootroot00000000000000type regexp val chars: Cset.t -> regexp val seq: regexp -> regexp -> regexp val alt: regexp -> regexp -> regexp val diff: regexp -> regexp -> regexp val rep: regexp -> regexp val plus: regexp -> regexp val eps: regexp val compile: regexp array -> (int * int array * bool array) array val partitions: unit -> (int * (int * int * int) list) list ulex-1.2-camlp5/ulexing.ml000066400000000000000000000116421340600002200155040ustar00rootroot00000000000000exception Error exception InvalidCodepoint of int let eof = -1 (* Absolute position from the beginning of the stream *) type apos = int type lexbuf = { refill: (int array -> int -> int -> int); mutable buf: int array; mutable len: int; (* Number of meaningful char in buffer *) mutable offset: apos; (* Position of the first char in buffer in the input stream *) mutable pos : int; mutable start : int; (* First char we need to keep visible *) mutable marked_pos : int; mutable marked_val : int; mutable finished: bool; } let get_buf lb = lb.buf let get_pos lb = lb.pos let get_start lb = lb.start let chunk_size = 512 let empty_lexbuf = { refill = (fun _ _ _ -> assert false); buf = [| |]; len = 0; offset = 0; pos = 0; start = 0; marked_pos = 0; marked_val = 0; finished = false; } let create f = { empty_lexbuf with refill = f; buf = Array.make chunk_size 0; } let from_stream s = create (fun buf pos len -> try buf.(pos) <- Stream.next s; 1 with Stream.Failure -> 0) let from_latin1_stream s = create (fun buf pos len -> try buf.(pos) <- Char.code (Stream.next s); 1 with Stream.Failure -> 0) let from_utf8_stream s = create (fun buf pos len -> try buf.(pos) <- Utf8.from_stream s; 1 with Stream.Failure -> 0) type enc = Ascii | Latin1 | Utf8 exception MalFormed let from_var_enc_stream enc s = create (fun buf pos len -> try buf.(pos) <- (match !enc with | Ascii -> let c = Char.code (Stream.next s) in if c > 127 then raise (InvalidCodepoint c); c | Latin1 -> Char.code (Stream.next s) | Utf8 -> Utf8.from_stream s); 1 with Stream.Failure -> 0) let from_var_enc_string enc s = from_var_enc_stream enc (Stream.of_string s) let from_var_enc_channel enc ic = from_var_enc_stream enc (Stream.of_channel ic) let from_latin1_string s = let len = String.length s in { empty_lexbuf with buf = Array.init len (fun i -> Char.code s.[i]); len = len; finished = true; } let from_latin1_channel ic = from_latin1_stream (Stream.of_channel ic) let from_utf8_channel ic = from_stream (Utf8.stream_from_char_stream (Stream.of_channel ic)) let from_int_array a = let len = Array.length a in { empty_lexbuf with buf = Array.init len (fun i -> a.(i)); len = len; finished = true; } let from_utf8_string s = from_int_array (Utf8.to_int_array s 0 (String.length s)) let refill lexbuf = if lexbuf.len + chunk_size > Array.length lexbuf.buf then begin let s = lexbuf.start in let ls = lexbuf.len - s in if ls + chunk_size <= Array.length lexbuf.buf then Array.blit lexbuf.buf s lexbuf.buf 0 ls else begin let newlen = (Array.length lexbuf.buf + chunk_size) * 2 in let newbuf = Array.make newlen 0 in Array.blit lexbuf.buf s newbuf 0 ls; lexbuf.buf <- newbuf end; lexbuf.len <- ls; lexbuf.offset <- lexbuf.offset + s; lexbuf.pos <- lexbuf.pos - s; lexbuf.marked_pos <- lexbuf.marked_pos - s; lexbuf.start <- 0 end; let n = lexbuf.refill lexbuf.buf lexbuf.pos chunk_size in if (n = 0) then begin lexbuf.buf.(lexbuf.len) <- eof; lexbuf.len <- lexbuf.len + 1; end else lexbuf.len <- lexbuf.len + n let next lexbuf = let i = if lexbuf.pos = lexbuf.len then if lexbuf.finished then eof else (refill lexbuf; lexbuf.buf.(lexbuf.pos)) else lexbuf.buf.(lexbuf.pos) in if i = eof then lexbuf.finished <- true else lexbuf.pos <- lexbuf.pos + 1; i let start lexbuf = lexbuf.start <- lexbuf.pos; lexbuf.marked_pos <- lexbuf.pos; lexbuf.marked_val <- (-1) let mark lexbuf i = lexbuf.marked_pos <- lexbuf.pos; lexbuf.marked_val <- i let backtrack lexbuf = lexbuf.pos <- lexbuf.marked_pos; lexbuf.marked_val let rollback lexbuf = lexbuf.pos <- lexbuf.start let lexeme_start lexbuf = lexbuf.start + lexbuf.offset let lexeme_end lexbuf = lexbuf.pos + lexbuf.offset let loc lexbuf = (lexbuf.start + lexbuf.offset, lexbuf.pos + lexbuf.offset) let lexeme_length lexbuf = lexbuf.pos - lexbuf.start let sub_lexeme lexbuf pos len = Array.sub lexbuf.buf (lexbuf.start + pos) len let lexeme lexbuf = Array.sub lexbuf.buf (lexbuf.start) (lexbuf.pos - lexbuf.start) let lexeme_char lexbuf pos = lexbuf.buf.(lexbuf.start + pos) let to_latin1 c = if (c >= 0) && (c < 256) then Char.chr c else raise (InvalidCodepoint c) let latin1_lexeme_char lexbuf pos = to_latin1 (lexeme_char lexbuf pos) let latin1_sub_lexeme lexbuf pos len = let s = Bytes.create len in for i = 0 to len - 1 do Bytes.set s i (to_latin1 lexbuf.buf.(lexbuf.start + pos + i)) done; Bytes.to_string s let latin1_lexeme lexbuf = latin1_sub_lexeme lexbuf 0 (lexbuf.pos - lexbuf.start) let utf8_sub_lexeme lexbuf pos len = Utf8.from_int_array lexbuf.buf (lexbuf.start + pos) len let utf8_lexeme lexbuf = utf8_sub_lexeme lexbuf 0 (lexbuf.pos - lexbuf.start) ulex-1.2-camlp5/ulexing.mli000066400000000000000000000204421340600002200156530ustar00rootroot00000000000000(** Runtime support for lexers generated by [ulex]. This module is roughly equivalent to the module Lexing from the OCaml standard library, except that its lexbuffers handles Unicode code points (OCaml type: [int] in the range [0..0x10ffff]) instead of bytes (OCaml type: [char]). It is possible to have ulex-generated lexers work on a custom implementation for lex buffers. To do this, define a module [L] which implements the [start], [next], [mark] and [backtrack] functions (See the Internal Interface section below for a specification), and the [Error] exception. They need not work on a type named [lexbuf]: you can use the type name you want. Then, just do in your ulex-processed source, before the first lexer specification: [module Ulexing = L] Of course, you'll probably want to define functions like [lexeme] to be used in the lexers semantic actions. *) type lexbuf (** The type of lexer buffers. A lexer buffer is the argument passed to the scanning functions defined by the generated lexers. The lexer buffer holds the internal information for the scanners, including the code points of the token currently scanned, its position from the beginning of the input stream, and the current position of the lexer. *) exception Error (** Raised by a lexer when it cannot parse a token from the lexbuf. The functions [Ulexing.lexeme_start] (resp. [Ulexing.lexeme_end]) can be used to find to positions of the first code point of the current matched substring (resp. the first code point that yield the error). *) exception InvalidCodepoint of int (** Raised by some functions to signal that some code point is not compatible with a specified encoding. *) (** {6 Clients interface} *) val create: (int array -> int -> int -> int) -> lexbuf (** Create a generic lexer buffer. When the lexer needs more characters, it will call the given function, giving it an array of integers [a], a position [pos] and a code point count [n]. The function should put [n] code points or less in [a], starting at position [pos], and return the number of characters provided. A return value of 0 means end of input. *) val from_stream: int Stream.t -> lexbuf (** Create a lexbuf from a stream of Unicode code points. *) val from_int_array: int array -> lexbuf (** Create a lexbuf from an array of Unicode code points. *) val from_latin1_stream: char Stream.t -> lexbuf (** Create a lexbuf from a Latin1 encoded stream (ie a stream of Unicode code points in the range [0..255]) *) val from_latin1_channel: in_channel -> lexbuf (** Create a lexbuf from a Latin1 encoded input channel. The client is responsible for closing the channel. *) val from_latin1_string: string -> lexbuf (** Create a lexbuf from a Latin1 encoded string. *) val from_utf8_stream: char Stream.t -> lexbuf (** Create a lexbuf from a UTF-8 encoded stream. *) val from_utf8_channel: in_channel -> lexbuf (** Create a lexbuf from a UTF-8 encoded input channel. *) val from_utf8_string: string -> lexbuf (** Create a lexbuf from a UTF-8 encoded string. *) type enc = Ascii | Latin1 | Utf8 val from_var_enc_stream: enc ref -> char Stream.t -> lexbuf (** Create a lexbuf from a stream whose encoding is subject to change during lexing. The reference can be changed at any point. Note that bytes that have been consumed by the lexer buffer are not re-interpreted with the new encoding. In [Ascii] mode, non-ASCII bytes (ie [>127]) in the stream raise an [InvalidCodepoint] exception. *) val from_var_enc_string: enc ref -> string -> lexbuf (** Same as [Ulexing.from_var_enc_stream] with a string as input. *) val from_var_enc_channel: enc ref -> in_channel -> lexbuf (** Same as [Ulexing.from_var_enc_stream] with a channel as input. *) (** {6 Interface for lexers semantic actions} *) (** The following functions can be called from the semantic actions of lexer definitions. They give access to the character string matched by the regular expression associated with the semantic action. These functions must be applied to the argument [lexbuf], which, in the code generated by [ulex], is bound to the lexer buffer passed to the parsing function. These functions can also be called when capturing a [Ulexing.Error] exception to retrieve the problematic string. *) val lexeme_start: lexbuf -> int (** [Ulexing.lexeme_start lexbuf] returns the offset in the input stream of the first code point of the matched string. The first code point of the stream has offset 0. *) val lexeme_end: lexbuf -> int (** [Ulexing.lexeme_end lexbuf] returns the offset in the input stream of the character following the last code point of the matched string. The first character of the stream has offset 0. *) val loc: lexbuf -> int * int (** [Ulexing.loc lexbuf] returns the pair [(Ulexing.lexeme_start lexbuf,Ulexing.lexeme_end lexbuf)]. *) val lexeme_length: lexbuf -> int (** [Ulexing.loc lexbuf] returns the difference [(Ulexing.lexeme_end lexbuf) - (Ulexing.lexeme_start lexbuf)], that is, the length (in code points) of the matched string. *) val lexeme: lexbuf -> int array (** [Ulexing.lexeme lexbuf] returns the string matched by the regular expression as an array of Unicode code point. *) val get_buf: lexbuf -> int array (** Direct access to the internal buffer. *) val get_start: lexbuf -> int (** Direct access to the starting position of the lexeme in the internal buffer. *) val get_pos: lexbuf -> int (** Direct access to the current position (end of lexeme) in the internal buffer. *) val lexeme_char: lexbuf -> int -> int (** [Ulexing.lexeme_char lexbuf pos] returns code point number [pos] in the matched string. *) val sub_lexeme: lexbuf -> int -> int -> int array (** [Ulexing.lexeme lexbuf pos len] returns a substring of the string matched by the regular expression as an array of Unicode code point. *) val latin1_lexeme: lexbuf -> string (** As [Ulexing.lexeme] with a result encoded in Latin1. This function throws an exception [InvalidCodepoint] if it is not possible to encode the result in Latin1. *) val latin1_sub_lexeme: lexbuf -> int -> int -> string (** As [Ulexing.sub_lexeme] with a result encoded in Latin1. This function throws an exception [InvalidCodepoint] if it is not possible to encode the result in Latin1. *) val latin1_lexeme_char: lexbuf -> int -> char (** As [Ulexing.lexeme_char] with a result encoded in Latin1. This function throws an exception [InvalidCodepoint] if it is not possible to encode the result in Latin1. *) val utf8_lexeme: lexbuf -> string (** As [Ulexing.lexeme] with a result encoded in UTF-8. *) val utf8_sub_lexeme: lexbuf -> int -> int -> string (** As [Ulexing.sub_lexeme] with a result encoded in UTF-8. *) val rollback: lexbuf -> unit (** [Ulexing.rollback lexbuf] puts [lexbuf] back in its configuration before the last lexeme was matched. It is then possible to use another lexer to parse the same characters again. The other functions above in this section should not be used in the semantic action after a call to [Ulexing.rollback]. *) (** {6 Internal interface} *) (** These functions are used internally by the lexers. They could be used to write lexers by hand, or with a lexer generator different from [ulex]. The lexer buffers have a unique internal slot that can store an integer. They also store a "backtrack" position. *) val start: lexbuf -> unit (** [Ulexing.start lexbuf] informs the lexer buffer that any code points until the current position can be discarded. The current position become the "start" position as returned by [Ulexing.lexeme_start]. Moreover, the internal slot is set to [-1] and the backtrack position is set to the current position. *) val next: lexbuf -> int (** [Ulexing.next lexbuf next] extracts the next code point from the lexer buffer and increments to current position. If the input stream is exhausted, the function returns [-1]. *) val mark: lexbuf -> int -> unit (** [Ulexing.mark lexbuf i] stores the integer [i] in the internal slot. The backtrack position is set to the current position. *) val backtrack: lexbuf -> int (** [Ulexing.backtrack lexbuf] returns the value stored in the internal slot of the buffer, and performs backtracking (the current position is set to the value of the backtrack position). *) ulex-1.2-camlp5/utf16.ml000066400000000000000000000104341340600002200147740ustar00rootroot00000000000000 exception MalFormed exception InvalidCodepoint of int type byte_order = Little_endian | Big_endian let get_byte_order c0 c1 = match (Char.code c0, Char.code c1) with | (0xfe,0xff) -> Big_endian | (0xff,0xfe) -> Little_endian | _ -> raise MalFormed let number_of_char_pair bo c1 c2 = match bo with | Little_endian -> ((Char.code c2) lsl 8) + (Char.code c1) | Big_endian -> ((Char.code c1) lsl 8) + (Char.code c2) let char_pair_of_number bo num = match bo with | Little_endian -> (Char.chr (num land 0xFF), Char.chr ((num lsr 8) land 0xFF )) | Big_endian -> (Char.chr ((num lsr 8) land 0xFF), Char.chr (num land 0xFF)) let next_in_string bo s pos bytes = if (pos + 1 >= bytes) then raise MalFormed; number_of_char_pair bo s.[pos] s.[pos+1] let next_code bo s pos bytes = let w1 = next_in_string bo s pos bytes in if w1 = 0xfffe then raise (InvalidCodepoint w1); if w1 < 0xd800 || 0xdfff < w1 then (w1, pos+2) else if w1 <= 0xdbff then let w2 = next_in_string bo s (pos + 2) bytes in if w2 < 0xdc00 || w2 > 0xdfff then raise MalFormed; let upper10 = (w1 land 0x3ff) lsl 10 and lower10 = w2 land 0x3ff in (0x10000 + upper10 + lower10, pos + 4) else raise MalFormed let next_in_stream bo s = let c1 = Stream.next s in let c2 = Stream.next s in number_of_char_pair bo c1 c2 let from_stream bo s w1 = if w1 = 0xfffe then raise (InvalidCodepoint w1); if w1 < 0xd800 || 0xdfff < w1 then w1 else if w1 <= 0xdbff then let w2 = next_in_stream bo s in if w2 < 0xdc00 || w2 > 0xdfff then raise MalFormed; let upper10 = (w1 land 0x3ff) lsl 10 and lower10 = w2 land 0x3ff in 0x10000 + upper10 + lower10 else raise MalFormed let stream_from_char_stream opt_bo s = let bo = ref opt_bo in Stream.from (fun _ -> try let c1 = Stream.next s in let c2 = Stream.next s in let o = match !bo with | Some o -> o | None -> let o = match (Char.code c1, Char.code c2) with | (0xff,0xfe) -> Little_endian | _ -> Big_endian in bo := Some o; o in Some (from_stream o s (number_of_char_pair o c1 c2)) with Stream.Failure -> None) let compute_len opt_bo str pos bytes = let s = stream_from_char_stream opt_bo (Stream.from (fun i -> if i + pos >= bytes then None else Some (str.[i + pos]))) in let l = ref 0 in Stream.iter (fun _ -> incr l) s ; !l let rec blit_to_int opt_bo s spos a apos bytes = let s = stream_from_char_stream opt_bo (Stream.from (fun i -> if i+spos >= bytes then None else Some (s.[i + spos]))) in let p = ref apos in try while true do a.(!p) <- Stream.next s ; incr p done; assert false with Stream.Failure -> () let to_int_array opt_bo s pos bytes = let len = compute_len opt_bo s pos bytes in let a = Array.create len 0 in blit_to_int opt_bo s pos a 0 bytes ; a let store bo buf code = if code < 0x10000 then ( let (c1,c2) = char_pair_of_number bo code in Buffer.add_char buf c1; Buffer.add_char buf c2 ) else ( let u' = code - 0x10000 in let w1 = 0xd800 + (u' lsr 10) and w2 = 0xdc00 + (u' land 0x3ff) in let (c1,c2) = char_pair_of_number bo w1 and (c3,c4) = char_pair_of_number bo w2 in Buffer.add_char buf c1; Buffer.add_char buf c2; Buffer.add_char buf c3; Buffer.add_char buf c4 ) let from_int_array bo a apos len bom = let b = Buffer.create (len * 4) in if bom then store bo b 0xfeff ; (* first, store the BOM *) let rec aux apos len = if len > 0 then (store bo b a.(apos); aux (succ apos) (pred len)) else Buffer.contents b in aux apos len let from_stream bo s = from_stream bo s (next_in_stream bo s) let from_utf16_stream s opt_bo = Ulexing.from_stream (stream_from_char_stream opt_bo s) let from_utf16_channel ic opt_bo = from_utf16_stream ((Stream.of_channel ic)) opt_bo let from_utf16_string s opt_bo = let a = to_int_array opt_bo s 0 (String.length s) in Ulexing.from_int_array a let utf16_sub_lexeme lb pos len bo bom = from_int_array bo (Ulexing.get_buf lb) (Ulexing.get_start lb + pos) len bom let utf16_lexeme lb bo bom = utf16_sub_lexeme lb 0 (Ulexing.get_pos lb - Ulexing.get_start lb) bo bom ulex-1.2-camlp5/utf16.mli000066400000000000000000000102111340600002200151360ustar00rootroot00000000000000 (** UTF-16 support for Ulex. Implementation as described in "http://www.ietf.org/rfc/rfc2781.txt". *) exception MalFormed (** UTF-16 can be encoded in little endian format (0xabcd -> (0xcd|0xab)) or big endian format (0xabcd -> (0xab|0xcd). *) type byte_order = Little_endian | Big_endian (** {6 Interface } *) (** [to_int_array opt_bo str spos bytes] decodes the string [str] of length [bytes] starting in position [spos]. If [opt_bo] matches with [None] the functions tries to detect a BOM, if it can't it assumes big endian byte order. If [opt_bo] matches with [Some bo] byte order [bo] is assumed and potential byte order marks are interpreted as code points 0xfeff. *) val to_int_array: byte_order option -> string -> int -> int -> int array (** [from_int_array bo a apos len bom] encodes an int array [a] containing [len] code points from position [apos] into a string with byte order [bo]. The results starts with a BOM if [bom = true]. *) val from_int_array: byte_order -> int array -> int -> int -> bool -> string (** [stream_from_char_stream opt_stro] creates a new int stream containing the code points encoded in [str]. Treats [opt_bo] as [to_int_array]. *) val stream_from_char_stream: byte_order option -> char Stream.t -> int Stream.t (** {6 Low level} *) (** [get_byte_order c1 c2] determines the byte order by a pair of bytes/characters [c1] and [c2]. *) val get_byte_order: char -> char -> byte_order (** [from_stream bo s] reads the next code point from a stream encoded in byte order [bo]. *) val from_stream: byte_order -> char Stream.t -> int (** [number_of_char_pair bo c1 c2] returns the code point encoded in [c1] and [c2] following byte order [bo]. *) val number_of_char_pair: byte_order -> char -> char -> int (** [char_pair_of_number bo cp] encodes code point [cp] into two characters with byte order [bo]. *) val char_pair_of_number: byte_order -> int -> char * char (** [next_code bo s pos bytes bo] reads the code point starting at position [pos] in a string [s] of total length [bytes]. *) val next_code: byte_order -> string -> int -> int -> int * int (** [compute_len opt_bo str pos len] computes the number of encoded code points in string [str] from position [pos] to [pos+len-1]. *) val compute_len: byte_order option -> string -> int -> int -> int (** [blit_to_int bo str spos a apos n] decode [len] bytes from string [str] starting at position [spos] into array [a], at position [apos]. *) val blit_to_int: byte_order option -> string -> int -> int array -> int -> int -> unit (** [store bo buf cp] adds a codepoint [cp] to a buffer [buf] following the byte order [bo]. *) val store: byte_order -> Buffer.t -> int -> unit val from_utf16_stream: char Stream.t -> byte_order option -> Ulexing.lexbuf (** [from_utf16_stream s opt_bo] creates a lexbuf from an UTF-16 encoded stream. If [opt_bo] matches with [None] the function expects a BOM (Byte Order Mark), and takes the byte order as [Utf16.Big_endian] if it cannot find one. When [opt_bo] matches with [Some bo], [bo] is taken as byte order. In this case a leading BOM is kept in the stream - the lexer has to ignore it and a `wrong' BOM ([0xfffe]) will raise Utf16.InvalidCodepoint. *) val from_utf16_channel: in_channel -> byte_order option-> Ulexing.lexbuf (** Works as [from_utf16_stream] with an [in_channel]. *) val from_utf16_string: string -> byte_order option -> Ulexing.lexbuf (** Works as [from_utf16_stream] with a [string]. *) val utf16_lexeme: Ulexing.lexbuf -> byte_order -> bool -> string (** [utf16_lexeme lb bo bom] as [Ulexing.lexeme] with a result encoded in UTF-16 in byte_order [bo] and starting with a BOM if [bom = true]. *) val utf16_sub_lexeme: Ulexing.lexbuf -> int -> int -> byte_order -> bool -> string (** [utf16_sub_lexeme lb pos len bo bom] as [Ulexing.sub_lexeme] with a result encoded in UTF-16 with byte order [bo] and starting with a BOM if [bom=true] *) ulex-1.2-camlp5/utf8.ml000066400000000000000000000102641340600002200147160ustar00rootroot00000000000000exception MalFormed (* cf http://www.faqs.org/rfcs/rfc3629.html *) let width = Array.make 256 (-1) let () = for i = 0 to 127 do width.(i) <- 1 done; for i = 192 to 223 do width.(i) <- 2 done; for i = 224 to 239 do width.(i) <- 3 done; for i = 240 to 247 do width.(i) <- 4 done let next s i = match s.[i] with | '\000'..'\127' as c -> Char.code c | '\192'..'\223' as c -> let n1 = Char.code c in let n2 = Char.code s.[i+1] in if (n2 lsr 6 != 0b10) then raise MalFormed; ((n1 land 0x1f) lsl 6) lor (n2 land 0x3f) | '\224'..'\239' as c -> let n1 = Char.code c in let n2 = Char.code s.[i+1] in let n3 = Char.code s.[i+2] in if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) then raise MalFormed; let p = ((n1 land 0x0f) lsl 12) lor ((n2 land 0x3f) lsl 6) lor (n3 land 0x3f) in if (p >= 0xd800) && (p <= 0xdf00) then raise MalFormed; p | '\240'..'\247' as c -> let n1 = Char.code c in let n2 = Char.code s.[i+1] in let n3 = Char.code s.[i+2] in let n4 = Char.code s.[i+3] in if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) || (n4 lsr 6 != 0b10) then raise MalFormed; ((n1 land 0x07) lsl 18) lor ((n2 land 0x3f) lsl 12) lor ((n3 land 0x3f) lsl 6) lor (n4 land 0x3f) | _ -> raise MalFormed (* With this implementation, a truncated code point will result in Stream.Failure, not in MalFormed. *) let from_stream s = match Stream.next s with | '\000'..'\127' as c -> Char.code c | '\192'..'\223' as c -> let n1 = Char.code c in let n2 = Char.code (Stream.next s) in if (n2 lsr 6 != 0b10) then raise MalFormed; ((n1 land 0x1f) lsl 6) lor (n2 land 0x3f) | '\224'..'\239' as c -> let n1 = Char.code c in let n2 = Char.code (Stream.next s) in let n3 = Char.code (Stream.next s) in if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) then raise MalFormed; ((n1 land 0x0f) lsl 12) lor ((n2 land 0x3f) lsl 6) lor (n3 land 0x3f) | '\240'..'\247' as c -> let n1 = Char.code c in let n2 = Char.code (Stream.next s) in let n3 = Char.code (Stream.next s) in let n4 = Char.code (Stream.next s) in if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) || (n4 lsr 6 != 0b10) then raise MalFormed; ((n1 land 0x07) lsl 18) lor ((n2 land 0x3f) lsl 12) lor ((n3 land 0x3f) lsl 6) lor (n4 land 0x3f) | _ -> raise MalFormed let compute_len s pos bytes = let rec aux n i = if i >= pos + bytes then if i = pos + bytes then n else raise MalFormed else let w = width.(Char.code s.[i]) in if w > 0 then aux (succ n) (i + w) else raise MalFormed in aux 0 pos let rec blit_to_int s spos a apos n = if n > 0 then begin a.(apos) <- next s spos; blit_to_int s (spos + width.(Char.code s.[spos])) a (succ apos) (pred n) end let to_int_array s pos bytes = let n = compute_len s pos bytes in let a = Array.make n 0 in blit_to_int s pos a 0 n; a (**************************) let width_code_point p = if p <= 0x7f then 1 else if p <= 0x7ff then 2 else if p <= 0xffff then 3 else if p <= 0x10ffff then 4 else raise MalFormed let store b p = if p <= 0x7f then Buffer.add_char b (Char.chr p) else if p <= 0x7ff then ( Buffer.add_char b (Char.chr (0xc0 lor (p lsr 6))); Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f))) ) else if p <= 0xffff then ( if (p >= 0xd800 && p < 0xe000) then raise MalFormed; Buffer.add_char b (Char.chr (0xe0 lor (p lsr 12))); Buffer.add_char b (Char.chr (0x80 lor ((p lsr 6) land 0x3f))); Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f))) ) else if p <= 0x10ffff then ( Buffer.add_char b (Char.chr (0xf0 lor (p lsr 18))); Buffer.add_char b (Char.chr (0x80 lor ((p lsr 12) land 0x3f))); Buffer.add_char b (Char.chr (0x80 lor ((p lsr 6) land 0x3f))); Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f))) ) else raise MalFormed let from_int_array a apos len = let b = Buffer.create (len * 4) in let rec aux apos len = if len > 0 then (store b a.(apos); aux (succ apos) (pred len)) else Buffer.contents b in aux apos len let stream_from_char_stream s = Stream.from (fun _ -> try Some (from_stream s) with Stream.Failure -> None) ulex-1.2-camlp5/utf8.mli000066400000000000000000000006501340600002200150650ustar00rootroot00000000000000exception MalFormed val width: int array val next: string -> int -> int val compute_len: string -> int -> int -> int val blit_to_int: string -> int -> int array -> int -> int -> unit val to_int_array: string -> int -> int -> int array val store: Buffer.t -> int -> unit val from_int_array: int array -> int -> int -> string val from_stream: char Stream.t -> int val stream_from_char_stream: char Stream.t -> int Stream.t ulex-1.2-camlp5/utf8_test.ml000066400000000000000000000010271340600002200157520ustar00rootroot00000000000000#load "utf8.cmo";; let () = let b = Buffer.create 10 in for i = 0 to 0x10ffff do if (i >= 0xd800) && (i <= 0xdfff) then () else ( (try Utf8.store b i with Utf8.MalFormed -> Printf.eprintf "Conversion failure %x\n" i; exit 1); let s = Buffer.contents b in Buffer.clear b; let j = try Utf8.next s 0 with Utf8.MalFormed -> Printf.eprintf "Deconversion failure %x (%S)\n" i s; exit 1 in if (i != j) then (Printf.eprintf "Conversion/deconversion error. %x->%x\n" i j; exit 1) ) done