cl-regex-1/0042755000175000001440000000000007562750063011443 5ustar mrduserscl-regex-1/license.txt0100755000175000001440000000263307540243102013613 0ustar mrdusersCopyright (c) 2000,2001,2002 Kenneth Michael Parker All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cl-regex-1/regex.translations0100755000175000001440000000024207520370374015211 0ustar mrdusers ;;; change to your own path (setf (logical-pathname-translations "REGEX") '(("REGEX:SYS;*" "d:/lisp/*") ("REGEX:SRC;*" "d:/lisp/regex/*"))) cl-regex-1/regex.system0100755000175000001440000000230207550106714014011 0ustar mrdusers;;; -*- Mode: Lisp; Syntax: ANSI-Common-lisp; Package: CL-USER; Base: 10 -*- (in-package "CL-USER") (load-logical-pathname-translations "REGEX") (mk:defsystem "REGEX" :source-extension "lisp" :source-pathname (translate-logical-pathname "REGEX:SRC;") :components ( (:file "packages") (:file "macs" :depends-on ("packages")) (:file "parser" :depends-on ("packages" "macs")) (:file "optimize" :depends-on ("packages" "macs")) (:file "gen" :depends-on ("packages" "macs")) (:file "closure" :depends-on ("packages" "macs")) ; (:file "expand" :depends-on ("packages" "macs")) (:file "regex" :depends-on ("packages" "macs" "parser" "optimize" "gen" "closure")) (:file "regexp-test-suite" :depends-on ("packages" "regex")) (:file "retest" :depends-on ("packages" "regex" "regexp-test-suite")))) (defun lc-regex () (mk:compile-system "REGEX")) (defun ld-regex () (mk:load-system "REGEX")) cl-regex-1/packages.lisp0100755000175000001440000000667007540241160014106 0ustar mrdusers;;; -*- Mode: Lisp; Syntax: Common-Lisp; Base: 10 -*- (defpackage REGEX #+:Genera (:use COMMON-LISP CLOS) #-:Genera (:use COMMON-LISP) (:export ;; compiler "PARSE-STR" "COMPILE-STR" "COMPILE-EXPR" "MACROEXPAND-REGEX-STR" "MACROEXPAND-REGEX-EXPR" "DEFREGEX" ;; match/scan "MATCH-STR" "SCAN-STR" ;; matcher structure "MATCHER" "MATCHER-P" "MATCHER-MATCHFN" "MATCHER-NUMREGS" ;; Accessors into match registers. Overloaded on string, symbol, ;; matcher, and register array. "REGISTER-MATCHED-P" "REGISTER-START" "REGISTER-END" "MAKE-REGS" ;; housekeeping "CLEAR-PATTERN-CACHE" ;; tweak the syntax, speed, etc. "COMPILE-VERBOSE" "MATCH-SIMPLE-STRINGS-ONLY" "ESCAPE-SPECIAL-CHARS" "FORCE-SAFE-MATCH" "DOT-MATCHES-NEWLINE" "ALLOW-BACKMATCH" "ALLOW-RANGEMATCH" "ALLOW-NONGREEDY-QUANTIFIERS" "ALLOW-NONREGISTER-GROUPS" "REGISTERS-MATCH-RIGHTMOST" ;; regex as sexpr "MAKE-CHAR-NODE" "CHAR-NODE-P" "CHAR-NODE-CHAR" "MAKE-STRING-NODE" "STRING-NODE-P" "STRING-NODE-STRING" "MAKE-BACKMATCH-NODE" "BACKMATCH-NODE-P" "BACKMATCH-NODE-REGNUM" "MAKE-SEQ-NODE-LIST" "MAKE-SEQ-NODE-ARGS" "SEQ-NODE-P" "SEQ-NODE-CHILDREN" "SEQ-NODE-NUMCHILDREN" "SEQ-NODE-CHILD" "MAKE-KLEENE-NODE" "KLEENE-NODE-P" "KLEENE-NODE-GREEDY-P" "KLEENE-NODE-NONGREEDY-P" "KLEENE-NODE-CHILD" "MAKE-PKLEENE-NODE" "PKLEENE-NODE-P" "PKLEENE-NODE-GREEDY-P" "PKLEENE-NODE-NONGREEDY-P" "PKLEENE-NODE-CHILD" "MAKE-OPTIONAL-NODE" "OPTIONAL-NODE-P" "OPTIONAL-NODE-GREEDY-P" "OPTIONAL-NODE-NONGREEDY-P" "OPTIONAL-NODE-CHILD" "MAKE-RANGE-NODE" "RANGE-NODE-P" "RANGE-NODE-GREEDY-P" "RANGE-NODE-NONGREEDY-P" "RANGE-NODE-MIN" "RANGE-NODE-MAX" "RANGE-NODE-CHILD" "MAKE-ALT-NODE-LIST" "MAKE-ALT-NODE-ARGS" "ALT-NODE-P" "ALT-NODE-CHILDREN" "ALT-NODE-NUMCHILDREN" "ALT-NODE-FIRST" "ALT-NODE-SECOND" "ALT-NODE-CHILD" "MAKE-REGISTER-NODE" "REGISTER-NODE-P" "REGISTER-NODE-REGNUM" "REGISTER-NODE-CHILD" "MAKE-REGSTART-NODE" "REGSTART-NODE-P" "REGSTART-NODE-REGNUM" "MAKE-REGEND-NODE" "REGEND-NODE-P" "REGEND-NODE-REGNUM" "MAKE-CHARCLASS-NODE" "CHARCLASS-NODE-P" "CHARCLASS-NODE-NEGATED-P" "CHARCLASS-NODE-CHARS" "MAKE-SPECCLASS-NODE" "SPECCLASS-NODE-P" "SPECCLASS-NODE-NEGATED-P" "SPECCLASS-NODE-CLASS" "MAKE-START-ANCHOR-NODE" "START-ANCHOR-NODE-P" "MAKE-END-ANCHOR-NODE" "END-ANCHOR-NODE-P" "MAKE-ANY" "ANY-P" "MAKE-HOOK-NODE" "HOOK-NODE-P" "HOOK-NODE-FUNCTION" "HOOK-NODE-SYMBOL-P" "HOOK-NODE-FUNCTION-P" "HOOK-NODE-INDEX-P" "MAKE-SUCCESS-NODE" "SUCCESS-NODE-P" "SUCCESS-NODE-RC" ;; special char classes "ALPHA" "UPPER" "LOWER" "DIGIT" "ALNUM" "XDIGIT" "ODIGIT" "PUNCT" "SPACE" )) (defpackage REGEX-TEST (:use COMMON-LISP REGEX)) (defun delete-regex () (delete-package :REGEX-TEST) (delete-package :REGEX)) cl-regex-1/regex.lisp0100755000175000001440000004650707550101470013445 0ustar mrdusers;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: REGEX; Base: 10 -*- (in-package :REGEX) ; ; Rewrite to use parse-tree functions. ; Separate optimization from parsing. ; Add separate canonicalization and rewrite passes. ; ; Partly because I'm about to allow the caller to manipulate ; the parse trees himself, so I can't depend on the parse ; tree being in any particular format. Partly because the ; lexer needs some beefier optimizations that I can easily ; provide in the current ad-hoc scheme. And partly because ; compile-greedy-star/compile-greedy-plus are a mess and ; about to get messier. ; ; Planned organization: ; pass 1. Lexical analysis ; pass 2. Parsing ; pass 3. Canonicalize parse tree (needed because we allow ; program interaction at the parse tree level). Also ; convert (reg n ) to (seq (rstart n) (rend n)). ; Either are valid parse-tree inputs, but the (reg ...) ; syntax is nearly always more convenient, but obscures ; the cps form, complicating some optimizations. ; pass 4. Instruction selection (fast-xxx ops) ; pass 5. CPS conversion to tuple list. ; For incremental use: ; pass 6. Code (closure) generation. ; pass 7. Linking (resolve target labels to functions). ; ; For batch use (i.e. deflexer, defregex et al) ; pass 6: Sexpr generation ; ; Passes 1&2 are coroutines. ; Pass 3 should be iterated until it reaches a fixedpoint. ; Passes 4-7 are currently all rolled into one pass, which is ; a PITA to maintain, and doesn't facilitate the generation of both ; closures and code. ; ; ; Planned optimizations: ; ; Merge ALT heads and tails. ; (alt (seq a A) (seq a B) (seq b A) (seq b B)) ; --> (alt (seq a (alt A B)) (seq b (alt A B)) ; --> (seq (alt (seq a) (seq b)) (alt (seq A) (seq B))) ; DONE. ; ; For n-way ALTs, support merging subsets of the children, and n-way ; branches on leading char. This should be a big win for the lexer. ; (alt (seq a) (seq a c) (seq c)) -> (alt (seq a (alt nil (seq c))) (seq c)) ; --> (guarded-alt (a (alt nil (seq c))) ; (c )) ; Since this involves pushing down alts, it conflicts with the ; alt-merging logic, so we need a way to turn off the merging logic ; temporarily (we need to group char type branches into a sub-alt, ; canonicalize them, then re-run the alt-merge at the higher level ; to re-integrate anything that didn't migrate out. Then at simplify ; time, we look for runs of alt branches that start with char-type ; nodes, and group these into guarded-alts. ; DONE ; ; Merge single-char/char-class alt clauses into char-class. ; (seq (alt (seq a) (seq b)) (alt (seq A) (seq B))) ; --> (seq (charclass "ab") (charclass "AB")) ; DONE. ; ; Merge lists of chars and strings into strings. ; DONE. ; ; Support lists of 2-char classes (common for case-insensitive matches) ; DONE ; ; Eliminate null states in sequences (caused by fully merging ; alt clause heads, and possibly others). ; DONE. ; ; Support fast alt of (|node), (|node), ; (|node), (|node) ; ; Support merging sequences of 1-valued nodes into a specialized matcher ; that takes an array of "match" functions that return either nil or the ; new pos. If any of them fail, the whole sequence fails. ; ; If the child of + is something trivial like char, seq of char, ; any, charclass, specclass, or seq of charclass, leave it as a ; + node and specialize it during the instruction selection pass. ; ; \d \D \w \W \s \S \< \> ; DONE ; ; Match hooks ; DONE (in sexpr form only) ; ; Acceptance functions ; DONE (in sexpr form only) ; ; Additional features: ; Forward lookahead, negative lookahead ; Named captures ; ;;; ;;; Code Generator ;;; (defun compile-expr-to-matcher (parse-tree &optional str) (multiple-value-bind (matchfn numregs simplified-tree) (compile-expr-to-matchfn parse-tree) (cond (*match-simple-strings-only* (make-matcher :simple-string-matchfn matchfn :string-matchfn nil :numregs numregs :matchstr str :matchexpr simplified-tree)) (t (make-matcher :simple-string-matchfn nil :string-matchfn matchfn :numregs numregs :matchstr str :matchexpr simplified-tree))))) (defun compile-expr-to-matchfn (parse-tree &key (simplifyp t)) (let* ((numregs (1+ (compute-max-regnum parse-tree))) (simple-tree (if simplifyp (optimize-regex-tree parse-tree) parse-tree)) (instr-tree (select-instructions simple-tree))) (multiple-value-bind (start-instr cps-instrs) (gen-instr-list instr-tree) (when *regex-compile-verbose* (format t "~&~%Numregs: ~D" numregs) (format t "~&~%Simplified tree:") (pprint simple-tree) (format t "~&~%Instruction tree:") (pprint instr-tree) (format t "~&~%CPS instruction list (start = ~D):" start-instr) (pprint cps-instrs)) (let ((closure-info (gen-closures cps-instrs))) (link-closures closure-info) (let ((matchfn (make-init-closure (remove-if #'null (map 'list #'closure-info-initfn closure-info)) (resolve-instr closure-info start-instr)))) (values (make-anchored-matcher matchfn) numregs simple-tree)))))) (defun make-anchored-matcher (matchfn) #'(lambda (*str* *regs* *start* *end* *start-is-anchor* *end-is-anchor* *acceptfn* *hooks*) #-:debug-regex (declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (special *str* *regs* *start* *end* *start-is-anchor* *end-is-anchor* *acceptfn* *hooks*)) (declare (ftype (function (fixnum) t) matchfn)) (catch 'cease-matching (funcall matchfn *start*)))) ;;; ;;; code-expansion version... ;;; ;(defun expand-anchored-matcher (matchfn vars funs) ; `(lambda (%str% %regs% %start% %end% ; %start-is-anchor% %end-is-anchor% ; %acceptfn% %hooks%) ; #-:debug-regex (declare (optimize (speed 3) (safety 1) (space 0) (debug 0))) ; (declare (ignorable %str% %regs% %start% %end% ; %start-is-anchor% %end-is-anchor% ; %acceptfn% %hooks%)) ; (declare (special %str% %regs% %start% %end% ; %start-is-anchor% %end-is-anchor% ; %acceptfn% %hooks%)) ; `(let ,vars ; (labels ,funs ; ;;(declare (inline ,@(mapcar #'first funs))) ; (catch 'cease-matching ; (,matchfn %start%)))))) ;(defun expand-anchored-matcher (matchfn vars funs) ; `(locally ; (declare (special *str* *regs* *start* *end* ; *start-is-anchor* *end-is-anchor* ; *acceptfn* *hooks*)) ; (labels ,funs ; #'(lambda (*str* *regs* *start* *end* ; *start-is-anchor* *end-is-anchor* ; *acceptfn* *hooks*) ; (declare (special *str* *regs* *start* *end* ; *start-is-anchor* *end-is-anchor* ; *acceptfn* *hooks*)) ; (let ,vars ; #-:debug-regex (declare (optimize (speed 3) (safety 1) (space 0) (debug 0))) ; (catch 'cease-matching ; (,matchfn *start*))))))) ;(defun expand-expr-to-matchfn (parse-tree &key charfn (simplifyp t)) ; (let* ((numregs (1+ (compute-max-regnum parse-tree))) ; (simple-tree (if simplifyp ; (optimize-regex-tree parse-tree) ; parse-tree)) ; (instr-tree (select-instructions simple-tree))) ; (multiple-value-bind (start-instr cps-instrs) ; (gen-instr-list instr-tree) ; (when *regex-compile-verbose* ; (format t "~&~%Numregs: ~D" numregs) ; (format t "~&~%Simplified tree:") ; (pprint simple-tree) ; (format t "~&~%Instruction tree:") ; (pprint instr-tree) ; (format t "~&~%CPS instruction list (start = ~D):" start-instr) ; (pprint cps-instrs)) ; (let ((code-info (expand-code cps-instrs charfn))) ; (let ((matchfn ; (expand-anchored-matcher ; (fnname-for-state start-instr) ; (remove-if #'null (mapcar #'second code-info)) ; (mapcar #'(lambda (inf) ; `(,(first inf) ,@(rest (third inf)))) ; code-info)))) ; (values matchfn numregs simple-tree)))))) ; search the parse tree, looking for the highest register in use. (defun compute-max-regnum (node) (cond ((seq-node-p node) (reduce #'max (mapcar #'compute-max-regnum (seq-node-children node)) :initial-value -1)) ((alt-node-p node) (reduce #'max (mapcar #'compute-max-regnum (alt-node-children node)) :initial-value -1)) ((kleene-node-p node) (compute-max-regnum (kleene-node-child node))) ((pkleene-node-p node) (compute-max-regnum (pkleene-node-child node))) ((optional-node-p node) (compute-max-regnum (optional-node-child node))) ((range-node-p node) (compute-max-regnum (range-node-child node))) ((backmatch-node-p node) (backmatch-node-regnum node)) ((register-node-p node) (max (register-node-regnum node) (compute-max-regnum (register-node-child node)))) ((regstart-node-p node) (regstart-node-regnum node)) ((regend-node-p node) (regend-node-regnum node)) ((lookahead-node-p node) (compute-max-regnum (lookahead-node-expr node))) ((nlookahead-node-p node) (compute-max-regnum (nlookahead-node-expr node))) (t 0))) ;;; ;;; Pass 7 - Linking ;;; ; resolve the target labels with the actual target closures (defun link-closures (link-info) (loop for info across link-info for linkfn = (closure-info-linkfn info) when (functionp linkfn) do (funcall linkfn link-info))) ;;;; ;;;; Compiled Matcher structure, and high-level functions ;;;; (defun compile-str (patstr) "Parse a string regex expression, and compile it into matcher object. Uses the pattern cache." ; (format t "~A entries in cache" (hash-table-count *pattern-cache*)) (let ((cached-machine (gethash patstr *pattern-cache*))) (or cached-machine (newly-compiled-str-matcher patstr)))) (defun compile-expr (regexpr) "Parse a string regex expression, and compile it into matcher object. Uses the pattern cache." ; (format t "~A entries in cache" (hash-table-count *pattern-cache*)) (let ((cached-machine (gethash regexpr *pattern-cache*))) (or cached-machine (newly-compiled-expr-matcher regexpr)))) (defun macroexpand-regex-str (patstr) `(compile-expr-to-matcher ',(parse-str patstr) ,patstr)) (defmacro defregex (name patstr &rest rest) `(defparameter ,name ,(macroexpand-regex-str patstr) ,@rest)) (defmacro macroexpand-regex-expr (regex-expr) `(compile-expr-to-matcher ,regex-expr)) ;;; ;;; code-expansion version... ;;; ;(defun macroexpand-regex-expr (regex-expr) ; "Parse a string regex expression, and translate it to lisp code." ; (multiple-value-bind (matchfn numregs simplified-tree) ; (expand-expr-to-matchfn regex-expr ; :charfn (if *match-simple-strings-only* 'schar 'char)) ; (declare (ignore simplified-tree)) ; (cond (*match-simple-strings-only* ; `(make-matcher :simple-string-matchfn ,matchfn ; :string-matchfn nil ; :numregs ,numregs ; :matchstr nil ; :matchstr nil)) ; (t `(make-matcher :simple-string-matchfn ,matchfn ; :string-matchfn nil ; :numregs ,numregs ; :matchstr nil ; :matchstr nil))))) ;(defun macroexpand-regex-str (patstr) ; "Parse a string regex expression, and translate it to lisp code." ; (macroexpand-regex-expr (parse-str patstr))) ;(defmacro defregex (name patstr &rest rest) ; `(defparameter ,name ,(macroexpand-regex-str patstr) ,@rest)) ; Try to use the quickest matcher for the input string. If the ; candidate string isn't a simple string, then match with the slower ; string-matcher. Since this isn't compiled by default, it may need ; to be compiled from the saved expr. (defun match-str-all-parms (matcher candstr regs start length start-is-anchor end-is-anchor acceptfn hooks) (dotimes (i (length regs)) (let ((reg (aref regs i))) (setf (car reg) nil) (setf (cdr reg) nil))) (cond ((simple-string-p candstr) (cond ((functionp (matcher-simple-string-matchfn matcher)) (funcall (matcher-simple-string-matchfn matcher) candstr regs start (+ start length) start-is-anchor end-is-anchor acceptfn hooks)) ((functionp (matcher-string-matchfn matcher)) (funcall (matcher-string-matchfn matcher) candstr regs start (+ start length) start-is-anchor end-is-anchor acceptfn hooks)) (t (error "REGEX Error: ~S is not a valid regex matcher" matcher)))) ((stringp candstr) (cond ((functionp (matcher-string-matchfn matcher)) (funcall (matcher-string-matchfn matcher) candstr regs start (+ start length) start-is-anchor end-is-anchor acceptfn hooks)) ((functionp (matcher-matchexpr matcher)) (let ((*match-simple-strings-only* nil)) (setf (matcher-string-matchfn matcher) (compile-expr-to-matcher (matcher-matchexpr matcher)))) (unless (matcher-string-matchfn matcher) (error "REGEX Error: ~S does not have a valid match function for class STRING" matcher)) (funcall (matcher-string-matchfn matcher) candstr regs start (+ start length) start-is-anchor end-is-anchor acceptfn hooks)))) (t (error "REGEX Error: ~S is not a string" candstr)))) (defun match-str (matcher candstr &key (regs (make-regs (matcher-numregs matcher))) (start 0) (length (- (length candstr) start)) (start-is-anchor (= start 0)) (end-is-anchor (= length (length candstr))) acceptfn hooks) "Run a matcher against a candidate string, without scanning \(so it is implicitly anchored\). Returns \(values t start end regs\) on success, nil on failure." (match-str-all-parms matcher candstr regs start length start-is-anchor end-is-anchor acceptfn hooks)) (define-compiler-macro match-str (matcher candstr &key (regs `(make-regs (matcher-numregs ,matcher))) (start 0) (length `(- (length ,candstr) ,start)) (start-is-anchor `(= ,start 0)) (end-is-anchor `(= ,length (length ,candstr))) acceptfn hooks) `(match-str-all-parms ,matcher ,candstr ,regs ,start ,length ,start-is-anchor ,end-is-anchor ,acceptfn ,hooks)) ;;; This really needs a prefix-map array in the matcher structure so we can ;;; quickly find potential beginnings to the string (defun scan-str-all-parms (matcher str regs start length start-is-anchor end-is-anchor acceptfn hooks) (declare (type matcher matcher) (string str) (fixnum start length)) (let ((matchedp t) match-start (match-start-pos start) (len-remaining length) (match-len length) (match-regs nil)) (loop (multiple-value-setq (matchedp match-start match-len match-regs) (match-str-all-parms matcher str regs match-start-pos len-remaining (and start-is-anchor (= match-start-pos start)) end-is-anchor acceptfn hooks)) (cond (matchedp (return-from scan-str-all-parms (values matchedp match-start match-len match-regs))) ((>= match-start-pos (+ start length)) (return-from scan-str-all-parms nil)) (t (incf match-start-pos) (decf len-remaining)))))) (defun scan-str (matcher candstr &key (regs (make-regs (matcher-numregs matcher))) (start 0) (length (length candstr)) (start-is-anchor (= start 0)) (end-is-anchor (= length (length candstr))) acceptfn hooks) "Run a matcher against a candidate string, scanning forward if necessary. Returns \(values t start end regs\) on success, nil on failure." (scan-str-all-parms matcher candstr regs start length start-is-anchor end-is-anchor acceptfn hooks)) (define-compiler-macro scan-str (matcher candstr &key (regs `(make-regs (matcher-numregs ,matcher))) (start 0) (length `(length ,candstr)) (start-is-anchor `(= ,start 0)) (end-is-anchor `(= ,length (length ,candstr))) acceptfn hooks) `(scan-str-all-parms ,matcher ,candstr ,regs ,start ,length ,start-is-anchor ,end-is-anchor ,acceptfn ,hooks)) (defun uncached-compile-str (patstr) (let ((result (catch 'regex-parse-error (compile-expr-to-matcher (parse-str patstr) patstr)))) (cond ((matcher-p result) result) (t (apply #'format (cons t result)) nil)))) (defun uncached-compile-expr (regexpr &optional str) (let ((result (catch 'regex-parse-error (compile-expr-to-matcher regexpr str)))) (cond ((matcher-p result) result) (t (apply #'format (cons t result)) nil)))) (defun newly-compiled-str-matcher (patstr) (when (>= (hash-table-count *pattern-cache*) +max-regex-str-cache+) (clrhash *pattern-cache*)) (setf (gethash patstr *pattern-cache*) (uncached-compile-str patstr))) (defun newly-compiled-expr-matcher (regexpr) (when (>= (hash-table-count *pattern-cache*) +max-regex-str-cache+) (clrhash *pattern-cache*)) (setf (gethash regexpr *pattern-cache*) (uncached-compile-expr regexpr))) ;;; ;;; Testing ;;; (defun testcomp (str) (compile-expr-to-matcher (parse-str str) str)) (defun testmatch (str pat &key hooks) (clear-pattern-cache) (match-str (compile-str pat) str :hooks hooks)) cl-regex-1/macs.lisp0100755000175000001440000004157507550100466013262 0ustar mrdusers;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: REGEX; Base: 10 -*- (in-package :REGEX) ; disable inlining and the optimizations in the match functions. ;(eval-when (eval compile load) ; (pushnew :debug-regex *features*) ) ;;;; ;;;; Regex Configuration ;;;; (defvar *regex-compile-verbose* nil) (defun compile-verbose (&optional (flag t)) (setq *regex-compile-verbose* flag)) (defvar *match-simple-strings-only* t) (defun match-simple-strings-only (&optional (flag t)) (setq *match-simple-strings-only* flag) (clear-pattern-cache)) ;; this should always be set to t (defvar *force-safe-match* t) (defun force-safe-match (&optional (flag t)) (setq *force-safe-match* flag) (clear-pattern-cache)) (defvar *escape-special-chars* nil) (defun escape-special-chars (&optional (flag t)) (setq *escape-special-chars* flag) (clear-pattern-cache)) (defvar *dot-matches-newline* nil) (defun dot-matches-newline (&optional (flag t)) (setq *dot-matches-newline* flag)) (defvar *allow-backmatch* t) (defun allow-backmatch (&optional (flag t)) (setq *allow-backmatch* flag) (clear-pattern-cache)) (defvar *allow-rangematch* t) (defun allow-rangematch (&optional (flag t)) (setq *allow-rangematch* flag) (clear-pattern-cache)) (defvar *allow-nongreedy-quantifiers* t) (defun allow-nongreedy-quantifiers (&optional (flag t)) (setq *allow-nongreedy-quantifiers* flag) (clear-pattern-cache)) (defvar *allow-nonregister-groups* t) (defun allow-nonregister-groups (&optional (flag t)) (setq *allow-nonregister-groups* flag) (clear-pattern-cache)) (defvar *registers-match-rightmost* nil) (defun registers-match-rightmost (&optional (flag t)) (setq *registers-match-rightmost* flag) (clear-pattern-cache)) ;;;; ;;;; Regex parse tree. ;;;; #-:debug-regex(declaim (inline make-char-node char-node-p char-node-char)) (defun make-char-node (chr) chr) (defun char-node-p (node) (characterp node)) (defun char-node-char (char-node) char-node) #-:debug-regex(declaim (inline make-string-node string-node-p string-node-string)) (defun make-string-node (str) str) (defun string-node-p (node) (stringp node)) (defun text-node-p (node) (or (char-node-p node) (string-node-p node))) (defun string-node-string (string-node) string-node) #-:debug-regex(declaim (inline make-startword-node startword-node-p)) (defun make-startword-node () '(startword)) (defun startword-node-p (node) (equalp node '(startword))) #-:debug-regex(declaim (inline make-endword-node endword-node-p)) (defun make-endword-node () '(endword)) (defun endword-node-p (node) (equalp node '(endword))) #-:debug-regex(declaim (inline make-classseq-node classseq-node-p classseq-node-seq)) (defun make-classseq-node (seq) `(classseq ,seq)) (defun classseq-node-p (node) (and (listp node) (eq (first node) 'classseq))) (defun classseq-node-seq (classseq-node) (second classseq-node)) #-:debug-regex(declaim (inline make-backmatch-node backmatch-node-p backmatch-node-regnum)) (defun make-backmatch-node (regnum) `(backmatch ,regnum)) (defun backmatch-node-p (node) (and (listp node) (eq (first node) 'backmatch))) (defun backmatch-node-regnum (backmatch-node) (second backmatch-node)) #-:debug-regex(declaim (inline make-seq-node-list make-seq-node-args seq-node-p seq-node-children seq-node-numchildren seq-node-child)) (defun make-seq-node-list (child-nodes) (let ((numchildren (length child-nodes))) (cond ((zerop numchildren) nil) ((= numchildren 1) (first child-nodes)) (t `(seq ,@child-nodes))))) (defun make-seq-node-args (&rest child-nodes) (make-seq-node-list child-nodes)) (defun seq-node-p (node) (and (listp node) (eq (first node) 'seq))) (defun seq-node-children (seq-node) (rest seq-node)) (defun seq-node-numchildren (seq-node) (length (seq-node-children seq-node))) (defun seq-node-child (seq-node idx) (elt (seq-node-children seq-node) idx)) #-:debug-regex(declaim (inline make-kleene-node kleene-node-greedy-p kleene-node-nongreedy-p kleene-node-child)) (defun make-kleene-node (child-node greedyp) (cond (greedyp `(* ,child-node)) (t `(*? ,child-node)))) (defun kleene-node-p (node) (cond ((not (listp node)) nil) ((eq (first node) '*) t) ((eq (first node) '*?) t) (t nil))) (defun kleene-node-greedy-p (node) (eq (first node) '*)) (defun kleene-node-nongreedy-p (node) (eq (first node) '*?)) (defun kleene-node-child (kleene-node) (second kleene-node)) #-:debug-regex(declaim (inline make-pkleene-node pkleene-node-greedy-p pkleene-node-nongreedy-p pkleene-node-child)) (defun make-pkleene-node (child-node greedyp) (cond (greedyp `(+ ,child-node)) (t `(+? ,child-node)))) (defun pkleene-node-p (node) (cond ((not (listp node)) nil) ((eq (first node) '+) t) ((eq (first node) '+?) t) (t nil))) (defun pkleene-node-greedy-p (node) (eq (first node) '+)) (defun pkleene-node-nongreedy-p (node) (eq (first node) '+?)) (defun pkleene-node-child (pkleene-node) (second pkleene-node)) #-:debug-regex(declaim (inline make-optional-node optional-node-p optional-node-greedy-p optional-node-nongreedy-p optional-child-node)) (defun make-optional-node (child-node greedyp) (cond (greedyp `(? ,child-node)) (t `(?? ,child-node)))) (defun optional-node-p (node) (cond ((not (listp node)) nil) ((eq (first node) '?) t) ((eq (first node) '??) t) (t nil))) (defun optional-node-greedy-p (node) (eq (first node) '?)) (defun optional-node-nongreedy-p (node) (eq (first node) '??)) (defun optional-node-child (optional-node) (second optional-node)) #-:debug-regex(declaim (inline make-range-node range-node-greedy-p range-node-nongreedy-p range-node-min range-node-max range-node-child)) (defun make-range-node (child-node min max greedyp) (unless (numberp min) (setq min 0)) (unless (numberp max) (setq max nil)) (let ((min (if (and (numberp min) (numberp max)) (min min max) min)) (max (if (and (numberp min) (numberp max)) (max min max) max))) (cond (greedyp `(range (,min . ,max) ,child-node)) (t `(ngrange (,min . ,max) ,child-node))))) (defun range-node-p (node) (cond ((not (listp node)) nil) ((eq (first node) 'range) t) ((eq (first node) 'ngrange) t) (t nil))) (defun range-node-greedy-p (node) (eq (first node) 'range)) (defun range-node-nongreedy-p (node) (eq (first node) 'ngrange)) (defun range-node-min (range-node) (car (second range-node))) (defun range-node-max (range-node) (cdr (second range-node))) (defun range-node-child (range-node) (third range-node)) #-:debug-regex(declaim (inline make-alt-node-list make-alt-node-args alt-node-p alt-node-children alt-node-numchildren alt-node-first alt-node-second alt-node-child)) (defun make-alt-node-list (child-nodes) `(alt ,@child-nodes)) (defun make-alt-node-args (&rest child-nodes) `(alt ,@child-nodes)) (defun alt-node-p (node) (and (listp node) (eq (first node) 'alt))) (defun alt-node-children (alt-node) (rest alt-node)) (defun alt-node-numchildren (alt-node) (length (alt-node-children alt-node))) (defun alt-node-first (alt-node) (first (alt-node-children alt-node))) (defun alt-node-second (alt-node) (second (alt-node-children alt-node))) (defun alt-node-child (alt-node idx) (elt (alt-node-children alt-node) idx)) #-:debug-regex(declaim (inline make-casealt-node-list make-casealt-node-args casealt-node-p casealt-node-children casealt-node-numchildren)) (defun make-casealt-node-list (child-nodes) `(casealt ,child-nodes)) (defun make-casealt-node-args (&rest child-nodes) `(casealt ,child-nodes)) (defun casealt-node-p (node) (and (listp node) (eq (first node) 'casealt))) (defun casealt-node-children (casealt-node) (second casealt-node)) (defun casealt-node-numchildren (casealt-node) (length (casealt-node-children casealt-node))) #-:debug-regex(declaim (inline make-register-node register-node-p register-node-regnum register-node-child)) (defun make-register-node (regnum child) `(reg ,regnum ,child)) (defun register-node-p (node) (and (listp node) (eq (first node) 'reg))) (defun register-node-regnum (reg-node) (second reg-node)) (defun register-node-child (reg-node) (third reg-node)) #-:debug-regex(declaim (inline make-regstart-node regstart-node-p regstart-node-regnum)) (defun make-regstart-node (regnum) `(regstart ,regnum)) (defun regstart-node-p (node) (and (listp node) (eq (first node) 'regstart))) (defun regstart-node-regnum (rstart-node) (second rstart-node)) #-:debug-regex(declaim (inline make-regend-node regend-node-p regend-node-regnum)) (defun make-regend-node (regnum) `(regend ,regnum)) (defun regend-node-p (node) (and (listp node) (eq (first node) 'regend))) (defun regend-node-regnum (rend-node) (second rend-node)) #-:debug-regex(declaim (inline make-lookahead-node lookahead-node-p lookahead-node-expr)) (defun make-lookahead-node (expr) `(lookahead ,expr)) (defun lookahead-node-p (node) (and (listp node) (eq (first node) 'lookahead))) (defun lookahead-node-expr (lookahead-node) (second lookahead-node)) #-:debug-regex(declaim (inline make-nlookahead-node nlookahead-node-p nlookahead-node-expr)) (defun make-nlookahead-node (expr) `(nlookahead ,expr)) (defun nlookahead-node-p (node) (and (listp node) (eq (first node) 'nlookahead))) (defun nlookahead-node-expr (nlookahead-node) (second nlookahead-node)) #-:debug-regex(declaim (inline make-charclass-node charclass-node-negated-p charclass-node-chars)) (defun make-charclass-node (chars &key negated) (cond (negated `(ncharclass ,(expand-char-class (coerce chars 'list)))) (t `(charclass ,(expand-char-class (coerce chars 'list)))))) (defun positive-charclass-node-p (node) (and (listp node) (eq (first node) 'charclass))) (defun charclass-node-p (node) (cond ((not (listp node)) nil) ((eq (first node) 'charclass) t) ((eq (first node) 'ncharclass) t) (t nil))) (defun charclass-node-negated-p (charclass-node) (eq (first charclass-node) 'ncharclass)) (defun charclass-node-chars (charclass-node) (second charclass-node)) (defun char-or-class-node-p (node) (or (char-node-p node) (charclass-node-p node))) #-:debug-regex(declaim (inline make-specclass-node specclass-node-negated-p specclass-node-class)) (defun make-specclass-node (class &key negated) (cond (negated `(nspecclass ,class)) (t `(specclass ,class)))) (defun specclass-node-p (node) (cond ((not (listp node)) nil) ((eq (first node) 'specclass) t) ((eq (first node) 'nspecclass) t) (t nil))) (defun specclass-node-negated-p (specclass-node) (eq (first specclass-node) 'nspecclass)) (defun specclass-node-class (specclass-node) (second specclass-node)) #-:debug-regex(declaim (inline make-start-anchor-node start-anchor-node-p)) (defun make-start-anchor-node () '(start)) (defun start-anchor-node-p (node) (equalp node '(start))) #-:debug-regex(declaim (inline make-end-anchor-node end-anchor-node-p)) (defun make-end-anchor-node () '(end)) (defun end-anchor-node-p (node) (equalp node '(end))) #-:debug-regex(declaim (inline make-any-node any-node-p)) (defun make-any-node () '(any)) (defun any-node-p (node) (equalp node '(any))) #-:debug-regex(declaim (inline make-hook-node hook-node-p hook-node-function hook-node-symbol-p hook-node-function-p hook-node-index-p)) (defun make-hook-node (fxn-id) `(hook ,fxn-id)) (defun hook-node-p (node) (and (listp node) (eq (first node) 'hook))) (defun hook-node-function (hook-node) (second hook-node)) (defun hook-node-symbol-p (hook-node) (symbolp (hook-node-function hook-node))) (defun hook-node-function-p (hook-node) (functionp (hook-node-function hook-node))) (defun hook-node-index-p (hook-node) (integerp (hook-node-function hook-node))) #-:debug-regex(declaim (inline make-success-node success-node-p success-node-rc)) (defun make-success-node (rc) `(succeed ,rc)) (defun success-node-p (node) (and (listp node) (eq (first node) 'succeed))) (defun success-node-rc (success-node) (second success-node)) ;;;; ;;;; Misc handy macros ;;;; (defun make-regs (n) (let ((regs (make-array n))) (dotimes (i n regs) (setf (svref regs i) (cons nil nil))))) (declaim (inline register-start)) (defun register-start (regs n) (car (svref regs n))) (define-compiler-macro register-start (regs n) `(car (svref ,regs ,n))) (declaim (inline register-end)) (defun register-end (regs n) (cdr (svref regs n))) (define-compiler-macro register-end (regs n) `(cdr (svref ,regs ,n))) (defsetf register-start (regs n) (val) `(setf (car (svref ,regs ,n)) ,val)) (defsetf register-end (regs n) (val) `(setf (cdr (svref ,regs ,n)) ,val)) (defun register-matched-p (regs n) (and (numberp (register-start regs n)) (numberp (register-end regs n)) (<= (register-start regs n) (register-end regs n)))) (defstruct re-scanner (str "" :type string) (pos 0 :type fixnum) (end 0 :type fixnum) (mode 'in-regex :type symbol) (regnum 0 :type fixnum) (ungot-token nil) (ungot-value nil) ) (defstruct closure-info matchfn initfn linkfn) (defun resolve-instr (closure-vec instr-num) (closure-info-matchfn (aref closure-vec instr-num))) (defun resolve-linkfn (closure-vec instr-num) (closure-info-linkfn (aref closure-vec instr-num))) (defun resolve-initfn (closure-vec instr-num) (closure-info-initfn (aref closure-vec instr-num))) (defstruct matcher simple-string-matchfn string-matchfn numregs matchstr matchexpr acceptfn) (defconstant +max-regex-str-cache+ 500 "Max number of entries in the regex compiler cache.") (defvar *pattern-cache* (make-hash-table :test #'equal)) (defun clear-pattern-cache () (clrhash *pattern-cache*) nil) (defconstant +special-class-names+ '((":alpha:" alpha) (":upper:" upper) (":lower:" lower) (":digit:" digit) (":alnum:" alnum) (":xdigit:" xdigit) (":odigit:" odigit) (":punct:" punct) (":space:" space) (":word:" wordchar))) (defun expand-char-class (chars) "Expand an encoded char class into an explicit enumeration of all the chars, e.g. 'a-f\A-F' --> 'abcdefABCDEF'." (parse-std-char-class chars)) (defun parse-std-char-class (in) (let ((out (make-string-output-stream))) (do ((chr (pop in) (pop in)) (prv nil chr)) ((null chr) (progn (when prv (write-char prv out)) (get-output-stream-string out))) (case chr (dash (let ((nxt (pop in))) (cond ((and (null nxt) (null prv)) (write-char #\- out)) ((null nxt) (write-char prv out) (write-char #\- out)) ((null prv) (write-char #\- out) (write-char nxt out)) (t (generate-char-range out prv nxt) (setq prv nil))) (setq chr (pop in)))) (t (when prv (write-char prv out))))))) (defun generate-char-range (strm start end) (when (< (char-int end) (char-int start)) (rotatef start end)) (loop for ic from (char-int start) to (char-int end) do (write-char (code-char ic) strm))) cl-regex-1/parser.lisp0100755000175000001440000006426707550100334013630 0ustar mrdusers;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: REGEX; Base: 10 -*- (in-package :REGEX) ;;; ;;; Pass 1 - Lexer ;;; (defun re-scanner (str &optional (start 0) (length (length str))) (make-re-scanner :str str :pos start :end (+ start length))) (defun nextchar (scanner) (let ((ch (char (re-scanner-str scanner) (re-scanner-pos scanner)))) (incf (re-scanner-pos scanner)) ch)) (defun ungetchar (scanner) (decf (re-scanner-pos scanner))) (defun fix-escape-char (chr) (case chr ((#\t #\T) #\Tab) ((#\n #\N) #\Newline) ((#\r #\R) #\Return) ((#\h #\H) #\Backspace) ((#\p #\P) #\Page) (t chr))) (defun scan-num (scanner) (let ((ch (nextchar scanner))) (cond ((digit-char-p ch) (cond ((< (re-scanner-pos scanner) (re-scanner-end scanner)) (let ((ch2 (nextchar scanner))) (cond ((digit-char-p ch2) (parse-integer (concatenate 'string (string ch) (string ch2)))) (t (ungetchar scanner) (parse-integer (string ch)))))) (t (parse-integer (string ch))))) (t (ungetchar scanner) ch)))) (defun next (scanner) (declare (type re-scanner scanner)) (cond ((re-scanner-ungot-token scanner) (let ((token (re-scanner-ungot-token scanner)) (value (re-scanner-ungot-value scanner))) (setf (re-scanner-ungot-token scanner) nil (re-scanner-ungot-value scanner) nil) (values token value))) ((= (re-scanner-pos scanner) (re-scanner-end scanner)) (values nil nil)) (t (let ((ch (nextchar scanner))) (if *escape-special-chars* ;; escaped magic chars (case (re-scanner-mode scanner) (in-regex (if (and (char= ch #\\) (< (re-scanner-pos scanner) (re-scanner-end scanner))) (let ((ch (nextchar scanner))) (case ch (#\\ (if (< (re-scanner-pos scanner) (re-scanner-end scanner)) (if *allow-backmatch* (let ((item (scan-num scanner))) (print item) (if (numberp item) (values 'backmatch item) (let ((nc (fix-escape-char (nextchar scanner)))) (case nc (#\< (values 'startword nil)) (#\> (values 'endword nil)) (#\w (values 'wordchar nil)) (#\W (values 'nonwordchar nil)) (#\d (values 'digitchar nil)) (#\D (values 'nondigitchar nil)) (#\s (values 'spacechar nil)) (#\S (values 'nonspacechar nil)) (t (values 'char nc)))))) (let ((nc (fix-escape-char (nextchar scanner)))) (case nc (#\< (values 'startword nil)) (#\> (values 'endword nil)) (#\w (values 'wordchar nil)) (#\W (values 'nonwordchar nil)) (#\d (values 'digitchar nil)) (#\D (values 'nondigitchar nil)) (#\s (values 'spacechar nil)) (#\S (values 'nonspacechar nil)) (t (values 'char nc))))) (values 'char ch))) (#\* (if (and *allow-nongreedy-quantifiers* (< (re-scanner-pos scanner) (re-scanner-end scanner))) (let ((ch2 (nextchar scanner))) (cond ((char= ch2 #\?) (values 'kleene 'non-greedy)) (t (ungetchar scanner) (values 'kleene 'greedy)))) (values 'kleene 'greedy))) (#\+ (if (and *allow-nongreedy-quantifiers* (< (re-scanner-pos scanner) (re-scanner-end scanner))) (let ((ch2 (nextchar scanner))) (cond ((char= ch2 #\?) (values 'plus 'non-greedy)) (t (ungetchar scanner) (values 'plus 'greedy)))) (values 'plus 'greedy))) (#\? (if (and *allow-nongreedy-quantifiers* (< (re-scanner-pos scanner) (re-scanner-end scanner))) (let ((ch2 (nextchar scanner))) (cond ((char= ch2 #\?) (values 'optional 'non-greedy)) (t (ungetchar scanner) (values 'optional 'greedy)))) (values 'optional 'greedy))) (#\( (if (and *allow-nonregister-groups* (< (re-scanner-pos scanner) (re-scanner-end scanner))) (let ((ch2 (nextchar scanner))) (cond ((char= ch2 #\?) (if (< (re-scanner-pos scanner) (re-scanner-end scanner)) (let ((ch3 (nextchar scanner))) (cond ((char= ch3 #\=) (values 'lparen 'lookahead)) ((char= ch3 #\!) (values 'lparen 'nlookahead)) (t (ungetchar scanner) (values 'lparen nil)))) (values 'lparen nil))) (t (ungetchar scanner) (values 'lparen (incf (re-scanner-regnum scanner)))))) (values 'lparen (incf (re-scanner-regnum scanner))))) (#\) (values 'rparen nil)) (#\| (values 'or nil)) (#\^ (values 'startanchor nil)) (#\$ (values 'endanchor nil)) (#\[ (setf (re-scanner-mode scanner) 'start-class) (values 'startclass nil)) (#\] (values 'char ch)) (#\. (values 'any nil)) (#\{ (if *allow-rangematch* (let ((rangebounds (scan-range-bounds scanner))) (if (and *allow-nongreedy-quantifiers* (< (re-scanner-pos scanner) (re-scanner-end scanner))) (let ((ch2 (nextchar scanner))) (cond ((char= ch2 #\?) (values 'ngrange rangebounds)) (t (ungetchar scanner) (values 'range rangebounds)))) (values 'range rangebounds))) (values 'char ch))) (t (values 'char ch))) ) (values 'char ch))) (start-class (if (and (char= ch #\\) (< (re-scanner-pos scanner) (re-scanner-end scanner))) (let ((ch (nextchar scanner))) (case ch (#\\ (setf (re-scanner-mode scanner) 'in-class) (values 'char (fix-escape-char (nextchar scanner)))) (#\] (setf (re-scanner-mode scanner) 'in-regex) (values 'endclass nil)) (#\^ (setf (re-scanner-mode scanner) 'in-class) (values 'nclass nil)) (t (setf (re-scanner-mode scanner) 'in-class) (values 'char ch)) )) (values 'char ch))) (in-class (if (and (char= ch #\\) (< (re-scanner-pos scanner) (re-scanner-end scanner))) (let ((ch (nextchar scanner))) (case ch (#\\ (values 'char (fix-escape-char (nextchar scanner)))) (#\- (values 'dash nil)) (#\] (setf (re-scanner-mode scanner) 'in-regex) (values 'endclass nil)) (t (values 'char ch)))) (values 'char ch)))) ;; non-escaped magic chars (case (re-scanner-mode scanner) (in-regex (case ch (#\\ (if (< (re-scanner-pos scanner) (re-scanner-end scanner)) (if *allow-backmatch* (let ((item (scan-num scanner))) (if (numberp item) (values 'backmatch item) (let ((nc (fix-escape-char (nextchar scanner)))) (case nc (#\< (values 'startword nil)) (#\> (values 'endword nil)) (#\w (values 'wordchar nil)) (#\W (values 'nonwordchar nil)) (#\d (values 'digitchar nil)) (#\D (values 'nondigitchar nil)) (#\s (values 'spacechar nil)) (#\S (values 'nonspacechar nil)) (t (values 'char nc)))))) (let ((nc (fix-escape-char (nextchar scanner)))) (case nc (#\< (values 'startword nil)) (#\> (values 'endword nil)) (#\w (values 'wordchar nil)) (#\W (values 'nonwordchar nil)) (#\d (values 'digitchar nil)) (#\D (values 'nondigitchar nil)) (#\s (values 'spacechar nil)) (#\S (values 'nonspacechar nil)) (t (values 'char nc))))) (values 'char ch))) (#\* (if (and *allow-nongreedy-quantifiers* (< (re-scanner-pos scanner) (re-scanner-end scanner))) (let ((ch2 (nextchar scanner))) (cond ((char= ch2 #\?) (values 'kleene 'non-greedy)) (t (ungetchar scanner) (values 'kleene 'greedy)))) (values 'kleene 'greedy))) (#\+ (if (and *allow-nongreedy-quantifiers* (< (re-scanner-pos scanner) (re-scanner-end scanner))) (let ((ch2 (nextchar scanner))) (cond ((char= ch2 #\?) (values 'plus 'non-greedy)) (t (ungetchar scanner) (values 'plus 'greedy)))) (values 'plus 'greedy))) (#\( (if (and *allow-nonregister-groups* (< (re-scanner-pos scanner) (re-scanner-end scanner))) (let ((ch2 (nextchar scanner))) (cond ((char= ch2 #\?) (if (< (re-scanner-pos scanner) (re-scanner-end scanner)) (let ((ch3 (nextchar scanner))) (cond ((char= ch3 #\=) (values 'lparen 'lookahead)) ((char= ch3 #\!) (values 'lparen 'nlookahead)) (t (ungetchar scanner) (values 'lparen nil)))) (values 'lparen nil))) (t (ungetchar scanner) (values 'lparen (incf (re-scanner-regnum scanner)))))) (values 'lparen (incf (re-scanner-regnum scanner))))) (#\) (values 'rparen nil)) (#\? (if (and *allow-nongreedy-quantifiers* (< (re-scanner-pos scanner) (re-scanner-end scanner))) (let ((ch2 (nextchar scanner))) (cond ((char= ch2 #\?) (values 'optional 'non-greedy)) (t (ungetchar scanner) (values 'optional 'greedy)))) (values 'optional 'greedy))) (#\| (values 'or nil)) (#\^ (values 'startanchor nil)) (#\$ (values 'endanchor nil)) (#\[ (setf (re-scanner-mode scanner) 'start-class) (values 'startclass nil)) (#\] (values 'char ch)) (#\. (values 'any nil)) (#\{ (if *allow-rangematch* (let ((rangebounds (scan-range-bounds scanner))) (if (and *allow-nongreedy-quantifiers* (< (re-scanner-pos scanner) (re-scanner-end scanner))) (let ((ch2 (nextchar scanner))) (cond ((char= ch2 #\?) (values 'ngrange rangebounds)) (t (ungetchar scanner) (values 'range rangebounds)))) (values 'range rangebounds))) (values 'char ch))) (t (values 'char ch))) ) (start-class (case ch (#\\ (setf (re-scanner-mode scanner) 'in-class) (if (< (re-scanner-pos scanner) (re-scanner-end scanner)) (values 'char (fix-escape-char (nextchar scanner))) (throw 'regex-parse-error (list "Invalid character class")))) (#\] (setf (re-scanner-mode scanner) 'in-regex) (values 'endclass nil)) (#\^ (setf (re-scanner-mode scanner) 'in-class) (values 'nclass nil)) (t (setf (re-scanner-mode scanner) 'in-class) (values 'char ch)) )) (in-class (case ch (#\\ (if (< (re-scanner-pos scanner) (re-scanner-end scanner)) (values 'char (fix-escape-char (nextchar scanner))) (throw 'regex-parse-error (list "Invalid character class")))) (#\- (values 'dash nil)) (#\] (setf (re-scanner-mode scanner) 'in-regex) (values 'endclass nil)) (t (values 'char ch)))))))))) (defun scan-range-bounds (scanner) (let ((lowbound (scan-num scanner))) (unless (< (re-scanner-pos scanner) (re-scanner-end scanner)) (throw 'regex-parse-error (list "Range pattern {nn,nn}: Unexpected end of pattern ~S" (re-scanner-str scanner)))) (when (not (numberp lowbound)) (setq lowbound 0)) (let ((ch (nextchar scanner))) (unless (or (char= ch #\,) (char= ch #\})) (throw 'regex-parse-error (list "Range pattern {nn,nn}: ',' or '}' expected in pattern ~S at ~D, encountered ~S" (re-scanner-str scanner) (re-scanner-pos scanner) ch))) (cond ((char= ch #\}) (cons lowbound nil)) (t (let ((highbound (scan-num scanner))) (cond ((not (numberp highbound)) (unless (char= highbound #\}) (list "Range pattern {nn,nn}: '}' expected in pattern ~S at ~D" (re-scanner-str scanner) (re-scanner-pos scanner))) (nextchar scanner) ; skip over the '}' (cons lowbound nil)) (t (unless (< (re-scanner-pos scanner) (re-scanner-end scanner)) (throw 'regex-parse-error (list "Range pattern {nn,nn}: Unexpected end of pattern ~S" (re-scanner-str scanner)))) (let ((ch (nextchar scanner))) (unless (char= ch #\}) (throw 'regex-parse-error (list "Range pattern {nn,nn}: '}' expected in pattern ~S at ~D" (re-scanner-str scanner) (re-scanner-pos scanner)))) (cons (min lowbound highbound) (max lowbound highbound))))))))))) (defun unget (scanner token value) (declare (type re-scanner scanner)) (setf (re-scanner-ungot-token scanner) token (re-scanner-ungot-value scanner) value)) (defun show-tokens (str) "Parse a string regex expression into a regex parse tree." (let ((scanner (re-scanner str))) (labels ((getnext () (multiple-value-bind (token value) (next scanner) (when token (cons (list token value) (getnext)))))) (getnext)))) ;;; ;;; Pass 2 - Parser ;;; ; ::= ; ::= "|" | ; ::= | ; ::= "*" | "+" | "?" | "{""}" | ; ::= "("")" | ; ::= "." | "$" | "^" | | ; ::= | "\" | \b | \B | \< | \> ; ::= | "," ; ::= "[" "]" | "[^" "]" (defun parse-str (str) "Parse a string into a parse tree." (let ((scanner (re-scanner str))) (multiple-value-bind (token value) (next scanner) (let ((regex (parse-regex token value scanner))) (multiple-value-bind (token value) (next scanner) (cond ((null token) (make-register-node 0 regex)) (t (throw 'regex-parse-error (list "Regex parse error at ~S ~S" token value))))))))) ; ::= (defun parse-regex (token value scanner) (parse-union token value scanner)) ; ::= "|" | (defun parse-union (token value scanner) (let ((concat (parse-concat token value scanner))) (multiple-value-bind (token value) (next scanner) (cond ((eq token 'or) (multiple-value-bind (token value) (next scanner) (cond ((or (null token) (eq token 'rparen)) (unget scanner token value) (make-alt-node-args concat nil)) (t (let ((other-concat (parse-union token value scanner))) (make-alt-node-args concat other-concat)))))) (t (unget scanner token value) concat))))) ; ::= | (defun parse-concat (token value scanner) (let ((seq) (quant (parse-quant token value scanner))) (setq seq (list quant)) (multiple-value-bind (token value) (next scanner) (loop until (member token '(or rparen nil)) do (progn (setq seq (nconc seq (list (parse-quant token value scanner)))) (multiple-value-setq (token value) (next scanner))) finally (progn (unget scanner token value) (return-from parse-concat (make-seq-node-list seq))))))) ; ::= "*" | "+" | "?" | "{""}" | (defun parse-quant (token value scanner) (let ((group (parse-group token value scanner))) (multiple-value-bind (token value) (next scanner) (loop while (member token '(kleene plus optional range ngrange)) do (progn (setq group (quantify token value group)) (multiple-value-setq (token value) (next scanner))) finally (progn (unget scanner token value) (return-from parse-quant group)))))) (defun quantify (token value expr) (cond ((eq token 'kleene) (make-kleene-node expr (eq value 'greedy))) ((eq token 'plus) (make-pkleene-node expr (eq value 'greedy))) ((eq token 'optional) (make-optional-node expr (eq value 'greedy))) ((eq token 'range) (make-range-node expr (car value) (cdr value) t)) ((eq token 'ngrange) (make-range-node expr (car value) (cdr value) nil)) (t (throw 'regex-parse-error (list "quantify: Unexpected quantifier ~S ~S" token value))))) ; ::= "("")" | (defun parse-group (token value scanner) (cond ((eq token 'lparen) (multiple-value-bind (token2 value2) (next scanner) (let ((regex (parse-regex token2 value2 scanner))) (multiple-value-bind (token3 value3) (next scanner) (unless (eq token3 'rparen) (throw 'regex-parse-error (list "parse-group: Expected ')' at token ~S ~S" token3 value3))) (cond ((numberp value) (make-register-node value regex)) ((eq value 'lookahead) (make-lookahead-node regex)) ((eq value 'nlookahead) (make-nlookahead-node regex)) (t regex)))))) ((member token '(any startanchor endanchor char backmatch startclass startword endword digitchar nondigitchar wordchar nonwordchar spacechar nonspacechar)) (parse-term token value scanner)) ((eq token 'or) (unget scanner token value) nil) (t (throw 'regex-parse-error (list "parse-group: Unexpected token ~S ~S" token value))))) ; ::= "." | "$" | "^" | | (defun parse-term (token value scanner) (cond ((eq token 'any) (make-any-node)) ((eq token 'startanchor) (make-start-anchor-node)) ((eq token 'endanchor) (make-end-anchor-node)) ((eq token 'char) (make-char-node value)) ((eq token 'backmatch) (make-backmatch-node value)) ((eq token 'startclass) (multiple-value-bind (token value) (next scanner) (parse-char-class token value scanner))) ((eq token 'startword) (make-startword-node)) ((eq token 'endword) (make-endword-node)) ((eq token 'wordchar) '(specclass alnum)) ((eq token 'nonwordchar) '(nspecclass alnum)) ((eq token 'digitchar) '(specclass digit)) ((eq token 'spacechar) '(specclass space)) ((eq token 'nonspacechar) '(nspecclass space)) (t (throw 'regex-parse-error (list "parse-term: Unexpected token ~S ~S" token value))))) (defun parse-char-class (token value scanner) (case token (nclass (multiple-value-bind (token2 value2) (next scanner) (let* ((chars (parse-char-class-contents token2 value2 scanner)) (specclass (special-class chars))) (cond (specclass (make-specclass-node specclass :negated t)) (t (make-charclass-node chars :negated t)))))) (t (let* ((chars (parse-char-class-contents token value scanner)) (specclass (special-class chars))) (cond (specclass (make-specclass-node specclass :negated nil)) (t (make-charclass-node chars :negated nil))))))) (defun parse-char-class-contents (token value scanner &aux lst) (loop while (or (eq token 'char) (eq token 'dash)) do (progn (cond ((eq token 'char) (push value lst)) ((eq token 'dash) (push token lst))) (multiple-value-setq (token value) (next scanner)))) (cond ((eq token 'endclass) (reverse lst)) (t (throw 'regex-parse-error (list "character class improperly terminated by ~S" token))))) (defun special-class (chars) (let* ((len (length chars)) (firstchar (first chars)) (lastchar (elt chars (1- len)))) (when (and (characterp firstchar) (char= firstchar #\:) (characterp lastchar) (not (char= lastchar #\:))) (throw 'regex-parse-error (list "Parse error: Special character class not terminated by ':'"))) (when (and (characterp firstchar) (char= firstchar #\:) (characterp lastchar) (char= lastchar #\:)) (let ((scname (string-downcase (coerce chars 'string)))) (second (assoc scname +special-class-names+ :test #'string=)))))) cl-regex-1/optimize.lisp0100755000175000001440000010563707550104072014174 0ustar mrdusers;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: REGEX; Base: 10 -*- (in-package :REGEX) ;;; ;;; Pass 3 - Canonicalization/tree rewrites/simplification ;;; ; (class "c") --> #\c ; (seq ... ...) --> (seq ...) ; (seq a b (seq c d) e f) --> (seq a b c d e f) ; (alt a b (alt c d) e f) --> (alt a b c d e f) ; (reg n ) --> (seq (rstart n) (rend n)) ; (opt a) --> (alt a nil) or (alt nil a) ; Iteratively canonicalize the tree, until it stabilizes, then simplify (defun optimize-regex-tree (tree) (let* ((canonical (canonicalize tree)) (better (improve canonical)) (even-better (split-alts better)) (simple (simplify even-better))) simple)) (defun canonicalize (tree) (when *regex-compile-verbose* (format t "~&~%Canonicalize Start:") (pprint tree)) (loop with prev-tree = tree for pass from 1 for new-tree = (canonicalize-once prev-tree) when (or (equal prev-tree new-tree) (> pass 10)) return new-tree do (progn (when *regex-compile-verbose* (format t "~&Canonicalize Pass ~D:" pass) (pprint new-tree)) (setq prev-tree new-tree)))) (defun improve (tree) (when *regex-compile-verbose* (format t "~&~%Optimize Start:") (pprint tree)) (loop with prev-tree = tree for pass from 1 for new-tree = (improve-once prev-tree) when (or (equal prev-tree new-tree) (> pass 10)) return new-tree do (progn (when *regex-compile-verbose* (format t "~&Optimize Pass ~D:" pass) (pprint new-tree)) (setq prev-tree new-tree)))) (defun split-alts (tree) (when *regex-compile-verbose* (format t "~&~%Split Alts Start:") (pprint tree)) (split-alts-aux tree)) (defun simplify (tree) (when *regex-compile-verbose* (format t "~&~%Simplify Start:") (pprint tree)) (loop with prev-tree = tree for pass from 1 for new-tree = (simplify-once prev-tree) when (or (equal prev-tree new-tree) (> pass 10)) return new-tree do (progn (when *regex-compile-verbose* (format t "~&Simplify Pass ~D:" pass) (pprint new-tree)) (setq prev-tree new-tree)))) ; for ALT's, after hoisting, try partitioning the set on the leading element, ; and see if we can't reduce things further: ; (alt (seq a) (seq a c) (seq c)) -> (alt (seq a (alt nil (seq c))) (seq c)) ; --> (alt-case (a (alt nil (seq c))) ; (c )) ; ; for Kleene's, check to see if the child node is also a kleene w/ no ; registers, and if so remove the inner kleene. ; (defun canonicalize-once (node) (cond ;; expand strings into sequences of char to enable more optimizations ((string-node-p node) (make-seq-node-list (coerce (string-node-string node) 'list))) ;; expand class-sequences into sequences of charclass ((classseq-node-p node) (make-seq-node-list (mapcar #'make-charclass-node (classseq-node-seq node)))) ((seq-node-p node) (let* ((canonicalchildren (mapcar #'canonicalize-once (seq-node-children node))) (denullchildren (remove-if #'null canonicalchildren)) (flatchildren (flatten-sequence denullchildren)) (children flatchildren) (numchildren (length flatchildren))) (cond ((zerop numchildren) nil) ((= numchildren 1) (first children)) (t (make-seq-node-list children))))) ((alt-node-p node) (let* ((children (mapcar #'canonicalize-once (alt-node-children node))) (flatchildren (flatten-alt children)) (uniquechildren (remove-duplicates flatchildren :from-end t)) (children uniquechildren)) (make-alt-node-list children))) ((kleene-node-p node) (make-kleene-node (canonicalize-once (kleene-node-child node)) (kleene-node-greedy-p node))) ;; If the child node is something trivial like char, seq of char, ;; any, charclass, specclass,or seq of charclass, may want to go ;; ahead and leave it as a + node and specialize it during the ;; instruction selection pass. ((pkleene-node-p node) (let ((greedyp (pkleene-node-greedy-p node)) (canonical-child (canonicalize-once (pkleene-node-child node)))) (make-seq-node-args canonical-child (make-kleene-node (cond (*registers-match-rightmost* canonical-child) (t (canonicalize-once (unregister canonical-child)))) greedyp)))) ((optional-node-p node) (let ((greedyp (optional-node-greedy-p node)) (canonical-child (canonicalize-once (optional-node-child node)))) (cond (greedyp (make-alt-node-args canonical-child nil)) (t (make-alt-node-args nil canonical-child))))) ((charclass-node-p node) (let* ((negp (charclass-node-negated-p node)) (chars (charclass-node-chars node)) (cclen (length chars))) (cond ((zerop cclen) nil) ((and (= cclen 1) (not negp)) (make-char-node (char chars 0))) (t node)))) ((register-node-p node) (let ((regnum (register-node-regnum node))) (make-seq-node-args (make-regstart-node regnum) (canonicalize-once (register-node-child node)) (make-regend-node regnum)))) ((range-node-p node) (expand-range (range-node-greedy-p node) (range-node-min node) (range-node-max node) (canonicalize-once (range-node-child node)))) ((lookahead-node-p node) (make-lookahead-node (canonicalize-once (lookahead-node-expr node)))) ((nlookahead-node-p node) (make-nlookahead-node (canonicalize-once (nlookahead-node-expr node)))) (t node))) (defun improve-once (node) (cond ((alt-node-p node) (let ((children (mapcar #'improve-once (alt-node-children node)))) (multiple-value-bind (prefix altbody suffix) (hoist-alt-ends children) (cond (altbody (make-seq-node-list `(,@prefix ,(make-alt-node-list altbody) ,@suffix))) ((and (or prefix suffix) (null altbody)) (make-seq-node-list `(,@prefix ,@suffix))) (t (make-alt-node-list altbody)))))) ((seq-node-p node) (make-seq-node-list (mapcar #'improve-once (seq-node-children node)))) ((kleene-node-p node) (let* ((greedyp (kleene-node-greedy-p node)) (child (improve-once (kleene-node-child node))) (hasregs (contains-registers-p child))) (cond ((or *registers-match-rightmost* (not hasregs)) (make-kleene-node child greedyp)) (greedyp (let ((more (make-kleene-node (unregister child) greedyp))) (make-alt-node-args (make-seq-node-args child more) nil))) (t ;; not greedy (let ((more (make-kleene-node (unregister child) greedyp))) (make-alt-node-args nil (make-seq-node-args child more))))))) ((lookahead-node-p node) (make-lookahead-node (improve-once (lookahead-node-expr node)))) ((nlookahead-node-p node) (make-nlookahead-node (improve-once (nlookahead-node-expr node)))) (t node))) (defun split-alts-aux (node) (cond ((seq-node-p node) (let ((children (mapcar #'split-alts-aux (seq-node-children node)))) (make-seq-node-list children))) ((alt-node-p node) (let ((children (mapcar #'split-alts-aux (alt-node-children node))) (num-children (alt-node-numchildren node))) (multiple-value-bind (unknown-char known-char-sets) (partition-on-leading-char children) (let* ((num-unknown-char (length unknown-char)) (num-known-char-sets (length known-char-sets)) (worth-partitioning-p (worth-alt-case-partitioning-p num-children num-unknown-char num-known-char-sets))) (cond ((and unknown-char known-char-sets worth-partitioning-p) (make-alt-node-list `(,(make-casealt-node-list (mapcar #'subalt-if-necessary known-char-sets)) ,@unknown-char))) ((and (not unknown-char) known-char-sets worth-partitioning-p) (make-casealt-node-list (mapcar #'subalt-if-necessary known-char-sets))) (t node)))))) ((kleene-node-p node) (make-kleene-node (split-alts-aux (kleene-node-child node)) (kleene-node-greedy-p node))) ((lookahead-node-p node) (make-lookahead-node (split-alts-aux (lookahead-node-expr node)))) ((nlookahead-node-p node) (make-nlookahead-node (split-alts-aux (nlookahead-node-expr node)))) (t node))) (defun simplify-once (node) (cond ((seq-node-p node) (let ((newchildren (combine-sequence-text (mapcar #'simplify-once (seq-node-children node))))) (cond ((= (length newchildren) 1) (first newchildren)) (t (make-seq-node-list newchildren))))) ((alt-node-p node) (let ((newalts (combine-alt-charclass (mapcar #'simplify-once (alt-node-children node))))) (cond ((= (length newalts) 1) (first newalts)) (t (make-alt-node-list newalts))))) ((casealt-node-p node) (make-casealt-node-list (mapcar #'(lambda (arm) (list (first arm) (simplify-once (second arm)))) (casealt-node-children node)))) ((kleene-node-p node) (make-kleene-node (simplify-once (kleene-node-child node)) (kleene-node-greedy-p node))) ((lookahead-node-p node) (make-lookahead-node (simplify-once (lookahead-node-expr node)))) ((nlookahead-node-p node) (make-nlookahead-node (simplify-once (nlookahead-node-expr node)))) (t node))) ; expand out a range (defun expand-range (greedyp lowbound highbound node) (cond ((and *registers-match-rightmost* (numberp highbound)) (let* ((reqd (loop for i from 0 below lowbound collect node)) (opt (loop with tmp = nil for i from lowbound below highbound when (= i lowbound) do (setq tmp (make-optional-node node greedyp)) when (> i lowbound) do (let ((seq (make-seq-node-args node tmp))) (setq tmp (cond (greedyp (make-alt-node-args seq nil)) (t (make-alt-node-args nil seq))))) finally (return tmp)))) (make-seq-node-list `(,@reqd ,opt)))) ((and *registers-match-rightmost* (null highbound)) (let* ((reqd (loop for i from 0 below lowbound collect node))) (make-seq-node-list `(,@reqd ,(make-kleene-node node greedyp))))) ((and (not *registers-match-rightmost*) (numberp highbound)) (let* ((registerless-node (unregister node)) (reqd (loop for i from 0 below lowbound when (zerop i) collect node when (> i 0) collect registerless-node)) (opt (loop with tmp = nil for i from lowbound below highbound when (= i lowbound) do (setq tmp (make-optional-node (if (zerop lowbound) node registerless-node) greedyp)) when (> i lowbound) do (let ((seq (make-seq-node-args registerless-node tmp))) (setq tmp (cond (greedyp (make-alt-node-args seq nil)) (t (make-alt-node-args nil seq))))) finally (return tmp)))) (make-seq-node-list `(,@reqd ,opt)))) (t (let* ((registerless-node (unregister node)) (reqd (loop for i from 0 below lowbound when (zerop i) collect node when (> i 0) collect registerless-node))) (cond ((zerop lowbound) (make-kleene-node node greedyp)) (t (make-seq-node-list `(,@reqd ,(make-kleene-node registerless-node greedyp))))))))) (defun coercetostring (x) (cond ((stringp x) x) ((characterp x) (string x)))) ; unnest sequences where possible (defun flatten-sequence (nodes) (cond ((null nodes) nil) ((seq-node-p (first nodes)) (flatten-sequence (append (seq-node-children (first nodes)) (rest nodes)))) (t (cons (first nodes) (flatten-sequence (rest nodes)))))) ; combine runs of chars and strings into strings ; combine runs of character classes into charclass-sequence's (defun combine-sequence-text (nodes) (cond ((null nodes) nil) ((string-seq-p nodes) (multiple-value-bind (str restseq) (partition-string-sequence nodes) (cons (make-string-node str) (combine-sequence-text restseq)))) ((char-class-seq-p nodes) (multiple-value-bind (classseq restseq) (partition-charclass-sequence nodes) (cons (make-classseq-node classseq) (combine-sequence-text restseq)))) (t (cons (first nodes) (combine-sequence-text (rest nodes)))))) ; combine multiple character classes in an ALT into one character class (defun combine-alt-charclass (nodes) (cond ((>= (count-if #'char-or-class-node-p nodes) 2) (multiple-value-bind (chars othernodes) (partition-charclass-alt nodes) (cons (make-charclass-node chars) othernodes))) (t nodes))) ;; does this sequence start out with a run of chars? (defun string-seq-p (seq) (loop for item in seq for i from 0 while (text-node-p item) when (>= i 1) return t) ) ;; does this sequence start out with a run of char-class? (defun char-class-seq-p (seq) (loop for item in seq for i from 0 while (positive-charclass-node-p item) when (>= i 1) return t) ) ;; partitions sequences into a string representing the leading run, ;; and the rest of the sequence (defun partition-string-sequence (seq) (loop with strseq = nil for item = (pop seq) when (text-node-p item) do (push item strseq) when (not (text-node-p item)) return (values (append-strings (mapcar #'coercetostring (reverse strseq))) (cond ((and (null item) (null seq)) nil) ((null seq) (list item)) (t (cons item seq)))))) (defun append-strings (strings) (cond ((null strings) "") (t (loop for str in strings for result = str then (concatenate 'simple-string result str) finally (return result))))) (defun partition-charclass-sequence (seq) (loop with classseq = nil for item = (pop seq) when (positive-charclass-node-p item) do (push item classseq) when (not (charclass-node-p item)) return (values (mapcar #'classseq-node-seq (reverse classseq)) (cond ((and (null item) (null seq)) nil) ((null seq) (list item)) (t (cons item seq)))))) (defun partition-charclass-alt (nodes) (loop with chars = "" for node = (pop nodes) when (charclass-node-p node) do (setq chars (concatenate 'string chars (charclass-node-chars node))) when (char-node-p node) do (setq chars (concatenate 'string chars (string (char-node-char node)))) when (not (char-or-class-node-p node)) return (values chars (cond ((and (null node) (null nodes)) nil) ((null nodes) (list node)) (t (cons node nodes)))))) (defun flatten-alt (nodes) (cond ((null nodes) nil) ((alt-node-p (first nodes)) (flatten-alt (append (alt-node-children (first nodes)) (rest nodes)))) (t (cons (first nodes) (flatten-alt (rest nodes)))))) (defun hoist-alt-ends (nodes) (multiple-value-bind (prefix restnodes) (hoist-alt-prefix nodes) (multiple-value-bind (altbody suffix) (hoist-alt-suffix restnodes) (values prefix altbody suffix)))) (defun seq-first (node) (cond ((seq-node-p node) (first (seq-node-children node))) (t node))) (defun seq-first-char (node) (cond ((seq-node-p node) (let ((children (seq-node-children node))) (loop for child in children when (or (alt-node-p child) (kleene-node-p child) (specclass-node-p child) (hook-node-p child) (backmatch-node-p child) (lookahead-node-p child) (nlookahead-node-p child)) return nil when (or (charclass-node-p child) (char-node-p child)) return child))) (t nil))) (defun seq-rest (node) (cond ((seq-node-p node) (make-seq-node-list (rest (seq-node-children node)))) (t nil))) ;; unlike CL's last, this returns the last car, not the last cons cell (defun seq-last (node) (cond ((seq-node-p node) (car (last (seq-node-children node)))) (t node))) (defun seq-butlast (node) (cond ((seq-node-p node) (make-seq-node-list (butlast (seq-node-children node)))) (t nil))) (defun hoist-alt-prefix (nodes) (let ((prefixes (mapcar #'seq-first nodes)) (rests (mapcar #'seq-rest nodes))) (cond ((or (null prefixes) (some #'null prefixes)) (values nil nodes)) ((every #'equal prefixes (rest prefixes)) (cond ((or (null rests) (every #'null rests)) (values (list (first prefixes)) nil)) ((or (null rests) (some #'null rests)) (values (list (first prefixes)) rests)) (t (multiple-value-bind (other-prefixes altnodes) (hoist-alt-prefix rests) (values (cons (first prefixes) other-prefixes) altnodes))))) (t (values nil nodes))))) (defun hoist-alt-suffix (nodes) (let ((suffixes (mapcar #'seq-last nodes)) (butlasts (mapcar #'seq-butlast nodes))) (cond ((or (null suffixes) (some #'null suffixes)) (values nodes nil)) ((every #'equal suffixes (rest suffixes)) (cond ((or (null butlasts) (some #'null butlasts)) (values butlasts (list (first suffixes)))) (t (multiple-value-bind (altnodes other-suffixes) (hoist-alt-suffix butlasts) (values altnodes (cons (first suffixes) other-suffixes)))))) (t (values nodes nil))))) ; (alt "aaa" "bbb" "bbc" "ccc" "ccd" (* #\e)) ; --> ; ((* #\e)) ; ((#\a "aaa") ; (#\b "bbb" "bbc") ; (#\c "ccc" "ccd")) ;; Now handles character classes as well (defun partition-on-leading-char (children) (let* ((leading-char-alist (make-discriminant-char-alist children)) (children-without-leading-char (mapcar #'second (remove-if #'first leading-char-alist))) (children-with-leading-char (remove-if-not #'first leading-char-alist)) (sorted-lc-children (sort children-with-leading-char #'sort-lc-pred :key #'first))) (loop with partition = nil for (prev-leading-char prev-childnode) in sorted-lc-children for (leading-char childnode) in (rest sorted-lc-children) for current-set = (list prev-childnode) then current-set when (not (equalp prev-leading-char leading-char)) do (progn (push (cons prev-leading-char (reverse current-set)) partition) (setq current-set (list childnode))) when (equalp prev-leading-char leading-char) do (push childnode current-set) finally (return (values children-without-leading-char (reverse (cons (cons leading-char (reverse current-set)) partition))))))) (defun sort-lc-pred (a b) (cond ((and (characterp a) (characterp b)) (char< a b)) ((and (characterp a) (stringp b)) (char< a (char b 0))) ((and (stringp a) (characterp b)) (char< (char a 0) b)))) (defun make-discriminant-char-alist (nodes &aux alist) (dolist (node nodes alist) (setq alist (nconc alist (get-discriminant-chars-alist node))))) ;; if first in sequence is a positive charclass, return all chars as the discriminant (defun get-discriminant-chars-alist (node) (let ((first (seq-first-char node))) (cond ((char-node-p first) (list (list (char-node-char first) node))) ((charclass-node-p first) (loop for x across (charclass-node-chars first) collect (list x node))) (t (list (list nil node)))))) (defun worth-alt-case-partitioning-p (num-children num-unknown-char-sets num-known-char-sets) "Is it worth partitioning an alt into a case-alt?" (and ;; Partition must split out at least 4 alternatives (>= (- num-children num-unknown-char-sets) 4) ;; Must be more than one set in the alt-case (the case of 1 should ;; be handled separately) (> num-known-char-sets 1))) (defun subalt-if-necessary (casealt-arm) (let ((len (length (rest casealt-arm)))) (cond ((zerop len) (error "sub-alt must have at least one clause ~S" casealt-arm)) ((= len 1) `(,(first casealt-arm) ,(make-seq-node-args (second casealt-arm)))) (t `(,(first casealt-arm) ,(make-alt-node-list (rest casealt-arm))))))) (defun obviously-nullable-pattern (tree) (and (listp tree) (or (not (null (member (first tree) '(* *?)))) (and (eq (first tree) 'alt) (<= (length tree) 2))))) (defun contains-looping-pattern-p (tree) (tree-any #'(lambda (x) (member x '(* + *? +?) :test #'eq)) tree)) (defun contains-registers-p (tree) (tree-any #'(lambda (x) (member x '(reg regstart regend))) tree)) (defun tree-any (fxn tree) (labels ((tree-any-aux (tree) (cond ((null tree) nil) ((atom tree) (funcall fxn tree)) (t (or (funcall fxn tree) (safer-some #'tree-any-aux tree)))))) (tree-any-aux tree))) ;; Similar to CL's SOME, but doesn't barf on improper lists. (defun safer-some (fxn lst) (loop while (consp lst) for x = (pop lst) when (funcall fxn x) return t finally (return (cond ((null lst) nil) (t (funcall fxn lst)))))) (defun unregister (node) (cond ((null node) nil) ((char-node-p node) node) ((string-node-p node) node) ((classseq-node-p node) node) ((backmatch-node-p node) node) ((seq-node-p node) (make-seq-node-list (remove-if #'null (mapcar #'unregister (seq-node-children node))))) ((kleene-node-p node) (make-kleene-node (unregister (kleene-node-child node)) (kleene-node-greedy-p node))) ((pkleene-node-p node) (make-pkleene-node (unregister (pkleene-node-child node)) (pkleene-node-greedy-p node))) ((optional-node-p node) (make-optional-node (unregister (optional-node-child node)) (optional-node-greedy-p node))) ((range-node-p node) (make-range-node (unregister (range-node-child node)) (range-node-min node) (range-node-max node) (range-node-greedy-p node))) ((alt-node-p node) ; don't descend into alt nodes -- we may match one branch one iter, and another ; branch the next. ;(make-alt-node-list (mapcar #'unregister (alt-node-children node)))) node) ((start-anchor-node-p node) node) ((end-anchor-node-p node) node) ((register-node-p node) (unregister (register-node-child node))) ((regstart-node-p node) nil) ((regend-node-p node) nil) ((charclass-node-p node) (make-charclass-node (charclass-node-chars node) :negated (charclass-node-negated-p node))) ((specclass-node-p node) (make-specclass-node (specclass-node-class node) :negated (specclass-node-negated-p node))) ((any-node-p node) node) ((hook-node-p node) node) ((lookahead-node-p node) (make-lookahead-node (unregister (lookahead-node-expr node)))) ((nlookahead-node-p node) (make-nlookahead-node (unregister (nlookahead-node-expr node)))) (t ;; once we're done, this should throw the :invalid-parse-tree tag (throw 'regex-parse-error (list "unregister: Unrecognized regex parse tree node ~S" node)))) ) ;;; ;;; Pass 4 - Instruction selection ;;; ; replace *, +, etc with the actual instructions to be used... (defun select-instructions (node) (cond ((null node) nil) ((char-node-p node) (select-char-instr (char-node-char node))) ((string-node-p node) (select-string-instr (string-node-string node))) ((classseq-node-p node) (select-classseq-instr (classseq-node-seq node))) ((backmatch-node-p node) node) ((seq-node-p node) (select-sequence-instrs (seq-node-children node))) ((kleene-node-p node) (cond ((kleene-node-greedy-p node) (select-greedy-kleene-instr (kleene-node-child node))) (t (select-nongreedy-kleene-instr (kleene-node-child node))))) ;; pkleene-nodes have been removed by the simplification process ((optional-node-p node) (cond ((optional-node-greedy-p node) (select-greedy-optional-instr (optional-node-child node))) (t (select-nongreedy-optional-instr (optional-node-child node))))) ;; range-nodes have been removed by simplification process ((alt-node-p node) (select-alt-instrs (alt-node-children node))) ((casealt-node-p node) (select-casealt-instr (casealt-node-children node))) ((start-anchor-node-p node) node) ((end-anchor-node-p node) node) ;; register-nodes have been removed by simplification process ((regstart-node-p node) (let ((regnum (regstart-node-regnum node))) (cond (*registers-match-rightmost* `(right-rstart ,regnum)) (t `(left-rstart ,regnum))))) ((regend-node-p node) node) ((charclass-node-p node) (let ((chars (charclass-node-chars node))) (cond ((not (charclass-node-negated-p node)) (select-charclass-instr chars)) (t (select-negated-charclass-instr chars))))) ((specclass-node-p node) (cond ((not (specclass-node-negated-p node)) (select-specclass-instr (specclass-node-class node))) (t (select-negated-specclass-instr (specclass-node-class node))))) ((any-node-p node) node) ((hook-node-p node) node) ((success-node-p node) node) ((startword-node-p node) node) ((endword-node-p node) node) ((lookahead-node-p node) (make-lookahead-node (select-instructions (lookahead-node-expr node)))) ((nlookahead-node-p node) (make-nlookahead-node (select-instructions (nlookahead-node-expr node)))) (t ;; once we're done, this should throw the :invalid-parse-tree tag (throw 'regex-parse-error (list "select-instructions: Unhandled regex parse tree node ~S" node))))) (defun select-char-instr (chr) `(char ,chr)) (defun select-string-instr (str) ;; the pattern string must always be a simple-string (let ((simple-pat-str (coerce str 'simple-string))) `(string ,simple-pat-str))) (defun select-classseq-instr (classseq) `(classseq ,classseq)) (defun select-sequence-instrs (children) (make-seq-node-list (mapcar #'select-instructions children))) ;; optimize (alt-2 ) ;; optimize (alt-2 ) ;; optimize (alt-2 ) ;; optimize (alt-2 ) (defun select-alt-instrs (children) (make-alt-node-list (mapcar #'select-instructions children))) (defun select-casealt-instr (children) (make-casealt-node-list (mapcar #'(lambda (arm) (list (first arm) (select-instructions (second arm)))) children))) ;; By this point, we have already been unrolled to move registers out ;; of loops, so we can just worry about the special cases. (defun select-greedy-kleene-instr (child &aux (nullpat (nullable-pattern-p child)) (looppat (contains-looping-pattern-p child))) (cond ((char-node-p child) `(char-greedy-kleene ,(char-node-char child))) ((string-node-p child) ;; the pattern string must always be a simple-string (let ((simple-pat-str (coerce (string-node-string child) 'simple-string))) `(str-greedy-kleene ,simple-pat-str))) ((charclass-node-p child) (let* ((negp (charclass-node-negated-p child)) (chars (charclass-node-chars child)) (ccsize (length chars))) (case ccsize (0 '(any-greedy-kleene)) (1 (cond ((not negp) `(char-greedy-kleene ,(char chars 0))) (negp `(not-char-greedy-kleene ,(char chars 0))))) (2 (cond ((not negp) `(cclass-2-greedy-kleene ,(char chars 0) ,(char chars 1))) (negp `(not-cclass-2-greedy-kleene ,(char chars 0) ,(char chars 1))))) (t (let ((schars (coerce chars 'simple-string))) (cond ((not negp) `(cclass-greedy-kleene ,schars)) (t `(not-cclass-greedy-kleene ,schars)))))))) ((specclass-node-p child) (let* ((negp (specclass-node-negated-p child)) (specclass (specclass-node-class child))) (cond ((not negp) `(specclass-greedy-kleene ,specclass)) (negp `(not-specclass-greedy-kleene ,specclass))))) ((not nullpat) `(greedy-kleene-no-termcheck ,(select-instructions child))) ((or (and (not nullpat) looppat) (and nullpat (not looppat))) `(greedy-kleene-simple-termcheck ,(select-instructions child))) (t `(greedy-kleene-full-termcheck ,(select-instructions child)))) ) (defun select-nongreedy-kleene-instr (child) (let ((nullpat (nullable-pattern-p child)) (looppat (contains-looping-pattern-p child))) (cond ((not nullpat) `(ngkleene-no-termcheck ,(select-instructions child))) ((or (and (not nullpat) looppat) (and nullpat (not looppat))) `(ngkleene-simple-termcheck ,(select-instructions child))) (t `(ngkleene-full-termcheck ,(select-instructions child))))) ) (defun select-greedy-optional-instr (child) (make-alt-node-args (select-instructions child) nil)) (defun select-nongreedy-optional-instr (child) (make-alt-node-args nil (select-instructions child))) (defun select-charclass-instr (chars) (let ((ccsize (length chars))) (case ccsize (1 (select-char-instr (char chars 0))) (2 `(cclass-2 ,(char chars 0) ,(char chars 1))) (t (let ((simple-chars (coerce chars 'simple-string))) `(cclass ,simple-chars)))))) (defun select-negated-charclass-instr (chars) (let ((ccsize (length chars))) (case ccsize (1 `(not-char ,(char chars 0))) (2 `(not-cclass-2 ,(char chars 0) ,(char chars 1))) (t (let ((simple-chars (coerce chars 'simple-string))) `(not-cclass ,simple-chars)))))) (defun select-specclass-instr (class) `(specclass ,class)) (defun select-negated-specclass-instr (class) `(not-specclass ,class) ) (defun nullable-pattern-p (node) (cond ((null node) t) ((char-node-p node) nil) ((string-node-p node) nil) ((classseq-node-p node) nil) ((backmatch-node-p node) t) ((seq-node-p node) (every #'nullable-pattern-p (seq-node-children node))) ((kleene-node-p node) t) ((pkleene-node-p node) (nullable-pattern-p (pkleene-node-child node))) ((optional-node-p node) t) ((range-node-p node) (zerop (range-node-min node))) ((alt-node-p node) (some #'nullable-pattern-p (alt-node-children node))) ((start-anchor-node-p node) t) ((end-anchor-node-p node) t) ((register-node-p node) (nullable-pattern-p (register-node-child node))) ((regstart-node-p node) t) ((regend-node-p node) t) ((charclass-node-p node) nil) ((specclass-node-p node) nil) ((any-node-p node) nil) ((hook-node-p node) t) ((lookahead-node-p node) t) ((nlookahead-node-p node) t) (t ;; once we're done, this should throw the :invalid-parse-tree tag (throw 'regex-parse-error (list "nullable-pattern-p: Unrecognized regex parse tree node ~S" node)))) ) cl-regex-1/gen.lisp0100755000175000001440000002361607550103336013103 0ustar mrdusers;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: REGEX; Base: 10 -*- (in-package :REGEX) ;;; ;;; Pass 5 - Generate list of CPS instructions ;;; ; Generate a flattened list of instructions, with continuation ; targets, and (init) and (success) in the proper places... (defun gen-instr-list (tree) (let* ((*instrs*) (*next-instr* 0)) (declare (special *instrs* *next-instr*)) (let* ((success-instr (alloc-instr)) (start-instr (emit-instr tree success-instr))) (add-instr success-instr '(success t)) (values start-instr (sort *instrs* #'< :key #'first))))) (defun add-instr (statenum instr) (declare (special *instrs*)) (push (cons statenum instr) *instrs*) statenum) (defun alloc-instr () (declare (special *next-instr*)) (let ((instr *next-instr*)) (incf *next-instr*) instr)) ; similar to old compile-state-machine (defun emit-instr (node next) (cond ((null node) next) ((eq (first node) 'char) (add-instr (alloc-instr) `(char ,(second node) ,next))) ((eq (first node) 'string) (add-instr (alloc-instr) `(string ,(second node) ,next))) ((classseq-node-p node) (add-instr (alloc-instr) `(classseq ,(classseq-node-seq node) ,next))) ((backmatch-node-p node) (add-instr (alloc-instr) `(backmatch ,(backmatch-node-regnum node) ,next))) ((seq-node-p node) (emit-sequence (seq-node-children node) next)) ;; kleene-nodes have been optimized to specialized instructions ((eq (first node) 'str-greedy-kleene) (add-instr (alloc-instr) `(str-greedy-kleene ,(second node) ,next))) ((eq (first node) 'char-greedy-kleene) (add-instr (alloc-instr) `(char-greedy-kleene ,(second node) ,next))) ((eq (first node) 'not-char-greedy-kleene) (add-instr (alloc-instr) `(not-char-greedy-kleene ,(second node) ,next))) ((eq (first node) 'cclass-2-greedy-kleene) (add-instr (alloc-instr) `(cclass-2-greedy-kleene ,(second node) ,(third node) ,next))) ((eq (first node) 'not-cclass-2-greedy-kleene) (add-instr (alloc-instr) `(not-cclass-2-greedy-kleene ,(second node) ,(third node) ,next))) ((eq (first node) 'cclass-greedy-kleene) (add-instr (alloc-instr) `(cclass-greedy-kleene ,(second node) ,next))) ((eq (first node) 'not-cclass-greedy-kleene) (add-instr (alloc-instr) `(not-cclass-greedy-kleene ,(second node) ,next))) ((eq (first node) 'specclass-greedy-kleene) (add-instr (alloc-instr) `(specclass-greedy-kleene ,(second node) ,next))) ((eq (first node) 'not-specclass-greedy-kleene) (add-instr (alloc-instr) `(not-specclass-greedy-kleene ,(second node) ,next))) ((eq (first node) 'greedy-kleene-no-termcheck) (emit-greedy-kleene-no-termcheck (second node) next)) ((eq (first node) 'greedy-kleene-simple-termcheck) (emit-greedy-kleene-simple-termcheck (second node) next)) ((eq (first node) 'greedy-kleene-full-termcheck) (emit-greedy-kleene-full-termcheck (second node) next)) ((eq (first node) 'ngkleene-no-termcheck) (emit-ngkleene-no-termcheck (second node) next)) ((eq (first node) 'ngkleene-simple-termcheck) (emit-ngkleene-simple-termcheck (second node) next)) ((eq (first node) 'ngkleene-full-termcheck) (emit-ngkleene-full-termcheck (second node) next)) ;; pkleene-nodes are long since converted to opt and kleene nodes ;; optional nodes are long since converted to alt nodes ;; range-nodes are long since gone... ((alt-node-p node) (emit-alt (alt-node-children node) next)) ((casealt-node-p node) (emit-casealt (casealt-node-children node) next)) ((start-anchor-node-p node) (add-instr (alloc-instr) `(startanchor ,next))) ((end-anchor-node-p node) (add-instr (alloc-instr) `(endanchor ,next))) ;; register-nodes are long since gone, converted to regstart/regend ((eq (first node) 'right-rstart) (add-instr (alloc-instr) `(right-rstart ,(second node) ,next))) ((eq (first node) 'left-rstart) (add-instr (alloc-instr) `(left-rstart ,(second node) ,next))) ((regend-node-p node) (add-instr (alloc-instr) `(rend ,(regend-node-regnum node) ,next))) ;; charclass-nodes are long since converted to specialized instr nodes ((eq (first node) 'cclass-2) (add-instr (alloc-instr) `(cclass-2 ,(second node) ,(third node) ,next))) ((eq (first node) 'cclass) (add-instr (alloc-instr) `(cclass ,(second node) ,next))) ((eq (first node) 'not-char) (add-instr (alloc-instr) `(not-char ,(second node) ,next))) ((eq (first node) 'not-cclass-2) (add-instr (alloc-instr) `(not-cclass-2 ,(second node) ,(third node) ,next))) ((eq (first node) 'not-cclass) (add-instr (alloc-instr) `(not-cclass ,(second node) ,next))) ;; specclass-nodes are long since gone, converted to specialized instr nodes ((eq (first node) 'specclass) (add-instr (alloc-instr) `(specclass ,(second node) ,next))) ((eq (first node) 'not-specclass) (add-instr (alloc-instr) `(not-specclass ,(second node) ,next))) ((any-node-p node) (add-instr (alloc-instr) `(any ,next))) ((startword-node-p node) (add-instr (alloc-instr) `(startword ,next))) ((endword-node-p node) (add-instr (alloc-instr) `(endword ,next))) ((lookahead-node-p node) (emit-lookahead (second node) next)) ((nlookahead-node-p node) (emit-nlookahead (second node) next)) ((hook-node-p node) (add-instr (alloc-instr) `(hook ,(hook-node-function node) ,next))) ((success-node-p node) (add-instr (alloc-instr) `(success ,(success-node-rc node) ,next))) (t ;; once we're done, this should throw the :invalid-parse-tree tag (throw 'regex-parse-error (list "codegen: Unhandled intermediate node ~S" node))))) (defun emit-sequence (children next) (cond ((null children) next) (t (emit-instr (first children) (emit-sequence (rest children) next))))) (defun emit-lookahead (child-node next) (let* ((childsuccess-instr-num (alloc-instr)) (child-instr-num (emit-instr child-node childsuccess-instr-num))) (add-instr childsuccess-instr-num '(success t)) (add-instr (alloc-instr) `(lookahead ,child-instr-num ,next)) )) (defun emit-nlookahead (child-node next) (let* ((childsuccess-instr-num (alloc-instr)) (child-instr-num (emit-instr child-node childsuccess-instr-num))) (add-instr childsuccess-instr-num '(success t)) (add-instr (alloc-instr) `(nlookahead ,child-instr-num ,next)) )) (defun emit-greedy-kleene-no-termcheck (child-node next) (let* ((loop-instr-num (alloc-instr)) (body-instr-num (emit-instr child-node loop-instr-num))) (add-instr loop-instr-num `(alt-2-no-termcheck ,body-instr-num ,next)))) (defun emit-greedy-kleene-simple-termcheck (child-node next) (let* ((loop-instr-num (alloc-instr)) (body-instr-num (emit-instr child-node loop-instr-num))) (add-instr loop-instr-num `(alt-2-simple-termcheck-1 ,body-instr-num ,next)))) (defun emit-greedy-kleene-full-termcheck (child-node next) (let* ((loop-instr-num (alloc-instr)) (body-instr-num (emit-instr child-node loop-instr-num))) (add-instr loop-instr-num `(alt-2-full-termcheck ,body-instr-num ,next)))) (defun emit-ngkleene-no-termcheck (child-node next) (let* ((loop-instr-num (alloc-instr)) (body-instr-num (emit-instr child-node loop-instr-num))) (add-instr loop-instr-num `(alt-2-no-termcheck ,next ,body-instr-num)))) (defun emit-ngkleene-simple-termcheck (child-node next) (let* ((loop-instr-num (alloc-instr)) (body-instr-num (emit-instr child-node loop-instr-num))) (add-instr loop-instr-num `(alt-2-simple-termcheck-2 ,next ,body-instr-num)))) (defun emit-ngkleene-full-termcheck (child-node next) (let* ((loop-instr-num (alloc-instr)) (body-instr-num (emit-instr child-node loop-instr-num))) (add-instr loop-instr-num `(alt-2-full-termcheck ,next ,body-instr-num)))) (defun emit-alt (child-nodes next) (let* ((alt-instr-num (alloc-instr)) (child-instr-nums (mapcar (lambda (child-node) (emit-instr child-node next)) child-nodes)) (num-children (length child-nodes))) (cond ((< num-children 2) (error "Too few child nodes for alt ~S" child-nodes)) ((= (length child-nodes) 2) (add-instr alt-instr-num `(alt-2-no-termcheck ,@child-instr-nums))) (t (add-instr alt-instr-num `(alt-no-termcheck ,child-instr-nums)))))) ;; The arms may be replicated many times, so we need to be careful to re-use ;; them when necessary (defun emit-casealt (child-nodes next) (let* ((alt-instr-num (alloc-instr)) (arm-reuse-cache (make-hash-table :test 'equalp)) (child-instr-branches (mapcar (lambda (arm) (let* ((guard (first arm)) (consequent-node (second arm)) (prev-lbl (gethash consequent-node arm-reuse-cache))) (if prev-lbl (list guard prev-lbl) (list guard (setf (gethash consequent-node arm-reuse-cache) (emit-instr consequent-node next)))))) child-nodes)) (num-children (length child-nodes))) (cond ((< num-children 2) (error "Too few child nodes for alt ~S" child-nodes)) (t (add-instr alt-instr-num `(casealt ,child-instr-branches)))))) cl-regex-1/closure.lisp0100755000175000001440000012166307550106060014004 0ustar mrdusers;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: REGEX; Base: 10 -*- (in-package :REGEX) ;;; ;;; Pass 6 - Code generation to closures ;;; ; returns array of info structures (defun gen-closures (cpscode) (let ((info (make-array (length cpscode)))) (loop for instr in cpscode for offset = (first instr) for opcode = (second instr) for args = (cddr instr) do (setf (aref info offset) (gen-closure opcode args))) info)) (defun gen-closure (opcode args) (destructuring-bind (&optional arg1 arg2 arg3) args (case opcode (char (gen-char-closure arg1 arg2)) (string (gen-string-closure (length arg1) arg1 arg2)) (classseq (let ((num-classes (length arg1))) (gen-classseq-closure num-classes (make-array num-classes :initial-contents arg1) arg2))) (backmatch (gen-backmatch-closure arg1 arg2)) (str-greedy-kleene (gen-str-greedy-kleene-closure (length arg1) arg1 arg2)) (char-greedy-kleene (gen-char-greedy-kleene-closure arg1 arg2)) (not-char-greedy-kleene (gen-not-char-greedy-kleene-closure arg1 arg2)) (cclass-2-greedy-kleene (gen-cclass-2-greedy-kleene-closure arg1 arg2 arg3)) (not-cclass-2-greedy-kleene (gen-not-cclass-2-greedy-kleene-closure arg1 arg2 arg3)) (cclass-greedy-kleene (gen-cclass-greedy-kleene-closure arg1 arg2)) (not-cclass-greedy-kleene (gen-not-cclass-greedy-kleene-closure arg1 arg2)) (specclass-greedy-kleene (gen-specclass-greedy-kleene-closure (get-spec-pat-fxn arg1) arg2)) (not-specclass-greedy-kleene (gen-not-specclass-greedy-kleene-closure (get-spec-pat-fxn arg1) arg2)) (alt-2-simple-termcheck-1 (gen-alt-2-simple-termcheck-1-closure arg1 arg2)) (alt-2-simple-termcheck-2 (gen-alt-2-simple-termcheck-2-closure arg1 arg2)) (alt-2-full-termcheck (gen-alt-2-full-termcheck-closure arg1 arg2)) (alt-2-no-termcheck (gen-alt-2-no-termcheck-closure arg1 arg2)) (alt-no-termcheck (let ((num-branches (length arg1))) (gen-alt-no-termcheck-closure num-branches (make-array num-branches :initial-contents arg1)))) (casealt (multiple-value-bind (numbranches jmptbl) (make-casealt-jmptable arg1) (gen-casealt-closure numbranches jmptbl))) (startanchor (gen-startanchor-closure arg1)) (endanchor (gen-endanchor-closure arg1)) (left-rstart (gen-left-rstart-closure arg1 arg2)) (right-rstart (gen-right-rstart-closure arg1 arg2)) (rend (gen-rend-closure arg1 arg2)) (cclass-2 (gen-cclass-2-closure arg1 arg2 arg3)) (cclass (gen-cclass-closure arg1 arg2)) (not-char (gen-not-char-closure arg1 arg2)) (not-cclass-2 (gen-not-cclass-2-closure arg1 arg2 arg3)) (not-cclass (gen-not-cclass-closure arg1 arg2)) (specclass (gen-specclass-closure (get-spec-pat-fxn arg1) arg2)) (not-specclass (gen-not-specclass-closure (get-spec-pat-fxn arg1) arg2)) (any (gen-any-closure arg1)) (startword (gen-startword-closure arg1)) (endword (gen-endword-closure arg1)) (lookahead (gen-lookahead-closure arg1 arg2)) (nlookahead (gen-nlookahead-closure arg1 arg2)) (hook (gen-hook-closure arg1 arg2)) (success (gen-success-closure arg1)) (t (error "gen-closure: Unknown instruction ~S ~S" opcode args)))) ) (defun make-casealt-jmptable (jumps) (let* ((num-jump-targets (length jumps)) (jump-tbl (make-array (* 2 num-jump-targets)))) (loop for discriminator-idx from 0 by 2 for destination-idx from 1 by 2 for (discriminator destination) in jumps do (progn (setf (aref jump-tbl discriminator-idx) discriminator) (setf (aref jump-tbl destination-idx) destination)) finally (return (values num-jump-targets jump-tbl))))) (defmacro make-text-closure (&key matcher initializer linker) `(make-closure-info :matchfn (if *match-simple-strings-only* (macrolet ((re-char (str idx) `(the character (schar (the simple-string ,str) (the fixnum ,idx))))) ,matcher) (macrolet ((re-char (str idx) `(the character (char (the string ,str) (the fixnum ,idx))))) ,matcher)) :initfn ,initializer :linkfn ,linker)) (defmacro make-nontext-closure (&key matcher initializer linker) `(make-closure-info :matchfn ,matcher :initfn ,initializer :linkfn ,linker)) (defun gen-char-closure (chr next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (character chr) (ftype (function (fixnum) t) next) (special *str* *end*) (string *str*) (fixnum *end*)) (cond ((and (< pos *end*) (char= chr (re-char *str* pos))) (funcall next (1+ pos))))) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next)) ))) (defun gen-string-closure (len patstr next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0) )) (declare (fixnum pos) (fixnum len) (string patstr) (ftype (function (fixnum) t) next) (special *str* *end*) (string *str*) (fixnum *end*)) (let ((end (+ pos len))) (declare (fixnum end)) (when (<= end *end*) (loop for i fixnum from pos below end for j fixnum from 0 below len when (char/= (re-char *str* i) (re-char patstr j)) return nil finally (funcall next i))))) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-classseq-closure (len chrclasses next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0) )) (declare (fixnum pos) (fixnum len) (simple-vector chrclasses) (ftype (function (fixnum) t) next) (special *str* *end*) (string *str*) (fixnum *end*)) (let ((reg-str-end (+ pos len))) (declare (fixnum reg-str-end)) (when (<= reg-str-end *end*) (loop for i fixnum from pos below reg-str-end for j fixnum from 0 below len unless (find (re-char *str* i) (svref chrclasses j)) return nil finally (funcall next reg-str-end))))) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-backmatch-closure (regnum next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (fixnum regnum) (ftype (function (fixnum) t) next) (special *str* *regs* *end*) (string *str*) (type simple-vector *regs*) (fixnum *end*)) (let ((reg-start (register-start *regs* regnum)) (reg-end (register-end *regs* regnum))) (cond ((numberp reg-start) ;; If reg-start is set, but reg-end isn't, then we're matching ;; inside that register's context. So use POS as the end. (unless (integerp reg-end) (setq reg-end pos)) (let* ((reg-len (- reg-end reg-start)) (reg-str-end (+ pos reg-len))) (declare (fixnum reg-len reg-str-end)) (when (<= reg-str-end *end*) (loop for i fixnum from pos below reg-str-end for j fixnum from reg-start below reg-end when (char/= (re-char *str* i) (re-char *str* j)) return nil finally (funcall next i))))) ;; backmatching an unmatched register is ok -- it may mean ;; that the register was for ()* or ()? or something, so treat ;; it as a 0-length register (t (funcall next pos))))) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-str-greedy-kleene-closure (len patstr next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (fixnum len) (string patstr) (character chr) (ftype (function (fixnum) t) next) (special *str* *end*) (string *str*) (fixnum *end*)) (let ((firstend (+ pos len)) (lastpos pos) (end (- *end* len))) (declare (fixnum firstend lastpos end)) (when (<= firstend *end*) (loop for testpos fixnum from pos upto end by len while (loop for j fixnum from 0 below len for i fixnum from testpos when (char/= (re-char *str* i) (re-char patstr j)) do (return nil) finally (return t)) do (incf lastpos len))) (loop for testpos fixnum from lastpos downto pos by len do (funcall next testpos)))) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))))) (defun gen-char-greedy-kleene-closure (chr next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (character chr) (ftype (function (fixnum) t) next) (special *str* *end*) (string *str*) (fixnum *end*)) (let ((newpos pos)) (declare (fixnum newpos)) (loop while (and (< newpos *end*) (char= chr (re-char *str* newpos))) do (incf newpos)) (loop while (>= newpos pos) do (progn (funcall next newpos) (decf newpos))) nil)) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-not-char-greedy-kleene-closure (chr next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (ftype (function (fixnum) t) next) (character chr) (special *str* *end*) (string *str*) (fixnum *end*)) (let ((newpos pos)) (declare (fixnum newpos)) (loop while (and (< newpos *end*) (char/= chr (re-char *str* newpos))) do (incf newpos)) (loop while (>= newpos pos) do (progn (funcall next newpos) (decf newpos))) nil)) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-cclass-2-greedy-kleene-closure (chr1 chr2 next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (character chr1 chr2) (ftype (function (fixnum) t) next) (special *str* *end*) (fixnum *start* *end*) (string *str*)) (let ((newpos pos)) (declare (fixnum newpos)) (loop while (and (< newpos *end*) (let ((chr (re-char *str* newpos))) (declare (character chr)) (or (char= chr1 chr) (char= chr2 chr)))) do (incf newpos)) (loop while (>= newpos pos) do (progn (funcall next newpos) (decf newpos))) nil)) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-not-cclass-2-greedy-kleene-closure (chr1 chr2 next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (character chr1 chr2) (ftype (function (fixnum) t) next) (special *str* *end*) (fixnum *start* *end*) (string *str*)) (let ((newpos pos)) (declare (fixnum newpos)) (loop while (and (< newpos *end*) (not (let ((chr (re-char *str* newpos))) (declare (character chr)) (or (char= chr1 chr) (char= chr2 chr))))) do (incf newpos)) (loop while (>= newpos pos) do (progn (funcall next newpos) (decf newpos))) nil)) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-cclass-greedy-kleene-closure (chrs next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (simple-string chrs) (ftype (function (fixnum) t) next) (special *str* *end*) (fixnum *end*) (string *str*)) (let ((newpos pos)) (declare (fixnum newpos)) (loop while (and (< newpos *end*) (find (re-char *str* newpos) chrs)) do (incf newpos)) (loop while (>= newpos pos) do (progn (funcall next newpos) (decf newpos))) nil)) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-not-cclass-greedy-kleene-closure (chrs next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (simple-string chrs) (ftype (function (fixnum) t) next) (special *str* *end*) (fixnum *end*) (string *str*)) (let ((newpos pos)) (declare (fixnum newpos)) (loop while (and (< newpos *end*) (not (find (re-char *str* newpos) chrs))) do (incf newpos)) (loop while (>= newpos pos) do (progn (funcall next newpos) (decf newpos))) nil)) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-specclass-greedy-kleene-closure (classfn next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (ftype (function (character) t) classfn) (ftype (function (fixnum) t) next) (special *str* *end*) (string *str*) (fixnum *end*)) (let ((newpos pos)) (declare (fixnum newpos)) (loop while (and (< newpos *end*) (funcall classfn (re-char *str* newpos))) do (incf newpos)) (loop while (>= newpos pos) do (progn (funcall next newpos) (decf newpos))) nil)) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-not-specclass-greedy-kleene-closure (classfn next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (ftype (function (character) t) classfn) (ftype (function (fixnum) t) next) (special *str* *end*) (string *str*) (fixnum *end*)) (let ((newpos pos)) (declare (fixnum newpos)) (loop while (and (< newpos *end*) (not (funcall classfn (re-char *str* newpos)))) do (incf newpos)) (loop while (>= newpos pos) do (progn (funcall next newpos) (decf newpos))) nil)) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-alt-2-simple-termcheck-1-closure (next1 next2 &aux (oldpos -1)) (make-nontext-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (ftype (function (fixnum) t) next1 next2) (fixnum oldpos) (special *str* *end*) (string *str*) (fixnum *end*)) (when (> pos oldpos) (setq oldpos pos) (funcall next1 pos)) (funcall next2 pos) (setq oldpos -1) nil) :initializer #'(lambda () (setq oldpos -1)) :linker #'(lambda (link-info) (setq next1 (resolve-instr link-info next1)) (setq next2 (resolve-instr link-info next2))) )) (defun gen-alt-2-simple-termcheck-2-closure (next1 next2 &aux (oldpos -1)) (make-nontext-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (ftype (function (fixnum) t) next1 next2) (fixnum oldpos)) (funcall next1 pos) (when (> pos oldpos) (setq oldpos pos) (funcall next2 pos)) (setq oldpos -1) nil) :initializer #'(lambda () (setq oldpos -1)) :linker #'(lambda (link-info) (setq next1 (resolve-instr link-info next1)) (setq next2 (resolve-instr link-info next2))) )) (defun gen-alt-2-full-termcheck-closure (next1 next2 &aux (firsttimep t) (seen1 (make-hash-table)) (seen2 (make-hash-table))) (make-nontext-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (fixnum firstpos) (ftype (function (fixnum) t) next1 next2)) (cond (firsttimep (setq firsttimep nil) (unless (gethash pos seen1) (setf (gethash pos seen1) t) (funcall next1 pos)) (unless (gethash pos seen2) (setf (gethash pos seen2) t) (funcall next2 pos)) (clrhash seen1) (clrhash seen2) (setq firsttimep t) nil) (t (unless (gethash pos seen1) (setf (gethash pos seen1) t) (funcall next1 pos)) (unless (gethash pos seen2) (setf (gethash pos seen2) t) (funcall next2 pos))))) :initializer #'(lambda () (setq firsttimep t)) :linker #'(lambda (link-info) (setq next1 (resolve-instr link-info next1)) (setq next2 (resolve-instr link-info next2))) )) (defun gen-alt-2-no-termcheck-closure (next1 next2) (make-nontext-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (ftype (function (fixnum) t) next1 next2)) (funcall next1 pos) (funcall next2 pos)) :initializer nil :linker #'(lambda (link-info) (setq next1 (resolve-instr link-info next1)) (setq next2 (resolve-instr link-info next2))) )) (defun gen-alt-no-termcheck-closure (num-nexts nexts) (make-nontext-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos num-nexts)) (dotimes (i num-nexts) (funcall (svref nexts i) pos))) :initializer nil :linker #'(lambda (link-info) (dotimes (i num-nexts nil) (declare (fixnum i)) (setf (svref nexts i) (resolve-instr link-info (svref nexts i))))) )) (defun gen-casealt-closure (num-jump-entries jmptbl) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (special *str* *end*) (string *str*) (fixnum *end*)) (when (< pos *end*) (let ((chr (re-char *str* pos))) (declare (character chr)) (loop for discriminator-idx fixnum from 0 by 2 below (* 2 num-jump-entries) when (char= chr (svref jmptbl discriminator-idx)) do (funcall (svref jmptbl (1+ discriminator-idx)) pos) finally (return nil))))) :initializer nil :linker #'(lambda (link-info) (loop for destination-idx from 1 by 2 below (* 2 num-jump-entries) do (setf (svref jmptbl destination-idx) (resolve-instr link-info (svref jmptbl destination-idx))))) )) (defun gen-startanchor-closure (next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (ftype (function (fixnum) t) next) (special *str* *start* *start-is-anchor*) (string *str*) (fixnum *start*)) (if (or (and *start-is-anchor* (= pos *start*)) (and (> pos 0) (char= (re-char *str* (1- pos)) #\newline))) (funcall next pos))) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-endanchor-closure (next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (ftype (function (fixnum) t) next) (special *str* *end* *end-is-anchor*) (fixnum *end*) (string *str*)) (if (or (and *end-is-anchor* (= pos *end*)) (and (< pos *end*) (char= (re-char *str* pos) #\newline))) (funcall next pos))) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) ;(defun gen-left-rstart-closure (regnum next) ; (make-nontext-closure ; :matcher ; #'(lambda (pos) ; #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) ; #+:lispworks (hcl:fixnum-safety 0))) ; (declare (fixnum pos) ; (fixnum regnum) ; (ftype (function (fixnum) t) next) ; (special *regs*) ; (type simple-vector *regs*)) ; (let ((reg (svref *regs* regnum))) ; (declare (cons reg)) ; (setf (car reg) pos (cdr reg) nil) ; (funcall next pos) ; (setf (car reg) nil) ; nil)) ; :initializer ; nil ; :linker ; #'(lambda (link-info) ; (setq next (resolve-instr link-info next))) )) (defun gen-left-rstart-closure (regnum next) (make-nontext-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (fixnum regnum) (ftype (function (fixnum) t) next) (special *regs*) (type simple-vector *regs*)) (let ((reg (svref *regs* regnum))) (declare (cons reg)) (cond ((and (car reg) (cdr reg)) (funcall next pos)) (t (setf (car reg) pos (cdr reg) nil) (funcall next pos) (setf (car reg) nil) nil)))) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-right-rstart-closure (regnum next) (make-nontext-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (fixnum regnum) (ftype (function (fixnum) t) next) (special *regs*) (type simple-vector *regs*)) (let ((reg (svref *regs* regnum))) (declare (cons reg)) (let ((prevstart (car reg)) (prevend (cdr reg))) (setf (car reg) pos (cdr reg) nil) (funcall next pos) (setf (car reg) prevstart (cdr reg) prevend))) nil) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-rend-closure (regnum next) (make-nontext-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (fixnum regnum) (ftype (function (fixnum) t) next) (special *regs*) (type simple-vector *regs*)) (cond ((register-end *regs* regnum) (funcall next pos)) (t (setf (register-end *regs* regnum) pos) (funcall next pos) (setf (register-end *regs* regnum) nil)))) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-cclass-2-closure (chr1 chr2 next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (character chr1 chr2) (ftype (function (fixnum) t) next-fn) (special *str* *end*) (string *str*) (fixnum *end*)) (if (< pos *end*) (let ((chr (re-char *str* pos))) (declare (character chr)) (if (or (char= chr chr1) (char= chr chr2)) (funcall next (1+ pos)))))) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-cclass-closure (chrs next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (simple-string chrs) (ftype (function (fixnum) t) next) (special *str* *end*) (string *str*) (fixnum *end*)) (if (and (< pos *end*) (find (re-char *str* pos) chrs)) (funcall next (1+ pos)))) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-not-char-closure (chr next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (character chr) (ftype (function (fixnum) t) next) (special *str* *end*) (string *str*) (fixnum *end*)) (if (and (< pos *end*) (char/= (re-char *str* pos) chr)) (funcall next (1+ pos)))) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-not-cclass-2-closure (chr1 chr2 next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (character chr1 chr2) (ftype (function (fixnum) t) next) (special *str* *end*) (string *str*) (fixnum *end*)) (if (< pos *end*) (let ((chr (re-char *str* pos))) (declare (character chr)) (if (not (or (char= chr chr1) (char= chr chr2))) (funcall next (1+ pos)))))) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-not-cclass-closure (chrs next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (simple-string chrs) (ftype (function (fixnum) t) next) (special *str* *end*) (string *str*) (fixnum *end*)) (if (and (< pos *end*) (not (find (re-char *str* pos) chrs))) (funcall next (1+ pos)))) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-specclass-closure (classfn next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (ftype (function (character) t) classfn) (ftype (function (fixnum) t) next) (special *str* *end*) (string *str*) (fixnum *end*)) (if (and (< pos *end*) (funcall classfn (re-char *str* pos))) (funcall next (1+ pos)))) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-not-specclass-closure (classfn next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (ftype (function (character) t) classfn) (ftype (function (fixnum) t) next) (special *str* *end*) (string *str*) (fixnum *end*)) (if (and (< pos *end*) (not (funcall classfn (re-char *str* pos)))) (funcall next (1+ pos)))) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-any-closure (next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (ftype (function (fixnum) t) next) (special *str* *end*) (fixnum pos *start* *end*) (string *str*) (type simple-vector *regs*)) (if (and (< pos *end*) (or *dot-matches-newline* (char/= (re-char *str* pos) #\newline))) (funcall next (1+ pos)))) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-startword-closure (next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (ftype (function (fixnum) t) next) (special *str* *end* *start*) (fixnum pos *start* *end* *start*) (string *str*) (type simple-vector *regs*)) ;; at start-of-word if previous char is nonword (or ;; start-of-str) and this char is word (if (and (or (= pos *start*) (and (> pos *start*) (not (wordcharp (re-char *str* (1- pos)))))) (< pos *end*) (wordcharp (re-char *str* pos))) (funcall next pos))) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-endword-closure (next) (make-text-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (fixnum pos) (ftype (function (fixnum) t) next) (special *str* *end* *start*) (fixnum pos *start* *end* *start*) (string *str*) (type simple-vector *regs*)) ;; at end-of-word if previous char is word and this char is ;; nonword or end-of-string (if (and (> pos *start*) (wordcharp (re-char *str* (1- pos))) (or (= pos *end*) (and (< pos *end*) (not (wordcharp (re-char *str* pos)))))) (funcall next pos))) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-hook-closure (hookfn next) (make-nontext-closure :matcher #'(lambda (pos) (declare (fixnum pos) (ftype (function (fixnum) t) next) (special *end* *hooks*) (fixnum *end*)) (let ((rc (cond ((integerp hookfn) (funcall (aref *hooks* hookfn) pos)) ((or (functionp hookfn) (symbolp hookfn)) (funcall hookfn pos)) (t nil)))) (cond ;; user hook returned an integer --> new matching position ((integerp rc) (funcall next rc)) ;; user hook returned t --> continue matching at pos (rc (funcall next pos)) ;; user hook returned nil --> fail (t nil)))) :initializer nil :linker #'(lambda (link-info) (setq next (resolve-instr link-info next))) )) (defun gen-lookahead-closure (childfn next) (make-nontext-closure :matcher #'(lambda (pos) (declare (fixnum pos) (ftype (function (fixnum) t) next) (special *end* *hooks*) (fixnum *end*)) (let ((*acceptfn* nil)) (declare (special *acceptfn*)) (if (catch 'cease-matching (funcall childfn pos)) (funcall next pos) nil))) :initializer nil :linker #'(lambda (link-info) (setq childfn (resolve-instr link-info childfn)) (setq next (resolve-instr link-info next))) )) (defun gen-nlookahead-closure (childfn next) (make-nontext-closure :matcher #'(lambda (pos) (declare (fixnum pos) (ftype (function (fixnum) t) next) (special *end* *hooks*) (fixnum *end*)) (let ((*acceptfn* nil)) (declare (special *acceptfn*)) (if (not (catch 'cease-matching (funcall childfn pos))) (funcall next pos) nil))) :initializer nil :linker #'(lambda (link-info) (setq childfn (resolve-instr link-info childfn)) (setq next (resolve-instr link-info next))) )) (defun gen-success-closure (rc) (make-nontext-closure :matcher #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (declare (ignore pos) (special *regs* *acceptfn*) (type simple-vector *regs*)) (let* ((match-start (register-start *regs* 0)) (match-end (register-end *regs* 0)) (valid-acceptfn-p (or (functionp *acceptfn*) (and (symbolp *acceptfn*) (fboundp *acceptfn*) (functionp (symbol-function *acceptfn*))))) (really-succeeded-p (if valid-acceptfn-p (funcall *acceptfn* match-start match-end) t))) (declare (fixnum match-start match-end)) (when really-succeeded-p (throw 'cease-matching (values rc match-start (- match-end match-start) *regs*))))) :initializer nil :linker nil )) (defun make-init-closure (reset-fns next) #'(lambda (pos) #-:debug-regex(declare (optimize (speed 3) (safety 0) (space 0) (debug 0) #+:lispworks (hcl:fixnum-safety 0))) (dolist (reset-fn reset-fns) (funcall reset-fn)) (funcall next pos)) ) ;;; ;;; Special pattern support ;;; (defun get-spec-pat-fxn (patclass) "Return the function to test for the special pattern?" (case patclass (alpha #'alpha-char-p) (upper #'upper-case-p) (lower #'lower-case-p) (digit #'digit-char-p) (alnum #'alphanumericp) (xdigit #'xdigitp) (odigit #'odigitp) (punct #'punctp) (space #'spacep) (t (error "Couldn't find special class ~S" patclass)))) (defun xdigitp (ch) "Is this character a hexidecimal digit?" (or (digit-char-p ch) (find (char-upcase ch) "ABCDEF"))) (defun odigitp (ch) "Is this character an octal digit?" (find ch "01234567")) (defun punctp (ch) "Is this character a punctuation mark?" (find ch "!.,;:'\"?`") ) (defun spacep (ch) "Is this character some type of whitespace?" (or (char= ch #\tab) (char= ch #\Space) (char= ch #\newline) (char= ch #\return))) (defun wordcharp (ch) (or (alphanumericp ch) (char= ch #\_))) cl-regex-1/retest.lisp0100755000175000001440000001654707542740006013647 0ustar mrdusers;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: REGEX-TEST; Base: 10 -*- (in-package :REGEX-TEST) ;;; ;;; testing... ;;; (defun test (patstr candstr shouldmatchp &optional (verbosep nil)) (when verbosep (format t "~%Testing pattern ~S against string ~S" patstr candstr) (format t "~%Compiling...")) (let ((matcher (compile-str patstr))) (unless matcher (format t "~%Error compiling pattern") (return-from test nil)) (multiple-value-bind (matchedp start len regs) (match-str matcher candstr) (format t "~%matched=~A start=~A len=~A regs=~A" matchedp start len regs) (when verbosep (cond ((and shouldmatchp (not matchedp)) (format t "~%***** Error: Should have matched, but didn't *****")) ((and (not shouldmatchp) matchedp) (format t "~%***** Error: Shouldn't have matched, but did *****")) (matchedp (format t "~%Success: Matched")) (t (format t "~%Success: Didn't match")))) (when matchedp (dotimes (i (array-dimension regs 0)) (let ((start (register-start regs i)) (end (register-end regs i))) (format t "~%REG ~D start=~D end=~D" i start end) (when (register-matched-p regs i) (format t " substr = \"") (loop for j from start below end do (princ (char candstr j))) (format t "\"")))))))) (defun coveragetest () (test "AB" "AB" t t) (test "A*" "" t t) (test "A*" "A" t t) (test "A*" "AA" t t) (test "A+" "" nil t) (test "A+" "A" t t) (test "A+" "AA" t t) ;; test '.' and '?' (test ".BC" "ABC" t t) (test ".BC" "BC" nil t) (test "A?BC" "ABC" t t) (test "A?BC" "BC" t t) ;; test alternation (test "A|B" "A" t t) (test "(A)|(B)" "B" t t) ;; more complicated test (test "((A*B)|(AC))D" "BD" t t) (test "((A*B)|(AC))D" "ABD" t t) (test "((A*B)|(AC))D" "AABD" t t) (test "((A*B)|(AC))D" "AAABD" t t) (test "((A*B)|(AC))D" "AAABC" nil t) (test "((A*B)|(AC))D" "ACD" t t) (test "(ABC)*DEF" "ABCABCABCDEF" t t) ;; test character patterns and anchors (test "[a-z][0-9][z-a][9-0]" "a0a0" t t) (test "[a-z][0-9][z-a][9-0]" "A0A0" nil t) (test "[^a-z][0-9]" "A0" t t) (test "[^a-z][0-9]" "a0" nil t) (test "^[abcdefg]*$" "abcdefg" t t) (test "^[abcdefg]*$" "abcdefgh" nil t) (test "^[abcdefg]*$" "ABCDEFG" nil t) ;; test special character patterns (test "[:lower:][:digit:][:upper:][:xdigit:]" "a0A0" t t) (test "[:lower:][:digit:][:upper:][:xdigit:]" "a0Aa" t t) (test "[:lower:][:digit:][:upper:][:xdigit:]" "a0AA" t t) (test "[:lower:][:digit:][:upper:][:xdigit:]" "a0Af" t t) (test "[:lower:][:digit:][:upper:][:xdigit:]" "a0AF" t t) ;; test compiler errors (format t "~%~%All of the following should generate compiler errors!") (test "(abc" "(abc" nil t) (test "(abc" "abc" nil t) (test "abc)def" "abc)def" nil t) (test "abc)def" "abc" nil t) (test "[abc" "[abc" nil t) (test "[abc" "abc" nil t) ;; Unlike the C++ parser, this one treats unattached ] as a normal character ;; (test "abc]def" "abc]def" nil t) ;; (test "abc]def" "abc" nil t) (test "[:digit]*" "012345" nil t) ) (defun respeedtest-closure (numreps patstr candstr) (let* ((matcher (compile-str patstr)) (regs (make-regs (matcher-numregs matcher))) (matchedp nil)) (when (null matcher) (format t "Error compiling pattern ~A" patstr) (return-from respeedtest-closure nil)) (format t "~%~%Timing ~S (closure)" patstr) (let ((starttime (get-internal-run-time))) (dotimes (rep numreps) (setq matchedp (match-str matcher candstr :regs regs))) (let* ((endtime (get-internal-run-time)) (elapsed (- endtime starttime))) (format t "~%~T: ~D secs, ~D/sec, ~S --> ~S~%" (round (/ elapsed internal-time-units-per-second)) (round (/ numreps (/ elapsed internal-time-units-per-second))) patstr candstr))) (when (not matchedp) (format t "~%Didn't match")))) (defun respeedtest-comp (numreps patstr matcher candstr) (let* ((regs (make-regs (matcher-numregs matcher))) (matchedp nil)) (when (null matcher) (format t "Error compiling pattern ~A" patstr) (return-from respeedtest-comp nil)) (format t "~%~%Timing ~S (compiled)" patstr) (let ((starttime (get-internal-run-time))) (dotimes (rep numreps) (setq matchedp (match-str matcher candstr :regs regs))) (let* ((endtime (get-internal-run-time)) (elapsed (- endtime starttime))) (format t "~%~T: ~D secs, ~D/sec, ~S --> ~S~%" (round (/ elapsed internal-time-units-per-second)) (round (/ numreps (/ elapsed internal-time-units-per-second))) patstr candstr))) (when (not matchedp) (format t "~%Didn't match")))) (defun strcmpspeedtest (numreps patstr candstr compname compfxn) (format t "~%~%Timing ~S ~S" compname patstr) (let ((matchedp nil) (starttime (get-internal-run-time))) (dotimes (rep numreps) (setq matchedp (funcall compfxn patstr candstr))) (let* ((endtime (get-internal-run-time)) (elapsed (- endtime starttime))) (format t "~%~T: ~D secs, ~D/sec, ~S --> ~S~%" (round (/ elapsed internal-time-units-per-second)) (round (/ numreps (/ elapsed internal-time-units-per-second))) patstr candstr)) (when (not matchedp) (format t "~%Didn't match")))) ;;; ;;; Speeds are under Win NT, P3-600mhz. GNU speeds are w/ MSVC6.0. ;;; (defregex test1 "A*BD") (defregex test2 "(?A|A)*BD") (defregex test3 "(A|A)*BD") (defregex test4 "(A|B)*BD") (defregex test5 "(B|A)*BD") (defregex test6 "((A*B)|(AC))D") (defregex test7 "((A*B)|(A*C))D") (defregex test8 "[Aa]*[Bb][Dd]") (defun speedtest () (let ((numreps #-:Genera 1000000 #+:Genera 250000) (candstr "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABD")) (regex::clear-pattern-cache) ;; CLAWK: 9 secs; MSVC/GNU: 53 secs (respeedtest-closure numreps "A*BD" candstr) ;; CLAWK: 10 secs; MSVC/GNU: No known equivalent (respeedtest-closure numreps "(?A|A)*BD" candstr) ;; CLAWK: 10 secs; MSVC/GNU: 171 secs (respeedtest-closure numreps "(A|A)*BD" candstr) ;; CLAWK: 34 secs; MSVC/GNU: 176 secs (respeedtest-closure numreps "(A|B)*BD" candstr) ;; CLAWK: 55 secs; MSVC/GNU: 178 secs (respeedtest-closure numreps "(B|A)*BD" candstr) ;; CLAWK: 10 secs; MSVC/GNU: 71 secs (respeedtest-closure numreps "((A*B)|(AC))D" candstr) ;; CLAWK: 11 secs; MSVC/GNU: 72 secs (respeedtest-closure numreps "((A*B)|(A*C))D" candstr) ;; CLAWK: 9 secs; MSVC/GNU: 63 secs (respeedtest-closure numreps "[Aa]*[Bb][Dd]" candstr) ;; LWW: 27 secs; MSVC/MSVC: 1 secs (strcmpspeedtest numreps candstr candstr "string=" #'string=) ;; LWW: 65 secs; MSVC/MSVC: 2 secs (strcmpspeedtest numreps candstr candstr "string-equal" #'string-equal) )) (defun run-tests () (format t "~%Starting coverage test~%") (coveragetest) (format t "~%Starting Sebastien's coverage test~%") (run-sebastien-tests) (format t "~%Starting speed test~%") (speedtest) (format t "~%Done~%") ) cl-regex-1/regexp-test-suite.lisp0100755000175000001440000007430007445552646015743 0ustar mrdusers;; ***************************************************************************** ;; FILE ;; Name : regexp-test-suite.cl ;; Date : 2002-03-01 ;; Author : Sébastien SAINT-SEVIN ;; Purpose : testing module for regular expressions ;; ;; Modified by KMP to run under both rightmost and leftmost matches. ;; ----------------------------------------------------------------------------- (in-package :REGEX-TEST) (allow-nonregister-groups) (allow-nongreedy-quantifiers) (allow-rangematch) (allow-backmatch) ;; rightmost matches are more canonical, but much slower. ;(registers-match-rightmost t) ;(pushnew :regex-right *features*) ;; leftmost matches are usually a lot faster with this engine. (registers-match-rightmost nil) (setq *features* (remove :regex-right *features*)) (defparameter *regexp-tests* '( ;; ********************************************************************* ;; ::= ( ) ;; ::= ( *) ;; --------------------------------------------------------------------- ;; ******************************************************* ;; the tests that follows are from: ;; ------------------------------------------------------- ;; (c) Sudhir Shenoy, 1996 ;; ;; The tests here are from: ;; ;; (a) Tom Lord's GNU rx package ;; (b) from the Zebu parser generator package ;; (modified to use new syntax) ;; ------------------------------------------------------- ;; All have been slightly modified to follow the syntax ;; I use in this module - Sébastien Saint-Sevin, 2002 ;; ------------------------------------------------------- ("a*a*" "aaaaaa" t t ("aaaaaa")) ("a*a*a*" "aaaaaa" t t ("aaaaaa")) ("a*a*a*a*" "aaaaaa" t t ("aaaaaa")) ("a*a*a*a*a*" "aaaaaa" t t ("aaaaaa")) ("a*a*a*a*a*a*" "aaaaaa" t t ("aaaaaa")) ("a*a*a*a*a*a*a*" "aaaaaa" t t ("aaaaaa")) ("" "" nil nil ()) ("b{0,6}" "" t t ("")) ("ab{0,0}c" "abc" t nil ()) ("ab{1,1}c" "abbc" t nil ()) ("ab{3,7}c" "abbbbbbbbc" t nil ()) ("ab{3,7}c" "abbbbbbbbbc" t nil ()) ("ab{3,7}c" "abbbbbbbbbbc" t nil ()) ("ab{3,7}c" "abbbbbbbbbbbc" t nil ()) ("b{2,7}" "bb" t t ("bb")) ("b{1,6}" "" t nil ()) ("b{1,6}" "b" t t ("b")) ("b{2,7}" "b" t nil ()) ("ab{0,7}c" "ac" t t ("ac")) ("ab{1,7}c" "abc" t t ("abc")) ("ab{2,7}c" "abbc" t t ("abbc")) ("ab{3,7}c" "abbbc" t t ("abbbc")) ("ab{3,7}c" "abbbbc" t t ("abbbbc")) ("ab{3,7}c" "abbbbbc" t t ("abbbbbc")) ("ab{3,7}c" "abbbbbbc" t t ("abbbbbbc")) ("ab{3,7}c" "abbbbbbbc" t t ("abbbbbbbc")) ("ab{3,7}c" "abbbbbbbbc" t nil ()) ("ab{3,7}c" "abbc" t nil ()) ("ab{3,7}c" "abc" t nil ()) ("(a|b)*c|(a|ab)*c" "xc" t t ("c" "" "")) ("(a)*" "b" t t ("" "")) ("(..)*(...)*" "a" t t ("" "" "")) ;;the following fails coz sshenoy's engine is a posix NFA ;;("(..)*(...)*" "abc" t t ("abc" "" "abc")) ("(..)*(...)*" "abc" t t ("ab" "ab" "")) ("^" "" t t ("")) ("$" "" t t ("")) ("^$" "" t t ("")) ("^a$" "a" t t ("a")) ("abc" "abc" t t ("abc")) ("abc" "xbc" t nil ()) ("abc" "axc" t nil ()) ("abc" "abx" t nil ()) ("abc" "xabcy" t t ("abc")) ("abc" "ababc" t t ("abc")) ("ab*c" "abc" t t ("abc")) ("ab*bc" "abc" t t ("abc")) ("ab*bc" "abbc" t t ("abbc")) ("ab*bc" "abbbbc" t t ("abbbbc")) ("ab+bc" "abbc" t t ("abbc")) ("ab+bc" "abc" t nil ()) ("ab+bc" "abq" t nil ()) ("ab+bc" "abbbbc" t t ("abbbbc")) ("ab?bc" "abbc" t t ("abbc")) ("ab?bc" "abc" t t ("abc")) ("ab?bc" "abbbbc" t nil ()) ("ab?c" "abc" t t ("abc")) ("^abc$" "abc" t t ("abc")) ("^abc$" "abcc" t nil ()) ("^abc" "abcc" t t ("abc")) ("^abc$" "aabc" t nil ()) ("abc$" "aabc" t t ("abc")) ("^" "abc" t t ("")) ("$" "abc" t t ("")) ("a.c" "abc" t t ("abc")) ("a.c" "axc" t t ("axc")) ("a.*c" "axyzc" t t ("axyzc")) ("a.*c" "axyzd" t nil ()) ("a[bc]d" "abc" t nil ()) ("a[bc]d" "abd" t t ("abd")) ("a[b-d]e" "abd" t nil ()) ("a[b-d]e" "ace" t t ("ace")) ("a[b-d]" "aac" t t ("ac")) ("a[-b]" "a-" t t ("a-")) ("a[b-]" "a-" t t ("a-")) ;;*** following is supposed to compile but what should it match ? ;;*** I don't know and that is why I reject the pattern. ;("a[b-a]" "-" t NIL NIL) ;("a[]b" "-" NIL NIL NIL) ;("a[" "-" NIL NIL NIL) ;("a]" "a]" t "a]" NIL) ;("a[]]b" "a]b" t "a]b" NIL) ("a[^bc]d" "aed" t t ("aed")) ("a[^bc]d" "abd" t nil ()) ("a[^-b]c" "adc" t t ("adc")) ("a[^-b]c" "a-c" t nil ()) ("a[^\\]b]c" "a]c" t nil ()) ("a[^\\]b]c" "adc" t t ("adc")) ("ab|cd" "abc" t t ("ab")) ("ab|cd" "abcd" t t ("ab")) ;;FAILED ("()ef" "def" t t ("ef" "")) ;;FAILED ("()*" "-" t t ("" "")) ;;FAILED ("*a" "-" t t ("")) ("^*" "-" t t ("")) ("$*" "-" t t ("")) ;;FAILED ("(*)b" "-" t t ("" "")) ("$b" "b" t nil ()) ("a\\(b" "a(b" t t ("a(b")) ("a\\(*b" "ab" t t ("ab")) ("a\\(*b" "a((b" t t ("a((b")) ("a\\\\b" "a\\b" t t ("a\\b")) ("(abc" "-" nil nil ()) ("((a))" "abc" t t ("a" "a" "a")) ("(a)b(c)" "abc" t t ("abc" "a" "c")) ("a+b+c" "aabbabc" t t ("abc")) ("a**" "-" t t ("")) ("a*?" "-" t t ("")) ("(a*)*" "-" t t ("" "")) ("(a*)+" "-" t t ("" "")) ("(a|)*" "-" t t ("" "")) ("(a*|b)*" "-" t t ("" "")) #+:regex-right("(a+|b)*" "ab" t t ("ab" "b")) #-:regex-right("(a+|b)*" "ab" t t ("ab" "a")) #+:regex-right("(a+|b)+" "ab" t t ("ab" "b")) #-:regex-right("(a+|b)+" "ab" t t ("ab" "a")) ("(a+|b)?" "ab" t t ("a" "a")) ("[^ab]*" "cde" t t ("cde")) ("(^)*" "-" t t ("" "")) ("(ab|)*" "-" t t ("" "")) (")(" "-" nil nil ()) ("" "abc" nil nil ()) ("abc" "" t nil ()) ("a*" "" t t ("")) #+:regex-right("([abc])*d" "abbbcd" t t ("abbbcd" "c")) #-:regex-right("([abc])*d" "abbbcd" t t ("abbbcd" "a")) ("([abc])*bcd" "abcd" t t ("abcd" "a")) ("a|b|c|d|e" "e" t t ("e")) ("(a|b|c|d|e)f" "ef" t t ("ef" "e")) ("((a*|b))*" "-" t t ("" "" "")) ("abcd*efg" "abcdefg" t t ("abcdefg")) ("ab*" "xabyabbbz" t t ("ab")) ("ab*" "xayabbbz" t t ("a")) ("(ab|cd)e" "abcde" t t ("cde" "cd")) ("[abhgefdc]ij" "hij" t t ("hij")) ("^(ab|cd)e" "abcde" t nil ()) ("(abc|)ef" "abcdef" t t ("ef" "")) ("(a|b)c*d" "abcd" t t ("bcd" "b")) ("(ab|ab*)bc" "abc" t t ("abc" "a")) ("a([bc]*)c*" "abc" t t ("abc" "bc")) ("a([bc]*)(c*d)" "abcd" t t ("abcd" "bc" "d")) ("a([bc]+)(c*d)" "abcd" t t ("abcd" "bc" "d")) ("a([bc]*)(c+d)" "abcd" t t ("abcd" "b" "cd")) ("a[bcd]*dcdcde" "adcdcde" t t ("adcdcde")) ("a[bcd]+dcdcde" "adcdcde" t nil ()) ("(ab|a)b*c" "abc" t t ("abc" "ab")) ("((a)(b)c)(d)" "abcd" t t ("abcd" "abc" "a" "b" "d")) ("[a-zA-Z_][a-zA-Z0-9_]*" "alpha" t t ("alpha")) ("^a(bc+|b[eh])g|.h$" "abh" t t ("bh" "")) ("(bc+d$|ef*g.|h?i(j|k))" "effgz" t t ("effgz" "effgz" "")) ("(bc+d$|ef*g.|h?i(j|k))" "ij" t t ("ij" "ij" "j")) ("(bc+d$|ef*g.|h?i(j|k))" "effg" t nil ()) ("(bc+d$|ef*g.|h?i(j|k))" "bcdd" t nil ()) ("(bc+d$|ef*g.|h?i(j|k))" "reffgz" t t ("effgz" "effgz" "")) ("((((((((((a))))))))))" "a" t t ("a" "a" "a" "a" "a" "a" "a" "a" "a" "a" "a")) ("(((((((((a)))))))))" "a" t t ("a" "a" "a" "a" "a" "a" "a" "a" "a" "a")) ("multiple words of text" "uh-uh" t nil ()) ("multiple words" "multiple words, yeah" t t ("multiple words")) ("(.*)c(.*)" "abcde" t t ("abcde" "ab" "de")) ("\\((.*), (.*)\\)" "(a, b)" t t ("(a, b)" "a" "b")) ("[k]" "ab" t nil ()) ("abcd" "abcd" t t ("abcd")) ("a(bc)d" "abcd" t t ("abcd" "bc")) ("a[-]?c" "ac" t t ("ac")) ("a[-]?c" "ac" t t ("ac")) ("a[-]?c" "ac" t t ("ac")) ("[ -~]*" "abc" t t ("abc")) ("[ -~ -~]*" "abc" t t ("abc")) ("[ -~ -~ -~]*" "abc" t t ("abc")) ("[ -~ -~ -~ -~]*" "abc" t t ("abc")) ("[ -~ -~ -~ -~ -~]*" "abc" t t ("abc")) ("[ -~ -~ -~ -~ -~ -~]*" "abc" t t ("abc")) ("[ -~ -~ -~ -~ -~ -~ -~]*" "abc" t t ("abc")) ;; ;; Tests from from the Zebu package (originally for nregex.lisp) ;; ("(na)x+" "naxna" t t ("nax" "na")) ("(na)x+na" "naxna123" t t ("naxna" "na")) ("(na)x+" "naxxos" t t ("naxx" "na")) ("(na)x+" "naxos" t t ("nax" "na")) ("(na)x+" "naos" t nil ()) ("(na)x*" "naxxos" t t ("naxx" "na")) ("(na)x*" "naxos" t t ("nax" "na")) ("(na)x*" "naos" t t ("na" "na")) ("[0-9]+" "123ab" t t ("123")) ("[a-zA-Z]+" "aAbb123" t t ("aAbb")) ("[0-9a-z]+" "1234&&*" t t ("1234")) ("[0-9a-z]+" "1234a&&*" t t ("1234a")) ("[0-9a-zA-Z]+" "a1234a" t t ("a1234a")) ("[0-9a-zA-Z&]+" "aAbb123&&*" t t ("aAbb123&&")) ("[0-9]+\\.[0-9]*" "0.123cm" t t ("0.123")) ; ("{[^} ;]*}" "{M.D. Harrison and A. Monk (Ed.)} \n\t foo: 2" ;T "{M.D. Harrison and A. Monk (Ed.)}" NIL) ; ("{[^} ;]*}" "{M.D. Harrison and ;A. Monk (Ed.)} \n\t foo: 2" t NIL NIL) ; ("{[^} ;]*}" ; "{M.D. Harrison and {A. Monk} (Ed.)} \n\t foo: 2" ; t "{M.D. Harrison and {A. Monk}" NIL) ("ca?r" "car" t t ("car")) ("ca?r" "cr" t t ("cr")) ("c[ad]+r" "caaar" t t ("caaar")) ("c[ad]+r" "caaar aa1" t t ("caaar")) ("c[ad]+r$" "caaar" t t ("caaar")) (".*" "" t t ("")) (".*" "aa" t t ("aa")) ("c[ad]?r" "cr" t t ("cr")) ("c[ad]?r" "car" t t ("car")) ("c[ad]?r" "cdr" t t ("cdr")) ("c[0-9]?r" "cr" t t ("cr")) ("c[0-9]?r" "c9rxx" t t ("c9r")) ("c[0-9]?r" "crxx" t t ("cr")) ("a|b" "a" t t ("a")) ("ab.yz" "ab yz" t t ("ab yz")) ;("ab.yz" "ab ;yz" t t ("ab ;yz")) ("(abc){1,2}" "abcabc" t t ("abcabc" "abc")) ;("(abc){1,2}x*(def)y*def" "abcabcxxxxdefyyyyyyydef$%%%%%" ; t "abcabcxxxxdefyyyyyyydef" #("abc" "def")) ("a|bc*" "a" t t ("a")) ("[A-Z]+" "ABCY" t t ("ABCY")) ("[0-9]+\\.[0-9]*(e[+-]?[0-9]+)" "12.3e4 k" t t ("12.3e4" "e4")) ("[0-9]+\\.[0-9]*(e[+-]?[0-9]+)" "12.3e-4 k" t t ("12.3e-4" "e-4")) ("[0-9]+\\.[0-9]*(e[+-]?[0-9]+)?" "12.3 k" t t ("12.3" "")) ;; ;; The Gadaffi tests ;; Note that the first group matches NULL because it is always sucked ;; up by the preceding .* in case of a successful match. ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Muammar Qaddafi" t t ("Muammar Qaddafi" "" "dd")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Mo'ammar Gadhafi" t t ("Mo'ammar Gadhafi" "" "dh")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Muammar Kaddafi" t t ("Muammar Kaddafi" "" "dd")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Muammar Qadhafi" t t ("Muammar Qadhafi" "" "dh")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Moammar El Kadhafi" t t ("Moammar El Kadhafi" "" "dh")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Muammar Gadafi" t t ("Muammar Gadafi" "" "d")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Mu'ammar al-Qadafi" t t ("Mu'ammar al-Qadafi" "" "d")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Moamer El Kazzafi" t t ("Moamer El Kazzafi" "" "zz")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Moamar al-Gaddafi" t t ("Moamar al-Gaddafi" "" "dd")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Mu'ammar Al Qathafi" t t ("Mu'ammar Al Qathafi" "" "th")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Muammar Al Qathafi" t t ("Muammar Al Qathafi" "" "th")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Mo'ammar el-Gadhafi" t t ("Mo'ammar el-Gadhafi" "" "dh")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Moamar El Kadhafi" t t ("Moamar El Kadhafi" "" "dh")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Muammar al-Qadhafi" t t ("Muammar al-Qadhafi" "" "dh")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Mu'ammar al-Qadhdhafi" t t ("Mu'ammar al-Qadhdhafi" "" "dh")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Mu'ammar Qadafi" t t ("Mu'ammar Qadafi" "" "d")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Moamar Gaddafi" t t ("Moamar Gaddafi" "" "dd")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Mu'ammar Qadhdhafi" t t ("Mu'ammar Qadhdhafi" "" "dh")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Muammar Khaddafi" t t ("Muammar Khaddafi" "" "dd")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Muammar al-Khaddafi" t t ("Muammar al-Khaddafi" "" "dd")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Mu'amar al-Kadafi" t t ("Mu'amar al-Kadafi" "" "d")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Muammar Ghaddafy" t t ("Muammar Ghaddafy" "" "dd")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Muammar Ghadafi" t t ("Muammar Ghadafi" "" "d")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Muammar Ghaddafi" t t ("Muammar Ghaddafi" "" "dd")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Muamar Kaddafi" t t ("Muamar Kaddafi" "" "dd")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Muammar Quathafi" t t ("Muammar Quathafi" "" "th")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Muammar Gheddafi" t t ("Muammar Gheddafi" "" "dd")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Muamar Al-Kaddafi" t t ("Muamar Al-Kaddafi" "" "dd")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Moammar Khadafy " t t ("Moammar Khadafy" "" "d")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Moammar Qudhafi" t t ("Moammar Qudhafi" "" "dh")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Mu'ammar al-Qaddafi" t t ("Mu'ammar al-Qaddafi" "" "dd")) ("M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]" "Mulazim Awwal Mu'ammar Muhammad Abu Minyar al-Qadhafi" t t ("Mu'ammar Muhammad Abu Minyar al-Qadhafi" "" "dh")) ;; ;; tests involving back-refs #+:regex-right("((a|b{10,11})|(b))*-\\2" "aaab-a" t t ("aaab-a" "b" "a" "b")) #-:regex-right("((a|b{10,11})|(b))*-\\2" "aaab-a" t t ("aaab-a" "a" "a" "b")) ("(a)*-\\1" "aaa-a" t t ("aaa-a" "a")) ("(a)*-\\1b" "aaa-b" t t ("-b" "")) ("([xyz])(-\\2)" "x-y" t nil ()) ("(([xyz])(-\\2))" "x-y" t nil ()) ("(([xyz])(-\\2)*)*" "x-y" t t ("x" "x" "x" "")) ("(([xyz])(-\\2)*)*" "x-" t t ("x" "x" "x" "")) #+:regex-right("(([xyz])(-\\2)*)*" "xy-yz-y" t t ("xy-yz" "z" "z" "-y")) ;; kmp haven't fixed this one yet... ;#-:regex-right("(([xyz])(-\\2)*)*" "xy-xz-x" t t ("xy-xz" "z" "z" "-y")) ;; kmp -- this *should* match ; ("((.*)\\1)+" "xxxxxx" t nil ()) ; for rightmost register match: #+:regex-right("((.*)\\1)+" "xxxxxx" t t ("xxxxxx" "" "")) ; for leftmost register match: #-:regex-right("((.*)\\1)+" "xxxxxx" t t ("xxxxxx" "xxxxxx" "xxx")) ("(a*)\\1\\1(a*)\\2\\2\\2" "aaaaaa" t t ("aaaaaa" "aa" "")) ("(a*)(a*)\\1\\2" "aaaa" t t ("aaaa" "aa" "")) ("(a*)\\1(a*)\\2\\2" "aaaa" t t ("aaaa" "aa" "")) ("(a*)\\1\\1(a*)" "aaaaaa" t t ("aaaaaa" "aa" "")) ("(a*)\\1\\1(a*)\\2" "aaaaaa" t t ("aaaaaa" "aa" "")) ("(a*)\\1\\1(a*)\\2\\2" "aaaaaa" t t ("aaaaaa" "aa" "")) ("(.*)\\1\\1(.*)\\2\\2\\2" "aaaaaa" t t ("aaaaaa" "aa" "")) ;;the following fails coz sshenoy's engine is a posix NFA ;("(.*)\\1\\1(.*)\\2\\2\\2" "aaaaaaa" t t ("aaaaaaa" "a" "a")) ("(.*)\\1\\1(.*)\\2\\2\\2" "aaaaaaa" t t ("aaaaaa" "aa" "")) ("(.*)\\1\\1(.*)\\2\\2\\2" "aaaaaa" t t ("aaaaaa" "aa" "")) ;;the following fails coz sshenoy's engine is a posix NFA ;("(.*)\\1\\1(.*)\\2\\2\\2" "aaaaa" t t ("aaaa" "" "a")) ("(.*)\\1\\1(.*)\\2\\2\\2" "aaaaa" t t ("aaa" "a" "")) ("(.*)\\1\\1" "aaa" t t ("aaa" "a")) #+:regex-right("(.*)*\\1" "xx" t t ("xx" "")) #-:regex-right("(.*)*\\1" "xx" t t ("xx" "x")) ("(....).*\\1" "beriberi" t t ("beriberi" "beri")) ; ;; ; ;; Some tests for class matches (my own) ; ;; ; ("[[:alpha:]_][[:alnum:]_]*" "c_identifier" t "c_identifier" NIL) ; ("[[:xdigit:]]*" "12aBcD89" t "12aBcD89" NIL) ; ;; In the following pattern, because :] is missing, the pattern is ; ;; interpreted as an ordinary range ; ("[[:xdigit]+" "0[x:dig" t "[x:dig" NIL))) ;; ******************************************************* ;; the tests that follows are from: ;; ------------------------------------------------------- ;; Sébastien Saint-Sevin, 2002 ;; ------------------------------------------------------- ;; some basics ;; ----------- (".*" "aa" t t ("aa")) (".+" "aa" t t ("aa")) ;; anchor ;; ------ ;; alternate ;; --------- ("(hello|man|)" "" t t ("" "")) ("(a+|b)" "aaa" t t ("aaa" "aaa")) ("(a+|b)" "b" t t ("b" "b")) ;; character classes ;; ----------------- ("[abc]{1,3}" "bcaa" t t ("bca")) ("a[\\-]?c" "ac" t t ("ac")) ("a[\\-]?c" "a-c" t t ("a-c")) ("a[-]?c" "ac" t t ("ac")) ("a[-]?c" "a-c" t t ("a-c")) ("a[-b]?c" "abc" t t ("abc")) ("a[b-]?c" "acc" t t ("ac")) ; "a[\\[]c" ; "a[\\^]c" ; "a[\\]]c" ; ("a[^\\-]?c" "ac" t t ("ac")) ; ("a[^\\-]?c" "a-c" t nil ()) ; ("a[^-]?c" "ac" t t ("ac")) ; ("a[^-]?c" "a-c" t t ("a-c")) ; ("a[^-b]?c" "abc" t t ("abc")) ; ("a[^b-]?c" "acc" t t ("ac")) ; "a[^\\[]c" ; "a[^\\^]c" ; "a[^\\]]c" ;; posix character classes ;; ----------------------- ;; greedy quantifiers ;; ------------------ ("a*" "aaaa" t t ("aaaa")) ("a+" "aaaa" t t ("aaaa")) ("a{2,3}" "aaaa" t t ("aaa")) ;; nongreedy quantifiers ;; --------------------- ("a*?" "aaaa" t t ("")) ("a+?" "aaaa" t t ("a")) ("a{2,3}?" "aaaa" t t ("aa")) ("a+?bb*?" "baaaabaaabbbaaaaa" t t ("aaaab")) ("a+?bb+?" "baaaabaaabbbaaaaa" t t ("aaabb")) ("[abc]{10,20}?" "xxxbcbcbabcaabcbabcbcbabcbcaabcabxxx" t t ("bcbcbabcaa")) ;; grouping ;; -------- ;; nonregister grouping ;; -------------------- ; "((?a+)|b)" ;; greedy quantifiers + backrefs ;; ----------------------------- ("^(x)+$" "xx" t t ("xx" "x")) ("^(x)+\\1$" "xx" t t ("xx" "x")) ("^(x){1,2}$" "xx" t t ("xx" "x")) ("^(x){1,2}\\1$" "xx" t t ("xx" "x")) ("^(x)+[^x]+\\1$" "xxaax" t t ("xxaax" "x")) ("^x*(x)[^x]+\\1$" "xxaax" t t ("xxaax" "x")) ("(x)+\\1" "xxxx" t t ("xxxx" "x")) ("(x){1,2}" "xxxx" t t ("xx" "x")) ;; kmp By the letter, (x) can only match one character. To get this ;; affect, the pattern should be "(x{1,2})\\1" ; ("(x){1,2}\\1" "xxxx" t t ("xxxx" "x")) ("(x){1,2}\\1" "xxxx" t t ("xxx" "x")) ("(x)+[^x]+\\1" "xxaax" t t ("xxaax" "x")) ("x*(x)[^x]+\\1" "xxaax" t t ("xxaax" "x")) ;; nongreedy quantifiers + backrefs ;; -------------------------------- ("(x)+?\\1" "xxxx" t t ("xx" "x")) ("(x){1,2}?" "xxxx" t t ("x" "x")) ("(x){1,2}?\\1" "xxxx" t t ("xx" "x")) ("(x)+?[^x]+\\1" "xxaax" t t ("xxaax" "x")) ("x*?(x)[^x]+\\1" "xxaax" t t ("xxaax" "x")) ;; misc ;; ---- ;; kmp it is legal for a* to match nothing ; ("(a*)*" "aaaa" t t ("aaaa" "aaaa")) #+:regex-right("(a*)*" "aaaa" t t ("aaaa" "")) #-:regex-right("(a*)*" "aaaa" t t ("aaaa" "aaaa")) ;; kmp it is legal for a* to match nothing ; ("(a*)+" "aaaa" t t ("aaaa" "aaaa")) #+:regex-right("(a*)+" "aaaa" t t ("aaaa" "")) #-:regex-right("(a*)+" "aaaa" t t ("aaaa" "aaaa")) ("(a+)*" "aaaa" t t ("aaaa" "aaaa")) ("(a+)*" "aaaa" t t ("aaaa" "aaaa")) )) ;; ***************************************************************************** ;; FUNCTION ;; Name : run-sebastien-tests ;; Date : 2002-03-08 ;; Author : 3S ;; Arguments : none ;; Side Effects : print testing results ;; Purpose : guess it ;; ----------------------------------------------------------------------------- (defun run-sebastien-tests () (regex::clear-pattern-cache) ;; kmp helps for when I'm debugging the compiler (print ";; *****************************************************************************") (print ";; BEGIN OF TEST") (print ";; -----------------------------------------------------------------------------") (dolist (test *regexp-tests*) (destructuring-bind (pattern str expected-compile-p expected-matched-p expected-results) test (format t "~%pattern: ~A ~%string: ~A" pattern str) (let ((matcher (compile-str pattern))) (cond ((and matcher (not expected-compile-p)) (format t "~%Shouldn't have compiled, but did ******************** TEST FAILED")) ((and (not matcher) expected-compile-p) (format t "~%Should have compiled, but didn't ******************** TEST FAILED")) ) (when matcher (multiple-value-bind (matched-p start len regs) (scan-str matcher str) (cond ((and expected-matched-p (not matched-p)) (format t "~%Should have matched, but didn't ******************** TEST FAILED")) ((and (not expected-matched-p) matched-p) (format t "~%Shouldn't have matched, but did ******************** TEST FAILED")) ) (when matched-p (if (string= (car expected-results) (subseq str start (+ start len))) (format t "~%Global match OK" ) (format t "~%Global match ******************** TEST FAILED") ) (let ((num-groups (array-dimension regs 0)) ) (if (/= (length expected-results) num-groups) (format t "~%Number of groups ******************** TEST FAILED") (dotimes (i num-groups) (let* ((group-start (register-start regs i)) (group-end (register-end regs i)) (expected-value (nth i expected-results)) (calculated-value (if (register-matched-p regs i) (subseq str group-start group-end) "")) ) (if (string= expected-value calculated-value) (format t "~%Group ~A OK ==> ~A" i calculated-value) (format t "~%Group ~A ==> ~A instead of ~A ******************** TEST FAILED" i calculated-value expected-value) ) )) ))) ))) ) (terpri)) (print ";; *****************************************************************************") (print ";; END OF TEST") (print ";; -----------------------------------------------------------------------------") ) ;; ***************************************************************************** ;; END OF FILE ;; ----------------------------------------------------------------------------- cl-regex-1/speedtest.c0100755000175000001440000000473407542737550013621 0ustar mrdusers#include #include #include #include "regex.h" void respeedtest(int numreps, const char* patstr, const char* candstr) { regex_t comp; regmatch_t matches[10]; int nomatch, err; time_t start, finish; int bMatched = -1; int candlen; float elapsed; int i; if (err = regcomp(&comp, patstr, REG_EXTENDED)) { char error[256]; regerror(err,&comp,error,255); printf("Error in regcomp: %s\n",error); return; } fprintf(stdout, "\nTiming %s\n", patstr); fflush(stdout); time(&start); candlen = strlen(candstr); for (i = 0; i < numreps; ++i) bMatched=!regexec(&comp,candstr,10,matches,0); time(&finish); elapsed = difftime(finish, start); fprintf(stdout, "\nRE match\t: %d secs, %d/sec, \"%s\" --> \"%s\"\n", (int)elapsed, (int)((float)numreps/elapsed), patstr, candstr); regfree(&comp); if (!bMatched) fprintf(stdout, "didn't match!\n"); } int foo; void strcmpspeedtest(int numreps, const char* patstr, const char* candstr, char* cmpname, int (*cmpfxn)(const char* a, const char* b)) { time_t start, finish; float elapsed; int i, j; fprintf(stdout, "\nTiming %s\n", cmpname); fflush(stdout); time(&start); for (i = 0; i < numreps; ++i) foo = (*cmpfxn)(patstr, candstr); time(&finish); elapsed = difftime(finish, start); fprintf(stdout, "\n%s\t: %d secs, %d/sec, \"%s\" --> \"%s\"\n", cmpname, (int)elapsed, (int)((float)numreps/elapsed), patstr, candstr); if (foo != 0) fprintf(stdout, "didn't match!\n"); } void speedtest() { char* candstr = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABD"; /* char* candstr = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABD"; */ const int numreps = 1000000; respeedtest(numreps, "A*BD", candstr); respeedtest(numreps, "(A|A)*BD", candstr); respeedtest(numreps, "(A|B)*BD", candstr); respeedtest(numreps, "(B|A)*BD", candstr); respeedtest(numreps, "((A*B)|(AC))D", candstr); respeedtest(numreps, "((A*B)|(A*C))D", candstr); respeedtest(numreps, "[Aa]*[Bb][Dd]", candstr); strcmpspeedtest(numreps, candstr, candstr, "strcmp", strcmp); strcmpspeedtest(numreps, candstr, candstr, "stricmp", stricmp); } void main() { fprintf(stdout, "\nStarting speed test\n"); speedtest(); fprintf(stdout, "\ndone\n"); }