regex-posix-0.95.2/0000755000000000000000000000000011756216545012263 5ustar0000000000000000regex-posix-0.95.2/LICENSE0000644000000000000000000000274411756216545013277 0ustar0000000000000000This modile is under this "3 clause" BSD license: Copyright (c) 2007, Christopher Kuklewicz All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The names of the contributors may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. regex-posix-0.95.2/regex-posix.cabal0000644000000000000000000000732211756216545015525 0ustar0000000000000000Name: regex-posix -- Keep the Version below in sync with ./Text/Regex/Posix.hs value -- getVersion_Text_Regex_Posix :: Version Version: 0.95.2 Cabal-Version: >=1.2 Build-Type: Custom License: BSD3 License-File: LICENSE Copyright: Copyright (c) 2007-2010, Christopher Kuklewicz Author: Christopher Kuklewicz Maintainer: TextRegexLazy@personal.mightyreason.com Stability: Seems to work, passes a few tests Homepage: http://sourceforge.net/projects/lazy-regex Package-URL: http://code.haskell.org/regex-posix/ Synopsis: Replaces/Enhances Text.Regex Description: The posix regex backend for regex-base Category: Text Tested-With: GHC Build-Type: Simple Extra-Source-Files: cbits/engine.c cbits/cclass.h cbits/cname.h cbits/regex2.h cbits/regex.h cbits/utils.h c-finalizer/myfree.h c-finalizer/myfree.c flag newBase description: Choose base >= 4 default: True flag splitBase description: Choose the new smaller, split-up base package. default: True Library if flag(newBase) Build-Depends: regex-base >= 0.93, base >= 4 && < 5, array, containers, bytestring -- Need the next symbol for using CPP to get Data.ByteString.Base|Unsafe in -- ./Text/Regex/Posix/ByteString.hs and ./Text/Regex/Posix/ByteString/Lazy.hs CPP-Options: "-DSPLIT_BASE=1" Extensions: MultiParamTypeClasses, FunctionalDependencies, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, FlexibleContexts, TypeSynonymInstances, FlexibleInstances else if flag(splitBase) Build-Depends: regex-base >= 0.93, base >= 3.0, array, containers, bytestring -- Need the next symbol for using CPP to get Data.ByteString.Base|Unsafe -- in ./Text/Regex/Posix/ByteString.hs and -- ./Text/Regex/Posix/ByteString/Lazy.hs CPP-Options: "-DSPLIT_BASE=1" Extensions: MultiParamTypeClasses, FunctionalDependencies, CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving, FlexibleContexts, TypeSynonymInstances, FlexibleInstances else Build-Depends: regex-base >= 0.93, base < 3.0 Extensions: MultiParamTypeClasses, FunctionalDependencies, CPP C-Sources: c-finalizer/myfree.c Include-Dirs: c-finalizer if !os(windows) CC-Options: -DHAVE_REGEX_H else C-Sources: cbits/reallocf.c cbits/regcomp.c cbits/regerror.c cbits/regexec.c cbits/regfree.c Include-Dirs: cbits -- Data-Files: -- Extra-Tmp-Files: -- This is the library Exposed-Modules: Text.Regex.Posix Text.Regex.Posix.Wrap Text.Regex.Posix.String Text.Regex.Posix.Sequence Text.Regex.Posix.ByteString Text.Regex.Posix.ByteString.Lazy -- Futher fields Buildable: True -- Other-Modules: -- HS-Source-Dirs: "." -- The CPP is for using -DSPLIT_BASE=1 to get Data.ByteString.Base|Unsafe GHC-Options: -Wall -O2 -- GHC-Options: -Wall -Werror -O2 -- GHC-Options: -Wall -ddump-minimal-imports -- GHC-Prof-Options: -- Hugs-Options: -- NHC-Options: -- C-Sources: -- LD-Options: -- Frameworks: -- Includes: -- Include-Dirs: include -- Extra-Libraries: -- Extra-Lib-Dirs: regex-posix-0.95.2/Setup.hs0000644000000000000000000000005611756216545013720 0ustar0000000000000000import Distribution.Simple main = defaultMain regex-posix-0.95.2/c-finalizer/0000755000000000000000000000000011756216545014466 5ustar0000000000000000regex-posix-0.95.2/c-finalizer/myfree.c0000644000000000000000000000026511756216545016124 0ustar0000000000000000#include "myfree.h" #include "regex.h" #include "stdlib.h" /* void free(void *ptr); void regfree(regex_t *preg); */ void myregfree(void *preg) { regfree(preg); free(preg); } regex-posix-0.95.2/c-finalizer/myfree.h0000644000000000000000000000003111756216545016120 0ustar0000000000000000void myregfree(void *); regex-posix-0.95.2/cbits/0000755000000000000000000000000011756216545013367 5ustar0000000000000000regex-posix-0.95.2/cbits/cclass.h0000644000000000000000000000501611756216545015012 0ustar0000000000000000/*- * Copyright (c) 1992, 1993, 1994 Henry Spencer. * Copyright (c) 1992, 1993, 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Henry Spencer. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)cclass.h 8.3 (Berkeley) 3/20/94 */ typedef enum {CALNUM, CALPHA, CBLANK, CCNTRL, CDIGIT, CGRAPH, CLOWER, CPRINT, CPUNCT, CSPACE, CUPPER, CXDIGIT} citype; /* character-class table */ static struct cclass { char *name; citype fidx; } cclasses[] = { {"alnum", CALNUM}, {"alpha", CALPHA}, {"blank", CBLANK}, {"cntrl", CCNTRL}, {"digit", CDIGIT}, {"graph", CGRAPH}, {"lower", CLOWER}, {"print", CPRINT}, {"punct", CPUNCT}, {"space", CSPACE}, {"upper", CUPPER}, {"xdigit", CXDIGIT}, {NULL, } }; regex-posix-0.95.2/cbits/cname.h0000644000000000000000000001010211756216545014615 0ustar0000000000000000/*- * Copyright (c) 1992, 1993, 1994 Henry Spencer. * Copyright (c) 1992, 1993, 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Henry Spencer. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)cname.h 8.3 (Berkeley) 3/20/94 */ /* character-name table */ static struct cname { char *name; char code; } cnames[] = { {"NUL", '\0'}, {"SOH", '\001'}, {"STX", '\002'}, {"ETX", '\003'}, {"EOT", '\004'}, {"ENQ", '\005'}, {"ACK", '\006'}, {"BEL", '\007'}, {"alert", '\007'}, {"BS", '\010'}, {"backspace", '\b'}, {"HT", '\011'}, {"tab", '\t'}, {"LF", '\012'}, {"newline", '\n'}, {"VT", '\013'}, {"vertical-tab", '\v'}, {"FF", '\014'}, {"form-feed", '\f'}, {"CR", '\015'}, {"carriage-return", '\r'}, {"SO", '\016'}, {"SI", '\017'}, {"DLE", '\020'}, {"DC1", '\021'}, {"DC2", '\022'}, {"DC3", '\023'}, {"DC4", '\024'}, {"NAK", '\025'}, {"SYN", '\026'}, {"ETB", '\027'}, {"CAN", '\030'}, {"EM", '\031'}, {"SUB", '\032'}, {"ESC", '\033'}, {"IS4", '\034'}, {"FS", '\034'}, {"IS3", '\035'}, {"GS", '\035'}, {"IS2", '\036'}, {"RS", '\036'}, {"IS1", '\037'}, {"US", '\037'}, {"space", ' '}, {"exclamation-mark", '!'}, {"quotation-mark", '"'}, {"number-sign", '#'}, {"dollar-sign", '$'}, {"percent-sign", '%'}, {"ampersand", '&'}, {"apostrophe", '\''}, {"left-parenthesis", '('}, {"right-parenthesis", ')'}, {"asterisk", '*'}, {"plus-sign", '+'}, {"comma", ','}, {"hyphen", '-'}, {"hyphen-minus", '-'}, {"period", '.'}, {"full-stop", '.'}, {"slash", '/'}, {"solidus", '/'}, {"zero", '0'}, {"one", '1'}, {"two", '2'}, {"three", '3'}, {"four", '4'}, {"five", '5'}, {"six", '6'}, {"seven", '7'}, {"eight", '8'}, {"nine", '9'}, {"colon", ':'}, {"semicolon", ';'}, {"less-than-sign", '<'}, {"equals-sign", '='}, {"greater-than-sign", '>'}, {"question-mark", '?'}, {"commercial-at", '@'}, {"left-square-bracket", '['}, {"backslash", '\\'}, {"reverse-solidus", '\\'}, {"right-square-bracket",']'}, {"circumflex", '^'}, {"circumflex-accent", '^'}, {"underscore", '_'}, {"low-line", '_'}, {"grave-accent", '`'}, {"left-brace", '{'}, {"left-curly-bracket", '{'}, {"vertical-line", '|'}, {"right-brace", '}'}, {"right-curly-bracket", '}'}, {"tilde", '~'}, {"DEL", '\177'}, {NULL, 0} }; regex-posix-0.95.2/cbits/engine.c0000644000000000000000000007253311756216545015012 0ustar0000000000000000/*- * Copyright (c) 1992, 1993, 1994 Henry Spencer. * Copyright (c) 1992, 1993, 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Henry Spencer. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)engine.c 8.5 (Berkeley) 3/20/94 * * $FreeBSD: src/lib/libc/regex/engine.c,v 1.5.8.1 2000/07/31 06:30:37 dcs Exp $ */ /* * The matching engine and friends. This file is #included by regexec.c * after suitable #defines of a variety of macros used herein, so that * different state representations can be used without duplicating masses * of code. */ #ifdef SNAMES #define matcher smatcher #define fast sfast #define slow sslow #define dissect sdissect #define backref sbackref #define step sstep #define print sprint #define at sat #define match smat #endif #ifdef LNAMES #define matcher lmatcher #define fast lfast #define slow lslow #define dissect ldissect #define backref lbackref #define step lstep #define print lprint #define at lat #define match lmat #endif /* another structure passed up and down to avoid zillions of parameters */ struct match { struct re_guts *g; int eflags; regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ char *offp; /* offsets work from here */ char *beginp; /* start of string -- virtual NUL precedes */ char *endp; /* end of string -- virtual NUL here */ char *coldp; /* can be no match starting before here */ char **lastpos; /* [nplus+1] */ STATEVARS; states st; /* current states */ states fresh; /* states for a fresh start */ states tmp; /* temporary */ states empty; /* empty set of states */ }; /* ========= begin header generated by ./mkh ========= */ #ifdef __cplusplus extern "C" { #endif /* === engine.c === */ static int matcher (struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[], int eflags); static char *dissect (struct match *m, char *start, char *stop, sopno startst, sopno stopst); static char *backref (struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev); static char *fast (struct match *m, char *start, char *stop, sopno startst, sopno stopst); static char *slow (struct match *m, char *start, char *stop, sopno startst, sopno stopst); static states step (struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft); #define BOL (OUT+1) #define EOL (BOL+1) #define BOLEOL (BOL+2) #define NOTHING (BOL+3) #define BOW (BOL+4) #define EOW (BOL+5) #define CODEMAX (BOL+5) /* highest code used */ #define NONCHAR(c) ((c) > CHAR_MAX) #define NNONCHAR (CODEMAX-CHAR_MAX) #ifdef REDEBUG static void print (struct match *m, char *caption, states st, int ch, FILE *d); #endif #ifdef REDEBUG static void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst)); #endif #ifdef REDEBUG static char *pchar __P((int ch)); #endif #ifdef __cplusplus } #endif /* ========= end header generated by ./mkh ========= */ #ifdef REDEBUG #define SP(t, s, c) print(m, t, s, c, stdout) #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } #else #define SP(t, s, c) /* nothing */ #define AT(t, p1, p2, s1, s2) /* nothing */ #define NOTE(s) /* nothing */ #endif /* - matcher - the actual matching engine == static int matcher(register struct re_guts *g, char *string, \ == size_t nmatch, regmatch_t pmatch[], int eflags); */ static int /* 0 success, REG_NOMATCH failure */ matcher(g, string, nmatch, pmatch, eflags) register struct re_guts *g; char *string; size_t nmatch; regmatch_t pmatch[]; int eflags; { register char *endp; register int i; struct match mv; register struct match *m = &mv; register char *dp; register const sopno gf = g->firststate+1; /* +1 for OEND */ register const sopno gl = g->laststate; char *start; char *stop; /* Boyer-Moore algorithms variables */ register char *pp; int cj, mj; register char *mustfirst; register char *mustlast; register int *matchjump; register int *charjump; /* simplify the situation where possible */ if (g->cflags®_NOSUB) nmatch = 0; if (eflags®_STARTEND) { start = string + pmatch[0].rm_so; stop = string + pmatch[0].rm_eo; } else { start = string; stop = start + strlen(start); } if (stop < start) return(REG_INVARG); /* prescreening; this does wonders for this rather slow code */ if (g->must != NULL) { if (g->charjump != NULL && g->matchjump != NULL) { mustfirst = g->must; mustlast = g->must + g->mlen - 1; charjump = g->charjump; matchjump = g->matchjump; pp = mustlast; for (dp = start+g->mlen-1; dp < stop;) { /* Fast skip non-matches */ while (dp < stop && charjump[*dp]) dp += charjump[*dp]; if (dp >= stop) break; /* Greedy matcher */ /* We depend on not being used for * for strings of length 1 */ while (*--dp == *--pp && pp != mustfirst); if (*dp == *pp) break; /* Jump to next possible match */ mj = matchjump[pp - mustfirst]; cj = charjump[*dp]; dp += (cj < mj ? mj : cj); pp = mustlast; } if (pp != mustfirst) return(REG_NOMATCH); } else { for (dp = start; dp < stop; dp++) if (*dp == g->must[0] && stop - dp >= g->mlen && memcmp(dp, g->must, (size_t)g->mlen) == 0) break; if (dp == stop) /* we didn't find g->must */ return(REG_NOMATCH); } } /* match struct setup */ m->g = g; m->eflags = eflags; m->pmatch = NULL; m->lastpos = NULL; m->offp = string; m->beginp = start; m->endp = stop; STATESETUP(m, 4); SETUP(m->st); SETUP(m->fresh); SETUP(m->tmp); SETUP(m->empty); CLEAR(m->empty); /* Adjust start according to moffset, to speed things up */ if (g->moffset > -1) start = ((dp - g->moffset) < start) ? start : dp - g->moffset; /* this loop does only one repetition except for backrefs */ for (;;) { endp = fast(m, start, stop, gf, gl); if (endp == NULL) { /* a miss */ STATETEARDOWN(m); return(REG_NOMATCH); } if (nmatch == 0 && !g->backrefs) break; /* no further info needed */ /* where? */ assert(m->coldp != NULL); for (;;) { NOTE("finding start"); endp = slow(m, m->coldp, stop, gf, gl); if (endp != NULL) break; assert(m->coldp < m->endp); m->coldp++; } if (nmatch == 1 && !g->backrefs) break; /* no further info needed */ /* oh my, he wants the subexpressions... */ if (m->pmatch == NULL) m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * sizeof(regmatch_t)); if (m->pmatch == NULL) { STATETEARDOWN(m); return(REG_ESPACE); } for (i = 1; i <= m->g->nsub; i++) m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; if (!g->backrefs && !(m->eflags®_BACKR)) { NOTE("dissecting"); dp = dissect(m, m->coldp, endp, gf, gl); } else { if (g->nplus > 0 && m->lastpos == NULL) m->lastpos = (char **)malloc((g->nplus+1) * sizeof(char *)); if (g->nplus > 0 && m->lastpos == NULL) { free(m->pmatch); STATETEARDOWN(m); return(REG_ESPACE); } NOTE("backref dissect"); dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); } if (dp != NULL) break; /* uh-oh... we couldn't find a subexpression-level match */ assert(g->backrefs); /* must be back references doing it */ assert(g->nplus == 0 || m->lastpos != NULL); for (;;) { if (dp != NULL || endp <= m->coldp) break; /* defeat */ NOTE("backoff"); endp = slow(m, m->coldp, endp-1, gf, gl); if (endp == NULL) break; /* defeat */ /* try it on a shorter possibility */ #ifndef NDEBUG for (i = 1; i <= m->g->nsub; i++) { assert(m->pmatch[i].rm_so == -1); assert(m->pmatch[i].rm_eo == -1); } #endif NOTE("backoff dissect"); dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); } assert(dp == NULL || dp == endp); if (dp != NULL) /* found a shorter one */ break; /* despite initial appearances, there is no match here */ NOTE("false alarm"); start = m->coldp + 1; /* recycle starting later */ assert(start <= stop); } /* fill in the details if requested */ if (nmatch > 0) { pmatch[0].rm_so = m->coldp - m->offp; pmatch[0].rm_eo = endp - m->offp; } if (nmatch > 1) { assert(m->pmatch != NULL); for (i = 1; i < nmatch; i++) if (i <= m->g->nsub) pmatch[i] = m->pmatch[i]; else { pmatch[i].rm_so = -1; pmatch[i].rm_eo = -1; } } if (m->pmatch != NULL) free((char *)m->pmatch); if (m->lastpos != NULL) free((char *)m->lastpos); STATETEARDOWN(m); return(0); } /* - dissect - figure out what matched what, no back references == static char *dissect(register struct match *m, char *start, \ == char *stop, sopno startst, sopno stopst); */ static char * /* == stop (success) always */ dissect(m, start, stop, startst, stopst) register struct match *m; char *start; char *stop; sopno startst; sopno stopst; { register int i; register sopno ss; /* start sop of current subRE */ register sopno es; /* end sop of current subRE */ register char *sp; /* start of string matched by it */ register char *stp; /* string matched by it cannot pass here */ register char *rest; /* start of rest of string */ register char *tail; /* string unmatched by rest of RE */ register sopno ssub; /* start sop of subsubRE */ register sopno esub; /* end sop of subsubRE */ register char *ssp; /* start of string matched by subsubRE */ register char *sep; /* end of string matched by subsubRE */ register char *oldssp; /* previous ssp */ register char *dp; AT("diss", start, stop, startst, stopst); sp = start; for (ss = startst; ss < stopst; ss = es) { /* identify end of subRE */ es = ss; switch (OP(m->g->strip[es])) { case OPLUS_: case OQUEST_: es += OPND(m->g->strip[es]); break; case OCH_: while (OP(m->g->strip[es]) != O_CH) es += OPND(m->g->strip[es]); break; } es++; /* figure out what it matched */ switch (OP(m->g->strip[ss])) { case OEND: assert(nope); break; case OCHAR: sp++; break; case OBOL: case OEOL: case OBOW: case OEOW: break; case OANY: case OANYOF: sp++; break; case OBACK_: case O_BACK: assert(nope); break; /* cases where length of match is hard to find */ case OQUEST_: stp = stop; for (;;) { /* how long could this one be? */ rest = slow(m, sp, stp, ss, es); assert(rest != NULL); /* it did match */ /* could the rest match the rest? */ tail = slow(m, rest, stop, es, stopst); if (tail == stop) break; /* yes! */ /* no -- try a shorter match for this one */ stp = rest - 1; assert(stp >= sp); /* it did work */ } ssub = ss + 1; esub = es - 1; /* did innards match? */ if (slow(m, sp, rest, ssub, esub) != NULL) { dp = dissect(m, sp, rest, ssub, esub); assert(dp == rest); } else /* no */ assert(sp == rest); sp = rest; break; case OPLUS_: stp = stop; for (;;) { /* how long could this one be? */ rest = slow(m, sp, stp, ss, es); assert(rest != NULL); /* it did match */ /* could the rest match the rest? */ tail = slow(m, rest, stop, es, stopst); if (tail == stop) break; /* yes! */ /* no -- try a shorter match for this one */ stp = rest - 1; assert(stp >= sp); /* it did work */ } ssub = ss + 1; esub = es - 1; ssp = sp; oldssp = ssp; for (;;) { /* find last match of innards */ sep = slow(m, ssp, rest, ssub, esub); if (sep == NULL || sep == ssp) break; /* failed or matched null */ oldssp = ssp; /* on to next try */ ssp = sep; } if (sep == NULL) { /* last successful match */ sep = ssp; ssp = oldssp; } assert(sep == rest); /* must exhaust substring */ assert(slow(m, ssp, sep, ssub, esub) == rest); dp = dissect(m, ssp, sep, ssub, esub); assert(dp == sep); sp = rest; break; case OCH_: stp = stop; for (;;) { /* how long could this one be? */ rest = slow(m, sp, stp, ss, es); assert(rest != NULL); /* it did match */ /* could the rest match the rest? */ tail = slow(m, rest, stop, es, stopst); if (tail == stop) break; /* yes! */ /* no -- try a shorter match for this one */ stp = rest - 1; assert(stp >= sp); /* it did work */ } ssub = ss + 1; esub = ss + OPND(m->g->strip[ss]) - 1; assert(OP(m->g->strip[esub]) == OOR1); for (;;) { /* find first matching branch */ if (slow(m, sp, rest, ssub, esub) == rest) break; /* it matched all of it */ /* that one missed, try next one */ assert(OP(m->g->strip[esub]) == OOR1); esub++; assert(OP(m->g->strip[esub]) == OOR2); ssub = esub + 1; esub += OPND(m->g->strip[esub]); if (OP(m->g->strip[esub]) == OOR2) esub--; else assert(OP(m->g->strip[esub]) == O_CH); } dp = dissect(m, sp, rest, ssub, esub); assert(dp == rest); sp = rest; break; case O_PLUS: case O_QUEST: case OOR1: case OOR2: case O_CH: assert(nope); break; case OLPAREN: i = OPND(m->g->strip[ss]); assert(0 < i && i <= m->g->nsub); m->pmatch[i].rm_so = sp - m->offp; break; case ORPAREN: i = OPND(m->g->strip[ss]); assert(0 < i && i <= m->g->nsub); m->pmatch[i].rm_eo = sp - m->offp; break; default: /* uh oh */ assert(nope); break; } } assert(sp == stop); return(sp); } /* - backref - figure out what matched what, figuring in back references == static char *backref(register struct match *m, char *start, \ == char *stop, sopno startst, sopno stopst, sopno lev); */ static char * /* == stop (success) or NULL (failure) */ backref(m, start, stop, startst, stopst, lev) register struct match *m; char *start; char *stop; sopno startst; sopno stopst; sopno lev; /* PLUS nesting level */ { register int i; register sopno ss; /* start sop of current subRE */ register char *sp; /* start of string matched by it */ register sopno ssub; /* start sop of subsubRE */ register sopno esub; /* end sop of subsubRE */ register char *ssp; /* start of string matched by subsubRE */ register char *dp; register size_t len; register int hard; register sop s; register regoff_t offsave; register cset *cs; AT("back", start, stop, startst, stopst); sp = start; /* get as far as we can with easy stuff */ hard = 0; for (ss = startst; !hard && ss < stopst; ss++) switch (OP(s = m->g->strip[ss])) { case OCHAR: if (sp == stop || *sp++ != (char)OPND(s)) return(NULL); break; case OANY: if (sp == stop) return(NULL); sp++; break; case OANYOF: cs = &m->g->sets[OPND(s)]; if (sp == stop || !CHIN(cs, *sp++)) return(NULL); break; case OBOL: if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || (sp < m->endp && *(sp-1) == '\n' && (m->g->cflags®_NEWLINE)) ) { /* yes */ } else return(NULL); break; case OEOL: if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || (sp < m->endp && *sp == '\n' && (m->g->cflags®_NEWLINE)) ) { /* yes */ } else return(NULL); break; case OBOW: if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || (sp < m->endp && *(sp-1) == '\n' && (m->g->cflags®_NEWLINE)) || (sp > m->beginp && !ISWORD(*(sp-1))) ) && (sp < m->endp && ISWORD(*sp)) ) { /* yes */ } else return(NULL); break; case OEOW: if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || (sp < m->endp && *sp == '\n' && (m->g->cflags®_NEWLINE)) || (sp < m->endp && !ISWORD(*sp)) ) && (sp > m->beginp && ISWORD(*(sp-1))) ) { /* yes */ } else return(NULL); break; case O_QUEST: break; case OOR1: /* matches null but needs to skip */ ss++; s = m->g->strip[ss]; do { assert(OP(s) == OOR2); ss += OPND(s); } while (OP(s = m->g->strip[ss]) != O_CH); /* note that the ss++ gets us past the O_CH */ break; default: /* have to make a choice */ hard = 1; break; } if (!hard) { /* that was it! */ if (sp != stop) return(NULL); return(sp); } ss--; /* adjust for the for's final increment */ /* the hard stuff */ AT("hard", sp, stop, ss, stopst); s = m->g->strip[ss]; switch (OP(s)) { case OBACK_: /* the vilest depths */ i = OPND(s); assert(0 < i && i <= m->g->nsub); if (m->pmatch[i].rm_eo == -1) return(NULL); assert(m->pmatch[i].rm_so != -1); len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; assert(stop - m->beginp >= len); if (sp > stop - len) return(NULL); /* not enough left to match */ ssp = m->offp + m->pmatch[i].rm_so; if (memcmp(sp, ssp, len) != 0) return(NULL); while (m->g->strip[ss] != SOP(O_BACK, i)) ss++; return(backref(m, sp+len, stop, ss+1, stopst, lev)); break; case OQUEST_: /* to null or not */ dp = backref(m, sp, stop, ss+1, stopst, lev); if (dp != NULL) return(dp); /* not */ return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev)); break; case OPLUS_: assert(m->lastpos != NULL); assert(lev+1 <= m->g->nplus); m->lastpos[lev+1] = sp; return(backref(m, sp, stop, ss+1, stopst, lev+1)); break; case O_PLUS: if (sp == m->lastpos[lev]) /* last pass matched null */ return(backref(m, sp, stop, ss+1, stopst, lev-1)); /* try another pass */ m->lastpos[lev] = sp; dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev); if (dp == NULL) return(backref(m, sp, stop, ss+1, stopst, lev-1)); else return(dp); break; case OCH_: /* find the right one, if any */ ssub = ss + 1; esub = ss + OPND(s) - 1; assert(OP(m->g->strip[esub]) == OOR1); for (;;) { /* find first matching branch */ dp = backref(m, sp, stop, ssub, esub, lev); if (dp != NULL) return(dp); /* that one missed, try next one */ if (OP(m->g->strip[esub]) == O_CH) return(NULL); /* there is none */ esub++; assert(OP(m->g->strip[esub]) == OOR2); ssub = esub + 1; esub += OPND(m->g->strip[esub]); if (OP(m->g->strip[esub]) == OOR2) esub--; else assert(OP(m->g->strip[esub]) == O_CH); } break; case OLPAREN: /* must undo assignment if rest fails */ i = OPND(s); assert(0 < i && i <= m->g->nsub); offsave = m->pmatch[i].rm_so; m->pmatch[i].rm_so = sp - m->offp; dp = backref(m, sp, stop, ss+1, stopst, lev); if (dp != NULL) return(dp); m->pmatch[i].rm_so = offsave; return(NULL); break; case ORPAREN: /* must undo assignment if rest fails */ i = OPND(s); assert(0 < i && i <= m->g->nsub); offsave = m->pmatch[i].rm_eo; m->pmatch[i].rm_eo = sp - m->offp; dp = backref(m, sp, stop, ss+1, stopst, lev); if (dp != NULL) return(dp); m->pmatch[i].rm_eo = offsave; return(NULL); break; default: /* uh oh */ assert(nope); break; } /* "can't happen" */ assert(nope); /* NOTREACHED */ return "shut up gcc"; } /* - fast - step through the string at top speed == static char *fast(register struct match *m, char *start, \ == char *stop, sopno startst, sopno stopst); */ static char * /* where tentative match ended, or NULL */ fast(m, start, stop, startst, stopst) register struct match *m; char *start; char *stop; sopno startst; sopno stopst; { register states st = m->st; register states fresh = m->fresh; register states tmp = m->tmp; register char *p = start; register int c = (start == m->beginp) ? OUT : *(start-1); register int lastc; /* previous c */ register int flagch; register int i; register char *coldp; /* last p after which no match was underway */ CLEAR(st); SET1(st, startst); st = step(m->g, startst, stopst, st, NOTHING, st); ASSIGN(fresh, st); SP("start", st, *p); coldp = NULL; for (;;) { /* next character */ lastc = c; c = (p == m->endp) ? OUT : *p; if (EQ(st, fresh)) coldp = p; /* is there an EOL and/or BOL between lastc and c? */ flagch = '\0'; i = 0; if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || (lastc == OUT && !(m->eflags®_NOTBOL)) ) { flagch = BOL; i = m->g->nbol; } if ( (c == '\n' && m->g->cflags®_NEWLINE) || (c == OUT && !(m->eflags®_NOTEOL)) ) { flagch = (flagch == BOL) ? BOLEOL : EOL; i += m->g->neol; } if (i != 0) { for (; i > 0; i--) st = step(m->g, startst, stopst, st, flagch, st); SP("boleol", st, c); } /* how about a word boundary? */ if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && (c != OUT && ISWORD(c)) ) { flagch = BOW; } if ( (lastc != OUT && ISWORD(lastc)) && (flagch == EOL || (c != OUT && !ISWORD(c))) ) { flagch = EOW; } if (flagch == BOW || flagch == EOW) { st = step(m->g, startst, stopst, st, flagch, st); SP("boweow", st, c); } /* are we done? */ if (ISSET(st, stopst) || p == stop) break; /* NOTE BREAK OUT */ /* no, we must deal with this character */ ASSIGN(tmp, st); ASSIGN(st, fresh); assert(c != OUT); st = step(m->g, startst, stopst, tmp, c, st); SP("aft", st, c); assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); p++; } assert(coldp != NULL); m->coldp = coldp; if (ISSET(st, stopst)) return(p+1); else return(NULL); } /* - slow - step through the string more deliberately == static char *slow(register struct match *m, char *start, \ == char *stop, sopno startst, sopno stopst); */ static char * /* where it ended */ slow(m, start, stop, startst, stopst) register struct match *m; char *start; char *stop; sopno startst; sopno stopst; { register states st = m->st; register states empty = m->empty; register states tmp = m->tmp; register char *p = start; register int c = (start == m->beginp) ? OUT : *(start-1); register int lastc; /* previous c */ register int flagch; register int i; register char *matchp; /* last p at which a match ended */ AT("slow", start, stop, startst, stopst); CLEAR(st); SET1(st, startst); SP("sstart", st, *p); st = step(m->g, startst, stopst, st, NOTHING, st); matchp = NULL; for (;;) { /* next character */ lastc = c; c = (p == m->endp) ? OUT : *p; /* is there an EOL and/or BOL between lastc and c? */ flagch = '\0'; i = 0; if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || (lastc == OUT && !(m->eflags®_NOTBOL)) ) { flagch = BOL; i = m->g->nbol; } if ( (c == '\n' && m->g->cflags®_NEWLINE) || (c == OUT && !(m->eflags®_NOTEOL)) ) { flagch = (flagch == BOL) ? BOLEOL : EOL; i += m->g->neol; } if (i != 0) { for (; i > 0; i--) st = step(m->g, startst, stopst, st, flagch, st); SP("sboleol", st, c); } /* how about a word boundary? */ if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && (c != OUT && ISWORD(c)) ) { flagch = BOW; } if ( (lastc != OUT && ISWORD(lastc)) && (flagch == EOL || (c != OUT && !ISWORD(c))) ) { flagch = EOW; } if (flagch == BOW || flagch == EOW) { st = step(m->g, startst, stopst, st, flagch, st); SP("sboweow", st, c); } /* are we done? */ if (ISSET(st, stopst)) matchp = p; if (EQ(st, empty) || p == stop) break; /* NOTE BREAK OUT */ /* no, we must deal with this character */ ASSIGN(tmp, st); ASSIGN(st, empty); assert(c != OUT); st = step(m->g, startst, stopst, tmp, c, st); SP("saft", st, c); assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); p++; } return(matchp); } /* - step - map set of states reachable before char to set reachable after == static states step(register struct re_guts *g, sopno start, sopno stop, \ == register states bef, int ch, register states aft); == #define BOL (OUT+1) == #define EOL (BOL+1) == #define BOLEOL (BOL+2) == #define NOTHING (BOL+3) == #define BOW (BOL+4) == #define EOW (BOL+5) == #define CODEMAX (BOL+5) // highest code used == #define NONCHAR(c) ((c) > CHAR_MAX) == #define NNONCHAR (CODEMAX-CHAR_MAX) */ static states step(g, start, stop, bef, ch, aft) register struct re_guts *g; sopno start; /* start state within strip */ sopno stop; /* state after stop state within strip */ register states bef; /* states reachable before */ int ch; /* character or NONCHAR code */ register states aft; /* states already known reachable after */ { register cset *cs; register sop s; register sopno pc; register onestate here; /* note, macros know this name */ register sopno look; register int i; for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { s = g->strip[pc]; switch (OP(s)) { case OEND: assert(pc == stop-1); break; case OCHAR: /* only characters can match */ assert(!NONCHAR(ch) || ch != (char)OPND(s)); if (ch == (char)OPND(s)) FWD(aft, bef, 1); break; case OBOL: if (ch == BOL || ch == BOLEOL) FWD(aft, bef, 1); break; case OEOL: if (ch == EOL || ch == BOLEOL) FWD(aft, bef, 1); break; case OBOW: if (ch == BOW) FWD(aft, bef, 1); break; case OEOW: if (ch == EOW) FWD(aft, bef, 1); break; case OANY: if (!NONCHAR(ch)) FWD(aft, bef, 1); break; case OANYOF: cs = &g->sets[OPND(s)]; if (!NONCHAR(ch) && CHIN(cs, ch)) FWD(aft, bef, 1); break; case OBACK_: /* ignored here */ case O_BACK: FWD(aft, aft, 1); break; case OPLUS_: /* forward, this is just an empty */ FWD(aft, aft, 1); break; case O_PLUS: /* both forward and back */ FWD(aft, aft, 1); i = ISSETBACK(aft, OPND(s)); BACK(aft, aft, OPND(s)); if (!i && ISSETBACK(aft, OPND(s))) { /* oho, must reconsider loop body */ pc -= OPND(s) + 1; INIT(here, pc); } break; case OQUEST_: /* two branches, both forward */ FWD(aft, aft, 1); FWD(aft, aft, OPND(s)); break; case O_QUEST: /* just an empty */ FWD(aft, aft, 1); break; case OLPAREN: /* not significant here */ case ORPAREN: FWD(aft, aft, 1); break; case OCH_: /* mark the first two branches */ FWD(aft, aft, 1); assert(OP(g->strip[pc+OPND(s)]) == OOR2); FWD(aft, aft, OPND(s)); break; case OOR1: /* done a branch, find the O_CH */ if (ISSTATEIN(aft, here)) { for (look = 1; OP(s = g->strip[pc+look]) != O_CH; look += OPND(s)) assert(OP(s) == OOR2); FWD(aft, aft, look); } break; case OOR2: /* propagate OCH_'s marking */ FWD(aft, aft, 1); if (OP(g->strip[pc+OPND(s)]) != O_CH) { assert(OP(g->strip[pc+OPND(s)]) == OOR2); FWD(aft, aft, OPND(s)); } break; case O_CH: /* just empty */ FWD(aft, aft, 1); break; default: /* ooooops... */ assert(nope); break; } } return(aft); } #ifdef REDEBUG /* - print - print a set of states == #ifdef REDEBUG == static void print(struct match *m, char *caption, states st, \ == int ch, FILE *d); == #endif */ static void print(m, caption, st, ch, d) struct match *m; char *caption; states st; int ch; FILE *d; { register struct re_guts *g = m->g; register int i; register int first = 1; if (!(m->eflags®_TRACE)) return; fprintf(d, "%s", caption); if (ch != '\0') fprintf(d, " %s", pchar(ch)); for (i = 0; i < g->nstates; i++) if (ISSET(st, i)) { fprintf(d, "%s%d", (first) ? "\t" : ", ", i); first = 0; } fprintf(d, "\n"); } /* - at - print current situation == #ifdef REDEBUG == static void at(struct match *m, char *title, char *start, char *stop, \ == sopno startst, sopno stopst); == #endif */ static void at(m, title, start, stop, startst, stopst) struct match *m; char *title; char *start; char *stop; sopno startst; sopno stopst; { if (!(m->eflags®_TRACE)) return; printf("%s %s-", title, pchar(*start)); printf("%s ", pchar(*stop)); printf("%ld-%ld\n", (long)startst, (long)stopst); } #ifndef PCHARDONE #define PCHARDONE /* never again */ /* - pchar - make a character printable == #ifdef REDEBUG == static char *pchar(int ch); == #endif * * Is this identical to regchar() over in debug.c? Well, yes. But a * duplicate here avoids having a debugging-capable regexec.o tied to * a matching debug.o, and this is convenient. It all disappears in * the non-debug compilation anyway, so it doesn't matter much. */ static char * /* -> representation */ pchar(ch) int ch; { static char pbuf[10]; if (isprint((uch)ch) || ch == ' ') sprintf(pbuf, "%c", ch); else sprintf(pbuf, "\\%o", ch); return(pbuf); } #endif #endif #undef matcher #undef fast #undef slow #undef dissect #undef backref #undef step #undef print #undef at #undef match regex-posix-0.95.2/cbits/reallocf.c0000644000000000000000000000311711756216545015324 0ustar0000000000000000/*- * Copyright (c) 1998, M. Warner Losh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD: src/lib/libc/stdlib/reallocf.c,v 1.3 1999/08/28 00:01:37 peter Exp $ */ #include void * reallocf(void *ptr, size_t size) { void *nptr; nptr = realloc(ptr, size); if (!nptr && ptr) free(ptr); return (nptr); } regex-posix-0.95.2/cbits/regcomp.c0000644000000000000000000014425311756216545015200 0ustar0000000000000000/*- * Copyright (c) 1992, 1993, 1994 Henry Spencer. * Copyright (c) 1992, 1993, 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Henry Spencer. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 * * $FreeBSD: src/lib/libc/regex/regcomp.c,v 1.13.2.1 2000/07/31 06:30:37 dcs Exp $ */ #if defined(LIBC_SCCS) && !defined(lint) static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; #endif /* LIBC_SCCS and not lint */ #include #include #include #include #include #include #include "regex.h" // removed collate stuff --SDM // #include "collate.h" #ifdef _WIN32 #define isblank(c) ((c) == ' ' || (c) == '\t') #endif #include "utils.h" #include "regex2.h" #include "cclass.h" #include "cname.h" /* * parse structure, passed up and down to avoid global variables and * other clumsinesses */ struct parse { char *next; /* next character in RE */ char *end; /* end of string (-> NUL normally) */ int error; /* has an error been seen? */ sop *strip; /* malloced strip */ sopno ssize; /* malloced strip size (allocated) */ sopno slen; /* malloced strip length (used) */ int ncsalloc; /* number of csets allocated */ struct re_guts *g; # define NPAREN 10 /* we need to remember () 1-9 for back refs */ sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ sopno pend[NPAREN]; /* -> ) ([0] unused) */ }; /* === reallocf.c === */ void * reallocf(void *ptr, size_t size); /* ========= begin header generated by ./mkh ========= */ #ifdef __cplusplus extern "C" { #endif /* === regcomp.c === */ static void p_ere (struct parse *p, int stop); static void p_ere_exp (struct parse *p); static void p_str (struct parse *p); static void p_bre (struct parse *p, int end1, int end2); static int p_simp_re (struct parse *p, int starordinary); static int p_count (struct parse *p); static void p_bracket (struct parse *p); static void p_b_term (struct parse *p, cset *cs); static void p_b_cclass (struct parse *p, cset *cs); static void p_b_eclass (struct parse *p, cset *cs); static char p_b_symbol (struct parse *p); static char p_b_coll_elem (struct parse *p, int endc); static char othercase (int ch); static void bothcases (struct parse *p, int ch); static void ordinary (struct parse *p, int ch); static void nonnewline (struct parse *p); static void repeat (struct parse *p, sopno start, int from, int to); static int seterr (struct parse *p, int e); static cset *allocset (struct parse *p); static void freeset (struct parse *p, cset *cs); static int freezeset (struct parse *p, cset *cs); static int firstch (struct parse *p, cset *cs); static int nch (struct parse *p, cset *cs); static void mcadd (struct parse *p, cset *cs, char *cp); #if used static void mcsub (cset *cs, char *cp); static int mcin (cset *cs, char *cp); static char *mcfind (cset *cs, char *cp); #endif static void mcinvert (struct parse *p, cset *cs); static void mccase (struct parse *p, cset *cs); static int isinsets (struct re_guts *g, int c); static int samesets (struct re_guts *g, int c1, int c2); static void categorize (struct parse *p, struct re_guts *g); static sopno dupl (struct parse *p, sopno start, sopno finish); static void doemit (struct parse *p, sop op, size_t opnd); static void doinsert (struct parse *p, sop op, size_t opnd, sopno pos); static void dofwd (struct parse *p, sopno pos, sop value); static void enlarge (struct parse *p, sopno size); static void stripsnug (struct parse *p, struct re_guts *g); static void findmust (struct parse *p, struct re_guts *g); static int altoffset (sop *scan, int offset, int mccs); static void computejumps (struct parse *p, struct re_guts *g); static void computematchjumps (struct parse *p, struct re_guts *g); static sopno pluscount (struct parse *p, struct re_guts *g); #ifdef __cplusplus } #endif /* ========= end header generated by ./mkh ========= */ static char nuls[10]; /* place to point scanner in event of error */ /* * macros for use with parse structure * BEWARE: these know that the parse structure is named `p' !!! */ #define PEEK() (*p->next) #define PEEK2() (*(p->next+1)) #define MORE() (p->next < p->end) #define MORE2() (p->next+1 < p->end) #define SEE(c) (MORE() && PEEK() == (c)) #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) #define NEXT() (p->next++) #define NEXT2() (p->next += 2) #define NEXTn(n) (p->next += (n)) #define GETNEXT() (*p->next++) #define SETERROR(e) seterr(p, (e)) #define REQUIRE(co, e) ((co) || SETERROR(e)) #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) #define HERE() (p->slen) #define THERE() (p->slen - 1) #define THERETHERE() (p->slen - 2) #define DROP(n) (p->slen -= (n)) #ifndef NDEBUG static int never = 0; /* for use in asserts; shuts lint up */ #else #define never 0 /* some s have bugs too */ #endif /* Macro used by computejump()/computematchjump() */ #define MIN(a,b) ((a)<(b)?(a):(b)) /* - regcomp - interface for parser and compilation = extern int regcomp(regex_t *, const char *, int); = #define REG_BASIC 0000 = #define REG_EXTENDED 0001 = #define REG_ICASE 0002 = #define REG_NOSUB 0004 = #define REG_NEWLINE 0010 = #define REG_NOSPEC 0020 = #define REG_PEND 0040 = #define REG_DUMP 0200 */ int /* 0 success, otherwise REG_something */ regcomp(preg, pattern, cflags) regex_t *preg; const char *pattern; int cflags; { struct parse pa; register struct re_guts *g; register struct parse *p = &pa; register int i; register size_t len; #ifdef REDEBUG # define GOODFLAGS(f) (f) #else # define GOODFLAGS(f) ((f)&~REG_DUMP) #endif cflags = GOODFLAGS(cflags); if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) return(REG_INVARG); if (cflags®_PEND) { if (preg->re_endp < pattern) return(REG_INVARG); len = preg->re_endp - pattern; } else len = strlen((char *)pattern); /* do the mallocs early so failure handling is easy */ g = (struct re_guts *)malloc(sizeof(struct re_guts) + (NC-1)*sizeof(cat_t)); if (g == NULL) return(REG_ESPACE); p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ p->strip = (sop *)malloc(p->ssize * sizeof(sop)); p->slen = 0; if (p->strip == NULL) { free((char *)g); return(REG_ESPACE); } /* set things up */ p->g = g; p->next = (char *)pattern; /* convenience; we do not modify it */ p->end = p->next + len; p->error = 0; p->ncsalloc = 0; for (i = 0; i < NPAREN; i++) { p->pbegin[i] = 0; p->pend[i] = 0; } g->csetsize = NC; g->sets = NULL; g->setbits = NULL; g->ncsets = 0; g->cflags = cflags; g->iflags = 0; g->nbol = 0; g->neol = 0; g->must = NULL; g->moffset = -1; g->charjump = NULL; g->matchjump = NULL; g->mlen = 0; g->nsub = 0; g->ncategories = 1; /* category 0 is "everything else" */ g->categories = &g->catspace[-(CHAR_MIN)]; (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); g->backrefs = 0; /* do it */ EMIT(OEND, 0); g->firststate = THERE(); if (cflags®_EXTENDED) p_ere(p, OUT); else if (cflags®_NOSPEC) p_str(p); else p_bre(p, OUT, OUT); EMIT(OEND, 0); g->laststate = THERE(); /* tidy up loose ends and fill things in */ categorize(p, g); stripsnug(p, g); findmust(p, g); /* only use Boyer-Moore algorithm if the pattern is bigger * than three characters */ if(g->mlen > 3) { computejumps(p, g); computematchjumps(p, g); if(g->matchjump == NULL && g->charjump != NULL) { free(g->charjump); g->charjump = NULL; } } g->nplus = pluscount(p, g); g->magic = MAGIC2; preg->re_nsub = g->nsub; preg->re_g = g; preg->re_magic = MAGIC1; #ifndef REDEBUG /* not debugging, so can't rely on the assert() in regexec() */ if (g->iflags&BAD) SETERROR(REG_ASSERT); #endif /* win or lose, we're done */ if (p->error != 0) /* lose */ regfree(preg); return(p->error); } /* - p_ere - ERE parser top level, concatenation and alternation == static void p_ere(register struct parse *p, int stop); */ static void p_ere(p, stop) register struct parse *p; int stop; /* character this ERE should end at */ { register char c; register sopno prevback; register sopno prevfwd; register sopno conc; register int first = 1; /* is this the first alternative? */ for (;;) { /* do a bunch of concatenated expressions */ conc = HERE(); while (MORE() && (c = PEEK()) != '|' && c != stop) p_ere_exp(p); (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ if (!EAT('|')) break; /* NOTE BREAK OUT */ if (first) { INSERT(OCH_, conc); /* offset is wrong */ prevfwd = conc; prevback = conc; first = 0; } ASTERN(OOR1, prevback); prevback = THERE(); AHEAD(prevfwd); /* fix previous offset */ prevfwd = HERE(); EMIT(OOR2, 0); /* offset is very wrong */ } if (!first) { /* tail-end fixups */ AHEAD(prevfwd); ASTERN(O_CH, prevback); } assert(!MORE() || SEE(stop)); } /* - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op == static void p_ere_exp(register struct parse *p); */ static void p_ere_exp(p) register struct parse *p; { register char c; register sopno pos; register int count; register int count2; register sopno subno; int wascaret = 0; assert(MORE()); /* caller should have ensured this */ c = GETNEXT(); pos = HERE(); switch (c) { case '(': (void)REQUIRE(MORE(), REG_EPAREN); p->g->nsub++; subno = p->g->nsub; if (subno < NPAREN) p->pbegin[subno] = HERE(); EMIT(OLPAREN, subno); if (!SEE(')')) p_ere(p, ')'); if (subno < NPAREN) { p->pend[subno] = HERE(); assert(p->pend[subno] != 0); } EMIT(ORPAREN, subno); (void)MUSTEAT(')', REG_EPAREN); break; #ifndef POSIX_MISTAKE case ')': /* happens only if no current unmatched ( */ /* * You may ask, why the ifndef? Because I didn't notice * this until slightly too late for 1003.2, and none of the * other 1003.2 regular-expression reviewers noticed it at * all. So an unmatched ) is legal POSIX, at least until * we can get it fixed. */ SETERROR(REG_EPAREN); break; #endif case '^': EMIT(OBOL, 0); p->g->iflags |= USEBOL; p->g->nbol++; wascaret = 1; break; case '$': EMIT(OEOL, 0); p->g->iflags |= USEEOL; p->g->neol++; break; case '|': SETERROR(REG_EMPTY); break; case '*': case '+': case '?': SETERROR(REG_BADRPT); break; case '.': if (p->g->cflags®_NEWLINE) nonnewline(p); else EMIT(OANY, 0); break; case '[': p_bracket(p); break; case '\\': (void)REQUIRE(MORE(), REG_EESCAPE); c = GETNEXT(); ordinary(p, c); break; case '{': /* okay as ordinary except if digit follows */ (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); /* FALLTHROUGH */ default: ordinary(p, c); break; } if (!MORE()) return; c = PEEK(); /* we call { a repetition if followed by a digit */ if (!( c == '*' || c == '+' || c == '?' || (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) return; /* no repetition, we're done */ NEXT(); (void)REQUIRE(!wascaret, REG_BADRPT); switch (c) { case '*': /* implemented as +? */ /* this case does not require the (y|) trick, noKLUDGE */ INSERT(OPLUS_, pos); ASTERN(O_PLUS, pos); INSERT(OQUEST_, pos); ASTERN(O_QUEST, pos); break; case '+': INSERT(OPLUS_, pos); ASTERN(O_PLUS, pos); break; case '?': /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ INSERT(OCH_, pos); /* offset slightly wrong */ ASTERN(OOR1, pos); /* this one's right */ AHEAD(pos); /* fix the OCH_ */ EMIT(OOR2, 0); /* offset very wrong... */ AHEAD(THERE()); /* ...so fix it */ ASTERN(O_CH, THERETHERE()); break; case '{': count = p_count(p); if (EAT(',')) { if (isdigit((uch)PEEK())) { count2 = p_count(p); (void)REQUIRE(count <= count2, REG_BADBR); } else /* single number with comma */ count2 = INFINITY; } else /* just a single number */ count2 = count; repeat(p, pos, count, count2); if (!EAT('}')) { /* error heuristics */ while (MORE() && PEEK() != '}') NEXT(); (void)REQUIRE(MORE(), REG_EBRACE); SETERROR(REG_BADBR); } break; } if (!MORE()) return; c = PEEK(); if (!( c == '*' || c == '+' || c == '?' || (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) return; SETERROR(REG_BADRPT); } /* - p_str - string (no metacharacters) "parser" == static void p_str(register struct parse *p); */ static void p_str(p) register struct parse *p; { (void)REQUIRE(MORE(), REG_EMPTY); while (MORE()) ordinary(p, GETNEXT()); } /* - p_bre - BRE parser top level, anchoring and concatenation == static void p_bre(register struct parse *p, register int end1, \ == register int end2); * Giving end1 as OUT essentially eliminates the end1/end2 check. * * This implementation is a bit of a kludge, in that a trailing $ is first * taken as an ordinary character and then revised to be an anchor. The * only undesirable side effect is that '$' gets included as a character * category in such cases. This is fairly harmless; not worth fixing. * The amount of lookahead needed to avoid this kludge is excessive. */ static void p_bre(p, end1, end2) register struct parse *p; register int end1; /* first terminating character */ register int end2; /* second terminating character */ { register sopno start = HERE(); register int first = 1; /* first subexpression? */ register int wasdollar = 0; if (EAT('^')) { EMIT(OBOL, 0); p->g->iflags |= USEBOL; p->g->nbol++; } while (MORE() && !SEETWO(end1, end2)) { wasdollar = p_simp_re(p, first); first = 0; } if (wasdollar) { /* oops, that was a trailing anchor */ DROP(1); EMIT(OEOL, 0); p->g->iflags |= USEEOL; p->g->neol++; } (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ } /* - p_simp_re - parse a simple RE, an atom possibly followed by a repetition == static int p_simp_re(register struct parse *p, int starordinary); */ static int /* was the simple RE an unbackslashed $? */ p_simp_re(p, starordinary) register struct parse *p; int starordinary; /* is a leading * an ordinary character? */ { register int c; register int count; register int count2; register sopno pos; register int i; register sopno subno; # define BACKSL (1<g->cflags®_NEWLINE) nonnewline(p); else EMIT(OANY, 0); break; case '[': p_bracket(p); break; case BACKSL|'{': SETERROR(REG_BADRPT); break; case BACKSL|'(': p->g->nsub++; subno = p->g->nsub; if (subno < NPAREN) p->pbegin[subno] = HERE(); EMIT(OLPAREN, subno); /* the MORE here is an error heuristic */ if (MORE() && !SEETWO('\\', ')')) p_bre(p, '\\', ')'); if (subno < NPAREN) { p->pend[subno] = HERE(); assert(p->pend[subno] != 0); } EMIT(ORPAREN, subno); (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); break; case BACKSL|')': /* should not get here -- must be user */ case BACKSL|'}': SETERROR(REG_EPAREN); break; case BACKSL|'1': case BACKSL|'2': case BACKSL|'3': case BACKSL|'4': case BACKSL|'5': case BACKSL|'6': case BACKSL|'7': case BACKSL|'8': case BACKSL|'9': i = (c&~BACKSL) - '0'; assert(i < NPAREN); if (p->pend[i] != 0) { assert(i <= p->g->nsub); EMIT(OBACK_, i); assert(p->pbegin[i] != 0); assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); assert(OP(p->strip[p->pend[i]]) == ORPAREN); (void) dupl(p, p->pbegin[i]+1, p->pend[i]); EMIT(O_BACK, i); } else SETERROR(REG_ESUBREG); p->g->backrefs = 1; break; case '*': (void)REQUIRE(starordinary, REG_BADRPT); /* FALLTHROUGH */ default: ordinary(p, (char)c); break; } if (EAT('*')) { /* implemented as +? */ /* this case does not require the (y|) trick, noKLUDGE */ INSERT(OPLUS_, pos); ASTERN(O_PLUS, pos); INSERT(OQUEST_, pos); ASTERN(O_QUEST, pos); } else if (EATTWO('\\', '{')) { count = p_count(p); if (EAT(',')) { if (MORE() && isdigit((uch)PEEK())) { count2 = p_count(p); (void)REQUIRE(count <= count2, REG_BADBR); } else /* single number with comma */ count2 = INFINITY; } else /* just a single number */ count2 = count; repeat(p, pos, count, count2); if (!EATTWO('\\', '}')) { /* error heuristics */ while (MORE() && !SEETWO('\\', '}')) NEXT(); (void)REQUIRE(MORE(), REG_EBRACE); SETERROR(REG_BADBR); } } else if (c == '$') /* $ (but not \$) ends it */ return(1); return(0); } /* - p_count - parse a repetition count == static int p_count(register struct parse *p); */ static int /* the value */ p_count(p) register struct parse *p; { register int count = 0; register int ndigits = 0; while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { count = count*10 + (GETNEXT() - '0'); ndigits++; } (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); return(count); } /* - p_bracket - parse a bracketed character list == static void p_bracket(register struct parse *p); * * Note a significant property of this code: if the allocset() did SETERROR, * no set operations are done. */ static void p_bracket(p) register struct parse *p; { register cset *cs = allocset(p); register int invert = 0; /* Dept of Truly Sickening Special-Case Kludges */ if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { EMIT(OBOW, 0); NEXTn(6); return; } if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { EMIT(OEOW, 0); NEXTn(6); return; } if (EAT('^')) invert++; /* make note to invert set at end */ if (EAT(']')) CHadd(cs, ']'); else if (EAT('-')) CHadd(cs, '-'); while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) p_b_term(p, cs); if (EAT('-')) CHadd(cs, '-'); (void)MUSTEAT(']', REG_EBRACK); if (p->error != 0) /* don't mess things up further */ return; if (p->g->cflags®_ICASE) { register int i; register int ci; for (i = p->g->csetsize - 1; i >= 0; i--) if (CHIN(cs, i) && isalpha(i)) { ci = othercase(i); if (ci != i) CHadd(cs, ci); } if (cs->multis != NULL) mccase(p, cs); } if (invert) { register int i; for (i = p->g->csetsize - 1; i >= 0; i--) if (CHIN(cs, i)) CHsub(cs, i); else CHadd(cs, i); if (p->g->cflags®_NEWLINE) CHsub(cs, '\n'); if (cs->multis != NULL) mcinvert(p, cs); } assert(cs->multis == NULL); /* xxx */ if (nch(p, cs) == 1) { /* optimize singleton sets */ ordinary(p, firstch(p, cs)); freeset(p, cs); } else EMIT(OANYOF, freezeset(p, cs)); } /* - p_b_term - parse one term of a bracketed character list == static void p_b_term(register struct parse *p, register cset *cs); */ static void p_b_term(p, cs) register struct parse *p; register cset *cs; { register char c; register char start, finish; register int i; /* classify what we've got */ switch ((MORE()) ? PEEK() : '\0') { case '[': c = (MORE2()) ? PEEK2() : '\0'; break; case '-': SETERROR(REG_ERANGE); return; /* NOTE RETURN */ break; default: c = '\0'; break; } switch (c) { case ':': /* character class */ NEXT2(); (void)REQUIRE(MORE(), REG_EBRACK); c = PEEK(); (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); p_b_cclass(p, cs); (void)REQUIRE(MORE(), REG_EBRACK); (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); break; case '=': /* equivalence class */ NEXT2(); (void)REQUIRE(MORE(), REG_EBRACK); c = PEEK(); (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); p_b_eclass(p, cs); (void)REQUIRE(MORE(), REG_EBRACK); (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); break; default: /* symbol, ordinary character, or range */ /* xxx revision needed for multichar stuff */ start = p_b_symbol(p); if (SEE('-') && MORE2() && PEEK2() != ']') { /* range */ NEXT(); if (EAT('-')) finish = '-'; else finish = p_b_symbol(p); } else finish = start; if (start == finish) CHadd(cs, start); else { // remove collate stuff --SDM #if 0 if (__collate_load_error) { (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE); #endif for (i = (uch)start; i <= (uch)finish; i++) CHadd(cs, i); #if 0 } else { (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE); for (i = CHAR_MIN; i <= CHAR_MAX; i++) { if ( __collate_range_cmp(start, i) <= 0 && __collate_range_cmp(i, finish) <= 0 ) CHadd(cs, i); } } #endif } break; } } /* - p_b_cclass - parse a character-class name and deal with it == static void p_b_cclass(register struct parse *p, register cset *cs); */ static void p_b_cclass(p, cs) register struct parse *p; register cset *cs; { register int c; register char *sp = p->next; register struct cclass *cp; register size_t len; while (MORE() && isalpha((uch)PEEK())) NEXT(); len = p->next - sp; for (cp = cclasses; cp->name != NULL; cp++) if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') break; if (cp->name == NULL) { /* oops, didn't find it */ SETERROR(REG_ECTYPE); return; } switch (cp->fidx) { case CALNUM: for (c = CHAR_MIN; c <= CHAR_MAX; c++) if (isalnum((uch)c)) CHadd(cs, c); break; case CALPHA: for (c = CHAR_MIN; c <= CHAR_MAX; c++) if (isalpha((uch)c)) CHadd(cs, c); break; case CBLANK: for (c = CHAR_MIN; c <= CHAR_MAX; c++) if (isblank((uch)c)) CHadd(cs, c); break; case CCNTRL: for (c = CHAR_MIN; c <= CHAR_MAX; c++) if (iscntrl((uch)c)) CHadd(cs, c); break; case CDIGIT: for (c = CHAR_MIN; c <= CHAR_MAX; c++) if (isdigit((uch)c)) CHadd(cs, c); break; case CGRAPH: for (c = CHAR_MIN; c <= CHAR_MAX; c++) if (isgraph((uch)c)) CHadd(cs, c); break; case CLOWER: for (c = CHAR_MIN; c <= CHAR_MAX; c++) if (islower((uch)c)) CHadd(cs, c); break; case CPRINT: for (c = CHAR_MIN; c <= CHAR_MAX; c++) if (isprint((uch)c)) CHadd(cs, c); break; case CPUNCT: for (c = CHAR_MIN; c <= CHAR_MAX; c++) if (ispunct((uch)c)) CHadd(cs, c); break; case CSPACE: for (c = CHAR_MIN; c <= CHAR_MAX; c++) if (isspace((uch)c)) CHadd(cs, c); break; case CUPPER: for (c = CHAR_MIN; c <= CHAR_MAX; c++) if (isupper((uch)c)) CHadd(cs, c); break; case CXDIGIT: for (c = CHAR_MIN; c <= CHAR_MAX; c++) if (isxdigit((uch)c)) CHadd(cs, c); break; } #if 0 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) MCadd(p, cs, u); #endif } /* - p_b_eclass - parse an equivalence-class name and deal with it == static void p_b_eclass(register struct parse *p, register cset *cs); * * This implementation is incomplete. xxx */ static void p_b_eclass(p, cs) register struct parse *p; register cset *cs; { register char c; c = p_b_coll_elem(p, '='); CHadd(cs, c); } /* - p_b_symbol - parse a character or [..]ed multicharacter collating symbol == static char p_b_symbol(register struct parse *p); */ static char /* value of symbol */ p_b_symbol(p) register struct parse *p; { register char value; (void)REQUIRE(MORE(), REG_EBRACK); if (!EATTWO('[', '.')) return(GETNEXT()); /* collating symbol */ value = p_b_coll_elem(p, '.'); (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); return(value); } /* - p_b_coll_elem - parse a collating-element name and look it up == static char p_b_coll_elem(register struct parse *p, int endc); */ static char /* value of collating element */ p_b_coll_elem(p, endc) register struct parse *p; int endc; /* name ended by endc,']' */ { register char *sp = p->next; register struct cname *cp; register int len; while (MORE() && !SEETWO(endc, ']')) NEXT(); if (!MORE()) { SETERROR(REG_EBRACK); return(0); } len = p->next - sp; for (cp = cnames; cp->name != NULL; cp++) if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') return(cp->code); /* known name */ if (len == 1) return(*sp); /* single character */ SETERROR(REG_ECOLLATE); /* neither */ return(0); } /* - othercase - return the case counterpart of an alphabetic == static char othercase(int ch); */ static char /* if no counterpart, return ch */ othercase(ch) int ch; { ch = (uch)ch; assert(isalpha(ch)); if (isupper(ch)) return(tolower(ch)); else if (islower(ch)) return(toupper(ch)); else /* peculiar, but could happen */ return(ch); } /* - bothcases - emit a dualcase version of a two-case character == static void bothcases(register struct parse *p, int ch); * * Boy, is this implementation ever a kludge... */ static void bothcases(p, ch) register struct parse *p; int ch; { register char *oldnext = p->next; register char *oldend = p->end; char bracket[3]; ch = (uch)ch; assert(othercase(ch) != ch); /* p_bracket() would recurse */ p->next = bracket; p->end = bracket+2; bracket[0] = ch; bracket[1] = ']'; bracket[2] = '\0'; p_bracket(p); assert(p->next == bracket+2); p->next = oldnext; p->end = oldend; } /* - ordinary - emit an ordinary character == static void ordinary(register struct parse *p, register int ch); */ static void ordinary(p, ch) register struct parse *p; register int ch; { register cat_t *cap = p->g->categories; if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch) bothcases(p, ch); else { EMIT(OCHAR, (uch)ch); if (cap[ch] == 0) cap[ch] = p->g->ncategories++; } } /* - nonnewline - emit REG_NEWLINE version of OANY == static void nonnewline(register struct parse *p); * * Boy, is this implementation ever a kludge... */ static void nonnewline(p) register struct parse *p; { register char *oldnext = p->next; register char *oldend = p->end; char bracket[4]; p->next = bracket; p->end = bracket+3; bracket[0] = '^'; bracket[1] = '\n'; bracket[2] = ']'; bracket[3] = '\0'; p_bracket(p); assert(p->next == bracket+3); p->next = oldnext; p->end = oldend; } /* - repeat - generate code for a bounded repetition, recursively if needed == static void repeat(register struct parse *p, sopno start, int from, int to); */ static void repeat(p, start, from, to) register struct parse *p; sopno start; /* operand from here to end of strip */ int from; /* repeated from this number */ int to; /* to this number of times (maybe INFINITY) */ { register sopno finish = HERE(); # define N 2 # define INF 3 # define REP(f, t) ((f)*8 + (t)) # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) register sopno copy; if (p->error != 0) /* head off possible runaway recursion */ return; assert(from <= to); switch (REP(MAP(from), MAP(to))) { case REP(0, 0): /* must be user doing this */ DROP(finish-start); /* drop the operand */ break; case REP(0, 1): /* as x{1,1}? */ case REP(0, N): /* as x{1,n}? */ case REP(0, INF): /* as x{1,}? */ /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ INSERT(OCH_, start); /* offset is wrong... */ repeat(p, start+1, 1, to); ASTERN(OOR1, start); AHEAD(start); /* ... fix it */ EMIT(OOR2, 0); AHEAD(THERE()); ASTERN(O_CH, THERETHERE()); break; case REP(1, 1): /* trivial case */ /* done */ break; case REP(1, N): /* as x?x{1,n-1} */ /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ INSERT(OCH_, start); ASTERN(OOR1, start); AHEAD(start); EMIT(OOR2, 0); /* offset very wrong... */ AHEAD(THERE()); /* ...so fix it */ ASTERN(O_CH, THERETHERE()); copy = dupl(p, start+1, finish+1); assert(copy == finish+4); repeat(p, copy, 1, to-1); break; case REP(1, INF): /* as x+ */ INSERT(OPLUS_, start); ASTERN(O_PLUS, start); break; case REP(N, N): /* as xx{m-1,n-1} */ copy = dupl(p, start, finish); repeat(p, copy, from-1, to-1); break; case REP(N, INF): /* as xx{n-1,INF} */ copy = dupl(p, start, finish); repeat(p, copy, from-1, to); break; default: /* "can't happen" */ SETERROR(REG_ASSERT); /* just in case */ break; } } /* - seterr - set an error condition == static int seterr(register struct parse *p, int e); */ static int /* useless but makes type checking happy */ seterr(p, e) register struct parse *p; int e; { if (p->error == 0) /* keep earliest error condition */ p->error = e; p->next = nuls; /* try to bring things to a halt */ p->end = nuls; return(0); /* make the return value well-defined */ } /* - allocset - allocate a set of characters for [] == static cset *allocset(register struct parse *p); */ static cset * allocset(p) register struct parse *p; { register int no = p->g->ncsets++; register size_t nc; register size_t nbytes; register cset *cs; register size_t css = (size_t)p->g->csetsize; register int i; if (no >= p->ncsalloc) { /* need another column of space */ p->ncsalloc += CHAR_BIT; nc = p->ncsalloc; assert(nc % CHAR_BIT == 0); nbytes = nc / CHAR_BIT * css; if (p->g->sets == NULL) p->g->sets = (cset *)malloc(nc * sizeof(cset)); else p->g->sets = (cset *)reallocf((char *)p->g->sets, nc * sizeof(cset)); if (p->g->setbits == NULL) p->g->setbits = (uch *)malloc(nbytes); else { p->g->setbits = (uch *)reallocf((char *)p->g->setbits, nbytes); /* xxx this isn't right if setbits is now NULL */ for (i = 0; i < no; i++) p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); } if (p->g->sets != NULL && p->g->setbits != NULL) (void) memset((char *)p->g->setbits + (nbytes - css), 0, css); else { no = 0; SETERROR(REG_ESPACE); /* caller's responsibility not to do set ops */ } } assert(p->g->sets != NULL); /* xxx */ cs = &p->g->sets[no]; cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); cs->mask = 1 << ((no) % CHAR_BIT); cs->hash = 0; cs->smultis = 0; cs->multis = NULL; return(cs); } /* - freeset - free a now-unused set == static void freeset(register struct parse *p, register cset *cs); */ static void freeset(p, cs) register struct parse *p; register cset *cs; { register int i; register cset *top = &p->g->sets[p->g->ncsets]; register size_t css = (size_t)p->g->csetsize; for (i = 0; i < css; i++) CHsub(cs, i); if (cs == top-1) /* recover only the easy case */ p->g->ncsets--; } /* - freezeset - final processing on a set of characters == static int freezeset(register struct parse *p, register cset *cs); * * The main task here is merging identical sets. This is usually a waste * of time (although the hash code minimizes the overhead), but can win * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash * is done using addition rather than xor -- all ASCII [aA] sets xor to * the same value! */ static int /* set number */ freezeset(p, cs) register struct parse *p; register cset *cs; { register short h = cs->hash; register int i; register cset *top = &p->g->sets[p->g->ncsets]; register cset *cs2; register size_t css = (size_t)p->g->csetsize; /* look for an earlier one which is the same */ for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) if (cs2->hash == h && cs2 != cs) { /* maybe */ for (i = 0; i < css; i++) if (!!CHIN(cs2, i) != !!CHIN(cs, i)) break; /* no */ if (i == css) break; /* yes */ } if (cs2 < top) { /* found one */ freeset(p, cs); cs = cs2; } return((int)(cs - p->g->sets)); } /* - firstch - return first character in a set (which must have at least one) == static int firstch(register struct parse *p, register cset *cs); */ static int /* character; there is no "none" value */ firstch(p, cs) register struct parse *p; register cset *cs; { register int i; register size_t css = (size_t)p->g->csetsize; for (i = 0; i < css; i++) if (CHIN(cs, i)) return((char)i); assert(never); return(0); /* arbitrary */ } /* - nch - number of characters in a set == static int nch(register struct parse *p, register cset *cs); */ static int nch(p, cs) register struct parse *p; register cset *cs; { register int i; register size_t css = (size_t)p->g->csetsize; register int n = 0; for (i = 0; i < css; i++) if (CHIN(cs, i)) n++; return(n); } /* - mcadd - add a collating element to a cset == static void mcadd(register struct parse *p, register cset *cs, \ == register char *cp); */ static void mcadd(p, cs, cp) register struct parse *p; register cset *cs; register char *cp; { register size_t oldend = cs->smultis; cs->smultis += strlen(cp) + 1; if (cs->multis == NULL) cs->multis = malloc(cs->smultis); else cs->multis = reallocf(cs->multis, cs->smultis); if (cs->multis == NULL) { SETERROR(REG_ESPACE); return; } (void) strcpy(cs->multis + oldend - 1, cp); cs->multis[cs->smultis - 1] = '\0'; } #if used /* - mcsub - subtract a collating element from a cset == static void mcsub(register cset *cs, register char *cp); */ static void mcsub(cs, cp) register cset *cs; register char *cp; { register char *fp = mcfind(cs, cp); register size_t len = strlen(fp); assert(fp != NULL); (void) memmove(fp, fp + len + 1, cs->smultis - (fp + len + 1 - cs->multis)); cs->smultis -= len; if (cs->smultis == 0) { free(cs->multis); cs->multis = NULL; return; } cs->multis = reallocf(cs->multis, cs->smultis); assert(cs->multis != NULL); } /* - mcin - is a collating element in a cset? == static int mcin(register cset *cs, register char *cp); */ static int mcin(cs, cp) register cset *cs; register char *cp; { return(mcfind(cs, cp) != NULL); } /* - mcfind - find a collating element in a cset == static char *mcfind(register cset *cs, register char *cp); */ static char * mcfind(cs, cp) register cset *cs; register char *cp; { register char *p; if (cs->multis == NULL) return(NULL); for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) if (strcmp(cp, p) == 0) return(p); return(NULL); } #endif /* - mcinvert - invert the list of collating elements in a cset == static void mcinvert(register struct parse *p, register cset *cs); * * This would have to know the set of possibilities. Implementation * is deferred. */ static void mcinvert(p, cs) register struct parse *p; register cset *cs; { assert(cs->multis == NULL); /* xxx */ } /* - mccase - add case counterparts of the list of collating elements in a cset == static void mccase(register struct parse *p, register cset *cs); * * This would have to know the set of possibilities. Implementation * is deferred. */ static void mccase(p, cs) register struct parse *p; register cset *cs; { assert(cs->multis == NULL); /* xxx */ } /* - isinsets - is this character in any sets? == static int isinsets(register struct re_guts *g, int c); */ static int /* predicate */ isinsets(g, c) register struct re_guts *g; int c; { register uch *col; register int i; register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; register unsigned uc = (uch)c; for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) if (col[uc] != 0) return(1); return(0); } /* - samesets - are these two characters in exactly the same sets? == static int samesets(register struct re_guts *g, int c1, int c2); */ static int /* predicate */ samesets(g, c1, c2) register struct re_guts *g; int c1; int c2; { register uch *col; register int i; register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; register unsigned uc1 = (uch)c1; register unsigned uc2 = (uch)c2; for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) if (col[uc1] != col[uc2]) return(0); return(1); } /* - categorize - sort out character categories == static void categorize(struct parse *p, register struct re_guts *g); */ static void categorize(p, g) struct parse *p; register struct re_guts *g; { register cat_t *cats = g->categories; register int c; register int c2; register cat_t cat; /* avoid making error situations worse */ if (p->error != 0) return; for (c = CHAR_MIN; c <= CHAR_MAX; c++) if (cats[c] == 0 && isinsets(g, c)) { cat = g->ncategories++; cats[c] = cat; for (c2 = c+1; c2 <= CHAR_MAX; c2++) if (cats[c2] == 0 && samesets(g, c, c2)) cats[c2] = cat; } } /* - dupl - emit a duplicate of a bunch of sops == static sopno dupl(register struct parse *p, sopno start, sopno finish); */ static sopno /* start of duplicate */ dupl(p, start, finish) register struct parse *p; sopno start; /* from here */ sopno finish; /* to this less one */ { register sopno ret = HERE(); register sopno len = finish - start; assert(finish >= start); if (len == 0) return(ret); enlarge(p, p->ssize + len); /* this many unexpected additions */ assert(p->ssize >= p->slen + len); (void) memcpy((char *)(p->strip + p->slen), (char *)(p->strip + start), (size_t)len*sizeof(sop)); p->slen += len; return(ret); } /* - doemit - emit a strip operator == static void doemit(register struct parse *p, sop op, size_t opnd); * * It might seem better to implement this as a macro with a function as * hard-case backup, but it's just too big and messy unless there are * some changes to the data structures. Maybe later. */ static void doemit(p, op, opnd) register struct parse *p; sop op; size_t opnd; { /* avoid making error situations worse */ if (p->error != 0) return; /* deal with oversize operands ("can't happen", more or less) */ assert(opnd < 1<slen >= p->ssize) enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ assert(p->slen < p->ssize); /* finally, it's all reduced to the easy case */ p->strip[p->slen++] = SOP(op, opnd); } /* - doinsert - insert a sop into the strip == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos); */ static void doinsert(p, op, opnd, pos) register struct parse *p; sop op; size_t opnd; sopno pos; { register sopno sn; register sop s; register int i; /* avoid making error situations worse */ if (p->error != 0) return; sn = HERE(); EMIT(op, opnd); /* do checks, ensure space */ assert(HERE() == sn+1); s = p->strip[sn]; /* adjust paren pointers */ assert(pos > 0); for (i = 1; i < NPAREN; i++) { if (p->pbegin[i] >= pos) { p->pbegin[i]++; } if (p->pend[i] >= pos) { p->pend[i]++; } } memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], (HERE()-pos-1)*sizeof(sop)); p->strip[pos] = s; } /* - dofwd - complete a forward reference == static void dofwd(register struct parse *p, sopno pos, sop value); */ static void dofwd(p, pos, value) register struct parse *p; register sopno pos; sop value; { /* avoid making error situations worse */ if (p->error != 0) return; assert(value < 1<strip[pos] = OP(p->strip[pos]) | value; } /* - enlarge - enlarge the strip == static void enlarge(register struct parse *p, sopno size); */ static void enlarge(p, size) register struct parse *p; register sopno size; { register sop *sp; if (p->ssize >= size) return; sp = (sop *)realloc(p->strip, size*sizeof(sop)); if (sp == NULL) { SETERROR(REG_ESPACE); return; } p->strip = sp; p->ssize = size; } /* - stripsnug - compact the strip == static void stripsnug(register struct parse *p, register struct re_guts *g); */ static void stripsnug(p, g) register struct parse *p; register struct re_guts *g; { g->nstates = p->slen; g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); if (g->strip == NULL) { SETERROR(REG_ESPACE); g->strip = p->strip; } } /* - findmust - fill in must and mlen with longest mandatory literal string == static void findmust(register struct parse *p, register struct re_guts *g); * * This algorithm could do fancy things like analyzing the operands of | * for common subsequences. Someday. This code is simple and finds most * of the interesting cases. * * Note that must and mlen got initialized during setup. */ static void findmust(p, g) struct parse *p; register struct re_guts *g; { register sop *scan; sop *start; register sop *newstart; register sopno newlen; register sop s; register char *cp; register sopno i; int offset; int cs, mccs; /* avoid making error situations worse */ if (p->error != 0) return; /* Find out if we can handle OANYOF or not */ mccs = 0; for (cs = 0; cs < g->ncsets; cs++) if (g->sets[cs].multis != NULL) mccs = 1; /* find the longest OCHAR sequence in strip */ newlen = 0; offset = 0; g->moffset = 0; scan = g->strip + 1; do { s = *scan++; switch (OP(s)) { case OCHAR: /* sequence member */ if (newlen == 0) /* new sequence */ newstart = scan - 1; newlen++; break; case OPLUS_: /* things that don't break one */ case OLPAREN: case ORPAREN: break; case OQUEST_: /* things that must be skipped */ case OCH_: offset = altoffset(scan, offset, mccs); scan--; do { scan += OPND(s); s = *scan; /* assert() interferes w debug printouts */ if (OP(s) != O_QUEST && OP(s) != O_CH && OP(s) != OOR2) { g->iflags |= BAD; return; } } while (OP(s) != O_QUEST && OP(s) != O_CH); /* fallthrough */ case OBOW: /* things that break a sequence */ case OEOW: case OBOL: case OEOL: case O_QUEST: case O_CH: case OEND: if (newlen > g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) { g->moffset += offset; offset = newlen; } else g->moffset = offset; } else { if (offset > -1) offset += newlen; } newlen = 0; break; case OANY: if (newlen > g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) { g->moffset += offset; offset = newlen; } else g->moffset = offset; } else { if (offset > -1) offset += newlen; } if (offset > -1) offset++; newlen = 0; break; case OANYOF: /* may or may not invalidate offset */ /* First, everything as OANY */ if (newlen > g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) { g->moffset += offset; offset = newlen; } else g->moffset = offset; } else { if (offset > -1) offset += newlen; } if (offset > -1) offset++; newlen = 0; /* And, now, if we found out we can't deal with * it, make offset = -1. */ if (mccs) offset = -1; break; default: /* Anything here makes it impossible or too hard * to calculate the offset -- so we give up; * save the last known good offset, in case the * must sequence doesn't occur later. */ if (newlen > g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) g->moffset += offset; else g->moffset = offset; } offset = -1; newlen = 0; break; } } while (OP(s) != OEND); if (g->mlen == 0) { /* there isn't one */ g->moffset = -1; return; } /* turn it into a character string */ g->must = malloc((size_t)g->mlen + 1); if (g->must == NULL) { /* argh; just forget it */ g->mlen = 0; g->moffset = -1; return; } cp = g->must; scan = start; for (i = g->mlen; i > 0; i--) { while (OP(s = *scan++) != OCHAR) continue; assert(cp < g->must + g->mlen); *cp++ = (char)OPND(s); } assert(cp == g->must + g->mlen); *cp++ = '\0'; /* just on general principles */ } /* - altoffset - choose biggest offset among multiple choices == static int altoffset(sop *scan, int offset, int mccs); * * Compute, recursively if necessary, the largest offset among multiple * re paths. */ static int altoffset(scan, offset, mccs) sop *scan; int offset; int mccs; { int largest; int try; sop s; /* If we gave up already on offsets, return */ if (offset == -1) return -1; largest = 0; try = 0; s = *scan++; while (OP(s) != O_QUEST && OP(s) != O_CH) { switch (OP(s)) { case OOR1: if (try > largest) largest = try; try = 0; break; case OQUEST_: case OCH_: try = altoffset(scan, try, mccs); if (try == -1) return -1; scan--; do { scan += OPND(s); s = *scan; if (OP(s) != O_QUEST && OP(s) != O_CH && OP(s) != OOR2) return -1; } while (OP(s) != O_QUEST && OP(s) != O_CH); /* We must skip to the next position, or we'll * leave altoffset() too early. */ scan++; break; case OANYOF: if (mccs) return -1; case OCHAR: case OANY: try++; case OBOW: case OEOW: case OLPAREN: case ORPAREN: case OOR2: break; default: try = -1; break; } if (try == -1) return -1; s = *scan++; } if (try > largest) largest = try; return largest+offset; } /* - computejumps - compute char jumps for BM scan == static void computejumps(register struct parse *p, register struct re_guts *g); * * This algorithm assumes g->must exists and is has size greater than * zero. It's based on the algorithm found on Computer Algorithms by * Sara Baase. * * A char jump is the number of characters one needs to jump based on * the value of the character from the text that was mismatched. */ static void computejumps(p, g) struct parse *p; struct re_guts *g; { int ch; int mindex; /* Avoid making errors worse */ if (p->error != 0) return; g->charjump = (int*) malloc((NC + 1) * sizeof(int)); if (g->charjump == NULL) /* Not a fatal error */ return; /* Adjust for signed chars, if necessary */ g->charjump = &g->charjump[-(CHAR_MIN)]; /* If the character does not exist in the pattern, the jump * is equal to the number of characters in the pattern. */ for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) g->charjump[ch] = g->mlen; /* If the character does exist, compute the jump that would * take us to the last character in the pattern equal to it * (notice that we match right to left, so that last character * is the first one that would be matched). */ for (mindex = 0; mindex < g->mlen; mindex++) g->charjump[g->must[mindex]] = g->mlen - mindex - 1; } /* - computematchjumps - compute match jumps for BM scan == static void computematchjumps(register struct parse *p, register struct re_guts *g); * * This algorithm assumes g->must exists and is has size greater than * zero. It's based on the algorithm found on Computer Algorithms by * Sara Baase. * * A match jump is the number of characters one needs to advance based * on the already-matched suffix. * Notice that all values here are minus (g->mlen-1), because of the way * the search algorithm works. */ static void computematchjumps(p, g) struct parse *p; struct re_guts *g; { int mindex; /* General "must" iterator */ int suffix; /* Keeps track of matching suffix */ int ssuffix; /* Keeps track of suffixes' suffix */ int* pmatches; /* pmatches[k] points to the next i * such that i+1...mlen is a substring * of k+1...k+mlen-i-1 */ /* Avoid making errors worse */ if (p->error != 0) return; pmatches = (int*) malloc(g->mlen * sizeof(unsigned int)); if (pmatches == NULL) { g->matchjump = NULL; return; } g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int)); if (g->matchjump == NULL) /* Not a fatal error */ return; /* Set maximum possible jump for each character in the pattern */ for (mindex = 0; mindex < g->mlen; mindex++) g->matchjump[mindex] = 2*g->mlen - mindex - 1; /* Compute pmatches[] */ for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; mindex--, suffix--) { pmatches[mindex] = suffix; /* If a mismatch is found, interrupting the substring, * compute the matchjump for that position. If no * mismatch is found, then a text substring mismatched * against the suffix will also mismatch against the * substring. */ while (suffix < g->mlen && g->must[mindex] != g->must[suffix]) { g->matchjump[suffix] = MIN(g->matchjump[suffix], g->mlen - mindex - 1); suffix = pmatches[suffix]; } } /* Compute the matchjump up to the last substring found to jump * to the beginning of the largest must pattern prefix matching * it's own suffix. */ for (mindex = 0; mindex <= suffix; mindex++) g->matchjump[mindex] = MIN(g->matchjump[mindex], g->mlen + suffix - mindex); ssuffix = pmatches[suffix]; while (suffix < g->mlen) { while (suffix <= ssuffix && suffix < g->mlen) { g->matchjump[suffix] = MIN(g->matchjump[suffix], g->mlen + ssuffix - suffix); suffix++; } if (suffix < g->mlen) ssuffix = pmatches[ssuffix]; } free(pmatches); } /* - pluscount - count + nesting == static sopno pluscount(register struct parse *p, register struct re_guts *g); */ static sopno /* nesting depth */ pluscount(p, g) struct parse *p; register struct re_guts *g; { register sop *scan; register sop s; register sopno plusnest = 0; register sopno maxnest = 0; if (p->error != 0) return(0); /* there may not be an OEND */ scan = g->strip + 1; do { s = *scan++; switch (OP(s)) { case OPLUS_: plusnest++; break; case O_PLUS: if (plusnest > maxnest) maxnest = plusnest; plusnest--; break; } } while (OP(s) != OEND); if (plusnest != 0) g->iflags |= BAD; return(maxnest); } regex-posix-0.95.2/cbits/regerror.c0000644000000000000000000001262711756216545015372 0ustar0000000000000000/*- * Copyright (c) 1992, 1993, 1994 Henry Spencer. * Copyright (c) 1992, 1993, 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Henry Spencer. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)regerror.c 8.4 (Berkeley) 3/20/94 */ #if defined(LIBC_SCCS) && !defined(lint) static char sccsid[] = "@(#)regerror.c 8.4 (Berkeley) 3/20/94"; #endif /* LIBC_SCCS and not lint */ #include #include #include #include #include #include "regex.h" #include "utils.h" /* ========= begin header generated by ./mkh ========= */ #ifdef __cplusplus extern "C" { #endif /* === regerror.c === */ static char *regatoi (const regex_t *preg, char *localbuf); #ifdef __cplusplus } #endif /* ========= end header generated by ./mkh ========= */ /* = #define REG_NOMATCH 1 = #define REG_BADPAT 2 = #define REG_ECOLLATE 3 = #define REG_ECTYPE 4 = #define REG_EESCAPE 5 = #define REG_ESUBREG 6 = #define REG_EBRACK 7 = #define REG_EPAREN 8 = #define REG_EBRACE 9 = #define REG_BADBR 10 = #define REG_ERANGE 11 = #define REG_ESPACE 12 = #define REG_BADRPT 13 = #define REG_EMPTY 14 = #define REG_ASSERT 15 = #define REG_INVARG 16 = #define REG_ATOI 255 // convert name to number (!) = #define REG_ITOA 0400 // convert number to name (!) */ static struct rerr { int code; char *name; char *explain; } rerrs[] = { {REG_NOMATCH, "REG_NOMATCH", "regexec() failed to match"}, {REG_BADPAT, "REG_BADPAT", "invalid regular expression"}, {REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element"}, {REG_ECTYPE, "REG_ECTYPE", "invalid character class"}, {REG_EESCAPE, "REG_EESCAPE", "trailing backslash (\\)"}, {REG_ESUBREG, "REG_ESUBREG", "invalid backreference number"}, {REG_EBRACK, "REG_EBRACK", "brackets ([ ]) not balanced"}, {REG_EPAREN, "REG_EPAREN", "parentheses not balanced"}, {REG_EBRACE, "REG_EBRACE", "braces not balanced"}, {REG_BADBR, "REG_BADBR", "invalid repetition count(s)"}, {REG_ERANGE, "REG_ERANGE", "invalid character range"}, {REG_ESPACE, "REG_ESPACE", "out of memory"}, {REG_BADRPT, "REG_BADRPT", "repetition-operator operand invalid"}, {REG_EMPTY, "REG_EMPTY", "empty (sub)expression"}, {REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug"}, {REG_INVARG, "REG_INVARG", "invalid argument to regex routine"}, {0, "", "*** unknown regexp error code ***"} }; /* - regerror - the interface to error numbers = extern size_t regerror(int, const regex_t *, char *, size_t); */ /* ARGSUSED */ size_t regerror(errcode, preg, errbuf, errbuf_size) int errcode; const regex_t *preg; char *errbuf; size_t errbuf_size; { register struct rerr *r; register size_t len; register int target = errcode &~ REG_ITOA; register char *s; char convbuf[50]; if (errcode == REG_ATOI) s = regatoi(preg, convbuf); else { for (r = rerrs; r->code != 0; r++) if (r->code == target) break; if (errcode®_ITOA) { if (r->code != 0) (void) strcpy(convbuf, r->name); else sprintf(convbuf, "REG_0x%x", target); assert(strlen(convbuf) < sizeof(convbuf)); s = convbuf; } else s = r->explain; } len = strlen(s) + 1; if (errbuf_size > 0) { if (errbuf_size > len) (void) strcpy(errbuf, s); else { (void) strncpy(errbuf, s, errbuf_size-1); errbuf[errbuf_size-1] = '\0'; } } return(len); } /* - regatoi - internal routine to implement REG_ATOI == static char *regatoi(const regex_t *preg, char *localbuf); */ static char * regatoi(preg, localbuf) const regex_t *preg; char *localbuf; { register struct rerr *r; for (r = rerrs; r->code != 0; r++) if (strcmp(r->name, preg->re_endp) == 0) break; if (r->code == 0) return("0"); sprintf(localbuf, "%d", r->code); return(localbuf); } regex-posix-0.95.2/cbits/regex.h0000644000000000000000000000717511756216545014664 0ustar0000000000000000/*- * Copyright (c) 1992 Henry Spencer. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Henry Spencer of the University of Toronto. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)regex.h 8.2 (Berkeley) 1/3/94 */ #ifndef _REGEX_H_ #define _REGEX_H_ #include /* types */ typedef off_t regoff_t; typedef struct { int re_magic; size_t re_nsub; /* number of parenthesized subexpressions */ __const char *re_endp; /* end pointer for REG_PEND */ struct re_guts *re_g; /* none of your business :-) */ } regex_t; typedef struct { regoff_t rm_so; /* start of match */ regoff_t rm_eo; /* end of match */ } regmatch_t; /* regcomp() flags */ #define REG_BASIC 0000 #define REG_EXTENDED 0001 #define REG_ICASE 0002 #define REG_NOSUB 0004 #define REG_NEWLINE 0010 #define REG_NOSPEC 0020 #define REG_PEND 0040 #define REG_DUMP 0200 /* regerror() flags */ #define REG_NOMATCH 1 #define REG_BADPAT 2 #define REG_ECOLLATE 3 #define REG_ECTYPE 4 #define REG_EESCAPE 5 #define REG_ESUBREG 6 #define REG_EBRACK 7 #define REG_EPAREN 8 #define REG_EBRACE 9 #define REG_BADBR 10 #define REG_ERANGE 11 #define REG_ESPACE 12 #define REG_BADRPT 13 #define REG_EMPTY 14 #define REG_ASSERT 15 #define REG_INVARG 16 #define REG_ATOI 255 /* convert name to number (!) */ #define REG_ITOA 0400 /* convert number to name (!) */ /* regexec() flags */ #define REG_NOTBOL 00001 #define REG_NOTEOL 00002 #define REG_STARTEND 00004 #define REG_TRACE 00400 /* tracing of execution */ #define REG_LARGE 01000 /* force large representation */ #define REG_BACKR 02000 /* force use of backref code */ #ifdef __cplusplus extern "C" { #endif int regcomp (regex_t *, const char *, int); size_t regerror (int, const regex_t *, char *, size_t); int regexec (const regex_t *, const char *, size_t, regmatch_t [], int); void regfree (regex_t *); #ifdef __cplusplus } #endif #endif /* !_REGEX_H_ */ regex-posix-0.95.2/cbits/regex2.h0000644000000000000000000001704211756216545014740 0ustar0000000000000000/*- * Copyright (c) 1992, 1993, 1994 Henry Spencer. * Copyright (c) 1992, 1993, 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Henry Spencer. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)regex2.h 8.4 (Berkeley) 3/20/94 * * $FreeBSD: src/lib/libc/regex/regex2.h,v 1.3.6.1 2000/07/31 06:30:37 dcs Exp $ */ /* * First, the stuff that ends up in the outside-world include file = typedef off_t regoff_t; = typedef struct { = int re_magic; = size_t re_nsub; // number of parenthesized subexpressions = const char *re_endp; // end pointer for REG_PEND = struct re_guts *re_g; // none of your business :-) = } regex_t; = typedef struct { = regoff_t rm_so; // start of match = regoff_t rm_eo; // end of match = } regmatch_t; */ /* * internals of regex_t */ #define MAGIC1 ((('r'^0200)<<8) | 'e') /* * The internal representation is a *strip*, a sequence of * operators ending with an endmarker. (Some terminology etc. is a * historical relic of earlier versions which used multiple strips.) * Certain oddities in the representation are there to permit running * the machinery backwards; in particular, any deviation from sequential * flow must be marked at both its source and its destination. Some * fine points: * * - OPLUS_ and O_PLUS are *inside* the loop they create. * - OQUEST_ and O_QUEST are *outside* the bypass they create. * - OCH_ and O_CH are *outside* the multi-way branch they create, while * OOR1 and OOR2 are respectively the end and the beginning of one of * the branches. Note that there is an implicit OOR2 following OCH_ * and an implicit OOR1 preceding O_CH. * * In state representations, an operator's bit is on to signify a state * immediately *preceding* "execution" of that operator. */ typedef unsigned long sop; /* strip operator */ typedef long sopno; #define OPRMASK 0xf8000000L #define OPDMASK 0x07ffffffL #define OPSHIFT ((unsigned)27) #define OP(n) ((n)&OPRMASK) #define OPND(n) ((n)&OPDMASK) #define SOP(op, opnd) ((op)|(opnd)) /* operators meaning operand */ /* (back, fwd are offsets) */ #define OEND (1L< uch [csetsize] */ uch mask; /* bit within array */ short hash; /* hash code */ size_t smultis; char *multis; /* -> char[smulti] ab\0cd\0ef\0\0 */ } cset; /* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */ #define CHadd(cs, c) ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (uch)(c)) #define CHsub(cs, c) ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (uch)(c)) #define CHIN(cs, c) ((cs)->ptr[(uch)(c)] & (cs)->mask) #define MCadd(p, cs, cp) mcadd(p, cs, cp) /* regcomp() internal fns */ #define MCsub(p, cs, cp) mcsub(p, cs, cp) #define MCin(p, cs, cp) mcin(p, cs, cp) /* stuff for character categories */ typedef unsigned char cat_t; /* * main compiled-expression structure */ struct re_guts { int magic; # define MAGIC2 ((('R'^0200)<<8)|'E') sop *strip; /* malloced area for strip */ int csetsize; /* number of bits in a cset vector */ int ncsets; /* number of csets in use */ cset *sets; /* -> cset [ncsets] */ uch *setbits; /* -> uch[csetsize][ncsets/CHAR_BIT] */ int cflags; /* copy of regcomp() cflags argument */ sopno nstates; /* = number of sops */ sopno firststate; /* the initial OEND (normally 0) */ sopno laststate; /* the final OEND */ int iflags; /* internal flags */ # define USEBOL 01 /* used ^ */ # define USEEOL 02 /* used $ */ # define BAD 04 /* something wrong */ int nbol; /* number of ^ used */ int neol; /* number of $ used */ int ncategories; /* how many character categories */ cat_t *categories; /* ->catspace[-CHAR_MIN] */ char *must; /* match must contain this string */ int moffset; /* latest point at which must may be located */ int *charjump; /* Boyer-Moore char jump table */ int *matchjump; /* Boyer-Moore match jump table */ int mlen; /* length of must */ size_t nsub; /* copy of re_nsub */ int backrefs; /* does it use back references? */ sopno nplus; /* how deep does it nest +s? */ /* catspace must be last */ cat_t catspace[1]; /* actually [NC] */ }; /* misc utilities */ #define OUT (CHAR_MAX+1) /* a non-character value */ #define ISWORD(c) (isalnum((uch)(c)) || (c) == '_') regex-posix-0.95.2/cbits/regexec.c0000644000000000000000000001445611756216545015167 0ustar0000000000000000/*- * Copyright (c) 1992, 1993, 1994 Henry Spencer. * Copyright (c) 1992, 1993, 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Henry Spencer. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)regexec.c 8.3 (Berkeley) 3/20/94 */ #if defined(LIBC_SCCS) && !defined(lint) static char sccsid[] = "@(#)regexec.c 8.3 (Berkeley) 3/20/94"; #endif /* LIBC_SCCS and not lint */ /* * the outer shell of regexec() * * This file includes engine.c *twice*, after muchos fiddling with the * macros that code uses. This lets the same code operate on two different * representations for state sets. */ #include #include #include #include #include #include #include "regex.h" #include "utils.h" #include "regex2.h" static int nope = 0; /* for use in asserts; shuts lint up */ /* macros for manipulating states, small version */ #define states long #define states1 states /* for later use in regexec() decision */ #define CLEAR(v) ((v) = 0) #define SET0(v, n) ((v) &= ~((unsigned long)1 << (n))) #define SET1(v, n) ((v) |= (unsigned long)1 << (n)) #define ISSET(v, n) (((v) & ((unsigned long)1 << (n))) != 0) #define ASSIGN(d, s) ((d) = (s)) #define EQ(a, b) ((a) == (b)) #define STATEVARS long dummy /* dummy version */ #define STATESETUP(m, n) /* nothing */ #define STATETEARDOWN(m) /* nothing */ #define SETUP(v) ((v) = 0) #define onestate long #define INIT(o, n) ((o) = (unsigned long)1 << (n)) #define INC(o) ((o) <<= 1) #define ISSTATEIN(v, o) (((v) & (o)) != 0) /* some abbreviations; note that some of these know variable names! */ /* do "if I'm here, I can also be there" etc without branches */ #define FWD(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) << (n)) #define BACK(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) >> (n)) #define ISSETBACK(v, n) (((v) & ((unsigned long)here >> (n))) != 0) /* function names */ #define SNAMES /* engine.c looks after details */ #include "engine.c" /* now undo things */ #undef states #undef CLEAR #undef SET0 #undef SET1 #undef ISSET #undef ASSIGN #undef EQ #undef STATEVARS #undef STATESETUP #undef STATETEARDOWN #undef SETUP #undef onestate #undef INIT #undef INC #undef ISSTATEIN #undef FWD #undef BACK #undef ISSETBACK #undef SNAMES /* macros for manipulating states, large version */ #define states char * #define CLEAR(v) memset(v, 0, m->g->nstates) #define SET0(v, n) ((v)[n] = 0) #define SET1(v, n) ((v)[n] = 1) #define ISSET(v, n) ((v)[n]) #define ASSIGN(d, s) memcpy(d, s, m->g->nstates) #define EQ(a, b) (memcmp(a, b, m->g->nstates) == 0) #define STATEVARS long vn; char *space #define STATESETUP(m, nv) { (m)->space = malloc((nv)*(m)->g->nstates); \ if ((m)->space == NULL) return(REG_ESPACE); \ (m)->vn = 0; } #define STATETEARDOWN(m) { free((m)->space); } #define SETUP(v) ((v) = &m->space[m->vn++ * m->g->nstates]) #define onestate long #define INIT(o, n) ((o) = (n)) #define INC(o) ((o)++) #define ISSTATEIN(v, o) ((v)[o]) /* some abbreviations; note that some of these know variable names! */ /* do "if I'm here, I can also be there" etc without branches */ #define FWD(dst, src, n) ((dst)[here+(n)] |= (src)[here]) #define BACK(dst, src, n) ((dst)[here-(n)] |= (src)[here]) #define ISSETBACK(v, n) ((v)[here - (n)]) /* function names */ #define LNAMES /* flag */ #include "engine.c" /* - regexec - interface for matching = extern int regexec(const regex_t *, const char *, size_t, \ = regmatch_t [], int); = #define REG_NOTBOL 00001 = #define REG_NOTEOL 00002 = #define REG_STARTEND 00004 = #define REG_TRACE 00400 // tracing of execution = #define REG_LARGE 01000 // force large representation = #define REG_BACKR 02000 // force use of backref code * * We put this here so we can exploit knowledge of the state representation * when choosing which matcher to call. Also, by this point the matchers * have been prototyped. */ int /* 0 success, REG_NOMATCH failure */ regexec(preg, string, nmatch, pmatch, eflags) const regex_t *preg; const char *string; size_t nmatch; regmatch_t pmatch[]; int eflags; { register struct re_guts *g = preg->re_g; #ifdef REDEBUG # define GOODFLAGS(f) (f) #else # define GOODFLAGS(f) ((f)&(REG_NOTBOL|REG_NOTEOL|REG_STARTEND)) #endif if (preg->re_magic != MAGIC1 || g->magic != MAGIC2) return(REG_BADPAT); assert(!(g->iflags&BAD)); if (g->iflags&BAD) /* backstop for no-debug case */ return(REG_BADPAT); eflags = GOODFLAGS(eflags); if (g->nstates <= CHAR_BIT*sizeof(states1) && !(eflags®_LARGE)) return(smatcher(g, (char *)string, nmatch, pmatch, eflags)); else return(lmatcher(g, (char *)string, nmatch, pmatch, eflags)); } regex-posix-0.95.2/cbits/regfree.c0000644000000000000000000000603011756216545015151 0ustar0000000000000000/*- * Copyright (c) 1992, 1993, 1994 Henry Spencer. * Copyright (c) 1992, 1993, 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Henry Spencer. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)regfree.c 8.3 (Berkeley) 3/20/94 * * $FreeBSD: src/lib/libc/regex/regfree.c,v 1.1.1.1.14.1 2000/07/31 06:30:37 dcs Exp $ */ #if defined(LIBC_SCCS) && !defined(lint) static char sccsid[] = "@(#)regfree.c 8.3 (Berkeley) 3/20/94"; #endif /* LIBC_SCCS and not lint */ #include #include #include #include #include "regex.h" #include "utils.h" #include "regex2.h" /* - regfree - free everything = extern void regfree(regex_t *); */ void regfree(preg) regex_t *preg; { register struct re_guts *g; if (preg->re_magic != MAGIC1) /* oops */ return; /* nice to complain, but hard */ g = preg->re_g; if (g == NULL || g->magic != MAGIC2) /* oops again */ return; preg->re_magic = 0; /* mark it invalid */ g->magic = 0; /* mark it invalid */ if (g->strip != NULL) free((char *)g->strip); if (g->sets != NULL) free((char *)g->sets); if (g->setbits != NULL) free((char *)g->setbits); if (g->must != NULL) free(g->must); if (g->charjump != NULL) free(&g->charjump[CHAR_MIN]); if (g->matchjump != NULL) free(g->matchjump); free((char *)g); } regex-posix-0.95.2/cbits/utils.h0000644000000000000000000000510111756216545014675 0ustar0000000000000000/*- * Copyright (c) 1992, 1993, 1994 Henry Spencer. * Copyright (c) 1992, 1993, 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Henry Spencer. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)utils.h 8.3 (Berkeley) 3/20/94 */ /* Added by SDM 15/2/2002: apparently mingw doesn't define this constant */ #ifndef _POSIX2_RE_DUP_MAX #define _POSIX2_RE_DUP_MAX 255 #endif /* utility definitions */ #define DUPMAX _POSIX2_RE_DUP_MAX /* xxx is this right? */ #define INFINITY (DUPMAX + 1) #define NC (CHAR_MAX - CHAR_MIN + 1) typedef unsigned char uch; /* switch off assertions (if not already off) if no REDEBUG */ #ifndef REDEBUG #ifndef NDEBUG #define NDEBUG /* no assertions please */ #endif #endif #include /* for old systems with bcopy() but no memmove() */ #ifdef USEBCOPY #define memmove(d, s, c) bcopy(s, d, c) #endif regex-posix-0.95.2/Text/0000755000000000000000000000000011756216545013207 5ustar0000000000000000regex-posix-0.95.2/Text/Regex/0000755000000000000000000000000011756216545014261 5ustar0000000000000000regex-posix-0.95.2/Text/Regex/Posix.hs0000644000000000000000000000625511756216545015727 0ustar0000000000000000{- OPTIONS_GHC -fno-warn-unused-imports -} ----------------------------------------------------------------------------- -- | -- -- Module : Text.Regex.Posix -- Copyright : (c) Chris Kuklewicz 2006 -- License : BSD-style (see the file LICENSE) -- -- Maintainer : libraries@haskell.org, textregexlazy@personal.mightyreason.com -- Stability : experimental -- Portability : non-portable (regex-base needs MPTC+FD) -- -- Module that provides the Regex backend that wraps the c posix regex api. -- This is the backend being used by the regex-compat package to replace -- Text.Regex -- -- The "Text.Regex.Posix" module provides a backend for regular -- expressions. If you import this along with other backends, then -- you should do so with qualified imports, perhaps renamed for -- convenience. -- -- If the '=~' and '=~~' functions are too high level, you can use the -- compile, regexec, and execute functions from importing either -- "Text.Regex.Posix.String" or "Text.Regex.Posix.ByteString". If you -- want to use a low-level 'Foreign.C.CString' interface to the library, -- then import "Text.Regex.Posix.Wrap" and use the wrap* functions. -- -- This module is only efficient with 'Data.ByteString.ByteString' only -- if it is null terminated, i.e. @(Bytestring.last bs)==0@. Otherwise the -- library must make a temporary copy of the 'Data.ByteString.ByteString' -- and append the NUL byte. -- -- A 'String' will be converted into a 'Foreign.C.CString' for processing. -- Doing this repeatedly will be very inefficient. -- -- Note that the posix library works with single byte characters, and -- does not understand Unicode. If you need Unicode support you will -- have to use a different backend. -- -- When offsets are reported for subexpression captures, a subexpression -- that did not match anything (as opposed to matching an empty string) -- will have its offset set to the 'unusedRegOffset' value, which is (-1). -- -- Benchmarking shows the default regex library on many platforms is very -- inefficient. You might increase performace by an order of magnitude -- by obtaining libpcre and regex-pcre or libtre and regex-tre. If you -- do not need the captured substrings then you can also get great -- performance from regex-dfa. If you do need the capture substrings -- then you may be able to use regex-parsec to improve performance. ----------------------------------------------------------------------------- module Text.Regex.Posix(getVersion_Text_Regex_Posix ,module Text.Regex.Base -- ** Wrap, for '=~' and '=~~', types and constants ,module Text.Regex.Posix.Wrap) where import Text.Regex.Posix.Wrap(Regex, CompOption(CompOption), ExecOption(ExecOption), (=~), (=~~), unusedRegOffset, compBlank, compExtended, compIgnoreCase, compNoSub, compNewline, execBlank, execNotBOL, execNotEOL) import Text.Regex.Posix.String() import Text.Regex.Posix.Sequence() import Text.Regex.Posix.ByteString() import Text.Regex.Posix.ByteString.Lazy() import Data.Version(Version(..)) import Text.Regex.Base getVersion_Text_Regex_Posix :: Version getVersion_Text_Regex_Posix = Version { versionBranch = [0,94,4] -- Keep in sync with regex-posix.cabal , versionTags = ["unstable"] } regex-posix-0.95.2/Text/Regex/Posix/0000755000000000000000000000000011756216545015363 5ustar0000000000000000regex-posix-0.95.2/Text/Regex/Posix/ByteString.hs0000644000000000000000000001345411756216545020020 0ustar0000000000000000{-# OPTIONS_GHC -fno-warn-orphans #-} ----------------------------------------------------------------------------- -- | -- Module : Text.Regex.Posix.ByteString -- Copyright : (c) Chris Kuklewicz 2006 -- License : BSD-style (see the file LICENSE) -- -- Maintainer : libraries@haskell.org, textregexlazy@personal.mightyreason.com -- Stability : experimental -- Portability : non-portable (regex-base needs MPTC+FD) -- -- This provides 'ByteString' instances for RegexMaker and RegexLike -- based on "Text.Regex.Posix.Wrap", and a (RegexContext Regex -- ByteString ByteString) instance. -- -- To use these instance, you would normally import -- "Text.Regex.Posix". You only need to import this module to use -- the medium level API of the compile, regexec, and execute -- functions. All of these report error by returning Left values -- instead of undefined or error or fail. -- -- The ByteString will only be passed to the library efficiently (as a -- pointer) if it ends in a NUL byte. Otherwise a temporary copy must -- be made with the 0 byte appended. ----------------------------------------------------------------------------- module Text.Regex.Posix.ByteString( -- ** Types Regex, MatchOffset, MatchLength, ReturnCode, WrapError, -- ** Miscellaneous unusedOffset, -- ** Medium level API functions compile, execute, regexec, -- ** Compilation options CompOption(CompOption), compBlank, compExtended, -- use extended regex syntax compIgnoreCase, -- ignore case when matching compNoSub, -- no substring matching needed compNewline, -- '.' doesn't match newline -- ** Execution options ExecOption(ExecOption), execBlank, execNotBOL, -- not at begining of line execNotEOL -- not at end of line ) where import Data.Array(Array,listArray) import Data.ByteString(ByteString) import qualified Data.ByteString as B(empty,useAsCString,last,take,drop,null) #ifdef SPLIT_BASE import qualified Data.ByteString.Unsafe as B(unsafeUseAsCString) #else import qualified Data.ByteString.Base as B(unsafeUseAsCString) #endif import System.IO.Unsafe(unsafePerformIO) import Text.Regex.Base.RegexLike(RegexMaker(..),RegexContext(..),RegexLike(..),MatchOffset,MatchLength) import Text.Regex.Posix.Wrap -- all import Text.Regex.Base.Impl(polymatch,polymatchM) import Foreign.C.String(CString) instance RegexContext Regex ByteString ByteString where match = polymatch matchM = polymatchM unwrap :: (Show e) => Either e v -> IO v unwrap x = case x of Left err -> fail ("Text.Regex.Posix.ByteString died: "++ show err) Right v -> return v instance RegexMaker Regex CompOption ExecOption ByteString where makeRegexOpts c e pattern = unsafePerformIO $ compile c e pattern >>= unwrap makeRegexOptsM c e pattern = either (fail.show) return $ unsafePerformIO $ compile c e pattern instance RegexLike Regex ByteString where matchTest regex bs = unsafePerformIO $ asCString bs (wrapTest regex) >>= unwrap matchOnce regex bs = unsafePerformIO $ execute regex bs >>= unwrap matchAll regex bs = unsafePerformIO $ asCString bs (wrapMatchAll regex) >>= unwrap matchCount regex bs = unsafePerformIO $ asCString bs (wrapCount regex) >>= unwrap -- --------------------------------------------------------------------- -- | Compiles a regular expression -- compile :: CompOption -- ^ Flags (summed together) -> ExecOption -- ^ Flags (summed together) -> ByteString -- ^ The regular expression to compile -> IO (Either WrapError Regex) -- ^ Returns: the compiled regular expression compile c e pattern = asCString pattern (wrapCompile c e) -- --------------------------------------------------------------------- -- | Matches a regular expression against a buffer, returning the buffer -- indicies of the match, and any submatches -- -- | Matches a regular expression against a string execute :: Regex -- ^ Compiled regular expression -> ByteString -- ^ String to match against -> IO (Either WrapError (Maybe (Array Int (MatchOffset,MatchLength)))) -- ^ Returns: 'Nothing' if the regex did not match the -- string, or: -- 'Just' an array of (offset,length) pairs where index 0 is whole match, and the rest are the captured subexpressions. execute regex bs = do maybeStartEnd <- asCString bs (wrapMatch regex) case maybeStartEnd of Right Nothing -> return (Right Nothing) -- Right (Just []) -> ... Right (Just parts) -> return . Right . Just . listArray (0,pred (length parts)) . map (\(s,e)->(fromIntegral s, fromIntegral (e-s))) $ parts Left err -> return (Left err) regexec :: Regex -- ^ Compiled regular expression -> ByteString -- ^ String to match against -> IO (Either WrapError (Maybe (ByteString, ByteString, ByteString, [ByteString]))) regexec regex bs = do let getSub (start,stop) | start == unusedRegOffset = B.empty | otherwise = B.take (fi (stop-start)) . B.drop (fi start) $ bs matchedParts [] = (B.empty,B.empty,bs,[]) -- no information matchedParts (matchedStartStop@(start,stop):subStartStop) = (B.take (fi start) bs ,getSub matchedStartStop ,B.drop (fi stop) bs ,map getSub subStartStop) maybeStartEnd <- asCString bs (wrapMatch regex) case maybeStartEnd of Right Nothing -> return (Right Nothing) -- Right (Just []) -> ... Right (Just parts) -> return . Right . Just . matchedParts $ parts Left err -> return (Left err) unusedOffset :: Int unusedOffset = fromIntegral unusedRegOffset fi :: (Integral i,Num n) => i->n fi = fromIntegral asCString :: ByteString -> (CString -> IO a) -> IO a asCString bs = if (not (B.null bs)) && (0==B.last bs) then B.unsafeUseAsCString bs else B.useAsCString bs regex-posix-0.95.2/Text/Regex/Posix/Sequence.hs0000644000000000000000000001441111756216545017470 0ustar0000000000000000{-# OPTIONS_GHC -fno-warn-orphans #-} ----------------------------------------------------------------------------- -- | -- Module : Text.Regex.Posix.Sequence -- Copyright : (c) Chris Kuklewicz 2006 -- License : BSD-style (see the file LICENSE) -- -- Maintainer : libraries@haskell.org, textregexlazy@personal.mightyreason.com -- Stability : experimental -- Portability : non-portable (regex-base needs MPTC+FD) -- -- This provides 'String' instances for 'RegexMaker' and 'RegexLike' based -- on "Text.Regex.Posix.Wrap", and a ('RegexContext' 'Regex' 'String' 'String') -- instance. -- -- To use these instance, you would normally import -- "Text.Regex.Posix". You only need to import this module to use -- the medium level API of the compile, regexec, and execute -- functions. All of these report error by returning Left values -- instead of undefined or error or fail. -- ----------------------------------------------------------------------------- module Text.Regex.Posix.Sequence( -- ** Types Regex, MatchOffset, MatchLength, ReturnCode, WrapError, -- ** Miscellaneous unusedOffset, -- ** Medium level API functions compile, regexec, execute, -- ** Compilation options CompOption(CompOption), compBlank, compExtended, -- use extended regex syntax compIgnoreCase, -- ignore case when matching compNoSub, -- no substring matching needed compNewline, -- '.' doesn't match newline ExecOption(ExecOption), execBlank, execNotBOL, -- not at begining of line execNotEOL -- not at end of line ) where import Data.Array(listArray, Array) import System.IO.Unsafe(unsafePerformIO) import Text.Regex.Base.RegexLike(RegexContext(..),RegexMaker(..),RegexLike(..),MatchOffset,MatchLength,Extract(..)) import Text.Regex.Posix.Wrap import Text.Regex.Base.Impl(polymatch,polymatchM) import Data.Sequence as S hiding (length) import qualified Data.Sequence as S (length) import Foreign.C.String import Foreign.Marshal.Array import Foreign.Marshal.Alloc import Foreign.Storable instance RegexContext Regex (Seq Char) (Seq Char) where match = polymatch matchM = polymatchM unusedOffset :: Int unusedOffset = fromIntegral unusedRegOffset unwrap :: (Show e) => Either e v -> IO v unwrap x = case x of Left err -> fail ("Text.Regex.Posix.Sequence died: "++ show err) Right v -> return v instance RegexMaker Regex CompOption ExecOption (Seq Char) where makeRegexOpts c e pattern = unsafePerformIO $ (compile c e pattern >>= unwrap) makeRegexOptsM c e pattern = either (fail.show) return $ unsafePerformIO $ (compile c e pattern) instance RegexLike Regex (Seq Char) where matchTest regex str = unsafePerformIO $ do withSeq str (wrapTest regex) >>= unwrap matchOnce regex str = unsafePerformIO $ execute regex str >>= unwrap matchAll regex str = unsafePerformIO $ withSeq str (wrapMatchAll regex) >>= unwrap matchCount regex str = unsafePerformIO $ withSeq str (wrapCount regex) >>= unwrap -- compile compile :: CompOption -- ^ Flags (summed together) -> ExecOption -- ^ Flags (summed together) -> (Seq Char) -- ^ The regular expression to compile (ASCII only, no null bytes) -> IO (Either WrapError Regex) -- ^ Returns: the compiled regular expression compile flags e pattern = withSeq pattern (wrapCompile flags e) -- ----------------------------------------------------------------------------- -- regexec -- | Matches a regular expression against a string execute :: Regex -- ^ Compiled regular expression -> (Seq Char) -- ^ (Seq Char) to match against -> IO (Either WrapError (Maybe (Array Int (MatchOffset,MatchLength)))) -- ^ Returns: 'Nothing' if the regex did not match the -- string, or: -- -- @ -- 'Just' (array of offset length pairs) -- @ execute regex str = do maybeStartEnd <- withSeq str (wrapMatch regex) case maybeStartEnd of Right Nothing -> return (Right Nothing) -- Right (Just []) -> fail "got [] back!" -- return wierd array instead Right (Just parts) -> return . Right . Just . listArray (0,pred (length parts)) . map (\(s,e)->(fromIntegral s, fromIntegral (e-s))) $ parts Left err -> return (Left err) -- ----------------------------------------------------------------------------- -- regexec -- | Matches a regular expression against a string regexec :: Regex -- ^ Compiled regular expression -> (Seq Char) -- ^ (Seq Char) to match against -> IO (Either WrapError (Maybe ((Seq Char), (Seq Char), (Seq Char), [(Seq Char)]))) -- ^ Returns: 'Nothing' if the regex did not match the -- string, or: -- -- @ -- 'Just' (everything before match, -- matched portion, -- everything after match, -- subexpression matches) -- @ regexec regex str = do let getSub :: (RegOffset,RegOffset) -> (Seq Char) getSub (start,stop) | start == unusedRegOffset = S.empty | otherwise = extract (fromEnum start,fromEnum $ stop-start) $ str matchedParts :: [(RegOffset,RegOffset)] -> ((Seq Char), (Seq Char), (Seq Char), [(Seq Char)]) matchedParts [] = (str,S.empty,S.empty,[]) -- no information matchedParts (matchedStartStop@(start,stop):subStartStop) = (before (fromEnum start) str ,getSub matchedStartStop ,after (fromEnum stop) str ,map getSub subStartStop) maybeStartEnd <- withSeq str (wrapMatch regex) case maybeStartEnd of Right Nothing -> return (Right Nothing) Right (Just parts) -> return . Right . Just . matchedParts $ parts Left err -> return (Left err) withSeq :: Seq Char -> (CString -> IO a) -> IO a withSeq s f = let -- Ensure null at end of s s' = case viewr s of -- bang !s EmptyR -> singleton '\0' _ :> '\0' -> s _ -> s |> '\0' pokes p a = case viewl a of -- bang pokes !p !a EmptyL -> return () c :< a' -> poke p (castCharToCChar c) >> pokes (advancePtr p 1) a' in allocaBytes (S.length s') (\ptr -> pokes ptr s' >> f ptr) regex-posix-0.95.2/Text/Regex/Posix/String.hs0000644000000000000000000001274511756216545017176 0ustar0000000000000000{-# OPTIONS_GHC -fno-warn-orphans #-} ----------------------------------------------------------------------------- -- | -- Module : Text.Regex.Posix.String -- Copyright : (c) Chris Kuklewicz 2006 -- License : BSD-style (see the file LICENSE) -- -- Maintainer : libraries@haskell.org, textregexlazy@personal.mightyreason.com -- Stability : experimental -- Portability : non-portable (regex-base needs MPTC+FD) -- -- This provides 'String' instances for 'RegexMaker' and 'RegexLike' based -- on "Text.Regex.Posix.Wrap", and a ('RegexContext' 'Regex' 'String' 'String') -- instance. -- -- To use these instance, you would normally import -- "Text.Regex.Posix". You only need to import this module to use -- the medium level API of the compile, regexec, and execute -- functions. All of these report error by returning Left values -- instead of undefined or error or fail. -- ----------------------------------------------------------------------------- module Text.Regex.Posix.String( -- ** Types Regex, MatchOffset, MatchLength, ReturnCode, WrapError, -- ** Miscellaneous unusedOffset, -- ** Medium level API functions compile, regexec, execute, -- ** Compilation options CompOption(CompOption), compBlank, compExtended, -- use extended regex syntax compIgnoreCase, -- ignore case when matching compNoSub, -- no substring matching needed compNewline, -- '.' doesn't match newline -- ** Execution options ExecOption(ExecOption), execBlank, execNotBOL, -- not at begining of line execNotEOL -- not at end of line ) where import Data.Array(listArray, Array) import Data.List(genericDrop, genericTake) import Foreign.C.String(withCAString) import System.IO.Unsafe(unsafePerformIO) import Text.Regex.Base.RegexLike(RegexContext(..),RegexMaker(..),RegexLike(..),MatchOffset,MatchLength) import Text.Regex.Posix.Wrap import Text.Regex.Base.Impl(polymatch,polymatchM) instance RegexContext Regex String String where match = polymatch matchM = polymatchM unusedOffset :: Int unusedOffset = fromIntegral unusedRegOffset unwrap :: (Show e) => Either e v -> IO v unwrap x = case x of Left err -> fail ("Text.Regex.Posix.String died: "++ show err) Right v -> return v instance RegexMaker Regex CompOption ExecOption String where makeRegexOpts c e pattern = unsafePerformIO $ (compile c e pattern >>= unwrap) makeRegexOptsM c e pattern = either (fail.show) return $ unsafePerformIO $ (compile c e pattern) instance RegexLike Regex String where matchTest regex str = unsafePerformIO $ do withCAString str (wrapTest regex) >>= unwrap matchOnce regex str = unsafePerformIO $ execute regex str >>= unwrap matchAll regex str = unsafePerformIO $ withCAString str (wrapMatchAll regex) >>= unwrap matchCount regex str = unsafePerformIO $ withCAString str (wrapCount regex) >>= unwrap -- compile compile :: CompOption -- ^ Flags (summed together) -> ExecOption -- ^ Flags (summed together) -> String -- ^ The regular expression to compile (ASCII only, no null bytes) -> IO (Either WrapError Regex) -- ^ Returns: the compiled regular expression compile flags e pattern = withCAString pattern (wrapCompile flags e) -- ----------------------------------------------------------------------------- -- regexec -- | Matches a regular expression against a string execute :: Regex -- ^ Compiled regular expression -> String -- ^ String to match against -> IO (Either WrapError (Maybe (Array Int (MatchOffset,MatchLength)))) -- ^ Returns: 'Nothing' if the regex did not match the -- string, or: -- -- @ -- 'Just' (array of offset length pairs) -- @ execute regex str = do maybeStartEnd <- withCAString str (wrapMatch regex) case maybeStartEnd of Right Nothing -> return (Right Nothing) -- Right (Just []) -> fail "got [] back!" -- return wierd array instead Right (Just parts) -> return . Right . Just . listArray (0,pred (length parts)) . map (\(s,e)->(fromIntegral s, fromIntegral (e-s))) $ parts Left err -> return (Left err) -- ----------------------------------------------------------------------------- -- regexec -- | Matches a regular expression against a string regexec :: Regex -- ^ Compiled regular expression -> String -- ^ String to match against -> IO (Either WrapError (Maybe (String, String, String, [String]))) -- ^ Returns: 'Nothing' if the regex did not match the -- string, or: -- -- @ -- 'Just' (everything before match, -- matched portion, -- everything after match, -- subexpression matches) -- @ regexec regex str = do let getSub (start,stop) | start == unusedRegOffset = "" | otherwise = genericTake (stop-start) . genericDrop start $ str matchedParts [] = (str,"","",[]) -- no information matchedParts (matchedStartStop@(start,stop):subStartStop) = (genericTake start str ,getSub matchedStartStop ,genericDrop stop str ,map getSub subStartStop) maybeStartEnd <- withCAString str (wrapMatch regex) case maybeStartEnd of Right Nothing -> return (Right Nothing) Right (Just parts) -> return . Right . Just . matchedParts $ parts Left err -> return (Left err) regex-posix-0.95.2/Text/Regex/Posix/Wrap.hsc0000644000000000000000000006225211756216545017002 0ustar0000000000000000{-# OPTIONS_GHC -fno-warn-unused-imports #-} ----------------------------------------------------------------------------- -- | -- Module : Text.Regex.Posix.Wrap -- Copyright : (c) Chris Kuklewicz 2006,2007,2008 derived from (c) The University of Glasgow 2002 -- License : BSD-style (see the file LICENSE) -- -- Maintainer : libraries@haskell.org, textregexlazy@personal.mightyreason.com -- Stability : experimental -- Portability : non-portable (regex-base needs MPTC+FD) -- -- WrapPosix.hsc exports a wrapped version of the ffi imports. To -- increase type safety, the flags are newtype'd. The other important -- export is a 'Regex' type that is specific to the Posix library -- backend. The flags are documented in "Text.Regex.Posix". The -- 'defaultCompOpt' is @(compExtended .|. compNewline)@. -- -- The 'Regex', 'CompOption', and 'ExecOption' types and their 'RegexOptions' -- instance is declared. The '=~' and '=~~' convenience functions are -- defined. -- -- The exported symbols are the same whether HAVE_REGEX_H is defined, but -- when it is not defined then @getVersion == Nothing@ and all other -- exported values will call error or fail. -- -- This module will fail or error only if allocation fails or a nullPtr -- is passed in. -- -- 2009-January : wrapMatchAll and wrapCount now adjust the execution -- option execNotBOL after the first result to take into account '\n' -- in the text immediately before the next matches. (version 0.93.3) -- -- 2009-January : wrapMatchAll and wrapCount have been changed to -- return all non-overlapping matches, including empty matches even if -- they coincide with the end of the previous non-empty match. The -- change is that the first non-empty match no longer terminates the -- search. One can filter the results to obtain the old behavior or -- to obtain the behavior of "sed", where "sed" eliminates the empty -- matches which coincide with the end of non-empty matches. (version -- 0.94.0) ----------------------------------------------------------------------------- module Text.Regex.Posix.Wrap( -- ** High-level API Regex, RegOffset, RegOffsetT, (=~), (=~~), -- ** Low-level API WrapError, wrapCompile, wrapTest, wrapMatch, wrapMatchAll, wrapCount, -- ** Miscellaneous unusedRegOffset, -- ** Compilation options CompOption(CompOption), compBlank, compExtended, -- use extended regex syntax compIgnoreCase, -- ignore case when matching compNoSub, -- no substring matching needed compNewline, -- '.' doesn't match newline -- ** Execution options ExecOption(ExecOption), execBlank, execNotBOL, -- not at begining of line execNotEOL, -- not at end of line -- ** Return codes ReturnCode(ReturnCode), retBadbr, retBadpat, retBadrpt, retEcollate, retEctype, retEescape, retEsubreg, retEbrack, retEparen, retEbrace, retErange, retEspace ) where #ifdef HAVE_REGEX_H #define HAVE_REGCOMP 1 #else #ifdef __NHC__ #define HAVE_REGEX_H 1 #define HAVE_REGCOMP 1 #endif #endif #include -- string.h is needed for memset #include "myfree.h" #include "string.h" #ifndef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 1 #endif #if HAVE_REGEX_H && HAVE_REGCOMP #include "regex.h" #else #include "regex.h" -- CFILES stuff is Hugs only {-# CFILES cbits/reallocf.c #-} {-# CFILES cbits/regcomp.c #-} {-# CFILES cbits/regerror.c #-} {-# CFILES cbits/regexec.c #-} {-# CFILES cbits/regfree.c #-} #endif import Control.Monad(liftM) import Data.Array(Array,listArray) import Data.Bits(Bits(..)) import Data.Int(Int32,Int64) -- need whatever RegeOffset or #regoff_t type will be import Data.Word(Word32,Word64) -- need whatever RegeOffset or #regoff_t type will be import Foreign(Ptr, FunPtr, nullPtr, newForeignPtr, addForeignPtrFinalizer, Storable(peekByteOff), allocaArray, allocaBytes, withForeignPtr,ForeignPtr,plusPtr,peekElemOff) import Foreign.Marshal.Alloc(mallocBytes) import Foreign.C(CChar) #if __GLASGOW_HASKELL__ >= 703 import Foreign.C(CSize(CSize),CInt(CInt)) #else import Foreign.C(CSize,CInt) #endif import Foreign.C.String(peekCAString, CString) import Text.Regex.Base.RegexLike(RegexOptions(..),RegexMaker(..),RegexContext(..),MatchArray) -- deprecated: import qualified System.IO.Error as IOERROR(try) import qualified Control.Exception(try,IOException) try :: IO a -> IO (Either Control.Exception.IOException a) try = Control.Exception.try type CRegex = () -- dummy regex_t used below to read out nsub value -- | RegOffset is "typedef int regoff_t" on Linux and ultimately "typedef -- long long __int64_t" on Max OS X. So rather than saying -- 2,147,483,647 is all the length you need, I'll take the larger: -- 9,223,372,036,854,775,807 should be enough bytes for anyone, no -- need for Integer. The alternative is to compile to different sizes -- in a platform dependent manner with "type RegOffset = (#type -- regoff_t)", which I do not want to do. -- -- There is also a special value 'unusedRegOffset' :: 'RegOffset' which is -- (-1) and as a starting index means that the subgroup capture was -- unused. Otherwise the RegOffset indicates a character boundary that -- is before the character at that index offset, with the first -- character at index offset 0. So starting at 1 and ending at 2 means -- to take only the second character. type RegOffset = Int64 --debugging 64-bit ubuntu type RegOffsetT = (#type regoff_t) -- | A bitmapped 'CInt' containing options for compilation of regular -- expressions. Option values (and their man 3 regcomp names) are -- -- * 'compBlank' which is a completely zero value for all the flags. -- This is also the 'blankCompOpt' value. -- -- * 'compExtended' (REG_EXTENDED) which can be set to use extended instead -- of basic regular expressions. -- This is set in the 'defaultCompOpt' value. -- -- * 'compNewline' (REG_NEWLINE) turns on newline sensitivity: The dot (.) -- and inverted set @[^ ]@ never match newline, and ^ and $ anchors do -- match after and before newlines. -- This is set in the 'defaultCompOpt' value. -- -- * 'compIgnoreCase' (REG_ICASE) which can be set to match ignoring upper -- and lower distinctions. -- -- * 'compNoSub' (REG_NOSUB) which turns off all information from matching -- except whether a match exists. #ifdef __GLASGOW_HASKELL__ newtype CompOption = CompOption CInt deriving (Eq,Show,Num,Bits) #else newtype CompOption = CompOption CInt deriving (Eq,Show) instance Num CompOption where CompOption x + CompOption y = CompOption (x + y) CompOption x - CompOption y = CompOption (x - y) CompOption x * CompOption y = CompOption (x * y) abs (CompOption x) = CompOption (abs x) signum (CompOption x) = CompOption (signum x) fromInteger n = CompOption (fromInteger n) instance Bits CompOption where CompOption x .&. CompOption y = CompOption (x .&. y) CompOption x .|. CompOption y = CompOption (x .|. y) CompOption x `xor` CompOption y = CompOption (x `xor` y) complement (CompOption x) = CompOption (complement x) shift (CompOption x) n = CompOption (shift x n) rotate (CompOption x) n = CompOption (rotate x n) bitSize (CompOption x) = bitSize x isSigned (CompOption x) = isSigned x #endif -- | A bitmapped 'CInt' containing options for execution of compiled -- regular expressions. Option values (and their man 3 regexec names) are -- -- * 'execBlank' which is a complete zero value for all the flags. This is -- the blankExecOpt value. -- -- * 'execNotBOL' (REG_NOTBOL) can be set to prevent ^ from matching at the -- start of the input. -- -- * 'execNotEOL' (REG_NOTEOL) can be set to prevent $ from matching at the -- end of the input (before the terminating NUL). #ifdef __GLASGOW_HASKELL__ newtype ExecOption = ExecOption CInt deriving (Eq,Show,Num,Bits) #else newtype ExecOption = ExecOption CInt deriving (Eq,Show) instance Num ExecOption where ExecOption x + ExecOption y = ExecOption (x + y) ExecOption x - ExecOption y = ExecOption (x - y) ExecOption x * ExecOption y = ExecOption (x * y) abs (ExecOption x) = ExecOption (abs x) signum (ExecOption x) = ExecOption (signum x) fromInteger n = ExecOption (fromInteger n) instance Bits ExecOption where ExecOption x .&. ExecOption y = ExecOption (x .&. y) ExecOption x .|. ExecOption y = ExecOption (x .|. y) ExecOption x `xor` ExecOption y = ExecOption (x `xor` y) complement (ExecOption x) = ExecOption (complement x) shift (ExecOption x) n = ExecOption (shift x n) rotate (ExecOption x) n = ExecOption (rotate x n) bitSize (ExecOption x) = bitSize x isSigned (ExecOption x) = isSigned x #endif -- | ReturnCode is an enumerated 'CInt', corresponding to the error codes -- from @man 3 regex@: -- -- * 'retBadbr' (@REG_BADBR@) invalid repetition count(s) in @{ }@ -- -- * 'retBadpat' (@REG_BADPAT@) invalid regular expression -- -- * 'retBadrpt' (@REG_BADRPT@) @?@, @*@, or @+@ operand invalid -- -- * 'retEcollate' (@REG_ECOLLATE@) invalid collating element -- -- * 'retEctype' (@REG_ECTYPE@) invalid character class -- -- * 'retEescape' (@REG_EESCAPE@) @\\@ applied to unescapable character -- -- * 'retEsubreg' (@REG_ESUBREG@) invalid backreference number -- -- * 'retEbrack' (@REG_EBRACK@) brackets @[ ]@ not balanced -- -- * 'retEparen' (@REG_EPAREN@) parentheses @( )@ not balanced -- -- * 'retEbrace' (@REG_EBRACE@) braces @{ }@ not balanced -- -- * 'retErange' (@REG_ERANGE@) invalid character range in @[ ]@ -- -- * 'retEspace' (@REG_ESPACE@) ran out of memory -- -- * 'retNoMatch' (@REG_NOMATCH@) The regexec() function failed to match -- newtype ReturnCode = ReturnCode CInt deriving (Eq,Show) -- | A compiled regular expression. data Regex = Regex (ForeignPtr CRegex) CompOption ExecOption -- | A completely zero value for all the flags. -- This is also the 'blankCompOpt' value. compBlank :: CompOption compBlank = CompOption 0 -- | A completely zero value for all the flags. -- This is also the 'blankExecOpt' value. execBlank :: ExecOption execBlank = ExecOption 0 unusedRegOffset :: RegOffset unusedRegOffset = (-1) -- | The return code will be retOk when it is the Haskell wrapper and -- not the underlying library generating the error message. type WrapError = (ReturnCode,String) wrapCompile :: CompOption -- ^ Flags (bitmapped) -> ExecOption -- ^ Flags (bitmapped) -> CString -- ^ The regular expression to compile (ASCII only, no null bytes) -> IO (Either WrapError Regex) -- ^ Returns: the compiled regular expression wrapTest :: Regex -> CString -> IO (Either WrapError Bool) -- | wrapMatch returns offsets for the begin and end of each capture. -- Unused captures have offsets of unusedRegOffset which is (-1) wrapMatch :: Regex -> CString -> IO (Either WrapError (Maybe [(RegOffset,RegOffset)])) -- | wrapMatchAll returns the offset and length of each capture. -- Unused captures have an offset of unusedRegOffset which is (-1) and -- length of 0. wrapMatchAll :: Regex -> CString -> IO (Either WrapError [MatchArray]) wrapCount :: Regex -> CString -> IO (Either WrapError Int) (=~) :: (RegexMaker Regex CompOption ExecOption source,RegexContext Regex source1 target) => source1 -> source -> target (=~~) :: (RegexMaker Regex CompOption ExecOption source,RegexContext Regex source1 target,Monad m) => source1 -> source -> m target instance RegexOptions Regex CompOption ExecOption where blankCompOpt = compBlank blankExecOpt = execBlank defaultCompOpt = compExtended .|. compNewline defaultExecOpt = execBlank setExecOpts e' (Regex r c _) = Regex r c e' getExecOpts (Regex _ _ e) = e -- (=~) ::(RegexMaker Regex CompOption ExecOption source,RegexContext Regex source1 target) => source1 -> source -> target (=~) x r = let make :: RegexMaker Regex CompOption ExecOption a => a -> Regex make = makeRegex in match (make r) x -- (=~~) ::(RegexMaker Regex CompOption ExecOption source,RegexContext Regex source1 target,Monad m) => source1 -> source -> m target (=~~) x r = let make :: RegexMaker Regex CompOption ExecOption a => a -> Regex make = makeRegex in matchM (make r) x type CRegMatch = () -- dummy regmatch_t used below to read out so and eo values -- ----------------------------------------------------------------------------- -- The POSIX regex C interface -- string.h foreign import ccall unsafe "memset" c_memset :: Ptr CRegex -> CInt -> CSize -> IO (Ptr CRegex) -- c-finalizer/myfree.h and c-finalizer/myfree.c foreign import ccall unsafe "&myregfree" c_myregfree :: FunPtr (Ptr CRegex -> IO ()) #if __GLASGOW_HASKELL__ || __HUGS__ foreign import ccall unsafe "regcomp" c_regcomp :: Ptr CRegex -> CString -> CompOption -> IO ReturnCode {- NOT USED foreign import ccall unsafe "®free" c_regfree :: FunPtr (Ptr CRegex -> IO ()) -} foreign import ccall unsafe "regexec" c_regexec :: Ptr CRegex -> CString -> CSize -> Ptr CRegMatch -> ExecOption -> IO ReturnCode foreign import ccall unsafe "regerror" c_regerror :: ReturnCode -> Ptr CRegex -> CString -> CSize -> IO CSize #elif HAVE_REGEX_H && HAVE_REGCOMP foreign import ccall unsafe "regex.h regcomp" c_regcomp :: Ptr CRegex -> CString -> CompOption -> IO ReturnCode foreign import ccall unsafe "regex.h ®free" c_regfree :: FunPtr (Ptr CRegex -> IO ()) foreign import ccall unsafe "regex.h regexec" c_regexec :: Ptr CRegex -> CString -> CSize -> Ptr CRegMatch -> ExecOption -> IO ReturnCode foreign import ccall unsafe "regex.h regerror" c_regerror :: ReturnCode -> Ptr CRegex -> CString -> CSize -> IO CSize #else foreign import ccall unsafe "regex/regex.h regcomp" c_regcomp :: Ptr CRegex -> CString -> CompOption -> IO ReturnCode foreign import ccall unsafe "regex/regex.h ®free" c_regfree :: FunPtr (Ptr CRegex -> IO ()) foreign import ccall unsafe "regex/regex.h regexec" c_regexec :: Ptr CRegex -> CString -> CSize -> Ptr CRegMatch -> ExecOption -> IO ReturnCode foreign import ccall unsafe "regex/regex.h regerror" c_regerror :: ReturnCode -> Ptr CRegex -> CString -> CSize -> IO CSize #endif retOk :: ReturnCode retOk = ReturnCode 0 -- Flags for regexec #enum ExecOption,ExecOption, \ execNotBOL = REG_NOTBOL, \ execNotEOL = REG_NOTEOL -- Flags for regcomp #enum CompOption,CompOption, \ compExtended = REG_EXTENDED, \ compIgnoreCase = REG_ICASE, \ compNoSub = REG_NOSUB, \ compNewline = REG_NEWLINE -- Return values from regexec (REG_NOMATCH, REG_ESPACE,...) -- Error codes from regcomp (not REG_NOMATCH) -- Though calling retNoMatch an error is rather missing the point... #enum ReturnCode,ReturnCode, \ retNoMatch = REG_NOMATCH, \ retBadbr = REG_BADBR, \ retBadpat = REG_BADPAT, \ retBadrpt = REG_BADRPT, \ retEcollate = REG_ECOLLATE, \ retEctype = REG_ECTYPE, \ retEescape = REG_EESCAPE, \ retEsubreg = REG_ESUBREG, \ retEbrack = REG_EBRACK, \ retEparen = REG_EPAREN, \ retEbrace = REG_EBRACE, \ retErange = REG_ERANGE, \ retEspace = REG_ESPACE ---- -- error helpers nullTest :: Ptr a -> String -> IO (Either WrapError b) -> IO (Either WrapError b) {-# INLINE nullTest #-} nullTest ptr msg io = do if nullPtr == ptr then return (Left (retOk,"Ptr parameter was nullPtr in Text.Regex.TRE.Wrap."++msg)) else io isNewline,isNull :: Ptr CChar -> Int -> IO Bool isNewline cstr pos = liftM (newline ==) (peekElemOff cstr pos) where newline = toEnum 10 isNull cstr pos = liftM (nullChar ==) (peekElemOff cstr pos) where nullChar = toEnum 0 {- wrapRC :: ReturnCode -> IO (Either WrapError b) {-# INLINE wrapRC #-} wrapRC r = return (Left (r,"Error in Text.Regex.Posix.Wrap: "++show r)) -} wrapError :: ReturnCode -> Ptr CRegex -> IO (Either WrapError b) wrapError errCode regex_ptr = do -- Call once to compute the error message buffer size errBufSize <- c_regerror errCode regex_ptr nullPtr 0 -- Allocate a temporary buffer to hold the error message allocaArray (fromIntegral errBufSize) $ \errBuf -> do nullTest errBuf "wrapError errBuf" $ do _ <- c_regerror errCode regex_ptr errBuf errBufSize msg <- peekCAString errBuf :: IO String return (Left (errCode, msg)) ---------- wrapCompile flags e pattern = do nullTest pattern "wrapCompile pattern" $ do e_regex_ptr <- try $ mallocBytes (#const sizeof(regex_t)) -- ioError called if nullPtr case e_regex_ptr of Left ioerror -> return (Left (retOk,"Text.Regex.Posix.Wrap.wrapCompile: IOError from mallocBytes(regex_t) : "++show ioerror)) Right raw_regex_ptr -> do zero_regex_ptr <- c_memset raw_regex_ptr 0 (#const sizeof(regex_t)) -- no calloc, so clear the new area to zero regex_fptr <- newForeignPtr c_myregfree zero_regex_ptr -- once pointed-to area is clear it should be safe to add finalizer withForeignPtr regex_fptr $ \regex_ptr -> do -- withForeignPtr is best hygiene here errCode <- c_regcomp regex_ptr pattern flags if (errCode == retOk) then return . Right $ Regex regex_fptr flags e else wrapError errCode regex_ptr --------- wrapTest (Regex regex_fptr _ flags) cstr = do nullTest cstr "wrapTest" $ do withForeignPtr regex_fptr $ \regex_ptr -> do r <- c_regexec regex_ptr cstr 0 nullPtr flags if r == retOk then return (Right True) else if r == retNoMatch then return (Right False) else wrapError r regex_ptr --------- wrapMatch regex@(Regex regex_fptr compileOptions flags) cstr = do nullTest cstr "wrapMatch cstr" $ do if (0 /= compNoSub .&. compileOptions) then do r <- wrapTest regex cstr case r of Right True -> return (Right (Just [])) -- Source of much "wtf?" crap Right False -> return (Right Nothing) Left err -> return (Left err) else do withForeignPtr regex_fptr $ \regex_ptr -> do nsub <- (#peek regex_t, re_nsub) regex_ptr :: IO CSize let nsub_int,nsub_bytes :: Int nsub_int = fromIntegral nsub nsub_bytes = ((1 + nsub_int) * (#const sizeof(regmatch_t))) -- add one because index zero covers the whole match allocaBytes nsub_bytes $ \p_match -> do nullTest p_match "wrapMatch allocaBytes" $ do doMatch regex_ptr cstr nsub p_match flags -- Very very thin wrapper -- Requires, but does not check, that nsub>=0 -- Cannot return (Right (Just [])) doMatch :: Ptr CRegex -> CString -> CSize -> Ptr CRegMatch -> ExecOption -> IO (Either WrapError (Maybe [(RegOffset,RegOffset)])) {-# INLINE doMatch #-} doMatch regex_ptr cstr nsub p_match flags = do r <- c_regexec regex_ptr cstr (1 + nsub) p_match flags if r == retOk then do regions <- mapM getOffsets . take (1+fromIntegral nsub) . iterate (`plusPtr` (#const sizeof(regmatch_t))) $ p_match return (Right (Just regions)) -- regions will not be [] else if r == retNoMatch then return (Right Nothing) else wrapError r regex_ptr where getOffsets :: Ptr CRegMatch -> IO (RegOffset,RegOffset) {-# INLINE getOffsets #-} getOffsets pmatch' = do start <- (#peek regmatch_t, rm_so) pmatch' :: IO (#type regoff_t) end <- (#peek regmatch_t, rm_eo) pmatch' :: IO (#type regoff_t) return (fromIntegral start,fromIntegral end) wrapMatchAll regex@(Regex regex_fptr compileOptions flags) cstr = do nullTest cstr "wrapMatchAll cstr" $ do if (0 /= compNoSub .&. compileOptions) then do r <- wrapTest regex cstr case r of Right True -> return (Right [(toMA 0 [])]) -- Source of much "wtf?" crap Right False -> return (Right []) Left err -> return (Left err) else do withForeignPtr regex_fptr $ \regex_ptr -> do nsub <- (#peek regex_t, re_nsub) regex_ptr :: IO CSize let nsub_int,nsub_bytes :: Int nsub_int = fromIntegral nsub nsub_bytes = ((1 + nsub_int) * (#const sizeof(regmatch_t))) -- add one because index zero covers the whole match allocaBytes nsub_bytes $ \p_match -> do nullTest p_match "wrapMatchAll p_match" $ do let flagsBOL = (complement execNotBOL) .&. flags flagsMIDDLE = execNotBOL .|. flags atBOL pos = doMatch regex_ptr (plusPtr cstr pos) nsub p_match flagsBOL atMIDDLE pos = doMatch regex_ptr (plusPtr cstr pos) nsub p_match flagsMIDDLE loop acc old (s,e) | acc `seq` old `seq` False = undefined | s == e = do let pos = old + fromIntegral e -- pos may be 0 atEnd <- isNull cstr pos if atEnd then return (Right (acc [])) else loop acc old (s,succ e) | otherwise = do let pos = old + fromIntegral e -- pos must be greater than 0 (tricky but true) prev'newline <- isNewline cstr (pred pos) -- safe result <- if prev'newline then atBOL pos else atMIDDLE pos case result of Right Nothing -> return (Right (acc [])) Right (Just parts@(whole:_)) -> let ma = toMA pos parts in loop (acc.(ma:)) pos whole Left err -> return (Left err) Right (Just []) -> return (Right (acc [(toMA pos [])])) -- should never happen result <- doMatch regex_ptr cstr nsub p_match flags case result of Right Nothing -> return (Right []) Right (Just parts@(whole:_)) -> let ma = toMA 0 parts in loop (ma:) 0 whole Left err -> return (Left err) Right (Just []) -> return (Right [(toMA 0 [])]) -- should never happen where toMA :: Int -> [(RegOffset,RegOffset)] -> Array Int (Int,Int) toMA pos [] = listArray (0,0) [(pos,0)] -- wtf? toMA pos parts = listArray (0,pred (length parts)) . map (\(s,e)-> if s>=0 then (pos+fromIntegral s, fromIntegral (e-s)) else (-1,0)) $ parts --------- wrapCount regex@(Regex regex_fptr compileOptions flags) cstr = do nullTest cstr "wrapCount cstr" $ do if (0 /= compNoSub .&. compileOptions) then do r <- wrapTest regex cstr case r of Right True -> return (Right 1) Right False -> return (Right 0) Left err -> return (Left err) else do withForeignPtr regex_fptr $ \regex_ptr -> do let nsub_bytes = (#size regmatch_t) allocaBytes nsub_bytes $ \p_match -> do nullTest p_match "wrapCount p_match" $ do let flagsBOL = (complement execNotBOL) .&. flags flagsMIDDLE = execNotBOL .|. flags atBOL pos = doMatch regex_ptr (plusPtr cstr pos) 0 p_match flagsBOL atMIDDLE pos = doMatch regex_ptr (plusPtr cstr pos) 0 p_match flagsMIDDLE loop acc old (s,e) | acc `seq` old `seq` False = undefined | s == e = do let pos = old + fromIntegral e -- 0 <= pos atEnd <- isNull cstr pos if atEnd then return (Right acc) else loop acc old (s,succ e) | otherwise = do let pos = old + fromIntegral e -- 0 < pos prev'newline <- isNewline cstr (pred pos) -- safe result <- if prev'newline then atBOL pos else atMIDDLE pos case result of Right Nothing -> return (Right acc) Right (Just (whole:_)) -> loop (succ acc) pos whole Left err -> return (Left err) Right (Just []) -> return (Right acc) -- should never happen result <- doMatch regex_ptr cstr 0 p_match flags case result of Right Nothing -> return (Right 0) Right (Just (whole:_)) -> loop 1 0 whole Left err -> return (Left err) Right (Just []) -> return (Right 0) -- should never happen {- -- This is the slower 0.66 version of the code (91s instead of 79s on 10^6 bytes) wrapMatchAll regex cstr = do let regex' = setExecOpts (execNotBOL .|. (getExecOpts regex)) regex at pos = wrapMatch regex' (plusPtr cstr pos) loop old (s,e) | s == e = return [] | otherwise = do let pos = old + fromIntegral e result <- at pos case unwrap result of Nothing -> return [] Just [] -> return ((toMA pos []):[]) -- wtf? Just parts@(whole:_) -> do rest <- loop pos whole return ((toMA pos parts) : rest) result <- wrapMatch regex cstr case unwrap result of Nothing -> return [] Just [] -> return ((toMA 0 []):[]) -- wtf? Just parts@(whole:_) -> do rest <- loop 0 whole return ((toMA 0 parts) : rest) --------- -- This was also changed to match wrapMatchAll after 0.66 wrapCount regex cstr = do let regex' = setExecOpts (execNotBOL .|. (getExecOpts regex)) regex at pos = wrapMatch regex' (plusPtr cstr pos) loop acc old (s,e) | acc `seq` old `seq` False = undefined | s == e = return acc | otherwise = do let pos = old + fromIntegral e result <- at pos case unwrap result of Nothing -> return acc Just [] -> return (succ acc) -- wtf? Just (whole:_) -> loop (succ acc) pos whole result <- wrapMatch regex cstr case unwrap result of Nothing -> return 0 Just [] -> return 1 -- wtf? Just (whole:_) -> loop 1 0 whole -} regex-posix-0.95.2/Text/Regex/Posix/ByteString/0000755000000000000000000000000011756216545017455 5ustar0000000000000000regex-posix-0.95.2/Text/Regex/Posix/ByteString/Lazy.hs0000644000000000000000000001320411756216545020730 0ustar0000000000000000{-# OPTIONS_GHC -fno-warn-orphans #-} ----------------------------------------------------------------------------- -- | -- Module : Text.Regex.Posix.ByteString.Lazy -- Copyright : (c) Chris Kuklewicz 2007 -- License : BSD-style (see the file LICENSE) -- -- Maintainer : libraries@haskell.org, textregexlazy@personal.mightyreason.com -- Stability : experimental -- Portability : non-portable (regex-base needs MPTC+FD) -- -- This provides 'ByteString.Lazy' instances for RegexMaker and RegexLike -- based on "Text.Regex.Posix.Wrap", and a (RegexContext Regex -- ByteString ByteString) instance. -- -- To use these instance, you would normally import -- "Text.Regex.Posix". You only need to import this module to use -- the medium level API of the compile, regexec, and execute -- functions. All of these report error by returning Left values -- instead of undefined or error or fail. -- -- A Lazy ByteString with more than one chunk cannot be be passed to -- the library efficiently (as a pointer). It will have to converted -- via a full copy to a temporary normal bytestring (with a null byte -- appended if necessary). ----------------------------------------------------------------------------- module Text.Regex.Posix.ByteString.Lazy( -- ** Types Regex, MatchOffset, MatchLength, ReturnCode, WrapError, -- ** Miscellaneous unusedOffset, -- ** Medium level API functions compile, execute, regexec, -- ** Compilation options CompOption(CompOption), compBlank, compExtended, -- use extended regex syntax compIgnoreCase, -- ignore case when matching compNoSub, -- no substring matching needed compNewline, -- '.' doesn't match newline -- ** Execution options ExecOption(ExecOption), execBlank, execNotBOL, -- not at begining of line execNotEOL -- not at end of line ) where import Data.Array(Array) import qualified Data.ByteString.Lazy as L (ByteString,null,toChunks,fromChunks,last,snoc) import qualified Data.ByteString as B(ByteString,concat) #ifdef SPLIT_BASE import qualified Data.ByteString.Unsafe as B(unsafeUseAsCString) #else import qualified Data.ByteString.Base as B(unsafeUseAsCString) #endif import System.IO.Unsafe(unsafePerformIO) import Text.Regex.Base.RegexLike(RegexMaker(..),RegexContext(..),RegexLike(..),MatchOffset,MatchLength) import Text.Regex.Posix.Wrap -- all import qualified Text.Regex.Posix.ByteString as BS(execute,regexec) import Text.Regex.Base.Impl(polymatch,polymatchM) import Foreign.C.String(CString) instance RegexContext Regex L.ByteString L.ByteString where match = polymatch matchM = polymatchM fromLazy :: L.ByteString -> B.ByteString fromLazy = B.concat . L.toChunks toLazy :: B.ByteString -> L.ByteString toLazy = L.fromChunks . return unwrap :: (Show e) => Either e v -> IO v unwrap x = case x of Left err -> fail ("Text.Regex.Posix.ByteString.Lazy died: "++ show err) Right v -> return v {-# INLINE asCString #-} asCString :: L.ByteString -> (CString -> IO a) -> IO a asCString s = if (not (L.null s)) && (0==L.last s) then B.unsafeUseAsCString (fromLazy s) else B.unsafeUseAsCString (fromLazy (L.snoc s 0)) instance RegexMaker Regex CompOption ExecOption L.ByteString where makeRegexOpts c e pattern = unsafePerformIO $ compile c e pattern >>= unwrap makeRegexOptsM c e pattern = either (fail.show) return $ unsafePerformIO $ compile c e pattern instance RegexLike Regex L.ByteString where matchTest regex bs = unsafePerformIO $ asCString bs (wrapTest regex) >>= unwrap matchOnce regex bs = unsafePerformIO $ execute regex bs >>= unwrap matchAll regex bs = unsafePerformIO $ asCString bs (wrapMatchAll regex) >>= unwrap matchCount regex bs = unsafePerformIO $ asCString bs (wrapCount regex) >>= unwrap -- --------------------------------------------------------------------- -- | Compiles a regular expression -- compile :: CompOption -- ^ Flags (summed together) -> ExecOption -- ^ Flags (summed together) -> L.ByteString -- ^ The regular expression to compile -> IO (Either WrapError Regex) -- ^ Returns: the compiled regular expression compile c e pattern = asCString pattern (wrapCompile c e) -- --------------------------------------------------------------------- -- | Matches a regular expression against a buffer, returning the buffer -- indicies of the match, and any submatches -- -- | Matches a regular expression against a string execute :: Regex -- ^ Compiled regular expression -> L.ByteString -- ^ String to match against -> IO (Either WrapError (Maybe (Array Int (MatchOffset,MatchLength)))) -- ^ Returns: 'Nothing' if the regex did not match the -- string, or: -- 'Just' an array of (offset,length) pairs where index 0 is whole match, and the rest are the captured subexpressions. execute regex bs = if (not (L.null bs)) && (0==L.last bs) then BS.execute regex (fromLazy bs) else BS.execute regex (fromLazy (L.snoc bs 0)) regexec :: Regex -- ^ Compiled regular expression -> L.ByteString -- ^ String to match against -> IO (Either WrapError (Maybe (L.ByteString, L.ByteString, L.ByteString, [L.ByteString]))) regexec regex bs = do x <- if (not (L.null bs)) && (0==L.last bs) then BS.regexec regex (fromLazy bs) else BS.regexec regex (fromLazy (L.snoc bs 0)) return $ case x of Left e -> Left e Right Nothing -> Right Nothing Right (Just (a,b,c,ds)) -> Right (Just (toLazy a,toLazy b,toLazy c,map toLazy ds)) unusedOffset :: Int unusedOffset = fromIntegral unusedRegOffset