hxt-regex-xmlschema-9.0.4/0000755000000000000000000000000011701313414013577 5ustar0000000000000000hxt-regex-xmlschema-9.0.4/Setup.hs0000644000000000000000000000011011701313414015223 0ustar0000000000000000#!/usr/bin/env runhaskell import Distribution.Simple main = defaultMain hxt-regex-xmlschema-9.0.4/LICENSE0000644000000000000000000000212011701313414014577 0ustar0000000000000000The MIT License Copyright (c) 2005 Uwe Schmidt, Martin Schmidt, Torben Kuseler Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. hxt-regex-xmlschema-9.0.4/hxt-regex-xmlschema.cabal0000644000000000000000000000413411701313414020457 0ustar0000000000000000Name: hxt-regex-xmlschema Version: 9.0.4 Synopsis: A regular expression library for W3C XML Schema regular expressions Description: This library supports full W3C XML Schema regular expressions inclusive all Unicode character sets and blocks. The complete grammar can be found under . It is implemented by the technique of derivations of regular expressions. The W3C syntax is extended to support not only union of regular sets, but also intersection, set difference, exor. Matching of subexpressions is also supported. The library can be used for constricting lightweight scanners and tokenizers. It is a standalone library, no external regex libraries are used. This package is a substitute for the old regex-xmlschema package. License: MIT License-file: LICENSE Author: Uwe Schmidt Maintainer: Uwe Schmidt Copyright: Copyright (c) 2010- Uwe Schmidt Stability: stable Category: Text Homepage: http://www.haskell.org/haskellwiki/Regular_expressions_for_XML_Schema Build-type: Simple Cabal-version: >=1.6 extra-source-files: test/Main.hs test/Makefile test/.ghci examples/colorizeProgs/ColorizeSourceCode.hs examples/colorizeProgs/Makefile examples/performance/REtest.hs examples/performance/Makefile examples/RegexXMLSchema/Makefile examples/RegexXMLSchema/REtest.hs Library Exposed-modules: Text.Regex.XMLSchema.String.RegexParser Text.Regex.XMLSchema.String.Regex Text.Regex.XMLSchema.String Text.Regex.Glob.String.RegexParser Text.Regex.Glob.String hs-source-dirs: src ghc-options: -Wall ghc-prof-options: -auto-all -caf-all build-depends: base >= 4 && < 5, parsec >= 2.1 && < 4, hxt-charproperties >= 9 && < 10 hxt-regex-xmlschema-9.0.4/src/0000755000000000000000000000000011701313414014366 5ustar0000000000000000hxt-regex-xmlschema-9.0.4/src/Text/0000755000000000000000000000000011701313414015312 5ustar0000000000000000hxt-regex-xmlschema-9.0.4/src/Text/Regex/0000755000000000000000000000000011701313414016364 5ustar0000000000000000hxt-regex-xmlschema-9.0.4/src/Text/Regex/XMLSchema/0000755000000000000000000000000011701313414020145 5ustar0000000000000000hxt-regex-xmlschema-9.0.4/src/Text/Regex/XMLSchema/String.hs0000644000000000000000000004044311701313414021754 0ustar0000000000000000-- ------------------------------------------------------------ {- | Module : Text.Regex.XMLSchema.String Copyright : Copyright (C) 2010- Uwe Schmidt License : MIT Maintainer : Uwe Schmidt Stability : experimental Portability: portable Convenient functions for W3C XML Schema Regular Expression Matcher. For internals see 'Text.Regex.XMLSchema.String.Regex' Grammar can be found under -} -- ------------------------------------------------------------ module Text.Regex.XMLSchema.String ( GenRegex , Regex , grep , grepExt , match , matchExt , matchSubex , sed , sedExt , split , splitExt , splitSubex , tokenize , tokenizeExt , tokenize' , tokenizeExt' , tokenizeSubex , matchRE , matchSubexRE , sedRE , splitRE , splitSubexRE , tokenizeRE , tokenizeRE' , tokenizeSubexRE , mkZero , mkUnit , mkSym1 , mkSymRng , mkWord , mkDot , mkStar , mkAll , mkAlt , mkElse , mkSeq , mkSeqs , mkRep , mkRng , mkOpt , mkDiff , mkIsect , mkExor , mkCompl , mkBr , isZero , errRegex , parseRegex -- re-export of Text.Regex.XMLSchema.String.RegexParser , parseRegexExt ) where import Control.Arrow import Data.List import Data.Maybe import Text.Regex.XMLSchema.String.Regex import Text.Regex.XMLSchema.String.RegexParser -- ------------------------------------------------------------ -- | split a string by taking the longest prefix matching a regular expression -- -- @Nothing@ is returned in case there is no matching prefix, -- else the pair of prefix and rest is returned splitRE :: (Eq l, Show l) => GenRegex l -> String -> Maybe (String, String) splitRE re input = do (sms, rest) <- splitWithRegex re input return (snd . head $ sms, rest) -- | convenient function for 'splitRE' -- -- examples: -- -- > split "a*b" "abc" = ("ab","c") -- > split "a*" "bc" = ("", "bc") -- "a*" matches "" -- > split "a+" "bc" = ("", "bc") -- "a+" does not match, no split -- > split "[" "abc" = ("", "abc") -- "[" syntax error, no split split :: String -> String -> (String, String) split = split' parseRegex -- | split with extended syntax splitExt :: String -> String -> (String, String) splitExt = split' parseRegexExt split' :: (String -> Regex) -> String -> String -> (String, String) split' parseRegex' re input = fromMaybe ("", input) . (splitRE . parseRegex' $ re) $ input -- ------------------------------------------------------------ -- | split a string by removing the longest prefix matching a regular expression -- and then return the list of subexpressions found in the matching part -- -- @Nothing@ is returned in case of no matching prefix, -- else the list of pairs of labels and submatches and the -- rest is returned splitSubexRE :: (Eq l, Show l) => GenRegex l -> String -> Maybe ([(l, String)], String) splitSubexRE re input = do (sms, rest) <- splitWithRegex re input return (map (first fromJust) . drop 1 $ sms, rest) -- | convenient function for 'splitSubex', uses extended syntax -- -- examples: -- -- > splitSubex "({1}a*)b" "abc" = ([("1","a")],"c") -- > splitSubex "({2}a*)" "bc" = ([("2","")], "bc") -- > splitSubex "({1}a|b)+" "abc" = ([("1","a"),("1","b")],"c") -- subex 1 matches 2 times -- > -- > splitSubex ".*({x}a*)" "aa" = ([("x",""),("x","a"),("x","aa")],"") -- > -- nondeterminism: 3 matches for a* -- > -- > splitSubex "({1}do)|({2}[a-z]+)" "do you know" -- > = ([("1","do"),("2","do")]," you know") -- > -- nondeterminism: 2 matches for do -- > -- > splitSubex "({1}do){|}({2}[a-z]+)" "do you know" -- > = ([("1","do")]," you know") -- > -- no nondeterminism with {|}: 1. match for do -- > -- > splitSubex "({1}a+)" "bcd" = ([], "bcd") -- no match -- > splitSubex "[" "abc" = ([], "abc") -- syntax error splitSubex :: String -> String -> ([(String,String)], String) splitSubex re inp = fromMaybe ([], inp) . (splitSubexRE . parseRegexExt $ re) $ inp -- ------------------------------------------------------------ -- | The function, that does the real work for 'tokenize' tokenizeRE :: (Eq l, Show l) => GenRegex l -> String -> [String] tokenizeRE re = token'' where re1 = mkDiff re mkUnit token'' = token' re fcs token1'' = token' re1 fcs fcs = firstChars re -- token' :: (Eq l, Show l) => GenRegex l -> CharSet -> String -> [String] token' re' fcs' inp | null inp = [] | otherwise = evalRes . splitWithRegexCS re' fcs' $ inp where evalRes Nothing = token'' (tail inp) -- re does not match any prefix evalRes (Just (toks, rest)) | null tok = tok : token'' (tail rest) -- re is nullable and only the empty prefix matches -- discard one char and try again | otherwise = tok : token1'' rest -- real token found, next token must not be empty where tok = snd . head $ toks -- | split a string into tokens (words) by giving a regular expression -- which all tokens must match. -- -- Convenient function for 'tokenizeRE' -- -- This can be used for simple tokenizers. -- It is recommended to use regular expressions where the empty word does not match. -- Else there will appear a lot of probably useless empty tokens in the output. -- All none matching chars are discarded. If the given regex contains syntax errors, -- @Nothing@ is returned -- -- examples: -- -- > tokenize "a" "aabba" = ["a","a","a"] -- > tokenize "a*" "aaaba" = ["aaa","a"] -- > tokenize "a*" "bbb" = ["","",""] -- > tokenize "a+" "bbb" = [] -- > -- > tokenize "a*b" "" = [] -- > tokenize "a*b" "abc" = ["ab"] -- > tokenize "a*b" "abaab ab" = ["ab","aab","ab"] -- > -- > tokenize "[a-z]{2,}|[0-9]{2,}|[0-9]+[.][0-9]+" "ab123 456.7abc" -- > = ["ab","123","456.7","abc"] -- > -- > tokenize "[a-z]*|[0-9]{2,}|[0-9]+[.][0-9]+" "cab123 456.7abc" -- > = ["cab","123","456.7","abc"] -- > -- > tokenize "[^ \t\n\r]*" "abc def\t\n\rxyz" -- > = ["abc","def","xyz"] -- > -- > tokenize ".*" "\nabc\n123\n\nxyz\n" -- > = ["","abc","123","","xyz"] -- > -- > tokenize ".*" = lines -- > -- > tokenize "[^ \t\n\r]*" = words tokenize :: String -> String -> [String] tokenize = tokenizeRE . parseRegex -- | tokenize with extended syntax tokenizeExt :: String -> String -> [String] tokenizeExt = tokenizeRE . parseRegexExt -- ------------------------------------------------------------ -- | split a string into tokens and delimierter by giving a regular expression -- wich all tokens must match -- -- This is a generalisation of the above 'tokenizeRE' functions. -- The none matching char sequences are marked with @Left@, the matching ones are marked with @Right@ -- -- If the regular expression contains syntax errors @Nothing@ is returned -- -- The following Law holds: -- -- > concat . map (either id id) . tokenizeRE' re == id tokenizeRE' :: (Eq l, Show l) => GenRegex l -> String -> [Either String String] tokenizeRE' re = token'' "" where re1 = mkDiff re mkUnit token'' = token' re fcs token1'' = token' re1 fcs fcs = firstChars re -- token' :: (Eq l, Show l) => GenRegex l -> CharSet -> String -> String -> [Either String String] token' re' fcs' unmatched inp | null inp = addUnmatched [] | otherwise = evalRes . splitWithRegexCS re' fcs' $ inp where addUnmatched | null unmatched = id | otherwise = ((Left . reverse $ unmatched) :) addMatched t = addUnmatched . ((Right t) :) evalRes Nothing = token'' ((head inp) : unmatched) (tail inp) -- re does not match any prefix evalRes (Just (toks, rest)) | null tok = addMatched tok $ token'' (take 1 rest) (tail rest) -- re is nullable and only the empty prefix matches -- discard one char and try again | otherwise = addMatched tok $ token1'' "" rest -- real token found, next token must not be empty where tok = snd . head $ toks -- | convenient function for 'tokenizeRE'' -- -- When the regular expression parses as Zero, @[Left input]@ is returned, that means no tokens are found tokenize' :: String -> String -> [Either String String] tokenize' = tokenizeRE' . parseRegex tokenizeExt' :: String -> String -> [Either String String] tokenizeExt' = tokenizeRE' . parseRegexExt -- ------------------------------------------------------------ -- | split a string into tokens (pair of labels and words) by giving a regular expression -- containing labeled subexpressions. -- -- This function should not be called with regular expressions -- without any labeled subexpressions. This does not make sense, because the result list -- will always be empty. -- -- Result is the list of matching subexpressions -- This can be used for simple tokenizers. -- At least one char is consumed by parsing a token. -- The pairs in the result list contain the matching substrings. -- All none matching chars are discarded. If the given regex contains syntax errors, -- @Nothing@ is returned tokenizeSubexRE :: (Eq l, Show l) => GenRegex l -> String -> [(l, String)] tokenizeSubexRE re = token'' where re1 = mkDiff re mkUnit token'' = token' re fcs token1'' = token' re1 fcs fcs = firstChars re -- token' :: (Eq l, Show l) => GenRegex l -> CharSet -> String -> [(l,String)] token' re' fcs' inp | null inp = [] | otherwise = evalRes . splitWithRegexCS re' fcs' $ inp where evalRes Nothing = token'' (tail inp) -- re does not match any prefix evalRes (Just (toks, rest)) | null tok = res ++ token'' (tail rest) -- re is nullable and only the empty prefix matches | otherwise = res ++ token1'' rest -- token found, tokenize the rest where res = map (first fromJust) . tail $ toks tok = snd . head $ toks -- | convenient function for 'tokenizeSubexRE' a string -- -- examples: -- -- > tokenizeSubex "({name}[a-z]+)|({num}[0-9]{2,})|({real}[0-9]+[.][0-9]+)" -- > "cab123 456.7abc" -- > = [("name","cab") -- > ,("num","123") -- > ,("real","456.7") -- > ,("name","abc")] -- > -- > tokenizeSubex "({real}({n}[0-9]+)([.]({f}[0-9]+))?)" -- > "12.34" = [("real","12.34") -- > ,("n","12") -- > ,("f","34")] -- > -- > tokenizeSubex "({real}({n}[0-9]+)([.]({f}[0-9]+))?)" -- > "12 34" = [("real","12"),("n","12") -- > ,("real","34"),("n","34")] -- > -- > tokenizeSubex "({real}({n}[0-9]+)(([.]({f}[0-9]+))|({f})))" -- > "12 34.56" = [("real","12"),("n","12"),("f","") -- > ,("real","34.56"),("n","34"),("f","56")] tokenizeSubex :: String -> String -> [(String,String)] tokenizeSubex = tokenizeSubexRE . parseRegexExt -- ------------------------------------------------------------ -- | sed like editing function -- -- All matching tokens are edited by the 1. argument, the editing function, -- all other chars remain as they are sedRE :: (Eq l, Show l) => (String -> String) -> GenRegex l -> String -> String sedRE edit re = concatMap (either id edit) . tokenizeRE' re -- | convenient function for 'sedRE' -- -- examples: -- -- > sed (const "b") "a" "xaxax" = "xbxbx" -- > sed (\ x -> x ++ x) "a" "xax" = "xaax" -- > sed undefined "[" "xxx" = "xxx" sed :: (String -> String) -> String -> String -> String sed edit = sedRE edit . parseRegex sedExt :: (String -> String) -> String -> String -> String sedExt edit = sedRE edit . parseRegexExt -- ------------------------------------------------------------ -- | match a string with a regular expression matchRE :: (Eq l, Show l) => GenRegex l -> String -> Bool matchRE = matchWithRegex -- | convenient function for 'matchRE' -- -- Examples: -- -- > match "x*" "xxx" = True -- > match "x" "xxx" = False -- > match "[" "xxx" = False match :: String -> String -> Bool match = matchWithRegex . parseRegex -- | match with extended regular expressions matchExt :: String -> String -> Bool matchExt = matchWithRegex . parseRegexExt -- ------------------------------------------------------------ -- | match a string with a regular expression -- and extract subexpression matches matchSubexRE :: (Eq l, Show l) => GenRegex l -> String -> [(l, String)] matchSubexRE re = map (first fromJust) . fromMaybe [] . matchWithRegex' re -- | convenient function for 'matchRE' -- -- Examples: -- -- > matchSubex "({1}x*)" "xxx" = [("1","xxx")] -- > matchSubex "({1}x*)" "y" = [] -- > matchSubex "({w}[0-9]+)x({h}[0-9]+)" "800x600" = [("w","800"),("h","600")] -- > matchSubex "[" "xxx" = [] matchSubex :: String -> String -> [(String, String)] matchSubex = matchSubexRE . parseRegexExt -- ------------------------------------------------------------ -- | grep like filter for lists of strings -- -- The regular expression may be prefixed with the usual context spec \"^\" for start of string, -- and "\\<" for start of word. -- and suffixed with \"$\" for end of text and "\\>" end of word. -- Word chars are defined by the multi char escape sequence "\\w" -- -- Examples -- -- > grep "a" ["_a_", "_a", "a_", "a", "_"] => ["_a_", "_a", "a_", "a"] -- > grep "^a" ["_a_", "_a", "a_", "a", "_"] => ["a_", "a"] -- > grep "a$" ["_a_", "_a", "a_", "a", "_"] => ["_a", "a"] -- > grep "^a$" ["_a_", "_a", "a_", "a", "_"] => ["a"] -- > grep "\\ ["x a b", " ax "] -- > grep "a\\>" ["x a b", " ax ", " xa ", "xab"] => ["x a b", " xa "] grep :: String -> [String] -> [String] grep = grep' parseRegex -- | grep with extended regular expressions grepExt :: String -> [String] -> [String] grepExt = grep' parseRegexExt grep' :: (String -> Regex) -> String -> [String] -> [String] grep' parseRegex' re = filter (matchRE re') where re' = mkSeqs . concat $ [ startContext , (:[]) . parseRegex' $ re2 , endContext ] (startContext, re1) | "^" `isPrefixOf` re = ([], tail re) | "\\<" `isPrefixOf` re = ([parseRegexExt "(\\A\\W)?"], drop 2 re) | otherwise = ([mkStar mkDot], re) (endContext, re2) | "$" `isSuffixOf` re1 = ([], init re1) | "\\>" `isSuffixOf` re1 = ([parseRegexExt "(\\W\\A)?"], init . init $ re1) | otherwise = ([mkStar mkDot], re1) -- ------------------------------------------------------------ hxt-regex-xmlschema-9.0.4/src/Text/Regex/XMLSchema/String/0000755000000000000000000000000011701313414021413 5ustar0000000000000000hxt-regex-xmlschema-9.0.4/src/Text/Regex/XMLSchema/String/Regex.hs0000644000000000000000000006754511701313414023042 0ustar0000000000000000-- ------------------------------------------------------------ {- | Module : Text.Regex.XMLSchema.String.Regex Copyright : Copyright (C) 2010 - Uwe Schmidt License : MIT Maintainer : Uwe Schmidt Stability : stable Portability: portable W3C XML Schema Regular Expression Matcher Grammar can be found under -} -- ------------------------------------------------------------ module Text.Regex.XMLSchema.String.Regex ( Regex , GenRegex , mkZero , mkUnit , mkSym , mkSym1 , mkSymRng , mkWord , mkDot , mkStar , mkAll , mkAlt , mkElse , mkSeq , mkSeqs , mkRep , mkRng , mkOpt , mkDiff , mkIsect , mkExor , mkInterleave , mkCompl , mkBr , isZero , errRegex , nullable , nullable' , delta1 , delta , firstChars , matchWithRegex , matchWithRegex' , splitWithRegex , splitWithRegex' , splitWithRegexCS , splitWithRegexCS' ) where import Data.List -- ( intercalate ) import Data.Set.CharSet -- ------------------------------------------------------------ data GenRegex l = Zero String | Unit | Sym CharSet | Dot | Star (GenRegex l) | Alt (GenRegex l) (GenRegex l) | Else (GenRegex l) (GenRegex l) | Seq (GenRegex l) (GenRegex l) | Rep Int (GenRegex l) -- 1 or more repetitions | Rng Int Int (GenRegex l) -- n..m repetitions | Diff (GenRegex l) (GenRegex l) -- r1 - r2 | Isec (GenRegex l) (GenRegex l) -- r1 n r2 | Exor (GenRegex l) (GenRegex l) -- r1 xor r2 | Intl (GenRegex l) (GenRegex l) -- r1 interleavedWith r2 | Br (Label l) (GenRegex l) String -- currently parsed (...) | Cbr (GenRegex l) [(Label l, String)] --already completely parsed (...) deriving (Eq, Ord {-, Show -}) type Regex = GenRegex String type Label l = Maybe l -- we need one special label for the whole expression -- see splitWithRegex type Nullable l = (Bool, [(Label l, String)]) -- ------------------------------------------------------------ {- just for documentation class Inv a where inv :: a -> Bool instance Inv (GenRegex l) where inv (Zero _) = True inv Unit = True inv (Sym p) = not (nulCS p) && not (fullCS p) inv Dot = True inv (Star e) = inv e inv (Alt e1 e2) = inv e1 && inv e2 inv (Seq e1 e2) = inv e1 && inv e2 inv (Rep i e) = i > 0 && inv e inv (Rng i j e) = (i < j || (i == j && i > 1)) && inv e inv (Diff e1 e2) = inv e1 && inv e2 inv (Isec e1 e2) = inv e1 && inv e2 inv (Exor e1 e2) = inv e1 && inv e2 -} -- ------------------------------------------------------------ -- -- smart constructors -- | construct the r.e. for the empty set. -- An (error-) message may be attached mkZero :: String -> GenRegex l mkZero = Zero {-# INLINE mkZero #-} -- | construct the r.e. for the set containing the empty word mkUnit :: GenRegex l mkUnit = Unit {-# INLINE mkUnit #-} -- | construct the r.e. for a set of chars mkSym :: CharSet -> GenRegex l mkSym s | nullCS s = mkZero $ "empty char range" | fullCS s = mkDot | otherwise = Sym s {-# INLINE mkSym #-} -- | construct an r.e. for a single char set mkSym1 :: Char -> GenRegex l mkSym1 = mkSym . singleCS {-# INLINE mkSym1 #-} -- | construct an r.e. for an intervall of chars mkSymRng :: Char -> Char -> GenRegex l mkSymRng c1 c2 = mkSym $ rangeCS c1 c2 {-# INLINE mkSymRng #-} -- | mkSym generaized for strings mkWord :: [Char] -> GenRegex l mkWord = mkSeqs . map mkSym1 -- | construct an r.e. for the set of all Unicode chars mkDot :: GenRegex l mkDot = Dot {-# INLINE mkDot #-} -- | construct an r.e. for the set of all Unicode words mkAll :: Eq l => GenRegex l mkAll = mkStar mkDot {-# INLINE mkAll #-} -- | construct r.e. for r* mkStar :: Eq l => GenRegex l -> GenRegex l mkStar (Zero _) = mkUnit -- {}* == () mkStar e@Unit = e -- ()* == () mkStar e@(Star _e1) = e -- (r*)* == r* mkStar (Rep 1 e1) = mkStar e1 -- (r+)* == r* mkStar (Rep i e1) | i == 1 || nullable e1 = mkStar e1 -- (r{i,})* == r* when i == 1 or nullable r mkStar e@(Rng _ _ e1) | nullable e = mkStar e1 -- (r{i,j})* == r* when i == 0 or nullable r mkStar e@(Alt _ _) = Star (rmStar e) -- (a*|b)* == (a|b)* {- this is wrong, not generally applicable mkStar (Br l r s) = mkBr0 l (mkStar r) s -- ({l}r)* == ({l}r*) because we want the longest match as result for the subexpression -} mkStar e = Star e rmStar :: Eq l => GenRegex l -> GenRegex l rmStar (Alt e1 e2) = mkAlt (rmStar e1) (rmStar e2) rmStar (Star e1) = rmStar e1 rmStar (Rep 1 e1) = rmStar e1 rmStar e1 = e1 -- | construct the r.e for r1|r2 mkAlt :: Eq l => GenRegex l -> GenRegex l -> GenRegex l mkAlt e1 (Zero _) = e1 -- e1 u {} = e1 mkAlt (Zero _) e2 = e2 -- {} u e2 = e2 mkAlt (Sym p1) (Sym p2) = mkSym $ p1 `unionCS` p2 -- melting of predicates mkAlt e1 e2@(Sym _) = mkAlt e2 e1 -- symmetry: predicates always first mkAlt e1@(Sym _) (Alt e2@(Sym _) e3) = mkAlt (mkAlt e1 e2) e3 -- prepare melting of predicates mkAlt (Sym _) e2@Dot = e2 -- c|. = . for a c's mkAlt e1@(Star Dot) _e2 = e1 -- A* u e1 = A* mkAlt _e1 e2@(Star Dot) = e2 -- e1 u A* = A* mkAlt (Alt e1 e2) e3 = mkAlt e1 (mkAlt e2 e3) -- associativity mkAlt e1 e2 | e1 == e2 = e1 | otherwise = Alt e1 e2 -- | construct the r.e. for r1{|}r2 (r1 orElse r2). -- -- This represents the same r.e. as r1|r2, but when -- collecting the results of subexpressions in (...) and r1 succeeds, the -- subexpressions of r2 are discarded, so r1 matches are prioritized -- -- example -- -- > splitSubex "({1}x)|({2}.)" "x" = ([("1","x"),("2","x")], "") -- > -- > splitSubex "({1}x){|}({2}.)" "x" = ([("1","x")], "") mkElse :: Eq l => GenRegex l -> GenRegex l -> GenRegex l mkElse e1 (Zero _) = e1 -- e1 u {} = e1 mkElse (Zero _) e2 = e2 -- {} u e2 = e2 mkElse (Sym p1) (Sym p2) = mkSym $ p1 `unionCS` p2 -- melting of predicates -- no symmetry allowed mkElse e1@(Sym _) (Else e2@(Sym _) e3) = mkElse (mkElse e1 e2) e3 -- prepare melting of predicates mkElse (Sym _) e2@Dot = e2 -- c|. = . for a c's mkElse e1@(Star Dot) _e2 = e1 -- A* u e1 = A* mkElse _e1 e2@(Star Dot) = e2 -- e1 u A* = A* mkElse (Else e1 e2) e3 = mkElse e1 (mkElse e2 e3) -- associativity mkElse e1 e2 | e1 == e2 = e1 | otherwise = Else e1 e2 -- | Construct the sequence r.e. r1.r2 mkSeq :: GenRegex l -> GenRegex l -> GenRegex l mkSeq e1@(Zero _) _e2 = e1 mkSeq _e1 e2@(Zero _) = e2 mkSeq Unit e2 = e2 mkSeq (Cbr e1 ss1) e2 = mkCbr (mkSeq e1 e2) ss1 -- move finished submatches upwards mkSeq e1 Unit = e1 mkSeq (Seq e1 e2) e3 = mkSeq e1 (mkSeq e2 e3) mkSeq e1 e2 = Seq e1 e2 -- | mkSeq extened to lists mkSeqs :: [GenRegex l] -> GenRegex l mkSeqs = foldr mkSeq mkUnit -- | Construct repetition r{i,} mkRep :: Eq l => Int -> GenRegex l -> GenRegex l mkRep 0 e = mkStar e mkRep _ e@(Zero _) = e mkRep _ e | nullable e = mkStar e mkRep i (Rep j e) = mkRep (i * j) e mkRep i e = Rep i e -- | Construct range r{i,j} mkRng :: Int -> Int -> GenRegex l -> GenRegex l mkRng 0 0 _e = mkUnit mkRng 1 1 e = e mkRng lb ub _e | lb > ub = Zero $ "illegal range " ++ show lb ++ ".." ++ show ub mkRng _l _u e@(Zero _) = e mkRng _l _u e@Unit = e mkRng lb ub e = Rng lb ub e -- | Construct option r? mkOpt :: GenRegex l -> GenRegex l mkOpt = mkRng 0 1 {-# INLINE mkOpt #-} -- | Construct difference r.e.: r1 {\\} r2 -- -- example -- -- > match "[a-z]+{\\}bush" "obama" = True -- > match "[a-z]+{\\}bush" "clinton" = True -- > match "[a-z]+{\\}bush" "bush" = False -- not important any more mkDiff :: Eq l => GenRegex l -> GenRegex l -> GenRegex l mkDiff e1@(Zero _) _e2 = e1 -- {} - r2 = {} mkDiff e1 (Zero _) = e1 -- r1 - {} = r1 mkDiff _e1 (Star Dot) = mkZero "empty set in difference expr" -- r1 - .* = {} mkDiff Dot (Sym p) = mkSym $ compCS p -- . - s = ~s mkDiff (Sym _) Dot = mkZero "empty set in difference expr" -- x - . = {} mkDiff (Sym p1) (Sym p2) = mkSym $ p1 `diffCS` p2 -- set diff mkDiff e1 e2 | e1 == e2 = mkZero "empty set in difference expr" -- r1 - r1 = {} | otherwise = Diff e1 e2 -- | Construct the Complement of an r.e.: whole set of words - r mkCompl :: Eq l => GenRegex l -> GenRegex l mkCompl (Zero _) = mkAll mkCompl (Star Dot) = mkZero "empty set in compl expr" mkCompl e = mkDiff (mkStar mkDot) e -- | Construct r.e. for intersection: r1 {&} r2 -- -- example -- -- > match ".*a.*{&}.*b.*" "-a-b-" = True -- > match ".*a.*{&}.*b.*" "-b-a-" = True -- > match ".*a.*{&}.*b.*" "-a-a-" = False -- > match ".*a.*{&}.*b.*" "---b-" = False mkIsect :: Eq l => GenRegex l -> GenRegex l -> GenRegex l mkIsect e1@(Zero _) _e2 = e1 -- {} n r2 = {} mkIsect _e1 e2@(Zero _) = e2 -- r1 n {} = {} mkIsect e1@(Unit) e2 -- () n r2 = () if nullable r2 | nullable e2 = e1 -- () n r2 = {} if not nullable r2 | otherwise = mkZero "intersection empty" mkIsect e1 e2@(Unit) = mkIsect e2 e1 -- symmetric version of las 2 laws mkIsect (Sym p1) (Sym p2) = mkSym $ p1 `intersectCS` p2 -- intersect sets mkIsect e1@(Sym _) Dot = e1 -- x n . = x mkIsect Dot e2@(Sym _) = e2 -- . n x = x mkIsect (Star Dot) e2 = e2 -- .* n r2 = r2 mkIsect e1 (Star Dot) = e1 -- r1 n .* = r1 mkIsect e1 e2 | e1 == e2 = e1 -- r1 n r1 = r1 | otherwise = Isec e1 e2 -- | Construct r.e. for exclusive or: r1 {^} r2 -- -- example -- -- > match "[a-c]+{^}[c-d]+" "abc" = True -- > match "[a-c]+{^}[c-d]+" "acdc" = False -- > match "[a-c]+{^}[c-d]+" "ccc" = False -- > match "[a-c]+{^}[c-d]+" "cdc" = True mkExor :: Eq l => GenRegex l -> GenRegex l -> GenRegex l mkExor (Zero _) e2 = e2 mkExor e1 (Zero _) = e1 mkExor (Star Dot) _e2 = mkZero "empty set in exor expr" mkExor _e1 (Star Dot) = mkZero "empty set in exor expr" mkExor (Sym p1) (Sym p2) = mkSym $ p1 `exorCS` p2 mkExor (Sym p1) Dot = mkSym $ compCS p1 mkExor Dot (Sym p2) = mkSym $ compCS p2 mkExor e1 e2 | e1 == e2 = mkZero "empty set in exor expr" -- r1 xor r1 = {} | otherwise = Exor e1 e2 mkInterleave :: GenRegex l -> GenRegex l -> GenRegex l mkInterleave e1@(Zero _) _ = e1 mkInterleave _ e2@(Zero _) = e2 mkInterleave (Unit) e2 = e2 mkInterleave e1 (Unit) = e1 mkInterleave e1 e2 = Intl e1 e2 mkBr0 :: Label l -> GenRegex l -> String -> GenRegex l mkBr0 _ e@(Zero _) _ = e mkBr0 l Unit s = mkCbr mkUnit [(l,reverse s)] mkBr0 l e s = Br l e s -- | Construct a labeled subexpression: ({label}r) mkBr :: l -> GenRegex l -> GenRegex l mkBr l e = mkBr0 (Just l) e "" mkBr' :: GenRegex l -> GenRegex l mkBr' e = mkBr0 Nothing e "" mkCbr :: GenRegex l -> [(Label l, String)] -> GenRegex l mkCbr e@(Zero _) _ = e -- dead end, throw away subexpr matches mkCbr (Cbr e ss1) ss = mkCbr e (ss ++ ss1) -- join inner and this subexpr match mkCbr e ss = Cbr e ss -- ------------------------------------------------------------ instance Show l => Show (GenRegex l) where show (Zero e) = "{" ++ e ++ "}" show Unit = "()" show (Sym p) | p == compCS (stringCS "\n\r") = "." | null (tail cs) && rng1 (head cs) = escRng . head $ cs | otherwise = "[" ++ concat cs' ++ "]" where rng1 (x,y) = x == y cs = p -- charRngs . chars $ p cs' = map escRng p escRng (x, y) | x == y = esc x | succ x == y = esc x ++ esc y | otherwise = esc x ++ "-" ++ esc y esc x | x `elem` "\\-[]{}()*+?.^" = '\\':x:"" | x >= ' ' && x <= '~' = x:"" | otherwise = "&#" ++ show (fromEnum x) ++ ";" show Dot = "\\a" show (Star Dot) = "\\A" show (Star e) = "(" ++ show e ++ "*)" show (Alt e1 e2) = "(" ++ show e1 ++ "|" ++ show e2 ++ ")" show (Else e1 e2) = "(" ++ show e1 ++ "{|}" ++ show e2 ++ ")" show (Seq e1 e2) = "(" ++ show e1 ++ show e2 ++ ")" show (Rep 1 e) = "(" ++ show e ++ "+)" show (Rep i e) = "(" ++ show e ++ "{" ++ show i ++ ",})" show (Rng 0 1 e) = "(" ++ show e ++ "?)" show (Rng i j e) = "(" ++ show e ++ "{" ++ show i ++ "," ++ show j ++ "})" show (Diff e1 e2) = "(" ++ show e1 ++ "{\\}" ++ show e2 ++ ")" show (Isec e1 e2) = "(" ++ show e1 ++ "{&}" ++ show e2 ++ ")" show (Exor e1 e2) = "(" ++ show e1 ++ "{^}" ++ show e2 ++ ")" show (Intl e1 e2) = "(" ++ show e1 ++ "{:}" ++ show e2 ++ ")" show (Br l e s) = "({" ++ showL l ++ (if null s then "" else "=" ++ reverse s ) ++ "}" ++ show e ++ ")" show (Cbr e ss) = "([" ++ intercalate "," (map (\(l,s) -> showL l ++ "=" ++ s) ss) ++ "]" ++ show e ++ ")" showL :: Show l => Label l -> String showL = rmq . maybe "" show where rmq ('\"':xs) = init xs rmq xs = xs -- ------------------------------------------------------------ isZero :: GenRegex l -> Bool isZero (Zero _) = True isZero _ = False {-# INLINE isZero #-} errRegex :: GenRegex l -> String errRegex (Zero e) = e errRegex _ = "" -- ------------------------------------------------------------ nullable :: GenRegex l -> Bool nullable = fst . nullable' {-# INLINE nullable #-} nullable' :: GenRegex l -> Nullable l nullable' (Zero _) = (False, []) nullable' Unit = (True, []) nullable' Dot = (False, []) nullable' (Sym _x) = (False, []) nullable' (Star _e) = (True, []) nullable' (Rep _i e) = nullable' e nullable' (Rng i _ e) = (i == 0, []) `unionN` nullable' e nullable' (Seq e1 e2) = nullable' e1 `isectN` nullable' e2 nullable' (Alt e1 e2) = nullable' e1 `unionN` nullable' e2 nullable' (Else e1 e2) = nullable' e1 `orElseN` nullable' e2 nullable' (Isec e1 e2) = nullable' e1 `isectN` nullable' e2 nullable' (Diff e1 e2) = nullable' e1 `diffN` nullable' e2 nullable' (Exor e1 e2) = nullable' e1 `exorN` nullable' e2 nullable' (Intl e1 e2) = nullable' e1 `isectN` nullable' e2 nullable' (Br l e s) = (True, [(l, reverse s)]) `isectN` nullable' e nullable' (Cbr e ss) = (True, ss) `isectN` nullable' e isectN :: Nullable l -> Nullable l -> Nullable l isectN (True, ws1) (True, ws2) = (True, ws1 ++ ws2) isectN _ _ = (False, []) unionN :: Nullable l -> Nullable l -> Nullable l unionN (False, _) (False, _) = (False, []) unionN (_, ws1) (_, ws2) = (True, ws1 ++ ws2) orElseN :: Nullable l -> Nullable l -> Nullable l orElseN e1@(True, _ws1) _ = e1 orElseN _ e2 = e2 diffN :: Nullable l -> Nullable l -> Nullable l diffN n1 (False, _) = n1 diffN _ _ = (False, []) exorN :: Nullable l -> Nullable l -> Nullable l exorN n1@(True, _) (False, _) = n1 exorN (False, _) n2@(True, _) = n2 exorN _ _ = (False, []) -- ------------------------------------------------------------ -- | FIRST for regular expressions -- -- this is only an approximation, the real set of char may be smaller, -- when the expression contains intersection, set difference or exor operators firstChars :: GenRegex l -> CharSet firstChars (Zero _) = emptyCS firstChars Unit = emptyCS firstChars (Sym p) = p firstChars Dot = allCS firstChars (Star e1) = firstChars e1 firstChars (Alt e1 e2) = firstChars e1 `unionCS` firstChars e2 firstChars (Else e1 e2) = firstChars e1 `unionCS` firstChars e2 firstChars (Seq e1 e2) | nullable e1 = firstChars e1 `unionCS` firstChars e2 | otherwise = firstChars e1 firstChars (Rep _i e) = firstChars e firstChars (Rng _i _j e) = firstChars e firstChars (Diff e1 _e2) = firstChars e1 -- this is an approximation firstChars (Isec e1 e2) = firstChars e1 `intersectCS` firstChars e2 -- this is an approximation firstChars (Exor e1 e2) = firstChars e1 `unionCS` firstChars e2 -- this is an approximation firstChars (Intl e1 e2) = firstChars e1 `unionCS` firstChars e2 firstChars (Br _l e _s) = firstChars e firstChars (Cbr e _ss) = firstChars e -- ------------------------------------------------------------ delta1 :: Eq l => GenRegex l -> Char -> GenRegex l delta1 e@(Zero _) _ = e delta1 Unit c = mkZero $ "unexpected char " ++ show c delta1 (Sym p) c | c `elemCS` p = mkUnit | otherwise = mkZero $ "unexpected char " ++ show c delta1 Dot _ = mkUnit delta1 e@(Star Dot) _ = e delta1 e@(Star e1) c = mkSeq (delta1 e1 c) e delta1 (Alt e1 e2) c = mkAlt (delta1 e1 c) (delta1 e2 c) delta1 (Else e1 e2) c = mkElse (delta1 e1 c) (delta1 e2 c) delta1 (Seq e1@(Br l e1' s) e2) c | n = mkAlt (mkSeq (delta1 e1 c) e2) (mkCbr (delta1 e2 c) ((l, reverse s) : ws)) where (n, ws) = nullable' e1' delta1 (Seq e1 e2) c | nullable e1 = mkAlt (mkSeq (delta1 e1 c) e2) (delta1 e2 c) | otherwise = mkSeq (delta1 e1 c) e2 delta1 (Rep i e) c = mkSeq (delta1 e c) (mkRep (i-1) e) delta1 (Rng i j e) c = mkSeq (delta1 e c) (mkRng ((i-1) `max` 0) (j-1) e) delta1 (Diff e1 e2) c = mkDiff (delta1 e1 c) (delta1 e2 c) delta1 (Isec e1 e2) c = mkIsect (delta1 e1 c) (delta1 e2 c) delta1 (Exor e1 e2) c = mkExor (delta1 e1 c) (delta1 e2 c) delta1 (Intl e1 e2) c = mkAlt (mkInterleave (delta1 e1 c) e2 ) (mkInterleave e1 (delta1 e2 c)) delta1 (Br l e s) c = mkBr0 l (delta1 e c) (c:s) delta1 (Cbr e ss) c = mkCbr (delta1 e c) ss -- ------------------------------------------------------------ delta :: Eq l => GenRegex l -> String -> GenRegex l delta = foldl' delta1 matchWithRegex :: Eq l => GenRegex l -> String -> Bool matchWithRegex e = nullable . delta e matchWithRegex' :: Eq l => GenRegex l -> String -> Maybe [(Label l,String)] matchWithRegex' e = (\ (r, l) -> if r then Just l else Nothing) . nullable' . delta e -- ------------------------------------------------------------ -- | This function wraps the whole regex in a subexpression before starting -- the parse. This is done for getting acces to -- the whole parsed string. Therfore we need one special label, this label -- is the Nothing value, all explicit labels are Just labels. splitWithRegex :: Eq l => GenRegex l -> String -> Maybe ([(Label l,String)], String) splitWithRegex re inp = do (re', rest) <- splitWithRegex' (mkBr' re) inp return ( snd . nullable' $ re', rest) splitWithRegexCS :: Eq l => GenRegex l -> CharSet -> String -> Maybe ([(Label l,String)], String) splitWithRegexCS re cs inp = do (re', rest) <- splitWithRegexCS' (mkBr' re) cs inp return ( snd . nullable' $ re', rest) -- ---------------------------------------- -- -- | The main scanner function {- linear recursive function, can lead to stack overflow splitWithRegex' :: Eq l => GenRegex l -> String -> Maybe (GenRegex l, String) splitWithRegex' re "" | nullable re = Just (re, "") | otherwise = Nothing splitWithRegex' re inp@(c : inp') | isZero re = Nothing | otherwise = evalRes . splitWithRegex' (delta1 re c) $ inp' where evalRes Nothing | nullable re = Just (re, inp) | otherwise = Nothing evalRes res = res -} -- tail recursive version of above function splitWithRegex' :: Eq l => GenRegex l -> String -> Maybe (GenRegex l, String) splitWithRegex' re inp = splitWithRegex'' ( if nullable re then Just (re, inp) -- first possible result: empty prefix else Nothing -- empty prefix not a result ) re inp splitWithRegex'' :: Eq l => Maybe (GenRegex l, String) -> GenRegex l -> String -> Maybe (GenRegex l, String) splitWithRegex'' lastRes _re "" = lastRes splitWithRegex'' lastRes re (c : inp') | isZero re = lastRes | otherwise = splitWithRegex'' nextRes re' $ inp' where re' = delta1 re c nextRes | nullable re' = Just (re', inp') | otherwise = lastRes -- ---------------------------------------- -- -- | speedup version for splitWithRegex' -- -- This function checks whether the input starts with a char from FIRST re. -- If this is not the case, the split fails. The FIRST set can be computed once -- for a whole tokenizer and reused by every call of split splitWithRegexCS' :: Eq l => GenRegex l -> CharSet -> String -> Maybe (GenRegex l, String) splitWithRegexCS' re cs inp@(c : _) | c `elemCS` cs = splitWithRegex' re inp splitWithRegexCS' re _cs inp | nullable re = Just (re, inp) | otherwise = Nothing -- ------------------------------------------------------------ hxt-regex-xmlschema-9.0.4/src/Text/Regex/XMLSchema/String/RegexParser.hs0000644000000000000000000003102711701313414024201 0ustar0000000000000000-- ------------------------------------------------------------ {- | Module : Text.Regex.XMLSchema.String.RegexParser Copyright : Copyright (C) 2010- Uwe Schmidt License : MIT Maintainer : Uwe Schmidt (uwe@fh-wedel.de) Stability : stable Portability: portable W3C XML Schema Regular Expression Parser This parser supports the full W3C standard, the complete grammar can be found under and extensions for all missing set operations, intersection, difference, exclusive or, interleave, complement -} -- ------------------------------------------------------------ module Text.Regex.XMLSchema.String.RegexParser ( parseRegex , parseRegexExt ) where import Data.Char.Properties.UnicodeBlocks import Data.Char.Properties.UnicodeCharProps import Data.Char.Properties.XMLCharProps import Data.Maybe import Data.Set.CharSet import Text.ParserCombinators.Parsec import Text.Regex.XMLSchema.String.Regex -- ------------------------------------------------------------ -- | parse a standard W3C XML Schema regular expression parseRegex :: String -> Regex parseRegex = parseRegex' regExpStd -- | parse an extended syntax W3C XML Schema regular expression -- -- The Syntax of the W3C XML Schema spec is extended by -- further useful set operations, like intersection, difference, exor. -- Subexpression match becomes possible with \"named\" pairs of parentheses. -- The multi char escape sequence \\a represents any Unicode char, -- The multi char escape sequence \\A represents any Unicode word, (\\A = \\a*). -- All syntactically wrong inputs are mapped to the Zero expression representing the -- empty set of words. Zero contains as data field a string for an error message. -- So error checking after parsing becomes possible by checking against Zero ('isZero' predicate) parseRegexExt :: String -> Regex parseRegexExt = parseRegex' regExpExt parseRegex' :: Parser Regex -> String -> Regex parseRegex' regExp' = either (mkZero . ("syntax error: " ++) . show) id . parse ( do r <- regExp' eof return r ) "" -- ------------------------------------------------------------ regExpExt :: Parser Regex regExpExt = branchList orElseList regExpStd :: Parser Regex regExpStd = branchList seqListStd branchList :: Parser Regex -> Parser Regex branchList exParser = do r1 <- exParser rs <- many branchList1 return (foldr1 mkAlt $ r1:rs) -- union is associative, so we use right ass. -- as with seq, alt and exor where branchList1 = do _ <- char '|' exParser orElseList :: Parser Regex orElseList = do r1 <- interleaveList rs <- many orElseList1 return (foldr1 mkElse $ r1:rs) -- orElse is associative, so we choose right ass. -- as with seq and alt ops where orElseList1 = do _ <- try (string "{|}") interleaveList interleaveList :: Parser Regex interleaveList = do r1 <- exorList rs <- many interleaveList1 return (foldr1 mkInterleave $ r1:rs) -- interleave is associative, so we choose right ass. -- as with seq and alt ops where interleaveList1 = do _ <- try (string "{:}") exorList exorList :: Parser Regex exorList = do r1 <- diffList rs <- many exorList1 return (foldr1 mkExor $ r1:rs) -- exor is associative, so we choose right ass. where exorList1 = do _ <- try (string "{^}") diffList diffList :: Parser Regex diffList = do r1 <- intersectList rs <- many diffList1 return (foldl1 mkDiff $ r1:rs) -- diff is not associative, so we choose left ass. where diffList1 = do _ <- try (string "{\\}") intersectList intersectList :: Parser Regex intersectList = do r1 <- seqListExt rs <- many intersectList1 return (foldr1 mkIsect $ r1:rs) where intersectList1 = do _ <- try (string "{&}") seqListExt seqListExt :: Parser Regex seqListExt = seqList' regExpLabel multiCharEscExt seqListStd :: Parser Regex seqListStd = seqList' regExpStd multiCharEsc seqList' :: Parser Regex -> Parser Regex -> Parser Regex seqList' regExp' multiCharEsc' = do rs <- many piece return $ mkSeqs rs where piece :: Parser Regex piece = do r <- atom quantifier r atom :: Parser Regex atom = char1 <|> charClass <|> between (char '(') (char ')') regExp' charClass :: Parser Regex charClass = charClassEsc multiCharEsc' <|> charClassExpr multiCharEsc' <|> wildCardEsc quantifier :: Regex -> Parser Regex quantifier r = ( do _ <- char '?' return $ mkOpt r ) <|> ( do _ <- char '*' return $ mkStar r ) <|> ( do _ <- char '+' return $ mkRep 1 r ) <|> try ( do _ <- char '{' res <- quantity r _ <- char '}' return res ) <|> ( return r ) quantity :: Regex -> Parser Regex quantity r = do lb <- many1 digit quantityRest r (read lb) quantityRest :: Regex -> Int -> Parser Regex quantityRest r lb = ( do _ <- char ',' ub <- many digit return ( if null ub then mkRep lb r else mkRng lb (read ub) r ) ) <|> ( return $ mkRng lb lb r) regExpLabel :: Parser Regex regExpLabel = do lab <- option id (between (char '{') (char '}') label') r <- regExpExt return $ lab r where label' = do l <- many1 (satisfy isXmlNameChar) return $ mkBr l char1 :: Parser Regex char1 = do c <- satisfy (`notElem` ".\\?*+{}()|[]") return $ mkSym1 c charClassEsc :: Parser Regex -> Parser Regex charClassEsc multiCharEsc' = do _ <- char '\\' ( singleCharEsc <|> multiCharEsc' <|> catEsc <|> complEsc ) singleCharEsc :: Parser Regex singleCharEsc = do c <- singleCharEsc' return $ mkSym1 c singleCharEsc' :: Parser Char singleCharEsc' = do c <- satisfy (`elem` "nrt\\|.?*+(){}-[]^") return $ maybe c id . lookup c . zip "ntr" $ "\n\r\t" multiCharEscExt :: Parser Regex multiCharEscExt = multiCharEsc <|> ( do -- extension: \a represents the whole alphabet inclusive newline chars: \a == .|\n|\r _ <- char 'a' return mkDot ) <|> ( do -- extension: \A represents all words: \A == \a* or \A == (.|\n|\r)* _ <- char 'A' return mkAll ) multiCharEsc :: Parser Regex multiCharEsc = ( do c <- satisfy (`elem` es) return $ mkSym . fromJust . lookup c $ pm ) where es = map fst pm pm = [ ('s', charPropXmlSpaceChar ) , ('S', compCS charPropXmlSpaceChar ) , ('i', charPropXmlNameStartChar ) , ('I', compCS charPropXmlNameStartChar ) , ('c', charPropXmlNameChar ) , ('C', compCS charPropXmlNameChar ) , ('d', charPropDigit ) , ('D', compCS charPropDigit ) , ('w', compCS charPropNotWord ) , ('W', charPropNotWord ) ] charPropDigit = rangeCS '0' '9' charPropNotWord = charPropUnicodeP `unionCS` charPropUnicodeZ `unionCS` charPropUnicodeC catEsc :: Parser Regex catEsc = do _ <- char 'p' s <- between (char '{') (char '}') charProp return $ mkSym s charProp :: Parser CharSet charProp = isCategory <|> isBlock isBlock :: Parser CharSet isBlock = do _ <- string "Is" name <- many1 (satisfy legalChar) case lookup name codeBlocks of Just b -> return $ uncurry rangeCS b Nothing -> fail $ "unknown Unicode code block " ++ show name where legalChar c = 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' || '-' == c isCategory :: Parser CharSet isCategory = do pr <- isCategory' return $ fromJust (lookup pr categories) categories :: [(String, CharSet)] categories = [ ("C", charPropUnicodeC ) , ("Cc", charPropUnicodeCc) , ("Cf", charPropUnicodeCf) , ("Co", charPropUnicodeCo) , ("Cs", charPropUnicodeCs) , ("L", charPropUnicodeL ) , ("Ll", charPropUnicodeLl) , ("Lm", charPropUnicodeLm) , ("Lo", charPropUnicodeLo) , ("Lt", charPropUnicodeLt) , ("Lu", charPropUnicodeLu) , ("M", charPropUnicodeM ) , ("Mc", charPropUnicodeMc) , ("Me", charPropUnicodeMe) , ("Mn", charPropUnicodeMn) , ("N", charPropUnicodeN ) , ("Nd", charPropUnicodeNd) , ("Nl", charPropUnicodeNl) , ("No", charPropUnicodeNo) , ("P", charPropUnicodeP ) , ("Pc", charPropUnicodePc) , ("Pd", charPropUnicodePd) , ("Pe", charPropUnicodePe) , ("Pf", charPropUnicodePf) , ("Pi", charPropUnicodePi) , ("Po", charPropUnicodePo) , ("Ps", charPropUnicodePs) , ("S", charPropUnicodeS ) , ("Sc", charPropUnicodeSc) , ("Sk", charPropUnicodeSk) , ("Sm", charPropUnicodeSm) , ("So", charPropUnicodeSo) , ("Z", charPropUnicodeZ ) , ("Zl", charPropUnicodeZl) , ("Zp", charPropUnicodeZp) , ("Zs", charPropUnicodeZs) ] isCategory' :: Parser String isCategory' = ( foldr1 (<|>) . map (uncurry prop) $ [ ('L', "ultmo") , ('M', "nce") , ('N', "dlo") , ('P', "cdseifo") , ('Z', "slp") , ('S', "mcko") , ('C', "cfon") ] ) "illegal Unicode character property" where prop c1 cs2 = do _ <- char c1 s2 <- option "" ( do c2 <- satisfy (`elem` cs2) return [c2] ) return $ c1:s2 complEsc :: Parser Regex complEsc = do _ <- char 'P' s <- between (char '{') (char '}') charProp return $ mkSym $ compCS s charClassExpr :: Parser Regex -> Parser Regex charClassExpr multiCharEsc' = between (char '[') (char ']') charGroup where charGroup :: Parser Regex charGroup = do r <- ( negCharGroup -- a ^ at beginning denotes negation, not start of posCharGroup <|> posCharGroup ) s <- option (mkZero "") -- charClassSub ( do _ <- char '-' charClassExpr multiCharEsc' ) return $ mkDiff r s posCharGroup :: Parser Regex posCharGroup = do rs <- many1 (charRange <|> charClassEsc multiCharEsc') return $ foldr1 mkAlt rs negCharGroup :: Parser Regex negCharGroup = do _ <- char '^' r <- posCharGroup return $ mkDiff mkDot r charRange :: Parser Regex charRange = try seRange <|> xmlCharIncDash seRange :: Parser Regex seRange = do c1 <- charOrEsc' _ <- char '-' c2 <- charOrEsc' return $ mkSymRng c1 c2 charOrEsc' :: Parser Char charOrEsc' = ( do _ <- char '\\' singleCharEsc' ) <|> satisfy (`notElem` "\\-[]") xmlCharIncDash :: Parser Regex xmlCharIncDash = try ( do -- dash is only allowed if not followed by a [, else charGroup differences do not parse correctly _ <- char '-' notFollowedBy (char '[') return $ mkSym1 '-' ) <|> ( do c <- satisfy (`notElem` "-\\[]") return $ mkSym1 c ) wildCardEsc :: Parser Regex wildCardEsc = do _ <- char '.' return . mkSym . compCS $ stringCS "\n\r" -- ------------------------------------------------------------ hxt-regex-xmlschema-9.0.4/src/Text/Regex/Glob/0000755000000000000000000000000011701313414017247 5ustar0000000000000000hxt-regex-xmlschema-9.0.4/src/Text/Regex/Glob/String.hs0000644000000000000000000000160611701313414021054 0ustar0000000000000000-- ------------------------------------------------------------ {- | Module : Text.Regex.Glob.String Copyright : Copyright (C) 2011- Uwe Schmidt License : MIT Maintainer : Uwe Schmidt Stability : stable Portability: portable csh glob style pattern matcher -} -- ------------------------------------------------------------ module Text.Regex.Glob.String ( Regex , match , matchNoCase , parseRegex , parseRegexNoCase ) where import Text.Regex.XMLSchema.String.Regex import Text.Regex.Glob.String.RegexParser -- ------------------------------------------------------------ match :: String -> String -> Bool match = matchWithRegex . parseRegex matchNoCase :: String -> String -> Bool matchNoCase = matchWithRegex . parseRegexNoCase -- ------------------------------------------------------------ hxt-regex-xmlschema-9.0.4/src/Text/Regex/Glob/String/0000755000000000000000000000000011701313414020515 5ustar0000000000000000hxt-regex-xmlschema-9.0.4/src/Text/Regex/Glob/String/RegexParser.hs0000644000000000000000000000576211701313414023312 0ustar0000000000000000-- ------------------------------------------------------------ {- | Module : Text.Regex.Glob.String.RegexParser Copyright : Copyright (C) 2010- Uwe Schmidt License : MIT Maintainer : Uwe Schmidt (uwe@fh-wedel.de) Stability : stable Portability: portable csh style Glob Pattern Parser for Regular Expressions -} -- ------------------------------------------------------------ module Text.Regex.Glob.String.RegexParser ( parseRegex , parseRegexNoCase ) where import Data.Char ( isLower , isUpper , toLower , toUpper ) import Text.ParserCombinators.Parsec import Text.Regex.XMLSchema.String.Regex -- ------------------------------------------------------------ -- | parse a glob pattern parseRegex :: String -> Regex parseRegex = parseRegex' mkSymRng parseRegexNoCase :: String -> Regex parseRegexNoCase = parseRegex' mkNoCaseSymRng parseRegex' :: (Char -> Char -> Regex) -> String -> Regex parseRegex' mkS = either (mkZero . ("syntax error: " ++) . show) id . parse ( do r <- pattern mkS eof return r ) "" -- ------------------------------------------------------------ pattern :: (Char -> Char -> Regex) -> Parser Regex pattern mkS = many part >>= return . mkSeqs where part :: Parser Regex part = ( many1 (noneOf "\\?*[{") >>= return . mkWord' ) <|> ( char '?' >> return mkDot ) <|> ( char '*' >> return mkAll ) <|> ( between (char '{') (char '}') wordList ) <|> ( between (char '[') (char ']') charSet ) <|> ( do c <- char '\\' >> anyChar return $ mkS c c ) mkWord' = mkSeqs . map (\ c -> mkS c c) wordList :: Parser Regex wordList = sepBy (many1 (noneOf ",}")) (char ',') >>= return . foldr mkAlt (mkZero "") . map mkWord' charSet :: Parser Regex charSet = ( do p1 <- charSet' anyChar ps <- many $ charSet' (noneOf "]") return $ foldr mkAlt (mkZero "") (p1 : ps) ) where charSet' cp = do c1 <- cp c2 <- rest c1 return $ mkS c1 c2 rest c1 = option c1 (char '-' >> anyChar) -- ------------------------------------------------------------ mkNoCaseSymRng :: Char -> Char -> Regex mkNoCaseSymRng c1 c2 | isLower c1 && isLower c2 = mkAlt (mkSymRng (toUpper c1) (toUpper c2)) (mkSymRng c1 c2) | isUpper c1 && isUpper c2 = mkAlt (mkSymRng (toLower c1) (toLower c2)) (mkSymRng c1 c2) | otherwise = mkSymRng c1 c2 -- ------------------------------------------------------------ hxt-regex-xmlschema-9.0.4/examples/0000755000000000000000000000000011701313414015415 5ustar0000000000000000hxt-regex-xmlschema-9.0.4/examples/RegexXMLSchema/0000755000000000000000000000000011701313414020171 5ustar0000000000000000hxt-regex-xmlschema-9.0.4/examples/RegexXMLSchema/Makefile0000644000000000000000000000513311701313414021633 0ustar0000000000000000# $Id: Makefile,v 1.9 2006/11/11 15:36:03 hxml Exp $ HXT_HOME = ../../.. PKGFLAGS = GHCFLAGS = -Wall -O2 GHC = ghc $(GHCFLAGS) $(PKGFLAGS) DIST = $(HXT_HOME)/dist/examples/arrows DIST_DIR = $(DIST)/RegexXMLSchema CNT = 3 ropts = +RTS -s -RTS prog = ./REtest prog2 = ./Lines prog3 = ./RElines prog4 = ./Words prog5 = ./REwords prog0 = ./Copy progs = $(prog) $(prog0) $(prog2) $(prog3) $(prog4) $(prog5) all : $(progs) $(prog) : $(prog).hs $(GHC) --make -o $@ $< local : $(GHC) --make -o $(prog) -fglasgow-exts -ignore-package hxt -i../../../src $(prog).hs $(prog2) : $(prog) ln -f $(prog) $(prog2) $(prog3) : $(prog) ln -f $(prog) $(prog3) $(prog4) : $(prog) ln -f $(prog) $(prog4) $(prog5) : $(prog) ln -f $(prog) $(prog5) $(prog0) : $(prog) ln -f $(prog) $(prog0) # generate and read documents containing a binary tree # with 2^i leaf nodes containing the numbers 1 to 2^i # for i up to at least 22 (8M XML elements) output works fine # for i up to 19 (1M XML elements) input works without swapping # with i=20 swapping starts, but the program it still terminates # the size of the XML file for i=20 is about 36Mb # these tests have run on a box with 1Gb memory tests = 18 test : $(prog) $(MAKE) genfiles tests="$(tests)" $(MAKE) copy tests="$(tests)" $(MAKE) lines tests="$(tests)" $(MAKE) relines tests="$(tests)" $(MAKE) words tests="$(tests)" $(MAKE) rewords tests="$(tests)" perftest : $(prog) $(MAKE) test tests="2 3 10 11 12 13 14 15 16 17 18 19 20" genfiles : @for i in $(tests) ; \ do \ echo time $(prog) $(ropts) $$i ; \ time $(prog) $(ropts) $$i ; \ ls -l tree-*$$i.xml ; \ echo ; \ done copy : @for i in $(tests) ; \ do \ echo time $(prog0) $(ropts) $$i ; \ time $(prog0) $(ropts) $$i ; \ ls -l tree-*$$i.xml.copy ; \ echo ; \ done lines : @for i in $(tests) ; \ do \ echo time $(prog2) $(ropts) $$i ; \ time $(prog2) $(ropts) $$i ; \ ls -l tree-*$$i.xml.lines ; \ echo ; \ done relines : @for i in $(tests) ; \ do \ echo time $(prog3) $(ropts) $$i ; \ time $(prog3) $(ropts) $$i ; \ ls -l tree-*$$i.xml.relines ; \ echo ; \ done words : @for i in $(tests) ; \ do \ echo time $(prog4) $(ropts) $$i ; \ time $(prog4) $(ropts) $$i ; \ ls -l tree-*$$i.xml.words ; \ echo ; \ done rewords : @for i in $(tests) ; \ do \ echo time $(prog5) $$i ; \ time $(prog5) $(ropts) $$i ; \ ls -l tree-*$$i.xml.rewords ; \ echo ; \ done dist : [ -d $(DIST_DIR) ] || mkdir -p $(DIST_DIR) cp Makefile REtest.hs $(DIST_DIR) clean : rm -f $(progs) *.o *.hi *.xml *.xml.* hxt-regex-xmlschema-9.0.4/examples/RegexXMLSchema/REtest.hs0000644000000000000000000000751311701313414021741 0ustar0000000000000000{-# LANGUAGE BangPatterns#-} -- ---------------------------------------- module Main(main) where import Text.XML.HXT.Core import Text.Regex.XMLSchema.String import Data.String.Unicode ( unicodeToXmlEntity ) import Control.Monad.State.Strict hiding (when) import Data.Maybe import System.IO -- import the IO and commandline option stuff import System.Environment -- ---------------------------------------- main :: IO () main = do p <- getProgName al <- getArgs let i = if null al then 4 else (read . head $ al)::Int main' p i where main' p' = fromMaybe main1 . lookup (pn p') $ mpt mpt = [ ("REtest", main1) , ("Copy", main2 "copy" (:[])) , ("Lines", main2 "lines" lines) , ("RElines", main2 "relines" relines) , ("Words", main2 "words" words) , ("REwords", main2 "rewords" rewords) ] -- ---------------------------------------- -- generate a document containing a binary tree of 2^i leafs (= 2^(i-1) XML elements) main1 :: Int -> IO () main1 i = runX (genDoc i (fn i)) >> return () -- ---------------------------------------- -- read a document containing a binary tree of 2^i leafs main2 :: String -> (String -> [String]) -> Int -> IO () main2 ext lines' i = do hPutStrLn stderr "start processing" h <- openBinaryFile (fn i) ReadMode c <- hGetContents h let ls = lines' c o <- openBinaryFile (fn i ++ "." ++ ext) WriteMode mapM_ (hPutStrLn o) ls hClose o hClose h hPutStrLn stderr "end processing" relines :: String -> [String] relines = tokenize "[^\n\r]*" rewords :: String -> [String] rewords = tokenize "[^ \t\n\r]+" -- ---------------------------------------- pn :: String -> String pn = reverse . takeWhile (/= '/') . reverse fn :: Int -> String fn = ("tree-" ++) . (++ ".xml") . reverse . take 4 . reverse . ((replicate 4 '0') ++ ) . show -- ---------------------------------------- genDoc :: Int -> String -> IOSArrow b XmlTree genDoc d out = constA (mkBTree d) >>> xpickleVal xpickle >>> indentDoc >>> putDoc out -- ---------------------------------------- type Counter a = State Int a incr :: Counter Int incr = do modify (+1) get -- ---------------------------------------- data BTree = Leaf Int | Fork BTree BTree deriving (Show) instance XmlPickler BTree where xpickle = xpAlt tag ps where tag (Leaf _ ) = 0 tag (Fork _ _ ) = 1 ps = [ xpWrap ( Leaf, \ (Leaf i) -> i) ( xpElem "leaf" $ xpAttr "value" $ xpickle ) , xpWrap ( uncurry Fork, \ (Fork l r) -> (l, r)) ( xpElem "fork" $ xpPair xpickle xpickle ) ] -- ---------------------------------------- mkBTree :: Int -> BTree mkBTree depth = evalState (mkT depth) 0 mkT :: Int -> Counter BTree mkT 0 = do i <- incr return (Leaf i) mkT n = do l <- mkT (n-1) r <- mkT (n-1) return (Fork l r) -- ---------------------------------------- -- output is done with low level ops to write the -- document i a lazy manner -- adding an xml pi and encoding is done "by hand" -- latin1 decoding is the identity, so please generate the -- docs with latin1 encoding. Here ist done even with ASCCI -- every none ASCII char is represented by a char ref (&nnn;) putDoc :: String -> IOStateArrow s XmlTree XmlTree putDoc dst = addXmlPi >>> addXmlPiEncoding isoLatin1 >>> xshow getChildren >>> arr unicodeToXmlEntity >>> arrIO (\ s -> hPutDocument (\h -> hPutStrLn h s)) >>> none where isStdout = null dst || dst == "-" hPutDocument :: (Handle -> IO()) -> IO() hPutDocument action | isStdout = action stdout | otherwise = do handle <- openBinaryFile dst WriteMode action handle hClose handle -- ---------------------------------------- hxt-regex-xmlschema-9.0.4/examples/colorizeProgs/0000755000000000000000000000000011701313414020256 5ustar0000000000000000hxt-regex-xmlschema-9.0.4/examples/colorizeProgs/Makefile0000644000000000000000000000027211701313414021717 0ustar0000000000000000all : colorize install : all sudo cp colorize /usr/local/bin clean : rm -f *.o *.hi distclean : clean rm -f colorize colorize : ColorizeSourceCode.hs ghc -o $@ -O2 --make $< hxt-regex-xmlschema-9.0.4/examples/colorizeProgs/ColorizeSourceCode.hs0000644000000000000000000010326611701313414024364 0ustar0000000000000000-- ------------------------------------------------------------ {- | Module : ColorizeSourceCode Copyright : Copyright (C) 2009 Uwe Schmidt License : BSD3 Maintainer : Uwe Schmidt (uwe@fh-wedel.de) Portability: portable Colorize Source Code Supports Java and Haskell -} -- ------------------------------------------------------------ module Main where import Control.Arrow import Data.List import Data.Maybe import System import System.IO -- import the IO and commandline option stuff import System.Environment import System.Console.GetOpt import System.Exit import Text.Regex.XMLSchema.String import Text.XML.HXT.Arrow import Text.XML.HXT.Parser.XhtmlEntities -- ------------------------------------------------------------ data Process = P { inFilter :: String -> String , tokenRE :: Regex , markupRE :: Regex -> Regex , formatToken :: (String, String) -> String , formatDoc :: [String] -> String , outFilter :: String -> String , input :: Handle , output :: Handle , inputFile :: String } defaultProcess = P { inFilter = id , tokenRE = plainRE , markupRE = id , formatToken = uncurry (++) , formatDoc = unlines , outFilter = id , input = stdin , output = stdout , inputFile = " " } -- ------------------------------------------------------------ main :: IO () main = do argv <- getArgs p <- evalArgs (getOpt Permute options argv) s <- hGetContents (input p) hPutStr (output p) (process p s) hFlush (output p) hClose (output p) exitWith ExitSuccess options :: [OptDescr (String, String)] options = [ Option "h?" ["help"] (NoArg ("help", "1")) "this message" , Option "" ["plain"] (NoArg ("plain", "1")) "don't colorize lines" , Option "" ["haskell"] (NoArg ("haskell", "1")) "colorize haskell" , Option "" ["java"] (NoArg ("java", "1")) "colorize java" , Option "" ["cpp"] (NoArg ("cpp", "1")) "colorize C or C++" , Option "" ["sh"] (NoArg ("sh", "1")) "colorize sh or bash" , Option "" ["ruby"] (NoArg ("ruby", "1")) "colorize ruby" , Option "" ["bnf"] (NoArg ("bnf", "1")) "colorize extended BNF grammar rules" , Option "" ["ppl"] (NoArg ("ppl", "1")) "colorize ppl" , Option "" ["pplass"] (NoArg ("pplass", "1")) "colorize ppl assembler" , Option "n" ["number"] (NoArg ("number", "1")) "with line numbers" , Option "t" ["tabs"] (NoArg ("tabs", "1")) "substitute tabs by blanks" , Option "m" ["markup"] (NoArg ("markup", "1")) "text contains embedded markup" , Option "e" ["erefs"] (NoArg ("erefs", "1")) "resolve HTML entity refs before processing" , Option "o" ["output"] (ReqArg ((,) "output") "FILE") "output file, \"-\" stands for stdout" , Option "s" ["scan"] (NoArg ("scan", "1")) "just scan input, for testing" , Option "x" ["html"] (NoArg ("html", "1")) "html output" , Option "f" ["full"] (NoArg ("full", "1")) "full HTML document with header and css" ] exitErr :: String -> IO a exitErr msg = do hPutStrLn stderr msg usage exitWith (ExitFailure (-1)) evalArgs (opts, files, errs) | not (null errs) = exitErr ("illegal arguments " ++ show errs) | null files = evalOpts opts defaultProcess | not (null fns) = exitErr ("only one input file allowed") | otherwise = do inp <- openFile fn ReadMode evalOpts opts (defaultProcess { input = inp , inputFile = fn } ) where (fn:fns) = files evalOpts :: [(String, String)] -> Process -> IO Process evalOpts [] res = return res evalOpts (o:os) res = do res' <- evalOpt o res evalOpts os res' evalOpt :: (String, String) -> Process -> IO Process evalOpt ("help","1") _ = do usage exitWith ExitSuccess evalOpt ("output", "-") p = return $ p {output = stdout} evalOpt ("output", fn) p = do outp <- openFile fn WriteMode return $ p {output = outp} evalOpt ("haskell","1") p = return $ p { tokenRE = haskellRE } evalOpt ("java", "1") p = return $ p { tokenRE = javaRE } evalOpt ("cpp", "1") p = return $ p { tokenRE = cppRE } evalOpt ("sh", "1") p = return $ p { tokenRE = shRE } evalOpt ("ruby", "1") p = return $ p { tokenRE = rubyRE } evalOpt ("bnf", "1") p = return $ p { tokenRE = bnfRE } evalOpt ("ppl", "1") p = return $ p { tokenRE = pplRE } evalOpt ("pplass", "1") p = return $ p { tokenRE = pplassRE } evalOpt ("plain", "1") p = return $ p { tokenRE = plainRE } evalOpt ("scan", "1") p = return $ p { tokenRE = plainRE , formatToken = uncurry formatTok , formatDoc = formatHList } evalOpt ("number", "1") p = return $ p { formatDoc = numberLines >>> formatDoc p } evalOpt ("tabs", "1") p = return $ p { inFilter = inFilter p >>> substTabs } evalOpt ("erefs", "1") p = return $ p { inFilter = resolveHtmlEntities >>> inFilter p } evalOpt ("markup", "1") p = return $ p { markupRE = addMarkup } evalOpt ("html", "1") p = return $ p { formatToken = formatHtmlTok , formatDoc = formatHtmlDoc } evalOpt ("full", "1") p = return $ p { outFilter = outFilter p >>> fullHtml (inputFile p) } usage :: IO () usage = hPutStrLn stderr use where use = usageInfo header options header = "colorizeSourceCode - colorize source code with HTML, version 0.1.1" -- ------------------------------------------------------------ process :: Process -> String -> String process p = inFilter p >>> tokenizeSubexRE (markupRE p (tokenRE p)) >>> map (formatToken p) >>> concat >>> lines >>> formatDoc p >>> outFilter p addMarkup :: Regex -> Regex addMarkup = mkElse (parseRegex . mkLE $ markupT) tokenizeLines :: String -> [(String, String)] tokenizeLines = map (\ l -> ("",l ++ "\n")) . lines numberLines :: [String] -> [String] numberLines = zipWith addNum [1..] where addNum i l = "" ++ fmt 4 i ++ "" ++ l fmt l = sed (const " ") " " . reverse . take l . reverse . (replicate l ' ' ++) . show substTabs :: String -> String substTabs = subs 0 subs _ "" = "" subs i (x:xs) | x == '\t' = replicate (8 - (i `mod` 8)) ' ' ++ subs 0 xs | x == '\n' = x : subs 0 xs | otherwise = x : subs (i+1) xs -- ------------------------------------------------------------ resolveHtmlEntities :: String -> String resolveHtmlEntities = sed (replaceEntity . drop 1 . init) "&\\i\\c*;" where replaceEntity e = maybe ("&" ++ e ++ ";") ((:[]) . toEnum) . lookup e $ xhtmlEntities -- ------------------------------------------------------------ formatHList :: [String] -> String formatHList = ("[" ++) . (++ "\n]") . intercalate "\n, " formatTok :: String -> String -> String formatTok kw tok = " (" ++ show kw ++ ",\t" ++ show tok ++ "\t)\n" formatHtmlDoc = map (("
" ++) . (++ "
") . preserveEmptyLines) >>> ("
" :) >>> (++ ["
"]) >>> unlines where preserveEmptyLines "" = " " preserveEmptyLines l = l formatHtmlTok :: (String, String) -> String formatHtmlTok (m, t) | m == "markup" = t | otherwise = colorizeTokens m (escapeText >>> sed (const " ") " " $ t) escapeText :: String -> String escapeText = concat . runLA (xshow (mkText >>> escapeHtmlDoc)) fullHtml :: String -> String -> String fullHtml fn s = unlines [ "" , "" , "" ++ fn ++ "" , "" , "" , "" , s , "" , "" ] css :: String css = unlines [ ".typename { color: #0000dd; }" , ".varname { color: #000000; }" , ".opname { color: #770000; }" , ".operator { color: #770000; /* font-weight:bold; */ }" , ".keyglyph { color: #3070A0; /* font-weight:bold; */ }" , ".par { }" , "" , ".keyword { color: #3070A0; /* font-weight:bold; */ }" , ".typekeyword { color: #3070A0; /* font-weight:bold; */ }" , ".strconst { color: #228B22; }" , ".charconst { color: #228B22; }" , ".labelname { color: #FF00FF; font-weight:bold; }" , ".cppcommand { color: #0000CD; }" , ".specialword { color: #c80000; }" , ".classname { color: #8B2323; }" , ".comment { color: #00008B; }" , ".bnfnt { color: #0000CD; }" , ".bnfmeta { color: #ff0000; font-weight:bold; }" , ".bnfterminal { color: #008800; font-weight:bold; }" , ".tclproc { color: #FF6000; }" , ".tclvar { color: #0000CD; }" , ".tclcomment { color: #c80000; }" , "" , ".linenr { color: #909090; padding-right: 2em; }" , "div.codeline { font-family: monospace; width: 100%; white-space: pre; border-width: 1px; border-style: solid; border-color: transparent; padding-left: 0.3em; }" , "div.codeline:hover { background-color:#ddddff; color:#c80000; border-width: 1px; border-style: solid; border-color: #c80000; }" ] -- ------------------------------------------------------------ colorizeTokens :: String -> String -> String colorizeTokens tok | tok `elem` [ "comment" , "keyword" , "keyglyph" , "typekeyword" , "varname", "typename", "labelname", "instancename", "globalname" , "opname" , "par" , "operator" , "strconst", "charconst" , "bnfnt", "bnfmeta" , "cppcommand" , "specialword" ] = wrap | tok == "longcomment" = wrap' "comment" . mlc | tok == "bnfterminal" = wrap . drop 1 . init -- | tok == "markupstart" = (("")) . drop 4 . init -- | tok == "markupend" = const "" | null tok = const "" | otherwise = id where wrap = wrap' tok wrap' tok' = (("") ++) . (++ "") mlc = sed (("" ++) . (++ "")) "(\\n\r?)" -- ------------------------------------------------------------ buildRegex :: [(String, String)] -> Regex buildRegex = foldr1 mkElse . map (uncurry mkBr') . map (second parseRegex) where mkBr' "" = id mkBr' l = mkBr l buildKeywords :: [String] -> String buildKeywords = intercalate "|" untilRE :: String -> String untilRE re = "(\\A{" ++ "\\}\\A" ++ re ++ "\\A)" ++ re mkLE :: (String, String) -> String mkLE (l, re) = "({" ++ l ++ "}(" ++ re ++ "))" ws1RE = "\\s+" ws1RE' = "[ \t]+" ws0RE = "[ \t]*" javacmt1, javacmt, strconst, charconst, number, par, xxx :: (String, String) -- markupS = ("markupstart", "<[a-zA-Z0-9]+>" ) -- markupE = ("markupend", "" ) markupT = ("markup", ( "" ++ "|" ++ "&" ++ xname ++ ";" ) ) where xname = "[A-Za-z][-_:A-Za-z0-9]*" xattr = ws1RE ++ xname ++ eq ++ "(" ++ dq ++ "|" ++ sq ++ ")" eq = "\\s*=\\s*" dq = "\"[^\"]*\"" sq = "\'[^\']*\'" ws = ("ws", ws1RE ) ws' = ("ws", ws1RE' ) javacmt1 = ("comment", "//.*" ) javacmt = ("longcomment", "/\\*" ++ untilRE "\\*/" ) shcmt1 = ("comment", "#.*" ) strconst = ("strconst", "\"([^\"\\\\\n\r]|\\\\.)*\"" ) charconst = ("charconst", "\'([^\'\\\\\n\r]|\\\\.)*\'" ) number = ("number", "[0-9]+(\\.[0-9]*([eE][-+]?[0-9]+)?)?" ) par = ("par", "[\\(\\)\\[\\]\\{\\}]" ) xxx = ("xxx", "." ) -- ------------------------------------------------------------ plainRE :: Regex plainRE = buildRegex [ ("xxx", "[^<&\n]+" ) , ("xxx", "[<&\n]" ) ] -- ------------------------------------------------------------ haskellRE :: Regex haskellRE = buildRegex [ ws , ("comment", "(-)- .*" ) , ("longcomment", "\\{" ++ untilRE "-\\}" ) , ("keyword", buildKeywords [ "case", "class" , "data", "default", "deriving", "do" , "else" , "forall" , "if", "import", "in" , "infix", "infixl", "infixr" , "instance" , "let" , "module" , "newtype" , "of" , "qualified" , "then", "type" , "where" , "_" , "as", "ccall", "foreign", "hiding", "proc", "safe", "unsafe" ] ) , ("keyglyph", buildKeywords ["\\.\\.","::","=","\\\\","\\|","<-","->","-<","@","~","=>","!",",",";"] ) , ("varname" , varname ) , ("typename", "[A-Z_][a-zA-Z0-9_]*[']*" ) , ("opname", "`" ++ varname ++ "`" ) , strconst , charconst , number , par , ("operator", "[-!#$%&\\*\\+./<=>\\?@\\\\^\\|~]+") , xxx ] where varname = "[a-z_][a-zA-Z0-9_]*[']*" -- ------------------------------------------------------------ javaRE :: Regex javaRE = buildRegex [ ws , javacmt1 , javacmt , ("keyword", buildKeywords [ "abstract", "assert" , "break" , "case", "catch", "class", "continue" , "default", "do" , "else", "extends" , "final", "finally", "for" , "if", "implements", "import", "instanceof", "interface" , "native", "new" , "package", "private", "protected", "public" , "return" , "static", "super", "switch", "synchronized" , "this", "throw", "throws", "transient", "try" , "volatile" , "while" ] ) , ("typekeyword", buildKeywords [ "boolean", "byte" , "char" , "double" , "false", "float" , "int" , "long" , "null" , "short" , "true" , "void" ] ) , ("labelname", "(" ++ varname ++ "{\\}default):" ) , ("", ( mkLE ("keyword", "break|continue") ++ mkLE ws ++ mkLE ("labelname", varname) ) ) , ("varname", varname ) , ("typename", "[A-Z][a-zA-Z0-9_]*" ) , strconst , charconst , number , par , ("delimiter", "[.,;]" ) , ("operator", "[-+!%&/=\\*\\?~|<>:]+" ) , xxx ] where varname = "[a-z][a-zA-Z0-9_]*" -- ------------------------------------------------------------ bnfRE = buildRegex [ ws , ("bnfnt" , "[A-Z][a-zA-Z0-9_]*" ) , ("bnfterminal", "\"([^\"\\\\\n\r]|\\\\.)*\"" ) , ("bnfmeta", buildKeywords [ "\\[" , "\\]" , "::=" , "\\|" , "\\{" , "\\}" ] ) , xxx ] -- ------------------------------------------------------------ cppRE :: Regex cppRE = buildRegex [ ws , javacmt1 , javacmt , ("keyword", buildKeywords [ "asm" , "auto" , "break" , "case" , "catch" , "class" , "const" , "continue" , "default" , "delete" , "do" , "else" , "extern" , "for" , "friend" , "goto" , "if" , "inline" , "new" , "operator" , "overload" , "private" , "protected" , "public" , "register" , "return" , "sizeof" , "static" , "switch" , "template" , "this" , "typedef" , "throw" , "try" , "virtual" , "volatile" , "while" ] ) , ("typekeyword", buildKeywords [ "char" , "double" , "enum" , "float" , "int" , "long" , "short" , "signed" , "struct" , "union" , "unsigned" , "void" ] ) , ("cppcommand", ( "#" ++ ws0RE ++ "(" ++ buildKeywords [ "define" , "else" , "endif" , "if" , "ifdef" , "ifndef" , "(include[ \t].*)" , "undef" ] ++ ")" ) ) , ("specialword", buildKeywords [ "assert" , "exit" , "free" , "main" , "malloc" ] ) , ("varname", varname ) , ("typename", "[A-Z][a-zA-Z0-9_]*" ) , strconst , charconst , number , par , ("delimiter", "[.,;]" ) , ("operator", "[-+!%&/=\\*\\?~|<>:]+" ) , xxx ] where varname = "[a-z][a-zA-Z0-9_]*" -- ------------------------------------------------------------ shRE = buildRegex [ ws , shcmt1 , ("keyword", buildKeywords [ "alias" , "break" , "bg" , "case" , "cd" , "continue" , "declare" , "do" , "done" , "echo" , "elif" , "else" , "env" , "esac" , "eval" , "exec" , "exit" , "export" , "false" , "fg" , "fi" , "for" , "function" , "if" , "in" , "jobs" , "kill" , "local" , "pwd" , "return" , "set" , "shift" , "test" , "then" , "trap" , "true" , "unalias" , "unset" , "while" , "wait" ] ) , ("varname", "[A-Za-z_][a-zA-Z0-9_]*" ) , ("operator", "[-+!%&=\\\\\\*\\?~|<>:@$]+" ) , ("operator", "[\\(\\)\\[\\]\\{\\}]+" ) , strconst , charconst , xxx ] -- ------------------------------------------------------------ rubyRE = buildRegex [ ws , rubycmt , ("keyword", buildKeywords [ "begin" , "break" , "catch" , "case" , "class" , "def" , "do" , "else" , "elif" , "end" , "ensure" , "false" , "for" , "if" , "in" , "include" , "initialize" , "loop" , "module" , "new" , "nil" , "raise" , "require" , "rescue" , "self" , "then" , "true" , "type" , "until" , "when" , "while" , "yield" ] ) , ("typename", "[A-Z][A-Za-z0-9]*" ) , ("varname", "[A-Za-z_][a-zA-Z0-9_]*(!|\\?)?" ) , ("instancename", "(@{1,2}|$)[A-Za-z_][a-zA-Z0-9_]*" ) , ("strconst", "%[qQx]\\{.*\\}" ) , ("strconst", "#\\{.*\\}" ) , ("strconst", ":[a-z][A-Za-z0-9]*" ) , strconst , charconst , regex , xxx ] where rubycmt = ("comment", "#(.{\\}\\{)*" ) regex = ("strconst", "/([^/\\\\\n\r]|\\\\.)*/" ) -- ------------------------------------------------------------ pplRE :: Regex pplRE = buildRegex [ ws , ("comment", "(-)- .*" ) , ("keyword", buildKeywords [ "and" , "begin" , "div" , "do" , "else" , "elseif" , "endif" , "endwhile" , "end" , "function" , "if" , "max" , "min" , "mod" , "not" , "of" , "or" , "procedure" , "repeat" , "return" , "then" , "until" , "var" , "while" , "xor" ] ) , ("typekeyword", buildKeywords [ "boolean" , "false" , "float" , "int" , "list" , "picture" , "string" , "true" ] ) , ("varname", "[A-Za-z_][a-zA-Z0-9_]*" ) , strconst , number , xxx ] -- ------------------------------------------------------------ pplassRE :: Regex pplassRE = buildRegex [ ws , ("comment", "(-)- .*" ) , ("keyword", buildKeywords [ "loadi" , "loadf" , "loads" , "emptyl" , "undef" , "load" ] ) , ("typename", buildKeywords [ "store" , "pop" ] ) , ("typekeyword", buildKeywords [ "jmp" , "brfalse" , "brtrue" , "pushj" , "popj" , "svc" ] ) , ("labelname", "(l[0-9]+:?)|([se]?_[A-Za-z0-9]*:?)" ) , ("varname", "[A-Za-z_][a-zA-Z0-9_]*" ) , strconst , xxx ] -- ------------------------------------------------------------ hxt-regex-xmlschema-9.0.4/examples/performance/0000755000000000000000000000000011701313414017716 5ustar0000000000000000hxt-regex-xmlschema-9.0.4/examples/performance/Makefile0000644000000000000000000000526111701313414021362 0ustar0000000000000000PKGFLAGS = GHCFLAGS = -Wall -O2 GHC = ghc $(GHCFLAGS) $(PKGFLAGS) CNT = 3 ropts = +RTS -s -RTS prog = ./REtest prog2 = ./Lines prog3 = ./RElines prog3a = ./SElines prog4 = ./Words prog5 = ./REwords prog5a = ./SEwords prog0 = ./Copy progs = $(prog) $(prog0) $(prog2) $(prog3) $(prog3a) $(prog4) $(prog5) $(prog5a) all : $(progs) $(prog) : $(prog).hs $(GHC) --make -o $@ $< local : $(GHC) --make -o $(prog) -fglasgow-exts $(prog).hs $(prog2) : $(prog) ln -f $(prog) $(prog2) $(prog3) : $(prog) ln -f $(prog) $(prog3) $(prog3a) : $(prog) ln -f $(prog) $(prog3a) $(prog4) : $(prog) ln -f $(prog) $(prog4) $(prog5) : $(prog) ln -f $(prog) $(prog5) $(prog5a) : $(prog) ln -f $(prog) $(prog5a) $(prog0) : $(prog) ln -f $(prog) $(prog0) # generate and read documents containing text # with 2^i characters tests = 25 test : $(prog) $(MAKE) genfiles tests="$(tests)" $(MAKE) copy tests="$(tests)" $(MAKE) lines tests="$(tests)" $(MAKE) relines tests="$(tests)" $(MAKE) selines tests="$(tests)" $(MAKE) words tests="$(tests)" $(MAKE) rewords tests="$(tests)" $(MAKE) sewords tests="$(tests)" perftest : $(prog) $(MAKE) test tests="2 3 10 11 12 13 14 15 16 17 18 19 20" genfiles : @for i in $(tests) ; \ do \ echo time $(prog) $(ropts) $$i ; \ time $(prog) $(ropts) $$i ; \ ls -l lines-*$$i.txt ; \ echo ; \ done copy : @for i in $(tests) ; \ do \ echo time $(prog0) $(ropts) $$i ; \ time $(prog0) $(ropts) $$i ; \ ls -l lines-*$$i.txt.copy ; \ echo ; \ done lines : @for i in $(tests) ; \ do \ echo time $(prog2) $(ropts) $$i ; \ time $(prog2) $(ropts) $$i ; \ ls -l lines-*$$i.txt.lines ; \ echo ; \ done relines : @for i in $(tests) ; \ do \ echo time $(prog3) $(ropts) $$i ; \ time $(prog3) $(ropts) $$i ; \ ls -l lines-*$$i.txt.relines ; \ echo ; \ done selines : @for i in $(tests) ; \ do \ echo time $(prog3a) $(ropts) $$i ; \ time $(prog3a) $(ropts) $$i ; \ ls -l lines-*$$i.txt.selines ; \ echo ; \ done words : @for i in $(tests) ; \ do \ echo time $(prog4) $(ropts) $$i ; \ time $(prog4) $(ropts) $$i ; \ ls -l lines-*$$i.txt.words ; \ echo ; \ done rewords : @for i in $(tests) ; \ do \ echo time $(prog5) $$i ; \ time $(prog5) $(ropts) $$i ; \ ls -l lines-*$$i.txt.rewords ; \ echo ; \ done sewords : @for i in $(tests) ; \ do \ echo time $(prog5a) $$i ; \ time $(prog5a) $(ropts) $$i ; \ ls -l lines-*$$i.txt.sewords ; \ echo ; \ done distclean : clean clean : rm -f $(progs) *.o *.hi *.txt *.txt.* .PHONY : clean distclean test perftest all local words rewords sewords lines relines selines copy genfiles hxt-regex-xmlschema-9.0.4/examples/performance/REtest.hs0000644000000000000000000000503511701313414021463 0ustar0000000000000000{-# LANGUAGE BangPatterns#-} -- ---------------------------------------- module Main(main) where import Text.Regex.XMLSchema.String import Control.Arrow import Data.Maybe import System.IO -- import the IO and commandline option stuff import System.Environment -- ---------------------------------------- main :: IO () main = do p <- getProgName al <- getArgs let i = if null al then 4 else (read . head $ al)::Int main' p i where main' p' = fromMaybe main1 . lookup (pn p') $ mpt mpt = [ ("REtest", main1) , ("Copy", main2 "copy" (:[])) , ("Lines", main2 "lines" lines) , ("RElines", main2 "relines" relines) , ("SElines", main2 "selines'" relines') , ("Words", main2 "words" words) , ("REwords", main2 "rewords" rewords) , ("SEwords", main2 "sewords" rewords') ] -- ---------------------------------------- -- generate a document containing a binary tree of 2^i leafs (= 2^(i-1) XML elements) main1 :: Int -> IO () main1 i = do genDoc i "REtest.hs" (fn i) return () -- ---------------------------------------- -- read a document containing a binary tree of 2^i leafs main2 :: String -> (String -> [String]) -> Int -> IO () main2 ext lines' i = do hPutStrLn stderr "start processing" h <- openFile (fn i) ReadMode c <- hGetContents h let ls = lines' c o <- openFile (fn i ++ "." ++ ext) WriteMode mapM_ (hPutStrLn o) ls hClose o hClose h hPutStrLn stderr "end processing" relines :: String -> [String] relines = tokenize ".*" relines' :: String -> [String] relines' = tokenizeSubex "({line}.*)" >>> map snd rewords :: String -> [String] rewords = tokenize "\\S+" rewords' :: String -> [String] rewords' = tokenizeSubex "({word}\\S+)" >>> map snd -- ---------------------------------------- pn :: String -> String pn = reverse . takeWhile (/= '/') . reverse fn :: Int -> String fn = ("lines-" ++) . (++ ".txt") . reverse . take 4 . reverse . ((replicate 4 '0') ++ ) . show -- ---------------------------------------- genDoc :: Int -> String -> String -> IO () genDoc d inp outp = do s <- readFile inp let s' = take (2^d) . concat . repeat $ s writeFile outp s' -- ---------------------------------------- hxt-regex-xmlschema-9.0.4/test/0000755000000000000000000000000011701313414014556 5ustar0000000000000000hxt-regex-xmlschema-9.0.4/test/Makefile0000644000000000000000000000024211701313414016214 0ustar0000000000000000prog = ./main all : $(prog) $(prog) : Main.hs ghc --make -Wall -O2 -o $@ $< test : $(prog) $(prog) clean : rm -f $(prog) *.hi *.o .PHONY : all test clean hxt-regex-xmlschema-9.0.4/test/Main.hs0000644000000000000000000003107611701313414016005 0ustar0000000000000000module Main where import Control.Arrow import System import Text.Regex.XMLSchema.String import Text.Regex.XMLSchema.String.Regex import Test.HUnit -- ------------------------------------------------------------ parseTestsStd :: Test parseTestsStd = TestLabel "standard XML parse tests" $ TestList $ map parseTest $ tests where parseTest (re, rep) = TestCase $ assertEqual (show re ++ " must be parsed as " ++ show rep) rep (show . parseRegexExt $ re) tests = [ ("", "()") , (".", ".") , (".*", "(.*)") , ("(())", "()") , ("(a*)*", "(a*)") , ("(a*)+", "(a*)") , ("(a+)*", "(a*)") , ("(a+)+", "(a+)") , ("(a?){2,}", "(a*)") , ("((a?){2,}){0,}", "(a*)") , ("((a?){2,}){3,}", "(a*)") , ("(a{0,}){2,}", "(a*)") , ("(a{2,}){3,}", "(a{6,})") , ("[9-0]", "{empty char range}") , ("[0-9]", "[0-9]") , ("[0-99-0]", "[0-9]") , ("[abc]", "[a-c]") , ("[\0-\1114111]", "\\a") , ("[\0-\1114111]|[0-9]", "\\a") , ("[\0-\1114110]", "[�-􏿾]" ) , ("[abc-[b]]", "[ac]" ) , ("a|b|c|d", "[a-d]" ) , ("(a|b)|c", "[a-c]" ) , ("a|(b|c)", "[a-c]" ) , ("abc", "(a(bc))" ) -- seq is right ass , ("({1}a+)", "({1}(a+))" ) -- extension: labeled subexpressions , ("({1}({2}({3}a)))", "({1}({2}({3}a)))" ) ] parseTestsExt :: Test parseTestsExt = TestLabel "extended parse tests" $ TestList $ map parseTest $ tests where parseTest (re, rep) = TestCase $ assertEqual (show re ++ " must be parsed as " ++ show rep) rep (show . parseRegexExt $ re) tests = [ ("", "()") , (".", ".") , (".*", "(.*)") , ("\\a", "\\a") , ("\\A", "\\A") , ("(())", "()") , ("(a*)*", "(a*)") , ("(a*)+", "(a*)") , ("(a+)*", "(a*)") , ("(a+)+", "(a+)") , ("(a?){2,}", "(a*)") , ("((a?){2,}){0,}", "(a*)") , ("((a?){2,}){3,}", "(a*)") , ("(a{0,}){2,}", "(a*)") , ("(a{2,}){3,}", "(a{6,})") , ("[9-0]", "{empty char range}") , ("[0-9]", "[0-9]") , ("[0-99-0]", "[0-9]") , ("[abc]", "[a-c]") , ("[\0-\1114111]", "\\a") , ("[\0-\1114111]|[0-9]", "\\a") , ("[\0-\1114110]", "[�-􏿾]" ) , ("[abc-[b]]", "[ac]" ) , ("a|b|c|d", "[a-d]" ) , ("(a|b)|c", "[a-c]" ) , ("a|(b|c)", "[a-c]" ) , ("abc", "(a(bc))" ) -- seq is right ass , ("a*{^}b*", "((a*){^}(b*))" ) -- extension: exor , ("a*{^}b*{^}c*", "((a*){^}((b*){^}(c*)))") , ("a*{&}b*", "((a*){&}(b*))" ) -- extension: intersection , ("a*{&}b*{&}c*", "((a*){&}((b*){&}(c*)))") , ("a*{\\}b*", "((a*){\\}(b*))" ) -- extension: set difference , ("a*{\\}b*{\\}c*", "(((a*){\\}(b*)){\\}(c*))" ) , ("(a|b)*{\\}(.*aa.*)", "(([ab]*){\\}((.*)(a(a(.*)))))" ) , ("({1}a+)", "({1}(a+))" ) -- extension: labeled subexpressions , ("({1}({2}({3}a)))", "({1}({2}({3}a)))" ) , ("({1}do){|}({2}[a-z]+)", "(({1}(do)){|}({2}([a-z]+)))" ) -- deterministic choice of submatches , ("a{:}b{:}c", "(a{:}(b{:}c))" ) -- interleave ] simpleMatchTests :: Test simpleMatchTests = TestLabel "simple match tests" $ TestList $ concatMap matchTest $ tests where matchTest (re, ok, er) = map (matchOK re) ok ++ map (matchErr re) er matchOK re s = TestCase $ assertBool (show s ++ " must match " ++ show re) (matchExt re s) matchErr re s = TestCase $ assertBool (show s ++ " must not match " ++ show re) (not (matchExt re s)) tests = [ ( "" , [""] , ["a"] ) , ( "a" , ["a"] , ["", "b", "ab"] ) , ( "()" , [""] , ["a"] ) , ( "ab" , ["ab"] , ["", "b", "abc"] ) , ( "." , [".","a","\0","\1114111"] , ["\n","\r","",".."] ) , ( "\\a" , [".","a","\n","\r","\0","\1114111"] , ["",".."] ) , ( "\\A" , ["",".","a","\n","\r","\0","\1114111",".."] , [] ) , ( "a*" , ["", "a", "aa"] , ["b", "ab", "aab"] ) , ( "a+" , ["a", "aa", "aaa"] , ["", "b", "ab"] ) , ( "a?" , ["", "a"] , ["b", "ab"] ) , ( "a{2}" , ["aa"] , ["", "a", "aaa"] ) , ( "a{2,}" , ["aa","aaa"] , ["", "a", "aaab"] ) , ( "a{2,4}" , ["aa", "aaa", "aaaa"] , ["", "a", "aaaaa", "ab"] ) , ( "a|b" , ["a", "b"] , ["", "c", "ab", "abc"] ) , ( "[0-9]" , ["0", "5", "9"] , ["", "a", "00"] ) , ( "[^0-9]" , ["a"] , ["", "0", "9", "00"]) , ( "\32" , [" "] , [] ) , ( "[\0-\1114111]" , ["\0","\1114111","a"] , ["","aaa"] ) , ( "[^\0-\1114111]" , [] , ["","aaa","\0","\1114111","a"] ) , ( ".*a.*|.*b.*|.*c.*" , ["a", "abc", "acdc"] , ["", "dddd"] ) , ( ".*a.*{&}.*b.*{&}.*c.*" , ["abc", "abcd", "abcabcd"] , ["", "a", "bc", "acdc", "dddd"] ) , ( ".*a.*{&}.*b.*{&}.*c.*{&}.{3}" -- all permutations of "abc" , ["abc", "acb", "bac", "bca", "cab", "cba"] , ["", "a", "bc", "acd", "aaaa", "aba"] ) , ( ".*a.*{&}.*b.*{&}.*c.*" -- all words containing at least 1 a, 1 b and 1 c , ["abc", "acb", "bac", "bca", "cab", "cba", "abcd", "abcabc"] , ["", "a", "bc", "acd", "aaaa"] ) , ( ".*a.*{^}.*b.*" -- all words containing at least 1 a or 1 b but not both a's and b's , ["a", "b", "ac", "bc", "aaaa", "bbb", "aacc", "ccbb", "acdc"] , ["", "ab", "abc", "dddd"] ) , ( "/[*](.*{\\}(.*[*]/.*))[*]/" -- single line C comment of form /*...*/, but without any */ in the comment body -- this is the way to specify none greedy expessions -- if multi-line comment are required, substitute .* by \A, so newlines are allowed , ["/**/","/***/","/*x*/","/*///*/"] , ["", "/", "/*", "/*/", "/**/*/", "/*xxx*/xxx*/"] ) , ( "a{:}b{:}c" , ["abc", "acb", "bac", "bca", "cab", "cba"] , ["", "a", "ab", "abcc", "abca", "aba"] ) ] -- ------------------------------------------------------------ simpleSplitTests :: Test simpleSplitTests = TestLabel "simple split tests" $ TestList $ map splitTest $ tests where splitTest (re, inp, tok, rest) = TestCase $ assertEqual ("split " ++ show re ++ " " ++ show inp ++ " = " ++ show (tok, rest)) (tok, rest) (split re inp) tests = [ ("", "a", "", "a" ) , ("a*b", "abc", "ab", "c" ) , ("a*", "bc", "", "bc" ) , ("a+", "bc", "", "bc" ) , ("[", "bc", "", "bc" ) , ("a{2}", "aaa", "aa", "a" ) , ("a{2,}", "aaa", "aaa", "" ) , ("a|b", "ab", "a", "b" ) , ("a|b*", "bbba", "bbb", "a" ) , ("abc", "abcd", "abc", "d" ) ] -- ------------------------------------------------------------ simpleTokenTests :: Test simpleTokenTests = TestLabel "simple token tests" $ TestList $ map tokenTest $ tests where tokenTest (re, inp, toks) = TestCase $ assertEqual ("tokenize " ++ show re ++ " " ++ show inp ++ " = " ++ show toks) toks (tokenize re inp) tests = [ ("", "", [] ) , ("a", "aba", ["a", "a"] ) , ("a", "b", [] ) , ("a", "ba", ["a"] ) , ("a*", "a", ["a"] ) , ("a*", "ba", ["","a"] ) , ("a*", "aba", ["a", "a"] ) , ("a*", "abba", ["a", "", "a"] ) , ("a+", "abba", ["a", "a"] ) , ("a*b", "abba", ["ab", "b"] ) , (".*", "a\n\nb", ["a", "", "b"] ) , (".*", "a\n\nb\n", ["a", "", "b"] ) , ("\\w+", "a\n\nb\n", ["a", "b"] ) , ("\\w|ab", "aaa\n\nabc\n", ["a", "a", "a", "ab", "c"] ) , ("\\w|ab", "aaa abc", ["a", "a", "a", "ab", "c"] ) ] -- ------------------------------------------------------------ allTests :: Test allTests = TestList [ parseTestsStd , parseTestsExt , simpleMatchTests , simpleSplitTests , simpleTokenTests ] main :: IO () main = do c <- runTestTT allTests putStrLn $ show c let errs = errors c fails = failures c System.exitWith (codeGet errs fails) codeGet :: Int -> Int -> ExitCode codeGet errs fails | fails > 0 = ExitFailure 2 | errs > 0 = ExitFailure 1 | otherwise = ExitSuccess -- ------------------------------------------------------------ deltaTrc :: Regex -> String -> [(String, Regex)] deltaTrc re "" = [("", re)] deltaTrc re s@(c:cs) = (s, re) : ( if isZero re' then [("",re')] else deltaTrc re' cs ) where re' = delta1 re c matchTrc :: String -> String -> (Bool, [(String, Regex)]) matchTrc re = ( parseRegex >>> deltaTrc$ re ) >>> ( (last >>> snd >>> nullable) &&& id ) trcMatch :: String -> String -> IO() trcMatch re = putStrLn . showTrc . matchTrc re where showTrc = ( (show >>> (++ "\n")) *** (concatMap ( ((++ "\t") *** (show >>> (++"\n"))) >>> uncurry (++) ) ) ) >>> uncurry (flip (++)) -- ------------------------------------------------------------ hxt-regex-xmlschema-9.0.4/test/.ghci0000644000000000000000000000003411701313414015466 0ustar0000000000000000:set -i../../src :load Main