hxt-regex-xmlschema-9.2.0.3/examples/0000755000000000000000000000000012752557013015570 5ustar0000000000000000hxt-regex-xmlschema-9.2.0.3/examples/RegexXMLSchema/0000755000000000000000000000000012752557013020344 5ustar0000000000000000hxt-regex-xmlschema-9.2.0.3/examples/colorizeProgs/0000755000000000000000000000000013001405044020411 5ustar0000000000000000hxt-regex-xmlschema-9.2.0.3/examples/performance/0000755000000000000000000000000012752557013020071 5ustar0000000000000000hxt-regex-xmlschema-9.2.0.3/src/0000755000000000000000000000000012752557013014541 5ustar0000000000000000hxt-regex-xmlschema-9.2.0.3/src/Text/0000755000000000000000000000000012752557013015465 5ustar0000000000000000hxt-regex-xmlschema-9.2.0.3/src/Text/Regex/0000755000000000000000000000000012752557013016537 5ustar0000000000000000hxt-regex-xmlschema-9.2.0.3/src/Text/Regex/Glob/0000755000000000000000000000000012752557013017422 5ustar0000000000000000hxt-regex-xmlschema-9.2.0.3/src/Text/Regex/Glob/Generic/0000755000000000000000000000000012752557013020776 5ustar0000000000000000hxt-regex-xmlschema-9.2.0.3/src/Text/Regex/XMLSchema/0000755000000000000000000000000012752557013020320 5ustar0000000000000000hxt-regex-xmlschema-9.2.0.3/src/Text/Regex/XMLSchema/Generic/0000755000000000000000000000000012752557013021674 5ustar0000000000000000hxt-regex-xmlschema-9.2.0.3/test/0000755000000000000000000000000012752557013014731 5ustar0000000000000000hxt-regex-xmlschema-9.2.0.3/src/Text/Regex/Glob/String.hs0000644000000000000000000000213012752557013021220 0ustar0000000000000000-- ------------------------------------------------------------ {- | Module : Text.Regex.Glob.String Copyright : Copyright (C) 2011- Uwe Schmidt License : MIT Maintainer : Uwe Schmidt Stability : stable Portability: portable csh glob style pattern matcher -} -- ------------------------------------------------------------ module Text.Regex.Glob.String {-# DEPRECATED "use the more general 'Text.Regex.Glob.Generic' instead" #-} ( Regex , match , matchNoCase , parseRegex , parseRegexNoCase ) where import Text.Regex.Glob.Generic (Regex) import qualified Text.Regex.Glob.Generic as G -- ------------------------------------------------------------ match :: String -> String -> Bool match = G.match matchNoCase :: String -> String -> Bool matchNoCase = G.matchNoCase parseRegex :: String -> Regex parseRegex = G.parseRegex parseRegexNoCase :: String -> Regex parseRegexNoCase = G.parseRegexNoCase -- ------------------------------------------------------------ hxt-regex-xmlschema-9.2.0.3/src/Text/Regex/Glob/Generic.hs0000644000000000000000000000300712752557013021332 0ustar0000000000000000-- ------------------------------------------------------------ {- | Module : Text.Regex.Glob.String Copyright : Copyright (C) 2011- Uwe Schmidt License : MIT Maintainer : Uwe Schmidt Stability : stable Portability: portable csh glob style pattern matcher -} -- ------------------------------------------------------------ module Text.Regex.Glob.Generic ( GenRegex , Regex , RegexText , RegexTextLazy , RegexByteString , RegexByteStringLazy , match , matchNoCase , parseRegex , parseRegexNoCase ) where import Text.Regex.Glob.Generic.RegexParser (parseRegex, parseRegexNoCase) import Text.Regex.XMLSchema.Generic.Regex (matchWithRegex) import Text.Regex.XMLSchema.Generic.StringLike import Text.Regex.XMLSchema.Generic (GenRegex, Regex, RegexText, RegexTextLazy, RegexByteString, RegexByteStringLazy ) -- ------------------------------------------------------------ match :: StringLike s => s -> s -> Bool match = matchWithRegex . parseRegex matchNoCase :: StringLike s => s -> s -> Bool matchNoCase = matchWithRegex . parseRegexNoCase -- ------------------------------------------------------------ hxt-regex-xmlschema-9.2.0.3/src/Text/Regex/Glob/Generic/RegexParser.hs0000644000000000000000000000617612752557013023573 0ustar0000000000000000{-# LANGUAGE FlexibleContexts #-} -- ------------------------------------------------------------ {- | Copyright : Copyright (C) 2014- Uwe Schmidt License : MIT Maintainer : Uwe Schmidt (uwe@fh-wedel.de) Stability : stable Portability: portable csh style Glob Pattern Parser for Regular Expressions -} -- ------------------------------------------------------------ module Text.Regex.Glob.Generic.RegexParser ( parseRegex , parseRegexNoCase ) where import Data.Char (isLower, isUpper, toLower, toUpper) import Text.ParserCombinators.Parsec import Text.Regex.XMLSchema.Generic.Regex import Text.Regex.XMLSchema.Generic.StringLike -- ------------------------------------------------------------ -- | parse a glob pattern parseRegex :: StringLike s => s -> GenRegex s parseRegex = parseRegex' mkSymRng . toString parseRegexNoCase :: StringLike s => s -> GenRegex s parseRegexNoCase = parseRegex' mkNoCaseSymRng . toString parseRegex' :: StringLike s => (Char -> Char -> GenRegex s) -> String -> GenRegex s parseRegex' mkS = either (mkZero' . ("syntax error: " ++) . show) id . parse ( do r <- pattern mkS eof return r ) "" -- ------------------------------------------------------------ pattern :: StringLike s => (Char -> Char -> GenRegex s) -> Parser (GenRegex s) pattern mkS = many part >>= return . mkSeqs where -- part :: Parser (GenRegex s) part = ( many1 (noneOf "\\?*[{") >>= return . mkWord' ) <|> ( char '?' >> return mkDot ) <|> ( char '*' >> return mkAll ) <|> ( between (char '{') (char '}') wordList ) <|> ( between (char '[') (char ']') charSet ) <|> ( do c <- char '\\' >> anyChar return $ mkS c c ) mkWord' = mkSeqs . map (\ c -> mkS c c) -- wordList :: Parser (GenRegex s) wordList = sepBy (many1 (noneOf ",}")) (char ',') >>= return . foldr mkAlt (mkZero' "") . map mkWord' -- charSet :: Parser (GenRegex s) charSet = ( do p1 <- charSet' anyChar ps <- many $ charSet' (noneOf "]") return $ foldr mkAlt (mkZero' "") (p1 : ps) ) where charSet' cp = do c1 <- cp c2 <- rest c1 return $ mkS c1 c2 rest c1 = option c1 (char '-' >> anyChar) -- ------------------------------------------------------------ mkNoCaseSymRng :: StringLike s => Char -> Char -> GenRegex s mkNoCaseSymRng c1 c2 | isLower c1 && isLower c2 = mkAlt (mkSymRng (toUpper c1) (toUpper c2)) (mkSymRng c1 c2) | isUpper c1 && isUpper c2 = mkAlt (mkSymRng (toLower c1) (toLower c2)) (mkSymRng c1 c2) | otherwise = mkSymRng c1 c2 -- ------------------------------------------------------------ hxt-regex-xmlschema-9.2.0.3/src/Text/Regex/XMLSchema/String.hs0000644000000000000000000003004712752557013022126 0ustar0000000000000000-- ------------------------------------------------------------ {- | Copyright : Copyright (C) 2010- Uwe Schmidt License : MIT Maintainer : Uwe Schmidt Stability : stable Portability: portable Convenient functions for W3C XML Schema Regular Expression Matcher for Strings. A specialisation of Text.Regex.XMLSchema.Generic as compatibility module to old non generic version Grammar can be found under -} -- ------------------------------------------------------------ module Text.Regex.XMLSchema.String {-# DEPRECATED "use the more general 'Text.Regex.XMLSchema.Generic' instead" #-} ( Regex , grep , grepExt , grepRE , grepREwithLineNum , match , matchExt , matchSubex , sed , sedExt , split , splitExt , splitSubex , tokenize , tokenizeExt , tokenize' , tokenizeExt' , tokenizeSubex , matchRE , matchSubexRE , sedRE , splitRE , splitSubexRE , tokenizeRE , tokenizeRE' , tokenizeSubexRE -- Text.Regex.XMLSchema.Generic.Regex , mkZero , mkUnit , mkSym1 , mkSymRng , mkWord , mkDot , mkStar , mkAll , mkAlt , mkElse , mkSeq , mkSeqs , mkRep , mkRng , mkOpt , mkDiff , mkIsect , mkExor , mkCompl , mkBr , isZero , errRegex -- Text.Regex.XMLSchema.Generic.RegexParser , parseRegex , parseRegexExt , parseContextRegex ) where import Text.Regex.XMLSchema.Generic (Regex) import qualified Text.Regex.XMLSchema.Generic as G import Text.Regex.XMLSchema.Generic.Regex import Text.Regex.XMLSchema.Generic.RegexParser -- ------------------------------------------------------------ -- | split a string by taking the longest prefix matching a regular expression -- -- @Nothing@ is returned in case there is no matching prefix, -- else the pair of prefix and rest is returned splitRE :: Regex -> String -> Maybe (String, String) splitRE = G.splitRE -- | convenient function for 'splitRE' -- -- examples: -- -- > split "a*b" "abc" = ("ab","c") -- > split "a*" "bc" = ("", "bc") -- "a*" matches "" -- > split "a+" "bc" = ("", "bc") -- "a+" does not match, no split -- > split "[" "abc" = ("", "abc") -- "[" syntax error, no split split :: String -> String -> (String, String) split = G.split -- | split with extended syntax splitExt :: String -> String -> (String, String) splitExt = G.splitExt -- ------------------------------------------------------------ -- | split a string by removing the longest prefix matching a regular expression -- and then return the list of subexpressions found in the matching part -- -- @Nothing@ is returned in case of no matching prefix, -- else the list of pairs of labels and submatches and the -- rest is returned splitSubexRE :: Regex -> String -> Maybe ([(String, String)], String) splitSubexRE = G.splitSubexRE -- | convenient function for 'splitSubex', uses extended syntax -- -- examples: -- -- > splitSubex "({1}a*)b" "abc" = ([("1","a")],"c") -- > splitSubex "({2}a*)" "bc" = ([("2","")], "bc") -- > splitSubex "({1}a|b)+" "abc" = ([("1","a"),("1","b")],"c") -- subex 1 matches 2 times -- > -- > splitSubex ".*({x}a*)" "aa" = ([("x",""),("x","a"),("x","aa")],"") -- > -- nondeterminism: 3 matches for a* -- > -- > splitSubex "({1}do)|({2}[a-z]+)" "do you know" -- > = ([("1","do"),("2","do")]," you know") -- > -- nondeterminism: 2 matches for do -- > -- > splitSubex "({1}do){|}({2}[a-z]+)" "do you know" -- > = ([("1","do")]," you know") -- > -- no nondeterminism with {|}: 1. match for do -- > -- > splitSubex "({1}a+)" "bcd" = ([], "bcd") -- no match -- > splitSubex "[" "abc" = ([], "abc") -- syntax error splitSubex :: String -> String -> ([(String,String)], String) splitSubex = G.splitSubex -- ------------------------------------------------------------ -- | The function, that does the real work for 'tokenize' tokenizeRE :: Regex -> String -> [String] tokenizeRE = G.tokenizeRE -- | split a string into tokens (words) by giving a regular expression -- which all tokens must match. -- -- Convenient function for 'tokenizeRE' -- -- This can be used for simple tokenizers. -- It is recommended to use regular expressions where the empty word does not match. -- Else there will appear a lot of probably useless empty tokens in the output. -- All none matching chars are discarded. If the given regex contains syntax errors, -- @Nothing@ is returned -- -- examples: -- -- > tokenize "a" "aabba" = ["a","a","a"] -- > tokenize "a*" "aaaba" = ["aaa","a"] -- > tokenize "a*" "bbb" = ["","",""] -- > tokenize "a+" "bbb" = [] -- > -- > tokenize "a*b" "" = [] -- > tokenize "a*b" "abc" = ["ab"] -- > tokenize "a*b" "abaab ab" = ["ab","aab","ab"] -- > -- > tokenize "[a-z]{2,}|[0-9]{2,}|[0-9]+[.][0-9]+" "ab123 456.7abc" -- > = ["ab","123","456.7","abc"] -- > -- > tokenize "[a-z]*|[0-9]{2,}|[0-9]+[.][0-9]+" "cab123 456.7abc" -- > = ["cab","123","456.7","abc"] -- > -- > tokenize "[^ \t\n\r]*" "abc def\t\n\rxyz" -- > = ["abc","def","xyz"] -- > -- > tokenize ".*" "\nabc\n123\n\nxyz\n" -- > = ["","abc","123","","xyz"] -- > -- > tokenize ".*" = lines -- > -- > tokenize "[^ \t\n\r]*" = words tokenize :: String -> String -> [String] tokenize = G.tokenize -- | tokenize with extended syntax tokenizeExt :: String -> String -> [String] tokenizeExt = G.tokenizeExt -- ------------------------------------------------------------ -- | split a string into tokens and delimierter by giving a regular expression -- wich all tokens must match -- -- This is a generalisation of the above 'tokenizeRE' functions. -- The none matching char sequences are marked with @Left@, the matching ones are marked with @Right@ -- -- If the regular expression contains syntax errors @Nothing@ is returned -- -- The following Law holds: -- -- > concat . map (either id id) . tokenizeRE' re == id tokenizeRE' :: Regex -> String -> [Either String String] tokenizeRE' = G.tokenizeRE' -- | convenient function for 'tokenizeRE'' -- -- When the regular expression parses as Zero, -- @[Left input]@ is returned, that means no tokens are found tokenize' :: String -> String -> [Either String String] tokenize' = G.tokenize' tokenizeExt' :: String -> String -> [Either String String] tokenizeExt' = G.tokenizeExt' -- ------------------------------------------------------------ -- | split a string into tokens (pair of labels and words) by giving a regular expression -- containing labeled subexpressions. -- -- This function should not be called with regular expressions -- without any labeled subexpressions. This does not make sense, because the result list -- will always be empty. -- -- Result is the list of matching subexpressions -- This can be used for simple tokenizers. -- At least one char is consumed by parsing a token. -- The pairs in the result list contain the matching substrings. -- All none matching chars are discarded. If the given regex contains syntax errors, -- @Nothing@ is returned tokenizeSubexRE :: Regex -> String -> [(String, String)] tokenizeSubexRE = G.tokenizeSubexRE -- | convenient function for 'tokenizeSubexRE' a string -- -- examples: -- -- > tokenizeSubex "({name}[a-z]+)|({num}[0-9]{2,})|({real}[0-9]+[.][0-9]+)" -- > "cab123 456.7abc" -- > = [("name","cab") -- > ,("num","123") -- > ,("real","456.7") -- > ,("name","abc")] -- > -- > tokenizeSubex "({real}({n}[0-9]+)([.]({f}[0-9]+))?)" -- > "12.34" = [("real","12.34") -- > ,("n","12") -- > ,("f","34")] -- > -- > tokenizeSubex "({real}({n}[0-9]+)([.]({f}[0-9]+))?)" -- > "12 34" = [("real","12"),("n","12") -- > ,("real","34"),("n","34")] -- > -- > tokenizeSubex "({real}({n}[0-9]+)(([.]({f}[0-9]+))|({f})))" -- > "12 34.56" = [("real","12"),("n","12"),("f","") -- > ,("real","34.56"),("n","34"),("f","56")] tokenizeSubex :: String -> String -> [(String,String)] tokenizeSubex = G.tokenizeSubex -- ------------------------------------------------------------ -- | sed like editing function -- -- All matching tokens are edited by the 1. argument, the editing function, -- all other chars remain as they are sedRE :: (String -> String) -> Regex -> String -> String sedRE = G.sedRE -- | convenient function for 'sedRE' -- -- examples: -- -- > sed (const "b") "a" "xaxax" = "xbxbx" -- > sed (\ x -> x ++ x) "a" "xax" = "xaax" -- > sed undefined "[" "xxx" = "xxx" sed :: (String -> String) -> String -> String -> String sed = G.sed sedExt :: (String -> String) -> String -> String -> String sedExt = G.sedExt -- ------------------------------------------------------------ -- | match a string with a regular expression matchRE :: Regex -> String -> Bool matchRE = G.matchRE -- | convenient function for 'matchRE' -- -- Examples: -- -- > match "x*" "xxx" = True -- > match "x" "xxx" = False -- > match "[" "xxx" = False match :: String -> String -> Bool match = G.match -- | match with extended regular expressions matchExt :: String -> String -> Bool matchExt = G.matchExt -- ------------------------------------------------------------ -- | match a string with a regular expression -- and extract subexpression matches matchSubexRE :: Regex -> String -> [(String, String)] matchSubexRE = G.matchSubexRE -- | convenient function for 'matchRE' -- -- Examples: -- -- > matchSubex "({1}x*)" "xxx" = [("1","xxx")] -- > matchSubex "({1}x*)" "y" = [] -- > matchSubex "({w}[0-9]+)x({h}[0-9]+)" "800x600" = [("w","800"),("h","600")] -- > matchSubex "[" "xxx" = [] matchSubex :: String -> String -> [(String, String)] matchSubex = G.matchSubex -- ------------------------------------------------------------ -- | grep like filter for lists of strings -- -- The regular expression may be prefixed with the usual context spec \"^\" for start of string, -- and "\\<" for start of word. -- and suffixed with \"$\" for end of text and "\\>" end of word. -- Word chars are defined by the multi char escape sequence "\\w" -- -- Examples -- -- > grep "a" ["_a_", "_a", "a_", "a", "_"] => ["_a_", "_a", "a_", "a"] -- > grep "^a" ["_a_", "_a", "a_", "a", "_"] => ["a_", "a"] -- > grep "a$" ["_a_", "_a", "a_", "a", "_"] => ["_a", "a"] -- > grep "^a$" ["_a_", "_a", "a_", "a", "_"] => ["a"] -- > grep "\\ ["x a b", " ax "] -- > grep "a\\>" ["x a b", " ax ", " xa ", "xab"] => ["x a b", " xa "] grep :: String -> [String] -> [String] grep = G.grep -- | grep with extended regular expressions grepExt :: String -> [String] -> [String] grepExt = G.grepExt -- | grep with already prepared Regex (ususally with 'parseContextRegex') grepRE :: Regex -> [String] -> [String] grepRE = G.grepRE -- | grep with Regex and line numbers grepREwithLineNum :: Regex -> [String] -> [(Int,String)] grepREwithLineNum = G.grepREwithLineNum -- ------------------------------------------------------------ hxt-regex-xmlschema-9.2.0.3/src/Text/Regex/XMLSchema/Generic.hs0000644000000000000000000000417112752557013022233 0ustar0000000000000000-- ------------------------------------------------------------ {- | Module : Text.Regex.XMLSchema.Generic Copyright : Copyright (C) 2014- Uwe Schmidt License : MIT Maintainer : Uwe Schmidt Stability : stable Portability: portable Convenient functions for W3C XML Schema Regular Expression Matcher. For internals see 'Text.Regex.XMLSchema.Generic.Regex' and 'Text.Regex.XMLSchema.Generic.Matching' Grammar can be found under -} -- ------------------------------------------------------------ module Text.Regex.XMLSchema.Generic ( GenRegex , Regex , RegexText , RegexTextLazy , RegexByteString , RegexByteStringLazy , grep , grepExt , grepRE , grepREwithLineNum , match , matchExt , matchSubex , sed , sedExt , split , splitExt , splitSubex , tokenize , tokenizeExt , tokenize' , tokenizeExt' , tokenizeSubex , matchRE , matchSubexRE , sedRE , splitRE , splitSubexRE , tokenizeRE , tokenizeRE' , tokenizeSubexRE , mkZero , mkZero' , mkUnit , mkSym1 , mkSymRng , mkWord , mkDot , mkStar , mkAll , mkAlt , mkElse , mkSeq , mkSeqs , mkRep , mkRng , mkOpt , mkDiff , mkIsect , mkExor , mkCompl , mkBr , mkBr' , isZero , errRegex , parseRegex , parseRegexExt , parseContextRegex ) where import Text.Regex.XMLSchema.Generic.Matching import Text.Regex.XMLSchema.Generic.Regex import Text.Regex.XMLSchema.Generic.RegexParser import qualified Data.Text as T import qualified Data.Text.Lazy as TL import qualified Data.ByteString.Char8 as B import qualified Data.ByteString.Lazy.Char8 as BL type Regex = GenRegex String type RegexText = GenRegex T.Text type RegexTextLazy = GenRegex TL.Text type RegexByteString = GenRegex B.ByteString type RegexByteStringLazy = GenRegex BL.ByteString -- ------------------------------------------------------------ hxt-regex-xmlschema-9.2.0.3/src/Text/Regex/XMLSchema/Generic/Matching.hs0000644000000000000000000003650612752557013023774 0ustar0000000000000000{-# LANGUAGE BangPatterns #-} -- ------------------------------------------------------------ {- | Module : Text.Regex.XMLSchema.Generic Copyright : Copyright (C) 2014- Uwe Schmidt License : MIT Maintainer : Uwe Schmidt Stability : stable Portability: portable Convenient functions for W3C XML Schema Regular Expression Matcher. For internals see 'Text.Regex.XMLSchema.Regex' Grammar can be found under -} -- ------------------------------------------------------------ module Text.Regex.XMLSchema.Generic.Matching ( grep , grepExt , grepRE , grepREwithLineNum , match , matchExt , matchSubex , sed , sedExt , split , splitExt , splitSubex , tokenize , tokenizeExt , tokenize' , tokenizeExt' , tokenizeSubex , matchRE , matchSubexRE , sedRE , splitRE , splitSubexRE , tokenizeRE , tokenizeRE' , tokenizeSubexRE ) where import Control.Arrow import Data.Maybe import Text.Regex.XMLSchema.Generic.Regex import Text.Regex.XMLSchema.Generic.RegexParser import Text.Regex.XMLSchema.Generic.StringLike {- import Debug.Trace (traceShow) trc :: Show a => String -> a -> a trc msg x = traceShow (msg, x) x -- -} -- ------------------------------------------------------------ -- | split a string by taking the longest prefix matching a regular expression -- -- @Nothing@ is returned in case there is no matching prefix, -- else the pair of prefix and rest is returned splitRE :: StringLike s => GenRegex s -> s -> Maybe (s, s) splitRE re input = do (sms, rest) <- splitWithRegex re input return (snd . head $ sms, rest) -- | convenient function for 'splitRE' -- -- examples: -- -- > split "a*b" "abc" = ("ab","c") -- > split "a*" "bc" = ("", "bc") -- "a*" matches "" -- > split "a+" "bc" = ("", "bc") -- "a+" does not match, no split -- > split "[" "abc" = ("", "abc") -- "[" syntax error, no split split :: StringLike s => s -> s -> (s, s) split = split' parseRegex -- | split with extended syntax splitExt :: StringLike s => s -> s -> (s, s) splitExt = split' parseRegexExt split' :: StringLike s => (s -> GenRegex s) -> s -> s -> (s, s) split' parseRe re input = fromMaybe (emptyS, input) . (splitRE . parseRe $ re) $ input -- ------------------------------------------------------------ -- | split a string by removing the longest prefix matching a regular expression -- and then return the list of subexpressions found in the matching part -- -- @Nothing@ is returned in case of no matching prefix, -- else the list of pairs of labels and submatches and the -- rest is returned splitSubexRE :: StringLike s => GenRegex s -> s -> Maybe ([(s, s)], s) splitSubexRE re input = do (sms, rest) <- splitWithRegex re input return (map (first fromJust) . drop 1 $ sms, rest) -- | convenient function for 'splitSubex', uses extended syntax -- -- examples: -- -- > splitSubex "({1}a*)b" "abc" = ([("1","a")],"c") -- > splitSubex "({2}a*)" "bc" = ([("2","")], "bc") -- > splitSubex "({1}a|b)+" "abc" = ([("1","a"),("1","b")],"c") -- subex 1 matches 2 times -- > -- > splitSubex ".*({x}a*)" "aa" = ([("x",""),("x","a"),("x","aa")],"") -- > -- nondeterminism: 3 matches for a* -- > -- > splitSubex "({1}do)|({2}[a-z]+)" "do you know" -- > = ([("1","do"),("2","do")]," you know") -- > -- nondeterminism: 2 matches for do -- > -- > splitSubex "({1}do){|}({2}[a-z]+)" "do you know" -- > = ([("1","do")]," you know") -- > -- no nondeterminism with {|}: 1. match for do -- > -- > splitSubex "({1}a+)" "bcd" = ([], "bcd") -- no match -- > splitSubex "[" "abc" = ([], "abc") -- syntax error splitSubex :: StringLike s => s -> s -> ([(s, s)], s) splitSubex re inp = fromMaybe ([], inp) . (splitSubexRE . parseRegexExt $ re) $ inp -- ------------------------------------------------------------ -- | The function, that does the real work for 'tokenize' tokenizeRE :: StringLike s => GenRegex s -> s -> [s] tokenizeRE re = token'' where fcs = firstChars re re1 = mkDiff re mkUnit token'' = token' re fcs token1'' = token' re1 fcs -- token' :: StringLike s => GenRegex s -> CharSet -> s -> [s] token' re' fcs' inp | nullS inp = [] | otherwise = evalRes . splitWithRegexCS re' fcs' $ inp where evalRes Nothing = token'' (dropS 1 inp) -- re does not match any prefix evalRes (Just (toks, rest)) | nullS tok = tok : token'' (dropS 1 rest) -- re is nullable and only the empty prefix matches -- discard one char and try again | otherwise = tok : token1'' rest -- real token found, next token must not be empty where tok = snd . head $ toks -- | split a string into tokens (words) by giving a regular expression -- which all tokens must match. -- -- Convenient function for 'tokenizeRE' -- -- This can be used for simple tokenizers. -- It is recommended to use regular expressions where the empty word does not match. -- Else there will appear a lot of probably useless empty tokens in the output. -- All none matching chars are discarded. If the given regex contains syntax errors, -- @Nothing@ is returned -- -- examples: -- -- > tokenize "a" "aabba" = ["a","a","a"] -- > tokenize "a*" "aaaba" = ["aaa","a"] -- > tokenize "a*" "bbb" = ["","",""] -- > tokenize "a+" "bbb" = [] -- > -- > tokenize "a*b" "" = [] -- > tokenize "a*b" "abc" = ["ab"] -- > tokenize "a*b" "abaab ab" = ["ab","aab","ab"] -- > -- > tokenize "[a-z]{2,}|[0-9]{2,}|[0-9]+[.][0-9]+" "ab123 456.7abc" -- > = ["ab","123","456.7","abc"] -- > -- > tokenize "[a-z]*|[0-9]{2,}|[0-9]+[.][0-9]+" "cab123 456.7abc" -- > = ["cab","123","456.7","abc"] -- > -- > tokenize "[^ \t\n\r]*" "abc def\t\n\rxyz" -- > = ["abc","def","xyz"] -- > -- > tokenize ".*" "\nabc\n123\n\nxyz\n" -- > = ["","abc","123","","xyz"] -- > -- > tokenize ".*" = lines -- > -- > tokenize "[^ \t\n\r]*" = words tokenize :: StringLike s => s -> s -> [s] tokenize = tokenizeRE . parseRegex -- | tokenize with extended syntax tokenizeExt :: StringLike s => s -> s -> [s] tokenizeExt = tokenizeRE . parseRegexExt -- ------------------------------------------------------------ -- | split a string into tokens and delimierter by giving a regular expression -- which all tokens must match -- -- This is a generalisation of the above 'tokenizeRE' functions. -- The none matching char sequences are marked with @Left@, the matching ones are marked with @Right@ -- -- If the regular expression contains syntax errors @Nothing@ is returned -- -- The following Law holds: -- -- > concat . map (either id id) . tokenizeRE' re == id tokenizeRE' :: StringLike s => GenRegex s -> s -> [Either s s] tokenizeRE' re inp0 = token'' (inp0, 0) inp0 where fcs = firstChars re re1 = mkDiff re mkUnit token'' = token' re fcs token1'' = token' re1 fcs -- token' :: StringLike s => GenRegex s -> CharSet -> (s, Int) -> s -> [Either s s] token' re' fcs' (uns, ! n) inp | nullS inp = addUnmatched [] | otherwise = evalRes . splitWithRegexCS re' fcs' $ inp where addUnmatched | n == 0 = id | otherwise = ((Left $ takeS n uns) :) addMatched t = addUnmatched . ((Right t) :) evalRes Nothing = token'' (uns, n + 1) (dropS 1 inp) -- re does not match any prefix evalRes (Just (toks, rest)) | nullS tok = addMatched tok -- re is nullable and only the empty prefix matches $ token'' (rest, 1) (dropS 1 rest) -- discard one char and try again | otherwise = addMatched tok $ token1'' (rest, 0) rest -- real token found, next token must not be empty where tok = snd . head $ toks -- | convenient function for 'tokenizeRE'' -- -- When the regular expression parses as Zero, @[Left input]@ is returned, that means no tokens are found tokenize' :: StringLike s => s -> s -> [Either s s] tokenize' = tokenizeRE' . parseRegex tokenizeExt' :: StringLike s => s -> s -> [Either s s] tokenizeExt' = tokenizeRE' . parseRegexExt -- ------------------------------------------------------------ -- | split a string into tokens (pair of labels and words) by giving a regular expression -- containing labeled subexpressions. -- -- This function should not be called with regular expressions -- without any labeled subexpressions. This does not make sense, because the result list -- will always be empty. -- -- Result is the list of matching subexpressions -- This can be used for simple tokenizers. -- At least one char is consumed by parsing a token. -- The pairs in the result list contain the matching substrings. -- All none matching chars are discarded. If the given regex contains syntax errors, -- @Nothing@ is returned tokenizeSubexRE :: StringLike s => GenRegex s -> s -> [(s, s)] tokenizeSubexRE re = token'' where fcs = firstChars re re1 = mkDiff re mkUnit token'' = token' re fcs token1'' = token' re1 fcs -- token' :: StringLike s => GenRegex s -> CharSet -> s -> [(s, s)] token' re' fcs' inp | nullS inp = [] | otherwise = evalRes . splitWithRegexCS re' fcs' $ inp where evalRes Nothing = token'' (dropS 1 inp) -- re does not match any prefix evalRes (Just (toks, rest)) | nullS tok = res ++ token'' (dropS 1 rest) -- re is nullable and only the empty prefix matches | otherwise = res ++ token1'' rest -- token found, tokenize the rest where res = map (first fromJust) . tail $ toks tok = snd . head $ toks -- | convenient function for 'tokenizeSubexRE' a string -- -- examples: -- -- > tokenizeSubex "({name}[a-z]+)|({num}[0-9]{2,})|({real}[0-9]+[.][0-9]+)" -- > "cab123 456.7abc" -- > = [("name","cab") -- > ,("num","123") -- > ,("real","456.7") -- > ,("name","abc")] -- > -- > tokenizeSubex "({real}({n}[0-9]+)([.]({f}[0-9]+))?)" -- > "12.34" = [("real","12.34") -- > ,("n","12") -- > ,("f","34")] -- > -- > tokenizeSubex "({real}({n}[0-9]+)([.]({f}[0-9]+))?)" -- > "12 34" = [("real","12"),("n","12") -- > ,("real","34"),("n","34")] -- > -- > tokenizeSubex "({real}({n}[0-9]+)(([.]({f}[0-9]+))|({f})))" -- > "12 34.56" = [("real","12"),("n","12"),("f","") -- > ,("real","34.56"),("n","34"),("f","56")] tokenizeSubex :: StringLike s => s -> s -> [(s, s)] tokenizeSubex = tokenizeSubexRE . parseRegexExt -- ------------------------------------------------------------ -- | sed like editing function -- -- All matching tokens are edited by the 1. argument, the editing function, -- all other chars remain as they are sedRE :: StringLike s => (s -> s) -> GenRegex s -> s -> s sedRE edit re = concatS . map (either id edit) . tokenizeRE' re -- | convenient function for 'sedRE' -- -- examples: -- -- > sed (const "b") "a" "xaxax" = "xbxbx" -- > sed (\ x -> x ++ x) "a" "xax" = "xaax" -- > sed undefined "[" "xxx" = "xxx" sed :: StringLike s => (s -> s) -> s -> s -> s sed edit = sedRE edit . parseRegex sedExt :: StringLike s => (s -> s) -> s -> s -> s sedExt edit = sedRE edit . parseRegexExt -- ------------------------------------------------------------ -- | match a string with a regular expression matchRE :: StringLike s => GenRegex s -> s -> Bool matchRE = matchWithRegex -- | convenient function for 'matchRE' -- -- Examples: -- -- > match "x*" "xxx" = True -- > match "x" "xxx" = False -- > match "[" "xxx" = False match :: StringLike s => s -> s -> Bool match = matchWithRegex . parseRegex -- | match with extended regular expressions matchExt :: StringLike s => s -> s -> Bool matchExt = matchWithRegex . parseRegexExt -- ------------------------------------------------------------ -- | match a string with a regular expression -- and extract subexpression matches matchSubexRE :: StringLike s => GenRegex s -> s -> [(s, s)] matchSubexRE re = map (first fromJust) . fromMaybe [] . matchWithRegex' re -- | convenient function for 'matchRE' -- -- Examples: -- -- > matchSubex "({1}x*)" "xxx" = [("1","xxx")] -- > matchSubex "({1}x*)" "y" = [] -- > matchSubex "({w}[0-9]+)x({h}[0-9]+)" "800x600" = [("w","800"),("h","600")] -- > matchSubex "[" "xxx" = [] matchSubex :: StringLike s => s -> s -> [(s, s)] matchSubex = matchSubexRE . parseRegexExt -- ------------------------------------------------------------ -- | grep like filter for lists of strings -- -- The regular expression may be prefixed with the usual context spec \"^\" for start of string, -- and "\\<" for start of word. -- and suffixed with \"$\" for end of text and "\\>" end of word. -- Word chars are defined by the multi char escape sequence "\\w" -- -- Examples -- -- > grep "a" ["_a_", "_a", "a_", "a", "_"] => ["_a_", "_a", "a_", "a"] -- > grep "^a" ["_a_", "_a", "a_", "a", "_"] => ["a_", "a"] -- > grep "a$" ["_a_", "_a", "a_", "a", "_"] => ["_a", "a"] -- > grep "^a$" ["_a_", "_a", "a_", "a", "_"] => ["a"] -- > grep "\\ ["x a b", " ax "] -- > grep "a\\>" ["x a b", " ax ", " xa ", "xab"] => ["x a b", " xa "] grep :: StringLike s => s -> [s] -> [s] grep = grep' parseRegex' -- | grep with extended regular expressions grepExt :: StringLike s => s -> [s] -> [s] grepExt = grep' parseRegexExt' grep' :: StringLike s => (String -> GenRegex s) -> s -> [s] -> [s] grep' parseRe = grepRE . parseContextRegex parseRe -- | grep with already prepared Regex (ususally with 'parseContextRegex') grepRE :: StringLike s => GenRegex s-> [s] -> [s] grepRE re = filter (matchRE re) -- | grep with Regex and line numbers grepREwithLineNum :: StringLike s => GenRegex s -> [s] -> [(Int, s)] grepREwithLineNum re = filter (matchRE re . snd) . zip [(1::Int)..] -- ------------------------------------------------------------ hxt-regex-xmlschema-9.2.0.3/src/Text/Regex/XMLSchema/Generic/RegexParser.hs0000644000000000000000000003611212752557013024462 0ustar0000000000000000{-# LANGUAGE FlexibleContexts #-} -- ------------------------------------------------------------ {- | Module : Text.Regex.XMLSchema.RegexParser Copyright : Copyright (C) 2014- Uwe Schmidt License : MIT Maintainer : Uwe Schmidt (uwe@fh-wedel.de) Stability : stable Portability: portable W3C XML Schema Regular Expression Parser This parser supports the full W3C standard, the complete grammar can be found under and extensions for all missing set operations, intersection, difference, exclusive or, interleave, complement -} -- ------------------------------------------------------------ module Text.Regex.XMLSchema.Generic.RegexParser ( parseRegex , parseRegexExt , parseRegex' , parseRegexExt' , parseContextRegex ) where import Data.Char.Properties.UnicodeBlocks import Data.Char.Properties.UnicodeCharProps import Data.Char.Properties.XMLCharProps import Data.List (isPrefixOf, isSuffixOf) import Data.Maybe import Data.Set.CharSet import Text.ParserCombinators.Parsec import Text.Regex.XMLSchema.Generic.Regex import Text.Regex.XMLSchema.Generic.StringLike -- ------------------------------------------------------------ -- | parse a standard W3C XML Schema regular expression parseRegex :: StringLike s => s -> GenRegex s parseRegex = parseRegex' . toString parseRegex' :: StringLike s => String -> GenRegex s parseRegex' = parseRegex'' regExpStd -- | parse an extended syntax W3C XML Schema regular expression -- -- The Syntax of the W3C XML Schema spec is extended by -- further useful set operations, like intersection, difference, exor. -- Subexpression match becomes possible with \"named\" pairs of parentheses. -- The multi char escape sequence \\a represents any Unicode char, -- The multi char escape sequence \\A represents any Unicode word, (\\A = \\a*). -- All syntactically wrong inputs are mapped to the Zero expression representing the -- empty set of words. Zero contains as data field a string for an error message. -- So error checking after parsing becomes possible by checking against Zero ('isZero' predicate) parseRegexExt :: StringLike s => s -> GenRegex s parseRegexExt = parseRegexExt' . toString parseRegexExt' :: StringLike s => String -> GenRegex s parseRegexExt' = parseRegex'' regExpExt parseRegex'' :: StringLike s => Parser (GenRegex s) -> String -> GenRegex s parseRegex'' regExp' = either (mkZero' . ("syntax error: " ++) . show) id . parse ( do r <- regExp' eof return r ) "" -- ------------------------------------------------------------ -- | parse a regular expression surrounded by contenxt spec -- -- a leading @^@ denotes start of text, -- a trailing @$@ denotes end of text, -- a leading @\\<@ denotes word start, -- a trailing @\\>@ denotes word end. -- -- The 1. param ist the regex parser ('parseRegex' or 'parseRegexExt') parseContextRegex :: StringLike s => (String -> GenRegex s) -> s -> GenRegex s parseContextRegex parseRe re0 = re' where parseAW = parseRegexExt' "(\\A\\W)?" parseWA = parseRegexExt' "(\\W\\A)?" re = toString re0 re' = mkSeqs . concat $ [ startContext , (:[]) . parseRe $ re2 , endContext ] (startContext, re1) | "^" `isPrefixOf` re = ([], tail re) | "\\<" `isPrefixOf` re = ([parseAW], drop 2 re) | otherwise = ([mkStar mkDot], re) (endContext, re2) | "$" `isSuffixOf` re1 = ([], init re1) | "\\>" `isSuffixOf` re1 = ([parseWA], init . init $ re1) | otherwise = ([mkStar mkDot], re1) -- ------------------------------------------------------------ regExpExt :: StringLike s => Parser (GenRegex s) regExpExt = branchList orElseList regExpStd :: StringLike s => Parser (GenRegex s) regExpStd = branchList seqListStd branchList :: StringLike s => Parser (GenRegex s) -> Parser (GenRegex s) branchList exParser = do r1 <- exParser rs <- many branchList1 return (foldr1 mkAlt $ r1:rs) -- union is associative, so we use right ass. -- as with seq, alt and exor where branchList1 = do _ <- char '|' exParser orElseList :: StringLike s => Parser (GenRegex s) orElseList = do r1 <- interleaveList rs <- many orElseList1 return (foldr1 mkElse $ r1:rs) -- orElse is associative, so we choose right ass. -- as with seq and alt ops where orElseList1 = do _ <- try (string "{|}") interleaveList interleaveList :: StringLike s => Parser (GenRegex s) interleaveList = do r1 <- exorList rs <- many interleaveList1 return (foldr1 mkInterleave $ r1:rs) -- interleave is associative, so we choose right ass. -- as with seq and alt ops where interleaveList1 = do _ <- try (string "{:}") exorList exorList :: StringLike s => Parser (GenRegex s) exorList = do r1 <- diffList rs <- many exorList1 return (foldr1 mkExor $ r1:rs) -- exor is associative, so we choose right ass. where exorList1 = do _ <- try (string "{^}") diffList diffList :: StringLike s => Parser (GenRegex s) diffList = do r1 <- intersectList rs <- many diffList1 return (foldl1 mkDiff $ r1:rs) -- diff is not associative, so we choose left ass. where diffList1 = do _ <- try (string "{\\}") intersectList intersectList :: StringLike s => Parser (GenRegex s) intersectList = do r1 <- seqListExt rs <- many intersectList1 return (foldr1 mkIsect $ r1:rs) where intersectList1 = do _ <- try (string "{&}") seqListExt seqListExt :: StringLike s => Parser (GenRegex s) seqListExt = seqList' regExpLabel multiCharEscExt seqListStd :: StringLike s => Parser (GenRegex s) seqListStd = seqList' regExpStd multiCharEsc seqList' :: StringLike s => Parser (GenRegex s) -> Parser (GenRegex s) -> Parser (GenRegex s) seqList' regExp' multiCharEsc' = do rs <- many piece return $ mkSeqs rs where -- piece :: StringLike s => Parser (GenRegex s) piece = do r <- atom quantifier r -- atom :: StringLike s => Parser (GenRegex s) atom = char1 <|> charClass <|> between (char '(') (char ')') regExp' -- charClass :: StringLike s => Parser (GenRegex s) charClass = charClassEsc multiCharEsc' <|> charClassExpr multiCharEsc' <|> wildCardEsc quantifier :: StringLike s => GenRegex s -> Parser (GenRegex s) quantifier r = ( do _ <- char '?' return $ mkOpt r ) <|> ( do _ <- char '*' return $ mkStar r ) <|> ( do _ <- char '+' return $ mkRep 1 r ) <|> try ( do _ <- char '{' res <- quantity r _ <- char '}' return res ) <|> ( return r ) quantity :: StringLike s => GenRegex s -> Parser (GenRegex s) quantity r = do lb <- many1 digit quantityRest r (read lb) quantityRest :: StringLike s => GenRegex s -> Int -> Parser (GenRegex s) quantityRest r lb = ( do _ <- char ',' ub <- many digit return ( if null ub then mkRep lb r else mkRng lb (read ub) r ) ) <|> ( return $ mkRng lb lb r) regExpLabel :: StringLike s => Parser (GenRegex s) regExpLabel = do lab <- option id (between (char '{') (char '}') label') r <- regExpExt return $ lab r where label' = do l <- many1 (satisfy isXmlNameChar) return $ mkBr' l char1 :: StringLike s => Parser (GenRegex s) char1 = do c <- satisfy (`notElem` ".\\?*+{}()|[]") return $ mkSym1 c charClassEsc :: StringLike s => Parser (GenRegex s) -> Parser (GenRegex s) charClassEsc multiCharEsc' = do _ <- char '\\' ( singleCharEsc <|> multiCharEsc' <|> catEsc <|> complEsc ) singleCharEsc :: StringLike s => Parser (GenRegex s) singleCharEsc = do c <- singleCharEsc' return $ mkSym1 c singleCharEsc' :: Parser Char singleCharEsc' = do c <- satisfy (`elem` "nrt\\|.?*+(){}-[]^") return $ maybe c id . lookup c . zip "ntr" $ "\n\r\t" multiCharEscExt :: StringLike s => Parser (GenRegex s) multiCharEscExt = multiCharEsc <|> ( do -- extension: \a represents the whole alphabet inclusive newline chars: \a == .|\n|\r _ <- char 'a' return mkDot ) <|> ( do -- extension: \A represents all words: \A == \a* or \A == (.|\n|\r)* _ <- char 'A' return mkAll ) multiCharEsc :: StringLike s => Parser (GenRegex s) multiCharEsc = ( do c <- satisfy (`elem` es) return $ mkSym . fromJust . lookup c $ pm ) where es = map fst pm pm = [ ('s', charPropXmlSpaceChar ) , ('S', compCS charPropXmlSpaceChar ) , ('i', charPropXmlNameStartChar ) , ('I', compCS charPropXmlNameStartChar ) , ('c', charPropXmlNameChar ) , ('C', compCS charPropXmlNameChar ) , ('d', charPropDigit ) , ('D', compCS charPropDigit ) , ('w', compCS charPropNotWord ) , ('W', charPropNotWord ) ] charPropDigit = rangeCS '0' '9' charPropNotWord = charPropUnicodeP `unionCS` charPropUnicodeZ `unionCS` charPropUnicodeC catEsc :: StringLike s => Parser (GenRegex s) catEsc = do _ <- char 'p' s <- between (char '{') (char '}') charProp return $ mkSym s charProp :: Parser CharSet charProp = isCategory <|> isBlock isBlock :: Parser CharSet isBlock = do _ <- string "Is" name <- many1 (satisfy legalChar) case lookup name codeBlocks of Just b -> return $ uncurry rangeCS b Nothing -> fail $ "unknown Unicode code block " ++ show name where legalChar c = 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' || '-' == c isCategory :: Parser CharSet isCategory = do pr <- isCategory' return $ fromJust (lookup pr categories) categories :: [(String, CharSet)] categories = [ ("C", charPropUnicodeC ) , ("Cc", charPropUnicodeCc) , ("Cf", charPropUnicodeCf) , ("Co", charPropUnicodeCo) , ("Cs", charPropUnicodeCs) , ("L", charPropUnicodeL ) , ("Ll", charPropUnicodeLl) , ("Lm", charPropUnicodeLm) , ("Lo", charPropUnicodeLo) , ("Lt", charPropUnicodeLt) , ("Lu", charPropUnicodeLu) , ("M", charPropUnicodeM ) , ("Mc", charPropUnicodeMc) , ("Me", charPropUnicodeMe) , ("Mn", charPropUnicodeMn) , ("N", charPropUnicodeN ) , ("Nd", charPropUnicodeNd) , ("Nl", charPropUnicodeNl) , ("No", charPropUnicodeNo) , ("P", charPropUnicodeP ) , ("Pc", charPropUnicodePc) , ("Pd", charPropUnicodePd) , ("Pe", charPropUnicodePe) , ("Pf", charPropUnicodePf) , ("Pi", charPropUnicodePi) , ("Po", charPropUnicodePo) , ("Ps", charPropUnicodePs) , ("S", charPropUnicodeS ) , ("Sc", charPropUnicodeSc) , ("Sk", charPropUnicodeSk) , ("Sm", charPropUnicodeSm) , ("So", charPropUnicodeSo) , ("Z", charPropUnicodeZ ) , ("Zl", charPropUnicodeZl) , ("Zp", charPropUnicodeZp) , ("Zs", charPropUnicodeZs) ] isCategory' :: Parser String isCategory' = ( foldr1 (<|>) . map (uncurry prop) $ [ ('L', "ultmo") , ('M', "nce") , ('N', "dlo") , ('P', "cdseifo") , ('Z', "slp") , ('S', "mcko") , ('C', "cfon") ] ) "illegal Unicode character property" where prop c1 cs2 = do _ <- char c1 s2 <- option "" ( do c2 <- satisfy (`elem` cs2) return [c2] ) return $ c1:s2 complEsc :: StringLike s => Parser (GenRegex s) complEsc = do _ <- char 'P' s <- between (char '{') (char '}') charProp return $ mkSym $ compCS s charClassExpr :: StringLike s => Parser (GenRegex s) -> Parser (GenRegex s) charClassExpr multiCharEsc' = between (char '[') (char ']') charGroup where -- charGroup :: StringLike s => Parser (GenRegex s) charGroup = do r <- ( negCharGroup -- a ^ at beginning denotes negation, not start of posCharGroup <|> posCharGroup ) s <- option (mkZero' "") -- charClassSub ( do _ <- char '-' charClassExpr multiCharEsc' ) return $ mkDiff r s -- posCharGroup :: StringLike s => Parser (GenRegex s) posCharGroup = do rs <- many1 (charRange <|> charClassEsc multiCharEsc') return $ foldr1 mkAlt rs -- negCharGroup :: StringLike s => Parser (GenRegex s) negCharGroup = do _ <- char '^' r <- posCharGroup return $ mkDiff mkDot r charRange :: StringLike s => Parser (GenRegex s) charRange = try seRange <|> xmlCharIncDash seRange :: StringLike s => Parser (GenRegex s) seRange = do c1 <- charOrEsc' _ <- char '-' c2 <- charOrEsc' return $ mkSymRng c1 c2 charOrEsc' :: Parser Char charOrEsc' = ( do _ <- char '\\' singleCharEsc' ) <|> satisfy (`notElem` "\\-[]") xmlCharIncDash :: StringLike s => Parser (GenRegex s) xmlCharIncDash = try ( do -- dash is only allowed if not followed by a [, else charGroup differences do not parse correctly _ <- char '-' notFollowedBy (char '[') return $ mkSym1 '-' ) <|> ( do c <- satisfy (`notElem` "-\\[]") return $ mkSym1 c ) wildCardEsc :: StringLike s => Parser (GenRegex s) wildCardEsc = do _ <- char '.' return . mkSym . compCS $ stringCS "\n\r" -- ------------------------------------------------------------ hxt-regex-xmlschema-9.2.0.3/src/Text/Regex/XMLSchema/Generic/Regex.hs0000644000000000000000000007307712752557013023320 0ustar0000000000000000{-# LANGUAGE ViewPatterns #-} {-# LANGUAGE FlexibleInstances #-} -- ------------------------------------------------------------ {- | Copyright : Copyright (C) 2014 - Uwe Schmidt License : MIT Maintainer : Uwe Schmidt Stability : stable Portability: portable W3C XML Schema Regular Expression Matcher Grammar can be found under -} -- ------------------------------------------------------------ module Text.Regex.XMLSchema.Generic.Regex ( GenRegex , mkZero , mkZero' , mkUnit , mkSym , mkSym1 , mkSymRng , mkWord , mkDot , mkStar , mkAll , mkAlt , mkElse , mkSeq , mkSeqs , mkRep , mkRng , mkOpt , mkDiff , mkIsect , mkExor , mkInterleave , mkCompl , mkBr , mkBr' , isZero , errRegex , nullable , nullable' , delta1 , delta , firstChars , matchWithRegex , matchWithRegex' , splitWithRegex , splitWithRegex' , splitWithRegexCS , splitWithRegexCS' ) where import Data.List (intercalate) import Data.Monoid ((<>)) import Data.Set.CharSet import Data.String (IsString(..)) import Text.Regex.XMLSchema.Generic.StringLike {- import Debug.Trace (traceShow) trc :: Show a => String -> a -> a trc msg x = traceShow (msg, x) x -- -} -- ------------------------------------------------------------ data GenRegex s = Zero s | Unit | Sym CharSet | Dot | Star (GenRegex s) | Alt (GenRegex s) (GenRegex s) | Else (GenRegex s) (GenRegex s) | Seq (GenRegex s) (GenRegex s) | Rep Int (GenRegex s) -- 1 or more repetitions | Rng Int Int (GenRegex s) -- n..m repetitions | Diff (GenRegex s) (GenRegex s) -- r1 - r2 | Isec (GenRegex s) (GenRegex s) -- r1 n r2 | Exor (GenRegex s) (GenRegex s) -- r1 xor r2 | Intl (GenRegex s) (GenRegex s) -- r1 interleavedWith r2 | Br (Label s) (GenRegex s) -- (...) not yet parsed | Obr (Label s) s !Int (GenRegex s) -- currently parsed (...) | Cbr [(Label s, s)] (GenRegex s) -- already completely parsed (...) deriving (Eq, Ord {-, Show -}) type Label s = Maybe s -- we need one special label for the whole expression -- see splitWithRegex type SubexResults s = [(Label s, s)] type Nullable s = (Bool, SubexResults s) -- ------------------------------------------------------------ {- just for documentation class Inv a where inv :: a -> Bool instance Inv (GenRegex s) where inv (Zero _) = True inv Unit = True inv (Sym p) = not (nulCS p) && not (fullCS p) inv Dot = True inv (Star e) = inv e inv (Alt e1 e2) = inv e1 && inv e2 inv (Seq e1 e2) = inv e1 && inv e2 inv (Rep i e) = i > 0 && inv e inv (Rng i j e) = (i < j || (i == j && i > 1)) && inv e inv (Diff e1 e2) = inv e1 && inv e2 inv (Isec e1 e2) = inv e1 && inv e2 inv (Exor e1 e2) = inv e1 && inv e2 -} -- ------------------------------------------------------------ -- -- smart constructors -- | construct the r.e. for the empty set. -- An (error-) message may be attached mkZero :: s -> GenRegex s mkZero = Zero {-# INLINE mkZero #-} mkZero' :: (StringLike s) => String -> GenRegex s mkZero' = Zero . fromString {-# INLINE mkZero' #-} -- | construct the r.e. for the set containing the empty word mkUnit :: GenRegex s mkUnit = Unit {-# INLINE mkUnit #-} -- | construct the r.e. for a set of chars mkSym :: (StringLike s) => CharSet -> GenRegex s mkSym s | nullCS s = mkZero' "empty char range" | fullCS s = mkDot | otherwise = Sym s {-# INLINE mkSym #-} -- | construct an r.e. for a single char set mkSym1 :: (StringLike s) => Char -> GenRegex s mkSym1 = mkSym . singleCS {-# INLINE mkSym1 #-} -- | construct an r.e. for an intervall of chars mkSymRng :: (StringLike s) => Char -> Char -> GenRegex s mkSymRng c1 c2 = mkSym $ rangeCS c1 c2 {-# INLINE mkSymRng #-} -- | mkSym generaized for strings mkWord :: (StringLike s) => [Char] -> GenRegex s mkWord = mkSeqs . map mkSym1 -- | construct an r.e. for the set of all Unicode chars mkDot :: GenRegex s mkDot = Dot {-# INLINE mkDot #-} -- | construct an r.e. for the set of all Unicode words mkAll :: (StringLike s) => GenRegex s mkAll = mkStar mkDot {-# INLINE mkAll #-} -- | construct r.e. for r* mkStar :: (StringLike s) => GenRegex s -> GenRegex s mkStar (Zero _) = mkUnit -- {}* == () mkStar e@Unit = e -- ()* == () mkStar e@(Star _e1) = e -- (r*)* == r* mkStar (Rep 1 e1) = mkStar e1 -- (r+)* == r* mkStar (Rep i e1) | i == 1 || nullable e1 = mkStar e1 -- (r{i,})* == r* when i == 1 or nullable r mkStar e@(Rng _ _ e1) | nullable e = mkStar e1 -- (r{i,j})* == r* when i == 0 or nullable r mkStar e@(Alt _ _) = Star (rmStar e) -- (a*|b)* == (a|b)* {- this is wrong, not generally applicable mkStar (Br l r s) = mkBr0 l (mkStar r) s -- ({l}r)* == ({l}r*) because we want the longest match as result for the subexpression -} mkStar e = Star e rmStar :: (StringLike s) => GenRegex s -> GenRegex s rmStar (Alt e1 e2) = mkAlt (rmStar e1) (rmStar e2) rmStar (Star e1) = rmStar e1 rmStar (Rep 1 e1) = rmStar e1 rmStar e1 = e1 -- | construct the r.e for r1|r2 mkAlt :: (StringLike s) => GenRegex s -> GenRegex s -> GenRegex s mkAlt e1 (Zero _) = e1 -- e1 u {} = e1 mkAlt (Zero _) e2 = e2 -- {} u e2 = e2 mkAlt (Sym p1) (Sym p2) = mkSym $ p1 `unionCS` p2 -- melting of predicates mkAlt e1 e2@(Sym _) = mkAlt e2 e1 -- symmetry: predicates always first mkAlt e1@(Sym _) (Alt e2@(Sym _) e3) = mkAlt (mkAlt e1 e2) e3 -- prepare melting of predicates mkAlt (Sym _) e2@Dot = e2 -- c|. = . for a c's mkAlt e1@(Star Dot) _e2 = e1 -- A* u e1 = A* mkAlt _e1 e2@(Star Dot) = e2 -- e1 u A* = A* mkAlt (Alt e1 e2) e3 = mkAlt e1 (mkAlt e2 e3) -- associativity mkAlt e1 e2 | e1 == e2 = e1 | otherwise = Alt e1 e2 -- | construct the r.e. for r1{|}r2 (r1 orElse r2). -- -- This represents the same r.e. as r1|r2, but when -- collecting the results of subexpressions in (...) and r1 succeeds, the -- subexpressions of r2 are discarded, so r1 matches are prioritized -- -- example -- -- > splitSubex "({1}x)|({2}.)" "x" = ([("1","x"),("2","x")], "") -- > -- > splitSubex "({1}x){|}({2}.)" "x" = ([("1","x")], "") mkElse :: (StringLike s) => GenRegex s -> GenRegex s -> GenRegex s mkElse e1 (Zero _) = e1 -- e1 u {} = e1 mkElse (Zero _) e2 = e2 -- {} u e2 = e2 mkElse (Sym p1) (Sym p2) = mkSym $ p1 `unionCS` p2 -- melting of predicates -- no symmetry allowed mkElse e1@(Sym _) (Else e2@(Sym _) e3) = mkElse (mkElse e1 e2) e3 -- prepare melting of predicates mkElse (Sym _) e2@Dot = e2 -- c|. = . for a c's mkElse e1@(Star Dot) _e2 = e1 -- A* u e1 = A* mkElse _e1 e2@(Star Dot) = e2 -- e1 u A* = A* mkElse (Else e1 e2) e3 = mkElse e1 (mkElse e2 e3) -- associativity mkElse e1 e2 | e1 == e2 = e1 | otherwise = Else e1 e2 -- | Construct the sequence r.e. r1.r2 mkSeq :: GenRegex s -> GenRegex s -> GenRegex s mkSeq e1@(Zero _) _e2 = e1 mkSeq _e1 e2@(Zero _) = e2 mkSeq Unit e2 = e2 mkSeq (Cbr ss1 e1) e2 = mkCbr ss1 (mkSeq e1 e2) -- move finished submatches upwards mkSeq e1 Unit = e1 mkSeq (Seq e1 e2) e3 = mkSeq e1 (mkSeq e2 e3) mkSeq e1 e2 = Seq e1 e2 -- | mkSeq extened to lists mkSeqs :: [GenRegex s] -> GenRegex s mkSeqs = foldr mkSeq mkUnit -- | Construct repetition r{i,} mkRep :: (StringLike s) => Int -> GenRegex s -> GenRegex s mkRep 0 e = mkStar e mkRep _ e@(Zero _) = e mkRep _ e | nullable e = mkStar e mkRep i (Rep j e) = mkRep (i * j) e mkRep i e = Rep i e -- | Construct range r{i,j} mkRng :: (StringLike s) => Int -> Int -> GenRegex s -> GenRegex s mkRng 0 0 _e = mkUnit mkRng 1 1 e = e mkRng lb ub _e | lb > ub = mkZero' $ "illegal range " ++ show lb ++ ".." ++ show ub mkRng _l _u e@(Zero _) = e mkRng _l _u e@Unit = e mkRng lb ub e = Rng lb ub e -- | Construct option r? mkOpt :: (StringLike s) => GenRegex s -> GenRegex s mkOpt = mkRng 0 1 {-# INLINE mkOpt #-} -- | Construct difference r.e.: r1 {\\} r2 -- -- example -- -- > match "[a-z]+{\\}bush" "obama" = True -- > match "[a-z]+{\\}bush" "clinton" = True -- > match "[a-z]+{\\}bush" "bush" = False -- not important any more mkDiff :: (StringLike s) => GenRegex s -> GenRegex s -> GenRegex s mkDiff e1@(Zero _) _e2 = e1 -- {} - r2 = {} mkDiff e1 (Zero _) = e1 -- r1 - {} = r1 mkDiff _e1 (Star Dot) = mkZero' "empty set in difference expr" -- r1 - .* = {} mkDiff Dot (Sym p) = mkSym $ compCS p -- . - s = ~s mkDiff (Sym _) Dot = mkZero' "empty set in difference expr" -- x - . = {} mkDiff (Sym p1) (Sym p2) = mkSym $ p1 `diffCS` p2 -- set diff mkDiff e1 e2 | e1 == e2 = mkZero' "empty set in difference expr" -- r1 - r1 = {} | otherwise = Diff e1 e2 -- | Construct the Complement of an r.e.: whole set of words - r mkCompl :: (StringLike s) => GenRegex s -> GenRegex s mkCompl (Zero _) = mkAll mkCompl (Star Dot) = mkZero' "empty set in compl expr" mkCompl e = mkDiff (mkStar mkDot) e -- | Construct r.e. for intersection: r1 {&} r2 -- -- example -- -- > match ".*a.*{&}.*b.*" "-a-b-" = True -- > match ".*a.*{&}.*b.*" "-b-a-" = True -- > match ".*a.*{&}.*b.*" "-a-a-" = False -- > match ".*a.*{&}.*b.*" "---b-" = False mkIsect :: (StringLike s) => GenRegex s -> GenRegex s -> GenRegex s mkIsect e1@(Zero _) _e2 = e1 -- {} n r2 = {} mkIsect _e1 e2@(Zero _) = e2 -- r1 n {} = {} mkIsect e1@(Unit) e2 -- () n r2 = () if nullable r2 | nullable e2 = e1 -- () n r2 = {} if not nullable r2 | otherwise = mkZero' "intersection empty" mkIsect e1 e2@(Unit) = mkIsect e2 e1 -- symmetric version of las 2 laws mkIsect (Sym p1) (Sym p2) = mkSym $ p1 `intersectCS` p2 -- intersect sets mkIsect e1@(Sym _) Dot = e1 -- x n . = x mkIsect Dot e2@(Sym _) = e2 -- . n x = x mkIsect (Star Dot) e2 = e2 -- .* n r2 = r2 mkIsect e1 (Star Dot) = e1 -- r1 n .* = r1 mkIsect e1 e2 | e1 == e2 = e1 -- r1 n r1 = r1 | otherwise = Isec e1 e2 -- | Construct r.e. for exclusive or: r1 {^} r2 -- -- example -- -- > match "[a-c]+{^}[c-d]+" "abc" = True -- > match "[a-c]+{^}[c-d]+" "acdc" = False -- > match "[a-c]+{^}[c-d]+" "ccc" = False -- > match "[a-c]+{^}[c-d]+" "cdc" = True mkExor :: (StringLike s) => GenRegex s -> GenRegex s -> GenRegex s mkExor (Zero _) e2 = e2 mkExor e1 (Zero _) = e1 mkExor (Star Dot) _e2 = mkZero' "empty set in exor expr" mkExor _e1 (Star Dot) = mkZero' "empty set in exor expr" mkExor (Sym p1) (Sym p2) = mkSym $ p1 `exorCS` p2 mkExor (Sym p1) Dot = mkSym $ compCS p1 mkExor Dot (Sym p2) = mkSym $ compCS p2 mkExor e1 e2 | e1 == e2 = mkZero' "empty set in exor expr" -- r1 xor r1 = {} | otherwise = Exor e1 e2 mkInterleave :: GenRegex s -> GenRegex s -> GenRegex s mkInterleave e1@(Zero _) _ = e1 mkInterleave _ e2@(Zero _) = e2 mkInterleave (Unit) e2 = e2 mkInterleave e1 (Unit) = e1 mkInterleave e1 e2 = Intl e1 e2 -- | Construct a labeled subexpression: ({label}r) mkBr :: s -> GenRegex s -> GenRegex s mkBr l e = Br (Just l) e mkBr' :: StringLike s => String -> GenRegex s -> GenRegex s mkBr' l e = Br (Just $ fromString l) e mkBrN :: GenRegex s -> GenRegex s mkBrN e = Br Nothing e mkObr :: StringLike s => Label s -> s -> Int -> GenRegex s -> GenRegex s mkObr _ _ _ e@(Zero _) = e mkObr l s n Unit = mkCbr [(l, takeS n s)] mkUnit mkObr l s n e = Obr l s n e mkCbr :: SubexResults s -> GenRegex s -> GenRegex s mkCbr _ e@(Zero _) = e -- dead end, throw away subexpr matches mkCbr ss (Cbr ss1 e) = mkCbr (ss <> ss1) e -- join inner and this subexpr match mkCbr ss e = Cbr ss e -- ------------------------------------------------------------ instance (StringLike s) => Show (GenRegex s) where show (Zero e) = "{" ++ toString e ++ "}" show Unit = "()" show (Sym p) | p == compCS (stringCS "\n\r") = "." | null (tail cs) && rng1 (head cs) = escRng . head $ cs | otherwise = "[" ++ concat cs' ++ "]" where rng1 (x,y) = x == y cs = p -- charRngs . chars $ p cs' = map escRng p escRng (x, y) | x == y = esc x | succ x == y = esc x ++ esc y | otherwise = esc x ++ "-" ++ esc y esc x | x `elem` "\\-[]{}()*+?.^" = '\\':x:"" | x >= ' ' && x <= '~' = x:"" | otherwise = "&#" ++ show (fromEnum x) ++ ";" show Dot = "\\a" show (Star Dot) = "\\A" show (Star e) = "(" ++ show e ++ "*)" show (Alt e1 e2) = "(" ++ show e1 ++ "|" ++ show e2 ++ ")" show (Else e1 e2) = "(" ++ show e1 ++ "{|}" ++ show e2 ++ ")" show (Seq e1 e2) = "(" ++ show e1 ++ show e2 ++ ")" show (Rep 1 e) = "(" ++ show e ++ "+)" show (Rep i e) = "(" ++ show e ++ "{" ++ show i ++ ",})" show (Rng 0 1 e) = "(" ++ show e ++ "?)" show (Rng i j e) = "(" ++ show e ++ "{" ++ show i ++ "," ++ show j ++ "})" show (Diff e1 e2) = "(" ++ show e1 ++ "{\\}" ++ show e2 ++ ")" show (Isec e1 e2) = "(" ++ show e1 ++ "{&}" ++ show e2 ++ ")" show (Exor e1 e2) = "(" ++ show e1 ++ "{^}" ++ show e2 ++ ")" show (Intl e1 e2) = "(" ++ show e1 ++ "{:}" ++ show e2 ++ ")" show (Br l e) = "({" ++ showL l ++ "}" ++ show e ++ ")" show (Obr l s n e) = "({" ++ showL l ++ "=" ++ toString (takeS n s) ++ "}" ++ show e ++ ")" show (Cbr ss e) = "([" ++ intercalate "," (map (\ (l, s) -> showL l ++ "=" ++ toString s) ss) ++ "]" ++ show e ++ ")" showL :: Show s => Label s -> String showL = rmq . maybe "" show where rmq ('\"':xs) = init xs rmq xs = xs -- ------------------------------------------------------------ isZero :: GenRegex s -> Bool isZero (Zero _) = True isZero _ = False {-# INLINE isZero #-} errRegex :: (StringLike s) => GenRegex s -> s errRegex (Zero e) = e errRegex _ = emptyS -- ------------------------------------------------------------ nullable :: (StringLike s) => GenRegex s -> Bool nullable = fst . nullable' {-# INLINE nullable #-} nullable' :: (StringLike s) => GenRegex s -> Nullable s nullable' (Zero _) = (False, []) nullable' Unit = (True, []) nullable' Dot = (False, []) nullable' (Sym _x) = (False, []) nullable' (Star _e) = (True, []) nullable' (Rep _i e) = nullable' e nullable' (Rng i _ e) = (i == 0, []) `unionN` nullable' e nullable' (Seq e1 e2) = nullable' e1 `isectN` nullable' e2 nullable' (Alt e1 e2) = nullable' e1 `unionN` nullable' e2 nullable' (Else e1 e2) = nullable' e1 `orElseN` nullable' e2 nullable' (Isec e1 e2) = nullable' e1 `isectN` nullable' e2 nullable' (Diff e1 e2) = nullable' e1 `diffN` nullable' e2 nullable' (Exor e1 e2) = nullable' e1 `exorN` nullable' e2 nullable' (Intl e1 e2) = nullable' e1 `isectN` nullable' e2 nullable' (Br l e) = (True, [(l, emptyS )]) `isectN` nullable' e nullable' (Obr l s n e) = (True, [(l, takeS n s)]) `isectN` nullable' e nullable' (Cbr ss e) = (True, ss) `isectN` nullable' e isectN :: Nullable s -> Nullable s -> Nullable s isectN (True, ws1) (True, ws2) = (True, ws1 ++ ws2) isectN _ _ = (False, []) unionN :: Nullable s -> Nullable s -> Nullable s unionN (False, _) (False, _) = (False, []) unionN (_, ws1) (_, ws2) = (True, ws1 ++ ws2) orElseN :: Nullable s -> Nullable s -> Nullable s orElseN e1@(True, _ws1) _ = e1 orElseN _ e2 = e2 diffN :: Nullable s -> Nullable s -> Nullable s diffN n1 (False, _) = n1 diffN _ _ = (False, []) exorN :: Nullable s -> Nullable s -> Nullable s exorN n1@(True, _) (False, _) = n1 exorN (False, _) n2@(True, _) = n2 exorN _ _ = (False, []) -- ------------------------------------------------------------ -- | FIRST for regular expressions -- -- this is only an approximation, the real set of char may be smaller, -- when the expression contains intersection, set difference or exor operators firstChars :: (StringLike s) => GenRegex s -> CharSet firstChars (Zero _) = emptyCS firstChars Unit = emptyCS firstChars (Sym p) = p firstChars Dot = allCS firstChars (Star e1) = firstChars e1 firstChars (Alt e1 e2) = firstChars e1 `unionCS` firstChars e2 firstChars (Else e1 e2) = firstChars e1 `unionCS` firstChars e2 firstChars (Seq e1 e2) | nullable e1 = firstChars e1 `unionCS` firstChars e2 | otherwise = firstChars e1 firstChars (Rep _i e) = firstChars e firstChars (Rng _i _j e) = firstChars e firstChars (Diff e1 _e2) = firstChars e1 -- this is an approximation firstChars (Isec e1 e2) = firstChars e1 `intersectCS` firstChars e2 -- this is an approximation firstChars (Exor e1 e2) = firstChars e1 `unionCS` firstChars e2 -- this is an approximation firstChars (Intl e1 e2) = firstChars e1 `unionCS` firstChars e2 firstChars (Br _l e) = firstChars e firstChars (Obr _l _s _n e) = firstChars e firstChars (Cbr _ss e) = firstChars e -- ------------------------------------------------------------ delta1 :: (StringLike s) => Char -> s -> GenRegex s -> GenRegex s delta1 c inp e0 = d' e0 where d' e@(Zero _) = e d' Unit = mkZero' $ "unexpected char " ++ show c d' (Sym p) | c `elemCS` p = mkUnit | otherwise = mkZero' $ "unexpected char " ++ show c d' Dot = mkUnit d' e@(Star Dot) = e d' e@(Star e1) = mkSeq (d' e1) e d' (Alt e1 e2) = mkAlt (d' e1) (d' e2) d' (Else e1 e2) = mkElse (d' e1) (d' e2) d' (Seq e1@(Obr l s n e1') e2) | nu = mkAlt (mkSeq (d' e1) e2) (mkCbr ((l, takeS n s) : ws) (d' e2)) where (nu, ws) = nullable' e1' d' (Seq e1 e2) | nullable e1 = mkAlt (mkSeq (d' e1) e2) (d' e2) | otherwise = mkSeq (d' e1) e2 d' (Rep i e) = mkSeq (d' e) (mkRep (i-1) e) d' (Rng i j e) = mkSeq (d' e) (mkRng ((i-1) `max` 0) (j-1) e) d' (Diff e1 e2) = mkDiff (d' e1) (d' e2) d' (Isec e1 e2) = mkIsect (d' e1) (d' e2) d' (Exor e1 e2) = mkExor (d' e1) (d' e2) d' (Intl e1 e2) = mkAlt (mkInterleave (d' e1) e2 ) (mkInterleave e1 (d' e2)) d' (Br l e) = d' (mkObr l inp 0 e) -- a subex parse starts d' (Obr l s n e) = mkObr l s (n + 1) (d' e) -- a subex parse cont. d' (Cbr ss e) = mkCbr ss (d' e) -- the results of a subex parse -- ------------------------------------------------------------ delta :: (StringLike s) => s -> GenRegex s -> GenRegex s delta inp@(uncons -> Just (c, inp')) e0 = d' e0 where d' e@(Zero _) = e -- don't process whole input, parse has failed d' e@(Star Dot) = e -- don't process input, derivative does not change d' e = delta inp' ( -- trc "delta1=" $ delta1 c inp e) delta _empty e = e matchWithRegex :: (StringLike s) => GenRegex s -> s -> Bool matchWithRegex e s = nullable $ delta s e matchWithRegex' :: (StringLike s) => GenRegex s -> s -> Maybe (SubexResults s) matchWithRegex' e s = (\ (r, l) -> if r then Just l else Nothing) . nullable' $ delta s e -- ------------------------------------------------------------ -- | This function wraps the whole regex in a subexpression before starting -- the parse. This is done for getting access to -- the whole parsed string. Therfore we need one special label, this label -- is the Nothing value, all explicit labels are Just labels. splitWithRegex :: (StringLike s) => GenRegex s -> s -> Maybe (SubexResults s, s) splitWithRegex re inp = do (re', rest) <- splitWithRegex' (mkBrN re) inp return ( snd . nullable' $ re', rest) splitWithRegexCS :: (StringLike s) => GenRegex s -> CharSet -> s -> Maybe (SubexResults s, s) splitWithRegexCS re cs inp = do (re', rest) <- splitWithRegexCS' (mkBrN re) cs inp return ( snd . nullable' $ re', rest) -- ---------------------------------------- -- -- | The main scanner function {- linear recursive function, can lead to stack overflow splitWithRegex' :: Eq l => GenRegex s -> String -> Maybe (GenRegex s, String) splitWithRegex' re "" | nullable re = Just (re, "") | otherwise = Nothing splitWithRegex' re inp@(c : inp') | isZero re = Nothing | otherwise = evalRes . splitWithRegex' (delta1 re c) $ inp' where evalRes Nothing | nullable re = Just (re, inp) | otherwise = Nothing evalRes res = res -} -- tail recursive version of above function splitWithRegex' :: (StringLike s) => GenRegex s -> s -> Maybe (GenRegex s, s) splitWithRegex' re inp = splitWithRegex'' ( if nullable re then Just (re, inp) -- first possible result: empty prefix else Nothing -- empty prefix not a result ) re inp splitWithRegex'' :: (StringLike s) => Maybe (GenRegex s, s) -> GenRegex s -> s -> Maybe (GenRegex s, s) splitWithRegex'' lastRes re inp@(uncons -> Just (c, inp')) | isZero re = lastRes | otherwise = splitWithRegex'' nextRes re' $ inp' where re' = delta1 c inp re nextRes | nullable re' = Just (re', inp') | otherwise = lastRes splitWithRegex'' lastRes _re _empty = lastRes -- ---------------------------------------- -- -- | speedup version for splitWithRegex' -- -- This function checks whether the input starts with a char from FIRST re. -- If this is not the case, the split fails. The FIRST set can be computed once -- for a whole tokenizer and reused by every call of split splitWithRegexCS' :: (StringLike s) => GenRegex s -> CharSet -> s -> Maybe (GenRegex s, s) splitWithRegexCS' re cs inp@(uncons -> Just (c, _inp')) | c `elemCS` cs = splitWithRegex' re inp splitWithRegexCS' re _cs inp | nullable re = Just (re, inp) | otherwise = Nothing -- ------------------------------------------------------------ hxt-regex-xmlschema-9.2.0.3/src/Text/Regex/XMLSchema/Generic/StringLike.hs0000644000000000000000000001101312752557013024277 0ustar0000000000000000{-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE TypeSynonymInstances #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ViewPatterns #-} {-# OPTIONS_GHC -fno-warn-orphans #-} -- ------------------------------------------------------------ {- | Copyright : Copyright (C) 2014- Uwe Schmidt License : MIT Maintainer : Uwe Schmidt Stability : stable -} -- ------------------------------------------------------------ module Text.Regex.XMLSchema.Generic.StringLike where import Data.Maybe import Data.String (IsString(..)) import qualified Data.Text as T import qualified Data.Text.Lazy as TL import qualified Data.ByteString.Char8 as B import qualified Data.ByteString.Lazy.Char8 as BL -- ------------------------------------------------------------ -- | /WARNING/: This StringLike class is /not/ intended for use outside this regex library. -- It provides an abstraction for String's as used inside this library. -- It allows the library to work with String (list of Char), -- ByteString.Char8, ByteString.Lazy.Char8, -- Data.Text and Data.Text.Lazy. -- -- The class is similar to the StringLike class in the tagsoup package class (Eq a, IsString a, Show a) => StringLike a where emptyS :: a uncons :: a -> Maybe (Char, a) nullS :: a -> Bool headS :: a -> Char takeS :: Int -> a -> a dropS :: Int -> a -> a appendS :: a -> a -> a concatS :: [a] -> a toString :: a -> String nullS = isNothing . uncons headS (uncons -> Just (c, _)) = c headS _ = error "headS: empty StringLike" concatS = foldl appendS emptyS {-# INLINE nullS #-} {-# INLINE headS #-} {-# INLINE concatS #-} -- ------------------------------------------------------------ instance StringLike String where emptyS = [] uncons (x : xs) = Just (x, xs) uncons "" = Nothing nullS = null headS = head takeS = take dropS = drop appendS = (++) concatS = concat toString = id {-# INLINE emptyS #-} {-# INLINE uncons #-} {-# INLINE nullS #-} {-# INLINE takeS #-} {-# INLINE dropS #-} {-# INLINE appendS #-} {-# INLINE concatS #-} {-# INLINE toString #-} -- ------------------------------------------------------------ instance StringLike T.Text where emptyS = T.empty uncons = T.uncons nullS = T.null headS = T.head takeS = T.take dropS = T.drop appendS = T.append concatS = T.concat toString = T.unpack {-# INLINE emptyS #-} {-# INLINE uncons #-} {-# INLINE nullS #-} {-# INLINE takeS #-} {-# INLINE dropS #-} {-# INLINE appendS #-} {-# INLINE concatS #-} {-# INLINE toString #-} -- ------------------------------------------------------------ instance StringLike TL.Text where emptyS = TL.empty uncons = TL.uncons nullS = TL.null headS = TL.head takeS = TL.take . toEnum dropS = TL.drop . toEnum appendS = TL.append concatS = TL.concat toString = TL.unpack {-# INLINE emptyS #-} {-# INLINE uncons #-} {-# INLINE nullS #-} {-# INLINE takeS #-} {-# INLINE dropS #-} {-# INLINE appendS #-} {-# INLINE concatS #-} {-# INLINE toString #-} -- ------------------------------------------------------------ instance StringLike B.ByteString where emptyS = B.empty uncons = B.uncons nullS = B.null headS = B.head takeS = B.take dropS = B.drop appendS = B.append concatS = B.concat toString = B.unpack {-# INLINE emptyS #-} {-# INLINE uncons #-} {-# INLINE nullS #-} {-# INLINE takeS #-} {-# INLINE dropS #-} {-# INLINE appendS #-} {-# INLINE concatS #-} {-# INLINE toString #-} -- ------------------------------------------------------------ instance StringLike BL.ByteString where emptyS = BL.empty uncons = BL.uncons nullS = BL.null headS = BL.head takeS = BL.take . toEnum dropS = BL.drop . toEnum appendS = BL.append concatS = BL.concat toString = BL.unpack {-# INLINE emptyS #-} {-# INLINE uncons #-} {-# INLINE nullS #-} {-# INLINE takeS #-} {-# INLINE dropS #-} {-# INLINE appendS #-} {-# INLINE concatS #-} {-# INLINE toString #-} -- ------------------------------------------------------------ hxt-regex-xmlschema-9.2.0.3/test/Date.hs0000644000000000000000000005625612752557013016160 0ustar0000000000000000{-# LANGUAGE ExplicitForAll #-} {-# LANGUAGE Rank2Types #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ViewPatterns #-} module Main where import Control.Arrow ( (***), second ) import Data.Char ( toLower, toUpper ) import Data.List ( isPrefixOf ) import qualified Data.ByteString.Char8 as B import qualified Data.ByteString.Lazy.Char8 as BL import Data.String (IsString (..)) import qualified Data.Text as T import qualified Data.Text.Lazy as TL import System.Exit (ExitCode (..), exitWith) import Test.HUnit import Text.Parsec import Text.Regex.XMLSchema.Generic import Text.Regex.XMLSchema.Generic.StringLike -- ------------------------------------------------------------ newtype Test' a = Test' {unTest' :: Test} type BS = B.ByteString type BL = BL.ByteString type Text = T.Text type TextL = TL.Text -- ------------------------------------------------------------ -- some little helpers for building r.e.s star :: String -> String star = (++ "*") . pars plus :: String -> String plus = (++ "+") . pars opt :: String -> String opt = (++ "?") . pars dot :: String -> String dot = (++ "\\.") pars :: String -> String pars = ("(" ++) . (++ ")") orr :: String -> String -> String orr x y = pars $ pars x ++ "|" ++ pars y xor :: String -> String -> String xor x y = pars $ pars x ++ "{|}" ++ pars y nocase :: String -> String nocase (x:xs) = '[' : toUpper x : toLower x : ']' : xs nocase [] = error "nocase with empty list" alt :: [String] -> String alt = pars . foldr1 orr altNC :: [String] -> String altNC = pars . alt . map nocase subex :: String -> String -> String subex n e = pars $ "{" ++ n ++ "}" ++ pars e ws :: String ws = "\\s" ws0 :: String ws0 = star ws ws1 :: String ws1 = plus ws s0 :: String -> String -> String s0 x y = x ++ ws0 ++ y -- the date and time r.e.s day :: String day = "(0?[1-9]|[12][0-9]|3[01])" month :: String month = "(0?[1-9]|1[0-2])" year2 :: String year2 = "[0-5][0-9]" year4 :: String year4 = "20" ++ year2 year :: String year = year4 `orr` year2 year' :: String year' = "'" ++ year2 dayD :: String dayD = dot day monthD :: String monthD = dot month dayMonthYear :: String dayMonthYear = dayD `s0` monthD `s0` year dayMonth :: String dayMonth = dayD `s0` monthD dayOfWeekL :: String dayOfWeekL = altNC [ "montag" , "dienstag" , "mittwoch" , "donnerstag" , "freitag" , "samstag" , "sonnabend" , "sonntag" ] dayOfWeekA :: String dayOfWeekA = alt . map dot $ [ "Mo", "Di", "Mi", "Do", "Fr", "Sa", "So"] dayOfWeek :: String dayOfWeek = dayOfWeekL `orr` dayOfWeekA monthL :: String monthL = altNC [ "januar" , "februar" , "märz" , "april" , "mai" , "juni" , "juli" , "august" , "september" , "oktober" , "november" , "dezember" ] monthA :: String monthA = altNC . map dot $ map snd monthAbr monthAbr :: [(Integer, String)] monthAbr = (9, "sept") : zip [1..12] [ "jan", "feb", "mär", "apr", "mai", "jun", "jul", "aug", "sep", "okt", "nov", "dez"] monthN :: String monthN = pars $ monthL `orr` monthA hour :: String hour = pars "([0-1]?[0-9])|(2[0-4])" minute :: String minute = pars "(0?[0-9])|([1-5][0-9])" uhr :: String uhr = ws0 ++ nocase "uhr" hourMin :: String hourMin = hour ++ ":" ++ minute ++ opt uhr wsyear :: String wsyear = year ++ "/[0-9]{2}" wsem :: String wsem = ("Wi?Se?" `orr` nocase "Wintersemester") ++ ws0 ++ wsyear ssem :: String ssem = ("So?Se?" `orr` nocase "Sommersemester") ++ ws0 ++ year sem :: String sem = wsem `orr` ssem num :: String num = "\\d+" -- the token types tokenRE :: String tokenRE = foldr1 xor $ map (uncurry subex) $ [ ( "ddmmyyyy", dayMonthYear ) , ( "ddMonthyyyy", dayD `s0` monthN `s0` (year `orr` year') ) , ( "ddmm", dayMonth) , ( "ddMonth", dayD `s0` monthN ) , ( "yyyymmdd", year ++ "[-/]" ++ month ++ "[-/]" ++ day ) , ( "yyyy", year4 `orr` ("'" ++ year2) ) , ( "month", monthN ) , ( "weekday", dayOfWeek ) , ( "HHMM", hourMin ++ opt uhr ) , ( "HH", hour ++ uhr ) , ( "wsem", wsem) , ( "ssem", ssem) , ( "word", "[\\w\\d]+") , ( "del", "[^\\w\\d]+") ] -- ------------------------------------------------------------ type Token = (String, String) type TokenStream = [Token] type DateParser a = Parsec [(String, String)] () a type StringFct = String -> String -- for fast concatenation -- must be extended for weekday or semester, if neccessay data DateVal = DT { _year :: ! Int , _month :: ! Int , _day :: ! Int , _hour :: ! Int , _min :: ! Int } deriving (Eq, Show) data DateParse = DP { _pre :: StringFct , _rep :: StringFct , _dat :: ! DateVal } -- just a helper for result output data DateRep = DR { _p :: String , _r :: String , _d :: ! DateVal } deriving (Eq, Show) -- ------------------------------------------------------------ emptyText :: StringFct emptyText = id mkText :: String -> StringFct mkText = (++) concText :: StringFct -> StringFct -> StringFct concText = (.) textToString :: StringFct -> String textToString = ($ []) emptyDateVal :: DateVal emptyDateVal = DT { _year = -1 , _month = -1 , _day = -1 , _hour = -1 , _min = -1 } emptyDateParse :: DateParse emptyDateParse = DP { _pre = emptyText , _rep = emptyText , _dat = emptyDateVal } appPre :: String -> DateParse -> DateParse appPre s d = d { _pre = _pre d `concText` mkText s } appRep :: String -> DateParse -> DateParse appRep s d = d { _rep = _rep d `concText` mkText s } setDay :: Int -> Int -> Int -> DateParse -> DateParse setDay j m t d = d { _dat = setDateVal j m t (-1) (-1) (_dat d) } setHour :: Int -> Int -> DateParse -> DateParse setHour h m d = d { _dat = setDateVal (-1) (-1) (-1) h m (_dat d) } setDateVal :: Int -> Int -> Int -> Int -> Int -> DateVal -> DateVal setDateVal j m t s i (DT j' m' t' s' i' ) = DT j'' m'' t'' s'' i'' where j'' | j < 0 = j' -- year not there | j < 100 = j + 2000 -- 2 digit year | otherwise = j -- 4 digit year m'' = m `max` m' t'' = t `max` t' s'' = s `max` s' i'' = i `max` i' datePToDateRep :: DateParse -> DateRep datePToDateRep dp = DR { _p = textToString $ _pre dp , _r = textToString $ _rep dp , _d = _dat dp } -- ------------------------------------------------------------ -- a simple helper for showing the results dateSearch' :: TokenStream -> [DateRep] dateSearch' = map datePToDateRep . dateSearch -- look for a sequence of date specs, the last entry in the list -- does not contain a valid date, but just the context behind the last real date dateSearch :: TokenStream -> [DateParse] dateSearch = either (const []) id . parse (many (dateParser emptyDateParse)) "" -- all date parsers thread a state the subparsers to accumulate -- the parts of a date, the context, the external representation and -- the pure data, year, month, day, ... dateParser :: DateParse -> DateParser DateParse dateParser d = ( do s <- fillTok dateParser0 (appPre s d) ) <|> parseDate d -- here is the hook for the real date parser <|> ( do s <- textTok -- the default case: if parseDate fails dateParser0 (appPre s d) -- the token is handled like a normal word ) dateParser0 :: DateParse -> DateParser DateParse dateParser0 d = dateParser d <|> return d parseDate :: DateParse -> DateParser DateParse parseDate d = parseDate0 d <|> try ( do d1 <- parseWeekDay d lookAheadN 3 parseDate0 d1 -- Freitag, den 13. ) -- parse a date optionally followed by a time parseDate0 :: DateParse -> DateParser DateParse parseDate0 d = ( do d1 <- parseDay d option d1 (parseFollowingHour d1) ) -- parse a simple token for a day parseDay :: DateParse -> DateParser DateParse parseDay d = ( do (s, d') <- parseDateTok "ddmmyyyy" d let [t, m, j] = tokenize num s return $ setDay (read j) (read m) (read t) d' ) <|> ( do (s, d') <- parseDateTok "ddMonthyyyy" d let s' = sed ((++ ".") . monthToM) monthN s let [t, m, j] = tokenize num s' return $ setDay (read j) (read m) (read t) d' ) <|> ( do (s, d') <- parseDateTok "ddmm" d let [t, m] = tokenize num s return $ setDay (-1) (read m) (read t) d' ) <|> ( do (s, d') <- parseDateTok "ddMonth" d let s' = sed ((++ ".") . monthToM) monthN s let [t, m] = tokenize num s' return $ setDay (-1) (read m) (read t) d' ) <|> ( do (s, d') <- parseDateTok "yyyymmdd" d let [j, m, t] = tokenize num s return $ setDay (read j) (read m) (read t) d' ) parseYear :: DateParse -> DateParser DateParse parseYear d = ( do (s, d') <- parseDateTok "yyyy" d let [j] = tokenize num s return $ setDay (read j) (-1) (-1) d' ) -- parse a weekday and add it to the external rep. parseWeekDay :: DateParse -> DateParser DateParse parseWeekDay d = ( do (_s, d') <- parseDateTok "weekday" d return d' ) -- parse a following hour spec, 5 fill tokens, words or delimiters are possible parseFollowingHour :: DateParse -> DateParser DateParse parseFollowingHour = try . -- backtracking becomes neccessary lookAheadN 5 parseHour -- max 2 words and 3 delimiters -- parse the simple time formats parseHour :: DateParse -> DateParser DateParse parseHour d = ( do (s, d') <- parseDateTok "HHMM" d let [h, m] = tokenize num s return $ setHour (read h) (read m) d' ) <|> ( do (s, d') <- parseDateTok "HH" d let [h] = tokenize num s return $ setHour (read h) 0 d' ) -- ------------------------------------------------------------ -- -- auxiliary parser combinators -- parse a token of a given type and add the text to the external rep. parseDateTok :: String -> DateParse -> DateParser (String, DateParse) parseDateTok tty d = dateTok (isTokType (== tty)) d dateTok :: DateParser String -> DateParse -> DateParser (String, DateParse) dateTok t d = ( do s <- t return (s, appRep s d) ) -- try to apply a parser, but first skip a given # of fill tokens lookAheadN :: Int -> (DateParse -> DateParser DateParse) -> DateParse -> DateParser DateParse lookAheadN n p d | n <= 0 = p d | otherwise = do (_, d1) <- dateTok fillTok d ( lookAheadN (n - 1) p d1 <|> p d1 ) -- ------------------------------------------------------------ -- -- basic token parsers -- the interface to the primitive parsec token parser tok :: (Token -> Bool) -> DateParser Token tok prd = tokenPrim showTok nextPos testTok where showTok = show . fst nextPos pos _tok _ts = incSourceColumn pos 1 testTok tk = if prd tk then Just tk else Nothing -- check for specific token type and in case of success return the text value isTokType :: (String -> Bool) -> DateParser String isTokType isT = tok (isT . fst) >>= return . snd -- parse an arbitrary token and return the text value textTok :: DateParser String textTok = isTokType (const True) -- a word wordTok :: DateParser String wordTok = isTokType (== "word") -- a delimiter, whitespace is normalized, sequences are reduced to a single space char delTok :: DateParser String delTok = isTokType (== "del") >>= return . sed (const " ") ws1 -- tokens that don't contain date info fillTok :: DateParser String fillTok = delTok <|> wordTok -- semester tokens, not yet interpreted semTok' :: String -> DateParser (String, Int, Bool) semTok' sem' = do v <- isTokType (== sem') return (v, read . head . tokenizeExt year $ v, sem' == "ssem") semTok :: DateParser (String, Int, Bool) semTok = semTok' "ssem" <|> semTok' "wsem" -- ------------------------------------------------------------ -- conversion from month names to 1..12 monthToM :: String -> String monthToM m = show . (\ l -> if null l then 99 else head l) . map fst . filter ((== True) . snd) . map (second (`isPrefixOf` map toLower m)) $ monthAbr -- ------------------------------------------------------------ ts :: String ts = "Am Sonntag, dem 17. Februar '03 findet um 9 Uhr ein wichtiger Termin für das Sommersemester 2000 statt. " ++ "Dieser wird allerdings auf Montag verschoben. Und zwar auf den ersten Montag im Wintersemester 11/12, 12:30. " ++ "Ein wichtiger Termin findet im SoSe 2011 statt. Im Jahr '12 gibt es Termine, aber auch in WS 2010/11. " ++ "Ein weiterer Termin ist am 2.4.11 um 12 Uhr. Oder war es doch Di. der 3.4.? Egal. " ++ "Ein weiterer wichtiger Termin findet am 2001-3-4 statt bzw. generell zwischen 01/3/4 - 01/6/4 um 13 Uhr. " ++ "Am kommenden Mittwoch findet Changemanagement in HS5 statt. Dies gilt dann auch für den 7. Juni " ++ "des Jahres 2011. Noch ein wichtiger Termin findet um 16:15 Uhr am Do., 1.2.03 statt. " ++ "Freitag, der 13. Juli ist kein Glückstag" ++ "und Freitag, der 13. Juli um 11:55 Uhr ist es zu spät." rrr :: [String] rrr = map _r . dateSearch' . tokenizeSubex tokenRE $ ts ddd :: [DateVal] ddd = map _d . dateSearch' . tokenizeSubex tokenRE $ ts aaa :: [DateRep] aaa = dateSearch' . tokenizeSubex tokenRE $ ts tt :: String -> [(String, String)] tt = tokenizeSubex tokenRE dd :: String -> [DateVal] dd = map _d . dateSearch' . tt rr :: String -> [String] rr = map _r . dateSearch' . tt pp :: String -> [String] pp = map _p . dateSearch' . tt -- ------------------------------------------------------------ testDate :: forall a . StringLike a => Test' a testDate = Test' $ TestLabel "date and time extraction from free text" $ TestList $ zipWith parseT toks exx where parseT res ok = TestCase $ assertEqual (show res ++ " == " ++ show ok) res ok toks :: [(a, a)] toks = tokenizeSubex (fromString tokenRE) (fromString ts) exx :: [(a, a)] exx = map (fromString *** fromString) $ [("word","Am"),("del"," "),("weekday","Sonntag"),("del",", "),("word","dem"),("del"," ") ,("ddMonthyyyy","17. Februar '03"),("del"," "),("word","findet"),("del"," "),("word","um") ,("del"," "),("HH","9 Uhr"),("del"," "),("word","ein"),("del"," "),("word","wichtiger") ,("del"," "),("word","Termin"),("del"," "),("word","f\252r"),("del"," "),("word","das") ,("del"," "),("ssem","Sommersemester 2000"),("del"," "),("word","statt"),("del",". ") ,("word","Dieser"),("del"," "),("word","wird"),("del"," "),("word","allerdings") ,("del"," "),("word","auf"),("del"," "),("weekday","Montag"),("del"," ") ,("word","verschoben"),("del",". "),("word","Und"),("del"," "),("word","zwar") ,("del"," "),("word","auf"),("del"," "),("word","den"),("del"," "),("word","ersten") ,("del"," "),("weekday","Montag"),("del"," "),("word","im"),("del"," ") ,("wsem","Wintersemester 11/12"),("del",", "),("HHMM","12:30"),("del",". ") ,("word","Ein"),("del"," "),("word","wichtiger"),("del"," "),("word","Termin") ,("del"," "),("word","findet"),("del"," "),("word","im"),("del"," "),("ssem","SoSe 2011") ,("del"," "),("word","statt"),("del",". "),("word","Im"),("del"," "),("word","Jahr") ,("del"," '"),("word","12"),("del"," "),("word","gibt"),("del"," "),("word","es") ,("del"," "),("word","Termine"),("del",", "),("word","aber"),("del"," "),("word","auch") ,("del"," "),("word","in"),("del"," "),("wsem","WS 2010/11"),("del",". "),("word","Ein") ,("del"," "),("word","weiterer"),("del"," "),("word","Termin"),("del"," "),("word","ist") ,("del"," "),("word","am"),("del"," "),("ddmmyyyy","2.4.11"),("del"," "),("word","um") ,("del"," "),("HH","12 Uhr"),("del",". "),("word","Oder"),("del"," "),("word","war") ,("del"," "),("word","es"),("del"," "),("word","doch"),("del"," "),("weekday","Di.") ,("del"," "),("word","der"),("del"," "),("ddmm","3.4."),("del","? "),("word","Egal") ,("del",". "),("word","Ein"),("del"," "),("word","weiterer"),("del"," ") ,("word","wichtiger"),("del"," "),("word","Termin"),("del"," "),("word","findet") ,("del"," "),("word","am"),("del"," "),("yyyymmdd","2001-3-4"),("del"," ") ,("word","statt"),("del"," "),("word","bzw"),("del",". "),("word","generell") ,("del"," "),("word","zwischen"),("del"," "),("yyyymmdd","01/3/4"),("del"," - ") ,("yyyymmdd","01/6/4"),("del"," "),("word","um"),("del"," "),("HH","13 Uhr") ,("del",". "),("word","Am"),("del"," "),("word","kommenden"),("del"," ") ,("weekday","Mittwoch"),("del"," "),("word","findet"),("del"," ") ,("word","Changemanagement"),("del"," "),("word","in"),("del"," "),("word","HS5") ,("del"," "),("word","statt"),("del",". "),("word","Dies"),("del"," "),("word","gilt") ,("del"," "),("word","dann"),("del"," "),("word","auch"),("del"," "),("word","f\252r") ,("del"," "),("word","den"),("del"," "),("ddMonth","7. Juni"),("del"," "),("word","des") ,("del"," "),("word","Jahres"),("del"," "),("yyyy","2011"),("del",". "),("word","Noch") ,("del"," "),("word","ein"),("del"," "),("word","wichtiger"),("del"," "),("word","Termin") ,("del"," "),("word","findet"),("del"," "),("word","um"),("del"," "),("HHMM","16:15 Uhr") ,("del"," "),("word","am"),("del"," "),("weekday","Do."),("del",", "),("ddmmyyyy","1.2.03") ,("del"," "),("word","statt"),("del",". "),("weekday","Freitag"),("del",", ") ,("word","der"),("del"," "),("ddMonth","13. Juli"),("del"," "),("word","ist"),("del"," ") ,("word","kein"),("del"," "),("word","Gl\252ckstagund"),("del"," "),("weekday","Freitag") ,("del",", "),("word","der"),("del"," "),("ddMonth","13. Juli"),("del"," "),("word","um") ,("del"," "),("HHMM","11:55 Uhr"),("del"," "),("word","ist"),("del"," "),("word","es") ,("del"," "),("word","zu"),("del"," "),("word","sp\228t"),("del",".") ] -- ------------------------------------------------------------ genericTest :: (forall a . StringLike a => Test' a) -> Test genericTest t = TestList $ [ TestLabel "Test with 'String'" $ unTest' (t :: Test' String) , TestLabel "Test with 'Text'" $ unTest' (t :: Test' Text) , TestLabel "Test with 'Text.Lazy'" $ unTest' (t :: Test' TextL) , TestLabel "Test with 'ByteString'" $ unTest' (t :: Test' BS) , TestLabel "Test with 'ByteString.Lazy'" $ unTest' (t :: Test' BL) ] allTests :: Test allTests = TestList [ genericTest testDate ] main :: IO () main = do c <- runTestTT allTests putStrLn $ show c let errs = errors c fails = failures c exitWith (codeGet errs fails) codeGet :: Int -> Int -> ExitCode codeGet errs fails | fails > 0 = ExitFailure 2 | errs > 0 = ExitFailure 1 | otherwise = ExitSuccess -- ------------------------------------------------------------ hxt-regex-xmlschema-9.2.0.3/test/SimpleTest.hs0000644000000000000000000004063512752557013017366 0ustar0000000000000000{-# LANGUAGE RankNTypes #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ViewPatterns #-} -- ------------------------------------------------------------ module Main where import Control.Arrow import qualified Data.ByteString.Char8 as B import qualified Data.ByteString.Lazy.Char8 as BL import Data.String (IsString (..)) import qualified Data.Text as T import qualified Data.Text.Lazy as TL import System.Exit (ExitCode (..), exitWith) import Text.Regex.XMLSchema.Generic import Text.Regex.XMLSchema.Generic.Regex import Text.Regex.XMLSchema.Generic.StringLike import Test.HUnit -- ------------------------------------------------------------ newtype Test' a = Test' {unTest' :: Test} type BS = B.ByteString type BL = BL.ByteString type Text = T.Text type TextL = TL.Text -- ------------------------------------------------------------ parseTestsStdLatin1 :: forall s . StringLike s => Test' s parseTestsStdLatin1 = parseTestsStd' testsLatin1 parseTestsStdUnicode :: forall s . StringLike s => Test' s parseTestsStdUnicode = parseTestsStd' testsUnicode parseTestsStd' :: forall s . StringLike s => [(String, String)] -> Test' s parseTestsStd' tests = Test' $ TestLabel "standard XML parse tests" $ TestList $ map parseTest $ tests where parseTest (re0, rep) = TestCase $ assertEqual (show re ++ " must be parsed as " ++ show rep) rep (show . parseRegexExt $ re) where re :: s re = fromString re0 testsLatin1 :: [(String, String)] testsLatin1 = [ ("", "()") , (".", ".") , (".*", "(.*)") , ("(())", "()") , ("(a*)*", "(a*)") , ("(a*)+", "(a*)") , ("(a+)*", "(a*)") , ("(a+)+", "(a+)") , ("(a?){2,}", "(a*)") , ("((a?){2,}){0,}", "(a*)") , ("((a?){2,}){3,}", "(a*)") , ("(a{0,}){2,}", "(a*)") , ("(a{2,}){3,}", "(a{6,})") , ("[9-0]", "{empty char range}") , ("[0-9]", "[0-9]") , ("[0-99-0]", "[0-9]") , ("[abc]", "[a-c]") , ("[abc-[b]]", "[ac]" ) , ("a|b|c|d", "[a-d]" ) , ("(a|b)|c", "[a-c]" ) , ("a|(b|c)", "[a-c]" ) , ("abc", "(a(bc))" ) -- seq is right ass , ("({1}a+)", "({1}(a+))" ) -- extension: labeled subexpressions , ("({1}({2}({3}a)))", "({1}({2}({3}a)))" ) ] testsUnicode :: [(String, String)] testsUnicode = [ ("[\0-\1114111]", "\\a") , ("[\0-\1114111]|[0-9]", "\\a") , ("[\0-\1114110]", "[�-􏿾]" ) ] parseTestsExtLatin1 :: forall s . StringLike s => Test' s parseTestsExtLatin1 = parseTestsExt' testsExtLatin1 parseTestsExtUnicode :: forall s . StringLike s => Test' s parseTestsExtUnicode = parseTestsExt' testsExtUnicode parseTestsExt' :: forall s . StringLike s => [(String, String)] -> Test' s parseTestsExt' tests = Test' $ TestLabel "extended parse tests" $ TestList $ map parseTest $ tests where parseTest (re0, rep) = TestCase $ assertEqual (show re ++ " must be parsed as " ++ show rep) rep (show . parseRegexExt $ re) where re :: s re = fromString re0 testsExtLatin1 :: [(String, String)] testsExtLatin1 = [ ("", "()") , (".", ".") , (".*", "(.*)") , ("\\a", "\\a") , ("\\A", "\\A") , ("(())", "()") , ("(a*)*", "(a*)") , ("(a*)+", "(a*)") , ("(a+)*", "(a*)") , ("(a+)+", "(a+)") , ("(a?){2,}", "(a*)") , ("((a?){2,}){0,}", "(a*)") , ("((a?){2,}){3,}", "(a*)") , ("(a{0,}){2,}", "(a*)") , ("(a{2,}){3,}", "(a{6,})") , ("[9-0]", "{empty char range}") , ("[0-9]", "[0-9]") , ("[0-99-0]", "[0-9]") , ("[abc]", "[a-c]") , ("[abc-[b]]", "[ac]" ) , ("a|b|c|d", "[a-d]" ) , ("(a|b)|c", "[a-c]" ) , ("a|(b|c)", "[a-c]" ) , ("abc", "(a(bc))" ) -- seq is right ass , ("a*{^}b*", "((a*){^}(b*))" ) -- extension: exor , ("a*{^}b*{^}c*", "((a*){^}((b*){^}(c*)))") , ("a*{&}b*", "((a*){&}(b*))" ) -- extension: intersection , ("a*{&}b*{&}c*", "((a*){&}((b*){&}(c*)))") , ("a*{\\}b*", "((a*){\\}(b*))" ) -- extension: set difference , ("a*{\\}b*{\\}c*", "(((a*){\\}(b*)){\\}(c*))" ) , ("(a|b)*{\\}(.*aa.*)", "(([ab]*){\\}((.*)(a(a(.*)))))" ) , ("({1}a+)", "({1}(a+))" ) -- extension: labeled subexpressions , ("({1}({2}({3}a)))", "({1}({2}({3}a)))" ) , ("({1}do){|}({2}[a-z]+)", "(({1}(do)){|}({2}([a-z]+)))" ) -- deterministic choice of submatches , ("a{:}b{:}c", "(a{:}(b{:}c))" ) -- interleave ] testsExtUnicode :: [(String, String)] testsExtUnicode = [ ("[\0-\1114111]", "\\a") , ("[\0-\1114111]|[0-9]", "\\a") , ("[\0-\1114110]", "[�-􏿾]" ) , ("[abc-[b]]", "[ac]" ) , ("a|b|c|d", "[a-d]" ) , ("(a|b)|c", "[a-c]" ) , ("a|(b|c)", "[a-c]" ) , ("abc", "(a(bc))" ) -- seq is right ass , ("a*{^}b*", "((a*){^}(b*))" ) -- extension: exor , ("a*{^}b*{^}c*", "((a*){^}((b*){^}(c*)))") , ("a*{&}b*", "((a*){&}(b*))" ) -- extension: intersection , ("a*{&}b*{&}c*", "((a*){&}((b*){&}(c*)))") , ("a*{\\}b*", "((a*){\\}(b*))" ) -- extension: set difference , ("a*{\\}b*{\\}c*", "(((a*){\\}(b*)){\\}(c*))" ) , ("(a|b)*{\\}(.*aa.*)", "(([ab]*){\\}((.*)(a(a(.*)))))" ) , ("({1}a+)", "({1}(a+))" ) -- extension: labeled subexpressions , ("({1}({2}({3}a)))", "({1}({2}({3}a)))" ) , ("({1}do){|}({2}[a-z]+)", "(({1}(do)){|}({2}([a-z]+)))" ) -- deterministic choice of submatches , ("a{:}b{:}c", "(a{:}(b{:}c))" ) -- interleave ] simpleMatchTests :: forall a . StringLike a => Test' a simpleMatchTests = Test' $ TestLabel "simple match tests" $ TestList $ concatMap matchTest $ testsMatch where matchTest :: (String, [String], [String]) -> [Test] matchTest (re0, ok, er) = map (matchOK re . fromString) ok ++ map (matchErr re . fromString) er where re :: a re = fromString re0 matchOK :: a -> a -> Test matchOK re xs = TestCase $ assertBool (show xs ++ " must match " ++ show re) (matchExt re xs) matchErr re xs = TestCase $ assertBool (show xs ++ " must not match " ++ show re) (not (matchExt re xs)) testsMatch :: [(String, [String], [String])] testsMatch = [ ( "" , [""] , ["a"] ) , ( "a" , ["a"] , ["", "b", "ab"] ) , ( "()" , [""] , ["a"] ) , ( "ab" , ["ab"] , ["", "b", "abc"] ) , ( "." , [".","a","\0","\1114111"] , ["\n","\r","",".."] ) , ( "\\a" , [".","a","\n","\r","\0","\1114111"] , ["",".."] ) , ( "\\A" , ["",".","a","\n","\r","\0","\1114111",".."] , [] ) , ( "a*" , ["", "a", "aa"] , ["b", "ab", "aab"] ) , ( "a+" , ["a", "aa", "aaa"] , ["", "b", "ab"] ) , ( "a?" , ["", "a"] , ["b", "ab"] ) , ( "a{2}" , ["aa"] , ["", "a", "aaa"] ) , ( "a{2,}" , ["aa","aaa"] , ["", "a", "aaab"] ) , ( "a{2,4}" , ["aa", "aaa", "aaaa"] , ["", "a", "aaaaa", "ab"] ) , ( "a|b" , ["a", "b"] , ["", "c", "ab", "abc"] ) , ( "[0-9]" , ["0", "5", "9"] , ["", "a", "00"] ) , ( "[^0-9]" , ["a"] , ["", "0", "9", "00"]) , ( "\32" , [" "] , [] ) , ( "[\0-\1114111]" , ["\0","\1114111","a"] , ["","aaa"] ) , ( "[^\0-\1114111]" , [] , ["","aaa","\0","\1114111","a"] ) , ( ".*a.*|.*b.*|.*c.*" , ["a", "abc", "acdc"] , ["", "dddd"] ) , ( ".*a.*{&}.*b.*{&}.*c.*" , ["abc", "abcd", "abcabcd"] , ["", "a", "bc", "acdc", "dddd"] ) , ( ".*a.*{&}.*b.*{&}.*c.*{&}.{3}" -- all permutations of "abc" , ["abc", "acb", "bac", "bca", "cab", "cba"] , ["", "a", "bc", "acd", "aaaa", "aba"] ) , ( ".*a.*{&}.*b.*{&}.*c.*" -- all words containing at least 1 a, 1 b and 1 c , ["abc", "acb", "bac", "bca", "cab", "cba", "abcd", "abcabc"] , ["", "a", "bc", "acd", "aaaa"] ) , ( ".*a.*{^}.*b.*" -- all words containing at least 1 a or 1 b but not both a's and b's , ["a", "b", "ac", "bc", "aaaa", "bbb", "aacc", "ccbb", "acdc"] , ["", "ab", "abc", "dddd"] ) , ( "/[*](.*{\\}(.*[*]/.*))[*]/" -- single line C comment of form /*...*/, but without any */ in the comment body -- this is the way to specify none greedy expessions -- if multi-line comment are required, substitute .* by \A, so newlines are allowed , ["/**/","/***/","/*x*/","/*///*/"] , ["", "/", "/*", "/*/", "/**/*/", "/*xxx*/xxx*/"] ) , ( "a{:}b{:}c" , ["abc", "acb", "bac", "bca", "cab", "cba"] , ["", "a", "ab", "abcc", "abca", "aba"] ) ] -- ------------------------------------------------------------ simpleSplitTests :: forall a . StringLike a => Test' a simpleSplitTests = Test' $ TestLabel "simple split tests" $ TestList $ map splitTest $ testsSplit where splitTest (re0, inp0, tok0, rest0) = TestCase $ assertEqual ("split " ++ show re ++ " " ++ show inp0 ++ " = " ++ show (tok, rest)) (tok, rest) (split re (fromString inp0)) where re, tok, rest :: a re = fromString re0 tok = fromString tok0 rest = fromString rest0 testsSplit :: [(String, String, String, String)] testsSplit = [ ("", "a", "", "a" ) , ("a*b", "abc", "ab", "c" ) , ("a*", "bc", "", "bc" ) , ("a+", "bc", "", "bc" ) , ("[", "bc", "", "bc" ) , ("a{2}", "aaa", "aa", "a" ) , ("a{2,}", "aaa", "aaa", "" ) , ("a|b", "ab", "a", "b" ) , ("a|b*", "bbba", "bbb", "a" ) , ("abc", "abcd", "abc", "d" ) ] -- ------------------------------------------------------------ simpleTokenTests :: forall a . StringLike a => Test' a simpleTokenTests = Test' $ TestLabel "simple token tests" $ TestList $ map tokenTest $ testsToken where tokenTest (re0, inp0, toks0) = TestCase $ assertEqual ("tokenize " ++ show re ++ " " ++ show inp ++ " = " ++ show toks) toks (tokenize re inp) where re, inp :: a re = fromString re0 inp = fromString inp0 toks :: [a] toks = map fromString toks0 testsToken :: [(String, String, [String])] testsToken = [ ("", "", [] ) , ("a", "aba", ["a", "a"] ) , ("a", "b", [] ) , ("a", "ba", ["a"] ) , ("a*", "a", ["a"] ) , ("a*", "ba", ["","a"] ) , ("a*", "aba", ["a", "a"] ) , ("a*", "abba", ["a", "", "a"] ) , ("a+", "abba", ["a", "a"] ) , ("a*b", "abba", ["ab", "b"] ) , (".*", "a\n\nb", ["a", "", "b"] ) , (".*", "a\n\nb\n", ["a", "", "b"] ) , ("\\w+", "a\n\nb\n", ["a", "b"] ) , ("\\w|ab", "aaa\n\nabc\n", ["a", "a", "a", "ab", "c"] ) , ("\\w|ab", "aaa abc", ["a", "a", "a", "ab", "c"] ) ] -- ------------------------------------------------------------ genericTest :: (forall a . StringLike a => Test' a) -> Test genericTest t = TestList $ [ TestLabel "Test with 'String'" $ unTest' (t :: Test' String) , TestLabel "Test with 'Text'" $ unTest' (t :: Test' Text) , TestLabel "Test with 'Text.Lazy'" $ unTest' (t :: Test' TextL) , TestLabel "Test with 'ByteString'" $ unTest' (t :: Test' BS) , TestLabel "Test with 'ByteString.Lazy'" $ unTest' (t :: Test' BL) ] unicodeTest :: (forall a . StringLike a => Test' a) -> Test unicodeTest t = TestList $ [ TestLabel "Test with 'String'" $ unTest' (t :: Test' String) , TestLabel "Test with 'Text'" $ unTest' (t :: Test' Text) , TestLabel "Test with 'Text.Lazy'" $ unTest' (t :: Test' TextL) ] allTests :: Test allTests = TestList [ genericTest parseTestsStdLatin1 , unicodeTest parseTestsStdUnicode , genericTest parseTestsExtLatin1 , unicodeTest parseTestsExtUnicode , genericTest simpleMatchTests , genericTest simpleSplitTests , genericTest simpleTokenTests ] main :: IO () main = do c <- runTestTT allTests putStrLn $ show c let errs = errors c fails = failures c exitWith (codeGet errs fails) codeGet :: Int -> Int -> ExitCode codeGet errs fails | fails > 0 = ExitFailure 2 | errs > 0 = ExitFailure 1 | otherwise = ExitSuccess -- ------------------------------------------------------------ deltaTrc :: StringLike s => s -> GenRegex s -> [(s, GenRegex s)] deltaTrc s@(uncons -> Just (c, cs)) re = (s, re) : ( if isZero re' then [(emptyS,re')] else deltaTrc cs re' ) where re' = delta1 c s re deltaTrc _ re = [(emptyS, re)] matchTrc :: StringLike s => s -> s -> (Bool, [(s, GenRegex s)]) matchTrc re s = (nullable . snd . last $ res, res) where res = deltaTrc s (parseRegex re) trcMatch :: StringLike s => s -> s -> IO() trcMatch re = putStrLn . showTrc . matchTrc re where showTrc = ( (show >>> (++ "\n")) *** (concatMap ( ( (toString >>> (++ "\t")) *** (show >>> (++"\n")) ) >>> uncurry (++) ) ) ) >>> uncurry (flip (++)) -- ------------------------------------------------------------ hxt-regex-xmlschema-9.2.0.3/test/Benchmark.hs0000644000000000000000000000422212752557013017157 0ustar0000000000000000{-# LANGUAGE RankNTypes #-} {-# LANGUAGE ScopedTypeVariables #-} -- ------------------------------------------------------------ module Main where import Control.DeepSeq import Criterion.Main import qualified Data.ByteString.Char8 as B import qualified Data.ByteString.Lazy.Char8 as BL import Data.String (IsString (..)) import qualified Data.Text as T import qualified Data.Text.Lazy as TL import Text.Regex.XMLSchema.Generic import Text.Regex.XMLSchema.Generic.StringLike -- import Debug.Trace -- ------------------------------------------------------------ type BS = B.ByteString type BL = BL.ByteString type Text = T.Text type TextL = TL.Text -- ------------------------------------------------------------ benchSTB :: String -> (forall s . (NFData s, StringLike s) => s -> [s]) -> (String, String -> [String]) -> String -> Benchmark benchSTB name fct ref inp = benchSTB' name fct ref $! mkInput inp benchSTB' :: String -> (forall s . (NFData s, StringLike s) => s -> [s]) -> (String, String -> [String]) -> (String, Text, TextL, BS, BL) -> Benchmark benchSTB' name fct (refName, ref) (s, t, tl, bs, bl) = bgroup name [ bench refName $ nf ref s , bench "String" $ nf fct s , bench "Text" $ nf fct t , bench "Text.Lazy" $ nf fct tl , bench "ByteString" $ nf fct bs , bench "ByteString.Lazy" $ nf fct bl ] mkInput :: String -> (String, Text, TextL, BS, BL) mkInput s = rnf t5 `seq` t5 where t5 = (s, fromString s, fromString s, fromString s, fromString s) words' :: StringLike s => s -> [s] words' inp = tokenize (fromString "\\w+") inp main :: IO () main = do defaultMain [ benchSTB "100,000-words" words' ("words", words) $ unwords (replicate 100000 "1234567890") ] -- ------------------------------------------------------------ hxt-regex-xmlschema-9.2.0.3/LICENSE0000644000000000000000000000212012752557013014752 0ustar0000000000000000The MIT License Copyright (c) 2005 Uwe Schmidt, Martin Schmidt, Torben Kuseler Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. hxt-regex-xmlschema-9.2.0.3/Setup.hs0000644000000000000000000000011012752557013015376 0ustar0000000000000000#!/usr/bin/env runhaskell import Distribution.Simple main = defaultMain hxt-regex-xmlschema-9.2.0.3/hxt-regex-xmlschema.cabal0000644000000000000000000000750313001410352020613 0ustar0000000000000000Name: hxt-regex-xmlschema Version: 9.2.0.3 Synopsis: A regular expression library for W3C XML Schema regular expressions Description: This library supports full W3C XML Schema regular expressions inclusive all Unicode character sets and blocks. The complete grammar can be found under . It is implemented by the technique of derivations of regular expressions. . The W3C syntax is extended to support not only union of regular sets, but also intersection, set difference, exor. Matching of subexpressions is also supported. . The library can be used for constricting lightweight scanners and tokenizers. It is a standalone library, no external regex libraries are used. . Extensions in 9.2: The library does nor only support String's, but also ByteString's and Text in strict and lazy variants License: MIT License-file: LICENSE Author: Uwe Schmidt Maintainer: Uwe Schmidt Copyright: Copyright (c) 2010- Uwe Schmidt Stability: stable Category: Text Homepage: http://www.haskell.org/haskellwiki/Regular_expressions_for_XML_Schema Build-type: Simple Cabal-version: >=1.10 extra-source-files: examples/colorizeProgs/ColorizeSourceCode.hs examples/colorizeProgs/Makefile examples/performance/REtest.hs examples/performance/Makefile examples/RegexXMLSchema/Makefile examples/RegexXMLSchema/REtest.hs flag profile description: turn profiling on default: False Library Exposed-modules: Text.Regex.Glob.String Text.Regex.Glob.Generic Text.Regex.Glob.Generic.RegexParser Text.Regex.XMLSchema.String Text.Regex.XMLSchema.Generic Text.Regex.XMLSchema.Generic.Matching Text.Regex.XMLSchema.Generic.RegexParser Text.Regex.XMLSchema.Generic.Regex Text.Regex.XMLSchema.Generic.StringLike hs-source-dirs: src default-language: Haskell2010 ghc-options: -Wall -fwarn-tabs -funbox-strict-fields if flag(profile) ghc-prof-options: -caf-all build-depends: base >= 4 && < 5 , bytestring >= 0.10 , hxt-charproperties >= 9 && < 10 , parsec >= 2.1 && < 4 , text >= 0.10 test-suite SimpleMatch hs-source-dirs: test main-is: SimpleTest.hs type: exitcode-stdio-1.0 default-language: Haskell2010 ghc-options: -Wall build-depends: base , bytestring , hxt-regex-xmlschema , text , HUnit test-suite Date hs-source-dirs: test main-is: Date.hs type: exitcode-stdio-1.0 default-language: Haskell2010 ghc-options: -Wall build-depends: base , bytestring , hxt-regex-xmlschema , parsec >= 2 , text , HUnit Benchmark Benchmark hs-source-dirs: test main-is: Benchmark.hs type: exitcode-stdio-1.0 default-language: Haskell2010 ghc-options: -Wall -O2 build-depends: base , bytestring , criterion >= 1 , deepseq >= 1.2 , hxt-regex-xmlschema , parsec >= 2 , text Source-Repository head Type: git Location: git://github.com/UweSchmidt/hxt.git hxt-regex-xmlschema-9.2.0.3/examples/colorizeProgs/ColorizeSourceCode.hs0000644000000000000000000010431312752557013024531 0ustar0000000000000000-- ------------------------------------------------------------ {- | Module : ColorizeSourceCode Copyright : Copyright (C) 2009 Uwe Schmidt License : BSD3 Maintainer : Uwe Schmidt (uwe@fh-wedel.de) Portability: portable Colorize Source Code Supports Java and Haskell -} -- ------------------------------------------------------------ module Main where import Control.Arrow import Data.List import System.Environment import System.IO -- import the IO and commandline option stuff import System.Console.GetOpt import System.Exit import Text.Regex.XMLSchema.Generic import Text.XML.HXT.Core import Text.XML.HXT.Parser.XhtmlEntities -- ------------------------------------------------------------ data Process = P { inFilter :: String -> String , tokenRE :: Regex , markupRE :: Regex -> Regex , formatToken :: (String, String) -> String , formatDoc :: [String] -> String , outFilter :: String -> String , input :: Handle , output :: Handle , inputFile :: String } defaultProcess :: Process defaultProcess = P { inFilter = id , tokenRE = plainRE , markupRE = id , formatToken = uncurry (++) , formatDoc = unlines , outFilter = id , input = stdin , output = stdout , inputFile = " " } -- ------------------------------------------------------------ main :: IO () main = do argv <- getArgs p <- evalArgs (getOpt Permute options argv) s <- hGetContents (input p) hPutStr (output p) (process p s) hFlush (output p) hClose (output p) exitWith ExitSuccess options :: [OptDescr (String, String)] options = [ Option "h?" ["help"] (NoArg ("help", "1")) "this message" , Option "" ["plain"] (NoArg ("plain", "1")) "don't colorize lines" , Option "" ["haskell"] (NoArg ("haskell", "1")) "colorize haskell" , Option "" ["java"] (NoArg ("java", "1")) "colorize java" , Option "" ["cpp"] (NoArg ("cpp", "1")) "colorize C or C++" , Option "" ["sh"] (NoArg ("sh", "1")) "colorize sh or bash" , Option "" ["ruby"] (NoArg ("ruby", "1")) "colorize ruby" , Option "" ["bnf"] (NoArg ("bnf", "1")) "colorize extended BNF grammar rules" , Option "" ["ppl"] (NoArg ("ppl", "1")) "colorize ppl" , Option "" ["pplass"] (NoArg ("pplass", "1")) "colorize ppl assembler" , Option "n" ["number"] (NoArg ("number", "1")) "with line numbers" , Option "t" ["tabs"] (NoArg ("tabs", "1")) "substitute tabs by blanks" , Option "m" ["markup"] (NoArg ("markup", "1")) "text contains embedded markup" , Option "e" ["erefs"] (NoArg ("erefs", "1")) "resolve HTML entity refs before processing" , Option "o" ["output"] (ReqArg ((,) "output") "FILE") "output file, \"-\" stands for stdout" , Option "s" ["scan"] (NoArg ("scan", "1")) "just scan input, for testing" , Option "x" ["html"] (NoArg ("html", "1")) "html output" , Option "f" ["full"] (NoArg ("full", "1")) "full HTML document with header and css" ] exitErr :: String -> IO a exitErr msg = do hPutStrLn stderr msg usage exitWith (ExitFailure 1) evalArgs :: ([(String, String)], [FilePath], [String]) -> IO Process evalArgs (opts, files, errs) | not (null errs) = exitErr ("illegal arguments " ++ show errs) | null files = evalOpts opts defaultProcess | not (null fns) = exitErr ("only one input file allowed") | otherwise = do inp <- openFile fn ReadMode evalOpts opts (defaultProcess { input = inp , inputFile = fn } ) where (fn:fns) = files evalOpts :: [(String, String)] -> Process -> IO Process evalOpts [] res = return res evalOpts (o:os) res = do res' <- evalOpt o res evalOpts os res' evalOpt :: (String, String) -> Process -> IO Process evalOpt ("help","1") _ = do usage exitWith ExitSuccess evalOpt ("output", "-") p = return $ p {output = stdout} evalOpt ("output", fn) p = do outp <- openFile fn WriteMode return $ p {output = outp} evalOpt ("haskell","1") p = return $ p { tokenRE = haskellRE } evalOpt ("java", "1") p = return $ p { tokenRE = javaRE } evalOpt ("cpp", "1") p = return $ p { tokenRE = cppRE } evalOpt ("sh", "1") p = return $ p { tokenRE = shRE } evalOpt ("ruby", "1") p = return $ p { tokenRE = rubyRE } evalOpt ("bnf", "1") p = return $ p { tokenRE = bnfRE } evalOpt ("ppl", "1") p = return $ p { tokenRE = pplRE } evalOpt ("pplass", "1") p = return $ p { tokenRE = pplassRE } evalOpt ("plain", "1") p = return $ p { tokenRE = plainRE } evalOpt ("scan", "1") p = return $ p { tokenRE = plainRE , formatToken = uncurry formatTok , formatDoc = formatHList } evalOpt ("number", "1") p = return $ p { formatDoc = numberLines >>> formatDoc p } evalOpt ("tabs", "1") p = return $ p { inFilter = inFilter p >>> substTabs } evalOpt ("erefs", "1") p = return $ p { inFilter = resolveHtmlEntities >>> inFilter p } evalOpt ("markup", "1") p = return $ p { markupRE = addMarkup } evalOpt ("html", "1") p = return $ p { formatToken = formatHtmlTok , formatDoc = formatHtmlDoc } evalOpt ("full", "1") p = return $ p { outFilter = outFilter p >>> fullHtml (inputFile p) } evalOpt (opt, _v ) p = exitErr ("illegal option " ++ show opt) >> return p usage :: IO () usage = hPutStrLn stderr use where use = usageInfo header options header = "colorizeSourceCode - colorize source code with HTML, version 0.1.1" -- ------------------------------------------------------------ process :: Process -> String -> String process p = inFilter p >>> tokenizeSubexRE (markupRE p (tokenRE p)) >>> map (formatToken p) >>> concat >>> lines >>> formatDoc p >>> outFilter p addMarkup :: Regex -> Regex addMarkup = mkElse (parseRegexExt . mkLE $ markupT) tokenizeLines :: String -> [(String, String)] tokenizeLines = map (\ l -> ("",l ++ "\n")) . lines numberLines :: [String] -> [String] numberLines = zipWith addNum [(1::Int)..] where addNum i l = "" ++ fmt 4 i ++ "" ++ l fmt l = sed (const " ") " " . reverse . take l . reverse . (replicate l ' ' ++) . show substTabs :: String -> String substTabs = subs 0 subs :: Int -> String -> String subs _ "" = "" subs i (x:xs) | x == '\t' = replicate (8 - (i `mod` 8)) ' ' ++ subs 0 xs | x == '\n' = x : subs 0 xs | otherwise = x : subs (i+1) xs -- ------------------------------------------------------------ resolveHtmlEntities :: String -> String resolveHtmlEntities = sed (replaceEntity . drop 1 . init) "&\\i\\c*;" where replaceEntity e = maybe ("&" ++ e ++ ";") ((:[]) . toEnum) . lookup e $ xhtmlEntities -- ------------------------------------------------------------ formatHList :: [String] -> String formatHList = ("[" ++) . (++ "\n]") . intercalate "\n, " formatTok :: String -> String -> String formatTok kw tok = " (" ++ show kw ++ ",\t" ++ show tok ++ "\t)\n" formatHtmlDoc :: [String] -> String formatHtmlDoc = map (("
" ++) . (++ "
") . preserveEmptyLines) >>> ("
" :) >>> (++ ["
"]) >>> unlines where preserveEmptyLines "" = " " preserveEmptyLines l = l formatHtmlTok :: (String, String) -> String formatHtmlTok ("markup", t@(x:_)) | x `elem` "<&" = t formatHtmlTok (m, t) | otherwise = colorizeTokens m (escapeText >>> sed (const " ") " " $ t) escapeText :: String -> String escapeText = foldr cquote "" where cquote = fst escapeHtmlRefs -- escapeText = concat . runLA (xshowEscapeXml mkText) fullHtml :: String -> String -> String fullHtml fn s = unlines [ "" , "" , "" ++ fn ++ "" , "" , "" , "" , s , "" , "" ] css :: String css = unlines [ ".typename { color: #0000dd; }" , ".varname { color: #000000; }" , ".opname { color: #770000; }" , ".operator { color: #770000; /* font-weight:bold; */ }" , ".keyglyph { color: #3070A0; /* font-weight:bold; */ }" , ".par { }" , "" , ".keyword { color: #3070A0; /* font-weight:bold; */ }" , ".typekeyword { color: #3070A0; /* font-weight:bold; */ }" , ".strconst { color: #228B22; }" , ".charconst { color: #228B22; }" , ".labelname { color: #FF00FF; font-weight:bold; }" , ".cppcommand { color: #0000CD; }" , ".specialword { color: #c80000; }" , ".classname { color: #8B2323; }" , ".comment { color: #00008B; }" , ".bnfnt { color: #0000CD; }" , ".bnfmeta { color: #ff0000; font-weight:bold; }" , ".bnfterminal { color: #008800; font-weight:bold; }" , ".tclproc { color: #FF6000; }" , ".tclvar { color: #0000CD; }" , ".tclcomment { color: #c80000; }" , "" , ".linenr { color: #909090; padding-right: 2em; }" , "div.codeline { font-family: monospace; width: 100%; white-space: pre; border-width: 1px; border-style: solid; border-color: transparent; padding-left: 0.3em; }" , "div.codeline:hover { background-color:#ddddff; color:#c80000; border-width: 1px; border-style: solid; border-color: #c80000; }" ] -- ------------------------------------------------------------ colorizeTokens :: String -> String -> String colorizeTokens tok | tok `elem` [ "comment" , "keyword" , "keyglyph" , "typekeyword" , "varname", "typename", "labelname", "instancename", "globalname" , "opname" , "par" , "operator" , "strconst", "charconst" , "bnfnt", "bnfmeta" , "cppcommand" , "specialword" ] = wrap | tok == "longcomment" = wrap' "comment" . mlc | tok == "bnfterminal" = wrap . drop 1 . init -- | tok == "markupstart" = (("")) . drop 4 . init -- | tok == "markupend" = const "" | null tok = const "" | otherwise = id where wrap = wrap' tok wrap' tok' = (("") ++) . (++ "") mlc = sed (("" ++) . (++ "")) "(\\n\r?)" -- ------------------------------------------------------------ buildRegex :: [(String, String)] -> Regex buildRegex = foldr1 mkElse . map (uncurry mkBr') . map (second parseRegexExt) where mkBr' "" = id mkBr' l = mkBr l buildKeywords :: [String] -> String buildKeywords = intercalate "|" untilRE :: String -> String untilRE re = "(\\A{" ++ "\\}\\A" ++ re ++ "\\A)" ++ re mkLE :: (String, String) -> String mkLE (l, re) = "({" ++ l ++ "}(" ++ re ++ "))" ws1RE, ws1RE',ws0RE :: String ws1RE = "\\s+" ws1RE' = "[ \t]+" ws0RE = "[ \t]*" ws, ws', javacmt1, javacmt, shcmt1, strconst, markupT, charconst, number, par, xxx :: (String, String) -- markupS = ("markupstart", "<[a-zA-Z0-9]+>" ) -- markupE = ("markupend", "" ) markupT = ("markup", ( "" ++ "|" ++ "&" ++ xname ++ ";" ) ) where xname = "[A-Za-z][-_:A-Za-z0-9]*" xattr = ws1RE ++ xname ++ eq ++ "(" ++ dq ++ "|" ++ sq ++ ")" eq = "\\s*=\\s*" dq = "\"[^\"]*\"" sq = "\'[^\']*\'" ws = ("ws", ws1RE ) ws' = ("ws", ws1RE' ) javacmt1 = ("comment", "//.*" ) javacmt = ("longcomment", "/\\*" ++ untilRE "\\*/" ) shcmt1 = ("comment", "#.*" ) strconst = ("strconst", "\"([^\"\\\\\n\r]|\\\\.)*\"" ) charconst = ("charconst", "\'([^\'\\\\\n\r]|\\\\.)*\'" ) number = ("number", "[0-9]+(\\.[0-9]*([eE][-+]?[0-9]+)?)?" ) par = ("par", "[\\(\\)\\[\\]\\{\\}]" ) xxx = ("xxx", "." ) -- ------------------------------------------------------------ plainRE :: Regex plainRE = buildRegex [ ("xxx", "[^<&\n]+" ) , ("xxx", "[<&\n]" ) ] -- ------------------------------------------------------------ haskellRE :: Regex haskellRE = buildRegex [ ws , ("comment", "(-)- .*" ) , ("longcomment", "\\{" ++ untilRE "-\\}" ) , ("keyword", buildKeywords [ "case", "class" , "data", "default", "deriving", "do" , "else" , "forall" , "if", "import", "in" , "infix", "infixl", "infixr" , "instance" , "let" , "module" , "newtype" , "of" , "qualified" , "then", "type" , "where" , "_" , "as", "ccall", "foreign", "hiding", "proc", "safe", "unsafe" ] ) , ("keyglyph", buildKeywords ["\\.\\.","::","=","\\\\","\\|","<-","->","-<","@","~","=>","!",",",";"] ) , ("varname" , varname ) , ("typename", "[A-Z_][a-zA-Z0-9_]*[']*" ) , ("opname", "`" ++ varname ++ "`" ) , strconst , charconst , number , par , ("operator", "[-!#$%&\\*\\+./<=>\\?@\\\\^\\|~]+") , xxx ] where varname = "[a-z_][a-zA-Z0-9_]*[']*" -- ------------------------------------------------------------ javaRE :: Regex javaRE = buildRegex [ ws , javacmt1 , javacmt , ("keyword", buildKeywords [ "abstract", "assert" , "break" , "case", "catch", "class", "continue" , "default", "do" , "else", "extends" , "final", "finally", "for" , "if", "implements", "import", "instanceof", "interface" , "native", "new" , "package", "private", "protected", "public" , "return" , "static", "super", "switch", "synchronized" , "this", "throw", "throws", "transient", "try" , "volatile" , "while" ] ) , ("typekeyword", buildKeywords [ "boolean", "byte" , "char" , "double" , "false", "float" , "int" , "long" , "null" , "short" , "true" , "void" ] ) , ("labelname", "(" ++ varname ++ "{\\}default):" ) , ("", ( mkLE ("keyword", "break|continue") ++ mkLE ws ++ mkLE ("labelname", varname) ) ) , ("varname", varname ) , ("typename", "[A-Z][a-zA-Z0-9_]*" ) , strconst , charconst , number , par , ("delimiter", "[.,;]" ) , ("operator", "[-+!%&/=\\*\\?~|<>:]+" ) , xxx ] where varname = "[a-z][a-zA-Z0-9_]*" -- ------------------------------------------------------------ bnfRE :: Regex bnfRE = buildRegex [ ws , ("bnfnt" , "[A-Z][a-zA-Z0-9_]*" ) , ("bnfterminal", "\"([^\"\\\\\n\r]|\\\\.)*\"" ) , ("bnfmeta", buildKeywords [ "\\[" , "\\]" , "::=" , "\\|" , "\\{" , "\\}" ] ) , xxx ] -- ------------------------------------------------------------ cppRE :: Regex cppRE = buildRegex [ ws , javacmt1 , javacmt , ("keyword", buildKeywords [ "asm" , "auto" , "break" , "case" , "catch" , "class" , "const" , "continue" , "default" , "delete" , "do" , "else" , "extern" , "for" , "friend" , "goto" , "if" , "inline" , "new" , "operator" , "overload" , "private" , "protected" , "public" , "register" , "return" , "sizeof" , "static" , "switch" , "template" , "this" , "typedef" , "throw" , "try" , "virtual" , "volatile" , "while" ] ) , ("typekeyword", buildKeywords [ "char" , "double" , "enum" , "float" , "int" , "long" , "short" , "signed" , "struct" , "union" , "unsigned" , "void" ] ) , ("cppcommand", ( "#" ++ ws0RE ++ "(" ++ buildKeywords [ "define" , "else" , "endif" , "if" , "ifdef" , "ifndef" , "(include[ \t].*)" , "undef" ] ++ ")" ) ) , ("specialword", buildKeywords [ "assert" , "exit" , "free" , "main" , "malloc" ] ) , ("varname", varname ) , ("typename", "[A-Z][a-zA-Z0-9_]*" ) , strconst , charconst , number , par , ("delimiter", "[.,;]" ) , ("operator", "[-+!%&/=\\*\\?~|<>:]+" ) , xxx ] where varname = "[a-z][a-zA-Z0-9_]*" -- ------------------------------------------------------------ shRE :: Regex shRE = buildRegex [ ws , shcmt1 , ("keyword", buildKeywords [ "alias" , "break" , "bg" , "case" , "cd" , "continue" , "declare" , "do" , "done" , "echo" , "elif" , "else" , "env" , "esac" , "eval" , "exec" , "exit" , "export" , "false" , "fg" , "fi" , "for" , "function" , "if" , "in" , "jobs" , "kill" , "local" , "pwd" , "return" , "set" , "shift" , "test" , "then" , "trap" , "true" , "unalias" , "unset" , "while" , "wait" ] ) , ("varname", "[A-Za-z_][a-zA-Z0-9_]*" ) , ("operator", "[-+!%&=\\\\\\*\\?~|<>:@$]+" ) , ("operator", "[\\(\\)\\[\\]\\{\\}]+" ) , strconst , charconst , xxx ] -- ------------------------------------------------------------ rubyRE :: Regex rubyRE = buildRegex [ ws , rubycmt , ("keyword", buildKeywords [ "begin" , "break" , "catch" , "case" , "class" , "def" , "do" , "else" , "elif" , "end" , "ensure" , "false" , "for" , "if" , "in" , "include" , "initialize" , "loop" , "module" , "new" , "nil" , "raise" , "require" , "rescue" , "self" , "then" , "true" , "type" , "until" , "when" , "while" , "yield" ] ) , ("typename", "[A-Z][A-Za-z0-9]*" ) , ("varname", "[A-Za-z_][a-zA-Z0-9_]*(!|\\?)?" ) , ("instancename", "(@{1,2}|$)[A-Za-z_][a-zA-Z0-9_]*" ) , ("strconst", "%[qQx]\\{.*\\}" ) , ("strconst", "#\\{.*\\}" ) , ("strconst", ":[a-z][A-Za-z0-9]*" ) , strconst , charconst , regex , xxx ] where rubycmt = ("comment", "#(.{\\}\\{)*" ) regex = ("strconst", "/([^/\\\\\n\r]|\\\\.)*/" ) -- ------------------------------------------------------------ pplRE :: Regex pplRE = buildRegex [ ws , ("comment", "(-)- .*" ) , ("keyword", buildKeywords [ "and" , "begin" , "div" , "do" , "else" , "elseif" , "endif" , "endwhile" , "end" , "function" , "if" , "max" , "min" , "mod" , "not" , "of" , "or" , "procedure" , "repeat" , "return" , "then" , "until" , "var" , "while" , "xor" ] ) , ("typekeyword", buildKeywords [ "boolean" , "false" , "float" , "int" , "list" , "picture" , "string" , "true" ] ) , ("varname", "[A-Za-z_][a-zA-Z0-9_]*" ) , strconst , number , xxx ] -- ------------------------------------------------------------ pplassRE :: Regex pplassRE = buildRegex [ ws , ("comment", "(-)- .*" ) , ("keyword", buildKeywords [ "loadi" , "loadf" , "loads" , "emptyl" , "undef" , "load" ] ) , ("typename", buildKeywords [ "store" , "pop" ] ) , ("typekeyword", buildKeywords [ "jmp" , "brfalse" , "brtrue" , "pushj" , "popj" , "svc" ] ) , ("labelname", "(l[0-9]+:?)|([se]?_[A-Za-z0-9]*:?)" ) , ("varname", "[A-Za-z_][a-zA-Z0-9_]*" ) , strconst , xxx ] -- ------------------------------------------------------------ hxt-regex-xmlschema-9.2.0.3/examples/colorizeProgs/Makefile0000644000000000000000000000022713001405044022052 0ustar0000000000000000all : colorize install : all sudo cp colorize /usr/local/bin clean : cabal clean colorize: cabal install .PHONY : all install clean colorize hxt-regex-xmlschema-9.2.0.3/examples/performance/REtest.hs0000644000000000000000000000503612752557013021637 0ustar0000000000000000{-# LANGUAGE BangPatterns#-} -- ---------------------------------------- module Main(main) where import Text.Regex.XMLSchema.Generic import Control.Arrow import Data.Maybe import System.IO -- import the IO and commandline option stuff import System.Environment -- ---------------------------------------- main :: IO () main = do p <- getProgName al <- getArgs let i = if null al then 4 else (read . head $ al)::Int main' p i where main' p' = fromMaybe main1 . lookup (pn p') $ mpt mpt = [ ("REtest", main1) , ("Copy", main2 "copy" (:[])) , ("Lines", main2 "lines" lines) , ("RElines", main2 "relines" relines) , ("SElines", main2 "selines'" relines') , ("Words", main2 "words" words) , ("REwords", main2 "rewords" rewords) , ("SEwords", main2 "sewords" rewords') ] -- ---------------------------------------- -- generate a document containing a binary tree of 2^i leafs (= 2^(i-1) XML elements) main1 :: Int -> IO () main1 i = do genDoc i "REtest.hs" (fn i) return () -- ---------------------------------------- -- read a document containing a binary tree of 2^i leafs main2 :: String -> (String -> [String]) -> Int -> IO () main2 ext lines' i = do hPutStrLn stderr "start processing" h <- openFile (fn i) ReadMode c <- hGetContents h let ls = lines' c o <- openFile (fn i ++ "." ++ ext) WriteMode mapM_ (hPutStrLn o) ls hClose o hClose h hPutStrLn stderr "end processing" relines :: String -> [String] relines = tokenize ".*" relines' :: String -> [String] relines' = tokenizeSubex "({line}.*)" >>> map snd rewords :: String -> [String] rewords = tokenize "\\S+" rewords' :: String -> [String] rewords' = tokenizeSubex "({word}\\S+)" >>> map snd -- ---------------------------------------- pn :: String -> String pn = reverse . takeWhile (/= '/') . reverse fn :: Int -> String fn = ("lines-" ++) . (++ ".txt") . reverse . take 4 . reverse . ((replicate 4 '0') ++ ) . show -- ---------------------------------------- genDoc :: Int -> String -> String -> IO () genDoc d inp outp = do s <- readFile inp let s' = take (2^d) . concat . repeat $ s writeFile outp s' -- ---------------------------------------- hxt-regex-xmlschema-9.2.0.3/examples/performance/Makefile0000644000000000000000000000526112752557013021535 0ustar0000000000000000PKGFLAGS = GHCFLAGS = -Wall -O2 GHC = ghc $(GHCFLAGS) $(PKGFLAGS) CNT = 3 ropts = +RTS -s -RTS prog = ./REtest prog2 = ./Lines prog3 = ./RElines prog3a = ./SElines prog4 = ./Words prog5 = ./REwords prog5a = ./SEwords prog0 = ./Copy progs = $(prog) $(prog0) $(prog2) $(prog3) $(prog3a) $(prog4) $(prog5) $(prog5a) all : $(progs) $(prog) : $(prog).hs $(GHC) --make -o $@ $< local : $(GHC) --make -o $(prog) -fglasgow-exts $(prog).hs $(prog2) : $(prog) ln -f $(prog) $(prog2) $(prog3) : $(prog) ln -f $(prog) $(prog3) $(prog3a) : $(prog) ln -f $(prog) $(prog3a) $(prog4) : $(prog) ln -f $(prog) $(prog4) $(prog5) : $(prog) ln -f $(prog) $(prog5) $(prog5a) : $(prog) ln -f $(prog) $(prog5a) $(prog0) : $(prog) ln -f $(prog) $(prog0) # generate and read documents containing text # with 2^i characters tests = 25 test : $(prog) $(MAKE) genfiles tests="$(tests)" $(MAKE) copy tests="$(tests)" $(MAKE) lines tests="$(tests)" $(MAKE) relines tests="$(tests)" $(MAKE) selines tests="$(tests)" $(MAKE) words tests="$(tests)" $(MAKE) rewords tests="$(tests)" $(MAKE) sewords tests="$(tests)" perftest : $(prog) $(MAKE) test tests="2 3 10 11 12 13 14 15 16 17 18 19 20" genfiles : @for i in $(tests) ; \ do \ echo time $(prog) $(ropts) $$i ; \ time $(prog) $(ropts) $$i ; \ ls -l lines-*$$i.txt ; \ echo ; \ done copy : @for i in $(tests) ; \ do \ echo time $(prog0) $(ropts) $$i ; \ time $(prog0) $(ropts) $$i ; \ ls -l lines-*$$i.txt.copy ; \ echo ; \ done lines : @for i in $(tests) ; \ do \ echo time $(prog2) $(ropts) $$i ; \ time $(prog2) $(ropts) $$i ; \ ls -l lines-*$$i.txt.lines ; \ echo ; \ done relines : @for i in $(tests) ; \ do \ echo time $(prog3) $(ropts) $$i ; \ time $(prog3) $(ropts) $$i ; \ ls -l lines-*$$i.txt.relines ; \ echo ; \ done selines : @for i in $(tests) ; \ do \ echo time $(prog3a) $(ropts) $$i ; \ time $(prog3a) $(ropts) $$i ; \ ls -l lines-*$$i.txt.selines ; \ echo ; \ done words : @for i in $(tests) ; \ do \ echo time $(prog4) $(ropts) $$i ; \ time $(prog4) $(ropts) $$i ; \ ls -l lines-*$$i.txt.words ; \ echo ; \ done rewords : @for i in $(tests) ; \ do \ echo time $(prog5) $$i ; \ time $(prog5) $(ropts) $$i ; \ ls -l lines-*$$i.txt.rewords ; \ echo ; \ done sewords : @for i in $(tests) ; \ do \ echo time $(prog5a) $$i ; \ time $(prog5a) $(ropts) $$i ; \ ls -l lines-*$$i.txt.sewords ; \ echo ; \ done distclean : clean clean : rm -f $(progs) *.o *.hi *.txt *.txt.* .PHONY : clean distclean test perftest all local words rewords sewords lines relines selines copy genfiles hxt-regex-xmlschema-9.2.0.3/examples/RegexXMLSchema/Makefile0000644000000000000000000000513312752557013022006 0ustar0000000000000000# $Id: Makefile,v 1.9 2006/11/11 15:36:03 hxml Exp $ HXT_HOME = ../../.. PKGFLAGS = GHCFLAGS = -Wall -O2 GHC = ghc $(GHCFLAGS) $(PKGFLAGS) DIST = $(HXT_HOME)/dist/examples/arrows DIST_DIR = $(DIST)/RegexXMLSchema CNT = 3 ropts = +RTS -s -RTS prog = ./REtest prog2 = ./Lines prog3 = ./RElines prog4 = ./Words prog5 = ./REwords prog0 = ./Copy progs = $(prog) $(prog0) $(prog2) $(prog3) $(prog4) $(prog5) all : $(progs) $(prog) : $(prog).hs $(GHC) --make -o $@ $< local : $(GHC) --make -o $(prog) -fglasgow-exts -ignore-package hxt -i../../../src $(prog).hs $(prog2) : $(prog) ln -f $(prog) $(prog2) $(prog3) : $(prog) ln -f $(prog) $(prog3) $(prog4) : $(prog) ln -f $(prog) $(prog4) $(prog5) : $(prog) ln -f $(prog) $(prog5) $(prog0) : $(prog) ln -f $(prog) $(prog0) # generate and read documents containing a binary tree # with 2^i leaf nodes containing the numbers 1 to 2^i # for i up to at least 22 (8M XML elements) output works fine # for i up to 19 (1M XML elements) input works without swapping # with i=20 swapping starts, but the program it still terminates # the size of the XML file for i=20 is about 36Mb # these tests have run on a box with 1Gb memory tests = 18 test : $(prog) $(MAKE) genfiles tests="$(tests)" $(MAKE) copy tests="$(tests)" $(MAKE) lines tests="$(tests)" $(MAKE) relines tests="$(tests)" $(MAKE) words tests="$(tests)" $(MAKE) rewords tests="$(tests)" perftest : $(prog) $(MAKE) test tests="2 3 10 11 12 13 14 15 16 17 18 19 20" genfiles : @for i in $(tests) ; \ do \ echo time $(prog) $(ropts) $$i ; \ time $(prog) $(ropts) $$i ; \ ls -l tree-*$$i.xml ; \ echo ; \ done copy : @for i in $(tests) ; \ do \ echo time $(prog0) $(ropts) $$i ; \ time $(prog0) $(ropts) $$i ; \ ls -l tree-*$$i.xml.copy ; \ echo ; \ done lines : @for i in $(tests) ; \ do \ echo time $(prog2) $(ropts) $$i ; \ time $(prog2) $(ropts) $$i ; \ ls -l tree-*$$i.xml.lines ; \ echo ; \ done relines : @for i in $(tests) ; \ do \ echo time $(prog3) $(ropts) $$i ; \ time $(prog3) $(ropts) $$i ; \ ls -l tree-*$$i.xml.relines ; \ echo ; \ done words : @for i in $(tests) ; \ do \ echo time $(prog4) $(ropts) $$i ; \ time $(prog4) $(ropts) $$i ; \ ls -l tree-*$$i.xml.words ; \ echo ; \ done rewords : @for i in $(tests) ; \ do \ echo time $(prog5) $$i ; \ time $(prog5) $(ropts) $$i ; \ ls -l tree-*$$i.xml.rewords ; \ echo ; \ done dist : [ -d $(DIST_DIR) ] || mkdir -p $(DIST_DIR) cp Makefile REtest.hs $(DIST_DIR) clean : rm -f $(progs) *.o *.hi *.xml *.xml.* hxt-regex-xmlschema-9.2.0.3/examples/RegexXMLSchema/REtest.hs0000644000000000000000000000751412752557013022115 0ustar0000000000000000{-# LANGUAGE BangPatterns#-} -- ---------------------------------------- module Main(main) where import Text.XML.HXT.Core import Text.Regex.XMLSchema.Generic import Data.String.Unicode ( unicodeToXmlEntity ) import Control.Monad.State.Strict hiding (when) import Data.Maybe import System.IO -- import the IO and commandline option stuff import System.Environment -- ---------------------------------------- main :: IO () main = do p <- getProgName al <- getArgs let i = if null al then 4 else (read . head $ al)::Int main' p i where main' p' = fromMaybe main1 . lookup (pn p') $ mpt mpt = [ ("REtest", main1) , ("Copy", main2 "copy" (:[])) , ("Lines", main2 "lines" lines) , ("RElines", main2 "relines" relines) , ("Words", main2 "words" words) , ("REwords", main2 "rewords" rewords) ] -- ---------------------------------------- -- generate a document containing a binary tree of 2^i leafs (= 2^(i-1) XML elements) main1 :: Int -> IO () main1 i = runX (genDoc i (fn i)) >> return () -- ---------------------------------------- -- read a document containing a binary tree of 2^i leafs main2 :: String -> (String -> [String]) -> Int -> IO () main2 ext lines' i = do hPutStrLn stderr "start processing" h <- openBinaryFile (fn i) ReadMode c <- hGetContents h let ls = lines' c o <- openBinaryFile (fn i ++ "." ++ ext) WriteMode mapM_ (hPutStrLn o) ls hClose o hClose h hPutStrLn stderr "end processing" relines :: String -> [String] relines = tokenize "[^\n\r]*" rewords :: String -> [String] rewords = tokenize "[^ \t\n\r]+" -- ---------------------------------------- pn :: String -> String pn = reverse . takeWhile (/= '/') . reverse fn :: Int -> String fn = ("tree-" ++) . (++ ".xml") . reverse . take 4 . reverse . ((replicate 4 '0') ++ ) . show -- ---------------------------------------- genDoc :: Int -> String -> IOSArrow b XmlTree genDoc d out = constA (mkBTree d) >>> xpickleVal xpickle >>> indentDoc >>> putDoc out -- ---------------------------------------- type Counter a = State Int a incr :: Counter Int incr = do modify (+1) get -- ---------------------------------------- data BTree = Leaf Int | Fork BTree BTree deriving (Show) instance XmlPickler BTree where xpickle = xpAlt tag ps where tag (Leaf _ ) = 0 tag (Fork _ _ ) = 1 ps = [ xpWrap ( Leaf, \ (Leaf i) -> i) ( xpElem "leaf" $ xpAttr "value" $ xpickle ) , xpWrap ( uncurry Fork, \ (Fork l r) -> (l, r)) ( xpElem "fork" $ xpPair xpickle xpickle ) ] -- ---------------------------------------- mkBTree :: Int -> BTree mkBTree depth = evalState (mkT depth) 0 mkT :: Int -> Counter BTree mkT 0 = do i <- incr return (Leaf i) mkT n = do l <- mkT (n-1) r <- mkT (n-1) return (Fork l r) -- ---------------------------------------- -- output is done with low level ops to write the -- document i a lazy manner -- adding an xml pi and encoding is done "by hand" -- latin1 decoding is the identity, so please generate the -- docs with latin1 encoding. Here ist done even with ASCCI -- every none ASCII char is represented by a char ref (&nnn;) putDoc :: String -> IOStateArrow s XmlTree XmlTree putDoc dst = addXmlPi >>> addXmlPiEncoding isoLatin1 >>> xshow getChildren >>> arr unicodeToXmlEntity >>> arrIO (\ s -> hPutDocument (\h -> hPutStrLn h s)) >>> none where isStdout = null dst || dst == "-" hPutDocument :: (Handle -> IO()) -> IO() hPutDocument action | isStdout = action stdout | otherwise = do handle <- openBinaryFile dst WriteMode action handle hClose handle -- ----------------------------------------