regex-tdfa-utf8-1.0/0000755000000000000000000000000011161454104012447 5ustar0000000000000000regex-tdfa-utf8-1.0/LICENSE0000644000000000000000000000275111161454104013461 0ustar0000000000000000This modile is under this "3 clause" BSD license: Copyright (c) 2007-2009, Christopher Kuklewicz All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * The names of the contributors may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. regex-tdfa-utf8-1.0/regex-tdfa-utf8.cabal0000644000000000000000000000141611161454104016347 0ustar0000000000000000name: regex-tdfa-utf8 Cabal-Version: >= 1.2 version: 1.0 synopsis: This combines regex-tdfa with utf8-string to allow searching over UTF8 encoded lazy bytestrings. description: This combines regex-tdfa with utf8-string to allow searching over UTF8 encoded lazy bytestrings. category: Text license: BSD3 license-file: LICENSE author: Chris Kuklewicz maintainer: haskell@list.mightyreason.com build-depends: base build-type: Simple library Build-Depends: base,array,bytestring,utf8-string,regex-base,regex-tdfa >= 1.1.1 Exposed-Modules: Text.Regex.TDFA.UTF8 Buildable: True Extensions: MultiParamTypeClasses Ghc-Options: -O2 regex-tdfa-utf8-1.0/Setup.lhs0000644000000000000000000000011411161454104014253 0ustar0000000000000000#!/usr/bin/env runhaskell > import Distribution.Simple > main = defaultMain regex-tdfa-utf8-1.0/Text/0000755000000000000000000000000011161454104013373 5ustar0000000000000000regex-tdfa-utf8-1.0/Text/Regex/0000755000000000000000000000000011161454104014445 5ustar0000000000000000regex-tdfa-utf8-1.0/Text/Regex/TDFA/0000755000000000000000000000000011161454104015163 5ustar0000000000000000regex-tdfa-utf8-1.0/Text/Regex/TDFA/UTF8.hs0000644000000000000000000000517011161454104016250 0ustar0000000000000000module Text.Regex.TDFA.UTF8(Utf8(Utf8,utf8)) where import Data.Array.IArray((!)) import Data.Maybe(listToMaybe) import qualified Data.ByteString.Lazy.Char8 as L(ByteString,empty) import qualified Data.ByteString.Lazy.UTF8 as U(take,drop,uncons,toString) import Text.Regex.Base(RegexLike(..),RegexMaker(..),Extract(..),MatchArray,RegexContext(..)) import Text.Regex.Base.Impl(polymatch,polymatchM) import Text.Regex.TDFA.String() -- instances only import Text.Regex.TDFA.Common(Regex(..),CompOption,ExecOption(captureGroups),Position) import Text.Regex.TDFA.NewDFA.Uncons(Uncons(uncons)) import qualified Text.Regex.TDFA.NewDFA.Engine as Engine(execMatch) import qualified Text.Regex.TDFA.NewDFA.Tester as Tester(matchTest) -- This is a newtype for the instances we are making. -- You will likely want to use a pre-existing newtype from your code. newtype Utf8 = Utf8 { utf8 :: L.ByteString } deriving (Show,Read,Eq,Ord) instance Extract Utf8 where {-# INLINE empty #-} empty = Utf8 L.empty {-# INLINE before #-} before i = Utf8 . U.take (fromIntegral i) . utf8 {-# INLINE after #-} after i = Utf8 . U.drop (fromIntegral i) . utf8 instance Uncons Utf8 where {-# INLINE uncons #-} uncons = fmap (fmap Utf8) . U.uncons . utf8 instance RegexMaker Regex CompOption ExecOption Utf8 where makeRegexOptsM c e source = makeRegexOptsM c e (U.toString (utf8 source)) {-# SPECIALIZE execMatch :: Regex -> Position -> Char -> Utf8 -> [MatchArray] #-} execMatch :: Uncons text => Regex -> Position -> Char -> text -> [MatchArray] execMatch = Engine.execMatch {-# SPECIALIZE myMatchTest :: Regex -> Utf8 -> Bool #-} myMatchTest :: Uncons text => Regex -> text -> Bool myMatchTest = Tester.matchTest instance RegexLike Regex Utf8 where matchOnce r s = listToMaybe (matchAll r s) matchAll r s = execMatch r 0 '\n' s matchCount r s = length (matchAll r' s) where r' = r { regex_execOptions = (regex_execOptions r) {captureGroups = False} } matchTest = myMatchTest matchOnceText regex source = fmap (\ ma -> let (o,l) = ma!0 in (before o source ,fmap (\ ol -> (extract ol source,ol)) ma ,after (o+l) source)) (matchOnce regex source) matchAllText regex source = let go i _ _ | i `seq` False = undefined go _i _t [] = [] go i t (x:xs) = let (off0,len0) = x!0 trans pair@(off,len) = (extract (off-i,len) t,pair) t' = after (off0+(len0-i)) t in fmap trans x : seq t' (go (off0+len0) t' xs) in go 0 source (matchAll regex source) instance RegexContext Regex Utf8 Utf8 where match = polymatch matchM = polymatchM