charset-0.3.7.1/0000755000000000000000000000000012507023225011475 5ustar0000000000000000charset-0.3.7.1/.travis.yml0000644000000000000000000000002212507023225013600 0ustar0000000000000000language: haskell charset-0.3.7.1/charset.cabal0000644000000000000000000000251612507023225014116 0ustar0000000000000000name: charset version: 0.3.7.1 license: BSD3 license-File: LICENSE copyright: (c) Edward Kmett 2010-2012 author: Edward Kmett maintainer: ekmett@gmail.com cabal-version: >= 1.6 stability: Experimental category: Data homepage: http://github.com/ekmett/charset bug-reports: http://github.com/ekmett/charset/issues synopsis: Fast unicode character sets based on complemented PATRICIA tries description: Fast unicode character sets based on complemented PATRICIA tries build-type: Simple extra-source-files: .travis.yml source-repository head type: git location: git://github.com/ekmett/charset.git library extensions: CPP other-extensions: MagicHash, BangPatterns build-depends: base >= 4 && < 5, array >= 0.2 && < 0.6, bytestring >= 0.9 && < 0.11, containers >= 0.2 && < 0.6, semigroups >= 0.8.3.1 && < 1, unordered-containers >= 0.1.4.6 && < 0.3 exposed-modules: Data.CharSet Data.CharSet.Common Data.CharSet.Posix Data.CharSet.Posix.Ascii Data.CharSet.Posix.Unicode Data.CharSet.Unicode Data.CharSet.Unicode.Block Data.CharSet.Unicode.Category Data.CharSet.ByteSet hs-source-dirs: src ghc-options: -Wall -fspec-constr -fdicts-cheap -O2 charset-0.3.7.1/LICENSE0000644000000000000000000000276212507023225012511 0ustar0000000000000000Copyright (c) 2010, Edward Kmett All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Edward Kmett nor the names of other contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. charset-0.3.7.1/Setup.lhs0000644000000000000000000000017412507023225013307 0ustar0000000000000000#!/usr/bin/env runhaskell \begin{code} import Distribution.Simple main = defaultMainWithHooks simpleUserHooks \end{code} charset-0.3.7.1/src/0000755000000000000000000000000012507023225012264 5ustar0000000000000000charset-0.3.7.1/src/Data/0000755000000000000000000000000012507023225013135 5ustar0000000000000000charset-0.3.7.1/src/Data/CharSet.hs0000644000000000000000000002504512507023225015030 0ustar0000000000000000{-# OPTIONS_GHC -fspec-constr #-} {-# LANGUAGE CPP #-} {-# LANGUAGE DeriveDataTypeable #-} #if defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ >= 702 {-# LANGUAGE Trustworthy #-} #endif ----------------------------------------------------------------------------- -- | -- Module : Data.CharSet -- Copyright : (c) Edward Kmett 2010-2011 -- License : BSD3 -- Maintainer : ekmett@gmail.com -- Stability : experimental -- Portability : portable -- -- Fast set membership tests for 'Char' values -- -- Stored as a (possibly negated) IntMap and a fast set used for the head byte. -- -- The set of valid (possibly negated) head bytes is stored unboxed as a 32-byte -- bytestring-based lookup table. -- -- Designed to be imported qualified: -- -- > import Data.CharSet (CharSet) -- > import qualified Data.CharSet as CharSet -- ------------------------------------------------------------------------------- module Data.CharSet ( -- * Set type CharSet(..) -- * Operators , (\\) -- * Query , null , size , member , notMember , overlaps, isSubsetOf , isComplemented -- * Construction , build , empty , singleton , full , insert , delete , complement , range -- * Combine , union , intersection , difference -- * Filter , filter , partition -- * Map , map -- * Fold , fold -- * Conversion -- ** List , toList , fromList -- ** Ordered list , toAscList , fromAscList , fromDistinctAscList -- ** IntMaps , fromCharSet , toCharSet -- ** Array , toArray ) where import Data.Array.Unboxed hiding (range) import Data.Data import Data.Function (on) import Data.IntSet (IntSet) import Data.CharSet.ByteSet (ByteSet) import qualified Data.CharSet.ByteSet as ByteSet import Data.Bits hiding (complement) import Data.Word import Data.ByteString.Internal (c2w) import Data.Semigroup import qualified Data.IntSet as I import qualified Data.List as L import Prelude hiding (filter, map, null) import qualified Prelude as P import Text.Read data CharSet = CharSet !Bool {-# UNPACK #-} !ByteSet !IntSet deriving Typeable charSet :: Bool -> IntSet -> CharSet charSet b s = CharSet b (ByteSet.fromList (fmap headByte (I.toAscList s))) s headByte :: Int -> Word8 headByte i | i <= 0x7f = toEnum i | i <= 0x7ff = toEnum $ 0x80 + (i `shiftR` 6) | i <= 0xffff = toEnum $ 0xe0 + (i `shiftR` 12) | otherwise = toEnum $ 0xf0 + (i `shiftR` 18) pos :: IntSet -> CharSet pos = charSet True neg :: IntSet -> CharSet neg = charSet False (\\) :: CharSet -> CharSet -> CharSet (\\) = difference build :: (Char -> Bool) -> CharSet build p = fromDistinctAscList $ P.filter p [minBound .. maxBound] {-# INLINE build #-} map :: (Char -> Char) -> CharSet -> CharSet map f (CharSet True _ i) = pos (I.map (fromEnum . f . toEnum) i) map f (CharSet False _ i) = fromList $ P.map f $ P.filter (\x -> fromEnum x `I.notMember` i) [ul..uh] {-# INLINE map #-} isComplemented :: CharSet -> Bool isComplemented (CharSet True _ _) = False isComplemented (CharSet False _ _) = True {-# INLINE isComplemented #-} toList :: CharSet -> String toList (CharSet True _ i) = P.map toEnum (I.toList i) toList (CharSet False _ i) = P.filter (\x -> fromEnum x `I.notMember` i) [ul..uh] {-# INLINE toList #-} toAscList :: CharSet -> String toAscList (CharSet True _ i) = P.map toEnum (I.toAscList i) toAscList (CharSet False _ i) = P.filter (\x -> fromEnum x `I.notMember` i) [ul..uh] {-# INLINE toAscList #-} empty :: CharSet empty = pos I.empty singleton :: Char -> CharSet singleton = pos . I.singleton . fromEnum {-# INLINE singleton #-} full :: CharSet full = neg I.empty -- | /O(n)/ worst case null :: CharSet -> Bool null (CharSet True _ i) = I.null i null (CharSet False _ i) = I.size i == numChars {-# INLINE null #-} -- | /O(n)/ size :: CharSet -> Int size (CharSet True _ i) = I.size i size (CharSet False _ i) = numChars - I.size i {-# INLINE size #-} insert :: Char -> CharSet -> CharSet insert c (CharSet True _ i) = pos (I.insert (fromEnum c) i) insert c (CharSet False _ i) = neg (I.delete (fromEnum c) i) {-# INLINE insert #-} range :: Char -> Char -> CharSet range a b | a <= b = fromDistinctAscList [a..b] | otherwise = empty delete :: Char -> CharSet -> CharSet delete c (CharSet True _ i) = pos (I.delete (fromEnum c) i) delete c (CharSet False _ i) = neg (I.insert (fromEnum c) i) {-# INLINE delete #-} complement :: CharSet -> CharSet complement (CharSet True s i) = CharSet False s i complement (CharSet False s i) = CharSet True s i {-# INLINE complement #-} union :: CharSet -> CharSet -> CharSet union (CharSet True _ i) (CharSet True _ j) = pos (I.union i j) union (CharSet True _ i) (CharSet False _ j) = neg (I.difference j i) union (CharSet False _ i) (CharSet True _ j) = neg (I.difference i j) union (CharSet False _ i) (CharSet False _ j) = neg (I.intersection i j) {-# INLINE union #-} intersection :: CharSet -> CharSet -> CharSet intersection (CharSet True _ i) (CharSet True _ j) = pos (I.intersection i j) intersection (CharSet True _ i) (CharSet False _ j) = pos (I.difference i j) intersection (CharSet False _ i) (CharSet True _ j) = pos (I.difference j i) intersection (CharSet False _ i) (CharSet False _ j) = neg (I.union i j) {-# INLINE intersection #-} difference :: CharSet -> CharSet -> CharSet difference (CharSet True _ i) (CharSet True _ j) = pos (I.difference i j) difference (CharSet True _ i) (CharSet False _ j) = pos (I.intersection i j) difference (CharSet False _ i) (CharSet True _ j) = neg (I.union i j) difference (CharSet False _ i) (CharSet False _ j) = pos (I.difference j i) {-# INLINE difference #-} member :: Char -> CharSet -> Bool member c (CharSet True b i) | c <= toEnum 0x7f = ByteSet.member (c2w c) b | otherwise = I.member (fromEnum c) i member c (CharSet False b i) | c <= toEnum 0x7f = not (ByteSet.member (c2w c) b) | otherwise = I.notMember (fromEnum c) i {-# INLINE member #-} notMember :: Char -> CharSet -> Bool notMember c s = not (member c s) {-# INLINE notMember #-} fold :: (Char -> b -> b) -> b -> CharSet -> b fold f z (CharSet True _ i) = I.fold (f . toEnum) z i fold f z (CharSet False _ i) = foldr f z $ P.filter (\x -> fromEnum x `I.notMember` i) [ul..uh] {-# INLINE fold #-} filter :: (Char -> Bool) -> CharSet -> CharSet filter p (CharSet True _ i) = pos (I.filter (p . toEnum) i) filter p (CharSet False _ i) = neg $ foldr (I.insert) i $ P.filter (\x -> (x `I.notMember` i) && not (p (toEnum x))) [ol..oh] {-# INLINE filter #-} partition :: (Char -> Bool) -> CharSet -> (CharSet, CharSet) partition p (CharSet True _ i) = (pos l, pos r) where (l,r) = I.partition (p . toEnum) i partition p (CharSet False _ i) = (neg (foldr I.insert i l), neg (foldr I.insert i r)) where (l,r) = L.partition (p . toEnum) $ P.filter (\x -> x `I.notMember` i) [ol..oh] {-# INLINE partition #-} overlaps :: CharSet -> CharSet -> Bool overlaps (CharSet True _ i) (CharSet True _ j) = not (I.null (I.intersection i j)) overlaps (CharSet True _ i) (CharSet False _ j) = not (I.isSubsetOf j i) overlaps (CharSet False _ i) (CharSet True _ j) = not (I.isSubsetOf i j) overlaps (CharSet False _ i) (CharSet False _ j) = any (\x -> I.notMember x i && I.notMember x j) [ol..oh] -- not likely {-# INLINE overlaps #-} isSubsetOf :: CharSet -> CharSet -> Bool isSubsetOf (CharSet True _ i) (CharSet True _ j) = I.isSubsetOf i j isSubsetOf (CharSet True _ i) (CharSet False _ j) = I.null (I.intersection i j) isSubsetOf (CharSet False _ i) (CharSet True _ j) = all (\x -> I.member x i && I.member x j) [ol..oh] -- not bloody likely isSubsetOf (CharSet False _ i) (CharSet False _ j) = I.isSubsetOf j i {-# INLINE isSubsetOf #-} fromList :: String -> CharSet fromList = pos . I.fromList . P.map fromEnum {-# INLINE fromList #-} fromAscList :: String -> CharSet fromAscList = pos . I.fromAscList . P.map fromEnum {-# INLINE fromAscList #-} fromDistinctAscList :: String -> CharSet fromDistinctAscList = pos . I.fromDistinctAscList . P.map fromEnum {-# INLINE fromDistinctAscList #-} -- isProperSubsetOf :: CharSet -> CharSet -> Bool -- isProperSubsetOf (P i) (P j) = I.isProperSubsetOf i j -- isProperSubsetOf (P i) (N j) = null (I.intersection i j) && ... -- isProperSubsetOf (N i) (N j) = I.isProperSubsetOf j i ul, uh :: Char ul = minBound uh = maxBound {-# INLINE ul #-} {-# INLINE uh #-} ol, oh :: Int ol = fromEnum ul oh = fromEnum uh {-# INLINE ol #-} {-# INLINE oh #-} numChars :: Int numChars = oh - ol + 1 {-# INLINE numChars #-} instance Data CharSet where gfoldl k z set | isComplemented set = z complement `k` complement set | otherwise = z fromList `k` toList set toConstr set | isComplemented set = complementConstr | otherwise = fromListConstr dataTypeOf _ = charSetDataType gunfold k z c = case constrIndex c of 1 -> k (z fromList) 2 -> k (z complement) _ -> error "gunfold" fromListConstr :: Constr fromListConstr = mkConstr charSetDataType "fromList" [] Prefix {-# NOINLINE fromListConstr #-} complementConstr :: Constr complementConstr = mkConstr charSetDataType "complement" [] Prefix {-# NOINLINE complementConstr #-} charSetDataType :: DataType charSetDataType = mkDataType "Data.CharSet.CharSet" [fromListConstr, complementConstr] {-# NOINLINE charSetDataType #-} -- returns an intset and if the charSet is positive fromCharSet :: CharSet -> (Bool, IntSet) fromCharSet (CharSet b _ i) = (b, i) {-# INLINE fromCharSet #-} toCharSet :: IntSet -> CharSet toCharSet = pos {-# INLINE toCharSet #-} instance Eq CharSet where (==) = (==) `on` toAscList instance Ord CharSet where compare = compare `on` toAscList instance Bounded CharSet where minBound = empty maxBound = full -- TODO return a tighter bounded array perhaps starting from the least element present to the last element present? toArray :: CharSet -> UArray Char Bool toArray set = array (minBound, maxBound) $ fmap (\x -> (x, x `member` set)) [minBound .. maxBound] instance Show CharSet where showsPrec d i | isComplemented i = showParen (d > 10) $ showString "complement " . showsPrec 11 (complement i) | otherwise = showParen (d > 10) $ showString "fromDistinctAscList " . showsPrec 11 (toAscList i) instance Read CharSet where readPrec = parens $ complemented +++ normal where complemented = prec 10 $ do Ident "complement" <- lexP complement `fmap` step readPrec normal = prec 10 $ do Ident "fromDistinctAscList" <- lexP fromDistinctAscList `fmap` step readPrec instance Semigroup CharSet where (<>) = union instance Monoid CharSet where mempty = empty mappend = union charset-0.3.7.1/src/Data/CharSet/0000755000000000000000000000000012507023225014466 5ustar0000000000000000charset-0.3.7.1/src/Data/CharSet/ByteSet.hs0000644000000000000000000000406712507023225016410 0ustar0000000000000000{-# LANGUAGE BangPatterns, MagicHash #-} {-# LANGUAGE CPP #-} #if defined(__GLASGOW_HASKELL__) && __GLASGOW_HASKELL__ >= 702 {-# LANGUAGE Trustworthy #-} #endif ----------------------------------------------------------------------------- -- | -- Module : Data.CharSet.ByteSet -- Copyright : Edward Kmett 2011 -- Bryan O'Sullivan 2008 -- License : BSD3 -- -- Maintainer : ekmett@gmail.com -- Stability : experimental -- Portability : non-portable (BangPatterns, MagicHash) -- -- Fast set membership tests for byte values, The set representation is -- unboxed for efficiency and uses a lookup table. This is a fairly minimal -- API. You probably want to use CharSet. ----------------------------------------------------------------------------- module Data.CharSet.ByteSet ( -- * Data type ByteSet(..) -- * Construction , fromList -- * Lookup , member ) where import Data.Bits ((.&.), (.|.)) import Foreign.Storable (peekByteOff, pokeByteOff) import GHC.Base (Int(I#), iShiftRA#, narrow8Word#, shiftL#) import GHC.Word (Word8(W8#)) import qualified Data.ByteString as B import qualified Data.ByteString.Internal as I import qualified Data.ByteString.Unsafe as U newtype ByteSet = ByteSet B.ByteString deriving (Eq, Ord, Show) data I = I {-# UNPACK #-} !Int {-# UNPACK #-} !Word8 shiftR :: Int -> Int -> Int shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#) shiftL :: Word8 -> Int -> Word8 shiftL (W8# x#) (I# i#) = W8# (narrow8Word# (x# `shiftL#` i#)) index :: Int -> I index i = I (i `shiftR` 3) (1 `shiftL` (i .&. 7)) {-# INLINE index #-} fromList :: [Word8] -> ByteSet fromList s0 = ByteSet $ I.unsafeCreate 32 $ \t -> do _ <- I.memset t 0 32 let go [] = return () go (c:cs) = do prev <- peekByteOff t byte :: IO Word8 pokeByteOff t byte (prev .|. bit) go cs where I byte bit = index (fromIntegral c) go s0 -- | Check the set for membership. member :: Word8 -> ByteSet -> Bool member w (ByteSet t) = U.unsafeIndex t byte .&. bit /= 0 where I byte bit = index (fromIntegral w) charset-0.3.7.1/src/Data/CharSet/Common.hs0000644000000000000000000000277512507023225016265 0ustar0000000000000000----------------------------------------------------------------------------- -- | -- Module : Data.CharSet.Common -- Copyright : (c) Edward Kmett 2010-2012 -- License : BSD3 -- Maintainer : ekmett@gmail.com -- Stability : experimental -- Portability : portable -- -- The various character classifications from "Data.Char" as 'CharSet's ------------------------------------------------------------------------------- module Data.CharSet.Common ( -- ** Data.Char classes control , space , lower , upper , alpha , alphaNum , print , digit , octDigit , letter , mark , number , punctuation , symbol , separator , ascii , latin1 , asciiUpper , asciiLower ) where import Prelude () import Data.Char import Data.CharSet -- Haskell character classes from Data.Char control, space, lower, upper, alpha, alphaNum, print, digit, octDigit, letter, mark, number, punctuation, symbol, separator, ascii, latin1 , asciiUpper, asciiLower :: CharSet control = build isControl space = build isSpace lower = build isLower upper = build isUpper alpha = build isAlpha alphaNum = build isAlphaNum print = build isPrint digit = build isDigit octDigit = build isOctDigit letter = build isLetter mark = build isMark number = build isNumber punctuation = build isPunctuation symbol = build isSymbol separator = build isSeparator ascii = build isAscii latin1 = build isLatin1 asciiUpper = build isAsciiUpper asciiLower = build isAsciiLower charset-0.3.7.1/src/Data/CharSet/Posix.hs0000644000000000000000000000111212507023225016117 0ustar0000000000000000----------------------------------------------------------------------------- -- | -- Module : Data.CharSet.Posix -- Copyright : (c) Edward Kmett 2011-2012 -- License : BSD3 -- Maintainer : ekmett@gmail.com -- Stability : experimental -- Portability : portable -- ------------------------------------------------------------------------------- module Data.CharSet.Posix ( posixAscii , lookupPosixAsciiCharSet , posixUnicode , lookupPosixUnicodeCharSet ) where import Data.CharSet.Posix.Ascii import Data.CharSet.Posix.Unicode import Prelude () charset-0.3.7.1/src/Data/CharSet/Unicode.hs0000644000000000000000000002117412507023225016415 0ustar0000000000000000{-# LANGUAGE DeriveDataTypeable #-} ----------------------------------------------------------------------------- -- | -- Module : Data.CharSet.Unicode -- Copyright : (c) Edward Kmett 2010 -- License : BSD3 -- Maintainer : ekmett@gmail.com -- Stability : experimental -- Portability : portable -- -- Provides unicode general categories, which are typically connoted by -- @\\p{Ll}@ or @\\p{Modifier_Letter}@. Lookups can be constructed using 'categories' -- or individual character sets can be used directly. ------------------------------------------------------------------------------- module Data.CharSet.Unicode ( -- * Unicode General Category UnicodeCategory(..) -- * Lookup , unicodeCategories -- * CharSets by UnicodeCategory -- ** Letter , modifierLetter, otherLetter, letter -- *** Letter\& , lowercaseLetter, uppercaseLetter, titlecaseLetter, letterAnd -- ** Mark , nonSpacingMark, spacingCombiningMark, enclosingMark, mark -- ** Separator , space, lineSeparator, paragraphSeparator, separator -- ** Symbol , mathSymbol, currencySymbol, modifierSymbol, otherSymbol, symbol -- ** Number , decimalNumber, letterNumber, otherNumber, number -- ** Punctuation , dashPunctuation, openPunctuation, closePunctuation, initialQuote , finalQuote, connectorPunctuation, otherPunctuation, punctuation -- ** Other , control, format, privateUse, surrogate, notAssigned, other ) where import Data.Char import Data.Data import Data.CharSet data UnicodeCategory = UnicodeCategory String String CharSet String deriving (Show, Data, Typeable) -- \p{Letter} or \p{Mc} unicodeCategories :: [UnicodeCategory] unicodeCategories = [ UnicodeCategory "Letter" "L" letter "any kind of letter from any language." , UnicodeCategory "Lowercase_Letter" "Ll" lowercaseLetter "a lowercase letter that has an uppercase variant" , UnicodeCategory "Uppercase_Letter" "Lu" uppercaseLetter "an uppercase letter that has a lowercase variant" , UnicodeCategory "Titlecase_Letter" "Lt" titlecaseLetter "a letter that appears at the start of a word when only the first letter of the word is capitalized" , UnicodeCategory "Letter&" "L&" letterAnd "a letter that exists in lowercase and uppercase variants (combination of Ll, Lu and Lt)" , UnicodeCategory "Modifier_Letter" "Lm" modifierLetter "a special character that is used like a letter" , UnicodeCategory "Other_Letter" "Lo" otherLetter "a letter or ideograph that does not have lowercase and uppercase variants" , UnicodeCategory "Mark" "M" mark "a character intended to be combined with another character (e.g. accents, umlauts, enclosing boxes, etc.)" , UnicodeCategory "Non_Spacing_Mark" "Mn" nonSpacingMark "a character intended to be combined with another character without taking up extra space (e.g. accents, umlauts, etc.)" , UnicodeCategory "Spacing_Combining_Mark" "Mc" spacingCombiningMark "a character intended to be combined with another character that takes up extra space (vowel signs in many Eastern languages)" , UnicodeCategory "Enclosing_Mark" "Me" enclosingMark "a character that encloses the character is is combined with (circle, square, keycap, etc.)" , UnicodeCategory "Separator" "Z" separator "any kind of whitespace or invisible separator" , UnicodeCategory "Space_Separator" "Zs" space "a whitespace character that is invisible, but does take up space" , UnicodeCategory "Line_Separator" "Zl" lineSeparator "line separator character U+2028" , UnicodeCategory "Paragraph_Separator" "Zp" paragraphSeparator "paragraph separator character U+2029" , UnicodeCategory "Symbol" "S" symbol "math symbols, currency signs, dingbats, box-drawing characters, etc." , UnicodeCategory "Math_Symbol" "Sm" mathSymbol "any mathematical symbol" , UnicodeCategory "Currency_Symbol" "Sc" currencySymbol "any currency sign" , UnicodeCategory "Modifier_Symbol" "Sk" modifierSymbol "a combining character (mark) as a full character on its own" , UnicodeCategory "Other_Symbol" "So" otherSymbol "various symbols that are not math symbols, currency signs, or combining characters" , UnicodeCategory "Number" "N" number "any kind of numeric character in any script" , UnicodeCategory "Decimal_Digit_Number" "Nd" decimalNumber "a digit zero through nine in any script except ideographic scripts" , UnicodeCategory "Letter_Number" "Nl" letterNumber "a number that looks like a letter, such as a Roman numeral" , UnicodeCategory "Other_Number" "No" otherNumber "a superscript or subscript digit, or a number that is not a digit 0..9 (excluding numbers from ideographic scripts)" , UnicodeCategory "Punctuation" "P" punctuation "any kind of punctuation character" , UnicodeCategory "Dash_Punctuation" "Pd" dashPunctuation "any kind of hyphen or dash" , UnicodeCategory "Open_Punctuation" "Ps" openPunctuation "any kind of opening bracket" , UnicodeCategory "Close_Punctuation" "Pe" closePunctuation "any kind of closing bracket" , UnicodeCategory "Initial_Punctuation" "Pi" initialQuote "any kind of opening quote" , UnicodeCategory "Final_Punctuation" "Pf" finalQuote "any kind of closing quote" , UnicodeCategory "Connector_Punctuation" "Pc" connectorPunctuation "a punctuation character such as an underscore that connects words" , UnicodeCategory "Other_Punctuation" "Po" otherPunctuation "any kind of punctuation character that is not a dash, bracket, quote or connector" , UnicodeCategory "Other" "C" other "invisible control characters and unused code points" , UnicodeCategory "Control" "Cc" control "an ASCII 0x00..0x1F or Latin-1 0x80..0x9F control character" , UnicodeCategory "Format" "Cf" format "invisible formatting indicator" , UnicodeCategory "Private_Use" "Co" privateUse "any code point reserved for private use" , UnicodeCategory "Surrogate" "Cs" surrogate "one half of a surrogate pair in UTF-16 encoding" , UnicodeCategory "Unassigned" "Cn" notAssigned "any code point to which no character has been assigned.properties" ] cat :: GeneralCategory -> CharSet cat category = build ((category ==) . generalCategory) -- Letter lowercaseLetter, uppercaseLetter, titlecaseLetter, letterAnd, modifierLetter, otherLetter, letter :: CharSet lowercaseLetter = cat LowercaseLetter uppercaseLetter = cat UppercaseLetter titlecaseLetter = cat TitlecaseLetter letterAnd = lowercaseLetter `union` uppercaseLetter `union` titlecaseLetter modifierLetter = cat ModifierLetter otherLetter = cat OtherLetter letter = letterAnd `union` modifierLetter `union` otherLetter -- Marks nonSpacingMark, spacingCombiningMark, enclosingMark, mark :: CharSet nonSpacingMark = cat NonSpacingMark spacingCombiningMark = cat SpacingCombiningMark enclosingMark = cat EnclosingMark mark = nonSpacingMark `union` spacingCombiningMark `union` enclosingMark space, lineSeparator, paragraphSeparator, separator :: CharSet space = cat Space lineSeparator = cat LineSeparator paragraphSeparator = cat ParagraphSeparator separator = space `union` lineSeparator `union` paragraphSeparator mathSymbol, currencySymbol, modifierSymbol, otherSymbol, symbol :: CharSet mathSymbol = cat MathSymbol currencySymbol = cat CurrencySymbol modifierSymbol = cat ModifierSymbol otherSymbol = cat OtherSymbol symbol = mathSymbol `union` currencySymbol `union` modifierSymbol `union` otherSymbol decimalNumber, letterNumber, otherNumber, number :: CharSet decimalNumber = cat DecimalNumber letterNumber = cat LetterNumber otherNumber = cat OtherNumber number = decimalNumber `union` letterNumber `union` otherNumber dashPunctuation, openPunctuation, closePunctuation, initialQuote, finalQuote, connectorPunctuation, otherPunctuation, punctuation :: CharSet dashPunctuation = cat DashPunctuation openPunctuation = cat OpenPunctuation closePunctuation = cat ClosePunctuation initialQuote = cat InitialQuote finalQuote = cat FinalQuote connectorPunctuation = cat ConnectorPunctuation otherPunctuation = cat OtherPunctuation punctuation = dashPunctuation `union` openPunctuation `union` closePunctuation `union` initialQuote `union` finalQuote `union` connectorPunctuation `union` otherPunctuation control, format, privateUse, surrogate, notAssigned, other :: CharSet control = cat Control format = cat Format privateUse = cat PrivateUse surrogate = cat Surrogate notAssigned = cat NotAssigned other = control `union` format `union` privateUse `union` surrogate `union` notAssigned charset-0.3.7.1/src/Data/CharSet/Posix/0000755000000000000000000000000012507023225015570 5ustar0000000000000000charset-0.3.7.1/src/Data/CharSet/Posix/Ascii.hs0000644000000000000000000000347112507023225017161 0ustar0000000000000000----------------------------------------------------------------------------- -- | -- Module : Data.CharSet.Posix.Ascii -- Copyright : (c) Edward Kmett 2010 -- License : BSD3 -- Maintainer : ekmett@gmail.com -- Stability : experimental -- Portability : portable -- ------------------------------------------------------------------------------- module Data.CharSet.Posix.Ascii ( posixAscii , lookupPosixAsciiCharSet -- * Traditional POSIX ASCII \"classes\" , alnum, alpha, ascii, blank, cntrl, digit, graph, print, word, punct, space, upper, lower, xdigit ) where import Prelude hiding (print) import Data.Char import Data.CharSet import Data.HashMap.Lazy (HashMap) import qualified Data.HashMap.Lazy as HashMap alnum, alpha, ascii, blank, cntrl, digit, graph, print, word, punct, space, upper, lower, xdigit :: CharSet alnum = alpha `union` digit alpha = lower `union` upper ascii = range '\x00' '\x7f' blank = fromList " \t" cntrl = insert '\x7f' $ range '\x00' '\x1f' digit = range '0' '9' lower = range 'a' 'z' upper = range 'A' 'Z' graph = range '\x21' '\x7e' print = insert '\x20' graph word = insert '_' alnum punct = fromList "-!\"#$%&'()*+,./:;<=>?@[\\]^_`{|}~" space = fromList " \t\r\n\v\f" xdigit = digit `union` range 'a' 'f' `union` range 'A' 'F' -- :digit:, etc. posixAscii :: HashMap String CharSet posixAscii = HashMap.fromList [ ("alnum", alnum) , ("alpha", alpha) , ("ascii", ascii) , ("blank", blank) , ("cntrl", cntrl) , ("digit", digit) , ("graph", graph) , ("print", print) , ("word", word) , ("punct", punct) , ("space", space) , ("upper", upper) , ("lower", lower) , ("xdigit", xdigit) ] lookupPosixAsciiCharSet :: String -> Maybe CharSet lookupPosixAsciiCharSet s = HashMap.lookup (Prelude.map toLower s) posixAscii charset-0.3.7.1/src/Data/CharSet/Posix/Unicode.hs0000644000000000000000000000410612507023225017513 0ustar0000000000000000----------------------------------------------------------------------------- -- | -- Module : Data.CharSet.Posix.Unicode -- Copyright : (c) Edward Kmett 2010 -- License : BSD3 -- Maintainer : ekmett@gmail.com -- Stability : experimental -- Portability : portable -- ------------------------------------------------------------------------------- module Data.CharSet.Posix.Unicode ( posixUnicode , lookupPosixUnicodeCharSet -- * POSIX ASCII \"classes\" , alnum, alpha, ascii, blank, cntrl, digit, graph, print, word, punct, space, upper, lower, xdigit ) where import Prelude hiding (print) import Data.Char import Data.CharSet import qualified Data.CharSet.Unicode.Category as Category import qualified Data.CharSet.Unicode.Block as Block import Data.HashMap.Lazy (HashMap) import qualified Data.HashMap.Lazy as HashMap alnum, alpha, ascii, blank, cntrl, digit, graph, print, word, punct, space, upper, lower, xdigit :: CharSet alnum = alpha `union` digit ascii = Block.basicLatin alpha = Category.letterAnd blank = insert '\t' Category.space cntrl = Category.control digit = Category.decimalNumber lower = Category.lowercaseLetter upper = Category.uppercaseLetter graph = complement (Category.separator `union` Category.other) print = complement (Category.other) word = Category.letter `union` Category.number `union` Category.connectorPunctuation punct = Category.punctuation `union` Category.symbol space = fromList " \t\r\n\v\f" `union` Category.separator xdigit = digit `union` range 'a' 'f' `union` range 'A' 'F' -- :digit:, etc. posixUnicode :: HashMap String CharSet posixUnicode = HashMap.fromList [ ("alnum", alnum) , ("alpha", alpha) , ("ascii", ascii) , ("blank", blank) , ("cntrl", cntrl) , ("digit", digit) , ("graph", graph) , ("print", print) , ("word", word) , ("punct", punct) , ("space", space) , ("upper", upper) , ("lower", lower) , ("xdigit", xdigit) ] lookupPosixUnicodeCharSet :: String -> Maybe CharSet lookupPosixUnicodeCharSet s = HashMap.lookup (Prelude.map toLower s) posixUnicode charset-0.3.7.1/src/Data/CharSet/Unicode/0000755000000000000000000000000012507023225016054 5ustar0000000000000000charset-0.3.7.1/src/Data/CharSet/Unicode/Block.hs0000644000000000000000000003215212507023225017445 0ustar0000000000000000{-# LANGUAGE DeriveDataTypeable #-} {-# OPTIONS_GHC -fno-warn-missing-signatures #-} ----------------------------------------------------------------------------- -- | -- Module : Data.CharSet.Unicode.Block -- Copyright : (c) Edward Kmett 2010-2011 -- License : BSD3 -- Maintainer : ekmett@gmail.com -- Stability : experimental -- Portability : portable -- -- Provides unicode general categories, which are typically connoted by -- @\p{InBasicLatin}@ or @\p{InIPA_Extensions}@. Lookups can be constructed using 'categories' -- or individual character sets can be used directly. ------------------------------------------------------------------------------- module Data.CharSet.Unicode.Block ( -- * Unicode General Category Block(..) -- * Lookup , blocks , lookupBlock , lookupBlockCharSet -- * CharSets by Block , basicLatin , latin1Supplement , latinExtendedA , latinExtendedB , ipaExtensions , spacingModifierLetters , combiningDiacriticalMarks , greekAndCoptic , cyrillic , cyrillicSupplementary , armenian , hebrew , arabic , syriac , thaana , devanagari , bengali , gurmukhi , gujarati , oriya , tamil , telugu , kannada , malayalam , sinhala , thai , lao , tibetan , myanmar , georgian , hangulJamo , ethiopic , cherokee , unifiedCanadianAboriginalSyllabics , ogham , runic , tagalog , hanunoo , buhid , tagbanwa , khmer , mongolian , limbu , taiLe , khmerSymbols , phoneticExtensions , latinExtendedAdditional , greekExtended , generalPunctuation , superscriptsAndSubscripts , currencySymbols , combiningDiacriticalMarksForSymbols , letterlikeSymbols , numberForms , arrows , mathematicalOperators , miscellaneousTechnical , controlPictures , opticalCharacterRecognition , enclosedAlphanumerics , boxDrawing , blockElements , geometricShapes , miscellaneousSymbols , dingbats , miscellaneousMathematicalSymbolsA , supplementalArrowsA , braillePatterns , supplementalArrowsB , miscellaneousMathematicalSymbolsB , supplementalMathematicalOperators , miscellaneousSymbolsAndArrows , cjkRadicalsSupplement , kangxiRadicals , ideographicDescriptionCharacters , cjkSymbolsAndPunctuation , hiragana , katakana , bopomofo , hangulCompatibilityJamo , kanbun , bopomofoExtended , katakanaPhoneticExtensions , enclosedCjkLettersAndMonths , cjkCompatibility , cjkUnifiedIdeographsExtensionA , yijingHexagramSymbols , cjkUnifiedIdeographs , yiSyllables , yiRadicals , hangulSyllables , highSurrogates , highPrivateUseSurrogates , lowSurrogates , privateUseArea , cjkCompatibilityIdeographs , alphabeticPresentationForms , arabicPresentationFormsA , variationSelectors , combiningHalfMarks , cjkCompatibilityForms , smallFormVariants , arabicPresentationFormsB , halfwidthAndFullwidthForms , specials ) where import Data.Char import Data.CharSet import Data.Data import Data.HashMap.Lazy (HashMap) import qualified Data.HashMap.Lazy as HashMap data Block = Block { blockName :: String , blockCharSet :: CharSet } deriving (Show, Data, Typeable) blocks :: [Block] blocks = [ Block "Basic_Latin" basicLatin , Block "Latin-1_Supplement" latin1Supplement , Block "Latin_Extended-A" latinExtendedA , Block "Latin_Extended-B" latinExtendedB , Block "IPA_Extensions" ipaExtensions , Block "Spacing_Modifier_Letters" spacingModifierLetters , Block "Combining_Diacritical_Marks" combiningDiacriticalMarks , Block "Greek_and_Coptic" greekAndCoptic , Block "Cyrillic" cyrillic , Block "Cyrillic_Supplementary" cyrillicSupplementary , Block "Armenian" armenian , Block "Hebrew" hebrew , Block "Arabic" arabic , Block "Syriac" syriac , Block "Thaana" thaana , Block "Devanagari" devanagari , Block "Bengali" bengali , Block "Gurmukhi" gurmukhi , Block "Gujarati" gujarati , Block "Oriya" oriya , Block "Tamil" tamil , Block "Telugu" telugu , Block "Kannada" kannada , Block "Malayalam" malayalam , Block "Sinhala" sinhala , Block "Thai" thai , Block "Lao" lao , Block "Tibetan" tibetan , Block "Myanmar" myanmar , Block "Georgian" georgian , Block "Hangul_Jamo" hangulJamo , Block "Ethiopic" ethiopic , Block "Cherokee" cherokee , Block "Unified_Canadian_Aboriginal_Syllabics" unifiedCanadianAboriginalSyllabics , Block "Ogham" ogham , Block "Runic" runic , Block "Tagalog" tagalog , Block "Hanunoo" hanunoo , Block "Buhid" buhid , Block "Tagbanwa" tagbanwa , Block "Khmer" khmer , Block "Mongolian" mongolian , Block "Limbu" limbu , Block "Tai_Le" taiLe , Block "Khmer_Symbols" khmerSymbols , Block "Phonetic_Extensions" phoneticExtensions , Block "Latin_Extended_Additional" latinExtendedAdditional , Block "Greek_Extended" greekExtended , Block "General_Punctuation" generalPunctuation , Block "Superscripts_and_Subscripts" superscriptsAndSubscripts , Block "Currency_Symbols" currencySymbols , Block "Combining_Diacritical_Marks_for_Symbols" combiningDiacriticalMarksForSymbols , Block "Letterlike_Symbols" letterlikeSymbols , Block "Number_Forms" numberForms , Block "Arrows" arrows , Block "Mathematical_Operators" mathematicalOperators , Block "Miscellaneous_Technical" miscellaneousTechnical , Block "Control_Pictures" controlPictures , Block "Optical_Character_Recognition" opticalCharacterRecognition , Block "Enclosed_Alphanumerics" enclosedAlphanumerics , Block "Box_Drawing" boxDrawing , Block "Block_Elements" blockElements , Block "Geometric_Shapes" geometricShapes , Block "Miscellaneous_Symbols" miscellaneousSymbols , Block "Dingbats" dingbats , Block "Miscellaneous_Mathematical_Symbols-A" miscellaneousMathematicalSymbolsA , Block "Supplemental_Arrows-A" supplementalArrowsA , Block "Braille_Patterns" braillePatterns , Block "Supplemental_Arrows-B" supplementalArrowsB , Block "Miscellaneous_Mathematical_Symbols-B" miscellaneousMathematicalSymbolsB , Block "Supplemental_Mathematical_Operators" supplementalMathematicalOperators , Block "Miscellaneous_Symbols_and_Arrows" miscellaneousSymbolsAndArrows , Block "CJK_Radicals_Supplement" cjkRadicalsSupplement , Block "Kangxi_Radicals" kangxiRadicals , Block "Ideographic_Description_Characters" ideographicDescriptionCharacters , Block "CJK_Symbols_and_Punctuation" cjkSymbolsAndPunctuation , Block "Hiragana" hiragana , Block "Katakana" katakana , Block "Bopomofo" bopomofo , Block "Hangul_Compatibility_Jamo" hangulCompatibilityJamo , Block "Kanbun" kanbun , Block "Bopomofo_Extended" bopomofoExtended , Block "Katakana_Phonetic_Extensions" katakanaPhoneticExtensions , Block "Enclosed_CJK_Letters_and_Months" enclosedCjkLettersAndMonths , Block "CJK_Compatibility" cjkCompatibility , Block "CJK_Unified_Ideographs_Extension_A" cjkUnifiedIdeographsExtensionA , Block "Yijing_Hexagram_Symbols" yijingHexagramSymbols , Block "CJK_Unified_Ideographs" cjkUnifiedIdeographs , Block "Yi_Syllables" yiSyllables , Block "Yi_Radicals" yiRadicals , Block "Hangul_Syllables" hangulSyllables , Block "High_Surrogates" highSurrogates , Block "High_Private_Use_Surrogates" highPrivateUseSurrogates , Block "Low_Surrogates" lowSurrogates , Block "Private_Use_Area" privateUseArea , Block "CJK_Compatibility_Ideographs" cjkCompatibilityIdeographs , Block "Alphabetic_Presentation_Forms" alphabeticPresentationForms , Block "Arabic_Presentation_Forms-A" arabicPresentationFormsA , Block "Variation_Selectors" variationSelectors , Block "Combining_Half_Marks" combiningHalfMarks , Block "CJK_Compatibility_Forms" cjkCompatibilityForms , Block "Small_Form_Variants" smallFormVariants , Block "Arabic_Presentation_Forms-B" arabicPresentationFormsB , Block "Halfwidth_and_Fullwidth_Forms" halfwidthAndFullwidthForms , Block "Specials" specials ] lookupTable :: HashMap String Block lookupTable = HashMap.fromList $ Prelude.map (\y@(Block x _) -> (canonicalize x, y)) blocks canonicalize :: String -> String canonicalize s = case Prelude.map toLower s of 'i': 'n' : xs -> go xs xs -> go xs where go ('-':xs) = go xs go ('_':xs) = go xs go (' ':xs) = go xs go (x:xs) = x : go xs go [] = [] lookupBlock :: String -> Maybe Block lookupBlock s = HashMap.lookup (canonicalize s) lookupTable lookupBlockCharSet :: String -> Maybe CharSet lookupBlockCharSet = fmap blockCharSet . lookupBlock basicLatin = range '\x0000' '\x007f' latin1Supplement = range '\x0080' '\x00ff' latinExtendedA = range '\x0100' '\x017F' latinExtendedB = range '\x0180' '\x024F' ipaExtensions = range '\x0250' '\x02AF' spacingModifierLetters = range '\x02B0' '\x02FF' combiningDiacriticalMarks = range '\x0300' '\x036F' greekAndCoptic = range '\x0370' '\x03FF' cyrillic = range '\x0400' '\x04FF' cyrillicSupplementary = range '\x0500' '\x052F' armenian = range '\x0530' '\x058F' hebrew = range '\x0590' '\x05FF' arabic = range '\x0600' '\x06FF' syriac = range '\x0700' '\x074F' thaana = range '\x0780' '\x07BF' devanagari = range '\x0900' '\x097F' bengali = range '\x0980' '\x09FF' gurmukhi = range '\x0A00' '\x0A7F' gujarati = range '\x0A80' '\x0AFF' oriya = range '\x0B00' '\x0B7F' tamil = range '\x0B80' '\x0BFF' telugu = range '\x0C00' '\x0C7F' kannada = range '\x0C80' '\x0CFF' malayalam = range '\x0D00' '\x0D7F' sinhala = range '\x0D80' '\x0DFF' thai = range '\x0E00' '\x0E7F' lao = range '\x0E80' '\x0EFF' tibetan = range '\x0F00' '\x0FFF' myanmar = range '\x1000' '\x109F' georgian = range '\x10A0' '\x10FF' hangulJamo = range '\x1100' '\x11FF' ethiopic = range '\x1200' '\x137F' cherokee = range '\x13A0' '\x13FF' unifiedCanadianAboriginalSyllabics = range '\x1400' '\x167F' ogham = range '\x1680' '\x169F' runic = range '\x16A0' '\x16FF' tagalog = range '\x1700' '\x171F' hanunoo = range '\x1720' '\x173F' buhid = range '\x1740' '\x175F' tagbanwa = range '\x1760' '\x177F' khmer = range '\x1780' '\x17FF' mongolian = range '\x1800' '\x18AF' limbu = range '\x1900' '\x194F' taiLe = range '\x1950' '\x197F' khmerSymbols = range '\x19E0' '\x19FF' phoneticExtensions = range '\x1D00' '\x1D7F' latinExtendedAdditional = range '\x1E00' '\x1EFF' greekExtended = range '\x1F00' '\x1FFF' generalPunctuation = range '\x2000' '\x206F' superscriptsAndSubscripts = range '\x2070' '\x209F' currencySymbols = range '\x20A0' '\x20CF' combiningDiacriticalMarksForSymbols = range '\x20D0' '\x20FF' letterlikeSymbols = range '\x2100' '\x214F' numberForms = range '\x2150' '\x218F' arrows = range '\x2190' '\x21FF' mathematicalOperators = range '\x2200' '\x22FF' miscellaneousTechnical = range '\x2300' '\x23FF' controlPictures = range '\x2400' '\x243F' opticalCharacterRecognition = range '\x2440' '\x245F' enclosedAlphanumerics = range '\x2460' '\x24FF' boxDrawing = range '\x2500' '\x257F' blockElements = range '\x2580' '\x259F' geometricShapes = range '\x25A0' '\x25FF' miscellaneousSymbols = range '\x2600' '\x26FF' dingbats = range '\x2700' '\x27BF' miscellaneousMathematicalSymbolsA = range '\x27C0' '\x27EF' supplementalArrowsA = range '\x27F0' '\x27FF' braillePatterns = range '\x2800' '\x28FF' supplementalArrowsB = range '\x2900' '\x297F' miscellaneousMathematicalSymbolsB = range '\x2980' '\x29FF' supplementalMathematicalOperators = range '\x2A00' '\x2AFF' miscellaneousSymbolsAndArrows = range '\x2B00' '\x2BFF' cjkRadicalsSupplement = range '\x2E80' '\x2EFF' kangxiRadicals = range '\x2F00' '\x2FDF' ideographicDescriptionCharacters = range '\x2FF0' '\x2FFF' cjkSymbolsAndPunctuation = range '\x3000' '\x303F' hiragana = range '\x3040' '\x309F' katakana = range '\x30A0' '\x30FF' bopomofo = range '\x3100' '\x312F' hangulCompatibilityJamo = range '\x3130' '\x318F' kanbun = range '\x3190' '\x319F' bopomofoExtended = range '\x31A0' '\x31BF' katakanaPhoneticExtensions = range '\x31F0' '\x31FF' enclosedCjkLettersAndMonths = range '\x3200' '\x32FF' cjkCompatibility = range '\x3300' '\x33FF' cjkUnifiedIdeographsExtensionA = range '\x3400' '\x4DBF' yijingHexagramSymbols = range '\x4DC0' '\x4DFF' cjkUnifiedIdeographs = range '\x4E00' '\x9FFF' yiSyllables = range '\xA000' '\xA48F' yiRadicals = range '\xA490' '\xA4CF' hangulSyllables = range '\xAC00' '\xD7AF' highSurrogates = range '\xD800' '\xDB7F' highPrivateUseSurrogates = range '\xDB80' '\xDBFF' lowSurrogates = range '\xDC00' '\xDFFF' privateUseArea = range '\xE000' '\xF8FF' cjkCompatibilityIdeographs = range '\xF900' '\xFAFF' alphabeticPresentationForms = range '\xFB00' '\xFB4F' arabicPresentationFormsA = range '\xFB50' '\xFDFF' variationSelectors = range '\xFE00' '\xFE0F' combiningHalfMarks = range '\xFE20' '\xFE2F' cjkCompatibilityForms = range '\xFE30' '\xFE4F' smallFormVariants = range '\xFE50' '\xFE6F' arabicPresentationFormsB = range '\xFE70' '\xFEFF' halfwidthAndFullwidthForms = range '\xFF00' '\xFFEF' specials = range '\xFFF0' '\xFFFF' charset-0.3.7.1/src/Data/CharSet/Unicode/Category.hs0000644000000000000000000002252212507023225020170 0ustar0000000000000000{-# LANGUAGE DeriveDataTypeable #-} ----------------------------------------------------------------------------- -- | -- Module : Data.CharSet.Unicode.Category -- Copyright : (c) Edward Kmett 2010-2012 -- License : BSD3 -- Maintainer : ekmett@gmail.com -- Stability : experimental -- Portability : DeriveDataTypeable -- -- Provides unicode general categories, which are typically connoted by -- @\\p{Ll}@ or @\\p{Modifier_Letter}@. Lookups can be constructed using 'categories' -- or individual character sets can be used directly. -- -- A case, @_@ and @-@ insensitive lookup is provided by 'lookupCategory' -- and can be used to provide behavior similar to that of Perl or PCRE. ------------------------------------------------------------------------------- module Data.CharSet.Unicode.Category ( -- * Unicode General Category Category(..) -- * Lookup , categories , lookupCategory , lookupCategoryCharSet -- * CharSets by Category -- ** Letter , modifierLetter, otherLetter, letter -- *** Letter\& , lowercaseLetter, uppercaseLetter, titlecaseLetter, letterAnd -- ** Mark , nonSpacingMark, spacingCombiningMark, enclosingMark, mark -- ** Separator , space, lineSeparator, paragraphSeparator, separator -- ** Symbol , mathSymbol, currencySymbol, modifierSymbol, otherSymbol, symbol -- ** Number , decimalNumber, letterNumber, otherNumber, number -- ** Punctuation , dashPunctuation, openPunctuation, closePunctuation, initialQuote , finalQuote, connectorPunctuation, otherPunctuation, punctuation -- ** Other , control, format, privateUse, surrogate, notAssigned, other ) where import Data.Char import Data.CharSet import Data.Data import Data.HashMap.Lazy (HashMap) import qualified Data.HashMap.Lazy as HashMap data Category = Category { categoryName :: String , categoryAbbreviation :: String , categoryCharSet :: CharSet , categoryDescription :: String } deriving (Show, Data, Typeable) -- \p{Letter} or \p{Mc} categories :: [Category] categories = [ Category "Letter" "L" letter "any kind of letter from any language." , Category "Lowercase_Letter" "Ll" lowercaseLetter "a lowercase letter that has an uppercase variant" , Category "Uppercase_Letter" "Lu" uppercaseLetter "an uppercase letter that has a lowercase variant" , Category "Titlecase_Letter" "Lt" titlecaseLetter "a letter that appears at the start of a word when only the first letter of the word is capitalized" , Category "Letter&" "L&" letterAnd "a letter that exists in lowercase and uppercase variants (combination of Ll, Lu and Lt)" , Category "Modifier_Letter" "Lm" modifierLetter "a special character that is used like a letter" , Category "Other_Letter" "Lo" otherLetter "a letter or ideograph that does not have lowercase and uppercase variants" , Category "Mark" "M" mark "a character intended to be combined with another character (e.g. accents, umlauts, enclosing boxes, etc.)" , Category "Non_Spacing_Mark" "Mn" nonSpacingMark "a character intended to be combined with another character without taking up extra space (e.g. accents, umlauts, etc.)" , Category "Spacing_Combining_Mark" "Mc" spacingCombiningMark "a character intended to be combined with another character that takes up extra space (vowel signs in many Eastern languages)" , Category "Enclosing_Mark" "Me" enclosingMark "a character that encloses the character is is combined with (circle, square, keycap, etc.)" , Category "Separator" "Z" separator "any kind of whitespace or invisible separator" , Category "Space_Separator" "Zs" space "a whitespace character that is invisible, but does take up space" , Category "Line_Separator" "Zl" lineSeparator "line separator character U+2028" , Category "Paragraph_Separator" "Zp" paragraphSeparator "paragraph separator character U+2029" , Category "Symbol" "S" symbol "math symbols, currency signs, dingbats, box-drawing characters, etc." , Category "Math_Symbol" "Sm" mathSymbol "any mathematical symbol" , Category "Currency_Symbol" "Sc" currencySymbol "any currency sign" , Category "Modifier_Symbol" "Sk" modifierSymbol "a combining character (mark) as a full character on its own" , Category "Other_Symbol" "So" otherSymbol "various symbols that are not math symbols, currency signs, or combining characters" , Category "Number" "N" number "any kind of numeric character in any script" , Category "Decimal_Digit_Number" "Nd" decimalNumber "a digit zero through nine in any script except ideographic scripts" , Category "Letter_Number" "Nl" letterNumber "a number that looks like a letter, such as a Roman numeral" , Category "Other_Number" "No" otherNumber "a superscript or subscript digit, or a number that is not a digit 0..9 (excluding numbers from ideographic scripts)" , Category "Punctuation" "P" punctuation "any kind of punctuation character" , Category "Dash_Punctuation" "Pd" dashPunctuation "any kind of hyphen or dash" , Category "Open_Punctuation" "Ps" openPunctuation "any kind of opening bracket" , Category "Close_Punctuation" "Pe" closePunctuation "any kind of closing bracket" , Category "Initial_Punctuation" "Pi" initialQuote "any kind of opening quote" , Category "Final_Punctuation" "Pf" finalQuote "any kind of closing quote" , Category "Connector_Punctuation" "Pc" connectorPunctuation "a punctuation character such as an underscore that connects words" , Category "Other_Punctuation" "Po" otherPunctuation "any kind of punctuation character that is not a dash, bracket, quote or connector" , Category "Other" "C" other "invisible control characters and unused code points" , Category "Control" "Cc" control "an ASCII 0x00..0x1F or Latin-1 0x80..0x9F control character" , Category "Format" "Cf" format "invisible formatting indicator" , Category "Private_Use" "Co" privateUse "any code point reserved for private use" , Category "Surrogate" "Cs" surrogate "one half of a surrogate pair in UTF-16 encoding" , Category "Unassigned" "Cn" notAssigned "any code point to which no character has been assigned.properties" ] lookupTable :: HashMap String Category lookupTable = HashMap.fromList [ (canonicalize x, category) | category@(Category l s _ _) <- categories , x <- [l,s] ] lookupCategory :: String -> Maybe Category lookupCategory s = HashMap.lookup (canonicalize s) lookupTable lookupCategoryCharSet :: String -> Maybe CharSet lookupCategoryCharSet = fmap categoryCharSet . lookupCategory canonicalize :: String -> String canonicalize s = case Prelude.map toLower s of 'i' : 's' : xs -> go xs xs -> go xs where go ('-':xs) = go xs go ('_':xs) = go xs go (' ':xs) = go xs go (x:xs) = x : go xs go [] = [] cat :: GeneralCategory -> CharSet cat category = build ((category ==) . generalCategory) -- Letter lowercaseLetter, uppercaseLetter, titlecaseLetter, letterAnd, modifierLetter, otherLetter, letter :: CharSet lowercaseLetter = cat LowercaseLetter uppercaseLetter = cat UppercaseLetter titlecaseLetter = cat TitlecaseLetter letterAnd = lowercaseLetter `union` uppercaseLetter `union` titlecaseLetter modifierLetter = cat ModifierLetter otherLetter = cat OtherLetter letter = letterAnd `union` modifierLetter `union` otherLetter -- Marks nonSpacingMark, spacingCombiningMark, enclosingMark, mark :: CharSet nonSpacingMark = cat NonSpacingMark spacingCombiningMark = cat SpacingCombiningMark enclosingMark = cat EnclosingMark mark = nonSpacingMark `union` spacingCombiningMark `union` enclosingMark space, lineSeparator, paragraphSeparator, separator :: CharSet space = cat Space lineSeparator = cat LineSeparator paragraphSeparator = cat ParagraphSeparator separator = space `union` lineSeparator `union` paragraphSeparator mathSymbol, currencySymbol, modifierSymbol, otherSymbol, symbol :: CharSet mathSymbol = cat MathSymbol currencySymbol = cat CurrencySymbol modifierSymbol = cat ModifierSymbol otherSymbol = cat OtherSymbol symbol = mathSymbol `union` currencySymbol `union` modifierSymbol `union` otherSymbol decimalNumber, letterNumber, otherNumber, number :: CharSet decimalNumber = cat DecimalNumber letterNumber = cat LetterNumber otherNumber = cat OtherNumber number = decimalNumber `union` letterNumber `union` otherNumber dashPunctuation, openPunctuation, closePunctuation, initialQuote, finalQuote, connectorPunctuation, otherPunctuation, punctuation :: CharSet dashPunctuation = cat DashPunctuation openPunctuation = cat OpenPunctuation closePunctuation = cat ClosePunctuation initialQuote = cat InitialQuote finalQuote = cat FinalQuote connectorPunctuation = cat ConnectorPunctuation otherPunctuation = cat OtherPunctuation punctuation = dashPunctuation `union` openPunctuation `union` closePunctuation `union` initialQuote `union` finalQuote `union` connectorPunctuation `union` otherPunctuation control, format, privateUse, surrogate, notAssigned, other :: CharSet control = cat Control format = cat Format privateUse = cat PrivateUse surrogate = cat Surrogate notAssigned = cat NotAssigned other = control `union` format `union` privateUse `union` surrogate `union` notAssigned