text-icu-0.8.0.4/0000755000000000000000000000000007346545000011614 5ustar0000000000000000text-icu-0.8.0.4/Data/Text/0000755000000000000000000000000007346545000013411 5ustar0000000000000000text-icu-0.8.0.4/Data/Text/ICU.hs0000644000000000000000000001521607346545000014372 0ustar0000000000000000{-# LANGUAGE CPP, NoImplicitPrelude #-} -- | -- Module : Data.Text.ICU -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Commonly used functions for Unicode, implemented as bindings to the -- International Components for Unicode (ICU) libraries. -- -- This module contains only the most commonly used types and -- functions. Other modules in this package expose richer interfaces. module Data.Text.ICU ( -- * Data representation -- $data -- * Types LocaleName(..) -- * Locales , availableLocales -- * Boundary analysis -- $break , Breaker , Break , brkPrefix , brkBreak , brkSuffix , brkStatus , Line(..) , Word(..) , breakCharacter , breakLine , breakSentence , breakWord , breaks , breaksRight -- * Case mapping , toCaseFold , toLower , toUpper -- * Iteration , CharIterator , fromString , fromText , fromUtf8 -- * Normalization -- $compat -- ** Normalize unicode strings , nfc, nfd, nfkc, nfkd, nfkcCasefold -- ** Checks for normalization , quickCheck, isNormalized -- * String comparison -- ** Normalization-sensitive string comparison , CompareOption(..) , compareUnicode -- ** Locale-sensitive string collation -- $collate , Collator , collator , collatorWith , collatorFromRules , collatorFromRulesWith , collate , collateIter , sortKey , uca -- * Regular expressions , MatchOption(..) , ParseError(errError, errLine, errOffset) , Match , Regex , Regular -- ** Construction , regex , regex' -- ** Inspection , pattern -- ** Searching , find , findAll -- ** Match groups -- $group , groupCount , unfold , span , group , prefix , suffix -- * Spoof checking -- $spoof , Spoof , SpoofParams(..) , S.SpoofCheck(..) , S.RestrictionLevel(..) , S.SpoofCheckResult(..) -- ** Construction , spoof , spoofWithParams , spoofFromSource , spoofFromSerialized -- ** String checking , areConfusable , spoofCheck , getSkeleton -- ** Configuration , getChecks , getAllowedLocales , getRestrictionLevel -- ** Persistence , serialize -- * Calendars , Calendar, CalendarType(..), SystemTimeZoneType(..), CalendarField(..), -- ** Construction calendar, -- ** Operations on calendars roll, add, set1, set, get, -- * Number formatting NumberFormatter, numberFormatter, formatIntegral, formatIntegral', formatDouble, formatDouble', -- * Date formatting DateFormatter, FormatStyle(..), DateFormatSymbolType(..), standardDateFormatter, patternDateFormatter, dateSymbols, formatCalendar, ) where import Data.Text.ICU.Break.Pure import Data.Text.ICU.Calendar import Data.Text.ICU.Collate.Pure import Data.Text.ICU.DateFormatter import Data.Text.ICU.Internal import Data.Text.ICU.Iterator import Data.Text.ICU.Locale import Data.Text.ICU.Normalize2 import Data.Text.ICU.NumberFormatter import Data.Text.ICU.Regex.Pure import qualified Data.Text.ICU.Spoof as S import Data.Text.ICU.Spoof.Pure import Data.Text.ICU.Text #if defined(__HADDOCK__) import Data.Text.Foreign import Data.Text (Text) #endif -- $data -- -- The Haskell 'Text' type is implemented as an array in the Haskell -- heap. This means that its location is not pinned; it may be copied -- during a garbage collection pass. ICU, on the other hand, works -- with strings that are allocated in the normal system heap and have -- a fixed address. -- -- To accommodate this need, these bindings use the functions from -- "Data.Text.Foreign" to copy data between the Haskell heap and the -- system heap. The copied strings are still managed automatically, -- but the need to duplicate data does add some performance and memory -- overhead. -- $break -- -- Text boundary analysis is the process of locating linguistic -- boundaries while formatting and handling text. Examples of this -- process include: -- -- * Locating appropriate points to word-wrap text to fit within -- specific margins while displaying or printing. -- -- * Counting characters, words, sentences, or paragraphs. -- -- * Making a list of the unique words in a document. -- -- * Figuring out if a given range of text contains only whole words. -- -- * Capitalizing the first letter of each word. -- -- * Locating a particular unit of the text (For example, finding the -- third word in the document). -- -- The 'Breaker' type was designed to support these kinds of -- tasks. -- -- For the impure boundary analysis API (which is richer, but less -- easy to use than the pure API), see the "Data.Text.ICU.Break" -- module. The impure API supports some uses that may be less -- efficient via the pure API, including: -- -- * Locating the beginning of a word that the user has selected. -- -- * Determining how far to move the text cursor when the user hits an -- arrow key (Some characters require more than one position in the -- text store and some characters in the text store do not display -- at all). -- $collate -- -- For the impure collation API (which is richer, but less easy to -- use than the pure API), see the "Data.Text.ICU.Collate" -- module. -- $group -- -- Capturing groups are numbered starting from zero. Group zero is -- always the entire matching text. Groups greater than zero contain -- the text matching each capturing group in a regular expression. -- $spoof -- -- The 'Spoof' type performs security checks on visually confusable -- (spoof) strings. For the impure spoof checking API (which is -- richer, but less easy to use than the pure API), see the -- "Data.Text.ICU.Spoof" module. -- -- See and -- for detailed information -- about the underlying algorithms and databases used by these functions. -- $formatting -- -- You create a 'NumberFormat' with 'numberFormatter' according to a locale -- and a choice of pre-defined formats. A 'NumberFormat' provides a formatting -- facility that 'format's numbers -- according to the chosen locale. Alternatively create and apply a 'NumberFormat' -- in a single step with 'formatNumber'' (it may be faster to re-use a NumberFormat though). -- See the section \"Patterns\" at -- for further details regarding pattern strings. -- $compat -- See module 'Data.Text.ICU.Normalization2' for the full interface which provides some compatibility with the former API. text-icu-0.8.0.4/Data/Text/ICU/0000755000000000000000000000000007346545000014031 5ustar0000000000000000text-icu-0.8.0.4/Data/Text/ICU/BiDi.hsc0000644000000000000000000001705207346545000015344 0ustar0000000000000000{-# LANGUAGE ForeignFunctionInterface #-} -- | -- Module : Data.Text.ICU.BiDi -- Copyright : (c) 2018 Ondrej Palkovsky -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Implementation of the Unicode Bidirectional Algorithm. See the documentation -- of the libicu library for additional details. -- -- -- /Note/: this module is not thread safe. /Do not/ call the -- functions on one BiDi object from more than one thread! module Data.Text.ICU.BiDi ( BiDi -- ** Basic functions , open , openSized -- ** Set data , setPara , setLine -- ** Access the BiDi object , countParagraphs , getParagraphByIndex , getProcessedLength -- ** Output text , writeReordered , WriteOption(..) -- ** High-level functions , reorderParagraphs ) where #include import Data.Text.ICU.BiDi.Internal import Foreign.Marshal.Utils (with) import Foreign.Storable (peek) import Foreign.Ptr (FunPtr, Ptr) import Data.Int (Int32, Int16) import Data.Text.ICU.Error.Internal (UErrorCode, handleError, handleOverflowError) import Data.Text (Text) import Data.Text.ICU.Internal (UChar, useAsUCharPtr, fromUCharPtr, newICUPtr) import Foreign.C.Types (CInt(..)) import Data.List (foldl') import Data.Bits ((.|.)) import System.IO.Unsafe (unsafePerformIO) import Data.Traversable (for) -- | Allocate a BiDi structure. open :: IO BiDi open = newICUPtr BiDi ubidi_close ubidi_open -- | Allocate a BiDi structure with preallocated memory for internal structures. openSized :: Int32 -- ^ is the maximum text or line length that internal memory will be preallocated for. -- An attempt to associate this object with a longer text will fail, unless this value is 0. -> Int32 -- ^ is the maximum anticipated number of same-level runs that internal memory will be preallocated for. -- An attempt to access visual runs on an object that was not preallocated for as many runs as the text was actually resolved to will fail, unless this value is 0. -> IO BiDi openSized maxlen maxruncount = newICUPtr BiDi ubidi_close $ handleError (ubidi_openSized maxlen maxruncount) -- | Perform the Unicode Bidi algorithm. It is defined in the Unicode Standard Annex #9, version 13, -- also described in The Unicode Standard, Version 4.0. -- This function takes a piece of plain text containing one or more paragraphs, -- with or without externally specified embedding levels from styled text and -- computes the left-right-directionality of each character. setPara :: BiDi -> Text -> Int32 -- ^ specifies the default level for the text; it is typically 0 (LTR) or 1 (RTL) -> IO () setPara bidi t paraLevel = withBiDi bidi $ \bptr -> useAsUCharPtr t $ \sptr slen -> handleError (ubidi_setPara bptr sptr (fromIntegral slen) paraLevel) -- | Sets a BiDi to contain the reordering information, especially the resolved levels, -- for all the characters in a line of text setLine :: BiDi -- ^ the parent paragraph object. It must have been set by a successful call to 'setPara'. -> Int32 -- ^ is the line's first index into the text -> Int32 -- ^ is just behind the line's last index into the text (its last index +1). -> BiDi -- ^ is the object that will now represent a line of the text -> IO () setLine paraBidi start limit lineBidi = withBiDi paraBidi $ \paraptr -> withBiDi lineBidi $ \lineptr -> handleError (ubidi_setLine paraptr start limit lineptr) -- | Get the number of paragraphs. countParagraphs :: BiDi -> IO Int32 countParagraphs bidi = withBiDi bidi ubidi_countParagraphs -- | Get a paragraph, given the index of this paragraph. getParagraphByIndex :: BiDi -> Int32 -- ^ is the number of the paragraph, in the range [0..ubidi_countParagraphs(pBiDi)-1]. -> IO (Int32, Int32) -- ^ index of the first character of the paragraph in the text and limit of the paragraph getParagraphByIndex bidi paraIndex = withBiDi bidi $ \bptr -> with 0 $ \pstart -> with 0 $ \pend -> do handleError (ubidi_getParagraphByIndex bptr paraIndex pstart pend) (,) <$> (fromIntegral <$> peek pstart) <*> (fromIntegral <$> peek pend) -- | Get the length of the source text processed by the last call to 'setPara'. getProcessedLength :: BiDi -> IO Int32 getProcessedLength bidi = withBiDi bidi ubidi_getProcessedLength data WriteOption = DoMirroring -- ^ replace characters with the "mirrored" property in RTL runs by their mirror-image mappings | InsertLrmForNumeric -- ^ surround the run with LRMs if necessary; this is part of the approximate "inverse Bidi" algorithm | KeepBaseCombining -- ^ keep combining characters after their base characters in RTL runs | OutputReverse -- ^ write the output in reverse order | RemoveBidiControls -- ^ remove Bidi control characters (this does not affect InsertLrmForNumeric) deriving (Show) reduceWriteOpts :: [WriteOption] -> Int16 reduceWriteOpts = foldl' orO 0 where a `orO` b = a .|. fromWriteOption b fromWriteOption :: WriteOption -> Int16 fromWriteOption DoMirroring = #const UBIDI_DO_MIRRORING fromWriteOption InsertLrmForNumeric = #const UBIDI_INSERT_LRM_FOR_NUMERIC fromWriteOption KeepBaseCombining = #const UBIDI_KEEP_BASE_COMBINING fromWriteOption OutputReverse = #const UBIDI_OUTPUT_REVERSE fromWriteOption RemoveBidiControls = #const UBIDI_REMOVE_BIDI_CONTROLS -- | Take a BiDi object containing the reordering information for a piece of text -- (one or more paragraphs) set by 'setPara' or for a line of text set by 'setLine' -- and write a reordered string to the destination buffer. writeReordered :: BiDi -> [WriteOption] -> IO Text writeReordered bidi opts = do destLen <- getProcessedLength bidi let options' = reduceWriteOpts opts withBiDi bidi $ \bptr -> handleOverflowError (fromIntegral destLen) (\dptr dlen -> ubidi_writeReordered bptr dptr (fromIntegral dlen) options') (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen)) foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_open" ubidi_open :: IO (Ptr UBiDi) foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_openSized" ubidi_openSized :: Int32 -> Int32 -> Ptr UErrorCode -> IO (Ptr UBiDi) foreign import ccall unsafe "hs_text_icu.h &__hs_ubidi_close" ubidi_close :: FunPtr (Ptr UBiDi -> IO ()) foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_setPara" ubidi_setPara :: Ptr UBiDi -> Ptr UChar -> Int32 -> Int32 -> Ptr UErrorCode -> IO () foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_countParagraphs" ubidi_countParagraphs :: Ptr UBiDi -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_getParagraphByIndex" ubidi_getParagraphByIndex :: Ptr UBiDi -> Int32 -> Ptr CInt -> Ptr CInt -> Ptr UErrorCode -> IO () foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_getProcessedLength" ubidi_getProcessedLength :: Ptr UBiDi -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_writeReordered" ubidi_writeReordered :: Ptr UBiDi -> Ptr UChar -> Int32 -> Int16 -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_setLine" ubidi_setLine :: Ptr UBiDi -> Int32 -> Int32 -> Ptr UBiDi -> Ptr UErrorCode -> IO () -- | Helper function to reorder a text to a series of paragraphs. reorderParagraphs :: [WriteOption] -> Text -> [Text] reorderParagraphs options input = unsafePerformIO $ do bidi <- open setPara bidi input 0 pcount <- countParagraphs bidi lineBidi <- open for [0..pcount-1] $ \pidx -> do (start,limit) <- getParagraphByIndex bidi pidx setLine bidi start limit lineBidi writeReordered lineBidi options text-icu-0.8.0.4/Data/Text/ICU/BiDi/0000755000000000000000000000000007346545000014640 5ustar0000000000000000text-icu-0.8.0.4/Data/Text/ICU/BiDi/Internal.hs0000644000000000000000000000164207346545000016753 0ustar0000000000000000{-# LANGUAGE DeriveDataTypeable, EmptyDataDecls #-} -- | -- Module : Data.Text.ICU.Bidi.Internal -- Copyright : (c) Ondrej Palkovsky 2018 -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Internal types for Unicode bidirectional algorithm module Data.Text.ICU.BiDi.Internal ( BiDi(..) , UBiDi , withBiDi ) where import Data.Typeable (Typeable) import Foreign.ForeignPtr (ForeignPtr, withForeignPtr) import Foreign.Ptr (Ptr) data UBiDi -- | BiDi object. /Note/: this structure is not -- thread safe. It is /not/ safe to use value of this type -- simultaneously from multiple threads. newtype BiDi = BiDi (ForeignPtr UBiDi) deriving (Eq, Typeable) instance Show BiDi where show _ = "BiDi" withBiDi :: BiDi -> (Ptr UBiDi -> IO a) -> IO a {-# INLINE withBiDi #-} withBiDi (BiDi cnv) = withForeignPtr cnv text-icu-0.8.0.4/Data/Text/ICU/BitMask.hs0000644000000000000000000000161407346545000015721 0ustar0000000000000000{-# LANGUAGE DefaultSignatures, ScopedTypeVariables #-} -- From http://stackoverflow.com/a/15911213 module Data.Text.ICU.BitMask ( -- * Bit mask twiddling API -- $api -- * Types ToBitMask(..) -- * Functions , fromBitMask , highestValueInBitMask ) where import Data.Bits ((.&.), (.|.)) import Data.Maybe (listToMaybe) -- $api -- Conversion to and from enumerated types representable as -- a compact bitmask. class ToBitMask a where toBitMask :: a -> Int instance (ToBitMask a) => ToBitMask [a] where toBitMask = foldr ((.|.) . toBitMask) 0 fromBitMask :: (Enum a, Bounded a, ToBitMask a) => Int -> [a] fromBitMask bm = filter inBitMask $ enumFrom minBound where inBitMask val = (bm .&. toBitMask val) == toBitMask val highestValueInBitMask :: (Enum a, Bounded a, ToBitMask a) => Int -> Maybe a highestValueInBitMask = listToMaybe . reverse . fromBitMask text-icu-0.8.0.4/Data/Text/ICU/Break.hsc0000644000000000000000000002643707346545000015570 0ustar0000000000000000{-# LANGUAGE BangPatterns, ForeignFunctionInterface, RecordWildCards #-} -- | -- Module : Data.Text.ICU.Break -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- String breaking functions for Unicode, implemented as bindings to -- the International Components for Unicode (ICU) libraries. -- -- The text boundary positions are found according to the rules described in -- Unicode Standard Annex #29, Text Boundaries, and Unicode Standard Annex -- #14, Line Breaking Properties. These are available at -- and -- . module Data.Text.ICU.Break ( -- * Types BreakIterator , Line(..) , Data.Text.ICU.Break.Word(..) -- * Breaking functions , breakCharacter , breakLine , breakSentence , breakWord , clone , setText -- * Iteration functions -- $indices , current , first , last , next , previous , preceding , following , isBoundary -- * Iterator status , getStatus , getStatuses -- * Locales , available ) where #include import Control.DeepSeq (NFData(..)) import Control.Monad (forM) import Data.IORef (newIORef, writeIORef) import Data.Int (Int32) import Data.Text (Text) import Data.Text.ICU.Break.Types (BreakIterator(..), UBreakIterator) import Data.Text.ICU.Error.Internal (UErrorCode, handleError) import Data.Text.ICU.Internal (LocaleName(..), UBool, UChar, asBool, withLocaleName, TextI, UText, asUTextPtr, withUTextPtr, newICUPtr) import Foreign.C.String (CString, peekCString) import Foreign.C.Types (CInt(..)) import Foreign.ForeignPtr (withForeignPtr) import Foreign.Marshal.Array (allocaArray, peekArray) import Foreign.Marshal.Utils (with) import Foreign.Ptr (FunPtr, Ptr, nullPtr) import Prelude hiding (last) import System.IO.Unsafe (unsafePerformIO) -- $indices -- -- /Important note/: All of the indices accepted and returned by -- functions in this module are offsets into the raw UTF-16 text -- array, /not/ a count of codepoints. -- | Line break status. data Line = Soft -- ^ A soft line break is a position at -- which a line break is acceptable, but not -- required. | Hard deriving (Eq, Show, Enum) instance NFData Line where rnf !_ = () -- | Word break status. data Word = Uncategorized -- ^ A \"word\" that does not fit into another -- category. Includes spaces and most -- punctuation. | Number -- ^ A word that appears to be a number. | Letter -- ^ A word containing letters, excluding -- hiragana, katakana or ideographic -- characters. | Kana -- ^ A word containing kana characters. | Ideograph -- ^ A word containing ideographic characters. deriving (Eq, Show, Enum) instance NFData Data.Text.ICU.Break.Word where rnf !_ = () -- | Break a string on character boundaries. -- -- Character boundary analysis identifies the boundaries of \"Extended -- Grapheme Clusters\", which are groupings of codepoints that should be -- treated as character-like units for many text operations. Please see -- Unicode Standard Annex #29, Unicode Text Segmentation, -- for additional information on -- grapheme clusters and guidelines on their use. breakCharacter :: LocaleName -> Text -> IO (BreakIterator ()) breakCharacter = open (#const UBRK_CHARACTER) (const ()) -- | Break a string on line boundaries. -- -- Line boundary analysis determines where a text string can be broken when -- line wrapping. The mechanism correctly handles punctuation and hyphenated -- words. breakLine :: LocaleName -> Text -> IO (BreakIterator Line) breakLine = open (#const UBRK_LINE) asLine where asLine i | i < (#const UBRK_LINE_SOFT_LIMIT) = Soft | i < (#const UBRK_LINE_HARD_LIMIT) = Hard | otherwise = error $ "unknown line break status " ++ show i -- | Break a string on sentence boundaries. -- -- Sentence boundary analysis allows selection with correct interpretation -- of periods within numbers and abbreviations, and trailing punctuation -- marks such as quotation marks and parentheses. breakSentence :: LocaleName -> Text -> IO (BreakIterator ()) breakSentence = open (#const UBRK_SENTENCE) (const ()) -- | Break a string on word boundaries. -- -- Word boundary analysis is used by search and replace functions, as well -- as within text editing applications that allow the user to select words -- with a double click. Word selection provides correct interpretation of -- punctuation marks within and following words. Characters that are not -- part of a word, such as symbols or punctuation marks, have word breaks on -- both sides. breakWord :: LocaleName -> Text -> IO (BreakIterator Data.Text.ICU.Break.Word) breakWord = open (#const UBRK_WORD) asWord where asWord i | i < (#const UBRK_WORD_NONE_LIMIT) = Uncategorized | i < (#const UBRK_WORD_NUMBER_LIMIT) = Number | i < (#const UBRK_WORD_LETTER_LIMIT) = Letter | i < (#const UBRK_WORD_KANA_LIMIT) = Kana | i < (#const UBRK_WORD_IDEO_LIMIT) = Ideograph | otherwise = error $ "unknown word break status " ++ show i -- | Create a new 'BreakIterator' for locating text boundaries in the -- specified locale. open :: UBreakIteratorType -> (Int32 -> a) -> LocaleName -> Text -> IO (BreakIterator a) open brk f loc t = withLocaleName loc $ \locale -> do r <- newIORef undefined b <- newICUPtr (BR r f) ubrk_close $ handleError $ ubrk_open brk locale nullPtr 0 setText b t return b -- | Point an existing 'BreakIterator' at a new piece of text. setText :: BreakIterator a -> Text -> IO () setText BR{..} t = do fp <- asUTextPtr t withUTextPtr fp $ \ ptr -> do withForeignPtr brIter $ \p -> handleError $ ubrk_setUText p ptr writeIORef brText fp -- | Thread safe cloning operation. This is substantially faster than -- creating a new 'BreakIterator' from scratch. clone :: BreakIterator a -> IO (BreakIterator a) clone BR{..} = newICUPtr (BR brText brStatus) ubrk_close $ withForeignPtr brIter $ \p -> with 1 $ handleError . ubrk_safeClone p nullPtr asIndex :: (Ptr UBreakIterator -> IO Int32) -> BreakIterator a -> IO (Maybe TextI) asIndex act BR{..} = do i <- withForeignPtr brIter act return $! if i == (#const UBRK_DONE) then Nothing else Just $! fromIntegral i -- | Reset the breaker to the beginning of the text to be scanned. first :: BreakIterator a -> IO TextI first BR{..} = fromIntegral `fmap` withForeignPtr brIter ubrk_first -- | Reset the breaker to the end of the text to be scanned. last :: BreakIterator a -> IO TextI last BR{..} = fromIntegral `fmap` withForeignPtr brIter ubrk_last -- | Advance the iterator and break at the text boundary that follows the -- current text boundary. next :: BreakIterator a -> IO (Maybe TextI) next = asIndex ubrk_next -- | Advance the iterator and break at the text boundary that precedes the -- current text boundary. previous :: BreakIterator a -> IO (Maybe TextI) previous = asIndex ubrk_previous -- | Determine the text boundary preceding the specified offset. preceding :: BreakIterator a -> Int -> IO (Maybe TextI) preceding bi i = asIndex (flip ubrk_preceding (fromIntegral i)) bi -- | Determine the text boundary following the specified offset. following :: BreakIterator a -> Int -> IO (Maybe TextI) following bi i = asIndex (flip ubrk_following (fromIntegral i)) bi -- | Return the character index most recently returned by 'next', -- 'previous', 'first', or 'last'. current :: BreakIterator a -> IO (Maybe TextI) current = asIndex ubrk_current -- | Return the status from the break rule that determined the most recently -- returned break position. For rules that do not specify a status, a -- default value of @()@ is returned. getStatus :: BreakIterator a -> IO a getStatus BR{..} = brStatus `fmap` withForeignPtr brIter ubrk_getRuleStatus -- | Return statuses from all of the break rules that determined the most -- recently returned break position. getStatuses :: BreakIterator a -> IO [a] getStatuses BR{..} = withForeignPtr brIter $ \brk -> do n <- handleError $ ubrk_getRuleStatusVec brk nullPtr 0 allocaArray (fromIntegral n) $ \ptr -> do _ <- handleError $ ubrk_getRuleStatusVec brk ptr n map brStatus `fmap` peekArray (fromIntegral n) ptr -- | Determine whether the specified position is a boundary position. -- As a side effect, leaves the iterator pointing to the first -- boundary position at or after the given offset. isBoundary :: BreakIterator a -> Int -> IO Bool isBoundary BR{..} i = asBool `fmap` withForeignPtr brIter (flip ubrk_isBoundary (fromIntegral i)) -- | Locales for which text breaking information is available. A -- 'BreakIterator' in a locale in this list will perform the correct -- text breaking for the locale. available :: [LocaleName] available = unsafePerformIO $ do n <- ubrk_countAvailable forM [0..n-1] $ \i -> ubrk_getAvailable i >>= fmap Locale . peekCString {-# NOINLINE available #-} type UBreakIteratorType = CInt foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_open" ubrk_open :: UBreakIteratorType -> CString -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO (Ptr UBreakIterator) foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_setUText" ubrk_setUText :: Ptr UBreakIterator -> Ptr UText -> Ptr UErrorCode -> IO () foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_safeClone" ubrk_safeClone :: Ptr UBreakIterator -> Ptr a -> Ptr Int32 -> Ptr UErrorCode -> IO (Ptr UBreakIterator) foreign import ccall unsafe "hs_text_icu.h &__hs_ubrk_close" ubrk_close :: FunPtr (Ptr UBreakIterator -> IO ()) foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_current" ubrk_current :: Ptr UBreakIterator -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_first" ubrk_first :: Ptr UBreakIterator -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_last" ubrk_last :: Ptr UBreakIterator -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_next" ubrk_next :: Ptr UBreakIterator -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_previous" ubrk_previous :: Ptr UBreakIterator -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_preceding" ubrk_preceding :: Ptr UBreakIterator -> Int32 -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_following" ubrk_following :: Ptr UBreakIterator -> Int32 -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_getRuleStatus" ubrk_getRuleStatus :: Ptr UBreakIterator -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_getRuleStatusVec" ubrk_getRuleStatusVec :: Ptr UBreakIterator -> Ptr Int32 -> Int32 -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_isBoundary" ubrk_isBoundary :: Ptr UBreakIterator -> Int32 -> IO UBool foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_countAvailable" ubrk_countAvailable :: IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_getAvailable" ubrk_getAvailable :: Int32 -> IO CString text-icu-0.8.0.4/Data/Text/ICU/Break/0000755000000000000000000000000007346545000015055 5ustar0000000000000000text-icu-0.8.0.4/Data/Text/ICU/Break/Pure.hs0000644000000000000000000001120607346545000016324 0ustar0000000000000000{-# LANGUAGE BangPatterns, RecordWildCards, CPP #-} -- | -- Module : Data.Text.ICU.Break.Pure -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- String breaking functions for Unicode, implemented as bindings to -- the International Components for Unicode (ICU) libraries. -- -- The text boundary positions are found according to the rules described in -- Unicode Standard Annex #29, Text Boundaries, and Unicode Standard Annex -- #14, Line Breaking Properties. These are available at -- and -- . module Data.Text.ICU.Break.Pure ( -- * Types Breaker , Break , brkPrefix , brkBreak , brkSuffix , brkStatus , Line(..) , Data.Text.ICU.Break.Word(..) -- * Breaking functions , breakCharacter , breakLine , breakSentence , breakWord -- * Iteration , breaks , breaksRight ) where import Control.DeepSeq (NFData(..)) import Data.Text (Text, empty) import Data.Text.ICU.Break (Line, Word) import Data.Text.ICU.Break.Types (BreakIterator(..)) import Data.Text.ICU.Internal (LocaleName, takeWord, dropWord) import System.IO.Unsafe (unsafeInterleaveIO, unsafePerformIO) import qualified Data.Text.ICU.Break as IO -- | A boundary analyser. newtype Breaker a = B (BreakIterator a) new :: (LocaleName -> Text -> IO (BreakIterator a)) -> LocaleName -> Breaker a new act loc = unsafePerformIO $ B `fmap` act loc empty -- | Break a string on character boundaries. -- -- Character boundary analysis identifies the boundaries of "Extended -- Grapheme Clusters", which are groupings of codepoints that should be -- treated as character-like units for many text operations. Please see -- Unicode Standard Annex #29, Unicode Text Segmentation, -- for additional information on -- grapheme clusters and guidelines on their use. breakCharacter :: LocaleName -> Breaker () breakCharacter = new IO.breakCharacter -- | Break a string on line boundaries. -- -- Line boundary analysis determines where a text string can be broken when -- line wrapping. The mechanism correctly handles punctuation and hyphenated -- words. breakLine :: LocaleName -> Breaker Line breakLine = new IO.breakLine -- | Break a string on sentence boundaries. -- -- Sentence boundary analysis allows selection with correct interpretation -- of periods within numbers and abbreviations, and trailing punctuation -- marks such as quotation marks and parentheses. breakSentence :: LocaleName -> Breaker () breakSentence = new IO.breakSentence -- | Break a string on word boundaries. -- -- Word boundary analysis is used by search and replace functions, as well -- as within text editing applications that allow the user to select words -- with a double click. Word selection provides correct interpretation of -- punctuation marks within and following words. Characters that are not -- part of a word, such as symbols or punctuation marks, have word breaks on -- both sides. breakWord :: LocaleName -> Breaker Data.Text.ICU.Break.Word breakWord = new IO.breakWord -- | A break in a string. data Break a = Break { brkPrefix :: {-# UNPACK #-} !Text -- ^ Prefix of the current break. , brkBreak :: {-# UNPACK #-} !Text -- ^ Text of the current break. , brkSuffix :: {-# UNPACK #-} !Text -- ^ Suffix of the current break. , brkStatus :: !a -- ^ Status of the current break (only meaningful if 'Line' or 'Word'). } deriving (Eq, Show) instance (NFData a) => NFData (Break a) where rnf Break{..} = rnf brkStatus -- | Return a list of all breaks in a string, from left to right. breaks :: Breaker a -> Text -> [Break a] breaks (B b) t = unsafePerformIO $ do bi <- IO.clone b IO.setText bi t let go p = do mix <- IO.next bi case mix of Nothing -> return [] Just n -> do s <- IO.getStatus bi let d = n-p u = dropWord p t (Break (takeWord p t) (takeWord d u) (dropWord d u) s :) `fmap` go n unsafeInterleaveIO $ go =<< IO.first bi -- | Return a list of all breaks in a string, from right to left. breaksRight :: Breaker a -> Text -> [Break a] breaksRight (B b) t = unsafePerformIO $ do bi <- IO.clone b IO.setText bi t let go p = do mix <- IO.previous bi case mix of Nothing -> return [] Just n -> do s <- IO.getStatus bi let d = p-n u = dropWord n t (Break (takeWord n t) (takeWord d u) (dropWord d u) s :) `fmap` go n unsafeInterleaveIO $ go =<< IO.last bi text-icu-0.8.0.4/Data/Text/ICU/Break/Types.hs0000644000000000000000000000121407346545000016513 0ustar0000000000000000{-# LANGUAGE EmptyDataDecls #-} -- | -- Module : Data.Text.ICU.Break.Internal -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC module Data.Text.ICU.Break.Types ( BreakIterator(..) , UBreakIterator ) where import Data.IORef (IORef) import Data.Int (Int32) import Foreign.ForeignPtr (ForeignPtr) import Data.Text.ICU.Internal (UTextPtr) -- A boundary breaker. data BreakIterator a = BR { brText :: IORef UTextPtr , brStatus :: Int32 -> a , brIter :: ForeignPtr UBreakIterator } data UBreakIterator text-icu-0.8.0.4/Data/Text/ICU/Calendar.hsc0000644000000000000000000006214307346545000016247 0ustar0000000000000000{-# LANGUAGE RankNTypes, BangPatterns, ForeignFunctionInterface, RecordWildCards #-} -- | -- Module : Data.Text.ICU.Calendar -- Copyright : (c) 2021 Torsten Kemps-Benedix -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Calendar functions implemented as bindings to -- the International Components for Unicode (ICU) libraries. module Data.Text.ICU.Calendar ( -- * Data Calendar(..), CalendarType(..), SystemTimeZoneType(..), CalendarField(..), UCalendar, -- * High-level interface -- ** Operations on calendars roll, add, set1, set, get, -- ** Calendar field getters era, year, month, dayOfMonth, dayOfYear, dayOfWeek, dayOfWeekInMonth, amPm, hour, hourOfDay, minute, second, millisecond, zoneOffset, dstOffset, yearWoY, doWLocal, extendedYear, julianDay, millisecondsInDay, isLeapMonth, day, utcTime, -- ** Calendar field setters setEra, setYear, setMonth, setDayOfMonth, setDayOfYear, setDayOfWeek, setDayOfWeekInMonth, setAmPm, setHour, setHourOfDay, setMinute, setSecond, setMillisecond, setZoneOffset, setDstOffset, setYearWoY, setDoWLocal, setExtendedYear, setJulianDay, setMillisecondsInDay, setDay, -- ** Lenses _era, _year, _month, _dayOfMonth, _dayOfYear, _dayOfWeek, _dayOfWeekInMonth, _amPm, _hour, _hourOfDay, _minute, _second, _millisecond, _zoneOffset, _dstOffset, _day, -- ** Time zone functions timeZoneIDs, timeZones, -- * Low-level interface calendar, openTimeZoneIDEnumeration, openTimeZones, getField, setField, setDate, setDateTime, rollField, addField, setTimeZone, ) where #include import Control.Monad (forM_) import Data.Int (Int32) import Data.Text (Text, pack) import Data.Text.Foreign (withCStringLen) import Data.Text.ICU.Enumerator import Data.Text.ICU.Error.Internal (UErrorCode, handleError) import Data.Text.ICU.Internal (LocaleName(..), UChar, withLocaleName, newICUPtr, useAsUCharPtr) import qualified Data.Time.Calendar as Cal import qualified Data.Time.Clock as Clock import Foreign.C.String (CString) import Foreign.C.Types (CInt(..)) import Foreign.ForeignPtr (withForeignPtr, ForeignPtr) import Foreign.Ptr (FunPtr, Ptr) import Prelude hiding (last) import System.IO.Unsafe (unsafePerformIO) type UCalendar = CInt -- A 'Calendar' is an abstract data type that contains a foreign pointer to the ICU internal data structure. data Calendar = Calendar {calendarForeignPtr :: ForeignPtr UCalendar} -- | All the fields that comprise a 'Calendar'. data CalendarField = Era -- ^ Field indicating the era, e.g., AD or BC in the Gregorian (Julian) calendar. This is a calendar-specific value. | Year -- ^ Field indicating the year. This is a calendar-specific value. | Month -- ^ Field indicating the month. This is a calendar-specific value. The first month of the year is JANUARY; the last depends on the number of months in a year. Note: Calendar month is 0-based. | WeekOfYear -- ^ Field indicating the week number within the current year. The first week of the year, as defined by UCAL_FIRST_DAY_OF_WEEK and UCAL_MINIMAL_DAYS_IN_FIRST_WEEK attributes, has value 1. Subclasses define the value of UCAL_WEEK_OF_YEAR for days before the first week of the year. | WeekOfMonth -- ^ Field indicating the week number within the current month. The first week of the month, as defined by UCAL_FIRST_DAY_OF_WEEK and UCAL_MINIMAL_DAYS_IN_FIRST_WEEK attributes, has value 1. Subclasses define the value of WEEK_OF_MONTH for days before the first week of the month. | DayOfMonth -- ^ Field indicating the day of the month. This is a synonym for DAY_OF_MONTH. The first day of the month has value 1. | DayOfYear -- ^ Field indicating the day number within the current year. The first day of the year has value 1. | DayOfWeek -- ^ Field indicating the day of the week. This field takes values SUNDAY, MONDAY, TUESDAY, WEDNESDAY, THURSDAY, FRIDAY, and SATURDAY. Note: Calendar day-of-week is 1-based. Clients who create locale resources for the field of first-day-of-week should be aware of this. For instance, in US locale, first-day-of-week is set to 1, i.e., UCAL_SUNDAY. | DayOfWeekInMonth -- ^ Field indicating the ordinal number of the day of the week within the current month. Together with the DAY_OF_WEEK field, this uniquely specifies a day within a month. Unlike WEEK_OF_MONTH and WEEK_OF_YEAR, this field's value does not depend on getFirstDayOfWeek() or getMinimalDaysInFirstWeek(). DAY_OF_MONTH 1 through 7 always correspond to DAY_OF_WEEK_IN_MONTH 1; 8 through 15 correspond to DAY_OF_WEEK_IN_MONTH 2, and so on. DAY_OF_WEEK_IN_MONTH 0 indicates the week before DAY_OF_WEEK_IN_MONTH 1. Negative values count back from the end of the month, so the last Sunday of a month is specified as DAY_OF_WEEK = SUNDAY, DAY_OF_WEEK_IN_MONTH = -1. Because negative values count backward they will usually be aligned differently within the month than positive values. For example, if a month has 31 days, DAY_OF_WEEK_IN_MONTH -1 will overlap DAY_OF_WEEK_IN_MONTH 5 and the end of 4. | AmPm -- ^ Field indicating whether the HOUR is before or after noon. E.g., at 10:04:15.250 PM the AM_PM is PM. | Hour -- ^ Field indicating the hour of the morning or afternoon. HOUR is used for the 12-hour clock. E.g., at 10:04:15.250 PM the HOUR is 10. | HourOfDay -- ^ Field indicating the hour of the day. HOUR_OF_DAY is used for the 24-hour clock. E.g., at 10:04:15.250 PM the HOUR_OF_DAY is 22. | Minute -- ^ Field indicating the minute within the hour. E.g., at 10:04:15.250 PM the UCAL_MINUTE is 4. | Second -- ^ Field indicating the second within the minute. E.g., at 10:04:15.250 PM the UCAL_SECOND is 15. | Millisecond -- ^ Field indicating the millisecond within the second. E.g., at 10:04:15.250 PM the UCAL_MILLISECOND is 250. | ZoneOffset -- ^ Field indicating the raw offset from GMT in milliseconds. | DstOffset -- ^ Field indicating the daylight savings offset in milliseconds. | YearWoY -- ^ Field indicating the extended year corresponding to the UCAL_WEEK_OF_YEAR field. This may be one greater or less than the value of UCAL_EXTENDED_YEAR. | DoWLocal -- ^ Field indicating the localized day of week. This will be a value from 1 to 7 inclusive, with 1 being the localized first day of the week. | ExtendedYear -- ^ Year of this calendar system, encompassing all supra-year fields. For example, in Gregorian/Julian calendars, positive Extended Year values indicate years AD, 1 BC = 0 extended, 2 BC = -1 extended, and so on. | JulianDay -- ^ Field indicating the modified Julian day number. This is different from the conventional Julian day number in two regards. First, it demarcates days at local zone midnight, rather than noon GMT. Second, it is a local number; that is, it depends on the local time zone. It can be thought of as a single number that encompasses all the date-related fields. | MillisecondsInDay -- ^ Ranges from 0 to 23:59:59.999 (regardless of DST). This field behaves exactly like a composite of all time-related fields, not including the zone fields. As such, it also reflects discontinuities of those fields on DST transition days. On a day of DST onset, it will jump forward. On a day of DST cessation, it will jump backward. This reflects the fact that it must be combined with the DST_OFFSET field to obtain a unique local time value. | IsLeapMonth -- ^ Whether or not the current month is a leap month (0 or 1). See the Chinese calendar for an example of this. deriving (Show, Read, Eq) type UCalendarDateFields = CInt toUCalendarDateFields :: CalendarField -> UCalendarDateFields toUCalendarDateFields Era = #const UCAL_ERA toUCalendarDateFields Year = #const UCAL_YEAR toUCalendarDateFields Month = #const UCAL_MONTH toUCalendarDateFields WeekOfYear = #const UCAL_WEEK_OF_YEAR toUCalendarDateFields WeekOfMonth = #const UCAL_WEEK_OF_MONTH toUCalendarDateFields DayOfMonth = #const UCAL_DAY_OF_MONTH toUCalendarDateFields DayOfYear = #const UCAL_DAY_OF_YEAR toUCalendarDateFields DayOfWeek = #const UCAL_DAY_OF_WEEK toUCalendarDateFields DayOfWeekInMonth = #const UCAL_DAY_OF_WEEK_IN_MONTH toUCalendarDateFields AmPm = #const UCAL_AM_PM toUCalendarDateFields Hour = #const UCAL_HOUR toUCalendarDateFields HourOfDay = #const UCAL_HOUR_OF_DAY toUCalendarDateFields Minute = #const UCAL_MINUTE toUCalendarDateFields Second = #const UCAL_SECOND toUCalendarDateFields Millisecond = #const UCAL_MILLISECOND toUCalendarDateFields ZoneOffset = #const UCAL_ZONE_OFFSET toUCalendarDateFields DstOffset = #const UCAL_DST_OFFSET toUCalendarDateFields YearWoY = #const UCAL_YEAR_WOY toUCalendarDateFields DoWLocal = #const UCAL_DOW_LOCAL toUCalendarDateFields ExtendedYear = #const UCAL_EXTENDED_YEAR toUCalendarDateFields JulianDay = #const UCAL_JULIAN_DAY toUCalendarDateFields MillisecondsInDay = #const UCAL_MILLISECONDS_IN_DAY toUCalendarDateFields IsLeapMonth = #const UCAL_IS_LEAP_MONTH data CalendarType = TraditionalCalendarType | DefaultCalendarType | GregorianCalendarType deriving (Show, Read, Eq) type UCalendarType = CInt toUCalendarType :: CalendarType -> UCalendarType toUCalendarType TraditionalCalendarType = #const UCAL_TRADITIONAL toUCalendarType DefaultCalendarType = #const UCAL_TRADITIONAL toUCalendarType GregorianCalendarType = #const UCAL_GREGORIAN data SystemTimeZoneType = AnyTimeZone | CanonicalTimeZone | CanonicalLocationTimeZone deriving (Show, Read, Eq) toUSystemTimeZoneType :: SystemTimeZoneType -> USystemTimeZoneType toUSystemTimeZoneType AnyTimeZone = #const UCAL_ZONE_TYPE_ANY toUSystemTimeZoneType CanonicalTimeZone = #const UCAL_ZONE_TYPE_CANONICAL toUSystemTimeZoneType CanonicalLocationTimeZone = #const UCAL_ZONE_TYPE_CANONICAL_LOCATION type USystemTimeZoneType = CInt -- | Open a Calendar. -- -- A Calendar may be used to convert a millisecond value to a year, -- month, and day. -- -- Note: When unknown TimeZone ID is specified or if the TimeZone ID -- specified is "Etc/Unknown", the Calendar returned by the function -- is initialized with GMT zone with TimeZone ID UCAL_UNKNOWN_ZONE_ID -- ("Etc/Unknown") without any errors/warnings. If you want to check -- if a TimeZone ID is valid prior to this function, use -- ucal_getCanonicalTimeZoneID. -- -- >>> import qualified Data.Text as T -- >>> c <- calendar (T.pack "CET") (Locale "de_DE") TraditionalCalendarType -- >>> show c -- 2021-10-12 17:37:43 calendar :: Text -> LocaleName -> CalendarType -> IO Calendar calendar zoneId loc typ = withLocaleName loc $ \locale -> useAsUCharPtr zoneId $ \zoneIdPtr zoneIdLen -> newICUPtr Calendar ucal_close $ handleError $ ucal_open zoneIdPtr (fromIntegral zoneIdLen) locale (toUCalendarType typ) setTimeZone :: Calendar -> Text -> IO () setTimeZone cal zoneId = withForeignPtr (calendarForeignPtr cal) $ \calPtr -> do withCStringLen zoneId $ \(zoneIdPtr, zoneIdLen) -> do handleError $ ucal_setTimeZone calPtr zoneIdPtr (fromIntegral zoneIdLen) clone :: Calendar -> IO Calendar clone cal = withForeignPtr (calendarForeignPtr cal) $ \calPtr -> do newICUPtr Calendar ucal_close $ handleError $ ucal_clone calPtr -- | List of all time zones. timeZones :: IO [Text] timeZones = do tzEnum <- openTimeZones tzs <- toList tzEnum pure tzs timeZoneIDs :: SystemTimeZoneType -> IO [Text] timeZoneIDs typ = do tzEnum <- openTimeZoneIDEnumeration typ tzs <- toList tzEnum pure tzs openTimeZoneIDEnumeration :: SystemTimeZoneType -> IO Enumerator openTimeZoneIDEnumeration typ = createEnumerator $ handleError $ ucal_openTimeZoneIDEnumeration (toUSystemTimeZoneType typ) -- | Create an enumeration over all time zones. openTimeZones :: IO Enumerator openTimeZones = createEnumerator $ handleError $ ucal_openTimeZones -- | Get the value of a specific calendar field. -- -- >>> import qualified Data.Text as T -- >>> c <- calendar (T.pack "CET") (Locale "de_DE") TraditionalCalendarType -- >>> getField c Year -- 2021 getField :: Calendar -> CalendarField -> IO Int getField cal fld = withForeignPtr (calendarForeignPtr cal) $ \calPtr -> do n <- handleError $ ucal_get calPtr (toUCalendarDateFields fld) pure (fromIntegral n) setField :: Calendar -> CalendarField -> Int -> IO () setField cal fld n = withForeignPtr (calendarForeignPtr cal) $ \calPtr -> ucal_set calPtr (toUCalendarDateFields fld) (fromIntegral n) setDate :: Calendar -> Int -> Int -> Int -> IO () setDate cal y m d = withForeignPtr (calendarForeignPtr cal) $ \calPtr -> handleError $ ucal_setDate calPtr (fromIntegral y) (fromIntegral m) (fromIntegral d) setDateTime :: Calendar -> Int -> Int -> Int -> Int -> Int -> Int -> IO () setDateTime cal y m d hr mn sec = withForeignPtr (calendarForeignPtr cal) $ \calPtr -> handleError $ ucal_setDateTime calPtr (fromIntegral y) (fromIntegral m) (fromIntegral d) (fromIntegral hr) (fromIntegral mn) (fromIntegral sec) -- | Add a specified signed amount to a particular field in a Calendar. -- -- This will not modify more significant fields in the calendar. Rolling by a -- positive value always means moving forward in time (unless the limit of the -- field is reached, in which case it may pin or wrap), so for Gregorian -- calendar, starting with 100 BC and rolling the year by +1 results in 99 BC. -- When eras have a definite beginning and end (as in the Chinese calendar, or -- as in most eras in the Japanese calendar) then rolling the year past either -- limit of the era will cause the year to wrap around. When eras only have a -- limit at one end, then attempting to roll the year past that limit will --result in pinning the year at that limit. Note that for most calendars in -- which era 0 years move forward in time (such as Buddhist, Hebrew, or Islamic), -- it is possible for add or roll to result in negative years for era 0 (that -- is the only way to represent years before the calendar epoch). rollField :: Calendar -> CalendarField -> Int -> IO () rollField cal fld n = withForeignPtr (calendarForeignPtr cal) $ \calPtr -> handleError $ ucal_roll calPtr (toUCalendarDateFields fld) (fromIntegral n) -- | Add a specified signed amount to a particular field in a Calendar. -- -- See 'rollField' for further details. -- -- >>> import qualified Data.Text as T -- >>> c1 <- calendar (T.pack "CET") (Locale "de_DE") TraditionalCalendarType -- >>> show c1 -- 2021-10-12 17:53:26 -- >>> let c2 = roll c1 [(Hour, 2)] -- >>> show c2 -- 2021-10-12 19:53:26 -- >>> let c3 = roll c1 [(Hour, 12)] -- >>> show c3 -- 2021-10-12 17:53:26 -- >>> let c4 = add c1 [(Hour, 12)] -- >>> show c4 -- 2021-10-13 5:53:26 roll :: Calendar -> [(CalendarField, Int)] -- ^ The field and the signed amount to add to this field. If the amount causes the value to exceed to maximum or minimum values for that field, the field is pinned to a permissible value. -> Calendar roll cal lst = unsafePerformIO $ do cal' <- clone cal forM_ lst (\(fld,n) -> rollField cal' fld n) pure cal' -- | Add a specified signed amount to a particular field in a Calendar. -- -- This can modify more significant fields in the calendar. Adding a positive -- value always means moving forward in time, so for the Gregorian calendar, -- starting with 100 BC and adding +1 to year results in 99 BC (even though -- this actually reduces the numeric value of the field itself). addField :: Calendar -> CalendarField -> Int -> IO () addField cal fld n = withForeignPtr (calendarForeignPtr cal) $ \calPtr -> handleError $ ucal_add calPtr (toUCalendarDateFields fld) (fromIntegral n) -- | Add a specified signed amount to a particular field in a Calendar. -- -- See 'addField' for further details and see 'rollField' for examples and differences -- compared to rolling. add :: Calendar -- ^ The 'Calendar' to which to add. -> [(CalendarField, Int)] -- ^ Field type and the signed amount to add to field. If the amount causes the value to exceed to maximum or minimum values for that field, other fields are modified to preserve the magnitude of the change. -> Calendar add cal lst = unsafePerformIO $ do cal' <- clone cal forM_ lst (\(fld,n) -> addField cal' fld n) pure cal' -- | Set the value of one field of a calendar to a certain value. All fields are -- represented as 32-bit integers. set1 :: Calendar -> CalendarField -> Int -> Calendar set1 cal fld n = unsafePerformIO $ do cal' <- clone cal setField cal' fld n pure cal' -- | Set the value of a list of fields of a calendar to certain values. All fields are -- represented as 32-bit integers. set :: Calendar -> [(CalendarField, Int)] -> Calendar set cal lst = unsafePerformIO $ do cal' <- clone cal forM_ lst (\(fld,n) -> setField cal' fld n) pure cal' -- | Convert the day part of the calendar to a 'Day'. -- -- >>> import qualified Data.Text as T -- >>> c1 <- calendar (T.pack "CET") (Locale "de_DE") TraditionalCalendarType -- >>> show c1 -- 2021-10-12 18:00:50 -- >>> day c1 -- 2021-10-12 day :: Calendar -> Cal.Day day cal = unsafePerformIO $ do y <- getField cal Year m <- getField cal Month d <- getField cal DayOfMonth pure (Cal.fromGregorian (fromIntegral y) (m+1) d) -- | Set the day part of the calendar from a 'Day'. setDay :: Calendar -> Cal.Day -> Calendar setDay cal aDay = unsafePerformIO $ do cal' <- clone cal let (y,m,d) = Cal.toGregorian aDay setDate cal' (fromIntegral y) (m-1) d pure cal' -- | Convert the day and time part of the calendar to a 'UTCTime'. -- -- >>> import qualified Data.Text as T -- >>> c1 <- calendar (T.pack "CET") (Locale "de_DE") TraditionalCalendarType -- >>> show c1 -- 2021-10-12 18:00:50 -- >>> utcTime c1 -- 2021-10-12 16:00:50.544999999998 UTC utcTime :: Calendar -> Clock.UTCTime utcTime cal = unsafePerformIO $ do cal' <- clone cal setTimeZone cal' (pack "UTC") y <- getField cal' Year m <- getField cal' Month d <- getField cal' DayOfMonth let day' = Cal.fromGregorian (fromIntegral y) (m+1) d ms <- getField cal' MillisecondsInDay let dt = realToFrac ((fromIntegral ms :: Double) / 1000) pure $ Clock.UTCTime day' dt -- | Get the value of a specific field in the calendar. get :: Calendar -> CalendarField -> Int get cal fld = unsafePerformIO $ getField cal fld -- | Return the era of the calendar. The values are calendar specific and are usually 0 (some ancient era) and 1 (current era). -- -- >>> import qualified Data.Text as T -- >>> c1 <- calendar (T.pack "CET") (Locale "de_DE") TraditionalCalendarType -- >>> era c1 -- 1 era :: Calendar -> Int era cal = unsafePerformIO $ getField cal Era setEra :: Calendar -> Int -> Calendar setEra cal = set1 cal Era -- | Return the year of the calendar. The values are calendar specific. -- -- >>> import qualified Data.Text as T -- >>> c1 <- calendar (T.pack "CET") (Locale "de_DE") TraditionalCalendarType -- >>> year c1 -- 2021 year :: Calendar -> Int year cal = unsafePerformIO $ getField cal Year setYear :: Calendar -> Int -> Calendar setYear cal = set1 cal Year -- | Return the month of the calendar. The values are calendar specific and 0-based. -- -- >>> import qualified Data.Text as T -- >>> c1 <- calendar (T.pack "CET") (Locale "de_DE") TraditionalCalendarType -- >>> month c1 -- 9 -- >>> day c1 -- 2021-10-12 month :: Calendar -> Int month cal = unsafePerformIO $ getField cal Month setMonth :: Calendar -> Int -> Calendar setMonth cal = set1 cal Month dayOfMonth :: Calendar -> Int dayOfMonth cal = unsafePerformIO $ getField cal DayOfMonth setDayOfMonth :: Calendar -> Int -> Calendar setDayOfMonth cal = set1 cal DayOfMonth dayOfYear :: Calendar -> Int dayOfYear cal = unsafePerformIO $ getField cal DayOfYear setDayOfYear :: Calendar -> Int -> Calendar setDayOfYear cal = set1 cal DayOfYear dayOfWeek :: Calendar -> Int dayOfWeek cal = unsafePerformIO $ getField cal DayOfWeek setDayOfWeek :: Calendar -> Int -> Calendar setDayOfWeek cal = set1 cal DayOfWeek dayOfWeekInMonth :: Calendar -> Int dayOfWeekInMonth cal = unsafePerformIO $ getField cal DayOfWeekInMonth setDayOfWeekInMonth :: Calendar -> Int -> Calendar setDayOfWeekInMonth cal = set1 cal DayOfWeekInMonth amPm :: Calendar -> Int amPm cal = unsafePerformIO $ getField cal AmPm setAmPm :: Calendar -> Int -> Calendar setAmPm cal = set1 cal AmPm hour :: Calendar -> Int hour cal = unsafePerformIO $ getField cal Hour setHour :: Calendar -> Int -> Calendar setHour cal = set1 cal Hour hourOfDay :: Calendar -> Int hourOfDay cal = unsafePerformIO $ getField cal HourOfDay setHourOfDay :: Calendar -> Int -> Calendar setHourOfDay cal = set1 cal HourOfDay minute :: Calendar -> Int minute cal = unsafePerformIO $ getField cal Minute setMinute :: Calendar -> Int -> Calendar setMinute cal = set1 cal Minute second :: Calendar -> Int second cal = unsafePerformIO $ getField cal Second setSecond :: Calendar -> Int -> Calendar setSecond cal = set1 cal Second millisecond :: Calendar -> Int millisecond cal = unsafePerformIO $ getField cal Millisecond setMillisecond :: Calendar -> Int -> Calendar setMillisecond cal = set1 cal Millisecond zoneOffset :: Calendar -> Int zoneOffset cal = unsafePerformIO $ getField cal ZoneOffset setZoneOffset :: Calendar -> Int -> Calendar setZoneOffset cal = set1 cal ZoneOffset dstOffset :: Calendar -> Int dstOffset cal = unsafePerformIO $ getField cal DstOffset setDstOffset :: Calendar -> Int -> Calendar setDstOffset cal = set1 cal DstOffset yearWoY :: Calendar -> Int yearWoY cal = unsafePerformIO $ getField cal YearWoY setYearWoY :: Calendar -> Int -> Calendar setYearWoY cal = set1 cal YearWoY doWLocal :: Calendar -> Int doWLocal cal = unsafePerformIO $ getField cal DoWLocal setDoWLocal :: Calendar -> Int -> Calendar setDoWLocal cal = set1 cal DoWLocal extendedYear :: Calendar -> Int extendedYear cal = unsafePerformIO $ getField cal ExtendedYear setExtendedYear :: Calendar -> Int -> Calendar setExtendedYear cal = set1 cal ExtendedYear julianDay :: Calendar -> Int julianDay cal = unsafePerformIO $ getField cal JulianDay setJulianDay :: Calendar -> Int -> Calendar setJulianDay cal = set1 cal JulianDay millisecondsInDay :: Calendar -> Int millisecondsInDay cal = unsafePerformIO $ getField cal MillisecondsInDay setMillisecondsInDay :: Calendar -> Int -> Calendar setMillisecondsInDay cal = set1 cal MillisecondsInDay isLeapMonth :: Calendar -> Bool isLeapMonth cal = 0 /= (unsafePerformIO $ getField cal IsLeapMonth) -- Copied from the lens package in order not to generate a dependency. type Lens s t a b = forall f. Functor f => (a -> f b) -> s -> f t type Lens' s a = Lens s s a a lens :: (s -> a) -> (s -> b -> t) -> Lens s t a b lens sa sbt afb s = sbt s <$> afb (sa s) {-# INLINE lens #-} _era :: Lens' Calendar Int _era = lens era setEra _year :: Lens' Calendar Int _year = lens year setYear _month :: Lens' Calendar Int _month = lens month setMonth _dayOfYear :: Lens' Calendar Int _dayOfYear = lens dayOfYear setDayOfYear _dayOfMonth :: Lens' Calendar Int _dayOfMonth = lens dayOfMonth setDayOfMonth _dayOfWeek :: Lens' Calendar Int _dayOfWeek = lens dayOfWeek setDayOfWeek _dayOfWeekInMonth :: Lens' Calendar Int _dayOfWeekInMonth = lens dayOfWeekInMonth setDayOfWeekInMonth _amPm :: Lens' Calendar Int _amPm = lens amPm setAmPm _hour :: Lens' Calendar Int _hour = lens hour setHour _hourOfDay :: Lens' Calendar Int _hourOfDay = lens hour setHourOfDay _minute :: Lens' Calendar Int _minute = lens minute setMinute _second :: Lens' Calendar Int _second = lens second setSecond _millisecond :: Lens' Calendar Int _millisecond = lens millisecond setMillisecond _zoneOffset :: Lens' Calendar Int _zoneOffset = lens zoneOffset setZoneOffset _dstOffset :: Lens' Calendar Int _dstOffset = lens dstOffset setDstOffset _day :: Lens' Calendar Cal.Day _day = lens day setDay instance Show Calendar where show cal = show (utcTime cal) foreign import ccall unsafe "hs_text_icu.h __hs_ucal_open" ucal_open :: Ptr UChar -> Int32 -> CString -> UCalendar -> Ptr UErrorCode -> IO (Ptr UCalendar) foreign import ccall unsafe "hs_text_icu.h __hs_ucal_clone" ucal_clone :: Ptr UCalendar -> Ptr UErrorCode -> IO (Ptr UCalendar) foreign import ccall unsafe "hs_text_icu.h &__hs_ucal_close" ucal_close :: FunPtr (Ptr UCalendar -> IO ()) foreign import ccall unsafe "hs_text_icu.h __hs_ucal_get" ucal_get :: Ptr UCalendar -> UCalendarDateFields -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ucal_set" ucal_set :: Ptr UCalendar -> UCalendarDateFields -> Int32 -> IO () foreign import ccall unsafe "hs_text_icu.h __hs_ucal_setDate" ucal_setDate :: Ptr UCalendar -> Int32 -> Int32 -> Int32 -> Ptr UErrorCode -> IO () foreign import ccall unsafe "hs_text_icu.h __hs_ucal_setDateTime" ucal_setDateTime :: Ptr UCalendar -> Int32 -> Int32 -> Int32 -> Int32 -> Int32 -> Int32 -> Ptr UErrorCode -> IO () foreign import ccall unsafe "hs_text_icu.h __hs_ucal_add" ucal_add :: Ptr UCalendar -> UCalendarDateFields -> Int32 -> Ptr UErrorCode -> IO () foreign import ccall unsafe "hs_text_icu.h __hs_ucal_roll" ucal_roll :: Ptr UCalendar -> UCalendarDateFields -> Int32 -> Ptr UErrorCode -> IO () foreign import ccall unsafe "hs_text_icu.h _hs__ucal_openTimeZoneIDEnumeration" ucal_openTimeZoneIDEnumeration :: USystemTimeZoneType -> Ptr UErrorCode -> IO (Ptr UEnumerator) foreign import ccall unsafe "hs_text_icu.h __hs_ucal_openTimeZones" ucal_openTimeZones :: Ptr UErrorCode -> IO (Ptr UEnumerator) foreign import ccall unsafe "hs_text_icu.h __hs_ucal_setTimeZone" ucal_setTimeZone :: Ptr UCalendar -> CString -> Int32 -> Ptr UErrorCode -> IO () text-icu-0.8.0.4/Data/Text/ICU/Char.hsc0000644000000000000000000010277207346545000015416 0ustar0000000000000000{-# LANGUAGE BangPatterns, DeriveDataTypeable, FlexibleInstances, ForeignFunctionInterface, FunctionalDependencies, MultiParamTypeClasses #-} -- | -- Module : Data.Text.ICU.Char -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Access to the Unicode Character Database, implemented as bindings -- to the International Components for Unicode (ICU) libraries. -- -- Unicode assigns each codepoint (not just assigned character) values for -- many properties. Most are simple boolean flags, or constants from a -- small enumerated list. For some, values are relatively more complex -- types. -- -- For more information see \"About the Unicode Character Database\" -- and the ICU User Guide chapter on -- Properties . module Data.Text.ICU.Char ( -- * Working with character properties -- $properties Property -- * Property identifier types , BidiClass_(..) , Block_(..) , Bool_(..) , Decomposition_(..) , EastAsianWidth_(..) , GeneralCategory_(..) , HangulSyllableType_(..) , JoiningGroup_(..) , JoiningType_(..) , NumericType_(..) -- ** Combining class , CanonicalCombiningClass_(..) , LeadCanonicalCombiningClass_(..) , TrailingCanonicalCombiningClass_(..) -- ** Normalization checking , NFCQuickCheck_(..) , NFDQuickCheck_(..) , NFKCQuickCheck_(..) , NFKDQuickCheck_(..) -- ** Text boundaries , GraphemeClusterBreak_(..) , LineBreak_(..) , SentenceBreak_(..) , WordBreak_(..) , BidiPairedBracketType_(..) -- * Property value types , BlockCode(..) , Direction(..) , Decomposition(..) , EastAsianWidth(..) , GeneralCategory(..) , HangulSyllableType(..) , JoiningGroup(..) , JoiningType(..) , NumericType(..) -- ** Text boundaries , GraphemeClusterBreak(..) , LineBreak(..) , SentenceBreak(..) , WordBreak(..) , BidiPairedBracketType(..) -- * Functions , blockCode , charFullName , charName , charFromFullName , charFromName , combiningClass , direction , property , isMirrored , mirror -- ** Conversion to numbers , digitToInt , numericValue ) where #include import Control.DeepSeq (NFData(..)) import Data.Char (chr, ord) import Data.Int (Int32) import Data.Text.ICU.Error (u_INVALID_CHAR_FOUND) import Data.Text.ICU.Error.Internal (UErrorCode, handleOverflowError, withError) import Data.Text.ICU.Internal (UBool, UChar32, asBool) import Data.Text.ICU.Normalize.Internal (toNCR) import Data.Typeable (Typeable) import Data.Word (Word8) import Foreign.C.String (CString, peekCStringLen, withCString) import Foreign.C.Types (CInt(..)) import Foreign.Ptr (Ptr) import System.IO.Unsafe (unsafePerformIO) -- $properties -- -- The 'property' function provides the main view onto the Unicode Character -- Database. Because Unicode character properties have a variety of types, -- the 'property' function is polymorphic. The type of its first argument -- dictates the type of its result, by use of the 'Property' typeclass. -- -- For instance, @'property' 'Alphabetic'@ returns a 'Bool', while @'property' -- 'NFCQuickCheck'@ returns a @'Maybe' 'Bool'@. -- | The language directional property of a character set. data Direction = LeftToRight | RightToLeft | EuropeanNumber | EuropeanNumberSeparator | EuropeanNumberTerminator | ArabicNumber | CommonNumberSeparator | BlockSeparator | SegmentSeparator | WhiteSpaceNeutral | OtherNeutral | LeftToRightEmbedding | LeftToRightOverride | RightToLeftArabic | RightToLeftEmbedding | RightToLeftOverride | PopDirectionalFormat | DirNonSpacingMark | BoundaryNeutral | FirstStrongIsolate | LeftToRightIsolate | RightToLeftIsolate | PopDirectionalIsolate deriving (Eq, Enum, Show, Typeable) instance NFData Direction where rnf !_ = () -- | Descriptions of Unicode blocks. data BlockCode = NoBlock | BasicLatin | Latin1Supplement | LatinExtendedA | LatinExtendedB | IPAExtensions | SpacingModifierLetters | CombiningDiacriticalMarks | GreekAndCoptic | Cyrillic | Armenian | Hebrew | Arabic | Syriac | Thaana | Devanagari | Bengali | Gurmukhi | Gujarati | Oriya | Tamil | Telugu | Kannada | Malayalam | Sinhala | Thai | Lao | Tibetan | Myanmar | Georgian | HangulJamo | Ethiopic | Cherokee | UnifiedCanadianAboriginalSyllabics | Ogham | Runic | Khmer | Mongolian | LatinExtendedAdditional | GreekExtended | GeneralPunctuation | SuperscriptsAndSubscripts | CurrencySymbols | CombiningDiacriticalMarksForSymbols | LetterlikeSymbols | NumberForms | Arrows | MathematicalOperators | MiscellaneousTechnical | ControlPictures | OpticalCharacterRecognition | EnclosedAlphanumerics | BoxDrawing | BlockElements | GeometricShapes | MiscellaneousSymbols | Dingbats | BraillePatterns | CJKRadicalsSupplement | KangxiRadicals | IdeographicDescriptionCharacters | CJKSymbolsAndPunctuation | Hiragana | Katakana | Bopomofo | HangulCompatibilityJamo | Kanbun | BopomofoExtended | EnclosedCJKLettersAndMonths | CJKCompatibility | CJKUnifiedIdeographsExtensionA | CJKUnifiedIdeographs | YiSyllables | YiRadicals | HangulSyllables | HighSurrogates | HighPrivateUseSurrogates | LowSurrogates | PrivateUseArea | CJKCompatibilityIdeographs | AlphabeticPresentationForms | ArabicPresentationFormsA | CombiningHalfMarks | CJKCompatibilityForms | SmallFormVariants | ArabicPresentationFormsB | Specials | HalfwidthAndFullwidthForms | OldItalic | Gothic | Deseret | ByzantineMusicalSymbols | MusicalSymbols | MathematicalAlphanumericSymbols | CJKUnifiedIdeographsExtensionB | CJKCompatibilityIdeographsSupplement | Tags | CyrillicSupplement | Tagalog | Hanunoo | Buhid | Tagbanwa | MiscellaneousMathematicalSymbolsA | SupplementalArrowsA | SupplementalArrowsB | MiscellaneousMathematicalSymbolsB | SupplementalMathematicalOperators | KatakanaPhoneticExtensions | VariationSelectors | SupplementaryPrivateUseAreaA | SupplementaryPrivateUseAreaB | Limbu | TaiLe | KhmerSymbols | PhoneticExtensions | MiscellaneousSymbolsAndArrows | YijingHexagramSymbols | LinearBSyllabary | LinearBIdeograms | AegeanNumbers | Ugaritic | Shavian | Osmanya | CypriotSyllabary | TaiXuanJingSymbols | VariationSelectorsSupplement | AncientGreekMusicalNotation | AncientGreekNumbers | ArabicSupplement | Buginese | CJKStrokes | CombiningDiacriticalMarksSupplement | Coptic | EthiopicExtended | EthiopicSupplement | GeorgianSupplement | Glagolitic | Kharoshthi | ModifierToneLetters | NewTaiLue | OldPersian | PhoneticExtensionsSupplement | SupplementalPunctuation | SylotiNagri | Tifinagh | VerticalForms | N'Ko | Balinese | LatinExtendedC | LatinExtendedD | PhagsPa | Phoenician | Cuneiform | CuneiformNumbersAndPunctuation | CountingRodNumerals | Sundanese | Lepcha | OlChiki | CyrillicExtendedA | Vai | CyrillicExtendedB | Saurashtra | KayahLi | Rejang | Cham | AncientSymbols | PhaistosDisc | Lycian | Carian | Lydian | MahjongTiles | DominoTiles | Samaritan | UnifiedCanadianAboriginalSyllabicsExtended | TaiTham | VedicExtensions | Lisu | Bamum | CommonIndicNumberForms | DevanagariExtended | HangulJamoExtendedA | Javanese | MyanmarExtendedA | TaiViet | MeeteiMayek | HangulJamoExtendedB | ImperialAramaic | OldSouthArabian | Avestan | InscriptionalParthian | InscriptionalPahlavi | OldTurkic | RumiNumeralSymbols | Kaithi | EgyptianHieroglyphs | EnclosedAlphanumericSupplement | EnclosedIdeographicSupplement | CJKUnifiedIdeographsExtensionC | Mandaic | Batak | EthiopicExtendedA | Brahmi | BamumSupplement | KanaSupplement | PlayingCards | MiscellaneousSymbolsAndPictographs | Emoticons | TransportAndMapSymbols | AlchemicalSymbols | CJKUnifiedIdeographsExtensionD | ArabicExtendedA | ArabicMathematicalAlphabeticSymbols | Chakma | MeeteiMayekExtensions | MeroiticCursive | MeroiticHieroglyphs | Miao | Sharada | SoraSompeng | SundaneseSupplement | Takri | BassaVah | CaucasianAlbanian | CopticEpactNumbers | CombiningDiacriticalMarksExtended | Duployan | Elbasan | GeometricShapesExtended | Grantha | Khojki | Khudawadi | LatinExtendedE | LinearA | Mahajani | Manichaean | MendeKikakui | Modi | Mro | MyanmarExtendedB | Nabataean | OldNorthArabian | OldPermic | OrnamentalDingbats | PahawhHmong | Palmyrene | PauCinHau | PsalterPahlavi | ShorthandFormatControls | Siddham | SinhalaArchaicNumbers | SupplementalArrowsC | Tirhuta | WarangCiti | Ahom | AnatolianHieroglyphs | CherokeeSupplement | CJKUnifiedIdeographsExtensionE | EarlyDynasticCuneiform | Hatran | Multani | OldHungarian | SupplementalSymbolsAndPictographs | SuttonSignwriting -- New blocks in Unicode 9.0 (ICU 58) | Adlam | Bhaiksuki | CyrillicExtendedC | GlagoliticSupplement | IdeographicSymbolsAndPunctuation | Marchen | MongolianSupplement | Newa | Osage | Tangut | TangutComponents -- New blocks in Unicode 10.0 (ICU 60) | CjkUnifiedIdeographsExtensionF | KanaExtendedA | MasaramGondi | Nushu | Soyombo | SyriacSupplement | ZanabazarSquare -- New blocks in Unicode 11.0 (ICU 62) | ChessSymbols | Dogra | GeorgianExtended | GunjalaGondi | HanifiRohingya | IndicSiyaqNumbers | Makasar | MayanNumerals | Medefaidrin | OldSogdian | Sogdian -- New blocks in Unicode 12.0 (ICU 64) | EgyptianHieroglyphFormatControls | Elymaic | Nandinagari | NyiakengPuachueHmong | OttomanSiyaqNumbers | SmallKanaExtension | SymbolsAndPictographsExtendedA | TamilSupplement | Wancho -- New blocks in Unicode 13.0 (ICU 66) | Chorasmian | CjkUnifiedIdeographsExtensionG | DivesAkuru | KhitanSmallScript | LisuSupplement | SymbolsForLegacyComputing | TangutSupplement | Yezidi -- New blocks in Unicode 14.0 (ICU 70) | ArabicExtendedB | CyproMinoan | EthiopicExtendedB | KanaExtendedB | LatinExtendedF | LatinExtendedG | OldUyghur | Tangsa | Toto | UnifiedCanadianAboriginalSyllabicsExtendedA | Vithkuqi | ZnamennyMusicalNotation -- New blocks in Unicode 15.0 (ICU 72) | ArabicExtendedC | CjkUnifiedIdeographsExtensionH | CyrillicExtendedD | DevanagariExtendedA | KaktovikNumerals | Kawi | NagMundari deriving (Eq, Enum, Bounded, Show, Typeable) instance NFData BlockCode where rnf !_ = () data Bool_ = Alphabetic | ASCIIHexDigit -- ^ 0-9, A-F, a-f | BidiControl -- ^ Format controls which have specific functions in the Bidi Algorithm. | BidiMirrored -- ^ Characters that may change display in RTL text. | Dash -- ^ Variations of dashes. | DefaultIgnorable -- ^ Ignorable in most processing. | Deprecated -- ^ The usage of deprecated characters is strongly discouraged. | Diacritic -- ^ Characters that linguistically modify the meaning of another -- character to which they apply. | Extender -- ^ Extend the value or shape of a preceding alphabetic character, -- e.g. length and iteration marks. | FullCompositionExclusion | GraphemeBase -- ^ For programmatic determination of grapheme cluster boundaries. | GraphemeExtend -- ^ For programmatic determination of grapheme cluster boundaries. | GraphemeLink -- ^ For programmatic determination of grapheme cluster boundaries. | HexDigit -- ^ Characters commonly used for hexadecimal numbers. | Hyphen -- ^ Dashes used to mark connections between pieces of words, plus the -- Katakana middle dot. | IDContinue -- ^ Characters that can continue an identifier. | IDStart -- ^ Characters that can start an identifier. | Ideographic -- ^ CJKV ideographs. | IDSBinaryOperator -- ^ For programmatic determination of Ideographic Description Sequences. | IDSTrinaryOperator | JoinControl -- ^ Format controls for cursive joining and ligation. | LogicalOrderException -- ^ Characters that do not use logical order and require special handling -- in most processing. | Lowercase | Math | NonCharacter -- ^ Code points that are explicitly defined as illegal for the encoding -- of characters. | QuotationMark | Radical -- ^ For programmatic determination of Ideographic Description Sequences. | SoftDotted -- ^ Characters with a "soft dot", like i or j. An accent placed on these -- characters causes the dot to disappear. | TerminalPunctuation -- ^ Punctuation characters that generally mark the end of textual units. | UnifiedIdeograph -- ^ For programmatic determination of Ideographic Description Sequences. | Uppercase | WhiteSpace | XidContinue -- ^ 'IDContinue' modified to allow closure under normalization forms -- NFKC and NFKD. | XidStart -- ^ 'IDStart' modified to allow closure under normalization forms NFKC -- and NFKD. | CaseSensitive -- ^ Either the source of a case mapping or /in/ the target of a case -- mapping. Not the same as the general category @Cased_Letter@. | STerm -- ^ Sentence Terminal. Used in UAX #29: Text Boundaries -- . | VariationSelector -- ^ Indicates all those characters that qualify as Variation -- Selectors. For details on the behavior of these characters, see -- and 15.6 -- Variation Selectors. | NFDInert -- ^ ICU-specific property for characters that are inert under NFD, i.e. -- they do not interact with adjacent characters. Used for example in -- normalizing transforms in incremental mode to find the boundary of -- safely normalizable text despite possible text additions. | NFKDInert -- ^ ICU-specific property for characters that are inert under NFKD, i.e. -- they do not interact with adjacent characters. | NFCInert -- ^ ICU-specific property for characters that are inert under NFC, -- i.e. they do not interact with adjacent characters. | NFKCInert -- ^ ICU-specific property for characters that are inert under NFKC, -- i.e. they do not interact with adjacent characters. | SegmentStarter -- ^ ICU-specific property for characters that are starters in terms of -- Unicode normalization and combining character sequences. | PatternSyntax -- ^ See UAX #31 Identifier and Pattern Syntax -- . | PatternWhiteSpace -- ^ See UAX #31 Identifier and Pattern Syntax -- . | POSIXAlNum -- ^ Alphanumeric character class. | POSIXBlank -- ^ Blank character class. | POSIXGraph -- ^ Graph character class. | POSIXPrint -- ^ Printable character class. | POSIXXDigit -- ^ Hex digit character class. | Cased -- ^ Cased character class. For lowercase, uppercase and titlecase characters. | CaseIgnorable -- ^ Used in context-sensitive case mappings. | ChangesWhenLowercased | ChangesWhenUppercased | ChangesWhenTitlecased | ChangesWhenCasefolded | ChangesWhenCasemapped | ChangesWhenNFKCCasefolded | Emoji -- ^ See http://www.unicode.org/reports/tr51/#Emoji_Properties | EmojiPresentation -- ^ See http://www.unicode.org/reports/tr51/#Emoji_Properties | EmojiModifier -- ^ See http://www.unicode.org/reports/tr51/#Emoji_Properties | EmojiModifierBase -- ^ See http://www.unicode.org/reports/tr51/#Emoji_Properties | EmojiComponent -- ^ See http://www.unicode.org/reports/tr51/#Emoji_Properties | RegionalIndicator | PrependedConcatenationMark | ExtendedPictographic -- ICU 70 | BasicEmoji -- ^ See https://www.unicode.org/reports/tr51/#Emoji_Sets | EmojiKeycapSequence -- ^ See https://www.unicode.org/reports/tr51/#Emoji_Sets | RgiEmojiModifierSequence -- ^ See https://www.unicode.org/reports/tr51/#Emoji_Sets | RgiEmojiFlagSequence -- ^ See https://www.unicode.org/reports/tr51/#Emoji_Sets | RgiEmojiTagSequence -- ^ See https://www.unicode.org/reports/tr51/#Emoji_Sets | RgiEmojiZwjSequence -- ^ See https://www.unicode.org/reports/tr51/#Emoji_Sets | RgiEmoji -- ^ See https://www.unicode.org/reports/tr51/#Emoji_Sets deriving (Eq, Enum, Show, Typeable) instance NFData Bool_ where rnf !_ = () class Property p v | p -> v where fromNative :: p -> Int32 -> v toUProperty :: p -> UProperty data BidiClass_ = BidiClass deriving (Show, Typeable) instance NFData BidiClass_ where rnf !_ = () instance Property BidiClass_ Direction where fromNative _ = toEnum . fromIntegral toUProperty _ = (#const UCHAR_BIDI_CLASS) data Block_ = Block instance NFData Block_ where rnf !_ = () instance Property Block_ BlockCode where fromNative _ = toEnum . fromIntegral toUProperty _ = (#const UCHAR_BLOCK) data CanonicalCombiningClass_ = CanonicalCombiningClass deriving (Show,Typeable) instance NFData CanonicalCombiningClass_ where rnf !_ = () instance Property CanonicalCombiningClass_ Int where fromNative _ = fromIntegral toUProperty _ = (#const UCHAR_CANONICAL_COMBINING_CLASS) data Decomposition_ = Decomposition deriving (Show, Typeable) instance NFData Decomposition_ where rnf !_ = () data Decomposition = Canonical | Compat | Circle | Final | Font | Fraction | Initial | Isolated | Medial | Narrow | NoBreak | Small | Square | Sub | Super | Vertical | Wide | Count deriving (Eq, Enum, Show, Typeable) instance NFData Decomposition where rnf !_ = () instance Property Decomposition_ (Maybe Decomposition) where fromNative _ = maybeEnum toUProperty _ = (#const UCHAR_DECOMPOSITION_TYPE) data EastAsianWidth_ = EastAsianWidth deriving (Show, Typeable) instance NFData EastAsianWidth_ where rnf !_ = () data EastAsianWidth = EANeutral | EAAmbiguous | EAHalf | EAFull | EANarrow | EAWide | EACount deriving (Eq, Enum, Show, Typeable) instance NFData EastAsianWidth where rnf !_ = () instance Property EastAsianWidth_ EastAsianWidth where fromNative _ = toEnum . fromIntegral toUProperty _ = (#const UCHAR_EAST_ASIAN_WIDTH) instance Property Bool_ Bool where fromNative _ = (/=0) toUProperty = fromIntegral . fromEnum data GeneralCategory_ = GeneralCategory deriving (Show, Typeable) instance NFData GeneralCategory_ where rnf !_ = () data GeneralCategory = GeneralOtherType -- ^ U_GENERAL_OTHER_TYPES is the same as U_UNASSIGNED | UppercaseLetter | LowercaseLetter | TitlecaseLetter | ModifierLetter | OtherLetter | NonSpacingMark | EnclosingMark | CombiningSpacingMark | DecimalDigitNumber | LetterNumber | OtherNumber | SpaceSeparator | LineSeparator | ParagraphSeparator | ControlChar | FormatChar | PrivateUseChar | Surrogate | DashPunctuation | StartPunctuation | EndPunctuation | ConnectorPunctuation | OtherPunctuation | MathSymbol | CurrencySymbol | ModifierSymbol | OtherSymbol | InitialPunctuation | FinalPunctuation deriving (Eq, Enum, Show, Typeable) instance NFData GeneralCategory where rnf !_ = () instance Property GeneralCategory_ GeneralCategory where fromNative _ = toEnum . fromIntegral toUProperty _ = (#const UCHAR_GENERAL_CATEGORY) data JoiningGroup_ = JoiningGroup deriving (Show, Typeable) instance NFData JoiningGroup_ where rnf !_ = () maybeEnum :: Enum a => Int32 -> Maybe a maybeEnum 0 = Nothing maybeEnum n = Just $! toEnum (fromIntegral n-1) data JoiningGroup = Ain | Alaph | Alef | Beh | Beth | Dal | DalathRish | E | Feh | FinalSemkath | Gaf | Gamal | Hah | HamzaOnHehGoal | He | Heh | HehGoal | Heth | Kaf | Kaph | KnottedHeh | Lam | Lamadh | Meem | Mim | Noon | Nun | Pe | Qaf | Qaph | Reh | ReversedPe | Sad | Sadhe | Seen | Semkath | Shin | SwashKaf | SyriacWaw | Tah | Taw | TehMarbuta | Teth | Waw | Yeh | YehBarree | YehWithTail | Yudh | YudhHe | Zain | Fe | Khaph | Zhain | BurushaskiYehBarree | FarsiYeh | Nya | RohingyaYeh | ManichaeanAleph | ManichaeanAyin | ManichaeanBeth | ManichaeanDaleth | ManichaeanDhamedh | ManichaeanFive | ManichaeanGimel | ManichaeanHeth | ManichaeanHundred | ManichaeanKaph | ManichaeanLamedh | ManichaeanMem | ManichaeanNun | ManichaeanOne | ManichaeanPe | ManichaeanQoph | ManichaeanResh | ManichaeanSadhe | ManichaeanSamekh | ManichaeanTaw | ManichaeanTen | ManichaeanTeth | ManichaeanThamedh | ManichaeanTwenty | ManichaeanWaw | ManichaeanYodh | ManichaeanZayin | StraightWaw deriving (Eq, Enum, Show, Typeable) instance NFData JoiningGroup where rnf !_ = () instance Property JoiningGroup_ (Maybe JoiningGroup) where fromNative _ = maybeEnum toUProperty _ = (#const UCHAR_JOINING_GROUP) data JoiningType_ = JoiningType deriving (Show, Typeable) instance NFData JoiningType_ where rnf !_ = () data JoiningType = JoinCausing | DualJoining | LeftJoining | RightJoining | Transparent deriving (Eq, Enum, Show, Typeable) instance NFData JoiningType where rnf !_ = () instance Property JoiningType_ (Maybe JoiningType) where fromNative _ = maybeEnum toUProperty _ = (#const UCHAR_JOINING_TYPE) data LineBreak_ = LineBreak deriving (Show, Typeable) instance NFData LineBreak_ where rnf !_ = () data LineBreak = Ambiguous | LBAlphabetic | BreakBoth | BreakAfter | BreakBefore | MandatoryBreak | ContingentBreak | ClosePunctuation | CombiningMark | CarriageReturn | Exclamation | Glue | LBHyphen | LBIdeographic | Inseparable | InfixNumeric | LineFeed | Nonstarter | Numeric | OpenPunctuation | PostfixNumeric | PrefixNumeric | Quotation | ComplexContext | LBSurrogate | Space | BreakSymbols | Zwspace | NextLine | WordJoiner | H2 | H3 | JL | JT | JV | CloseParenthesis | ConditionalJapaneseStarter | LBHebrewLetter | LBRegionalIndicator | EBase | EModifier | ZWJ deriving (Eq, Enum, Show, Typeable) instance NFData LineBreak where rnf !_ = () instance Property LineBreak_ (Maybe LineBreak) where fromNative _ = maybeEnum toUProperty _ = (#const UCHAR_LINE_BREAK) data NumericType_ = NumericType deriving (Show, Typeable) instance NFData NumericType_ where rnf !_ = () data NumericType = NTDecimal | NTDigit | NTNumeric deriving (Eq, Enum, Show, Typeable) instance NFData NumericType where rnf !_ = () instance Property NumericType_ (Maybe NumericType) where fromNative _ = maybeEnum toUProperty _ = (#const UCHAR_NUMERIC_TYPE) data HangulSyllableType_ = HangulSyllableType deriving (Show, Typeable) instance NFData HangulSyllableType_ where rnf !_ = () data HangulSyllableType = LeadingJamo | VowelJamo | TrailingJamo | LVSyllable | LVTSyllable deriving (Eq, Enum, Show, Typeable) instance NFData HangulSyllableType where rnf !_ = () instance Property HangulSyllableType_ (Maybe HangulSyllableType) where fromNative _ = maybeEnum toUProperty _ = (#const UCHAR_HANGUL_SYLLABLE_TYPE) data NFCQuickCheck_ = NFCQuickCheck deriving (Show, Typeable) data NFDQuickCheck_ = NFDQuickCheck deriving (Show, Typeable) data NFKCQuickCheck_ = NFKCQuickCheck deriving (Show, Typeable) data NFKDQuickCheck_ = NFKDQuickCheck deriving (Show, Typeable) instance NFData NFCQuickCheck_ where rnf !_ = () instance NFData NFDQuickCheck_ where rnf !_ = () instance NFData NFKCQuickCheck_ where rnf !_ = () instance NFData NFKDQuickCheck_ where rnf !_ = () instance Property NFCQuickCheck_ (Maybe Bool) where fromNative _ = toNCR . fromIntegral toUProperty _ = (#const UCHAR_NFC_QUICK_CHECK) instance Property NFDQuickCheck_ (Maybe Bool) where fromNative _ = toNCR . fromIntegral toUProperty _ = (#const UCHAR_NFD_QUICK_CHECK) instance Property NFKCQuickCheck_ (Maybe Bool) where fromNative _ = toNCR . fromIntegral toUProperty _ = (#const UCHAR_NFKC_QUICK_CHECK) instance Property NFKDQuickCheck_ (Maybe Bool) where fromNative _ = toNCR . fromIntegral toUProperty _ = (#const UCHAR_NFKD_QUICK_CHECK) data LeadCanonicalCombiningClass_ = LeadCanonicalCombiningClass deriving (Show, Typeable) instance NFData LeadCanonicalCombiningClass_ where rnf !_ = () instance Property LeadCanonicalCombiningClass_ Int where fromNative _ = fromIntegral toUProperty _ = (#const UCHAR_LEAD_CANONICAL_COMBINING_CLASS) data TrailingCanonicalCombiningClass_ = TrailingCanonicalCombiningClass deriving (Show, Typeable) instance NFData TrailingCanonicalCombiningClass_ where rnf !_ = () instance Property TrailingCanonicalCombiningClass_ Int where fromNative _ = fromIntegral toUProperty _ = (#const UCHAR_TRAIL_CANONICAL_COMBINING_CLASS) data GraphemeClusterBreak_ = GraphemeClusterBreak deriving (Show, Typeable) instance NFData GraphemeClusterBreak_ where rnf !_ = () data GraphemeClusterBreak = GCBControl | GCBCR | GCBExtend | GCBL | GCBLF | GCBLV | GCBLVT | GCBT | GCBV | GCBSpacingMark | GCBPrepend | GCBRegionalIndicator | GCBEBase | GCBEBaseGAZ | GCBEModifier | GCBGlueAfterZWJ | GCBZWJ deriving (Eq, Enum, Show, Typeable) instance NFData GraphemeClusterBreak where rnf !_ = () instance Property GraphemeClusterBreak_ (Maybe GraphemeClusterBreak) where fromNative _ = maybeEnum toUProperty _ = (#const UCHAR_GRAPHEME_CLUSTER_BREAK) data SentenceBreak_ = SentenceBreak deriving (Show, Typeable) instance NFData SentenceBreak_ where rnf !_ = () data SentenceBreak = SBATerm | SBClose | SBFormat | SBLower | SBNumeric | SBOLetter | SBSep | SBSP | SBSTerm | SBUpper | SBCR | SBExtend | SBLF | SBSContinue deriving (Eq, Enum, Show, Typeable) instance NFData SentenceBreak where rnf !_ = () instance Property SentenceBreak_ (Maybe SentenceBreak) where fromNative _ = maybeEnum toUProperty _ = (#const UCHAR_SENTENCE_BREAK) data WordBreak_ = WordBreak deriving (Show, Typeable) instance NFData WordBreak_ where rnf !_ = () data WordBreak = WBALetter | WBFormat | WBKatakana | WBMidLetter | WBMidNum | WBNumeric | WBExtendNumLet | WBCR | WBExtend | WBLF | WBMidNumLet | WBNewline | WBRegionalIndicator | WBHebrewLetter | WBSingleQuote | WBDoubleQuote deriving (Eq, Enum, Show, Typeable) instance NFData WordBreak where rnf !_ = () instance Property WordBreak_ (Maybe WordBreak) where fromNative _ = maybeEnum toUProperty _ = (#const UCHAR_WORD_BREAK) data BidiPairedBracketType_ = BidiPairedBracketType deriving (Show, Typeable) instance NFData BidiPairedBracketType_ where rnf !_ = () data BidiPairedBracketType = BPTNone | BPTOpen | BPTClose deriving (Eq, Enum, Show, Typeable) instance NFData BidiPairedBracketType where rnf !_ = () instance Property BidiPairedBracketType_ (Maybe BidiPairedBracketType) where fromNative _ = maybeEnum toUProperty _ = (#const UCHAR_BIDI_PAIRED_BRACKET_TYPE) property :: Property p v => p -> Char -> v property p c = fromNative p . u_getIntPropertyValue (fromIntegral (ord c)) . toUProperty $ p {-# INLINE property #-} -- | Return the Unicode allocation block that contains the given -- character. blockCode :: Char -> BlockCode blockCode = toEnum . fromIntegral . ublock_getCode . fromIntegral . ord {-# INLINE blockCode #-} -- | Return the bidirectional category value for the codepoint, -- which is used in the Unicode bidirectional algorithm (UAX #9 -- ). direction :: Char -> Direction direction = toEnum . fromIntegral . u_charDirection . fromIntegral . ord {-# INLINE direction #-} -- | Determine whether the codepoint has the 'BidiMirrored' property. This -- property is set for characters that are commonly used in Right-To-Left -- contexts and need to be displayed with a "mirrored" glyph. isMirrored :: Char -> Bool isMirrored = asBool . u_isMirrored . fromIntegral . ord {-# INLINE isMirrored #-} -- Map the specified character to a "mirror-image" character. -- -- For characters with the 'BidiMirrored' property, implementations -- sometimes need a "poor man's" mapping to another Unicode (codepoint) -- such that the default glyph may serve as the mirror image of the default -- glyph of the specified character. This is useful for text conversion to -- and from code pages with visual order, and for displays without glyph -- selection capabilities. -- -- The return value is another Unicode codepoint that may serve as a -- mirror-image substitute, or the original character itself if there -- is no such mapping or the character lacks the 'BidiMirrored' -- property. mirror :: Char -> Char mirror = chr . fromIntegral . u_charMirror . fromIntegral . ord {-# INLINE mirror #-} combiningClass :: Char -> Int combiningClass = fromIntegral . u_getCombiningClass . fromIntegral . ord {-# INLINE combiningClass #-} -- | Return the decimal digit value of a decimal digit character. -- Such characters have the general category @Nd@ (decimal digit -- numbers) and a 'NumericType' of 'NTDecimal'. -- -- No digit values are returned for any Han characters, because Han -- number characters are often used with a special Chinese-style -- number format (with characters for powers of 10 in between) instead -- of in decimal-positional notation. Unicode 4 explicitly assigns -- Han number characters a 'NumericType' of 'NTNumeric' instead of -- 'NTDecimal'. digitToInt :: Char -> Maybe Int digitToInt c | i == -1 = Nothing | otherwise = Just $! fromIntegral i where i = u_charDigitValue . fromIntegral . ord $ c -- | Return the numeric value for a Unicode codepoint as defined in the -- Unicode Character Database. -- -- A 'Double' return type is necessary because some numeric values are -- fractions, negative, or too large to fit in a fixed-width integral type. numericValue :: Char -> Maybe Double numericValue c | v == (#const U_NO_NUMERIC_VALUE) = Nothing | otherwise = Just v where v = u_getNumericValue . fromIntegral . ord $ c -- | Return the name of a Unicode character. -- -- The names of all unassigned characters are empty. -- -- The name contains only "invariant" characters like A-Z, 0-9, space, -- and \'-\'. charName :: Char -> String charName = charName' (#const U_UNICODE_CHAR_NAME) -- | Return the full name of a Unicode character. -- -- Compared to 'charName', this function gives each Unicode codepoint -- a unique extended name. Extended names are lowercase followed by an -- uppercase hexadecimal number, within angle brackets. charFullName :: Char -> String charFullName = charName' (#const U_EXTENDED_CHAR_NAME) -- | Find a Unicode character by its full name, and return its code -- point value. -- -- The name is matched exactly and completely. -- -- A Unicode 1.0 name is matched only if it differs from the modern -- name. Unicode names are all uppercase. charFromName :: String -> Maybe Char charFromName = charFromName' (#const U_UNICODE_CHAR_NAME) -- | Find a Unicode character by its full or extended name, and return -- its codepoint value. -- -- The name is matched exactly and completely. -- -- A Unicode 1.0 name is matched only if it differs from the modern -- name. -- -- Compared to 'charFromName', this function gives each Unicode code -- point a unique extended name. Extended names are lowercase followed -- by an uppercase hexadecimal number, within angle brackets. charFromFullName :: String -> Maybe Char charFromFullName = charFromName' (#const U_EXTENDED_CHAR_NAME) charFromName' :: UCharNameChoice -> String -> Maybe Char charFromName' choice name = unsafePerformIO . withCString name $ \ptr -> do (err,r) <- withError $ u_charFromName choice ptr return $! if err == u_INVALID_CHAR_FOUND || r == 0xffff then Nothing else Just $! chr (fromIntegral r) charName' :: UCharNameChoice -> Char -> String charName' choice c = fillString $ u_charName (fromIntegral (ord c)) choice fillString :: (CString -> Int32 -> Ptr UErrorCode -> IO Int32) -> String fillString act = unsafePerformIO $ handleOverflowError 83 act (curry peekCStringLen) type UBlockCode = CInt type UCharDirection = CInt type UCharNameChoice = CInt type UProperty = CInt foreign import ccall unsafe "hs_text_icu.h __hs_ublock_getCode" ublock_getCode :: UChar32 -> UBlockCode foreign import ccall unsafe "hs_text_icu.h __hs_u_charDirection" u_charDirection :: UChar32 -> UCharDirection foreign import ccall unsafe "hs_text_icu.h __hs_u_isMirrored" u_isMirrored :: UChar32 -> UBool foreign import ccall unsafe "hs_text_icu.h __hs_u_charMirror" u_charMirror :: UChar32 -> UChar32 foreign import ccall unsafe "hs_text_icu.h __hs_u_getCombiningClass" u_getCombiningClass :: UChar32 -> Word8 foreign import ccall unsafe "hs_text_icu.h __hs_u_charDigitValue" u_charDigitValue :: UChar32 -> Int32 foreign import ccall unsafe "hs_text_icu.h __hs_u_charName" u_charName :: UChar32 -> UCharNameChoice -> CString -> Int32 -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_u_charFromName" u_charFromName :: UCharNameChoice -> CString -> Ptr UErrorCode -> IO UChar32 foreign import ccall unsafe "hs_text_icu.h __hs_u_getIntPropertyValue" u_getIntPropertyValue :: UChar32 -> UProperty -> Int32 foreign import ccall unsafe "hs_text_icu.h __hs_u_getNumericValue" u_getNumericValue :: UChar32 -> Double text-icu-0.8.0.4/Data/Text/ICU/CharsetDetection.hsc0000644000000000000000000001053207346545000017761 0ustar0000000000000000{-# LANGUAGE ForeignFunctionInterface #-} -- | -- Module : Data.Text.ICU.CharsetDetection -- Copyright : (c) 2017 Zac Slade -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Access to the Unicode Character Set Detection facilities, implemented in -- the International Components for Unicode (ICU) libraries. -- -- For more information see the \"Character Set Detection\" chapter -- in the ICU User Guide -- . module Data.Text.ICU.CharsetDetection ( setText , detect , mkCharsetDetector , withCharsetDetector , wrapUCharsetMatch , CharsetMatch , CharsetDetector , getConfidence , getName , getLanguage ) where import Foreign.Ptr (Ptr) import Foreign.C.String (CString) import Foreign.C.Types (CChar) import qualified Data.ByteString as BS import Data.ByteString (ByteString) import qualified Data.Text.Encoding as TE import Data.Text (Text) import Data.Text.ICU.Error.Internal (UErrorCode, handleError) import Data.Text.ICU.CharsetDetection.Internal (UCharsetMatch, UCharsetDetector, CharsetDetector, CharsetMatch, mkCharsetDetector, withCharsetDetector, withCharsetMatch, wrapUCharsetMatch) #include -- | From the ICU C API documentation: -- "Character set detection is at best an imprecise operation. The -- detection process will attempt to identify the charset that best matches -- the characteristics of the byte data, but the process is partly statistical -- in nature, and the results can not be guaranteed to always be correct. -- -- For best accuracy in charset detection, the input data should be primarily -- in a single language, and a minimum of a few hundred bytes worth of plain -- text in the language are needed. The detection process will attempt to -- ignore html or xml style markup that could otherwise obscure the content." -- | Use the first 512 bytes, if available, as the text in the -- 'CharsetDetector' object. This function is low-level and used by the more -- high-level 'detect' function. setText :: ByteString -> CharsetDetector -> IO () setText bs ucsd = withCharsetDetector ucsd go where go u = if BS.length bs < 512 then BS.useAsCStringLen bs (\(text,size) -> handleError $ ucsdet_setText u text size) else BS.useAsCStringLen (BS.take 512 bs) (\(text,size) -> handleError $ ucsdet_setText u text size) -- | Attempt to perform a detection without an input filter. The best match -- will be returned. detect :: ByteString -> IO CharsetMatch detect bs = do ucsd <- mkCharsetDetector setText bs ucsd wrapUCharsetMatch ucsd $ withCharsetDetector ucsd (handleError . ucsdet_detect) -- | See the confidence score from 0-100 of the 'CharsetMatch' object. getConfidence :: CharsetMatch -> IO Int getConfidence ucm = withCharsetMatch ucm $ handleError . ucsdet_getConfidence -- | Extract the character set encoding name from the 'CharsetMatch' -- object. getName :: CharsetMatch -> IO Text getName ucsm = do bs <- withCharsetMatch ucsm (handleError . ucsdet_getName) >>= BS.packCString return $ TE.decodeUtf8 bs -- | Extracts the three letter ISO code for the language encoded in the -- 'CharsetMatch'. getLanguage :: CharsetMatch -> IO Text getLanguage ucsm = do bs <- withCharsetMatch ucsm (handleError . ucsdet_getLanguage) >>= BS.packCString return $ TE.decodeUtf8 bs foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_setText" ucsdet_setText :: Ptr UCharsetDetector -> Ptr CChar -> Int -> Ptr UErrorCode -> IO () foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_detect" ucsdet_detect :: Ptr UCharsetDetector -> Ptr UErrorCode -> IO (Ptr UCharsetMatch) foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_getConfidence" ucsdet_getConfidence :: Ptr UCharsetMatch -> Ptr UErrorCode -> IO Int foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_getName" ucsdet_getName :: Ptr UCharsetMatch -> Ptr UErrorCode -> IO CString foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_getLanguage" ucsdet_getLanguage :: Ptr UCharsetMatch -> Ptr UErrorCode -> IO CString text-icu-0.8.0.4/Data/Text/ICU/CharsetDetection/0000755000000000000000000000000007346545000017261 5ustar0000000000000000text-icu-0.8.0.4/Data/Text/ICU/CharsetDetection/Internal.hsc0000644000000000000000000000636007346545000021541 0ustar0000000000000000{-# LANGUAGE DeriveDataTypeable, ForeignFunctionInterface, EmptyDataDecls #-} -- | -- Module : Data.Text.ICU.CharsetDetection.Internal -- Copyright : (c) 2017 Zac Slade -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Access to the Unicode Character Set Detection facilities, implemented in -- the International Components for Unicode (ICU) libraries. -- -- For more information see the \"Character Set Detection\" chapter -- in the ICU User Guide -- . module Data.Text.ICU.CharsetDetection.Internal ( UCharsetDetector , UCharsetMatch , CharsetMatch(..) , CharsetDetector(..) , withCharsetDetector , wrapUCharsetDetector , wrapUCharsetMatch , mkCharsetDetector , withCharsetMatch ) where import Data.Typeable (Typeable) import Foreign.ForeignPtr (ForeignPtr, withForeignPtr) import Foreign.Ptr (FunPtr, Ptr) import Data.Text.ICU.Error.Internal (UErrorCode, handleError) import Data.Text.ICU.Internal (newICUPtr) #include -- | Opaque handle to a character set detector data UCharsetDetector -- | Handy wrapper for the pointer to the 'UCharsetDetector'. We must -- always call ucsdet_close on any UCharsetDetector when we are done. The -- 'withCharsetDetector' and 'wrapUCharsetDetector' functions simplify -- management of the pointers. data CharsetDetector = CharsetDetector { charsetDetectorPtr :: {-# UNPACK #-} !(ForeignPtr UCharsetDetector) } deriving (Typeable) mkCharsetDetector :: IO CharsetDetector mkCharsetDetector = wrapUCharsetDetector $ handleError ucsdet_open -- | Temporarily unwraps an 'CharsetDetector' to perform operations on its -- raw 'UCharsetDetector' handle. withCharsetDetector :: CharsetDetector -> (Ptr UCharsetDetector -> IO a) -> IO a withCharsetDetector (CharsetDetector ucsd) = withForeignPtr ucsd {-# INLINE withCharsetDetector #-} -- | Wraps a raw 'UCharsetDetector' in an 'CharsetDetector', closing the -- handle when the last reference to the object is dropped. wrapUCharsetDetector :: IO (Ptr UCharsetDetector) -> IO CharsetDetector wrapUCharsetDetector = newICUPtr CharsetDetector ucsdet_close {-# INLINE wrapUCharsetDetector #-} -- | Opaque handle to a character set match data UCharsetMatch -- | Opaque character set match handle. The memory backing these objects is -- managed entirely by the ICU C library. -- TODO: UCharsetMatch is reset after the setText call. We need to handle it. data CharsetMatch = CharsetMatch { charsetMatchPtr :: {-# UNPACK #-} !(Ptr UCharsetMatch) , charsetMatchDetector :: CharsetDetector -- ^ keep reference since UCharsetMatch object is owned -- by the UCharsetDetector. } deriving (Typeable) wrapUCharsetMatch :: CharsetDetector -> IO (Ptr UCharsetMatch) -> IO CharsetMatch wrapUCharsetMatch cd = fmap $ flip CharsetMatch cd withCharsetMatch :: CharsetMatch -> (Ptr UCharsetMatch -> IO a) -> IO a withCharsetMatch (CharsetMatch ucsm _) f = f ucsm foreign import ccall unsafe "hs_text_icu.h &__hs_ucsdet_close" ucsdet_close :: FunPtr (Ptr UCharsetDetector -> IO ()) foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_open" ucsdet_open :: Ptr UErrorCode -> IO (Ptr UCharsetDetector) text-icu-0.8.0.4/Data/Text/ICU/Collate.hsc0000644000000000000000000003756207346545000016130 0ustar0000000000000000{-# LANGUAGE BangPatterns, CPP, DeriveDataTypeable, ForeignFunctionInterface #-} -- | -- Module : Data.Text.ICU.Collate -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- String collation functions for Unicode, implemented as bindings to -- the International Components for Unicode (ICU) libraries. module Data.Text.ICU.Collate ( -- * Unicode collation API -- $api -- * Types MCollator , Attribute(..) , AlternateHandling(..) , CaseFirst(..) , Strength(..) -- * Functions , open , openRules , collate , collateIter -- ** Utility functions , getRules , getAttribute , setAttribute , sortKey , clone , freeze ) where #include import Control.DeepSeq (NFData(..)) import Data.ByteString (empty) import Data.ByteString.Internal (ByteString(..), create, mallocByteString, memcpy) import Data.Int (Int32) import Data.Text (Text) import qualified Data.Text as T import Data.Text.Foreign (useAsPtr) import Data.Text.ICU.Collate.Internal (Collator(..), MCollator, UCollator, withCollator, wrap) import Data.Text.ICU.Error (u_INVALID_FORMAT_ERROR) import Data.Text.ICU.Error.Internal (UErrorCode, UParseError, handleError, handleParseError) import Data.Text.ICU.Internal (LocaleName, UChar, CharIterator, UCharIterator, asOrdering, fromUCharPtr, withCharIterator, withLocaleName, useAsUCharPtr) import Data.Typeable (Typeable) import Data.Word (Word8) import Foreign.C.String (CString) import Foreign.C.Types (CInt(..)) import Foreign.ForeignPtr (withForeignPtr) import Foreign.Marshal.Alloc (alloca) import Foreign.Marshal.Utils (with) import Foreign.Ptr (Ptr, nullPtr) import Foreign.Storable (peek) -- $api -- -- | Control the handling of variable weight elements. data AlternateHandling = NonIgnorable -- ^ Treat all codepoints with non-ignorable primary -- weights in the same way. | Shifted -- ^ Cause codepoints with primary weights that are -- equal to or below the variable top value to be -- ignored on primary level and moved to the -- quaternary level. deriving (Eq, Bounded, Enum, Show, Typeable) instance NFData AlternateHandling where rnf !_ = () -- | Control the ordering of upper and lower case letters. data CaseFirst = UpperFirst -- ^ Force upper case letters to sort before -- lower case. | LowerFirst -- ^ Force lower case letters to sort before -- upper case. deriving (Eq, Bounded, Enum, Show, Typeable) instance NFData CaseFirst where rnf !_ = () -- | The strength attribute. The usual strength for most locales (except -- Japanese) is tertiary. Quaternary strength is useful when combined with -- shifted setting for alternate handling attribute and for JIS x 4061 -- collation, when it is used to distinguish between Katakana and Hiragana -- (this is achieved by setting 'HiraganaQuaternaryMode' mode to -- 'True'). Otherwise, quaternary level is affected only by the number of -- non ignorable codepoints in the string. Identical strength is rarely -- useful, as it amounts to codepoints of the 'NFD' form of the string. data Strength = Primary | Secondary | Tertiary | Quaternary | Identical deriving (Eq, Bounded, Enum, Show, Typeable) instance NFData Strength where rnf !_ = () data Attribute = French Bool -- ^ Direction of secondary weights, used in French. 'True', -- results in secondary weights being considered backwards, -- while 'False' treats secondary weights in the order in -- which they appear. | AlternateHandling AlternateHandling -- ^ For handling variable elements. 'NonIgnorable' is -- default. | CaseFirst (Maybe CaseFirst) -- ^ Control the ordering of upper and lower case letters. -- 'Nothing' (the default) orders upper and lower case -- letters in accordance to their tertiary weights. | CaseLevel Bool -- ^ Controls whether an extra case level (positioned -- before the third level) is generated or not. When -- 'False' (default), case level is not generated; when -- 'True', the case level is generated. Contents of the -- case level are affected by the value of the 'CaseFirst' -- attribute. A simple way to ignore accent differences in -- a string is to set the strength to 'Primary' and enable -- case level. | NormalizationMode Bool -- ^ Controls whether the normalization check and necessary -- normalizations are performed. When 'False' (default) no -- normalization check is performed. The correctness of the -- result is guaranteed only if the input data is in -- so-called 'FCD' form (see users manual for more info). -- When 'True', an incremental check is performed to see -- whether the input data is in 'FCD' form. If the data is -- not in 'FCD' form, incremental 'NFD' normalization is -- performed. | Strength Strength | HiraganaQuaternaryMode Bool -- ^ When turned on, this attribute positions Hiragana -- before all non-ignorables on quaternary level. This is a -- sneaky way to produce JIS sort order. | Numeric Bool -- ^ When enabled, this attribute generates a collation key -- for the numeric value of substrings of digits. This is -- a way to get '100' to sort /after/ '2'. deriving (Eq, Show, Typeable) instance NFData Attribute where rnf (French !_) = () rnf (AlternateHandling !_) = () rnf (CaseFirst c) = rnf c rnf (CaseLevel !_) = () rnf (NormalizationMode !_) = () rnf (Strength !_) = () rnf (HiraganaQuaternaryMode !_) = () rnf (Numeric !_) = () type UColAttribute = CInt type UColAttributeValue = CInt type UCollationStrength = UColAttributeValue toUAttribute :: Attribute -> (UColAttribute, UColAttributeValue) toUAttribute (French v) = ((#const UCOL_FRENCH_COLLATION), toOO v) toUAttribute (AlternateHandling v) = ((#const UCOL_ALTERNATE_HANDLING), toAH v) toUAttribute (CaseFirst v) = ((#const UCOL_CASE_FIRST), toCF v) toUAttribute (CaseLevel v) = ((#const UCOL_CASE_LEVEL), toOO v) toUAttribute (NormalizationMode v) = ((#const UCOL_NORMALIZATION_MODE), toOO v) toUAttribute (Strength v) = ((#const UCOL_STRENGTH), toS v) toUAttribute (HiraganaQuaternaryMode v) = ((#const UCOL_HIRAGANA_QUATERNARY_MODE), toOO v) toUAttribute (Numeric v) = ((#const UCOL_NUMERIC_COLLATION), toOO v) toOO :: Bool -> UColAttributeValue toOO False = #const UCOL_OFF toOO True = #const UCOL_ON toDefaultOO :: (Maybe Bool) -> UColAttributeValue toDefaultOO (Just False) = #const UCOL_OFF toDefaultOO (Just True) = #const UCOL_ON toDefaultOO Nothing = #const UCOL_DEFAULT toAH :: AlternateHandling -> UColAttributeValue toAH NonIgnorable = #const UCOL_NON_IGNORABLE toAH Shifted = #const UCOL_SHIFTED toCF :: Maybe CaseFirst -> UColAttributeValue toCF Nothing = #const UCOL_OFF toCF (Just UpperFirst) = #const UCOL_UPPER_FIRST toCF (Just LowerFirst) = #const UCOL_LOWER_FIRST toS :: Strength -> UColAttributeValue toS Primary = #const UCOL_PRIMARY toS Secondary = #const UCOL_SECONDARY toS Tertiary = #const UCOL_TERTIARY toS Quaternary = #const UCOL_QUATERNARY toS Identical = #const UCOL_IDENTICAL toDefaultS :: Maybe Strength -> UColAttributeValue toDefaultS (Just s) = toS s toDefaultS Nothing = #const UCOL_DEFAULT_STRENGTH fromOO :: UColAttributeValue -> Bool fromOO (#const UCOL_OFF) = False fromOO (#const UCOL_ON) = True fromOO bad = valueError "fromOO" bad fromAH :: UColAttributeValue -> AlternateHandling fromAH (#const UCOL_NON_IGNORABLE) = NonIgnorable fromAH (#const UCOL_SHIFTED) = Shifted fromAH bad = valueError "fromAH" bad fromCF :: UColAttributeValue -> Maybe CaseFirst fromCF (#const UCOL_OFF) = Nothing fromCF (#const UCOL_UPPER_FIRST) = Just UpperFirst fromCF (#const UCOL_LOWER_FIRST) = Just LowerFirst fromCF bad = valueError "fromCF" bad fromS :: UColAttributeValue -> Strength fromS (#const UCOL_PRIMARY) = Primary fromS (#const UCOL_SECONDARY) = Secondary fromS (#const UCOL_TERTIARY) = Tertiary fromS (#const UCOL_QUATERNARY) = Quaternary fromS (#const UCOL_IDENTICAL) = Identical fromS bad = valueError "fromS" bad fromUAttribute :: UColAttribute -> UColAttributeValue -> Attribute fromUAttribute key val = case key of (#const UCOL_FRENCH_COLLATION) -> French (fromOO val) (#const UCOL_ALTERNATE_HANDLING) -> AlternateHandling (fromAH val) (#const UCOL_CASE_FIRST) -> CaseFirst (fromCF val) (#const UCOL_CASE_LEVEL) -> CaseLevel (fromOO val) (#const UCOL_NORMALIZATION_MODE) -> NormalizationMode (fromOO val) (#const UCOL_STRENGTH) -> Strength (fromS val) (#const UCOL_HIRAGANA_QUATERNARY_MODE) -> HiraganaQuaternaryMode (fromOO val) (#const UCOL_NUMERIC_COLLATION) -> Numeric (fromOO val) _ -> valueError "fromUAttribute" key valueError :: Show a => String -> a -> z valueError func bad = error ("Data.Text.ICU.Collate." ++ func ++ ": invalid value " ++ show bad) type UCollationResult = CInt -- | Open a 'Collator' for comparing strings. open :: LocaleName -- ^ The locale containing the required collation rules. -> IO MCollator open loc = wrap $ withLocaleName loc (handleError . ucol_open) -- | Produce a 'Collator' instance according to the rules supplied. openRules :: Text -- ^ A string describing the collation rules. -> Maybe Bool -- ^ The normalization mode: One of 'Just False' (expect the text to not need normalization) -- 'Just True' (normalize), or 'Nothing' (set the mode according to the rules) -> Maybe Strength -- ^ The default collation strength; one of 'Just Primary', 'Just Secondary', 'Just Tertiary', 'Just Identical', 'Nothing' (default strength) - can be also set in the rules. -> IO MCollator openRules r n s = wrap $ useAsUCharPtr r $ \rPtr rLen -> do let len = fromIntegral rLen handleParseError (== u_INVALID_FORMAT_ERROR) $ ucol_openRules rPtr len (toDefaultOO n) (toDefaultS s) -- | Get the rules of an 'MCollator' attribute. getRules :: MCollator -> IO Text getRules c = withCollator c $ \cPtr -> alloca $ \lenPtr -> do textPtr <- ucol_getRules cPtr lenPtr (fromUCharPtr textPtr . fromIntegral) =<< peek lenPtr -- | Set the value of an 'MCollator' attribute. setAttribute :: MCollator -> Attribute -> IO () setAttribute c a = withCollator c $ \cptr -> handleError $ uncurry (ucol_setAttribute cptr) (toUAttribute a) -- | Get the value of an 'MCollator' attribute. -- -- It is safe to provide a dummy argument to an 'Attribute' constructor when -- using this function, so the following will work: -- -- > getAttribute mcol (NormalizationMode undefined) getAttribute :: MCollator -> Attribute -> IO Attribute getAttribute c a = do let name = fst (toUAttribute a) val <- withCollator c $ \cptr -> handleError $ ucol_getAttribute cptr name return $! fromUAttribute name val -- | Compare two strings. collate :: MCollator -> Text -> Text -> IO Ordering collate c a b = withCollator c $ \cptr -> useAsPtr a $ \aptr alen -> useAsPtr b $ \bptr blen -> fmap asOrdering . handleError $ #if MIN_VERSION_text(2,0,0) ucol_strcollUTF8 #else ucol_strcoll #endif cptr aptr (fromIntegral alen) bptr (fromIntegral blen) -- | Compare two 'CharIterator's. -- -- If either iterator was constructed from a 'ByteString', it does not need -- to be copied or converted internally, so this function can be quite -- cheap. collateIter :: MCollator -> CharIterator -> CharIterator -> IO Ordering collateIter c a b = fmap asOrdering . withCollator c $ \cptr -> withCharIterator a $ \ai -> withCharIterator b $ handleError . ucol_strcollIter cptr ai -- | Create a key for sorting the 'Text' using the given 'Collator'. -- The result of comparing two 'ByteString's that have been -- transformed with 'sortKey' will be the same as the result of -- 'collate' on the two untransformed 'Text's. sortKey :: MCollator -> Text -> IO ByteString sortKey c t | T.null t = return empty | otherwise = do withCollator c $ \cptr -> useAsUCharPtr t $ \tptr tlen -> do let len = fromIntegral tlen loop n = do fp <- mallocByteString (fromIntegral n) i <- withForeignPtr fp $ \p -> ucol_getSortKey cptr tptr len p n let j = fromIntegral i case undefined of _ | i == 0 -> error "Data.Text.ICU.Collate.sortKey: internal error" | i > n -> loop i | i <= n `div` 2 -> create j $ \p -> withForeignPtr fp $ \op -> memcpy p op (fromIntegral i) | otherwise -> return $! PS fp 0 j loop (min (len * 4) 8) -- | Make a safe copy of a mutable 'MCollator' for use in pure code. -- Subsequent changes to the 'MCollator' will not affect the state of -- the returned 'Collator'. freeze :: MCollator -> IO Collator freeze = fmap C . clone -- | Make a copy of a mutable 'MCollator'. -- Subsequent changes to the input 'MCollator' will not affect the state of -- the returned 'MCollator'. clone :: MCollator -> IO MCollator clone c = wrap $ withCollator c $ \cptr -> with (#const U_COL_SAFECLONE_BUFFERSIZE) (handleError . ucol_safeClone cptr nullPtr) foreign import ccall unsafe "hs_text_icu.h __hs_ucol_open" ucol_open :: CString -> Ptr UErrorCode -> IO (Ptr UCollator) foreign import ccall unsafe "hs_text_icu.h __hs_ucol_openRules" ucol_openRules :: Ptr UChar -> Int32 -> UColAttributeValue -> UCollationStrength -> Ptr UParseError -> Ptr UErrorCode -> IO (Ptr UCollator) foreign import ccall unsafe "hs_text_icu.h __hs_ucol_getAttribute" ucol_getAttribute :: Ptr UCollator -> UColAttribute -> Ptr UErrorCode -> IO UColAttributeValue foreign import ccall unsafe "hs_text_icu.h __hs_ucol_getRules" ucol_getRules :: Ptr UCollator -> Ptr Int32 -> IO (Ptr UChar) foreign import ccall unsafe "hs_text_icu.h __hs_ucol_setAttribute" ucol_setAttribute :: Ptr UCollator -> UColAttribute -> UColAttributeValue -> Ptr UErrorCode -> IO () #if MIN_VERSION_text(2,0,0) foreign import ccall unsafe "hs_text_icu.h __hs_ucol_strcollUTF8" ucol_strcollUTF8 :: Ptr UCollator -> Ptr Word8 -> Int32 -> Ptr Word8 -> Int32 -> Ptr UErrorCode -> IO UCollationResult #else foreign import ccall unsafe "hs_text_icu.h __hs_ucol_strcoll" ucol_strcoll :: Ptr UCollator -> Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO UCollationResult #endif foreign import ccall unsafe "hs_text_icu.h __hs_ucol_getSortKey" ucol_getSortKey :: Ptr UCollator -> Ptr UChar -> Int32 -> Ptr Word8 -> Int32 -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ucol_strcollIter" ucol_strcollIter :: Ptr UCollator -> Ptr UCharIterator -> Ptr UCharIterator -> Ptr UErrorCode -> IO UCollationResult foreign import ccall unsafe "hs_text_icu.h __hs_ucol_safeClone" ucol_safeClone :: Ptr UCollator -> Ptr a -> Ptr Int32 -> Ptr UErrorCode -> IO (Ptr UCollator) text-icu-0.8.0.4/Data/Text/ICU/Collate/0000755000000000000000000000000007346545000015414 5ustar0000000000000000text-icu-0.8.0.4/Data/Text/ICU/Collate/Internal.hs0000644000000000000000000000241507346545000017526 0ustar0000000000000000{-# LANGUAGE DeriveDataTypeable, EmptyDataDecls, ForeignFunctionInterface #-} -- | -- Module : Data.Text.ICU.Collate.Internal -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Internals of the string collation infrastructure. module Data.Text.ICU.Collate.Internal ( -- * Unicode collation API MCollator(..) , Collator(..) , UCollator , withCollator , wrap ) where import Data.Typeable (Typeable) import Foreign.ForeignPtr (ForeignPtr, withForeignPtr) import Foreign.Ptr (FunPtr, Ptr) import Data.Text.ICU.Internal (newICUPtr) -- $api -- data UCollator -- | String collator type. data MCollator = MCollator {-# UNPACK #-} !(ForeignPtr UCollator) deriving (Typeable) -- | String collator type. newtype Collator = C MCollator deriving (Typeable) withCollator :: MCollator -> (Ptr UCollator -> IO a) -> IO a withCollator (MCollator col) action = withForeignPtr col action {-# INLINE withCollator #-} wrap :: IO (Ptr UCollator) -> IO MCollator wrap = newICUPtr MCollator ucol_close {-# INLINE wrap #-} foreign import ccall unsafe "hs_text_icu.h &__hs_ucol_close" ucol_close :: FunPtr (Ptr UCollator -> IO ()) text-icu-0.8.0.4/Data/Text/ICU/Collate/Pure.hs0000644000000000000000000000651107346545000016666 0ustar0000000000000000{-# LANGUAGE DeriveDataTypeable, ForeignFunctionInterface, ScopedTypeVariables #-} -- | -- Module : Data.Text.ICU.Collate.Pure -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Pure string collation functions for Unicode, implemented as -- bindings to the International Components for Unicode (ICU) -- libraries. -- -- For the impure collation API (which is richer, but less easy to -- use), see the "Data.Text.ICU.Collate" module. module Data.Text.ICU.Collate.Pure ( -- * Unicode collation API -- $api Collator , collator , collatorWith , collatorFromRules , collatorFromRulesWith , collate , collateIter , rules , sortKey , uca ) where import qualified Control.Exception as E import Control.Monad (forM_) import Data.ByteString (ByteString) import Data.Text (Text) import Data.Text.ICU.Error.Internal (ParseError(..)) import Data.Text.ICU.Collate.Internal (Collator(..)) import Data.Text.ICU.Internal (CharIterator, LocaleName(..)) import System.IO.Unsafe (unsafePerformIO) import qualified Data.Text.ICU.Collate as IO -- $api -- -- | Create an immutable 'Collator' for comparing strings. -- -- If 'Root' is passed as the locale, UCA collation rules will be -- used. collator :: LocaleName -> Collator collator loc = unsafePerformIO $ C `fmap` IO.open loc -- | Create an immutable 'Collator' with the given 'Attribute's. collatorWith :: LocaleName -> [IO.Attribute] -> Collator collatorWith loc atts = unsafePerformIO $ do mc <- IO.open loc forM_ atts $ IO.setAttribute mc return (C mc) -- | Create an immutable 'Collator' from the given collation rules. collatorFromRules :: Text -> Either ParseError Collator collatorFromRules rul = collatorFromRulesWith rul [] -- | Create an immutable 'Collator' from the given collation rules with the given 'Attribute's. collatorFromRulesWith :: Text -> [IO.Attribute] -> Either ParseError Collator collatorFromRulesWith rul atts = unsafePerformIO $ (Right `fmap` openAndSetAtts) `E.catch` \(err::ParseError) -> return (Left err) where openAndSetAtts = do mc <- IO.openRules rul Nothing Nothing forM_ atts $ IO.setAttribute mc return (C mc) -- | Get rules for the given 'Collator'. rules :: Collator -> Text rules (C c) = unsafePerformIO $ IO.getRules c -- | Compare two strings. collate :: Collator -> Text -> Text -> Ordering collate (C c) a b = unsafePerformIO $ IO.collate c a b {-# INLINE collate #-} -- | Compare two 'CharIterator's. -- -- If either iterator was constructed from a 'ByteString', it does not -- need to be copied or converted beforehand, so this function can be -- quite cheap. collateIter :: Collator -> CharIterator -> CharIterator -> Ordering collateIter (C c) a b = unsafePerformIO $ IO.collateIter c a b {-# INLINE collateIter #-} -- | Create a key for sorting the 'Text' using the given 'Collator'. -- The result of comparing two 'ByteString's that have been -- transformed with 'sortKey' will be the same as the result of -- 'collate' on the two untransformed 'Text's. sortKey :: Collator -> Text -> ByteString sortKey (C c) = unsafePerformIO . IO.sortKey c {-# INLINE sortKey #-} -- | A 'Collator' that uses the Unicode Collation Algorithm (UCA). uca :: Collator uca = collator Root {-# NOINLINE uca #-} text-icu-0.8.0.4/Data/Text/ICU/Convert.hs0000644000000000000000000002416507346545000016015 0ustar0000000000000000{-# LANGUAGE ForeignFunctionInterface, CPP #-} -- | -- Module : Data.Text.ICU.Convert -- Copyright : (c) 2009, 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Character set conversion functions for Unicode, implemented as -- bindings to the International Components for Unicode (ICU) -- libraries. module Data.Text.ICU.Convert ( -- * Character set conversion Converter -- ** Basic functions , open , fromUnicode , toUnicode -- ** Converter metadata , getName , usesFallback , isAmbiguous -- * Functions for controlling global behavior , getDefaultName , setDefaultName -- * Miscellaneous functions , compareNames , aliases -- * Metadata , converterNames , standardNames ) where import Data.ByteString.Internal (ByteString, createAndTrim) import Data.ByteString.Unsafe (unsafeUseAsCStringLen) import Data.Int (Int32) import Data.Text (Text) import Data.Text.Foreign (fromPtr, useAsPtr) #if !MIN_VERSION_text(2,0,0) import Data.Text.ICU.Internal (UChar) #endif import Data.Text.ICU.Internal (lengthWord) import Data.Text.ICU.Convert.Internal import Data.Text.ICU.Error.Internal (UErrorCode, handleError) import Data.Word (Word8, Word16) import Foreign.C.String (CString, peekCString, withCString) import Foreign.C.Types (CInt(..)) import Foreign.Marshal.Array (allocaArray) import Foreign.Ptr (FunPtr, Ptr) import System.IO.Unsafe (unsafePerformIO) import Data.Text.ICU.Internal (UBool, asBool, asOrdering, withName, newICUPtr) -- | Do a fuzzy compare of two converter/alias names. The comparison -- is case-insensitive, ignores leading zeroes if they are not -- followed by further digits, and ignores all but letters and digits. -- Thus the strings @\"UTF-8\"@, @\"utf_8\"@, @\"u*T\@f08\"@ and -- @\"Utf 8\"@ are exactly equivalent. See section 1.4, Charset Alias -- Matching in Unicode Technical Standard #22 at -- compareNames :: String -> String -> Ordering compareNames a b = unsafePerformIO . withCString a $ \aptr -> fmap asOrdering . withCString b $ ucnv_compareNames aptr -- | Create a 'Converter' with the name of a coded character set -- specified as a string. The actual name will be resolved with the -- alias file using a case-insensitive string comparison that ignores -- leading zeroes and all non-alphanumeric characters. E.g., the -- names @\"UTF8\"@, @\"utf-8\"@, @\"u*T\@f08\"@ and @\"Utf 8\"@ are -- all equivalent (see also 'compareNames'). If an empty string is -- passed for the converter name, it will create one with the -- 'getDefaultName' return value. -- -- A converter name may contain options like a locale specification to -- control the specific behavior of the newly instantiated converter. -- The meaning of the options depends on the particular converter. If -- an option is not defined for or recognized by a given converter, -- then it is ignored. -- -- Options are appended to the converter name string, with a comma -- between the name and the first option and also between adjacent -- options. -- -- If the alias is ambiguous, then the preferred converter is used. -- -- The conversion behavior and names can vary between platforms. ICU -- may convert some characters differently from other -- platforms. Details on this topic are in the ICU User's Guide at -- . Aliases -- starting with a @\"cp\"@ prefix have no specific meaning other than -- its an alias starting with the letters @\"cp\"@. Please do not -- associate any meaning to these aliases. open :: String -- ^ Name of the converter to use. -> Maybe Bool -- ^ Whether to use fallback mappings -- (see 'usesFallback' for details). -> IO Converter open name mf = do c <- newICUPtr Converter ucnv_close $ withName name (handleError . ucnv_open) case mf of Just f -> withConverter c $ \p -> ucnv_setFallback p . fromIntegral . fromEnum $ f _ -> return () return c -- | Encode a Unicode string into a code page string using the given converter. fromUnicode :: Converter -> Text -> ByteString fromUnicode cnv t = unsafePerformIO . useAsPtr t $ \tptr tlen -> withConverter cnv $ \cptr -> do let capacity = fromIntegral . ucnv_max_bytes_for_string cptr . fromIntegral $ lengthWord t createAndTrim (fromIntegral capacity) $ \sptr -> fmap fromIntegral . handleError $ #if MIN_VERSION_text(2,0,0) ucnv_fromAlgorithmic_UTF8 #else ucnv_fromUChars #endif cptr sptr capacity tptr (fromIntegral tlen) -- | Decode an encoded string into a Unicode string using the given converter. toUnicode :: Converter -> ByteString -> Text toUnicode cnv bs = unsafePerformIO . unsafeUseAsCStringLen bs $ \(sptr, slen) -> withConverter cnv $ \cptr -> do let (capacity, conv) = #if MIN_VERSION_text(2,0,0) (slen * 4, ucnv_toAlgorithmic_UTF8) #else (slen * 2, ucnv_toUChars) #endif allocaArray capacity $ \tptr -> fromPtr tptr =<< (fmap fromIntegral . handleError $ conv cptr tptr (fromIntegral capacity) sptr (fromIntegral slen)) -- | Determines whether the converter uses fallback mappings or not. -- This flag has restrictions. Regardless of this flag, the converter -- will always use fallbacks from Unicode Private Use codepoints, as -- well as reverse fallbacks (to Unicode). For details see \".ucm -- File Format\" in the Conversion Data chapter of the ICU User Guide: -- usesFallback :: Converter -> Bool usesFallback cnv = unsafePerformIO $ asBool `fmap` withConverter cnv ucnv_usesFallback -- | Returns the current default converter name. If you want to 'open' -- a default converter, you do not need to use this function. It is -- faster to pass the empty string to 'open' the default converter. getDefaultName :: IO String getDefaultName = peekCString =<< ucnv_getDefaultName -- | Indicates whether the converter contains ambiguous mappings of -- the same character or not. isAmbiguous :: Converter -> Bool isAmbiguous cnv = asBool . unsafePerformIO $ withConverter cnv ucnv_isAmbiguous -- | Sets the current default converter name. If this function needs -- to be called, it should be called during application -- initialization. Most of the time, the results from 'getDefaultName' -- or 'open' with an empty string argument is sufficient for your -- application. -- -- /Note/: this function is not thread safe. /Do not/ call this -- function when /any/ ICU function is being used from more than one -- thread! setDefaultName :: String -> IO () setDefaultName s = withCString s $ ucnv_setDefaultName -- | A list of the canonical names of all available converters. converterNames :: [String] {-# NOINLINE converterNames #-} converterNames = unsafePerformIO $ mapM ((peekCString =<<) . ucnv_getAvailableName) [0..ucnv_countAvailable-1] -- | The list of supported standard names. standardNames :: [String] {-# NOINLINE standardNames #-} standardNames = filter (not . null) . unsafePerformIO $ mapM ((peekCString =<<) . handleError . ucnv_getStandard) [0..ucnv_countStandards-1] -- | Return the aliases for a given converter or alias name. aliases :: String -> [String] aliases name = unsafePerformIO . withCString name $ \ptr -> do count <- handleError $ ucnv_countAliases ptr if count == 0 then return [] else mapM ((peekCString =<<) . handleError . ucnv_getAlias ptr) [0..count-1] foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_open" ucnv_open :: CString -> Ptr UErrorCode -> IO (Ptr UConverter) foreign import ccall unsafe "hs_text_icu.h &__hs_ucnv_close" ucnv_close :: FunPtr (Ptr UConverter -> IO ()) foreign import ccall unsafe "__hs_ucnv_get_max_bytes_for_string" ucnv_max_bytes_for_string :: Ptr UConverter -> CInt -> CInt #if MIN_VERSION_text(2,0,0) foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_toAlgorithmic_UTF8" ucnv_toAlgorithmic_UTF8 :: Ptr UConverter -> Ptr Word8 -> Int32 -> CString -> Int32 -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_fromAlgorithmic_UTF8" ucnv_fromAlgorithmic_UTF8 :: Ptr UConverter -> Ptr Word8 -> Int32 -> Ptr Word8 -> Int32 -> Ptr UErrorCode -> IO Int32 #else foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_toUChars" ucnv_toUChars :: Ptr UConverter -> Ptr UChar -> Int32 -> CString -> Int32 -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_fromUChars" ucnv_fromUChars :: Ptr UConverter -> Ptr Word8 -> Int32 -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO Int32 #endif foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_compareNames" ucnv_compareNames :: CString -> CString -> IO CInt foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getDefaultName" ucnv_getDefaultName :: IO CString foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_setDefaultName" ucnv_setDefaultName :: CString -> IO () foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_countAvailable" ucnv_countAvailable :: Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getAvailableName" ucnv_getAvailableName :: Int32 -> IO CString foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_countAliases" ucnv_countAliases :: CString -> Ptr UErrorCode -> IO Word16 foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getAlias" ucnv_getAlias :: CString -> Word16 -> Ptr UErrorCode -> IO CString foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_countStandards" ucnv_countStandards :: Word16 foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getStandard" ucnv_getStandard :: Word16 -> Ptr UErrorCode -> IO CString foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_usesFallback" ucnv_usesFallback :: Ptr UConverter -> IO UBool foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_setFallback" ucnv_setFallback :: Ptr UConverter -> UBool -> IO () foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_isAmbiguous" ucnv_isAmbiguous :: Ptr UConverter -> IO UBool text-icu-0.8.0.4/Data/Text/ICU/Convert/0000755000000000000000000000000007346545000015451 5ustar0000000000000000text-icu-0.8.0.4/Data/Text/ICU/Convert/Internal.hs0000644000000000000000000000307107346545000017562 0ustar0000000000000000{-# LANGUAGE DeriveDataTypeable, EmptyDataDecls, ForeignFunctionInterface #-} -- | -- Module : Data.Text.ICU.Convert.Internal -- Copyright : (c) Bryan O'Sullivan 2009 -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Low-level character set types and functions. module Data.Text.ICU.Convert.Internal ( Converter(..) , UConverter , getName , withConverter ) where import Data.Text.ICU.Error.Internal (UErrorCode, handleError) import Data.Typeable (Typeable) import Foreign.C.String (CString, peekCString) import Foreign.ForeignPtr (ForeignPtr, withForeignPtr) import Foreign.Ptr (Ptr) import System.IO.Unsafe (unsafePerformIO) data UConverter -- | Character set converter type. /Note/: this structure is not -- thread safe. It is /not/ safe to use value of this type -- simultaneously from multiple threads. data Converter = Converter {-# UNPACK #-} !(ForeignPtr UConverter) deriving (Eq, Typeable) instance Show Converter where show c = "Converter " ++ show (getName c) withConverter :: Converter -> (Ptr UConverter -> IO a) -> IO a {-# INLINE withConverter #-} withConverter (Converter cnv) action = withForeignPtr cnv action -- | Gets the internal, canonical name of the converter. getName :: Converter -> String getName cnv = unsafePerformIO . withConverter cnv $ \ptr -> peekCString =<< handleError (ucnv_getName ptr) foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getName" ucnv_getName :: Ptr UConverter -> Ptr UErrorCode -> IO CString text-icu-0.8.0.4/Data/Text/ICU/DateFormatter.hsc0000644000000000000000000003673307346545000017305 0ustar0000000000000000{-# LANGUAGE EmptyDataDecls, RankNTypes, BangPatterns, ForeignFunctionInterface, RecordWildCards #-} -- | -- Module : Data.Text.ICU.DateFormatter -- Copyright : (c) 2021 Torsten Kemps-Benedix -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Calendar formatter implemented as bindings to -- the International Components for Unicode (ICU) libraries. -- You display or print a Date by first converting it to a locale-specific string that conforms -- to the conventions of the end user’s Locale. For example, Germans recognize 20.4.98 as a valid -- date, and Americans recognize 4/20/98. -- -- 👉 Note: The appropriate Calendar support is required for different locales. For example, the -- Buddhist calendar is the official calendar in Thailand so the typical assumption of Gregorian -- Calendar usage should not be used. ICU will pick the appropriate Calendar based on the locale -- you supply when opening a Calendar or DateFormat. -- -- Date and time formatters are used to convert dates and times from their internal representations -- to textual form in a language-independent manner. module Data.Text.ICU.DateFormatter (DateFormatter, FormatStyle(..), DateFormatSymbolType(..), standardDateFormatter, patternDateFormatter, dateSymbols, formatCalendar ) where #include #include import Control.Monad (forM) import Data.Int (Int32) import Data.Text (Text) import qualified Data.Text as T import Data.Text.ICU.Error.Internal (UErrorCode, handleError, handleOverflowError) import Data.Text.ICU.Internal (LocaleName(..), UChar, withLocaleName, newICUPtr, fromUCharPtr, useAsUCharPtr) import Data.Text.ICU.Calendar import Foreign.C.String (CString) import Foreign.C.Types (CInt(..)) import Foreign.ForeignPtr (withForeignPtr, ForeignPtr) import Foreign.Ptr (FunPtr, Ptr, nullPtr) import Prelude hiding (last) import System.IO.Unsafe (unsafePerformIO) -- | The possible date/time format styles. data FormatStyle = FullFormatStyle -- ^ Full style, such as Tuesday, April 12, 1952 AD or 3:30:42pm PST | LongFormatStyle -- ^ Long style, such as January 12, 1952 or 3:30:32pm | MediumFormatStyle -- ^ Medium style, such as Jan. 12, 1952 | ShortFormatStyle -- ^ Short style, such as 12/13/52 or 3:30pm | DefaultFormatStyle -- ^ Default style | RelativeFormatStyle -- ^ Relative style: ICU currently provides limited support for formatting dates using a “relative” style, specified using RELATIVE_SHORT, RELATIVE_MEDIUM, RELATIVE_LONG or RELATIVE_FULL. As currently implemented, relative date formatting only affects the formatting of dates within a limited range of calendar days before or after the current date, based on the CLDR / data: For example, in English, “Yesterday”, “Today”, and “Tomorrow”. Within this range, the specific relative style currently makes no difference. Outside of this range, relative dates are formatted using the corresponding non-relative style (SHORT, MEDIUM, etc.). Relative time styles are not currently supported, and behave just like the corresponding non-relative style. | NoFormatStyle -- ^ No style. deriving (Eq, Enum, Show) toUDateFormatStyle :: FormatStyle -> CInt toUDateFormatStyle FullFormatStyle = #const UDAT_FULL toUDateFormatStyle LongFormatStyle = #const UDAT_LONG toUDateFormatStyle MediumFormatStyle = #const UDAT_MEDIUM toUDateFormatStyle ShortFormatStyle = #const UDAT_SHORT toUDateFormatStyle DefaultFormatStyle = #const UDAT_DEFAULT toUDateFormatStyle RelativeFormatStyle = #const UDAT_RELATIVE toUDateFormatStyle NoFormatStyle = #const UDAT_NONE -- | The possible types of date format symbols. data DateFormatSymbolType = Eras -- ^ The era names, for example AD. | Months -- ^ The month names, for example February. | ShortMonths -- ^ The short month names, for example Feb. | Weekdays -- ^ The CLDR-style format "wide" weekday names, for example Monday. | ShortWeekdays -- ^ The CLDR-style format "abbreviated" (not "short") weekday names, for example "Mon." For the CLDR-style format "short" weekday names, use UDAT_SHORTER_WEEKDAYS. | AmPms -- ^ The AM/PM names, for example AM. | LocalizedChars -- ^ The localized characters. | EraNames -- ^ The long era names, for example Anno Domini. | NarrowMonths -- ^ The narrow month names, for example F. | NarrowWeekdays -- ^ The CLDR-style format "narrow" weekday names, for example "M". | StandaloneMonths -- ^ Standalone context versions of months. | StandaloneWeekdays -- ^ The CLDR-style stand-alone "wide" weekday names. | StandaoneShortWeekdays -- ^ The CLDR-style stand-alone "abbreviated" (not "short") weekday names. For the CLDR-style stand-alone "short" weekday names, use UDAT_STANDALONE_SHORTER_WEEKDAYS. | StandaloneNarrowWeekdays -- ^ The CLDR-style stand-alone "narrow" weekday names. | Quarters -- ^ The quarters, for example 1st Quarter. | ShortQuarters -- ^ The short quarter names, for example Q1. | StandaloneQuarters -- ^ Standalone context versions of quarters. | ShorterWeekdays -- ^ The CLDR-style short weekday names, e.g. "Su", Mo", etc. These are named "SHORTER" to contrast with the constants using SHORT above, which actually get the CLDR-style abbreviated versions of the corresponding names. | StandaloneShorterWeekdays -- ^ Standalone version of UDAT_SHORTER_WEEKDAYS. | CyclicYearsWide -- ^ Cyclic year names (only supported for some calendars, and only for FORMAT usage; udat_setSymbols not supported for UDAT_CYCLIC_YEARS_WIDE) | CyclicYearsAbbreviated -- ^ Cyclic year names (only supported for some calendars, and only for FORMAT usage) | CyclicYearsNarrow -- ^ Cyclic year names (only supported for some calendars, and only for FORMAT usage; udat_setSymbols not supported for UDAT_CYCLIC_YEARS_NARROW) | ZodiacNamesWide -- ^ Calendar zodiac names (only supported for some calendars, and only for FORMAT usage; udat_setSymbols not supported for UDAT_ZODIAC_NAMES_WIDE) | ZodiacNamesAbbreviated -- ^ Calendar zodiac names (only supported for some calendars, and only for FORMAT usage) | ZodiacNamesNarrow -- ^ Calendar zodiac names (only supported for some calendars, and only for FORMAT usage; udat_setSymbols not supported for UDAT_ZODIAC_NAMES_NARROW) #if U_ICU_VERSION_MAJOR_NUM >= 70 | NarrowQuarters -- ^ The narrow quarter names, for example 1. | StandaloneNarrowQuarters -- ^ The narrow standalone quarter names, for example 1. #endif toUDateFormatSymbolType :: DateFormatSymbolType -> CInt toUDateFormatSymbolType Eras = #const UDAT_ERAS toUDateFormatSymbolType Months = #const UDAT_MONTHS toUDateFormatSymbolType ShortMonths = #const UDAT_SHORT_MONTHS toUDateFormatSymbolType Weekdays = #const UDAT_WEEKDAYS toUDateFormatSymbolType ShortWeekdays = #const UDAT_SHORT_WEEKDAYS toUDateFormatSymbolType AmPms = #const UDAT_AM_PMS toUDateFormatSymbolType LocalizedChars = #const UDAT_LOCALIZED_CHARS toUDateFormatSymbolType EraNames = #const UDAT_ERA_NAMES toUDateFormatSymbolType NarrowMonths = #const UDAT_NARROW_MONTHS toUDateFormatSymbolType NarrowWeekdays = #const UDAT_NARROW_WEEKDAYS toUDateFormatSymbolType StandaloneMonths = #const UDAT_STANDALONE_MONTHS toUDateFormatSymbolType StandaloneWeekdays = #const UDAT_STANDALONE_WEEKDAYS toUDateFormatSymbolType StandaoneShortWeekdays = #const UDAT_STANDALONE_SHORT_WEEKDAYS toUDateFormatSymbolType StandaloneNarrowWeekdays = #const UDAT_STANDALONE_NARROW_WEEKDAYS toUDateFormatSymbolType Quarters = #const UDAT_QUARTERS toUDateFormatSymbolType ShortQuarters = #const UDAT_SHORT_QUARTERS toUDateFormatSymbolType StandaloneQuarters = #const UDAT_STANDALONE_QUARTERS toUDateFormatSymbolType ShorterWeekdays = #const UDAT_SHORTER_WEEKDAYS toUDateFormatSymbolType StandaloneShorterWeekdays = #const UDAT_STANDALONE_SHORTER_WEEKDAYS toUDateFormatSymbolType CyclicYearsWide = #const UDAT_CYCLIC_YEARS_WIDE toUDateFormatSymbolType CyclicYearsAbbreviated = #const UDAT_CYCLIC_YEARS_ABBREVIATED toUDateFormatSymbolType CyclicYearsNarrow = #const UDAT_CYCLIC_YEARS_NARROW toUDateFormatSymbolType ZodiacNamesWide = #const UDAT_ZODIAC_NAMES_WIDE toUDateFormatSymbolType ZodiacNamesAbbreviated = #const UDAT_ZODIAC_NAMES_ABBREVIATED toUDateFormatSymbolType ZodiacNamesNarrow = #const UDAT_ZODIAC_NAMES_NARROW #if U_ICU_VERSION_MAJOR_NUM >= 70 toUDateFormatSymbolType NarrowQuarters = #const UDAT_NARROW_QUARTERS toUDateFormatSymbolType StandaloneNarrowQuarters = #const UDAT_STANDALONE_NARROW_QUARTERS #endif type UDateFormatStyle = CInt type UFieldPosition = CInt type UDateFormatSymbolType = CInt data UDateFormat -- | This is an abstract data type holding a reference to the ICU date format object. Create a 'DateFormatter' -- with either 'standardDateFormatter' or 'patternDateFormatter' and use it in order to format 'Calendar' -- objects with the function 'formatCalendar'. newtype DateFormatter = DateFormatter (ForeignPtr UDateFormat) -- | Create a new 'DateFormatter' from the standard styles. -- -- >>> import Data.Text -- >>> dfDe <- standardDateFormatter LongFormatStyle LongFormatStyle (Locale "de_DE") (pack "CET") standardDateFormatter :: FormatStyle -> FormatStyle -> LocaleName -> Text -> IO DateFormatter standardDateFormatter timeStyle dateStyle loc timeZoneId = withLocaleName loc $ \locale -> useAsUCharPtr timeZoneId $ \tzPtr tzLen -> newICUPtr DateFormatter udat_close $ handleError $ udat_open (toUDateFormatStyle timeStyle) (toUDateFormatStyle dateStyle) locale tzPtr (fromIntegral tzLen) nullPtr (0 :: Int32) -- | Create a new 'DateFormatter' using a custom pattern as described at -- https://unicode-org.github.io/icu/userguide/format_parse/datetime/#datetime-format-syntax. For examples -- the pattern "yyyy.MM.dd G 'at' HH:mm:ss zzz" produces “1996.07.10 AD at 15:08:56 PDT” in English for -- the PDT time zone. -- -- A date pattern is a string of characters, where specific strings of characters are replaced with date and --time data from a calendar when formatting or used to generate data for a calendar when parsing. -- -- The [Date Field Symbol Table](https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table) -- contains the characters used in patterns to show the appropriate formats -- for a given locale, such as yyyy for the year. Characters may be used multiple times. For example, if y is -- used for the year, "yy" might produce “99”, whereas "yyyy" produces “1999”. For most numerical fields, the -- number of characters specifies the field width. For example, if h is the hour, "h" might produce “5”, but -- "hh" produces “05”. For some characters, the count specifies whether an abbreviated or full form should be -- used, but may have other choices, as given below. -- -- Two single quotes represents a literal single quote, either inside or outside single quotes. Text within -- single quotes is not interpreted in any way (except for two adjacent single quotes). Otherwise all ASCII -- letter from a to z and A to Z are reserved as syntax characters, and require quoting if they are to represent -- literal characters. In addition, certain ASCII punctuation characters may become variable in the future (eg -- ':' being interpreted as the time separator and '/' as a date separator, and replaced by respective locale-sensitive -- characters in display). -- -- “Stand-alone” values refer to those designed to stand on their own independently, as opposed to being with -- other formatted values. “2nd quarter” would use the wide stand-alone format "qqqq", whereas “2nd quarter 2007” -- would use the regular format "QQQQ yyyy". For more information about format and stand-alone forms, see -- [CLDR Calendar Elements](https://www.unicode.org/reports/tr35/tr35-dates.html#months_days_quarters_eras). -- -- The pattern characters used in the Date Field Symbol Table are defined by CLDR; for more information see -- [CLDR Date Field Symbol Table](https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table). -- -- 👉 Note that the examples may not reflect current CLDR data. patternDateFormatter :: Text -> LocaleName -> Text -> IO DateFormatter patternDateFormatter pattern loc timeZoneId = withLocaleName loc $ \locale -> useAsUCharPtr timeZoneId $ \tzPtr tzLen -> useAsUCharPtr pattern $ \patPtr patLen -> newICUPtr DateFormatter udat_close $ handleError $ udat_open (fromIntegral ((#const UDAT_PATTERN) :: Int32)) (fromIntegral ((#const UDAT_PATTERN) :: Int32)) locale tzPtr (fromIntegral tzLen) patPtr (fromIntegral patLen) -- | Get relevant date related symbols, e.g. month and weekday names. -- -- >>> import Data.Text -- >>> dfDe <- standardDateFormatter LongFormatStyle LongFormatStyle (Locale "de_DE") (pack "CET") -- >>> dateSymbols dfDe Months -- ["Januar","Februar","M\228rz","April","Mai","Juni","Juli","August","September","Oktober","November","Dezember"] -- >>> dfAt <- standardDateFormatter LongFormatStyle LongFormatStyle (Locale "de_AT") (pack "CET") -- >>> dateSymbols dfAt Months -- ["J\228nner","Februar","M\228rz","April","Mai","Juni","Juli","August","September","Oktober","November","Dezember"] dateSymbols :: DateFormatter -> DateFormatSymbolType -> [Text] dateSymbols (DateFormatter df) symType = unsafePerformIO $ do withForeignPtr df $ \dfPtr -> do n <- udat_countSymbols dfPtr (toUDateFormatSymbolType symType) syms <- forM [0..(n-1)] $ \i -> do handleOverflowError (fromIntegral (64 :: Int)) (\dptr dlen -> udat_getSymbols dfPtr (toUDateFormatSymbolType symType) (fromIntegral i) dptr dlen) (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen)) pure $ filter (not . T.null) syms -- | Format a 'Calendar' using a 'DateFormatter'. -- -- >>> import Data.Text -- >>> dfDe <- standardDateFormatter LongFormatStyle LongFormatStyle (Locale "de_DE") (pack "CET") -- >>> c <- calendar (pack "CET") (Locale "de_DE") TraditionalCalendarType -- >>> formatCalendar dfDe c -- "13. Oktober 2021 um 12:44:09 GMT+2" formatCalendar :: DateFormatter -> Calendar -> Text formatCalendar (DateFormatter df) (Calendar cal) = unsafePerformIO $ withForeignPtr df $ \dfPtr -> do withForeignPtr cal $ \calPtr -> do handleOverflowError (fromIntegral (64 :: Int)) (\dptr dlen -> udat_formatCalendar dfPtr calPtr dptr dlen nullPtr) (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen)) foreign import ccall unsafe "hs_text_icu.h __hs_udat_open" udat_open :: UDateFormatStyle -> UDateFormatStyle -> CString -> Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO (Ptr UDateFormat) foreign import ccall unsafe "hs_text_icu.h &__hs_udat_close" udat_close :: FunPtr (Ptr UDateFormat -> IO ()) foreign import ccall unsafe "hs_text_icu.h __hs_udat_formatCalendar" udat_formatCalendar :: Ptr UDateFormat -> Ptr UCalendar -> Ptr UChar -> Int32 -> Ptr UFieldPosition -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_udat_getSymbols" udat_getSymbols :: Ptr UDateFormat -> UDateFormatSymbolType -> Int32 -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_udat_countSymbols" udat_countSymbols :: Ptr UDateFormat -> UDateFormatSymbolType -> IO Int32 text-icu-0.8.0.4/Data/Text/ICU/Enumerator.hsc0000644000000000000000000000361607346545000016657 0ustar0000000000000000{-# LANGUAGE EmptyDataDecls, BangPatterns, ForeignFunctionInterface, RecordWildCards #-} -- | -- Module : Data.Text.ICU.Calendar -- Copyright : (c) 2021 Torsten Kemps-Benedix -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Calendar functions implemented as bindings to -- the International Components for Unicode (ICU) libraries. module Data.Text.ICU.Enumerator (next, toList, createEnumerator, Enumerator, UEnumerator, ) where #include import Data.Int (Int32) import Data.Text (Text) import Data.Text.ICU.Error.Internal (UErrorCode, handleError) import Data.Text.ICU.Internal (UChar, newICUPtr, fromUCharPtr) import Foreign.ForeignPtr (withForeignPtr, ForeignPtr) import Foreign.Marshal.Alloc (alloca) import Foreign.Ptr (FunPtr, Ptr, nullPtr) import Foreign.Storable (peek) import Prelude hiding (last) data UEnumerator newtype Enumerator = Enumerator {enumeratorToForeignPtr :: ForeignPtr UEnumerator} createEnumerator :: IO (Ptr UEnumerator) -> IO Enumerator createEnumerator = newICUPtr Enumerator uenum_close next :: Enumerator -> IO (Maybe Text) next enum = withForeignPtr (enumeratorToForeignPtr enum) $ \enumPtr -> alloca $ \lenPtr -> do textPtr <- handleError $ uenum_unext enumPtr lenPtr if textPtr == nullPtr then pure Nothing else do n <- peek lenPtr t <- fromUCharPtr textPtr (fromIntegral n) pure $ Just t toList :: Enumerator -> IO [Text] toList enum = reverse <$> go [] where go l = do mx <- next enum case mx of Nothing -> pure l Just x -> go (x:l) foreign import ccall unsafe "hs_text_icu.h &__hs_uenum_close" uenum_close :: FunPtr (Ptr UEnumerator -> IO ()) foreign import ccall unsafe "hs_text_icu.h __hs_uenum_unext" uenum_unext :: Ptr UEnumerator -> Ptr Int32 -> Ptr UErrorCode -> IO (Ptr UChar) text-icu-0.8.0.4/Data/Text/ICU/Error.hsc0000644000000000000000000002577107346545000015635 0ustar0000000000000000-- | -- Module : Data.Text.ICU.Error -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Errors thrown by bindings to the International Components for -- Unicode (ICU) libraries. -- -- Most ICU functions can throw an 'ICUError' value as an exception. -- Some can additionally throw a 'ParseError', if more detailed error -- information is necessary. module Data.Text.ICU.Error ( -- * Types ICUError, ParseError(errError, errLine, errOffset), -- * Functions isSuccess, isFailure, errorName, isRegexError, -- * Warnings u_USING_FALLBACK_WARNING, u_USING_DEFAULT_WARNING, u_SAFECLONE_ALLOCATED_WARNING, u_STATE_OLD_WARNING, u_STRING_NOT_TERMINATED_WARNING, u_SORT_KEY_TOO_SHORT_WARNING, u_AMBIGUOUS_ALIAS_WARNING, u_DIFFERENT_UCA_VERSION, -- * Errors u_ILLEGAL_ARGUMENT_ERROR, u_MISSING_RESOURCE_ERROR, u_INVALID_FORMAT_ERROR, u_FILE_ACCESS_ERROR, u_INTERNAL_PROGRAM_ERROR, u_MESSAGE_PARSE_ERROR, u_MEMORY_ALLOCATION_ERROR, u_INDEX_OUTOFBOUNDS_ERROR, u_PARSE_ERROR, u_INVALID_CHAR_FOUND, u_TRUNCATED_CHAR_FOUND, u_ILLEGAL_CHAR_FOUND, u_INVALID_TABLE_FORMAT, u_INVALID_TABLE_FILE, u_BUFFER_OVERFLOW_ERROR, u_UNSUPPORTED_ERROR, u_RESOURCE_TYPE_MISMATCH, u_ILLEGAL_ESCAPE_SEQUENCE, u_UNSUPPORTED_ESCAPE_SEQUENCE, u_NO_SPACE_AVAILABLE, u_CE_NOT_FOUND_ERROR, u_PRIMARY_TOO_LONG_ERROR, u_STATE_TOO_OLD_ERROR, u_TOO_MANY_ALIASES_ERROR, u_ENUM_OUT_OF_SYNC_ERROR, u_INVARIANT_CONVERSION_ERROR, u_INVALID_STATE_ERROR, u_COLLATOR_VERSION_MISMATCH, u_USELESS_COLLATOR_ERROR, u_NO_WRITE_PERMISSION, -- ** Transliterator errors u_BAD_VARIABLE_DEFINITION, u_MALFORMED_RULE, u_MALFORMED_SET, u_MALFORMED_UNICODE_ESCAPE, u_MALFORMED_VARIABLE_DEFINITION, u_MALFORMED_VARIABLE_REFERENCE, u_MISPLACED_CURSOR_OFFSET, u_MISPLACED_QUANTIFIER, u_MISSING_OPERATOR, u_MULTIPLE_ANTE_CONTEXTS, u_MULTIPLE_CURSORS, u_MULTIPLE_POST_CONTEXTS, u_TRAILING_BACKSLASH, u_UNDEFINED_SEGMENT_REFERENCE, u_UNDEFINED_VARIABLE, u_UNQUOTED_SPECIAL, u_UNTERMINATED_QUOTE, u_RULE_MASK_ERROR, u_MISPLACED_COMPOUND_FILTER, u_MULTIPLE_COMPOUND_FILTERS, u_INVALID_RBT_SYNTAX, u_MALFORMED_PRAGMA, u_UNCLOSED_SEGMENT, u_VARIABLE_RANGE_EXHAUSTED, u_VARIABLE_RANGE_OVERLAP, u_ILLEGAL_CHARACTER, u_INTERNAL_TRANSLITERATOR_ERROR, u_INVALID_ID, u_INVALID_FUNCTION, -- ** Formatting API parsing errors u_UNEXPECTED_TOKEN, u_MULTIPLE_DECIMAL_SEPARATORS, u_MULTIPLE_EXPONENTIAL_SYMBOLS, u_MALFORMED_EXPONENTIAL_PATTERN, u_MULTIPLE_PERCENT_SYMBOLS, u_MULTIPLE_PERMILL_SYMBOLS, u_MULTIPLE_PAD_SPECIFIERS, u_PATTERN_SYNTAX_ERROR, u_ILLEGAL_PAD_POSITION, u_UNMATCHED_BRACES, u_ARGUMENT_TYPE_MISMATCH, u_DUPLICATE_KEYWORD, u_UNDEFINED_KEYWORD, u_DEFAULT_KEYWORD_MISSING, -- ** Break iterator errors u_BRK_INTERNAL_ERROR, u_BRK_HEX_DIGITS_EXPECTED, u_BRK_SEMICOLON_EXPECTED, u_BRK_RULE_SYNTAX, u_BRK_UNCLOSED_SET, u_BRK_ASSIGN_ERROR, u_BRK_VARIABLE_REDFINITION, u_BRK_MISMATCHED_PAREN, u_BRK_NEW_LINE_IN_QUOTED_STRING, u_BRK_UNDEFINED_VARIABLE, u_BRK_INIT_ERROR, u_BRK_RULE_EMPTY_SET, u_BRK_UNRECOGNIZED_OPTION, u_BRK_MALFORMED_RULE_TAG, -- ** Regular expression errors u_REGEX_INTERNAL_ERROR, u_REGEX_RULE_SYNTAX, u_REGEX_INVALID_STATE, u_REGEX_BAD_ESCAPE_SEQUENCE, u_REGEX_PROPERTY_SYNTAX, u_REGEX_UNIMPLEMENTED, u_REGEX_MISMATCHED_PAREN, u_REGEX_NUMBER_TOO_BIG, u_REGEX_BAD_INTERVAL, u_REGEX_MAX_LT_MIN, u_REGEX_INVALID_BACK_REF, u_REGEX_INVALID_FLAG, u_REGEX_SET_CONTAINS_STRING, u_REGEX_OCTAL_TOO_BIG, u_REGEX_INVALID_RANGE, u_REGEX_STACK_OVERFLOW, u_REGEX_TIME_OUT, u_REGEX_STOPPED_BY_CALLER, -- ** IDNA errors u_IDNA_PROHIBITED_ERROR, u_IDNA_UNASSIGNED_ERROR, u_IDNA_CHECK_BIDI_ERROR, u_IDNA_STD3_ASCII_RULES_ERROR, u_IDNA_ACE_PREFIX_ERROR, u_IDNA_VERIFICATION_ERROR, u_IDNA_LABEL_TOO_LONG_ERROR, u_IDNA_ZERO_LENGTH_LABEL_ERROR, u_IDNA_DOMAIN_NAME_TOO_LONG_ERROR ) where #ifdef mingw32_HOST_OS #define U_HAVE_INTTYPES_H 1 #endif #include import Data.Text.ICU.Error.Internal #{enum ICUError, ICUError, u_USING_FALLBACK_WARNING = U_USING_FALLBACK_WARNING, u_USING_DEFAULT_WARNING = U_USING_DEFAULT_WARNING, u_SAFECLONE_ALLOCATED_WARNING = U_SAFECLONE_ALLOCATED_WARNING, u_STATE_OLD_WARNING = U_STATE_OLD_WARNING, u_STRING_NOT_TERMINATED_WARNING = U_STRING_NOT_TERMINATED_WARNING, u_SORT_KEY_TOO_SHORT_WARNING = U_SORT_KEY_TOO_SHORT_WARNING, u_AMBIGUOUS_ALIAS_WARNING = U_AMBIGUOUS_ALIAS_WARNING, u_DIFFERENT_UCA_VERSION = U_DIFFERENT_UCA_VERSION, u_ILLEGAL_ARGUMENT_ERROR = U_ILLEGAL_ARGUMENT_ERROR, u_MISSING_RESOURCE_ERROR = U_MISSING_RESOURCE_ERROR, u_INVALID_FORMAT_ERROR = U_INVALID_FORMAT_ERROR, u_FILE_ACCESS_ERROR = U_FILE_ACCESS_ERROR, u_INTERNAL_PROGRAM_ERROR = U_INTERNAL_PROGRAM_ERROR, u_MESSAGE_PARSE_ERROR = U_MESSAGE_PARSE_ERROR, u_MEMORY_ALLOCATION_ERROR = U_MEMORY_ALLOCATION_ERROR, u_INDEX_OUTOFBOUNDS_ERROR = U_INDEX_OUTOFBOUNDS_ERROR, u_PARSE_ERROR = U_PARSE_ERROR, u_INVALID_CHAR_FOUND = U_INVALID_CHAR_FOUND, u_TRUNCATED_CHAR_FOUND = U_TRUNCATED_CHAR_FOUND, u_ILLEGAL_CHAR_FOUND = U_ILLEGAL_CHAR_FOUND, u_INVALID_TABLE_FORMAT = U_INVALID_TABLE_FORMAT, u_INVALID_TABLE_FILE = U_INVALID_TABLE_FILE, u_BUFFER_OVERFLOW_ERROR = U_BUFFER_OVERFLOW_ERROR, u_UNSUPPORTED_ERROR = U_UNSUPPORTED_ERROR, u_RESOURCE_TYPE_MISMATCH = U_RESOURCE_TYPE_MISMATCH, u_ILLEGAL_ESCAPE_SEQUENCE = U_ILLEGAL_ESCAPE_SEQUENCE, u_UNSUPPORTED_ESCAPE_SEQUENCE = U_UNSUPPORTED_ESCAPE_SEQUENCE, u_NO_SPACE_AVAILABLE = U_NO_SPACE_AVAILABLE, u_CE_NOT_FOUND_ERROR = U_CE_NOT_FOUND_ERROR, u_PRIMARY_TOO_LONG_ERROR = U_PRIMARY_TOO_LONG_ERROR, u_STATE_TOO_OLD_ERROR = U_STATE_TOO_OLD_ERROR, u_TOO_MANY_ALIASES_ERROR = U_TOO_MANY_ALIASES_ERROR, u_ENUM_OUT_OF_SYNC_ERROR = U_ENUM_OUT_OF_SYNC_ERROR, u_INVARIANT_CONVERSION_ERROR = U_INVARIANT_CONVERSION_ERROR, u_INVALID_STATE_ERROR = U_INVALID_STATE_ERROR, u_COLLATOR_VERSION_MISMATCH = U_COLLATOR_VERSION_MISMATCH, u_USELESS_COLLATOR_ERROR = U_USELESS_COLLATOR_ERROR, u_NO_WRITE_PERMISSION = U_NO_WRITE_PERMISSION, u_BAD_VARIABLE_DEFINITION = U_BAD_VARIABLE_DEFINITION, u_MALFORMED_RULE = U_MALFORMED_RULE, u_MALFORMED_SET = U_MALFORMED_SET, u_MALFORMED_UNICODE_ESCAPE = U_MALFORMED_UNICODE_ESCAPE, u_MALFORMED_VARIABLE_DEFINITION = U_MALFORMED_VARIABLE_DEFINITION, u_MALFORMED_VARIABLE_REFERENCE = U_MALFORMED_VARIABLE_REFERENCE, u_MISPLACED_CURSOR_OFFSET = U_MISPLACED_CURSOR_OFFSET, u_MISPLACED_QUANTIFIER = U_MISPLACED_QUANTIFIER, u_MISSING_OPERATOR = U_MISSING_OPERATOR, u_MULTIPLE_ANTE_CONTEXTS = U_MULTIPLE_ANTE_CONTEXTS, u_MULTIPLE_CURSORS = U_MULTIPLE_CURSORS, u_MULTIPLE_POST_CONTEXTS = U_MULTIPLE_POST_CONTEXTS, u_TRAILING_BACKSLASH = U_TRAILING_BACKSLASH, u_UNDEFINED_SEGMENT_REFERENCE = U_UNDEFINED_SEGMENT_REFERENCE, u_UNDEFINED_VARIABLE = U_UNDEFINED_VARIABLE, u_UNQUOTED_SPECIAL = U_UNQUOTED_SPECIAL, u_UNTERMINATED_QUOTE = U_UNTERMINATED_QUOTE, u_RULE_MASK_ERROR = U_RULE_MASK_ERROR, u_MISPLACED_COMPOUND_FILTER = U_MISPLACED_COMPOUND_FILTER, u_MULTIPLE_COMPOUND_FILTERS = U_MULTIPLE_COMPOUND_FILTERS, u_INVALID_RBT_SYNTAX = U_INVALID_RBT_SYNTAX, u_MALFORMED_PRAGMA = U_MALFORMED_PRAGMA, u_UNCLOSED_SEGMENT = U_UNCLOSED_SEGMENT, u_VARIABLE_RANGE_EXHAUSTED = U_VARIABLE_RANGE_EXHAUSTED, u_VARIABLE_RANGE_OVERLAP = U_VARIABLE_RANGE_OVERLAP, u_ILLEGAL_CHARACTER = U_ILLEGAL_CHARACTER, u_INTERNAL_TRANSLITERATOR_ERROR = U_INTERNAL_TRANSLITERATOR_ERROR, u_INVALID_ID = U_INVALID_ID, u_INVALID_FUNCTION = U_INVALID_FUNCTION, u_UNEXPECTED_TOKEN = U_UNEXPECTED_TOKEN, u_MULTIPLE_DECIMAL_SEPARATORS = U_MULTIPLE_DECIMAL_SEPARATORS, u_MULTIPLE_EXPONENTIAL_SYMBOLS = U_MULTIPLE_EXPONENTIAL_SYMBOLS, u_MALFORMED_EXPONENTIAL_PATTERN = U_MALFORMED_EXPONENTIAL_PATTERN, u_MULTIPLE_PERCENT_SYMBOLS = U_MULTIPLE_PERCENT_SYMBOLS, u_MULTIPLE_PERMILL_SYMBOLS = U_MULTIPLE_PERMILL_SYMBOLS, u_MULTIPLE_PAD_SPECIFIERS = U_MULTIPLE_PAD_SPECIFIERS, u_PATTERN_SYNTAX_ERROR = U_PATTERN_SYNTAX_ERROR, u_ILLEGAL_PAD_POSITION = U_ILLEGAL_PAD_POSITION, u_UNMATCHED_BRACES = U_UNMATCHED_BRACES, u_ARGUMENT_TYPE_MISMATCH = U_ARGUMENT_TYPE_MISMATCH, u_DUPLICATE_KEYWORD = U_DUPLICATE_KEYWORD, u_UNDEFINED_KEYWORD = U_UNDEFINED_KEYWORD, u_DEFAULT_KEYWORD_MISSING = U_DEFAULT_KEYWORD_MISSING, u_BRK_INTERNAL_ERROR = U_BRK_INTERNAL_ERROR, u_BRK_HEX_DIGITS_EXPECTED = U_BRK_HEX_DIGITS_EXPECTED, u_BRK_SEMICOLON_EXPECTED = U_BRK_SEMICOLON_EXPECTED, u_BRK_RULE_SYNTAX = U_BRK_RULE_SYNTAX, u_BRK_UNCLOSED_SET = U_BRK_UNCLOSED_SET, u_BRK_ASSIGN_ERROR = U_BRK_ASSIGN_ERROR, u_BRK_VARIABLE_REDFINITION = U_BRK_VARIABLE_REDFINITION, u_BRK_MISMATCHED_PAREN = U_BRK_MISMATCHED_PAREN, u_BRK_NEW_LINE_IN_QUOTED_STRING = U_BRK_NEW_LINE_IN_QUOTED_STRING, u_BRK_UNDEFINED_VARIABLE = U_BRK_UNDEFINED_VARIABLE, u_BRK_INIT_ERROR = U_BRK_INIT_ERROR, u_BRK_RULE_EMPTY_SET = U_BRK_RULE_EMPTY_SET, u_BRK_UNRECOGNIZED_OPTION = U_BRK_UNRECOGNIZED_OPTION, u_BRK_MALFORMED_RULE_TAG = U_BRK_MALFORMED_RULE_TAG, u_REGEX_INTERNAL_ERROR = U_REGEX_INTERNAL_ERROR, u_REGEX_RULE_SYNTAX = U_REGEX_RULE_SYNTAX, u_REGEX_INVALID_STATE = U_REGEX_INVALID_STATE, u_REGEX_BAD_ESCAPE_SEQUENCE = U_REGEX_BAD_ESCAPE_SEQUENCE, u_REGEX_PROPERTY_SYNTAX = U_REGEX_PROPERTY_SYNTAX, u_REGEX_UNIMPLEMENTED = U_REGEX_UNIMPLEMENTED, u_REGEX_MISMATCHED_PAREN = U_REGEX_MISMATCHED_PAREN, u_REGEX_NUMBER_TOO_BIG = U_REGEX_NUMBER_TOO_BIG, u_REGEX_BAD_INTERVAL = U_REGEX_BAD_INTERVAL, u_REGEX_MAX_LT_MIN = U_REGEX_MAX_LT_MIN, u_REGEX_INVALID_BACK_REF = U_REGEX_INVALID_BACK_REF, u_REGEX_INVALID_FLAG = U_REGEX_INVALID_FLAG, u_REGEX_SET_CONTAINS_STRING = U_REGEX_SET_CONTAINS_STRING, u_REGEX_OCTAL_TOO_BIG = U_REGEX_OCTAL_TOO_BIG, u_REGEX_INVALID_RANGE = U_REGEX_INVALID_RANGE, u_REGEX_STACK_OVERFLOW = U_REGEX_STACK_OVERFLOW, u_REGEX_TIME_OUT = U_REGEX_TIME_OUT, u_REGEX_STOPPED_BY_CALLER = U_REGEX_STOPPED_BY_CALLER, u_IDNA_PROHIBITED_ERROR = U_IDNA_PROHIBITED_ERROR, u_IDNA_UNASSIGNED_ERROR = U_IDNA_UNASSIGNED_ERROR, u_IDNA_CHECK_BIDI_ERROR = U_IDNA_CHECK_BIDI_ERROR, u_IDNA_STD3_ASCII_RULES_ERROR = U_IDNA_STD3_ASCII_RULES_ERROR, u_IDNA_ACE_PREFIX_ERROR = U_IDNA_ACE_PREFIX_ERROR, u_IDNA_VERIFICATION_ERROR = U_IDNA_VERIFICATION_ERROR, u_IDNA_LABEL_TOO_LONG_ERROR = U_IDNA_LABEL_TOO_LONG_ERROR, u_IDNA_ZERO_LENGTH_LABEL_ERROR = U_IDNA_ZERO_LENGTH_LABEL_ERROR, u_IDNA_DOMAIN_NAME_TOO_LONG_ERROR = U_IDNA_DOMAIN_NAME_TOO_LONG_ERROR } isRegexError :: ICUError -> Bool isRegexError (ICUError err) = err >= #{const U_REGEX_ERROR_START} && err < #{const U_REGEX_ERROR_LIMIT} text-icu-0.8.0.4/Data/Text/ICU/Error/0000755000000000000000000000000007346545000015122 5ustar0000000000000000text-icu-0.8.0.4/Data/Text/ICU/Error/Internal.hsc0000644000000000000000000001347507346545000017407 0ustar0000000000000000{-# LANGUAGE BangPatterns, DeriveDataTypeable, ForeignFunctionInterface, RecordWildCards, ScopedTypeVariables #-} module Data.Text.ICU.Error.Internal ( -- * Types ICUError(..) -- ** Low-level types , UErrorCode , ParseError(errError, errLine, errOffset) , UParseError -- * Functions , isFailure , isSuccess , errorName , handleError , handleOverflowError , handleParseError , throwOnError , withError ) where import Control.DeepSeq (NFData(..)) import Control.Exception (Exception, throwIO) import Data.Function (fix) import Foreign.Ptr (Ptr) import Foreign.Marshal.Alloc (alloca, allocaBytes) import Foreign.Marshal.Utils (with) import Foreign.Marshal.Array (allocaArray) import Data.Int (Int32) import Data.Typeable (Typeable) import Foreign.C.String (CString, peekCString) import Foreign.C.Types (CInt(..)) import Foreign.Storable (Storable(..)) import System.IO.Unsafe (unsafePerformIO) #include #include type UErrorCode = CInt -- | ICU error type. This is an instance of the 'Exception' type -- class. A value of this type may be thrown as an exception by most -- ICU functions. newtype ICUError = ICUError { fromErrorCode :: UErrorCode } deriving (Eq, Typeable) instance Show ICUError where show code = "ICUError " ++ errorName code instance Exception ICUError instance NFData ICUError where rnf !_ = () -- | Detailed information about parsing errors. Used by ICU parsing -- engines that parse long rules, patterns, or programs, where the -- text being parsed is long enough that more information than an -- 'ICUError' is needed to localize the error. data ParseError = ParseError { errError :: ICUError , errLine :: !(Maybe Int) -- ^ The line on which the error occured. If the parser uses this -- field, it sets it to the line number of the source text line on -- which the error appears, which will be be a positive value. If -- the parser does not support line numbers, the value will be -- 'Nothing'. , errOffset :: !(Maybe Int) -- ^ The character offset to the error. If the 'errLine' field is -- 'Just' some value, then this field contains the offset from the -- beginning of the line that contains the error. Otherwise, it -- represents the offset from the start of the text. If the -- parser does not support this field, it will have a value of -- 'Nothing'. } deriving (Show, Typeable) instance NFData ParseError where rnf ParseError{..} = rnf errError `seq` rnf errLine `seq` rnf errOffset type UParseError = ParseError instance Exception ParseError -- | Indicate whether the given error code is a success. isSuccess :: ICUError -> Bool {-# INLINE isSuccess #-} isSuccess = (<= 0) . fromErrorCode -- | Indicate whether the given error code is a failure. isFailure :: ICUError -> Bool {-# INLINE isFailure #-} isFailure = (> 0) . fromErrorCode -- | Throw an exception if the given code is actually an error. throwOnError :: UErrorCode -> IO () {-# INLINE throwOnError #-} throwOnError code = do let err = (ICUError code) if isFailure err then throwIO err else return () withError :: (Ptr UErrorCode -> IO a) -> IO (ICUError, a) {-# INLINE withError #-} withError action = with 0 $ \errPtr -> do ret <- action errPtr err <- peek errPtr return (ICUError err, ret) handleError :: (Ptr UErrorCode -> IO a) -> IO a {-# INLINE handleError #-} handleError action = with 0 $ \errPtr -> do ret <- action errPtr throwOnError =<< peek errPtr return ret -- | Deal with ICU functions that report a buffer overflow error if we -- give them an insufficiently large buffer. Our first call will -- report a buffer overflow, in which case we allocate a correctly -- sized buffer and try again. handleOverflowError :: (Storable a) => Int -- ^ Initial guess at buffer size. -> (Ptr a -> Int32 -> Ptr UErrorCode -> IO Int32) -- ^ Function that retrieves data. -> (Ptr a -> Int -> IO b) -- ^ Function that fills destination buffer if no -- overflow occurred. -> IO b handleOverflowError guess fill retrieve = alloca $ \uerrPtr -> flip fix guess $ \loop n -> (either (loop . fromIntegral) return =<<) . allocaArray n $ \ptr -> do poke uerrPtr 0 ret <- fill ptr (fromIntegral n) uerrPtr err <- peek uerrPtr case undefined of _| err == (#const U_BUFFER_OVERFLOW_ERROR) -> return (Left ret) | err > 0 -> throwIO (ICUError err) | otherwise -> Right `fmap` retrieve ptr (fromIntegral ret) handleParseError :: (ICUError -> Bool) -> (Ptr UParseError -> Ptr UErrorCode -> IO a) -> IO a handleParseError isParseError action = with 0 $ \uerrPtr -> allocaBytes (#{size UParseError}) $ \perrPtr -> do ret <- action perrPtr uerrPtr err <- ICUError `fmap` peek uerrPtr case undefined of _| isParseError err -> throwParseError perrPtr err | isFailure err -> throwIO err | otherwise -> return ret throwParseError :: Ptr UParseError -> ICUError -> IO a throwParseError ptr err = do (line::Int32) <- #{peek UParseError, line} ptr (offset::Int32) <- #{peek UParseError, offset} ptr let wrap k = if k == -1 then Nothing else Just $! fromIntegral k throwIO $! ParseError err (wrap line) (wrap offset) -- | Return a string representing the name of the given error code. errorName :: ICUError -> String errorName code = unsafePerformIO $ peekCString (u_errorName (fromErrorCode code)) foreign import ccall unsafe "hs_text_icu.h __hs_u_errorName" u_errorName :: UErrorCode -> CString text-icu-0.8.0.4/Data/Text/ICU/Internal.hsc0000644000000000000000000001765407346545000016321 0ustar0000000000000000{-# LANGUAGE EmptyDataDecls, ForeignFunctionInterface, GeneralizedNewtypeDeriving, TupleSections #-} module Data.Text.ICU.Internal ( LocaleName(..) , UBool , UChar , UChar32 , UCharIterator , CharIterator(..) , UText, UTextPtr , asBool , asOrdering , withCharIterator , withLocaleName , withName , useAsUCharPtr, fromUCharPtr, I16, asUCharForeignPtr , asUTextPtr, withUTextPtr, withUTextPtrText, emptyUTextPtr, utextPtrLength , TextI, takeWord, dropWord, lengthWord , newICUPtr ) where #include import Control.Exception (mask_) import Control.DeepSeq (NFData(..)) import Data.ByteString.Internal (ByteString(..)) import Data.Int (Int8, Int32, Int64) import Data.String (IsString(..)) import Data.Text (Text, empty) import Data.Text.Encoding (decodeUtf8) import Data.Text.Foreign (useAsPtr, asForeignPtr, fromPtr) #if MIN_VERSION_text(2,0,0) import Data.Text.Foreign (I8, dropWord8, takeWord8, lengthWord8) import Foreign.ForeignPtr (mallocForeignPtrArray) import Foreign.Marshal.Array (allocaArray) import Foreign.Storable (peek) #else import Data.Text.Foreign (I16, dropWord16, takeWord16, lengthWord16) #endif import Data.Word (Word8, Word16, Word32) import Foreign.C.String (CString, withCString) import Foreign.ForeignPtr (withForeignPtr, ForeignPtr, newForeignPtr, FinalizerPtr) import Foreign.Marshal.Alloc (allocaBytes) import Foreign.Marshal.Utils (with) import Foreign.Ptr (Ptr, nullPtr, FunPtr) import Data.Text.ICU.Error.Internal (UErrorCode) import System.IO.Unsafe (unsafePerformIO) -- | A type that supports efficient iteration over Unicode characters. -- -- As an example of where this may be useful, a function using this -- type may be able to iterate over a UTF-8 'ByteString' directly, -- rather than first copying and converting it to an intermediate -- form. This type also allows e.g. comparison between 'Text' and -- 'ByteString', with minimal overhead. data CharIterator = CIText !Text | CIUTF8 !ByteString instance Show CharIterator where show (CIText t) = show t show (CIUTF8 bs) = show (decodeUtf8 bs) data UCharIterator -- | Temporarily allocate a 'UCharIterator' and use it with the -- contents of the to-be-iterated-over string. withCharIterator :: CharIterator -> (Ptr UCharIterator -> IO a) -> IO a withCharIterator (CIUTF8 (PS fp _ l)) act = allocaBytes (#{size UCharIterator}) $ \i -> withForeignPtr fp $ \p -> uiter_setUTF8 i p (fromIntegral l) >> act i withCharIterator (CIText t) act = allocaBytes (#{size UCharIterator}) $ \i -> useAsPtr t $ \p l -> #if MIN_VERSION_text(2,0,0) uiter_setUTF8 i p (fromIntegral l) >> act i #else uiter_setString i p (fromIntegral l) >> act i #endif type UBool = Int8 type UChar = Word16 type UChar32 = Word32 asBool :: Integral a => a -> Bool {-# INLINE asBool #-} asBool = (/=0) asOrdering :: Integral a => a -> Ordering {-# INLINE asOrdering #-} asOrdering i | i < 0 = LT | i == 0 = EQ | otherwise = GT withName :: String -> (CString -> IO a) -> IO a withName name act | null name = act nullPtr | otherwise = withCString name act -- | The name of a locale. data LocaleName = Root -- ^ The root locale. For a description of resource bundles -- and the root resource, see -- . | Locale String -- ^ A specific locale. | Current -- ^ The program's current locale. deriving (Eq, Ord, Read, Show) instance NFData LocaleName where rnf Root = () rnf (Locale l) = rnf l rnf Current = () instance IsString LocaleName where fromString = Locale withLocaleName :: LocaleName -> (CString -> IO a) -> IO a withLocaleName Current act = act nullPtr withLocaleName Root act = withCString "" act withLocaleName (Locale n) act = withCString n act #if !MIN_VERSION_text(2,0,0) foreign import ccall unsafe "hs_text_icu.h __hs_uiter_setString" uiter_setString :: Ptr UCharIterator -> Ptr UChar -> Int32 -> IO () #endif foreign import ccall unsafe "hs_text_icu.h __hs_uiter_setUTF8" uiter_setUTF8 :: Ptr UCharIterator -> Ptr Word8 -> Int32 -> IO () data UText -- | Pointer to UText which also keeps pointer to source text so it won't be -- garbage collected. data UTextPtr = UTextPtr { utextPtr :: ForeignPtr UText , utextPtrText :: ForeignPtr TextChar , utextPtrLength :: TextI } emptyUTextPtr :: UTextPtr emptyUTextPtr = unsafePerformIO $ asUTextPtr empty {-# NOINLINE emptyUTextPtr #-} withUTextPtr :: UTextPtr -> (Ptr UText -> IO a) -> IO a withUTextPtr = withForeignPtr . utextPtr withUTextPtrText :: UTextPtr -> (Ptr TextChar -> IO a) -> IO a withUTextPtrText = withForeignPtr . utextPtrText -- | Returns UTF-8 UText for text >= 2.0 or UTF-16 UText for previous versions. asUTextPtr :: Text -> IO UTextPtr asUTextPtr t = do (fp,l) <- asForeignPtr t with 0 $ \ e -> withForeignPtr fp $ \ p -> newICUPtr (\ ut -> UTextPtr ut fp l) utext_close $ #if MIN_VERSION_text(2,0,0) utext_openUTF8 #else utext_openUChars #endif nullPtr p (fromIntegral l) e foreign import ccall unsafe "hs_text_icu.h &__hs_utext_close" utext_close :: FunPtr (Ptr UText -> IO ()) useAsUCharPtr :: Text -> (Ptr UChar -> I16 -> IO a) -> IO a asUCharForeignPtr :: Text -> IO (ForeignPtr UChar, I16) fromUCharPtr :: Ptr UChar -> I16 -> IO Text dropWord, takeWord :: TextI -> Text -> Text lengthWord :: Text -> Int #if MIN_VERSION_text(2,0,0) newtype I16 = I16 Int deriving (Bounded, Enum, Eq, Integral, Num, Ord, Read, Real, Show) type TextChar = Word8 type TextI = I8 useAsUCharPtr t act = useAsPtr t $ \tptr tlen -> allocaArray (fromIntegral tlen) $ \ dst -> act dst =<< fromUtf8 dst tptr tlen asUCharForeignPtr t = useAsPtr t $ \tptr tlen -> do fp <- mallocForeignPtrArray (fromIntegral tlen) withForeignPtr fp $ \ dst -> (fp,) <$> fromUtf8 dst tptr tlen fromUtf8 :: Ptr UChar -> Ptr Word8 -> I8 -> IO I16 fromUtf8 dst tptr tlen = with 0 $ \ err -> with 0 $ \ dstLen -> do _ <- u_strFromUTF8Lenient dst (fromIntegral tlen) dstLen tptr (fromIntegral tlen) err fromIntegral <$> peek dstLen fromUCharPtr p l = with 0 $ \ err -> with 0 $ \ dstLen -> allocaArray capacity $ \ dst -> do _ <- u_strToUTF8 dst (fromIntegral capacity) dstLen p (fromIntegral l) err dl <- peek dstLen fromPtr dst (fromIntegral dl) where capacity = fromIntegral l * 3 dropWord = dropWord8 takeWord = takeWord8 lengthWord = lengthWord8 foreign import ccall unsafe "hs_text_icu.h __hs_u_strFromUTF8Lenient" u_strFromUTF8Lenient :: Ptr UChar -> Int32 -> Ptr Int32 -> Ptr Word8 -> Int32 -> Ptr UErrorCode -> IO CString foreign import ccall unsafe "hs_text_icu.h __hs_u_strToUTF8" u_strToUTF8 :: Ptr Word8 -> Int32 -> Ptr Int32 -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO CString foreign import ccall unsafe "hs_text_icu.h __hs_utext_openUTF8" utext_openUTF8 :: Ptr UText -> Ptr Word8 -> Int64 -> Ptr UErrorCode -> IO (Ptr UText) #else type TextChar = UChar type TextI = I16 -- text < 2.0 has UChar as internal representation. useAsUCharPtr = useAsPtr asUCharForeignPtr = asForeignPtr fromUCharPtr = fromPtr dropWord = dropWord16 takeWord = takeWord16 lengthWord = lengthWord16 foreign import ccall unsafe "hs_text_icu.h __hs_utext_openUChars" utext_openUChars :: Ptr UText -> Ptr UChar -> Int64 -> Ptr UErrorCode -> IO (Ptr UText) #endif -- | Allocate new ICU data structure (usually via @*_open@ call), -- add finalizer (@*_close@ call) and wrap resulting 'ForeignPtr'. -- -- Exceptions are masked since the memory leak is possible if any -- asynchronous exception (such as a timeout) is raised between -- allocating C data and 'newForeignPtr' call. newICUPtr :: (ForeignPtr a -> i) -> FinalizerPtr a -> IO (Ptr a) -> IO i newICUPtr wrap close open = fmap wrap $ mask_ $ newForeignPtr close =<< open text-icu-0.8.0.4/Data/Text/ICU/Iterator.hs0000644000000000000000000000426507346545000016165 0ustar0000000000000000{-# LANGUAGE ForeignFunctionInterface #-} {-# OPTIONS_GHC -fno-warn-orphans #-} -- | -- Module : Data.Text.ICU.Iterator -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Iteration functions for Unicode, implemented as bindings to the -- International Components for Unicode (ICU) libraries. -- -- Unlike the C and C++ @UCharIterator@ type, the Haskell -- 'CharIterator' type is immutable, and can safely be used in pure -- code. -- -- Functions using these iterators may be more efficient than their -- counterparts. For instance, the 'CharIterator' type allows a UTF-8 -- 'ByteString' to be compared against a 'Text', without first -- converting the 'ByteString': -- -- > fromUtf8 bs == fromText t module Data.Text.ICU.Iterator ( -- * Types and constructors CharIterator , fromString , fromText , fromUtf8 ) where import Data.ByteString (ByteString) import Data.Int (Int32) import Data.Text (Text, pack) import Data.Text.ICU.Internal (CharIterator(..), UCharIterator, asOrdering, withCharIterator) import Foreign.Ptr (Ptr) import System.IO.Unsafe (unsafePerformIO) instance Eq CharIterator where a == b = compareIter a b == EQ instance Ord CharIterator where compare = compareIter -- | Compare two 'CharIterator's. compareIter :: CharIterator -> CharIterator -> Ordering compareIter a b = unsafePerformIO . fmap asOrdering . withCharIterator a $ withCharIterator b . u_strCompareIter -- | Construct a 'CharIterator' from a Unicode string. fromString :: String -> CharIterator fromString = CIText . pack {-# INLINE fromString #-} -- | Construct a 'CharIterator' from a Unicode string. fromText :: Text -> CharIterator fromText = CIText {-# INLINE fromText #-} -- | Construct a 'CharIterator' from a Unicode string encoded as a -- UTF-8 'ByteString'. The validity of the encoded string is *not* -- checked. fromUtf8 :: ByteString -> CharIterator fromUtf8 = CIUTF8 {-# INLINE fromUtf8 #-} foreign import ccall unsafe "hs_text_icu.h __hs_u_strCompareIter" u_strCompareIter :: Ptr UCharIterator -> Ptr UCharIterator -> IO Int32 text-icu-0.8.0.4/Data/Text/ICU/Locale.hsc0000644000000000000000000000214707346545000015733 0ustar0000000000000000{-# LANGUAGE RankNTypes, BangPatterns, ForeignFunctionInterface, RecordWildCards #-} -- | -- Module : Data.Text.ICU.Locale -- Copyright : (c) 2021 Torsten Kemps-Benedix -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Locale functions implemented as bindings to -- the International Components for Unicode (ICU) libraries. module Data.Text.ICU.Locale (availableLocales ) where #include import Control.Monad (forM) import Data.Int (Int32) import Foreign.C.String (CString, peekCString) import Prelude hiding (last) -- | Get the available default locales, i.e. locales that return data when passed to ICU -- APIs, but not including legacy or alias locales. availableLocales :: IO [String] availableLocales = do n <- uloc_countAvailable forM [0..n-1] $ \i -> uloc_getAvailable i >>= peekCString foreign import ccall unsafe "hs_text_icu.h __hs_uloc_getAvailable" uloc_getAvailable :: Int32 -> IO CString foreign import ccall unsafe "hs_text_icu.h __hs_uloc_countAvailable" uloc_countAvailable :: IO Int32 text-icu-0.8.0.4/Data/Text/ICU/Normalize.hsc0000644000000000000000000002677507346545000016511 0ustar0000000000000000{-# LANGUAGE CPP, DeriveDataTypeable, ForeignFunctionInterface #-} -- | -- Module : Data.Text.ICU.Normalize -- Copyright : (c) 2009, 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Character set normalization functions for Unicode, implemented as -- bindings to the International Components for Unicode (ICU) -- libraries. -- -- This module is based on the now deprecated "unorm.h" functions. -- Please use Data.Text.ICU.Normalize2 instead. module Data.Text.ICU.Normalize {-# DEPRECATED "Use Data.Text.ICU.Normalize2 instead" #-} ( -- * Unicode normalization API -- $api NormalizationMode(..) -- * Normalization functions , normalize -- * Normalization checks , quickCheck , isNormalized -- * Normalization-sensitive comparison , CompareOption(..) , compare ) where #ifdef mingw32_HOST_OS #define U_HAVE_INTTYPES_H 1 #endif #include #include import Data.Text (Text) import Data.Text.ICU.Error.Internal (UErrorCode, handleError, handleOverflowError) import Data.Text.ICU.Internal (UBool, UChar, asBool, asOrdering, useAsUCharPtr, fromUCharPtr) import Data.Text.ICU.Normalize.Internal (UNormalizationCheckResult, toNCR) import Data.Typeable (Typeable) import Data.Int (Int32) import Data.Word (Word32) import Foreign.C.Types (CInt(..)) import Foreign.Ptr (Ptr) import System.IO.Unsafe (unsafePerformIO) import Prelude hiding (compare) import Data.List (foldl') import Data.Bits ((.|.)) -- $api -- -- The 'normalize' function transforms Unicode text into an equivalent -- composed or decomposed form, allowing for easier sorting and -- searching of text. 'normalize' supports the standard normalization -- forms described in , -- Unicode Standard Annex #15: Unicode Normalization Forms. -- -- Characters with accents or other adornments can be encoded in -- several different ways in Unicode. For example, take the character A-acute. -- In Unicode, this can be encoded as a single character (the -- \"composed\" form): -- -- @ -- 00C1 LATIN CAPITAL LETTER A WITH ACUTE -- @ -- -- or as two separate characters (the \"decomposed\" form): -- -- @ -- 0041 LATIN CAPITAL LETTER A -- 0301 COMBINING ACUTE ACCENT -- @ -- -- To a user of your program, however, both of these sequences should -- be treated as the same \"user-level\" character \"A with acute -- accent\". When you are searching or comparing text, you must -- ensure that these two sequences are treated equivalently. In -- addition, you must handle characters with more than one accent. -- Sometimes the order of a character's combining accents is -- significant, while in other cases accent sequences in different -- orders are really equivalent. -- -- Similarly, the string \"ffi\" can be encoded as three separate letters: -- -- @ -- 0066 LATIN SMALL LETTER F -- 0066 LATIN SMALL LETTER F -- 0069 LATIN SMALL LETTER I -- @ -- -- or as the single character -- -- @ -- FB03 LATIN SMALL LIGATURE FFI -- @ -- -- The \"ffi\" ligature is not a distinct semantic character, and -- strictly speaking it shouldn't be in Unicode at all, but it was -- included for compatibility with existing character sets that -- already provided it. The Unicode standard identifies such -- characters by giving them \"compatibility\" decompositions into the -- corresponding semantic characters. When sorting and searching, you -- will often want to use these mappings. -- -- 'normalize' helps solve these problems by transforming text into -- the canonical composed and decomposed forms as shown in the first -- example above. In addition, you can have it perform compatibility -- decompositions so that you can treat compatibility characters the -- same as their equivalents. Finally, 'normalize' rearranges accents -- into the proper canonical order, so that you do not have to worry -- about accent rearrangement on your own. -- -- Form 'FCD', \"Fast C or D\", is also designed for collation. It -- allows to work on strings that are not necessarily normalized with -- an algorithm (like in collation) that works under \"canonical -- closure\", i.e., it treats precomposed characters and their -- decomposed equivalents the same. -- -- It is not a normalization form because it does not provide for -- uniqueness of representation. Multiple strings may be canonically -- equivalent (their NFDs are identical) and may all conform to 'FCD' -- without being identical themselves. -- -- The form is defined such that the \"raw decomposition\", the -- recursive canonical decomposition of each character, results in a -- string that is canonically ordered. This means that precomposed -- characters are allowed for as long as their decompositions do not -- need canonical reordering. -- -- Its advantage for a process like collation is that all 'NFD' and -- most 'NFC' texts - and many unnormalized texts - already conform to -- 'FCD' and do not need to be normalized ('NFD') for such a -- process. The 'FCD' 'quickCheck' will return 'Yes' for most strings -- in practice. -- -- @'normalize' 'FCD'@ may be implemented with 'NFD'. -- -- For more details on 'FCD' see the collation design document: -- -- -- ICU collation performs either 'NFD' or 'FCD' normalization -- automatically if normalization is turned on for the collator -- object. Beyond collation and string search, normalized strings may -- be useful for string equivalence comparisons, -- transliteration/transcription, unique representations, etc. -- -- The W3C generally recommends to exchange texts in 'NFC'. Note also -- that most legacy character encodings use only precomposed forms and -- often do not encode any combining marks by themselves. For -- conversion to such character encodings the Unicode text needs to be -- normalized to 'NFC'. For more usage examples, see the Unicode -- Standard Annex. type UCompareOption = Word32 -- | Options to 'compare'. data CompareOption = InputIsFCD -- ^ The caller knows that both strings fulfill the -- 'FCD' conditions. If /not/ set, 'compare' will -- 'quickCheck' for 'FCD' and normalize if -- necessary. | CompareIgnoreCase -- ^ Compare strings case-insensitively using case -- folding, instead of case-sensitively. If set, -- then the following case folding options are -- used. | FoldCaseExcludeSpecialI -- ^ When case folding, exclude the special I -- character. For use with Turkic -- (Turkish/Azerbaijani) text data. deriving (Eq, Show, Enum, Typeable) fromCompareOption :: CompareOption -> UCompareOption fromCompareOption InputIsFCD = #const UNORM_INPUT_IS_FCD fromCompareOption CompareIgnoreCase = #const U_COMPARE_IGNORE_CASE fromCompareOption FoldCaseExcludeSpecialI = #const U_FOLD_CASE_EXCLUDE_SPECIAL_I reduceCompareOptions :: [CompareOption] -> UCompareOption reduceCompareOptions = foldl' orO (#const U_COMPARE_CODE_POINT_ORDER) where a `orO` b = a .|. fromCompareOption b type UNormalizationMode = CInt -- | Normalization modes. data NormalizationMode = None -- ^ No decomposition/composition. | NFD -- ^ Canonical decomposition. | NFKD -- ^ Compatibility decomposition. | NFC -- ^ Canonical decomposition followed by canonical composition. | NFKC -- ^ Compatibility decomposition followed by canonical composition. | FCD -- ^ \"Fast C or D\" form. deriving (Eq, Show, Enum, Typeable) toNM :: NormalizationMode -> UNormalizationMode toNM None = #const UNORM_NONE toNM NFD = #const UNORM_NFD toNM NFKD = #const UNORM_NFKD toNM NFC = #const UNORM_NFC toNM NFKC = #const UNORM_NFKC toNM FCD = #const UNORM_FCD -- | Normalize a string according to the specified normalization mode. normalize :: NormalizationMode -> Text -> Text normalize mode t = unsafePerformIO . useAsUCharPtr t $ \sptr slen -> let slen' = fromIntegral slen mode' = toNM mode in handleOverflowError (fromIntegral slen) (\dptr dlen -> unorm_normalize sptr slen' mode' 0 dptr (fromIntegral dlen)) (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen)) -- | Perform an efficient check on a string, to quickly determine if -- the string is in a particular normalization form. -- -- A 'Nothing' result indicates that a definite answer could not be -- determined quickly, and a more thorough check is required, -- e.g. with 'isNormalized'. The user may have to convert the string -- to its normalized form and compare the results. -- -- A result of 'Just' 'True' or 'Just' 'False' indicates that the -- string definitely is, or is not, in the given normalization form. quickCheck :: NormalizationMode -> Text -> Maybe Bool quickCheck mode t = unsafePerformIO . useAsUCharPtr t $ \ptr len -> fmap toNCR . handleError $ unorm_quickCheck ptr (fromIntegral len) (toNM mode) -- | Indicate whether a string is in a given normalization form. -- -- Unlike 'quickCheck', this function returns a definitive result. -- For 'NFD', 'NFKD', and 'FCD' normalization forms, both functions -- work in exactly the same ways. For 'NFC' and 'NFKC' forms, where -- 'quickCheck' may return 'Nothing', this function will perform -- further tests to arrive at a definitive result. isNormalized :: NormalizationMode -> Text -> Bool isNormalized mode t = unsafePerformIO . useAsUCharPtr t $ \ptr len -> fmap asBool . handleError $ unorm_isNormalized ptr (fromIntegral len) (toNM mode) -- | Compare two strings for canonical equivalence. Further options -- include case-insensitive comparison and codepoint order (as -- opposed to code unit order). -- -- Canonical equivalence between two strings is defined as their -- normalized forms ('NFD' or 'NFC') being identical. This function -- compares strings incrementally instead of normalizing (and -- optionally case-folding) both strings entirely, improving -- performance significantly. -- -- Bulk normalization is only necessary if the strings do not fulfill -- the 'FCD' conditions. Only in this case, and only if the strings -- are relatively long, is memory allocated temporarily. For 'FCD' -- strings and short non-'FCD' strings there is no memory allocation. compare :: [CompareOption] -> Text -> Text -> Ordering compare opts a b = unsafePerformIO . useAsUCharPtr a $ \aptr alen -> useAsUCharPtr b $ \bptr blen -> fmap asOrdering . handleError $ unorm_compare aptr (fromIntegral alen) bptr (fromIntegral blen) (reduceCompareOptions opts) foreign import ccall unsafe "hs_text_icu.h __hs_unorm_compare" unorm_compare :: Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> Word32 -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_unorm_quickCheck" unorm_quickCheck :: Ptr UChar -> Int32 -> UNormalizationMode -> Ptr UErrorCode -> IO UNormalizationCheckResult foreign import ccall unsafe "hs_text_icu.h __hs_unorm_isNormalized" unorm_isNormalized :: Ptr UChar -> Int32 -> UNormalizationMode -> Ptr UErrorCode -> IO UBool foreign import ccall unsafe "hs_text_icu.h __hs_unorm_normalize" unorm_normalize :: Ptr UChar -> Int32 -> UNormalizationMode -> Int32 -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO Int32 text-icu-0.8.0.4/Data/Text/ICU/Normalize/0000755000000000000000000000000007346545000015771 5ustar0000000000000000text-icu-0.8.0.4/Data/Text/ICU/Normalize/Internal.hsc0000644000000000000000000000131107346545000020240 0ustar0000000000000000{-# LANGUAGE CPP, DeriveDataTypeable, ForeignFunctionInterface #-} -- | -- Module : Data.Text.ICU.Normalize.Internal -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC module Data.Text.ICU.Normalize.Internal ( UNormalizationCheckResult , toNCR ) where #include import Foreign.C.Types (CInt) type UNormalizationCheckResult = CInt toNCR :: UNormalizationCheckResult -> Maybe Bool toNCR (#const UNORM_NO) = Just False toNCR (#const UNORM_MAYBE) = Nothing toNCR (#const UNORM_YES) = Just True toNCR _ = error "toNormalizationCheckResult" text-icu-0.8.0.4/Data/Text/ICU/Normalize2.hsc0000644000000000000000000004127607346545000016564 0ustar0000000000000000{-# LANGUAGE EmptyDataDecls, CPP, DeriveDataTypeable, ForeignFunctionInterface #-} -- | -- Module : Data.Text.ICU.Normalize -- Copyright : (c) 2009, 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Character set normalization functions for Unicode, implemented as -- bindings to the International Components for Unicode (ICU) -- libraries. See http://www.unicode.org/reports/tr15/ for a description -- of Unicode normalization modes and why these are needed. module Data.Text.ICU.Normalize2 ( -- * Unicode normalization API -- $api -- * Create normalizers NormalizationMode(..), normalizer, nfcNormalizer, nfdNormalizer, nfkcNormalizer, nfkdNormalizer, nfkcCasefoldNormalizer, -- * Normalize unicode strings nfc, nfd, nfkc, nfkd, nfkcCasefold, normalize, normalizeWith, -- * Checks for normalization quickCheck, isNormalized, isNormalizedWith, -- * Comparison of unicode strings compareUnicode, compareUnicode', CompareOption(..), ) where #ifdef mingw32_HOST_OS #define U_HAVE_INTTYPES_H 1 #endif #include #include import Data.Text (Text) import Data.Text.ICU.Error.Internal (UErrorCode, handleError, handleOverflowError) import Data.Text.ICU.Internal (UBool, UChar, asBool, asOrdering, fromUCharPtr, useAsUCharPtr) import Data.Text.ICU.Normalize.Internal (UNormalizationCheckResult, toNCR) import Data.Typeable (Typeable) import Data.Int (Int32) import Data.Word (Word32) import Foreign.C.Types (CInt(..)) import Foreign.Ptr (Ptr) import System.IO.Unsafe (unsafePerformIO) import Prelude hiding (compare) import Data.List (foldl') import Data.Bits ((.|.)) -- $api -- -- The 'normalize' function transforms Unicode text into an equivalent -- composed or decomposed form, allowing for easier sorting and -- searching of text. 'normalize' supports the standard normalization -- forms described in , -- Unicode Standard Annex #15: Unicode Normalization Forms. -- -- Characters with accents or other adornments can be encoded in -- several different ways in Unicode. For example, take the character A-acute. -- In Unicode, this can be encoded as a single character (the -- \"composed\" form): -- -- @ -- 00C1 LATIN CAPITAL LETTER A WITH ACUTE -- @ -- -- or as two separate characters (the \"decomposed\" form): -- -- @ -- 0041 LATIN CAPITAL LETTER A -- 0301 COMBINING ACUTE ACCENT -- @ -- -- To a user of your program, however, both of these sequences should -- be treated as the same \"user-level\" character \"A with acute -- accent\". When you are searching or comparing text, you must -- ensure that these two sequences are treated equivalently. In -- addition, you must handle characters with more than one accent. -- Sometimes the order of a character's combining accents is -- significant, while in other cases accent sequences in different -- orders are really equivalent. -- -- Similarly, the string \"ffi\" can be encoded as three separate letters: -- -- @ -- 0066 LATIN SMALL LETTER F -- 0066 LATIN SMALL LETTER F -- 0069 LATIN SMALL LETTER I -- @ -- -- or as the single character -- -- @ -- FB03 LATIN SMALL LIGATURE FFI -- @ -- -- The \"ffi\" ligature is not a distinct semantic character, and -- strictly speaking it shouldn't be in Unicode at all, but it was -- included for compatibility with existing character sets that -- already provided it. The Unicode standard identifies such -- characters by giving them \"compatibility\" decompositions into the -- corresponding semantic characters. When sorting and searching, you -- will often want to use these mappings. -- -- 'normalize' helps solve these problems by transforming text into -- the canonical composed and decomposed forms as shown in the first -- example above. In addition, you can have it perform compatibility -- decompositions so that you can treat compatibility characters the -- same as their equivalents. Finally, 'normalize' rearranges accents -- into the proper canonical order, so that you do not have to worry -- about accent rearrangement on your own. -- -- Form 'FCD', \"Fast C or D\", is also designed for collation. It -- allows to work on strings that are not necessarily normalized with -- an algorithm (like in collation) that works under \"canonical -- closure\", i.e., it treats precomposed characters and their -- decomposed equivalents the same. -- -- It is not a normalization form because it does not provide for -- uniqueness of representation. Multiple strings may be canonically -- equivalent (their NFDs are identical) and may all conform to 'FCD' -- without being identical themselves. -- -- The form is defined such that the \"raw decomposition\", the -- recursive canonical decomposition of each character, results in a -- string that is canonically ordered. This means that precomposed -- characters are allowed for as long as their decompositions do not -- need canonical reordering. -- -- Its advantage for a process like collation is that all 'NFD' and -- most 'NFC' texts - and many unnormalized texts - already conform to -- 'FCD' and do not need to be normalized ('NFD') for such a -- process. The 'FCD' 'quickCheck' will return 'Yes' for most strings -- in practice. -- -- @'normalize' 'FCD'@ may be implemented with 'NFD'. -- -- For more details on 'FCD' see the collation design document: -- -- -- ICU collation performs either 'NFD' or 'FCD' normalization -- automatically if normalization is turned on for the collator -- object. Beyond collation and string search, normalized strings may -- be useful for string equivalence comparisons, -- transliteration/transcription, unique representations, etc. -- -- The W3C generally recommends to exchange texts in 'NFC'. Note also -- that most legacy character encodings use only precomposed forms and -- often do not encode any combining marks by themselves. For -- conversion to such character encodings the Unicode text needs to be -- normalized to 'NFC'. For more usage examples, see the Unicode -- Standard Annex. -- | This is an abstract data type holding a reference to the ICU `UNormalizer2` object. newtype Normalizer = Normalizer (Ptr UNormalizer2) data UNormalizer2 -- | Normalization modes analog (but not identical) to the ones in the -- 'Data.Text.ICU.Normalize' module. data NormalizationMode = NFD -- ^ Canonical decomposition. | NFKD -- ^ Compatibility decomposition. | NFC -- ^ Canonical decomposition followed by canonical composition. | NFKC -- ^ Compatibility decomposition followed by canonical composition. | NFKCCasefold -- ^ NFKC with Casefold. deriving (Eq, Show, Enum, Typeable) createNormalizerWith :: (Ptr UErrorCode -> IO (Ptr UNormalizer2)) -> IO Normalizer createNormalizerWith f = Normalizer <$> handleError f -- from the ICU documentation: "Returns an unmodifiable singleton instance of `unorm2_getInstance()`. Do not delete it." -- Thats why we use raw pointer here. -- | Create a normalizer for a given normalization mode. This function is more similar to -- the interface in the 'Data.Text.ICU.Normalize' module. normalizer :: NormalizationMode -> IO Normalizer normalizer NFD = nfdNormalizer normalizer NFKD = nfkdNormalizer normalizer NFC = nfcNormalizer normalizer NFKC = nfkcNormalizer normalizer NFKCCasefold = nfkcCasefoldNormalizer -- | Create an NFC normalizer. nfcNormalizer :: IO Normalizer nfcNormalizer = createNormalizerWith unorm2_getNFCInstance -- | Create an NFD normalizer. nfdNormalizer :: IO Normalizer nfdNormalizer = createNormalizerWith unorm2_getNFDInstance -- | Create an NFKC normalizer. nfkcNormalizer :: IO Normalizer nfkcNormalizer = createNormalizerWith unorm2_getNFKCInstance -- | Create an NFKD normalizer. nfkdNormalizer :: IO Normalizer nfkdNormalizer = createNormalizerWith unorm2_getNFKDInstance -- | Create an NFKCCasefold normalizer. nfkcCasefoldNormalizer :: IO Normalizer nfkcCasefoldNormalizer = createNormalizerWith unorm2_getNFKCCasefoldInstance -- * Normalization -- | Normalize a string with the given normalizer. normalizeWith :: Normalizer -> Text -> Text normalizeWith (Normalizer nfPtr) t = unsafePerformIO $ useAsUCharPtr t $ \sptr slen -> let slen' = fromIntegral slen in handleOverflowError (fromIntegral slen) (\dptr dlen -> unorm2_normalize nfPtr sptr slen' dptr (fromIntegral dlen)) (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen)) -- | Normalize a string using the given normalization mode. normalize :: NormalizationMode -> Text -> Text normalize NFC = nfc normalize NFD = nfd normalize NFKC = nfkc normalize NFKD = nfkd normalize NFKCCasefold = nfkcCasefold -- | Create an NFC normalizer and apply this to the given text. -- -- Let's have a look at a concrete example that contains the letter a with an acute accent twice. -- First as a combination of two codepoints and second as a canonical composite or precomposed -- character. Both look exactly the same but one character consists of two and one of only one -- codepoint. A bytewise comparison does not give equality of these. -- -- >>> import Data.Text -- >>> let t = pack "a\x301á" -- >>> t -- "a\769\225" -- >>> putStr t -- áá -- pack "a\x301" == pack "á" -- False -- -- But now lets apply some normalization functions and see how these characters coincide afterwards -- in two different ways: -- -- >>> nfc t -- "\225\225" -- >>> nfd t -- "a\769a\769" -- -- That is exactly what 'compareUnicode'' does: -- -- >>> pack "a\x301" `compareUnicode'` pack "á" nfc :: Text -> Text nfc t = unsafePerformIO $ do nf <- nfcNormalizer pure $ normalizeWith nf t -- | Create an NFKC normalizer and apply this to the given text. nfkc :: Text -> Text nfkc t = unsafePerformIO $ do nf <- nfkcNormalizer pure $ normalizeWith nf t -- | Create an NFD normalizer and apply this to the given text. nfd :: Text -> Text nfd t = unsafePerformIO $ do nf <- nfdNormalizer pure $ normalizeWith nf t -- | Create an NFC normalizer and apply this to the given text. nfkd :: Text -> Text nfkd t = unsafePerformIO $ do nf <- nfkdNormalizer pure $ normalizeWith nf t -- | Create an NFKCCasefold normalizer and apply this to the given text. nfkcCasefold :: Text -> Text nfkcCasefold t = unsafePerformIO $ do nf <- nfkcCasefoldNormalizer pure $ normalizeWith nf t -- * Checks for normalization -- | Perform an efficient check on a string, to quickly determine if -- the string is in a particular normalization form. -- -- A 'Nothing' result indicates that a definite answer could not be -- determined quickly, and a more thorough check is required, -- e.g. with 'isNormalized'. The user may have to convert the string -- to its normalized form and compare the results. -- -- A result of 'Just' 'True' or 'Just' 'False' indicates that the -- string definitely is, or is not, in the given normalization form. quickCheckWith :: Normalizer -> Text -> Maybe Bool quickCheckWith (Normalizer nfPtr) t = unsafePerformIO $ useAsUCharPtr t $ \sptr slen -> fmap toNCR . handleError $ unorm2_quickCheck nfPtr sptr (fromIntegral slen) quickCheck :: NormalizationMode -> Text -> Maybe Bool quickCheck NFD t = unsafePerformIO $ do nf <- nfdNormalizer pure $ quickCheckWith nf t quickCheck NFC t = unsafePerformIO $ do nf <- nfcNormalizer pure $ quickCheckWith nf t quickCheck NFKD t = unsafePerformIO $ do nf <- nfkdNormalizer pure $ quickCheckWith nf t quickCheck NFKC t = unsafePerformIO $ do nf <- nfkcNormalizer pure $ quickCheckWith nf t quickCheck NFKCCasefold t = unsafePerformIO $ do nf <- nfkcCasefoldNormalizer pure $ quickCheckWith nf t -- | Indicate whether a string is in a given normalization form. -- -- Unlike 'quickCheck', this function returns a definitive result. -- For 'NFD' and 'NFKD' normalization forms, both functions -- work in exactly the same ways. For 'NFC' and 'NFKC' forms, where -- 'quickCheck' may return 'Nothing', this function will perform -- further tests to arrive at a definitive result. isNormalizedWith :: Normalizer -> Text -> Bool isNormalizedWith (Normalizer nfPtr) t = unsafePerformIO $ useAsUCharPtr t $ \sptr slen -> fmap asBool . handleError $ unorm2_isNormalized nfPtr sptr (fromIntegral slen) isNormalized :: NormalizationMode -> Text -> Bool isNormalized NFD t = unsafePerformIO $ do nf <- nfdNormalizer pure $ isNormalizedWith nf t isNormalized NFC t = unsafePerformIO $ do nf <- nfcNormalizer pure $ isNormalizedWith nf t isNormalized NFKD t = unsafePerformIO $ do nf <- nfkdNormalizer pure $ isNormalizedWith nf t isNormalized NFKC t = unsafePerformIO $ do nf <- nfkcNormalizer pure $ isNormalizedWith nf t isNormalized NFKCCasefold t = unsafePerformIO $ do nf <- nfkcCasefoldNormalizer pure $ isNormalizedWith nf t -- * Comparison type UCompareOption = Word32 -- | Options to 'compare'. data CompareOption = InputIsFCD -- ^ The caller knows that both strings fulfill the -- 'FCD' conditions. If /not/ set, 'compare' will -- 'quickCheck' for 'FCD' and normalize if -- necessary. | CompareIgnoreCase -- ^ Compare strings case-insensitively using case -- folding, instead of case-sensitively. If set, -- then the following case folding options are -- used. | FoldCaseExcludeSpecialI -- ^ When case folding, exclude the special I -- character. For use with Turkic -- (Turkish/Azerbaijani) text data. deriving (Eq, Show, Enum, Typeable) fromCompareOption :: CompareOption -> UCompareOption fromCompareOption InputIsFCD = #const UNORM_INPUT_IS_FCD fromCompareOption CompareIgnoreCase = #const U_COMPARE_IGNORE_CASE fromCompareOption FoldCaseExcludeSpecialI = #const U_FOLD_CASE_EXCLUDE_SPECIAL_I reduceCompareOptions :: [CompareOption] -> UCompareOption reduceCompareOptions = foldl' orO (#const U_COMPARE_CODE_POINT_ORDER) where a `orO` b = a .|. fromCompareOption b -- | Compare two strings for canonical equivalence. Further options -- include case-insensitive comparison and codepoint order (as -- opposed to code unit order). -- -- Canonical equivalence between two strings is defined as their -- normalized forms ('NFD' or 'NFC') being identical. This function -- compares strings incrementally instead of normalizing (and -- optionally case-folding) both strings entirely, improving -- performance significantly. -- -- Bulk normalization is only necessary if the strings do not fulfill -- the 'FCD' conditions. Only in this case, and only if the strings -- are relatively long, is memory allocated temporarily. For 'FCD' -- strings and short non-'FCD' strings there is no memory allocation. compareUnicode :: [CompareOption] -> Text -> Text -> Ordering compareUnicode opts a b = unsafePerformIO $ do useAsUCharPtr a $ \aptr alen -> useAsUCharPtr b $ \bptr blen -> fmap asOrdering . handleError $ unorm_compare aptr (fromIntegral alen) bptr (fromIntegral blen) (reduceCompareOptions opts) -- | This is equivalent to `compareUnicode []`. compareUnicode' :: Text -> Text -> Ordering compareUnicode' = compareUnicode [] foreign import ccall unsafe "hs_text_icu.h __hs_unorm2_getNFCInstance" unorm2_getNFCInstance :: Ptr UErrorCode -> IO (Ptr UNormalizer2) foreign import ccall unsafe "hs_text_icu.h __hs_unorm2_getNFDInstance" unorm2_getNFDInstance :: Ptr UErrorCode -> IO (Ptr UNormalizer2) foreign import ccall unsafe "hs_text_icu.h __hs_unorm2_getNFKCInstance" unorm2_getNFKCInstance :: Ptr UErrorCode -> IO (Ptr UNormalizer2) foreign import ccall unsafe "hs_text_icu.h __hs_unorm2_getNFKDInstance" unorm2_getNFKDInstance :: Ptr UErrorCode -> IO (Ptr UNormalizer2) foreign import ccall unsafe "hs_text_icu.h __hs_unorm2_getNFKCCasefoldInstance" unorm2_getNFKCCasefoldInstance :: Ptr UErrorCode -> IO (Ptr UNormalizer2) foreign import ccall unsafe "hs_text_icu.h __hs_unorm_compare" unorm_compare :: Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> Word32 -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_unorm2_quickCheck" unorm2_quickCheck :: Ptr UNormalizer2 -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO UNormalizationCheckResult foreign import ccall unsafe "hs_text_icu.h __hs_unorm2_isNormalized" unorm2_isNormalized :: Ptr UNormalizer2 -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO UBool foreign import ccall unsafe "hs_text_icu.h __hs_unorm2_normalize" unorm2_normalize :: Ptr UNormalizer2 -> Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO Int32 text-icu-0.8.0.4/Data/Text/ICU/Number.hsc0000644000000000000000000002435507346545000015771 0ustar0000000000000000{-# LANGUAGE CPP, DeriveDataTypeable, ForeignFunctionInterface #-} -- | -- Module : Data.Text.ICU.Number -- Copyright : (c) 2020 Torsten Kemps-Benedix -- -- License : BSD-style -- Maintainer : tkx68@icloud.com -- Stability : experimental -- Portability : GHC -- -- New users with are strongly encouraged to see -- if Data.Text.ICU.NumberFormatter fits their use case. -- Although not deprecated, this header is provided for backwards -- compatibility only. module Data.Text.ICU.Number ( -- * Unicode number formatting API -- $api numberFormatter , FormattableNumber, formatNumber, formatNumber' , NumberFormatStyle(..) , NumberFormat ) where #ifdef mingw32_HOST_OS #define U_HAVE_INTTYPES_H 1 #endif #include #include import GHC.Natural import Data.Text (Text) import qualified Data.Text as T import Data.Text.ICU.Error import Data.Text.ICU.Error.Internal (UErrorCode, UParseError, handleParseError, handleOverflowError) import Data.Text.ICU.Internal (UChar, useAsUCharPtr, fromUCharPtr) import Data.Text.ICU.Internal (LocaleName, withLocaleName) import Data.Typeable (Typeable) import Data.Int (Int32) import Foreign.C.Types (CInt(..), CDouble(..)) import Foreign.Ptr (Ptr) import System.IO.Unsafe (unsafePerformIO) import Prelude hiding (compare) import Foreign.C.String (CString) import Data.Text.ICU.Number.Internal -- $api -- -- This module helps you to format and parse numbers for any locale. Your code -- can be completely independent of the locale conventions for decimal points, -- thousands-separators, or even the particular decimal digits used, or whether -- the number format is even decimal. There are different number format styles -- like decimal, currency, percent and spelled-out. -- -- Use 'formatter' to create a formatter and 'format' to format numbers. -- | The possible number format styles. data NumberFormatStyle = NUM_PATTERN_DECIMAL Text -- ^ Decimal format defined by a pattern string. See the section \"Patterns\" at for further details regarding pattern strings. | NUM_DECIMAL -- ^ Decimal format ("normal" style). | NUM_CURRENCY -- ^ Currency format (generic). Defaults to UNUM_CURRENCY_STANDARD style (using currency symbol, e.g., "$1.00", with non-accounting style for negative values e.g. using minus sign). The specific style may be specified using the -cf- locale key. | NUM_PERCENT -- ^ Percent format. | NUM_SCIENTIFIC -- ^ Scientific format. | NUM_SPELLOUT -- ^ Spellout rule-based format. The default ruleset can be specified/changed using unum_setTextAttribute with UNUM_DEFAULT_RULESET; the available public rulesets can be listed using unum_getTextAttribute with UNUM_PUBLIC_RULESETS. | NUM_ORDINAL -- ^ Ordinal rule-based format. The default ruleset can be specified/changed using unum_setTextAttribute with UNUM_DEFAULT_RULESET; the available public rulesets can be listed using unum_getTextAttribute with UNUM_PUBLIC_RULESETS. | NUM_DURATION -- ^ Duration rule-based format. | NUM_NUMBERING_SYSTEM -- ^ Numbering system rule-based format. | NUM_PATTERN_RULEBASED Text -- ^ Rule-based format defined by a pattern string. See the section \"Patterns\" at for further details regarding pattern strings. | NUM_CURRENCY_ISO -- ^ Currency format with an ISO currency code, e.g., "USD1.00". | NUM_CURRENCY_PLURAL -- ^ Currency format with a pluralized currency name, e.g., "1.00 US dollar" and "3.00 US dollars". | NUM_CURRENCY_ACCOUNTING -- ^ Currency format for accounting, e.g., "($3.00)" for negative currency amount instead of "-$3.00" (UNUM_CURRENCY). Overrides any style specified using -cf- key in locale. | NUM_CASH_CURRENCY -- ^ Currency format with a currency symbol given CASH usage, e.g., "NT$3" instead of "NT$3.23". | NUM_DECIMAL_COMPACT_SHORT -- ^ Decimal format expressed using compact notation (short form, corresponds to UNumberCompactStyle=UNUM_SHORT) e.g. "23K", "45B" | NUM_DECIMAL_COMPACT_LONG -- ^ Decimal format expressed using compact notation (long form, corresponds to UNumberCompactStyle=UNUM_LONG) e.g. "23 thousand", "45 billion" | NUM_CURRENCY_STANDARD -- ^ Currency format with a currency symbol, e.g., "$1.00", using non-accounting style for negative values (e.g. minus sign). Overrides any style specified using -cf- key in locale. | NUM_FORMAT_STYLE_COUNT -- ^ One more than the highest normal UNumberFormatStyle value. Deprecated: ICU 58 The numeric value may change over time, see ICU ticket #12420. | NUM_DEFAULT -- ^ Default format. | NUM_IGNORE -- ^ Alias for NUM_PATTERN_DECIMAL. deriving (Eq, Show, Typeable) type UNumberFormatStyle = CInt toNFS :: NumberFormatStyle -> UNumberFormatStyle toNFS (NUM_PATTERN_DECIMAL _) = #const UNUM_PATTERN_DECIMAL toNFS NUM_DECIMAL = #const UNUM_DECIMAL toNFS NUM_CURRENCY = #const UNUM_CURRENCY toNFS NUM_PERCENT = #const UNUM_PERCENT toNFS NUM_SCIENTIFIC = #const UNUM_SCIENTIFIC toNFS NUM_SPELLOUT = #const UNUM_SPELLOUT toNFS NUM_ORDINAL = #const UNUM_ORDINAL toNFS NUM_DURATION = #const UNUM_DURATION toNFS NUM_NUMBERING_SYSTEM = #const UNUM_NUMBERING_SYSTEM toNFS (NUM_PATTERN_RULEBASED _) = #const UNUM_PATTERN_RULEBASED toNFS NUM_CURRENCY_ISO = #const UNUM_CURRENCY_ISO toNFS NUM_CURRENCY_PLURAL = #const UNUM_CURRENCY_PLURAL toNFS NUM_CURRENCY_ACCOUNTING = #const UNUM_CURRENCY_ACCOUNTING toNFS NUM_CASH_CURRENCY = #const UNUM_CASH_CURRENCY toNFS NUM_DECIMAL_COMPACT_SHORT = #const UNUM_DECIMAL_COMPACT_SHORT toNFS NUM_DECIMAL_COMPACT_LONG = #const UNUM_DECIMAL_COMPACT_LONG toNFS NUM_CURRENCY_STANDARD = #const UNUM_CURRENCY_STANDARD toNFS NUM_FORMAT_STYLE_COUNT = #const UNUM_FORMAT_STYLE_COUNT toNFS NUM_DEFAULT = #const UNUM_DEFAULT toNFS NUM_IGNORE = #const UNUM_IGNORE -- | Create and return a new NumberFormat for formatting and parsing numbers. -- -- A NumberFormat may be used to format numbers by calling unum_format, and -- to parse numbers by calling unum_parse. The caller must call unum_close when -- done to release resources used by this object. numberFormatter :: NumberFormatStyle -- ^ The type of number format to open. If NUM_PATTERN_DECIMAL or NUM_PATTERN_RULEBASED is passed then the number format is opened using the given pattern, which must conform to the syntax described in DecimalFormat or RuleBasedNumberFormat, respectively. -> LocaleName -- ^ A locale identifier to use to determine formatting and parsing conventions, or NULL to use the default locale, e.g. "de_DE". -> NumberFormat numberFormatter sty@(NUM_PATTERN_DECIMAL pattern) loc = numberFormatter' (toNFS sty) pattern loc numberFormatter sty@(NUM_PATTERN_RULEBASED pattern) loc = numberFormatter' (toNFS sty) pattern loc numberFormatter style loc = numberFormatter' (toNFS style) T.empty loc numberFormatter' :: UNumberFormatStyle -- ^ The type of number format to open. If NUM_PATTERN_DECIMAL or NUM_PATTERN_RULEBASED is passed then the number format is opened using the given pattern, which must conform to the syntax described in DecimalFormat or RuleBasedNumberFormat, respectively. -> Text -> LocaleName -- ^ A locale identifier to use to determine formatting and parsing conventions, or NULL to use the default locale, e.g. "de_DE". -> NumberFormat numberFormatter' style pattern loc = System.IO.Unsafe.unsafePerformIO $ fmap C $ wrap $ useAsUCharPtr pattern $ \patternPtr patternLen -> withLocaleName loc $ handleParseError (== u_PARSE_ERROR) . (unum_open style patternPtr (fromIntegral patternLen)) foreign import ccall unsafe "hs_text_icu.h __hs_unum_open" unum_open :: UNumberFormatStyle -> Ptr UChar -> Int32 -> CString -> Ptr UParseError -> Ptr UErrorCode -> IO (Ptr UNumberFormat) -- | Format an integer using a NumberFormat. -- -- The integer will be formatted according to the UNumberFormat's locale. class FormattableNumber n where formatNumber :: NumberFormat -- ^ The formatter to use. -> n -- ^ The number to format. -> Text -- | Create a formatter and apply it in one step. formatNumber' :: (FormattableNumber n) => NumberFormatStyle -- ^ The type of number format to open. If NUM_PATTERN_DECIMAL or NUM_PATTERN_RULEBASED is passed then the number format is opened using the given pattern, which must conform to the syntax described in DecimalFormat or RuleBasedNumberFormat, respectively. -> LocaleName -- ^ A locale identifier to use to determine formatting and parsing conventions, or NULL to use the default locale, e.g. "de_DE". -> n -- ^ The number to format. -> Text formatNumber' style loc x = formatNumber (numberFormatter style loc) x instance FormattableNumber Integer where formatNumber (C nf) x = numberFormatInt nf (fromIntegral x) instance FormattableNumber Natural where formatNumber (C nf) x = numberFormatInt nf (fromIntegral x) instance FormattableNumber Int where formatNumber (C nf) x = numberFormatInt nf x instance FormattableNumber Double where formatNumber (C nf) x = numberFormatDouble nf x instance FormattableNumber Float where formatNumber (C nf) x = numberFormatDouble nf (fromRational $ toRational x) -- | Create a number format. numberFormatInt :: MNumberFormat -> Int -> Text numberFormatInt nf x = System.IO.Unsafe.unsafePerformIO $ withNumberFormat nf $ \nptr -> handleOverflowError 100 (\dptr dlen ec -> unum_formatInt64 nptr (fromIntegral x) dptr (fromIntegral dlen) ec) (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen)) -- | Format a number. numberFormatDouble :: MNumberFormat -> Double -> Text numberFormatDouble nf x = System.IO.Unsafe.unsafePerformIO $ withNumberFormat nf $ \nptr -> handleOverflowError 100 (\dptr dlen ec -> unum_formatDouble nptr (CDouble x) dptr (fromIntegral dlen) ec) (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen)) foreign import ccall unsafe "hs_text_icu.h __hs_unum_formatInt64" unum_formatInt64 :: Ptr UNumberFormat -> Int -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_unum_formatDouble" unum_formatDouble :: Ptr UNumberFormat -> CDouble -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO Int32 text-icu-0.8.0.4/Data/Text/ICU/Number/0000755000000000000000000000000007346545000015261 5ustar0000000000000000text-icu-0.8.0.4/Data/Text/ICU/Number/Internal.hs0000644000000000000000000000303607346545000017373 0ustar0000000000000000{-# LANGUAGE DeriveDataTypeable, EmptyDataDecls, ForeignFunctionInterface #-} -- | -- Module : Data.Text.ICU.Collate.Internal -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Internals of the string collation infrastructure. module Data.Text.ICU.Number.Internal ( -- * Unicode collation API MNumberFormat(..) , NumberFormat(..) , UNumberFormat , withNumberFormat , wrap ) where import Data.Typeable (Typeable) import Foreign.ForeignPtr (ForeignPtr, withForeignPtr) import Foreign.Ptr (FunPtr, Ptr) import Data.Text.ICU.Internal (newICUPtr) -- $api -- data UNumberFormat -- | This is the number formatter. It can be created with 'formatter'. Use it to format numbers with the 'format' function. data MNumberFormat = MNumberFormat {-# UNPACK #-} !(ForeignPtr UNumberFormat) deriving (Typeable) -- | This is the number formatter. It can be created with 'formatter'. Use it to format numbers with the 'format' function. newtype NumberFormat = C MNumberFormat deriving (Typeable) withNumberFormat :: MNumberFormat -> (Ptr UNumberFormat -> IO a) -> IO a withNumberFormat (MNumberFormat col) action = withForeignPtr col action {-# INLINE withNumberFormat #-} wrap :: IO (Ptr UNumberFormat) -> IO MNumberFormat wrap = newICUPtr MNumberFormat unum_close {-# INLINE wrap #-} foreign import ccall unsafe "hs_text_icu.h &__hs_unum_close" unum_close :: FunPtr (Ptr UNumberFormat -> IO ()) text-icu-0.8.0.4/Data/Text/ICU/NumberFormatter.hsc0000644000000000000000000001774607346545000017663 0ustar0000000000000000{-# LANGUAGE EmptyDataDecls, RankNTypes, BangPatterns, ForeignFunctionInterface, RecordWildCards #-} -- | -- Module : Data.Text.ICU.NumberFormatter -- Copyright : (c) 2021 Torsten Kemps-Benedix -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Number formatter implemented as bindings to -- the International Components for Unicode (ICU) libraries. module Data.Text.ICU.NumberFormatter ( -- * Data NumberFormatter, -- * Formatter numberFormatter, -- $skeleton -- * Formatting functions formatIntegral, formatIntegral', formatDouble, formatDouble' ) where #include import Data.Int (Int32, Int64) import Data.Text (Text) import Data.Text.ICU.Error.Internal (UErrorCode, handleError, handleOverflowError) import Data.Text.ICU.Internal (LocaleName(..), UChar, withLocaleName, newICUPtr, fromUCharPtr, useAsUCharPtr) import Foreign.C.String (CString) import Foreign.C.Types (CDouble(..)) import Foreign.ForeignPtr (withForeignPtr, ForeignPtr) import Foreign.Ptr (FunPtr, Ptr) import Prelude hiding (last) import System.IO.Unsafe (unsafePerformIO) -- $skeleton -- -- Here are some examples for number skeletons, see -- https://unicode-org.github.io/icu/userguide/format_parse/numbers/skeletons.html#examples for more: -- -- +----------------------------+-----------------+--------+--------------+-------------------------------------------------------------+ -- | Long Skeleton | Concise Skeleton | Input | en-US Output | Comments | -- +============================+==================+=======+==============+=============================================================+ -- | percent | % | 25 | 25% | | -- | .00 |.00 | 25 | 25.00 | Equivalent to Precision::fixedFraction(2) | -- | percent .00 | % .00 | 25 | 25.00% | | -- | scale/100 | scale/100 | 0.3 | 30 | Multiply by 100 before formatting | -- | percent scale/100 | %x100 | 0.3 | 30% | | -- | measure-unit/length-meter | unit/meter | 5 | 5 m | UnitWidth defaults to Short | -- | unit-width-full-name | unit/meter | 5 | 5 meters | | -- | compact-short | K | 5000 | 5K | | -- | compact-long | KK | 5000 | 5 thousand | | -- | group-min2 | ,? | 5000 | 5000 | Require 2 digits in group for separator | -- | group-min2 | ,? | 15000 | 15,000 | | -- | sign-always | +! | 60 | +60 | Show sign on all numbers | -- | sign-always | +! | 0 | +0 | | -- | sign-except-zero | +? | 60 | +60 | Show sign on all numbers except 0 | -- | sign-except-zero | +? | 0 | 0 | | -- +----------------------------+-----------------+--------+--------------+-------------------------------------------------------------+ data UNumberFormatter data UFormattedNumber newtype NumberFormatter = NumberFormatter (ForeignPtr UNumberFormatter) -- | Create a new 'NumberFormatter'. -- -- See https://unicode-org.github.io/icu/userguide/format_parse/numbers/skeletons.html for how to specify -- the number skeletons. And use 'availableLocales' in order to find the allowed locale names. These -- usuallly look like "en", "de", "de_AT" etc. See 'formatIntegral' and 'formatDouble' for some examples. numberFormatter :: Text -> LocaleName -> IO NumberFormatter numberFormatter skel loc = withLocaleName loc $ \locale -> useAsUCharPtr skel $ \skelPtr skelLen -> newICUPtr NumberFormatter unumf_close $ handleError $ unumf_openForSkeletonAndLocale skelPtr (fromIntegral skelLen) locale -- | Format an integral number. -- -- See https://unicode-org.github.io/icu/userguide/format_parse/numbers/skeletons.html for how to specify -- the number skeletons. -- -- >>> import Data.Text -- >>> nf <- numberFormatter (pack "precision-integer") (Locale "de") -- >>> formatIntegral nf 12345 -- "12.345" -- >>> nf2 <- numberFormatter (pack "precision-integer") (Locale "fr") -- >>> formatIntegral nf2 12345 -- "12\8239\&345" formatIntegral :: Integral a => NumberFormatter -> a -> Text formatIntegral (NumberFormatter nf) x = unsafePerformIO $ do withForeignPtr nf $ \nfPtr -> do resultPtr <- newResult withForeignPtr resultPtr $ \resPtr -> do handleError $ unumf_formatInt nfPtr (fromIntegral x) resPtr t <- handleOverflowError (fromIntegral (64 :: Int)) (\dptr dlen -> unumf_resultToString resPtr dptr dlen) (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen)) pure t -- | Create a number formatter and apply it to an integral number. formatIntegral' :: (Integral a) => Text -> LocaleName -> a -> Text formatIntegral' skel loc x = unsafePerformIO $ do nf <- numberFormatter skel loc pure $ formatIntegral nf x -- | Format a Double. -- -- See https://unicode-org.github.io/icu/userguide/format_parse/numbers/skeletons.html for how to specify -- the number skeletons. -- -- >>> import Data.Text -- >>> nf3 <- numberFormatter (pack "precision-currency-cash") (Locale "it") -- >>> formatDouble nf3 12345.6789 -- "12.345,68" formatDouble :: NumberFormatter -> Double -> Text formatDouble (NumberFormatter nf) x = unsafePerformIO $ do withForeignPtr nf $ \nfPtr -> do resultPtr <- newResult withForeignPtr resultPtr $ \resPtr -> do handleError $ unumf_formatDouble nfPtr (CDouble x) resPtr t <- handleOverflowError (fromIntegral (64 :: Int)) (\dptr dlen -> unumf_resultToString resPtr dptr dlen) (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen)) pure t -- | Create a number formatter and apply it to a Double. formatDouble' :: Text -> LocaleName -> Double -> Text formatDouble' skel loc x = unsafePerformIO $ do nf <- numberFormatter skel loc pure $ formatDouble nf x newResult :: IO (ForeignPtr UFormattedNumber) newResult = newICUPtr id unumf_closeResult $ handleError unumf_openResult foreign import ccall unsafe "hs_text_icu.h __hs_unumf_openForSkeletonAndLocale" unumf_openForSkeletonAndLocale :: Ptr UChar -> Int32 -> CString -> Ptr UErrorCode -> IO (Ptr UNumberFormatter) foreign import ccall unsafe "hs_text_icu.h &__hs_unumf_close" unumf_close :: FunPtr (Ptr UNumberFormatter -> IO ()) foreign import ccall unsafe "hs_text_icu.h __hs_unumf_openResult" unumf_openResult :: Ptr UErrorCode -> IO (Ptr UFormattedNumber) foreign import ccall unsafe "hs_text_icu.h &__hs_unumf_closeResult" unumf_closeResult :: FunPtr (Ptr UFormattedNumber -> IO ()) foreign import ccall unsafe "hs_text_icu.h __hs_unumf_formatInt" unumf_formatInt :: Ptr UNumberFormatter -> Int64 -> Ptr UFormattedNumber -> Ptr UErrorCode -> IO () foreign import ccall unsafe "hs_text_icu.h __hs_unumf_formatDouble" unumf_formatDouble :: Ptr UNumberFormatter -> CDouble -> Ptr UFormattedNumber -> Ptr UErrorCode -> IO () foreign import ccall unsafe "hs_text_icu.h __hs_unumf_resultToString" unumf_resultToString :: Ptr UFormattedNumber -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO Int32 text-icu-0.8.0.4/Data/Text/ICU/Regex.hs0000644000000000000000000001525107346545000015443 0ustar0000000000000000{-# LANGUAGE BangPatterns, EmptyDataDecls, MagicHash, RecordWildCards, ScopedTypeVariables #-} {-# OPTIONS_GHC -fno-warn-orphans #-} -- | -- Module : Data.Text.ICU.Regex -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Regular expression support for Unicode, implemented as bindings to -- the International Components for Unicode (ICU) libraries. -- -- The syntax and behaviour of ICU regular expressions are Perl-like. -- For complete details, see the ICU User Guide entry at -- . -- -- /Note/: The functions in this module are not thread safe. For -- thread safe use, see 'clone' below, or use the pure functions in -- "Data.Text.ICU". module Data.Text.ICU.Regex ( -- * Types MatchOption(..) , ParseError(errError, errLine, errOffset) , Regex -- * Functions -- ** Construction , regex , regex' , clone -- ** Managing text to search , setText , getUTextPtr -- ** Inspection , pattern -- ** Searching , find , findNext -- ** Match groups -- $groups , groupCount , start , end , start_ , end_ ) where import Data.Text.ICU.Regex.Internal import qualified Control.Exception as E import Data.IORef (newIORef, readIORef, writeIORef) import Data.Text (Text) import Data.Text.ICU.Internal (asBool, UTextPtr, asUTextPtr, emptyUTextPtr, TextI, withUTextPtr, fromUCharPtr, newICUPtr) import Data.Text.ICU.Error.Internal (ParseError(..), handleError) import Foreign.ForeignPtr (withForeignPtr) import Foreign.Marshal.Alloc (alloca) import Foreign.Storable (peek) import System.IO.Unsafe (unsafePerformIO) instance Show Regex where show re = "Regex " ++ show (pattern re) -- $groups -- -- Capturing groups are numbered starting from zero. Group zero is -- always the entire matching text. Groups greater than zero contain -- the text matching each capturing group in a regular expression. -- | Compile a regular expression with the given options. This is -- safest to use when the pattern is constructed at run time. regex' :: [MatchOption] -> Text -> IO (Either ParseError Regex) regex' opts pat = (Right `fmap` regex opts pat) `E.catch` \(err::ParseError) -> return (Left err) -- | Set the subject text string upon which the regular expression -- will look for matches. This function may be called any number of -- times, allowing the regular expression pattern to be applied to -- different strings. setText :: Regex -> Text -> IO () setText Regex{..} t = do hayfp <- asUTextPtr t withForeignPtr reRe $ \rePtr -> withUTextPtr hayfp $ \hayPtr -> handleError $ uregex_setUText rePtr hayPtr writeIORef reText hayfp -- | Get the subject text that is currently associated with this -- regular expression object. getUTextPtr :: Regex -> IO UTextPtr getUTextPtr Regex{..} = readIORef reText -- | Return the source form of the pattern used to construct this -- regular expression or match. pattern :: Regex -> Text pattern Regex{..} = unsafePerformIO . withForeignPtr reRe $ \rePtr -> alloca $ \lenPtr -> do textPtr <- handleError $ uregex_pattern rePtr lenPtr (fromUCharPtr textPtr . fromIntegral) =<< peek lenPtr -- | Find the first matching substring of the input string that -- matches the pattern. -- -- If /n/ is non-negative, the search for a match begins at the -- specified index, and any match region is reset. -- -- If /n/ is -1, the search begins at the start of the input region, -- or at the start of the full string if no region has been specified. -- -- If a match is found, 'start', 'end', and 'group' will provide more -- information regarding the match. find :: Regex -> TextI -> IO Bool find Regex{..} n = fmap asBool . withForeignPtr reRe $ \rePtr -> handleError $ uregex_find rePtr (fromIntegral n) -- | Find the next pattern match in the input string. Begin searching -- the input at the location following the end of he previous match, -- or at the start of the string (or region) if there is no previous -- match. -- -- If a match is found, 'start', 'end', and 'group' will provide more -- information regarding the match. findNext :: Regex -> IO Bool findNext Regex{..} = fmap asBool . withForeignPtr reRe $ handleError . uregex_findNext -- | Make a copy of a compiled regular expression. Cloning a regular -- expression is faster than opening a second instance from the source -- form of the expression, and requires less memory. -- -- Note that the current input string and the position of any matched -- text within it are not cloned; only the pattern itself and and the -- match mode flags are copied. -- -- Cloning can be particularly useful to threaded applications that -- perform multiple match operations in parallel. Each concurrent RE -- operation requires its own instance of a 'Regex'. clone :: Regex -> IO Regex {-# INLINE clone #-} clone Regex{..} = do newICUPtr Regex uregex_close (withForeignPtr reRe (handleError . uregex_clone)) <*> newIORef emptyUTextPtr -- | Return the number of capturing groups in this regular -- expression's pattern. groupCount :: Regex -> IO Int groupCount Regex{..} = fmap fromIntegral . withForeignPtr reRe $ handleError . uregex_groupCount -- | Returns the index in the input string of the start of the text -- matched by the specified capture group during the previous match -- operation. Returns @-1@ if the capture group was not part of the -- last match. start_ :: Regex -> Int -> IO TextI start_ Regex{..} n = fmap fromIntegral . withForeignPtr reRe $ \rePtr -> handleError $ uregex_start rePtr (fromIntegral n) -- | Returns the index in the input string of the end of the text -- matched by the specified capture group during the previous match -- operation. Returns @-1@ if the capture group was not part of -- the last match. end_ :: Regex -> Int -> IO TextI end_ Regex{..} n = fmap fromIntegral . withForeignPtr reRe $ \rePtr -> handleError $ uregex_end rePtr (fromIntegral n) -- | Returns the index in the input string of the start of the text -- matched by the specified capture group during the previous match -- operation. Returns 'Nothing' if the capture group was not part of -- the last match. start :: Regex -> Int -> IO (Maybe TextI) start r n = check `fmap` start_ r n -- | Returns the index in the input string of the end of the text -- matched by the specified capture group during the previous match -- operation. Returns 'Nothing' if the capture group was not part of -- the last match. end :: Regex -> Int -> IO (Maybe TextI) end r n = check `fmap` end_ r n check :: TextI -> Maybe TextI check (-1) = Nothing check k = Just $! fromIntegral k text-icu-0.8.0.4/Data/Text/ICU/Regex/0000755000000000000000000000000007346545000015103 5ustar0000000000000000text-icu-0.8.0.4/Data/Text/ICU/Regex/Internal.hsc0000644000000000000000000002172207346545000017362 0ustar0000000000000000{-# LANGUAGE BangPatterns, DeriveDataTypeable, EmptyDataDecls, ForeignFunctionInterface, MagicHash, RecordWildCards, ScopedTypeVariables #-} -- | -- Module : Data.Text.ICU.Regex.Internal -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Regular expression support for Unicode, implemented as bindings to -- the International Components for Unicode (ICU) libraries. -- -- The syntax and behaviour of ICU regular expressions are Perl-like. -- For complete details, see the ICU User Guide entry at -- . module Data.Text.ICU.Regex.Internal ( -- * Types MatchOption(..) , Regex(..) , URegularExpression -- * Functions , regex , uregex_clone , uregex_close , uregex_end , uregex_find , uregex_findNext , uregex_getText , uregex_group , uregex_groupCount , uregex_pattern , uregex_setUText , uregex_start ) where import Control.Monad (when) import Data.IORef (IORef, newIORef) import Data.Int (Int32) import Data.Text (Text) import Data.Text.ICU.Internal (UBool, UChar, UTextPtr, UText, useAsUCharPtr, withUTextPtr, emptyUTextPtr, newICUPtr) import Data.Text.ICU.Error (isRegexError) import Data.Text.ICU.Error.Internal (UParseError, UErrorCode, handleError, handleParseError) import Data.Typeable (Typeable) import Data.Word (Word32) import Foreign.ForeignPtr (ForeignPtr) import Foreign.Ptr (FunPtr, Ptr) #include -- | Options for controlling matching behaviour. data MatchOption = CaseInsensitive -- ^ Enable case insensitive matching. | Comments -- ^ Allow comments and white space within patterns. | DotAll -- ^ If set, @\'.\'@ matches line terminators. Otherwise @\'.\'@ -- matching stops at line end. | Literal -- ^ If set, treat the entire pattern as a literal string. -- Metacharacters or escape sequences in the input sequence will -- be given no special meaning. -- -- The option 'CaseInsensitive' retains its meanings on matching -- when used in conjunction with this option. Other options -- become superfluous. | Multiline -- ^ Control behaviour of @\'$\'@ and @\'^\'@. If set, recognize -- line terminators within string, Otherwise, match only at start -- and end of input string. | HaskellLines -- ^ Haskell-only line endings. When this mode is enabled, only -- @\'\\n\'@ is recognized as a line ending in the behavior of -- @\'.\'@, @\'^\'@, and @\'$\'@. | UnicodeWord -- ^ Unicode word boundaries. If set, @\'\\\\b\'@ uses the -- Unicode TR 29 definition of word boundaries. -- -- /Warning/: Unicode word boundaries are quite different from -- traditional regular expression word boundaries. See -- . | ErrorOnUnknownEscapes -- ^ Throw an error on unrecognized backslash escapes. If set, -- fail with an error on patterns that contain backslash-escaped -- ASCII letters without a known special meaning. If this flag is -- not set, these escaped letters represent themselves. | WorkLimit Int -- ^ Set a processing limit for match operations. -- -- Some patterns, when matching certain strings, can run in -- exponential time. For practical purposes, the match operation -- may appear to be in an infinite loop. When a limit is set a -- match operation will fail with an error if the limit is -- exceeded. -- -- The units of the limit are steps of the match engine. -- Correspondence with actual processor time will depend on the -- speed of the processor and the details of the specific pattern, -- but will typically be on the order of milliseconds. -- -- By default, the matching time is not limited. | StackLimit Int -- ^ Set the amount of heap storage available for use by the match -- backtracking stack. -- -- ICU uses a backtracking regular expression engine, with the -- backtrack stack maintained on the heap. This function sets the -- limit to the amount of memory that can be used for this -- purpose. A backtracking stack overflow will result in an error -- from the match operation that caused it. -- -- A limit is desirable because a malicious or poorly designed -- pattern can use excessive memory, potentially crashing the -- process. A limit is enabled by default. deriving (Eq, Show, Typeable) -- | A compiled regular expression. -- -- 'Regex' values are usually constructed using the 'regex' or -- 'regex'' functions. This type is also an instance of 'IsString', -- so if you have the @OverloadedStrings@ language extension enabled, -- you can construct a 'Regex' by simply writing the pattern in -- quotes (though this does not allow you to specify any 'Option's). data Regex = Regex { reRe :: ForeignPtr URegularExpression , reText :: IORef UTextPtr } -- | Compile a regular expression with the given options. This -- function throws a 'ParseError' if the pattern is invalid. -- -- The 'Regex' is initialized with empty text to search against. regex :: [MatchOption] -> Text -> IO Regex regex opts pat = useAsUCharPtr pat $ \pptr plen -> newICUPtr Regex uregex_close (do ptr <- handleParseError isRegexError $ uregex_open pptr (fromIntegral plen) flags withUTextPtr hayfp $ \hayPtr -> handleError $ uregex_setUText ptr hayPtr when (workLimit > -1) . handleError $ uregex_setTimeLimit ptr (fromIntegral workLimit) when (stackLimit > -1) . handleError $ uregex_setStackLimit ptr (fromIntegral stackLimit) return ptr) <*> newIORef hayfp where (flags,workLimit,stackLimit) = toURegexpOpts opts hayfp = emptyUTextPtr data URegularExpression type URegexpFlag = Word32 toURegexpOpts :: [MatchOption] -> (URegexpFlag,Int,Int) toURegexpOpts = foldl go (0,-1,-1) where go (!flag,work,stack) opt = (flag+flag',work',stack') where flag' = case opt of CaseInsensitive -> #const UREGEX_CASE_INSENSITIVE Comments -> #const UREGEX_COMMENTS DotAll -> #const UREGEX_DOTALL Literal -> #const UREGEX_LITERAL Multiline -> #const UREGEX_MULTILINE HaskellLines -> #const UREGEX_UNIX_LINES UnicodeWord -> #const UREGEX_UWORD ErrorOnUnknownEscapes -> #const UREGEX_ERROR_ON_UNKNOWN_ESCAPES _ -> 0 work' = case opt of WorkLimit limit -> limit _ -> work stack' = case opt of StackLimit limit -> limit _ -> stack foreign import ccall unsafe "hs_text_icu.h __hs_uregex_open" uregex_open :: Ptr UChar -> Int32 -> Word32 -> Ptr UParseError -> Ptr UErrorCode -> IO (Ptr URegularExpression) foreign import ccall unsafe "hs_text_icu.h &__hs_uregex_close" uregex_close :: FunPtr (Ptr URegularExpression -> IO ()) foreign import ccall unsafe "hs_text_icu.h __hs_uregex_clone" uregex_clone :: Ptr URegularExpression -> Ptr UErrorCode -> IO (Ptr URegularExpression) foreign import ccall unsafe "hs_text_icu.h __hs_uregex_pattern" uregex_pattern :: Ptr URegularExpression -> Ptr Int32 -> Ptr UErrorCode -> IO (Ptr UChar) foreign import ccall unsafe "hs_text_icu.h __hs_uregex_setUText" uregex_setUText :: Ptr URegularExpression -> Ptr UText -> Ptr UErrorCode -> IO () foreign import ccall unsafe "hs_text_icu.h __hs_uregex_getText" uregex_getText :: Ptr URegularExpression -> Ptr Int32 -> Ptr UErrorCode -> IO (Ptr UChar) foreign import ccall unsafe "hs_text_icu.h __hs_uregex_find" uregex_find :: Ptr URegularExpression -> Int32 -> Ptr UErrorCode -> IO UBool foreign import ccall unsafe "hs_text_icu.h __hs_uregex_findNext" uregex_findNext :: Ptr URegularExpression -> Ptr UErrorCode -> IO UBool foreign import ccall unsafe "hs_text_icu.h __hs_uregex_start" uregex_start :: Ptr URegularExpression -> Int32 -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_uregex_end" uregex_end :: Ptr URegularExpression -> Int32 -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_uregex_groupCount" uregex_groupCount :: Ptr URegularExpression -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_uregex_group" uregex_group :: Ptr URegularExpression -> Int32 -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_uregex_setTimeLimit" uregex_setTimeLimit :: Ptr URegularExpression -> Int32 -> Ptr UErrorCode -> IO () foreign import ccall unsafe "hs_text_icu.h __hs_uregex_setStackLimit" uregex_setStackLimit :: Ptr URegularExpression -> Int32 -> Ptr UErrorCode -> IO () text-icu-0.8.0.4/Data/Text/ICU/Regex/Pure.hs0000644000000000000000000001627607346545000016366 0ustar0000000000000000{-# LANGUAGE BangPatterns, EmptyDataDecls, ScopedTypeVariables #-} -- | -- Module : Data.Text.ICU.Regex.Pure -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Regular expression support for Unicode, implemented as bindings to -- the International Components for Unicode (ICU) libraries. -- -- The functions in this module are pure and hence thread safe, but -- may not be as fast or as flexible as those in the -- "Data.Text.ICU.Regex" module. -- -- The syntax and behaviour of ICU regular expressions are Perl-like. -- For complete details, see the ICU User Guide entry at -- . module Data.Text.ICU.Regex.Pure ( -- * Types MatchOption(..) , ParseError(errError, errLine, errOffset) , Match , Regex , Regular -- * Functions -- ** Construction , regex , regex' -- ** Inspection , pattern -- ** Searching , find , findAll -- ** Match groups -- $group , groupCount , unfold , span , group , prefix , suffix ) where import qualified Control.Exception as E import Data.String (IsString(..)) import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Foreign as T import Data.Text.ICU.Internal (TextI, fromUCharPtr, lengthWord, withUTextPtrText, utextPtrLength) import Data.Text.ICU.Error.Internal (ParseError(..), handleError) import qualified Data.Text.ICU.Regex as IO import Data.Text.ICU.Regex.Internal hiding (Regex(..), regex) import qualified Data.Text.ICU.Regex.Internal as Internal import Foreign.ForeignPtr (ForeignPtr, withForeignPtr) import Foreign.Marshal.Alloc (alloca) import Foreign.Marshal.Array (advancePtr) import Foreign.Storable (peek) import Prelude hiding (span) import System.IO.Unsafe (unsafeInterleaveIO, unsafePerformIO) -- | A compiled regular expression. -- -- 'Regex' values are usually constructed using the 'regex' or -- 'regex'' functions. This type is also an instance of 'IsString', -- so if you have the @OverloadedStrings@ language extension enabled, -- you can construct a 'Regex' by simply writing the pattern in -- quotes (though this does not allow you to specify any 'Option's). newtype Regex = Regex { reRe :: Internal.Regex } instance Show Regex where show re = "Regex " ++ show (pattern re) instance IsString Regex where fromString = regex [] . T.pack -- | A match for a regular expression. data Match = Match { matchRe :: Internal.Regex , _matchPrev :: TextI } instance Show Match where show m = "Match " ++ show (unfold group m) -- | A typeclass for functions common to both 'Match' and 'Regex' -- types. class Regular r where regRe :: r -> Internal.Regex regFp :: r -> ForeignPtr URegularExpression regFp = Internal.reRe . regRe {-# INLINE regFp #-} instance Regular Match where regRe = matchRe instance Regular Regex where regRe = reRe -- | Compile a regular expression with the given options. This -- function throws a 'ParseError' if the pattern is invalid, so it is -- best for use when the pattern is statically known. regex :: [MatchOption] -> Text -> Regex regex opts pat = Regex . unsafePerformIO $ IO.regex opts pat -- | Compile a regular expression with the given options. This is -- safest to use when the pattern is constructed at run time. regex' :: [MatchOption] -> Text -> Either ParseError Regex regex' opts pat = unsafePerformIO $ ((Right . Regex) `fmap` Internal.regex opts pat) `E.catch` \(err::ParseError) -> return (Left err) -- | Return the source form of the pattern used to construct this -- regular expression or match. pattern :: Regular r => r -> Text pattern r = unsafePerformIO . withForeignPtr (regFp r) $ \rePtr -> alloca $ \lenPtr -> do textPtr <- handleError $ uregex_pattern rePtr lenPtr (fromUCharPtr textPtr . fromIntegral) =<< peek lenPtr -- | Find the first match for the regular expression in the given text. find :: Regex -> Text -> Maybe Match find re0 haystack = unsafePerformIO . matching re0 haystack $ \re -> do m <- IO.findNext re return $! if m then Just (Match re 0) else Nothing -- | Lazily find all matches for the regular expression in the given -- text. findAll :: Regex -> Text -> [Match] findAll re0 haystack = unsafePerformIO . unsafeInterleaveIO $ go 0 where len = fromIntegral . lengthWord $ haystack go !n | n >= len = return [] | otherwise = matching re0 haystack $ \re -> do found <- IO.find re n if found then do n' <- IO.end_ re 0 (Match re n:) `fmap` go n' else return [] matching :: Regex -> Text -> (IO.Regex -> IO a) -> IO a matching (Regex re0) haystack act = do re <- IO.clone re0 IO.setText re haystack act re -- $group -- -- Capturing groups are numbered starting from zero. Group zero is -- always the entire matching text. Groups greater than zero contain -- the text matching each capturing group in a regular expression. -- | Return the number of capturing groups in this regular -- expression or match's pattern. groupCount :: Regular r => r -> Int groupCount = unsafePerformIO . IO.groupCount . regRe {-# INLINE groupCount #-} -- | A combinator for returning a list of all capturing groups on a -- 'Match'. unfold :: (Int -> Match -> Maybe Text) -> Match -> [Text] unfold f m = go 0 where go !n = case f n m of Nothing -> [] Just z -> z : go (n+1) -- | Return the /n/th capturing group in a match, or 'Nothing' if /n/ -- is out of bounds. group :: Int -> Match -> Maybe Text group n m = grouping n m $ \re -> do let n' = fromIntegral n start <- fromIntegral `fmap` IO.start_ re n' end <- fromIntegral `fmap` IO.end_ re n' ut <- IO.getUTextPtr re withUTextPtrText ut $ \ptr -> T.fromPtr (ptr `advancePtr` fromIntegral start) (end - start) -- | Return the prefix of the /n/th capturing group in a match (the -- text from the start of the string to the start of the match), or -- 'Nothing' if /n/ is out of bounds. prefix :: Int -> Match -> Maybe Text prefix n m = grouping n m $ \re -> do start <- fromIntegral `fmap` IO.start_ re n ut <- IO.getUTextPtr re withUTextPtrText ut (`T.fromPtr` start) -- | Return the span of text between the end of the previous match and -- the beginning of the current match. span :: Match -> Text span (Match re p) = unsafePerformIO $ do start <- IO.start_ re 0 ut <- IO.getUTextPtr re withUTextPtrText ut $ \ptr -> T.fromPtr (ptr `advancePtr` fromIntegral p) (start - p) -- | Return the suffix of the /n/th capturing group in a match (the -- text from the end of the match to the end of the string), or -- 'Nothing' if /n/ is out of bounds. suffix :: Int -> Match -> Maybe Text suffix n m = grouping n m $ \re -> do end <- fromIntegral `fmap` IO.end_ re n ut <- IO.getUTextPtr re withUTextPtrText ut $ \ptr -> do T.fromPtr (ptr `advancePtr` fromIntegral end) (utextPtrLength ut - end) grouping :: Int -> Match -> (Internal.Regex -> IO a) -> Maybe a grouping n (Match m _) act = unsafePerformIO $ do count <- IO.groupCount m let n' = fromIntegral n if n' == 0 || (n' >= 0 && n' <= count) then Just `fmap` act m else return Nothing text-icu-0.8.0.4/Data/Text/ICU/Shape.hsc0000644000000000000000000001214207346545000015570 0ustar0000000000000000{-# LANGUAGE CPP, ForeignFunctionInterface #-} -- | -- Module : Data.Text.ICU.Shape -- Copyright : (c) 2018 Ondrej Palkovsky -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Bindings for arabic shaping, implemented as bindings to -- the International Components for Unicode (ICU) libraries. -- module Data.Text.ICU.Shape ( shapeArabic , ShapeOption(..) ) where #include import Data.List (foldl') import Data.Text.ICU.Error.Internal (UErrorCode, handleOverflowError) import Data.Bits ((.|.)) import Data.Int (Int32) import Foreign.Ptr (Ptr) import Data.Text.ICU.Internal (UChar, useAsUCharPtr, fromUCharPtr) import Data.Text (Text) import System.IO.Unsafe (unsafePerformIO) -- | Options for the 'shapeArabic' function. data ShapeOption = AggregateTaskheel -- ^ Tashkeel aggregation option: Replaces any combination of U+0651 with one of U+064C, U+064D, U+064E, U+064F, U+0650 with U+FC5E, U+FC5F, U+FC60, U+FC61, U+FC62 consecutively. | DigitTypeAnExtended -- ^ Digit type option: Use Eastern (Extended) Arabic-Indic digits (U+06f0...U+06f9). | DigitsAlen2AnInitAl -- ^ Replace European digits (U+0030...) by Arabic-Indic digits if the most recent strongly -- directional character is an Arabic letter (u_charDirection() result U_RIGHT_TO_LEFT_ARABIC [AL]). | DigitsAlen2AnInitLr -- ^ Digit shaping option: Replace European digits (U+0030...) by Arabic-Indic digits if the most recent strongly directional character is an Arabic letter (u_charDirection() result U_RIGHT_TO_LEFT_ARABIC [AL]). | DigitsAn2En -- ^ Digit shaping option: Replace Arabic-Indic digits by European digits (U+0030...). | DigitsEn2An -- ^ Digit shaping option: Replace European digits (U+0030...) by Arabic-Indic digits. | LengthFixedSpacesAtBeginning -- ^ If more room is necessary, then try to consume spaces at the beginning of the text. | LengthFixedSpacesAtEnd -- ^ If more room is necessary, then try to consume spaces at the end of the text. | LengthFixedSpacesNear -- ^ If more room is necessary, then try to consume spaces next to modified characters. | LettersShape -- ^ Letter shaping option: replace abstract letter characters by "shaped" ones. | LettersUnshape -- ^ Letter shaping option: replace "shaped" letter characters by abstract ones. | LettersShapeTashkeelIsolated -- ^ The only difference with LettersShape is that Tashkeel letters are always "shaped" into the isolated form instead of the medial form (selecting codepoints from the Arabic Presentation Forms-B block). | PreservePresentation -- ^ Presentation form option: Don't replace Arabic Presentation Forms-A and Arabic Presentation Forms-B characters with 0+06xx characters, before shaping. | TextDirectionVisualLTR -- ^ Direction indicator: the source is in visual LTR order, the leftmost displayed character stored first. deriving (Show) reduceShapeOpts :: [ShapeOption] -> Int32 reduceShapeOpts = foldl' orO 0 where a `orO` b = a .|. fromShapeOption b fromShapeOption :: ShapeOption -> Int32 fromShapeOption AggregateTaskheel = #const U_SHAPE_AGGREGATE_TASHKEEL fromShapeOption DigitTypeAnExtended = #const U_SHAPE_DIGIT_TYPE_AN_EXTENDED fromShapeOption DigitsAlen2AnInitAl = #const U_SHAPE_DIGITS_ALEN2AN_INIT_AL fromShapeOption DigitsAlen2AnInitLr = #const U_SHAPE_DIGITS_ALEN2AN_INIT_LR fromShapeOption DigitsAn2En = #const U_SHAPE_DIGITS_AN2EN fromShapeOption DigitsEn2An = #const U_SHAPE_DIGITS_EN2AN fromShapeOption LengthFixedSpacesAtBeginning = #const U_SHAPE_LENGTH_FIXED_SPACES_AT_BEGINNING fromShapeOption LengthFixedSpacesAtEnd = #const U_SHAPE_LENGTH_FIXED_SPACES_AT_END fromShapeOption LengthFixedSpacesNear = #const U_SHAPE_LENGTH_FIXED_SPACES_NEAR fromShapeOption LettersShape = #const U_SHAPE_LETTERS_SHAPE fromShapeOption LettersUnshape = #const U_SHAPE_LETTERS_UNSHAPE fromShapeOption LettersShapeTashkeelIsolated = #const U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED fromShapeOption PreservePresentation = #const U_SHAPE_PRESERVE_PRESENTATION fromShapeOption TextDirectionVisualLTR = #const U_SHAPE_TEXT_DIRECTION_VISUAL_LTR -- | Shape Arabic text on a character basis. -- -- Text-based shaping means that some character codepoints in the text are replaced by -- others depending on the context. It transforms one kind of text into another. -- In comparison, modern displays for Arabic text select appropriate, context-dependent font -- glyphs for each text element, which means that they transform text into a glyph vector. -- -- You probably want to call this with the LettersShape option in the default case. shapeArabic :: [ShapeOption] -> Text -> Text shapeArabic options t = unsafePerformIO . useAsUCharPtr t $ \sptr slen -> let slen' = fromIntegral slen options' = reduceShapeOpts options in handleOverflowError (fromIntegral slen) (\dptr dlen -> u_shapeArabic sptr slen' dptr (fromIntegral dlen) options') (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen)) foreign import ccall unsafe "hs_text_icu.h __hs_u_shapeArabic" u_shapeArabic :: Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> Int32 -> Ptr UErrorCode -> IO Int32 text-icu-0.8.0.4/Data/Text/ICU/Spoof.hsc0000644000000000000000000004726307346545000015632 0ustar0000000000000000{-# LANGUAGE BangPatterns, CPP, DeriveDataTypeable, ForeignFunctionInterface, OverloadedStrings, RecordWildCards, ScopedTypeVariables #-} -- | -- Module : Data.Text.ICU.Spoof -- Copyright : (c) 2015 Ben Hamilton -- -- License : BSD-style -- Maintainer : bgertzfield@gmail.com -- Stability : experimental -- Portability : GHC -- -- String spoofing (confusability) checks for Unicode, implemented as -- bindings to the International Components for Unicode (ICU) uspoof -- library. -- -- See and -- for detailed information -- about the underlying algorithms and databases used by this module. module Data.Text.ICU.Spoof ( -- * Unicode spoof checking API -- $api -- * Types MSpoof , OpenFromSourceParseError(..) , SpoofCheck(..) , SpoofCheckResult(..) , RestrictionLevel(..) , SkeletonTypeOverride(..) -- * Functions , open , openFromSerialized , openFromSource , getSkeleton , getChecks , setChecks , getRestrictionLevel , setRestrictionLevel , getAllowedLocales , setAllowedLocales , areConfusable , spoofCheck , serialize ) where #include #include #include import Control.DeepSeq (NFData(..)) import Control.Exception (Exception, throwIO, catchJust) import Data.Bits ((.&.)) import Data.ByteString (ByteString) import Data.ByteString.Internal (create, memcpy, toForeignPtr) import Data.ByteString.Unsafe (unsafeUseAsCStringLen) import Data.Int (Int32) import Data.List (intercalate) import Data.Text (Text, pack, splitOn, strip, unpack) import Data.Text.Foreign (useAsPtr) import Data.Text.ICU.BitMask (ToBitMask, fromBitMask, highestValueInBitMask, toBitMask) import Data.Text.ICU.Spoof.Internal (MSpoof, USpoof, withSpoof, wrap, wrapWithSerialized) import Data.Text.ICU.Error (u_PARSE_ERROR) import Data.Text.ICU.Error.Internal (UErrorCode, UParseError, ParseError(..), handleError, handleOverflowError, handleParseError) #if MIN_VERSION_text(2,0,0) import Data.Text.Foreign (fromPtr) #else import Data.Text.ICU.Internal (UChar) import Data.Text.ICU.Internal (fromUCharPtr) #endif import Data.Typeable (Typeable) import Data.Word (Word8) import Foreign.C.String (CString, peekCString, withCString) import Foreign.Marshal.Utils (with) import Foreign.Ptr (Ptr, nullPtr, plusPtr) import Foreign.Storable (peek) import Foreign.ForeignPtr (withForeignPtr) -- $api -- -- The 'spoofCheck', 'areConfusable', and 'getSkeleton' functions analyze -- Unicode text for visually confusable (or \"spoof\") characters. -- -- For example, Latin, Cyrillic, and Greek all contain unique Unicode -- values which appear nearly identical on-screen: -- -- @ -- A 0041 LATIN CAPITAL LETTER A -- Α 0391 GREEK CAPITAL LETTER ALPHA -- А 0410 CYRILLIC CAPITAL LETTER A -- Ꭺ 13AA CHEROKEE LETTER GO -- ᴀ 1D00 LATIN LETTER SMALL CAPITAL A -- ᗅ 15C5 CANADIAN SYLLABICS CARRIER GHO -- A FF21 FULLWIDTH LATIN CAPITAL LETTER A -- 𐊠 102A0 CARIAN LETTER A -- 𝐀 1D400 MATHEMATICAL BOLD CAPITAL A -- @ -- -- and so on. To check a string for visually confusable characters: -- -- 1. 'open' an 'MSpoof' -- 2. optionally configure it with 'setChecks', 'setRestrictionLevel', -- and/or 'setAllowedLocales', then -- 3. 'spoofCheck' a single string, use 'areConfusable' to check if two -- strings could be confused for each other, or use 'getSkeleton' to precompute -- a \"skeleton\" string (similar to a hash code) which can be cached -- and re-used to quickly check (using Unicode string comparison) if -- two strings are confusable. -- -- By default, these methods will use ICU's bundled copy of -- -- and , -- which could be out of date. To provide your own confusables databases, use -- 'openFromSource'. (To avoid repeatedly parsing these databases, you -- can then 'serialize' your configured 'MSpoof' and later -- 'openFromSerialized' to load the pre-parsed databases.) data SpoofCheck -- | Makes 'areConfusable' report if both identifiers are both from the -- same script and are visually confusable. Does not affect 'spoofCheck'. = SingleScriptConfusable -- | Makes 'areConfusable' report if both identifiers are visually -- confusable and at least one identifier contains characters from more -- than one script. -- -- Makes 'spoofCheck' report if the identifier contains multiple scripts, -- and is confusable with some other identifier in a single script. | MixedScriptConfusable -- | Makes 'areConfusable' report if each identifier is of a different -- single script, and the identifiers are visually confusable. | WholeScriptConfusable -- | By default, spoof checks assume the strings have been processed -- through 'toCaseFold' and only check lower-case identifiers. If -- this is set, spoof checks will check both upper and lower case -- identifiers. | AnyCase -- | Checks that identifiers are no looser than the specified -- level passed to 'setRestrictionLevel'. | RestrictionLevel -- | Checks the identifier for the presence of invisible characters, -- such as zero-width spaces, or character sequences that are likely -- not to display, such as multiple occurrences of the same -- non-spacing mark. | Invisible -- | Checks whether the identifier contains only characters from a -- specified set (for example, via 'setAllowedLocales'). | CharLimit -- | Checks that the identifier contains numbers from only a -- single script. | MixedNumbers -- | Enables all checks. | AllChecks -- | Enables returning a 'RestrictionLevel' in the 'SpoofCheckResult'. | AuxInfo deriving (Bounded, Enum, Eq, Show) instance ToBitMask SpoofCheck where toBitMask SingleScriptConfusable = #const USPOOF_SINGLE_SCRIPT_CONFUSABLE toBitMask MixedScriptConfusable = #const USPOOF_MIXED_SCRIPT_CONFUSABLE toBitMask WholeScriptConfusable = #const USPOOF_WHOLE_SCRIPT_CONFUSABLE toBitMask AnyCase = #const USPOOF_ANY_CASE toBitMask RestrictionLevel = #const USPOOF_RESTRICTION_LEVEL toBitMask Invisible = #const USPOOF_INVISIBLE toBitMask CharLimit = #const USPOOF_CHAR_LIMIT toBitMask MixedNumbers = #const USPOOF_MIXED_NUMBERS toBitMask AllChecks = #const USPOOF_ALL_CHECKS toBitMask AuxInfo = #const USPOOF_AUX_INFO type USpoofCheck = Int32 data RestrictionLevel -- | Checks that the string contains only Unicode values in the range -- #0000#ߝ#007F# inclusive. = ASCII -- | Checks that the string contains only characters from a single script. | SingleScriptRestrictive -- | Checks that the string contains only characters from a single script, -- or from the combinations (Latin + Han + Hiragana + Katakana), -- (Latin + Han + Bopomofo), or (Latin + Han + Hangul). | HighlyRestrictive -- | Checks that the string contains only characters from the combinations -- (Latin + Cyrillic + Greek + Cherokee), (Latin + Han + Hiragana + Katakana), -- (Latin + Han + Bopomofo), or (Latin + Han + Hangul). | ModeratelyRestrictive -- | Allows arbitrary mixtures of scripts. | MinimallyRestrictive -- | Allows any valid identifiers, including characters outside of the -- Identifier Profile. | Unrestrictive deriving (Bounded, Enum, Eq, Show) instance ToBitMask RestrictionLevel where toBitMask ASCII = #const USPOOF_ASCII toBitMask SingleScriptRestrictive = #const USPOOF_SINGLE_SCRIPT_RESTRICTIVE toBitMask HighlyRestrictive = #const USPOOF_HIGHLY_RESTRICTIVE toBitMask ModeratelyRestrictive = #const USPOOF_MODERATELY_RESTRICTIVE toBitMask MinimallyRestrictive = #const USPOOF_MINIMALLY_RESTRICTIVE toBitMask Unrestrictive = #const USPOOF_UNRESTRICTIVE type URestrictionLevel = Int32 data SpoofCheckResult -- | The string passed all configured spoof checks. = CheckOK -- | The string failed one or more spoof checks. | CheckFailed [SpoofCheck] -- | The string failed one or more spoof checks, and -- failed to pass the configured restriction level. | CheckFailedWithRestrictionLevel { -- | The spoof checks which the string failed. failedChecks :: [SpoofCheck] -- | The restriction level which the string failed to pass. , failedLevel :: RestrictionLevel } deriving (Eq, Show) data SkeletonTypeOverride -- | By default, 'getSkeleton' builds skeletons which catch -- visually confusable characters across multiple scripts. -- Pass this flag to override that behavior and build skeletons -- which catch visually confusable characters across single scripts. = SkeletonSingleScript -- | By default, 'getSkeleton' assumes the input string has already -- been passed through 'toCaseFold' and is lower-case. Pass this -- flag to override that behavior and allow upper and lower-case strings. | SkeletonAnyCase deriving (Bounded, Enum, Eq, Show) instance ToBitMask SkeletonTypeOverride where toBitMask SkeletonSingleScript = #const USPOOF_SINGLE_SCRIPT_CONFUSABLE toBitMask SkeletonAnyCase = #const USPOOF_ANY_CASE type USkeletonTypeOverride = Int32 makeSpoofCheckResult :: USpoofCheck -> SpoofCheckResult makeSpoofCheckResult c = case c of 0 -> CheckOK _ -> case restrictionLevel of Nothing -> CheckFailed spoofChecks Just l -> CheckFailedWithRestrictionLevel spoofChecks l where spoofChecks = fromBitMask $ fromIntegral $ c .&. #const USPOOF_ALL_CHECKS restrictionValue = c .&. #const USPOOF_RESTRICTION_LEVEL_MASK restrictionLevel = highestValueInBitMask $ fromIntegral $ restrictionValue -- | Indicates which input file to 'openFromSource' failed to parse upon error. data OpenFromSourceParseErrorFile = ConfusablesTxtError | ConfusablesWholeScriptTxtError deriving (Eq, Show) instance NFData OpenFromSourceParseErrorFile where rnf !_ = () -- | Exception thrown with 'openFromSource' fails to parse one of the input files. data OpenFromSourceParseError = OpenFromSourceParseError { -- | The file which could not be parsed. errFile :: OpenFromSourceParseErrorFile -- | Parse error encountered opening a spoof checker from source. , parseError :: ParseError } deriving (Show, Typeable) instance NFData OpenFromSourceParseError where rnf OpenFromSourceParseError{..} = rnf parseError `seq` rnf errFile instance Exception OpenFromSourceParseError -- | Open a spoof checker for checking Unicode strings for lookalike -- security issues with default options (all 'SpoofCheck's except -- 'CharLimit'). open :: IO MSpoof open = wrap $ handleError uspoof_open isParseError :: ParseError -> Maybe ParseError isParseError = Just -- | Open a spoof checker with custom rules given the UTF-8 encoded -- contents of the @confusables.txt@ and @confusablesWholeScript.txt@ -- files as described in . openFromSource :: (ByteString, ByteString) -> IO MSpoof openFromSource (confusables, confusablesWholeScript) = unsafeUseAsCStringLen confusables $ \(cptr, clen) -> unsafeUseAsCStringLen confusablesWholeScript $ \(wptr, wlen) -> with 0 $ \errTypePtr -> catchJust isParseError (wrap $ handleParseError (== u_PARSE_ERROR) (uspoof_openFromSource cptr (fromIntegral clen) wptr (fromIntegral wlen) errTypePtr)) (throwOpenFromSourceParseError errTypePtr) throwOpenFromSourceParseError :: Ptr Int32 -> ParseError -> IO a throwOpenFromSourceParseError errTypePtr parseErr = do errType <- peek errTypePtr let errFile = if errType == #{const USPOOF_SINGLE_SCRIPT_CONFUSABLE} then ConfusablesTxtError -- N.B.: ICU as of 55.1 actually leaves errFile set to 0 in this case. else ConfusablesWholeScriptTxtError throwIO $! OpenFromSourceParseError errFile parseErr -- | Open a spoof checker previously serialized to bytes using 'serialize'. -- The returned 'MSpoof' will retain a reference to the 'ForeignPtr' inside -- the ByteString, so ensure its contents do not change for the lifetime -- of the lifetime of the returned value. openFromSerialized :: ByteString -> IO MSpoof openFromSerialized b = case toForeignPtr b of (ptr, off, len) -> withForeignPtr ptr $ \p -> wrapWithSerialized ptr $ handleError (uspoof_openFromSerialized (p `plusPtr` off) (fromIntegral len) nullPtr) -- | Get the checks performed by a spoof checker. getChecks :: MSpoof -> IO [SpoofCheck] getChecks s = withSpoof s $ \sptr -> (fromBitMask . fromIntegral . (.&.) #{const USPOOF_ALL_CHECKS}) <$> handleError (uspoof_getChecks sptr) -- | Configure the checks performed by a spoof checker. setChecks :: MSpoof -> [SpoofCheck] -> IO () setChecks s c = withSpoof s $ \sptr -> handleError $ uspoof_setChecks sptr . fromIntegral $ toBitMask c -- | Get the restriction level of a spoof checker. getRestrictionLevel :: MSpoof -> IO (Maybe RestrictionLevel) getRestrictionLevel s = withSpoof s $ \sptr -> (highestValueInBitMask . fromIntegral) <$> uspoof_getRestrictionLevel sptr -- | Configure the restriction level of a spoof checker. setRestrictionLevel :: MSpoof -> RestrictionLevel -> IO () setRestrictionLevel s l = withSpoof s $ \sptr -> uspoof_setRestrictionLevel sptr . fromIntegral $ toBitMask l -- | Get the list of locale names allowed to be used with a spoof checker. -- (We don't use 'LocaleName' since the root and default locales have no -- meaning here.) getAllowedLocales :: MSpoof -> IO [String] getAllowedLocales s = withSpoof s $ \sptr -> splitLocales <$> (peekCString =<< handleError (uspoof_getAllowedLocales sptr)) where splitLocales = fmap (unpack . strip) . splitOn "," . pack -- | Get the list of locale names allowed to be used with a spoof checker. -- (We don't use 'LocaleName' since the root and default locales have no -- meaning here.) setAllowedLocales :: MSpoof -> [String] -> IO () setAllowedLocales s locs = withSpoof s $ \sptr -> withCString (intercalate "," locs) $ \lptr -> handleError (uspoof_setAllowedLocales sptr lptr) -- | Check if two strings could be confused with each other. areConfusable :: MSpoof -> Text -> Text -> IO SpoofCheckResult areConfusable s t1 t2 = withSpoof s $ \sptr -> useAsPtr t1 $ \t1ptr t1len -> useAsPtr t2 $ \t2ptr t2len -> makeSpoofCheckResult <$> handleError ( #if MIN_VERSION_text(2,0,0) uspoof_areConfusableUTF8 #else uspoof_areConfusable #endif sptr t1ptr (fromIntegral t1len) t2ptr (fromIntegral t2len)) -- | Generates re-usable "skeleton" strings which can be used (via -- Unicode equality) to check if an identifier is confusable -- with some large set of existing identifiers. -- -- If you cache the returned strings in storage, you /must/ invalidate -- your cache any time the underlying confusables database changes -- (i.e., on ICU upgrade). -- -- By default, assumes all input strings have been passed through -- 'toCaseFold' and are lower-case. To change this, pass -- 'SkeletonAnyCase'. -- -- By default, builds skeletons which catch visually confusable -- characters across multiple scripts. Pass 'SkeletonSingleScript' to -- override that behavior and build skeletons which catch visually -- confusable characters across single scripts. getSkeleton :: MSpoof -> Maybe SkeletonTypeOverride -> Text -> IO Text getSkeleton s o t = withSpoof s $ \sptr -> useAsPtr t $ \tptr tlen -> handleOverflowError (fromIntegral tlen) (\dptr dlen -> getS sptr oflags tptr (fromIntegral tlen) dptr (fromIntegral dlen)) (\dptr dlen -> from dptr (fromIntegral dlen)) where oflags = maybe 0 (fromIntegral . toBitMask) o (getS, from) = #if MIN_VERSION_text(2,0,0) (uspoof_getSkeletonUTF8, fromPtr) #else (uspoof_getSkeleton, fromUCharPtr) #endif -- | Checks if a string could be confused with any other. spoofCheck :: MSpoof -> Text -> IO SpoofCheckResult spoofCheck s t = withSpoof s $ \sptr -> useAsPtr t $ \tptr tlen -> makeSpoofCheckResult <$> handleError ( #if MIN_VERSION_text(2,0,0) uspoof_checkUTF8 #else uspoof_check #endif sptr tptr (fromIntegral tlen) nullPtr) -- | Serializes the rules in this spoof checker to a byte array, -- suitable for re-use by 'openFromSerialized'. -- -- Only includes any data provided to 'openFromSource'. Does not -- include any other state or configuration. serialize :: MSpoof -> IO ByteString serialize s = withSpoof s $ \sptr -> handleOverflowError 0 (\dptr dlen -> (uspoof_serialize sptr dptr (fromIntegral dlen))) (\dptr dlen -> create (fromIntegral dlen) $ \bptr -> memcpy dptr bptr (fromIntegral dlen)) foreign import ccall unsafe "hs_text_icu.h __hs_uspoof_open" uspoof_open :: Ptr UErrorCode -> IO (Ptr USpoof) foreign import ccall unsafe "hs_text_icu.h __hs_uspoof_openFromSerialized" uspoof_openFromSerialized :: Ptr Word8 -> Int32 -> Ptr Int32 -> Ptr UErrorCode -> IO (Ptr USpoof) foreign import ccall unsafe "hs_text_icu.h __hs_uspoof_openFromSource" uspoof_openFromSource :: CString -> Int32 -> CString -> Int32 -> Ptr Int32 -> Ptr UParseError -> Ptr UErrorCode -> IO (Ptr USpoof) foreign import ccall unsafe "hs_text_icu.h __hs_uspoof_getChecks" uspoof_getChecks :: Ptr USpoof -> Ptr UErrorCode -> IO USpoofCheck foreign import ccall unsafe "hs_text_icu.h __hs_uspoof_setChecks" uspoof_setChecks :: Ptr USpoof -> USpoofCheck -> Ptr UErrorCode -> IO () foreign import ccall unsafe "hs_text_icu.h __hs_uspoof_getRestrictionLevel" uspoof_getRestrictionLevel :: Ptr USpoof -> IO URestrictionLevel foreign import ccall unsafe "hs_text_icu.h __hs_uspoof_setRestrictionLevel" uspoof_setRestrictionLevel :: Ptr USpoof -> URestrictionLevel -> IO () foreign import ccall unsafe "hs_text_icu.h __hs_uspoof_getAllowedLocales" uspoof_getAllowedLocales :: Ptr USpoof -> Ptr UErrorCode -> IO CString foreign import ccall unsafe "hs_text_icu.h __hs_uspoof_setAllowedLocales" uspoof_setAllowedLocales :: Ptr USpoof -> CString -> Ptr UErrorCode -> IO () #if MIN_VERSION_text(2,0,0) foreign import ccall unsafe "hs_text_icu.h __hs_uspoof_areConfusableUTF8" uspoof_areConfusableUTF8 :: Ptr USpoof -> Ptr Word8 -> Int32 -> Ptr Word8 -> Int32 -> Ptr UErrorCode -> IO USpoofCheck foreign import ccall unsafe "hs_text_icu.h __hs_uspoof_checkUTF8" uspoof_checkUTF8 :: Ptr USpoof -> Ptr Word8 -> Int32 -> Ptr Int32 -> Ptr UErrorCode -> IO USpoofCheck foreign import ccall unsafe "hs_text_icu.h __hs_uspoof_getSkeletonUTF8" uspoof_getSkeletonUTF8 :: Ptr USpoof -> USkeletonTypeOverride -> Ptr Word8 -> Int32 -> Ptr Word8 -> Int32 -> Ptr UErrorCode -> IO Int32 #else foreign import ccall unsafe "hs_text_icu.h __hs_uspoof_areConfusable" uspoof_areConfusable :: Ptr USpoof -> Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO USpoofCheck foreign import ccall unsafe "hs_text_icu.h __hs_uspoof_check" uspoof_check :: Ptr USpoof -> Ptr UChar -> Int32 -> Ptr Int32 -> Ptr UErrorCode -> IO USpoofCheck foreign import ccall unsafe "hs_text_icu.h __hs_uspoof_getSkeleton" uspoof_getSkeleton :: Ptr USpoof -> USkeletonTypeOverride -> Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO Int32 #endif foreign import ccall unsafe "hs_text_icu.h __hs_uspoof_serialize" uspoof_serialize :: Ptr USpoof -> Ptr Word8 -> Int32 -> Ptr UErrorCode -> IO Int32 text-icu-0.8.0.4/Data/Text/ICU/Spoof/0000755000000000000000000000000007346545000015117 5ustar0000000000000000text-icu-0.8.0.4/Data/Text/ICU/Spoof/Internal.hs0000644000000000000000000000414207346545000017230 0ustar0000000000000000{-# LANGUAGE DeriveDataTypeable, EmptyDataDecls, ForeignFunctionInterface #-} -- | -- Module : Data.Text.ICU.Spoof.Internal -- Copyright : (c) 2015 Ben Hamilton -- -- License : BSD-style -- Maintainer : bgertzfield@gmail.com -- Stability : experimental -- Portability : GHC -- -- Internals of the spoofability check infrastructure. module Data.Text.ICU.Spoof.Internal ( -- * Unicode spoof checking API -- $api -- * Types MSpoof(..) , Spoof(..) , USpoof -- * Functions , withSpoof , wrap , wrapWithSerialized ) where import Data.Typeable (Typeable) import Data.Word (Word8) import Foreign.ForeignPtr (ForeignPtr, withForeignPtr) import Foreign.Ptr (FunPtr, Ptr) import Data.Text.ICU.Internal (newICUPtr) -- $api -- Low-level operations on spoof checkers. -- | Opaque handle to a configurable spoof checker. data USpoof -- | Configurable spoof checker wrapping an opaque handle -- and optionally wrapping a previously serialized instance. data MSpoof = MSpoof { serializedBuf :: Maybe (ForeignPtr Word8) , spoofPtr :: {-# UNPACK #-} !(ForeignPtr USpoof) } deriving (Typeable) -- | Spoof checker type. newtype Spoof = S MSpoof deriving (Typeable) -- | Temporarily unwraps an 'MSpoof' to perform operations on its raw 'USpoof' -- handle. withSpoof :: MSpoof -> (Ptr USpoof -> IO a) -> IO a withSpoof (MSpoof _ spoof) = withForeignPtr spoof {-# INLINE withSpoof #-} -- | Wraps a raw 'USpoof' handle in an 'MSpoof', closing the handle when -- the last reference to the object is dropped. wrap :: IO (Ptr USpoof) -> IO MSpoof wrap = newICUPtr (MSpoof Nothing) uspoof_close {-# INLINE wrap #-} -- | Wraps a previously serialized spoof checker and raw 'USpoof' handle -- in an 'MSpoof', closing the handle and releasing the 'ForeignPtr' when -- the last reference to the object is dropped. wrapWithSerialized :: ForeignPtr Word8 -> IO (Ptr USpoof) -> IO MSpoof wrapWithSerialized s = newICUPtr (MSpoof $ Just s) uspoof_close {-# INLINE wrapWithSerialized #-} foreign import ccall unsafe "hs_text_icu.h &__hs_uspoof_close" uspoof_close :: FunPtr (Ptr USpoof -> IO ()) text-icu-0.8.0.4/Data/Text/ICU/Spoof/Pure.hs0000644000000000000000000001247407346545000016376 0ustar0000000000000000-- | -- Module : Data.Text.ICU.Spoof.Pure -- Copyright : (c) 2015 Ben Hamilton -- -- License : BSD-style -- Maintainer : bgertzfield@gmail.com -- Stability : experimental -- Portability : GHC -- -- Pure string spoof checking functions for Unicode, implemented as -- bindings to the International Components for Unicode (ICU) -- libraries. -- -- For the impure spoof checking API (which is richer, but less easy to -- use), see the "Data.Text.ICU.Spoof" module. module Data.Text.ICU.Spoof.Pure ( -- * Types Spoof , SpoofParams(..) , spoof , spoofWithParams , spoofFromSource , spoofFromSerialized -- * String spoof checks , areConfusable , getSkeleton , spoofCheck -- * Configuration , getAllowedLocales , getChecks , getRestrictionLevel -- * Persistence , serialize ) where import Data.ByteString (ByteString) import Data.Foldable (forM_) import Data.Text (Text) import Data.Text.ICU.Spoof.Internal (Spoof(..)) import System.IO.Unsafe (unsafePerformIO) import qualified Data.Text.ICU.Spoof as S data SpoofParams -- | Used to configure a 'Spoof' checker via 'spoofWithParams'. = SpoofParams { -- | Optional 'S.SpoofCheck's to perform on a string. By default, performs -- all checks except 'CharLimit'. spoofChecks :: Maybe [S.SpoofCheck] -- | Optional 'S.RestrictionLevel' to which characters in the string will -- be limited. By default, uses 'HighlyRestrictive'. , level :: Maybe S.RestrictionLevel -- | Optional locale(s) whose scripts will be used to limit the -- set of allowed characters in a string. If set, automatically -- enables the 'CharLimit' spoof check. , locales :: Maybe [String] } deriving (Show, Eq) applySpoofParams :: SpoofParams -> S.MSpoof -> S.MSpoof applySpoofParams (SpoofParams c lev loc) s = unsafePerformIO $ do forM_ c (S.setChecks s) forM_ lev (S.setRestrictionLevel s) forM_ loc (S.setAllowedLocales s) return s -- | Open an immutable 'Spoof' checker with default options (all -- 'S.SpoofCheck's except 'CharLimit'). spoof :: Spoof spoof = unsafePerformIO $ S `fmap` S.open {-# NOINLINE spoof #-} -- | Open an immutable 'Spoof' checker with specific 'SpoofParams' -- to control its behavior. spoofWithParams :: SpoofParams -> Spoof spoofWithParams p = unsafePerformIO $ do s <- S.open return (S $ applySpoofParams p s) -- | Open a immutable 'Spoof' checker with specific 'SpoofParams' -- to control its behavior and custom rules given the UTF-8 encoded -- contents of the @confusables.txt@ and @confusablesWholeScript.txt@ -- files as described in . spoofFromSource :: (ByteString, ByteString) -> SpoofParams -> Spoof spoofFromSource (confusables, confusablesWholeScript) p = unsafePerformIO $ do s <- S.openFromSource (confusables, confusablesWholeScript) return (S $ applySpoofParams p s) -- | Create an immutable spoof checker with specific 'SpoofParams' -- to control its behavior and custom rules previously returned -- by 'serialize'. spoofFromSerialized :: ByteString -> SpoofParams -> Spoof spoofFromSerialized b p = unsafePerformIO $ do s <- S.openFromSerialized b return (S $ applySpoofParams p s) -- | Check two strings for confusability. areConfusable :: Spoof -> Text -> Text -> S.SpoofCheckResult areConfusable (S s) t1 t2 = unsafePerformIO $ S.areConfusable s t1 t2 -- | Check a string for spoofing issues. spoofCheck :: Spoof -> Text -> S.SpoofCheckResult spoofCheck (S s) t = unsafePerformIO $ S.spoofCheck s t -- | Generates re-usable \"skeleton\" strings which can be used (via -- Unicode equality) to check if an identifier is confusable -- with some large set of existing identifiers. -- -- If you cache the returned strings in storage, you /must/ invalidate -- your cache any time the underlying confusables database changes -- (i.e., on ICU upgrade). -- -- By default, assumes all input strings have been passed through -- 'toCaseFold' and are lower-case. To change this, pass -- 'SkeletonAnyCase'. -- -- By default, builds skeletons which catch visually confusable -- characters across multiple scripts. Pass 'SkeletonSingleScript' to -- override that behavior and build skeletons which catch visually -- confusable characters across single scripts. getSkeleton :: Spoof -> Maybe S.SkeletonTypeOverride -> Text -> Text getSkeleton (S s) o t = unsafePerformIO $ S.getSkeleton s o t -- | Gets the restriction level currently configured in the spoof -- checker, if present. getRestrictionLevel :: Spoof -> Maybe S.RestrictionLevel getRestrictionLevel (S s) = unsafePerformIO $ S.getRestrictionLevel s -- | Gets the checks currently configured in the spoof checker. getChecks :: Spoof -> [S.SpoofCheck] getChecks (S s) = unsafePerformIO $ S.getChecks s -- | Gets the locales whose scripts are currently allowed by the spoof -- checker. (We don't use 'LocaleName' since the root and default -- locales have no meaning here.) getAllowedLocales :: Spoof -> [String] getAllowedLocales (S s) = unsafePerformIO $ S.getAllowedLocales s -- | Serializes the rules in this spoof checker to a byte array, -- suitable for re-use by 'spoofFromSerialized'. -- -- Only includes any data provided to 'openFromSource'. Does not -- include any other state or configuration. serialize :: Spoof -> ByteString serialize (S s) = unsafePerformIO $ S.serialize s {-# INLINE spoofCheck #-} text-icu-0.8.0.4/Data/Text/ICU/Text.hs0000644000000000000000000000600607346545000015313 0ustar0000000000000000{-# LANGUAGE ForeignFunctionInterface #-} -- | -- Module : Data.Text.ICU.Text -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Functions for manipulating Unicode text, implemented as bindings to -- the International Components for Unicode (ICU) libraries. module Data.Text.ICU.Text ( -- * Case conversion -- $case toCaseFold , toLower , toUpper ) where import Data.Int (Int32) import Data.Text (Text) import Data.Text.ICU.Error.Internal (UErrorCode, handleOverflowError) import Data.Text.ICU.Internal (LocaleName, UChar, withLocaleName, useAsUCharPtr, fromUCharPtr) import Data.Word (Word32) import Foreign.C.String (CString) import Foreign.Ptr (Ptr) import System.IO.Unsafe (unsafePerformIO) -- $case -- -- In some languages, case conversion is a locale- and -- context-dependent operation. The case conversion functions in this -- module are locale and context sensitive. -- | Case-fold the characters in a string. -- -- Case folding is locale independent and not context sensitive, but -- there is an option for treating the letter I specially for Turkic -- languages. The result may be longer or shorter than the original. toCaseFold :: Bool -- ^ Whether to include or exclude mappings for -- dotted and dotless I and i that are marked with -- 'I' in @CaseFolding.txt@. -> Text -> Text toCaseFold excludeI s = unsafePerformIO . useAsUCharPtr s $ \sptr slen -> do let opts = fromIntegral . fromEnum $ excludeI handleOverflowError (fromIntegral slen) (\dptr dlen -> u_strFoldCase dptr dlen sptr (fromIntegral slen) opts) (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen)) type CaseMapper = Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> CString -> Ptr UErrorCode -> IO Int32 caseMap :: CaseMapper -> LocaleName -> Text -> Text caseMap mapFn loc s = unsafePerformIO . withLocaleName loc $ \locale -> useAsUCharPtr s $ \sptr slen -> handleOverflowError (fromIntegral slen) (\dptr dlen -> mapFn dptr dlen sptr (fromIntegral slen) locale) (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen)) -- | Lowercase the characters in a string. -- -- Casing is locale dependent and context sensitive. The result may -- be longer or shorter than the original. toLower :: LocaleName -> Text -> Text toLower = caseMap u_strToLower -- | Uppercase the characters in a string. -- -- Casing is locale dependent and context sensitive. The result may -- be longer or shorter than the original. toUpper :: LocaleName -> Text -> Text toUpper = caseMap u_strToUpper foreign import ccall unsafe "hs_text_icu.h __hs_u_strFoldCase" u_strFoldCase :: Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> Word32 -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_u_strToLower" u_strToLower :: CaseMapper foreign import ccall unsafe "hs_text_icu.h __hs_u_strToUpper" u_strToUpper :: CaseMapper text-icu-0.8.0.4/Data/Text/ICU/Types.hs0000644000000000000000000000110207346545000015463 0ustar0000000000000000-- | -- Module : Data.Text.ICU.Types -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Types for use when manipulating Unicode text, using the bindings to -- the International Components for Unicode (ICU) libraries. module Data.Text.ICU.Types ( -- * Widely used types LocaleName(..) , ParseError(errError, errLine, errOffset) ) where import Data.Text.ICU.Error.Internal (ParseError(..)) import Data.Text.ICU.Internal (LocaleName(..)) text-icu-0.8.0.4/LICENSE0000644000000000000000000000245407346545000012626 0ustar0000000000000000Copyright (c) 2009, Bryan O'Sullivan All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. text-icu-0.8.0.4/README.markdown0000644000000000000000000000377107346545000014325 0ustar0000000000000000# Text-ICU: Comprehensive support for string manipulation This package provides the Data.Text.ICU library, for performing complex manipulation of Unicode text. It provides features such as the following: * Unicode normalization * Conversion to and from many common and obscure encodings * Date and number formatting * Comparison and collation ## Prerequisites This library is implemented as bindings to the well-respected [ICU library](https://icu.unicode.org/) (which is not bundled, and must be installed separately). ### macOS brew install icu4c brew link icu4c --force You might need: export PKG_CONFIG_PATH="$(brew --prefix)/opt/icu4c/lib/pkgconfig" ### Debian/Ubuntu sudo apt-get update sudo apt-get install libicu-dev ### Fedora/CentOS sudo dnf install unzip libicu-devel ### Nix/NixOS nix-shell --packages icu ### Windows/MSYS2 Under MSYS2, `ICU` can be installed via `pacman`. pacman --noconfirm -S mingw-w64-x86_64-icu Depending on the age of the MSYS2 installation, the keyring might need to be updated to avoid certification issues, and `pkg-config` might need to be added. In this case, do this first: pacman --noconfirm -Sy msys2-keyring pacman --noconfirm -S mingw-w64-x86_64-pkgconf ### Windows/stack With `stack` on Windows, which comes with its _own_ bundled MSYS2, the following commands give up-to-date system dependencies for `text-icu-0.8.0` (tested 2023-09-30): stack exec -- pacman --noconfirm -Sy msys2-keyring stack exec -- pacman --noconfirm -S mingw-w64-x86_64-pkgconf stack exec -- pacman --noconfirm -S mingw-w64-x86_64-icu ## Compatibility Upstream ICU occasionally introduces backwards-incompatible API breaks. This package tries to stay up to date with upstream, and is currently more or less in sync with ICU 72. Minimum required version is ICU 62. ## Get involved! Please report bugs via the [github issue tracker](https://github.com/haskell/text-icu/issues). ## Authors This library was written by Bryan O'Sullivan. text-icu-0.8.0.4/Setup.lhs0000644000000000000000000000011407346545000013420 0ustar0000000000000000#!/usr/bin/env runhaskell > import Distribution.Simple > main = defaultMain text-icu-0.8.0.4/benchmarks/0000755000000000000000000000000007346545000013731 5ustar0000000000000000text-icu-0.8.0.4/benchmarks/Breaker.hs0000644000000000000000000000207107346545000015640 0ustar0000000000000000-- Estimate the time difference between creating a breaker. {-# LANGUAGE OverloadedStrings #-} import Control.Monad import qualified Data.Text as T import Data.Text.IO as T import Data.Text.ICU.Break as IO import Data.Text.ICU as ICU import System.Environment consume b = go where go = do m <- next b case m of Nothing -> return () Just _ -> go manyBreakers (t:ts) = do b <- IO.breakWord "en" t consume b manyBreakers ts manyBreakers _ = return () oneBreaker ts = do b <- IO.breakWord "en" "" forM_ ts $ \t -> do setText b t consume b cloneBreakers ts = do b <- IO.breakWord "en" "" forM_ ts $ \t -> do b' <- clone b setText b' t consume b' pureBreaker ts = do let b = ICU.breakWord "en" forM_ ts $ \t -> length (breaks b t) `seq` return () main = do (kind:files) <- getArgs let act = case kind of "one" -> oneBreaker "many" -> manyBreakers "clone" -> cloneBreakers "pure" -> pureBreaker forM_ files $ \f -> T.readFile f >>= act . T.lines text-icu-0.8.0.4/cbits/0000755000000000000000000000000007346545000012720 5ustar0000000000000000text-icu-0.8.0.4/cbits/text_icu.c0000644000000000000000000007104107346545000014713 0ustar0000000000000000#include "hs_text_icu.h" #include "stdbool.h" UDateFormat *__hs_udat_open(UDateFormatStyle timeStyle, UDateFormatStyle dateStyle, const char *locale, const UChar *tzID, int32_t tzIDLength, const UChar *pattern, int32_t patternLength, UErrorCode *status) { return udat_open(timeStyle, dateStyle, locale, tzID, tzIDLength, pattern, patternLength, status); } void __hs_udat_close(UDateFormat *format) { udat_close(format); } UDateFormat *__hs_udat_clone(const UDateFormat *fmt, UErrorCode *status) { return udat_clone(fmt, status); } int32_t __hs_udat_formatCalendar(const UDateFormat *format, UCalendar *calendar, UChar *result, int32_t capacity, UFieldPosition *position, UErrorCode *status) { return udat_formatCalendar(format, calendar, result, capacity, position, status); } int32_t __hs_udat_getSymbols(const UDateFormat *fmt, UDateFormatSymbolType type, int32_t symbolIndex, UChar *result, int32_t resultLength, UErrorCode *status) { return udat_getSymbols(fmt, type, symbolIndex, result, resultLength, status); } int32_t __hs_udat_countSymbols(const UDateFormat *fmt, UDateFormatSymbolType type) { return udat_countSymbols(fmt, type); } UNumberFormatter *__hs_unumf_openForSkeletonAndLocale(const UChar *skeleton, int32_t skeletonLen, const char *locale, UErrorCode *ec) { return unumf_openForSkeletonAndLocale(skeleton, skeletonLen, locale, ec); } void __hs_unumf_close(UNumberFormatter *uformatter) { unumf_close(uformatter); } UFormattedNumber *__hs_unumf_openResult(UErrorCode *ec) { return unumf_openResult(ec); } void __hs_unumf_closeResult(UFormattedNumber *uresult) { unumf_closeResult(uresult); } void __hs_unumf_formatInt(const UNumberFormatter *uformatter, int64_t value, UFormattedNumber *uresult, UErrorCode *ec) { unumf_formatInt(uformatter, value, uresult, ec); } void __hs_unumf_formatDouble(const UNumberFormatter *uformatter, double value, UFormattedNumber *uresult, UErrorCode *ec) { unumf_formatDouble(uformatter, value, uresult, ec); } int32_t __hs_unumf_resultToString(const UFormattedNumber *uresult, UChar *buffer, int32_t bufferCapacity, UErrorCode *ec) { return unumf_resultToString(uresult, buffer, bufferCapacity, ec); } const char *__hs_uloc_getAvailable(int32_t n) { return uloc_getAvailable(n); } int32_t __hs_uloc_countAvailable(void) { return uloc_countAvailable(); } void __hs_uenum_close(UEnumeration *en) { uenum_close(en); } const UChar *__hs_uenum_unext(UEnumeration *en, int32_t *resultLength, UErrorCode *status) { return uenum_unext(en, resultLength, status); } UCalendar *__hs_ucal_open(const UChar *zoneID, int32_t len, const char *locale, UCalendarType type, UErrorCode *status) { return ucal_open(zoneID, len, locale, type, status); } UCalendar *__hs_ucal_clone(const UCalendar *cal, UErrorCode *status) { return ucal_clone(cal, status); } int32_t __hs_ucal_get(const UCalendar *cal, UCalendarDateFields field, UErrorCode *status) { return ucal_get(cal, field, status); } void __hs_ucal_set(UCalendar *cal, UCalendarDateFields field, int32_t value) { ucal_set(cal, field, value); } void __hs_ucal_setDate(UCalendar *cal, int32_t year, int32_t month, int32_t date, UErrorCode *status) { ucal_setDate(cal, year, month, date, status); } void __hs_ucal_setDateTime(UCalendar *cal, int32_t year, int32_t month, int32_t date, int32_t hr, int32_t min, int32_t sec, UErrorCode *status) { ucal_setDateTime(cal, year, month, date, hr, min, sec, status); } void __hs_ucal_add(UCalendar *cal, UCalendarDateFields field, int32_t value, UErrorCode *status) { ucal_add(cal, field, value, status); } void __hs_ucal_roll(UCalendar *cal, UCalendarDateFields field, int32_t value, UErrorCode *status) { ucal_roll(cal, field, value, status); } void __hs_ucal_close(UCalendar *cal) { ucal_close(cal); } UEnumeration *_hs__ucal_openTimeZoneIDEnumeration(USystemTimeZoneType zoneType, UErrorCode *ec) { return ucal_openTimeZoneIDEnumeration(zoneType, NULL, NULL, ec); } UEnumeration *__hs_ucal_openTimeZones(UErrorCode *ec) { return ucal_openTimeZones(ec); } void __hs_ucal_setTimeZone(UCalendar *cal, const UChar *zoneID, int32_t len, UErrorCode *status) { ucal_setTimeZone(cal, zoneID, len, status); } UBreakIterator *__hs_ubrk_open(UBreakIteratorType type, const char *locale, const UChar *text, int32_t textLength, UErrorCode *status) { return ubrk_open(type, locale, text, textLength, status); } void __hs_ubrk_close(UBreakIterator *bi) { ubrk_close(bi); } void __hs_ubrk_setUText(UBreakIterator* bi, UText *text, UErrorCode *status) { ubrk_setUText(bi, text, status); } UBreakIterator *__hs_ubrk_safeClone(const UBreakIterator *bi, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status) { return ubrk_safeClone(bi, stackBuffer, pBufferSize, status); } int32_t __hs_ubrk_current(const UBreakIterator *bi) { return ubrk_current(bi); } int32_t __hs_ubrk_first(UBreakIterator *bi) { return ubrk_first(bi); } int32_t __hs_ubrk_last(UBreakIterator *bi) { return ubrk_last(bi); } int32_t __hs_ubrk_next(UBreakIterator *bi) { return ubrk_next(bi); } int32_t __hs_ubrk_previous(UBreakIterator *bi) { return ubrk_previous(bi); } int32_t __hs_ubrk_preceding(UBreakIterator *bi, int32_t offset) { return ubrk_preceding(bi, offset); } int32_t __hs_ubrk_following(UBreakIterator *bi, int32_t offset) { return ubrk_following(bi, offset); } int32_t __hs_ubrk_getRuleStatus(UBreakIterator *bi) { return ubrk_getRuleStatus(bi); } int32_t __hs_ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status) { return ubrk_getRuleStatusVec(bi, fillInVec, capacity, status); } UBool __hs_ubrk_isBoundary(UBreakIterator *bi, int32_t offset) { return ubrk_isBoundary(bi, offset); } int32_t __hs_ubrk_countAvailable(void) { return ubrk_countAvailable(); } const char *__hs_ubrk_getAvailable(int32_t index) { return ubrk_getAvailable(index); } UCollator *__hs_ucol_open(const char *loc, UErrorCode *status) { return ucol_open(loc, status); } UCollator* __hs_ucol_openRules(const UChar *rules, int32_t rulesLength, UColAttributeValue normalizationMode, UCollationStrength strength, UParseError *parseError, UErrorCode *status) { return ucol_openRules(rules, rulesLength, normalizationMode, strength, parseError, status); } void __hs_ucol_close(UCollator *coll) { ucol_close(coll); } const UChar *__hs_ucol_getRules(const UCollator *coll, int32_t *length) { return ucol_getRules(coll, length); } void __hs_ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) { ucol_setAttribute(coll, attr, value, status); } UColAttributeValue __hs_ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) { return ucol_getAttribute(coll, attr, status); } UCollationResult __hs_ucol_strcoll(const UCollator *coll, const UChar *source, int32_t sourceLength, const UChar *target, int32_t targetLength) { return ucol_strcoll(coll, source, sourceLength, target, targetLength); } UCollationResult __hs_ucol_strcollUTF8( const UCollator *coll, const char *source, int32_t sourceLength, const char *target, int32_t targetLength, UErrorCode *status) { return ucol_strcollUTF8(coll, source, sourceLength, target, targetLength, status); } UCollationResult __hs_ucol_strcollIter(const UCollator *coll, UCharIterator *sIter, UCharIterator *tIter, UErrorCode *status) { return ucol_strcollIter(coll, sIter, tIter, status); } UCollator *__hs_ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status) { return ucol_safeClone(coll, stackBuffer, pBufferSize, status); } int32_t __hs_ucol_getSortKey(const UCollator *coll, const UChar *source, int32_t sourceLength, uint8_t *result, int32_t resultLength) { return ucol_getSortKey(coll, source, sourceLength, result, resultLength); } int __hs_ucnv_get_max_bytes_for_string(UConverter *cnv, int src_length) { return UCNV_GET_MAX_BYTES_FOR_STRING(src_length, ucnv_getMaxCharSize(cnv)); } const char *__hs_u_errorName(UErrorCode code) { return u_errorName(code); } UBiDi* __hs_ubidi_open() { return ubidi_open(); } UBiDi* __hs_ubidi_openSized(int32_t maxLength, int32_t maxRunCount, UErrorCode *err) { return ubidi_openSized(maxLength, maxRunCount, err); } void __hs_ubidi_close(UBiDi * bidi) { ubidi_close(bidi); } void __hs_ubidi_setPara(UBiDi *pBiDi, const UChar *text, int32_t length, UBiDiLevel paraLevel, UErrorCode *pErrorCode) { ubidi_setPara(pBiDi, text, length, paraLevel, NULL, pErrorCode); } void __hs_ubidi_setLine(const UBiDi *pParaBiDi, int32_t start, int32_t limit, UBiDi *pLineBiDi, UErrorCode *pErrorCode) { ubidi_setLine(pParaBiDi, start, limit, pLineBiDi, pErrorCode); } int32_t __hs_ubidi_countParagraphs(UBiDi *pBiDi) { return ubidi_countParagraphs(pBiDi); } void __hs_ubidi_getParagraphByIndex(const UBiDi *pBiDi, int32_t paraIndex, int32_t *pParaStart, int32_t *pParaLimit, UErrorCode *pErrorCode) { ubidi_getParagraphByIndex(pBiDi, paraIndex, pParaStart, pParaLimit, NULL, pErrorCode); } int32_t __hs_ubidi_getProcessedLength(const UBiDi *pBiDi) { return ubidi_getProcessedLength(pBiDi); } int32_t __hs_ubidi_writeReordered(UBiDi *pBiDi, UChar *dest, int32_t destSize, uint16_t options, UErrorCode *pErrorCode) { return ubidi_writeReordered(pBiDi, dest, destSize, options, pErrorCode); } const char *__hs_ucnv_getName(const UConverter *converter, UErrorCode *err) { return ucnv_getName(converter, err); } UConverter *__hs_ucnv_open(const char *converterName, UErrorCode *err) { return ucnv_open(converterName, err); } void __hs_ucnv_close(UConverter *converter) { ucnv_close(converter); } int32_t __hs_ucnv_toUChars(UConverter *cnv, UChar *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode) { return ucnv_toUChars(cnv, dest, destCapacity, src, srcLength, pErrorCode); } int32_t __hs_ucnv_toAlgorithmic_UTF8( UConverter *cnv, char *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode) { return ucnv_toAlgorithmic(UCNV_UTF8, cnv, dest, destCapacity, src, srcLength, pErrorCode); } int32_t __hs_ucnv_fromUChars(UConverter *cnv, char *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode) { return ucnv_fromUChars(cnv, dest, destCapacity, src, srcLength, pErrorCode); } int32_t __hs_ucnv_fromAlgorithmic_UTF8( UConverter *cnv, char *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode) { return ucnv_fromAlgorithmic(cnv, UCNV_UTF8, dest, destCapacity, src, srcLength, pErrorCode); } int __hs_ucnv_compareNames(const char *name1, const char *name2) { return ucnv_compareNames(name1, name2); } const char *__hs_ucnv_getDefaultName(void) { return ucnv_getDefaultName(); } void __hs_ucnv_setDefaultName(const char *name) { ucnv_setDefaultName(name); } int32_t __hs_ucnv_countAvailable(void) { return ucnv_countAvailable(); } const char *__hs_ucnv_getAvailableName(int32_t n) { return ucnv_getAvailableName(n); } uint16_t __hs_ucnv_countAliases(const char *alias, UErrorCode *pErrorCode) { return ucnv_countAliases(alias, pErrorCode); } const char *__hs_ucnv_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode) { return ucnv_getAlias(alias, n, pErrorCode); } uint16_t __hs_ucnv_countStandards(void) { return ucnv_countStandards(); } const char *__hs_ucnv_getStandard(uint16_t n, UErrorCode *pErrorCode) { return ucnv_getStandard(n, pErrorCode); } UBool __hs_ucnv_usesFallback(const UConverter *cnv) { return ucnv_usesFallback(cnv); } void __hs_ucnv_setFallback(UConverter *cnv, UBool usesFallback) { ucnv_setFallback(cnv, usesFallback); } UBool __hs_ucnv_isAmbiguous(const UConverter *cnv) { return ucnv_isAmbiguous(cnv); } void __hs_uiter_setString(UCharIterator *iter, const UChar *s, int32_t length) { uiter_setString(iter, s, length); } void __hs_uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) { uiter_setUTF8(iter, s, length); } const UNormalizer2 *__hs_unorm2_getNFCInstance(UErrorCode *pErrorCode) { return unorm2_getNFCInstance(pErrorCode); } const UNormalizer2 *__hs_unorm2_getNFDInstance(UErrorCode *pErrorCode) { return unorm2_getNFDInstance(pErrorCode); } const UNormalizer2 *__hs_unorm2_getNFKCInstance(UErrorCode *pErrorCode) { return unorm2_getNFKCInstance(pErrorCode); } const UNormalizer2 *__hs_unorm2_getNFKDInstance(UErrorCode *pErrorCode) { return unorm2_getNFKDInstance(pErrorCode); } const UNormalizer2 *__hs_unorm2_getNFKCCasefoldInstance(UErrorCode *pErrorCode) { return unorm2_getNFKCCasefoldInstance(pErrorCode); } int32_t __hs_unorm2_normalize(const UNormalizer2 *norm2, const UChar *src, int32_t length, UChar *dest, int32_t capacity, UErrorCode *pErrorCode) { return unorm2_normalize(norm2, src, length, dest, capacity, pErrorCode); } UBool __hs_unorm2_isNormalized(const UNormalizer2 *norm2, const UChar *s, int32_t length, UErrorCode *pErrorCode) { return unorm2_isNormalized(norm2, s, length, pErrorCode); } UNormalizationCheckResult __hs_unorm2_quickCheck(const UNormalizer2 *norm2, const UChar *s, int32_t length, UErrorCode *pErrorCode) { return unorm2_quickCheck(norm2, s, length, pErrorCode); } int32_t __hs_unorm_compare(const UChar *s1, int32_t length1, const UChar *s2, int32_t length2, uint32_t options, UErrorCode *pErrorCode) { return unorm_compare(s1, length1, s2, length2, options, pErrorCode); } UNormalizationCheckResult __hs_unorm_quickCheck(const UChar *source, int32_t sourcelength, UNormalizationMode mode, UErrorCode *status) { return unorm_quickCheck(source, sourcelength, mode, status); } UBool __hs_unorm_isNormalized(const UChar *src, int32_t srcLength, UNormalizationMode mode, UErrorCode *pErrorCode) { return unorm_isNormalized(src, srcLength, mode, pErrorCode); } int32_t __hs_unorm_normalize(const UChar *source, int32_t sourceLength, UNormalizationMode mode, int32_t options, UChar *result, int32_t resultLength, UErrorCode *status) { return unorm_normalize(source, sourceLength, mode, options, result, resultLength, status); } int32_t __hs_u_shapeArabic(const UChar *source, int32_t sourceLength, UChar *result, int32_t resultLength, int32_t options, UErrorCode *status) { return u_shapeArabic(source, sourceLength, result, resultLength, options, status); } int32_t __hs_u_strToUpper(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, const char *locale, UErrorCode *pErrorCode) { return u_strToUpper(dest, destCapacity, src, srcLength, locale, pErrorCode); } int32_t __hs_u_strToLower(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, const char *locale, UErrorCode *pErrorCode) { return u_strToLower(dest, destCapacity, src, srcLength, locale, pErrorCode); } int32_t __hs_u_strFoldCase(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, uint32_t options, UErrorCode *pErrorCode) { return u_strFoldCase(dest, destCapacity, src, srcLength, options, pErrorCode); } int32_t __hs_u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2) { return u_strCompareIter(iter1, iter2, true); } UChar* __hs_u_strFromUTF8Lenient( UChar *dest, int32_t destCapacity, int32_t *pDestLength, const char *src, int32_t srcLength, UErrorCode *pErrorCode) { return u_strFromUTF8Lenient(dest, destCapacity, pDestLength, src, srcLength, pErrorCode); } char* __hs_u_strToUTF8( char *dest, int32_t destCapacity, int32_t *pDestLength, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode) { return u_strToUTF8(dest, destCapacity, pDestLength, src, srcLength, pErrorCode); } UBlockCode __hs_ublock_getCode(UChar32 c) { return ublock_getCode(c); } UCharDirection __hs_u_charDirection(UChar32 c) { return u_charDirection(c); } UBool __hs_u_isMirrored(UChar32 c) { return u_isMirrored(c); } UChar32 __hs_u_charMirror(UChar32 c) { return u_charMirror(c); } uint8_t __hs_u_getCombiningClass(UChar32 c) { return u_getCombiningClass(c); } int32_t __hs_u_charDigitValue(UChar32 c) { return u_charDigitValue(c); } int32_t __hs_u_charName(UChar32 code, UCharNameChoice nameChoice, char *buffer, int32_t bufferLength, UErrorCode *pErrorCode) { return u_charName(code, nameChoice, buffer, bufferLength, pErrorCode); } UChar32 __hs_u_charFromName(UCharNameChoice nameChoice, const char *name, UErrorCode *pErrorCode) { return u_charFromName(nameChoice, name, pErrorCode); } int32_t __hs_u_getIntPropertyValue(UChar32 c, UProperty which) { return u_getIntPropertyValue(c, which); } double __hs_u_getNumericValue(UChar32 c) { return u_getNumericValue(c); } URegularExpression *__hs_uregex_open(const UChar *pattern, int32_t patternLength, uint32_t flags, UParseError *pe, UErrorCode *status) { return uregex_open(pattern, patternLength, flags, pe, status); } void __hs_uregex_setTimeLimit(URegularExpression *regexp, int32_t limit, UErrorCode *status) { uregex_setTimeLimit(regexp, limit, status); } void __hs_uregex_setStackLimit(URegularExpression *regexp, int32_t limit, UErrorCode *status) { uregex_setStackLimit(regexp, limit, status); } void __hs_uregex_close(URegularExpression *regexp) { uregex_close(regexp); } URegularExpression *__hs_uregex_clone(const URegularExpression *regexp, UErrorCode *pErrorCode) { return uregex_clone(regexp, pErrorCode); } const UChar *__hs_uregex_pattern(const URegularExpression *regexp, int32_t *patLength, UErrorCode *status) { return uregex_pattern(regexp, patLength, status); } int32_t __hs_uregex_flags(const URegularExpression *regexp, UErrorCode *status) { return uregex_flags(regexp, status); } void __hs_uregex_setUText(URegularExpression *regexp, UText *text, UErrorCode *status) { uregex_setUText(regexp, text, status); } const UChar *__hs_uregex_getText(URegularExpression *regexp, int32_t *textLength, UErrorCode *status) { return uregex_getText(regexp, textLength, status); } UBool __hs_uregex_find(URegularExpression *regexp, int32_t startIndex, UErrorCode *status) { return uregex_find(regexp, startIndex, status); } UBool __hs_uregex_findNext(URegularExpression *regexp, UErrorCode *status) { return uregex_findNext(regexp, status); } int32_t __hs_uregex_start(URegularExpression *regexp, int32_t groupNum, UErrorCode *status) { return uregex_start(regexp, groupNum, status); } int32_t __hs_uregex_end(URegularExpression *regexp, int32_t groupNum, UErrorCode *status) { return uregex_end(regexp, groupNum, status); } int32_t __hs_uregex_groupCount(URegularExpression *regexp, UErrorCode *status) { return uregex_groupCount(regexp, status); } int32_t __hs_uregex_group(URegularExpression *regexp, int32_t groupNum, UChar *dest, int32_t destCapacity, UErrorCode *status) { return uregex_group(regexp, groupNum, dest, destCapacity, status); } USpoofChecker *__hs_uspoof_open(UErrorCode *status) { return uspoof_open(status); } USpoofChecker *__hs_uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength, UErrorCode *status) { return uspoof_openFromSerialized(data, length, pActualLength, status); } USpoofChecker *__hs_uspoof_openFromSource(const char *confusables, int32_t confusablesLen, const char *confusablesWholeScript, int32_t confusablesWholeScriptLen, int32_t *errType, UParseError *parseError, UErrorCode *status) { // Work around missing call to umtx_initOnce() in uspoof_openFromSource() // causing crash when gNfdNormalizer is accessed uspoof_getInclusionSet(status); return uspoof_openFromSource(confusables, confusablesLen, confusablesWholeScript, confusablesWholeScriptLen, errType, parseError, status); } void __hs_uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status) { uspoof_setChecks(sc, checks, status); } int32_t __hs_uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status) { return uspoof_getChecks(sc, status); } void __hs_uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel level) { uspoof_setRestrictionLevel(sc, level); } URestrictionLevel __hs_uspoof_getRestrictionLevel(const USpoofChecker *sc) { return uspoof_getRestrictionLevel(sc); } void __hs_uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status) { uspoof_setAllowedLocales(sc, localesList, status); } const char *__hs_uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status) { return uspoof_getAllowedLocales(sc, status); } int32_t __hs_uspoof_check(USpoofChecker *sc, const UChar *id, int32_t length, int32_t *position, UErrorCode *status) { return uspoof_check(sc, id, length, position, status); } int32_t __hs_uspoof_areConfusable(USpoofChecker *sc, const UChar *id1, int32_t length1, const UChar *id2, int32_t length2, UErrorCode *status) { return uspoof_areConfusable(sc, id1, length1, id2, length2, status); } int32_t __hs_uspoof_getSkeleton(USpoofChecker *sc, int32_t type, const UChar *id, int32_t length, UChar *dest, int32_t destCapacity, UErrorCode *status) { return uspoof_getSkeleton(sc, type, id, length, dest, destCapacity, status); } int32_t __hs_uspoof_checkUTF8( USpoofChecker *sc, const char *id, int32_t length, int32_t *position, UErrorCode *status) { return uspoof_checkUTF8(sc, id, length, position, status); } int32_t __hs_uspoof_areConfusableUTF8( USpoofChecker *sc, const char *id1, int32_t length1, const char *id2, int32_t length2, UErrorCode *status) { return uspoof_areConfusableUTF8(sc, id1, length1, id2, length2, status); } int32_t __hs_uspoof_getSkeletonUTF8( USpoofChecker *sc, int32_t type, const char *id, int32_t length, char *dest, int32_t destCapacity, UErrorCode *status) { return uspoof_getSkeletonUTF8(sc, type, id, length, dest, destCapacity, status); } int32_t __hs_uspoof_serialize(USpoofChecker *sc, void *data, int32_t capacity, UErrorCode *status) { return uspoof_serialize(sc, data, capacity, status); } void __hs_uspoof_close(USpoofChecker *sc) { uspoof_close(sc); } UText* __hs_utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode * status) { return utext_openUChars(ut, s, length, status); } UText* __hs_utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode * status) { return utext_openUTF8(ut, s, length, status); } void __hs_utext_close(UText *ut) { utext_close(ut); } UCharsetDetector *__hs_ucsdet_open(UErrorCode *status) { return ucsdet_open(status); } void __hs_ucsdet_close(UCharsetDetector *ucsd) { ucsdet_close(ucsd); } void __hs_ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t length, UErrorCode *status) { ucsdet_setText(ucsd, textIn, length, status); } void __hs_ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status) { ucsdet_setDeclaredEncoding(ucsd, encoding, length, status); } const UCharsetMatch *__hs_ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status) { return ucsdet_detect(ucsd, status); } const UCharsetMatch **__hs_ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status) { return ucsdet_detectAll(ucsd, matchesFound, status); } const char *__hs_ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status) { return ucsdet_getName(ucsm, status); } int32_t __hs_ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status) { return ucsdet_getConfidence(ucsm, status); } const char *__hs_ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status) { return ucsdet_getLanguage(ucsm, status); } int32_t __hs_ucsdet_getUChars(const UCharsetMatch *ucsm, UChar *buf, int32_t capacity, UErrorCode *status) { return ucsdet_getUChars(ucsm, buf, capacity, status); } UEnumeration *__hs_ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status) { return ucsdet_getAllDetectableCharsets(ucsd, status); } UBool __hs_ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd) { return ucsdet_isInputFilterEnabled(ucsd); } UBool __hs_ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter) { return ucsdet_enableInputFilter(ucsd, filter); } UEnumeration *__hs_ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status) { return ucsdet_getDetectableCharsets(ucsd, status); } void __hs_ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status) { ucsdet_setDetectableCharset(ucsd, encoding, enabled, status); } UNumberFormat *__hs_unum_open(UNumberFormatStyle style, const UChar *pattern, int32_t patternLength, const char *loc, UParseError *parseErr, UErrorCode *status) { if (patternLength == 0) return unum_open(style, 0, 0, loc, parseErr, status); else return unum_open(style, pattern, patternLength, loc, parseErr, status); } void __hs_unum_close(UNumberFormat *fmt) { unum_close(fmt); } int32_t __hs_unum_formatInt64(const UNumberFormat *fmt, int64_t value, UChar *result, int32_t resultLength, UErrorCode *ec) { return unum_formatInt64(fmt, value, result, resultLength, 0, ec); } int32_t __hs_unum_formatDouble(const UNumberFormat *fmt, double value, UChar *result, int32_t resultLength, UErrorCode *ec) { return unum_formatDouble(fmt, value, result, resultLength, 0, ec); } text-icu-0.8.0.4/changelog.md0000644000000000000000000000175707346545000014077 0ustar00000000000000000.8.0.4 * Fixed tests to work with ICU < 72 (#94) 0.8.0.3 * Support for ICU 72 (#94) 0.8.0.2 * Support for creating a collator from custom rules (#76) 0.8.0.1 * Restore build with GHC 7.10 - 8.8 (#61) * New CI for Linux, macOS and Windows (#63, #64, #66, #69) 0.8.0 * Support for text-2.0 (#57) * Support for ICU 69 and new features (#55) * Add lib/include dirs for newer homebrew (#54) * basic number formatting added (#46) * Declare pkg-config dependencies (#43) * Added support for arabic shaping and BiDi (#41) * Include icuio lib (#36) * Character Set Detection (#27) 0.7.1.0 * Add fix for undefined TRUE value in cbits (#52) * Improve CI and documentation (#20) Thanks to everyone who contributed! 0.7.0.0 * Built and tested against ICU 53. * The isoComment function has been deprecated, and will be removed in the next major release. * The Collator type is no longer an instance of Eq, as this functionality has been removed from ICU 53. * Many NFData instances have been added. text-icu-0.8.0.4/include/0000755000000000000000000000000007346545000013237 5ustar0000000000000000text-icu-0.8.0.4/include/hs_text_icu.h0000644000000000000000000004752107346545000015737 0ustar0000000000000000#ifdef WIN32 #define U_HAVE_INTTYPES_H 1 #endif #include "unicode/utypes.h" #include "unicode/ubidi.h" #include "unicode/ubrk.h" #include "unicode/ucal.h" #include "unicode/uchar.h" #include "unicode/unum.h" #include "unicode/ucnv.h" #include "unicode/ucol.h" #include "unicode/udat.h" #include "unicode/uenum.h" #include "unicode/uiter.h" #include "unicode/uloc.h" #include "unicode/unorm.h" #include "unicode/unumberformatter.h" #include "unicode/uregex.h" #include "unicode/ushape.h" #include "unicode/uspoof.h" #include "unicode/ustring.h" #include "unicode/utext.h" #include "unicode/ucsdet.h" #include /* udat.h */ UDateFormat *__hs_udat_open(UDateFormatStyle timeStyle, UDateFormatStyle dateStyle, const char *locale, const UChar *tzID, int32_t tzIDLength, const UChar *pattern, int32_t patternLength, UErrorCode *status); void __hs_udat_close(UDateFormat *format); UDateFormat *__hs_udat_clone(const UDateFormat *fmt, UErrorCode *status); int32_t __hs_udat_formatCalendar(const UDateFormat *format, UCalendar *calendar, UChar *result, int32_t capacity, UFieldPosition *position, UErrorCode *status); int32_t __hs_udat_getSymbols(const UDateFormat *fmt, UDateFormatSymbolType type, int32_t symbolIndex, UChar *result, int32_t resultLength, UErrorCode *status); int32_t __hs_udat_countSymbols(const UDateFormat *fmt, UDateFormatSymbolType type); /* unumberformatter.h */ UNumberFormatter *__hs_unumf_openForSkeletonAndLocale(const UChar *skeleton, int32_t skeletonLen, const char *locale, UErrorCode *ec); void __hs_unumf_close(UNumberFormatter *uformatter); UFormattedNumber *__hs_unumf_openResult(UErrorCode *ec); void __hs_unumf_closeResult(UFormattedNumber *uresult); void __hs_unumf_formatInt(const UNumberFormatter *uformatter, int64_t value, UFormattedNumber *uresult, UErrorCode *ec); void __hs_unumf_formatDouble(const UNumberFormatter *uformatter, double value, UFormattedNumber *uresult, UErrorCode *ec); int32_t __hs_unumf_resultToString(const UFormattedNumber *uresult, UChar *buffer, int32_t bufferCapacity, UErrorCode *ec); /* uenum.h */ void __hs_uenum_close(UEnumeration *en); const UChar *__hs_uenum_unext(UEnumeration *en, int32_t *resultLength, UErrorCode *status); /* uloc.h */ const char *__hs_uloc_getAvailable(int32_t n); int32_t __hs_uloc_countAvailable(void); /* ucal.h */ UCalendar *__hs_ucal_open(const UChar *zoneID, int32_t len, const char *locale, UCalendarType type, UErrorCode *status); UCalendar *__hs_ucal_clone(const UCalendar *cal, UErrorCode *status); int32_t __hs_ucal_get(const UCalendar *cal, UCalendarDateFields field, UErrorCode *status); void __hs_ucal_set(UCalendar *cal, UCalendarDateFields field, int32_t value); void __hs_ucal_setDate(UCalendar *cal, int32_t year, int32_t month, int32_t date, UErrorCode *status); void __hs_ucal_setDateTime(UCalendar *cal, int32_t year, int32_t month, int32_t date, int32_t hr, int32_t min, int32_t sec, UErrorCode *status); void __hs_ucal_add(UCalendar *cal, UCalendarDateFields field, int32_t value, UErrorCode *status); void __hs_ucal_roll(UCalendar *cal, UCalendarDateFields field, int32_t value, UErrorCode *status); UEnumeration *__hs_ucal_openTimeZones(UErrorCode *ec); UEnumeration *__hs_ucal_openTimeZoneIDEnumeration(USystemTimeZoneType zoneType, UErrorCode *ec); void __hs_ucal_setTimeZone(UCalendar *cal, const UChar *zoneID, int32_t len, UErrorCode *status); /* ubrk.h */ UBreakIterator *__hs_ubrk_open(UBreakIteratorType type, const char *locale, const UChar *text, int32_t textLength, UErrorCode *status); void __hs_ubrk_close(UBreakIterator *bi); void __hs_ubrk_setUText(UBreakIterator* bi, UText *text, UErrorCode *status); UBreakIterator *__hs_ubrk_safeClone(const UBreakIterator *bi, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); int32_t __hs_ubrk_current(const UBreakIterator *bi); int32_t __hs_ubrk_first(UBreakIterator *bi); int32_t __hs_ubrk_last(UBreakIterator *bi); int32_t __hs_ubrk_next(UBreakIterator *bi); int32_t __hs_ubrk_previous(UBreakIterator *bi); int32_t __hs_ubrk_preceding(UBreakIterator *bi, int32_t offset); int32_t __hs_ubrk_following(UBreakIterator *bi, int32_t offset); int32_t __hs_ubrk_getRuleStatus(UBreakIterator *bi); int32_t __hs_ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status); UBool __hs_ubrk_isBoundary(UBreakIterator *bi, int32_t offset); int32_t __hs_ubrk_countAvailable(void); const char *__hs_ubrk_getAvailable(int32_t index); /* uchar.h */ UBlockCode __hs_ublock_getCode(UChar32 c); UCharDirection __hs_u_charDirection(UChar32 c); UBool __hs_u_isMirrored(UChar32 c); UChar32 __hs_u_charMirror(UChar32 c); uint8_t __hs_u_getCombiningClass(UChar32 c); int32_t __hs_u_charDigitValue(UChar32 c); int32_t __hs_u_charName(UChar32 code, UCharNameChoice nameChoice, char *buffer, int32_t bufferLength, UErrorCode *pErrorCode); UChar32 __hs_u_charFromName(UCharNameChoice nameChoice, const char *name, UErrorCode *pErrorCode); int32_t __hs_u_getIntPropertyValue(UChar32 c, UProperty which); double __hs_u_getNumericValue(UChar32 c); /* ucol.h */ UCollator *__hs_ucol_open(const char *loc, UErrorCode *status); UCollator* __hs_ucol_openRules(const UChar *rules, int32_t rulesLength, UColAttributeValue normalizationMode, UCollationStrength strength, UParseError *parseError, UErrorCode *status); void __hs_ucol_close(UCollator *coll); const UChar *__hs_ucol_getRules(const UCollator *coll, int32_t *length); void __hs_ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status); UColAttributeValue __hs_ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status); UCollationResult __hs_ucol_strcoll(const UCollator *coll, const UChar *source, int32_t sourceLength, const UChar *target, int32_t targetLength); UCollationResult __hs_ucol_strcollUTF8( const UCollator *coll, const char *source, int32_t sourceLength, const char *target, int32_t targetLength, UErrorCode *status); UCollationResult __hs_ucol_strcollIter(const UCollator *coll, UCharIterator *sIter, UCharIterator *tIter, UErrorCode *status); UCollator *__hs_ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); int32_t __hs_ucol_getSortKey(const UCollator *coll, const UChar *source, int32_t sourceLength, uint8_t *result, int32_t resultLength); /* ubidi.h */ UBiDi* __hs_ubidi_open(); UBiDi* __hs_ubidi_openSized(int32_t maxLength, int32_t maxRunCount, UErrorCode *err); void __hs_ubidi_setPara(UBiDi *pBiDi, const UChar *text, int32_t length, UBiDiLevel paraLevel, UErrorCode *pErrorCode); void __hs_ubidi_close(UBiDi * converter); int32_t __hs_ubidi_countParagraphs(UBiDi *pBiDi); void __hs_ubidi_getParagraphByIndex(const UBiDi *pBiDi, int32_t paraIndex, int32_t *pParaStart, int32_t *pParaLimit, UErrorCode *pErrorCode); int32_t __hs_ubidi_getProcessedLength(const UBiDi *pBiDi); int32_t __hs_ubidi_writeReordered(UBiDi *pBiDi, UChar *dest, int32_t destSize, uint16_t options, UErrorCode *pErrorCode); void __hs_ubidi_setLine(const UBiDi *pParaBiDi, int32_t start, int32_t limit, UBiDi *pLineBiDi, UErrorCode *pErrorCode); /* ucnv.h */ int __hs_ucnv_get_max_bytes_for_string(UConverter *cnv, int src_length); const char *__hs_u_errorName(UErrorCode code); const char *__hs_ucnv_getName(const UConverter *converter, UErrorCode *err); UConverter *__hs_ucnv_open(const char *converterName, UErrorCode *err); void __hs_ucnv_close(UConverter *converter); int32_t __hs_ucnv_toUChars(UConverter *cnv, UChar *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode); int32_t __hs_ucnv_fromUChars(UConverter *cnv, char *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode); int32_t __hs_ucnv_toAlgorithmic_UTF8( UConverter *cnv, char *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode); int32_t __hs_ucnv_fromAlgorithmic_UTF8( UConverter *cnv, char *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode); int __hs_ucnv_compareNames(const char *name1, const char *name2); const char *__hs_ucnv_getDefaultName(void); void __hs_ucnv_setDefaultName(const char *name); int32_t __hs_ucnv_countAvailable(void); const char *__hs_ucnv_getAvailableName(int32_t n); uint16_t __hs_ucnv_countAliases(const char *alias, UErrorCode *pErrorCode); const char *__hs_ucnv_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode); uint16_t __hs_ucnv_countStandards(void); const char *__hs_ucnv_getStandard(uint16_t n, UErrorCode *pErrorCode); UBool __hs_ucnv_usesFallback(const UConverter *cnv); void __hs_ucnv_setFallback(UConverter *cnv, UBool usesFallback); UBool __hs_ucnv_isAmbiguous(const UConverter *cnv); /* uiter.h */ void __hs_uiter_setString(UCharIterator *iter, const UChar *s, int32_t length); void __hs_uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length); /* unorm2.h */ const UNormalizer2 *__hs_unorm2_getNFCInstance(UErrorCode *pErrorCode); const UNormalizer2 *__hs_unorm2_getNFDInstance(UErrorCode *pErrorCode); const UNormalizer2 *__hs_unorm2_getNFKCInstance(UErrorCode *pErrorCode); const UNormalizer2 *__hs_unorm2_getNFKDInstance(UErrorCode *pErrorCode); const UNormalizer2 *__hs_unorm2_getNFKCCasefoldInstance(UErrorCode *pErrorCode); int32_t __hs_unorm2_normalize(const UNormalizer2 *norm2, const UChar *src, int32_t length, UChar *dest, int32_t capacity, UErrorCode *pErrorCode); UBool __hs_unorm2_isNormalized(const UNormalizer2 *norm2, const UChar *s, int32_t length, UErrorCode *pErrorCode); UNormalizationCheckResult __hs_unorm2_quickCheck(const UNormalizer2 *norm2, const UChar *s, int32_t length, UErrorCode *pErrorCode); /* unorm.h DEPRECATED */ int32_t __hs_unorm_compare(const UChar *s1, int32_t length1, const UChar *s2, int32_t length2, uint32_t options, UErrorCode *pErrorCode); UNormalizationCheckResult __hs_unorm_quickCheck(const UChar *source, int32_t sourcelength, UNormalizationMode mode, UErrorCode *status); UBool __hs_unorm_isNormalized(const UChar *src, int32_t srcLength, UNormalizationMode mode, UErrorCode *pErrorCode); int32_t __hs_unorm_normalize(const UChar *source, int32_t sourceLength, UNormalizationMode mode, int32_t options, UChar *result, int32_t resultLength, UErrorCode *status); /* uregex.h */ URegularExpression *__hs_uregex_open(const UChar *pattern, int32_t patternLength, uint32_t flags, UParseError *pe, UErrorCode *status); void __hs_uregex_setTimeLimit(URegularExpression *regexp, int32_t limit, UErrorCode *status); void __hs_uregex_setStackLimit(URegularExpression *regexp, int32_t limit, UErrorCode *status); void __hs_uregex_close(URegularExpression *regexp); URegularExpression *__hs_uregex_clone(const URegularExpression *regexp, UErrorCode *pErrorCode); const UChar *__hs_uregex_pattern(const URegularExpression *regexp, int32_t *patLength, UErrorCode *status); int32_t __hs_uregex_flags(const URegularExpression *regexp, UErrorCode *status); void __hs_uregex_setUText(URegularExpression *regexp, UText *text, UErrorCode *status); const UChar *__hs_uregex_getText(URegularExpression *regexp, int32_t *textLength, UErrorCode *status); UBool __hs_uregex_find(URegularExpression *regexp, int32_t startIndex, UErrorCode *status); UBool __hs_uregex_findNext(URegularExpression *regexp, UErrorCode *status); int32_t __hs_uregex_start(URegularExpression *regexp, int32_t groupNum, UErrorCode *status); int32_t __hs_uregex_end(URegularExpression *regexp, int32_t groupNum, UErrorCode *status); int32_t __hs_uregex_groupCount(URegularExpression *regexp, UErrorCode *status); int32_t __hs_uregex_group(URegularExpression *regexp, int32_t groupNum, UChar *dest, int32_t destCapacity, UErrorCode *status); /* ushape.h */ int32_t __hs_u_shapeArabic(const UChar *source, int32_t sourceLength, UChar *result, int32_t resultLength, int32_t options, UErrorCode *status); /* ustring.h */ int32_t __hs_u_strFoldCase(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, uint32_t options, UErrorCode *pErrorCode); int32_t __hs_u_strToUpper(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, const char *locale, UErrorCode *pErrorCode); int32_t __hs_u_strToLower(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, const char *locale, UErrorCode *pErrorCode); int32_t __hs_u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2); UChar* __hs_u_strFromUTF8Lenient( UChar *dest, int32_t destCapacity, int32_t *pDestLength, const char *src, int32_t srcLength, UErrorCode *pErrorCode); char* __hs_u_strToUTF8( char *dest, int32_t destCapacity, int32_t *pDestLength, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode); /* uspoof.h */ USpoofChecker *__hs_uspoof_open(UErrorCode *status); USpoofChecker *__hs_uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength, UErrorCode *status); USpoofChecker *__hs_uspoof_openFromSource(const char *confusables, int32_t confusablesLen, const char *confusablesWholeScript, int32_t confusablesWholeScriptLen, int32_t *errType, UParseError *parseError, UErrorCode *status); void __hs_uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status); int32_t __hs_uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status); // Yes, these really don't take UErrorCode *.. void __hs_uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel); URestrictionLevel __hs_uspoof_getRestrictionLevel(const USpoofChecker *sc); void __hs_uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status); const char *__hs_uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status); int32_t __hs_uspoof_check(USpoofChecker *sc, const UChar *id, int32_t length, int32_t *position, UErrorCode *status); int32_t __hs_uspoof_areConfusable(USpoofChecker *sc, const UChar *id1, int32_t length1, const UChar *id2, int32_t length2, UErrorCode *status); int32_t __hs_uspoof_getSkeleton(USpoofChecker *sc, int32_t checks, const UChar *id, int32_t length, UChar *dest, int32_t destCapacity, UErrorCode *status); int32_t __hs_uspoof_checkUTF8( USpoofChecker *sc, const char *id, int32_t length, int32_t *position, UErrorCode *status); int32_t __hs_uspoof_areConfusableUTF8( USpoofChecker *sc, const char *id1, int32_t length1, const char *id2, int32_t length2, UErrorCode *status); int32_t __hs_uspoof_getSkeletonUTF8( USpoofChecker *sc, int32_t checks, const char *id, int32_t length, char *dest, int32_t destCapacity, UErrorCode *status); int32_t __hs_uspoof_serialize(USpoofChecker *sc, void *data, int32_t capacity, UErrorCode *status); void __hs_uspoof_close(USpoofChecker *sc); /* utext.t */ UText* __hs_utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode * status); UText* __hs_utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode * status); void __hs_utext_close(UText *ut); /* ucsdet.h */ UCharsetDetector *__hs_ucsdet_open(UErrorCode *status); void __hs_ucsdet_close(UCharsetDetector *ucsd); void __hs_ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t length, UErrorCode *status); void __hs_ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status); const UCharsetMatch *__hs_ucsdet_detect (UCharsetDetector *ucsd, UErrorCode *status); const UCharsetMatch **__hs_ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status); const char *__hs_ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status); int32_t __hs_ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status); const char *__hs_ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status); int32_t __hs_ucsdet_getUChars(const UCharsetMatch *ucsm, UChar *buf, int32_t capacity, UErrorCode *status); UEnumeration *__hs_ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); UBool __hs_ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd); UBool __hs_ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter); UEnumeration *__hs_ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); void __hs_ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status); /* unum.h */ UNumberFormat *__hs_unum_open(UNumberFormatStyle style, const UChar *pattern, int32_t patternLength, const char *loc, UParseError *parseErr, UErrorCode *status); void __hs_unum_close(UNumberFormat *fmt); int32_t __hs_unum_formatInt64(const UNumberFormat *fmt, int64_t value, UChar *result, int32_t resultLength, UErrorCode *ec); int32_t __hs_unum_formatDouble(const UNumberFormat *fmt, double value, UChar *result, int32_t resultLength, UErrorCode *ec); text-icu-0.8.0.4/tests/0000755000000000000000000000000007346545000012756 5ustar0000000000000000text-icu-0.8.0.4/tests/Properties.hs0000644000000000000000000002307607346545000015456 0ustar0000000000000000-- Tester beware! -- -- Many of the tests below are "weak", i.e. they ensure that functions -- return results, without checking whether the results are correct. -- Weak tests are described as such. {-# LANGUAGE CPP #-} {-# LANGUAGE OverloadedStrings, LambdaCase #-} {-# OPTIONS_GHC -fno-warn-missing-signatures #-} module Properties (propertyTests, testCases) where import Control.Monad (unless) import qualified Control.Exception as E import Control.DeepSeq (NFData(..)) import Data.Function (on) import Data.Maybe (fromMaybe) import Data.Text (Text) import Data.Text.ICU (LocaleName(..), ParseError(..)) import QuickCheckUtils (NonEmptyText(..), LatinSpoofableText(..), NonSpoofableText(..), Utf8Text(..)) import Data.Text.ICU.Normalize2 (NormalizationMode(..)) import qualified Data.Text.ICU.Normalize2 as I import Test.Framework (Test, testGroup) import Test.Framework.Providers.QuickCheck2 (testProperty) import Test.Framework.Providers.HUnit (hUnitTestToTests) import Test.HUnit ((~?=), (@?=), (~:)) import qualified Test.HUnit (Test(..), assertFailure) import Test.HUnit.Lang (HUnitFailure (..), FailureReason (..)) import Test.QuickCheck.Monadic (monadicIO, run, assert) import qualified Data.Text as T import qualified Data.Text.Encoding as T import qualified Data.Text.ICU as I import qualified Data.Text.ICU.BiDi as BiDi import qualified Data.Text.ICU.Calendar as Cal import qualified Data.Text.ICU.Convert as I import qualified Data.Text.ICU.Char as I import qualified Data.Text.ICU.CharsetDetection as CD import qualified Data.Text.ICU.Error as Err import qualified Data.Text.ICU.Number as N import qualified Data.Text.ICU.Shape as S import System.IO.Unsafe (unsafePerformIO) #if !MIN_VERSION_base(4,11,0) import Data.Semigroup ((<>)) #endif {-# ANN module ("HLint: use camelCase"::String) #-} t_rnf :: (NFData b) => (a -> b) -> a -> Bool t_rnf f t = rnf (f t) == () t_nonEmpty :: (Text -> Text) -> Text -> Bool t_nonEmpty f t | T.null t = T.null ft | otherwise = T.length ft > 0 where ft = f t -- Case mapping -- These tests are all fairly weak. t_toCaseFold bool = t_nonEmpty $ I.toCaseFold bool t_toLower locale = t_nonEmpty $ I.toLower locale t_toUpper locale = t_nonEmpty $ I.toUpper locale -- Iteration t_charIterator_String a b = (compare `on` I.fromString) a b == compare a b t_charIterator_Text a b = (compare `on` I.fromText) a b == compare a b t_charIterator_Utf8 a b = (compare `on` I.fromUtf8) ba bb == compare ba bb where ba = T.encodeUtf8 a; bb = T.encodeUtf8 b -- Normalization t_quickCheck_isNormalized mode normMode txt | mode `elem` [NFD, NFKD] = quickCheck == Just isNormalized | otherwise = fromMaybe isNormalized quickCheck == isNormalized where quickCheck = I.quickCheck mode normTxt isNormalized = I.isNormalized mode normTxt normTxt = I.normalize normMode txt -- Collation t_collate a b = c a b == flipOrdering (c b a) where c = I.collate I.uca t_collate_emptyRule a b = I.collate cUca a b == I.collate cEmpty a b where cUca = I.uca cEmpty = either (error "Can’t create empty collator") id $ I.collatorFromRules "" flipOrdering :: Ordering -> Ordering flipOrdering = \ case GT -> LT LT -> GT EQ -> EQ -- Convert converter e = unsafePerformIO $ I.open e Nothing t_convert a = I.toUnicode c (I.fromUnicode c a) == a where c = converter "UTF-32" -- Unicode character database -- These tests are weak. t_blockCode = t_rnf I.blockCode t_charFullName c = I.charFromFullName (I.charFullName c) == Just c t_charName c = maybe True (==c) $ I.charFromName (I.charName c) t_combiningClass = t_rnf I.combiningClass t_direction = t_rnf I.direction -- t_property p = t_rnf $ I.property p t_isMirrored = t_rnf $ I.isMirrored t_mirror = t_rnf $ I.mirror t_digitToInt = t_rnf $ I.digitToInt t_numericValue = t_rnf $ I.numericValue -- Spoofing t_nonspoofable (NonSpoofableText t) = I.spoofCheck I.spoof t == I.CheckOK t_spoofable (LatinSpoofableText t) = I.spoofCheck I.spoof t == I.CheckFailed [I.RestrictionLevel] t_confusable (NonEmptyText t) = I.areConfusable I.spoof t t `elem` [I.CheckFailed [I.MixedScriptConfusable] ,I.CheckFailed [I.SingleScriptConfusable]] -- Encoding Guessing t_Utf8IsUtf8 a = monadicIO $ do val <- run $ CD.detect (utf8Text a) >>= CD.getName assert $ T.isPrefixOf "UTF-8" val propertyTests :: Test propertyTests = testGroup "Properties" [ testProperty "t_toCaseFold" t_toCaseFold , testProperty "t_toLower" t_toLower , testProperty "t_toUpper" t_toUpper , testProperty "t_charIterator_String" t_charIterator_String , testProperty "t_charIterator_Text" t_charIterator_Text , testProperty "t_charIterator_Utf8" t_charIterator_Utf8 , testProperty "t_quickCheck_isNormalized" t_quickCheck_isNormalized , testProperty "t_collate" t_collate , testProperty "t_collate_emptyRule" t_collate_emptyRule , testProperty "t_convert" t_convert , testProperty "t_blockCode" t_blockCode , testProperty "t_charFullName" t_charFullName , testProperty "t_charName" t_charName , testProperty "t_combiningClass" t_combiningClass , testProperty "t_direction" $ t_direction --, testProperty "t_property" t_property , testProperty "t_isMirrored" t_isMirrored , testProperty "t_mirror" t_mirror , testProperty "t_digitToInt" t_digitToInt , testProperty "t_numericValue" t_numericValue , testProperty "t_spoofable" t_spoofable , testProperty "t_nonspoofable" t_nonspoofable , testProperty "t_confusable" t_confusable , testProperty "t_Utf8IsUtf8" t_Utf8IsUtf8 ] testCases :: Test testCases = testGroup "Test cases" $ hUnitTestToTests $ Test.HUnit.TestList $ [I.normalize NFC "Ame\x0301lie" ~?= "Amélie" ,I.normalize NFC "(⊃。•́︵•̀。)⊃" ~?= "(⊃。•́︵•̀。)⊃" ,map I.brkBreak (I.breaks (I.breakWord (Locale "en_US")) "Hi, Amélie!") ~?= ["Hi",","," ","Amélie","!"] ,map I.brkBreak (I.breaksRight (I.breakLine (Locale "ru")) "Привет, мир!") ~?= ["мир!","Привет, "] ,(I.unfold I.group <$> I.findAll "[abc]+" "xx b yy ac") ~?= [["b"],["ac"]] ,I.toUpper (Locale "de-DE") "ß" ~?= "SS" ,I.toCaseFold False "flag" ~?= "flag" ,map I.blockCode ['\x1FA50', '\203257', '\73494'] `oneOf` [[I.ChessSymbols, I.CjkUnifiedIdeographsExtensionH, I.Kawi] ,[I.ChessSymbols, I.NoBlock, I.NoBlock] -- ICU < 72 does not have last two codes ] ,I.direction '\x2068' ~?= I.FirstStrongIsolate ,I.getSkeleton I.spoof Nothing "\1089\1072t" ~?= "cat" ,S.shapeArabic [S.LettersShape] (nosp "ا ب ت ث") ~?= (nosp "ﺍ ﺑ ﺘ ﺚ") ,BiDi.reorderParagraphs [] (nosp "abc ا ب ت ث def\n123") ~?= ["abc" <> T.reverse (nosp "ا ب ت ث") <> "def\n", "123"] ,N.formatNumber (N.numberFormatter N.NUM_CURRENCY_PLURAL "en_US") (12.5 :: Double) ~?= "12.50 US dollars" ,do dfDe <- I.standardDateFormatter I.LongFormatStyle I.LongFormatStyle (Locale "de_DE") "" c <- cal "CET" 2000 00 01 02 03 00 return $ I.formatCalendar dfDe (Cal.add c [(Cal.Hour, 25), (Cal.Second, 65)]) `ioEq` "2. Januar 2000 um 03:04:05 GMT+1" ,do dfAt <- I.standardDateFormatter I.LongFormatStyle I.LongFormatStyle (Locale "de_AT") "CET" return $ I.dateSymbols dfAt I.Months `ioEq` ["Jänner","Februar","März","April","Mai","Juni" ,"Juli","August","September","Oktober","November","Dezember"] ,do dfP <- I.patternDateFormatter "MMMM dd, yyyy GGGG, hh 'o''clock' a, VVVV" (Locale "en_US") "" c <- cal "America/Los_Angeles" 2000 00 02 03 04 05 return $ I.formatCalendar dfP c `ioEq` "January 02, 2000 Anno Domini, 03 o'clock AM, Los Angeles Time" ,(flip Cal.getField Cal.Year =<< cal "UTC" 1999 01 02 03 04 05) `ioEq` 1999 ,(elem "en_US" <$> I.availableLocales) `ioEq` True ,(flip I.formatIntegral (12345 :: Int) <$> I.numberFormatter "precision-integer" (Locale "fr")) `ioEq` "12\8239\&345" ,(flip I.formatDouble 12345.6789 <$> I.numberFormatter "precision-currency-cash currency/EUR" (Locale "it")) `ioEq` "12.345,68\160€" , Test.HUnit.TestLabel "collate" testCases_collate ] <> concat [conv "ISO-2022-CN" "程序設計" "\ESC$)A\SO3LPr\ESC$)G]CSS\SI" ,conv "cp1251" "Привет, мир!" "\207\240\232\226\229\242, \236\232\240!" ] where conv n f t = [I.fromUnicode c f ~?= t, I.toUnicode c t ~?= f] where c = converter n nosp = T.filter (/= ' ') cal tz y m d h mn s = do c <- Cal.calendar tz (Locale "en_US") Cal.TraditionalCalendarType Cal.setDateTime c y m d h mn s return c ioEq io a = Test.HUnit.TestCase $ do x <- io x @?= a oneOf actual expected = Test.HUnit.TestCase $ unless (actual `elem` expected) $ E.throwIO $ HUnitFailure Nothing $ ExpectedButGot Nothing (unlines $ "one of:" : map show expected) (show actual) testCases_collate :: Test.HUnit.Test testCases_collate = Test.HUnit.TestList $ [ Test.HUnit.TestLabel "invalid format" $ assertParseError (I.collatorFromRules "& a < <") Err.u_INVALID_FORMAT_ERROR (Just 0) (Just 4) , Test.HUnit.TestLabel "custom collator" $ Test.HUnit.TestCase $ do let c = either (error "Can’t create b)) #endif instance Arbitrary T.Text where arbitrary = T.pack `fmap` arbitrary shrink = map T.pack . shrink . T.unpack instance Arbitrary BS.ByteString where arbitrary = BS.pack <$> arbitrary shrink xs = BS.pack <$> shrink (BS.unpack xs) instance Arbitrary LocaleName where arbitrary = elements (Root:available) instance Arbitrary NormalizationMode where arbitrary = elements [NFD .. NFKCCasefold] instance Arbitrary Collator where arbitrary = I.collator <$> arbitrary newtype NonEmptyText = NonEmptyText { nonEmptyText :: T.Text } deriving Show instance Arbitrary NonEmptyText where arbitrary = NonEmptyText . T.pack <$> listOf1 arbitrary newtype LatinSpoofableText = LatinSpoofableText { latinSpoofableText :: T.Text } deriving Show instance Arbitrary LatinSpoofableText where arbitrary = LatinSpoofableText <$> T.pack . (<>) "latin" <$> listOf1 genCyrillicLatinSpoofableChar genCyrillicLatinSpoofableChar :: Gen Char genCyrillicLatinSpoofableChar = elements ( "\x043A\x043E\x0433\x0435\x043A\x043C" ++ ['\x043E'..'\x0443'] ++ ['\x0445'..'\x0446'] ++ "\x044A" ++ ['\x0454'..'\x0456'] ++ "\x0458\x045B\x048D\x0491\x0493\x049B\x049F\x04AB\x04AD\x04AF\x04B1\x04BB" ++ "\x04BD\x04BF" ++ ['\x04CE'..'\x04CF'] ++ "\x04D5\x04D9\x04E9\x0501\x0511\x051B\x051D") newtype NonSpoofableText = NonSpoofableText { nonSpoofableText :: T.Text } deriving Show instance Arbitrary NonSpoofableText where arbitrary = NonSpoofableText <$> T.pack <$> listOf1 genNonSpoofableChar genNonSpoofableChar :: Gen Char genNonSpoofableChar = elements "QDFRz" newtype Utf8Text = Utf8Text { utf8Text :: BS.ByteString } deriving Show instance Arbitrary Utf8Text where arbitrary = Utf8Text . BS.pack <$> vectorOf 300 (suchThat (arbitrary :: Gen Word8) (`elem` ([0x41..0x5A] ++ [0x61..0x7A])) ) text-icu-0.8.0.4/tests/Tests.hs0000644000000000000000000000025607346545000014417 0ustar0000000000000000module Main (main) where import Test.Framework (defaultMain) import qualified Properties main :: IO () main = defaultMain [Properties.propertyTests, Properties.testCases] text-icu-0.8.0.4/text-icu.cabal0000644000000000000000000001106607346545000014346 0ustar0000000000000000cabal-version: 1.18 -- 1.18 introduced extra-doc-files name: text-icu version: 0.8.0.4 synopsis: Bindings to the ICU library homepage: https://github.com/haskell/text-icu bug-reports: https://github.com/haskell/text-icu/issues description: Haskell bindings to the International Components for Unicode (ICU) libraries. These libraries provide robust and full-featured Unicode services on a wide variety of platforms. . Features include: . * Both pure and impure bindings, to allow for fine control over efficiency and ease of use. . * Breaking of strings on character, word, sentence, and line boundaries. . * Access to the Unicode Character Database (UCD) of character metadata. . * String collation functions, for locales where the conventions for lexicographic ordering differ from the simple numeric ordering of character codes. . * Character set conversion functions, allowing conversion between Unicode and over 220 character encodings. . * Unicode normalization. (When implementations keep strings in a normalized form, they can be assured that equivalent strings have a unique binary representation.) . * Regular expression search and replace. . * Security checks for visually confusable (spoofable) strings. . * Bidirectional Unicode algorithm . * Calendar objects holding dates and times. . * Number and calendar formatting. maintainer: Vladimir Shabanov copyright: 2009-2015 Bryan O'Sullivan category: Data, Text license: BSD3 license-file: LICENSE build-type: Simple extra-doc-files: README.markdown changelog.md extra-source-files: benchmarks/Breaker.hs include/hs_text_icu.h tested-with: GHC == 9.8.0 GHC == 9.6.2 GHC == 9.4.7 GHC == 9.2.8 GHC == 9.0.2 GHC == 8.10.7 GHC == 8.8.4 GHC == 8.6.5 GHC == 8.4.4 GHC == 8.2.2 GHC == 8.0.2 GHC == 7.10.3 library default-language: Haskell98 build-depends: base >= 4.8 && < 5, bytestring >= 0.9 && < 0.13, deepseq >= 1.4.2.0 && < 1.6, text >= 0.9.1.0 && < 1.3 || >= 2.0 && < 2.2, time >= 1.5 && < 1.13 pkgconfig-depends: icu-i18n >= 62.1 exposed-modules: Data.Text.ICU Data.Text.ICU.BiDi Data.Text.ICU.Calendar Data.Text.ICU.Break Data.Text.ICU.Char Data.Text.ICU.CharsetDetection Data.Text.ICU.Collate Data.Text.ICU.Convert Data.Text.ICU.DateFormatter Data.Text.ICU.Error Data.Text.ICU.Locale Data.Text.ICU.Normalize Data.Text.ICU.Number Data.Text.ICU.Normalize2 Data.Text.ICU.NumberFormatter Data.Text.ICU.Regex Data.Text.ICU.Shape Data.Text.ICU.Spoof Data.Text.ICU.Types other-modules: Data.Text.ICU.BiDi.Internal Data.Text.ICU.BitMask Data.Text.ICU.Break.Pure Data.Text.ICU.Break.Types Data.Text.ICU.CharsetDetection.Internal Data.Text.ICU.Collate.Internal Data.Text.ICU.Collate.Pure Data.Text.ICU.Number.Internal Data.Text.ICU.Convert.Internal Data.Text.ICU.Enumerator Data.Text.ICU.Error.Internal Data.Text.ICU.Internal Data.Text.ICU.Iterator Data.Text.ICU.Normalize.Internal Data.Text.ICU.Regex.Internal Data.Text.ICU.Regex.Pure Data.Text.ICU.Spoof.Internal Data.Text.ICU.Spoof.Pure Data.Text.ICU.Text c-sources: cbits/text_icu.c cc-options: -Wall -Wextra -pedantic -Wno-deprecated include-dirs: include if os(darwin) extra-lib-dirs: /usr/local/opt/icu4c/lib /opt/homebrew/opt/icu4c/lib include-dirs: /usr/local/opt/icu4c/include /opt/homebrew/opt/icu4c/include extra-libraries: icuuc if os(mingw32) extra-libraries: icuin icudt icuio else extra-libraries: icui18n icudata ghc-options: -Wall if impl(ghc >= 8.0) ghc-options: -Wcompat test-suite tests default-language: Haskell98 type: exitcode-stdio-1.0 hs-source-dirs: tests main-is: Tests.hs other-modules: Properties QuickCheckUtils ghc-options: -Wall -threaded -rtsopts build-depends: HUnit >= 1.2, QuickCheck >= 2.4, array, base, bytestring >= 0.9 && < 0.13, deepseq >= 1.4.2.0 && < 1.6, directory, ghc-prim, random, test-framework >= 0.4, test-framework-hunit >= 0.2, test-framework-quickcheck2 >= 0.2, text, text-icu if impl(ghc <= 8.2) build-depends: semigroups source-repository head type: git location: https://github.com/haskell/text-icu