text-icu-0.6.3.7/0000755000000000000000000000000012234226032011611 5ustar0000000000000000text-icu-0.6.3.7/LICENSE0000644000000000000000000000245412234226032012623 0ustar0000000000000000Copyright (c) 2009, Bryan O'Sullivan All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. text-icu-0.6.3.7/README.markdown0000644000000000000000000000171312234226032014314 0ustar0000000000000000# Text-ICU: Comprehensive support for string manipulation This package provides the Data.Text.ICU library, for performing complex manipulation of Unicode text. It provides features such as the following: * Unicode normalization * Conversion to and from many common and obscure encodings # Prerequisites This library is implemented as bindings to the well-respected [ICU library](http://www.icu-project.org/), which is not included. The versions of ICU currently supported are 4.0 and newer. # Get involved! Please report bugs via the [bitbucket issue tracker](http://bitbucket.org/bos/text-icu/issues). Master [Mercurial repository](http://bitbucket.org/bos/text-icu): * `hg clone http://bitbucket.org/bos/text-icu` There's also a [git mirror](http://github.com/bos/text-icu): * `git clone git://github.com/bos/text-icu.git` (You can create and contribute changes using either Mercurial or git.) # Authors This library was written by Bryan O'Sullivan. text-icu-0.6.3.7/Setup.lhs0000644000000000000000000000011412234226032013415 0ustar0000000000000000#!/usr/bin/env runhaskell > import Distribution.Simple > main = defaultMain text-icu-0.6.3.7/text-icu.cabal0000644000000000000000000000620712234226032014344 0ustar0000000000000000name: text-icu version: 0.6.3.7 synopsis: Bindings to the ICU library homepage: https://github.com/bos/text-icu bug-reports: https://github.com/bos/text-icu/issues description: Haskell bindings to the International Components for Unicode (ICU) libraries. These libraries provide robust and full-featured Unicode services on a wide variety of platforms. . Features include: . * Both pure and impure bindings, to allow for fine control over efficiency and ease of use. . * Breaking of strings on character, word, sentence, and line boundaries. . * Access to the Unicode Character Database (UCD) of character metadata. . * String collation functions, for locales where the conventions for lexicographic ordering differ from the simple numeric ordering of character codes. . * Character set conversion functions, allowing conversion between Unicode and over 220 character encodings. . * Unicode normalization. (When implementations keep strings in a normalized form, they can be assured that equivalent strings have a unique binary representation.) . * Regular expression search and replace. maintainer: Bryan O'Sullivan copyright: 2009-2013 Bryan O'Sullivan category: Data, Text license: BSD3 license-file: LICENSE build-type: Simple cabal-version: >= 1.10 extra-source-files: README.markdown include/hs_text_icu.h benchmarks/Breaker.hs library default-language: Haskell98 build-depends: base >= 4 && < 5, bytestring, text >= 0.9.1.0 exposed-modules: Data.Text.ICU Data.Text.ICU.Break Data.Text.ICU.Char Data.Text.ICU.Collate Data.Text.ICU.Convert Data.Text.ICU.Error Data.Text.ICU.Normalize Data.Text.ICU.Regex Data.Text.ICU.Types other-modules: Data.Text.ICU.Break.Pure Data.Text.ICU.Break.Types Data.Text.ICU.Collate.Internal Data.Text.ICU.Collate.Pure Data.Text.ICU.Convert.Internal Data.Text.ICU.Error.Internal Data.Text.ICU.Internal Data.Text.ICU.Iterator Data.Text.ICU.Normalize.Internal Data.Text.ICU.Regex.Internal Data.Text.ICU.Regex.Pure Data.Text.ICU.Text c-sources: cbits/text_icu.c include-dirs: include extra-libraries: icuuc if os(mingw32) extra-libraries: icuin icudt else extra-libraries: icui18n icudata ghc-options: -Wall if impl(ghc >= 6.8) ghc-options: -fwarn-tabs test-suite tests default-language: Haskell98 type: exitcode-stdio-1.0 hs-source-dirs: tests main-is: Tests.hs other-modules: Properties QuickCheckUtils ghc-options: -Wall -threaded -O0 -rtsopts build-depends: HUnit >= 1.2, QuickCheck >= 2.4, array, base >= 4 && < 5, bytestring, deepseq, directory, ghc-prim, random, test-framework >= 0.4, test-framework-hunit >= 0.2, test-framework-quickcheck2 >= 0.2, text, text-icu source-repository head type: git location: https://github.com/bos/text-icu source-repository head type: mercurial location: https://bitbucket.org/bos/text-icu text-icu-0.6.3.7/benchmarks/0000755000000000000000000000000012234226032013726 5ustar0000000000000000text-icu-0.6.3.7/benchmarks/Breaker.hs0000644000000000000000000000207112234226032015635 0ustar0000000000000000-- Estimate the time difference between creating a breaker. {-# LANGUAGE OverloadedStrings #-} import Control.Monad import qualified Data.Text as T import Data.Text.IO as T import Data.Text.ICU.Break as IO import Data.Text.ICU as ICU import System.Environment consume b = go where go = do m <- next b case m of Nothing -> return () Just _ -> go manyBreakers (t:ts) = do b <- IO.breakWord "en" t consume b manyBreakers ts manyBreakers _ = return () oneBreaker ts = do b <- IO.breakWord "en" "" forM_ ts $ \t -> do setText b t consume b cloneBreakers ts = do b <- IO.breakWord "en" "" forM_ ts $ \t -> do b' <- clone b setText b' t consume b' pureBreaker ts = do let b = ICU.breakWord "en" forM_ ts $ \t -> length (breaks b t) `seq` return () main = do (kind:files) <- getArgs let act = case kind of "one" -> oneBreaker "many" -> manyBreakers "clone" -> cloneBreakers "pure" -> pureBreaker forM_ files $ \f -> T.readFile f >>= act . T.lines text-icu-0.6.3.7/cbits/0000755000000000000000000000000012234226032012715 5ustar0000000000000000text-icu-0.6.3.7/cbits/text_icu.c0000644000000000000000000002600112234226032014704 0ustar0000000000000000#include "hs_text_icu.h" UBreakIterator* __hs_ubrk_open(UBreakIteratorType type, const char *locale, const UChar *text, int32_t textLength, UErrorCode *status) { return ubrk_open(type, locale, text, textLength, status); } void __hs_ubrk_close(UBreakIterator *bi) { ubrk_close(bi); } void __hs_ubrk_setText(UBreakIterator* bi, const UChar *text, int32_t textLength, UErrorCode *status) { ubrk_setText(bi, text, textLength, status); } UBreakIterator * __hs_ubrk_safeClone(const UBreakIterator *bi, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status) { return ubrk_safeClone(bi, stackBuffer, pBufferSize, status); } int32_t __hs_ubrk_current(UBreakIterator *bi) { return ubrk_current(bi); } int32_t __hs_ubrk_first(UBreakIterator *bi) { return ubrk_first(bi); } int32_t __hs_ubrk_last(UBreakIterator *bi) { return ubrk_last(bi); } int32_t __hs_ubrk_next(UBreakIterator *bi) { return ubrk_next(bi); } int32_t __hs_ubrk_previous(UBreakIterator *bi) { return ubrk_previous(bi); } int32_t __hs_ubrk_preceding(UBreakIterator *bi, int32_t offset) { return ubrk_preceding(bi, offset); } int32_t __hs_ubrk_following(UBreakIterator *bi, int32_t offset) { return ubrk_following(bi, offset); } int32_t __hs_ubrk_getRuleStatus(UBreakIterator *bi) { return ubrk_getRuleStatus(bi); } int32_t __hs_ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status) { return ubrk_getRuleStatusVec(bi, fillInVec, capacity, status); } UBool __hs_ubrk_isBoundary(UBreakIterator *bi, int32_t offset) { return ubrk_isBoundary(bi, offset); } int32_t __hs_ubrk_countAvailable(void) { return ubrk_countAvailable(); } const char* __hs_ubrk_getAvailable(int32_t index) { return ubrk_getAvailable(index); } UCollator* __hs_ucol_open(const char *loc, UErrorCode *status) { return ucol_open(loc, status); } void __hs_ucol_close(UCollator *coll) { ucol_close(coll); } void __hs_ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) { ucol_setAttribute(coll, attr, value, status); } UColAttributeValue __hs_ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) { return ucol_getAttribute(coll, attr, status); } UCollationResult __hs_ucol_strcoll(const UCollator *coll, const UChar *source, int32_t sourceLength, const UChar *target, int32_t targetLength) { return ucol_strcoll(coll, source, sourceLength, target, targetLength); } UCollationResult __hs_ucol_strcollIter(const UCollator *coll, UCharIterator *sIter, UCharIterator *tIter, UErrorCode *status) { return ucol_strcollIter(coll, sIter, tIter, status); } UCollator* __hs_ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status) { return ucol_safeClone(coll, stackBuffer, pBufferSize, status); } int32_t __hs_ucol_getSortKey(const UCollator *coll, const UChar *source, int32_t sourceLength, uint8_t *result, int32_t resultLength) { return ucol_getSortKey(coll, source, sourceLength, result, resultLength); } UBool __hs_ucol_equals(const UCollator *source, const UCollator *target) { return ucol_equals(source, target); } int __get_max_bytes_for_string(UConverter *cnv, int src_length) { return UCNV_GET_MAX_BYTES_FOR_STRING(src_length, ucnv_getMaxCharSize(cnv)); } const char *__hs_u_errorName(UErrorCode code) { return u_errorName(code); } const char *__hs_ucnv_getName(const UConverter *converter, UErrorCode *err) { return ucnv_getName(converter, err); } UConverter* __hs_ucnv_open(const char *converterName, UErrorCode *err) { return ucnv_open(converterName, err); } void __hs_ucnv_close(UConverter * converter) { ucnv_close(converter); } int32_t __hs_ucnv_toUChars(UConverter *cnv, UChar *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode) { return ucnv_toUChars(cnv, dest, destCapacity, src, srcLength, pErrorCode); } int32_t __hs_ucnv_fromUChars(UConverter *cnv, char *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode) { return ucnv_fromUChars(cnv, dest, destCapacity, src, srcLength, pErrorCode); } int __hs_ucnv_compareNames(const char *name1, const char *name2) { return ucnv_compareNames(name1, name2); } const char *__hs_ucnv_getDefaultName(void) { return ucnv_getDefaultName(); } void __hs_ucnv_setDefaultName(const char *name) { ucnv_setDefaultName(name); } int32_t __hs_ucnv_countAvailable(void) { return ucnv_countAvailable(); } const char* __hs_ucnv_getAvailableName(int32_t n) { return ucnv_getAvailableName(n); } uint16_t __hs_ucnv_countAliases(const char *alias, UErrorCode *pErrorCode) { return ucnv_countAliases(alias, pErrorCode); } const char *__hs_ucnv_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode) { return ucnv_getAlias(alias, n, pErrorCode); } uint16_t __hs_ucnv_countStandards(void) { return ucnv_countStandards(); } const char *__hs_ucnv_getStandard(uint16_t n, UErrorCode *pErrorCode) { return ucnv_getStandard(n, pErrorCode); } UBool __hs_ucnv_usesFallback(const UConverter *cnv) { return ucnv_usesFallback(cnv); } void __hs_ucnv_setFallback(UConverter *cnv, UBool usesFallback) { ucnv_setFallback(cnv, usesFallback); } UBool __hs_ucnv_isAmbiguous(const UConverter *cnv) { return ucnv_isAmbiguous(cnv); } void __hs_uiter_setString(UCharIterator *iter, const UChar *s, int32_t length) { uiter_setString(iter, s, length); } void __hs_uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) { uiter_setUTF8(iter, s, length); } int32_t __hs_unorm_compare(const UChar *s1, int32_t length1, const UChar *s2, int32_t length2, uint32_t options, UErrorCode *pErrorCode) { return unorm_compare(s1, length1, s2, length2, options, pErrorCode); } UNormalizationCheckResult __hs_unorm_quickCheck(const UChar *source, int32_t sourcelength, UNormalizationMode mode, UErrorCode *status) { return unorm_quickCheck(source, sourcelength, mode, status); } UBool __hs_unorm_isNormalized(const UChar *src, int32_t srcLength, UNormalizationMode mode, UErrorCode *pErrorCode) { return unorm_isNormalized(src, srcLength, mode, pErrorCode); } int32_t __hs_unorm_normalize(const UChar *source, int32_t sourceLength, UNormalizationMode mode, int32_t options, UChar *result, int32_t resultLength, UErrorCode *status) { return unorm_normalize(source, sourceLength, mode, options, result, resultLength, status); } int32_t __hs_u_strToUpper(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, const char *locale, UErrorCode *pErrorCode) { return u_strToUpper(dest, destCapacity, src, srcLength, locale, pErrorCode); } int32_t __hs_u_strToLower(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, const char *locale, UErrorCode *pErrorCode) { return u_strToLower(dest, destCapacity, src, srcLength, locale, pErrorCode); } int32_t __hs_u_strFoldCase(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, uint32_t options, UErrorCode *pErrorCode) { return u_strFoldCase(dest, destCapacity, src, srcLength, options, pErrorCode); } int32_t __hs_u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2) { return u_strCompareIter(iter1, iter2, TRUE); } UBlockCode __hs_ublock_getCode(UChar32 c) { return ublock_getCode(c); } UCharDirection __hs_u_charDirection(UChar32 c) { return u_charDirection(c); } UBool __hs_u_isMirrored(UChar32 c) { return u_isMirrored(c); } UChar32 __hs_u_charMirror(UChar32 c) { return u_charMirror(c); } uint8_t __hs_u_getCombiningClass(UChar32 c) { return u_getCombiningClass(c); } int32_t __hs_u_charDigitValue(UChar32 c) { return u_charDigitValue(c); } int32_t __hs_u_charName(UChar32 code, UCharNameChoice nameChoice, char *buffer, int32_t bufferLength, UErrorCode *pErrorCode) { return u_charName(code, nameChoice, buffer, bufferLength, pErrorCode); } UChar32 __hs_u_charFromName(UCharNameChoice nameChoice, const char *name, UErrorCode *pErrorCode) { return u_charFromName(nameChoice, name, pErrorCode); } int32_t __hs_u_getISOComment(UChar32 c, char *dest, int32_t destCapacity, UErrorCode *pErrorCode) { return u_getISOComment(c, dest, destCapacity, pErrorCode); } int32_t __hs_u_getIntPropertyValue(UChar32 c, UProperty which) { return u_getIntPropertyValue(c, which); } double __hs_u_getNumericValue(UChar32 c) { return u_getNumericValue(c); } URegularExpression * __hs_uregex_open(const UChar *pattern, int32_t patternLength, uint32_t flags, UParseError *pe, UErrorCode *status) { return uregex_open(pattern, patternLength, flags, pe, status); } void __hs_uregex_setTimeLimit(URegularExpression *regexp, int32_t limit, UErrorCode *status) { return uregex_setTimeLimit(regexp, limit, status); } void __hs_uregex_setStackLimit(URegularExpression *regexp, int32_t limit, UErrorCode *status) { return uregex_setStackLimit(regexp, limit, status); } void __hs_uregex_close(URegularExpression *regexp) { return uregex_close(regexp); } URegularExpression *__hs_uregex_clone(URegularExpression *regexp, UErrorCode *pErrorCode) { return uregex_clone(regexp, pErrorCode); } const UChar *__hs_uregex_pattern(const URegularExpression *regexp, int32_t *patLength, UErrorCode *status) { return uregex_pattern(regexp, patLength, status); } int32_t __hs_uregex_flags(const URegularExpression *regexp, UErrorCode *status) { return uregex_flags(regexp, status); } void __hs_uregex_setText(URegularExpression *regexp, const UChar *text, int32_t textLength, UErrorCode *status) { return uregex_setText(regexp, text, textLength, status); } const UChar *__hs_uregex_getText(URegularExpression *regexp, int32_t *textLength, UErrorCode *status) { return uregex_getText(regexp, textLength, status); } UBool __hs_uregex_find(URegularExpression *regexp, int32_t startIndex, UErrorCode *status) { return uregex_find(regexp, startIndex, status); } UBool __hs_uregex_findNext(URegularExpression *regexp, UErrorCode *status) { return uregex_findNext(regexp, status); } int32_t __hs_uregex_start(URegularExpression *regexp, int32_t groupNum, UErrorCode *status) { return uregex_start(regexp, groupNum, status); } int32_t __hs_uregex_end(URegularExpression *regexp, int32_t groupNum, UErrorCode *status) { return uregex_end(regexp, groupNum, status); } int32_t __hs_uregex_groupCount(URegularExpression *regexp, UErrorCode *status) { return uregex_groupCount(regexp, status); } int32_t __hs_uregex_group(URegularExpression *regexp, int32_t groupNum, UChar *dest, int32_t destCapacity, UErrorCode *status) { return uregex_group(regexp, groupNum, dest, destCapacity, status); } text-icu-0.6.3.7/Data/0000755000000000000000000000000012234226032012462 5ustar0000000000000000text-icu-0.6.3.7/Data/Text/0000755000000000000000000000000012234226032013406 5ustar0000000000000000text-icu-0.6.3.7/Data/Text/ICU.hs0000644000000000000000000001034012234226032014360 0ustar0000000000000000{-# LANGUAGE CPP, NoImplicitPrelude #-} -- | -- Module : Data.Text.ICU -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Commonly used functions for Unicode, implemented as bindings to the -- International Components for Unicode (ICU) libraries. -- -- This module contains only the most commonly used types and -- functions. Other modules in this package expose richer interfaces. module Data.Text.ICU ( -- * Data representation -- $data -- * Types LocaleName(..) -- * Boundary analysis -- $break , Breaker , Break , brkPrefix , brkBreak , brkSuffix , brkStatus , Line(..) , Word(..) , breakCharacter , breakLine , breakSentence , breakWord , breaks , breaksRight -- * Case mapping , toCaseFold , toLower , toUpper -- * Iteration , CharIterator , fromString , fromText , fromUtf8 -- * Normalization , NormalizationMode(..) , normalize , quickCheck , isNormalized -- * String comparison -- ** Normalization-sensitive string comparison , CompareOption(..) , compare -- ** Locale-sensitive string collation -- $collate , Collator , collator , collatorWith , collate , collateIter , sortKey , uca -- * Regular expressions , MatchOption(..) , ParseError(errError, errLine, errOffset) , Match , Regex , Regular -- ** Construction , regex , regex' -- ** Inspection , pattern -- ** Searching , find , findAll -- ** Match groups -- $group , groupCount , unfold , span , group , prefix , suffix ) where import Data.Text.ICU.Break.Pure import Data.Text.ICU.Collate.Pure import Data.Text.ICU.Internal import Data.Text.ICU.Iterator import Data.Text.ICU.Normalize import Data.Text.ICU.Regex.Pure import Data.Text.ICU.Text #if defined(__HADDOCK__) import Data.Text.Foreign import Data.Text (Text) #endif -- $data -- -- The Haskell 'Text' type is implemented as an array in the Haskell -- heap. This means that its location is not pinned; it may be copied -- during a garbage collection pass. ICU, on the other hand, works -- with strings that are allocated in the normal system heap and have -- a fixed address. -- -- To accommodate this need, these bindings use the functions from -- 'Data.Text.Foreign' to copy data between the Haskell heap and the -- system heap. The copied strings are still managed automatically, -- but the need to duplicate data does add some performance and memory -- overhead. -- $break -- -- Text boundary analysis is the process of locating linguistic -- boundaries while formatting and handling text. Examples of this -- process include: -- -- * Locating appropriate points to word-wrap text to fit within -- specific margins while displaying or printing. -- -- * Counting characters, words, sentences, or paragraphs. -- -- * Making a list of the unique words in a document. -- -- * Figuring out if a given range of text contains only whole words. -- -- * Capitalizing the first letter of each word. -- -- * Locating a particular unit of the text (For example, finding the -- third word in the document). -- -- The 'Breaker' type was designed to support these kinds of -- tasks. -- -- For the impure boundary analysis API (which is richer, but less -- easy to use than the pure API), see the "Data.Text.ICU.Break" -- module. The impure API supports some uses that may be less -- efficient via the pure API, including: -- -- * Locating the beginning of a word that the user has selected. -- -- * Determining how far to move the text cursor when the user hits an -- arrow key (Some characters require more than one position in the -- text store and some characters in the text store do not display -- at all). -- $collate -- -- For the impure collation API (which is richer, but less easy to -- use than the pure API), see the "Data.Text.ICU.Collate" -- module. -- $group -- -- Capturing groups are numbered starting from zero. Group zero is -- always the entire matching text. Groups greater than zero contain -- the text matching each capturing group in a regular expression. text-icu-0.6.3.7/Data/Text/ICU/0000755000000000000000000000000012234226032014026 5ustar0000000000000000text-icu-0.6.3.7/Data/Text/ICU/Break.hsc0000644000000000000000000002625312234226032015561 0ustar0000000000000000{-# LANGUAGE BangPatterns, ForeignFunctionInterface, RecordWildCards #-} -- | -- Module : Data.Text.ICU.Break -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- String breaking functions for Unicode, implemented as bindings to -- the International Components for Unicode (ICU) libraries. -- -- The text boundary positions are found according to the rules described in -- Unicode Standard Annex #29, Text Boundaries, and Unicode Standard Annex -- #14, Line Breaking Properties. These are available at -- and -- . module Data.Text.ICU.Break ( -- * Types BreakIterator , Line(..) , Word(..) -- * Breaking functions , breakCharacter , breakLine , breakSentence , breakWord , clone , setText -- * Iteration functions -- $indices , current , first , last , next , previous , preceding , following , isBoundary -- * Iterator status , getStatus , getStatuses -- * Locales , available ) where #include import Control.Monad (forM) import Data.IORef (newIORef, writeIORef) import Data.Int (Int32) import Data.Text (Text) import Data.Text.Foreign (I16, useAsPtr) import Data.Text.ICU.Break.Types (BreakIterator(..), UBreakIterator) import Data.Text.ICU.Error.Internal (UErrorCode, handleError) import Data.Text.ICU.Internal (LocaleName(..), UBool, UChar, asBool, withLocaleName) import Foreign.C.String (CString, peekCString) import Foreign.C.Types (CInt(..)) import Foreign.ForeignPtr (newForeignPtr, withForeignPtr) import Foreign.Marshal.Array (allocaArray, peekArray) import Foreign.Marshal.Utils (with) import Foreign.Ptr (FunPtr, Ptr, nullPtr) import Prelude hiding (last) import System.IO.Unsafe (unsafePerformIO) -- $indices -- -- /Important note/: All of the indices accepted and returned by -- functions in this module are offsets into the raw UTF-16 text -- array, /not/ a count of code points. -- | Line break status. data Line = Soft -- ^ A soft line break is a position at -- which a line break is acceptable, but not -- required. | Hard deriving (Eq, Show, Enum) -- | Word break status. data Word = Uncategorized -- ^ A \"word\" that does not fit into another -- category. Includes spaces and most -- punctuation. | Number -- ^ A word that appears to be a number. | Letter -- ^ A word containing letters, excluding -- hiragana, katakana or ideographic -- characters. | Kana -- ^ A word containing kana characters. | Ideograph -- ^ A word containing ideographic characters. deriving (Eq, Show, Enum) -- | Break a string on character boundaries. -- -- Character boundary analysis identifies the boundaries of \"Extended -- Grapheme Clusters\", which are groupings of codepoints that should be -- treated as character-like units for many text operations. Please see -- Unicode Standard Annex #29, Unicode Text Segmentation, -- for additional information on -- grapheme clusters and guidelines on their use. breakCharacter :: LocaleName -> Text -> IO (BreakIterator ()) breakCharacter = open (#const UBRK_CHARACTER) (const ()) -- | Break a string on line boundaries. -- -- Line boundary analysis determines where a text string can be broken when -- line wrapping. The mechanism correctly handles punctuation and hyphenated -- words. breakLine :: LocaleName -> Text -> IO (BreakIterator Line) breakLine = open (#const UBRK_LINE) asLine where asLine i | i < (#const UBRK_LINE_SOFT_LIMIT) = Soft | i < (#const UBRK_LINE_HARD_LIMIT) = Hard | otherwise = error $ "unknown line break status " ++ show i -- | Break a string on sentence boundaries. -- -- Sentence boundary analysis allows selection with correct interpretation -- of periods within numbers and abbreviations, and trailing punctuation -- marks such as quotation marks and parentheses. breakSentence :: LocaleName -> Text -> IO (BreakIterator ()) breakSentence = open (#const UBRK_SENTENCE) (const ()) -- | Break a string on word boundaries. -- -- Word boundary analysis is used by search and replace functions, as well -- as within text editing applications that allow the user to select words -- with a double click. Word selection provides correct interpretation of -- punctuation marks within and following words. Characters that are not -- part of a word, such as symbols or punctuation marks, have word breaks on -- both sides. breakWord :: LocaleName -> Text -> IO (BreakIterator Word) breakWord = open (#const UBRK_WORD) asWord where asWord i | i < (#const UBRK_WORD_NONE_LIMIT) = Uncategorized | i < (#const UBRK_WORD_NUMBER_LIMIT) = Number | i < (#const UBRK_WORD_LETTER_LIMIT) = Letter | i < (#const UBRK_WORD_KANA_LIMIT) = Kana | i < (#const UBRK_WORD_IDEO_LIMIT) = Ideograph | otherwise = error $ "unknown word break status " ++ show i -- | Create a new 'BreakIterator' for locating text boundaries in the -- specified locale. open :: UBreakIteratorType -> (Int32 -> a) -> LocaleName -> Text -> IO (BreakIterator a) open brk f loc t = withLocaleName loc $ \locale -> useAsPtr t $ \ptr len -> do bi <- handleError $ ubrk_open brk locale ptr (fromIntegral len) r <- newIORef t BR r f `fmap` newForeignPtr ubrk_close bi -- | Point an existing 'BreakIterator' at a new piece of text. setText :: BreakIterator a -> Text -> IO () setText BR{..} t = useAsPtr t $ \ptr len -> do withForeignPtr brIter $ \p -> handleError $ ubrk_setText p ptr (fromIntegral len) writeIORef brText t -- | Thread safe cloning operation. This is substantially faster than -- creating a new 'BreakIterator' from scratch. clone :: BreakIterator a -> IO (BreakIterator a) clone BR{..} = do bi <- withForeignPtr brIter $ \p -> with 1 $ handleError . ubrk_safeClone p nullPtr BR brText brStatus `fmap` newForeignPtr ubrk_close bi asIndex :: (Ptr UBreakIterator -> IO Int32) -> BreakIterator a -> IO (Maybe I16) asIndex act BR{..} = do i <- withForeignPtr brIter act return $! if i == (#const UBRK_DONE) then Nothing else Just $! fromIntegral i -- | Reset the breaker to the beginning of the text to be scanned. first :: BreakIterator a -> IO I16 first BR{..} = fromIntegral `fmap` withForeignPtr brIter ubrk_first -- | Reset the breaker to the end of the text to be scanned. last :: BreakIterator a -> IO I16 last BR{..} = fromIntegral `fmap` withForeignPtr brIter ubrk_last -- | Advance the iterator and break at the text boundary that follows the -- current text boundary. next :: BreakIterator a -> IO (Maybe I16) next = asIndex ubrk_next -- | Advance the iterator and break at the text boundary that precedes the -- current text boundary. previous :: BreakIterator a -> IO (Maybe I16) previous = asIndex ubrk_previous -- | Determine the text boundary preceding the specified offset. preceding :: BreakIterator a -> Int -> IO (Maybe I16) preceding bi i = asIndex (flip ubrk_preceding (fromIntegral i)) bi -- | Determine the text boundary following the specified offset. following :: BreakIterator a -> Int -> IO (Maybe I16) following bi i = asIndex (flip ubrk_following (fromIntegral i)) bi -- | Return the character index most recently returned by 'next', -- 'previous', 'first', or 'last'. current :: BreakIterator a -> IO (Maybe I16) current = asIndex ubrk_current -- | Return the status from the break rule that determined the most recently -- returned break position. For rules that do not specify a status, a -- default value of @()@ is returned. getStatus :: BreakIterator a -> IO a getStatus BR{..} = brStatus `fmap` withForeignPtr brIter ubrk_getRuleStatus -- | Return statuses from all of the break rules that determined the most -- recently returned break position. getStatuses :: BreakIterator a -> IO [a] getStatuses BR{..} = withForeignPtr brIter $ \brk -> do n <- handleError $ ubrk_getRuleStatusVec brk nullPtr 0 allocaArray (fromIntegral n) $ \ptr -> do _ <- handleError $ ubrk_getRuleStatusVec brk ptr n map brStatus `fmap` peekArray (fromIntegral n) ptr -- | Determine whether the specfied position is a boundary position. -- As a side effect, leaves the iterator pointing to the first -- boundary position at or after the given offset. isBoundary :: BreakIterator a -> Int -> IO Bool isBoundary BR{..} i = asBool `fmap` withForeignPtr brIter (flip ubrk_isBoundary (fromIntegral i)) -- | Locales for which text breaking information is available. A -- 'BreakIterator' in a locale in this list will perform the correct -- text breaking for the locale. available :: [LocaleName] available = unsafePerformIO $ do n <- ubrk_countAvailable forM [0..n-1] $ \i -> ubrk_getAvailable i >>= fmap Locale . peekCString {-# NOINLINE available #-} type UBreakIteratorType = CInt foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_open" ubrk_open :: UBreakIteratorType -> CString -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO (Ptr UBreakIterator) foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_setText" ubrk_setText :: Ptr UBreakIterator -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO () foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_safeClone" ubrk_safeClone :: Ptr UBreakIterator -> Ptr a -> Ptr Int32 -> Ptr UErrorCode -> IO (Ptr UBreakIterator) foreign import ccall unsafe "hs_text_icu.h &__hs_ubrk_close" ubrk_close :: FunPtr (Ptr UBreakIterator -> IO ()) foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_current" ubrk_current :: Ptr UBreakIterator -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_first" ubrk_first :: Ptr UBreakIterator -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_last" ubrk_last :: Ptr UBreakIterator -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_next" ubrk_next :: Ptr UBreakIterator -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_previous" ubrk_previous :: Ptr UBreakIterator -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_preceding" ubrk_preceding :: Ptr UBreakIterator -> Int32 -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_following" ubrk_following :: Ptr UBreakIterator -> Int32 -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_getRuleStatus" ubrk_getRuleStatus :: Ptr UBreakIterator -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_getRuleStatusVec" ubrk_getRuleStatusVec :: Ptr UBreakIterator -> Ptr Int32 -> Int32 -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_isBoundary" ubrk_isBoundary :: Ptr UBreakIterator -> Int32 -> IO UBool foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_countAvailable" ubrk_countAvailable :: IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_getAvailable" ubrk_getAvailable :: Int32 -> IO CString text-icu-0.6.3.7/Data/Text/ICU/Char.hsc0000644000000000000000000006401612234226032015411 0ustar0000000000000000{-# LANGUAGE BangPatterns, DeriveDataTypeable, FlexibleInstances, ForeignFunctionInterface, FunctionalDependencies, MultiParamTypeClasses #-} -- | -- Module : Data.Text.ICU.Char -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Access to the Unicode Character Database, implemented as bindings -- to the International Components for Unicode (ICU) libraries. -- -- Unicode assigns each code point (not just assigned character) values for -- many properties. Most are simple boolean flags, or constants from a -- small enumerated list. For some, values are relatively more complex -- types. -- -- For more information see \"About the Unicode Character Database\" -- and the ICU User Guide chapter on -- Properties . module Data.Text.ICU.Char ( -- * Working with character properties -- $properties Property -- * Property identifier types , BidiClass_(..) , Block_(..) , Bool_(..) , Decomposition_(..) , EastAsianWidth_(..) , GeneralCategory_(..) , HangulSyllableType_(..) , JoiningGroup_(..) , JoiningType_(..) , NumericType_(..) -- ** Combining class , CanonicalCombiningClass_(..) , LeadCanonicalCombiningClass_(..) , TrailingCanonicalCombiningClass_(..) -- ** Normalization checking , NFCQuickCheck_(..) , NFDQuickCheck_(..) , NFKCQuickCheck_(..) , NFKDQuickCheck_(..) -- ** Text boundaries , GraphemeClusterBreak_(..) , LineBreak_(..) , SentenceBreak_(..) , WordBreak_(..) -- * Property value types , BlockCode(..) , Direction(..) , Decomposition(..) , EastAsianWidth(..) , GeneralCategory(..) , HangulSyllableType(..) , JoiningGroup(..) , JoiningType(..) , NumericType(..) -- ** Text boundaries , GraphemeClusterBreak(..) , LineBreak(..) , SentenceBreak(..) , WordBreak(..) -- * Functions , blockCode , charFullName , charName , charFromFullName , charFromName , combiningClass , direction , property , isoComment , isMirrored , mirror -- ** Conversion to numbers , digitToInt , numericValue ) where #include import Data.Char (chr, ord) import Data.Int (Int32) import Data.Text.ICU.Error (u_INVALID_CHAR_FOUND) import Data.Text.ICU.Error.Internal (UErrorCode, handleOverflowError, withError) import Data.Text.ICU.Internal (UBool, UChar32, asBool) import Data.Text.ICU.Normalize.Internal (toNCR) import Data.Typeable (Typeable) import Data.Word (Word8) import Foreign.C.String (CString, peekCStringLen, withCString) import Foreign.C.Types (CInt(..)) import Foreign.Ptr (Ptr) import System.IO.Unsafe (unsafePerformIO) -- $properties -- -- The 'property' function provides the main view onto the Unicode Character -- Database. Because Unicode character properties have a variety of types, -- the 'property' function is polymorphic. The type of its first argument -- dictates the type of its result, by use of the 'Property' typeclass. -- -- For instance, @'property' 'Alphabetic'@ returns a 'Bool', while @'property' -- 'NFCQuickCheck'@ returns a @'Maybe' 'Bool'@. -- | The language directional property of a character set. data Direction = LeftToRight | RightToLeft | EuropeanNumber | EuropeanNumberSeparator | EuropeanNumberTerminator | ArabicNumber | CommonNumberSeparator | BlockSeparator | SegmentSeparator | WhiteSpaceNeutral | OtherNeutral | LeftToRightEmbedding | LeftToRightOverride | RightToLeftArabic | RightToLeftEmbedding | RightToLeftOverride | PopDirectionalFormat | DirNonSpacingMark | BoundaryNeutral deriving (Eq, Enum, Show, Typeable) -- | Descriptions of Unicode blocks. data BlockCode = NoBlock | BasicLatin | Latin1Supplement | LatinExtendedA | LatinExtendedB | IPAExtensions | SpacingModifierLetters | CombiningDiacriticalMarks | GreekAndCoptic | Cyrillic | Armenian | Hebrew | Arabic | Syriac | Thaana | Devanagari | Bengali | Gurmukhi | Gujarati | Oriya | Tamil | Telugu | Kannada | Malayalam | Sinhala | Thai | Lao | Tibetan | Myanmar | Georgian | HangulJamo | Ethiopic | Cherokee | UnifiedCanadianAboriginalSyllabics | Ogham | Runic | Khmer | Mongolian | LatinExtendedAdditional | GreekExtended | GeneralPunctuation | SuperscriptsAndSubscripts | CurrencySymbols | CombiningDiacriticalMarksForSymbols | LetterlikeSymbols | NumberForms | Arrows | MathematicalOperators | MiscellaneousTechnical | ControlPictures | OpticalCharacterRecognition | EnclosedAlphanumerics | BoxDrawing | BlockElements | GeometricShapes | MiscellaneousSymbols | Dingbats | BraillePatterns | CJKRadicalsSupplement | KangxiRadicals | IdeographicDescriptionCharacters | CJKSymbolsAndPunctuation | Hiragana | Katakana | Bopomofo | HangulCompatibilityJamo | Kanbun | BopomofoExtended | EnclosedCJKLettersAndMonths | CJKCompatibility | CJKUnifiedIdeographsExtensionA | CJKUnifiedIdeographs | YiSyllables | YiRadicals | HangulSyllables | HighSurrogates | HighPrivateUseSurrogates | LowSurrogates | PrivateUseArea | CJKCompatibilityIdeographs | AlphabeticPresentationForms | ArabicPresentationFormsA | CombiningHalfMarks | CJKCompatibilityForms | SmallFormVariants | ArabicPresentationFormsB | Specials | HalfwidthAndFullwidthForms | OldItalic | Gothic | Deseret | ByzantineMusicalSymbols | MusicalSymbols | MathematicalAlphanumericSymbols | CJKUnifiedIdeographsExtensionB | CJKCompatibilityIdeographsSupplement | Tags | CyrillicSupplement | Tagalog | Hanunoo | Buhid | Tagbanwa | MiscellaneousMathematicalSymbolsA | SupplementalArrowsA | SupplementalArrowsB | MiscellaneousMathematicalSymbolsB | SupplementalMathematicalOperators | KatakanaPhoneticExtensions | VariationSelectors | SupplementaryPrivateUseAreaA | SupplementaryPrivateUseAreaB | Limbu | TaiLe | KhmerSymbols | PhoneticExtensions | MiscellaneousSymbolsAndArrows | YijingHexagramSymbols | LinearBSyllabary | LinearBIdeograms | AegeanNumbers | Ugaritic | Shavian | Osmanya | CypriotSyllabary | TaiXuanJingSymbols | VariationSelectorsSupplement | AncientGreekMusicalNotation | AncientGreekNumbers | ArabicSupplement | Buginese | CJKStrokes | CombiningDiacriticalMarksSupplement | Coptic | EthiopicExtended | EthiopicSupplement | GeorgianSupplement | Glagolitic | Kharoshthi | ModifierToneLetters | NewTaiLue | OldPersian | PhoneticExtensionsSupplement | SupplementalPunctuation | SylotiNagri | Tifinagh | VerticalForms | N'Ko | Balinese | LatinExtendedC | LatinExtendedD | PhagsPa | Phoenician | Cuneiform | CuneiformNumbersAndPunctuation | CountingRodNumerals | Sundanese | Lepcha | OlChiki | CyrillicExtendedA | Vai | CyrillicExtendedB | Saurashtra | KayahLi | Rejang | Cham | AncientSymbols | PhaistosDisc | Lycian | Carian | Lydian | MahjongTiles | DominoTiles deriving (Eq, Enum, Show, Typeable) data Bool_ = Alphabetic | ASCIIHexDigit -- ^ 0-9, A-F, a-f | BidiControl -- ^ Format controls which have specific functions in the Bidi Algorithm. | BidiMirrored -- ^ Characters that may change display in RTL text. | Dash -- ^ Variations of dashes. | DefaultIgnorable -- ^ Ignorable in most processing. | Deprecated -- ^ The usage of deprecated characters is strongly discouraged. | Diacritic -- ^ Characters that linguistically modify the meaning of another -- character to which they apply. | Extender -- ^ Extend the value or shape of a preceding alphabetic character, -- e.g. length and iteration marks. | FullCompositionExclusion | GraphemeBase -- ^ For programmatic determination of grapheme cluster boundaries. | GraphemeExtend -- ^ For programmatic determination of grapheme cluster boundaries. | GraphemeLink -- ^ For programmatic determination of grapheme cluster boundaries. | HexDigit -- ^ Characters commonly used for hexadecimal numbers. | Hyphen -- ^ Dashes used to mark connections between pieces of words, plus the -- Katakana middle dot. | IDContinue -- ^ Characters that can continue an identifier. | IDStart -- ^ Characters that can start an identifier. | Ideographic -- ^ CJKV ideographs. | IDSBinaryOperator -- ^ For programmatic determination of Ideographic Description Sequences. | IDSTrinaryOperator | JoinControl -- ^ Format controls for cursive joining and ligation. | LogicalOrderException -- ^ Characters that do not use logical order and require special handling -- in most processing. | Lowercase | Math | NonCharacter -- ^ Code points that are explicitly defined as illegal for the encoding -- of characters. | QuotationMark | Radical -- ^ For programmatic determination of Ideographic Description Sequences. | SoftDotted -- ^ Characters with a "soft dot", like i or j. An accent placed on these -- characters causes the dot to disappear. | TerminalPunctuation -- ^ Punctuation characters that generally mark the end of textual units. | UnifiedIdeograph -- ^ For programmatic determination of Ideographic Description Sequences. | Uppercase | WhiteSpace | XidContinue -- ^ 'IDContinue' modified to allow closure under normalization forms -- NFKC and NFKD. | XidStart -- ^ 'IDStart' modified to allow closure under normalization forms NFKC -- and NFKD. | CaseSensitive -- ^ Either the source of a case mapping or /in/ the target of a case -- mapping. Not the same as the general category @Cased_Letter@. | STerm -- ^ Sentence Terminal. Used in UAX #29: Text Boundaries -- . | VariationSelector -- ^ Indicates all those characters that qualify as Variation -- Selectors. For details on the behavior of these characters, see -- and 15.6 -- Variation Selectors. | NFDInert -- ^ ICU-specific property for characters that are inert under NFD, i.e. -- they do not interact with adjacent characters. Used for example in -- normalizing transforms in incremental mode to find the boundary of -- safely normalizable text despite possible text additions. | NFKDInert -- ^ ICU-specific property for characters that are inert under NFKD, i.e. -- they do not interact with adjacent characters. | NFCInert -- ^ ICU-specific property for characters that are inert under NFC, -- i.e. they do not interact with adjacent characters. | NFKCInert -- ^ ICU-specific property for characters that are inert under NFKC, -- i.e. they do not interact with adjacent characters. | SegmentStarter -- ^ ICU-specific property for characters that are starters in terms of -- Unicode normalization and combining character sequences. | PatternSyntax -- ^ See UAX #31 Identifier and Pattern Syntax -- . | PatternWhiteSpace -- ^ See UAX #31 Identifier and Pattern Syntax -- . | POSIXAlNum -- ^ Alphanumeric character class. | POSIXBlank -- ^ Blank character class. | POSIXGraph -- ^ Graph character class. | POSIXPrint -- ^ Printable character class. | POSIXXDigit -- ^ Hex digit character class. deriving (Eq, Enum, Show, Typeable) class Property p v | p -> v where fromNative :: p -> Int32 -> v toUProperty :: p -> UProperty data BidiClass_ = BidiClass deriving (Show, Typeable) instance Property BidiClass_ Direction where fromNative _ = toEnum . fromIntegral toUProperty _ = (#const UCHAR_BIDI_CLASS) data Block_ = Block instance Property Block_ BlockCode where fromNative _ = toEnum . fromIntegral toUProperty _ = (#const UCHAR_BLOCK) data CanonicalCombiningClass_ = CanonicalCombiningClass deriving (Show,Typeable) instance Property CanonicalCombiningClass_ Int where fromNative _ = fromIntegral toUProperty _ = (#const UCHAR_CANONICAL_COMBINING_CLASS) data Decomposition_ = Decomposition deriving (Show, Typeable) data Decomposition = Canonical | Compat | Circle | Final | Font | Fraction | Initial | Isolated | Medial | Narrow | NoBreak | Small | Square | Sub | Super | Vertical | Wide | Count deriving (Eq, Enum, Show, Typeable) instance Property Decomposition_ (Maybe Decomposition) where fromNative _ = maybeEnum toUProperty _ = (#const UCHAR_DECOMPOSITION_TYPE) data EastAsianWidth_ = EastAsianWidth deriving (Show, Typeable) data EastAsianWidth = EANeutral | EAAmbiguous | EAHalf | EAFull | EANarrow | EAWide | EACount deriving (Eq, Enum, Show, Typeable) instance Property EastAsianWidth_ EastAsianWidth where fromNative _ = toEnum . fromIntegral toUProperty _ = (#const UCHAR_EAST_ASIAN_WIDTH) instance Property Bool_ Bool where fromNative _ = (/=0) toUProperty = fromIntegral . fromEnum data GeneralCategory_ = GeneralCategory deriving (Show, Typeable) data GeneralCategory = GeneralOtherType | UppercaseLetter | LowercaseLetter | TitlecaseLetter | ModifierLetter | OtherLetter | NonSpacingMark | EnclosingMark | CombiningSpacingMark | DecimalDigitNumber | LetterNumber | OtherNumber | SpaceSeparator | LineSeparator | ParagraphSeparator | ControlChar | FormatChar | PrivateUseChar | Surrogate | DashPunctuation | StartPunctuation | EndPunctuation | ConnectorPunctuation | OtherPunctuation | MathSymbol | CurrencySymbol | ModifierSymbol | OtherSymbol | InitialPunctuation | FinalPunctuation deriving (Eq, Enum, Show, Typeable) instance Property GeneralCategory_ GeneralCategory where fromNative _ = toEnum . fromIntegral toUProperty _ = (#const UCHAR_GENERAL_CATEGORY) data JoiningGroup_ = JoiningGroup deriving (Show, Typeable) maybeEnum :: Enum a => Int32 -> Maybe a maybeEnum 0 = Nothing maybeEnum n = Just $! toEnum (fromIntegral n-1) data JoiningGroup = Ain | Alaph | Alef | Beh | Beth | Dal | DalathRish | E | Feh | FinalSemkath | Gaf | Gamal | Hah | HamzaOnHehGoal | He | Heh | HehGoal | Heth | Kaf | Kaph | KnottedHeh | Lam | Lamadh | Meem | Mim | Noon | Nun | Pe | Qaf | Qaph | Reh | ReversedPe | Sad | Sadhe | Seen | Semkath | Shin | SwashKaf | SyriacWaw | Tah | Taw | TehMarbuta | Teth | Waw | Yeh | YehBarree | YehWithTail | Yudh | YudhHe | Zain | Fe | Khaph | Zhain | BurushaskiYehBarree deriving (Eq, Enum, Show, Typeable) instance Property JoiningGroup_ (Maybe JoiningGroup) where fromNative _ = maybeEnum toUProperty _ = (#const UCHAR_JOINING_GROUP) data JoiningType_ = JoiningType deriving (Show, Typeable) data JoiningType = JoinCausing | DualJoining | LeftJoining | RightJoining | Transparent deriving (Eq, Enum, Show, Typeable) instance Property JoiningType_ (Maybe JoiningType) where fromNative _ = maybeEnum toUProperty _ = (#const UCHAR_JOINING_TYPE) data LineBreak_ = LineBreak deriving (Show, Typeable) data LineBreak = Ambiguous | LBAlphabetic | BreakBoth | BreakAfter | BreakBefore | MandatoryBreak | ContingentBreak | ClosePunctuation | CombiningMark | CarriageReturn | Exclamation | Glue | LBHyphen | LBIdeographic | Inseparable | InfixNumeric | LineFeed | Nonstarter | Numeric | OpenPunctuation | PostfixNumeric | PrefixNumeric | Quotation | ComplexContext | LBSurrogate | Space | BreakSymbols | Zwspace | NextLine | WordJoiner | H2 | H3 | JL | JT | JV deriving (Eq, Enum, Show, Typeable) instance Property LineBreak_ (Maybe LineBreak) where fromNative _ = maybeEnum toUProperty _ = (#const UCHAR_LINE_BREAK) data NumericType_ = NumericType deriving (Show, Typeable) data NumericType = NTDecimal | NTDigit | NTNumeric deriving (Eq, Enum, Show, Typeable) instance Property NumericType_ (Maybe NumericType) where fromNative _ = maybeEnum toUProperty _ = (#const UCHAR_NUMERIC_TYPE) data HangulSyllableType_ = HangulSyllableType deriving (Show, Typeable) data HangulSyllableType = LeadingJamo | VowelJamo | TrailingJamo | LVSyllable | LVTSyllable deriving (Eq, Enum, Show, Typeable) instance Property HangulSyllableType_ (Maybe HangulSyllableType) where fromNative _ = maybeEnum toUProperty _ = (#const UCHAR_HANGUL_SYLLABLE_TYPE) data NFCQuickCheck_ = NFCQuickCheck deriving (Show, Typeable) data NFDQuickCheck_ = NFDQuickCheck deriving (Show, Typeable) data NFKCQuickCheck_ = NFKCQuickCheck deriving (Show, Typeable) data NFKDQuickCheck_ = NFKDQuickCheck deriving (Show, Typeable) instance Property NFCQuickCheck_ (Maybe Bool) where fromNative _ = toNCR . fromIntegral toUProperty _ = (#const UCHAR_NFC_QUICK_CHECK) instance Property NFDQuickCheck_ (Maybe Bool) where fromNative _ = toNCR . fromIntegral toUProperty _ = (#const UCHAR_NFD_QUICK_CHECK) instance Property NFKCQuickCheck_ (Maybe Bool) where fromNative _ = toNCR . fromIntegral toUProperty _ = (#const UCHAR_NFKC_QUICK_CHECK) instance Property NFKDQuickCheck_ (Maybe Bool) where fromNative _ = toNCR . fromIntegral toUProperty _ = (#const UCHAR_NFKD_QUICK_CHECK) data LeadCanonicalCombiningClass_ = LeadCanonicalCombiningClass deriving (Show, Typeable) instance Property LeadCanonicalCombiningClass_ Int where fromNative _ = fromIntegral toUProperty _ = (#const UCHAR_LEAD_CANONICAL_COMBINING_CLASS) data TrailingCanonicalCombiningClass_ = TrailingCanonicalCombiningClass deriving (Show, Typeable) instance Property TrailingCanonicalCombiningClass_ Int where fromNative _ = fromIntegral toUProperty _ = (#const UCHAR_TRAIL_CANONICAL_COMBINING_CLASS) data GraphemeClusterBreak_ = GraphemeClusterBreak deriving (Show, Typeable) data GraphemeClusterBreak = Control | CR | Extend | L | LF | LV | LVT | T | V | SpacingMark | Prepend deriving (Eq, Enum, Show, Typeable) instance Property GraphemeClusterBreak_ (Maybe GraphemeClusterBreak) where fromNative _ = maybeEnum toUProperty _ = (#const UCHAR_GRAPHEME_CLUSTER_BREAK) data SentenceBreak_ = SentenceBreak deriving (Show, Typeable) data SentenceBreak = SBATerm | SBClose | SBFormat | SBLower | SBNumeric | SBOLetter | SBSep | SBSP | SBSTerm | SBUpper | SBCR | SBExtend | SBLF | SBSContinue deriving (Eq, Enum, Show, Typeable) instance Property SentenceBreak_ (Maybe SentenceBreak) where fromNative _ = maybeEnum toUProperty _ = (#const UCHAR_SENTENCE_BREAK) data WordBreak_ = WordBreak deriving (Show, Typeable) data WordBreak = WBALetter | WBFormat | WBKatakana | WBMidLetter | WBMidNum | WBNumeric | WBExtendNumLet | WBCR | WBExtend | WBLF | WBMidNumLet | WBNewline deriving (Eq, Enum, Show, Typeable) instance Property WordBreak_ (Maybe WordBreak) where fromNative _ = maybeEnum toUProperty _ = (#const UCHAR_WORD_BREAK) property :: Property p v => p -> Char -> v property p c = fromNative p . u_getIntPropertyValue (fromIntegral (ord c)) . toUProperty $ p {-# INLINE property #-} -- | Return the Unicode allocation block that contains the given -- character. blockCode :: Char -> BlockCode blockCode = toEnum . fromIntegral . ublock_getCode . fromIntegral . ord {-# INLINE blockCode #-} -- | Return the bidirectional category value for the code point, -- which is used in the Unicode bidirectional algorithm (UAX #9 -- ). direction :: Char -> Direction direction = toEnum . fromIntegral . u_charDirection . fromIntegral . ord {-# INLINE direction #-} -- | Determine whether the code point has the 'BidiMirrored' property. This -- property is set for characters that are commonly used in Right-To-Left -- contexts and need to be displayed with a "mirrored" glyph. isMirrored :: Char -> Bool isMirrored = asBool . u_isMirrored . fromIntegral . ord {-# INLINE isMirrored #-} -- Map the specified character to a "mirror-image" character. -- -- For characters with the 'BidiMirrored' property, implementations -- sometimes need a "poor man's" mapping to another Unicode (code point) -- such that the default glyph may serve as the mirror image of the default -- glyph of the specified character. This is useful for text conversion to -- and from codepages with visual order, and for displays without glyph -- selection capabilities. -- -- The return value is another Unicode code point that may serve as a -- mirror-image substitute, or the original character itself if there -- is no such mapping or the character lacks the 'BidiMirrored' -- property. mirror :: Char -> Char mirror = chr . fromIntegral . u_charMirror . fromIntegral . ord {-# INLINE mirror #-} combiningClass :: Char -> Int combiningClass = fromIntegral . u_getCombiningClass . fromIntegral . ord {-# INLINE combiningClass #-} -- | Return the decimal digit value of a decimal digit character. -- Such characters have the general category @Nd@ (decimal digit -- numbers) and a 'NumericType' of 'NTDecimal'. -- -- No digit values are returned for any Han characters, because Han -- number characters are often used with a special Chinese-style -- number format (with characters for powers of 10 in between) instead -- of in decimal-positional notation. Unicode 4 explicitly assigns -- Han number characters a 'NumericType' of 'NTNumeric' instead of -- 'NTDecimal'. digitToInt :: Char -> Maybe Int digitToInt c | i == -1 = Nothing | otherwise = Just $! fromIntegral i where i = u_charDigitValue . fromIntegral . ord $ c -- | Return the numeric value for a Unicode code point as defined in the -- Unicode Character Database. -- -- A 'Double' return type is necessary because some numeric values are -- fractions, negative, or too large to fit in a fixed-width integral type. numericValue :: Char -> Maybe Double numericValue c | v == (#const U_NO_NUMERIC_VALUE) = Nothing | otherwise = Just v where v = u_getNumericValue . fromIntegral . ord $ c -- | Return the name of a Unicode character. -- -- The names of all unassigned characters are empty. -- -- The name contains only "invariant" characters like A-Z, 0-9, space, -- and \'-\'. charName :: Char -> String charName = charName' (#const U_UNICODE_CHAR_NAME) -- | Return the full name of a Unicode character. -- -- Compared to 'charName', this function gives each Unicode code point -- a unique extended name. Extended names are lowercase followed by an -- uppercase hexadecimal number, within angle brackets. charFullName :: Char -> String charFullName = charName' (#const U_EXTENDED_CHAR_NAME) -- | Find a Unicode character by its full name, and return its code -- point value. -- -- The name is matched exactly and completely. -- -- A Unicode 1.0 name is matched only if it differs from the modern -- name. Unicode names are all uppercase. charFromName :: String -> Maybe Char charFromName = charFromName' (#const U_UNICODE_CHAR_NAME) -- | Find a Unicode character by its full or extended name, and return -- its code point value. -- -- The name is matched exactly and completely. -- -- A Unicode 1.0 name is matched only if it differs from the modern -- name. -- -- Compared to 'charFromName', this function gives each Unicode code -- point a unique extended name. Extended names are lowercase followed -- by an uppercase hexadecimal number, within angle brackets. charFromFullName :: String -> Maybe Char charFromFullName = charFromName' (#const U_EXTENDED_CHAR_NAME) charFromName' :: UCharNameChoice -> String -> Maybe Char charFromName' choice name = unsafePerformIO . withCString name $ \ptr -> do (err,r) <- withError $ u_charFromName choice ptr return $! if err == u_INVALID_CHAR_FOUND || r == 0xffff then Nothing else Just $! chr (fromIntegral r) -- | Return the ISO 10646 comment for a character. -- -- If a character does not have an associated comment, the empty -- string is returned. -- -- The ISO 10646 comment is an informative field in the Unicode -- Character Database (@UnicodeData.txt@ field 11) and is from the ISO -- 10646 names list. isoComment :: Char -> String isoComment c = fillString $ u_getISOComment (fromIntegral (ord c)) charName' :: UCharNameChoice -> Char -> String charName' choice c = fillString $ u_charName (fromIntegral (ord c)) choice fillString :: (CString -> Int32 -> Ptr UErrorCode -> IO Int32) -> String fillString act = unsafePerformIO $ handleOverflowError 83 act (curry peekCStringLen) type UBlockCode = CInt type UCharDirection = CInt type UCharNameChoice = CInt type UProperty = CInt foreign import ccall unsafe "hs_text_icu.h __hs_ublock_getCode" ublock_getCode :: UChar32 -> UBlockCode foreign import ccall unsafe "hs_text_icu.h __hs_u_charDirection" u_charDirection :: UChar32 -> UCharDirection foreign import ccall unsafe "hs_text_icu.h __hs_u_isMirrored" u_isMirrored :: UChar32 -> UBool foreign import ccall unsafe "hs_text_icu.h __hs_u_charMirror" u_charMirror :: UChar32 -> UChar32 foreign import ccall unsafe "hs_text_icu.h __hs_u_getCombiningClass" u_getCombiningClass :: UChar32 -> Word8 foreign import ccall unsafe "hs_text_icu.h __hs_u_charDigitValue" u_charDigitValue :: UChar32 -> Int32 foreign import ccall unsafe "hs_text_icu.h __hs_u_charName" u_charName :: UChar32 -> UCharNameChoice -> CString -> Int32 -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_u_charFromName" u_charFromName :: UCharNameChoice -> CString -> Ptr UErrorCode -> IO UChar32 foreign import ccall unsafe "hs_text_icu.h __hs_u_getISOComment" u_getISOComment :: UChar32 -> CString -> Int32 -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_u_getIntPropertyValue" u_getIntPropertyValue :: UChar32 -> UProperty -> Int32 foreign import ccall unsafe "hs_text_icu.h __hs_u_getNumericValue" u_getNumericValue :: UChar32 -> Double text-icu-0.6.3.7/Data/Text/ICU/Collate.hsc0000644000000000000000000003221012234226032016106 0ustar0000000000000000{-# LANGUAGE CPP, DeriveDataTypeable, ForeignFunctionInterface #-} -- | -- Module : Data.Text.ICU.Collate -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- String collation functions for Unicode, implemented as bindings to -- the International Components for Unicode (ICU) libraries. module Data.Text.ICU.Collate ( -- * Unicode collation API -- $api -- * Types MCollator , Attribute(..) , AlternateHandling(..) , CaseFirst(..) , Strength(..) -- * Functions , open , collate , collateIter -- ** Utility functions , equals , getAttribute , setAttribute , sortKey , clone , freeze ) where #include import Data.ByteString (empty) import Data.ByteString.Internal (ByteString(..), create, mallocByteString, memcpy) import Data.Int (Int32) import Data.Text (Text) import qualified Data.Text as T import Data.Text.Foreign (useAsPtr) import Data.Text.ICU.Collate.Internal (Collator(..), MCollator, UCollator, equals, withCollator, wrap) import Data.Text.ICU.Error.Internal (UErrorCode, handleError) import Data.Text.ICU.Internal (LocaleName, UChar, CharIterator, UCharIterator, asOrdering, withCharIterator, withLocaleName) import Data.Typeable (Typeable) import Data.Word (Word8) import Foreign.C.String (CString) import Foreign.C.Types (CInt(..)) import Foreign.ForeignPtr (withForeignPtr) import Foreign.Marshal.Utils (with) import Foreign.Ptr (Ptr, nullPtr) -- $api -- -- | Control the handling of variable weight elements. data AlternateHandling = NonIgnorable -- ^ Treat all codepoints with non-ignorable primary -- weights in the same way. | Shifted -- ^ Cause codepoints with primary weights that are -- equal to or below the variable top value to be -- ignored on primary level and moved to the -- quaternary level. deriving (Eq, Bounded, Enum, Show, Typeable) -- | Control the ordering of upper and lower case letters. data CaseFirst = UpperFirst -- ^ Force upper case letters to sort before -- lower case. | LowerFirst -- ^ Force lower case letters to sort before -- upper case. deriving (Eq, Bounded, Enum, Show, Typeable) -- | The strength attribute. The usual strength for most locales (except -- Japanese) is tertiary. Quaternary strength is useful when combined with -- shifted setting for alternate handling attribute and for JIS x 4061 -- collation, when it is used to distinguish between Katakana and Hiragana -- (this is achieved by setting 'HiraganaQuaternaryMode' mode to -- 'True'). Otherwise, quaternary level is affected only by the number of -- non ignorable code points in the string. Identical strength is rarely -- useful, as it amounts to codepoints of the 'NFD' form of the string. data Strength = Primary | Secondary | Tertiary | Quaternary | Identical deriving (Eq, Bounded, Enum, Show, Typeable) data Attribute = French Bool -- ^ Direction of secondary weights, used in French. 'True', -- results in secondary weights being considered backwards, -- while 'False' treats secondary weights in the order in -- which they appear. | AlternateHandling AlternateHandling -- ^ For handling variable elements. 'NonIgnorable' is -- default. | CaseFirst (Maybe CaseFirst) -- ^ Control the ordering of upper and lower case letters. -- 'Nothing' (the default) orders upper and lower case -- letters in accordance to their tertiary weights. | CaseLevel Bool -- ^ Controls whether an extra case level (positioned -- before the third level) is generated or not. When -- 'False' (default), case level is not generated; when -- 'True', the case level is generated. Contents of the -- case level are affected by the value of the 'CaseFirst' -- attribute. A simple way to ignore accent differences in -- a string is to set the strength to 'Primary' and enable -- case level. | NormalizationMode Bool -- ^ Controls whether the normalization check and necessary -- normalizations are performed. When 'False' (default) no -- normalization check is performed. The correctness of the -- result is guaranteed only if the input data is in -- so-called 'FCD' form (see users manual for more info). -- When 'True', an incremental check is performed to see -- whether the input data is in 'FCD' form. If the data is -- not in 'FCD' form, incremental 'NFD' normalization is -- performed. | Strength Strength | HiraganaQuaternaryMode Bool -- ^ When turned on, this attribute positions Hiragana -- before all non-ignorables on quaternary level. This is a -- sneaky way to produce JIS sort order. | Numeric Bool -- ^ When enabled, this attribute generates a collation key -- for the numeric value of substrings of digits. This is -- a way to get '100' to sort /after/ '2'. deriving (Eq, Show, Typeable) type UColAttribute = CInt type UColAttributeValue = CInt toUAttribute :: Attribute -> (UColAttribute, UColAttributeValue) toUAttribute (French v) = ((#const UCOL_FRENCH_COLLATION), toOO v) toUAttribute (AlternateHandling v) = ((#const UCOL_ALTERNATE_HANDLING), toAH v) toUAttribute (CaseFirst v) = ((#const UCOL_CASE_FIRST), toCF v) toUAttribute (CaseLevel v) = ((#const UCOL_CASE_LEVEL), toOO v) toUAttribute (NormalizationMode v) = ((#const UCOL_NORMALIZATION_MODE), toOO v) toUAttribute (Strength v) = ((#const UCOL_STRENGTH), toS v) toUAttribute (HiraganaQuaternaryMode v) = ((#const UCOL_HIRAGANA_QUATERNARY_MODE), toOO v) toUAttribute (Numeric v) = ((#const UCOL_NUMERIC_COLLATION), toOO v) toOO :: Bool -> UColAttributeValue toOO False = #const UCOL_OFF toOO True = #const UCOL_ON toAH :: AlternateHandling -> UColAttributeValue toAH NonIgnorable = #const UCOL_NON_IGNORABLE toAH Shifted = #const UCOL_SHIFTED toCF :: Maybe CaseFirst -> UColAttributeValue toCF Nothing = #const UCOL_OFF toCF (Just UpperFirst) = #const UCOL_UPPER_FIRST toCF (Just LowerFirst) = #const UCOL_LOWER_FIRST toS :: Strength -> UColAttributeValue toS Primary = #const UCOL_PRIMARY toS Secondary = #const UCOL_SECONDARY toS Tertiary = #const UCOL_TERTIARY toS Quaternary = #const UCOL_QUATERNARY toS Identical = #const UCOL_IDENTICAL fromOO :: UColAttributeValue -> Bool fromOO (#const UCOL_OFF) = False fromOO (#const UCOL_ON) = True fromOO bad = valueError "fromOO" bad fromAH :: UColAttributeValue -> AlternateHandling fromAH (#const UCOL_NON_IGNORABLE) = NonIgnorable fromAH (#const UCOL_SHIFTED) = Shifted fromAH bad = valueError "fromAH" bad fromCF :: UColAttributeValue -> Maybe CaseFirst fromCF (#const UCOL_OFF) = Nothing fromCF (#const UCOL_UPPER_FIRST) = Just UpperFirst fromCF (#const UCOL_LOWER_FIRST) = Just LowerFirst fromCF bad = valueError "fromCF" bad fromS :: UColAttributeValue -> Strength fromS (#const UCOL_PRIMARY) = Primary fromS (#const UCOL_SECONDARY) = Secondary fromS (#const UCOL_TERTIARY) = Tertiary fromS (#const UCOL_QUATERNARY) = Quaternary fromS (#const UCOL_IDENTICAL) = Identical fromS bad = valueError "fromS" bad fromUAttribute :: UColAttribute -> UColAttributeValue -> Attribute fromUAttribute key val = case key of (#const UCOL_FRENCH_COLLATION) -> French (fromOO val) (#const UCOL_ALTERNATE_HANDLING) -> AlternateHandling (fromAH val) (#const UCOL_CASE_FIRST) -> CaseFirst (fromCF val) (#const UCOL_CASE_LEVEL) -> CaseLevel (fromOO val) (#const UCOL_NORMALIZATION_MODE) -> NormalizationMode (fromOO val) (#const UCOL_STRENGTH) -> Strength (fromS val) (#const UCOL_HIRAGANA_QUATERNARY_MODE) -> HiraganaQuaternaryMode (fromOO val) (#const UCOL_NUMERIC_COLLATION) -> Numeric (fromOO val) _ -> valueError "fromUAttribute" key valueError :: Show a => String -> a -> z valueError func bad = error ("Data.Text.ICU.Collate.IO." ++ func ++ ": invalid value " ++ show bad) type UCollationResult = CInt -- | Open a 'Collator' for comparing strings. open :: LocaleName -- ^ The locale containing the required collation rules. -> IO MCollator open loc = wrap =<< withLocaleName loc (handleError . ucol_open) -- | Set the value of an 'MCollator' attribute. setAttribute :: MCollator -> Attribute -> IO () setAttribute c a = withCollator c $ \cptr -> handleError $ uncurry (ucol_setAttribute cptr) (toUAttribute a) -- | Get the value of an 'MCollator' attribute. -- -- It is safe to provide a dummy argument to an 'Attribute' constructor when -- using this function, so the following will work: -- -- > getAttribute mcol (NormalizationMode undefined) getAttribute :: MCollator -> Attribute -> IO Attribute getAttribute c a = do let name = fst (toUAttribute a) val <- withCollator c $ \cptr -> handleError $ ucol_getAttribute cptr name return $! fromUAttribute name val -- | Compare two strings. collate :: MCollator -> Text -> Text -> IO Ordering collate c a b = withCollator c $ \cptr -> useAsPtr a $ \aptr alen -> useAsPtr b $ \bptr blen -> fmap asOrdering . handleError $ ucol_strcoll cptr aptr (fromIntegral alen) bptr (fromIntegral blen) -- | Compare two 'CharIterator's. -- -- If either iterator was constructed from a 'ByteString', it does not need -- to be copied or converted internally, so this function can be quite -- cheap. collateIter :: MCollator -> CharIterator -> CharIterator -> IO Ordering collateIter c a b = fmap asOrdering . withCollator c $ \cptr -> withCharIterator a $ \ai -> withCharIterator b $ handleError . ucol_strcollIter cptr ai -- | Create a key for sorting the 'Text' using the given 'Collator'. -- The result of comparing two 'ByteString's that have been -- transformed with 'sortKey' will be the same as the result of -- 'collate' on the two untransformed 'Text's. sortKey :: MCollator -> Text -> IO ByteString sortKey c t | T.null t = return empty | otherwise = do withCollator c $ \cptr -> useAsPtr t $ \tptr tlen -> do let len = fromIntegral tlen loop n = do fp <- mallocByteString (fromIntegral n) i <- withForeignPtr fp $ \p -> ucol_getSortKey cptr tptr len p n let j = fromIntegral i case undefined of _ | i == 0 -> error "Data.Text.ICU.Collate.IO.sortKey: internal error" | i > n -> loop i | i <= n `div` 2 -> create j $ \p -> withForeignPtr fp $ \op -> memcpy p op (fromIntegral i) | otherwise -> return $! PS fp 0 j loop (min (len * 4) 8) -- | Make a safe copy of a mutable 'MCollator' for use in pure code. -- Subsequent changes to the 'MCollator' will not affect the state of -- the returned 'Collator'. freeze :: MCollator -> IO Collator freeze = fmap C . clone -- | Make a copy of a mutable 'MCollator'. -- Subsequent changes to the input 'MCollator' will not affect the state of -- the returned 'MCollator'. clone :: MCollator -> IO MCollator clone c = do p <- withCollator c $ \cptr -> with (#const U_COL_SAFECLONE_BUFFERSIZE) (handleError . ucol_safeClone cptr nullPtr) wrap p foreign import ccall unsafe "hs_text_icu.h __hs_ucol_open" ucol_open :: CString -> Ptr UErrorCode -> IO (Ptr UCollator) foreign import ccall unsafe "hs_text_icu.h __hs_ucol_getAttribute" ucol_getAttribute :: Ptr UCollator -> UColAttribute -> Ptr UErrorCode -> IO UColAttributeValue foreign import ccall unsafe "hs_text_icu.h __hs_ucol_setAttribute" ucol_setAttribute :: Ptr UCollator -> UColAttribute -> UColAttributeValue -> Ptr UErrorCode -> IO () foreign import ccall unsafe "hs_text_icu.h __hs_ucol_strcoll" ucol_strcoll :: Ptr UCollator -> Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO UCollationResult foreign import ccall unsafe "hs_text_icu.h __hs_ucol_getSortKey" ucol_getSortKey :: Ptr UCollator -> Ptr UChar -> Int32 -> Ptr Word8 -> Int32 -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ucol_strcollIter" ucol_strcollIter :: Ptr UCollator -> Ptr UCharIterator -> Ptr UCharIterator -> Ptr UErrorCode -> IO UCollationResult foreign import ccall unsafe "hs_text_icu.h __hs_ucol_safeClone" ucol_safeClone :: Ptr UCollator -> Ptr a -> Ptr Int32 -> Ptr UErrorCode -> IO (Ptr UCollator) text-icu-0.6.3.7/Data/Text/ICU/Convert.hs0000644000000000000000000002266612234226032016016 0ustar0000000000000000{-# LANGUAGE ForeignFunctionInterface #-} -- | -- Module : Data.Text.ICU.Convert -- Copyright : (c) 2009, 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Character set conversion functions for Unicode, implemented as -- bindings to the International Components for Unicode (ICU) -- libraries. module Data.Text.ICU.Convert ( -- * Character set conversion Converter -- ** Basic functions , open , fromUnicode , toUnicode -- ** Converter metadata , getName , usesFallback , isAmbiguous -- * Functions for controlling global behavior , getDefaultName , setDefaultName -- * Miscellaneous functions , compareNames , aliases -- * Metadata , converterNames , standardNames ) where import Data.ByteString.Internal (ByteString, createAndTrim) import Data.ByteString.Unsafe (unsafeUseAsCStringLen) import Data.Int (Int32) import Data.Text (Text) import Data.Text.Foreign (fromPtr, lengthWord16, useAsPtr) import Data.Text.ICU.Convert.Internal import Data.Text.ICU.Error.Internal (UErrorCode, handleError) import Data.Word (Word16) import Foreign.C.String (CString, peekCString, withCString) import Foreign.C.Types (CInt(..)) import Foreign.ForeignPtr (newForeignPtr) import Foreign.Marshal.Array (allocaArray) import Foreign.Ptr (FunPtr, Ptr, castPtr) import System.IO.Unsafe (unsafePerformIO) import Data.Text.ICU.Internal (UBool, UChar, asBool, asOrdering, withName) -- | Do a fuzzy compare of two converter/alias names. The comparison -- is case-insensitive, ignores leading zeroes if they are not -- followed by further digits, and ignores all but letters and digits. -- Thus the strings @\"UTF-8\"@, @\"utf_8\"@, @\"u*T\@f08\"@ and -- @\"Utf 8\"@ are exactly equivalent. See section 1.4, Charset Alias -- Matching in Unicode Technical Standard #22 at -- compareNames :: String -> String -> Ordering compareNames a b = unsafePerformIO . withCString a $ \aptr -> fmap asOrdering . withCString b $ ucnv_compareNames aptr -- | Create a 'Converter' with the name of a coded character set -- specified as a string. The actual name will be resolved with the -- alias file using a case-insensitive string comparison that ignores -- leading zeroes and all non-alphanumeric characters. E.g., the -- names @\"UTF8\"@, @\"utf-8\"@, @\"u*T\@f08\"@ and @\"Utf 8\"@ are -- all equivalent (see also 'compareNames'). If an empty string is -- passed for the converter name, it will create one with the -- 'getDefaultName' return value. -- -- A converter name may contain options like a locale specification to -- control the specific behavior of the newly instantiated converter. -- The meaning of the options depends on the particular converter. If -- an option is not defined for or recognized by a given converter, -- then it is ignored. -- -- Options are appended to the converter name string, with a comma -- between the name and the first option and also between adjacent -- options. -- -- If the alias is ambiguous, then the preferred converter is used. -- -- The conversion behavior and names can vary between platforms. ICU -- may convert some characters differently from other -- platforms. Details on this topic are in the ICU User's Guide at -- . Aliases -- starting with a @\"cp\"@ prefix have no specific meaning other than -- its an alias starting with the letters @\"cp\"@. Please do not -- associate any meaning to these aliases. open :: String -- ^ Name of the converter to use. -> Maybe Bool -- ^ Whether to use fallback mappings -- (see 'usesFallback' for details). -> IO Converter open name mf = do c <- fmap Converter . newForeignPtr ucnv_close =<< withName name (handleError . ucnv_open) case mf of Just f -> withConverter c $ \p -> ucnv_setFallback p . fromIntegral . fromEnum $ f _ -> return () return c -- | Convert the Unicode string into a codepage string using the given -- converter. fromUnicode :: Converter -> Text -> ByteString fromUnicode cnv t = unsafePerformIO . useAsPtr t $ \tptr tlen -> withConverter cnv $ \cptr -> do let capacity = fromIntegral . max_bytes_for_string cptr . fromIntegral $ lengthWord16 t createAndTrim (fromIntegral capacity) $ \sptr -> fmap fromIntegral . handleError $ ucnv_fromUChars cptr (castPtr sptr) capacity tptr (fromIntegral tlen) -- | Convert the codepage string into a Unicode string using the given -- converter. toUnicode :: Converter -> ByteString -> Text toUnicode cnv bs = unsafePerformIO . unsafeUseAsCStringLen bs $ \(sptr, slen) -> withConverter cnv $ \cptr -> do let capacity = slen * 2 allocaArray capacity $ \tptr -> fromPtr tptr =<< (fmap fromIntegral . handleError $ ucnv_toUChars cptr tptr (fromIntegral capacity) sptr (fromIntegral slen)) -- | Determines whether the converter uses fallback mappings or not. -- This flag has restrictions. Regardless of this flag, the converter -- will always use fallbacks from Unicode Private Use code points, as -- well as reverse fallbacks (to Unicode). For details see \".ucm -- File Format\" in the Conversion Data chapter of the ICU User Guide: -- usesFallback :: Converter -> Bool usesFallback cnv = unsafePerformIO $ asBool `fmap` withConverter cnv ucnv_usesFallback -- | Returns the current default converter name. If you want to 'open' -- a default converter, you do not need to use this function. It is -- faster to pass the empty string to 'open' the default converter. getDefaultName :: IO String getDefaultName = peekCString =<< ucnv_getDefaultName -- | Indicates whether the converter contains ambiguous mappings of -- the same character or not. isAmbiguous :: Converter -> Bool isAmbiguous cnv = asBool . unsafePerformIO $ withConverter cnv ucnv_isAmbiguous -- | Sets the current default converter name. If this function needs -- to be called, it should be called during application -- initialization. Most of the time, the results from 'getDefaultName' -- or 'open' with an empty string argument is sufficient for your -- application. -- -- /Note/: this function is not thread safe. /Do not/ call this -- function when /any/ ICU function is being used from more than one -- thread! setDefaultName :: String -> IO () setDefaultName s = withCString s $ ucnv_setDefaultName -- | A list of the canonical names of all available converters. converterNames :: [String] {-# NOINLINE converterNames #-} converterNames = unsafePerformIO $ mapM ((peekCString =<<) . ucnv_getAvailableName) [0..ucnv_countAvailable-1] -- | The list of supported standard names. standardNames :: [String] {-# NOINLINE standardNames #-} standardNames = filter (not . null) . unsafePerformIO $ mapM ((peekCString =<<) . handleError . ucnv_getStandard) [0..ucnv_countStandards-1] -- | Return the aliases for a given converter or alias name. aliases :: String -> [String] aliases name = unsafePerformIO . withCString name $ \ptr -> do count <- handleError $ ucnv_countAliases ptr if count == 0 then return [] else mapM ((peekCString =<<) . handleError . ucnv_getAlias ptr) [0..count-1] foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_open" ucnv_open :: CString -> Ptr UErrorCode -> IO (Ptr UConverter) foreign import ccall unsafe "hs_text_icu.h &__hs_ucnv_close" ucnv_close :: FunPtr (Ptr UConverter -> IO ()) foreign import ccall unsafe "__get_max_bytes_for_string" max_bytes_for_string :: Ptr UConverter -> CInt -> CInt foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_toUChars" ucnv_toUChars :: Ptr UConverter -> Ptr UChar -> Int32 -> CString -> Int32 -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_fromUChars" ucnv_fromUChars :: Ptr UConverter -> CString -> Int32 -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_compareNames" ucnv_compareNames :: CString -> CString -> IO CInt foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getDefaultName" ucnv_getDefaultName :: IO CString foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_setDefaultName" ucnv_setDefaultName :: CString -> IO () foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_countAvailable" ucnv_countAvailable :: Int32 foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getAvailableName" ucnv_getAvailableName :: Int32 -> IO CString foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_countAliases" ucnv_countAliases :: CString -> Ptr UErrorCode -> IO Word16 foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getAlias" ucnv_getAlias :: CString -> Word16 -> Ptr UErrorCode -> IO CString foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_countStandards" ucnv_countStandards :: Word16 foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getStandard" ucnv_getStandard :: Word16 -> Ptr UErrorCode -> IO CString foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_usesFallback" ucnv_usesFallback :: Ptr UConverter -> IO UBool foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_setFallback" ucnv_setFallback :: Ptr UConverter -> UBool -> IO () foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_isAmbiguous" ucnv_isAmbiguous :: Ptr UConverter -> IO UBool text-icu-0.6.3.7/Data/Text/ICU/Error.hsc0000644000000000000000000002577112234226032015632 0ustar0000000000000000-- | -- Module : Data.Text.ICU.Error -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Errors thrown by bindings to the International Components for -- Unicode (ICU) libraries. -- -- Most ICU functions can throw an 'ICUError' value as an exception. -- Some can additionally throw a 'ParseError', if more detailed error -- information is necessary. module Data.Text.ICU.Error ( -- * Types ICUError, ParseError(errError, errLine, errOffset), -- * Functions isSuccess, isFailure, errorName, isRegexError, -- * Warnings u_USING_FALLBACK_WARNING, u_USING_DEFAULT_WARNING, u_SAFECLONE_ALLOCATED_WARNING, u_STATE_OLD_WARNING, u_STRING_NOT_TERMINATED_WARNING, u_SORT_KEY_TOO_SHORT_WARNING, u_AMBIGUOUS_ALIAS_WARNING, u_DIFFERENT_UCA_VERSION, -- * Errors u_ILLEGAL_ARGUMENT_ERROR, u_MISSING_RESOURCE_ERROR, u_INVALID_FORMAT_ERROR, u_FILE_ACCESS_ERROR, u_INTERNAL_PROGRAM_ERROR, u_MESSAGE_PARSE_ERROR, u_MEMORY_ALLOCATION_ERROR, u_INDEX_OUTOFBOUNDS_ERROR, u_PARSE_ERROR, u_INVALID_CHAR_FOUND, u_TRUNCATED_CHAR_FOUND, u_ILLEGAL_CHAR_FOUND, u_INVALID_TABLE_FORMAT, u_INVALID_TABLE_FILE, u_BUFFER_OVERFLOW_ERROR, u_UNSUPPORTED_ERROR, u_RESOURCE_TYPE_MISMATCH, u_ILLEGAL_ESCAPE_SEQUENCE, u_UNSUPPORTED_ESCAPE_SEQUENCE, u_NO_SPACE_AVAILABLE, u_CE_NOT_FOUND_ERROR, u_PRIMARY_TOO_LONG_ERROR, u_STATE_TOO_OLD_ERROR, u_TOO_MANY_ALIASES_ERROR, u_ENUM_OUT_OF_SYNC_ERROR, u_INVARIANT_CONVERSION_ERROR, u_INVALID_STATE_ERROR, u_COLLATOR_VERSION_MISMATCH, u_USELESS_COLLATOR_ERROR, u_NO_WRITE_PERMISSION, -- ** Transliterator errors u_BAD_VARIABLE_DEFINITION, u_MALFORMED_RULE, u_MALFORMED_SET, u_MALFORMED_UNICODE_ESCAPE, u_MALFORMED_VARIABLE_DEFINITION, u_MALFORMED_VARIABLE_REFERENCE, u_MISPLACED_CURSOR_OFFSET, u_MISPLACED_QUANTIFIER, u_MISSING_OPERATOR, u_MULTIPLE_ANTE_CONTEXTS, u_MULTIPLE_CURSORS, u_MULTIPLE_POST_CONTEXTS, u_TRAILING_BACKSLASH, u_UNDEFINED_SEGMENT_REFERENCE, u_UNDEFINED_VARIABLE, u_UNQUOTED_SPECIAL, u_UNTERMINATED_QUOTE, u_RULE_MASK_ERROR, u_MISPLACED_COMPOUND_FILTER, u_MULTIPLE_COMPOUND_FILTERS, u_INVALID_RBT_SYNTAX, u_MALFORMED_PRAGMA, u_UNCLOSED_SEGMENT, u_VARIABLE_RANGE_EXHAUSTED, u_VARIABLE_RANGE_OVERLAP, u_ILLEGAL_CHARACTER, u_INTERNAL_TRANSLITERATOR_ERROR, u_INVALID_ID, u_INVALID_FUNCTION, -- ** Formatting API parsing errors u_UNEXPECTED_TOKEN, u_MULTIPLE_DECIMAL_SEPARATORS, u_MULTIPLE_EXPONENTIAL_SYMBOLS, u_MALFORMED_EXPONENTIAL_PATTERN, u_MULTIPLE_PERCENT_SYMBOLS, u_MULTIPLE_PERMILL_SYMBOLS, u_MULTIPLE_PAD_SPECIFIERS, u_PATTERN_SYNTAX_ERROR, u_ILLEGAL_PAD_POSITION, u_UNMATCHED_BRACES, u_ARGUMENT_TYPE_MISMATCH, u_DUPLICATE_KEYWORD, u_UNDEFINED_KEYWORD, u_DEFAULT_KEYWORD_MISSING, -- ** Break iterator errors u_BRK_INTERNAL_ERROR, u_BRK_HEX_DIGITS_EXPECTED, u_BRK_SEMICOLON_EXPECTED, u_BRK_RULE_SYNTAX, u_BRK_UNCLOSED_SET, u_BRK_ASSIGN_ERROR, u_BRK_VARIABLE_REDFINITION, u_BRK_MISMATCHED_PAREN, u_BRK_NEW_LINE_IN_QUOTED_STRING, u_BRK_UNDEFINED_VARIABLE, u_BRK_INIT_ERROR, u_BRK_RULE_EMPTY_SET, u_BRK_UNRECOGNIZED_OPTION, u_BRK_MALFORMED_RULE_TAG, -- ** Regular expression errors u_REGEX_INTERNAL_ERROR, u_REGEX_RULE_SYNTAX, u_REGEX_INVALID_STATE, u_REGEX_BAD_ESCAPE_SEQUENCE, u_REGEX_PROPERTY_SYNTAX, u_REGEX_UNIMPLEMENTED, u_REGEX_MISMATCHED_PAREN, u_REGEX_NUMBER_TOO_BIG, u_REGEX_BAD_INTERVAL, u_REGEX_MAX_LT_MIN, u_REGEX_INVALID_BACK_REF, u_REGEX_INVALID_FLAG, u_REGEX_SET_CONTAINS_STRING, u_REGEX_OCTAL_TOO_BIG, u_REGEX_INVALID_RANGE, u_REGEX_STACK_OVERFLOW, u_REGEX_TIME_OUT, u_REGEX_STOPPED_BY_CALLER, -- ** IDNA errors u_IDNA_PROHIBITED_ERROR, u_IDNA_UNASSIGNED_ERROR, u_IDNA_CHECK_BIDI_ERROR, u_IDNA_STD3_ASCII_RULES_ERROR, u_IDNA_ACE_PREFIX_ERROR, u_IDNA_VERIFICATION_ERROR, u_IDNA_LABEL_TOO_LONG_ERROR, u_IDNA_ZERO_LENGTH_LABEL_ERROR, u_IDNA_DOMAIN_NAME_TOO_LONG_ERROR ) where #ifdef mingw32_HOST_OS #define U_HAVE_INTTYPES_H 1 #endif #include import Data.Text.ICU.Error.Internal #{enum ICUError, ICUError, u_USING_FALLBACK_WARNING = U_USING_FALLBACK_WARNING, u_USING_DEFAULT_WARNING = U_USING_DEFAULT_WARNING, u_SAFECLONE_ALLOCATED_WARNING = U_SAFECLONE_ALLOCATED_WARNING, u_STATE_OLD_WARNING = U_STATE_OLD_WARNING, u_STRING_NOT_TERMINATED_WARNING = U_STRING_NOT_TERMINATED_WARNING, u_SORT_KEY_TOO_SHORT_WARNING = U_SORT_KEY_TOO_SHORT_WARNING, u_AMBIGUOUS_ALIAS_WARNING = U_AMBIGUOUS_ALIAS_WARNING, u_DIFFERENT_UCA_VERSION = U_DIFFERENT_UCA_VERSION, u_ILLEGAL_ARGUMENT_ERROR = U_ILLEGAL_ARGUMENT_ERROR, u_MISSING_RESOURCE_ERROR = U_MISSING_RESOURCE_ERROR, u_INVALID_FORMAT_ERROR = U_INVALID_FORMAT_ERROR, u_FILE_ACCESS_ERROR = U_FILE_ACCESS_ERROR, u_INTERNAL_PROGRAM_ERROR = U_INTERNAL_PROGRAM_ERROR, u_MESSAGE_PARSE_ERROR = U_MESSAGE_PARSE_ERROR, u_MEMORY_ALLOCATION_ERROR = U_MEMORY_ALLOCATION_ERROR, u_INDEX_OUTOFBOUNDS_ERROR = U_INDEX_OUTOFBOUNDS_ERROR, u_PARSE_ERROR = U_PARSE_ERROR, u_INVALID_CHAR_FOUND = U_INVALID_CHAR_FOUND, u_TRUNCATED_CHAR_FOUND = U_TRUNCATED_CHAR_FOUND, u_ILLEGAL_CHAR_FOUND = U_ILLEGAL_CHAR_FOUND, u_INVALID_TABLE_FORMAT = U_INVALID_TABLE_FORMAT, u_INVALID_TABLE_FILE = U_INVALID_TABLE_FILE, u_BUFFER_OVERFLOW_ERROR = U_BUFFER_OVERFLOW_ERROR, u_UNSUPPORTED_ERROR = U_UNSUPPORTED_ERROR, u_RESOURCE_TYPE_MISMATCH = U_RESOURCE_TYPE_MISMATCH, u_ILLEGAL_ESCAPE_SEQUENCE = U_ILLEGAL_ESCAPE_SEQUENCE, u_UNSUPPORTED_ESCAPE_SEQUENCE = U_UNSUPPORTED_ESCAPE_SEQUENCE, u_NO_SPACE_AVAILABLE = U_NO_SPACE_AVAILABLE, u_CE_NOT_FOUND_ERROR = U_CE_NOT_FOUND_ERROR, u_PRIMARY_TOO_LONG_ERROR = U_PRIMARY_TOO_LONG_ERROR, u_STATE_TOO_OLD_ERROR = U_STATE_TOO_OLD_ERROR, u_TOO_MANY_ALIASES_ERROR = U_TOO_MANY_ALIASES_ERROR, u_ENUM_OUT_OF_SYNC_ERROR = U_ENUM_OUT_OF_SYNC_ERROR, u_INVARIANT_CONVERSION_ERROR = U_INVARIANT_CONVERSION_ERROR, u_INVALID_STATE_ERROR = U_INVALID_STATE_ERROR, u_COLLATOR_VERSION_MISMATCH = U_COLLATOR_VERSION_MISMATCH, u_USELESS_COLLATOR_ERROR = U_USELESS_COLLATOR_ERROR, u_NO_WRITE_PERMISSION = U_NO_WRITE_PERMISSION, u_BAD_VARIABLE_DEFINITION = U_BAD_VARIABLE_DEFINITION, u_MALFORMED_RULE = U_MALFORMED_RULE, u_MALFORMED_SET = U_MALFORMED_SET, u_MALFORMED_UNICODE_ESCAPE = U_MALFORMED_UNICODE_ESCAPE, u_MALFORMED_VARIABLE_DEFINITION = U_MALFORMED_VARIABLE_DEFINITION, u_MALFORMED_VARIABLE_REFERENCE = U_MALFORMED_VARIABLE_REFERENCE, u_MISPLACED_CURSOR_OFFSET = U_MISPLACED_CURSOR_OFFSET, u_MISPLACED_QUANTIFIER = U_MISPLACED_QUANTIFIER, u_MISSING_OPERATOR = U_MISSING_OPERATOR, u_MULTIPLE_ANTE_CONTEXTS = U_MULTIPLE_ANTE_CONTEXTS, u_MULTIPLE_CURSORS = U_MULTIPLE_CURSORS, u_MULTIPLE_POST_CONTEXTS = U_MULTIPLE_POST_CONTEXTS, u_TRAILING_BACKSLASH = U_TRAILING_BACKSLASH, u_UNDEFINED_SEGMENT_REFERENCE = U_UNDEFINED_SEGMENT_REFERENCE, u_UNDEFINED_VARIABLE = U_UNDEFINED_VARIABLE, u_UNQUOTED_SPECIAL = U_UNQUOTED_SPECIAL, u_UNTERMINATED_QUOTE = U_UNTERMINATED_QUOTE, u_RULE_MASK_ERROR = U_RULE_MASK_ERROR, u_MISPLACED_COMPOUND_FILTER = U_MISPLACED_COMPOUND_FILTER, u_MULTIPLE_COMPOUND_FILTERS = U_MULTIPLE_COMPOUND_FILTERS, u_INVALID_RBT_SYNTAX = U_INVALID_RBT_SYNTAX, u_MALFORMED_PRAGMA = U_MALFORMED_PRAGMA, u_UNCLOSED_SEGMENT = U_UNCLOSED_SEGMENT, u_VARIABLE_RANGE_EXHAUSTED = U_VARIABLE_RANGE_EXHAUSTED, u_VARIABLE_RANGE_OVERLAP = U_VARIABLE_RANGE_OVERLAP, u_ILLEGAL_CHARACTER = U_ILLEGAL_CHARACTER, u_INTERNAL_TRANSLITERATOR_ERROR = U_INTERNAL_TRANSLITERATOR_ERROR, u_INVALID_ID = U_INVALID_ID, u_INVALID_FUNCTION = U_INVALID_FUNCTION, u_UNEXPECTED_TOKEN = U_UNEXPECTED_TOKEN, u_MULTIPLE_DECIMAL_SEPARATORS = U_MULTIPLE_DECIMAL_SEPARATORS, u_MULTIPLE_EXPONENTIAL_SYMBOLS = U_MULTIPLE_EXPONENTIAL_SYMBOLS, u_MALFORMED_EXPONENTIAL_PATTERN = U_MALFORMED_EXPONENTIAL_PATTERN, u_MULTIPLE_PERCENT_SYMBOLS = U_MULTIPLE_PERCENT_SYMBOLS, u_MULTIPLE_PERMILL_SYMBOLS = U_MULTIPLE_PERMILL_SYMBOLS, u_MULTIPLE_PAD_SPECIFIERS = U_MULTIPLE_PAD_SPECIFIERS, u_PATTERN_SYNTAX_ERROR = U_PATTERN_SYNTAX_ERROR, u_ILLEGAL_PAD_POSITION = U_ILLEGAL_PAD_POSITION, u_UNMATCHED_BRACES = U_UNMATCHED_BRACES, u_ARGUMENT_TYPE_MISMATCH = U_ARGUMENT_TYPE_MISMATCH, u_DUPLICATE_KEYWORD = U_DUPLICATE_KEYWORD, u_UNDEFINED_KEYWORD = U_UNDEFINED_KEYWORD, u_DEFAULT_KEYWORD_MISSING = U_DEFAULT_KEYWORD_MISSING, u_BRK_INTERNAL_ERROR = U_BRK_INTERNAL_ERROR, u_BRK_HEX_DIGITS_EXPECTED = U_BRK_HEX_DIGITS_EXPECTED, u_BRK_SEMICOLON_EXPECTED = U_BRK_SEMICOLON_EXPECTED, u_BRK_RULE_SYNTAX = U_BRK_RULE_SYNTAX, u_BRK_UNCLOSED_SET = U_BRK_UNCLOSED_SET, u_BRK_ASSIGN_ERROR = U_BRK_ASSIGN_ERROR, u_BRK_VARIABLE_REDFINITION = U_BRK_VARIABLE_REDFINITION, u_BRK_MISMATCHED_PAREN = U_BRK_MISMATCHED_PAREN, u_BRK_NEW_LINE_IN_QUOTED_STRING = U_BRK_NEW_LINE_IN_QUOTED_STRING, u_BRK_UNDEFINED_VARIABLE = U_BRK_UNDEFINED_VARIABLE, u_BRK_INIT_ERROR = U_BRK_INIT_ERROR, u_BRK_RULE_EMPTY_SET = U_BRK_RULE_EMPTY_SET, u_BRK_UNRECOGNIZED_OPTION = U_BRK_UNRECOGNIZED_OPTION, u_BRK_MALFORMED_RULE_TAG = U_BRK_MALFORMED_RULE_TAG, u_REGEX_INTERNAL_ERROR = U_REGEX_INTERNAL_ERROR, u_REGEX_RULE_SYNTAX = U_REGEX_RULE_SYNTAX, u_REGEX_INVALID_STATE = U_REGEX_INVALID_STATE, u_REGEX_BAD_ESCAPE_SEQUENCE = U_REGEX_BAD_ESCAPE_SEQUENCE, u_REGEX_PROPERTY_SYNTAX = U_REGEX_PROPERTY_SYNTAX, u_REGEX_UNIMPLEMENTED = U_REGEX_UNIMPLEMENTED, u_REGEX_MISMATCHED_PAREN = U_REGEX_MISMATCHED_PAREN, u_REGEX_NUMBER_TOO_BIG = U_REGEX_NUMBER_TOO_BIG, u_REGEX_BAD_INTERVAL = U_REGEX_BAD_INTERVAL, u_REGEX_MAX_LT_MIN = U_REGEX_MAX_LT_MIN, u_REGEX_INVALID_BACK_REF = U_REGEX_INVALID_BACK_REF, u_REGEX_INVALID_FLAG = U_REGEX_INVALID_FLAG, u_REGEX_SET_CONTAINS_STRING = U_REGEX_SET_CONTAINS_STRING, u_REGEX_OCTAL_TOO_BIG = U_REGEX_OCTAL_TOO_BIG, u_REGEX_INVALID_RANGE = U_REGEX_INVALID_RANGE, u_REGEX_STACK_OVERFLOW = U_REGEX_STACK_OVERFLOW, u_REGEX_TIME_OUT = U_REGEX_TIME_OUT, u_REGEX_STOPPED_BY_CALLER = U_REGEX_STOPPED_BY_CALLER, u_IDNA_PROHIBITED_ERROR = U_IDNA_PROHIBITED_ERROR, u_IDNA_UNASSIGNED_ERROR = U_IDNA_UNASSIGNED_ERROR, u_IDNA_CHECK_BIDI_ERROR = U_IDNA_CHECK_BIDI_ERROR, u_IDNA_STD3_ASCII_RULES_ERROR = U_IDNA_STD3_ASCII_RULES_ERROR, u_IDNA_ACE_PREFIX_ERROR = U_IDNA_ACE_PREFIX_ERROR, u_IDNA_VERIFICATION_ERROR = U_IDNA_VERIFICATION_ERROR, u_IDNA_LABEL_TOO_LONG_ERROR = U_IDNA_LABEL_TOO_LONG_ERROR, u_IDNA_ZERO_LENGTH_LABEL_ERROR = U_IDNA_ZERO_LENGTH_LABEL_ERROR, u_IDNA_DOMAIN_NAME_TOO_LONG_ERROR = U_IDNA_DOMAIN_NAME_TOO_LONG_ERROR } isRegexError :: ICUError -> Bool isRegexError (ICUError err) = err >= #{const U_REGEX_ERROR_START} && err < #{const U_REGEX_ERROR_LIMIT} text-icu-0.6.3.7/Data/Text/ICU/Internal.hsc0000644000000000000000000000642112234226032016304 0ustar0000000000000000{-# LANGUAGE EmptyDataDecls, ForeignFunctionInterface #-} module Data.Text.ICU.Internal ( LocaleName(..) , UBool , UChar , UChar32 , UCharIterator , CharIterator(..) , asBool , asOrdering , withCharIterator , withLocaleName , withName ) where #include import Data.ByteString.Internal (ByteString(..)) import Data.Int (Int8, Int32) import Data.String (IsString(..)) import Data.Text (Text) import Data.Text.Encoding (decodeUtf8) import Data.Text.Foreign (useAsPtr) import Data.Word (Word16, Word32) import Foreign.C.String (CString, withCString) import Foreign.C.Types (CChar) import Foreign.Marshal.Alloc (alloca) import Foreign.ForeignPtr (withForeignPtr) import Foreign.Ptr (Ptr, castPtr, nullPtr) import Foreign.Storable (Storable(..)) -- | A type that supports efficient iteration over Unicode characters. -- -- As an example of where this may be useful, a function using this -- type may be able to iterate over a UTF-8 'ByteString' directly, -- rather than first copying and converting it to an intermediate -- form. This type also allows e.g. comparison between 'Text' and -- 'ByteString', with minimal overhead. data CharIterator = CIText !Text | CIUTF8 !ByteString instance Show CharIterator where show (CIText t) = show t show (CIUTF8 bs) = show (decodeUtf8 bs) data UCharIterator instance Storable UCharIterator where sizeOf _ = #{size UCharIterator} alignment _ = alignment (undefined :: CString) -- | Temporarily allocate a 'UCharIterator' and use it with the -- contents of the to-be-iterated-over string. withCharIterator :: CharIterator -> (Ptr UCharIterator -> IO a) -> IO a withCharIterator (CIUTF8 (PS fp _ l)) act = alloca $ \i -> withForeignPtr fp $ \p -> uiter_setUTF8 i (castPtr p) (fromIntegral l) >> act i withCharIterator (CIText t) act = alloca $ \i -> useAsPtr t $ \p l -> uiter_setString i p (fromIntegral l) >> act i type UBool = Int8 type UChar = Word16 type UChar32 = Word32 asBool :: Integral a => a -> Bool {-# INLINE asBool #-} asBool = (/=0) asOrdering :: Integral a => a -> Ordering {-# INLINE asOrdering #-} asOrdering i | i < 0 = LT | i == 0 = EQ | otherwise = GT withName :: String -> (CString -> IO a) -> IO a withName name act | null name = act nullPtr | otherwise = withCString name act -- | The name of a locale. data LocaleName = Root -- ^ The root locale. For a description of resource bundles -- and the root resource, see -- . | Locale String -- ^ A specific locale. | Current -- ^ The program's current locale. deriving (Eq, Ord, Read, Show) instance IsString LocaleName where fromString = Locale withLocaleName :: LocaleName -> (CString -> IO a) -> IO a withLocaleName Current act = act nullPtr withLocaleName Root act = withCString "" act withLocaleName (Locale n) act = withCString n act foreign import ccall unsafe "hs_text_icu.h __hs_uiter_setString" uiter_setString :: Ptr UCharIterator -> Ptr UChar -> Int32 -> IO () foreign import ccall unsafe "hs_text_icu.h __hs_uiter_setUTF8" uiter_setUTF8 :: Ptr UCharIterator -> Ptr CChar -> Int32 -> IO () text-icu-0.6.3.7/Data/Text/ICU/Iterator.hs0000644000000000000000000000426512234226032016162 0ustar0000000000000000{-# LANGUAGE ForeignFunctionInterface #-} {-# OPTIONS_GHC -fno-warn-orphans #-} -- | -- Module : Data.Text.ICU.Iterator -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Iteration functions for Unicode, implemented as bindings to the -- International Components for Unicode (ICU) libraries. -- -- Unlike the C and C++ @UCharIterator@ type, the Haskell -- 'CharIterator' type is immutable, and can safely be used in pure -- code. -- -- Functions using these iterators may be more efficient than their -- counterparts. For instance, the 'CharIterator' type allows a UTF-8 -- 'ByteString' to be compared against a 'Text', without first -- converting the 'ByteString': -- -- > fromUtf8 bs == fromText t module Data.Text.ICU.Iterator ( -- * Types and constructors CharIterator , fromString , fromText , fromUtf8 ) where import Data.ByteString (ByteString) import Data.Int (Int32) import Data.Text (Text, pack) import Data.Text.ICU.Internal (CharIterator(..), UCharIterator, asOrdering, withCharIterator) import Foreign.Ptr (Ptr) import System.IO.Unsafe (unsafePerformIO) instance Eq CharIterator where a == b = compareIter a b == EQ instance Ord CharIterator where compare = compareIter -- | Compare two 'CharIterator's. compareIter :: CharIterator -> CharIterator -> Ordering compareIter a b = unsafePerformIO . fmap asOrdering . withCharIterator a $ withCharIterator b . u_strCompareIter -- | Construct a 'CharIterator' from a Unicode string. fromString :: String -> CharIterator fromString = CIText . pack {-# INLINE fromString #-} -- | Construct a 'CharIterator' from a Unicode string. fromText :: Text -> CharIterator fromText = CIText {-# INLINE fromText #-} -- | Construct a 'CharIterator' from a Unicode string encoded as a -- UTF-8 'ByteString'. The validity of the encoded string is *not* -- checked. fromUtf8 :: ByteString -> CharIterator fromUtf8 = CIUTF8 {-# INLINE fromUtf8 #-} foreign import ccall unsafe "hs_text_icu.h __hs_u_strCompareIter" u_strCompareIter :: Ptr UCharIterator -> Ptr UCharIterator -> IO Int32 text-icu-0.6.3.7/Data/Text/ICU/Normalize.hsc0000644000000000000000000002642412234226032016475 0ustar0000000000000000{-# LANGUAGE CPP, DeriveDataTypeable, ForeignFunctionInterface #-} -- | -- Module : Data.Text.ICU.Normalize -- Copyright : (c) 2009, 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Character set normalization functions for Unicode, implemented as -- bindings to the International Components for Unicode (ICU) -- libraries. module Data.Text.ICU.Normalize ( -- * Unicode normalization API -- $api NormalizationMode(..) -- * Normalization functions , normalize -- * Normalization checks , quickCheck , isNormalized -- * Normalization-sensitive comparison , CompareOption(..) , compare ) where #ifdef mingw32_HOST_OS #define U_HAVE_INTTYPES_H 1 #endif #include #include import Data.Text (Text) import Data.Text.Foreign (fromPtr, useAsPtr) import Data.Text.ICU.Error.Internal (UErrorCode, handleError, handleOverflowError) import Data.Text.ICU.Internal (UBool, UChar, asBool, asOrdering) import Data.Text.ICU.Normalize.Internal (UNormalizationCheckResult, toNCR) import Data.Typeable (Typeable) import Data.Int (Int32) import Data.Word (Word32) import Foreign.C.Types (CInt(..)) import Foreign.Ptr (Ptr, castPtr) import System.IO.Unsafe (unsafePerformIO) import Prelude hiding (compare) import Data.List (foldl') import Data.Bits ((.|.)) -- $api -- -- The 'normalize' function transforms Unicode text into an equivalent -- composed or decomposed form, allowing for easier sorting and -- searching of text. 'normalize' supports the standard normalization -- forms described in , -- Unicode Standard Annex #15: Unicode Normalization Forms. -- -- Characters with accents or other adornments can be encoded in -- several different ways in Unicode. For example, take the character A-acute. -- In Unicode, this can be encoded as a single character (the -- \"composed\" form): -- -- @ -- 00C1 LATIN CAPITAL LETTER A WITH ACUTE -- @ -- -- or as two separate characters (the \"decomposed\" form): -- -- @ -- 0041 LATIN CAPITAL LETTER A -- 0301 COMBINING ACUTE ACCENT -- @ -- -- To a user of your program, however, both of these sequences should -- be treated as the same \"user-level\" character \"A with acute -- accent\". When you are searching or comparing text, you must -- ensure that these two sequences are treated equivalently. In -- addition, you must handle characters with more than one accent. -- Sometimes the order of a character's combining accents is -- significant, while in other cases accent sequences in different -- orders are really equivalent. -- -- Similarly, the string \"ffi\" can be encoded as three separate letters: -- -- @ -- 0066 LATIN SMALL LETTER F -- 0066 LATIN SMALL LETTER F -- 0069 LATIN SMALL LETTER I -- @ -- -- or as the single character -- -- @ -- FB03 LATIN SMALL LIGATURE FFI -- @ -- -- The \"ffi\" ligature is not a distinct semantic character, and -- strictly speaking it shouldn't be in Unicode at all, but it was -- included for compatibility with existing character sets that -- already provided it. The Unicode standard identifies such -- characters by giving them \"compatibility\" decompositions into the -- corresponding semantic characters. When sorting and searching, you -- will often want to use these mappings. -- -- 'normalize' helps solve these problems by transforming text into -- the canonical composed and decomposed forms as shown in the first -- example above. In addition, you can have it perform compatibility -- decompositions so that you can treat compatibility characters the -- same as their equivalents. Finally, 'normalize' rearranges accents -- into the proper canonical order, so that you do not have to worry -- about accent rearrangement on your own. -- -- Form 'FCD', \"Fast C or D\", is also designed for collation. It -- allows to work on strings that are not necessarily normalized with -- an algorithm (like in collation) that works under \"canonical -- closure\", i.e., it treats precomposed characters and their -- decomposed equivalents the same. -- -- It is not a normalization form because it does not provide for -- uniqueness of representation. Multiple strings may be canonically -- equivalent (their NFDs are identical) and may all conform to 'FCD' -- without being identical themselves. -- -- The form is defined such that the \"raw decomposition\", the -- recursive canonical decomposition of each character, results in a -- string that is canonically ordered. This means that precomposed -- characters are allowed for as long as their decompositions do not -- need canonical reordering. -- -- Its advantage for a process like collation is that all 'NFD' and -- most 'NFC' texts - and many unnormalized texts - already conform to -- 'FCD' and do not need to be normalized ('NFD') for such a -- process. The 'FCD' 'quickCheck' will return 'Yes' for most strings -- in practice. -- -- @'normalize' 'FCD'@ may be implemented with 'NFD'. -- -- For more details on 'FCD' see the collation design document: -- -- -- ICU collation performs either 'NFD' or 'FCD' normalization -- automatically if normalization is turned on for the collator -- object. Beyond collation and string search, normalized strings may -- be useful for string equivalence comparisons, -- transliteration/transcription, unique representations, etc. -- -- The W3C generally recommends to exchange texts in 'NFC'. Note also -- that most legacy character encodings use only precomposed forms and -- often do not encode any combining marks by themselves. For -- conversion to such character encodings the Unicode text needs to be -- normalized to 'NFC'. For more usage examples, see the Unicode -- Standard Annex. type UCompareOption = Word32 -- | Options to 'compare'. data CompareOption = InputIsFCD -- ^ The caller knows that both strings fulfill the -- 'FCD' conditions. If /not/ set, 'compare' will -- 'quickCheck' for 'FCD' and normalize if -- necessary. | CompareIgnoreCase -- ^ Compare strings case-insensitively using case -- folding, instead of case-sensitively. If set, -- then the following case folding options are -- used. | FoldCaseExcludeSpecialI -- ^ When case folding, exclude the special I -- character. For use with Turkic -- (Turkish/Azerbaijani) text data. deriving (Eq, Show, Enum, Typeable) fromCompareOption :: CompareOption -> UCompareOption fromCompareOption InputIsFCD = #const UNORM_INPUT_IS_FCD fromCompareOption CompareIgnoreCase = #const U_COMPARE_IGNORE_CASE fromCompareOption FoldCaseExcludeSpecialI = #const U_FOLD_CASE_EXCLUDE_SPECIAL_I reduceCompareOptions :: [CompareOption] -> UCompareOption reduceCompareOptions = foldl' orO (#const U_COMPARE_CODE_POINT_ORDER) where a `orO` b = a .|. fromCompareOption b type UNormalizationMode = CInt -- | Normalization modes. data NormalizationMode = None -- ^ No decomposition/composition. | NFD -- ^ Canonical decomposition. | NFKD -- ^ Compatibility decomposition. | NFC -- ^ Canonical decomposition followed by canonical composition. | NFKC -- ^ Compatibility decomposition followed by canonical composition. | FCD -- ^ \"Fast C or D\" form. deriving (Eq, Show, Enum, Typeable) toNM :: NormalizationMode -> UNormalizationMode toNM None = #const UNORM_NONE toNM NFD = #const UNORM_NFD toNM NFKD = #const UNORM_NFKD toNM NFC = #const UNORM_NFC toNM NFKC = #const UNORM_NFKC toNM FCD = #const UNORM_FCD -- | Normalize a string according the specified normalization mode. normalize :: NormalizationMode -> Text -> Text normalize mode t = unsafePerformIO . useAsPtr t $ \sptr slen -> let slen' = fromIntegral slen mode' = toNM mode in handleOverflowError (fromIntegral slen) (\dptr dlen -> unorm_normalize sptr slen' mode' 0 dptr (fromIntegral dlen)) (\dptr dlen -> fromPtr (castPtr dptr) (fromIntegral dlen)) -- | Perform an efficient check on a string, to quickly determine if -- the string is in a particular normalization form. -- -- A 'Nothing' result indicates that a definite answer could not be -- determined quickly, and a more thorough check is required, -- e.g. with 'isNormalized'. The user may have to convert the string -- to its normalized form and compare the results. -- -- A result of 'Just' 'True' or 'Just' 'False' indicates that the -- string definitely is, or is not, in the given normalization form. quickCheck :: NormalizationMode -> Text -> Maybe Bool quickCheck mode t = unsafePerformIO . useAsPtr t $ \ptr len -> fmap toNCR . handleError $ unorm_quickCheck ptr (fromIntegral len) (toNM mode) -- | Indicate whether a string is in a given normalization form. -- -- Unlike 'quickCheck', this function returns a definitive result. -- For 'NFD', 'NFKD', and 'FCD' normalization forms, both functions -- work in exactly the same ways. For 'NFC' and 'NFKC' forms, where -- 'quickCheck' may return 'Nothing', this function will perform -- further tests to arrive at a definitive result. isNormalized :: NormalizationMode -> Text -> Bool isNormalized mode t = unsafePerformIO . useAsPtr t $ \ptr len -> fmap asBool . handleError $ unorm_isNormalized ptr (fromIntegral len) (toNM mode) -- | Compare two strings for canonical equivalence. Further options -- include case-insensitive comparison and code point order (as -- opposed to code unit order). -- -- Canonical equivalence between two strings is defined as their -- normalized forms ('NFD' or 'NFC') being identical. This function -- compares strings incrementally instead of normalizing (and -- optionally case-folding) both strings entirely, improving -- performance significantly. -- -- Bulk normalization is only necessary if the strings do not fulfill -- the 'FCD' conditions. Only in this case, and only if the strings -- are relatively long, is memory allocated temporarily. For 'FCD' -- strings and short non-'FCD' strings there is no memory allocation. compare :: [CompareOption] -> Text -> Text -> Ordering compare opts a b = unsafePerformIO . useAsPtr a $ \aptr alen -> useAsPtr b $ \bptr blen -> fmap asOrdering . handleError $ unorm_compare aptr (fromIntegral alen) bptr (fromIntegral blen) (reduceCompareOptions opts) foreign import ccall unsafe "hs_text_icu.h __hs_unorm_compare" unorm_compare :: Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> Word32 -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_unorm_quickCheck" unorm_quickCheck :: Ptr UChar -> Int32 -> UNormalizationMode -> Ptr UErrorCode -> IO UNormalizationCheckResult foreign import ccall unsafe "hs_text_icu.h __hs_unorm_isNormalized" unorm_isNormalized :: Ptr UChar -> Int32 -> UNormalizationMode -> Ptr UErrorCode -> IO UBool foreign import ccall unsafe "hs_text_icu.h __hs_unorm_normalize" unorm_normalize :: Ptr UChar -> Int32 -> UNormalizationMode -> Int32 -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO Int32 text-icu-0.6.3.7/Data/Text/ICU/Regex.hs0000644000000000000000000001545212234226032015443 0ustar0000000000000000{-# LANGUAGE BangPatterns, EmptyDataDecls, MagicHash, RecordWildCards, ScopedTypeVariables #-} {-# OPTIONS_GHC -fno-warn-orphans #-} -- | -- Module : Data.Text.ICU.Regex -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Regular expression support for Unicode, implemented as bindings to -- the International Components for Unicode (ICU) libraries. -- -- The syntax and behaviour of ICU regular expressions are Perl-like. -- For complete details, see the ICU User Guide entry at -- . -- -- /Note/: The functions in this module are not thread safe. For -- thread safe use, see 'clone' below, or use the pure functions in -- 'Data.Text.ICU'. module Data.Text.ICU.Regex ( -- * Types MatchOption(..) , ParseError(errError, errLine, errOffset) , Regex -- * Functions -- ** Construction , regex , regex' , clone -- ** Managing text to search , setText , getText -- ** Inspection , pattern -- ** Searching , find , findNext -- ** Match groups -- $groups , groupCount , start , end , start_ , end_ ) where import Data.Text.ICU.Regex.Internal import qualified Control.Exception as E import Data.IORef (newIORef, readIORef, writeIORef) import Data.Text (Text) import qualified Data.Text.Foreign as T import Data.Text.Foreign (I16) import Data.Text.ICU.Internal (asBool) import Data.Text.ICU.Error.Internal (ParseError(..), handleError) import Data.Word (Word16) import Foreign.ForeignPtr (ForeignPtr, newForeignPtr, withForeignPtr) import Foreign.Marshal.Alloc (alloca) import Foreign.Storable (peek) import System.IO.Unsafe (unsafePerformIO) instance Show Regex where show re = "Regex " ++ show (pattern re) -- $groups -- -- Capturing groups are numbered starting from zero. Group zero is -- always the entire matching text. Groups greater than zero contain -- the text matching each capturing group in a regular expression. -- | Compile a regular expression with the given options. This is -- safest to use when the pattern is constructed at run time. regex' :: [MatchOption] -> Text -> IO (Either ParseError Regex) regex' opts pat = (Right `fmap` regex opts pat) `E.catch` \(err::ParseError) -> return (Left err) -- | Set the subject text string upon which the regular expression -- will look for matches. This function may be called any number of -- times, allowing the regular expression pattern to be applied to -- different strings. setText :: Regex -> Text -> IO () setText Regex{..} t = do (hayfp, hayLen) <- T.asForeignPtr t withForeignPtr reRe $ \rePtr -> withForeignPtr hayfp $ \hayPtr -> handleError $ uregex_setText rePtr hayPtr (fromIntegral hayLen) writeIORef reText $! H hayfp hayLen -- | Get the subject text that is currently associated with this -- regular expression object. getText :: Regex -> IO (ForeignPtr Word16, I16) getText Regex{..} = do H fp len <- readIORef reText return (fp, len) -- | Return the source form of the pattern used to construct this -- regular expression or match. pattern :: Regex -> Text pattern Regex{..} = unsafePerformIO . withForeignPtr reRe $ \rePtr -> alloca $ \lenPtr -> do textPtr <- handleError $ uregex_pattern rePtr lenPtr (T.fromPtr textPtr . fromIntegral) =<< peek lenPtr -- | Find the first matching substring of the input string that -- matches the pattern. -- -- If /n/ is non-negative, the search for a match begins at the -- specified index, and any match region is reset. -- -- If /n/ is -1, the search begins at the start of the input region, -- or at the start of the full string if no region has been specified. -- -- If a match is found, 'start', 'end', and 'group' will provide more -- information regarding the match. find :: Regex -> I16 -> IO Bool find Regex{..} n = fmap asBool . withForeignPtr reRe $ \rePtr -> handleError $ uregex_find rePtr (fromIntegral n) -- | Find the next pattern match in the input string. Begin searching -- the input at the location following the end of he previous match, -- or at the start of the string (or region) if there is no previous -- match. -- -- If a match is found, 'start', 'end', and 'group' will provide more -- information regarding the match. findNext :: Regex -> IO Bool findNext Regex{..} = fmap asBool . withForeignPtr reRe $ handleError . uregex_findNext -- | Make a copy of a compiled regular expression. Cloning a regular -- expression is faster than opening a second instance from the source -- form of the expression, and requires less memory. -- -- Note that the current input string and the position of any matched -- text within it are not cloned; only the pattern itself and and the -- match mode flags are copied. -- -- Cloning can be particularly useful to threaded applications that -- perform multiple match operations in parallel. Each concurrent RE -- operation requires its own instance of a 'Regex'. clone :: Regex -> IO Regex {-# INLINE clone #-} clone Regex{..} = do fp <- newForeignPtr uregex_close =<< withForeignPtr reRe (handleError . uregex_clone) Regex fp `fmap` newIORef (H emptyForeignPtr 0) -- | Return the number of capturing groups in this regular -- expression's pattern. groupCount :: Regex -> IO Int groupCount Regex{..} = fmap fromIntegral . withForeignPtr reRe $ handleError . uregex_groupCount -- | Returns the index in the input string of the start of the text -- matched by the specified capture group during the previous match -- operation. Returns @-1@ if the capture group was not part of the -- last match. start_ :: Regex -> Int -> IO I16 start_ Regex{..} n = fmap fromIntegral . withForeignPtr reRe $ \rePtr -> handleError $ uregex_start rePtr (fromIntegral n) -- | Returns the index in the input string of the end of the text -- matched by the specified capture group during the previous match -- operation. Returns @-1@ if the capture group was not part of -- the last match. end_ :: Regex -> Int -> IO I16 end_ Regex{..} n = fmap fromIntegral . withForeignPtr reRe $ \rePtr -> handleError $ uregex_end rePtr (fromIntegral n) -- | Returns the index in the input string of the start of the text -- matched by the specified capture group during the previous match -- operation. Returns 'Nothing' if the capture group was not part of -- the last match. start :: Regex -> Int -> IO (Maybe I16) start r n = check `fmap` start_ r n -- | Returns the index in the input string of the end of the text -- matched by the specified capture group during the previous match -- operation. Returns 'Nothing' if the capture group was not part of -- the last match. end :: Regex -> Int -> IO (Maybe I16) end r n = check `fmap` end_ r n check :: I16 -> Maybe I16 check (-1) = Nothing check k = Just $! fromIntegral k text-icu-0.6.3.7/Data/Text/ICU/Text.hs0000644000000000000000000000603712234226032015314 0ustar0000000000000000{-# LANGUAGE ForeignFunctionInterface #-} -- | -- Module : Data.Text.ICU.Text -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Functions for manipulating Unicode text, implemented as bindings to -- the International Components for Unicode (ICU) libraries. module Data.Text.ICU.Text ( -- * Case conversion -- $case toCaseFold , toLower , toUpper ) where import Data.Int (Int32) import Data.Text (Text) import Data.Text.Foreign (fromPtr, useAsPtr) import Data.Text.ICU.Error.Internal (UErrorCode, handleOverflowError) import Data.Text.ICU.Internal (LocaleName, UChar, withLocaleName) import Data.Word (Word32) import Foreign.C.String (CString) import Foreign.Ptr (Ptr, castPtr) import System.IO.Unsafe (unsafePerformIO) -- $case -- -- In some languages, case conversion is a locale- and -- context-dependent operation. The case conversion functions in this -- module are locale and context sensitive. -- | Case-fold the characters in a string. -- -- Case folding is locale independent and not context sensitive, but -- there is an option for treating the letter I specially for Turkic -- languages. The result may be longer or shorter than the original. toCaseFold :: Bool -- ^ Whether to include or exclude mappings for -- dotted and dotless I and i that are marked with -- 'I' in @CaseFolding.txt@. -> Text -> Text toCaseFold excludeI s = unsafePerformIO . useAsPtr s $ \sptr slen -> do let opts = fromIntegral . fromEnum $ excludeI handleOverflowError (fromIntegral slen) (\dptr dlen -> u_strFoldCase dptr dlen sptr (fromIntegral slen) opts) (\dptr dlen -> fromPtr (castPtr dptr) (fromIntegral dlen)) type CaseMapper = Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> CString -> Ptr UErrorCode -> IO Int32 caseMap :: CaseMapper -> LocaleName -> Text -> Text caseMap mapFn loc s = unsafePerformIO . withLocaleName loc $ \locale -> useAsPtr s $ \sptr slen -> handleOverflowError (fromIntegral slen) (\dptr dlen -> mapFn dptr dlen sptr (fromIntegral slen) locale) (\dptr dlen -> fromPtr (castPtr dptr) (fromIntegral dlen)) -- | Lowercase the characters in a string. -- -- Casing is locale dependent and context sensitive. The result may -- be longer or shorter than the original. toLower :: LocaleName -> Text -> Text toLower = caseMap u_strToLower -- | Uppercase the characters in a string. -- -- Casing is locale dependent and context sensitive. The result may -- be longer or shorter than the original. toUpper :: LocaleName -> Text -> Text toUpper = caseMap u_strToUpper foreign import ccall unsafe "hs_text_icu.h __hs_u_strFoldCase" u_strFoldCase :: Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> Word32 -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_u_strToLower" u_strToLower :: CaseMapper foreign import ccall unsafe "hs_text_icu.h __hs_u_strToUpper" u_strToUpper :: CaseMapper text-icu-0.6.3.7/Data/Text/ICU/Types.hs0000644000000000000000000000110212234226032015460 0ustar0000000000000000-- | -- Module : Data.Text.ICU.Types -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Types for use when manipulating Unicode text, using the bindings to -- the International Components for Unicode (ICU) libraries. module Data.Text.ICU.Types ( -- * Widely used types LocaleName(..) , ParseError(errError, errLine, errOffset) ) where import Data.Text.ICU.Error.Internal (ParseError(..)) import Data.Text.ICU.Internal (LocaleName(..)) text-icu-0.6.3.7/Data/Text/ICU/Break/0000755000000000000000000000000012234226032015052 5ustar0000000000000000text-icu-0.6.3.7/Data/Text/ICU/Break/Pure.hs0000644000000000000000000001102212234226032016315 0ustar0000000000000000{-# LANGUAGE BangPatterns, RecordWildCards #-} -- | -- Module : Data.Text.ICU.Break.Pure -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- String breaking functions for Unicode, implemented as bindings to -- the International Components for Unicode (ICU) libraries. -- -- The text boundary positions are found according to the rules described in -- Unicode Standard Annex #29, Text Boundaries, and Unicode Standard Annex -- #14, Line Breaking Properties. These are available at -- and -- . module Data.Text.ICU.Break.Pure ( -- * Types Breaker , Break , brkPrefix , brkBreak , brkSuffix , brkStatus , Line(..) , Word(..) -- * Breaking functions , breakCharacter , breakLine , breakSentence , breakWord -- * Iteration , breaks , breaksRight ) where import Data.Text (Text, empty) import Data.Text.Foreign (dropWord16, takeWord16) import Data.Text.ICU.Break (Line, Word) import Data.Text.ICU.Break.Types (BreakIterator(..)) import Data.Text.ICU.Internal (LocaleName) import System.IO.Unsafe (unsafeInterleaveIO, unsafePerformIO) import qualified Data.Text.ICU.Break as IO -- | A boundary analyser. newtype Breaker a = B (BreakIterator a) new :: (LocaleName -> Text -> IO (BreakIterator a)) -> LocaleName -> Breaker a new act loc = unsafePerformIO $ B `fmap` act loc empty -- | Break a string on character boundaries. -- -- Character boundary analysis identifies the boundaries of "Extended -- Grapheme Clusters", which are groupings of codepoints that should be -- treated as character-like units for many text operations. Please see -- Unicode Standard Annex #29, Unicode Text Segmentation, -- for additional information on -- grapheme clusters and guidelines on their use. breakCharacter :: LocaleName -> Breaker () breakCharacter = new IO.breakCharacter -- | Break a string on line boundaries. -- -- Line boundary analysis determines where a text string can be broken when -- line wrapping. The mechanism correctly handles punctuation and hyphenated -- words. breakLine :: LocaleName -> Breaker Line breakLine = new IO.breakLine -- | Break a string on sentence boundaries. -- -- Sentence boundary analysis allows selection with correct interpretation -- of periods within numbers and abbreviations, and trailing punctuation -- marks such as quotation marks and parentheses. breakSentence :: LocaleName -> Breaker () breakSentence = new IO.breakSentence -- | Break a string on word boundaries. -- -- Word boundary analysis is used by search and replace functions, as well -- as within text editing applications that allow the user to select words -- with a double click. Word selection provides correct interpretation of -- punctuation marks within and following words. Characters that are not -- part of a word, such as symbols or punctuation marks, have word breaks on -- both sides. breakWord :: LocaleName -> Breaker Word breakWord = new IO.breakWord -- | A break in a string. data Break a = Break { brkPrefix :: {-# UNPACK #-} !Text -- ^ Prefix of the current break. , brkBreak :: {-# UNPACK #-} !Text -- ^ Text of the current break. , brkSuffix :: {-# UNPACK #-} !Text -- ^ Suffix of the current break. , brkStatus :: !a -- ^ Status of the current break (only meaningful if 'Line' or 'Word'). } deriving (Eq, Show) -- | Return a list of all breaks in a string, from left to right. breaks :: Breaker a -> Text -> [Break a] breaks (B b) t = unsafePerformIO $ do bi <- IO.clone b IO.setText bi t let go p = do mix <- IO.next bi case mix of Nothing -> return [] Just n -> do s <- IO.getStatus bi let d = n-p u = dropWord16 p t (Break (takeWord16 p t) (takeWord16 d u) (dropWord16 d u) s :) `fmap` go n unsafeInterleaveIO $ go =<< IO.first bi -- | Return a list of all breaks in a string, from right to left. breaksRight :: Breaker a -> Text -> [Break a] breaksRight (B b) t = unsafePerformIO $ do bi <- IO.clone b IO.setText bi t let go p = do mix <- IO.previous bi case mix of Nothing -> return [] Just n -> do s <- IO.getStatus bi let d = p-n u = dropWord16 n t (Break (takeWord16 n t) (takeWord16 d u) (dropWord16 d u) s :) `fmap` go n unsafeInterleaveIO $ go =<< IO.last bi text-icu-0.6.3.7/Data/Text/ICU/Break/Types.hs0000644000000000000000000000116712234226032016517 0ustar0000000000000000{-# LANGUAGE EmptyDataDecls #-} -- | -- Module : Data.Text.ICU.Break.Internal -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC module Data.Text.ICU.Break.Types ( BreakIterator(..) , UBreakIterator ) where import Data.IORef (IORef) import Data.Int (Int32) import Data.Text (Text) import Foreign.ForeignPtr (ForeignPtr) -- A boundary breaker. data BreakIterator a = BR { brText :: IORef Text , brStatus :: Int32 -> a , brIter :: ForeignPtr UBreakIterator } data UBreakIterator text-icu-0.6.3.7/Data/Text/ICU/Collate/0000755000000000000000000000000012234226032015411 5ustar0000000000000000text-icu-0.6.3.7/Data/Text/ICU/Collate/Internal.hs0000644000000000000000000000363712234226032017532 0ustar0000000000000000{-# LANGUAGE DeriveDataTypeable, EmptyDataDecls, ForeignFunctionInterface #-} -- | -- Module : Data.Text.ICU.Collate.Internal -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Internals of the string collation infrastructure. module Data.Text.ICU.Collate.Internal ( -- * Unicode collation API MCollator(..) , Collator(..) , UCollator , equals , withCollator , wrap ) where import Data.Text.ICU.Internal (UBool, asBool) import Data.Typeable (Typeable) import Foreign.ForeignPtr (ForeignPtr, newForeignPtr, withForeignPtr) import Foreign.Ptr (FunPtr, Ptr) import System.IO.Unsafe (unsafePerformIO) -- $api -- data UCollator -- | String collator type. data MCollator = MCollator {-# UNPACK #-} !(ForeignPtr UCollator) deriving (Typeable) -- | String collator type. 'Collator's are considered equal if they -- will sort strings identically. newtype Collator = C MCollator deriving (Typeable) instance Eq Collator where (C a) == (C b) = unsafePerformIO $ equals a b withCollator :: MCollator -> (Ptr UCollator -> IO a) -> IO a withCollator (MCollator col) action = withForeignPtr col action {-# INLINE withCollator #-} wrap :: Ptr UCollator -> IO MCollator wrap = fmap MCollator . newForeignPtr ucol_close {-# INLINE wrap #-} -- | 'MCollator's are considered equal if they will sort strings -- identically. This means that both the current attributes and the rules -- must be equivalent. equals :: MCollator -> MCollator -> IO Bool equals a b = fmap asBool . withCollator a $ \aptr -> withCollator b $ ucol_equals aptr foreign import ccall unsafe "hs_text_icu.h &__hs_ucol_close" ucol_close :: FunPtr (Ptr UCollator -> IO ()) foreign import ccall unsafe "hs_text_icu.h __hs_ucol_equals" ucol_equals :: Ptr UCollator -> Ptr UCollator -> IO UBool text-icu-0.6.3.7/Data/Text/ICU/Collate/Pure.hs0000644000000000000000000000466512234226032016673 0ustar0000000000000000{-# LANGUAGE DeriveDataTypeable, ForeignFunctionInterface #-} -- | -- Module : Data.Text.ICU.Collate.Pure -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Pure string collation functions for Unicode, implemented as -- bindings to the International Components for Unicode (ICU) -- libraries. -- -- For the impure collation API (which is richer, but less easy to -- use), see the "Data.Text.ICU.Collate" module. module Data.Text.ICU.Collate.Pure ( -- * Unicode collation API -- $api Collator , collator , collatorWith , collate , collateIter , sortKey , uca ) where import Control.Monad (forM_) import Data.ByteString (ByteString) import Data.Text (Text) import Data.Text.ICU.Collate.Internal (Collator(..)) import Data.Text.ICU.Internal (CharIterator, LocaleName(..)) import System.IO.Unsafe (unsafePerformIO) import qualified Data.Text.ICU.Collate as IO -- $api -- -- | Create an immutable 'Collator' for comparing strings. -- -- If 'Root' is passed as the locale, UCA collation rules will be -- used. collator :: LocaleName -> Collator collator loc = unsafePerformIO $ C `fmap` IO.open loc -- | Create an immutable 'Collator' with the given 'Attribute's. collatorWith :: LocaleName -> [IO.Attribute] -> Collator collatorWith loc atts = unsafePerformIO $ do mc <- IO.open loc forM_ atts $ IO.setAttribute mc return (C mc) -- | Compare two strings. collate :: Collator -> Text -> Text -> Ordering collate (C c) a b = unsafePerformIO $ IO.collate c a b {-# INLINE collate #-} -- | Compare two 'CharIterator's. -- -- If either iterator was constructed from a 'ByteString', it does not -- need to be copied or converted beforehand, so this function can be -- quite cheap. collateIter :: Collator -> CharIterator -> CharIterator -> Ordering collateIter (C c) a b = unsafePerformIO $ IO.collateIter c a b {-# INLINE collateIter #-} -- | Create a key for sorting the 'Text' using the given 'Collator'. -- The result of comparing two 'ByteString's that have been -- transformed with 'sortKey' will be the same as the result of -- 'collate' on the two untransformed 'Text's. sortKey :: Collator -> Text -> ByteString sortKey (C c) = unsafePerformIO . IO.sortKey c {-# INLINE sortKey #-} -- | A 'Collator' that uses the Unicode Collation Algorithm (UCA). uca :: Collator uca = collator Root {-# NOINLINE uca #-} text-icu-0.6.3.7/Data/Text/ICU/Convert/0000755000000000000000000000000012234226032015446 5ustar0000000000000000text-icu-0.6.3.7/Data/Text/ICU/Convert/Internal.hs0000644000000000000000000000307112234226032017557 0ustar0000000000000000{-# LANGUAGE DeriveDataTypeable, EmptyDataDecls, ForeignFunctionInterface #-} -- | -- Module : Data.Text.ICU.Convert.Internal -- Copyright : (c) Bryan O'Sullivan 2009 -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Low-level character set types and functions. module Data.Text.ICU.Convert.Internal ( Converter(..) , UConverter , getName , withConverter ) where import Data.Text.ICU.Error.Internal (UErrorCode, handleError) import Data.Typeable (Typeable) import Foreign.C.String (CString, peekCString) import Foreign.ForeignPtr (ForeignPtr, withForeignPtr) import Foreign.Ptr (Ptr) import System.IO.Unsafe (unsafePerformIO) data UConverter -- | Character set converter type. /Note/: this structure is not -- thread safe. It is /not/ safe to use value of this type -- simultaneously from multiple threads. data Converter = Converter {-# UNPACK #-} !(ForeignPtr UConverter) deriving (Eq, Typeable) instance Show Converter where show c = "Converter " ++ show (getName c) withConverter :: Converter -> (Ptr UConverter -> IO a) -> IO a {-# INLINE withConverter #-} withConverter (Converter cnv) action = withForeignPtr cnv action -- | Gets the internal, canonical name of the converter. getName :: Converter -> String getName cnv = unsafePerformIO . withConverter cnv $ \ptr -> peekCString =<< handleError (ucnv_getName ptr) foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getName" ucnv_getName :: Ptr UConverter -> Ptr UErrorCode -> IO CString text-icu-0.6.3.7/Data/Text/ICU/Error/0000755000000000000000000000000012234226032015117 5ustar0000000000000000text-icu-0.6.3.7/Data/Text/ICU/Error/Internal.hsc0000644000000000000000000001332112234226032017372 0ustar0000000000000000{-# LANGUAGE DeriveDataTypeable, ForeignFunctionInterface, ScopedTypeVariables #-} module Data.Text.ICU.Error.Internal ( -- * Types ICUError(..) -- ** Low-level types , UErrorCode , ParseError(errError, errLine, errOffset) , UParseError -- * Functions , isFailure , isSuccess , errorName , handleError , handleOverflowError , handleParseError , throwOnError , withError ) where import Control.Exception (Exception, throwIO) import Data.Function (fix) import Foreign.Ptr (Ptr) import Foreign.Marshal.Alloc (alloca) import Foreign.Marshal.Utils (with) import Foreign.Marshal.Array (allocaArray) import Data.Int (Int32) import Data.Typeable (Typeable) import Foreign.C.String (CString, peekCString) import Foreign.C.Types (CInt(..)) import Foreign.Storable (Storable(..)) import System.IO.Unsafe (unsafePerformIO) #include #include type UErrorCode = CInt -- | ICU error type. This is an instance of the 'Exception' type -- class. A value of this type may be thrown as an exception by most -- ICU functions. newtype ICUError = ICUError { fromErrorCode :: UErrorCode } deriving (Eq, Typeable) instance Show ICUError where show code = "ICUError " ++ errorName code instance Exception ICUError -- | Detailed information about parsing errors. Used by ICU parsing -- engines that parse long rules, patterns, or programs, where the -- text being parsed is long enough that more information than an -- 'ICUError' is needed to localize the error. data ParseError = ParseError { errError :: ICUError , errLine :: !(Maybe Int) -- ^ The line on which the error occured. If the parser uses this -- field, it sets it to the line number of the source text line on -- which the error appears, which will be be a positive value. If -- the parser does not support line numbers, the value will be -- 'Nothing'. , errOffset :: !(Maybe Int) -- ^ The character offset to the error. If the 'errLine' field is -- 'Just' some value, then this field contains the offset from the -- beginning of the line that contains the error. Otherwise, it -- represents the offset from the start of the text. If the -- parser does not support this field, it will have a value of -- 'Nothing'. } deriving (Show, Typeable) type UParseError = ParseError instance Exception ParseError instance Storable ParseError where sizeOf _ = #{size UParseError} alignment _ = alignment (undefined :: CString) peek ptr = do (line::Int32) <- #{peek UParseError, line} ptr (offset::Int32) <- #{peek UParseError, offset} ptr let wrap k = if k == -1 then Nothing else Just $! fromIntegral k return $! ParseError undefined (wrap line) (wrap offset) -- | Indicate whether the given error code is a success. isSuccess :: ICUError -> Bool {-# INLINE isSuccess #-} isSuccess = (<= 0) . fromErrorCode -- | Indicate whether the given error code is a failure. isFailure :: ICUError -> Bool {-# INLINE isFailure #-} isFailure = (> 0) . fromErrorCode -- | Throw an exception if the given code is actually an error. throwOnError :: UErrorCode -> IO () {-# INLINE throwOnError #-} throwOnError code = do let err = (ICUError code) if isFailure err then throwIO err else return () withError :: (Ptr UErrorCode -> IO a) -> IO (ICUError, a) {-# INLINE withError #-} withError action = with 0 $ \errPtr -> do ret <- action errPtr err <- peek errPtr return (ICUError err, ret) handleError :: (Ptr UErrorCode -> IO a) -> IO a {-# INLINE handleError #-} handleError action = with 0 $ \errPtr -> do ret <- action errPtr throwOnError =<< peek errPtr return ret -- | Deal with ICU functions that report a buffer overflow error if we -- give them an insufficiently large buffer. Our first call will -- report a buffer overflow, in which case we allocate a correctly -- sized buffer and try again. handleOverflowError :: (Storable a) => Int -- ^ Initial guess at buffer size. -> (Ptr a -> Int32 -> Ptr UErrorCode -> IO Int32) -- ^ Function that retrieves data. -> (Ptr a -> Int -> IO b) -- ^ Function that fills destination buffer if no -- overflow occurred. -> IO b handleOverflowError guess fill retrieve = alloca $ \uerrPtr -> flip fix guess $ \loop n -> (either (loop . fromIntegral) return =<<) . allocaArray n $ \ptr -> do poke uerrPtr 0 ret <- fill ptr (fromIntegral n) uerrPtr err <- peek uerrPtr case undefined of _| err == (#const U_BUFFER_OVERFLOW_ERROR) -> return (Left ret) | err > 0 -> throwIO (ICUError err) | otherwise -> Right `fmap` retrieve ptr (fromIntegral ret) handleParseError :: (ICUError -> Bool) -> (Ptr UParseError -> Ptr UErrorCode -> IO a) -> IO a handleParseError isParseError action = with 0 $ \uerrPtr -> alloca $ \perrPtr -> do ret <- action perrPtr uerrPtr err <- ICUError `fmap` peek uerrPtr case undefined of _| isParseError err -> do perr <- peek perrPtr throwIO perr { errError = err } | isFailure err -> throwIO err | otherwise -> return ret -- | Return a string representing the name of the given error code. errorName :: ICUError -> String errorName code = unsafePerformIO $ peekCString (u_errorName (fromErrorCode code)) foreign import ccall unsafe "hs_text_icu.h __hs_u_errorName" u_errorName :: UErrorCode -> CString text-icu-0.6.3.7/Data/Text/ICU/Normalize/0000755000000000000000000000000012234226032015766 5ustar0000000000000000text-icu-0.6.3.7/Data/Text/ICU/Normalize/Internal.hsc0000644000000000000000000000131112234226032020235 0ustar0000000000000000{-# LANGUAGE CPP, DeriveDataTypeable, ForeignFunctionInterface #-} -- | -- Module : Data.Text.ICU.Normalize.Internal -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC module Data.Text.ICU.Normalize.Internal ( UNormalizationCheckResult , toNCR ) where #include import Foreign.C.Types (CInt) type UNormalizationCheckResult = CInt toNCR :: UNormalizationCheckResult -> Maybe Bool toNCR (#const UNORM_NO) = Just False toNCR (#const UNORM_MAYBE) = Nothing toNCR (#const UNORM_YES) = Just True toNCR _ = error "toNormalizationCheckResult" text-icu-0.6.3.7/Data/Text/ICU/Regex/0000755000000000000000000000000012234226032015100 5ustar0000000000000000text-icu-0.6.3.7/Data/Text/ICU/Regex/Internal.hsc0000644000000000000000000002261112234226032017355 0ustar0000000000000000{-# LANGUAGE BangPatterns, DeriveDataTypeable, EmptyDataDecls, ForeignFunctionInterface, MagicHash, RecordWildCards, ScopedTypeVariables #-} -- | -- Module : Data.Text.ICU.Regex.Internal -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Regular expression support for Unicode, implemented as bindings to -- the International Components for Unicode (ICU) libraries. -- -- The syntax and behaviour of ICU regular expressions are Perl-like. -- For complete details, see the ICU User Guide entry at -- . module Data.Text.ICU.Regex.Internal ( -- * Types MatchOption(..) , Haystack(..) , Regex(..) , URegularExpression -- * Functions , emptyForeignPtr , regex , uregex_clone , uregex_close , uregex_end , uregex_find , uregex_findNext , uregex_getText , uregex_group , uregex_groupCount , uregex_pattern , uregex_setText , uregex_start ) where import Control.Monad (when) import Data.IORef (IORef, newIORef) import Data.Int (Int32) import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Foreign as T import Data.Text.ICU.Internal (UBool, UChar) import Data.Text.ICU.Error (isRegexError) import Data.Text.ICU.Error.Internal (UParseError, UErrorCode, handleError, handleParseError) import Data.Typeable (Typeable) import Data.Word (Word16, Word32) import Foreign.ForeignPtr (ForeignPtr, newForeignPtr, touchForeignPtr, withForeignPtr) import Foreign.Ptr (FunPtr, Ptr) import System.IO.Unsafe (unsafePerformIO) #include -- | Options for controlling matching behaviour. data MatchOption = CaseInsensitive -- ^ Enable case insensitive matching. | Comments -- ^ Allow comments and white space within patterns. | DotAll -- ^ If set, @\'.\'@ matches line terminators. Otherwise @\'.\'@ -- matching stops at line end. | Literal -- ^ If set, treat the entire pattern as a literal string. -- Metacharacters or escape sequences in the input sequence will -- be given no special meaning. -- -- The option 'CaseInsensitive' retains its meanings on matching -- when used in conjunction with this option. Other options -- become superfluous. | Multiline -- ^ Control behaviour of @\'$\'@ and @\'^\'@. If set, recognize -- line terminators within string, Otherwise, match only at start -- and end of input string. | HaskellLines -- ^ Haskell-only line endings. When this mode is enabled, only -- @\'\\n\'@ is recognized as a line ending in the behavior of -- @\'.\'@, @\'^\'@, and @\'$\'@. | UnicodeWord -- ^ Unicode word boundaries. If set, @\'\\\\b\'@ uses the -- Unicode TR 29 definition of word boundaries. -- -- /Warning/: Unicode word boundaries are quite different from -- traditional regular expression word boundaries. See -- . | ErrorOnUnknownEscapes -- ^ Throw an error on unrecognized backslash escapes. If set, -- fail with an error on patterns that contain backslash-escaped -- ASCII letters without a known special meaning. If this flag is -- not set, these escaped letters represent themselves. | WorkLimit Int -- ^ Set a processing limit for match operations. -- -- Some patterns, when matching certain strings, can run in -- exponential time. For practical purposes, the match operation -- may appear to be in an infinite loop. When a limit is set a -- match operation will fail with an error if the limit is -- exceeded. -- -- The units of the limit are steps of the match engine. -- Correspondence with actual processor time will depend on the -- speed of the processor and the details of the specific pattern, -- but will typically be on the order of milliseconds. -- -- By default, the matching time is not limited. | StackLimit Int -- ^ Set the amount of heap storage avaliable for use by the match -- backtracking stack. -- -- ICU uses a backtracking regular expression engine, with the -- backtrack stack maintained on the heap. This function sets the -- limit to the amount of memory that can be used for this -- purpose. A backtracking stack overflow will result in an error -- from the match operation that caused it. -- -- A limit is desirable because a malicious or poorly designed -- pattern can use excessive memory, potentially crashing the -- process. A limit is enabled by default. deriving (Eq, Show, Typeable) data Haystack = H (ForeignPtr Word16) {-# UNPACK #-} !T.I16 -- | A compiled regular expression. -- -- 'Regex' values are usually constructed using the 'regex' or -- 'regex'' functions. This type is also an instance of 'IsString', -- so if you have the @OverloadedStrings@ language extension enabled, -- you can construct a 'Regex' by simply writing the pattern in -- quotes (though this does not allow you to specify any 'Option's). data Regex = Regex { reRe :: ForeignPtr URegularExpression , reText :: IORef Haystack } emptyForeignPtr :: ForeignPtr Word16 emptyForeignPtr = unsafePerformIO $ fst `fmap` T.asForeignPtr T.empty {-# NOINLINE emptyForeignPtr #-} -- | Compile a regular expression with the given options. This -- function throws a 'ParseError' if the pattern is invalid. -- -- The 'Regex' is initialized with empty text to search against. regex :: [MatchOption] -> Text -> IO Regex regex opts pat = T.useAsPtr pat $ \pptr plen -> do let (flags,workLimit,stackLimit) = toURegexpOpts opts ptr <- handleParseError isRegexError $ uregex_open pptr (fromIntegral plen) flags refp <- newForeignPtr uregex_close ptr (hayfp, hayLen) <- T.asForeignPtr T.empty withForeignPtr refp $ \rePtr -> withForeignPtr hayfp $ \hayPtr -> handleError $ uregex_setText rePtr hayPtr (fromIntegral hayLen) when (workLimit > -1) . handleError $ uregex_setTimeLimit ptr (fromIntegral workLimit) when (stackLimit > -1) . handleError $ uregex_setStackLimit ptr (fromIntegral stackLimit) touchForeignPtr refp Regex refp `fmap` newIORef (H hayfp 0) data URegularExpression type URegexpFlag = Word32 toURegexpOpts :: [MatchOption] -> (URegexpFlag,Int,Int) toURegexpOpts = foldl go (0,-1,-1) where go (!flag,work,stack) opt = (flag+flag',work',stack') where flag' = case opt of CaseInsensitive -> #const UREGEX_CASE_INSENSITIVE Comments -> #const UREGEX_COMMENTS DotAll -> #const UREGEX_DOTALL Literal -> #const UREGEX_LITERAL Multiline -> #const UREGEX_MULTILINE HaskellLines -> #const UREGEX_UNIX_LINES UnicodeWord -> #const UREGEX_UWORD ErrorOnUnknownEscapes -> #const UREGEX_ERROR_ON_UNKNOWN_ESCAPES _ -> 0 work' = case opt of WorkLimit limit -> limit _ -> work stack' = case opt of StackLimit limit -> limit _ -> stack foreign import ccall unsafe "hs_text_icu.h __hs_uregex_open" uregex_open :: Ptr UChar -> Int32 -> Word32 -> Ptr UParseError -> Ptr UErrorCode -> IO (Ptr URegularExpression) foreign import ccall unsafe "hs_text_icu.h &__hs_uregex_close" uregex_close :: FunPtr (Ptr URegularExpression -> IO ()) foreign import ccall unsafe "hs_text_icu.h __hs_uregex_clone" uregex_clone :: Ptr URegularExpression -> Ptr UErrorCode -> IO (Ptr URegularExpression) foreign import ccall unsafe "hs_text_icu.h __hs_uregex_pattern" uregex_pattern :: Ptr URegularExpression -> Ptr Int32 -> Ptr UErrorCode -> IO (Ptr UChar) foreign import ccall unsafe "hs_text_icu.h __hs_uregex_setText" uregex_setText :: Ptr URegularExpression -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO () foreign import ccall unsafe "hs_text_icu.h __hs_uregex_getText" uregex_getText :: Ptr URegularExpression -> Ptr Int32 -> Ptr UErrorCode -> IO (Ptr UChar) foreign import ccall unsafe "hs_text_icu.h __hs_uregex_find" uregex_find :: Ptr URegularExpression -> Int32 -> Ptr UErrorCode -> IO UBool foreign import ccall unsafe "hs_text_icu.h __hs_uregex_findNext" uregex_findNext :: Ptr URegularExpression -> Ptr UErrorCode -> IO UBool foreign import ccall unsafe "hs_text_icu.h __hs_uregex_start" uregex_start :: Ptr URegularExpression -> Int32 -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_uregex_end" uregex_end :: Ptr URegularExpression -> Int32 -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_uregex_groupCount" uregex_groupCount :: Ptr URegularExpression -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_uregex_group" uregex_group :: Ptr URegularExpression -> Int32 -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO Int32 foreign import ccall unsafe "hs_text_icu.h __hs_uregex_setTimeLimit" uregex_setTimeLimit :: Ptr URegularExpression -> Int32 -> Ptr UErrorCode -> IO () foreign import ccall unsafe "hs_text_icu.h __hs_uregex_setStackLimit" uregex_setStackLimit :: Ptr URegularExpression -> Int32 -> Ptr UErrorCode -> IO () text-icu-0.6.3.7/Data/Text/ICU/Regex/Pure.hs0000644000000000000000000001611412234226032016352 0ustar0000000000000000{-# LANGUAGE BangPatterns, EmptyDataDecls, ScopedTypeVariables #-} -- | -- Module : Data.Text.ICU.Regex.Pure -- Copyright : (c) 2010 Bryan O'Sullivan -- -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : GHC -- -- Regular expression support for Unicode, implemented as bindings to -- the International Components for Unicode (ICU) libraries. -- -- The functions in this module are pure and hence thread safe, but -- may not be as fast or as flexible as those in the -- 'Data.Text.ICU.Regex.IO' module. -- -- The syntax and behaviour of ICU regular expressions are Perl-like. -- For complete details, see the ICU User Guide entry at -- . module Data.Text.ICU.Regex.Pure ( -- * Types MatchOption(..) , ParseError(errError, errLine, errOffset) , Match , Regex , Regular -- * Functions -- ** Construction , regex , regex' -- ** Inspection , pattern -- ** Searching , find , findAll -- ** Match groups -- $group , groupCount , unfold , span , group , prefix , suffix ) where import qualified Control.Exception as E import Data.String (IsString(..)) import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Foreign as T import Data.Text.ICU.Error.Internal (ParseError(..), handleError) import qualified Data.Text.ICU.Regex as IO import Data.Text.ICU.Regex.Internal hiding (Regex(..), regex) import qualified Data.Text.ICU.Regex.Internal as Internal import Foreign.ForeignPtr (ForeignPtr, withForeignPtr) import Foreign.Marshal.Alloc (alloca) import Foreign.Marshal.Array (advancePtr) import Foreign.Storable (peek) import Prelude hiding (span) import System.IO.Unsafe (unsafeInterleaveIO, unsafePerformIO) -- | A compiled regular expression. -- -- 'Regex' values are usually constructed using the 'regex' or -- 'regex'' functions. This type is also an instance of 'IsString', -- so if you have the @OverloadedStrings@ language extension enabled, -- you can construct a 'Regex' by simply writing the pattern in -- quotes (though this does not allow you to specify any 'Option's). newtype Regex = Regex { reRe :: Internal.Regex } instance Show Regex where show re = "Regex " ++ show (pattern re) instance IsString Regex where fromString = regex [] . T.pack -- | A match for a regular expression. data Match = Match { matchRe :: Internal.Regex , _matchPrev :: T.I16 } instance Show Match where show m = "Match " ++ show (unfold group m) -- | A typeclass for functions common to both 'Match' and 'Regex' -- types. class Regular r where regRe :: r -> Internal.Regex regFp :: r -> ForeignPtr URegularExpression regFp = Internal.reRe . regRe {-# INLINE regFp #-} instance Regular Match where regRe = matchRe instance Regular Regex where regRe = reRe -- | Compile a regular expression with the given options. This -- function throws a 'ParseError' if the pattern is invalid, so it is -- best for use when the pattern is statically known. regex :: [MatchOption] -> Text -> Regex regex opts pat = Regex . unsafePerformIO $ IO.regex opts pat -- | Compile a regular expression with the given options. This is -- safest to use when the pattern is constructed at run time. regex' :: [MatchOption] -> Text -> Either ParseError Regex regex' opts pat = unsafePerformIO $ ((Right . Regex) `fmap` Internal.regex opts pat) `E.catch` \(err::ParseError) -> return (Left err) -- | Return the source form of the pattern used to construct this -- regular expression or match. pattern :: Regular r => r -> Text pattern r = unsafePerformIO . withForeignPtr (regFp r) $ \rePtr -> alloca $ \lenPtr -> do textPtr <- handleError $ uregex_pattern rePtr lenPtr (T.fromPtr textPtr . fromIntegral) =<< peek lenPtr -- | Find the first match for the regular expression in the given text. find :: Regex -> Text -> Maybe Match find re0 haystack = unsafePerformIO . matching re0 haystack $ \re -> do m <- IO.findNext re return $! if m then Just (Match re 0) else Nothing -- | Lazily find all matches for the regular expression in the given -- text. findAll :: Regex -> Text -> [Match] findAll re0 haystack = unsafePerformIO . unsafeInterleaveIO $ go 0 where len = fromIntegral . T.lengthWord16 $ haystack go !n | n >= len = return [] | otherwise = matching re0 haystack $ \re -> do found <- IO.find re n if found then do n' <- IO.end_ re 0 (Match re n:) `fmap` go n' else return [] matching :: Regex -> Text -> (IO.Regex -> IO a) -> IO a matching (Regex re0) haystack act = do re <- IO.clone re0 IO.setText re haystack act re -- $group -- -- Capturing groups are numbered starting from zero. Group zero is -- always the entire matching text. Groups greater than zero contain -- the text matching each capturing group in a regular expression. -- | Return the number of capturing groups in this regular -- expression or match's pattern. groupCount :: Regular r => r -> Int groupCount = unsafePerformIO . IO.groupCount . regRe {-# INLINE groupCount #-} -- | A combinator for returning a list of all capturing groups on a -- 'Match'. unfold :: (Int -> Match -> Maybe Text) -> Match -> [Text] unfold f m = go 0 where go !n = case f n m of Nothing -> [] Just z -> z : go (n+1) -- | Return the /n/th capturing group in a match, or 'Nothing' if /n/ -- is out of bounds. group :: Int -> Match -> Maybe Text group n m = grouping n m $ \re -> do let n' = fromIntegral n start <- fromIntegral `fmap` IO.start_ re n' end <- fromIntegral `fmap` IO.end_ re n' (fp,_) <- IO.getText re withForeignPtr fp $ \ptr -> T.fromPtr (ptr `advancePtr` fromIntegral start) (end - start) -- | Return the prefix of the /n/th capturing group in a match (the -- text from the start of the string to the start of the match), or -- 'Nothing' if /n/ is out of bounds. prefix :: Int -> Match -> Maybe Text prefix n m = grouping n m $ \re -> do start <- fromIntegral `fmap` IO.start_ re n (fp,_) <- IO.getText re withForeignPtr fp (`T.fromPtr` start) -- | Return the span of text between the end of the previous match and -- the beginning of the current match. span :: Match -> Text span (Match re p) = unsafePerformIO $ do start <- IO.start_ re 0 (fp,_) <- IO.getText re withForeignPtr fp $ \ptr -> T.fromPtr (ptr `advancePtr` fromIntegral p) (start - p) -- | Return the suffix of the /n/th capturing group in a match (the -- text from the end of the match to the end of the string), or -- 'Nothing' if /n/ is out of bounds. suffix :: Int -> Match -> Maybe Text suffix n m = grouping n m $ \re -> do end <- fromIntegral `fmap` IO.end_ re n (fp,len) <- IO.getText re withForeignPtr fp $ \ptr -> do T.fromPtr (ptr `advancePtr` fromIntegral end) (len - end) grouping :: Int -> Match -> (Internal.Regex -> IO a) -> Maybe a grouping n (Match m _) act = unsafePerformIO $ do count <- IO.groupCount m let n' = fromIntegral n if n' == 0 || (n' >= 0 && n' <= count) then Just `fmap` act m else return Nothing text-icu-0.6.3.7/include/0000755000000000000000000000000012234226032013234 5ustar0000000000000000text-icu-0.6.3.7/include/hs_text_icu.h0000644000000000000000000001653012234226032015730 0ustar0000000000000000#ifdef WIN32 #define U_HAVE_INTTYPES_H 1 #endif #include "unicode/utypes.h" #include "unicode/ubrk.h" #include "unicode/uchar.h" #include "unicode/ucol.h" #include "unicode/ucnv.h" #include "unicode/uiter.h" #include "unicode/unorm.h" #include "unicode/uregex.h" #include "unicode/ustring.h" #include /* ubrk.h */ UBreakIterator* __hs_ubrk_open(UBreakIteratorType type, const char *locale, const UChar *text, int32_t textLength, UErrorCode *status); void __hs_ubrk_close(UBreakIterator *bi); void __hs_ubrk_setText(UBreakIterator* bi, const UChar *text, int32_t textLength, UErrorCode *status); UBreakIterator * __hs_ubrk_safeClone(const UBreakIterator *bi, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); int32_t __hs_ubrk_current(UBreakIterator *bi); int32_t __hs_ubrk_first(UBreakIterator *bi); int32_t __hs_ubrk_last(UBreakIterator *bi); int32_t __hs_ubrk_next(UBreakIterator *bi); int32_t __hs_ubrk_previous(UBreakIterator *bi); int32_t __hs_ubrk_preceding(UBreakIterator *bi, int32_t offset); int32_t __hs_ubrk_following(UBreakIterator *bi, int32_t offset); int32_t __hs_ubrk_getRuleStatus(UBreakIterator *bi); int32_t __hs_ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status); UBool __hs_ubrk_isBoundary(UBreakIterator *bi, int32_t offset); int32_t __hs_ubrk_countAvailable(void); const char* __hs_ubrk_getAvailable(int32_t index); /* uchar.h */ UBlockCode __hs_ublock_getCode(UChar32 c); UCharDirection __hs_u_charDirection(UChar32 c); UBool __hs_u_isMirrored(UChar32 c); UChar32 __hs_u_charMirror(UChar32 c); uint8_t __hs_u_getCombiningClass(UChar32 c); int32_t __hs_u_charDigitValue(UChar32 c); int32_t __hs_u_charName(UChar32 code, UCharNameChoice nameChoice, char *buffer, int32_t bufferLength, UErrorCode *pErrorCode); UChar32 __hs_u_charFromName(UCharNameChoice nameChoice, const char *name, UErrorCode *pErrorCode); int32_t __hs_u_getISOComment(UChar32 c, char *dest, int32_t destCapacity, UErrorCode *pErrorCode); int32_t __hs_u_getIntPropertyValue(UChar32 c, UProperty which); double __hs_u_getNumericValue(UChar32 c); /* ucol.h */ UCollator* __hs_ucol_open(const char *loc, UErrorCode *status); void __hs_ucol_close(UCollator *coll); void __hs_ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status); UColAttributeValue __hs_ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status); UCollationResult __hs_ucol_strcoll(const UCollator *coll, const UChar *source, int32_t sourceLength, const UChar *target, int32_t targetLength); UCollationResult __hs_ucol_strcollIter(const UCollator *coll, UCharIterator *sIter, UCharIterator *tIter, UErrorCode *status); UCollator* __hs_ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); int32_t __hs_ucol_getSortKey(const UCollator *coll, const UChar *source, int32_t sourceLength, uint8_t *result, int32_t resultLength); UBool __hs_ucol_equals(const UCollator *source, const UCollator *target); /* ucnv.h */ int __get_max_bytes_for_string(UConverter *cnv, int src_length); const char *__hs_u_errorName(UErrorCode code); const char *__hs_ucnv_getName(const UConverter *converter, UErrorCode *err); UConverter* __hs_ucnv_open(const char *converterName, UErrorCode *err); void __hs_ucnv_close(UConverter * converter); int32_t __hs_ucnv_toUChars(UConverter *cnv, UChar *dest, int32_t destCapacity, const char *src, int32_t srcLength, UErrorCode *pErrorCode); int32_t __hs_ucnv_fromUChars(UConverter *cnv, char *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode); int __hs_ucnv_compareNames(const char *name1, const char *name2); const char *__hs_ucnv_getDefaultName(void); void __hs_ucnv_setDefaultName(const char *name); int32_t __hs_ucnv_countAvailable(void); const char* __hs_ucnv_getAvailableName(int32_t n); uint16_t __hs_ucnv_countAliases(const char *alias, UErrorCode *pErrorCode); const char *__hs_ucnv_getAlias(const char *alias, uint16_t n, UErrorCode *pErrorCode); uint16_t __hs_ucnv_countStandards(void); const char *__hs_ucnv_getStandard(uint16_t n, UErrorCode *pErrorCode); UBool __hs_ucnv_usesFallback(const UConverter *cnv); void __hs_ucnv_setFallback(UConverter *cnv, UBool usesFallback); UBool __hs_ucnv_isAmbiguous(const UConverter *cnv); /* uiter.h */ void __hs_uiter_setString(UCharIterator *iter, const UChar *s, int32_t length); void __hs_uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length); /* unorm.h */ int32_t __hs_unorm_compare(const UChar *s1, int32_t length1, const UChar *s2, int32_t length2, uint32_t options, UErrorCode *pErrorCode); UNormalizationCheckResult __hs_unorm_quickCheck(const UChar *source, int32_t sourcelength, UNormalizationMode mode, UErrorCode *status); UBool __hs_unorm_isNormalized(const UChar *src, int32_t srcLength, UNormalizationMode mode, UErrorCode *pErrorCode); int32_t __hs_unorm_normalize(const UChar *source, int32_t sourceLength, UNormalizationMode mode, int32_t options, UChar *result, int32_t resultLength, UErrorCode *status); /* uregex.h */ URegularExpression * __hs_uregex_open(const UChar *pattern, int32_t patternLength, uint32_t flags, UParseError *pe, UErrorCode *status); void __hs_uregex_setTimeLimit(URegularExpression *regexp, int32_t limit, UErrorCode *status); void __hs_uregex_setStackLimit(URegularExpression *regexp, int32_t limit, UErrorCode *status); void __hs_uregex_close(URegularExpression *regexp); URegularExpression *__hs_uregex_clone(URegularExpression *regexp, UErrorCode *pErrorCode); const UChar *__hs_uregex_pattern(const URegularExpression *regexp, int32_t *patLength, UErrorCode *status); int32_t __hs_uregex_flags(const URegularExpression *regexp, UErrorCode *status); void __hs_uregex_setText(URegularExpression *regexp, const UChar *text, int32_t textLength, UErrorCode *status); const UChar *__hs_uregex_getText(URegularExpression *regexp, int32_t *textLength, UErrorCode *status); UBool __hs_uregex_find(URegularExpression *regexp, int32_t startIndex, UErrorCode *status); UBool __hs_uregex_findNext(URegularExpression *regexp, UErrorCode *status); int32_t __hs_uregex_start(URegularExpression *regexp, int32_t groupNum, UErrorCode *status); int32_t __hs_uregex_end(URegularExpression *regexp, int32_t groupNum, UErrorCode *status); int32_t __hs_uregex_groupCount(URegularExpression *regexp, UErrorCode *status); int32_t __hs_uregex_group(URegularExpression *regexp, int32_t groupNum, UChar *dest, int32_t destCapacity, UErrorCode *status); /* ustring.h */ int32_t __hs_u_strFoldCase(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, uint32_t options, UErrorCode *pErrorCode); int32_t __hs_u_strToUpper(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, const char *locale, UErrorCode *pErrorCode); int32_t __hs_u_strToLower(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, const char *locale, UErrorCode *pErrorCode); int32_t __hs_u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2); text-icu-0.6.3.7/tests/0000755000000000000000000000000012234226032012753 5ustar0000000000000000text-icu-0.6.3.7/tests/Properties.hs0000644000000000000000000000464112234226032015450 0ustar0000000000000000-- Tester beware! -- -- Many of the tests below are "weak", i.e. they ensure that functions -- return results, without checking whether the results are correct. -- Weak tests are described as such. {-# LANGUAGE OverloadedStrings #-} {-# OPTIONS_GHC -fno-warn-missing-signatures #-} module Properties (tests) where import Control.DeepSeq (NFData(..)) import Data.Function (on) import Data.Maybe (fromMaybe) import Data.Text (Text) import Data.Text.ICU (NormalizationMode(..)) import QuickCheckUtils () import Test.Framework (Test, testGroup) import Test.Framework.Providers.QuickCheck2 (testProperty) import qualified Data.Text as T import qualified Data.Text.Encoding as T import qualified Data.Text.ICU as I t_rnf :: (NFData a) => (Text -> a) -> Text -> Bool t_rnf f t = rnf (f t) == () t_nonEmpty :: (Text -> Text) -> Text -> Bool t_nonEmpty f t | T.null t = T.null ft | otherwise = T.length ft > 0 where ft = f t -- Case mapping -- These tests are all fairly weak. t_toCaseFold bool = t_nonEmpty $ I.toCaseFold bool t_toLower locale = t_nonEmpty $ I.toLower locale t_toUpper locale = t_nonEmpty $ I.toUpper locale -- Iteration t_charIterator_String a b = (compare `on` I.fromString) a b == compare a b t_charIterator_Text a b = (compare `on` I.fromText) a b == compare a b t_charIterator_Utf8 a b = (compare `on` I.fromUtf8) ba bb == compare ba bb where ba = T.encodeUtf8 a; bb = T.encodeUtf8 b -- Normalization t_normalize mode = t_nonEmpty $ I.normalize mode t_quickCheck_isNormalized mode normMode txt | mode `elem` [NFD, NFKD, FCD] = quickCheck == Just isNormalized | otherwise = fromMaybe isNormalized quickCheck == isNormalized where quickCheck = I.quickCheck mode normTxt isNormalized = I.isNormalized mode normTxt normTxt = I.normalize normMode txt -- Collation -- This test is weak. t_collate_root txt = t_rnf $ I.collate I.uca txt tests :: Test tests = testGroup "Properties" [ testProperty "t_toCaseFold" t_toCaseFold , testProperty "t_toLower" t_toLower , testProperty "t_toUpper" t_toUpper , testProperty "t_charIterator_String" t_charIterator_String , testProperty "t_charIterator_Text" t_charIterator_Text , testProperty "t_charIterator_Utf8" t_charIterator_Utf8 , testProperty "t_normalize" t_normalize , testProperty "t_quickCheck_isNormalized" t_quickCheck_isNormalized , testProperty "t_collate_root" t_collate_root ] text-icu-0.6.3.7/tests/QuickCheckUtils.hs0000644000000000000000000000141412234226032016342 0ustar0000000000000000{-# OPTIONS_GHC -fno-warn-orphans #-} module QuickCheckUtils () where import Control.Applicative ((<$>)) import Control.DeepSeq (NFData(..)) import Data.Text.ICU (Collator, LocaleName(..), NormalizationMode(..)) import Data.Text.ICU.Break (available) import Test.QuickCheck (Arbitrary(..), elements) import qualified Data.Text as T import qualified Data.Text.ICU as I instance NFData Ordering where rnf v = v `seq` () instance Arbitrary T.Text where arbitrary = T.pack `fmap` arbitrary shrink = map T.pack . shrink . T.unpack instance Arbitrary LocaleName where arbitrary = elements (Root:available) instance Arbitrary NormalizationMode where arbitrary = elements [None ..FCD] instance Arbitrary Collator where arbitrary = I.collator <$> arbitrary text-icu-0.6.3.7/tests/Tests.hs0000644000000000000000000000022012234226032014403 0ustar0000000000000000module Main (main) where import Test.Framework (defaultMain) import qualified Properties main :: IO () main = defaultMain [Properties.tests]