iconv-0.4.1.1/0000755000000000000000000000000012044312414011152 5ustar0000000000000000iconv-0.4.1.1/README0000644000000000000000000000224312044312414012033 0ustar0000000000000000 Codec.Text.IConv ================ This is a Haskell binding to the iconv() C library function. The only module exported is Codec.Text.IConv, which provides a single function: -- | Convert fromCharset toCharset input output convert :: String -> String -> Lazy.ByteString -> Lazy.ByteString where fromCharset and toCharset are the names of the input and output character set encodings, and input and output are the input and output text as lazy ByteStrings. An example program to convert the encoding of an input file, similar to the iconv program, is given in examples/hiconv.hs Character set encodings ----------------------- To see a list of encoding names which are known by your operating system, run "iconv --list" in a shell. Likely encodings are listed on the libiconv web page: http://www.gnu.org/software/libiconv/ Availability of iconv() ----------------------- The iconv(3) function conforms to POSIX.1-2001. It is provided by the GNU C library: http://www.gnu.org/software/libc/manual/html_node/Character-Set-Handling.html On systems which do not have a native iconv() implementation you may need to install libiconv: http://www.gnu.org/software/libiconv/ iconv-0.4.1.1/Setup.hs0000644000000000000000000000010412044312414012601 0ustar0000000000000000#!/usr/bin/runhaskell import Distribution.Simple main = defaultMain iconv-0.4.1.1/iconv.cabal0000644000000000000000000000222712044312414013257 0ustar0000000000000000name: iconv version: 0.4.1.1 copyright: (c) 2006-2008 Duncan Coutts license: BSD3 license-file: LICENSE author: Duncan Coutts maintainer: Duncan Coutts category: Text synopsis: String encoding conversion description: Provides an interface to the POSIX iconv library functions for string encoding conversion. build-type: Simple cabal-version: >= 1.6 extra-source-files: README examples/hiconv.hs cbits/hsiconv.h source-repository head type: darcs location: http://code.haskell.org/iconv/ library exposed-modules: Codec.Text.IConv other-modules: Codec.Text.IConv.Internal build-depends: base >= 3 && < 5, bytestring == 0.9.* || ==0.10.* extensions: ForeignFunctionInterface includes: hsiconv.h include-dirs: cbits c-sources: cbits/hsiconv.c if os(darwin) || os(freebsd) -- on many systems the iconv api is part of the standard C library -- but on some others we have to link to an external libiconv: extra-libraries: iconv ghc-options: -Wall iconv-0.4.1.1/LICENSE0000644000000000000000000000243012044312414012156 0ustar0000000000000000Copyright (c) 2006-2007, Duncan Coutts All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. iconv-0.4.1.1/examples/0000755000000000000000000000000012044312414012770 5ustar0000000000000000iconv-0.4.1.1/examples/hiconv.hs0000644000000000000000000001015212044312414014611 0ustar0000000000000000{- - This example is similar to the commandline iconv program. - Author: Conrad Parker, July 2007 Usage: hiconv [options] filename -h, -? --help, --usage Display this help and exit -f encoding --from-code=encoding Convert characters from encoding -t encoding --to-code=encoding Convert characters to encoding -c --discard Discard invalid characters from output --transliterate Transliterate unconvertable characters -o file --output=file Specify output file (instead of stdout) -} module Main where import Control.Monad (when) import System.Environment (getArgs, getProgName) import System.Console.GetOpt (getOpt, usageInfo, OptDescr(..), ArgDescr(..), ArgOrder(..)) import System.Exit (exitFailure) import qualified Data.ByteString.Lazy as Lazy import qualified Codec.Text.IConv as IConv ------------------------------------------------------------ -- main -- main :: IO () main = do args <- getArgs (config, filenames) <- processArgs args let inputFile = head filenames input <- case inputFile of "-" -> Lazy.getContents _ -> Lazy.readFile inputFile let convert = case fuzzyConvert config of Nothing -> IConv.convert Just fuzz -> IConv.convertFuzzy fuzz output = convert (fromEncoding config) (toEncoding config) input o = outputFile config case o of "-" -> Lazy.putStr output _ -> Lazy.writeFile o output ------------------------------------------------------------ -- Option handling -- data Config = Config { fromEncoding :: String, toEncoding :: String, fuzzyConvert :: Maybe IConv.Fuzzy, outputFile :: FilePath } defaultConfig = Config { fromEncoding = "", toEncoding = "", fuzzyConvert = Nothing, outputFile = "-" } data Option = Help | FromEncoding String | ToEncoding String | Discard | Translit | OutputFile String deriving Eq options :: [OptDescr Option] options = [ Option ['h', '?'] ["help", "usage"] (NoArg Help) "Display this help and exit" , Option ['f'] ["from-code"] (ReqArg FromEncoding "encoding") "Convert characters from encoding" , Option ['t'] ["to-code"] (ReqArg ToEncoding "encoding") "Convert characters to encoding" , Option ['c'] ["discard"] (NoArg Discard) "Discard invalid characters from output" , Option [] ["transliterate"] (NoArg Translit) "Transliterate unconvertable characters" , Option ['o'] ["output"] (ReqArg OutputFile "file") "Specify output file (instead of stdout)" ] processArgs :: [String] -> IO (Config, [String]) processArgs args = do case getOpt Permute options args of (opts, args, errs) -> do processHelp opts let config = processConfig defaultConfig opts checkConfig errs config args return (config, args) checkConfig :: [String] -> Config -> [String] -> IO () checkConfig errs config filenames = do when (any null [fromEncoding config, toEncoding config] || null filenames) $ processHelp [Help] when (not (null errs)) $ do mapM_ putStr errs processHelp [Help] processHelp :: [Option] -> IO () processHelp opts = do name <- getProgName let header = "\nUsage: " ++ name ++ " [options] filename\n" when (Help `elem` opts) $ do putStrLn $ usageInfo header options exitFailure processConfig :: Config -> [Option] -> Config processConfig = foldl processOneOption where processOneOption config (FromEncoding f) = config {fromEncoding = f} processOneOption config (ToEncoding t) = config {toEncoding = t} processOneOption config (OutputFile o) = config {outputFile = o} processOneOption config Discard = config {fuzzyConvert = Just IConv.Discard} processOneOption config Translit = config {fuzzyConvert = Just IConv.Transliterate} iconv-0.4.1.1/cbits/0000755000000000000000000000000012044312414012256 5ustar0000000000000000iconv-0.4.1.1/cbits/hsiconv.c0000644000000000000000000000117612044312414014100 0ustar0000000000000000#include "hsiconv.h" /* On some platforms (notably darwin) the iconv functions are defined as * a macro rather than a real C function. Doh! That means we need these * wrappers to get a real C functions we can import via the Haskell FFI. */ iconv_t hs_wrap_iconv_open(const char *tocode, const char *fromcode) { return iconv_open(tocode, fromcode); } size_t hs_wrap_iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { return iconv(cd, inbuf, inbytesleft, outbuf, outbytesleft); } int hs_wrap_iconv_close(iconv_t cd) { return iconv_close(cd); } iconv-0.4.1.1/cbits/hsiconv.h0000644000000000000000000000042512044312414014101 0ustar0000000000000000#include iconv_t hs_wrap_iconv_open(const char *tocode, const char *fromcode); size_t hs_wrap_iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft); int hs_wrap_iconv_close(iconv_t cd); iconv-0.4.1.1/Codec/0000755000000000000000000000000012044312414012167 5ustar0000000000000000iconv-0.4.1.1/Codec/Text/0000755000000000000000000000000012044312414013113 5ustar0000000000000000iconv-0.4.1.1/Codec/Text/IConv.hs0000644000000000000000000004673512044312414014504 0ustar0000000000000000{-# LANGUAGE CPP #-} ----------------------------------------------------------------------------- -- | -- Copyright : (c) 2006-2007 Duncan Coutts -- License : BSD-style -- -- Maintainer : duncan@haskell.org -- Portability : portable (H98 + FFI) -- -- String encoding conversion -- ----------------------------------------------------------------------------- module Codec.Text.IConv ( -- | This module provides pure functions for converting the string encoding -- of strings represented by lazy 'ByteString's. This makes it easy to use -- either in memory or with disk or network IO. -- -- For example, a simple Latin1 to UTF-8 conversion program is just: -- -- > import Codec.Text.IConv as IConv -- > import Data.ByteString.Lazy as ByteString -- > -- > main = ByteString.interact (convert "LATIN1" "UTF-8") -- -- Or you could lazily read in and convert a UTF-8 file to UTF-32 using: -- -- > content <- fmap (IConv.convert "UTF-8" "UTF-32") (readFile file) -- -- This module uses the POSIX @iconv()@ library function. The primary -- advantage of using iconv is that it is widely available, most systems -- have a wide range of supported string encodings and the conversion speed -- it typically good. The iconv library is available on all unix systems -- (since it is required by the POSIX.1 standard) and GNU libiconv is -- available as a standalone library for other systems, including Windows. -- * Simple conversion API convert, EncodingName, -- * Variant that is lax about conversion errors convertFuzzy, Fuzzy(..), -- * Variants that are pedantic about conversion errors convertStrictly, convertLazily, ConversionError(..), reportConversionError, Span(..), ) where import Prelude hiding (length, span) import Control.Exception (assert) import qualified Control.Exception as Exception import Foreign.C.Error as C.Error (Errno, errnoToIOError) import qualified Data.ByteString.Lazy as L (ByteString, toChunks, fromChunks) import qualified Data.ByteString.Lazy.Internal as L (defaultChunkSize) import qualified Data.ByteString as S import qualified Codec.Text.IConv.Internal as IConv import Codec.Text.IConv.Internal (IConv) -- | A string encoding name, eg @\"UTF-8\"@ or @\"LATIN1\"@. -- -- The range of string encodings available is determined by the capabilities -- of the underlying iconv implementation. -- -- When using the GNU C or libiconv libraries, the permitted values are listed -- by the @iconv --list@ command, and all combinations of the listed values -- are supported. -- type EncodingName = String -- | Output spans from encoding conversion. When nothing goes wrong we -- expect just a bunch of 'Span's. If there are conversion errors we get other -- span types. -- data Span = -- | An ordinary output span in the target encoding Span !S.ByteString -- | An error in the conversion process. If this occurs it will be the -- last span. | ConversionError !ConversionError data ConversionError = -- | The conversion from the input to output string encoding is not -- supported by the underlying iconv implementation. This is usually -- because a named encoding is not recognised or support for it -- was not enabled on this system. -- -- The POSIX standard does not guarantee that all possible combinations -- of recognised string encoding are supported, however most common -- implementations do support all possible combinations. -- UnsuportedConversion EncodingName EncodingName -- | This covers two possible conversion errors: -- -- * There is a byte sequence in the input that is not valid in the input -- encoding. -- -- * There is a valid character in the input that has no corresponding -- character in the output encoding. -- -- Unfortunately iconv does not let us distinguish these two cases. In -- either case, the Int parameter gives the byte offset in the input of -- the unrecognised bytes or unconvertable character. -- | InvalidChar Int -- | This error covers the case where the end of the input has trailing -- bytes that are the initial bytes of a valid character in the input -- encoding. In other words, it looks like the input ended in the middle of -- a multi-byte character. This would often be an indication that the input -- was somehow truncated. Again, the Int parameter is the byte offset in -- the input where the incomplete character starts. -- | IncompleteChar Int -- | An unexpected iconv error. The iconv spec lists a number of possible -- expected errors but does not guarantee that there might not be other -- errors. -- -- This error can occur either immediately, which might indicate that the -- iconv installation is messed up somehow, or it could occur later which -- might indicate resource exhaustion or some other internal iconv error. -- -- Use 'Foreign.C.Error.errnoToIOError' to get slightly more information -- on what the error could possibly be. -- | UnexpectedError C.Error.Errno reportConversionError :: ConversionError -> IOError reportConversionError conversionError = case conversionError of UnsuportedConversion fromEncoding toEncoding -> err $ "cannot convert from string encoding " ++ show fromEncoding ++ " to string encoding " ++ show toEncoding InvalidChar inputPos -> err $ "invalid input sequence at byte offset " ++ show inputPos IncompleteChar inputPos -> err $ "incomplete input sequence at byte offset " ++ show inputPos UnexpectedError errno -> C.Error.errnoToIOError "Codec.Text.IConv: unexpected error" errno Nothing Nothing where err msg = userError $ "Codec.Text.IConv: " ++ msg {-# NOINLINE convert #-} -- | Convert text from one named string encoding to another. -- -- * The conversion is done lazily. -- -- * An exception is thrown if conversion between the two encodings is not -- supported. -- -- * An exception is thrown if there are any encoding conversion errors. -- convert :: EncodingName -- ^ Name of input string encoding -> EncodingName -- ^ Name of output string encoding -> L.ByteString -- ^ Input text -> L.ByteString -- ^ Output text convert fromEncoding toEncoding = -- lazily convert the list of spans into an ordinary lazy ByteString: L.fromChunks . foldr span [] . convertLazily fromEncoding toEncoding where span (Span c) cs = c : cs span (ConversionError e) _ = #if MIN_VERSION_base(4,0,0) Exception.throw (reportConversionError e) #else Exception.throw (Exception.IOException (reportConversionError e)) #endif data Fuzzy = Transliterate | Discard -- | Convert text ignoring encoding conversion problems. -- -- If invalid byte sequences are found in the input they are ignored and -- conversion continues if possible. This is not always possible especially -- with stateful encodings. No placeholder character is inserted into the -- output so there will be no indication that invalid byte sequences were -- encountered. -- -- If there are characters in the input that have no direct corresponding -- character in the output encoding then they are dealt in one of two ways, -- depending on the 'Fuzzy' argument. We can try and 'Transliterate' them into -- the nearest corresponding character(s) or use a replacement character -- (typically @\'?\'@ or the Unicode replacement character). Alternatively they -- can simply be 'Discard'ed. -- -- In either case, no exceptions will occur. In the case of unrecoverable -- errors, the output will simply be truncated. This includes the case of -- unrecognised or unsupported encoding names; the output will be empty. -- -- * This function only works with the GNU iconv implementation which provides -- this feature beyond what is required by the iconv specification. -- convertFuzzy :: Fuzzy -- ^ Whether to try and transliterate or -- discard characters with no direct conversion -> EncodingName -- ^ Name of input string encoding -> EncodingName -- ^ Name of output string encoding -> L.ByteString -- ^ Input text -> L.ByteString -- ^ Output text convertFuzzy fuzzy fromEncoding toEncoding = -- lazily convert the list of spans into an ordinary lazy ByteString: L.fromChunks . foldr span [] . convertInternal IgnoreInvalidChar fromEncoding (toEncoding ++ mode) where mode = case fuzzy of Transliterate -> "//IGNORE,TRANSLIT" Discard -> "//IGNORE" span (Span c) cs = c : cs span (ConversionError _) cs = cs {-# NOINLINE convertStrictly #-} -- | This variant does the conversion all in one go, so it is able to report -- any conversion errors up front. It exposes all the possible error conditions -- and never throws exceptions -- -- The disadvantage is that no output can be produced before the whole input -- is consumed. This might be problematic for very large inputs. -- convertStrictly :: EncodingName -- ^ Name of input string encoding -> EncodingName -- ^ Name of output string encoding -> L.ByteString -- ^ Input text -> Either L.ByteString ConversionError -- ^ Output text or conversion error convertStrictly fromEncoding toEncoding = -- strictly convert the list of spans into an ordinary lazy ByteString -- or an error strictify [] . convertLazily fromEncoding toEncoding where strictify :: [S.ByteString] -> [Span] -> Either L.ByteString ConversionError strictify cs [] = Left (L.fromChunks (reverse cs)) strictify cs (Span c : ss) = strictify (c:cs) ss strictify _ (ConversionError e:_) = Right e {-# NOINLINE convertLazily #-} -- | This version provides a more complete but less convenient conversion -- interface. It exposes all the possible error conditions and never throws -- exceptions. -- -- The conversion is still lazy. It returns a list of spans, where a span may -- be an ordinary span of output text or a conversion error. This somewhat -- complex interface allows both for lazy conversion and for precise reporting -- of conversion problems. The other functions 'convert' and 'convertStrictly' -- are actually simple wrappers on this function. -- convertLazily :: EncodingName -- ^ Name of input string encoding -> EncodingName -- ^ Name of output string encoding -> L.ByteString -- ^ Input text -> [Span] -- ^ Output text spans convertLazily = convertInternal StopOnInvalidChar data InvalidCharBehaviour = StopOnInvalidChar | IgnoreInvalidChar convertInternal :: InvalidCharBehaviour -> EncodingName -> EncodingName -> L.ByteString -> [Span] convertInternal ignore fromEncoding toEncoding input = IConv.run fromEncoding toEncoding $ \status -> case status of IConv.InitOk -> do IConv.newOutputBuffer outChunkSize fillInputBuffer ignore (L.toChunks input) IConv.UnsupportedConversion -> failConversion (UnsuportedConversion fromEncoding toEncoding) IConv.UnexpectedInitError errno -> failConversion (UnexpectedError errno) fillInputBuffer :: InvalidCharBehaviour -> [S.ByteString] -> IConv [Span] fillInputBuffer ignore (inChunk : inChunks) = do IConv.pushInputBuffer inChunk drainBuffers ignore inChunks fillInputBuffer _ignore [] = do outputBufferBytesAvailable <- IConv.outputBufferBytesAvailable IConv.finalise if outputBufferBytesAvailable > 0 then do outChunk <- IConv.popOutputBuffer return [Span outChunk] else return [] drainBuffers :: InvalidCharBehaviour -> [S.ByteString] -> IConv [Span] drainBuffers ignore inChunks = do inputBufferEmpty_ <- IConv.inputBufferEmpty outputBufferFull <- IConv.outputBufferFull assert (not outputBufferFull && not inputBufferEmpty_) $ return () -- this invariant guarantees we can always make forward progress status <- IConv.iconv case status of IConv.InputEmpty -> do inputBufferEmpty <- IConv.inputBufferEmpty assert inputBufferEmpty $ fillInputBuffer ignore inChunks IConv.OutputFull -> do outChunk <- IConv.popOutputBuffer outChunks <- IConv.unsafeInterleave $ do IConv.newOutputBuffer outChunkSize drainBuffers ignore inChunks return (Span outChunk : outChunks) IConv.InvalidChar -> invalidChar ignore inChunks IConv.IncompleteChar -> fixupBoundary ignore inChunks IConv.UnexpectedError errno -> failConversion (UnexpectedError errno) -- | The posix iconv api looks like it's designed specifically for streaming -- and it is, except for one really really annoying corner case... -- -- Suppose you're converting a stream, say by reading a file in 4k chunks. This -- would seem to be the canonical use case for iconv, reading and converting an -- input file. However suppose the 4k read chunk happens to split a multi-byte -- character. Then iconv will stop just before that char and tell us that its -- an incomplete char. So far so good. Now what we'd like to do is have iconv -- remember those last few bytes in its conversion state so we can carry on -- with the next 4k block. Sadly it does not. It requires us to fix things up -- so that it can carry on with the next block starting with a complete multi- -- byte character. Do do that we have to somehow copy those few trailing bytes -- to the beginning of the next block. That's perhaps not too bad in an -- imperitive context using a mutable input buffer - we'd just copy the few -- trailing bytes to the beginning of the buffer and do a short read (ie 4k-n -- the number of trailing bytes). That's not terribly nice since it means the -- OS has to do IO on non-page aligned buffers which tends to be slower. It's -- worse for us though since we're not using a mutable input buffer, we're -- using a lazy bytestring which is a sequence of immutable buffers. -- -- So we have to do more cunning things. We could just prepend the trailing -- bytes to the next block, but that would mean alocating and copying the whole -- next block just to prepend a couple bytes. This probably happens quite -- frequently so would be pretty slow. So we have to be even more cunning. -- -- The solution is to create a very small buffer to cover the few bytes making -- up the character spanning the block boundary. So we copy the trailing bytes -- plus a few from the beginning of the next block. Then we run iconv again on -- that small buffer. How many bytes from the next block to copy is a slightly -- tricky issue. If we copy too few there's no guarantee that we have enough to -- give a complete character. We opt for a maximum size of 16, 'tmpChunkSize' -- on the theory that no encoding in existance uses that many bytes to encode a -- single character, so it ought to be enough. Yeah, it's a tad dodgey. -- -- Having papered over the block boundary, we still have to cross the boundary -- of this small buffer. It looks like we've still got the same problem, -- however this time we should have crossed over into bytes that are wholly -- part of the large following block so we can abandon our small temp buffer -- an continue with the following block, with a slight offset for the few bytes -- taken up by the chars that fit into the small buffer. -- -- So yeah, pretty complex. Check out the proof below of the tricky case. -- fixupBoundary :: InvalidCharBehaviour -> [S.ByteString] -> IConv [Span] fixupBoundary _ignore [] = do inputPos <- IConv.inputPosition failConversion (IncompleteChar inputPos) fixupBoundary ignore inChunks@(inChunk : inChunks') = do inSize <- IConv.inputBufferSize assert (inSize < tmpChunkSize) $ return () let extraBytes = tmpChunkSize - inSize if S.length inChunk <= extraBytes then do IConv.replaceInputBuffer (`S.append` inChunk) drainBuffers ignore inChunks' else do IConv.replaceInputBuffer (`S.append` S.take extraBytes inChunk) before <- IConv.inputBufferSize assert (before == tmpChunkSize) $ return () status <- IConv.iconv after <- IConv.inputBufferSize let consumed = before - after case status of IConv.InputEmpty -> assert (consumed == tmpChunkSize) $ fillInputBuffer ignore (S.drop extraBytes inChunk : inChunks') IConv.OutputFull -> do outChunk <- IConv.popOutputBuffer outChunks <- IConv.unsafeInterleave $ do IConv.newOutputBuffer outChunkSize drainBuffers ignore inChunks return (Span outChunk : outChunks) IConv.InvalidChar -> invalidChar ignore inChunks IConv.IncompleteChar -> assert (inSize < consumed && consumed < tmpChunkSize) $ -- inSize < consumed < tmpChunkSize -- => { subtract inSize from each side } -- 0 < consumed - inSize < tmpChunkSize - inSize -- => { by definition that extraBytes = tmpChunkSize - inSize } -- 0 < consumed - inSize < extraBytes -- => { since we're in the False case of the if, we know: -- not (S.length inChunk <= extraBytes) -- = S.length inChunk > extraBytes -- = extraBytes < S.length inChunk } -- 0 < consumed - inSize < extraBytes < S.length inChunk -- -- And we're done! We know it's safe to drop (consumed - inSize) from -- inChunk since it's more than 0 and less than the inChunk size, so -- we're not being left with an empty chunk (which is not allowed). drainBuffers ignore (S.drop (consumed - inSize) inChunk : inChunks') IConv.UnexpectedError errno -> failConversion (UnexpectedError errno) invalidChar :: InvalidCharBehaviour -> [S.ByteString] -> IConv [Span] invalidChar StopOnInvalidChar _ = do inputPos <- IConv.inputPosition failConversion (InvalidChar inputPos) invalidChar IgnoreInvalidChar inChunks = do inputPos <- IConv.inputPosition let invalidCharError = ConversionError (InvalidChar inputPos) outputBufferBytesAvailable <- IConv.outputBufferBytesAvailable if outputBufferBytesAvailable > 0 then do outChunk <- IConv.popOutputBuffer outChunks <- IConv.unsafeInterleave $ do IConv.newOutputBuffer outChunkSize inputBufferEmpty <- IConv.inputBufferEmpty if inputBufferEmpty then fillInputBuffer IgnoreInvalidChar inChunks else drainBuffers IgnoreInvalidChar inChunks return (Span outChunk : invalidCharError : outChunks) else do outChunks <- IConv.unsafeInterleave $ do IConv.newOutputBuffer outChunkSize inputBufferEmpty <- IConv.inputBufferEmpty if inputBufferEmpty then fillInputBuffer IgnoreInvalidChar inChunks else drainBuffers IgnoreInvalidChar inChunks return (invalidCharError : outChunks) failConversion :: ConversionError -> IConv [Span] failConversion err = do outputBufferBytesAvailable <- IConv.outputBufferBytesAvailable IConv.finalise if outputBufferBytesAvailable > 0 then do outChunk <- IConv.popOutputBuffer return [Span outChunk, ConversionError err] else return [ ConversionError err] outChunkSize :: Int outChunkSize = L.defaultChunkSize tmpChunkSize :: Int tmpChunkSize = 16 iconv-0.4.1.1/Codec/Text/IConv/0000755000000000000000000000000012044312414014131 5ustar0000000000000000iconv-0.4.1.1/Codec/Text/IConv/Internal.hs0000644000000000000000000002547012044312414016251 0ustar0000000000000000----------------------------------------------------------------------------- -- | -- Copyright : (c) 2006-2007 Duncan Coutts -- License : BSD-style -- -- Maintainer : duncan@haskell.org -- Portability : portable (H98 + FFI) -- -- IConv wrapper layer -- ----------------------------------------------------------------------------- module Codec.Text.IConv.Internal ( -- * The iconv state monad IConv, run, InitStatus(..), unsafeInterleave, unsafeLiftIO, finalise, -- * The buisness iconv, Status(..), -- * Buffer management -- ** Input buffer pushInputBuffer, inputBufferSize, inputBufferEmpty, inputPosition, replaceInputBuffer, -- ** Output buffer newOutputBuffer, popOutputBuffer, outputBufferBytesAvailable, outputBufferFull, -- * Debugging -- consistencyCheck, dump, trace ) where import Foreign import Foreign.C import qualified Data.ByteString.Internal as S import System.IO.Unsafe (unsafeInterleaveIO) import System.IO (hPutStrLn, stderr) import Control.Exception (assert) import Prelude hiding (length) pushInputBuffer :: S.ByteString -> IConv () pushInputBuffer (S.PS inBuffer' inOffset' inLength') = do -- must not push a new input buffer if the last one is not used up inAvail <- gets inLength assert (inAvail == 0) $ return () -- now set the available input buffer ptr and length modify $ \bufs -> bufs { inBuffer = inBuffer', inOffset = inOffset', inLength = inLength' } inputBufferEmpty :: IConv Bool inputBufferEmpty = gets ((==0) . inLength) inputBufferSize :: IConv Int inputBufferSize = gets inLength inputPosition :: IConv Int inputPosition = gets inTotal replaceInputBuffer :: (S.ByteString -> S.ByteString) -> IConv () replaceInputBuffer replace = modify $ \bufs -> case replace (S.PS (inBuffer bufs) (inOffset bufs) (inLength bufs)) of S.PS inBuffer' inOffset' inLength' -> bufs { inBuffer = inBuffer', inOffset = inOffset', inLength = inLength' } newOutputBuffer :: Int -> IConv () newOutputBuffer size = do --must not push a new buffer if there is still data in the old one outAvail <- gets outLength assert (outAvail == 0) $ return () -- Note that there may still be free space in the output buffer, that's ok, -- you might not want to bother completely filling the output buffer say if -- there's only a few free bytes left. -- now set the available output buffer ptr and length outBuffer' <- unsafeLiftIO $ S.mallocByteString size modify $ \bufs -> bufs { outBuffer = outBuffer', outOffset = 0, outLength = 0, outFree = size } -- get that part of the output buffer that is currently full -- (might be 0, use outputBufferBytesAvailable to check) -- this may leave some space remaining in the buffer popOutputBuffer :: IConv S.ByteString popOutputBuffer = do bufs <- get -- there really should be something to pop, otherwise it's silly assert (outLength bufs > 0) $ return () modify $ \buf -> buf { outOffset = outOffset bufs + outLength bufs, outLength = 0 } return (S.PS (outBuffer bufs) (outOffset bufs) (outLength bufs)) -- this is the number of bytes available in the output buffer outputBufferBytesAvailable :: IConv Int outputBufferBytesAvailable = gets outLength -- you only need to supply a new buffer when there is no more output buffer -- space remaining outputBufferFull :: IConv Bool outputBufferFull = gets ((==0) . outFree) ---------------------------- -- IConv buffer layout -- data Buffers = Buffers { inBuffer :: {-# UNPACK #-} !(ForeignPtr Word8), -- ^ Current input buffer inOffset :: {-# UNPACK #-} !Int, -- ^ Current read offset inLength :: {-# UNPACK #-} !Int, -- ^ Input bytes left inTotal :: {-# UNPACK #-} !Int, -- ^ Total read offset outBuffer :: {-# UNPACK #-} !(ForeignPtr Word8), -- ^ Current output buffer outOffset :: {-# UNPACK #-} !Int, -- ^ Base out offset outLength :: {-# UNPACK #-} !Int, -- ^ Available output bytes outFree :: {-# UNPACK #-} !Int -- ^ Free output space } deriving Show nullBuffers :: Buffers nullBuffers = Buffers S.nullForeignPtr 0 0 0 S.nullForeignPtr 0 0 0 {- - For the output buffer we have this setup: - - +-------------+-------------+----------+ - |### poped ###|** current **| free | - +-------------+-------------+----------+ - \ / \ / \ / - outOffset outLength outFree - - The output buffer is allocated by us and pointer to by the outBuf ForeignPtr. - An initial prefix of the buffer that we have already poped/yielded. This bit - is immutable, it's already been handed out to the caller, we cannot touch it. - When we yield we increment the outOffset. The next part of the buffer between - outBuf + outOffset and outBuf + outOffset + outLength is the current bit that - has had output data written into it but we have not yet yielded it to the - caller. Finally, we have the free part of the buffer. This is the bit we - provide to iconv to be filled. When it is written to, we increase the - outLength and decrease the outLeft by the number of bytes written. - The input buffer layout is much simpler, it's basically just a bytestring: - - +------------+------------+ - |### done ###| remaining | - +------------+------------+ - \ / \ / - inOffset inLength - - So when we iconv we increase the inOffset and decrease the inLength by the - number of bytes read. -} ---------------------------- -- IConv monad -- newtype IConv a = I { unI :: ConversionDescriptor -> Buffers -> IO (Buffers, a) } instance Monad IConv where (>>=) = bindI -- m >>= f = (m `bindI` \a -> consistencyCheck `thenI` returnI a) `bindI` f (>>) = thenI return = returnI returnI :: a -> IConv a returnI a = I $ \_ bufs -> return (bufs, a) {-# INLINE returnI #-} bindI :: IConv a -> (a -> IConv b) -> IConv b bindI m f = I $ \cd bufs -> do (bufs', a) <- unI m cd bufs unI (f a) cd bufs' {-# INLINE bindI #-} thenI :: IConv a -> IConv b -> IConv b thenI m f = I $ \cd bufs -> do (bufs', _) <- unI m cd bufs unI f cd bufs' {-# INLINE thenI #-} data InitStatus = InitOk | UnsupportedConversion | UnexpectedInitError Errno {-# NOINLINE run #-} run :: String -> String -> (InitStatus -> IConv a) -> a run from to m = unsafePerformIO $ do ptr <- withCString from $ \fromPtr -> withCString to $ \toPtr -> c_iconv_open toPtr fromPtr -- note arg reversal (cd, status) <- if ptrToIntPtr ptr /= (-1) then do cd <- newForeignPtr c_iconv_close ptr return (cd, InitOk) else do errno <- getErrno cd <- newForeignPtr_ nullPtr if errno == eINVAL then return (cd, UnsupportedConversion) else return (cd, UnexpectedInitError errno) (_,a) <- unI (m status) (ConversionDescriptor cd) nullBuffers return a unsafeLiftIO :: IO a -> IConv a unsafeLiftIO m = I $ \_ bufs -> do a <- m return (bufs, a) -- It's unsafe because we discard the values here, so if you mutate anything -- between running this and forcing the result then you'll get an inconsistent -- iconv state. unsafeInterleave :: IConv a -> IConv a unsafeInterleave m = I $ \cd st -> do res <- unsafeInterleaveIO (unI m cd st) return (st, snd res) get :: IConv Buffers get = I $ \_ buf -> return (buf, buf) gets :: (Buffers -> a) -> IConv a gets getter = I $ \_ buf -> return (buf, getter buf) modify :: (Buffers -> Buffers) -> IConv () modify change = I $ \_ buf -> return (change buf, ()) ---------------------------- -- Debug stuff -- trace :: String -> IConv () trace = unsafeLiftIO . hPutStrLn stderr dump :: IConv () dump = do bufs <- get unsafeLiftIO $ hPutStrLn stderr $ show bufs ---------------------------- -- iconv wrapper layer -- data Status = InputEmpty | OutputFull | IncompleteChar | InvalidChar | UnexpectedError Errno iconv :: IConv Status iconv = I $ \(ConversionDescriptor cdfptr) bufs -> assert (outFree bufs > 0) $ --TODO: optimise all this allocation withForeignPtr cdfptr $ \cdPtr -> withForeignPtr (inBuffer bufs) $ \inBufPtr -> with (inBufPtr `plusPtr` inOffset bufs) $ \inBufPtrPtr -> with (fromIntegral (inLength bufs)) $ \inLengthPtr -> withForeignPtr (outBuffer bufs) $ \outBufPtr -> let outBufPtr' = outBufPtr `plusPtr` (outOffset bufs + outLength bufs) in with outBufPtr' $ \outBufPtrPtr -> with (fromIntegral (outFree bufs)) $ \outFreePtr -> do result <- c_iconv cdPtr inBufPtrPtr inLengthPtr outBufPtrPtr outFreePtr inLength' <- fromIntegral `fmap` peek inLengthPtr outFree' <- fromIntegral `fmap` peek outFreePtr let inByteCount = inLength bufs - inLength' outByteCount = outFree bufs - outFree' bufs' = bufs { inOffset = inOffset bufs + inByteCount, inLength = inLength', inTotal = inTotal bufs + inByteCount, outLength = outLength bufs + outByteCount, outFree = outFree' } if result /= errVal then return (bufs', InputEmpty) else do errno <- getErrno case () of _ | errno == e2BIG -> return (bufs', OutputFull) | errno == eINVAL -> return (bufs', IncompleteChar) | errno == eILSEQ -> return (bufs', InvalidChar) | otherwise -> return (bufs', UnexpectedError errno) where errVal :: CSize errVal = (-1) -- (size_t)(-1) -- | This never needs to be used as the iconv descriptor will be released -- automatically when no longer needed, however this can be used to release -- it early. Only use this when you can guarantee that the iconv will no -- longer be needed, for example if an error occurs or if the input stream -- ends. -- finalise :: IConv () finalise = I $ \(ConversionDescriptor cd) bufs -> do finalizeForeignPtr cd return (bufs, ()) ---------------------- -- The foreign imports newtype ConversionDescriptor = ConversionDescriptor (ForeignPtr ConversionDescriptor) -- iconv_t foreign import ccall unsafe "hsiconv.h hs_wrap_iconv_open" c_iconv_open :: CString -- to code -> CString -- from code -> IO (Ptr ConversionDescriptor) foreign import ccall unsafe "hsiconv.h hs_wrap_iconv" c_iconv :: Ptr ConversionDescriptor -> Ptr (Ptr CChar) -- in buf -> Ptr CSize -- in buf bytes left -> Ptr (Ptr CChar) -- out buf -> Ptr CSize -- out buf bytes left -> IO CSize foreign import ccall unsafe "hsiconv.h &hs_wrap_iconv_close" c_iconv_close :: FinalizerPtr ConversionDescriptor