cassava-megaparsec-2.0.4/Data/0000755000000000000000000000000013726413775014321 5ustar0000000000000000cassava-megaparsec-2.0.4/Data/Csv/0000755000000000000000000000000013726413775015054 5ustar0000000000000000cassava-megaparsec-2.0.4/Data/Csv/Parser/0000755000000000000000000000000014134352763016301 5ustar0000000000000000cassava-megaparsec-2.0.4/Data/Csv/Parser/Megaparsec/0000755000000000000000000000000014134352763020350 5ustar0000000000000000cassava-megaparsec-2.0.4/tests/0000755000000000000000000000000013726413775014612 5ustar0000000000000000cassava-megaparsec-2.0.4/Data/Csv/Parser/Megaparsec.hs0000644000000000000000000000663014134352763020711 0ustar0000000000000000-- | -- Module : Data.Csv.Parser.Megaparsec -- Copyright : © 2016–2021 Stack Builders -- License : MIT -- -- Maintainer : Mark Karpov -- Stability : experimental -- Portability : portable -- -- A CSV parser. The parser here is RFC 4180 compliant, with the following -- extensions: -- -- * Non-escaped fields may contain any characters except double-quotes, -- commas (or generally delimiter characters), carriage returns, and -- newlines. -- * Escaped fields may contain any characters, but double-quotes need -- to be escaped. -- -- The parser provides better error messages than the parser that comes with -- Cassava library, while being compatible with the rest of the library. module Data.Csv.Parser.Megaparsec ( ConversionError (..) , decode , decodeWith , decodeByName , decodeByNameWith) where import Data.Csv hiding ( decode , decodeWith , decodeByName , decodeByNameWith ) import Data.Vector (Vector) import Text.Megaparsec import qualified Data.ByteString.Lazy as BL import Data.Csv.Parser.Megaparsec.Internals ( ConversionError (..) , csv , csvWithHeader , decodeWithC) ---------------------------------------------------------------------------- -- Top level interface -- | Deserialize CSV records form a lazy 'BL.ByteString'. If this fails due -- to incomplete or invalid input, 'Left' is returned. Equivalent to -- 'decodeWith' 'defaultDecodeOptions'. decode :: FromRecord a => HasHeader -- ^ Whether the data contains header that should be skipped -> FilePath -- ^ File name (only for displaying in parse error messages, use empty -- string if you have none) -> BL.ByteString -- ^ CSV data -> Either (ParseErrorBundle BL.ByteString ConversionError) (Vector a) decode = decodeWith defaultDecodeOptions {-# INLINE decode #-} -- | Like 'decode', but lets you customize how the CSV data is parsed. decodeWith :: FromRecord a => DecodeOptions -- ^ Decoding options -> HasHeader -- ^ Whether the data contains header that should be skipped -> FilePath -- ^ File name (only for displaying in parse error messages, use empty -- string if you have none) -> BL.ByteString -- ^ CSV data -> Either (ParseErrorBundle BL.ByteString ConversionError) (Vector a) decodeWith = decodeWithC csv {-# INLINE decodeWith #-} -- | Deserialize CSV records from a lazy 'BL.ByteString'. If this fails due -- to incomplete or invalid input, 'Left' is returned. The data is assumed -- to be preceded by a header. Equivalent to 'decodeByNameWith' -- 'defaultDecodeOptions'. decodeByName :: FromNamedRecord a => FilePath -- ^ File name (only for displaying in parse error messages, use empty -- string if you have none) -> BL.ByteString -- ^ CSV data -> Either (ParseErrorBundle BL.ByteString ConversionError) (Header, Vector a) decodeByName = decodeByNameWith defaultDecodeOptions {-# INLINE decodeByName #-} -- | Like 'decodeByName', but lets you customize how the CSV data is parsed. decodeByNameWith :: FromNamedRecord a => DecodeOptions -- ^ Decoding options -> FilePath -- ^ File name (only for displaying in parse error messages, use empty -- string if you have none) -> BL.ByteString -- ^ CSV data -> Either (ParseErrorBundle BL.ByteString ConversionError) (Header, Vector a) decodeByNameWith opts = parse (csvWithHeader opts) {-# INLINE decodeByNameWith #-}cassava-megaparsec-2.0.4/Data/Csv/Parser/Megaparsec/Internals.hs0000644000000000000000000001222714134352763022647 0ustar0000000000000000-- | -- Module : Data.Csv.Parser.Megaparsec.Internals -- Copyright : © 2016–2021 Stack Builders -- License : MIT -- {-# LANGUAGE BangPatterns #-} {-# LANGUAGE DeriveDataTypeable #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RecordWildCards #-} module Data.Csv.Parser.Megaparsec.Internals ( ConversionError (..) , Parser , csv , csvWithHeader , decodeWithC , toNamedRecord , header , name , record , field , escapedField , unescapedField) where import Control.Monad import Data.ByteString (ByteString) import Data.Csv hiding ( Parser , record , header , toNamedRecord ) import Data.Data import Data.Vector (Vector) import Data.Word (Word8) import Text.Megaparsec import Text.Megaparsec.Byte import qualified Data.ByteString as B import qualified Data.ByteString.Lazy as BL import qualified Data.Csv as C import qualified Data.HashMap.Strict as H import qualified Data.Vector as V ---------------------------------------------------------------------------- -- Custom error component and other types -- | Custom error component for CSV parsing. It allows typed reporting of -- conversion errors. newtype ConversionError = ConversionError String deriving (Eq, Data, Typeable, Ord, Read, Show) instance ShowErrorComponent ConversionError where showErrorComponent (ConversionError msg) = "conversion error: " ++ msg -- | Parser type that uses “custom error component” 'ConversionError'. type Parser = Parsec ConversionError BL.ByteString ---------------------------------------------------------------------------- -- The parser -- | Parse a CSV file that does not include a header. csv :: FromRecord a => DecodeOptions -- ^ Decoding options -> Parser (Vector a) -- ^ The parser that parses collection of records csv DecodeOptions {..} = do xs <- sepEndBy1 (record decDelimiter parseRecord) eol eof return $! V.fromList xs -- | Parse a CSV file that includes a header. csvWithHeader :: FromNamedRecord a => DecodeOptions -- ^ Decoding options -> Parser (Header, Vector a) -- ^ The parser that parser collection of named records csvWithHeader DecodeOptions {..} = do !hdr <- header decDelimiter let f = parseNamedRecord . toNamedRecord hdr xs <- sepEndBy1 (record decDelimiter f) eol eof return $ let !v = V.fromList xs in (hdr, v) -- | Decode CSV data using the provided parser, skipping a leading header if -- necessary. decodeWithC :: (DecodeOptions -> Parser a) -- ^ Parsing function parametrized by 'DecodeOptions' -> DecodeOptions -- ^ Decoding options -> HasHeader -- ^ Whether to expect a header in the input -> FilePath -- ^ File name (only for displaying in parse error messages, use empty -- string if you have none) -> BL.ByteString -- ^ CSV data -> Either (ParseErrorBundle BL.ByteString ConversionError) a decodeWithC p opts@DecodeOptions {..} hasHeader = parse parser where parser = case hasHeader of HasHeader -> header decDelimiter *> p opts NoHeader -> p opts {-# INLINE decodeWithC #-} -- | Convert a 'Record' to a 'NamedRecord' by attaching column names. The -- 'Header' and 'Record' must be of the same length. toNamedRecord :: Header -> Record -> NamedRecord toNamedRecord hdr v = H.fromList . V.toList $ V.zip hdr v {-# INLINE toNamedRecord #-} -- | Parse a header, including the terminating line separator. header :: Word8 -> Parser Header header del = V.fromList <$!> p <* eol where p = sepBy1 (name del) (void $ char del) "file header" {-# INLINE header #-} -- | Parse a header name. Header names have the same format as regular -- 'field's. name :: Word8 -> Parser Name name del = field del "name in header" {-# INLINE name #-} -- | Parse a record, not including the terminating line separator. The -- terminating line separate is not included as the last record in a CSV -- file is allowed to not have a terminating line separator. record :: Word8 -- ^ Field delimiter -> (Record -> C.Parser a) -- ^ How to “parse” record to get the data of interest -> Parser a record del f = do notFollowedBy eof -- to prevent reading empty line at the end of file r <- V.fromList <$!> (sepBy1 (field del) (void $ char del) "record") case C.runParser (f r) of Left msg -> customFailure (ConversionError msg) Right x -> return x {-# INLINE record #-} -- | Parse a field. The field may be in either the escaped or non-escaped -- format. The returned value is unescaped. field :: Word8 -> Parser Field field del = label "field" (escapedField <|> unescapedField del) {-# INLINE field #-} -- | Parse an escaped field. escapedField :: Parser ByteString escapedField = B.pack <$!> between (char 34) (char 34) (many $ normalChar <|> escapedDq) where normalChar = anySingleBut 34 "unescaped character" escapedDq = label "escaped double-quote" (34 <$ string "\"\"") {-# INLINE escapedField #-} -- | Parse an unescaped field. unescapedField :: Word8 -> Parser ByteString unescapedField del = BL.toStrict <$> takeWhileP (Just "unescaped character") f where f x = x /= del && x /= 34 && x /= 10 && x /= 13 {-# INLINE unescapedField #-}cassava-megaparsec-2.0.4/tests/Spec.hs0000644000000000000000000001074513726413775016047 0ustar0000000000000000{-# LANGUAGE CPP #-} {-# LANGUAGE OverloadedStrings #-} module Main (main) where import Data.ByteString (ByteString) import Data.Csv hiding (decode, decodeWith, decodeByName, decodeByNameWith) import Data.Csv.Parser.Megaparsec import Data.Vector (Vector) import Test.Hspec import Test.Hspec.Megaparsec import qualified Data.ByteString.Lazy as BL import qualified Data.Vector as V #if !MIN_VERSION_base(4,8,0) import Control.Applicative ((<$>)) #endif main :: IO () main = hspec spec spec :: Spec spec = do describe "decode" decodeSpec describe "decodeWith" decodeWithSpec describe "decodeByName" decodeByNameSpec describe "decodeByNameWith" decodeByNameWithSpec decodeSpec :: Spec decodeSpec = do let dec = decode NoHeader "" it "decodes simple data" $ dec "a,b,c\n" `shouldParse` φ [["a", "b", "c"]] it "decodes simple data skipping header" $ decode HasHeader "" "field1,field2,field3\na,b,c\n" `shouldParse` φ [["a", "b", "c"]] it "handles CRLF sequence correctly" $ dec "a,b\r\nc,d\r\n" `shouldParse` φ [["a", "b"], ["c", "d"]] it "handles missing end of line" $ dec "a,b,c" `shouldParse` φ [["a", "b", "c"]] it "parses the empty line all right" $ dec "a,b,c\n\nd,e,f\n\n" `shouldParse` φ [["a", "b", "c"], [""], ["d", "e", "f"], [""]] it "handles leading space" $ dec " a, b, c\n" `shouldParse` φ [[" a", " b", " c"]] it "parses the RFC 4180 test data correctly" $ dec rfc4180Input `shouldParse` φ rfc4180Output decodeWithSpec :: Spec decodeWithSpec = context "using tab as delimiter" $ do let dec = decodeWith defaultDecodeOptions { decDelimiter = 9 } NoHeader "" it "works fine with tab as delimiter" $ dec "a\tb\tc\n" `shouldParse` φ [["a", "b", "c"]] it "handles CRLF sequence correctly" $ dec "a\tb\r\nc\td\r\n" `shouldParse` φ [["a", "b"], ["c", "d"]] it "handles missing end of line" $ dec "a\tb\tc" `shouldParse` φ [["a", "b", "c"]] it "parses the empty line all right" $ dec "a\tb\tc\n\nd\te\tf\n\n" `shouldParse` φ [["a", "b", "c"], [""], ["d", "e", "f"], [""]] it "handles leading space" $ dec " a\t b\t c\n" `shouldParse` φ [[" a", " b", " c"]] decodeByNameSpec :: Spec decodeByNameSpec = do let dec = decodeByName "" it "decodes simple data" $ dec "field\r\nabc\r\n" `shouldParse` χ ["field"] [[("field", "abc")]] it "decodes entries with two fields" $ dec "field1,field2\r\nabc,def\r\n" `shouldParse` χ ["field1", "field2"] [[("field1", "abc"), ("field2", "def")]] it "decodes header ending with CRLF" $ dec "field\r\nabc" `shouldParse` χ ["field"] [[("field", "abc")]] it "decodes data with training CRLF" $ dec "field\r\nabc\r\n" `shouldParse` χ ["field"] [[("field", "abc")]] it "decodes multiply entries (CRLF separated)" $ dec "field\r\nabc\r\ndef" `shouldParse` χ ["field"] [[("field", "abc")],[("field","def")]] decodeByNameWithSpec :: Spec decodeByNameWithSpec = context "using tab as delimiter" $ do let dec = decodeByNameWith defaultDecodeOptions { decDelimiter = 9 } "" it "decodes simple data" $ dec "field\r\nabc\r\n" `shouldParse` χ ["field"] [[("field", "abc")]] it "decodes entries with two fields" $ dec "field1\tfield2\r\nabc\tdef\r\n" `shouldParse` χ ["field1", "field2"] [[("field1", "abc"), ("field2", "def")]] it "decodes header ending with CRLF" $ dec "field\r\nabc" `shouldParse` χ ["field"] [[("field", "abc")]] it "decodes data with training CRLF" $ dec "field\r\nabc\r\n" `shouldParse` χ ["field"] [[("field", "abc")]] it "decodes multiply entries (CRLF separated)" $ dec "field\r\nabc\r\ndef" `shouldParse` χ ["field"] [[("field", "abc")],[("field","def")]] ---------------------------------------------------------------------------- -- Helpers φ :: [[ByteString]] -> Vector [ByteString] φ = V.fromList χ :: [ByteString] -> [[(ByteString, ByteString)]] -> (Header, Vector NamedRecord) χ h xs = (header h, V.fromList (namedRecord <$> xs)) rfc4180Input :: BL.ByteString rfc4180Input = BL.concat [ "#field1,field2,field3\n" , "\"aaa\",\"bb\n" , "b\",\"ccc\"\n" , "\"a,a\",\"b\"\"bb\",\"ccc\"\n" , "zzz,yyy,xxx\n" ] rfc4180Output :: [[ByteString]] rfc4180Output = [ ["#field1", "field2", "field3"] , ["aaa", "bb\nb", "ccc"] , ["a,a", "b\"bb", "ccc"] , ["zzz", "yyy", "xxx"] ] cassava-megaparsec-2.0.4/CHANGELOG.md0000644000000000000000000000075214134353201015242 0ustar0000000000000000## Cassava Megaparsec 2.0.4 * Allow bytestring-0.11 ## Cassava Megaparsec 2.0.3 * Exposing Internals ## Cassava Megaparsec 2.0.2 * Add suppport for `megaparsec-9.0.0` ## Cassava Megaparsec 2.0.1 * Add suppport for `megaparsec-8.0.0` ## Cassava Megaparsec 2.0.0 * Works with Megaparsec 7. ## Cassava Megaparsec 1.0.0 * Works with Megaparsec 6. * Instead of `Cec` we have `ConversionError` as custom component of error messages. ## Cassava Megaparsec 0.1.0 * Initial release. cassava-megaparsec-2.0.4/README.md0000644000000000000000000000504214134352763014721 0ustar0000000000000000# Cassava Megaparsec [![License MIT](https://img.shields.io/badge/license-MIT-brightgreen.svg)](http://opensource.org/licenses/MIT) [![Hackage](https://img.shields.io/hackage/v/cassava-megaparsec.svg?style=flat)](https://hackage.haskell.org/package/cassava-megaparsec) [![Stackage Nightly](http://stackage.org/package/cassava-megaparsec/badge/nightly)](http://stackage.org/nightly/package/cassava-megaparsec) [![Stackage LTS](http://stackage.org/package/cassava-megaparsec/badge/lts)](http://stackage.org/lts/package/cassava-megaparsec) [![Build Status](https://travis-ci.org/stackbuilders/cassava-megaparsec.svg?branch=master)](https://travis-ci.org/stackbuilders/cassava-megaparsec) The package provides alternative parser for the [Cassava](https://hackage.haskell.org/package/cassava) package written with [Megaparsec](https://hackage.haskell.org/package/megaparsec) so you can get better error messages at expense of some speed. ## Quick start The package works seamlessly with Cassava by replacing the following functions: * `decode` * `decodeWith` * `decodeByName` * `decodeByNameWith` The functions work just the same as Cassava's equivalents, but also take name of file they parse (to include into error messages) and return typed high-quality error messages produced by [Megaparsec](https://hackage.haskell.org/package/megaparsec). The import section typically looks like this: ```haskell import Data.Csv hiding (decode, decodeWith, decodeByName, decodeByNameWith) import Data.Csv.Parser.Megaparsec (decode, decodeWith, decodeByName, decodeByNameWith) ``` Next you call appropriate function and get either result of parsing identical to that of Cassava or error message. The error message is well-typed so you can examine it in Haskell code easily. Conversion error are considered parsing errors by the `cassava-megaparsec` package and are reported via custom error message component `Cec` supported by Megaparsec 5. Since Cassava's conversion errors are plain strings, we have no choice but to represent them as strings too: ```haskell -- | Custom error component for CSV parsing. It allows typed reporting of -- conversion errors. data Cec = CecFail String | CecIndentation Ordering Pos Pos | CecConversionError String deriving (Eq, Data, Typeable, Ord, Read, Show) ``` To pretty print a error message use the `parseErrorPretty` function from `Text.Megaparsec`. This should be enough to start using the package, please consult Haddocks for detailed per-function documentation. ## License Copyright © 2016–2021 Stack Builders Distributed under MIT license. cassava-megaparsec-2.0.4/LICENSE.md0000644000000000000000000000206714134352763015052 0ustar0000000000000000# MIT License Copyright © 2016–2021 Stack Builders Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. cassava-megaparsec-2.0.4/Setup.hs0000644000000000000000000000012713726413775015104 0ustar0000000000000000module Main (main) where import Distribution.Simple main :: IO () main = defaultMain cassava-megaparsec-2.0.4/cassava-megaparsec.cabal0000644000000000000000000000500314134353201020135 0ustar0000000000000000name: cassava-megaparsec version: 2.0.4 cabal-version: 1.18 tested-with: GHC==7.10.3, GHC==8.0.2, GHC==8.2.2, GHC==8.4.3, GHC==8.6.1 license: MIT license-file: LICENSE.md author: Mark Karpov maintainer: Mark Karpov homepage: https://github.com/stackbuilders/cassava-megaparsec bug-reports: https://github.com/stackbuilders/cassava-megaparsec/issues category: Text, Web, CSV, Parsing synopsis: Megaparsec parser of CSV files that plays nicely with Cassava build-type: Simple description: Megaparsec parser of CSV files that plays nicely with Cassava. extra-doc-files: CHANGELOG.md , README.md source-repository head type: git location: https://github.com/stackbuilders/cassava-megaparsec.git flag dev description: Turn on development settings. manual: True default: False library build-depends: base >= 4.8 && < 5.0 , bytestring >= 0.9 && < 0.12 , cassava >= 0.4.2 && < 0.6 , megaparsec >= 7.0 && < 10.0 , unordered-containers >= 0.2.7 && < 0.3 , vector >= 0.11 && < 0.13 exposed-modules: Data.Csv.Parser.Megaparsec , Data.Csv.Parser.Megaparsec.Internals if flag(dev) ghc-options: -Wall -Werror else ghc-options: -O2 -Wall if flag(dev) && impl(ghc >= 8.0) ghc-options: -Wcompat -Wincomplete-record-updates -Wincomplete-uni-patterns -Wnoncanonical-monad-instances -Wnoncanonical-monadfail-instances default-language: Haskell2010 test-suite tests main-is: Spec.hs hs-source-dirs: tests type: exitcode-stdio-1.0 build-depends: base >= 4.8 && < 5.0 , bytestring >= 0.9 && < 0.12 , cassava >= 0.4.2 && < 0.6 , cassava-megaparsec , hspec >= 2.0 && < 3.0 , hspec-megaparsec >= 2.0 && < 3.0 , vector >= 0.11 && < 0.13 if flag(dev) ghc-options: -Wall -Werror else ghc-options: -O2 -Wall default-language: Haskell2010