lazy-csv-0.5.1/0000755000000000000000000000000012542734211011454 5ustar0000000000000000lazy-csv-0.5.1/LICENCE-BSD30000644000000000000000000000344412542734211013137 0ustar0000000000000000This library (lazy-csv) is (c) copyright 2009-2015, Malcolm Wallace, Ian Lynagh, and Well-Typed LLP. and distributable under a BSD-style 3-clause license (see below). ----------------------------------------------------------------------------- Copyright 2009-2015, Malcolm Wallace, Ian Lynagh, and Well-Typed LLP. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither name of the University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND THE CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR THE CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ----------------------------------------------------------------------------- lazy-csv-0.5.1/Setup.hs0000644000000000000000000000005612542734211013111 0ustar0000000000000000import Distribution.Simple main = defaultMain lazy-csv-0.5.1/changelog.html0000644000000000000000000000141112542734211014266 0ustar0000000000000000 lazy-csv change log

lazy-csv change log


Release 0.5

Release 0.4

Release 0.3

Release 0.2

Release 0.1

lazy-csv-0.5.1/csvSelect.hs0000644000000000000000000000776512542734211013762 0ustar0000000000000000module Main where import Text.CSV.Lazy.ByteString import qualified Data.ByteString.Lazy.Char8 as BS import System.Environment (getArgs) import System.Console.GetOpt import System.Exit import Control.Monad (when, unless) import System.IO import Data.Char (isDigit) import Data.List (elemIndex) import Data.Maybe (fromJust) version = "0.3" -- lazily read a CSV file, select some columns, and print it out again. main = do opts <- cmdlineOpts =<< getArgs let delim = head $ [ c | Delimiter c <- opts ]++"," when (Version `elem` opts) $ do hPutStrLn stderr $ "csvSelect "++version exitSuccess unless (Unchecked `elem` opts) $ do content <- lazyRead opts case csvErrors (parseDSV True delim content) of errs@(_:_) -> do hPutStrLn stderr (unlines (map ppCSVError errs)) exitWith (ExitFailure 2) [] -> return () out <- case [ f | Output f <- opts ] of [] -> return stdout [file] -> openBinaryFile file WriteMode _ -> do hPutStrLn stderr "Too many outputs: only one allowed" exitWith (ExitFailure 3) content <- lazyRead opts case selectFieldMix [ e | Select e <- opts ] (csvTableFull (parseDSV True delim content)) of Left err -> do hPutStrLn stderr $ "CSV missing fields: " ++unwords err exitWith (ExitFailure 4) Right selection -> do BS.hPut out $ ppCSVTable selection hClose out -- | The standard Data.CSV.Lazy.selectFields chooses only by field name. -- This version chooses with any mixture of numeric index or field name. selectFieldMix :: [ Either Int String ] -> CSVTable -> Either [String] CSVTable selectFieldMix fields table | null table = Left (map (either show id) fields) | not (null missing) = Left missing | otherwise = Right (map select table) where header = map (BS.unpack . csvFieldContent) (head table) lenheader = length header missing = map show (filter (>lenheader) [ i | Left i <- fields ]) ++ filter (`notElem` header) [ name | Right name <- fields ] reordering = map (\e-> case e of Left i -> i Right s -> fromJust $ elemIndex s header) fields select fields = map (fields!!) reordering -- | Read a single input file, or stdin. lazyRead :: [Flag] -> IO BS.ByteString lazyRead opts = case [ f | Input f <- opts ] of [] -> BS.hGetContents stdin [file] -> BS.readFile file _ -> do hPutStrLn stderr "Too many input files: only one allowed" exitWith (ExitFailure 1) -- Command-line options data Flag = Version | Input String | Output String | Unchecked | Delimiter Char | Select (Either Int String) deriving (Show,Eq) options :: [OptDescr Flag] options = [ Option ['v','V'] ["version"] (NoArg Version) "show version number" , Option ['o'] ["output"] (ReqArg Output "FILE") "output FILE" , Option ['i'] ["input"] (ReqArg Input "FILE") "input FILE" , Option ['u'] ["unchecked"] (NoArg Unchecked) "ignore CSV format errors" , Option ['d'] ["delimiter"] (ReqArg (Delimiter . head) "@") "delimiter char is @" ] cmdlineOpts :: [String] -> IO [Flag] cmdlineOpts argv = case getOpt Permute options argv of (o,fs,[] ) -> return (o ++ map field fs) (_,_,errs) -> ioError (userError (concat errs ++ usageInfo header options)) where header = "Usage: csvSelect [OPTION...] (num|fieldname)...\n" ++" select numbered/named columns from a CSV file" field :: String -> Flag field str | all isDigit str = Select (Left (read str)) | otherwise = Select (Right str) lazy-csv-0.5.1/index.html0000644000000000000000000003516112542734211013457 0ustar0000000000000000 lazy-csv: fast, space-efficient, CSV parsing

lazy-csv

What is lazy-csv?
Cmdline tool: csvSelect
Downloads
Recent news
Contacts
Related Work

What is lazy-csv?

lazy-csv is a library in Haskell, for reading CSV (comma-separated value) data. It is lazier, faster, more space-efficient, and more flexible in its treatment of errors, than any other extant Haskell CSV library on Hackage.

Detailed documentation of the lazy-csv API is generated automatically by Haddock directly from the source code.

You can choose between String and ByteString variants of the API, just by picking the appropriate import. The API is identical modulo the type. Here is an example program:

module Main where

import Text.CSV.Lazy.String
import System

-- read a CSV file, select the 3rd column, and print it out again.

main = do
  [file]  <- getArgs
  content <- readFile file
  let csv = parseCSV content
  case csvErrors csv of
    errs@(_:_)  -> print (unlines (map ppCSVError errs))
    []          -> do content <- readFile file
                      let selection = map (take 1 . drop 2)
                                          (csvTable (parseCSV content))
                      putStrLn $ ppCSVTable selection

There are two useful things to note about the API, arising out of this example. First, parseCSV does not directly give you the value of the CSV table, but rather gives a CSVResult. You must project out either the errors (with csvErrors) or the values (with csvTable). Secondly, because the result of parseCSV is lazy, it is in fact more space-efficient (and also faster) to get hold of the valid table contents by reopening and reparsing the file after checking for errors. This also means, of course, that you can simply omit the step of checking for errors and ignore them if you wish.

To illustrate the performance of lazy-csv, here is a micro-benchmark. We compare the same program (the example above) recoded with all of the CSV libraries available on Hackage. The libraries are:

librarystring typeparsing/lexingresultserror-reporting
csvStringParsecstrictfirst error
bytestring-csvByteStringAlexstrictNothing (Maybe)
spreadsheetStringcustom parserlazyfirst error
lazy-csvStringcustom lexerlazyall errors
lazy-csvByteStringcustom lexerlazyall errors
lazy-csvByteStringcustom lexerlazydiscarding errors

The main differences are shown in the table. As far as error-reporting goes, The Parsec-based CSV parser will report only the first error encountered. The Alex-based bytestring-csv will stop at the first error, but not give you any information about it. The Spreadsheet library has a lazy parser, allowing you to retrieve the initial portion of valid data, as far as the first error. The lazy-csv library will notify all errors in the input in addition to returning all the well-formed data it can find. Many of the possible CSV formatting errors are easily recoverable, such as incorrect number of fields in a row, bad quoting, etc. Thus, with the lazy-csv library one can choose to halt on errors, or display them as warnings whilst continuing with good data, or ignore the errors completely, continuing to process the retrievable data.

The choice of lazy vs strict input is extremely important when it comes to large file sizes. Here are some indicative performance figures, using as input a series of files of increasing size: 1Mb, 10Mb, 100Mb, 1Gb. For the purposes of comparison, I include three sets of numbers for the lazy library - two with error-reporting using each of String and ByteString types, the third for ByteString but ignoring errors. In all cases the good data is processed anyway, but the difference in reporting leads to significant performance differences.

Finally, the nearest non-Haskell comparison I could think of is to use the Unix tool 'cut'. Of course, 'cut' (with comma as delimiter) is not a correct CSV-parser, but it does have the benefit of simplicity, and most closely resembles the lazy-csv library in terms of ignoring errors and continuing with good data. It also has lazy streaming behaviour on very large files.

library1Mb10Mb100Mb1Gb
spreadsheetruntime failureruntime failureruntime failureruntime failure
csv0.54220.483stack overflowstack overflow
bytestring-csv0.2732.65627.187out of memory
lazy-csv (String)0.1961.89018.845189.978
lazy-csv (ByteString)0.1481.39913.936139.379
lazy-csv (ByteString, discard errors)0.0870.8178.10280.835
cut -d',' -f30.0520.4624.57645.726

All timings are in seconds, measured best-of-3 with the unix time command, on a 2.26Gz Intel Core 2 Duo MacBook with 4Gb RAM, compiled with ghc-6.10.4 using -O optimisation.

How much maximum live heap do these implementations use, for different input sizes? (All measured from ghc heap profiles.)

library1Mb10Mb100Mb1Gb
csv8Mb120Mbstack overflowstack overflow
bytestring-csvempty profile52Mb700Mbout of memory
lazy-csv (String)12kb12kb12kb12kb
lazy-csv (ByteString)3kb3kb3kb3kb

My conclusions are these:


Cmdline tool: csvSelect

The package distribution contains a command-line tool called csvSelect. It is a fuller and more useful version of the demo program used to illustrate performance above. csvSelect chooses and re-arranges the columns of a CSV file as specified by its command-line arguments. Columns can be chosen by number (counting from 1) or by name (as in the header row of the input). Columns appear in the output in the same order as the arguments. A different delimiter than comma can be specified. If input or output files are not specified, then stdin/stdout are used.

Usage: csvSelect [OPTION...] (num|fieldname)...
    select numbered/named columns from a CSV file
  -v, -V   --version      show version number
  -o FILE  --output=FILE  output FILE
  -i FILE  --input=FILE   input FILE
  -u       --unchecked    ignore CSV format errors
  -d @     --delimiter=@  delimiter char is @

Downloads

Development version:

darcs get http://code.haskell.org/lazy-csv

Current released version:
lazy-csv-0.5, release date 2013.05.24 - on Hackage

Older versions:
lazy-csv-0.5, release date 2013.05.24 - Fifth release, public.
lazy-csv-0.4, release date 2013.02.25 - Fourth release, first public.
lazy-csv-0.3, release date 2011.12.12 - Third (non-public) release.
lazy-csv-0.2, release date 2011.10.11 - Second (non-public) release.
lazy-csv-0.1, release date 2009.11.20 - First (non-public) release.


Recent news

Version 0.5 fixes a bug when handling (rare) CR-only line-endings.

Version 0.4 is the first public release.

Version 0.3 adds duplicate-header detection and repair.

Version 0.2 adds repairing of blank lines and short rows.

Version 0.1 is the first (but non-public) release of lazy-csv.
Complete Changelog


Contacts

Licence: The library is Free and Open Source Software, i.e., copyright to us, but freely licensed for your use, modification, and re-distribution. The lazy-csv library is distributed under a BSD-like 3-clause Licence - see file LICENCE-BSD3 for more details.


Related work


lazy-csv-0.5.1/lazy-csv.cabal0000644000000000000000000000257412542734211014220 0ustar0000000000000000Name: lazy-csv Version: 0.5.1 License: BSD3 License-file: LICENCE-BSD3 Copyright: Malcolm Wallace, Ian Lynagh, and Well Typed LLP Author: Malcolm Wallace , Ian Lynagh Maintainer: Malcolm Wallace Synopsis: Efficient lazy parsers for CSV (comma-separated values). Description: The CSV format is defined by RFC 4180. These efficient lazy parsers (String and ByteString variants) can report all CSV formatting errors, whilst also returning all the valid data, so the user can choose whether to continue, to show warnings, or to halt on error. Valid fields retain information about their original location in the input, so a secondary parser from textual fields to typed values can give intelligent error messages. Category: Text Cabal-Version: >= 1.6 Build-Type: Simple Homepage: http://code.haskell.org/lazy-csv Extra-source-files: index.html changelog.html Source-repository head type: darcs location: http://code.haskell.org/lazy-csv library Build-Depends: base < 5, bytestring Exposed-Modules: Text.CSV.Lazy.String Text.CSV.Lazy.ByteString executable csvSelect build-depends: base < 5, bytestring main-is: csvSelect.hs other-modules: Text.CSV.Lazy.ByteString lazy-csv-0.5.1/Text/0000755000000000000000000000000012542734211012400 5ustar0000000000000000lazy-csv-0.5.1/Text/CSV/0000755000000000000000000000000012542734211013033 5ustar0000000000000000lazy-csv-0.5.1/Text/CSV/Lazy/0000755000000000000000000000000012542734211013752 5ustar0000000000000000lazy-csv-0.5.1/Text/CSV/Lazy/ByteString.hs0000644000000000000000000006055112542734211016407 0ustar0000000000000000-- | The CSV (comma-separated value) format is defined by RFC 4180, -- \"Common Format and MIME Type for Comma-Separated Values (CSV) Files\", -- -- -- This lazy parser can report all CSV formatting errors, whilst also -- returning all the valid data, so the user can choose whether to -- continue, to show warnings, or to halt on error. -- -- Valid fields retain information about their original location in the -- input, so a secondary parser from textual fields to typed values -- can give intelligent error messages. -- -- In a valid CSV file, all rows must have the same number of columns. -- This parser will flag a row with the wrong number of columns as a error. -- (But the error type contains the actual data, so the user can recover -- it if desired.) Completely blank lines are also treated as errors, -- and again the user is free either to filter these out or convert them -- to a row of actual null fields. module Text.CSV.Lazy.ByteString ( -- * CSV types CSVTable , CSVRow , CSVField(..) -- * CSV parsing , CSVError(..) , CSVResult , csvErrors , csvTable , csvTableFull , csvTableHeader , parseCSV , parseDSV -- * Pretty-printing , ppCSVError , ppCSVField , ppCSVTable , ppDSVTable -- * Conversion between standard and simple representations , fromCSVTable , toCSVTable -- * Selection, validation, and algebra of CSV tables , selectFields , expectFields , mkEmptyColumn , joinCSV , mkCSVField ) where -- , ppCSVTableAsTuples import Data.List (groupBy, partition, elemIndex, intercalate, takeWhile ,deleteFirstsBy, nub) import Data.Function (on) import Data.Maybe (fromJust) import qualified Data.ByteString.Lazy.Char8 as BS import Data.ByteString.Lazy.Char8 (ByteString) -- | A CSV table is a sequence of rows. All rows have the same number -- of fields. type CSVTable = [CSVRow] -- | A CSV row is just a sequence of fields. type CSVRow = [CSVField] -- | A CSV field's content is stored with its logical row and column number, -- as well as its textual extent. This information is necessary if you -- want to generate good error messages in a secondary parsing stage, -- should you choose to convert the textual fields to typed data values. data CSVField = CSVField { csvRowNum :: !Int , csvColNum :: !Int , csvTextStart :: !(Int,Int) , csvTextEnd :: !(Int,Int) , csvFieldContent :: !ByteString , csvFieldQuoted :: !Bool } | CSVFieldError { csvRowNum :: !Int , csvColNum :: !Int , csvTextStart :: !(Int,Int) , csvTextEnd :: !(Int,Int) , csvFieldError :: !String } deriving (Eq,Show) -- | A structured error type for CSV formatting mistakes. data CSVError = IncorrectRow { csvRow :: Int , csvColsExpected :: Int , csvColsActual :: Int , csvFields :: [CSVField] } | BlankLine { csvRow :: !Int , csvColsExpected :: !Int , csvColsActual :: !Int , csvField :: CSVField } | FieldError { csvField :: CSVField } | DuplicateHeader{ csvColsExpected :: !Int , csvHeaderSerial :: !Int , csvDuplicate :: !String } | NoData deriving (Eq,Show) -- | The result of parsing a CSV input is a mixed collection of errors -- and valid rows. This way of representing things is crucial to the -- ability to parse lazily whilst still catching format errors. type CSVResult = [ Either [CSVError] [CSVField] ] -- | Extract just the valid portions of a CSV parse. csvTable :: CSVResult -> CSVTable csvTable r = [ row | Right row <- r ] -- | Extract just the errors from a CSV parse. csvErrors :: CSVResult -> [CSVError] csvErrors r = concat [ err | Left err <- r ] -- | Extract the full table, including invalid rows, with padding, and -- de-duplicated headers. csvTableFull:: CSVResult -> CSVTable csvTableFull = map beCareful . deduplicate where beCareful (Right row) = row beCareful (Left (r@IncorrectRow{}:_)) = csvFields r ++ replicate (csvColsExpected r - csvColsActual r) (mkCSVField (csvRow r) 0 BS.empty) beCareful (Left (r@BlankLine{}:_)) = replicate (csvColsExpected r) (mkCSVField (csvRow r) 0 BS.empty) beCareful (Left (r@DuplicateHeader{}:_)) = -- obsolete with deduping replicate (csvColsExpected r) (mkCSVField 0 0 BS.empty) beCareful (Left (FieldError{}:r)) = beCareful (Left r) beCareful (Left (NoData:_)) = [] beCareful (Left []) = [] deduplicate (Left (errs@(DuplicateHeader{}:_)):Right heads:rows) = -- Right (reverse $ foldl replace [] heads) Right (replaceInOrder errs (zip heads [0..])) : rows deduplicate rows = rows {- replace output header | headerName `elem` map csvFieldContent output = header{ csvFieldContent = headerName `BS.append` BS.pack "_duplicate" } : output | otherwise = header: output where headerName = csvFieldContent header -} replaceInOrder [] headers = map fst headers replaceInOrder _ [] = [] replaceInOrder (d:dups) ((h,n):headers) | csvHeaderSerial d == n = h{ csvFieldContent = BS.pack (csvDuplicate d++"_"++show n) } : replaceInOrder dups headers | otherwise = h: replaceInOrder (d:dups) headers -- | The header row of the CSV table, assuming it is non-empty. csvTableHeader :: CSVResult -> [String] csvTableHeader = map (BS.unpack . csvFieldContent) . firstRow where firstRow (Left _: rest) = firstRow rest firstRow (Right x: _) = x -- | A first-stage parser for CSV (comma-separated values) data. -- The individual fields remain as text, but errors in CSV formatting -- are reported. Errors (containing unrecognisable rows/fields) are -- interspersed with the valid rows/fields. parseCSV :: ByteString -> CSVResult parseCSV = parseDSV True ',' -- | Sometimes CSV is not comma-separated, but delimiter-separated -- values (DSV). The choice of delimiter is arbitrary, but semi-colon -- is common in locales where comma is used as a decimal point, and tab -- is also common. The Boolean argument is -- whether newlines should be accepted within quoted fields. The CSV RFC -- says newlines can occur in quotes, but other DSV formats might say -- otherwise. You can often get better error messages if newlines are -- disallowed. parseDSV :: Bool -> Char -> ByteString -> CSVResult parseDSV qn delim = validate . groupBy ((==)`on`csvRowNum) . lexCSV qn delim validate :: [CSVRow] -> CSVResult validate [] = [Left [NoData]] validate xs@(x:_) = checkDuplicateHeaders x $ map (extractErrs (length x)) xs extractErrs :: Int -> CSVRow -> Either [CSVError] CSVRow extractErrs size row | length row0 == size && null errs0 = Right row0 | length row0 == 1 && empty field0 = Left [blankLine field0] | otherwise = Left (map convert errs0 ++ validateColumns row0) where (row0,errs0) = partition isField row (field0:_) = row0 isField (CSVField{}) = True isField (CSVFieldError{}) = False empty f@(CSVField{}) = BS.null (csvFieldContent f) empty _ = False convert err = FieldError {csvField = err} validateColumns r = if length r == size then [] else [ IncorrectRow{ csvRow = if null r then 0 else csvRowNum (head r) , csvColsExpected = size , csvColsActual = length r , csvFields = r } ] blankLine f = BlankLine{ csvRow = csvRowNum f , csvColsExpected = size , csvColsActual = 1 , csvField = f } checkDuplicateHeaders :: CSVRow -> CSVResult -> CSVResult checkDuplicateHeaders row result = let headers = [ f | f@(CSVField{}) <- row ] dups = deleteFirstsBy ((==)`on`csvFieldContent) headers (nub headers) n = length headers in if null dups then result else Left (map (\d-> DuplicateHeader { csvColsExpected = n , csvHeaderSerial = csvColNum d , csvDuplicate = BS.unpack (csvFieldContent d)}) dups) : result -- Reading CSV data is essentially lexical, and can be implemented with a -- simple finite state machine. We keep track of logical row number, -- logical column number (in tabular terms), and textual position (row,col) -- to enable good error messages. -- Positional data is retained even after successful lexing, in case a -- second-stage field parser wants to complain. -- -- A double-quoted CSV field may contain commas, newlines, and double quotes. data CSVState = CSVState { tableRow, tableCol :: !Int , textRow, textCol :: !Int } deriving Show incTableRow, incTableCol, incTextRow :: CSVState -> CSVState incTableRow st = st { tableRow = tableRow st + 1 , tableCol = 1 } incTableCol st = st { tableCol = tableCol st + 1 } incTextRow st = st { textRow = textRow st + 1 , textCol = 1 } incTextCol :: Int -> CSVState -> CSVState incTextCol n st = st { textCol = textCol st + n } here :: CSVState -> (Int,Int) here st = (textRow st, textCol st) -- Lexer is a small finite state machine. lexCSV :: Bool -> Char -> ByteString -> [CSVField] lexCSV qn delim = getFields qn delim (CSVState{tableRow=1,tableCol=1,textRow=1,textCol=1}) (1,1) getFields :: Bool -> Char -> CSVState -> (Int, Int) -> ByteString -> [CSVField] getFields q d state begin bs0 = case BS.uncons bs0 of Nothing -> [] Just ('"', bs1) -> doStringFieldContent q d (incTextCol 1 state) begin BS.empty bs1 _ -> case BS.break interestingChar bs0 of (fieldBs, bs1) -> let field = mkField end begin fieldBs False end = incTextCol (len-1) $ state state' = incTableCol $ incTextCol 2 end stateNL = incTableRow . incTextRow $ state len = fromIntegral $ BS.length fieldBs in case BS.uncons bs1 of Just (c,bs2) | c==d -> field: getFields q d state' (here state') bs2 Just ('\r',bs2) -> case BS.uncons bs2 of Just ('\n',bs3) -> field: getFields q d stateNL (here stateNL) bs3 -- XXX This could be an error instead: _ -> field: getFields q d stateNL (here stateNL) bs2 Just ('\n',bs2) -> field: getFields q d stateNL (here stateNL) bs2 Just ('"', _) -> field: mkError state' begin "unexpected quote, resync at EOL": getFields q d stateNL (here stateNL) (BS.dropWhile (/='\n') bs1) Just _ -> [mkError state' begin "XXX Can't happen"] Nothing -> field: getFields q d stateNL (here stateNL) bs1 where interestingChar '\r' = True interestingChar '\n' = True interestingChar '"' = True interestingChar c | c==d = True interestingChar _ = False doStringFieldContent :: Bool -> Char -> CSVState -> (Int, Int) -> ByteString -> ByteString -> [CSVField] doStringFieldContent q d state begin acc bs1 = case BS.break interestingCharInsideString bs1 of (newBs, bs2) -> let fieldBs = acc `BS.append` newBs field = mkField end begin fieldBs True end = incTextCol (len-1) state state' = incTableCol $ incTextCol 3 end stateNL = incTableRow . incTextRow $ state len = fromIntegral $ BS.length newBs in case BS.uncons bs2 of Just ('\r',bs3) -> case BS.uncons bs3 of Just ('\n',bs4) | q -> doStringFieldContent q d (incTextRow end) begin (fieldBs `BS.append` BS.singleton '\n') bs4 _ -> doStringFieldContent q d end begin (fieldBs `BS.append` BS.singleton '\r') bs3 Just ('\n',bs3) | q -> doStringFieldContent q d (incTextRow end) begin (fieldBs `BS.append` BS.singleton '\n') bs3 Just ('\n',bs3) -> field: mkError end begin "Found newline within quoted field": getFields q d stateNL (here stateNL) bs3 Just ('"', bs3) -> case BS.uncons bs3 of Just (c,bs4) | c==d -> field: getFields q d state' (here state') bs4 Just ('\r',bs4) -> case BS.uncons bs4 of Just ('\n',bs5) -> field: getFields q d stateNL (here stateNL) bs5 -- XXX This could be an error instead: _ -> field: getFields q d stateNL (here stateNL) bs4 Just ('\n',bs4) -> field: getFields q d stateNL (here stateNL) bs4 Just ('"',bs4) -> doStringFieldContent q d (incTextCol 3 end) begin (fieldBs `BS.append` BS.singleton '"') bs4 Just _ -> field: mkError state' begin "End-quote not followed by comma": getFields q d state' (here state') bs3 Nothing -> field: getFields q d stateNL (here stateNL) bs3 Just _ -> [mkError state' begin "XXX Can't happen (string field)"] Nothing -> field: mkError state' begin "CSV data ends within a quoted string" :[] where interestingCharInsideString '\r' = True interestingCharInsideString '\n' = True interestingCharInsideString '"' = True interestingCharInsideString _ = False mkField :: CSVState -> (Int, Int) -> ByteString -> Bool -> CSVField mkField st begin bs q = CSVField { csvRowNum = tableRow st , csvColNum = tableCol st , csvTextStart = begin , csvTextEnd = (textRow st,textCol st) , csvFieldContent = bs , csvFieldQuoted = q } mkError :: CSVState -> (Int, Int) -> String -> CSVField mkError st begin e = CSVFieldError { csvRowNum = tableRow st , csvColNum = tableCol st , csvTextStart = begin , csvTextEnd = (textRow st,textCol st) , csvFieldError = e } -- Some pretty-printing for structured CSV errors. ppCSVError :: CSVError -> String ppCSVError (err@IncorrectRow{}) = "\nRow "++show (csvRow err)++" has wrong number of fields."++ "\n Expected "++show (csvColsExpected err)++" but got "++ show (csvColsActual err)++"."++ "\n The fields are:"++ indent 8 (concatMap ppCSVField (csvFields err)) ppCSVError (err@BlankLine{}) = "\nRow "++show (csvRow err)++" is blank."++ "\n Expected "++show (csvColsExpected err)++" fields." ppCSVError (err@FieldError{}) = ppCSVField (csvField err) ppCSVError (err@DuplicateHeader{}) = "\nThere are two (or more) identical column headers: "++ show (csvDuplicate err)++"."++ "\n Column number "++show (csvHeaderSerial err) ppCSVError (NoData{}) = "\nNo usable data (after accounting for any other errors)." -- | Pretty-printing for CSV fields, shows positional information in addition -- to the textual content. ppCSVField :: CSVField -> String ppCSVField (f@CSVField{}) = "\n"++BS.unpack (quoted (csvFieldQuoted f) (csvFieldContent f))++ "\nin row "++show (csvRowNum f)++" at column "++show (csvColNum f)++ " (textually from "++show (csvTextStart f)++" to "++ show (csvTextEnd f)++")" ppCSVField (f@CSVFieldError{}) = "\n"++csvFieldError f++ "\nin row "++show (csvRowNum f)++" at column "++show (csvColNum f)++ " (textually from "++show (csvTextStart f)++" to "++ show (csvTextEnd f) -- | Output a table back to a lazily-constructed string. There are lots of -- possible design decisions one could take, e.g. to re-arrange columns -- back into something resembling their original order, but here we just -- take the given table without looking at Row and Field numbers etc. ppCSVTable :: CSVTable -> ByteString ppCSVTable = BS.unlines . map (BS.intercalate (BS.pack ",") . map ppField) where ppField f = quoted (csvFieldQuoted f) (csvFieldContent f) -- | Output a table back to a lazily-constructed bytestring, using the given -- delimiter char. The Boolean argument is to repair fields containing -- newlines, by replacing the nl with a space. ppDSVTable :: Bool -> Char -> CSVTable -> ByteString ppDSVTable nl d = BS.unlines . map (BS.intercalate (BS.pack [d]) . map ppField) where ppField f = quoted (csvFieldQuoted f) (doNL $ csvFieldContent f) doNL | nl = replaceNL | otherwise = id {- -- | Output a table back to a string, but using Haskell list-of-tuple notation -- rather than CSV. ppCSVTableAsTuples :: CSVTable -> String ppCSVTableAsTuples = indent 4 . unlines . map ( (", ("++) . (++")") . intercalate ", " . map ppField ) where ppField f = quoted (csvFieldQuoted f) (BS.unpack (csvFieldContent f)) -} -- Some pp helpers - indent and quoted - should live elsewhere, in a -- pretty-printing package. indent :: Int -> String -> String indent n = unlines . map (replicate n ' ' ++) . lines quoted :: Bool -> ByteString -> ByteString quoted False s = s quoted True s = BS.concat [BS.pack "\"", escape s, BS.pack"\""] where escape s = let (good,next) = BS.span (/='"') s in if BS.null next then good else BS.concat [ good, BS.pack "\"\"", escape (BS.tail next) ] replaceNL :: ByteString -> ByteString replaceNL s = let (good,next) = BS.span (/='\n') s in if BS.null next then good else if BS.null good then replaceNL (BS.tail next) else BS.concat [ good, BS.pack " ", replaceNL next ] -- | Convert a CSV table to a simpler representation, by dropping all -- the original location information. fromCSVTable :: CSVTable -> [[ByteString]] fromCSVTable = map (map csvFieldContent) -- | Convert a simple list of lists into a CSVTable by the addition of -- logical locations. (Textual locations are not so useful.) -- Rows of varying lengths generate errors. Fields that need -- quotation marks are automatically marked as such. toCSVTable :: [[ByteString]] -> ([CSVError], CSVTable) toCSVTable [] = ([NoData], []) toCSVTable rows@(r:_) = (\ (a,b)-> (concat a, b)) $ unzip (zipWith walk [1..] rows) where n = length r walk :: Int -> [ByteString] -> ([CSVError], CSVRow) walk rnum [] = ( [blank rnum] , map (\c-> mkCSVField rnum c (BS.empty)) [1..n]) walk rnum cs = ( if length cs /= n then [bad rnum cs] else [] , zipWith (mkCSVField rnum) [1..n] cs ) blank rnum = BlankLine{ csvRow = rnum , csvColsExpected = n , csvColsActual = 0 , csvField = mkCSVField rnum 0 BS.empty } bad r cs = IncorrectRow{ csvRow = r , csvColsExpected = n , csvColsActual = length cs , csvFields = zipWith (mkCSVField r) [1..] cs } -- | Select and/or re-arrange columns from a CSV table, based on names in the -- header row of the table. The original header row is re-arranged too. -- The result is either a list of column names that were not present, or -- the (possibly re-arranged) sub-table. selectFields :: [String] -> CSVTable -> Either [String] CSVTable selectFields names table | null table = Left names | not (null missing) = Left missing | otherwise = Right (map select table) where header = map (BS.unpack . csvFieldContent) (head table) missing = filter (`notElem` header) names reordering = map (fromJust . (\n-> elemIndex n header)) names select fields = map (fields!!) reordering -- | Validate that the columns of a table have exactly the names and -- ordering given in the argument. expectFields :: [String] -> CSVTable -> Either [String] CSVTable expectFields names table | null table = Left ["CSV table is empty"] | not (null missing) = Left (map ("CSV table is missing field: "++) missing) | header /= names = Left ["CSV columns are in the wrong order" ,"Expected: "++intercalate ", " names ,"Found: "++intercalate ", " header] | otherwise = Right table where header = map (BS.unpack . csvFieldContent) (head table) missing = filter (`notElem` header) names -- | A join operator, adds the columns of two tables together. -- Precondition: the tables have the same number of rows. joinCSV :: CSVTable -> CSVTable -> CSVTable joinCSV = zipWith (++) -- | A generator for a new CSV column, of arbitrary length. -- The result can be joined to an existing table if desired. mkEmptyColumn :: String -> CSVTable mkEmptyColumn header = [headField] : map ((:[]).emptyField) [2..] where headField = (emptyField 1) { csvFieldContent = BS.pack header , csvFieldQuoted = True } emptyField n = CSVField { csvRowNum = n , csvColNum = 0 , csvTextStart = (0,0) , csvTextEnd = (0,0) , csvFieldContent = BS.empty , csvFieldQuoted = False } -- | Generate a fresh field with the given textual content. -- The quoting flag is set automatically based on the text. -- Textual extents are not particularly useful, since there was no original -- input to refer to. mkCSVField :: Int -> Int -> ByteString -> CSVField mkCSVField n c text = CSVField { csvRowNum = n , csvColNum = c , csvTextStart = (0,0) , csvTextEnd = ( fromIntegral . BS.length . BS.filter (=='\n') $ text , fromIntegral . BS.length . BS.takeWhile (/='\n') . BS.reverse $ text ) , csvFieldContent = text , csvFieldQuoted = any (`elem`"\",\n\r") (BS.unpack text) } lazy-csv-0.5.1/Text/CSV/Lazy/String.hs0000644000000000000000000005661312542734211015567 0ustar0000000000000000-- | The CSV (comma-separated value) format is defined by RFC 4180, -- \"Common Format and MIME Type for Comma-Separated Values (CSV) Files\", -- -- -- This lazy parser can report all CSV formatting errors, whilst also -- returning all the valid data, so the user can choose whether to -- continue, to show warnings, or to halt on error. -- -- Valid fields retain information about their original location in the -- input, so a secondary parser from textual fields to typed values -- can give intelligent error messages. -- -- In a valid CSV file, all rows must have the same number of columns. -- This parser will flag a row with the wrong number of columns as a error. -- (But the error type contains the actual data, so the user can recover -- it if desired.) Completely blank lines are also treated as errors, -- and again the user is free either to filter these out or convert them -- to a row of actual null fields. module Text.CSV.Lazy.String ( -- * CSV types CSVTable , CSVRow , CSVField(..) -- * CSV parsing , CSVError(..) , CSVResult , csvErrors , csvTable , csvTableFull , csvTableHeader , parseCSV , parseDSV -- * Pretty-printing , ppCSVError , ppCSVField , ppCSVTable , ppDSVTable -- * Conversion between standard and simple representations , fromCSVTable , toCSVTable -- * Selection, validation, and algebra of CSV tables , selectFields , expectFields , mkEmptyColumn , joinCSV , mkCSVField ) where import Data.List (groupBy, partition, elemIndex, intercalate, takeWhile ,deleteFirstsBy, nub) import Data.Function (on) import Data.Maybe (fromJust) -- | A CSV table is a sequence of rows. All rows have the same number -- of fields. type CSVTable = [CSVRow] -- | A CSV row is just a sequence of fields. type CSVRow = [CSVField] -- | A CSV field's content is stored with its logical row and column number, -- as well as its textual extent. This information is necessary if you -- want to generate good error messages in a secondary parsing stage, -- should you choose to convert the textual fields to typed data values. data CSVField = CSVField { csvRowNum :: !Int , csvColNum :: !Int , csvTextStart :: !(Int,Int) , csvTextEnd :: !(Int,Int) , csvFieldContent :: !String , csvFieldQuoted :: !Bool } | CSVFieldError { csvRowNum :: !Int , csvColNum :: !Int , csvTextStart :: !(Int,Int) , csvTextEnd :: !(Int,Int) , csvFieldError :: !String } deriving (Eq,Show) -- | A structured error type for CSV formatting mistakes. data CSVError = IncorrectRow { csvRow :: !Int , csvColsExpected :: !Int , csvColsActual :: !Int , csvFields :: [CSVField] } | BlankLine { csvRow :: !Int , csvColsExpected :: !Int , csvColsActual :: !Int , csvField :: CSVField } | FieldError { csvField :: CSVField } | DuplicateHeader{ csvColsExpected :: !Int , csvHeaderSerial :: !Int , csvDuplicate :: !String } | NoData deriving (Eq,Show) -- | The result of parsing a CSV input is a mixed collection of errors -- and valid rows. This way of representing things is crucial to the -- ability to parse lazily whilst still catching format errors. type CSVResult = [Either [CSVError] CSVRow] -- | Extract just the valid portions of a CSV parse. csvTable :: CSVResult -> CSVTable csvTable r = [ v | Right v <- r ] -- | Extract just the errors from a CSV parse. csvErrors :: CSVResult -> [CSVError] csvErrors r = concat [ v | Left v <- r ] -- | Extract the full table, including invalid rows, repaired with padding. -- and de-duplicated headers. csvTableFull:: CSVResult -> CSVTable csvTableFull = map beCareful . deduplicate where beCareful (Right row) = row beCareful (Left (r@IncorrectRow{}:_)) = csvFields r ++ replicate (csvColsExpected r - csvColsActual r) (mkCSVField (csvRow r) 0 "") beCareful (Left (r@BlankLine{}:_)) = replicate (csvColsExpected r) (mkCSVField (csvRow r) 0 "") beCareful (Left (r@DuplicateHeader{}:_)) = -- obsolete with deduping replicate (csvColsExpected r) (mkCSVField 0 0 "") beCareful (Left (FieldError{}:r)) = beCareful (Left r) beCareful (Left (NoData:_)) = [] beCareful (Left []) = [] deduplicate (Left (errs@(DuplicateHeader{}:_)):Right heads:rows) = -- Right (reverse $ foldl replace [] heads) Right (replaceInOrder errs (zip heads [0..])) : rows deduplicate rows = rows {- replace output header | headerName `elem` map csvFieldContent output = header{ csvFieldContent=headerName++"_duplicate" } : output | otherwise = header: output where headerName = csvFieldContent header -} replaceInOrder [] headers = map fst headers replaceInOrder _ [] = [] replaceInOrder (d:dups) ((h,n):headers) | csvHeaderSerial d == n = h{ csvFieldContent = (csvDuplicate d++"_"++show n) } : replaceInOrder dups headers | otherwise = h: replaceInOrder (d:dups) headers -- | The header row of the CSV table, assuming it is non-empty. csvTableHeader :: CSVResult -> [String] csvTableHeader = map csvFieldContent . firstRow where firstRow (Left _: rest) = firstRow rest firstRow (Right x: _) = x -- | A first-stage parser for CSV (comma-separated values) data. -- The individual fields remain as text, but errors in CSV formatting -- are reported. Errors (containing unrecognisable rows/fields) are -- interspersed with the valid rows/fields. parseCSV :: String -> CSVResult parseCSV = parseDSV True ',' -- | Sometimes CSV is not comma-separated, but delimiter-separated -- values (DSV). The choice of delimiter is arbitrary, but semi-colon -- is common in locales where comma is used as a decimal point, and tab -- is also common. The Boolean argument is -- whether newlines should be accepted within quoted fields. The CSV RFC -- says newlines can occur in quotes, but other DSV formats might say -- otherwise. You can often get better error messages if newlines are -- disallowed. parseDSV :: Bool -> Char -> String -> CSVResult parseDSV qn delim = validate . groupBy ((==)`on`csvRowNum) . lexCSV qn delim validate :: [CSVRow] -> CSVResult validate [] = [Left [NoData]] validate xs@(x:_) = checkDuplicateHeaders x $ map (extractErrs (length x)) xs extractErrs :: Int -> CSVRow -> Either [CSVError] CSVRow extractErrs size row | length row0 == size && null errs0 = Right row0 | length row0 == 1 && empty field0 = Left [blankLine field0] | otherwise = Left (map convert errs0 ++ validateColumns row0) where (row0,errs0) = partition isField row (field0:_) = row0 isField (CSVField{}) = True isField (CSVFieldError{}) = False empty f@(CSVField{}) = null (csvFieldContent f) empty _ = False convert err = FieldError {csvField = err} validateColumns r = if length r == size then [] else [ IncorrectRow{ csvRow = if null r then 0 else csvRowNum (head r) , csvColsExpected = size , csvColsActual = length r , csvFields = r } ] blankLine f = BlankLine{ csvRow = csvRowNum f , csvColsExpected = size , csvColsActual = 1 , csvField = f } checkDuplicateHeaders :: CSVRow -> CSVResult -> CSVResult checkDuplicateHeaders row result = let headers = [ f | f@(CSVField{}) <- row ] dups = deleteFirstsBy ((==)`on`csvFieldContent) headers (nub headers) n = length headers in if null dups then result else Left (map (\d-> DuplicateHeader { csvColsExpected = n , csvHeaderSerial = csvColNum d , csvDuplicate = csvFieldContent d }) dups) : result -- Reading CSV data is essentially lexical, and can be implemented with a -- simple finite state machine. We keep track of logical row number, -- logical column number (in tabular terms), and textual position (row,col) -- to enable good error messages. -- Positional data is retained even after successful lexing, in case a -- second-stage field parser wants to complain. -- -- A double-quoted CSV field may contain commas, newlines, and double quotes. data CSVState = CSVState { tableRow, tableCol :: !Int , textRow, textCol :: !Int } incTableRow, incTableCol, incTextRow, incTextCol :: CSVState -> CSVState incTableRow st = st { tableRow = tableRow st + 1 } incTableCol st = st { tableCol = tableCol st + 1 } incTextRow st = st { textRow = textRow st + 1 } incTextCol st = st { textCol = textCol st + 1 } -- Lexer is a small finite state machine. lexCSV :: Bool -> Char -> [Char] -> [CSVField] lexCSV quotedNewline delim = simple CSVState{tableRow=1,tableCol=1,textRow=1,textCol=1} (1,1) [] where -- 'simple' recognises an unquoted field, and delimiter char as separator simple :: CSVState -> (Int,Int) -> String -> String -> [CSVField] simple _ _ [] [] = [] simple s begin acc [] = mkField s begin acc False : [] simple s begin acc (c:cs) | not (interesting c) = simple (incTextCol $! s) begin (c:acc) cs simple s begin acc (c:'"':cs) | c==delim = mkField s begin acc False : string s' (textRow s',textCol s') [] cs where s' = incTextCol . incTextCol . incTableCol $! s simple s begin acc (c:cs) | c==delim = mkField s begin acc False : simple s' (textRow s',textCol s') [] cs where s' = incTableCol . incTextCol $! s simple s begin acc ('\r':'\n':cs) = mkField s begin acc False : simple s' (textRow s',1) [] cs where s' = incTableRow . incTextRow $! s {tableCol=1, textCol=1} simple s begin acc ('\n' :cs) = mkField s begin acc False : simple s' (textRow s',1) [] cs where s' = incTableRow . incTextRow $! s {tableCol=1, textCol=1} simple s begin acc ('\r' :cs) = mkField s begin acc False : simple s' (textRow s',1) [] cs where s' = incTableRow . incTextRow $! s {tableCol=1, textCol=1} simple s begin [] ('"' :cs) = string (incTextCol $! s) begin [] cs simple s begin acc ('"' :cs) = mkError s begin "Start-quote not next to comma": string (incTextCol $! s) begin acc cs -- 'string' recognises a double-quoted field containing commas and newlines string :: CSVState -> (Int,Int) -> String -> String -> [CSVField] string s begin [] [] = mkError s begin "Data ends at start-quote": [] string s begin acc [] = mkError s begin "Data ends in quoted field": [] string s begin acc (c:cs) | not (interestingInString c) = string (incTextCol $! s) begin (c:acc) cs string s begin acc ('"':'"':cs) = string s' begin ('"':acc) cs where s' = incTextCol . incTextCol $! s string s begin acc ('"':c:'"':cs) | c==delim = mkField s begin acc True : string s' (textRow s',textCol s') [] cs where s' = incTextCol . incTextCol . incTextCol . incTableCol $! s string s begin acc ('"':c:cs) | c==delim = mkField s begin acc True : simple s' (textRow s',textCol s') [] cs where s' = incTextCol . incTextCol . incTableCol $! s string s begin acc ('"':'\n':cs)= mkField s begin acc True : simple s' (textRow s',1) [] cs where s' = incTableRow . incTextRow $! s {tableCol=1, textCol=1} string s begin acc ('"':'\r':'\n':cs) = mkField s begin acc True : simple s' (textRow s',1) [] cs where s' = incTableRow . incTextRow $! s {tableCol=1, textCol=1} string s begin acc ('"':[]) = mkField s begin acc True : [] string s begin acc ('"':cs) = mkError s begin "End-quote not followed by comma": simple (incTextCol $! s) begin acc cs string s begin acc ('\r':'\n':cs) | quotedNewline = string s' begin ('\n':acc) cs | otherwise = mkError s begin "Found newline within quoted field": simple s'' (textRow s'',textCol s'') [] cs where s' = incTextRow $! s {textCol=1} s'' = incTableRow . incTextRow $! s {textCol=1, tableCol=1} string s begin acc ('\n' :cs) | quotedNewline = string s' begin ('\n':acc) cs | otherwise = mkError s begin "Found newline within quoted field": simple s'' (textRow s'',textCol s'') [] cs where s' = incTextRow $! s {textCol=1} s'' = incTableRow . incTextRow $! s {textCol=1, tableCol=1} interesting :: Char -> Bool interesting '\n' = True interesting '\r' = True interesting '"' = True interesting c = c==delim interestingInString :: Char -> Bool interestingInString '\n' = True interestingInString '\r' = True interestingInString '"' = True interestingInString _ = False -- generate the lexical tokens representing either a field or an error mkField st begin f q = CSVField { csvRowNum = tableRow st , csvColNum = tableCol st , csvTextStart = begin , csvTextEnd = (textRow st,textCol st) , csvFieldContent = reverse f , csvFieldQuoted = q } mkError st begin e = CSVFieldError { csvRowNum = tableRow st , csvColNum = tableCol st , csvTextStart = begin , csvTextEnd = (textRow st,textCol st) , csvFieldError = e } -- | Some pretty-printing for structured CSV errors. ppCSVError :: CSVError -> String ppCSVError (err@IncorrectRow{}) = "\nRow "++show (csvRow err)++" has wrong number of fields."++ "\n Expected "++show (csvColsExpected err)++" but got "++ show (csvColsActual err)++"."++ "\n The fields are:"++ indent 8 (concatMap ppCSVField (csvFields err)) ppCSVError (err@BlankLine{}) = "\nRow "++show (csvRow err)++" is blank."++ "\n Expected "++show (csvColsExpected err)++" fields." ppCSVError (err@FieldError{}) = ppCSVField (csvField err) ppCSVError (err@DuplicateHeader{}) = "\nThere are two (or more) identical column headers: "++ show (csvDuplicate err)++"."++ "\n Column number "++show (csvHeaderSerial err) ppCSVError (err@NoData{}) = "\nNo usable data (after accounting for any other errors)." -- | Pretty-printing for CSV fields, shows positional information in addition -- to the textual content. ppCSVField :: CSVField -> String ppCSVField (f@CSVField{}) = "\n"++quoted (csvFieldQuoted f) (csvFieldContent f)++ "\nin row "++show (csvRowNum f)++" at column "++show (csvColNum f)++ " (textually from "++show (csvTextStart f)++" to "++ show (csvTextEnd f)++")" ppCSVField (f@CSVFieldError{}) = "\n"++csvFieldError f++ "\nin row "++show (csvRowNum f)++" at column "++show (csvColNum f)++ " (textually from "++show (csvTextStart f)++" to "++ show (csvTextEnd f)++")" -- | Turn a full CSV table back into text, as much like the original -- input as possible, e.g. preserving quoted/unquoted format of fields. ppCSVTable :: CSVTable -> String ppCSVTable = unlines . map (intercalate "," . map ppField) where ppField f = quoted (csvFieldQuoted f) (csvFieldContent f) -- | Turn a full CSV table back into text, using the given delimiter -- character. Quoted/unquoted formatting of the original is preserved. -- The Boolean argument is to repair fields containing newlines, by -- replacing the nl with a space. ppDSVTable :: Bool -> Char -> CSVTable -> String ppDSVTable nl delim = unlines . map (intercalate [delim] . map ppField) where ppField f = quoted (csvFieldQuoted f) (doNL $ csvFieldContent f) doNL | nl = replaceNL | otherwise = id -- Some pp helpers - indent and quoted - should live elsewhere, in a -- pretty-printing package. indent :: Int -> String -> String indent n = unlines . map (replicate n ' ' ++) . lines quoted :: Bool -> String -> String quoted False s = s quoted True s = '"': escape s ++"\"" where escape ('"':cs) = '"':'"': escape cs escape (c:cs) = c: escape cs escape [] = [] replaceNL :: String -> String replaceNL ('\n':s) = ' ':replaceNL s replaceNL (c:s) = c: replaceNL s replaceNL [] = [] -- | Convert a CSV table to a simpler representation, by dropping all -- the original location information. fromCSVTable :: CSVTable -> [[String]] fromCSVTable = map (map csvFieldContent) -- | Convert a simple list of lists into a CSVTable by the addition of -- logical locations. (Textual locations are not so useful.) -- Rows of varying lengths generate errors. Fields that need -- quotation marks are automatically marked as such. toCSVTable :: [[String]] -> ([CSVError], CSVTable) toCSVTable [] = ([NoData], []) toCSVTable rows@(r:_) = (\ (a,b)-> (concat a, b)) $ unzip (zipWith walk [1..] rows) where n = length r walk :: Int -> [String] -> ([CSVError], CSVRow) walk rnum [] = ( [blank rnum] , map (\c-> mkCSVField rnum c "") [1..n]) walk rnum cs = ( if length cs /= n then [bad rnum cs] else [] , zipWith (mkCSVField rnum) [1..n] cs ) blank rnum = BlankLine{ csvRow = rnum , csvColsExpected = n , csvColsActual = 0 , csvField = mkCSVField rnum 0 "" } bad r cs = IncorrectRow{ csvRow = r , csvColsExpected = n , csvColsActual = length cs , csvFields = zipWith (mkCSVField r) [1..] cs } -- | Select and/or re-arrange columns from a CSV table, based on names in the -- header row of the table. The original header row is re-arranged too. -- The result is either a list of column names that were not present, or -- the (possibly re-arranged) sub-table. selectFields :: [String] -> CSVTable -> Either [String] CSVTable selectFields names table | null table = Left names | not (null missing) = Left missing | otherwise = Right (map select table) where header = map csvFieldContent (head table) missing = filter (`notElem` header) names reordering = map (fromJust . (\n-> elemIndex n header)) names select fields = map (fields!!) reordering -- | Validate that the named columns of a table have exactly the names and -- ordering given in the argument. expectFields :: [String] -> CSVTable -> Either [String] CSVTable expectFields names table | null table = Left ["CSV table is empty"] | not (null missing) = Left (map ("CSV table is missing field: "++) missing) | header /= names = Left ["CSV columns are in the wrong order" ,"Expected: "++intercalate ", " names ,"Found: "++intercalate ", " header] | otherwise = Right table where header = map csvFieldContent (head table) missing = filter (`notElem` header) names -- | A join operator, adds the columns of two tables together. -- Precondition: the tables have the same number of rows. joinCSV :: CSVTable -> CSVTable -> CSVTable joinCSV = zipWith (++) -- | A generator for a new CSV column, of arbitrary length. -- The result can be joined to an existing table if desired. mkEmptyColumn :: String -> CSVTable mkEmptyColumn header = [mkCSVField 1 0 header] : map (\n-> [mkCSVField n 0 ""]) [2..] -- | Generate a fresh field with the given textual content. -- The quoting flag is set automatically based on the text. -- Textual extents are not particularly useful, since there was no original -- input to refer to. mkCSVField :: Int -> Int -> String -> CSVField mkCSVField n c text = CSVField { csvRowNum = n , csvColNum = c , csvTextStart = (0,0) , csvTextEnd = (length (filter (=='\n') text) ,length . takeWhile (/='\n') . reverse $ text ) , csvFieldContent = text , csvFieldQuoted = any (`elem`"\",\n\r") text }