xeno-0.6/app/0000755000000000000000000000000014124710313011272 5ustar0000000000000000xeno-0.6/bench/0000755000000000000000000000000014124710313011571 5ustar0000000000000000xeno-0.6/data/0000755000000000000000000000000014124710313011423 5ustar0000000000000000xeno-0.6/src/0000755000000000000000000000000014124710313011301 5ustar0000000000000000xeno-0.6/src/Control/0000755000000000000000000000000014124710313012721 5ustar0000000000000000xeno-0.6/src/Xeno/0000755000000000000000000000000014303077203012215 5ustar0000000000000000xeno-0.6/src/Xeno/DOM/0000755000000000000000000000000014210366106012634 5ustar0000000000000000xeno-0.6/test/0000755000000000000000000000000014210366106011474 5ustar0000000000000000xeno-0.6/src/Xeno/SAX.hs0000644000000000000000000004377614303077203013225 0ustar0000000000000000{-# LANGUAGE CPP #-} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE BinaryLiterals #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ScopedTypeVariables #-} -- | SAX parser and API for XML. module Xeno.SAX ( process , Process(..) , StringLike(..) , fold , validate , validateEx , dump , skipDoctype ) where import Control.Exception (throw) import Control.Monad (unless) import Control.Monad.ST (ST, runST) import Control.Monad.State.Strict (State, evalStateT, execState, modify', lift, get, put) import Control.Spork (spork) import Data.Bits (testBit) import Data.ByteString (ByteString) import qualified Data.ByteString as S import qualified Data.ByteString.Char8 as S8 import qualified Data.ByteString.Unsafe as SU import Data.Char (isSpace) import Data.Functor.Identity (Identity(..)) import Data.Semigroup () import Data.STRef (newSTRef, modifySTRef', readSTRef) import Data.Word (Word8, Word64) import Xeno.Types class StringLike str where s_index' :: str -> Int -> Word8 elemIndexFrom' :: Word8 -> str -> Int -> Maybe Int drop' :: Int -> str -> str substring' :: str -> Int -> Int -> ByteString toBS :: str -> ByteString instance StringLike ByteString where s_index' = s_index {-# INLINE s_index' #-} elemIndexFrom' = elemIndexFrom {-# INLINE elemIndexFrom' #-} drop' = S.drop {-# INLINE drop' #-} substring' = substring {-# INLINE substring' #-} toBS = id {-# INLINE toBS #-} instance StringLike ByteStringZeroTerminated where s_index' (BSZT ps) n = ps `SU.unsafeIndex` n {-# INLINE s_index' #-} elemIndexFrom' w (BSZT bs) i = elemIndexFrom w bs i {-# INLINE elemIndexFrom' #-} drop' i (BSZT bs) = BSZT $ S.drop i bs {-# INLINE drop' #-} substring' (BSZT bs) s t = substring bs s t {-# INLINE substring' #-} toBS (BSZT bs) = bs {-# INLINE toBS #-} -- | Parameters to the 'process' function data Process a = Process { openF :: !(ByteString -> a) -- ^ Open tag. , attrF :: !(ByteString -> ByteString -> a) -- ^ Tag attribute. , endOpenF :: !(ByteString -> a) -- ^ End open tag. , textF :: !(ByteString -> a) -- ^ Text. , closeF :: !(ByteString -> a) -- ^ Close tag. , cdataF :: !(ByteString -> a) -- ^ CDATA. } -------------------------------------------------------------------------------- -- Helpful interfaces to the parser -- | Parse the XML but return no result, process no events. -- -- N.B.: Only the lexical correctness of the input string is checked, not its XML semantics (e.g. only if tags are well formed, not whether tags are properly closed) -- -- > > :set -XOverloadedStrings -- > > validate "" -- > True -- -- > > validate " False validate :: (StringLike str) => str -> Bool validate s = case spork (runIdentity (process Process { openF = \_ -> pure () , attrF = \_ _ -> pure () , endOpenF = \_ -> pure () , textF = \_ -> pure () , closeF = \_ -> pure () , cdataF = \_ -> pure () } s)) of Left (_ :: XenoException) -> False Right _ -> True -- It must be inlined or specialised to ByteString/ByteStringZeroTerminated {-# INLINE validate #-} {-# SPECIALISE validate :: ByteString -> Bool #-} {-# SPECIALISE validate :: ByteStringZeroTerminated -> Bool #-} -- | Parse the XML and checks tags nesting. -- validateEx :: (StringLike str) => str -> Bool validateEx s = case spork (runST $ do tags <- newSTRef [] (process Process { openF = \tag -> modifySTRef' tags (tag:) , attrF = \_ _ -> pure () , endOpenF = \_ -> pure () , textF = \_ -> pure () , closeF = \tag -> modifySTRef' tags $ \case [] -> fail $ "Unexpected close tag \"" ++ show tag ++ "\"" (expectedTag:tags') -> if expectedTag == tag then tags' else fail $ "Unexpected close tag. Expected \"" ++ show expectedTag ++ "\", but got \"" ++ show tag ++ "\"" , cdataF = \_ -> pure () } s) readSTRef tags >>= \case [] -> return () tags' -> error $ "Not all tags closed: " ++ show tags' ) of Left (_ :: XenoException) -> False Right _ -> True {-# INLINE validateEx #-} {-# SPECIALISE validateEx :: ByteString -> Bool #-} {-# SPECIALISE validateEx :: ByteStringZeroTerminated -> Bool #-} -- | Parse the XML and pretty print it to stdout. dump :: ByteString -> IO () dump str = evalStateT (process Process { openF = \name -> do level <- get lift (S8.putStr (S8.replicate level ' ' <> "<" <> name <> "")) , attrF = \key value -> lift (S8.putStr (" " <> key <> "=\"" <> value <> "\"")) , endOpenF = \_ -> do level <- get let !level' = level + 2 put level' lift (S8.putStrLn (">")) , textF = \text -> do level <- get lift (S8.putStrLn (S8.replicate level ' ' <> S8.pack (show text))) , closeF = \name -> do level <- get let !level' = level - 2 put level' lift (S8.putStrLn (S8.replicate level' ' ' <> " name <> ">")) , cdataF = \cdata -> do level <- get lift (S8.putStrLn (S8.replicate level ' ' <> "CDATA: " <> S8.pack (show cdata))) } str) (0 :: Int) -- | Fold over the XML input. fold :: (s -> ByteString -> s) -- ^ Open tag. -> (s -> ByteString -> ByteString -> s) -- ^ Attribute key/value. -> (s -> ByteString -> s) -- ^ End of open tag. -> (s -> ByteString -> s) -- ^ Text. -> (s -> ByteString -> s) -- ^ Close tag. -> (s -> ByteString -> s) -- ^ CDATA. -> s -> ByteString -> Either XenoException s fold openF attrF endOpenF textF closeF cdataF s str = spork (execState (process Process { openF = \name -> modify' (\s' -> openF s' name) , attrF = \key value -> modify' (\s' -> attrF s' key value) , endOpenF = \name -> modify' (\s' -> endOpenF s' name) , textF = \text -> modify' (\s' -> textF s' text) , closeF = \name -> modify' (\s' -> closeF s' name) , cdataF = \cdata -> modify' (\s' -> cdataF s' cdata) } str) s) -------------------------------------------------------------------------------- -- Main parsing function -- | Process events with callbacks in the XML input. process :: (Monad m, StringLike str) => Process (m ()) -> str -> m () process !(Process {openF, attrF, endOpenF, textF, closeF, cdataF}) str = findLT 0 where findLT index = case elemIndexFrom' openTagChar str index of Nothing -> unless (S.null text) (textF text) where text = toBS $ drop' index str Just fromLt -> do unless (S.null text) (textF text) checkOpenComment (fromLt + 1) where text = substring' str index fromLt -- Find open comment, CDATA or tag name. checkOpenComment index | s_index' this 0 == bangChar -- ! , s_index' this 1 == commentChar -- - , s_index' this 2 == commentChar -- - = findCommentEnd (index + 3) | s_index' this 0 == bangChar -- ! , s_index' this 1 == openAngleBracketChar -- [ , s_index' this 2 == 67 -- C , s_index' this 3 == 68 -- D , s_index' this 4 == 65 -- A , s_index' this 5 == 84 -- T , s_index' this 6 == 65 -- A , s_index' this 7 == openAngleBracketChar -- [ = findCDataEnd (index + 8) (index + 8) | otherwise = findTagName index where this = drop' index str findCommentEnd index = case elemIndexFrom' commentChar str index of Nothing -> throw $ XenoParseError index "Couldn't find the closing comment dash." Just fromDash -> if s_index' this 0 == commentChar && s_index' this 1 == closeTagChar then findLT (fromDash + 2) else findCommentEnd (fromDash + 1) where this = drop' index str findCDataEnd cdata_start index = case elemIndexFrom' closeAngleBracketChar str index of Nothing -> throw $ XenoParseError index "Couldn't find closing angle bracket for CDATA." Just fromCloseAngleBracket -> if s_index' str (fromCloseAngleBracket + 1) == closeAngleBracketChar then do cdataF (substring' str cdata_start fromCloseAngleBracket) findLT (fromCloseAngleBracket + 3) -- Start after ]]> else -- We only found one ], that means that we need to keep searching. findCDataEnd cdata_start (fromCloseAngleBracket + 1) findTagName index0 | s_index' str index0 == questionChar = case elemIndexFrom' closeTagChar str spaceOrCloseTag of Nothing -> throw $ XenoParseError index "Couldn't find the end of the tag." Just fromGt -> do findLT (fromGt + 1) | s_index' str spaceOrCloseTag == closeTagChar = do let tagname = substring' str index spaceOrCloseTag if s_index' str index0 == slashChar then closeF tagname else do openF tagname endOpenF tagname findLT (spaceOrCloseTag + 1) | otherwise = do let tagname = substring' str index spaceOrCloseTag openF tagname result <- findAttributes spaceOrCloseTag endOpenF tagname case result of Right closingTag -> findLT (closingTag + 1) Left closingPair -> do closeF tagname findLT (closingPair + 2) where index = if s_index' str index0 == slashChar then index0 + 1 else index0 spaceOrCloseTag = parseName str index findAttributes index0 -- case: /> | s_index' str index == slashChar , s_index' str (index + 1) == closeTagChar = pure (Left index) -- case: > | s_index' str index == closeTagChar = pure (Right index) -- case: attr=' or attr=" | s_index' str afterAttrName == equalChar , usedChar == quoteChar || usedChar == doubleQuoteChar = case elemIndexFrom' usedChar str (quoteIndex + 1) of Nothing -> throw (XenoParseError index "Couldn't find the matching quote character.") Just endQuoteIndex -> do attrF (substring' str index afterAttrName) (substring' str (quoteIndex + 1) (endQuoteIndex)) findAttributes (endQuoteIndex + 1) -- case: attr= without following quote | s_index' str afterAttrName == equalChar = throw (XenoParseError index("Expected ' or \", got: " <> S.singleton usedChar)) | otherwise = throw (XenoParseError index ("Expected =, got: " <> S.singleton (s_index' str afterAttrName) <> " at character index: " <> (S8.pack . show) afterAttrName)) where index = skipSpaces str index0 #ifdef WHITESPACE_AROUND_EQUALS afterAttrName = skipSpaces str (parseName str index) quoteIndex = skipSpaces str (afterAttrName + 1) #else afterAttrName = parseName str index quoteIndex = afterAttrName + 1 #endif usedChar = s_index' str quoteIndex {-# INLINE process #-} {-# SPECIALISE process :: Process (Identity ()) -> ByteString -> Identity () #-} {-# SPECIALISE process :: Process (State s ()) -> ByteString -> State s () #-} {-# SPECIALISE process :: Process (ST s ()) -> ByteString -> ST s () #-} {-# SPECIALISE process :: Process (IO ()) -> ByteString -> IO () #-} {-# SPECIALISE process :: Process (Identity ()) -> ByteStringZeroTerminated -> Identity () #-} {-# SPECIALISE process :: Process (State s ()) -> ByteStringZeroTerminated -> State s () #-} {-# SPECIALISE process :: Process (ST s ()) -> ByteStringZeroTerminated -> ST s () #-} {-# SPECIALISE process :: Process (IO ()) -> ByteStringZeroTerminated -> IO () #-} -------------------------------------------------------------------------------- -- ByteString utilities -- | /O(1)/ 'ByteString' index (subscript) operator, starting from 0. s_index :: ByteString -> Int -> Word8 s_index ps n | n < 0 = throw (XenoStringIndexProblem n ps) | n >= S.length ps = throw (XenoStringIndexProblem n ps) | otherwise = ps `SU.unsafeIndex` n {-# INLINE s_index #-} -- | A fast space skipping function. skipSpaces :: (StringLike str) => str -> Int -> Int skipSpaces str i = if isSpaceChar (s_index' str i) then skipSpaces str (i + 1) else i {-# INLINE skipSpaces #-} -- | Get a substring of a string. substring :: ByteString -> Int -> Int -> ByteString substring s start end = S.take (end - start) (S.drop start s) {-# INLINE substring #-} -- | Basically @findIndex (not . isNameChar)@, but doesn't allocate. parseName :: (StringLike str) => str -> Int -> Int parseName str index = if not (isNameChar1 (s_index' str index)) then index else parseName' str (index + 1) {-# INLINE parseName #-} -- | Basically @findIndex (not . isNameChar)@, but doesn't allocate. parseName' :: (StringLike str) => str -> Int -> Int parseName' str index = if not (isNameChar (s_index' str index)) then index else parseName' str (index + 1) {-# INLINE parseName' #-} -- | Get index of an element starting from offset. elemIndexFrom :: Word8 -> ByteString -> Int -> Maybe Int elemIndexFrom c str offset = fmap (+ offset) (S.elemIndex c (S.drop offset str)) -- Without the INLINE below, the whole function is twice as slow and -- has linear allocation. See git commit with this comment for -- results. {-# INLINE elemIndexFrom #-} -------------------------------------------------------------------------------- -- Character types isSpaceChar :: Word8 -> Bool isSpaceChar = testBit (0b100000000000000000010011000000000 :: Int) . fromIntegral -- | | || bits: -- | | |+-- 9 -- | | +--- 10 -- | +------ 13 -- +------------------------- 32 {-# INLINE isSpaceChar #-} -- | Is the character a valid first tag/attribute name constituent? -- 'a'-'z', 'A'-'Z', '_', ':' isNameChar1 :: Word8 -> Bool isNameChar1 c = (c >= 97 && c <= 122) || (c >= 65 && c <= 90) || c == 95 || c == 58 {-# INLINE isNameChar1 #-} -- isNameCharOriginal :: Word8 -> Bool -- isNameCharOriginal c = -- (c >= 97 && c <= 122) || (c >= 65 && c <= 90) || c == 95 || c == 58 || -- c == 45 || c == 46 || (c >= 48 && c <= 57) -- {-# INLINE isNameCharOriginal #-} -- -- TODO Strange, but highMaskIsNameChar, lowMaskIsNameChar don't calculate fast... FIX IT -- highMaskIsNameChar, lowMaskIsNameChar :: Word64 -- (highMaskIsNameChar, lowMaskIsNameChar) = -- foldl (\(hi,low) char -> (hi `setBit` (char - 64), low `setBit` char)) -- NB: `setBit` can process overflowed values (where char < 64; -- TODO fix it -- (0::Word64, 0::Word64) -- (map fromIntegral (filter isNameCharOriginal [0..128])) -- {-# INLINE highMaskIsNameChar #-} -- {-# INLINE lowMaskIsNameChar #-} -- | Is the character a valid tag/attribute name constituent? -- isNameChar1 + '-', '.', '0'-'9' isNameChar :: Word8 -> Bool isNameChar char = (lowMaskIsNameChar `testBit` char'low) || (highMaskIsNameChar `testBit` char'high) -- TODO 1) change code to use W# instead of Word64 -- 2) Document `ii - 64` -- there is underflow, but `testBit` can process this! where char'low = fromIntegral char char'high = fromIntegral (char - 64) highMaskIsNameChar :: Word64 highMaskIsNameChar = 0b11111111111111111111111111010000111111111111111111111111110 -- ------------+------------- | ------------+------------- -- | | | bits: -- | | +-- 65-90 -- | +------------------- 95 -- +---------------------------------- 97-122 lowMaskIsNameChar :: Word64 lowMaskIsNameChar = 0b11111111111011000000000000000000000000000000000000000000000 -- -----+----- || -- | || bits: -- | |+-- 45 -- | +--- 46 -- +---------- 48-58 {-# INLINE isNameChar #-} -- | Char for '\''. quoteChar :: Word8 quoteChar = 39 -- | Char for '"'. doubleQuoteChar :: Word8 doubleQuoteChar = 34 -- | Char for '='. equalChar :: Word8 equalChar = 61 -- | Char for '?'. questionChar :: Word8 questionChar = 63 -- | Char for '/'. slashChar :: Word8 slashChar = 47 -- | Exclaimation character !. bangChar :: Word8 bangChar = 33 -- | The dash character. commentChar :: Word8 commentChar = 45 -- '-' -- | Open tag character. openTagChar :: Word8 openTagChar = 60 -- '<' -- | Close tag character. closeTagChar :: Word8 closeTagChar = 62 -- '>' -- | Open angle bracket character. openAngleBracketChar :: Word8 openAngleBracketChar = 91 -- | Close angle bracket character. closeAngleBracketChar :: Word8 closeAngleBracketChar = 93 -- | Skip initial DOCTYPE declaration skipDoctype :: ByteString -> ByteString skipDoctype arg = if "" `S8.breakSubstring` bs in skipBlanks $ S8.drop 1 rest else bs where bs = skipBlanks arg skipBlanks = S8.dropWhile isSpace xeno-0.6/src/Xeno/DOM.hs0000644000000000000000000002046114210366106013173 0ustar0000000000000000{-# LANGUAGE BangPatterns #-} {-# LANGUAGE DeriveAnyClass #-} {-# LANGUAGE DeriveDataTypeable #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE CPP #-} -- | DOM parser and API for XML. module Xeno.DOM ( parse , Node , Content(..) , name , attributes , contents , children ) where import Control.Monad.ST import Control.Spork import Data.ByteString (ByteString) #if MIN_VERSION_bytestring(0,11,0) import Data.ByteString.Internal (ByteString(BS)) #else import Data.ByteString.Internal (ByteString(PS)) #endif import qualified Data.ByteString as S import Data.Mutable import Data.STRef import qualified Data.Vector.Unboxed as UV import qualified Data.Vector.Unboxed.Mutable as UMV #if MIN_VERSION_bytestring(0,11,0) import Foreign.Ptr (minusPtr) import Foreign.ForeignPtr (ForeignPtr, withForeignPtr) import System.IO.Unsafe (unsafeDupablePerformIO) #endif import Xeno.SAX import Xeno.Types import Xeno.DOM.Internal -- | Parse a complete Nodes document. parse :: ByteString -> Either XenoException Node parse str = maybe (Left XenoExpectRootNode) Right . findRootNode =<< spork node where findRootNode r = go 0 where go n = case r UV.!? n of Just 0x0 -> Just (Node str n r) -- skipping text assuming that it contains only white space -- characters Just 0x1 -> go (n+3) _ -> Nothing #if MIN_VERSION_bytestring(0,11,0) BS offset0 _ = str #else PS _ offset0 _ = str #endif node = let !initialSize = max 1000 (S.length str `div` 8) in runST (do nil <- UMV.unsafeNew initialSize vecRef <- newSTRef nil sizeRef <- fmap asURef (newRef 0) parentRef <- fmap asURef (newRef 0) process Process { #if MIN_VERSION_bytestring(0,11,0) openF = \(BS name_start name_len) -> do #else openF = \(PS _ name_start name_len) -> do #endif let tag = 0x00 tag_end = -1 index <- readRef sizeRef v' <- do v <- readSTRef vecRef if index + 5 < UMV.length v then pure v else do v' <- UMV.unsafeGrow v (predictGrowSize name_start name_len (index + 5) (UMV.length v)) writeSTRef vecRef v' return v' tag_parent <- readRef parentRef do writeRef parentRef index writeRef sizeRef (index + 5) do UMV.unsafeWrite v' index tag UMV.unsafeWrite v' (index + 1) tag_parent UMV.unsafeWrite v' (index + 2) (distance name_start offset0) UMV.unsafeWrite v' (index + 3) name_len UMV.unsafeWrite v' (index + 4) tag_end #if MIN_VERSION_bytestring(0,11,0) , attrF = \(BS key_start key_len) (BS value_start value_len) -> do #else , attrF = \(PS _ key_start key_len) (PS _ value_start value_len) -> do #endif index <- readRef sizeRef v' <- do v <- readSTRef vecRef if index + 5 < UMV.length v then pure v else do v' <- UMV.unsafeGrow v (predictGrowSize value_start value_len (index + 5) (UMV.length v)) writeSTRef vecRef v' return v' let tag = 0x02 do writeRef sizeRef (index + 5) do UMV.unsafeWrite v' index tag UMV.unsafeWrite v' (index + 1) (distance key_start offset0) UMV.unsafeWrite v' (index + 2) key_len UMV.unsafeWrite v' (index + 3) (distance value_start offset0) UMV.unsafeWrite v' (index + 4) value_len , endOpenF = \_ -> return () #if MIN_VERSION_bytestring(0,11,0) , textF = \(BS text_start text_len) -> do #else , textF = \(PS _ text_start text_len) -> do #endif let tag = 0x01 index <- readRef sizeRef v' <- do v <- readSTRef vecRef if index + 3 < UMV.length v then pure v else do v' <- UMV.unsafeGrow v (predictGrowSize text_start text_len (index + 3) (UMV.length v)) writeSTRef vecRef v' return v' do writeRef sizeRef (index + 3) do UMV.unsafeWrite v' index tag UMV.unsafeWrite v' (index + 1) (distance text_start offset0) UMV.unsafeWrite v' (index + 2) text_len , closeF = \_ -> do v <- readSTRef vecRef -- Set the tag_end slot of the parent. parent <- readRef parentRef index <- readRef sizeRef UMV.unsafeWrite v (parent + 4) index -- Pop the stack and return to the parent element. previousParent <- UMV.unsafeRead v (parent + 1) writeRef parentRef previousParent #if MIN_VERSION_bytestring(0,11,0) , cdataF = \(BS cdata_start cdata_len) -> do #else , cdataF = \(PS _ cdata_start cdata_len) -> do #endif let tag = 0x03 index <- readRef sizeRef v' <- do v <- readSTRef vecRef if index + 3 < UMV.length v then pure v else do v' <- UMV.unsafeGrow v (predictGrowSize cdata_start cdata_len (index + 3) (UMV.length v)) writeSTRef vecRef v' return v' writeRef sizeRef (index + 3) UMV.unsafeWrite v' index tag UMV.unsafeWrite v' (index + 1) (distance cdata_start offset0) UMV.unsafeWrite v' (index + 2) cdata_len } str wet <- readSTRef vecRef arr <- UV.unsafeFreeze wet size <- readRef sizeRef return (UV.unsafeSlice 0 size arr)) where -- Growing a large vector is slow, so we need to do it less times. -- We can predict final array size after processing some part (i.e. 1/4) of input XML. -- -- predictGrowSize _bsStart _bsLen _index vecLen = round $ fromIntegral vecLen * (1.25 :: Double) predictGrowSize bsStart bsLen index vecLen = let -- at least 1 so we don't divide by zero below and end up with -- a negative grow size if (bsStart + bsLen - offset0) == 0 processedLen = max 1 (distance bsStart offset0 + bsLen) -- 1. Using integral operations, such as -- "predictedTotalSize = (index * S.length str) `div` processedLen" -- cause overflow, so we use float. -- 2. Slightly enlarge predicted size to compensite copy on vector grow -- if prediction is incorrect k = (1.25 :: Double) * fromIntegral (S.length str) / fromIntegral processedLen predictedTotalSize = round $ fromIntegral index * k growSize = predictedTotalSize - vecLen in growSize #if MIN_VERSION_bytestring(0,11,0) minusForeignPtr :: ForeignPtr a -> ForeignPtr b -> Int minusForeignPtr fpA fpB = unsafeDupablePerformIO $ withForeignPtr fpA $ \ptrA -> withForeignPtr fpB $ \ptrB -> pure (minusPtr ptrA ptrB) distance :: ForeignPtr a -> ForeignPtr b -> Int distance = minusForeignPtr #else distance :: Int -> Int -> Int distance a b = a - b #endif xeno-0.6/src/Xeno/DOM/Internal.hs0000644000000000000000000000672214124710313014750 0ustar0000000000000000{-# LANGUAGE BangPatterns #-} {-# LANGUAGE DeriveAnyClass #-} {-# LANGUAGE DeriveDataTypeable #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} -- | Efficient DOM data structure module Xeno.DOM.Internal ( Node(..) , Content(..) , name , attributes , contents , children ) where import Control.DeepSeq import Data.ByteString (ByteString) import qualified Data.ByteString as S import Data.Data (Data, Typeable) import Data.Vector.Unboxed ((!)) import qualified Data.Vector.Unboxed as UV --import Debug.Trace --trace _ a = a -- | Some XML nodes. data Node = Node !ByteString !Int !(UV.Vector Int) deriving (Eq, Data, Typeable) instance NFData Node where rnf !_ = () instance Show Node where show n = "(Node " ++ show (name n) ++ " " ++ show (attributes n) ++ " " ++ show (contents n) ++ ")" -- | Content of a node. data Content = Element {-# UNPACK #-}!Node | Text {-# UNPACK #-}!ByteString | CData {-# UNPACK #-}!ByteString deriving (Eq, Show, Data, Typeable) instance NFData Content where rnf !_ = () -- | Get just element children of the node (no text). children :: Node -> [Node] children (Node str start offsets) = collect firstChild where collect i | i < endBoundary = case offsets ! i of 0x00 -> Node str i offsets : collect (offsets ! (i + 4)) 0x01 -> collect (i + 3) _off -> [] -- trace ("Offsets " <> show i <> " is " <> show off) [] | otherwise = [] firstChild = go (start + 5) where go i | i < endBoundary = case offsets ! i of 0x02 -> go (i + 5) _ -> i | otherwise = i endBoundary = offsets ! (start + 4) -- | Contents of a node. contents :: Node -> [Content] contents (Node str start offsets) = collect firstChild where collect i | i < endBoundary = case offsets ! i of 0x00 -> Element (Node str i offsets) : collect (offsets ! (i + 4)) 0x01 -> Text (substring str (offsets ! (i + 1)) (offsets ! (i + 2))) : collect (i + 3) 0x03 -> CData (substring str (offsets ! (i + 1)) (offsets ! (i + 2))) : collect (i + 3) _ -> [] | otherwise = [] firstChild = go (start + 5) where go i | i < endBoundary = case offsets ! i of 0x02 -> go (i + 5) _ -> i | otherwise = i endBoundary = offsets ! (start + 4) -- | Attributes of a node. attributes :: Node -> [(ByteString,ByteString)] attributes (Node str start offsets) = collect (start + 5) where collect i | i < endBoundary = case offsets ! i of 0x02 -> ( substring str (offsets ! (i + 1)) (offsets ! (i + 2)) , substring str (offsets ! (i + 3)) (offsets ! (i + 4))) : collect (i + 5) _ -> [] | otherwise = [] endBoundary = offsets ! (start + 4) -- | Name of the element. name :: Node -> ByteString name (Node str start offsets) = case offsets ! start of 0x00 -> substring str (offsets ! (start + 2)) (offsets ! (start + 3)) _ -> error "Node cannot have empty name" -- mempty -- | Get a substring of the BS. substring :: ByteString -> Int -> Int -> ByteString substring s' start len = S.take len (S.drop start s') {-# INLINE substring #-} xeno-0.6/src/Xeno/DOM/Robust.hs0000644000000000000000000001715514210366106014457 0ustar0000000000000000{-# LANGUAGE BangPatterns #-} {-# LANGUAGE DeriveAnyClass #-} {-# LANGUAGE DeriveDataTypeable #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE CPP #-} -- | DOM parser and API for XML. -- Slightly slower DOM parsing, -- but add missing close tags. module Xeno.DOM.Robust ( parse , Node , Content(..) , name , attributes , contents , children ) where import Control.Monad.ST import Control.Spork #if MIN_VERSION_bytestring(0,11,0) import Data.ByteString.Internal as BS (ByteString(..), plusForeignPtr) #else import Data.ByteString.Internal(ByteString(..)) #endif import Data.STRef import qualified Data.Vector.Unboxed as UV import qualified Data.Vector.Unboxed.Mutable as UMV import Data.Mutable(asURef, newRef, readRef, writeRef) #if MIN_VERSION_bytestring(0,11,0) import Foreign.Ptr (minusPtr) import Foreign.ForeignPtr (ForeignPtr, withForeignPtr) import System.IO.Unsafe (unsafeDupablePerformIO) #endif import Xeno.SAX import Xeno.Types import Xeno.DOM.Internal(Node(..), Content(..), name, attributes, contents, children) -- | Parse a complete Nodes document. parse :: ByteString -> Either XenoException Node parse inp = case spork node of Left e -> Left e Right r -> case findRootNode r of Just n -> Right n Nothing -> Left XenoExpectRootNode where findRootNode r = go 0 where go n = case r UV.!? n of Just 0x0 -> Just (Node str n r) -- skipping text assuming that it contains only white space -- characters Just 0x1 -> go (n+3) _ -> Nothing #if MIN_VERSION_bytestring(0,11,0) BS offset0 _ = str #else PS _ offset0 _ = str #endif str = skipDoctype inp node = runST (do nil <- UMV.new 1000 vecRef <- newSTRef nil sizeRef <- fmap asURef $ newRef 0 parentRef <- fmap asURef $ newRef 0 process Process { #if MIN_VERSION_bytestring(0,11,0) openF = \(BS name_start name_len) -> do #else openF = \(PS _ name_start name_len) -> do #endif let tag = 0x00 tag_end = -1 index <- readRef sizeRef v' <- do v <- readSTRef vecRef if index + 5 < UMV.length v then pure v else do v' <- UMV.grow v (UMV.length v) writeSTRef vecRef v' return v' tag_parent <- readRef parentRef do writeRef parentRef index writeRef sizeRef (index + 5) UMV.write v' index tag UMV.write v' (index + 1) tag_parent UMV.write v' (index + 2) (distance name_start offset0) UMV.write v' (index + 3) name_len UMV.write v' (index + 4) tag_end #if MIN_VERSION_bytestring(0,11,0) , attrF = \(BS key_start key_len) (BS value_start value_len) -> do #else , attrF = \(PS _ key_start key_len) (PS _ value_start value_len) -> do #endif index <- readRef sizeRef v' <- do v <- readSTRef vecRef if index + 5 < UMV.length v then pure v else do v' <- UMV.grow v (UMV.length v) writeSTRef vecRef v' return v' let tag = 0x02 do writeRef sizeRef (index + 5) do UMV.write v' index tag UMV.write v' (index + 1) (distance key_start offset0) UMV.write v' (index + 2) key_len UMV.write v' (index + 3) (distance value_start offset0) UMV.write v' (index + 4) value_len , endOpenF = \_ -> return () #if MIN_VERSION_bytestring(0,11,0) , textF = \(BS text_start text_len) -> do #else , textF = \(PS _ text_start text_len) -> do #endif let tag = 0x01 index <- readRef sizeRef v' <- do v <- readSTRef vecRef if index + 3 < UMV.length v then pure v else do v' <- UMV.grow v (UMV.length v) writeSTRef vecRef v' return v' do writeRef sizeRef (index + 3) do UMV.write v' index tag UMV.write v' (index + 1) (distance text_start offset0) UMV.write v' (index + 2) text_len #if MIN_VERSION_bytestring(0,11,0) , closeF = \closeTag@(BS _ _) -> do #else , closeF = \closeTag@(PS s _ _) -> do #endif v <- readSTRef vecRef -- Set the tag_end slot of the parent. index <- readRef sizeRef untilM $ do parent <- readRef parentRef correctTag <- if parent == 0 then return True -- no more tags to close!!! else do parent_name <- UMV.read v (parent + 2) parent_len <- UMV.read v (parent + 3) #if MIN_VERSION_bytestring(0,11,0) let openTag = BS (BS.plusForeignPtr offset0 parent_name) parent_len #else let openTag = PS s (parent_name+offset0) parent_len #endif return $ openTag == closeTag UMV.write v (parent + 4) index -- Pop the stack and return to the parent element. previousParent <- UMV.read v (parent + 1) writeRef parentRef previousParent return correctTag -- continue closing tags, until matching one is found #if MIN_VERSION_bytestring(0,11,0) , cdataF = \(BS cdata_start cdata_len) -> do #else , cdataF = \(PS _ cdata_start cdata_len) -> do #endif let tag = 0x03 index <- readRef sizeRef v' <- do v <- readSTRef vecRef if index + 3 < UMV.length v then pure v else do v' <- UMV.grow v (UMV.length v) writeSTRef vecRef v' return v' do writeRef sizeRef (index + 3) do UMV.write v' index tag UMV.write v' (index + 1) (distance cdata_start offset0) UMV.write v' (index + 2) cdata_len } str wet <- readSTRef vecRef arr <- UV.unsafeFreeze wet size <- readRef sizeRef return (UV.unsafeSlice 0 size arr)) untilM :: Monad m => m Bool -> m () untilM loop = do cond <- loop case cond of True -> return () False -> untilM loop #if MIN_VERSION_bytestring(0,11,0) minusForeignPtr :: ForeignPtr a -> ForeignPtr b -> Int minusForeignPtr fpA fpB = unsafeDupablePerformIO $ withForeignPtr fpA $ \ptrA -> withForeignPtr fpB $ \ptrB -> pure (minusPtr ptrA ptrB) distance :: ForeignPtr a -> ForeignPtr b -> Int distance = minusForeignPtr #else distance :: Int -> Int -> Int distance a b = a - b #endif xeno-0.6/src/Xeno/Types.hs0000644000000000000000000000200414124710313013646 0ustar0000000000000000{-# LANGUAGE CPP #-} {-# LANGUAGE DeriveAnyClass #-} {-# LANGUAGE DeriveDataTypeable #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE FlexibleInstances #-} -- | Shared types. module Xeno.Types where import Control.DeepSeq import Control.Exception import Data.ByteString.Char8 (ByteString, pack) import Data.Data import GHC.Generics #if MIN_VERSION_base(4,9,0) import Control.Monad.Fail -- It is recommended to use more specific `failHere` instead instance MonadFail (Either Xeno.Types.XenoException) where fail = Left . XenoParseError 0 . pack #endif data XenoException = XenoStringIndexProblem { stringIndex :: Int, inputString :: ByteString } | XenoParseError { inputIndex :: Int, message :: ByteString } | XenoExpectRootNode deriving (Show, Data, Typeable, NFData, Generic) instance Exception XenoException where displayException = show -- | ByteString wich guaranted have '\NUL' at the end newtype ByteStringZeroTerminated = BSZT ByteString deriving (Generic, NFData) xeno-0.6/src/Xeno/Errors.hs0000644000000000000000000000433014210366106014025 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE Strict #-} {-# LANGUAGE ViewPatterns #-} {-# LANGUAGE CPP #-} -- | Simplifies raising and presenting localized exceptions to the user. module Xeno.Errors(printExceptions ,displayException ,getStartIndex ,failHere ) where import Data.Semigroup((<>)) import qualified Data.ByteString.Char8 as BS hiding (elem) import Data.ByteString.Internal(ByteString(..)) import System.IO(stderr) import Xeno.Types {-# NOINLINE failHere #-} failHere :: BS.ByteString -> BS.ByteString -> Either XenoException a failHere msg here = Left $ XenoParseError (getStartIndex here) msg -- | Print schema errors with excerpts printExceptions :: BS.ByteString -> [XenoException] -> IO () printExceptions i s = (BS.hPutStrLn stderr . displayException i) `mapM_` s -- | Find line number of the error from ByteString index. lineNo :: Int -> BS.ByteString -> Int lineNo index bs = BS.count '\n' $ BS.take index bs -- | Show for ByteStrings bshow :: Show a => a -> BS.ByteString bshow = BS.pack . show {-# INLINE CONLIKE getStartIndex #-} -- FIXME remove this; there's no offset in the bytestring. getStartIndex :: BS.ByteString -> Int getStartIndex (PS _ from _) = from displayException :: BS.ByteString -> XenoException -> BS.ByteString displayException input (XenoParseError i msg) = "Parse error in line " <> bshow (lineNo i input) <> ": " <> msg <> " at:\n" <> lineContentBeforeError <> lineContentAfterError <> "\n" <> pointer where lineContentBeforeError = snd $ BS.spanEnd eoln $ revTake 40 $ BS.take i input lineContentAfterError = BS.takeWhile eoln $ BS.take 40 $ BS.drop i input pointer = BS.replicate (BS.length lineContentBeforeError) ' ' <> "^" eoln ch = ch /= '\n' && ch /= '\r' displayException _ err = bshow err -- | Take n last bytes. revTake :: Int -> BS.ByteString -> BS.ByteString revTake i bs = if i >= len then bs else BS.drop (len - i) bs where len = fromIntegral (BS.length bs) xeno-0.6/src/Control/Spork.hs0000644000000000000000000000065214124710313014356 0ustar0000000000000000-- | Like the spoon package, but for catching one specific exception -- type and returning it. module Control.Spork ( spork ) where import Control.Exception import System.IO.Unsafe -- | Evaluate `a` and return left if it throws a pure exception. spork :: Exception e => a -> Either e a spork a = unsafePerformIO $ (Right `fmap` evaluate a) `catches` [Handler (\e -> pure (Left e))] {-# INLINEABLE spork #-} xeno-0.6/test/Main.hs0000644000000000000000000001725414210366106012725 0ustar0000000000000000{-# LANGUAGE CPP #-} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE OverloadedStrings #-} -- | Simple test suite. module Main where import Data.Either (isRight) import Data.ByteString (ByteString) import qualified Data.ByteString as BS import Test.Hspec import Xeno.SAX (validate, skipDoctype) import Xeno.DOM (Node, Content(..), parse, name, contents, attributes, children) import qualified Xeno.DOM.Robust as RDOM import Xeno.Types import qualified Debug.Trace as Debug(trace) main :: IO () main = hspec spec spec :: SpecWith () spec = do describe "Xeno.DOM tests" $ do it "test 1" $ do xml <- BS.readFile "data/books-4kb.xml" let (Right dom) = parse xml (name dom) `shouldBe` "catalog" (length $ contents dom) `shouldBe` 25 (length $ children dom) `shouldBe` 12 (length $ allChildrens dom) `shouldBe` 84 (length $ concatMap attributes $ allChildrens dom) `shouldBe` 12 (concatMap attributes $ allChildrens dom) `shouldBe` [("id","bk101"),("id","bk102"),("id","bk103"),("id","bk104") ,("id","bk105"),("id","bk106"),("id","bk107"),("id","bk108") ,("id","bk109"),("id","bk110"),("id","bk111"),("id","bk112")] (map name $ allChildrens dom) `shouldBe` (replicate 12 "book" ++ (concat $ replicate 12 ["author","title","genre","price","publish_date","description"])) describe "Xeno.DOM tests" $ do it "DOM from bytestring substring" $ do let substr = BS.drop 5 "5<8& xml" parsedRoot = fromRightE $ parse substr name parsedRoot `shouldBe` "valid" it "Leading whitespace characters are accepted by parse" $ isRight (parse "\n") `shouldBe` True let doc = parse "\n" it "children test" $ map name (children $ fromRightE doc) `shouldBe` ["test", "test", "b", "test", "test"] it "attributes" $ attributes (head (children $ fromRightE doc)) `shouldBe` [("id", "1"), ("extra", "2")] it "xml prologue test" $ do let docWithPrologue = "\nHello, world!" parsedRoot = fromRightE $ Xeno.DOM.parse docWithPrologue name parsedRoot `shouldBe` "greeting" describe "hexml tests" (do mapM_ (\(v, i) -> it (show i) (shouldBe (validate i) v)) $ concat [ hexml_examples_sax , extra_examples_sax #ifdef WHITESPACE_AROUND_EQUALS , ws_around_equals_sax #endif ] mapM_ (\(v, i) -> it (show i) (shouldBe (either (Left . show) (Right . id) (contents <$> parse i)) v)) cdata_tests -- If this works without crashing we're happy. let nsdoc = ("Content." :: ByteString) it "namespaces" $ validate nsdoc `shouldBe` True ) describe "robust XML tests" $ do it "DOM from bytestring substring" $ do let substr = BS.drop 5 "5<8& xml" parsedRoot = fromRightE $ RDOM.parse substr name parsedRoot `shouldBe` "valid" it "Leading whitespace characters are accepted by parse" $ isRight (RDOM.parse "\n") `shouldBe` True let doc = RDOM.parse "\n" it "children test" $ map name (children $ fromRightE doc) `shouldBe` ["test", "test", "b", "test", "test"] it "attributes" $ attributes (head (children $ fromRightE doc)) `shouldBe` [("id", "1"), ("extra", "2")] it "xml prologue test" $ do let docWithPrologue = "\nHello, world!" parsedRoot = fromRightE $ RDOM.parse docWithPrologue name parsedRoot `shouldBe` "greeting" it "html doctype test" $ do let docWithPrologue = "\nHello, world!" parsedRoot = fromRightE $ RDOM.parse docWithPrologue name parsedRoot `shouldBe` "greeting" describe "hexml tests" (do mapM_ (\(v, i) -> it (show i) (shouldBe (validate i) v)) (hexml_examples_sax ++ extra_examples_sax) mapM_ (\(v, i) -> it (show i) (shouldBe (either (Left . show) (Right . id) (contents <$> parse i)) v)) cdata_tests -- If this works without crashing we're happy. let nsdoc = ("Content." :: ByteString) it "namespaces" $ validate nsdoc `shouldBe` True ) it "recovers unclosed tag" $ do let parsed = RDOM.parse "" Debug.trace (show parsed) $ do name (fromRightE parsed) `shouldBe` "a" RDOM.attributes (fromRightE parsed) `shouldBe` [("attr", "a")] map name (RDOM.children $ fromRightE parsed) `shouldBe` ["img"] it "ignores too many closing tags" $ do let parsed = RDOM.parse "" isRight parsed `shouldBe` True describe "skipDoctype" $ do it "strips initial doctype declaration" $ do skipDoctype "Hello" `shouldBe` "Hello" it "strips doctype after spaces" $ do skipDoctype " \nHello" `shouldBe` "Hello" it "does not strip anything after or inside element" $ do let insideElt = "Hello" skipDoctype insideElt `shouldBe` insideElt hexml_examples_sax :: [(Bool, ByteString)] hexml_examples_sax = [(True, "herethere") ,(True, "") ,(True, "") ,(True, "here more text at the end") ,(True, "") -- SAX doesn't care about tag balancing ,(False, "\nHello, world!") ] extra_examples_sax :: [(Bool, ByteString)] extra_examples_sax = [(True, "") ,(True, "") ,(True, "") ] ws_around_equals_sax :: [(Bool, ByteString)] ws_around_equals_sax = [(True, "") ] -- | We want to make sure that the parser doesn't jump out of the CDATA -- area prematurely because it encounters a single ]. cdata_tests :: [(Either a [Content], ByteString)] cdata_tests = [ ( Right [CData "Oneliner CDATA."] , "") , ( Right [CData "This is strong but not XML tags."] , "This is strong but not XML tags.]]>") , ( Right [CData "A lonely ], sad isn't it?"] , "") ] -- | Horrible hack. Don't try this at home. fromRightE :: Either XenoException a -> a fromRightE = either (error . show) id mapLeft :: Applicative f => (a -> f b) -> Either a b -> f b mapLeft f = either f pure mapRight :: Applicative f => (b -> f a) -> Either a b -> f a mapRight = either pure allChildrens :: Node -> [Node] allChildrens n = allChildrens' [n] where allChildrens' :: [Node] -> [Node] allChildrens' [] = [] allChildrens' ns = let nextNodes = concatMap children ns in nextNodes ++ (allChildrens' nextNodes) xeno-0.6/app/Bench.hs0000644000000000000000000000306214124710313012646 0ustar0000000000000000{-# LANGUAGE BangPatterns #-} {-# LANGUAGE QuasiQuotes #-} import Control.Monad import Data.Time.Clock import System.IO.Posix.MMap import System.Mem import Xeno.DOM import qualified Data.ByteString as BS main :: IO () main = do let prefix = "ex-data/" files' = map (prefix ++) [ {- 921 Mb -} "1htq.xml" , {- 190 Mb -} "enwiki-20190901-abstract10.xml" , {- 1.6 Gb -} "enwiki-20190901-pages-logging1.xml" , {- 4.0 Gb -} "enwiki-20190901-pages-meta-current24.xml-p30503451p32003451.xml" -- , {- 21 Gb -} "enwiki-20190901-pages-meta-history2.xml" ] files = concat $ replicate 5 files' -- deltas <- forM files $ \fn -> do putStrLn $ "Processing file '" ++ show fn ++ "'" -- -- NOTE: It is need to cache file in memory BEFORE start test. -- It can be done with `vmtouch` utility for example (`vmtouch -vtL *`). -- bs <- unsafeMMapFile fn -- bs <- BS.readFile fn putStrLn $ " size: " ++ show (BS.length bs `div` (1024*1024)) ++ " Mb" performGC start <- getCurrentTime -- SAX: -- let res = validate bs -- putStrLn [qc| process result: {res}|] -- DOM: (\(Right !_node) -> putStrLn " processed!") (parse bs) finish <- getCurrentTime let delta = finish `diffUTCTime` start putStrLn $ " processing time: " ++ show delta return delta -- putStrLn "------" putStrLn $ "Total: " ++ show (sum deltas) xeno-0.6/bench/SpeedBigFiles.hs0000644000000000000000000001024214124710313014571 0ustar0000000000000000{-# LANGUAGE BangPatterns #-} {-# LANGUAGE CPP #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE StandaloneDeriving #-} {-# OPTIONS_GHC -fno-warn-orphans -Wno-unused-imports #-} -- | Benchmark speed with big files module Main where import Codec.Compression.BZip import Control.DeepSeq import Criterion import Criterion.Main import Data.ByteString (ByteString) import qualified Data.ByteString.Lazy as L import Data.List (delete) import GHC.Generics import System.FilePath.Posix import qualified Text.XML.Expat.SAX as Hexpat import qualified Text.XML.Expat.Tree as HexpatTree import qualified Text.XML.Hexml as Hexml import Text.XML.Light as XML import Text.XML.Light.Input as XML import qualified Xeno.Types import qualified Xeno.SAX import qualified Xeno.DOM import qualified Xeno.DOM.Robust import qualified Data.ByteString as S #ifdef LIBXML2 import qualified Text.XML.LibXML.Parser as Libxml2 #endif main :: IO () main = defaultMain [ benchFile allTests "46MB" "enwiki-20190901-pages-articles14.xml-p7697599p7744799.bz2" , benchFile allTests "624MB" "enwiki-20190901-pages-articles-multistream1.xml-p10p30302.bz2" , benchFile allTests "921MB" "1HTQ.xml.bz2" , benchFile allTests "1.6Gb" "enwiki-20190901-pages-meta-current6.xml-p565314p892912.bz2" , benchFile allExceptHexml "4Gb" "enwiki-20190901-pages-meta-current24.xml-p30503451p32003451.bz2" -- , benchFile allExceptHexml "21Gb" "enwiki-20190901-pages-meta-history2.xml-p31255p31720.bz2" ] allTests :: [String] allTests = [ "hexml-dom" , "xeno-sax" , "xeno-sax-z" -- , "xeno-sax-ex" -- , "xeno-sax-ex-z" , "xeno-dom" , "xeno-dom-with-recovery" -- XXX: "hexpact", "xml-dom" library don't work with big files; require too much memory -- , "hexpat-sax" -- , "hexpat-dom" -- , "xml-dom" -- , "libxml2-dom" ] allExceptHexml :: [String] allExceptHexml = "hexml-dom" `delete` allTests benchFile :: [String] -> String -> FilePath -> Benchmark benchFile enabledTests size fn = env (readBZip2File fn) (\ ~(input, inputz) -> bgroup size $ benchMethods enabledTests input inputz) benchMethods :: [String] -> ByteString -> Xeno.Types.ByteStringZeroTerminated -> [Benchmark] benchMethods enabledTests input inputz = runBench "hexml-dom" (whnf Hexml.parse input) ++ runBench "xeno-sax" (whnf Xeno.SAX.validate input) ++ runBench "xeno-sax-z" (whnf Xeno.SAX.validate inputz) ++ runBench "xeno-sax-ex " (whnf Xeno.SAX.validateEx input) ++ runBench "xeno-sax-ex-z" (whnf Xeno.SAX.validateEx inputz) ++ runBench "xeno-dom" (whnf Xeno.DOM.parse input) ++ runBench "xeno-dom-with-recovery" (whnf Xeno.DOM.Robust.parse input) ++ runBench "hexpat-sax" (whnf ((Hexpat.parseThrowing Hexpat.defaultParseOptions :: L.ByteString -> [Hexpat.SAXEvent ByteString ByteString]) . L.fromStrict) input) ++ runBench "hexpat-dom" (whnf ((HexpatTree.parse' HexpatTree.defaultParseOptions :: ByteString -> Either HexpatTree.XMLParseError (HexpatTree.Node ByteString ByteString))) input) ++ runBench "xml-dom" (nf XML.parseXMLDoc input) #ifdef LIBXML2 ++ runBench "libxml2-dom" (whnfIO (Libxml2.parseMemory input)) #endif where runBench name act | name `elem` enabledTests = [bench name act] | otherwise = [] readBZip2File :: FilePath -> IO (ByteString, Xeno.Types.ByteStringZeroTerminated) readBZip2File fn = do file <- L.readFile ("data" "ex" fn) let !bs = L.toStrict $ decompress file !bsz = Xeno.Types.BSZT $ bs `S.snoc` 0 return (bs, bsz) deriving instance Generic Content deriving instance Generic Element deriving instance Generic CData deriving instance Generic CDataKind deriving instance Generic QName deriving instance Generic Attr instance NFData Content instance NFData Element instance NFData CData instance NFData CDataKind instance NFData QName instance NFData Attr xeno-0.6/bench/Memory.hs0000644000000000000000000000231714124710313013400 0ustar0000000000000000{-# OPTIONS_GHC -fno-warn-orphans #-} {-# LANGUAGE BangPatterns #-} -- | Benchmark memory allocations. module Main where import Control.DeepSeq import qualified Data.ByteString as S import qualified Text.XML.Hexml as Hexml import Weigh import qualified Xeno.DOM import qualified Xeno.DOM.Robust import qualified Xeno.SAX main :: IO () main = do f4kb <- S.readFile "data/books-4kb.xml" f31kb <- S.readFile "data/text-31kb.xml" f211kb <- S.readFile "data/fabricated-211kb.xml" mainWith (do func "4kb_hexml_dom" Hexml.parse f4kb func "4kb_xeno_sax" Xeno.SAX.validate f4kb func "4kb_xeno_dom" Xeno.DOM.parse f4kb func "4kb_xeno_dom-with-recovery" Xeno.DOM.Robust.parse f4kb func "31kb_hexml_dom" Hexml.parse f31kb func "31kb_xeno_sax" Xeno.SAX.validate f31kb func "31kb_xeno_dom" Xeno.DOM.parse f31kb func "31kb_xeno_dom-with-recovery" Xeno.DOM.Robust.parse f31kb func "211kb_hexml_dom" Hexml.parse f211kb func "211kb_xeno_sax" Xeno.SAX.validate f211kb func "211kb_xeno_dom" Xeno.DOM.parse f211kb func "211kb_xeno_dom-with-recovery" Xeno.DOM.Robust.parse f211kb) instance NFData Hexml.Node where rnf !_ = () xeno-0.6/bench/Speed.hs0000644000000000000000000000547414124710313013177 0ustar0000000000000000{-# LANGUAGE CPP #-} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE StandaloneDeriving, DeriveGeneric #-} {-# OPTIONS_GHC -fno-warn-orphans #-} -- | Benchmark speed. module Main where import Control.DeepSeq import Criterion import Criterion.Main import Data.ByteString (ByteString) import qualified Data.ByteString as S import qualified Data.ByteString.Lazy as L import GHC.Generics import qualified Text.XML.Expat.SAX as Hexpat import qualified Text.XML.Expat.Tree as HexpatTree import qualified Text.XML.Hexml as Hexml import Text.XML.Light as XML import qualified Xeno.SAX import qualified Xeno.Types import qualified Xeno.DOM import qualified Xeno.DOM.Robust #ifdef LIBXML2 import qualified Text.XML.LibXML.Parser as Libxml2 #endif readFileZ :: FilePath -> IO (ByteString, Xeno.Types.ByteStringZeroTerminated) readFileZ fn = do !s <- S.readFile fn let !sz = Xeno.Types.BSZT (s `S.snoc` 0) return (s, sz) main :: IO () main = defaultMain $ (flip map) [ ("4KB", "data/books-4kb.xml") , ("31KB", "data/text-31kb.xml") , ("211KB", "data/fabricated-211kb.xml") ] $ \(group, fn) -> env (readFileZ fn) (\ ~(!input, !inputz) -> bgroup group [ bench "hexml-dom" (whnf Hexml.parse input) , bench "xeno-sax" (whnf Xeno.SAX.validate input) , bench "xeno-sax-z" (whnf Xeno.SAX.validate inputz) , bench "xeno-sax-ex" (whnf Xeno.SAX.validateEx input) , bench "xeno-sax-ex-z" (whnf Xeno.SAX.validateEx inputz) , bench "xeno-dom" (whnf Xeno.DOM.parse input) , bench "xeno-dom-with-recovery" (whnf Xeno.DOM.Robust.parse input) , bench "hexpat-sax" (whnf ((Hexpat.parseThrowing Hexpat.defaultParseOptions :: L.ByteString -> [Hexpat.SAXEvent ByteString ByteString]) . L.fromStrict) input) , bench "hexpat-dom" (whnf ((HexpatTree.parse' HexpatTree.defaultParseOptions :: ByteString -> Either HexpatTree.XMLParseError (HexpatTree.Node ByteString ByteString))) input) , bench "xml-dom" (nf XML.parseXMLDoc input) #ifdef LIBXML2 , bench "libxml2-dom" (whnfIO (Libxml2.parseMemory input)) #endif ]) deriving instance Generic Content deriving instance Generic Element deriving instance Generic CData deriving instance Generic CDataKind deriving instance Generic QName deriving instance Generic Attr instance NFData Content instance NFData Element instance NFData CData instance NFData CDataKind instance NFData QName instance NFData Attr xeno-0.6/data/books-4kb.xml0000644000000000000000000001047414124710313013746 0ustar0000000000000000 Gambardella, Matthew XML Developer's Guide Computer 44.95 2000-10-01 An in-depth look at creating applications with XML. Ralls, Kim Midnight Rain Fantasy 5.95 2000-12-16 A former architect battles corporate zombies, an evil sorceress, and her own childhood to become queen of the world. Corets, Eva Maeve Ascendant Fantasy 5.95 2000-11-17 After the collapse of a nanotechnology society in England, the young survivors lay the foundation for a new society. Corets, Eva Oberon's Legacy Fantasy 5.95 2001-03-10 In post-apocalypse England, the mysterious agent known only as Oberon helps to create a new life for the inhabitants of London. Sequel to Maeve Ascendant. Corets, Eva The Sundered Grail Fantasy 5.95 2001-09-10 The two daughters of Maeve, half-sisters, battle one another for control of England. Sequel to Oberon's Legacy. Randall, Cynthia Lover Birds Romance 4.95 2000-09-02 When Carla meets Paul at an ornithology conference, tempers fly as feathers get ruffled. Thurman, Paula Splish Splash Romance 4.95 2000-11-02 A deep sea diver finds true love twenty thousand leagues beneath the sea. Knorr, Stefan Creepy Crawlies Horror 4.95 2000-12-06 An anthology of horror stories about roaches, centipedes, scorpions and other insects. Kress, Peter Paradox Lost Science Fiction 6.95 2000-11-02 After an inadvertant trip through a Heisenberg Uncertainty Device, James Salway discovers the problems of being quantum. O'Brien, Tim Microsoft .NET: The Programming Bible Computer 36.95 2000-12-09 Microsoft's .NET initiative is explored in detail in this deep programmer's reference. O'Brien, Tim MSXML3: A Comprehensive Guide Computer 36.95 2000-12-01 The Microsoft MSXML3 parser is covered in detail, with attention to XML DOM interfaces, XSLT processing, SAX and more. Galos, Mike Visual Studio 7: A Comprehensive Guide Computer 49.95 2001-04-16 Microsoft Visual Studio 7 is explored in depth, looking at how Visual Basic, Visual C++, C#, and ASP+ are integrated into a comprehensive development environment. xeno-0.6/LICENSE0000644000000000000000000000264514124710313011526 0ustar0000000000000000Copyright (c) 2016-2017, Chris Done All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of hindent nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CHRIS DONE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xeno-0.6/Setup.hs0000644000000000000000000000005614124710313012147 0ustar0000000000000000import Distribution.Simple main = defaultMain xeno-0.6/xeno.cabal0000644000000000000000000000630014303077203012451 0ustar0000000000000000name: xeno version: 0.6 synopsis: A fast event-based XML parser in pure Haskell description: A fast, low-memory use, event-based XML parser in pure Haskell. build-type: Simple category: XML, Parser cabal-version: >=1.10 homepage: https://github.com/ocramz/xeno license: BSD3 license-file: LICENSE author: Christopher Done maintainer: Marco Zocca (ocramz fripost org) tested-with: GHC == 8.0.1, GHC == 8.2.2, GHC == 8.4.2, GHC == 8.4.4, GHC == 8.6.5, GHC == 9.0.1, GHC == 9.0.4 extra-source-files: README.md CHANGELOG.markdown CONTRIBUTORS.md data-files: data/books-4kb.xml source-repository head type: git location: https://github.com/ocramz/xeno flag libxml2 description: Include libxml2 in the benchmarks default: False flag whitespace-around-equals description: Correctly parse whitespace around the = characters in attribute definitions default: False library hs-source-dirs: src ghc-options: -Wall -O2 exposed-modules: Xeno.SAX, Xeno.DOM, Xeno.DOM.Internal, Xeno.DOM.Robust, Xeno.Types, Xeno.Errors other-modules: Control.Spork build-depends: base >= 4.7 && < 5 , bytestring >= 0.10.8 , vector >= 0.11 , deepseq >= 1.4.2 , array >= 0.5.1 , mutable-containers >= 0.3.3 , mtl >= 2.2.1 if flag(whitespace-around-equals) cpp-options: -DWHITESPACE_AROUND_EQUALS default-language: Haskell2010 test-suite xeno-test type: exitcode-stdio-1.0 hs-source-dirs: test main-is: Main.hs build-depends: base, xeno, hexml, hspec, bytestring -- | DEBUG , hspec ghc-options: -Wall -threaded -rtsopts -with-rtsopts=-N if flag(whitespace-around-equals) cpp-options: -DWHITESPACE_AROUND_EQUALS default-language: Haskell2010 benchmark xeno-speed-bench type: exitcode-stdio-1.0 hs-source-dirs: bench main-is: Speed.hs build-depends: base, xeno, hexml, criterion, bytestring, deepseq, ghc-prim, xml, hexpat if flag(libxml2) build-depends: libxml ghc-options: -Wall -rtsopts -O2 if flag(libxml2) cpp-options: -DLIBXML2 -- ghc-options: -DLIBXML2 -- Hackage started complaining about this default-language: Haskell2010 benchmark xeno-memory-bench type: exitcode-stdio-1.0 hs-source-dirs: bench main-is: Memory.hs build-depends: base, xeno, weigh, bytestring, deepseq, hexml ghc-options: -Wall -threaded -O2 -rtsopts -with-rtsopts=-N default-language: Haskell2010 benchmark xeno-speed-big-files-bench type: exitcode-stdio-1.0 hs-source-dirs: bench main-is: SpeedBigFiles.hs build-depends: base, xeno, hexml, criterion, bytestring, deepseq, ghc-prim, xml, hexpat, bzlib, filepath if flag(libxml2) build-depends: libxml ghc-options: -Wall -O2 -rtsopts "-with-rtsopts=-H8G -AL1G -A256m -M25G" if flag(libxml2) cpp-options: -DLIBXML2 default-language: Haskell2010 benchmark xeno-bench type: exitcode-stdio-1.0 main-is: Bench.hs hs-source-dirs: app build-depends: base, xeno, weigh, bytestring, deepseq, hexml, bytestring-mmap, time ghc-options: -O2 -threaded -rtsopts "-with-rtsopts=-N" default-language: Haskell2010 xeno-0.6/README.md0000644000000000000000000001562214210366106012002 0ustar0000000000000000# xeno [![Github actions build status](https://img.shields.io/github/workflow/status/ocramz/xeno/Stack)](https://github.com/ocramz/xeno/actions) [![Hackage version](https://img.shields.io/hackage/v/xeno.svg?label=Hackage)](https://hackage.haskell.org/package/xeno) [![Stackage version](https://www.stackage.org/package/xeno/badge/lts?label=Stackage)](https://www.stackage.org/package/xeno) A fast event-based XML parser. [Blog post](http://chrisdone.com/posts/fast-haskell-c-parsing-xml). ## Features * SAX-style/fold parser which triggers events for open/close tags, attributes, text, etc. * Low memory use (see memory benchmarks below). * Very fast (see speed benchmarks below). * It [cheats like Hexml does](http://neilmitchell.blogspot.co.uk/2016/12/new-xml-parser-hexml.html) (doesn't expand entities, or most of the XML standard). * Written in pure Haskell. * CDATA is supported as of version 0.2. Please see the bottom of this file for guidelines on contributing to this library. ## Performance goals The [hexml](https://github.com/ndmitchell/hexml) Haskell library uses an XML parser written in C, so that is the baseline we're trying to beat or match roughly. ![Imgur](http://i.imgur.com/XgdZoQ9.png) The `Xeno.SAX` module is faster than Hexml for simply walking the document. Hexml actually does more work, allocating a DOM. `Xeno.DOM` is slighly slower or faster than Hexml depending on the document, although it is 2x slower on a 211KB document. Memory benchmarks for Xeno: Case Bytes GCs Check 4kb/xeno/sax 2,376 0 OK 31kb/xeno/sax 1,824 0 OK 211kb/xeno/sax 56,832 0 OK 4kb/xeno/dom 11,360 0 OK 31kb/xeno/dom 10,352 0 OK 211kb/xeno/dom 1,082,816 0 OK I memory benchmarked Hexml, but most of its allocation happens in C, which GHC doesn't track. So the data wasn't useful to compare. Speed benchmarks: benchmarking 4KB/hexml/dom time 6.317 μs (6.279 μs .. 6.354 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 6.333 μs (6.307 μs .. 6.362 μs) std dev 97.15 ns (77.15 ns .. 125.3 ns) variance introduced by outliers: 13% (moderately inflated) benchmarking 4KB/xeno/sax time 5.152 μs (5.131 μs .. 5.179 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 5.139 μs (5.128 μs .. 5.161 μs) std dev 58.02 ns (41.25 ns .. 85.41 ns) benchmarking 4KB/xeno/dom time 10.93 μs (10.83 μs .. 11.14 μs) 0.994 R² (0.983 R² .. 0.999 R²) mean 11.35 μs (11.12 μs .. 11.91 μs) std dev 1.188 μs (458.7 ns .. 2.148 μs) variance introduced by outliers: 87% (severely inflated) benchmarking 31KB/hexml/dom time 9.405 μs (9.348 μs .. 9.480 μs) 0.999 R² (0.998 R² .. 0.999 R²) mean 9.745 μs (9.599 μs .. 10.06 μs) std dev 745.3 ns (598.6 ns .. 902.4 ns) variance introduced by outliers: 78% (severely inflated) benchmarking 31KB/xeno/sax time 2.736 μs (2.723 μs .. 2.753 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 2.757 μs (2.742 μs .. 2.791 μs) std dev 76.93 ns (43.62 ns .. 136.1 ns) variance introduced by outliers: 35% (moderately inflated) benchmarking 31KB/xeno/dom time 5.767 μs (5.735 μs .. 5.814 μs) 0.999 R² (0.999 R² .. 1.000 R²) mean 5.759 μs (5.728 μs .. 5.810 μs) std dev 127.3 ns (79.02 ns .. 177.2 ns) variance introduced by outliers: 24% (moderately inflated) benchmarking 211KB/hexml/dom time 260.3 μs (259.8 μs .. 260.8 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 259.9 μs (259.7 μs .. 260.3 μs) std dev 959.9 ns (821.8 ns .. 1.178 μs) benchmarking 211KB/xeno/sax time 249.2 μs (248.5 μs .. 250.1 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 251.5 μs (250.6 μs .. 253.0 μs) std dev 3.944 μs (3.032 μs .. 5.345 μs) benchmarking 211KB/xeno/dom time 543.1 μs (539.4 μs .. 547.0 μs) 0.999 R² (0.999 R² .. 1.000 R²) mean 550.0 μs (545.3 μs .. 553.6 μs) std dev 14.39 μs (12.45 μs .. 17.12 μs) variance introduced by outliers: 17% (moderately inflated) ## DOM Example Easy as running the parse function: ``` haskell > parse "


hisuphi

" Right (Node "p" [("key", "val"), ("x", "foo"), ("k", "")] [ Element (Node "a" [] [Element (Node "hr" [] []), Text "hi"]) , Element (Node "b" [] [Text "sup"]) , Text "hi" ]) ``` ## SAX Example Quickly dumping XML: ``` haskell > let input = "TextHello, World!Content!Trailing." > dump input "Text" "Hello, World!" "Content!" "Trailing." ``` Folding over XML: ``` haskell > fold const (\m _ _ -> m + 1) const const const const 0 input -- Count attributes. Right 2 ``` ``` haskell > fold (\m _ -> m + 1) (\m _ _ -> m) const const const const 0 input -- Count elements. Right 3 ``` Most general XML processor: ``` haskell process :: Monad m => (ByteString -> m ()) -- ^ Open tag. -> (ByteString -> ByteString -> m ()) -- ^ Tag attribute. -> (ByteString -> m ()) -- ^ End open tag. -> (ByteString -> m ()) -- ^ Text. -> (ByteString -> m ()) -- ^ Close tag. -> ByteString -- ^ Input string. -> m () ``` You can use any monad you want. IO, State, etc. For example, `fold` is implemented like this: ``` haskell fold openF attrF endOpenF textF closeF s str = execState (process (\name -> modify (\s' -> openF s' name)) (\key value -> modify (\s' -> attrF s' key value)) (\name -> modify (\s' -> endOpenF s' name)) (\text -> modify (\s' -> textF s' text)) (\name -> modify (\s' -> closeF s' name)) str) s ``` The `process` is marked as INLINE, which means use-sites of it will inline, and your particular monad's type will be potentially erased for great performance. ## Contributors See CONTRIBUTORS.md ## Contribution guidelines All contributions and bug fixes are welcome and will be credited appropriately, as long as they are aligned with the goals of this library: speed and memory efficiency. In practical terms, patches and additional features should not introduce significant performance regressions. xeno-0.6/CHANGELOG.markdown0000644000000000000000000000315114303077203013550 0ustar0000000000000000 0.6 * GHC 9.4 compatibility : * get rid of MonadFail (ST s) instance use * mtl-2.3 compatibility 0.4.3 * Ensure we don't grow with a negative size in DOM parser (#48) * Flatten code nesting in process function (#45) * Introduce a whitespace-around-equals CPP flag (#44) * Use modify' instead of modify in fold (#42) 0.4.2 * all benchmarks marked as such in the Cabal file 0.4 * A number of optimizations and some changes in ergonomics. Thanks to Dmitry Krylov (dmalkr) and Michal Gajda (mgajda) ! * breaking API changes : * The parameters to function 'Xeno.SAX.process' are now wrapped in a Process type * Speed optimizations : * function 'Xeno.DOM.predictGrowSize' * Xeno.DOM.Robust * Benchmark improvements : * Added benchmarks for ByteStringZeroTerminated * Added benchmarks for big files (bench/SpeedBigFiles.hs) * Benchmarks run non-threaded 0.3.5.2 * Fix dependency lower bounds (GHC 8.0.1 is the earliest version currently supported) 0.3.5 * Improve error handling (#24 #26, mgajda) 0.3.4 * Fixed #14 and add test for #15 * Fixed typos in the examples (unhammer) 0.3.2 * Fixed DOM parsing from bystrings with non-zero offset (#11, qrilka) 0.3 * Fixed name parsing (for attributes and tags) so it conforms with the XML spec (qrilka) * Fixed parsing failure when root tag is preceded by white space (though without checking for white space characters specifically) (qrilka) * Added contribution guidelines (ocramz) 0.2 * Added CDATA support (Rembane) 0.1 * First Hackage release xeno-0.6/CONTRIBUTORS.md0000644000000000000000000000035014124710313012767 0ustar0000000000000000# Contributors Author : Chris Done ( chrisdone ) Maintainer : Marco Zocca ( ocramz ) ## In chronological order : Andreas Ekeroot ( Rembane ) Kirill Zaborsky ( qrilka ) Kevin Brubeck Unhammer ( unhammer ) Michal Gajda (mgajda)