html-conduit-1.3.2/src/0000755000000000000000000000000013313644714013107 5ustar0000000000000000html-conduit-1.3.2/src/Text/0000755000000000000000000000000013313644714014033 5ustar0000000000000000html-conduit-1.3.2/src/Text/HTML/0000755000000000000000000000000013362660273014601 5ustar0000000000000000html-conduit-1.3.2/test/0000755000000000000000000000000013362660273013301 5ustar0000000000000000html-conduit-1.3.2/src/Text/HTML/DOM.hs0000644000000000000000000001065313313644714015557 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} module Text.HTML.DOM ( eventConduit , sinkDoc , readFile , parseLBS , parseBSChunks , eventConduitText , sinkDocText , parseLT , parseSTChunks ) where import Control.Monad.Trans.Resource import Prelude hiding (readFile) import qualified Data.ByteString as S import qualified Text.HTML.TagStream as TS import qualified Data.XML.Types as XT import qualified Data.Text as T import qualified Data.Text.Lazy as TL import qualified Data.Conduit.List as CL import Control.Arrow ((***)) import qualified Data.Set as Set import qualified Text.XML as X import Conduit import qualified Data.ByteString.Lazy as L import Data.Maybe (mapMaybe) import qualified Data.Map.Strict as Map -- | Converts a stream of bytes to a stream of properly balanced @Event@s. -- -- Note that there may be multiple (or not) root elements. @sinkDoc@ addresses -- that case. eventConduit :: Monad m => ConduitT S.ByteString XT.Event m () eventConduit = decodeUtf8LenientC .| eventConduit' eventConduitText :: Monad m => ConduitT T.Text XT.Event m () eventConduitText = eventConduit' eventConduit' :: Monad m => ConduitT T.Text XT.Event m () eventConduit' = TS.tokenStream .| go [] where go stack = do mx <- await case mx of Nothing -> closeStack stack -- Ignore processing instructions (or pseudo-instructions) Just (TS.TagOpen local _ _) | "?" `T.isPrefixOf` local -> go stack Just (TS.TagOpen local attrs isClosed) -> do let name = toName local attrs' = map (toName *** return . XT.ContentText) $ Map.toList attrs yield $ XT.EventBeginElement name attrs' if isClosed || isVoid local then yield (XT.EventEndElement name) >> go stack else go $ name : stack Just (TS.TagClose name) | toName name `elem` stack -> let loop [] = go [] loop (n:ns) = do yield $ XT.EventEndElement n if n == toName name then go ns else loop ns in loop stack | otherwise -> go stack Just (TS.Text t) -> do yield $ XT.EventContent $ XT.ContentText t go stack Just (TS.Comment t) -> do yield $ XT.EventComment t go stack Just TS.Special{} -> go stack Just TS.Incomplete{} -> go stack toName l = XT.Name l Nothing Nothing closeStack = mapM_ (yield . XT.EventEndElement) isVoid = flip Set.member $ Set.fromList [ "area" , "base" , "br" , "col" , "command" , "embed" , "hr" , "img" , "input" , "keygen" , "link" , "meta" , "param" , "source" , "track" , "wbr" ] sinkDoc :: MonadThrow m => ConduitT S.ByteString o m X.Document sinkDoc = sinkDoc' eventConduit sinkDocText :: MonadThrow m => ConduitT T.Text o m X.Document sinkDocText = sinkDoc' eventConduitText sinkDoc' :: MonadThrow m => ConduitT a XT.Event m () -> ConduitT a o m X.Document sinkDoc' f = fmap stripDummy $ mapOutput ((,) Nothing) f .| addDummyWrapper .| X.fromEvents where addDummyWrapper = do yield (Nothing, XT.EventBeginElement "html" []) awaitForever yield yield (Nothing, XT.EventEndElement "html") stripDummy doc@(X.Document pro (X.Element _ _ nodes) epi) = case mapMaybe toElement nodes of [root] -> X.Document pro root epi _ -> doc toElement (X.NodeElement e) = Just e toElement _ = Nothing readFile :: FilePath -> IO X.Document readFile fp = withSourceFile fp $ \src -> runConduit $ src .| sinkDoc parseLBS :: L.ByteString -> X.Document parseLBS = parseBSChunks . L.toChunks parseBSChunks :: [S.ByteString] -> X.Document parseBSChunks tss = case runConduit $ CL.sourceList tss .| sinkDoc of Left e -> error $ "Unexpected exception in parseBSChunks: " ++ show e Right x -> x parseLT :: TL.Text -> X.Document parseLT = parseSTChunks . TL.toChunks parseSTChunks :: [T.Text] -> X.Document parseSTChunks tss = case runConduit $ CL.sourceList tss .| sinkDocText of Left e -> error $ "Unexpected exception in parseSTChunks: " ++ show e Right x -> x html-conduit-1.3.2/src/Text/HTML/TagStream.hs0000644000000000000000000002260013362660273017024 0ustar0000000000000000{-# LANGUAGE OverloadedStrings, TupleSections, ViewPatterns #-} {-# LANGUAGE TypeFamilies #-} {-# LANGUAGE RecordWildCards #-} module Text.HTML.TagStream ( Token (..) , tokenStream ) where import Control.Applicative import Control.Monad (unless) import Control.Monad.Trans.Resource (MonadThrow) import Data.Char import qualified Data.Conduit.List as CL import Data.Attoparsec.Text import Data.Conduit import qualified Data.Conduit.Attoparsec as CA import Data.Maybe (fromMaybe, isJust) import Data.Monoid ((<>)) import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Lazy as L import qualified Data.Text.Lazy.Builder as B import qualified Text.XML.Stream.Parse as XML import Data.Map (Map) import qualified Data.Map.Strict as Map import Control.Arrow (first) data Token = TagOpen Text (Map Text Text) Bool | TagClose Text | Text Text | Comment Text | Special Text Text | Incomplete Text deriving (Eq, Show) data TagType = TagTypeClose | TagTypeSpecial | TagTypeNormal {-- - match quoted string, can fail. -} quoted :: Char -> Parser Text quoted q = T.append <$> takeTill (in2 ('\\',q)) <*> ( char q *> pure "" <|> char '\\' *> atLeast 1 (quoted q) ) quotedOr :: Parser Text -> Parser Text quotedOr p = maybeP (satisfy (in2 ('"','\''))) >>= maybe p quoted {-- - attribute value, can't fail. -} attrValue :: Parser Text attrValue = quotedOr $ takeTill ((=='>') ||. isSpace) {-- - attribute name, at least one char, can fail when meet tag end. - might match self-close tag end "/>" , make sure match `tagEnd' first. -} attrName :: Parser Text attrName = quotedOr $ T.cons <$> satisfy (/='>') <*> takeTill (in3 ('/','>','=') ||. isSpace) {-- - tag end, return self-close or not, can fail. -} tagEnd :: Parser Bool tagEnd = char '>' *> pure False <|> string "/>" *> pure True {-- - attribute pair or tag end, can fail if tag end met. -} attr :: Parser (Text, Text) attr = (,) <$> attrName <* skipSpace <*> ( boolP (char '=') >>= cond (skipSpace *> attrValue) (pure "") ) {-- - all attributes before tag end. can't fail. -} attrs :: Parser (Map Text Text, Bool) attrs = loop Map.empty where loop acc = skipSpace *> (Left <$> tagEnd <|> Right <$> attr) >>= either (return . (acc,)) (\(key, value) -> loop $ Map.insert key value acc) {-- - comment tag without prefix. -} comment :: Parser Token comment = Comment <$> comment' where comment' = T.append <$> takeTill (=='-') <*> ( string "-->" *> return "" <|> atLeast 1 comment' ) {-- - tags begine with -} special :: Parser Token special = Special <$> ( T.cons <$> satisfy (not . ((=='-') ||. isSpace)) <*> takeTill ((=='>') ||. isSpace) <* skipSpace ) <*> takeTill (=='>') <* char '>' {-- - parse a tag, can fail. -} tag :: Parser Token tag = do t <- string "/" *> return TagTypeClose <|> string "!" *> return TagTypeSpecial <|> return TagTypeNormal case t of TagTypeClose -> TagClose <$> takeTill (=='>') <* char '>' TagTypeSpecial -> boolP (string "--") >>= cond comment special TagTypeNormal -> do name <- takeTill (in3 ('<','>','/') ||. isSpace) (as, close) <- attrs return $ TagOpen name (Map.map decodeString as) close {-- - record incomplete tag for streamline processing. -} incomplete :: Parser Token incomplete = Incomplete . T.cons '<' <$> takeText {-- - parse text node. consume at least one char, to make sure progress. -} text :: Parser Token text = Text <$> atLeast 1 (takeTill (=='<')) decodeEntity :: MonadThrow m => Text -> m Text decodeEntity entity = runConduit $ CL.sourceList ["&",entity,";"] .| XML.parseText' XML.def { XML.psDecodeEntities = XML.decodeHtmlEntities } .| XML.content token :: Parser Token token = char '<' *> (tag <|> incomplete) <|> text {-- - treat script tag specially, can't fail. -} tillScriptEnd :: Token -> Parser [Token] tillScriptEnd open = loop mempty where loop acc = do chunk <- takeTill (== '<') let acc' = acc <> B.fromText chunk finish = pure [open, Text $ L.toStrict $ B.toLazyText acc', TagClose "script"] hasContent = (string "/script>" *> finish) <|> loop (acc' <> "<") (char '<' *> hasContent) <|> finish tokens :: Parser [Token] tokens = do t <- token case t of TagOpen "script" _ False -> tillScriptEnd t Text text0 -> do let parseText = do Text text <- token pure text texts <- many parseText pure [Text $ decodeString $ T.concat $ text0 : texts] _ -> pure [t] {-- - Utils {{{ -} atLeast :: Int -> Parser Text -> Parser Text atLeast 0 p = p atLeast n p = T.cons <$> anyChar <*> atLeast (n-1) p cond :: a -> a -> Bool -> a cond a1 a2 b = if b then a1 else a2 (||.) :: Applicative f => f Bool -> f Bool -> f Bool (||.) = liftA2 (||) in2 :: Eq a => (a,a) -> a -> Bool in2 (a1,a2) a = a==a1 || a==a2 in3 :: Eq a => (a,a,a) -> a -> Bool in3 (a1,a2,a3) a = a==a1 || a==a2 || a==a3 boolP :: Parser a -> Parser Bool boolP p = p *> pure True <|> pure False maybeP :: Parser a -> Parser (Maybe a) maybeP p = Just <$> p <|> return Nothing -- }}} -- {{{ Stream tokenStream :: Monad m => ConduitT Text Token m () tokenStream = CL.filter (not . T.null) .| CA.conduitParserEither tokens .| CL.concatMap go where go (Left e) = error $ "html-conduit: parse error that should never happen occurred! " ++ show e go (Right (_, tokens')) = tokens' splitAccum :: [Token] -> (Text, [Token]) splitAccum [] = (mempty, []) splitAccum (reverse -> (Incomplete s : xs)) = (s, reverse xs) splitAccum tokens = (mempty, tokens) -- Entities -- | A conduit to decode entities from a stream of tokens into a new stream of tokens. decodeEntities :: Monad m => ConduitT Token Token m () decodeEntities = start where start = await >>= maybe (return ()) (\token' -> start' token' >> start) start' (Text t) = (yield t >> yieldWhileText) .| decodeEntities' .| CL.mapMaybe go start' (TagOpen name attrs' bool) = yield (TagOpen name (Map.map decodeString attrs') bool) start' token' = yield token' go t | t == "" = Nothing | otherwise = Just (Text t) -- | Decode entities in a complete string. decodeString :: Text -> Text decodeString input = case makeEntityDecoder input of (value', remainder) | value' /= mempty -> value' <> decodeString remainder | otherwise -> input decodeEntities' :: Monad m => ConduitT Text Text m () decodeEntities' = loop id where loop accum = do mchunk <- await let chunk = accum $ fromMaybe mempty mchunk (newStr, remainder) = makeEntityDecoder chunk yield newStr if isJust mchunk then loop (mappend remainder) else yield remainder -- | Yield contiguous text tokens as strings. yieldWhileText :: Monad m => ConduitT Token Text m () yieldWhileText = loop where loop = await >>= maybe (return ()) go go (Text t) = yield t >> loop go token' = leftover token' -- | Decode the entities in a string type with a decoder. makeEntityDecoder :: Text -> (Text, Text) makeEntityDecoder = first (L.toStrict . B.toLazyText) . go where go s = case T.break (=='&') s of (_,"") -> (B.fromText s, "") (before,restPlusAmp@(T.drop 1 -> rest)) -> case T.break (not . (\c -> isNameChar c || c == '#')) rest of (_,"") -> (B.fromText before, restPlusAmp) (entity,after) -> (before1 <> before2, after') where before1 = B.fromText before (before2, after') = case mdecoded of Nothing -> first (("&" <> B.fromText entity) <>) (go after) Just (B.fromText -> decoded) -> case T.uncons after of Just (';',validAfter) -> first (decoded <>) (go validAfter) Just (_invalid,_rest) -> first (decoded <>) (go after) Nothing -> (mempty, s) mdecoded = if entity == mempty then Nothing else decodeEntity entity -- | Is the character a valid Name starter? isNameStart :: Char -> Bool isNameStart c = c == ':' || c == '_' || isAsciiUpper c || isAsciiLower c || (c >= '\xC0' && c <= '\xD6') || (c >= '\xD8' && c <= '\xF6') || (c >= '\xF8' && c <= '\x2FF') || (c >= '\x370' && c <= '\x37D') || (c >= '\x37F' && c <= '\x1FFF') || (c >= '\x200C' && c <= '\x200D') || (c >= '\x2070' && c <= '\x218F') || (c >= '\x2C00' && c <= '\x2FEF') || (c >= '\x3001' && c <= '\xD7FF') || (c >= '\xF900' && c <= '\xFDCF') || (c >= '\xFDF0' && c <= '\xFFFD') || (c >= '\x10000' && c <= '\xEFFFF') -- | Is the character valid in a Name? isNameChar :: Char -> Bool isNameChar c = c == '-' || c == '.' || c == '\xB7' || isDigit c || isNameStart c || (c >= '\x0300' && c <= '\x036F') || (c >= '\x203F' && c <= '\x2040') html-conduit-1.3.2/test/main.hs0000644000000000000000000001770213362660273014570 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} import Test.HUnit hiding (Test) import Test.Hspec import Test.Hspec.QuickCheck import Data.ByteString.Lazy.Char8 () import qualified Text.HTML.DOM as H import qualified Text.XML as X import qualified Data.Map as Map import qualified Data.Text as T import Control.Exception (evaluate) import Control.DeepSeq (($!!)) import Control.Monad (void) main :: IO () main = hspec $ do describe "parses" $ do it "well-formed document" $ X.parseLBS_ X.def "baz" @=? H.parseLBS "baz" it "adds missing close tags" $ X.parseLBS_ X.def "baz" @=? H.parseLBS "baz" it "void tags" $ X.parseLBS_ X.def "foo" @=? H.parseLBS "foo" it "xml entities" $ X.parseLBS_ X.def "baz>" @=? H.parseLBS "baz>" it "html entities" $ X.parseLBS_ X.def "baz " @=? H.parseLBS "baz " it "decimal entities" $ X.parseLBS_ X.def "baz " @=? H.parseLBS "baz " it "hex entities" $ X.parseLBS_ X.def "bazŠ" @=? H.parseLBS "bazŠ" it "invalid entities" $ X.parseLBS_ X.def "baz&foobar;" @=? H.parseLBS "baz&foobar;" it "multiple root elements" $ X.parseLBS_ X.def "baz&foobar;" @=? H.parseLBS "baz&foobar;" it "doesn't strip whitespace" $ X.parseLBS_ X.def " hello" @=? H.parseLBS " hello" it "split code-points" $ X.parseLBS_ X.def " " @=? H.parseBSChunks ["\xc2", "\xa0"] it "latin1 codes" $ X.parseText_ X.def "\232" @=? H.parseSTChunks ["\232"] it "latin1 codes strict vs lazy" $ H.parseLT "\232" @=? H.parseSTChunks ["\232"] describe "HTML parsing" $ do it "XHTML" $ let html = "foo

Hello World

" doc = X.Document (X.Prologue [] Nothing []) root [] root = X.Element "html" Map.empty [ X.NodeElement $ X.Element "head" Map.empty [ X.NodeElement $ X.Element "title" Map.empty [X.NodeContent "foo"] ] , X.NodeElement $ X.Element "body" Map.empty [ X.NodeElement $ X.Element "p" Map.empty [X.NodeContent "Hello World"] ] ] in H.parseLBS html @?= doc it "XHTML with doctype and \n\nfoo

Hello World

" doc = X.Document (X.Prologue [] Nothing []) root [] root = X.Element "html" Map.empty [ X.NodeElement $ X.Element "head" Map.empty [ X.NodeElement $ X.Element "title" Map.empty [X.NodeContent "foo"] ] , X.NodeElement $ X.Element "body" Map.empty [ X.NodeElement $ X.Element "p" Map.empty [X.NodeContent "Hello World"] ] ] in H.parseLBS html @?= doc it "HTML" $ let html = "foo

Hello World

" doc = X.Document (X.Prologue [] Nothing []) root [] root = X.Element "html" Map.empty [ X.NodeElement $ X.Element "head" Map.empty [ X.NodeElement $ X.Element "title" Map.empty [X.NodeContent "foo"] ] , X.NodeElement $ X.Element "body" Map.empty [ X.NodeElement $ X.Element "br" Map.empty [] , X.NodeElement $ X.Element "p" Map.empty [X.NodeContent "Hello World"] ] ] in H.parseLBS html @?= doc it "doesn't double unescape" $ let html = "

Hello &gt; World

" doc = X.Document (X.Prologue [] Nothing []) root [] root = X.Element "p" Map.empty [ X.NodeContent "Hello > World" ] in H.parseLBS html @?= doc it "handles entities in attributes" $ let html = "
" doc = X.Document (X.Prologue [] Nothing []) root [] root = X.Element "br" (Map.singleton "title" "Mac & Cheese") [] in H.parseLBS html @?= doc it "doesn't double escape entities in attributes" $ let html = "
" doc = X.Document (X.Prologue [] Nothing []) root [] root = X.Element "br" (Map.singleton "title" "Mac & Cheese") [] in H.parseLBS html @?= doc describe "script tags" $ do it "ignores funny characters" $ let html = "" doc = X.Document (X.Prologue [] Nothing []) root [] root = X.Element "script" Map.empty [X.NodeContent "hello <> world"] in H.parseLBS html @?= doc {- Would be nice... doesn't work with tagstream-conduit original code. Not even sure if the HTML5 parser spec discusses this case. it "ignores inside string" $ let html = "\" world" doc = X.Document (X.Prologue [] Nothing []) root [] root = X.Element "script" Map.empty [X.NodeContent "hello \"\" world"] in H.parseLBS html @?= doc -} it "unterminated" $ let html = "