html-conduit-1.3.2.2/src/ 0000755 0000000 0000000 00000000000 14106362034 013240 5 ustar 00 0000000 0000000 html-conduit-1.3.2.2/src/Text/ 0000755 0000000 0000000 00000000000 14106362034 014164 5 ustar 00 0000000 0000000 html-conduit-1.3.2.2/src/Text/HTML/ 0000755 0000000 0000000 00000000000 14106362600 014727 5 ustar 00 0000000 0000000 html-conduit-1.3.2.2/test/ 0000755 0000000 0000000 00000000000 14106362034 013430 5 ustar 00 0000000 0000000 html-conduit-1.3.2.2/src/Text/HTML/DOM.hs 0000644 0000000 0000000 00000010747 14106362600 015713 0 ustar 00 0000000 0000000 {-# LANGUAGE OverloadedStrings #-}
module Text.HTML.DOM
( eventConduit
, sinkDoc
, readFile
, parseLBS
, parseBSChunks
, eventConduitText
, sinkDocText
, parseLT
, parseSTChunks
) where
import Control.Monad.Trans.Resource
import Prelude hiding (readFile)
import qualified Data.ByteString as S
import qualified Text.HTML.TagStream as TS
import qualified Data.XML.Types as XT
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import qualified Data.Conduit.List as CL
import Control.Arrow ((***))
import qualified Data.Set as Set
import qualified Text.XML as X
import Conduit
import qualified Data.ByteString.Lazy as L
import Data.Maybe (mapMaybe)
import qualified Data.Map.Strict as Map
-- | Converts a stream of bytes to a stream of properly balanced @Event@s.
--
-- Note that there may be multiple (or not) root elements. @sinkDoc@ addresses
-- that case.
eventConduit :: Monad m => ConduitT S.ByteString XT.Event m ()
eventConduit = decodeUtf8LenientC .| eventConduit'
eventConduitText :: Monad m => ConduitT T.Text XT.Event m ()
eventConduitText = eventConduit'
eventConduit' :: Monad m => ConduitT T.Text XT.Event m ()
eventConduit' =
TS.tokenStream .| go []
where
go stack = do
mx <- await
case mx of
Nothing -> closeStack stack
-- Ignore processing instructions (or pseudo-instructions)
Just (TS.TagOpen local _ _) | "?" `T.isPrefixOf` local -> go stack
Just (TS.TagOpen local attrs isClosed) -> do
let name = toName local
attrs' = map (toName *** return . XT.ContentText) $ Map.toList attrs
yield $ XT.EventBeginElement name attrs'
if isClosed || isVoid local
then yield (XT.EventEndElement name) >> go stack
else go $ name : stack
Just (TS.TagClose name)
| toName name `elem` stack ->
let loop [] = go []
loop (n:ns) = do
yield $ XT.EventEndElement n
if n == toName name
then go ns
else loop ns
in loop stack
| otherwise -> go stack
Just (TS.Text t) -> do
yield $ XT.EventContent $ XT.ContentText t
go stack
Just (TS.Comment t) -> do
yield $ XT.EventComment t
go stack
Just TS.Special{} -> go stack
Just TS.Incomplete{} -> go stack
toName l = XT.Name l Nothing Nothing
closeStack = mapM_ (yield . XT.EventEndElement)
isVoid name = Set.member (T.toLower name) voidSet
voidSet :: Set.Set T.Text
voidSet = Set.fromList
[ "area"
, "base"
, "br"
, "col"
, "command"
, "embed"
, "hr"
, "img"
, "input"
, "keygen"
, "link"
, "meta"
, "param"
, "source"
, "track"
, "wbr"
]
sinkDoc :: MonadThrow m => ConduitT S.ByteString o m X.Document
sinkDoc = sinkDoc' eventConduit
sinkDocText :: MonadThrow m => ConduitT T.Text o m X.Document
sinkDocText = sinkDoc' eventConduitText
sinkDoc'
:: MonadThrow m
=> ConduitT a XT.Event m ()
-> ConduitT a o m X.Document
sinkDoc' f =
fmap stripDummy $ mapOutput ((,) Nothing) f .| addDummyWrapper .| X.fromEvents
where
addDummyWrapper = do
yield (Nothing, XT.EventBeginElement "html" [])
awaitForever yield
yield (Nothing, XT.EventEndElement "html")
stripDummy doc@(X.Document pro (X.Element _ _ nodes) epi) =
case mapMaybe toElement nodes of
[root] -> X.Document pro root epi
_ -> doc
toElement (X.NodeElement e) = Just e
toElement _ = Nothing
readFile :: FilePath -> IO X.Document
readFile fp = withSourceFile fp $ \src -> runConduit $ src .| sinkDoc
parseLBS :: L.ByteString -> X.Document
parseLBS = parseBSChunks . L.toChunks
parseBSChunks :: [S.ByteString] -> X.Document
parseBSChunks tss =
case runConduit $ CL.sourceList tss .| sinkDoc of
Left e -> error $ "Unexpected exception in parseBSChunks: " ++ show e
Right x -> x
parseLT :: TL.Text -> X.Document
parseLT = parseSTChunks . TL.toChunks
parseSTChunks :: [T.Text] -> X.Document
parseSTChunks tss =
case runConduit $ CL.sourceList tss .| sinkDocText of
Left e -> error $ "Unexpected exception in parseSTChunks: " ++ show e
Right x -> x
html-conduit-1.3.2.2/src/Text/HTML/TagStream.hs 0000644 0000000 0000000 00000023032 14106362034 017153 0 ustar 00 0000000 0000000 {-# LANGUAGE OverloadedStrings, TupleSections, ViewPatterns #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE CPP #-}
module Text.HTML.TagStream
( Token (..)
, tokenStream
) where
import Control.Applicative
import Control.Monad (unless)
import Control.Monad.Trans.Resource (MonadThrow)
import Data.Char
import qualified Data.Conduit.List as CL
import Data.Attoparsec.Text
import Data.Conduit
import qualified Data.Conduit.Attoparsec as CA
import Data.Maybe (fromMaybe, isJust)
import Data.Monoid ((<>))
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.Builder as B
import qualified Text.XML.Stream.Parse as XML
import Data.Map (Map)
import qualified Data.Map.Strict as Map
import Control.Arrow (first)
data Token
= TagOpen Text (Map Text Text) Bool
| TagClose Text
| Text Text
| Comment Text
| Special Text Text
| Incomplete Text
deriving (Eq, Show)
data TagType
= TagTypeClose
| TagTypeSpecial
| TagTypeNormal
{--
- match quoted string, can fail.
-}
quoted :: Char -> Parser Text
quoted q = T.append <$> takeTill (in2 ('\\',q))
<*> ( char q *> pure ""
<|> char '\\' *> atLeast 1 (quoted q) )
quotedOr :: Parser Text -> Parser Text
quotedOr p = maybeP (satisfy (in2 ('"','\''))) >>=
maybe p quoted
{--
- attribute value, can't fail.
-}
attrValue :: Parser Text
attrValue = quotedOr $ takeTill ((=='>') ||. isSpace)
{--
- attribute name, at least one char, can fail when meet tag end.
- might match self-close tag end "/>" , make sure match `tagEnd' first.
-}
attrName :: Parser Text
attrName = quotedOr $
T.cons <$> satisfy (/='>')
<*> takeTill (in3 ('/','>','=') ||. isSpace)
{--
- tag end, return self-close or not, can fail.
-}
tagEnd :: Parser Bool
tagEnd = char '>' *> pure False
<|> string "/>" *> pure True
{--
- attribute pair or tag end, can fail if tag end met.
-}
attr :: Parser (Text, Text)
attr = (,) <$> attrName <* skipSpace
<*> ( boolP (char '=') >>=
cond (skipSpace *> attrValue)
(pure "")
)
{--
- all attributes before tag end. can't fail.
-}
attrs :: Parser (Map Text Text, Bool)
attrs = loop Map.empty
where
loop acc = skipSpace *> (Left <$> tagEnd <|> Right <$> attr) >>=
either
(return . (acc,))
(\(key, value) -> loop $ Map.insert key value acc)
{--
- comment tag without prefix.
-}
comment :: Parser Token
comment = Comment <$> comment'
where comment' = T.append <$> takeTill (=='-')
<*> ( string "-->" *> return ""
<|> atLeast 1 comment' )
{--
- tags begine with
-}
special :: Parser Token
special = Special
<$> ( T.cons <$> satisfy (not . ((=='-') ||. isSpace))
<*> takeTill ((=='>') ||. isSpace)
<* skipSpace )
<*> takeTill (=='>') <* char '>'
{--
- parse a tag, can fail.
-}
tag :: Parser Token
tag = do
t <- string "/" *> return TagTypeClose
<|> string "!" *> return TagTypeSpecial
<|> return TagTypeNormal
case t of
TagTypeClose ->
TagClose <$> takeTill (=='>')
<* char '>'
TagTypeSpecial -> boolP (string "--") >>=
cond comment special
TagTypeNormal -> do
name <- takeTill (in3 ('<','>','/') ||. isSpace)
(as, close) <- attrs
return $ TagOpen name (Map.map decodeString as) close
{--
- record incomplete tag for streamline processing.
-}
incomplete :: Parser Token
incomplete = Incomplete . T.cons '<' <$> takeText
{--
- parse text node. consume at least one char, to make sure progress.
-}
text :: Parser Token
text = Text <$> atLeast 1 (takeTill (=='<'))
decodeEntity :: MonadThrow m => Text -> m Text
decodeEntity entity =
runConduit
$ CL.sourceList ["&",entity,";"]
#if MIN_VERSION_xml_conduit(1,9,0)
.| XML.parseText XML.def { XML.psDecodeEntities = XML.decodeHtmlEntities }
#else
.| XML.parseText' XML.def { XML.psDecodeEntities = XML.decodeHtmlEntities }
#endif
.| XML.content
token :: Parser Token
token = char '<' *> (tag <|> incomplete)
<|> text
{--
- treat script tag specially, can't fail.
-}
tillScriptEnd :: Token -> Parser [Token]
tillScriptEnd open =
loop mempty
where
loop acc = do
chunk <- takeTill (== '<')
let acc' = acc <> B.fromText chunk
finish = pure [open, Text $ L.toStrict $ B.toLazyText acc', TagClose "script"]
hasContent = (string "/script>" *> finish) <|> loop (acc' <> "<")
(char '<' *> hasContent) <|> finish
tokens :: Parser [Token]
tokens = do
t <- token
case t of
TagOpen "script" _ False -> tillScriptEnd t
Text text0 -> do
let parseText = do
Text text <- token
pure text
texts <- many parseText
pure [Text $ decodeString $ T.concat $ text0 : texts]
_ -> pure [t]
{--
- Utils {{{
-}
atLeast :: Int -> Parser Text -> Parser Text
atLeast 0 p = p
atLeast n p = T.cons <$> anyChar <*> atLeast (n-1) p
cond :: a -> a -> Bool -> a
cond a1 a2 b = if b then a1 else a2
(||.) :: Applicative f => f Bool -> f Bool -> f Bool
(||.) = liftA2 (||)
in2 :: Eq a => (a,a) -> a -> Bool
in2 (a1,a2) a = a==a1 || a==a2
in3 :: Eq a => (a,a,a) -> a -> Bool
in3 (a1,a2,a3) a = a==a1 || a==a2 || a==a3
boolP :: Parser a -> Parser Bool
boolP p = p *> pure True <|> pure False
maybeP :: Parser a -> Parser (Maybe a)
maybeP p = Just <$> p <|> return Nothing
-- }}}
-- {{{ Stream
tokenStream :: Monad m
=> ConduitT Text Token m ()
tokenStream =
CL.filter (not . T.null) .| CA.conduitParserEither tokens .| CL.concatMap go
where
go (Left e) = error $ "html-conduit: parse error that should never happen occurred! " ++ show e
go (Right (_, tokens')) = tokens'
splitAccum :: [Token] -> (Text, [Token])
splitAccum [] = (mempty, [])
splitAccum (reverse -> (Incomplete s : xs)) = (s, reverse xs)
splitAccum tokens = (mempty, tokens)
-- Entities
-- | A conduit to decode entities from a stream of tokens into a new stream of tokens.
decodeEntities :: Monad m => ConduitT Token Token m ()
decodeEntities =
start
where
start = await >>= maybe (return ()) (\token' -> start' token' >> start)
start' (Text t) = (yield t >> yieldWhileText) .| decodeEntities' .| CL.mapMaybe go
start' (TagOpen name attrs' bool) = yield (TagOpen name (Map.map decodeString attrs') bool)
start' token' = yield token'
go t
| t == "" = Nothing
| otherwise = Just (Text t)
-- | Decode entities in a complete string.
decodeString :: Text -> Text
decodeString input =
case makeEntityDecoder input of
(value', remainder)
| value' /= mempty -> value' <> decodeString remainder
| otherwise -> input
decodeEntities' :: Monad m => ConduitT Text Text m ()
decodeEntities' =
loop id
where
loop accum = do
mchunk <- await
let chunk = accum $ fromMaybe mempty mchunk
(newStr, remainder) = makeEntityDecoder chunk
yield newStr
if isJust mchunk
then loop (mappend remainder)
else yield remainder
-- | Yield contiguous text tokens as strings.
yieldWhileText :: Monad m => ConduitT Token Text m ()
yieldWhileText =
loop
where
loop = await >>= maybe (return ()) go
go (Text t) = yield t >> loop
go token' = leftover token'
-- | Decode the entities in a string type with a decoder.
makeEntityDecoder :: Text -> (Text, Text)
makeEntityDecoder = first (L.toStrict . B.toLazyText) . go
where
go s =
case T.break (=='&') s of
(_,"") -> (B.fromText s, "")
(before,restPlusAmp@(T.drop 1 -> rest)) ->
case T.break (not . (\c -> isNameChar c || c == '#')) rest of
(_,"") -> (B.fromText before, restPlusAmp)
(entity,after) -> (before1 <> before2, after')
where
before1 = B.fromText before
(before2, after') =
case mdecoded of
Nothing -> first (("&" <> B.fromText entity) <>) (go after)
Just (B.fromText -> decoded) ->
case T.uncons after of
Just (';',validAfter) -> first (decoded <>) (go validAfter)
Just (_invalid,_rest) -> first (decoded <>) (go after)
Nothing -> (mempty, s)
mdecoded =
if entity == mempty
then Nothing
else decodeEntity entity
-- | Is the character a valid Name starter?
isNameStart :: Char -> Bool
isNameStart c =
c == ':' ||
c == '_' ||
isAsciiUpper c ||
isAsciiLower c ||
(c >= '\xC0' && c <= '\xD6') ||
(c >= '\xD8' && c <= '\xF6') ||
(c >= '\xF8' && c <= '\x2FF') ||
(c >= '\x370' && c <= '\x37D') ||
(c >= '\x37F' && c <= '\x1FFF') ||
(c >= '\x200C' && c <= '\x200D') ||
(c >= '\x2070' && c <= '\x218F') ||
(c >= '\x2C00' && c <= '\x2FEF') ||
(c >= '\x3001' && c <= '\xD7FF') ||
(c >= '\xF900' && c <= '\xFDCF') ||
(c >= '\xFDF0' && c <= '\xFFFD') ||
(c >= '\x10000' && c <= '\xEFFFF')
-- | Is the character valid in a Name?
isNameChar :: Char -> Bool
isNameChar c =
c == '-' ||
c == '.' ||
c == '\xB7' ||
isDigit c ||
isNameStart c ||
(c >= '\x0300' && c <= '\x036F') ||
(c >= '\x203F' && c <= '\x2040')
html-conduit-1.3.2.2/test/main.hs 0000644 0000000 0000000 00000021556 14106362460 014724 0 ustar 00 0000000 0000000 {-# LANGUAGE OverloadedStrings #-}
import Test.HUnit hiding (Test)
import Test.Hspec
import Test.Hspec.QuickCheck
import Data.ByteString.Lazy.Char8 ()
import qualified Text.HTML.DOM as H
import qualified Text.XML as X
import qualified Data.Map as Map
import qualified Data.Text as T
import Control.Exception (evaluate)
import Control.DeepSeq (($!!))
import Control.Monad (void)
main :: IO ()
main = hspec $ do
describe "parses" $ do
it "well-formed document" $
X.parseLBS_ X.def "baz" @=?
H.parseLBS "baz"
it "adds missing close tags" $
X.parseLBS_ X.def "baz" @=?
H.parseLBS "baz"
it "void tags" $
X.parseLBS_ X.def "
foo" @=?
H.parseLBS "
foo"
it "xml entities" $
X.parseLBS_ X.def "baz>" @=?
H.parseLBS "baz>"
it "html entities" $
X.parseLBS_ X.def "baz " @=?
H.parseLBS "baz "
it "decimal entities" $
X.parseLBS_ X.def "baz " @=?
H.parseLBS "baz "
it "hex entities" $
X.parseLBS_ X.def "bazŠ" @=?
H.parseLBS "bazŠ"
it "invalid entities" $
X.parseLBS_ X.def "baz&foobar;" @=?
H.parseLBS "baz&foobar;"
it "multiple root elements" $
X.parseLBS_ X.def "baz&foobar;" @=?
H.parseLBS "baz&foobar;"
it "doesn't strip whitespace" $
X.parseLBS_ X.def " hello" @=?
H.parseLBS " hello"
it "split code-points" $
X.parseLBS_ X.def " " @=?
H.parseBSChunks ["\xc2", "\xa0"]
it "latin1 codes" $
X.parseText_ X.def "\232" @=?
H.parseSTChunks ["\232"]
it "latin1 codes strict vs lazy" $
H.parseLT "\232" @=?
H.parseSTChunks ["\232"]
describe "HTML parsing" $ do
it "XHTML" $
let html = "fooHello World
"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "html" Map.empty
[ X.NodeElement $ X.Element "head" Map.empty
[ X.NodeElement $ X.Element "title" Map.empty
[X.NodeContent "foo"]
]
, X.NodeElement $ X.Element "body" Map.empty
[ X.NodeElement $ X.Element "p" Map.empty
[X.NodeContent "Hello World"]
]
]
in H.parseLBS html @?= doc
it "XHTML with doctype and \n\nfooHello World
"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "html" Map.empty
[ X.NodeElement $ X.Element "head" Map.empty
[ X.NodeElement $ X.Element "title" Map.empty
[X.NodeContent "foo"]
]
, X.NodeElement $ X.Element "body" Map.empty
[ X.NodeElement $ X.Element "p" Map.empty
[X.NodeContent "Hello World"]
]
]
in H.parseLBS html @?= doc
it "HTML" $
let html = "foo
Hello World
"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "html" Map.empty
[ X.NodeElement $ X.Element "head" Map.empty
[ X.NodeElement $ X.Element "title" Map.empty
[X.NodeContent "foo"]
]
, X.NodeElement $ X.Element "body" Map.empty
[ X.NodeElement $ X.Element "br" Map.empty []
, X.NodeElement $ X.Element "p" Map.empty
[X.NodeContent "Hello World"]
]
]
in H.parseLBS html @?= doc
it "Mixed case br #167" $
let html = "foo
Hello World
done"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "html" Map.empty
[ X.NodeElement $ X.Element "head" Map.empty
[ X.NodeElement $ X.Element "title" Map.empty
[X.NodeContent "foo"]
]
, X.NodeElement $ X.Element "body" Map.empty
[ X.NodeElement $ X.Element "bR" Map.empty []
, X.NodeElement $ X.Element "p" Map.empty
[X.NodeContent "Hello World"]
, X.NodeElement $ X.Element "BR" Map.empty []
, X.NodeContent "done"
]
]
in H.parseLBS html @?= doc
it "doesn't double unescape" $
let html = "Hello > World
"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "p" Map.empty
[ X.NodeContent "Hello > World"
]
in H.parseLBS html @?= doc
it "handles entities in attributes" $
let html = "
"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "br" (Map.singleton "title" "Mac & Cheese") []
in H.parseLBS html @?= doc
it "doesn't double escape entities in attributes" $
let html = "
"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "br" (Map.singleton "title" "Mac & Cheese") []
in H.parseLBS html @?= doc
describe "script tags" $ do
it "ignores funny characters" $
let html = ""
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "script" Map.empty [X.NodeContent "hello <> world"]
in H.parseLBS html @?= doc
{-
Would be nice... doesn't work with tagstream-conduit original
code. Not even sure if the HTML5 parser spec discusses this
case.
it "ignores inside string" $
let html = "\" world"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "script" Map.empty [X.NodeContent "hello \"\" world"]
in H.parseLBS html @?= doc
-}
it "unterminated" $
let html = "