html-conduit-1.1.0/ 0000755 0000000 0000000 00000000000 12110322757 012306 5 ustar 00 0000000 0000000 html-conduit-1.1.0/Setup.hs 0000644 0000000 0000000 00000000056 12110322757 013743 0 ustar 00 0000000 0000000 import Distribution.Simple
main = defaultMain
html-conduit-1.1.0/LICENSE 0000644 0000000 0000000 00000002075 12110322757 013317 0 ustar 00 0000000 0000000 Copyright (c) 2012 Michael Snoyman, http://www.yesodweb.com/
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
html-conduit-1.1.0/html-conduit.cabal 0000644 0000000 0000000 00000003660 12110322757 015706 0 ustar 00 0000000 0000000 Name: html-conduit
Version: 1.1.0
Synopsis: Parse HTML documents using xml-conduit datatypes.
Description: This package uses tagstream-conduit for its parser. It automatically balances mismatched tags, so that there shouldn't be any parse failures. It does not handle a full HTML document rendering, such as adding missing html and head tags.
Homepage: https://github.com/snoyberg/xml
License: MIT
License-file: LICENSE
Author: Michael Snoyman
Maintainer: michael@snoyman.com
Category: Web, Text, Conduit
Build-type: Simple
Extra-source-files: test/main.hs
Cabal-version: >=1.8
Library
Exposed-modules: Text.HTML.DOM
Build-depends: base >= 4 && < 5
, transformers
, bytestring
, containers
, text
, resourcet >= 0.3 && < 0.5
, conduit >= 1.0 && < 1.1
, filesystem-conduit >= 1.0
, system-filepath >= 0.4 && < 0.5
, xml-conduit >= 1.1 && < 1.2
, tagstream-conduit >= 0.4 && < 0.6
, xml-types >= 0.3 && < 0.4
test-suite test
type: exitcode-stdio-1.0
main-is: main.hs
hs-source-dirs: test
build-depends: base
, hspec >= 1.3
, HUnit
, xml-conduit
, html-conduit
, bytestring
, containers
source-repository head
type: git
location: git://github.com/snoyberg/xml.conduit
html-conduit-1.1.0/Text/ 0000755 0000000 0000000 00000000000 12110322757 013232 5 ustar 00 0000000 0000000 html-conduit-1.1.0/Text/HTML/ 0000755 0000000 0000000 00000000000 12110322757 013776 5 ustar 00 0000000 0000000 html-conduit-1.1.0/Text/HTML/DOM.hs 0000644 0000000 0000000 00000011521 12110322757 014751 0 ustar 00 0000000 0000000 {-# LANGUAGE OverloadedStrings, CPP #-}
module Text.HTML.DOM
( eventConduit
, sinkDoc
, readFile
, parseLBS
) where
import Prelude hiding (readFile)
import qualified Data.ByteString as S
#if MIN_VERSION_tagstream_conduit(0,5,0)
import qualified Text.HTML.TagStream.ByteString as TS
#endif
import qualified Text.HTML.TagStream as TS
import qualified Data.XML.Types as XT
import Data.Conduit
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Conduit.List as CL
import Control.Arrow ((***), second)
import Data.Text.Encoding (decodeUtf8With)
import Data.Text.Encoding.Error (lenientDecode)
import qualified Data.Set as Set
import qualified Text.XML as X
import Text.XML.Stream.Parse (decodeHtmlEntities)
import qualified Filesystem.Path.CurrentOS as F
import Data.Conduit.Filesystem (sourceFile)
import qualified Data.ByteString.Lazy as L
import Control.Monad.Trans.Resource (runExceptionT_)
import Data.Functor.Identity (runIdentity)
import Data.Maybe (mapMaybe)
-- | Converts a stream of bytes to a stream of properly balanced @Event@s.
--
-- Note that there may be multiple (or not) root elements. @sinkDoc@ addresses
-- that case.
eventConduit :: Monad m => Conduit S.ByteString m XT.Event
eventConduit =
TS.tokenStream =$= go []
where
go stack = do
mx <- await
case fmap (entities . fmap' (decodeUtf8With lenientDecode)) mx of
Nothing -> closeStack stack
Just (TS.TagOpen local attrs isClosed) -> do
let name = toName local
attrs' = map (toName *** return . XT.ContentText) attrs
yield $ XT.EventBeginElement name attrs'
if isClosed || isVoid local
then yield (XT.EventEndElement name) >> go stack
else go $ name : stack
Just (TS.TagClose name)
| toName name `elem` stack ->
let loop [] = go []
loop (n:ns) = do
yield $ XT.EventEndElement n
if n == toName name
then go ns
else loop ns
in loop stack
| otherwise -> go stack
Just (TS.Text t) -> do
yield $ XT.EventContent $ XT.ContentText t
go stack
Just (TS.Comment t) -> do
yield $ XT.EventComment t
go stack
Just TS.Special{} -> go stack
Just TS.Incomplete{} -> go stack
toName l = XT.Name l Nothing Nothing
closeStack = mapM_ (yield . XT.EventEndElement)
fmap' :: (a -> b) -> TS.Token' a -> TS.Token' b
fmap' f (TS.TagOpen x pairs b) = TS.TagOpen (f x) (map (f *** f) pairs) b
fmap' f (TS.TagClose x) = TS.TagClose (f x)
fmap' f (TS.Text x) = TS.Text (f x)
fmap' f (TS.Comment x) = TS.Comment (f x)
fmap' f (TS.Special x y) = TS.Special (f x) (f y)
fmap' f (TS.Incomplete x) = TS.Incomplete (f x)
entities :: TS.Token' Text -> TS.Token' Text
entities (TS.TagOpen x pairs b) = TS.TagOpen x (map (second entities') pairs) b
entities (TS.Text x) = TS.Text $ entities' x
entities ts = ts
entities' :: Text -> Text
entities' t =
case T.break (== '&') t of
(_, "") -> t
(before, t') ->
case T.break (== ';') $ T.drop 1 t' of
(_, "") -> t
(entity, rest') ->
let rest = T.drop 1 rest'
in case decodeHtmlEntities entity of
XT.ContentText entity' -> T.concat [before, entity', entities' rest]
XT.ContentEntity _ -> T.concat [before, "&", entity, entities' rest']
isVoid = flip Set.member $ Set.fromList
[ "area"
, "base"
, "br"
, "col"
, "command"
, "embed"
, "hr"
, "img"
, "input"
, "keygen"
, "link"
, "meta"
, "param"
, "source"
, "track"
, "wbr"
]
sinkDoc :: MonadThrow m => Sink S.ByteString m X.Document
sinkDoc =
fmap stripDummy $ mapOutput ((,) Nothing) eventConduit =$ addDummyWrapper =$ X.fromEvents
where
addDummyWrapper = do
yield (Nothing, XT.EventBeginElement "html" [])
awaitForever yield
yield (Nothing, XT.EventEndElement "html")
stripDummy doc@(X.Document pro (X.Element _ _ nodes) epi) =
case mapMaybe toElement nodes of
[root] -> X.Document pro root epi
_ -> doc
toElement (X.NodeElement e) = Just e
toElement _ = Nothing
readFile :: F.FilePath -> IO X.Document
readFile fp = runResourceT $ sourceFile fp $$ sinkDoc
parseLBS :: L.ByteString -> X.Document
parseLBS lbs = runIdentity $ runExceptionT_ $ CL.sourceList (L.toChunks lbs) $$ sinkDoc
html-conduit-1.1.0/test/ 0000755 0000000 0000000 00000000000 12110322757 013265 5 ustar 00 0000000 0000000 html-conduit-1.1.0/test/main.hs 0000644 0000000 0000000 00000006324 12110322757 014552 0 ustar 00 0000000 0000000 {-# LANGUAGE OverloadedStrings #-}
import Test.HUnit hiding (Test)
import Test.Hspec
import Data.ByteString.Lazy.Char8 ()
import qualified Text.HTML.DOM as H
import qualified Text.XML as X
import qualified Data.Map as Map
main :: IO ()
main = hspec $ do
describe "parses" $ do
it "well-formed document" $
X.parseLBS_ X.def "baz" @=?
H.parseLBS "baz"
it "adds missing close tags" $
X.parseLBS_ X.def "baz" @=?
H.parseLBS "baz"
it "void tags" $
X.parseLBS_ X.def "
foo" @=?
H.parseLBS "
foo"
it "xml entities" $
X.parseLBS_ X.def "baz>" @=?
H.parseLBS "baz>"
it "html entities" $
X.parseLBS_ X.def "baz " @=?
H.parseLBS "baz "
it "decimal entities" $
X.parseLBS_ X.def "baz " @=?
H.parseLBS "baz "
it "hex entities" $
X.parseLBS_ X.def "bazŠ" @=?
H.parseLBS "bazŠ"
it "invalid entities" $
X.parseLBS_ X.def "baz&foobar;" @=?
H.parseLBS "baz&foobar;"
it "multiple root elements" $
X.parseLBS_ X.def "baz&foobar;" @=?
H.parseLBS "baz&foobar;"
describe "HTML parsing" $ do
it "XHTML" $
let html = "fooHello World
"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "html" Map.empty
[ X.NodeElement $ X.Element "head" Map.empty
[ X.NodeElement $ X.Element "title" Map.empty
[X.NodeContent "foo"]
]
, X.NodeElement $ X.Element "body" Map.empty
[ X.NodeElement $ X.Element "p" Map.empty
[X.NodeContent "Hello World"]
]
]
in H.parseLBS html @?= doc
it "HTML" $
let html = "foo
Hello World
"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "html" Map.empty
[ X.NodeElement $ X.Element "head" Map.empty
[ X.NodeElement $ X.Element "title" Map.empty
[X.NodeContent "foo"]
]
, X.NodeElement $ X.Element "body" Map.empty
[ X.NodeElement $ X.Element "br" Map.empty []
, X.NodeElement $ X.Element "p" Map.empty
[X.NodeContent "Hello World"]
]
]
in H.parseLBS html @?= doc