html-conduit-1.1.0/0000755000000000000000000000000012110322757012306 5ustar0000000000000000html-conduit-1.1.0/Setup.hs0000644000000000000000000000005612110322757013743 0ustar0000000000000000import Distribution.Simple main = defaultMain html-conduit-1.1.0/LICENSE0000644000000000000000000000207512110322757013317 0ustar0000000000000000Copyright (c) 2012 Michael Snoyman, http://www.yesodweb.com/ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. html-conduit-1.1.0/html-conduit.cabal0000644000000000000000000000366012110322757015706 0ustar0000000000000000Name: html-conduit Version: 1.1.0 Synopsis: Parse HTML documents using xml-conduit datatypes. Description: This package uses tagstream-conduit for its parser. It automatically balances mismatched tags, so that there shouldn't be any parse failures. It does not handle a full HTML document rendering, such as adding missing html and head tags. Homepage: https://github.com/snoyberg/xml License: MIT License-file: LICENSE Author: Michael Snoyman Maintainer: michael@snoyman.com Category: Web, Text, Conduit Build-type: Simple Extra-source-files: test/main.hs Cabal-version: >=1.8 Library Exposed-modules: Text.HTML.DOM Build-depends: base >= 4 && < 5 , transformers , bytestring , containers , text , resourcet >= 0.3 && < 0.5 , conduit >= 1.0 && < 1.1 , filesystem-conduit >= 1.0 , system-filepath >= 0.4 && < 0.5 , xml-conduit >= 1.1 && < 1.2 , tagstream-conduit >= 0.4 && < 0.6 , xml-types >= 0.3 && < 0.4 test-suite test type: exitcode-stdio-1.0 main-is: main.hs hs-source-dirs: test build-depends: base , hspec >= 1.3 , HUnit , xml-conduit , html-conduit , bytestring , containers source-repository head type: git location: git://github.com/snoyberg/xml.conduit html-conduit-1.1.0/Text/0000755000000000000000000000000012110322757013232 5ustar0000000000000000html-conduit-1.1.0/Text/HTML/0000755000000000000000000000000012110322757013776 5ustar0000000000000000html-conduit-1.1.0/Text/HTML/DOM.hs0000644000000000000000000001152112110322757014751 0ustar0000000000000000{-# LANGUAGE OverloadedStrings, CPP #-} module Text.HTML.DOM ( eventConduit , sinkDoc , readFile , parseLBS ) where import Prelude hiding (readFile) import qualified Data.ByteString as S #if MIN_VERSION_tagstream_conduit(0,5,0) import qualified Text.HTML.TagStream.ByteString as TS #endif import qualified Text.HTML.TagStream as TS import qualified Data.XML.Types as XT import Data.Conduit import Data.Text (Text) import qualified Data.Text as T import qualified Data.Conduit.List as CL import Control.Arrow ((***), second) import Data.Text.Encoding (decodeUtf8With) import Data.Text.Encoding.Error (lenientDecode) import qualified Data.Set as Set import qualified Text.XML as X import Text.XML.Stream.Parse (decodeHtmlEntities) import qualified Filesystem.Path.CurrentOS as F import Data.Conduit.Filesystem (sourceFile) import qualified Data.ByteString.Lazy as L import Control.Monad.Trans.Resource (runExceptionT_) import Data.Functor.Identity (runIdentity) import Data.Maybe (mapMaybe) -- | Converts a stream of bytes to a stream of properly balanced @Event@s. -- -- Note that there may be multiple (or not) root elements. @sinkDoc@ addresses -- that case. eventConduit :: Monad m => Conduit S.ByteString m XT.Event eventConduit = TS.tokenStream =$= go [] where go stack = do mx <- await case fmap (entities . fmap' (decodeUtf8With lenientDecode)) mx of Nothing -> closeStack stack Just (TS.TagOpen local attrs isClosed) -> do let name = toName local attrs' = map (toName *** return . XT.ContentText) attrs yield $ XT.EventBeginElement name attrs' if isClosed || isVoid local then yield (XT.EventEndElement name) >> go stack else go $ name : stack Just (TS.TagClose name) | toName name `elem` stack -> let loop [] = go [] loop (n:ns) = do yield $ XT.EventEndElement n if n == toName name then go ns else loop ns in loop stack | otherwise -> go stack Just (TS.Text t) -> do yield $ XT.EventContent $ XT.ContentText t go stack Just (TS.Comment t) -> do yield $ XT.EventComment t go stack Just TS.Special{} -> go stack Just TS.Incomplete{} -> go stack toName l = XT.Name l Nothing Nothing closeStack = mapM_ (yield . XT.EventEndElement) fmap' :: (a -> b) -> TS.Token' a -> TS.Token' b fmap' f (TS.TagOpen x pairs b) = TS.TagOpen (f x) (map (f *** f) pairs) b fmap' f (TS.TagClose x) = TS.TagClose (f x) fmap' f (TS.Text x) = TS.Text (f x) fmap' f (TS.Comment x) = TS.Comment (f x) fmap' f (TS.Special x y) = TS.Special (f x) (f y) fmap' f (TS.Incomplete x) = TS.Incomplete (f x) entities :: TS.Token' Text -> TS.Token' Text entities (TS.TagOpen x pairs b) = TS.TagOpen x (map (second entities') pairs) b entities (TS.Text x) = TS.Text $ entities' x entities ts = ts entities' :: Text -> Text entities' t = case T.break (== '&') t of (_, "") -> t (before, t') -> case T.break (== ';') $ T.drop 1 t' of (_, "") -> t (entity, rest') -> let rest = T.drop 1 rest' in case decodeHtmlEntities entity of XT.ContentText entity' -> T.concat [before, entity', entities' rest] XT.ContentEntity _ -> T.concat [before, "&", entity, entities' rest'] isVoid = flip Set.member $ Set.fromList [ "area" , "base" , "br" , "col" , "command" , "embed" , "hr" , "img" , "input" , "keygen" , "link" , "meta" , "param" , "source" , "track" , "wbr" ] sinkDoc :: MonadThrow m => Sink S.ByteString m X.Document sinkDoc = fmap stripDummy $ mapOutput ((,) Nothing) eventConduit =$ addDummyWrapper =$ X.fromEvents where addDummyWrapper = do yield (Nothing, XT.EventBeginElement "html" []) awaitForever yield yield (Nothing, XT.EventEndElement "html") stripDummy doc@(X.Document pro (X.Element _ _ nodes) epi) = case mapMaybe toElement nodes of [root] -> X.Document pro root epi _ -> doc toElement (X.NodeElement e) = Just e toElement _ = Nothing readFile :: F.FilePath -> IO X.Document readFile fp = runResourceT $ sourceFile fp $$ sinkDoc parseLBS :: L.ByteString -> X.Document parseLBS lbs = runIdentity $ runExceptionT_ $ CL.sourceList (L.toChunks lbs) $$ sinkDoc html-conduit-1.1.0/test/0000755000000000000000000000000012110322757013265 5ustar0000000000000000html-conduit-1.1.0/test/main.hs0000644000000000000000000000632412110322757014552 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} import Test.HUnit hiding (Test) import Test.Hspec import Data.ByteString.Lazy.Char8 () import qualified Text.HTML.DOM as H import qualified Text.XML as X import qualified Data.Map as Map main :: IO () main = hspec $ do describe "parses" $ do it "well-formed document" $ X.parseLBS_ X.def "baz" @=? H.parseLBS "baz" it "adds missing close tags" $ X.parseLBS_ X.def "baz" @=? H.parseLBS "baz" it "void tags" $ X.parseLBS_ X.def "foo" @=? H.parseLBS "foo" it "xml entities" $ X.parseLBS_ X.def "baz>" @=? H.parseLBS "baz>" it "html entities" $ X.parseLBS_ X.def "baz " @=? H.parseLBS "baz " it "decimal entities" $ X.parseLBS_ X.def "baz " @=? H.parseLBS "baz " it "hex entities" $ X.parseLBS_ X.def "bazŠ" @=? H.parseLBS "bazŠ" it "invalid entities" $ X.parseLBS_ X.def "baz&foobar;" @=? H.parseLBS "baz&foobar;" it "multiple root elements" $ X.parseLBS_ X.def "baz&foobar;" @=? H.parseLBS "baz&foobar;" describe "HTML parsing" $ do it "XHTML" $ let html = "foo

Hello World

" doc = X.Document (X.Prologue [] Nothing []) root [] root = X.Element "html" Map.empty [ X.NodeElement $ X.Element "head" Map.empty [ X.NodeElement $ X.Element "title" Map.empty [X.NodeContent "foo"] ] , X.NodeElement $ X.Element "body" Map.empty [ X.NodeElement $ X.Element "p" Map.empty [X.NodeContent "Hello World"] ] ] in H.parseLBS html @?= doc it "HTML" $ let html = "foo

Hello World

" doc = X.Document (X.Prologue [] Nothing []) root [] root = X.Element "html" Map.empty [ X.NodeElement $ X.Element "head" Map.empty [ X.NodeElement $ X.Element "title" Map.empty [X.NodeContent "foo"] ] , X.NodeElement $ X.Element "body" Map.empty [ X.NodeElement $ X.Element "br" Map.empty [] , X.NodeElement $ X.Element "p" Map.empty [X.NodeContent "Hello World"] ] ] in H.parseLBS html @?= doc