html-conduit-1.2.1/0000755000000000000000000000000012621147426012315 5ustar0000000000000000html-conduit-1.2.1/ChangeLog.md0000644000000000000000000000027212621147426014467 0ustar0000000000000000## 1.2.1 * Add strict and lazy text parsing [#66](https://github.com/snoyberg/xml/pull/66) ## 1.2.0 * Drop system-filepath ## 1.1.1.2 * Fix a bug with double-unescaping of entities html-conduit-1.2.1/html-conduit.cabal0000644000000000000000000000356612621147426015722 0ustar0000000000000000Name: html-conduit Version: 1.2.1 Synopsis: Parse HTML documents using xml-conduit datatypes. Description: This package uses tagstream-conduit for its parser. It automatically balances mismatched tags, so that there shouldn't be any parse failures. It does not handle a full HTML document rendering, such as adding missing html and head tags. Homepage: https://github.com/snoyberg/xml License: MIT License-file: LICENSE Author: Michael Snoyman Maintainer: michael@snoyman.com Category: Web, Text, Conduit Build-type: Simple Extra-source-files: test/main.hs README.md ChangeLog.md Cabal-version: >=1.8 Library Exposed-modules: Text.HTML.DOM Build-depends: base >= 4 && < 5 , transformers , bytestring , containers , text , resourcet >= 0.3 && < 1.2 , conduit >= 1.0 && < 1.3 , conduit-extra >= 1.1.1 , xml-conduit >= 1.3 && < 1.4 , tagstream-conduit >= 0.5.5.3 && < 0.6 , xml-types >= 0.3 && < 0.4 test-suite test type: exitcode-stdio-1.0 main-is: main.hs hs-source-dirs: test build-depends: base , hspec >= 1.3 , HUnit , xml-conduit , html-conduit , bytestring , containers source-repository head type: git location: git://github.com/snoyberg/xml.conduit html-conduit-1.2.1/LICENSE0000644000000000000000000000207512621147426013326 0ustar0000000000000000Copyright (c) 2012 Michael Snoyman, http://www.yesodweb.com/ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. html-conduit-1.2.1/README.md0000644000000000000000000000035412621147426013576 0ustar0000000000000000This package uses tagstream-conduit for its parser. It automatically balances mismatched tags, so that there shouldn't be any parse failures. It does not handle a full HTML document rendering, such as adding missing html and head tags. html-conduit-1.2.1/Setup.hs0000644000000000000000000000005612621147426013752 0ustar0000000000000000import Distribution.Simple main = defaultMain html-conduit-1.2.1/test/0000755000000000000000000000000012621147426013274 5ustar0000000000000000html-conduit-1.2.1/test/main.hs0000644000000000000000000001167312621147426014564 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} import Test.HUnit hiding (Test) import Test.Hspec import Data.ByteString.Lazy.Char8 () import qualified Text.HTML.DOM as H import qualified Text.XML as X import qualified Data.Map as Map main :: IO () main = hspec $ do describe "parses" $ do it "well-formed document" $ X.parseLBS_ X.def "baz" @=? H.parseLBS "baz" it "adds missing close tags" $ X.parseLBS_ X.def "baz" @=? H.parseLBS "baz" it "void tags" $ X.parseLBS_ X.def "foo" @=? H.parseLBS "foo" it "xml entities" $ X.parseLBS_ X.def "baz>" @=? H.parseLBS "baz>" it "html entities" $ X.parseLBS_ X.def "baz " @=? H.parseLBS "baz " it "decimal entities" $ X.parseLBS_ X.def "baz " @=? H.parseLBS "baz " it "hex entities" $ X.parseLBS_ X.def "bazŠ" @=? H.parseLBS "bazŠ" it "invalid entities" $ X.parseLBS_ X.def "baz&foobar;" @=? H.parseLBS "baz&foobar;" it "multiple root elements" $ X.parseLBS_ X.def "baz&foobar;" @=? H.parseLBS "baz&foobar;" it "doesn't strip whitespace" $ X.parseLBS_ X.def " hello" @=? H.parseLBS " hello" it "split code-points" $ X.parseLBS_ X.def " " @=? H.parseBSChunks ["\xc2", "\xa0"] it "latin1 codes" $ X.parseText_ X.def "\232" @=? H.parseSTChunks ["\232"] it "latin1 codes strict vs lazy" $ H.parseLT "\232" @=? H.parseSTChunks ["\232"] describe "HTML parsing" $ do it "XHTML" $ let html = "foo

Hello World

" doc = X.Document (X.Prologue [] Nothing []) root [] root = X.Element "html" Map.empty [ X.NodeElement $ X.Element "head" Map.empty [ X.NodeElement $ X.Element "title" Map.empty [X.NodeContent "foo"] ] , X.NodeElement $ X.Element "body" Map.empty [ X.NodeElement $ X.Element "p" Map.empty [X.NodeContent "Hello World"] ] ] in H.parseLBS html @?= doc it "XHTML with doctype and \n\nfoo

Hello World

" doc = X.Document (X.Prologue [] Nothing []) root [] root = X.Element "html" Map.empty [ X.NodeElement $ X.Element "head" Map.empty [ X.NodeElement $ X.Element "title" Map.empty [X.NodeContent "foo"] ] , X.NodeElement $ X.Element "body" Map.empty [ X.NodeElement $ X.Element "p" Map.empty [X.NodeContent "Hello World"] ] ] in H.parseLBS html @?= doc it "HTML" $ let html = "foo

Hello World

" doc = X.Document (X.Prologue [] Nothing []) root [] root = X.Element "html" Map.empty [ X.NodeElement $ X.Element "head" Map.empty [ X.NodeElement $ X.Element "title" Map.empty [X.NodeContent "foo"] ] , X.NodeElement $ X.Element "body" Map.empty [ X.NodeElement $ X.Element "br" Map.empty [] , X.NodeElement $ X.Element "p" Map.empty [X.NodeContent "Hello World"] ] ] in H.parseLBS html @?= doc it "doesn't double unescape" $ let html = "

Hello &gt; World

" doc = X.Document (X.Prologue [] Nothing []) root [] root = X.Element "p" Map.empty [ X.NodeContent "Hello > World" ] in H.parseLBS html @?= doc html-conduit-1.2.1/Text/0000755000000000000000000000000012621147426013241 5ustar0000000000000000html-conduit-1.2.1/Text/HTML/0000755000000000000000000000000012621147426014005 5ustar0000000000000000html-conduit-1.2.1/Text/HTML/DOM.hs0000644000000000000000000001070512621147426014763 0ustar0000000000000000{-# LANGUAGE OverloadedStrings #-} module Text.HTML.DOM ( eventConduit , sinkDoc , readFile , parseLBS , parseBSChunks , eventConduitText , sinkDocText , parseLT , parseSTChunks ) where import Control.Monad.Trans.Resource import Prelude hiding (readFile) import qualified Data.ByteString as S import qualified Text.HTML.TagStream.Text as TS import qualified Text.HTML.TagStream as TS import qualified Data.XML.Types as XT import Data.Conduit import Data.Conduit.Text (decodeUtf8Lenient) import qualified Data.Text as T import qualified Data.Text.Lazy as TL import qualified Data.Conduit.List as CL import Control.Arrow ((***)) import qualified Data.Set as Set import qualified Text.XML as X import Text.XML.Stream.Parse (decodeHtmlEntities) import Data.Conduit.Binary (sourceFile) import qualified Data.ByteString.Lazy as L import Control.Monad.Trans.Resource (runExceptionT_) import Data.Functor.Identity (runIdentity) import Data.Maybe (mapMaybe) -- | Converts a stream of bytes to a stream of properly balanced @Event@s. -- -- Note that there may be multiple (or not) root elements. @sinkDoc@ addresses -- that case. eventConduit :: Monad m => Conduit S.ByteString m XT.Event eventConduit = decodeUtf8Lenient =$= eventConduit' eventConduitText :: Monad m => Conduit T.Text m XT.Event eventConduitText = eventConduit' eventConduit' :: Monad m => Conduit T.Text m XT.Event eventConduit' = TS.tokenStream =$= go [] where go stack = do mx <- await case mx of Nothing -> closeStack stack -- Ignore processing instructions (or pseudo-instructions) Just (TS.TagOpen local _ _) | "?" `T.isPrefixOf` local -> go stack Just (TS.TagOpen local attrs isClosed) -> do let name = toName local attrs' = map (toName *** return . XT.ContentText) attrs yield $ XT.EventBeginElement name attrs' if isClosed || isVoid local then yield (XT.EventEndElement name) >> go stack else go $ name : stack Just (TS.TagClose name) | toName name `elem` stack -> let loop [] = go [] loop (n:ns) = do yield $ XT.EventEndElement n if n == toName name then go ns else loop ns in loop stack | otherwise -> go stack Just (TS.Text t) -> do yield $ XT.EventContent $ XT.ContentText t go stack Just (TS.Comment t) -> do yield $ XT.EventComment t go stack Just TS.Special{} -> go stack Just TS.Incomplete{} -> go stack toName l = XT.Name l Nothing Nothing closeStack = mapM_ (yield . XT.EventEndElement) isVoid = flip Set.member $ Set.fromList [ "area" , "base" , "br" , "col" , "command" , "embed" , "hr" , "img" , "input" , "keygen" , "link" , "meta" , "param" , "source" , "track" , "wbr" ] sinkDoc :: MonadThrow m => Sink S.ByteString m X.Document sinkDoc = sinkDoc' eventConduit sinkDocText :: MonadThrow m => Sink T.Text m X.Document sinkDocText = sinkDoc' eventConduitText sinkDoc' :: (Monad m, MonadThrow m) => Conduit a m XT.Event -> Sink a m X.Document sinkDoc' f = fmap stripDummy $ mapOutput ((,) Nothing) f =$ addDummyWrapper =$ X.fromEvents where addDummyWrapper = do yield (Nothing, XT.EventBeginElement "html" []) awaitForever yield yield (Nothing, XT.EventEndElement "html") stripDummy doc@(X.Document pro (X.Element _ _ nodes) epi) = case mapMaybe toElement nodes of [root] -> X.Document pro root epi _ -> doc toElement (X.NodeElement e) = Just e toElement _ = Nothing readFile :: FilePath -> IO X.Document readFile fp = runResourceT $ sourceFile fp $$ sinkDoc parseLBS :: L.ByteString -> X.Document parseLBS = parseBSChunks . L.toChunks parseBSChunks :: [S.ByteString] -> X.Document parseBSChunks tss = runIdentity $ runExceptionT_ $ CL.sourceList tss $$ sinkDoc parseLT :: TL.Text -> X.Document parseLT = parseSTChunks . TL.toChunks parseSTChunks :: [T.Text] -> X.Document parseSTChunks tss = runIdentity $ runExceptionT_ $ CL.sourceList tss $$ sinkDocText