html-conduit-1.2.1.2/Text/ 0000755 0000000 0000000 00000000000 12621142441 013371 5 ustar 00 0000000 0000000 html-conduit-1.2.1.2/Text/HTML/ 0000755 0000000 0000000 00000000000 12713700167 014144 5 ustar 00 0000000 0000000 html-conduit-1.2.1.2/test/ 0000755 0000000 0000000 00000000000 12657024067 013440 5 ustar 00 0000000 0000000 html-conduit-1.2.1.2/Text/HTML/DOM.hs 0000644 0000000 0000000 00000010705 12657024067 015127 0 ustar 00 0000000 0000000 {-# LANGUAGE OverloadedStrings #-}
module Text.HTML.DOM
( eventConduit
, sinkDoc
, readFile
, parseLBS
, parseBSChunks
, eventConduitText
, sinkDocText
, parseLT
, parseSTChunks
) where
import Control.Monad.Trans.Resource
import Prelude hiding (readFile)
import qualified Data.ByteString as S
import qualified Text.HTML.TagStream.Text as TS
import qualified Text.HTML.TagStream as TS
import qualified Data.XML.Types as XT
import Data.Conduit
import Data.Conduit.Text (decodeUtf8Lenient)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import qualified Data.Conduit.List as CL
import Control.Arrow ((***))
import qualified Data.Set as Set
import qualified Text.XML as X
import Text.XML.Stream.Parse (decodeHtmlEntities)
import Data.Conduit.Binary (sourceFile)
import qualified Data.ByteString.Lazy as L
import Control.Monad.Trans.Resource (runExceptionT_)
import Data.Functor.Identity (runIdentity)
import Data.Maybe (mapMaybe)
-- | Converts a stream of bytes to a stream of properly balanced @Event@s.
--
-- Note that there may be multiple (or not) root elements. @sinkDoc@ addresses
-- that case.
eventConduit :: Monad m => Conduit S.ByteString m XT.Event
eventConduit = decodeUtf8Lenient =$= eventConduit'
eventConduitText :: Monad m => Conduit T.Text m XT.Event
eventConduitText = eventConduit'
eventConduit' :: Monad m => Conduit T.Text m XT.Event
eventConduit' =
TS.tokenStream =$= go []
where
go stack = do
mx <- await
case mx of
Nothing -> closeStack stack
-- Ignore processing instructions (or pseudo-instructions)
Just (TS.TagOpen local _ _) | "?" `T.isPrefixOf` local -> go stack
Just (TS.TagOpen local attrs isClosed) -> do
let name = toName local
attrs' = map (toName *** return . XT.ContentText) attrs
yield $ XT.EventBeginElement name attrs'
if isClosed || isVoid local
then yield (XT.EventEndElement name) >> go stack
else go $ name : stack
Just (TS.TagClose name)
| toName name `elem` stack ->
let loop [] = go []
loop (n:ns) = do
yield $ XT.EventEndElement n
if n == toName name
then go ns
else loop ns
in loop stack
| otherwise -> go stack
Just (TS.Text t) -> do
yield $ XT.EventContent $ XT.ContentText t
go stack
Just (TS.Comment t) -> do
yield $ XT.EventComment t
go stack
Just TS.Special{} -> go stack
Just TS.Incomplete{} -> go stack
toName l = XT.Name l Nothing Nothing
closeStack = mapM_ (yield . XT.EventEndElement)
isVoid = flip Set.member $ Set.fromList
[ "area"
, "base"
, "br"
, "col"
, "command"
, "embed"
, "hr"
, "img"
, "input"
, "keygen"
, "link"
, "meta"
, "param"
, "source"
, "track"
, "wbr"
]
sinkDoc :: MonadThrow m => Sink S.ByteString m X.Document
sinkDoc = sinkDoc' eventConduit
sinkDocText :: MonadThrow m => Sink T.Text m X.Document
sinkDocText = sinkDoc' eventConduitText
sinkDoc' :: (Monad m, MonadThrow m) => Conduit a m XT.Event -> Sink a m X.Document
sinkDoc' f =
fmap stripDummy $ mapOutput ((,) Nothing) f =$ addDummyWrapper =$ X.fromEvents
where
addDummyWrapper = do
yield (Nothing, XT.EventBeginElement "html" [])
awaitForever yield
yield (Nothing, XT.EventEndElement "html")
stripDummy doc@(X.Document pro (X.Element _ _ nodes) epi) =
case mapMaybe toElement nodes of
[root] -> X.Document pro root epi
_ -> doc
toElement (X.NodeElement e) = Just e
toElement _ = Nothing
readFile :: FilePath -> IO X.Document
readFile fp = runResourceT $ sourceFile fp $$ sinkDoc
parseLBS :: L.ByteString -> X.Document
parseLBS = parseBSChunks . L.toChunks
parseBSChunks :: [S.ByteString] -> X.Document
parseBSChunks tss = runIdentity $ runExceptionT_ $ CL.sourceList tss $$ sinkDoc
parseLT :: TL.Text -> X.Document
parseLT = parseSTChunks . TL.toChunks
parseSTChunks :: [T.Text] -> X.Document
parseSTChunks tss = runIdentity $ runExceptionT_ $ CL.sourceList tss $$ sinkDocText
html-conduit-1.2.1.2/test/main.hs 0000644 0000000 0000000 00000011673 12657024067 014730 0 ustar 00 0000000 0000000 {-# LANGUAGE OverloadedStrings #-}
import Test.HUnit hiding (Test)
import Test.Hspec
import Data.ByteString.Lazy.Char8 ()
import qualified Text.HTML.DOM as H
import qualified Text.XML as X
import qualified Data.Map as Map
main :: IO ()
main = hspec $ do
describe "parses" $ do
it "well-formed document" $
X.parseLBS_ X.def "baz" @=?
H.parseLBS "baz"
it "adds missing close tags" $
X.parseLBS_ X.def "baz" @=?
H.parseLBS "baz"
it "void tags" $
X.parseLBS_ X.def "
foo" @=?
H.parseLBS "
foo"
it "xml entities" $
X.parseLBS_ X.def "baz>" @=?
H.parseLBS "baz>"
it "html entities" $
X.parseLBS_ X.def "baz " @=?
H.parseLBS "baz "
it "decimal entities" $
X.parseLBS_ X.def "baz " @=?
H.parseLBS "baz "
it "hex entities" $
X.parseLBS_ X.def "bazŠ" @=?
H.parseLBS "bazŠ"
it "invalid entities" $
X.parseLBS_ X.def "baz&foobar;" @=?
H.parseLBS "baz&foobar;"
it "multiple root elements" $
X.parseLBS_ X.def "baz&foobar;" @=?
H.parseLBS "baz&foobar;"
it "doesn't strip whitespace" $
X.parseLBS_ X.def " hello" @=?
H.parseLBS " hello"
it "split code-points" $
X.parseLBS_ X.def " " @=?
H.parseBSChunks ["\xc2", "\xa0"]
it "latin1 codes" $
X.parseText_ X.def "\232" @=?
H.parseSTChunks ["\232"]
it "latin1 codes strict vs lazy" $
H.parseLT "\232" @=?
H.parseSTChunks ["\232"]
describe "HTML parsing" $ do
it "XHTML" $
let html = "fooHello World
"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "html" Map.empty
[ X.NodeElement $ X.Element "head" Map.empty
[ X.NodeElement $ X.Element "title" Map.empty
[X.NodeContent "foo"]
]
, X.NodeElement $ X.Element "body" Map.empty
[ X.NodeElement $ X.Element "p" Map.empty
[X.NodeContent "Hello World"]
]
]
in H.parseLBS html @?= doc
it "XHTML with doctype and \n\nfooHello World
"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "html" Map.empty
[ X.NodeElement $ X.Element "head" Map.empty
[ X.NodeElement $ X.Element "title" Map.empty
[X.NodeContent "foo"]
]
, X.NodeElement $ X.Element "body" Map.empty
[ X.NodeElement $ X.Element "p" Map.empty
[X.NodeContent "Hello World"]
]
]
in H.parseLBS html @?= doc
it "HTML" $
let html = "foo
Hello World
"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "html" Map.empty
[ X.NodeElement $ X.Element "head" Map.empty
[ X.NodeElement $ X.Element "title" Map.empty
[X.NodeContent "foo"]
]
, X.NodeElement $ X.Element "body" Map.empty
[ X.NodeElement $ X.Element "br" Map.empty []
, X.NodeElement $ X.Element "p" Map.empty
[X.NodeContent "Hello World"]
]
]
in H.parseLBS html @?= doc
it "doesn't double unescape" $
let html = "Hello > World
"
doc = X.Document (X.Prologue [] Nothing []) root []
root = X.Element "p" Map.empty
[ X.NodeContent "Hello > World"
]
in H.parseLBS html @?= doc
html-conduit-1.2.1.2/LICENSE 0000644 0000000 0000000 00000002075 12621142441 013456 0 ustar 00 0000000 0000000 Copyright (c) 2012 Michael Snoyman, http://www.yesodweb.com/
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
html-conduit-1.2.1.2/Setup.hs 0000644 0000000 0000000 00000000056 12621142441 014102 0 ustar 00 0000000 0000000 import Distribution.Simple
main = defaultMain
html-conduit-1.2.1.2/html-conduit.cabal 0000644 0000000 0000000 00000003544 13162672470 016061 0 ustar 00 0000000 0000000 Name: html-conduit
Version: 1.2.1.2
Synopsis: Parse HTML documents using xml-conduit datatypes.
Description: This package uses tagstream-conduit for its parser. It automatically balances mismatched tags, so that there shouldn't be any parse failures. It does not handle a full HTML document rendering, such as adding missing html and head tags.
Homepage: https://github.com/snoyberg/xml
License: MIT
License-file: LICENSE
Author: Michael Snoyman
Maintainer: michael@snoyman.com
Category: Web, Text, Conduit
Build-type: Simple
Extra-source-files: test/main.hs README.md ChangeLog.md
Cabal-version: >=1.8
Library
Exposed-modules: Text.HTML.DOM
Build-depends: base >= 4 && < 5
, transformers
, bytestring
, containers
, text
, resourcet >= 0.3 && < 1.2
, conduit >= 1.0 && < 1.3
, conduit-extra >= 1.1.1
, xml-conduit >= 1.3
, tagstream-conduit >= 0.5.5.3 && < 0.6
, xml-types >= 0.3 && < 0.4
test-suite test
type: exitcode-stdio-1.0
main-is: main.hs
hs-source-dirs: test
build-depends: base
, hspec >= 1.3
, HUnit
, xml-conduit
, html-conduit
, bytestring
, containers
source-repository head
type: git
location: git://github.com/snoyberg/xml.conduit
html-conduit-1.2.1.2/README.md 0000644 0000000 0000000 00000002002 13012347521 013717 0 ustar 00 0000000 0000000 This package uses tagstream-conduit for its parser. It automatically balances
mismatched tags, so that there shouldn't be any parse failures. It does not
handle a full HTML document rendering, such as adding missing html and head
tags.
Simple usage example:
```haskell
#!/usr/bin/env stack
{- stack --install-ghc --resolver lts-6.23 runghc
--package http-conduit --package html-conduit
-}
{-# LANGUAGE OverloadedStrings #-}
import qualified Data.Text.IO as T
import Network.HTTP.Simple (httpSink)
import Text.HTML.DOM (sinkDoc)
import Text.XML.Cursor (attributeIs, content, element,
fromDocument, ($//), (&/), (&//))
main :: IO ()
main = do
doc <- httpSink "http://www.yesodweb.com/book" $ const sinkDoc
let cursor = fromDocument doc
T.putStrLn "Chapters in the Yesod book:\n"
mapM_ T.putStrLn
$ cursor
$// attributeIs "class" "main-listing"
&// element "li"
&/ element "a"
&/ content
```
html-conduit-1.2.1.2/ChangeLog.md 0000644 0000000 0000000 00000000426 13162672504 014631 0 ustar 00 0000000 0000000 ## 1.2.1.2
* Remove an upper bound
* Doc improvement
## 1.2.1.1
* Allow xml-conduit 1.4
## 1.2.1
* Add strict and lazy text parsing [#66](https://github.com/snoyberg/xml/pull/66)
## 1.2.0
* Drop system-filepath
## 1.1.1.2
* Fix a bug with double-unescaping of entities