pax_global_header 0000666 0000000 0000000 00000000064 14557005770 0014524 g ustar 00root root 0000000 0000000 52 comment=ce552259c070454e45ecaf63ff90fd807bc78d90
golang-github-antchfx-htmlquery-1.3.0/ 0000775 0000000 0000000 00000000000 14557005770 0017735 5 ustar 00root root 0000000 0000000 golang-github-antchfx-htmlquery-1.3.0/.gitignore 0000664 0000000 0000000 00000000462 14557005770 0021727 0 ustar 00root root 0000000 0000000 # vscode
.vscode
debug
*.test
./build
# Compiled Object files, Static and Dynamic libs (Shared Objects)
*.o
*.a
*.so
# Folders
_obj
_test
# Architecture specific extensions/prefixes
*.[568vq]
[568vq].out
*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*
_testmain.go
*.exe
*.test
*.prof golang-github-antchfx-htmlquery-1.3.0/.travis.yml 0000664 0000000 0000000 00000000462 14557005770 0022050 0 ustar 00root root 0000000 0000000 language: go
go:
- 1.9.x
- 1.12.x
- 1.13.x
install:
- go get golang.org/x/net/html/charset
- go get golang.org/x/net/html
- go get github.com/antchfx/xpath
- go get github.com/mattn/goveralls
- go get github.com/golang/groupcache
script:
- $HOME/gopath/bin/goveralls -service=travis-ci golang-github-antchfx-htmlquery-1.3.0/LICENSE 0000664 0000000 0000000 00000001776 14557005770 0020755 0 ustar 00root root 0000000 0000000 Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. golang-github-antchfx-htmlquery-1.3.0/README.md 0000664 0000000 0000000 00000010561 14557005770 0021217 0 ustar 00root root 0000000 0000000 htmlquery
====
[](https://travis-ci.org/antchfx/htmlquery)
[](https://coveralls.io/github/antchfx/htmlquery?branch=master)
[](https://godoc.org/github.com/antchfx/htmlquery)
[](https://goreportcard.com/report/github.com/antchfx/htmlquery)
Overview
====
`htmlquery` is an XPath query package for HTML, lets you extract data or evaluate from HTML documents by an XPath expression.
`htmlquery` built-in the query object caching feature based on [LRU](https://godoc.org/github.com/golang/groupcache/lru), this feature will caching the recently used XPATH query string. Enable query caching can avoid re-compile XPath expression each query.
You can visit this page to learn about the supported XPath(1.0/2.0) syntax. https://github.com/antchfx/xpath
XPath query packages for Go
===
| Name | Description |
| ------------------------------------------------- | ----------------------------------------- |
| [htmlquery](https://github.com/antchfx/htmlquery) | XPath query package for the HTML document |
| [xmlquery](https://github.com/antchfx/xmlquery) | XPath query package for the XML document |
| [jsonquery](https://github.com/antchfx/jsonquery) | XPath query package for the JSON document |
Installation
====
```
go get github.com/antchfx/htmlquery
```
Getting Started
====
#### Query, returns matched elements or error.
```go
nodes, err := htmlquery.QueryAll(doc, "//a")
if err != nil {
panic(`not a valid XPath expression.`)
}
```
#### Load HTML document from URL.
```go
doc, err := htmlquery.LoadURL("http://example.com/")
```
#### Load HTML from document.
```go
filePath := "/home/user/sample.html"
doc, err := htmlquery.LoadDoc(filePath)
```
#### Load HTML document from string.
```go
s := `....`
doc, err := htmlquery.Parse(strings.NewReader(s))
```
#### Find all A elements.
```go
list := htmlquery.Find(doc, "//a")
```
#### Find all A elements that have `href` attribute.
```go
list := htmlquery.Find(doc, "//a[@href]")
```
#### Find all A elements with `href` attribute and only return `href` value.
```go
list := htmlquery.Find(doc, "//a/@href")
for _ , n := range list{
fmt.Println(htmlquery.SelectAttr(n, "href")) // output @href value
}
```
### Find the third A element.
```go
a := htmlquery.FindOne(doc, "//a[3]")
```
### Find children element (img) under A `href` and print the source
```go
a := htmlquery.FindOne(doc, "//a")
img := htmlquery.FindOne(a, "//img")
fmt.Prinln(htmlquery.SelectAttr(img, "src")) // output @src value
```
#### Evaluate the number of all IMG element.
```go
expr, _ := xpath.Compile("count(//img)")
v := expr.Evaluate(htmlquery.CreateXPathNavigator(doc)).(float64)
fmt.Printf("total count is %f", v)
```
Quick Starts
===
```go
func main() {
doc, err := htmlquery.LoadURL("https://www.bing.com/search?q=golang")
if err != nil {
panic(err)
}
// Find all news item.
list, err := htmlquery.QueryAll(doc, "//ol/li")
if err != nil {
panic(err)
}
for i, n := range list {
a := htmlquery.FindOne(n, "//a")
if a != nil {
fmt.Printf("%d %s(%s)\n", i, htmlquery.InnerText(a), htmlquery.SelectAttr(a, "href"))
}
}
}
```
FAQ
====
#### `Find()` vs `QueryAll()`, which is better?
`Find` and `QueryAll` both do the same things, searches all of matched html nodes.
The `Find` will panics if you give an error XPath query, but `QueryAll` will return an error for you.
#### Can I save my query expression object for the next query?
Yes, you can. We offer the `QuerySelector` and `QuerySelectorAll` methods, It will accept your query expression object.
Cache a query expression object(or reused) will avoid re-compile XPath query expression, improve your query performance.
#### XPath query object cache performance
```
goos: windows
goarch: amd64
pkg: github.com/antchfx/htmlquery
BenchmarkSelectorCache-4 20000000 55.2 ns/op
BenchmarkDisableSelectorCache-4 500000 3162 ns/op
```
#### How to disable caching?
```
htmlquery.DisableSelectorCache = true
```
Questions
===
Please let me know if you have any questions.
golang-github-antchfx-htmlquery-1.3.0/cache.go 0000664 0000000 0000000 00000001622 14557005770 0021330 0 ustar 00root root 0000000 0000000 package htmlquery
import (
"sync"
"github.com/antchfx/xpath"
"github.com/golang/groupcache/lru"
)
// DisableSelectorCache will disable caching for the query selector if value is true.
var DisableSelectorCache = false
// SelectorCacheMaxEntries allows how many selector object can be caching. Default is 50.
// Will disable caching if SelectorCacheMaxEntries <= 0.
var SelectorCacheMaxEntries = 50
var (
cacheOnce sync.Once
cache *lru.Cache
cacheMutex sync.Mutex
)
func getQuery(expr string) (*xpath.Expr, error) {
if DisableSelectorCache || SelectorCacheMaxEntries <= 0 {
return xpath.Compile(expr)
}
cacheOnce.Do(func() {
cache = lru.New(SelectorCacheMaxEntries)
})
cacheMutex.Lock()
defer cacheMutex.Unlock()
if v, ok := cache.Get(expr); ok {
return v.(*xpath.Expr), nil
}
v, err := xpath.Compile(expr)
if err != nil {
return nil, err
}
cache.Add(expr, v)
return v, nil
}
golang-github-antchfx-htmlquery-1.3.0/go.mod 0000664 0000000 0000000 00000000265 14557005770 0021046 0 ustar 00root root 0000000 0000000 module github.com/antchfx/htmlquery
go 1.14
require (
github.com/antchfx/xpath v1.2.3
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da
golang.org/x/net v0.5.0
)
golang-github-antchfx-htmlquery-1.3.0/go.sum 0000664 0000000 0000000 00000006225 14557005770 0021075 0 ustar 00root root 0000000 0000000 github.com/antchfx/xpath v1.2.3 h1:CCZWOzv5bAqjVv0offZ2LVgVYFbeldKQVuLNbViZdes=
github.com/antchfx/xpath v1.2.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.5.0 h1:GyT4nK/YDHSqa1c4753ouYCDajOYKTja9Xb/OHtgvSw=
golang.org/x/net v0.5.0/go.mod h1:DivGGAXEgPSlEBzxGzZI+ZLohi+xUj054jfeKui00ws=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.6.0 h1:3XmdazWV+ubf7QgHSTWeykHOci5oeekaGJBLkrkaw4k=
golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang-github-antchfx-htmlquery-1.3.0/query.go 0000664 0000000 0000000 00000016264 14557005770 0021442 0 ustar 00root root 0000000 0000000 /*
Package htmlquery provides extract data from HTML documents using XPath expression.
*/
package htmlquery
import (
"bufio"
"fmt"
"io"
"net/http"
"os"
"strings"
"github.com/antchfx/xpath"
"golang.org/x/net/html"
"golang.org/x/net/html/charset"
)
var _ xpath.NodeNavigator = &NodeNavigator{}
// CreateXPathNavigator creates a new xpath.NodeNavigator for the specified html.Node.
func CreateXPathNavigator(top *html.Node) *NodeNavigator {
return &NodeNavigator{curr: top, root: top, attr: -1}
}
// Find is like QueryAll but Will panics if the expression `expr` cannot be parsed.
//
// See `QueryAll()` function.
func Find(top *html.Node, expr string) []*html.Node {
nodes, err := QueryAll(top, expr)
if err != nil {
panic(err)
}
return nodes
}
// FindOne is like Query but will panics if the expression `expr` cannot be parsed.
// See `Query()` function.
func FindOne(top *html.Node, expr string) *html.Node {
node, err := Query(top, expr)
if err != nil {
panic(err)
}
return node
}
// QueryAll searches the html.Node that matches by the specified XPath expr.
// Return an error if the expression `expr` cannot be parsed.
func QueryAll(top *html.Node, expr string) ([]*html.Node, error) {
exp, err := getQuery(expr)
if err != nil {
return nil, err
}
nodes := QuerySelectorAll(top, exp)
return nodes, nil
}
// Query runs the given XPath expression against the given html.Node and
// returns the first matching html.Node, or nil if no matches are found.
//
// Returns an error if the expression `expr` cannot be parsed.
func Query(top *html.Node, expr string) (*html.Node, error) {
exp, err := getQuery(expr)
if err != nil {
return nil, err
}
return QuerySelector(top, exp), nil
}
// QuerySelector returns the first matched html.Node by the specified XPath selector.
func QuerySelector(top *html.Node, selector *xpath.Expr) *html.Node {
t := selector.Select(CreateXPathNavigator(top))
if t.MoveNext() {
return getCurrentNode(t.Current().(*NodeNavigator))
}
return nil
}
// QuerySelectorAll searches all of the html.Node that matches the specified XPath selectors.
func QuerySelectorAll(top *html.Node, selector *xpath.Expr) []*html.Node {
var elems []*html.Node
t := selector.Select(CreateXPathNavigator(top))
for t.MoveNext() {
nav := t.Current().(*NodeNavigator)
n := getCurrentNode(nav)
elems = append(elems, n)
}
return elems
}
// LoadURL loads the HTML document from the specified URL.
func LoadURL(url string) (*html.Node, error) {
resp, err := http.Get(url)
if err != nil {
return nil, err
}
defer resp.Body.Close()
r, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type"))
if err != nil {
return nil, err
}
return html.Parse(r)
}
// LoadDoc loads the HTML document from the specified file path.
func LoadDoc(path string) (*html.Node, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
return html.Parse(bufio.NewReader(f))
}
func getCurrentNode(n *NodeNavigator) *html.Node {
if n.NodeType() == xpath.AttributeNode {
childNode := &html.Node{
Type: html.TextNode,
Data: n.Value(),
}
return &html.Node{
Type: html.ElementNode,
Data: n.LocalName(),
FirstChild: childNode,
LastChild: childNode,
}
}
return n.curr
}
// Parse returns the parse tree for the HTML from the given Reader.
func Parse(r io.Reader) (*html.Node, error) {
return html.Parse(r)
}
// InnerText returns the text between the start and end tags of the object.
func InnerText(n *html.Node) string {
var output func(*strings.Builder, *html.Node)
output = func(b *strings.Builder, n *html.Node) {
switch n.Type {
case html.TextNode:
b.WriteString(n.Data)
return
case html.CommentNode:
return
}
for child := n.FirstChild; child != nil; child = child.NextSibling {
output(b, child)
}
}
var b strings.Builder
output(&b, n)
return b.String()
}
// SelectAttr returns the attribute value with the specified name.
func SelectAttr(n *html.Node, name string) (val string) {
if n == nil {
return
}
if n.Type == html.ElementNode && n.Parent == nil && name == n.Data {
return InnerText(n)
}
for _, attr := range n.Attr {
if attr.Key == name {
val = attr.Val
break
}
}
return
}
// ExistsAttr returns whether attribute with specified name exists.
func ExistsAttr(n *html.Node, name string) bool {
if n == nil {
return false
}
for _, attr := range n.Attr {
if attr.Key == name {
return true
}
}
return false
}
// OutputHTML returns the text including tags name.
func OutputHTML(n *html.Node, self bool) string {
var b strings.Builder
if self {
html.Render(&b, n)
} else {
for n := n.FirstChild; n != nil; n = n.NextSibling {
html.Render(&b, n)
}
}
return b.String()
}
type NodeNavigator struct {
root, curr *html.Node
attr int
}
func (h *NodeNavigator) Current() *html.Node {
return h.curr
}
func (h *NodeNavigator) NodeType() xpath.NodeType {
switch h.curr.Type {
case html.CommentNode:
return xpath.CommentNode
case html.TextNode:
return xpath.TextNode
case html.DocumentNode:
return xpath.RootNode
case html.ElementNode:
if h.attr != -1 {
return xpath.AttributeNode
}
return xpath.ElementNode
case html.DoctypeNode:
// ignored declare and as Root-Node type.
return xpath.RootNode
}
panic(fmt.Sprintf("unknown HTML node type: %v", h.curr.Type))
}
func (h *NodeNavigator) LocalName() string {
if h.attr != -1 {
return h.curr.Attr[h.attr].Key
}
return h.curr.Data
}
func (*NodeNavigator) Prefix() string {
return ""
}
func (h *NodeNavigator) Value() string {
switch h.curr.Type {
case html.CommentNode:
return h.curr.Data
case html.ElementNode:
if h.attr != -1 {
return h.curr.Attr[h.attr].Val
}
return InnerText(h.curr)
case html.TextNode:
return h.curr.Data
}
return ""
}
func (h *NodeNavigator) Copy() xpath.NodeNavigator {
n := *h
return &n
}
func (h *NodeNavigator) MoveToRoot() {
h.curr = h.root
}
func (h *NodeNavigator) MoveToParent() bool {
if h.attr != -1 {
h.attr = -1
return true
} else if node := h.curr.Parent; node != nil {
h.curr = node
return true
}
return false
}
func (h *NodeNavigator) MoveToNextAttribute() bool {
if h.attr >= len(h.curr.Attr)-1 {
return false
}
h.attr++
return true
}
func (h *NodeNavigator) MoveToChild() bool {
if h.attr != -1 {
return false
}
if node := h.curr.FirstChild; node != nil {
h.curr = node
return true
}
return false
}
func (h *NodeNavigator) MoveToFirst() bool {
if h.attr != -1 || h.curr.PrevSibling == nil {
return false
}
for {
node := h.curr.PrevSibling
if node == nil {
break
}
h.curr = node
}
return true
}
func (h *NodeNavigator) String() string {
return h.Value()
}
func (h *NodeNavigator) MoveToNext() bool {
if h.attr != -1 {
return false
}
if node := h.curr.NextSibling; node != nil {
h.curr = node
return true
}
return false
}
func (h *NodeNavigator) MoveToPrevious() bool {
if h.attr != -1 {
return false
}
if node := h.curr.PrevSibling; node != nil {
h.curr = node
return true
}
return false
}
func (h *NodeNavigator) MoveTo(other xpath.NodeNavigator) bool {
node, ok := other.(*NodeNavigator)
if !ok || node.root != h.root {
return false
}
h.curr = node.curr
h.attr = node.attr
return true
}
golang-github-antchfx-htmlquery-1.3.0/query_test.go 0000664 0000000 0000000 00000010360 14557005770 0022470 0 ustar 00root root 0000000 0000000 package htmlquery
import (
"fmt"
"io/ioutil"
"net/http"
"net/http/httptest"
"os"
"strings"
"sync"
"testing"
"github.com/antchfx/xpath"
"golang.org/x/net/html"
)
const htmlSample = `
Hello,World!
City Gallery
London
London is the capital city of England. It is the most populous city in the United Kingdom, with a metropolitan area of over 13 million inhabitants.
Standing on the River Thames, London has been a major settlement for two millennia, its history going back to its founding by the Romans, who named it Londinium.
`
var testDoc = loadHTML(htmlSample)
func BenchmarkSelectorCache(b *testing.B) {
DisableSelectorCache = false
for i := 0; i < b.N; i++ {
getQuery("/AAA/BBB/DDD/CCC/EEE/ancestor::*")
}
}
func BenchmarkDisableSelectorCache(b *testing.B) {
DisableSelectorCache = true
for i := 0; i < b.N; i++ {
getQuery("/AAA/BBB/DDD/CCC/EEE/ancestor::*")
}
}
func TestSelectorCache(t *testing.T) {
SelectorCacheMaxEntries = 2
for i := 1; i <= 3; i++ {
getQuery(fmt.Sprintf("//a[position()=%d]", i))
}
getQuery("//a[position()=3]")
}
func TestLoadURL(t *testing.T) {
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fmt.Fprint(w, htmlSample)
}))
defer ts.Close()
_, err := LoadURL(ts.URL)
if err != nil {
t.Fatal(err)
}
}
func TestLoadDoc(t *testing.T) {
tempHTMLdoc, err := ioutil.TempFile("", "sample_*.html")
if err != nil {
t.Fatal(err)
}
tempHTMLFilename := tempHTMLdoc.Name()
defer func(tempHTMLdoc *os.File, filename string) {
tempHTMLdoc.Close()
os.Remove(filename)
}(tempHTMLdoc, tempHTMLFilename)
tempHTMLdoc.Write([]byte(htmlSample))
if _, err := LoadDoc(tempHTMLFilename); err != nil {
t.Fatal(err)
}
}
func TestNavigator(t *testing.T) {
top := FindOne(testDoc, "//html")
nav := &NodeNavigator{curr: top, root: top, attr: -1}
nav.MoveToChild() // HEAD
nav.MoveToNext()
if nav.NodeType() != xpath.TextNode {
t.Fatalf("expectd node type is TextNode,but got %vs", nav.NodeType())
}
nav.MoveToNext() //
if nav.Value() != InnerText(FindOne(testDoc, "//body")) {
t.Fatal("body not equal")
}
nav.MoveToPrevious() //
nav.MoveToParent() //
if nav.curr != top {
t.Fatal("current node is not html node")
}
nav.MoveToNextAttribute()
if nav.LocalName() != "lang" {
t.Fatal("node not move to lang attribute")
}
nav.MoveToParent()
nav.MoveToFirst() //
if nav.curr.Type != html.DoctypeNode {
t.Fatalf("expected node type is DoctypeNode,but got %d", nav.curr.Type)
}
}
func TestXPath(t *testing.T) {
node := FindOne(testDoc, "//html")
if SelectAttr(node, "lang") != "en-US" {
t.Fatal("//html[@lang] != en-Us")
}
node = FindOne(testDoc, "//header")
if strings.Index(InnerText(node), "Logo") > 0 {
t.Fatal("InnerText() have comment node text")
}
if !strings.Contains(OutputHTML(node, true), "Logo") {
t.Fatal("OutputHTML() shoud have comment node text")
}
link := FindOne(testDoc, "//a[1]/@href")
if link == nil {
t.Fatal("link is nil")
}
if v := InnerText(link); v != "/London" {
t.Fatalf("expect value is /London, but got %s", v)
}
}
func TestXPathCdUp(t *testing.T) {
doc := loadHTML(``)
node := FindOne(doc, "//b/@attr/..")
t.Logf("node = %#v", node)
if node == nil || node.Data != "b" {
t.Fatal("//b/@id/.. != ")
}
}
func TestConcurrentQuery(t *testing.T) {
var wg sync.WaitGroup
for i := 0; i < 10; i++ {
wg.Add(1)
go func(i int) {
defer wg.Done()
s := `
a
`
doc := loadHTML(s)
if n := FindOne(doc, `//div`); n == nil {
t.Fatalf("should find one but got nil [%d]", i)
}
}(i)
}
wg.Done()
}
func loadHTML(str string) *html.Node {
node, err := Parse(strings.NewReader(str))
if err != nil {
panic(err)
}
return node
}