pax_global_header 0000666 0000000 0000000 00000000064 14740514154 0014517 g ustar 00root root 0000000 0000000 52 comment=18985ac3e95a7f4790ab2389ff2aed36868d2a44
golang-github-antchfx-htmlquery-1.3.4/ 0000775 0000000 0000000 00000000000 14740514154 0017734 5 ustar 00root root 0000000 0000000 golang-github-antchfx-htmlquery-1.3.4/.github/ 0000775 0000000 0000000 00000000000 14740514154 0021274 5 ustar 00root root 0000000 0000000 golang-github-antchfx-htmlquery-1.3.4/.github/workflows/ 0000775 0000000 0000000 00000000000 14740514154 0023331 5 ustar 00root root 0000000 0000000 golang-github-antchfx-htmlquery-1.3.4/.github/workflows/testing.yml 0000664 0000000 0000000 00000000756 14740514154 0025541 0 ustar 00root root 0000000 0000000 name: Testing
on: [push, pull_request]
jobs:
test:
strategy:
matrix:
go-version: ["1.20", 1.21.x, 1.22.x]
os: [ubuntu-latest, macos-latest, windows-latest]
runs-on: ${{ matrix.os }}
steps:
- name: Setup Go
uses: actions/setup-go@v5
with:
go-version: ${{ matrix.go-version }}
- name: Checkout code
uses: actions/checkout@v4
- name: Test
run: |
go version
go test . -v -cover
golang-github-antchfx-htmlquery-1.3.4/.gitignore 0000664 0000000 0000000 00000000462 14740514154 0021726 0 ustar 00root root 0000000 0000000 # vscode
.vscode
debug
*.test
./build
# Compiled Object files, Static and Dynamic libs (Shared Objects)
*.o
*.a
*.so
# Folders
_obj
_test
# Architecture specific extensions/prefixes
*.[568vq]
[568vq].out
*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*
_testmain.go
*.exe
*.test
*.prof golang-github-antchfx-htmlquery-1.3.4/LICENSE 0000664 0000000 0000000 00000001776 14740514154 0020754 0 ustar 00root root 0000000 0000000 Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. golang-github-antchfx-htmlquery-1.3.4/README.md 0000664 0000000 0000000 00000010341 14740514154 0021212 0 ustar 00root root 0000000 0000000 # htmlquery
[](https://github.com/antchfx/htmlquery/actions/workflows/testing.yml)
[](https://godoc.org/github.com/antchfx/htmlquery)
[](https://goreportcard.com/report/github.com/antchfx/htmlquery)
# Overview
`htmlquery` is an XPath query package for HTML, lets you extract data or evaluate from HTML documents by an XPath expression.
`htmlquery` built-in the query object caching feature based on [LRU](https://godoc.org/github.com/golang/groupcache/lru), this feature will caching the recently used XPATH query string. Enable query caching can avoid re-compile XPath expression each query.
You can visit this page to learn about the supported XPath(1.0/2.0) syntax. https://github.com/antchfx/xpath
# XPath query packages for Go
| Name | Description |
| ------------------------------------------------- | ----------------------------------------- |
| [htmlquery](https://github.com/antchfx/htmlquery) | XPath query package for the HTML document |
| [xmlquery](https://github.com/antchfx/xmlquery) | XPath query package for the XML document |
| [jsonquery](https://github.com/antchfx/jsonquery) | XPath query package for the JSON document |
# Installation
```
go get github.com/antchfx/htmlquery
```
# Getting Started
#### Query, returns matched elements or error.
```go
nodes, err := htmlquery.QueryAll(doc, "//a")
if err != nil {
panic(`not a valid XPath expression.`)
}
```
#### Load HTML document from URL.
```go
doc, err := htmlquery.LoadURL("http://example.com/")
```
#### Load HTML from document.
```go
filePath := "/home/user/sample.html"
doc, err := htmlquery.LoadDoc(filePath)
```
#### Load HTML document from string.
```go
s := `....`
doc, err := htmlquery.Parse(strings.NewReader(s))
```
#### Find all A elements.
```go
list := htmlquery.Find(doc, "//a")
```
#### Find all A elements that have `href` attribute.
```go
list := htmlquery.Find(doc, "//a[@href]")
```
#### Find all A elements with `href` attribute and only return `href` value.
```go
list := htmlquery.Find(doc, "//a/@href")
for _ , n := range list{
fmt.Println(htmlquery.InnerText(n)) // output @href value
}
```
### Find the third A element.
```go
a := htmlquery.FindOne(doc, "//a[3]")
```
### Find children element (img) under A `href` and print the source
```go
a := htmlquery.FindOne(doc, "//a")
img := htmlquery.FindOne(a, "//img")
fmt.Prinln(htmlquery.SelectAttr(img, "src")) // output @src value
```
#### Evaluate the number of all IMG element.
```go
expr, _ := xpath.Compile("count(//img)")
v := expr.Evaluate(htmlquery.CreateXPathNavigator(doc)).(float64)
fmt.Printf("total count is %f", v)
```
# Quick Starts
```go
func main() {
doc, err := htmlquery.LoadURL("https://www.bing.com/search?q=golang")
if err != nil {
panic(err)
}
// Find all news item.
list, err := htmlquery.QueryAll(doc, "//ol/li")
if err != nil {
panic(err)
}
for i, n := range list {
a := htmlquery.FindOne(n, "//a")
if a != nil {
fmt.Printf("%d %s(%s)\n", i, htmlquery.InnerText(a), htmlquery.SelectAttr(a, "href"))
}
}
}
```
# FAQ
#### `Find()` vs `QueryAll()`, which is better?
`Find` and `QueryAll` both do the same things, searches all of matched html nodes.
The `Find` will panics if you give an error XPath query, but `QueryAll` will return an error for you.
#### Can I save my query expression object for the next query?
Yes, you can. We offer the `QuerySelector` and `QuerySelectorAll` methods, It will accept your query expression object.
Cache a query expression object(or reused) will avoid re-compile XPath query expression, improve your query performance.
#### XPath query object cache performance
```
goos: windows
goarch: amd64
pkg: github.com/antchfx/htmlquery
BenchmarkSelectorCache-4 20000000 55.2 ns/op
BenchmarkDisableSelectorCache-4 500000 3162 ns/op
```
#### How to disable caching?
```
htmlquery.DisableSelectorCache = true
```
# Questions
Please let me know if you have any questions.
golang-github-antchfx-htmlquery-1.3.4/cache.go 0000664 0000000 0000000 00000001622 14740514154 0021327 0 ustar 00root root 0000000 0000000 package htmlquery
import (
"sync"
"github.com/antchfx/xpath"
"github.com/golang/groupcache/lru"
)
// DisableSelectorCache will disable caching for the query selector if value is true.
var DisableSelectorCache = false
// SelectorCacheMaxEntries allows how many selector object can be caching. Default is 50.
// Will disable caching if SelectorCacheMaxEntries <= 0.
var SelectorCacheMaxEntries = 50
var (
cacheOnce sync.Once
cache *lru.Cache
cacheMutex sync.Mutex
)
func getQuery(expr string) (*xpath.Expr, error) {
if DisableSelectorCache || SelectorCacheMaxEntries <= 0 {
return xpath.Compile(expr)
}
cacheOnce.Do(func() {
cache = lru.New(SelectorCacheMaxEntries)
})
cacheMutex.Lock()
defer cacheMutex.Unlock()
if v, ok := cache.Get(expr); ok {
return v.(*xpath.Expr), nil
}
v, err := xpath.Compile(expr)
if err != nil {
return nil, err
}
cache.Add(expr, v)
return v, nil
}
golang-github-antchfx-htmlquery-1.3.4/go.mod 0000664 0000000 0000000 00000000266 14740514154 0021046 0 ustar 00root root 0000000 0000000 module github.com/antchfx/htmlquery
go 1.14
require (
github.com/antchfx/xpath v1.3.3
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da
golang.org/x/net v0.33.0
)
golang-github-antchfx-htmlquery-1.3.4/go.sum 0000664 0000000 0000000 00000014325 14740514154 0021074 0 ustar 00root root 0000000 0000000 github.com/antchfx/xpath v1.3.3 h1:tmuPQa1Uye0Ym1Zn65vxPgfltWb/Lxu2jeqIGteJSRs=
github.com/antchfx/xpath v1.3.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I=
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo=
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang-github-antchfx-htmlquery-1.3.4/query.go 0000664 0000000 0000000 00000017540 14740514154 0021437 0 ustar 00root root 0000000 0000000 /*
Package htmlquery provides extract data from HTML documents using XPath expression.
*/
package htmlquery
import (
"bufio"
"compress/gzip"
"compress/zlib"
"fmt"
"io"
"net/http"
"os"
"strings"
"github.com/antchfx/xpath"
"golang.org/x/net/html"
"golang.org/x/net/html/charset"
)
var _ xpath.NodeNavigator = &NodeNavigator{}
// CreateXPathNavigator creates a new xpath.NodeNavigator for the specified html.Node.
func CreateXPathNavigator(top *html.Node) *NodeNavigator {
return &NodeNavigator{curr: top, root: top, attr: -1}
}
// Find is like QueryAll but Will panics if the expression `expr` cannot be parsed.
//
// See `QueryAll()` function.
func Find(top *html.Node, expr string) []*html.Node {
nodes, err := QueryAll(top, expr)
if err != nil {
panic(err)
}
return nodes
}
// FindOne is like Query but will panics if the expression `expr` cannot be parsed.
// See `Query()` function.
func FindOne(top *html.Node, expr string) *html.Node {
node, err := Query(top, expr)
if err != nil {
panic(err)
}
return node
}
// QueryAll searches the html.Node that matches by the specified XPath expr.
// Return an error if the expression `expr` cannot be parsed.
func QueryAll(top *html.Node, expr string) ([]*html.Node, error) {
exp, err := getQuery(expr)
if err != nil {
return nil, err
}
nodes := QuerySelectorAll(top, exp)
return nodes, nil
}
// Query runs the given XPath expression against the given html.Node and
// returns the first matching html.Node, or nil if no matches are found.
//
// Returns an error if the expression `expr` cannot be parsed.
func Query(top *html.Node, expr string) (*html.Node, error) {
exp, err := getQuery(expr)
if err != nil {
return nil, err
}
return QuerySelector(top, exp), nil
}
// QuerySelector returns the first matched html.Node by the specified XPath selector.
func QuerySelector(top *html.Node, selector *xpath.Expr) *html.Node {
t := selector.Select(CreateXPathNavigator(top))
if t.MoveNext() {
return getCurrentNode(t.Current().(*NodeNavigator))
}
return nil
}
// QuerySelectorAll searches all of the html.Node that matches the specified XPath selectors.
func QuerySelectorAll(top *html.Node, selector *xpath.Expr) []*html.Node {
var elems []*html.Node
t := selector.Select(CreateXPathNavigator(top))
for t.MoveNext() {
nav := t.Current().(*NodeNavigator)
n := getCurrentNode(nav)
elems = append(elems, n)
}
return elems
}
// LoadURL loads the HTML document from the specified URL. Default enabling gzip on a HTTP request.
func LoadURL(url string) (*html.Node, error) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
// Enable gzip compression.
req.Header.Add("Accept-Encoding", "gzip")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
}
var reader io.ReadCloser
defer func() {
if reader != nil {
reader.Close()
}
}()
encoding := resp.Header.Get("Content-Encoding")
switch encoding {
case "gzip":
reader, err = gzip.NewReader(resp.Body)
if err != nil {
return nil, err
}
case "deflate":
reader, err = zlib.NewReader(resp.Body)
if err != nil {
return nil, err
}
case "":
reader = resp.Body
default:
return nil, fmt.Errorf("%s compression is not support", encoding)
}
r, err := charset.NewReader(reader, resp.Header.Get("Content-Type"))
if err != nil {
return nil, err
}
return html.Parse(r)
}
// LoadDoc loads the HTML document from the specified file path.
func LoadDoc(path string) (*html.Node, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
return html.Parse(bufio.NewReader(f))
}
func getCurrentNode(n *NodeNavigator) *html.Node {
if n.NodeType() == xpath.AttributeNode {
childNode := &html.Node{
Type: html.TextNode,
Data: n.Value(),
}
return &html.Node{
Type: html.ElementNode,
Data: n.LocalName(),
FirstChild: childNode,
LastChild: childNode,
}
}
return n.curr
}
// Parse returns the parse tree for the HTML from the given Reader.
func Parse(r io.Reader) (*html.Node, error) {
return html.Parse(r)
}
// InnerText returns the text between the start and end tags of the object.
func InnerText(n *html.Node) string {
var output func(*strings.Builder, *html.Node)
output = func(b *strings.Builder, n *html.Node) {
switch n.Type {
case html.TextNode:
b.WriteString(n.Data)
return
case html.CommentNode:
return
}
for child := n.FirstChild; child != nil; child = child.NextSibling {
output(b, child)
}
}
var b strings.Builder
output(&b, n)
return b.String()
}
// SelectAttr returns the attribute value with the specified name.
func SelectAttr(n *html.Node, name string) (val string) {
if n == nil {
return
}
if n.Type == html.ElementNode && n.Parent == nil && name == n.Data {
return InnerText(n)
}
for _, attr := range n.Attr {
if attr.Key == name {
val = attr.Val
break
}
}
return
}
// ExistsAttr returns whether attribute with specified name exists.
func ExistsAttr(n *html.Node, name string) bool {
if n == nil {
return false
}
for _, attr := range n.Attr {
if attr.Key == name {
return true
}
}
return false
}
// OutputHTML returns the text including tags name.
func OutputHTML(n *html.Node, self bool) string {
var b strings.Builder
if self {
html.Render(&b, n)
} else {
for n := n.FirstChild; n != nil; n = n.NextSibling {
html.Render(&b, n)
}
}
return b.String()
}
type NodeNavigator struct {
root, curr *html.Node
attr int
}
func (h *NodeNavigator) Current() *html.Node {
return h.curr
}
func (h *NodeNavigator) NodeType() xpath.NodeType {
switch h.curr.Type {
case html.CommentNode:
return xpath.CommentNode
case html.TextNode:
return xpath.TextNode
case html.DocumentNode:
return xpath.RootNode
case html.ElementNode:
if h.attr != -1 {
return xpath.AttributeNode
}
return xpath.ElementNode
case html.DoctypeNode:
// ignored declare and as Root-Node type.
return xpath.RootNode
}
panic(fmt.Sprintf("unknown HTML node type: %v", h.curr.Type))
}
func (h *NodeNavigator) LocalName() string {
if h.attr != -1 {
return h.curr.Attr[h.attr].Key
}
return h.curr.Data
}
func (*NodeNavigator) Prefix() string {
return ""
}
func (h *NodeNavigator) Value() string {
switch h.curr.Type {
case html.CommentNode:
return h.curr.Data
case html.ElementNode:
if h.attr != -1 {
return h.curr.Attr[h.attr].Val
}
return InnerText(h.curr)
case html.TextNode:
return h.curr.Data
}
return ""
}
func (h *NodeNavigator) Copy() xpath.NodeNavigator {
n := *h
return &n
}
func (h *NodeNavigator) MoveToRoot() {
h.curr = h.root
}
func (h *NodeNavigator) MoveToParent() bool {
if h.attr != -1 {
h.attr = -1
return true
} else if node := h.curr.Parent; node != nil {
h.curr = node
return true
}
return false
}
func (h *NodeNavigator) MoveToNextAttribute() bool {
if h.attr >= len(h.curr.Attr)-1 {
return false
}
h.attr++
return true
}
func (h *NodeNavigator) MoveToChild() bool {
if h.attr != -1 {
return false
}
if node := h.curr.FirstChild; node != nil {
h.curr = node
return true
}
return false
}
func (h *NodeNavigator) MoveToFirst() bool {
if h.attr != -1 || h.curr.PrevSibling == nil {
return false
}
for {
node := h.curr.PrevSibling
if node == nil {
break
}
h.curr = node
}
return true
}
func (h *NodeNavigator) String() string {
return h.Value()
}
func (h *NodeNavigator) MoveToNext() bool {
if h.attr != -1 {
return false
}
if node := h.curr.NextSibling; node != nil {
h.curr = node
return true
}
return false
}
func (h *NodeNavigator) MoveToPrevious() bool {
if h.attr != -1 {
return false
}
if node := h.curr.PrevSibling; node != nil {
h.curr = node
return true
}
return false
}
func (h *NodeNavigator) MoveTo(other xpath.NodeNavigator) bool {
node, ok := other.(*NodeNavigator)
if !ok || node.root != h.root {
return false
}
h.curr = node.curr
h.attr = node.attr
return true
}
golang-github-antchfx-htmlquery-1.3.4/query_test.go 0000664 0000000 0000000 00000011134 14740514154 0022467 0 ustar 00root root 0000000 0000000 package htmlquery
import (
"compress/gzip"
"fmt"
"io/ioutil"
"net/http"
"net/http/httptest"
"os"
"strings"
"sync"
"testing"
"github.com/antchfx/xpath"
"golang.org/x/net/html"
)
const htmlSample = `
Hello,World!
City Gallery
London
London is the capital city of England. It is the most populous city in the United Kingdom, with a metropolitan area of over 13 million inhabitants.
Standing on the River Thames, London has been a major settlement for two millennia, its history going back to its founding by the Romans, who named it Londinium.
`
var testDoc = loadHTML(htmlSample)
func BenchmarkSelectorCache(b *testing.B) {
DisableSelectorCache = false
for i := 0; i < b.N; i++ {
getQuery("/AAA/BBB/DDD/CCC/EEE/ancestor::*")
}
}
func BenchmarkDisableSelectorCache(b *testing.B) {
DisableSelectorCache = true
for i := 0; i < b.N; i++ {
getQuery("/AAA/BBB/DDD/CCC/EEE/ancestor::*")
}
}
func TestSelectorCache(t *testing.T) {
SelectorCacheMaxEntries = 2
for i := 1; i <= 3; i++ {
getQuery(fmt.Sprintf("//a[position()=%d]", i))
}
getQuery("//a[position()=3]")
}
func TestLoadURL(t *testing.T) {
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fmt.Fprint(w, htmlSample)
}))
defer ts.Close()
_, err := LoadURL(ts.URL)
if err != nil {
t.Fatal(err)
}
}
func TestLoadURLWithGzipResponse(t *testing.T) {
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Add("Content-Encoding", "gzip")
gz := gzip.NewWriter(w)
defer gz.Close()
fmt.Fprint(gz, htmlSample)
}))
defer ts.Close()
_, err := LoadURL(ts.URL)
if err != nil {
t.Fatal(err)
}
}
func TestLoadDoc(t *testing.T) {
tempHTMLdoc, err := ioutil.TempFile("", "sample_*.html")
if err != nil {
t.Fatal(err)
}
tempHTMLFilename := tempHTMLdoc.Name()
defer func(tempHTMLdoc *os.File, filename string) {
tempHTMLdoc.Close()
os.Remove(filename)
}(tempHTMLdoc, tempHTMLFilename)
tempHTMLdoc.Write([]byte(htmlSample))
if _, err := LoadDoc(tempHTMLFilename); err != nil {
t.Fatal(err)
}
}
func TestNavigator(t *testing.T) {
top := FindOne(testDoc, "//html")
nav := &NodeNavigator{curr: top, root: top, attr: -1}
nav.MoveToChild() // HEAD
nav.MoveToNext()
if nav.NodeType() != xpath.TextNode {
t.Fatalf("expectd node type is TextNode,but got %vs", nav.NodeType())
}
nav.MoveToNext() //
if nav.Value() != InnerText(FindOne(testDoc, "//body")) {
t.Fatal("body not equal")
}
nav.MoveToPrevious() //
nav.MoveToParent() //
if nav.curr != top {
t.Fatal("current node is not html node")
}
nav.MoveToNextAttribute()
if nav.LocalName() != "lang" {
t.Fatal("node not move to lang attribute")
}
nav.MoveToParent()
nav.MoveToFirst() //
if nav.curr.Type != html.DoctypeNode {
t.Fatalf("expected node type is DoctypeNode,but got %d", nav.curr.Type)
}
}
func TestXPath(t *testing.T) {
node := FindOne(testDoc, "//html")
if SelectAttr(node, "lang") != "en-US" {
t.Fatal("//html[@lang] != en-Us")
}
node = FindOne(testDoc, "//header")
if strings.Index(InnerText(node), "Logo") > 0 {
t.Fatal("InnerText() have comment node text")
}
if !strings.Contains(OutputHTML(node, true), "Logo") {
t.Fatal("OutputHTML() shoud have comment node text")
}
link := FindOne(testDoc, "//a[1]/@href")
if link == nil {
t.Fatal("link is nil")
}
if v := InnerText(link); v != "/London" {
t.Fatalf("expect value is /London, but got %s", v)
}
}
func TestXPathCdUp(t *testing.T) {
doc := loadHTML(``)
node := FindOne(doc, "//b/@attr/..")
t.Logf("node = %#v", node)
if node == nil || node.Data != "b" {
t.Fatal("//b/@id/.. != ")
}
}
func TestConcurrentQuery(t *testing.T) {
var wg sync.WaitGroup
for i := 0; i < 10; i++ {
wg.Add(1)
go func(i int) {
defer wg.Done()
s := `
a
`
doc := loadHTML(s)
if n := FindOne(doc, `//div`); n == nil {
t.Fatalf("should find one but got nil [%d]", i)
}
}(i)
}
wg.Wait()
}
func loadHTML(str string) *html.Node {
node, err := Parse(strings.NewReader(str))
if err != nil {
panic(err)
}
return node
}