replace html parser with golang.org/x/net/html
parent
5e88f404e0
commit
2433e40d1e
39
README.md
39
README.md
|
@ -38,37 +38,38 @@ Example of parsing a string containing HTML:
|
||||||
}
|
}
|
||||||
|
|
||||||
println("Name: ", data.Items[0].Properties["name"][0].(string))
|
println("Name: ", data.Items[0].Properties["name"][0].(string))
|
||||||
}
|
}
|
||||||
|
|
||||||
Extract microdata from a webpage and print the result as JSON
|
Extract microdata from a webpage and print the result as JSON
|
||||||
|
|
||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"github.com/iand/microdata"
|
"io/ioutil"
|
||||||
"io/ioutil"
|
"net/http"
|
||||||
"net/http"
|
"net/url"
|
||||||
"net/url"
|
"os"
|
||||||
"os"
|
|
||||||
|
"github.com/iand/microdata"
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
|
||||||
baseUrl, _ := url.Parse("http://tagger.steve.museum/steve/object/44863?offset=6")
|
baseUrl, _ := url.Parse("http://www.designhive.com/blog/using-schemaorg-microdata")
|
||||||
|
|
||||||
resp, _ := http.Get(baseUrl.String())
|
resp, _ := http.Get(baseUrl.String())
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
|
|
||||||
html, _ := ioutil.ReadAll(resp.Body)
|
html, _ := ioutil.ReadAll(resp.Body)
|
||||||
|
|
||||||
p := microdata.NewParser(bytes.NewReader(html), baseUrl)
|
p := microdata.NewParser(bytes.NewReader(html), baseUrl)
|
||||||
|
|
||||||
data, _ := p.Parse()
|
data, _ := p.Parse()
|
||||||
|
|
||||||
json, _ := data.Json()
|
json, _ := data.Json()
|
||||||
os.Stdout.Write(json)
|
os.Stdout.Write(json)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
## Authors
|
## Authors
|
||||||
|
@ -83,11 +84,11 @@ Extract microdata from a webpage and print the result as JSON
|
||||||
|
|
||||||
* Do submit your changes as a pull request
|
* Do submit your changes as a pull request
|
||||||
* Do your best to adhere to the existing coding conventions and idioms.
|
* Do your best to adhere to the existing coding conventions and idioms.
|
||||||
* Do run `go fmt` on the code before committing
|
* Do run `go fmt` on the code before committing
|
||||||
* Do feel free to add yourself to the [`CREDITS`](CREDITS) file and the
|
* Do feel free to add yourself to the [`CREDITS`](CREDITS) file and the
|
||||||
corresponding Contributors list in the the [`README.md`](README.md).
|
corresponding Contributors list in the the [`README.md`](README.md).
|
||||||
Alphabetical order applies.
|
Alphabetical order applies.
|
||||||
* Don't touch the [`AUTHORS`](AUTHORS) file. An existing author will add you if
|
* Don't touch the [`AUTHORS`](AUTHORS) file. An existing author will add you if
|
||||||
your contributions are significant enough.
|
your contributions are significant enough.
|
||||||
* Do note that in order for any non-trivial changes to be merged (as a rule
|
* Do note that in order for any non-trivial changes to be merged (as a rule
|
||||||
of thumb, additions larger than about 15 lines of code), an explicit
|
of thumb, additions larger than about 15 lines of code), an explicit
|
||||||
|
|
184
microdata.go
184
microdata.go
|
@ -9,13 +9,13 @@ package microdata
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"code.google.com/p/go-html-transform/h5"
|
|
||||||
"code.google.com/p/go.net/html"
|
|
||||||
"code.google.com/p/go.net/html/atom"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"io"
|
"io"
|
||||||
"net/url"
|
"net/url"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"golang.org/x/net/html"
|
||||||
|
"golang.org/x/net/html/atom"
|
||||||
)
|
)
|
||||||
|
|
||||||
type ValueList []interface{}
|
type ValueList []interface{}
|
||||||
|
@ -37,18 +37,18 @@ func NewItem() *Item {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add a string type item property value
|
// Add a string type item property value
|
||||||
func (self *Item) AddString(property string, value string) {
|
func (i *Item) AddString(property string, value string) {
|
||||||
self.Properties[property] = append(self.Properties[property], value)
|
i.Properties[property] = append(i.Properties[property], value)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add an Item type item property value
|
// Add an Item type item property value
|
||||||
func (self *Item) AddItem(property string, value *Item) {
|
func (i *Item) AddItem(property string, value *Item) {
|
||||||
self.Properties[property] = append(self.Properties[property], value)
|
i.Properties[property] = append(i.Properties[property], value)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add a type to the item
|
// Add a type to the item
|
||||||
func (self *Item) AddType(value string) {
|
func (i *Item) AddType(value string) {
|
||||||
self.Types = append(self.Types, value)
|
i.Types = append(i.Types, value)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Represents a set of microdata items
|
// Represents a set of microdata items
|
||||||
|
@ -64,13 +64,13 @@ func NewMicrodata() *Microdata {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add an item to the microdata set
|
// Add an item to the microdata set
|
||||||
func (self *Microdata) AddItem(value *Item) {
|
func (m *Microdata) AddItem(value *Item) {
|
||||||
self.Items = append(self.Items, value)
|
m.Items = append(m.Items, value)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert the microdata set to JSON
|
// Convert the microdata set to JSON
|
||||||
func (self *Microdata) Json() ([]byte, error) {
|
func (m *Microdata) Json() ([]byte, error) {
|
||||||
b, err := json.Marshal(self)
|
b, err := json.Marshal(m)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
@ -79,7 +79,7 @@ func (self *Microdata) Json() ([]byte, error) {
|
||||||
|
|
||||||
// An HTML parser that extracts microdata
|
// An HTML parser that extracts microdata
|
||||||
type Parser struct {
|
type Parser struct {
|
||||||
p *h5.Tree
|
r io.Reader
|
||||||
data *Microdata
|
data *Microdata
|
||||||
base *url.URL
|
base *url.URL
|
||||||
identifiedNodes map[string]*html.Node
|
identifiedNodes map[string]*html.Node
|
||||||
|
@ -89,37 +89,40 @@ type Parser struct {
|
||||||
// r is a reader over an HTML document
|
// r is a reader over an HTML document
|
||||||
// base is the base URL for resolving relative URLs
|
// base is the base URL for resolving relative URLs
|
||||||
func NewParser(r io.Reader, base *url.URL) *Parser {
|
func NewParser(r io.Reader, base *url.URL) *Parser {
|
||||||
p, _ := h5.New(r)
|
|
||||||
|
|
||||||
return &Parser{
|
return &Parser{
|
||||||
p: p,
|
r: r,
|
||||||
data: NewMicrodata(),
|
data: NewMicrodata(),
|
||||||
base: base,
|
base: base,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse the document and return a Microdata set
|
// Parse the document and return a Microdata set
|
||||||
func (self *Parser) Parse() (*Microdata, error) {
|
func (p *Parser) Parse() (*Microdata, error) {
|
||||||
tree := self.p
|
tree, err := html.Parse(p.r)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
topLevelItemNodes := make([]*html.Node, 0)
|
topLevelItemNodes := make([]*html.Node, 0)
|
||||||
self.identifiedNodes = make(map[string]*html.Node, 0)
|
p.identifiedNodes = make(map[string]*html.Node, 0)
|
||||||
|
|
||||||
tree.Walk(func(n *html.Node) {
|
walk(tree, func(n *html.Node) {
|
||||||
if _, exists := getAttr("itemscope", n); exists {
|
if n.Type == html.ElementNode {
|
||||||
if _, exists := getAttr("itemprop", n); !exists {
|
if _, exists := getAttr("itemscope", n); exists {
|
||||||
topLevelItemNodes = append(topLevelItemNodes, n)
|
if _, exists := getAttr("itemprop", n); !exists {
|
||||||
|
topLevelItemNodes = append(topLevelItemNodes, n)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if id, exists := getAttr("id", n); exists {
|
if id, exists := getAttr("id", n); exists {
|
||||||
self.identifiedNodes[id] = n
|
p.identifiedNodes[id] = n
|
||||||
|
}
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
for _, node := range topLevelItemNodes {
|
for _, node := range topLevelItemNodes {
|
||||||
item := NewItem()
|
item := NewItem()
|
||||||
self.data.Items = append(self.data.Items, item)
|
p.data.Items = append(p.data.Items, item)
|
||||||
if itemtypes, exists := getAttr("itemtype", node); exists {
|
if itemtypes, exists := getAttr("itemtype", node); exists {
|
||||||
for _, itemtype := range strings.Split(strings.TrimSpace(itemtypes), " ") {
|
for _, itemtype := range strings.Split(strings.TrimSpace(itemtypes), " ") {
|
||||||
itemtype = strings.TrimSpace(itemtype)
|
itemtype = strings.TrimSpace(itemtype)
|
||||||
|
@ -129,7 +132,7 @@ func (self *Parser) Parse() (*Microdata, error) {
|
||||||
}
|
}
|
||||||
// itemid only valid when itemscope and itemtype are both present
|
// itemid only valid when itemscope and itemtype are both present
|
||||||
if itemid, exists := getAttr("itemid", node); exists {
|
if itemid, exists := getAttr("itemid", node); exists {
|
||||||
if parsedUrl, err := self.base.Parse(itemid); err == nil {
|
if parsedUrl, err := p.base.Parse(itemid); err == nil {
|
||||||
item.ID = parsedUrl.String()
|
item.ID = parsedUrl.String()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -140,22 +143,22 @@ func (self *Parser) Parse() (*Microdata, error) {
|
||||||
for _, itemref := range strings.Split(strings.TrimSpace(itemrefs), " ") {
|
for _, itemref := range strings.Split(strings.TrimSpace(itemrefs), " ") {
|
||||||
itemref = strings.TrimSpace(itemref)
|
itemref = strings.TrimSpace(itemref)
|
||||||
|
|
||||||
if refnode, exists := self.identifiedNodes[itemref]; exists {
|
if refnode, exists := p.identifiedNodes[itemref]; exists {
|
||||||
self.readItem(item, refnode)
|
p.readItem(item, refnode)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for child := node.FirstChild; child != nil; {
|
for child := node.FirstChild; child != nil; {
|
||||||
self.readItem(item, child)
|
p.readItem(item, child)
|
||||||
child = child.NextSibling
|
child = child.NextSibling
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return self.data, nil
|
return p.data, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (self *Parser) readItem(item *Item, node *html.Node) {
|
func (p *Parser) readItem(item *Item, node *html.Node) {
|
||||||
if itemprop, exists := getAttr("itemprop", node); exists {
|
if itemprop, exists := getAttr("itemprop", node); exists {
|
||||||
if _, exists := getAttr("itemscope", node); exists {
|
if _, exists := getAttr("itemscope", node); exists {
|
||||||
subitem := NewItem()
|
subitem := NewItem()
|
||||||
|
@ -164,14 +167,14 @@ func (self *Parser) readItem(item *Item, node *html.Node) {
|
||||||
for _, itemref := range strings.Split(strings.TrimSpace(itemrefs), " ") {
|
for _, itemref := range strings.Split(strings.TrimSpace(itemrefs), " ") {
|
||||||
itemref = strings.TrimSpace(itemref)
|
itemref = strings.TrimSpace(itemref)
|
||||||
|
|
||||||
if refnode, exists := self.identifiedNodes[itemref]; exists {
|
if refnode, exists := p.identifiedNodes[itemref]; exists {
|
||||||
self.readItem(subitem, refnode)
|
p.readItem(subitem, refnode)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for child := node.FirstChild; child != nil; {
|
for child := node.FirstChild; child != nil; {
|
||||||
self.readItem(subitem, child)
|
p.readItem(subitem, child)
|
||||||
child = child.NextSibling
|
child = child.NextSibling
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -184,66 +187,65 @@ func (self *Parser) readItem(item *Item, node *html.Node) {
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
} else {
|
}
|
||||||
var propertyValue string
|
|
||||||
|
|
||||||
switch node.DataAtom {
|
var propertyValue string
|
||||||
case atom.Meta:
|
|
||||||
if val, exists := getAttr("content", node); exists {
|
|
||||||
propertyValue = val
|
|
||||||
}
|
|
||||||
case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video:
|
|
||||||
if urlValue, exists := getAttr("src", node); exists {
|
|
||||||
if parsedUrl, err := self.base.Parse(urlValue); err == nil {
|
|
||||||
propertyValue = parsedUrl.String()
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
switch node.DataAtom {
|
||||||
case atom.A, atom.Area, atom.Link:
|
case atom.Meta:
|
||||||
if urlValue, exists := getAttr("href", node); exists {
|
if val, exists := getAttr("content", node); exists {
|
||||||
if parsedUrl, err := self.base.Parse(urlValue); err == nil {
|
propertyValue = val
|
||||||
propertyValue = parsedUrl.String()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
case atom.Object:
|
|
||||||
if urlValue, exists := getAttr("data", node); exists {
|
|
||||||
propertyValue = urlValue
|
|
||||||
}
|
|
||||||
case atom.Data, atom.Meter:
|
|
||||||
if urlValue, exists := getAttr("value", node); exists {
|
|
||||||
propertyValue = urlValue
|
|
||||||
}
|
|
||||||
case atom.Time:
|
|
||||||
if urlValue, exists := getAttr("datetime", node); exists {
|
|
||||||
propertyValue = urlValue
|
|
||||||
}
|
|
||||||
|
|
||||||
default:
|
|
||||||
var text bytes.Buffer
|
|
||||||
h5.WalkNodes(node, func(n *html.Node) {
|
|
||||||
if n.Type == html.TextNode {
|
|
||||||
text.WriteString(n.Data)
|
|
||||||
}
|
|
||||||
|
|
||||||
})
|
|
||||||
propertyValue = text.String()
|
|
||||||
}
|
}
|
||||||
|
case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video:
|
||||||
|
if urlValue, exists := getAttr("src", node); exists {
|
||||||
|
if parsedUrl, err := p.base.Parse(urlValue); err == nil {
|
||||||
|
propertyValue = parsedUrl.String()
|
||||||
|
}
|
||||||
|
|
||||||
if len(propertyValue) > 0 {
|
}
|
||||||
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
|
case atom.A, atom.Area, atom.Link:
|
||||||
propertyName = strings.TrimSpace(propertyName)
|
if urlValue, exists := getAttr("href", node); exists {
|
||||||
if propertyName != "" {
|
if parsedUrl, err := p.base.Parse(urlValue); err == nil {
|
||||||
item.AddString(propertyName, propertyValue)
|
propertyValue = parsedUrl.String()
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
case atom.Object:
|
||||||
|
if urlValue, exists := getAttr("data", node); exists {
|
||||||
|
propertyValue = urlValue
|
||||||
|
}
|
||||||
|
case atom.Data, atom.Meter:
|
||||||
|
if urlValue, exists := getAttr("value", node); exists {
|
||||||
|
propertyValue = urlValue
|
||||||
|
}
|
||||||
|
case atom.Time:
|
||||||
|
if urlValue, exists := getAttr("datetime", node); exists {
|
||||||
|
propertyValue = urlValue
|
||||||
|
}
|
||||||
|
|
||||||
|
default:
|
||||||
|
var text bytes.Buffer
|
||||||
|
walk(node, func(n *html.Node) {
|
||||||
|
if n.Type == html.TextNode {
|
||||||
|
text.WriteString(n.Data)
|
||||||
|
}
|
||||||
|
|
||||||
|
})
|
||||||
|
propertyValue = text.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(propertyValue) > 0 {
|
||||||
|
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
|
||||||
|
propertyName = strings.TrimSpace(propertyName)
|
||||||
|
if propertyName != "" {
|
||||||
|
item.AddString(propertyName, propertyValue)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for child := node.FirstChild; child != nil; {
|
for child := node.FirstChild; child != nil; {
|
||||||
self.readItem(item, child)
|
p.readItem(item, child)
|
||||||
child = child.NextSibling
|
child = child.NextSibling
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -257,3 +259,15 @@ func getAttr(name string, node *html.Node) (string, bool) {
|
||||||
}
|
}
|
||||||
return "", false
|
return "", false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func walk(parent *html.Node, fn func(n *html.Node)) {
|
||||||
|
if parent == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
fn(parent)
|
||||||
|
|
||||||
|
for child := parent.FirstChild; child != nil; {
|
||||||
|
walk(child, fn)
|
||||||
|
child = child.NextSibling
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -55,7 +55,7 @@ func TestParseActuallyParses(t *testing.T) {
|
||||||
item := ParseOneItem(html, t)
|
item := ParseOneItem(html, t)
|
||||||
|
|
||||||
if item.Properties["name"][0].(string) != "Daniel" {
|
if item.Properties["name"][0].(string) != "Daniel" {
|
||||||
t.Errorf("Property value not found")
|
t.Errorf("got %v, wanted %s", item.Properties["name"][0], "Daniel")
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -113,7 +113,7 @@ func TestParseAreaHref(t *testing.T) {
|
||||||
html := `
|
html := `
|
||||||
<div itemscope><map name="shapes">
|
<div itemscope><map name="shapes">
|
||||||
<area itemprop="foo" href="http://example.com/foo" shape=rect coords="50,50,100,100">
|
<area itemprop="foo" href="http://example.com/foo" shape=rect coords="50,50,100,100">
|
||||||
|
|
||||||
</map></div>`
|
</map></div>`
|
||||||
|
|
||||||
item := ParseOneItem(html, t)
|
item := ParseOneItem(html, t)
|
||||||
|
|
Loading…
Reference in New Issue