replace html parser with golang.org/x/net/html

master
Ian Davis 2016-10-20 16:51:21 +01:00
parent 5e88f404e0
commit 2433e40d1e
3 changed files with 121 additions and 106 deletions

View File

@ -45,29 +45,30 @@ Extract microdata from a webpage and print the result as JSON
package main package main
import ( import (
"bytes" "bytes"
"github.com/iand/microdata" "io/ioutil"
"io/ioutil" "net/http"
"net/http" "net/url"
"net/url" "os"
"os"
"github.com/iand/microdata"
) )
func main() { func main() {
baseUrl, _ := url.Parse("http://tagger.steve.museum/steve/object/44863?offset=6") baseUrl, _ := url.Parse("http://www.designhive.com/blog/using-schemaorg-microdata")
resp, _ := http.Get(baseUrl.String()) resp, _ := http.Get(baseUrl.String())
defer resp.Body.Close() defer resp.Body.Close()
html, _ := ioutil.ReadAll(resp.Body) html, _ := ioutil.ReadAll(resp.Body)
p := microdata.NewParser(bytes.NewReader(html), baseUrl) p := microdata.NewParser(bytes.NewReader(html), baseUrl)
data, _ := p.Parse() data, _ := p.Parse()
json, _ := data.Json() json, _ := data.Json()
os.Stdout.Write(json) os.Stdout.Write(json)
} }

View File

@ -9,13 +9,13 @@ package microdata
import ( import (
"bytes" "bytes"
"code.google.com/p/go-html-transform/h5"
"code.google.com/p/go.net/html"
"code.google.com/p/go.net/html/atom"
"encoding/json" "encoding/json"
"io" "io"
"net/url" "net/url"
"strings" "strings"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
) )
type ValueList []interface{} type ValueList []interface{}
@ -37,18 +37,18 @@ func NewItem() *Item {
} }
// Add a string type item property value // Add a string type item property value
func (self *Item) AddString(property string, value string) { func (i *Item) AddString(property string, value string) {
self.Properties[property] = append(self.Properties[property], value) i.Properties[property] = append(i.Properties[property], value)
} }
// Add an Item type item property value // Add an Item type item property value
func (self *Item) AddItem(property string, value *Item) { func (i *Item) AddItem(property string, value *Item) {
self.Properties[property] = append(self.Properties[property], value) i.Properties[property] = append(i.Properties[property], value)
} }
// Add a type to the item // Add a type to the item
func (self *Item) AddType(value string) { func (i *Item) AddType(value string) {
self.Types = append(self.Types, value) i.Types = append(i.Types, value)
} }
// Represents a set of microdata items // Represents a set of microdata items
@ -64,13 +64,13 @@ func NewMicrodata() *Microdata {
} }
// Add an item to the microdata set // Add an item to the microdata set
func (self *Microdata) AddItem(value *Item) { func (m *Microdata) AddItem(value *Item) {
self.Items = append(self.Items, value) m.Items = append(m.Items, value)
} }
// Convert the microdata set to JSON // Convert the microdata set to JSON
func (self *Microdata) Json() ([]byte, error) { func (m *Microdata) Json() ([]byte, error) {
b, err := json.Marshal(self) b, err := json.Marshal(m)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -79,7 +79,7 @@ func (self *Microdata) Json() ([]byte, error) {
// An HTML parser that extracts microdata // An HTML parser that extracts microdata
type Parser struct { type Parser struct {
p *h5.Tree r io.Reader
data *Microdata data *Microdata
base *url.URL base *url.URL
identifiedNodes map[string]*html.Node identifiedNodes map[string]*html.Node
@ -89,37 +89,40 @@ type Parser struct {
// r is a reader over an HTML document // r is a reader over an HTML document
// base is the base URL for resolving relative URLs // base is the base URL for resolving relative URLs
func NewParser(r io.Reader, base *url.URL) *Parser { func NewParser(r io.Reader, base *url.URL) *Parser {
p, _ := h5.New(r)
return &Parser{ return &Parser{
p: p, r: r,
data: NewMicrodata(), data: NewMicrodata(),
base: base, base: base,
} }
} }
// Parse the document and return a Microdata set // Parse the document and return a Microdata set
func (self *Parser) Parse() (*Microdata, error) { func (p *Parser) Parse() (*Microdata, error) {
tree := self.p tree, err := html.Parse(p.r)
if err != nil {
return nil, err
}
topLevelItemNodes := make([]*html.Node, 0) topLevelItemNodes := make([]*html.Node, 0)
self.identifiedNodes = make(map[string]*html.Node, 0) p.identifiedNodes = make(map[string]*html.Node, 0)
tree.Walk(func(n *html.Node) { walk(tree, func(n *html.Node) {
if _, exists := getAttr("itemscope", n); exists { if n.Type == html.ElementNode {
if _, exists := getAttr("itemprop", n); !exists { if _, exists := getAttr("itemscope", n); exists {
topLevelItemNodes = append(topLevelItemNodes, n) if _, exists := getAttr("itemprop", n); !exists {
topLevelItemNodes = append(topLevelItemNodes, n)
}
} }
}
if id, exists := getAttr("id", n); exists { if id, exists := getAttr("id", n); exists {
self.identifiedNodes[id] = n p.identifiedNodes[id] = n
}
} }
}) })
for _, node := range topLevelItemNodes { for _, node := range topLevelItemNodes {
item := NewItem() item := NewItem()
self.data.Items = append(self.data.Items, item) p.data.Items = append(p.data.Items, item)
if itemtypes, exists := getAttr("itemtype", node); exists { if itemtypes, exists := getAttr("itemtype", node); exists {
for _, itemtype := range strings.Split(strings.TrimSpace(itemtypes), " ") { for _, itemtype := range strings.Split(strings.TrimSpace(itemtypes), " ") {
itemtype = strings.TrimSpace(itemtype) itemtype = strings.TrimSpace(itemtype)
@ -129,7 +132,7 @@ func (self *Parser) Parse() (*Microdata, error) {
} }
// itemid only valid when itemscope and itemtype are both present // itemid only valid when itemscope and itemtype are both present
if itemid, exists := getAttr("itemid", node); exists { if itemid, exists := getAttr("itemid", node); exists {
if parsedUrl, err := self.base.Parse(itemid); err == nil { if parsedUrl, err := p.base.Parse(itemid); err == nil {
item.ID = parsedUrl.String() item.ID = parsedUrl.String()
} }
} }
@ -140,22 +143,22 @@ func (self *Parser) Parse() (*Microdata, error) {
for _, itemref := range strings.Split(strings.TrimSpace(itemrefs), " ") { for _, itemref := range strings.Split(strings.TrimSpace(itemrefs), " ") {
itemref = strings.TrimSpace(itemref) itemref = strings.TrimSpace(itemref)
if refnode, exists := self.identifiedNodes[itemref]; exists { if refnode, exists := p.identifiedNodes[itemref]; exists {
self.readItem(item, refnode) p.readItem(item, refnode)
} }
} }
} }
for child := node.FirstChild; child != nil; { for child := node.FirstChild; child != nil; {
self.readItem(item, child) p.readItem(item, child)
child = child.NextSibling child = child.NextSibling
} }
} }
return self.data, nil return p.data, nil
} }
func (self *Parser) readItem(item *Item, node *html.Node) { func (p *Parser) readItem(item *Item, node *html.Node) {
if itemprop, exists := getAttr("itemprop", node); exists { if itemprop, exists := getAttr("itemprop", node); exists {
if _, exists := getAttr("itemscope", node); exists { if _, exists := getAttr("itemscope", node); exists {
subitem := NewItem() subitem := NewItem()
@ -164,14 +167,14 @@ func (self *Parser) readItem(item *Item, node *html.Node) {
for _, itemref := range strings.Split(strings.TrimSpace(itemrefs), " ") { for _, itemref := range strings.Split(strings.TrimSpace(itemrefs), " ") {
itemref = strings.TrimSpace(itemref) itemref = strings.TrimSpace(itemref)
if refnode, exists := self.identifiedNodes[itemref]; exists { if refnode, exists := p.identifiedNodes[itemref]; exists {
self.readItem(subitem, refnode) p.readItem(subitem, refnode)
} }
} }
} }
for child := node.FirstChild; child != nil; { for child := node.FirstChild; child != nil; {
self.readItem(subitem, child) p.readItem(subitem, child)
child = child.NextSibling child = child.NextSibling
} }
@ -184,66 +187,65 @@ func (self *Parser) readItem(item *Item, node *html.Node) {
return return
} else { }
var propertyValue string
switch node.DataAtom { var propertyValue string
case atom.Meta:
if val, exists := getAttr("content", node); exists {
propertyValue = val
}
case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video:
if urlValue, exists := getAttr("src", node); exists {
if parsedUrl, err := self.base.Parse(urlValue); err == nil {
propertyValue = parsedUrl.String()
}
} switch node.DataAtom {
case atom.A, atom.Area, atom.Link: case atom.Meta:
if urlValue, exists := getAttr("href", node); exists { if val, exists := getAttr("content", node); exists {
if parsedUrl, err := self.base.Parse(urlValue); err == nil { propertyValue = val
propertyValue = parsedUrl.String()
}
}
case atom.Object:
if urlValue, exists := getAttr("data", node); exists {
propertyValue = urlValue
}
case atom.Data, atom.Meter:
if urlValue, exists := getAttr("value", node); exists {
propertyValue = urlValue
}
case atom.Time:
if urlValue, exists := getAttr("datetime", node); exists {
propertyValue = urlValue
}
default:
var text bytes.Buffer
h5.WalkNodes(node, func(n *html.Node) {
if n.Type == html.TextNode {
text.WriteString(n.Data)
}
})
propertyValue = text.String()
} }
case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video:
if urlValue, exists := getAttr("src", node); exists {
if parsedUrl, err := p.base.Parse(urlValue); err == nil {
propertyValue = parsedUrl.String()
}
if len(propertyValue) > 0 { }
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") { case atom.A, atom.Area, atom.Link:
propertyName = strings.TrimSpace(propertyName) if urlValue, exists := getAttr("href", node); exists {
if propertyName != "" { if parsedUrl, err := p.base.Parse(urlValue); err == nil {
item.AddString(propertyName, propertyValue) propertyValue = parsedUrl.String()
}
} }
} }
case atom.Object:
if urlValue, exists := getAttr("data", node); exists {
propertyValue = urlValue
}
case atom.Data, atom.Meter:
if urlValue, exists := getAttr("value", node); exists {
propertyValue = urlValue
}
case atom.Time:
if urlValue, exists := getAttr("datetime", node); exists {
propertyValue = urlValue
}
default:
var text bytes.Buffer
walk(node, func(n *html.Node) {
if n.Type == html.TextNode {
text.WriteString(n.Data)
}
})
propertyValue = text.String()
}
if len(propertyValue) > 0 {
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
propertyName = strings.TrimSpace(propertyName)
if propertyName != "" {
item.AddString(propertyName, propertyValue)
}
}
} }
} }
for child := node.FirstChild; child != nil; { for child := node.FirstChild; child != nil; {
self.readItem(item, child) p.readItem(item, child)
child = child.NextSibling child = child.NextSibling
} }
@ -257,3 +259,15 @@ func getAttr(name string, node *html.Node) (string, bool) {
} }
return "", false return "", false
} }
func walk(parent *html.Node, fn func(n *html.Node)) {
if parent == nil {
return
}
fn(parent)
for child := parent.FirstChild; child != nil; {
walk(child, fn)
child = child.NextSibling
}
}

View File

@ -55,7 +55,7 @@ func TestParseActuallyParses(t *testing.T) {
item := ParseOneItem(html, t) item := ParseOneItem(html, t)
if item.Properties["name"][0].(string) != "Daniel" { if item.Properties["name"][0].(string) != "Daniel" {
t.Errorf("Property value not found") t.Errorf("got %v, wanted %s", item.Properties["name"][0], "Daniel")
} }
} }