diff --git a/README.md b/README.md index 1037ec4..26be83c 100644 --- a/README.md +++ b/README.md @@ -38,37 +38,38 @@ Example of parsing a string containing HTML: } println("Name: ", data.Items[0].Properties["name"][0].(string)) - } + } Extract microdata from a webpage and print the result as JSON package main import ( - "bytes" - "github.com/iand/microdata" - "io/ioutil" - "net/http" - "net/url" - "os" + "bytes" + "io/ioutil" + "net/http" + "net/url" + "os" + + "github.com/iand/microdata" ) func main() { - baseUrl, _ := url.Parse("http://tagger.steve.museum/steve/object/44863?offset=6") + baseUrl, _ := url.Parse("http://www.designhive.com/blog/using-schemaorg-microdata") - resp, _ := http.Get(baseUrl.String()) - defer resp.Body.Close() + resp, _ := http.Get(baseUrl.String()) + defer resp.Body.Close() - html, _ := ioutil.ReadAll(resp.Body) + html, _ := ioutil.ReadAll(resp.Body) - p := microdata.NewParser(bytes.NewReader(html), baseUrl) + p := microdata.NewParser(bytes.NewReader(html), baseUrl) - data, _ := p.Parse() + data, _ := p.Parse() - json, _ := data.Json() - os.Stdout.Write(json) - } + json, _ := data.Json() + os.Stdout.Write(json) + } ## Authors @@ -83,11 +84,11 @@ Extract microdata from a webpage and print the result as JSON * Do submit your changes as a pull request * Do your best to adhere to the existing coding conventions and idioms. -* Do run `go fmt` on the code before committing +* Do run `go fmt` on the code before committing * Do feel free to add yourself to the [`CREDITS`](CREDITS) file and the - corresponding Contributors list in the the [`README.md`](README.md). + corresponding Contributors list in the the [`README.md`](README.md). Alphabetical order applies. -* Don't touch the [`AUTHORS`](AUTHORS) file. An existing author will add you if +* Don't touch the [`AUTHORS`](AUTHORS) file. An existing author will add you if your contributions are significant enough. * Do note that in order for any non-trivial changes to be merged (as a rule of thumb, additions larger than about 15 lines of code), an explicit diff --git a/microdata.go b/microdata.go index 2f8f573..1dfef0b 100644 --- a/microdata.go +++ b/microdata.go @@ -9,13 +9,13 @@ package microdata import ( "bytes" - "code.google.com/p/go-html-transform/h5" - "code.google.com/p/go.net/html" - "code.google.com/p/go.net/html/atom" "encoding/json" "io" "net/url" "strings" + + "golang.org/x/net/html" + "golang.org/x/net/html/atom" ) type ValueList []interface{} @@ -37,18 +37,18 @@ func NewItem() *Item { } // Add a string type item property value -func (self *Item) AddString(property string, value string) { - self.Properties[property] = append(self.Properties[property], value) +func (i *Item) AddString(property string, value string) { + i.Properties[property] = append(i.Properties[property], value) } // Add an Item type item property value -func (self *Item) AddItem(property string, value *Item) { - self.Properties[property] = append(self.Properties[property], value) +func (i *Item) AddItem(property string, value *Item) { + i.Properties[property] = append(i.Properties[property], value) } // Add a type to the item -func (self *Item) AddType(value string) { - self.Types = append(self.Types, value) +func (i *Item) AddType(value string) { + i.Types = append(i.Types, value) } // Represents a set of microdata items @@ -64,13 +64,13 @@ func NewMicrodata() *Microdata { } // Add an item to the microdata set -func (self *Microdata) AddItem(value *Item) { - self.Items = append(self.Items, value) +func (m *Microdata) AddItem(value *Item) { + m.Items = append(m.Items, value) } // Convert the microdata set to JSON -func (self *Microdata) Json() ([]byte, error) { - b, err := json.Marshal(self) +func (m *Microdata) Json() ([]byte, error) { + b, err := json.Marshal(m) if err != nil { return nil, err } @@ -79,7 +79,7 @@ func (self *Microdata) Json() ([]byte, error) { // An HTML parser that extracts microdata type Parser struct { - p *h5.Tree + r io.Reader data *Microdata base *url.URL identifiedNodes map[string]*html.Node @@ -89,37 +89,40 @@ type Parser struct { // r is a reader over an HTML document // base is the base URL for resolving relative URLs func NewParser(r io.Reader, base *url.URL) *Parser { - p, _ := h5.New(r) - return &Parser{ - p: p, + r: r, data: NewMicrodata(), base: base, } } // Parse the document and return a Microdata set -func (self *Parser) Parse() (*Microdata, error) { - tree := self.p +func (p *Parser) Parse() (*Microdata, error) { + tree, err := html.Parse(p.r) + if err != nil { + return nil, err + } topLevelItemNodes := make([]*html.Node, 0) - self.identifiedNodes = make(map[string]*html.Node, 0) + p.identifiedNodes = make(map[string]*html.Node, 0) - tree.Walk(func(n *html.Node) { - if _, exists := getAttr("itemscope", n); exists { - if _, exists := getAttr("itemprop", n); !exists { - topLevelItemNodes = append(topLevelItemNodes, n) + walk(tree, func(n *html.Node) { + if n.Type == html.ElementNode { + if _, exists := getAttr("itemscope", n); exists { + if _, exists := getAttr("itemprop", n); !exists { + topLevelItemNodes = append(topLevelItemNodes, n) + } } - } - if id, exists := getAttr("id", n); exists { - self.identifiedNodes[id] = n + if id, exists := getAttr("id", n); exists { + p.identifiedNodes[id] = n + } } }) for _, node := range topLevelItemNodes { item := NewItem() - self.data.Items = append(self.data.Items, item) + p.data.Items = append(p.data.Items, item) if itemtypes, exists := getAttr("itemtype", node); exists { for _, itemtype := range strings.Split(strings.TrimSpace(itemtypes), " ") { itemtype = strings.TrimSpace(itemtype) @@ -129,7 +132,7 @@ func (self *Parser) Parse() (*Microdata, error) { } // itemid only valid when itemscope and itemtype are both present if itemid, exists := getAttr("itemid", node); exists { - if parsedUrl, err := self.base.Parse(itemid); err == nil { + if parsedUrl, err := p.base.Parse(itemid); err == nil { item.ID = parsedUrl.String() } } @@ -140,22 +143,22 @@ func (self *Parser) Parse() (*Microdata, error) { for _, itemref := range strings.Split(strings.TrimSpace(itemrefs), " ") { itemref = strings.TrimSpace(itemref) - if refnode, exists := self.identifiedNodes[itemref]; exists { - self.readItem(item, refnode) + if refnode, exists := p.identifiedNodes[itemref]; exists { + p.readItem(item, refnode) } } } for child := node.FirstChild; child != nil; { - self.readItem(item, child) + p.readItem(item, child) child = child.NextSibling } } - return self.data, nil + return p.data, nil } -func (self *Parser) readItem(item *Item, node *html.Node) { +func (p *Parser) readItem(item *Item, node *html.Node) { if itemprop, exists := getAttr("itemprop", node); exists { if _, exists := getAttr("itemscope", node); exists { subitem := NewItem() @@ -164,14 +167,14 @@ func (self *Parser) readItem(item *Item, node *html.Node) { for _, itemref := range strings.Split(strings.TrimSpace(itemrefs), " ") { itemref = strings.TrimSpace(itemref) - if refnode, exists := self.identifiedNodes[itemref]; exists { - self.readItem(subitem, refnode) + if refnode, exists := p.identifiedNodes[itemref]; exists { + p.readItem(subitem, refnode) } } } for child := node.FirstChild; child != nil; { - self.readItem(subitem, child) + p.readItem(subitem, child) child = child.NextSibling } @@ -184,66 +187,65 @@ func (self *Parser) readItem(item *Item, node *html.Node) { return - } else { - var propertyValue string + } - switch node.DataAtom { - case atom.Meta: - if val, exists := getAttr("content", node); exists { - propertyValue = val - } - case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video: - if urlValue, exists := getAttr("src", node); exists { - if parsedUrl, err := self.base.Parse(urlValue); err == nil { - propertyValue = parsedUrl.String() - } + var propertyValue string - } - case atom.A, atom.Area, atom.Link: - if urlValue, exists := getAttr("href", node); exists { - if parsedUrl, err := self.base.Parse(urlValue); err == nil { - propertyValue = parsedUrl.String() - } - } - case atom.Object: - if urlValue, exists := getAttr("data", node); exists { - propertyValue = urlValue - } - case atom.Data, atom.Meter: - if urlValue, exists := getAttr("value", node); exists { - propertyValue = urlValue - } - case atom.Time: - if urlValue, exists := getAttr("datetime", node); exists { - propertyValue = urlValue - } - - default: - var text bytes.Buffer - h5.WalkNodes(node, func(n *html.Node) { - if n.Type == html.TextNode { - text.WriteString(n.Data) - } - - }) - propertyValue = text.String() + switch node.DataAtom { + case atom.Meta: + if val, exists := getAttr("content", node); exists { + propertyValue = val } + case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video: + if urlValue, exists := getAttr("src", node); exists { + if parsedUrl, err := p.base.Parse(urlValue); err == nil { + propertyValue = parsedUrl.String() + } - if len(propertyValue) > 0 { - for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") { - propertyName = strings.TrimSpace(propertyName) - if propertyName != "" { - item.AddString(propertyName, propertyValue) - } + } + case atom.A, atom.Area, atom.Link: + if urlValue, exists := getAttr("href", node); exists { + if parsedUrl, err := p.base.Parse(urlValue); err == nil { + propertyValue = parsedUrl.String() } } + case atom.Object: + if urlValue, exists := getAttr("data", node); exists { + propertyValue = urlValue + } + case atom.Data, atom.Meter: + if urlValue, exists := getAttr("value", node); exists { + propertyValue = urlValue + } + case atom.Time: + if urlValue, exists := getAttr("datetime", node); exists { + propertyValue = urlValue + } + default: + var text bytes.Buffer + walk(node, func(n *html.Node) { + if n.Type == html.TextNode { + text.WriteString(n.Data) + } + + }) + propertyValue = text.String() + } + + if len(propertyValue) > 0 { + for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") { + propertyName = strings.TrimSpace(propertyName) + if propertyName != "" { + item.AddString(propertyName, propertyValue) + } + } } } for child := node.FirstChild; child != nil; { - self.readItem(item, child) + p.readItem(item, child) child = child.NextSibling } @@ -257,3 +259,15 @@ func getAttr(name string, node *html.Node) (string, bool) { } return "", false } + +func walk(parent *html.Node, fn func(n *html.Node)) { + if parent == nil { + return + } + fn(parent) + + for child := parent.FirstChild; child != nil; { + walk(child, fn) + child = child.NextSibling + } +} diff --git a/microdata_test.go b/microdata_test.go index f24572f..b87fb76 100644 --- a/microdata_test.go +++ b/microdata_test.go @@ -55,7 +55,7 @@ func TestParseActuallyParses(t *testing.T) { item := ParseOneItem(html, t) if item.Properties["name"][0].(string) != "Daniel" { - t.Errorf("Property value not found") + t.Errorf("got %v, wanted %s", item.Properties["name"][0], "Daniel") } } @@ -113,7 +113,7 @@ func TestParseAreaHref(t *testing.T) { html := `
- +
` item := ParseOneItem(html, t)