2012-06-10 22:55:12 +04:00
|
|
|
/*
|
|
|
|
To the extent possible under law, Ian Davis has waived all copyright
|
|
|
|
and related or neighboring rights to this Source Code file.
|
|
|
|
This work is published from the United Kingdom.
|
|
|
|
*/
|
|
|
|
|
|
|
|
// A package for parsing microdata
|
2012-06-07 03:49:06 +04:00
|
|
|
package microdata
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
|
|
|
"code.google.com/p/go-html-transform/h5"
|
2012-06-10 22:49:15 +04:00
|
|
|
"encoding/json"
|
2012-06-07 03:49:06 +04:00
|
|
|
"io"
|
2012-06-10 19:26:53 +04:00
|
|
|
"net/url"
|
2012-06-07 18:31:43 +04:00
|
|
|
"strings"
|
2012-06-07 03:49:06 +04:00
|
|
|
)
|
|
|
|
|
|
|
|
type ValueList []interface{}
|
|
|
|
type PropertyMap map[string]ValueList
|
|
|
|
|
2012-06-10 22:55:12 +04:00
|
|
|
// Represents a microdata item
|
2012-06-07 03:49:06 +04:00
|
|
|
type Item struct {
|
2012-06-10 22:49:15 +04:00
|
|
|
Properties PropertyMap `json:"properties"`
|
|
|
|
Types []string `json:"type,omitempty"`
|
|
|
|
ID string `json:"id,omitempty"`
|
2012-06-07 03:49:06 +04:00
|
|
|
}
|
|
|
|
|
2012-06-10 22:55:12 +04:00
|
|
|
// Create a new microdata item
|
2012-06-07 03:49:06 +04:00
|
|
|
func NewItem() *Item {
|
|
|
|
return &Item{
|
2012-06-10 21:57:35 +04:00
|
|
|
Properties: make(PropertyMap, 0),
|
|
|
|
Types: make([]string, 0),
|
2012-06-07 03:49:06 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-06-10 22:55:12 +04:00
|
|
|
// Add a string type item property value
|
|
|
|
func (self *Item) AddString(property string, value string) {
|
2012-06-10 21:57:35 +04:00
|
|
|
self.Properties[property] = append(self.Properties[property], value)
|
2012-06-07 03:49:06 +04:00
|
|
|
}
|
|
|
|
|
2012-06-10 22:55:12 +04:00
|
|
|
// Add an Item type item property value
|
|
|
|
func (self *Item) AddItem(property string, value *Item) {
|
2012-06-10 21:57:35 +04:00
|
|
|
self.Properties[property] = append(self.Properties[property], value)
|
2012-06-10 19:22:51 +04:00
|
|
|
}
|
|
|
|
|
2012-06-10 22:55:12 +04:00
|
|
|
// Add a type to the item
|
2012-06-10 22:49:15 +04:00
|
|
|
func (self *Item) AddType(value string) {
|
|
|
|
self.Types = append(self.Types, value)
|
|
|
|
}
|
2012-06-10 19:22:51 +04:00
|
|
|
|
2012-06-10 22:55:12 +04:00
|
|
|
// Represents a set of microdata items
|
2012-06-07 03:49:06 +04:00
|
|
|
type Microdata struct {
|
2012-06-10 22:49:15 +04:00
|
|
|
Items []*Item `json:"items"`
|
2012-06-07 03:49:06 +04:00
|
|
|
}
|
|
|
|
|
2012-06-10 22:55:12 +04:00
|
|
|
// Create a new microdata set
|
2012-06-07 03:49:06 +04:00
|
|
|
func NewMicrodata() *Microdata {
|
|
|
|
return &Microdata{
|
2012-06-10 21:57:35 +04:00
|
|
|
Items: make([]*Item, 0),
|
2012-06-07 03:49:06 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-06-10 22:55:12 +04:00
|
|
|
// Add an item to the microdata set
|
2012-06-10 22:49:15 +04:00
|
|
|
func (self *Microdata) AddItem(value *Item) {
|
|
|
|
self.Items = append(self.Items, value)
|
|
|
|
}
|
|
|
|
|
2012-06-10 22:55:12 +04:00
|
|
|
// Convert the microdata set to JSON
|
2012-06-10 22:49:15 +04:00
|
|
|
func (self *Microdata) Json() ([]byte, error) {
|
|
|
|
b, err := json.Marshal(self)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
return b, nil
|
|
|
|
}
|
|
|
|
|
2012-06-10 22:55:12 +04:00
|
|
|
// An HTML parser that extracts microdata
|
2012-06-07 03:49:06 +04:00
|
|
|
type Parser struct {
|
2012-06-10 17:59:30 +04:00
|
|
|
p *h5.Parser
|
|
|
|
data *Microdata
|
2012-06-10 22:49:15 +04:00
|
|
|
base *url.URL
|
2012-06-10 17:18:33 +04:00
|
|
|
identifiedNodes map[string]*h5.Node
|
2012-06-07 03:49:06 +04:00
|
|
|
}
|
|
|
|
|
2012-06-10 22:55:12 +04:00
|
|
|
// Create a new parser for extracting microdata
|
|
|
|
// r is a reader over an HTML document
|
|
|
|
// base is the base URL for resolving relative URLs
|
2012-06-10 21:57:35 +04:00
|
|
|
func NewParser(r io.Reader, base *url.URL) *Parser {
|
2012-06-10 17:59:30 +04:00
|
|
|
return &Parser{
|
|
|
|
p: h5.NewParser(r),
|
2012-06-07 03:49:06 +04:00
|
|
|
data: NewMicrodata(),
|
2012-06-10 21:57:35 +04:00
|
|
|
base: base,
|
2012-06-07 03:49:06 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-06-10 22:55:12 +04:00
|
|
|
// Parse the document and return a Microdata set
|
2012-06-07 03:49:06 +04:00
|
|
|
func (self *Parser) Parse() (*Microdata, error) {
|
|
|
|
err := self.p.Parse()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
tree := self.p.Tree()
|
|
|
|
|
2012-06-10 17:18:33 +04:00
|
|
|
topLevelItemNodes := make([]*h5.Node, 0)
|
|
|
|
self.identifiedNodes = make(map[string]*h5.Node, 0)
|
2012-06-07 03:49:06 +04:00
|
|
|
|
2012-06-10 17:59:30 +04:00
|
|
|
tree.Walk(func(n *h5.Node) {
|
2012-06-10 17:18:33 +04:00
|
|
|
if _, exists := getAttr("itemscope", n); exists {
|
|
|
|
if _, exists := getAttr("itemprop", n); !exists {
|
|
|
|
topLevelItemNodes = append(topLevelItemNodes, n)
|
|
|
|
}
|
|
|
|
}
|
2012-06-07 03:49:06 +04:00
|
|
|
|
2012-06-10 17:18:33 +04:00
|
|
|
if id, exists := getAttr("id", n); exists {
|
|
|
|
self.identifiedNodes[id] = n
|
|
|
|
}
|
2012-06-10 17:59:30 +04:00
|
|
|
})
|
2012-06-10 17:18:33 +04:00
|
|
|
|
|
|
|
for _, node := range topLevelItemNodes {
|
2012-06-07 03:49:06 +04:00
|
|
|
item := NewItem()
|
2012-06-10 21:57:35 +04:00
|
|
|
self.data.Items = append(self.data.Items, item)
|
2012-06-07 18:31:43 +04:00
|
|
|
if itemtypes, exists := getAttr("itemtype", node); exists {
|
|
|
|
for _, itemtype := range strings.Split(strings.TrimSpace(itemtypes), " ") {
|
|
|
|
itemtype = strings.TrimSpace(itemtype)
|
|
|
|
if itemtype != "" {
|
2012-06-10 21:57:35 +04:00
|
|
|
item.Types = append(item.Types, itemtype)
|
2012-06-07 18:31:43 +04:00
|
|
|
}
|
|
|
|
}
|
2012-06-07 18:41:50 +04:00
|
|
|
// itemid only valid when itemscope and itemtype are both present
|
2012-06-10 17:59:30 +04:00
|
|
|
if itemid, exists := getAttr("itemid", node); exists {
|
2012-06-10 21:57:35 +04:00
|
|
|
if parsedUrl, err := self.base.Parse(itemid); err == nil {
|
|
|
|
item.ID = parsedUrl.String()
|
|
|
|
}
|
2012-06-07 18:41:50 +04:00
|
|
|
}
|
2012-06-07 03:49:06 +04:00
|
|
|
|
2012-06-10 17:59:30 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if itemrefs, exists := getAttr("itemref", node); exists {
|
|
|
|
for _, itemref := range strings.Split(strings.TrimSpace(itemrefs), " ") {
|
|
|
|
itemref = strings.TrimSpace(itemref)
|
|
|
|
|
|
|
|
if refnode, exists := self.identifiedNodes[itemref]; exists {
|
|
|
|
self.readItem(item, refnode)
|
|
|
|
}
|
2012-06-10 17:18:33 +04:00
|
|
|
}
|
|
|
|
}
|
2012-06-07 03:49:06 +04:00
|
|
|
|
|
|
|
if len(node.Children) > 0 {
|
2012-06-10 17:59:30 +04:00
|
|
|
for _, child := range node.Children {
|
|
|
|
self.readItem(item, child)
|
|
|
|
}
|
|
|
|
}
|
2012-06-07 03:49:06 +04:00
|
|
|
}
|
|
|
|
|
2012-06-10 17:18:33 +04:00
|
|
|
return self.data, nil
|
2012-06-07 03:49:06 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
func (self *Parser) readItem(item *Item, node *h5.Node) {
|
2012-06-07 18:31:43 +04:00
|
|
|
if itemprop, exists := getAttr("itemprop", node); exists {
|
2012-06-10 19:22:51 +04:00
|
|
|
if _, exists := getAttr("itemscope", node); exists {
|
|
|
|
subitem := NewItem()
|
2012-06-10 17:59:30 +04:00
|
|
|
|
2012-06-10 19:22:51 +04:00
|
|
|
if itemrefs, exists := getAttr("itemref", node); exists {
|
|
|
|
for _, itemref := range strings.Split(strings.TrimSpace(itemrefs), " ") {
|
|
|
|
itemref = strings.TrimSpace(itemref)
|
2012-06-07 03:49:06 +04:00
|
|
|
|
2012-06-10 19:22:51 +04:00
|
|
|
if refnode, exists := self.identifiedNodes[itemref]; exists {
|
|
|
|
self.readItem(subitem, refnode)
|
|
|
|
}
|
|
|
|
}
|
2012-06-10 17:59:30 +04:00
|
|
|
}
|
2012-06-10 19:22:51 +04:00
|
|
|
|
|
|
|
if len(node.Children) > 0 {
|
|
|
|
for _, child := range node.Children {
|
|
|
|
self.readItem(subitem, child)
|
|
|
|
}
|
2012-06-10 17:59:30 +04:00
|
|
|
}
|
2012-06-10 19:22:51 +04:00
|
|
|
|
|
|
|
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
|
|
|
|
propertyName = strings.TrimSpace(propertyName)
|
|
|
|
if propertyName != "" {
|
2012-06-10 22:55:12 +04:00
|
|
|
item.AddItem(propertyName, subitem)
|
2012-06-10 19:22:51 +04:00
|
|
|
}
|
2012-06-10 17:59:30 +04:00
|
|
|
}
|
2012-06-07 03:49:06 +04:00
|
|
|
|
2012-06-10 19:22:51 +04:00
|
|
|
return
|
|
|
|
|
|
|
|
} else {
|
|
|
|
var propertyValue string
|
|
|
|
|
|
|
|
switch node.Data() {
|
|
|
|
|
|
|
|
case "img", "audio", "source", "video", "embed", "iframe", "track":
|
|
|
|
if urlValue, exists := getAttr("src", node); exists {
|
2012-06-10 21:57:35 +04:00
|
|
|
if parsedUrl, err := self.base.Parse(urlValue); err == nil {
|
|
|
|
propertyValue = parsedUrl.String()
|
|
|
|
}
|
|
|
|
|
2012-06-10 19:22:51 +04:00
|
|
|
}
|
|
|
|
case "a", "area", "link":
|
|
|
|
if urlValue, exists := getAttr("href", node); exists {
|
2012-06-10 21:57:35 +04:00
|
|
|
if parsedUrl, err := self.base.Parse(urlValue); err == nil {
|
|
|
|
propertyValue = parsedUrl.String()
|
|
|
|
}
|
2012-06-10 19:22:51 +04:00
|
|
|
}
|
|
|
|
case "data":
|
|
|
|
if urlValue, exists := getAttr("value", node); exists {
|
|
|
|
propertyValue = urlValue
|
|
|
|
}
|
|
|
|
case "time":
|
|
|
|
if urlValue, exists := getAttr("datetime", node); exists {
|
|
|
|
propertyValue = urlValue
|
2012-06-10 17:59:30 +04:00
|
|
|
}
|
2012-06-07 03:49:06 +04:00
|
|
|
|
2012-06-10 19:22:51 +04:00
|
|
|
default:
|
|
|
|
var text bytes.Buffer
|
|
|
|
node.Walk(func(n *h5.Node) {
|
|
|
|
if n.Type == h5.TextNode {
|
|
|
|
text.WriteString(n.Data())
|
|
|
|
}
|
|
|
|
|
|
|
|
})
|
|
|
|
propertyValue = text.String()
|
|
|
|
}
|
2012-06-07 03:49:06 +04:00
|
|
|
|
2012-06-10 21:57:35 +04:00
|
|
|
if len(propertyValue) > 0 {
|
|
|
|
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
|
|
|
|
propertyName = strings.TrimSpace(propertyName)
|
|
|
|
if propertyName != "" {
|
2012-06-10 22:55:12 +04:00
|
|
|
item.AddString(propertyName, propertyValue)
|
2012-06-10 21:57:35 +04:00
|
|
|
}
|
2012-06-10 19:22:51 +04:00
|
|
|
}
|
2012-06-07 18:31:43 +04:00
|
|
|
}
|
2012-06-10 19:22:51 +04:00
|
|
|
|
2012-06-07 18:31:43 +04:00
|
|
|
}
|
2012-06-10 19:22:51 +04:00
|
|
|
|
2012-06-07 03:49:06 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if len(node.Children) > 0 {
|
2012-06-10 17:59:30 +04:00
|
|
|
for _, child := range node.Children {
|
|
|
|
self.readItem(item, child)
|
|
|
|
}
|
|
|
|
}
|
2012-06-10 19:22:51 +04:00
|
|
|
|
2012-06-07 03:49:06 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
func getAttr(name string, node *h5.Node) (string, bool) {
|
|
|
|
for _, a := range node.Attr {
|
|
|
|
if a.Name == name {
|
|
|
|
return a.Value, true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return "", false
|
|
|
|
}
|