microdata_mirror/microdata.go

209 lines
4.5 KiB
Go
Raw Normal View History

2012-06-07 03:49:06 +04:00
package microdata
import (
"bytes"
"code.google.com/p/go-html-transform/h5"
"io"
2012-06-07 18:31:43 +04:00
"strings"
2012-06-07 03:49:06 +04:00
)
type ValueList []interface{}
type PropertyMap map[string]ValueList
type Item struct {
properties PropertyMap
2012-06-10 17:59:30 +04:00
types []string
id string
2012-06-07 03:49:06 +04:00
}
func NewItem() *Item {
return &Item{
2012-06-07 18:31:43 +04:00
properties: make(PropertyMap, 0),
2012-06-10 17:59:30 +04:00
types: make([]string, 0),
2012-06-07 03:49:06 +04:00
}
}
func (self *Item) SetString(property string, value string) {
self.properties[property] = append(self.properties[property], value)
}
2012-06-10 19:22:51 +04:00
func (self *Item) SetItem(property string, value *Item) {
self.properties[property] = append(self.properties[property], value)
}
2012-06-07 03:49:06 +04:00
type Microdata struct {
items []*Item
}
func NewMicrodata() *Microdata {
return &Microdata{
items: make([]*Item, 0),
}
}
type Parser struct {
2012-06-10 17:59:30 +04:00
p *h5.Parser
data *Microdata
2012-06-10 17:18:33 +04:00
identifiedNodes map[string]*h5.Node
2012-06-07 03:49:06 +04:00
}
func NewParser(r io.Reader) *Parser {
2012-06-10 17:59:30 +04:00
return &Parser{
p: h5.NewParser(r),
2012-06-07 03:49:06 +04:00
data: NewMicrodata(),
}
}
func (self *Parser) Parse() (*Microdata, error) {
err := self.p.Parse()
if err != nil {
return nil, err
}
tree := self.p.Tree()
2012-06-10 17:18:33 +04:00
topLevelItemNodes := make([]*h5.Node, 0)
self.identifiedNodes = make(map[string]*h5.Node, 0)
2012-06-07 03:49:06 +04:00
2012-06-10 17:59:30 +04:00
tree.Walk(func(n *h5.Node) {
2012-06-10 17:18:33 +04:00
if _, exists := getAttr("itemscope", n); exists {
if _, exists := getAttr("itemprop", n); !exists {
topLevelItemNodes = append(topLevelItemNodes, n)
}
}
2012-06-07 03:49:06 +04:00
2012-06-10 17:18:33 +04:00
if id, exists := getAttr("id", n); exists {
self.identifiedNodes[id] = n
}
2012-06-10 17:59:30 +04:00
})
2012-06-10 17:18:33 +04:00
for _, node := range topLevelItemNodes {
2012-06-07 03:49:06 +04:00
item := NewItem()
self.data.items = append(self.data.items, item)
2012-06-07 18:31:43 +04:00
if itemtypes, exists := getAttr("itemtype", node); exists {
for _, itemtype := range strings.Split(strings.TrimSpace(itemtypes), " ") {
itemtype = strings.TrimSpace(itemtype)
if itemtype != "" {
item.types = append(item.types, itemtype)
}
}
2012-06-07 18:41:50 +04:00
// itemid only valid when itemscope and itemtype are both present
2012-06-10 17:59:30 +04:00
if itemid, exists := getAttr("itemid", node); exists {
2012-06-07 18:41:50 +04:00
item.id = strings.TrimSpace(itemid)
}
2012-06-07 03:49:06 +04:00
2012-06-10 17:59:30 +04:00
}
if itemrefs, exists := getAttr("itemref", node); exists {
for _, itemref := range strings.Split(strings.TrimSpace(itemrefs), " ") {
itemref = strings.TrimSpace(itemref)
if refnode, exists := self.identifiedNodes[itemref]; exists {
self.readItem(item, refnode)
}
2012-06-10 17:18:33 +04:00
}
}
2012-06-07 03:49:06 +04:00
if len(node.Children) > 0 {
2012-06-10 17:59:30 +04:00
for _, child := range node.Children {
self.readItem(item, child)
}
}
2012-06-07 03:49:06 +04:00
}
2012-06-10 17:18:33 +04:00
return self.data, nil
2012-06-07 03:49:06 +04:00
}
func (self *Parser) readItem(item *Item, node *h5.Node) {
2012-06-07 18:31:43 +04:00
if itemprop, exists := getAttr("itemprop", node); exists {
2012-06-10 19:22:51 +04:00
if _, exists := getAttr("itemscope", node); exists {
subitem := NewItem()
2012-06-10 17:59:30 +04:00
2012-06-10 19:22:51 +04:00
if itemrefs, exists := getAttr("itemref", node); exists {
for _, itemref := range strings.Split(strings.TrimSpace(itemrefs), " ") {
itemref = strings.TrimSpace(itemref)
2012-06-07 03:49:06 +04:00
2012-06-10 19:22:51 +04:00
if refnode, exists := self.identifiedNodes[itemref]; exists {
self.readItem(subitem, refnode)
}
}
2012-06-10 17:59:30 +04:00
}
2012-06-10 19:22:51 +04:00
if len(node.Children) > 0 {
for _, child := range node.Children {
self.readItem(subitem, child)
}
2012-06-10 17:59:30 +04:00
}
2012-06-10 19:22:51 +04:00
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
propertyName = strings.TrimSpace(propertyName)
if propertyName != "" {
item.SetItem(propertyName, subitem)
}
2012-06-10 17:59:30 +04:00
}
2012-06-07 03:49:06 +04:00
2012-06-10 19:22:51 +04:00
return
} else {
var propertyValue string
switch node.Data() {
case "img", "audio", "source", "video", "embed", "iframe", "track":
if urlValue, exists := getAttr("src", node); exists {
propertyValue = urlValue
}
case "a", "area", "link":
if urlValue, exists := getAttr("href", node); exists {
propertyValue = urlValue
}
case "data":
if urlValue, exists := getAttr("value", node); exists {
propertyValue = urlValue
}
case "time":
if urlValue, exists := getAttr("datetime", node); exists {
propertyValue = urlValue
2012-06-10 17:59:30 +04:00
}
2012-06-07 03:49:06 +04:00
2012-06-10 19:22:51 +04:00
default:
var text bytes.Buffer
node.Walk(func(n *h5.Node) {
if n.Type == h5.TextNode {
text.WriteString(n.Data())
}
})
propertyValue = text.String()
}
2012-06-07 03:49:06 +04:00
2012-06-10 19:22:51 +04:00
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
propertyName = strings.TrimSpace(propertyName)
if propertyName != "" {
item.SetString(propertyName, propertyValue)
}
2012-06-07 18:31:43 +04:00
}
2012-06-10 19:22:51 +04:00
2012-06-07 18:31:43 +04:00
}
2012-06-10 19:22:51 +04:00
2012-06-07 03:49:06 +04:00
}
if len(node.Children) > 0 {
2012-06-10 17:59:30 +04:00
for _, child := range node.Children {
self.readItem(item, child)
}
}
2012-06-10 19:22:51 +04:00
2012-06-07 03:49:06 +04:00
}
func getAttr(name string, node *h5.Node) (string, bool) {
for _, a := range node.Attr {
if a.Name == name {
return a.Value, true
}
}
return "", false
}