package microdata import ( "bytes" "code.google.com/p/go-html-transform/h5" "io" ) type ValueList []interface{} type PropertyMap map[string]ValueList type Item struct { properties PropertyMap } func NewItem() *Item { return &Item{ properties: make(PropertyMap, 10), } } func (self *Item) SetString(property string, value string) { self.properties[property] = append(self.properties[property], value) } type Microdata struct { items []*Item } func NewMicrodata() *Microdata { return &Microdata{ items: make([]*Item, 0), } } type Parser struct { p *h5.Parser data *Microdata } func NewParser(r io.Reader) *Parser { return &Parser { p : h5.NewParser(r), data: NewMicrodata(), } } func (self *Parser) Parse() (*Microdata, error) { err := self.p.Parse() if err != nil { return nil, err } tree := self.p.Tree() self.scanForItem(tree) return self.data, nil } func (self *Parser) scanForItem(node *h5.Node) { if node == nil { return } hasItemscope := false for _, a := range node.Attr { if a.Name == "itemscope" { hasItemscope = true break } } if hasItemscope { item := NewItem() self.data.items = append(self.data.items, item) if len(node.Children) > 0 { for _, child := range node.Children { self.readItem(item, child) } } } else { if len(node.Children) > 0 { for _, child := range node.Children { self.scanForItem(child) } } } } func (self *Parser) readItem(item *Item, node *h5.Node) { if propertyName, exists := getAttr("itemprop", node); exists { var propertyValue string switch node.Data() { case "img","audio", "source", "video", "embed", "iframe", "track": if urlValue, exists := getAttr("src", node); exists { propertyValue = urlValue } case "a", "area", "link": if urlValue, exists := getAttr("href", node); exists { propertyValue = urlValue } case "data": if urlValue, exists := getAttr("value", node); exists { propertyValue = urlValue } case "time": if urlValue, exists := getAttr("datetime", node); exists { propertyValue = urlValue } default: var text bytes.Buffer node.Walk( func(n *h5.Node) { if n.Type == h5.TextNode { text.WriteString(n.Data()) } }) propertyValue = text.String() } item.SetString(propertyName, propertyValue) } if len(node.Children) > 0 { for _, child := range node.Children { self.readItem(item, child) } } } func getAttr(name string, node *h5.Node) (string, bool) { for _, a := range node.Attr { if a.Name == name { return a.Value, true } } return "", false }