diff --git a/microdata.go b/microdata.go index 3d7d706..1bafbf8 100644 --- a/microdata.go +++ b/microdata.go @@ -27,6 +27,11 @@ func (self *Item) SetString(property string, value string) { self.properties[property] = append(self.properties[property], value) } +func (self *Item) SetItem(property string, value *Item) { + self.properties[property] = append(self.properties[property], value) +} + + type Microdata struct { items []*Item } @@ -111,44 +116,78 @@ func (self *Parser) Parse() (*Microdata, error) { func (self *Parser) readItem(item *Item, node *h5.Node) { if itemprop, exists := getAttr("itemprop", node); exists { - var propertyValue string + if _, exists := getAttr("itemscope", node); exists { + subitem := NewItem() - switch node.Data() { + if itemrefs, exists := getAttr("itemref", node); exists { + for _, itemref := range strings.Split(strings.TrimSpace(itemrefs), " ") { + itemref = strings.TrimSpace(itemref) - case "img", "audio", "source", "video", "embed", "iframe", "track": - if urlValue, exists := getAttr("src", node); exists { - propertyValue = urlValue - } - case "a", "area", "link": - if urlValue, exists := getAttr("href", node); exists { - propertyValue = urlValue - } - case "data": - if urlValue, exists := getAttr("value", node); exists { - propertyValue = urlValue - } - case "time": - if urlValue, exists := getAttr("datetime", node); exists { - propertyValue = urlValue + if refnode, exists := self.identifiedNodes[itemref]; exists { + self.readItem(subitem, refnode) + } + } } - default: - var text bytes.Buffer - node.Walk(func(n *h5.Node) { - if n.Type == h5.TextNode { - text.WriteString(n.Data()) + if len(node.Children) > 0 { + for _, child := range node.Children { + self.readItem(subitem, child) + } + } + + for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") { + propertyName = strings.TrimSpace(propertyName) + if propertyName != "" { + item.SetItem(propertyName, subitem) + } + } + + return + + } else { + var propertyValue string + + switch node.Data() { + + case "img", "audio", "source", "video", "embed", "iframe", "track": + if urlValue, exists := getAttr("src", node); exists { + propertyValue = urlValue + } + case "a", "area", "link": + if urlValue, exists := getAttr("href", node); exists { + propertyValue = urlValue + } + case "data": + if urlValue, exists := getAttr("value", node); exists { + propertyValue = urlValue + } + case "time": + if urlValue, exists := getAttr("datetime", node); exists { + propertyValue = urlValue } - }) - propertyValue = text.String() + default: + var text bytes.Buffer + node.Walk(func(n *h5.Node) { + if n.Type == h5.TextNode { + text.WriteString(n.Data()) + } + + }) + propertyValue = text.String() + } + + for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") { + propertyName = strings.TrimSpace(propertyName) + if propertyName != "" { + item.SetString(propertyName, propertyValue) + } + } + + } - for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") { - propertyName = strings.TrimSpace(propertyName) - if propertyName != "" { - item.SetString(propertyName, propertyValue) - } - } + } if len(node.Children) > 0 { @@ -156,6 +195,7 @@ func (self *Parser) readItem(item *Item, node *h5.Node) { self.readItem(item, child) } } + } func getAttr(name string, node *h5.Node) (string, bool) { diff --git a/microdata_test.go b/microdata_test.go index 4c85916..67fb8aa 100644 --- a/microdata_test.go +++ b/microdata_test.go @@ -465,7 +465,35 @@ func TestParseEmbeddedItem(t *testing.T) { t.Errorf("Property value 'Amanda' not found for 'name'") } - subitem := data.items[0].properties["band"][0].(Item) + subitem := data.items[0].properties["band"][0].(*Item) + + if subitem.properties["name"][0].(string) != "Jazz Band" { + t.Errorf("Property value 'Jazz Band' not found for 'name'") + } +} + +func TestParseEmbeddedItemWithItemRef(t *testing.T) { + html := ` +
+

Name: Amanda

+
+
+

Band: Jazz Band

+

Size: 12 players

+
` + + data := ParseData(html, t) + + if len(data.items) != 1 { + t.Errorf("Expecting 1 item but got %d", len(data.items)) + } + + + if data.items[0].properties["name"][0].(string) != "Amanda" { + t.Errorf("Property value 'Amanda' not found for 'name'") + } + + subitem := data.items[0].properties["band"][0].(*Item) if subitem.properties["name"][0].(string) != "Jazz Band" { t.Errorf("Property value 'Jazz Band' not found for 'name'")