diff --git a/microdata.go b/microdata.go index a6b9a7d..a42d024 100644 --- a/microdata.go +++ b/microdata.go @@ -42,6 +42,7 @@ func NewMicrodata() *Microdata { type Parser struct { p *h5.Parser data *Microdata + identifiedNodes map[string]*h5.Node } func NewParser(r io.Reader) *Parser { @@ -58,17 +59,23 @@ func (self *Parser) Parse() (*Microdata, error) { } tree := self.p.Tree() - self.scanForItem(tree) + topLevelItemNodes := make([]*h5.Node, 0) + self.identifiedNodes = make(map[string]*h5.Node, 0) - return self.data, nil -} -func (self *Parser) scanForItem(node *h5.Node) { - if node == nil { - return - } + tree.Walk( func(n *h5.Node) { + if _, exists := getAttr("itemscope", n); exists { + if _, exists := getAttr("itemprop", n); !exists { + topLevelItemNodes = append(topLevelItemNodes, n) + } + } - if _, exists := getAttr("itemscope", node); exists { + if id, exists := getAttr("id", n); exists { + self.identifiedNodes[id] = n + } + }) + + for _, node := range topLevelItemNodes { item := NewItem() self.data.items = append(self.data.items, item) if itemtypes, exists := getAttr("itemtype", node); exists { @@ -79,29 +86,30 @@ func (self *Parser) scanForItem(node *h5.Node) { } } // itemid only valid when itemscope and itemtype are both present - if itemid, exists := getAttr("itemid", node); exists { + if itemid, exists := getAttr("itemid", node); exists { item.id = strings.TrimSpace(itemid) } } + if itemref, exists := getAttr("itemref", node); exists { + if refnode, exists := self.identifiedNodes[itemref]; exists { + self.readItem(item, refnode) + } + } if len(node.Children) > 0 { for _, child := range node.Children { self.readItem(item, child) } } - - } else { - if len(node.Children) > 0 { - for _, child := range node.Children { - self.scanForItem(child) - } - } } + return self.data, nil } + + func (self *Parser) readItem(item *Item, node *h5.Node) { if itemprop, exists := getAttr("itemprop", node); exists { var propertyValue string @@ -133,7 +141,7 @@ func (self *Parser) readItem(item *Item, node *h5.Node) { } }) - propertyValue = text.String() + propertyValue = text.String() } for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") { @@ -144,6 +152,8 @@ func (self *Parser) readItem(item *Item, node *h5.Node) { } } + + if len(node.Children) > 0 { for _, child := range node.Children { self.readItem(item, child) diff --git a/microdata_test.go b/microdata_test.go index 3ead2fa..e0502f3 100644 --- a/microdata_test.go +++ b/microdata_test.go @@ -366,3 +366,72 @@ func TestParseItemId(t *testing.T) { } } + + +func TestParseItemRef(t *testing.T) { + html := `

+ A white house, boarded up, sits in a forest. +
The house I found.
+

+

All images licensed under the MIT + license.

` + + item := ParseOneItem(html, t) + + + if len(item.properties) != 3 { + t.Errorf("Expecting 3 properties but got %d",len(item.properties) ) + } + + if item.properties["license"][0].(string) != "http://www.opensource.org/licenses/mit-license.php" { + t.Errorf("Property value 'http://www.opensource.org/licenses/mit-license.php' not found for 'license'") + } + +} + +func TestParseSharedItemRef(t *testing.T) { + html := ` + + + Photo gallery + + +

My photos

+
+ A white house, boarded up, sits in a forest. +
The house I found.
+
+
+ Outside the house is a mailbox. It has a leaflet inside. +
The mailbox.
+
+ + + ` + + data := ParseData(html, t) + + if len(data.items) != 2 { + t.Errorf("Expecting 2 items but got %d",len(data.items) ) + } + if len(data.items[0].properties) != 3 { + t.Errorf("Expecting 3 properties but got %d",len(data.items[0].properties) ) + } + if len(data.items[1].properties) != 3 { + t.Errorf("Expecting 3 properties but got %d",len(data.items[1].properties) ) + } + + if data.items[0].properties["license"][0].(string) != "http://www.opensource.org/licenses/mit-license.php" { + t.Errorf("Property value 'http://www.opensource.org/licenses/mit-license.php' not found for 'license'") + } + + if data.items[1].properties["license"][0].(string) != "http://www.opensource.org/licenses/mit-license.php" { + t.Errorf("Property value 'http://www.opensource.org/licenses/mit-license.php' not found for 'license'") + } + +} \ No newline at end of file