Added parsing of itemref

master
Ian Davis 2012-06-10 14:18:33 +01:00
parent a58ae8f1fe
commit aec670c37a
2 changed files with 96 additions and 17 deletions

View File

@ -42,6 +42,7 @@ func NewMicrodata() *Microdata {
type Parser struct { type Parser struct {
p *h5.Parser p *h5.Parser
data *Microdata data *Microdata
identifiedNodes map[string]*h5.Node
} }
func NewParser(r io.Reader) *Parser { func NewParser(r io.Reader) *Parser {
@ -58,17 +59,23 @@ func (self *Parser) Parse() (*Microdata, error) {
} }
tree := self.p.Tree() tree := self.p.Tree()
self.scanForItem(tree) topLevelItemNodes := make([]*h5.Node, 0)
self.identifiedNodes = make(map[string]*h5.Node, 0)
return self.data, nil
tree.Walk( func(n *h5.Node) {
if _, exists := getAttr("itemscope", n); exists {
if _, exists := getAttr("itemprop", n); !exists {
topLevelItemNodes = append(topLevelItemNodes, n)
}
} }
func (self *Parser) scanForItem(node *h5.Node) { if id, exists := getAttr("id", n); exists {
if node == nil { self.identifiedNodes[id] = n
return
} }
})
if _, exists := getAttr("itemscope", node); exists { for _, node := range topLevelItemNodes {
item := NewItem() item := NewItem()
self.data.items = append(self.data.items, item) self.data.items = append(self.data.items, item)
if itemtypes, exists := getAttr("itemtype", node); exists { if itemtypes, exists := getAttr("itemtype", node); exists {
@ -85,23 +92,24 @@ func (self *Parser) scanForItem(node *h5.Node) {
} }
if itemref, exists := getAttr("itemref", node); exists {
if refnode, exists := self.identifiedNodes[itemref]; exists {
self.readItem(item, refnode)
}
}
if len(node.Children) > 0 { if len(node.Children) > 0 {
for _, child := range node.Children { for _, child := range node.Children {
self.readItem(item, child) self.readItem(item, child)
} }
} }
} else {
if len(node.Children) > 0 {
for _, child := range node.Children {
self.scanForItem(child)
}
}
} }
return self.data, nil
} }
func (self *Parser) readItem(item *Item, node *h5.Node) { func (self *Parser) readItem(item *Item, node *h5.Node) {
if itemprop, exists := getAttr("itemprop", node); exists { if itemprop, exists := getAttr("itemprop", node); exists {
var propertyValue string var propertyValue string
@ -144,6 +152,8 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
} }
} }
if len(node.Children) > 0 { if len(node.Children) > 0 {
for _, child := range node.Children { for _, child := range node.Children {
self.readItem(item, child) self.readItem(item, child)

View File

@ -366,3 +366,72 @@ func TestParseItemId(t *testing.T) {
} }
} }
func TestParseItemRef(t *testing.T) {
html := `<body><p><figure itemscope itemtype="http://n.whatwg.org/work" itemref="licenses">
<img itemprop="work" src="images/house.jpeg" alt="A white house, boarded up, sits in a forest.">
<figcaption itemprop="title">The house I found.</figcaption>
</figure></p>
<p id="licenses">All images licensed under the <a itemprop="license"
href="http://www.opensource.org/licenses/mit-license.php">MIT
license</a>.</p></body>`
item := ParseOneItem(html, t)
if len(item.properties) != 3 {
t.Errorf("Expecting 3 properties but got %d",len(item.properties) )
}
if item.properties["license"][0].(string) != "http://www.opensource.org/licenses/mit-license.php" {
t.Errorf("Property value 'http://www.opensource.org/licenses/mit-license.php' not found for 'license'")
}
}
func TestParseSharedItemRef(t *testing.T) {
html := `<!DOCTYPE HTML>
<html>
<head>
<title>Photo gallery</title>
</head>
<body>
<h1>My photos</h1>
<figure itemscope itemtype="http://n.whatwg.org/work" itemref="licenses">
<img itemprop="work" src="images/house.jpeg" alt="A white house, boarded up, sits in a forest.">
<figcaption itemprop="title">The house I found.</figcaption>
</figure>
<figure itemscope itemtype="http://n.whatwg.org/work" itemref="licenses">
<img itemprop="work" src="images/mailbox.jpeg" alt="Outside the house is a mailbox. It has a leaflet inside.">
<figcaption itemprop="title">The mailbox.</figcaption>
</figure>
<footer>
<p id="licenses">All images licensed under the <a itemprop="license"
href="http://www.opensource.org/licenses/mit-license.php">MIT
license</a>.</p>
</footer>
</body>
</html>`
data := ParseData(html, t)
if len(data.items) != 2 {
t.Errorf("Expecting 2 items but got %d",len(data.items) )
}
if len(data.items[0].properties) != 3 {
t.Errorf("Expecting 3 properties but got %d",len(data.items[0].properties) )
}
if len(data.items[1].properties) != 3 {
t.Errorf("Expecting 3 properties but got %d",len(data.items[1].properties) )
}
if data.items[0].properties["license"][0].(string) != "http://www.opensource.org/licenses/mit-license.php" {
t.Errorf("Property value 'http://www.opensource.org/licenses/mit-license.php' not found for 'license'")
}
if data.items[1].properties["license"][0].(string) != "http://www.opensource.org/licenses/mit-license.php" {
t.Errorf("Property value 'http://www.opensource.org/licenses/mit-license.php' not found for 'license'")
}
}