Added parsing of itemref
parent
a58ae8f1fe
commit
aec670c37a
44
microdata.go
44
microdata.go
|
@ -42,6 +42,7 @@ func NewMicrodata() *Microdata {
|
|||
type Parser struct {
|
||||
p *h5.Parser
|
||||
data *Microdata
|
||||
identifiedNodes map[string]*h5.Node
|
||||
}
|
||||
|
||||
func NewParser(r io.Reader) *Parser {
|
||||
|
@ -58,17 +59,23 @@ func (self *Parser) Parse() (*Microdata, error) {
|
|||
}
|
||||
tree := self.p.Tree()
|
||||
|
||||
self.scanForItem(tree)
|
||||
topLevelItemNodes := make([]*h5.Node, 0)
|
||||
self.identifiedNodes = make(map[string]*h5.Node, 0)
|
||||
|
||||
return self.data, nil
|
||||
}
|
||||
|
||||
func (self *Parser) scanForItem(node *h5.Node) {
|
||||
if node == nil {
|
||||
return
|
||||
}
|
||||
tree.Walk( func(n *h5.Node) {
|
||||
if _, exists := getAttr("itemscope", n); exists {
|
||||
if _, exists := getAttr("itemprop", n); !exists {
|
||||
topLevelItemNodes = append(topLevelItemNodes, n)
|
||||
}
|
||||
}
|
||||
|
||||
if _, exists := getAttr("itemscope", node); exists {
|
||||
if id, exists := getAttr("id", n); exists {
|
||||
self.identifiedNodes[id] = n
|
||||
}
|
||||
})
|
||||
|
||||
for _, node := range topLevelItemNodes {
|
||||
item := NewItem()
|
||||
self.data.items = append(self.data.items, item)
|
||||
if itemtypes, exists := getAttr("itemtype", node); exists {
|
||||
|
@ -79,29 +86,30 @@ func (self *Parser) scanForItem(node *h5.Node) {
|
|||
}
|
||||
}
|
||||
// itemid only valid when itemscope and itemtype are both present
|
||||
if itemid, exists := getAttr("itemid", node); exists {
|
||||
if itemid, exists := getAttr("itemid", node); exists {
|
||||
item.id = strings.TrimSpace(itemid)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if itemref, exists := getAttr("itemref", node); exists {
|
||||
if refnode, exists := self.identifiedNodes[itemref]; exists {
|
||||
self.readItem(item, refnode)
|
||||
}
|
||||
}
|
||||
|
||||
if len(node.Children) > 0 {
|
||||
for _, child := range node.Children {
|
||||
self.readItem(item, child)
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
if len(node.Children) > 0 {
|
||||
for _, child := range node.Children {
|
||||
self.scanForItem(child)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return self.data, nil
|
||||
}
|
||||
|
||||
|
||||
|
||||
func (self *Parser) readItem(item *Item, node *h5.Node) {
|
||||
if itemprop, exists := getAttr("itemprop", node); exists {
|
||||
var propertyValue string
|
||||
|
@ -133,7 +141,7 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
|
|||
}
|
||||
|
||||
})
|
||||
propertyValue = text.String()
|
||||
propertyValue = text.String()
|
||||
}
|
||||
|
||||
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
|
||||
|
@ -144,6 +152,8 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
if len(node.Children) > 0 {
|
||||
for _, child := range node.Children {
|
||||
self.readItem(item, child)
|
||||
|
|
|
@ -366,3 +366,72 @@ func TestParseItemId(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
func TestParseItemRef(t *testing.T) {
|
||||
html := `<body><p><figure itemscope itemtype="http://n.whatwg.org/work" itemref="licenses">
|
||||
<img itemprop="work" src="images/house.jpeg" alt="A white house, boarded up, sits in a forest.">
|
||||
<figcaption itemprop="title">The house I found.</figcaption>
|
||||
</figure></p>
|
||||
<p id="licenses">All images licensed under the <a itemprop="license"
|
||||
href="http://www.opensource.org/licenses/mit-license.php">MIT
|
||||
license</a>.</p></body>`
|
||||
|
||||
item := ParseOneItem(html, t)
|
||||
|
||||
|
||||
if len(item.properties) != 3 {
|
||||
t.Errorf("Expecting 3 properties but got %d",len(item.properties) )
|
||||
}
|
||||
|
||||
if item.properties["license"][0].(string) != "http://www.opensource.org/licenses/mit-license.php" {
|
||||
t.Errorf("Property value 'http://www.opensource.org/licenses/mit-license.php' not found for 'license'")
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func TestParseSharedItemRef(t *testing.T) {
|
||||
html := `<!DOCTYPE HTML>
|
||||
<html>
|
||||
<head>
|
||||
<title>Photo gallery</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>My photos</h1>
|
||||
<figure itemscope itemtype="http://n.whatwg.org/work" itemref="licenses">
|
||||
<img itemprop="work" src="images/house.jpeg" alt="A white house, boarded up, sits in a forest.">
|
||||
<figcaption itemprop="title">The house I found.</figcaption>
|
||||
</figure>
|
||||
<figure itemscope itemtype="http://n.whatwg.org/work" itemref="licenses">
|
||||
<img itemprop="work" src="images/mailbox.jpeg" alt="Outside the house is a mailbox. It has a leaflet inside.">
|
||||
<figcaption itemprop="title">The mailbox.</figcaption>
|
||||
</figure>
|
||||
<footer>
|
||||
<p id="licenses">All images licensed under the <a itemprop="license"
|
||||
href="http://www.opensource.org/licenses/mit-license.php">MIT
|
||||
license</a>.</p>
|
||||
</footer>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
data := ParseData(html, t)
|
||||
|
||||
if len(data.items) != 2 {
|
||||
t.Errorf("Expecting 2 items but got %d",len(data.items) )
|
||||
}
|
||||
if len(data.items[0].properties) != 3 {
|
||||
t.Errorf("Expecting 3 properties but got %d",len(data.items[0].properties) )
|
||||
}
|
||||
if len(data.items[1].properties) != 3 {
|
||||
t.Errorf("Expecting 3 properties but got %d",len(data.items[1].properties) )
|
||||
}
|
||||
|
||||
if data.items[0].properties["license"][0].(string) != "http://www.opensource.org/licenses/mit-license.php" {
|
||||
t.Errorf("Property value 'http://www.opensource.org/licenses/mit-license.php' not found for 'license'")
|
||||
}
|
||||
|
||||
if data.items[1].properties["license"][0].(string) != "http://www.opensource.org/licenses/mit-license.php" {
|
||||
t.Errorf("Property value 'http://www.opensource.org/licenses/mit-license.php' not found for 'license'")
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue