forked from ukamnya/microdata_mirror
Added parsing of itemref
parent
a58ae8f1fe
commit
aec670c37a
44
microdata.go
44
microdata.go
|
@ -42,6 +42,7 @@ func NewMicrodata() *Microdata {
|
||||||
type Parser struct {
|
type Parser struct {
|
||||||
p *h5.Parser
|
p *h5.Parser
|
||||||
data *Microdata
|
data *Microdata
|
||||||
|
identifiedNodes map[string]*h5.Node
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewParser(r io.Reader) *Parser {
|
func NewParser(r io.Reader) *Parser {
|
||||||
|
@ -58,17 +59,23 @@ func (self *Parser) Parse() (*Microdata, error) {
|
||||||
}
|
}
|
||||||
tree := self.p.Tree()
|
tree := self.p.Tree()
|
||||||
|
|
||||||
self.scanForItem(tree)
|
topLevelItemNodes := make([]*h5.Node, 0)
|
||||||
|
self.identifiedNodes = make(map[string]*h5.Node, 0)
|
||||||
|
|
||||||
return self.data, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (self *Parser) scanForItem(node *h5.Node) {
|
tree.Walk( func(n *h5.Node) {
|
||||||
if node == nil {
|
if _, exists := getAttr("itemscope", n); exists {
|
||||||
return
|
if _, exists := getAttr("itemprop", n); !exists {
|
||||||
}
|
topLevelItemNodes = append(topLevelItemNodes, n)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if _, exists := getAttr("itemscope", node); exists {
|
if id, exists := getAttr("id", n); exists {
|
||||||
|
self.identifiedNodes[id] = n
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
for _, node := range topLevelItemNodes {
|
||||||
item := NewItem()
|
item := NewItem()
|
||||||
self.data.items = append(self.data.items, item)
|
self.data.items = append(self.data.items, item)
|
||||||
if itemtypes, exists := getAttr("itemtype", node); exists {
|
if itemtypes, exists := getAttr("itemtype", node); exists {
|
||||||
|
@ -79,29 +86,30 @@ func (self *Parser) scanForItem(node *h5.Node) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// itemid only valid when itemscope and itemtype are both present
|
// itemid only valid when itemscope and itemtype are both present
|
||||||
if itemid, exists := getAttr("itemid", node); exists {
|
if itemid, exists := getAttr("itemid", node); exists {
|
||||||
item.id = strings.TrimSpace(itemid)
|
item.id = strings.TrimSpace(itemid)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if itemref, exists := getAttr("itemref", node); exists {
|
||||||
|
if refnode, exists := self.identifiedNodes[itemref]; exists {
|
||||||
|
self.readItem(item, refnode)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if len(node.Children) > 0 {
|
if len(node.Children) > 0 {
|
||||||
for _, child := range node.Children {
|
for _, child := range node.Children {
|
||||||
self.readItem(item, child)
|
self.readItem(item, child)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
|
||||||
if len(node.Children) > 0 {
|
|
||||||
for _, child := range node.Children {
|
|
||||||
self.scanForItem(child)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return self.data, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
func (self *Parser) readItem(item *Item, node *h5.Node) {
|
func (self *Parser) readItem(item *Item, node *h5.Node) {
|
||||||
if itemprop, exists := getAttr("itemprop", node); exists {
|
if itemprop, exists := getAttr("itemprop", node); exists {
|
||||||
var propertyValue string
|
var propertyValue string
|
||||||
|
@ -133,7 +141,7 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
|
||||||
}
|
}
|
||||||
|
|
||||||
})
|
})
|
||||||
propertyValue = text.String()
|
propertyValue = text.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
|
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
|
||||||
|
@ -144,6 +152,8 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if len(node.Children) > 0 {
|
if len(node.Children) > 0 {
|
||||||
for _, child := range node.Children {
|
for _, child := range node.Children {
|
||||||
self.readItem(item, child)
|
self.readItem(item, child)
|
||||||
|
|
|
@ -366,3 +366,72 @@ func TestParseItemId(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
func TestParseItemRef(t *testing.T) {
|
||||||
|
html := `<body><p><figure itemscope itemtype="http://n.whatwg.org/work" itemref="licenses">
|
||||||
|
<img itemprop="work" src="images/house.jpeg" alt="A white house, boarded up, sits in a forest.">
|
||||||
|
<figcaption itemprop="title">The house I found.</figcaption>
|
||||||
|
</figure></p>
|
||||||
|
<p id="licenses">All images licensed under the <a itemprop="license"
|
||||||
|
href="http://www.opensource.org/licenses/mit-license.php">MIT
|
||||||
|
license</a>.</p></body>`
|
||||||
|
|
||||||
|
item := ParseOneItem(html, t)
|
||||||
|
|
||||||
|
|
||||||
|
if len(item.properties) != 3 {
|
||||||
|
t.Errorf("Expecting 3 properties but got %d",len(item.properties) )
|
||||||
|
}
|
||||||
|
|
||||||
|
if item.properties["license"][0].(string) != "http://www.opensource.org/licenses/mit-license.php" {
|
||||||
|
t.Errorf("Property value 'http://www.opensource.org/licenses/mit-license.php' not found for 'license'")
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseSharedItemRef(t *testing.T) {
|
||||||
|
html := `<!DOCTYPE HTML>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Photo gallery</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>My photos</h1>
|
||||||
|
<figure itemscope itemtype="http://n.whatwg.org/work" itemref="licenses">
|
||||||
|
<img itemprop="work" src="images/house.jpeg" alt="A white house, boarded up, sits in a forest.">
|
||||||
|
<figcaption itemprop="title">The house I found.</figcaption>
|
||||||
|
</figure>
|
||||||
|
<figure itemscope itemtype="http://n.whatwg.org/work" itemref="licenses">
|
||||||
|
<img itemprop="work" src="images/mailbox.jpeg" alt="Outside the house is a mailbox. It has a leaflet inside.">
|
||||||
|
<figcaption itemprop="title">The mailbox.</figcaption>
|
||||||
|
</figure>
|
||||||
|
<footer>
|
||||||
|
<p id="licenses">All images licensed under the <a itemprop="license"
|
||||||
|
href="http://www.opensource.org/licenses/mit-license.php">MIT
|
||||||
|
license</a>.</p>
|
||||||
|
</footer>
|
||||||
|
</body>
|
||||||
|
</html>`
|
||||||
|
|
||||||
|
data := ParseData(html, t)
|
||||||
|
|
||||||
|
if len(data.items) != 2 {
|
||||||
|
t.Errorf("Expecting 2 items but got %d",len(data.items) )
|
||||||
|
}
|
||||||
|
if len(data.items[0].properties) != 3 {
|
||||||
|
t.Errorf("Expecting 3 properties but got %d",len(data.items[0].properties) )
|
||||||
|
}
|
||||||
|
if len(data.items[1].properties) != 3 {
|
||||||
|
t.Errorf("Expecting 3 properties but got %d",len(data.items[1].properties) )
|
||||||
|
}
|
||||||
|
|
||||||
|
if data.items[0].properties["license"][0].(string) != "http://www.opensource.org/licenses/mit-license.php" {
|
||||||
|
t.Errorf("Property value 'http://www.opensource.org/licenses/mit-license.php' not found for 'license'")
|
||||||
|
}
|
||||||
|
|
||||||
|
if data.items[1].properties["license"][0].(string) != "http://www.opensource.org/licenses/mit-license.php" {
|
||||||
|
t.Errorf("Property value 'http://www.opensource.org/licenses/mit-license.php' not found for 'license'")
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue