forked from ukamnya/microdata_mirror
Added multi-valued itemrefs
parent
aec670c37a
commit
5a35df7849
82
microdata.go
82
microdata.go
|
@ -7,21 +7,19 @@ import (
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
type ValueList []interface{}
|
type ValueList []interface{}
|
||||||
type PropertyMap map[string]ValueList
|
type PropertyMap map[string]ValueList
|
||||||
|
|
||||||
type Item struct {
|
type Item struct {
|
||||||
properties PropertyMap
|
properties PropertyMap
|
||||||
types []string
|
types []string
|
||||||
id string
|
id string
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewItem() *Item {
|
func NewItem() *Item {
|
||||||
return &Item{
|
return &Item{
|
||||||
properties: make(PropertyMap, 0),
|
properties: make(PropertyMap, 0),
|
||||||
types: make([]string, 0),
|
types: make([]string, 0),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -40,14 +38,14 @@ func NewMicrodata() *Microdata {
|
||||||
}
|
}
|
||||||
|
|
||||||
type Parser struct {
|
type Parser struct {
|
||||||
p *h5.Parser
|
p *h5.Parser
|
||||||
data *Microdata
|
data *Microdata
|
||||||
identifiedNodes map[string]*h5.Node
|
identifiedNodes map[string]*h5.Node
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewParser(r io.Reader) *Parser {
|
func NewParser(r io.Reader) *Parser {
|
||||||
return &Parser {
|
return &Parser{
|
||||||
p : h5.NewParser(r),
|
p: h5.NewParser(r),
|
||||||
data: NewMicrodata(),
|
data: NewMicrodata(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -62,8 +60,7 @@ func (self *Parser) Parse() (*Microdata, error) {
|
||||||
topLevelItemNodes := make([]*h5.Node, 0)
|
topLevelItemNodes := make([]*h5.Node, 0)
|
||||||
self.identifiedNodes = make(map[string]*h5.Node, 0)
|
self.identifiedNodes = make(map[string]*h5.Node, 0)
|
||||||
|
|
||||||
|
tree.Walk(func(n *h5.Node) {
|
||||||
tree.Walk( func(n *h5.Node) {
|
|
||||||
if _, exists := getAttr("itemscope", n); exists {
|
if _, exists := getAttr("itemscope", n); exists {
|
||||||
if _, exists := getAttr("itemprop", n); !exists {
|
if _, exists := getAttr("itemprop", n); !exists {
|
||||||
topLevelItemNodes = append(topLevelItemNodes, n)
|
topLevelItemNodes = append(topLevelItemNodes, n)
|
||||||
|
@ -73,7 +70,7 @@ func (self *Parser) Parse() (*Microdata, error) {
|
||||||
if id, exists := getAttr("id", n); exists {
|
if id, exists := getAttr("id", n); exists {
|
||||||
self.identifiedNodes[id] = n
|
self.identifiedNodes[id] = n
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
for _, node := range topLevelItemNodes {
|
for _, node := range topLevelItemNodes {
|
||||||
item := NewItem()
|
item := NewItem()
|
||||||
|
@ -86,62 +83,64 @@ func (self *Parser) Parse() (*Microdata, error) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// itemid only valid when itemscope and itemtype are both present
|
// itemid only valid when itemscope and itemtype are both present
|
||||||
if itemid, exists := getAttr("itemid", node); exists {
|
if itemid, exists := getAttr("itemid", node); exists {
|
||||||
item.id = strings.TrimSpace(itemid)
|
item.id = strings.TrimSpace(itemid)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
if itemref, exists := getAttr("itemref", node); exists {
|
}
|
||||||
if refnode, exists := self.identifiedNodes[itemref]; exists {
|
|
||||||
self.readItem(item, refnode)
|
if itemrefs, exists := getAttr("itemref", node); exists {
|
||||||
|
for _, itemref := range strings.Split(strings.TrimSpace(itemrefs), " ") {
|
||||||
|
itemref = strings.TrimSpace(itemref)
|
||||||
|
|
||||||
|
if refnode, exists := self.identifiedNodes[itemref]; exists {
|
||||||
|
self.readItem(item, refnode)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(node.Children) > 0 {
|
if len(node.Children) > 0 {
|
||||||
for _, child := range node.Children {
|
for _, child := range node.Children {
|
||||||
self.readItem(item, child)
|
self.readItem(item, child)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return self.data, nil
|
return self.data, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
func (self *Parser) readItem(item *Item, node *h5.Node) {
|
func (self *Parser) readItem(item *Item, node *h5.Node) {
|
||||||
if itemprop, exists := getAttr("itemprop", node); exists {
|
if itemprop, exists := getAttr("itemprop", node); exists {
|
||||||
var propertyValue string
|
var propertyValue string
|
||||||
|
|
||||||
switch node.Data() {
|
switch node.Data() {
|
||||||
|
|
||||||
case "img","audio", "source", "video", "embed", "iframe", "track":
|
case "img", "audio", "source", "video", "embed", "iframe", "track":
|
||||||
if urlValue, exists := getAttr("src", node); exists {
|
if urlValue, exists := getAttr("src", node); exists {
|
||||||
propertyValue = urlValue
|
propertyValue = urlValue
|
||||||
}
|
}
|
||||||
case "a", "area", "link":
|
case "a", "area", "link":
|
||||||
if urlValue, exists := getAttr("href", node); exists {
|
if urlValue, exists := getAttr("href", node); exists {
|
||||||
propertyValue = urlValue
|
propertyValue = urlValue
|
||||||
}
|
}
|
||||||
case "data":
|
case "data":
|
||||||
if urlValue, exists := getAttr("value", node); exists {
|
if urlValue, exists := getAttr("value", node); exists {
|
||||||
propertyValue = urlValue
|
propertyValue = urlValue
|
||||||
}
|
}
|
||||||
case "time":
|
case "time":
|
||||||
if urlValue, exists := getAttr("datetime", node); exists {
|
if urlValue, exists := getAttr("datetime", node); exists {
|
||||||
propertyValue = urlValue
|
propertyValue = urlValue
|
||||||
}
|
}
|
||||||
|
|
||||||
default:
|
default:
|
||||||
var text bytes.Buffer
|
var text bytes.Buffer
|
||||||
node.Walk( func(n *h5.Node) {
|
node.Walk(func(n *h5.Node) {
|
||||||
if n.Type == h5.TextNode {
|
if n.Type == h5.TextNode {
|
||||||
text.WriteString(n.Data())
|
text.WriteString(n.Data())
|
||||||
}
|
}
|
||||||
|
|
||||||
})
|
})
|
||||||
propertyValue = text.String()
|
propertyValue = text.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
|
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
|
||||||
|
@ -152,13 +151,11 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if len(node.Children) > 0 {
|
if len(node.Children) > 0 {
|
||||||
for _, child := range node.Children {
|
for _, child := range node.Children {
|
||||||
self.readItem(item, child)
|
self.readItem(item, child)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func getAttr(name string, node *h5.Node) (string, bool) {
|
func getAttr(name string, node *h5.Node) (string, bool) {
|
||||||
|
@ -169,4 +166,3 @@ func getAttr(name string, node *h5.Node) (string, bool) {
|
||||||
}
|
}
|
||||||
return "", false
|
return "", false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,6 @@ import (
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
func ParseData(html string, t *testing.T) *Microdata {
|
func ParseData(html string, t *testing.T) *Microdata {
|
||||||
p := NewParser(strings.NewReader(html))
|
p := NewParser(strings.NewReader(html))
|
||||||
|
|
||||||
|
@ -40,7 +39,6 @@ func TestParse(t *testing.T) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
func TestParseActuallyParses(t *testing.T) {
|
func TestParseActuallyParses(t *testing.T) {
|
||||||
html := `
|
html := `
|
||||||
<div itemscope>
|
<div itemscope>
|
||||||
|
@ -54,7 +52,6 @@ func TestParseActuallyParses(t *testing.T) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
func TestParseThreeProps(t *testing.T) {
|
func TestParseThreeProps(t *testing.T) {
|
||||||
html := `
|
html := `
|
||||||
<div itemscope>
|
<div itemscope>
|
||||||
|
@ -78,7 +75,6 @@ func TestParseThreeProps(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
func TestParseImgSrc(t *testing.T) {
|
func TestParseImgSrc(t *testing.T) {
|
||||||
html := `
|
html := `
|
||||||
<div itemscope>
|
<div itemscope>
|
||||||
|
@ -158,7 +154,6 @@ func TestParseSourceSrc(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
func TestParseVideoSrc(t *testing.T) {
|
func TestParseVideoSrc(t *testing.T) {
|
||||||
html := `
|
html := `
|
||||||
<div itemscope>
|
<div itemscope>
|
||||||
|
@ -237,8 +232,6 @@ func TestParseTimeDatetime(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
func TestParseTwoValues(t *testing.T) {
|
func TestParseTwoValues(t *testing.T) {
|
||||||
html := `
|
html := `
|
||||||
<div itemscope>
|
<div itemscope>
|
||||||
|
@ -251,7 +244,7 @@ func TestParseTwoValues(t *testing.T) {
|
||||||
|
|
||||||
item := ParseOneItem(html, t)
|
item := ParseOneItem(html, t)
|
||||||
if len(item.properties["flavor"]) != 2 {
|
if len(item.properties["flavor"]) != 2 {
|
||||||
t.Errorf("Expecting 2 values but got %d",len(item.properties["flavor"]) )
|
t.Errorf("Expecting 2 values but got %d", len(item.properties["flavor"]))
|
||||||
}
|
}
|
||||||
if item.properties["flavor"][0].(string) != "Lemon sorbet" {
|
if item.properties["flavor"][0].(string) != "Lemon sorbet" {
|
||||||
t.Errorf("Property value 'Lemon sorbet' not found")
|
t.Errorf("Property value 'Lemon sorbet' not found")
|
||||||
|
@ -260,7 +253,6 @@ func TestParseTwoValues(t *testing.T) {
|
||||||
t.Errorf("Property value 'Apricot sorbet' not found")
|
t.Errorf("Property value 'Apricot sorbet' not found")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestParseTwoPropertiesOneValue(t *testing.T) {
|
func TestParseTwoPropertiesOneValue(t *testing.T) {
|
||||||
|
@ -271,13 +263,13 @@ func TestParseTwoPropertiesOneValue(t *testing.T) {
|
||||||
|
|
||||||
item := ParseOneItem(html, t)
|
item := ParseOneItem(html, t)
|
||||||
if len(item.properties) != 2 {
|
if len(item.properties) != 2 {
|
||||||
t.Errorf("Expecting 2 properties but got %d",len(item.properties) )
|
t.Errorf("Expecting 2 properties but got %d", len(item.properties))
|
||||||
}
|
}
|
||||||
if len(item.properties["favorite-color"]) != 1 {
|
if len(item.properties["favorite-color"]) != 1 {
|
||||||
t.Errorf("Expecting 1 value but got %d",len(item.properties["favorite-color"]) )
|
t.Errorf("Expecting 1 value but got %d", len(item.properties["favorite-color"]))
|
||||||
}
|
}
|
||||||
if len(item.properties["favorite-fruit"]) != 1 {
|
if len(item.properties["favorite-fruit"]) != 1 {
|
||||||
t.Errorf("Expecting 1 value but got %d",len(item.properties["favorite-fruit"]) )
|
t.Errorf("Expecting 1 value but got %d", len(item.properties["favorite-fruit"]))
|
||||||
}
|
}
|
||||||
if item.properties["favorite-color"][0].(string) != "orange" {
|
if item.properties["favorite-color"][0].(string) != "orange" {
|
||||||
t.Errorf("Property value 'orange' not found for 'favorite-color'")
|
t.Errorf("Property value 'orange' not found for 'favorite-color'")
|
||||||
|
@ -295,14 +287,14 @@ func TestParseTwoPropertiesOneValueMultispaced(t *testing.T) {
|
||||||
|
|
||||||
item := ParseOneItem(html, t)
|
item := ParseOneItem(html, t)
|
||||||
if len(item.properties) != 2 {
|
if len(item.properties) != 2 {
|
||||||
t.Errorf("Expecting 2 properties but got %d",len(item.properties) )
|
t.Errorf("Expecting 2 properties but got %d", len(item.properties))
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(item.properties["favorite-color"]) != 1 {
|
if len(item.properties["favorite-color"]) != 1 {
|
||||||
t.Errorf("Expecting 1 value but got %d",len(item.properties["favorite-color"]) )
|
t.Errorf("Expecting 1 value but got %d", len(item.properties["favorite-color"]))
|
||||||
}
|
}
|
||||||
if len(item.properties["favorite-fruit"]) != 1 {
|
if len(item.properties["favorite-fruit"]) != 1 {
|
||||||
t.Errorf("Expecting 1 value but got %d",len(item.properties["favorite-fruit"]) )
|
t.Errorf("Expecting 1 value but got %d", len(item.properties["favorite-fruit"]))
|
||||||
}
|
}
|
||||||
if item.properties["favorite-color"][0].(string) != "orange" {
|
if item.properties["favorite-color"][0].(string) != "orange" {
|
||||||
t.Errorf("Property value 'orange' not found for 'favorite-color'")
|
t.Errorf("Property value 'orange' not found for 'favorite-color'")
|
||||||
|
@ -320,11 +312,11 @@ func TestParseItemType(t *testing.T) {
|
||||||
|
|
||||||
item := ParseOneItem(html, t)
|
item := ParseOneItem(html, t)
|
||||||
if len(item.types) != 1 {
|
if len(item.types) != 1 {
|
||||||
t.Errorf("Expecting 1 type but got %d",len(item.types) )
|
t.Errorf("Expecting 1 type but got %d", len(item.types))
|
||||||
}
|
}
|
||||||
|
|
||||||
if item.types[0] != "http://example.org/animals#cat" {
|
if item.types[0] != "http://example.org/animals#cat" {
|
||||||
t.Errorf("Expecting type of 'http://example.org/animals#cat' but got %d",item.types[0])
|
t.Errorf("Expecting type of 'http://example.org/animals#cat' but got %d", item.types[0])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -336,14 +328,14 @@ func TestParseMultipleItemTypes(t *testing.T) {
|
||||||
|
|
||||||
item := ParseOneItem(html, t)
|
item := ParseOneItem(html, t)
|
||||||
if len(item.types) != 2 {
|
if len(item.types) != 2 {
|
||||||
t.Errorf("Expecting 2 types but got %d",len(item.types) )
|
t.Errorf("Expecting 2 types but got %d", len(item.types))
|
||||||
}
|
}
|
||||||
|
|
||||||
if item.types[0] != "http://example.org/animals#mammal" {
|
if item.types[0] != "http://example.org/animals#mammal" {
|
||||||
t.Errorf("Expecting type of 'http://example.org/animals#mammal' but got %d",item.types[0])
|
t.Errorf("Expecting type of 'http://example.org/animals#mammal' but got %d", item.types[0])
|
||||||
}
|
}
|
||||||
if item.types[1] != "http://example.org/animals#cat" {
|
if item.types[1] != "http://example.org/animals#cat" {
|
||||||
t.Errorf("Expecting type of 'http://example.org/animals#cat' but got %d",item.types[1])
|
t.Errorf("Expecting type of 'http://example.org/animals#cat' but got %d", item.types[1])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -362,12 +354,10 @@ func TestParseItemId(t *testing.T) {
|
||||||
item := ParseOneItem(html, t)
|
item := ParseOneItem(html, t)
|
||||||
|
|
||||||
if item.id != "urn:isbn:0-330-34032-8" {
|
if item.id != "urn:isbn:0-330-34032-8" {
|
||||||
t.Errorf("Expecting id of 'urn:isbn:0-330-34032-8' but got %d",item.id)
|
t.Errorf("Expecting id of 'urn:isbn:0-330-34032-8' but got %d", item.id)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
func TestParseItemRef(t *testing.T) {
|
func TestParseItemRef(t *testing.T) {
|
||||||
html := `<body><p><figure itemscope itemtype="http://n.whatwg.org/work" itemref="licenses">
|
html := `<body><p><figure itemscope itemtype="http://n.whatwg.org/work" itemref="licenses">
|
||||||
<img itemprop="work" src="images/house.jpeg" alt="A white house, boarded up, sits in a forest.">
|
<img itemprop="work" src="images/house.jpeg" alt="A white house, boarded up, sits in a forest.">
|
||||||
|
@ -379,9 +369,8 @@ func TestParseItemRef(t *testing.T) {
|
||||||
|
|
||||||
item := ParseOneItem(html, t)
|
item := ParseOneItem(html, t)
|
||||||
|
|
||||||
|
|
||||||
if len(item.properties) != 3 {
|
if len(item.properties) != 3 {
|
||||||
t.Errorf("Expecting 3 properties but got %d",len(item.properties) )
|
t.Errorf("Expecting 3 properties but got %d", len(item.properties))
|
||||||
}
|
}
|
||||||
|
|
||||||
if item.properties["license"][0].(string) != "http://www.opensource.org/licenses/mit-license.php" {
|
if item.properties["license"][0].(string) != "http://www.opensource.org/licenses/mit-license.php" {
|
||||||
|
@ -417,13 +406,13 @@ func TestParseSharedItemRef(t *testing.T) {
|
||||||
data := ParseData(html, t)
|
data := ParseData(html, t)
|
||||||
|
|
||||||
if len(data.items) != 2 {
|
if len(data.items) != 2 {
|
||||||
t.Errorf("Expecting 2 items but got %d",len(data.items) )
|
t.Errorf("Expecting 2 items but got %d", len(data.items))
|
||||||
}
|
}
|
||||||
if len(data.items[0].properties) != 3 {
|
if len(data.items[0].properties) != 3 {
|
||||||
t.Errorf("Expecting 3 properties but got %d",len(data.items[0].properties) )
|
t.Errorf("Expecting 3 properties but got %d", len(data.items[0].properties))
|
||||||
}
|
}
|
||||||
if len(data.items[1].properties) != 3 {
|
if len(data.items[1].properties) != 3 {
|
||||||
t.Errorf("Expecting 3 properties but got %d",len(data.items[1].properties) )
|
t.Errorf("Expecting 3 properties but got %d", len(data.items[1].properties))
|
||||||
}
|
}
|
||||||
|
|
||||||
if data.items[0].properties["license"][0].(string) != "http://www.opensource.org/licenses/mit-license.php" {
|
if data.items[0].properties["license"][0].(string) != "http://www.opensource.org/licenses/mit-license.php" {
|
||||||
|
@ -434,4 +423,26 @@ func TestParseSharedItemRef(t *testing.T) {
|
||||||
t.Errorf("Property value 'http://www.opensource.org/licenses/mit-license.php' not found for 'license'")
|
t.Errorf("Property value 'http://www.opensource.org/licenses/mit-license.php' not found for 'license'")
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseMultiValuedItemRef(t *testing.T) {
|
||||||
|
html := `<!DOCTYPE HTML>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<div itemscope id="amanda" itemref="a b"></div>
|
||||||
|
<p id="a">Name: <span itemprop="name">Amanda</span></p>
|
||||||
|
<p id="b">Age: <span itemprop="age">26</span></p>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>`
|
||||||
|
|
||||||
|
data := ParseData(html, t)
|
||||||
|
|
||||||
|
if data.items[0].properties["name"][0].(string) != "Amanda" {
|
||||||
|
t.Errorf("Property value 'Amanda' not found for 'name'")
|
||||||
|
}
|
||||||
|
|
||||||
|
if data.items[0].properties["age"][0].(string) != "26" {
|
||||||
|
t.Errorf("Property value '26' not found for 'age'")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue