From 5a35df7849d1a93f30677b86525993376ac493e9 Mon Sep 17 00:00:00 2001 From: Ian Davis Date: Sun, 10 Jun 2012 14:59:30 +0100 Subject: [PATCH] Added multi-valued itemrefs --- microdata.go | 82 ++++++++++++++++++++++------------------------- microdata_test.go | 69 ++++++++++++++++++++++----------------- 2 files changed, 79 insertions(+), 72 deletions(-) diff --git a/microdata.go b/microdata.go index a42d024..3d7d706 100644 --- a/microdata.go +++ b/microdata.go @@ -7,21 +7,19 @@ import ( "strings" ) - - type ValueList []interface{} type PropertyMap map[string]ValueList type Item struct { properties PropertyMap - types []string - id string + types []string + id string } func NewItem() *Item { return &Item{ properties: make(PropertyMap, 0), - types: make([]string, 0), + types: make([]string, 0), } } @@ -40,14 +38,14 @@ func NewMicrodata() *Microdata { } type Parser struct { - p *h5.Parser - data *Microdata + p *h5.Parser + data *Microdata identifiedNodes map[string]*h5.Node } func NewParser(r io.Reader) *Parser { - return &Parser { - p : h5.NewParser(r), + return &Parser{ + p: h5.NewParser(r), data: NewMicrodata(), } } @@ -62,8 +60,7 @@ func (self *Parser) Parse() (*Microdata, error) { topLevelItemNodes := make([]*h5.Node, 0) self.identifiedNodes = make(map[string]*h5.Node, 0) - - tree.Walk( func(n *h5.Node) { + tree.Walk(func(n *h5.Node) { if _, exists := getAttr("itemscope", n); exists { if _, exists := getAttr("itemprop", n); !exists { topLevelItemNodes = append(topLevelItemNodes, n) @@ -73,7 +70,7 @@ func (self *Parser) Parse() (*Microdata, error) { if id, exists := getAttr("id", n); exists { self.identifiedNodes[id] = n } - }) + }) for _, node := range topLevelItemNodes { item := NewItem() @@ -86,62 +83,64 @@ func (self *Parser) Parse() (*Microdata, error) { } } // itemid only valid when itemscope and itemtype are both present - if itemid, exists := getAttr("itemid", node); exists { + if itemid, exists := getAttr("itemid", node); exists { item.id = strings.TrimSpace(itemid) } - - } - if itemref, exists := getAttr("itemref", node); exists { - if refnode, exists := self.identifiedNodes[itemref]; exists { - self.readItem(item, refnode) + } + + if itemrefs, exists := getAttr("itemref", node); exists { + for _, itemref := range strings.Split(strings.TrimSpace(itemrefs), " ") { + itemref = strings.TrimSpace(itemref) + + if refnode, exists := self.identifiedNodes[itemref]; exists { + self.readItem(item, refnode) + } } } if len(node.Children) > 0 { - for _, child := range node.Children { - self.readItem(item, child) - } - } + for _, child := range node.Children { + self.readItem(item, child) + } + } } return self.data, nil } - - func (self *Parser) readItem(item *Item, node *h5.Node) { if itemprop, exists := getAttr("itemprop", node); exists { var propertyValue string - + switch node.Data() { - case "img","audio", "source", "video", "embed", "iframe", "track": + case "img", "audio", "source", "video", "embed", "iframe", "track": if urlValue, exists := getAttr("src", node); exists { propertyValue = urlValue - } + } case "a", "area", "link": if urlValue, exists := getAttr("href", node); exists { propertyValue = urlValue - } + } case "data": if urlValue, exists := getAttr("value", node); exists { propertyValue = urlValue - } + } case "time": if urlValue, exists := getAttr("datetime", node); exists { propertyValue = urlValue - } + } default: var text bytes.Buffer - node.Walk( func(n *h5.Node) { - if n.Type == h5.TextNode { - text.WriteString(n.Data()) - } + node.Walk(func(n *h5.Node) { + if n.Type == h5.TextNode { + text.WriteString(n.Data()) + } - }) - propertyValue = text.String() + }) + propertyValue = text.String() } for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") { @@ -152,13 +151,11 @@ func (self *Parser) readItem(item *Item, node *h5.Node) { } } - - if len(node.Children) > 0 { - for _, child := range node.Children { - self.readItem(item, child) - } - } + for _, child := range node.Children { + self.readItem(item, child) + } + } } func getAttr(name string, node *h5.Node) (string, bool) { @@ -169,4 +166,3 @@ func getAttr(name string, node *h5.Node) (string, bool) { } return "", false } - diff --git a/microdata_test.go b/microdata_test.go index e0502f3..380e6b1 100644 --- a/microdata_test.go +++ b/microdata_test.go @@ -5,7 +5,6 @@ import ( "testing" ) - func ParseData(html string, t *testing.T) *Microdata { p := NewParser(strings.NewReader(html)) @@ -40,7 +39,6 @@ func TestParse(t *testing.T) { } - func TestParseActuallyParses(t *testing.T) { html := `
@@ -54,7 +52,6 @@ func TestParseActuallyParses(t *testing.T) { } - func TestParseThreeProps(t *testing.T) { html := `
@@ -78,7 +75,6 @@ func TestParseThreeProps(t *testing.T) { } } - func TestParseImgSrc(t *testing.T) { html := `
@@ -158,7 +154,6 @@ func TestParseSourceSrc(t *testing.T) { } } - func TestParseVideoSrc(t *testing.T) { html := `
@@ -237,8 +232,6 @@ func TestParseTimeDatetime(t *testing.T) { } } - - func TestParseTwoValues(t *testing.T) { html := `
@@ -251,7 +244,7 @@ func TestParseTwoValues(t *testing.T) { item := ParseOneItem(html, t) if len(item.properties["flavor"]) != 2 { - t.Errorf("Expecting 2 values but got %d",len(item.properties["flavor"]) ) + t.Errorf("Expecting 2 values but got %d", len(item.properties["flavor"])) } if item.properties["flavor"][0].(string) != "Lemon sorbet" { t.Errorf("Property value 'Lemon sorbet' not found") @@ -260,7 +253,6 @@ func TestParseTwoValues(t *testing.T) { t.Errorf("Property value 'Apricot sorbet' not found") } - } func TestParseTwoPropertiesOneValue(t *testing.T) { @@ -271,13 +263,13 @@ func TestParseTwoPropertiesOneValue(t *testing.T) { item := ParseOneItem(html, t) if len(item.properties) != 2 { - t.Errorf("Expecting 2 properties but got %d",len(item.properties) ) + t.Errorf("Expecting 2 properties but got %d", len(item.properties)) } if len(item.properties["favorite-color"]) != 1 { - t.Errorf("Expecting 1 value but got %d",len(item.properties["favorite-color"]) ) + t.Errorf("Expecting 1 value but got %d", len(item.properties["favorite-color"])) } if len(item.properties["favorite-fruit"]) != 1 { - t.Errorf("Expecting 1 value but got %d",len(item.properties["favorite-fruit"]) ) + t.Errorf("Expecting 1 value but got %d", len(item.properties["favorite-fruit"])) } if item.properties["favorite-color"][0].(string) != "orange" { t.Errorf("Property value 'orange' not found for 'favorite-color'") @@ -295,14 +287,14 @@ func TestParseTwoPropertiesOneValueMultispaced(t *testing.T) { item := ParseOneItem(html, t) if len(item.properties) != 2 { - t.Errorf("Expecting 2 properties but got %d",len(item.properties) ) + t.Errorf("Expecting 2 properties but got %d", len(item.properties)) } if len(item.properties["favorite-color"]) != 1 { - t.Errorf("Expecting 1 value but got %d",len(item.properties["favorite-color"]) ) + t.Errorf("Expecting 1 value but got %d", len(item.properties["favorite-color"])) } if len(item.properties["favorite-fruit"]) != 1 { - t.Errorf("Expecting 1 value but got %d",len(item.properties["favorite-fruit"]) ) + t.Errorf("Expecting 1 value but got %d", len(item.properties["favorite-fruit"])) } if item.properties["favorite-color"][0].(string) != "orange" { t.Errorf("Property value 'orange' not found for 'favorite-color'") @@ -320,11 +312,11 @@ func TestParseItemType(t *testing.T) { item := ParseOneItem(html, t) if len(item.types) != 1 { - t.Errorf("Expecting 1 type but got %d",len(item.types) ) + t.Errorf("Expecting 1 type but got %d", len(item.types)) } if item.types[0] != "http://example.org/animals#cat" { - t.Errorf("Expecting type of 'http://example.org/animals#cat' but got %d",item.types[0]) + t.Errorf("Expecting type of 'http://example.org/animals#cat' but got %d", item.types[0]) } } @@ -336,14 +328,14 @@ func TestParseMultipleItemTypes(t *testing.T) { item := ParseOneItem(html, t) if len(item.types) != 2 { - t.Errorf("Expecting 2 types but got %d",len(item.types) ) + t.Errorf("Expecting 2 types but got %d", len(item.types)) } if item.types[0] != "http://example.org/animals#mammal" { - t.Errorf("Expecting type of 'http://example.org/animals#mammal' but got %d",item.types[0]) + t.Errorf("Expecting type of 'http://example.org/animals#mammal' but got %d", item.types[0]) } if item.types[1] != "http://example.org/animals#cat" { - t.Errorf("Expecting type of 'http://example.org/animals#cat' but got %d",item.types[1]) + t.Errorf("Expecting type of 'http://example.org/animals#cat' but got %d", item.types[1]) } } @@ -362,12 +354,10 @@ func TestParseItemId(t *testing.T) { item := ParseOneItem(html, t) if item.id != "urn:isbn:0-330-34032-8" { - t.Errorf("Expecting id of 'urn:isbn:0-330-34032-8' but got %d",item.id) + t.Errorf("Expecting id of 'urn:isbn:0-330-34032-8' but got %d", item.id) } } - - func TestParseItemRef(t *testing.T) { html := `

A white house, boarded up, sits in a forest. @@ -379,9 +369,8 @@ func TestParseItemRef(t *testing.T) { item := ParseOneItem(html, t) - if len(item.properties) != 3 { - t.Errorf("Expecting 3 properties but got %d",len(item.properties) ) + t.Errorf("Expecting 3 properties but got %d", len(item.properties)) } if item.properties["license"][0].(string) != "http://www.opensource.org/licenses/mit-license.php" { @@ -417,13 +406,13 @@ func TestParseSharedItemRef(t *testing.T) { data := ParseData(html, t) if len(data.items) != 2 { - t.Errorf("Expecting 2 items but got %d",len(data.items) ) + t.Errorf("Expecting 2 items but got %d", len(data.items)) } if len(data.items[0].properties) != 3 { - t.Errorf("Expecting 3 properties but got %d",len(data.items[0].properties) ) + t.Errorf("Expecting 3 properties but got %d", len(data.items[0].properties)) } if len(data.items[1].properties) != 3 { - t.Errorf("Expecting 3 properties but got %d",len(data.items[1].properties) ) + t.Errorf("Expecting 3 properties but got %d", len(data.items[1].properties)) } if data.items[0].properties["license"][0].(string) != "http://www.opensource.org/licenses/mit-license.php" { @@ -434,4 +423,26 @@ func TestParseSharedItemRef(t *testing.T) { t.Errorf("Property value 'http://www.opensource.org/licenses/mit-license.php' not found for 'license'") } -} \ No newline at end of file +} + +func TestParseMultiValuedItemRef(t *testing.T) { + html := ` + + +
+

Name: Amanda

+

Age: 26

+ + + ` + + data := ParseData(html, t) + + if data.items[0].properties["name"][0].(string) != "Amanda" { + t.Errorf("Property value 'Amanda' not found for 'name'") + } + + if data.items[0].properties["age"][0].(string) != "26" { + t.Errorf("Property value '26' not found for 'age'") + } +}