forked from ukamnya/microdata_mirror
Added parsing of itemtype
parent
a650a2c9e9
commit
d607667c90
34
microdata.go
34
microdata.go
|
@ -4,6 +4,7 @@ import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"code.google.com/p/go-html-transform/h5"
|
"code.google.com/p/go-html-transform/h5"
|
||||||
"io"
|
"io"
|
||||||
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -13,11 +14,13 @@ type PropertyMap map[string]ValueList
|
||||||
|
|
||||||
type Item struct {
|
type Item struct {
|
||||||
properties PropertyMap
|
properties PropertyMap
|
||||||
|
types []string
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewItem() *Item {
|
func NewItem() *Item {
|
||||||
return &Item{
|
return &Item{
|
||||||
properties: make(PropertyMap, 10),
|
properties: make(PropertyMap, 0),
|
||||||
|
types: make([]string, 0),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -64,17 +67,19 @@ func (self *Parser) scanForItem(node *h5.Node) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
hasItemscope := false
|
if _, exists := getAttr("itemscope", node); exists {
|
||||||
|
|
||||||
for _, a := range node.Attr {
|
|
||||||
if a.Name == "itemscope" {
|
|
||||||
hasItemscope = true
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if hasItemscope {
|
|
||||||
item := NewItem()
|
item := NewItem()
|
||||||
self.data.items = append(self.data.items, item)
|
self.data.items = append(self.data.items, item)
|
||||||
|
if itemtypes, exists := getAttr("itemtype", node); exists {
|
||||||
|
for _, itemtype := range strings.Split(strings.TrimSpace(itemtypes), " ") {
|
||||||
|
itemtype = strings.TrimSpace(itemtype)
|
||||||
|
if itemtype != "" {
|
||||||
|
item.types = append(item.types, itemtype)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
if len(node.Children) > 0 {
|
if len(node.Children) > 0 {
|
||||||
|
@ -94,7 +99,7 @@ func (self *Parser) scanForItem(node *h5.Node) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (self *Parser) readItem(item *Item, node *h5.Node) {
|
func (self *Parser) readItem(item *Item, node *h5.Node) {
|
||||||
if propertyName, exists := getAttr("itemprop", node); exists {
|
if itemprop, exists := getAttr("itemprop", node); exists {
|
||||||
var propertyValue string
|
var propertyValue string
|
||||||
|
|
||||||
switch node.Data() {
|
switch node.Data() {
|
||||||
|
@ -127,7 +132,12 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
|
||||||
propertyValue = text.String()
|
propertyValue = text.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
item.SetString(propertyName, propertyValue)
|
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
|
||||||
|
propertyName = strings.TrimSpace(propertyName)
|
||||||
|
if propertyName != "" {
|
||||||
|
item.SetString(propertyName, propertyValue)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(node.Children) > 0 {
|
if len(node.Children) > 0 {
|
||||||
|
|
|
@ -270,10 +270,13 @@ func TestReadTwoPropertiesOneValue(t *testing.T) {
|
||||||
</div>`
|
</div>`
|
||||||
|
|
||||||
item := ReadOneItem(html, t)
|
item := ReadOneItem(html, t)
|
||||||
if len(item.properties["favorite-color"]) != 2 {
|
if len(item.properties) != 2 {
|
||||||
|
t.Errorf("Expecting 2 properties but got %d",len(item.properties) )
|
||||||
|
}
|
||||||
|
if len(item.properties["favorite-color"]) != 1 {
|
||||||
t.Errorf("Expecting 1 value but got %d",len(item.properties["favorite-color"]) )
|
t.Errorf("Expecting 1 value but got %d",len(item.properties["favorite-color"]) )
|
||||||
}
|
}
|
||||||
if len(item.properties["favorite-fruit"]) != 2 {
|
if len(item.properties["favorite-fruit"]) != 1 {
|
||||||
t.Errorf("Expecting 1 value but got %d",len(item.properties["favorite-fruit"]) )
|
t.Errorf("Expecting 1 value but got %d",len(item.properties["favorite-fruit"]) )
|
||||||
}
|
}
|
||||||
if item.properties["favorite-color"][0].(string) != "orange" {
|
if item.properties["favorite-color"][0].(string) != "orange" {
|
||||||
|
@ -282,6 +285,64 @@ func TestReadTwoPropertiesOneValue(t *testing.T) {
|
||||||
if item.properties["favorite-fruit"][0].(string) != "orange" {
|
if item.properties["favorite-fruit"][0].(string) != "orange" {
|
||||||
t.Errorf("Property value 'orange' not found for 'favorite-fruit'")
|
t.Errorf("Property value 'orange' not found for 'favorite-fruit'")
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestReadTwoPropertiesOneValueMultispaced(t *testing.T) {
|
||||||
|
html := `
|
||||||
|
<div itemscope>
|
||||||
|
<span itemprop=" favorite-color favorite-fruit ">orange</span>
|
||||||
|
</div>`
|
||||||
|
|
||||||
|
item := ReadOneItem(html, t)
|
||||||
|
if len(item.properties) != 2 {
|
||||||
|
t.Errorf("Expecting 2 properties but got %d",len(item.properties) )
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(item.properties["favorite-color"]) != 1 {
|
||||||
|
t.Errorf("Expecting 1 value but got %d",len(item.properties["favorite-color"]) )
|
||||||
|
}
|
||||||
|
if len(item.properties["favorite-fruit"]) != 1 {
|
||||||
|
t.Errorf("Expecting 1 value but got %d",len(item.properties["favorite-fruit"]) )
|
||||||
|
}
|
||||||
|
if item.properties["favorite-color"][0].(string) != "orange" {
|
||||||
|
t.Errorf("Property value 'orange' not found for 'favorite-color'")
|
||||||
|
}
|
||||||
|
if item.properties["favorite-fruit"][0].(string) != "orange" {
|
||||||
|
t.Errorf("Property value 'orange' not found for 'favorite-fruit'")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestReadItemType(t *testing.T) {
|
||||||
|
html := `
|
||||||
|
<div itemscope itemtype="http://example.org/animals#cat">
|
||||||
|
<h1 itemprop="name">Hedral</h1>
|
||||||
|
</div>`
|
||||||
|
|
||||||
|
item := ReadOneItem(html, t)
|
||||||
|
if len(item.types) != 1 {
|
||||||
|
t.Errorf("Expecting 1 type but got %d",len(item.types) )
|
||||||
|
}
|
||||||
|
|
||||||
|
if item.types[0] != "http://example.org/animals#cat" {
|
||||||
|
t.Errorf("Expecting type of 'http://example.org/animals#cat' but got %d",item.types[0])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestReadMultiplrItemTypes(t *testing.T) {
|
||||||
|
html := `
|
||||||
|
<div itemscope itemtype=" http://example.org/animals#mammal http://example.org/animals#cat ">
|
||||||
|
<h1 itemprop="name">Hedral</h1>
|
||||||
|
</div>`
|
||||||
|
|
||||||
|
item := ReadOneItem(html, t)
|
||||||
|
if len(item.types) != 2 {
|
||||||
|
t.Errorf("Expecting 2 types but got %d",len(item.types) )
|
||||||
|
}
|
||||||
|
|
||||||
|
if item.types[0] != "http://example.org/animals#mammal" {
|
||||||
|
t.Errorf("Expecting type of 'http://example.org/animals#mammal' but got %d",item.types[0])
|
||||||
|
}
|
||||||
|
if item.types[1] != "http://example.org/animals#cat" {
|
||||||
|
t.Errorf("Expecting type of 'http://example.org/animals#cat' but got %d",item.types[1])
|
||||||
|
}
|
||||||
}
|
}
|
Loading…
Reference in New Issue