forked from ukamnya/microdata_mirror
		
	Added multi-valued itemrefs
This commit is contained in:
		
							parent
							
								
									aec670c37a
								
							
						
					
					
						commit
						5a35df7849
					
				
							
								
								
									
										82
									
								
								microdata.go
									
									
									
									
									
								
							
							
						
						
									
										82
									
								
								microdata.go
									
									
									
									
									
								
							@ -7,21 +7,19 @@ import (
 | 
			
		||||
	"strings"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
type ValueList []interface{}
 | 
			
		||||
type PropertyMap map[string]ValueList
 | 
			
		||||
 | 
			
		||||
type Item struct {
 | 
			
		||||
	properties PropertyMap
 | 
			
		||||
	types []string
 | 
			
		||||
	id string
 | 
			
		||||
	types      []string
 | 
			
		||||
	id         string
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func NewItem() *Item {
 | 
			
		||||
	return &Item{
 | 
			
		||||
		properties: make(PropertyMap, 0),
 | 
			
		||||
		types: make([]string, 0),
 | 
			
		||||
		types:      make([]string, 0),
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -40,14 +38,14 @@ func NewMicrodata() *Microdata {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
type Parser struct {
 | 
			
		||||
	p *h5.Parser
 | 
			
		||||
	data *Microdata
 | 
			
		||||
	p               *h5.Parser
 | 
			
		||||
	data            *Microdata
 | 
			
		||||
	identifiedNodes map[string]*h5.Node
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func NewParser(r io.Reader) *Parser {
 | 
			
		||||
	return &Parser {
 | 
			
		||||
		p : h5.NewParser(r),
 | 
			
		||||
	return &Parser{
 | 
			
		||||
		p:    h5.NewParser(r),
 | 
			
		||||
		data: NewMicrodata(),
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
@ -62,8 +60,7 @@ func (self *Parser) Parse() (*Microdata, error) {
 | 
			
		||||
	topLevelItemNodes := make([]*h5.Node, 0)
 | 
			
		||||
	self.identifiedNodes = make(map[string]*h5.Node, 0)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	tree.Walk( func(n *h5.Node) {
 | 
			
		||||
	tree.Walk(func(n *h5.Node) {
 | 
			
		||||
		if _, exists := getAttr("itemscope", n); exists {
 | 
			
		||||
			if _, exists := getAttr("itemprop", n); !exists {
 | 
			
		||||
				topLevelItemNodes = append(topLevelItemNodes, n)
 | 
			
		||||
@ -73,7 +70,7 @@ func (self *Parser) Parse() (*Microdata, error) {
 | 
			
		||||
		if id, exists := getAttr("id", n); exists {
 | 
			
		||||
			self.identifiedNodes[id] = n
 | 
			
		||||
		}
 | 
			
		||||
		})
 | 
			
		||||
	})
 | 
			
		||||
 | 
			
		||||
	for _, node := range topLevelItemNodes {
 | 
			
		||||
		item := NewItem()
 | 
			
		||||
@ -86,62 +83,64 @@ func (self *Parser) Parse() (*Microdata, error) {
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
			// itemid only valid when itemscope and itemtype are both present
 | 
			
		||||
			if itemid, exists := getAttr("itemid", 	node); exists {
 | 
			
		||||
			if itemid, exists := getAttr("itemid", node); exists {
 | 
			
		||||
				item.id = strings.TrimSpace(itemid)
 | 
			
		||||
			}
 | 
			
		||||
			
 | 
			
		||||
		} 
 | 
			
		||||
 | 
			
		||||
		if itemref, exists := getAttr("itemref", node); exists {
 | 
			
		||||
			if refnode, exists := self.identifiedNodes[itemref]; exists {
 | 
			
		||||
	        	self.readItem(item, refnode)
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		if itemrefs, exists := getAttr("itemref", node); exists {
 | 
			
		||||
			for _, itemref := range strings.Split(strings.TrimSpace(itemrefs), " ") {
 | 
			
		||||
				itemref = strings.TrimSpace(itemref)
 | 
			
		||||
 | 
			
		||||
				if refnode, exists := self.identifiedNodes[itemref]; exists {
 | 
			
		||||
					self.readItem(item, refnode)
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		if len(node.Children) > 0 {
 | 
			
		||||
	    	for _, child := range node.Children {
 | 
			
		||||
	        	self.readItem(item, child)
 | 
			
		||||
	        }
 | 
			
		||||
	    }
 | 
			
		||||
			for _, child := range node.Children {
 | 
			
		||||
				self.readItem(item, child)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return self.data, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
func (self *Parser) readItem(item *Item, node *h5.Node) {
 | 
			
		||||
	if itemprop, exists := getAttr("itemprop", node); exists {
 | 
			
		||||
		var propertyValue string
 | 
			
		||||
		
 | 
			
		||||
 | 
			
		||||
		switch node.Data() {
 | 
			
		||||
 | 
			
		||||
		case "img","audio", "source", "video", "embed", "iframe", "track":
 | 
			
		||||
		case "img", "audio", "source", "video", "embed", "iframe", "track":
 | 
			
		||||
			if urlValue, exists := getAttr("src", node); exists {
 | 
			
		||||
				propertyValue = urlValue
 | 
			
		||||
			} 
 | 
			
		||||
			}
 | 
			
		||||
		case "a", "area", "link":
 | 
			
		||||
			if urlValue, exists := getAttr("href", node); exists {
 | 
			
		||||
				propertyValue = urlValue
 | 
			
		||||
			} 
 | 
			
		||||
			}
 | 
			
		||||
		case "data":
 | 
			
		||||
			if urlValue, exists := getAttr("value", node); exists {
 | 
			
		||||
				propertyValue = urlValue
 | 
			
		||||
			} 
 | 
			
		||||
			}
 | 
			
		||||
		case "time":
 | 
			
		||||
			if urlValue, exists := getAttr("datetime", node); exists {
 | 
			
		||||
				propertyValue = urlValue
 | 
			
		||||
			} 
 | 
			
		||||
			}
 | 
			
		||||
 | 
			
		||||
		default:
 | 
			
		||||
			var text bytes.Buffer
 | 
			
		||||
			node.Walk( func(n *h5.Node) {
 | 
			
		||||
					if n.Type == h5.TextNode {
 | 
			
		||||
						text.WriteString(n.Data())
 | 
			
		||||
					}
 | 
			
		||||
			node.Walk(func(n *h5.Node) {
 | 
			
		||||
				if n.Type == h5.TextNode {
 | 
			
		||||
					text.WriteString(n.Data())
 | 
			
		||||
				}
 | 
			
		||||
 | 
			
		||||
				})
 | 
			
		||||
				propertyValue = text.String()
 | 
			
		||||
			})
 | 
			
		||||
			propertyValue = text.String()
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
 | 
			
		||||
@ -152,13 +151,11 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	if len(node.Children) > 0 {
 | 
			
		||||
    	for _, child := range node.Children {
 | 
			
		||||
        	self.readItem(item, child)
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
		for _, child := range node.Children {
 | 
			
		||||
			self.readItem(item, child)
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func getAttr(name string, node *h5.Node) (string, bool) {
 | 
			
		||||
@ -169,4 +166,3 @@ func getAttr(name string, node *h5.Node) (string, bool) {
 | 
			
		||||
	}
 | 
			
		||||
	return "", false
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -5,7 +5,6 @@ import (
 | 
			
		||||
	"testing"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
func ParseData(html string, t *testing.T) *Microdata {
 | 
			
		||||
	p := NewParser(strings.NewReader(html))
 | 
			
		||||
 | 
			
		||||
@ -40,7 +39,6 @@ func TestParse(t *testing.T) {
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
func TestParseActuallyParses(t *testing.T) {
 | 
			
		||||
	html := `
 | 
			
		||||
	<div itemscope>
 | 
			
		||||
@ -54,7 +52,6 @@ func TestParseActuallyParses(t *testing.T) {
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
func TestParseThreeProps(t *testing.T) {
 | 
			
		||||
	html := `
 | 
			
		||||
	<div itemscope>
 | 
			
		||||
@ -78,7 +75,6 @@ func TestParseThreeProps(t *testing.T) {
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
func TestParseImgSrc(t *testing.T) {
 | 
			
		||||
	html := `
 | 
			
		||||
	<div itemscope>
 | 
			
		||||
@ -158,7 +154,6 @@ func TestParseSourceSrc(t *testing.T) {
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
func TestParseVideoSrc(t *testing.T) {
 | 
			
		||||
	html := `
 | 
			
		||||
	<div itemscope>
 | 
			
		||||
@ -237,8 +232,6 @@ func TestParseTimeDatetime(t *testing.T) {
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
func TestParseTwoValues(t *testing.T) {
 | 
			
		||||
	html := `
 | 
			
		||||
	<div itemscope>
 | 
			
		||||
@ -251,7 +244,7 @@ func TestParseTwoValues(t *testing.T) {
 | 
			
		||||
 | 
			
		||||
	item := ParseOneItem(html, t)
 | 
			
		||||
	if len(item.properties["flavor"]) != 2 {
 | 
			
		||||
		t.Errorf("Expecting 2 values but got %d",len(item.properties["flavor"]) )
 | 
			
		||||
		t.Errorf("Expecting 2 values but got %d", len(item.properties["flavor"]))
 | 
			
		||||
	}
 | 
			
		||||
	if item.properties["flavor"][0].(string) != "Lemon sorbet" {
 | 
			
		||||
		t.Errorf("Property value 'Lemon sorbet' not found")
 | 
			
		||||
@ -260,7 +253,6 @@ func TestParseTwoValues(t *testing.T) {
 | 
			
		||||
		t.Errorf("Property value 'Apricot sorbet' not found")
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func TestParseTwoPropertiesOneValue(t *testing.T) {
 | 
			
		||||
@ -271,13 +263,13 @@ func TestParseTwoPropertiesOneValue(t *testing.T) {
 | 
			
		||||
 | 
			
		||||
	item := ParseOneItem(html, t)
 | 
			
		||||
	if len(item.properties) != 2 {
 | 
			
		||||
		t.Errorf("Expecting 2 properties but got %d",len(item.properties) )
 | 
			
		||||
		t.Errorf("Expecting 2 properties but got %d", len(item.properties))
 | 
			
		||||
	}
 | 
			
		||||
	if len(item.properties["favorite-color"]) != 1 {
 | 
			
		||||
		t.Errorf("Expecting 1 value but got %d",len(item.properties["favorite-color"]) )
 | 
			
		||||
		t.Errorf("Expecting 1 value but got %d", len(item.properties["favorite-color"]))
 | 
			
		||||
	}
 | 
			
		||||
	if len(item.properties["favorite-fruit"]) != 1 {
 | 
			
		||||
		t.Errorf("Expecting 1 value but got %d",len(item.properties["favorite-fruit"]) )
 | 
			
		||||
		t.Errorf("Expecting 1 value but got %d", len(item.properties["favorite-fruit"]))
 | 
			
		||||
	}
 | 
			
		||||
	if item.properties["favorite-color"][0].(string) != "orange" {
 | 
			
		||||
		t.Errorf("Property value 'orange' not found for 'favorite-color'")
 | 
			
		||||
@ -295,14 +287,14 @@ func TestParseTwoPropertiesOneValueMultispaced(t *testing.T) {
 | 
			
		||||
 | 
			
		||||
	item := ParseOneItem(html, t)
 | 
			
		||||
	if len(item.properties) != 2 {
 | 
			
		||||
		t.Errorf("Expecting 2 properties but got %d",len(item.properties) )
 | 
			
		||||
		t.Errorf("Expecting 2 properties but got %d", len(item.properties))
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if len(item.properties["favorite-color"]) != 1 {
 | 
			
		||||
		t.Errorf("Expecting 1 value but got %d",len(item.properties["favorite-color"]) )
 | 
			
		||||
		t.Errorf("Expecting 1 value but got %d", len(item.properties["favorite-color"]))
 | 
			
		||||
	}
 | 
			
		||||
	if len(item.properties["favorite-fruit"]) != 1 {
 | 
			
		||||
		t.Errorf("Expecting 1 value but got %d",len(item.properties["favorite-fruit"]) )
 | 
			
		||||
		t.Errorf("Expecting 1 value but got %d", len(item.properties["favorite-fruit"]))
 | 
			
		||||
	}
 | 
			
		||||
	if item.properties["favorite-color"][0].(string) != "orange" {
 | 
			
		||||
		t.Errorf("Property value 'orange' not found for 'favorite-color'")
 | 
			
		||||
@ -320,11 +312,11 @@ func TestParseItemType(t *testing.T) {
 | 
			
		||||
 | 
			
		||||
	item := ParseOneItem(html, t)
 | 
			
		||||
	if len(item.types) != 1 {
 | 
			
		||||
		t.Errorf("Expecting 1 type but got %d",len(item.types) )	
 | 
			
		||||
		t.Errorf("Expecting 1 type but got %d", len(item.types))
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if item.types[0] != "http://example.org/animals#cat" {
 | 
			
		||||
		t.Errorf("Expecting type of 'http://example.org/animals#cat' but got %d",item.types[0]) 
 | 
			
		||||
		t.Errorf("Expecting type of 'http://example.org/animals#cat' but got %d", item.types[0])
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -336,14 +328,14 @@ func TestParseMultipleItemTypes(t *testing.T) {
 | 
			
		||||
 | 
			
		||||
	item := ParseOneItem(html, t)
 | 
			
		||||
	if len(item.types) != 2 {
 | 
			
		||||
		t.Errorf("Expecting 2 types but got %d",len(item.types) )	
 | 
			
		||||
		t.Errorf("Expecting 2 types but got %d", len(item.types))
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if item.types[0] != "http://example.org/animals#mammal" {
 | 
			
		||||
		t.Errorf("Expecting type of 'http://example.org/animals#mammal' but got %d",item.types[0]) 
 | 
			
		||||
		t.Errorf("Expecting type of 'http://example.org/animals#mammal' but got %d", item.types[0])
 | 
			
		||||
	}
 | 
			
		||||
	if item.types[1] != "http://example.org/animals#cat" {
 | 
			
		||||
		t.Errorf("Expecting type of 'http://example.org/animals#cat' but got %d",item.types[1]) 
 | 
			
		||||
		t.Errorf("Expecting type of 'http://example.org/animals#cat' but got %d", item.types[1])
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -362,12 +354,10 @@ func TestParseItemId(t *testing.T) {
 | 
			
		||||
	item := ParseOneItem(html, t)
 | 
			
		||||
 | 
			
		||||
	if item.id != "urn:isbn:0-330-34032-8" {
 | 
			
		||||
		t.Errorf("Expecting id of 'urn:isbn:0-330-34032-8' but got %d",item.id) 
 | 
			
		||||
		t.Errorf("Expecting id of 'urn:isbn:0-330-34032-8' but got %d", item.id)
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
func TestParseItemRef(t *testing.T) {
 | 
			
		||||
	html := `<body><p><figure itemscope itemtype="http://n.whatwg.org/work" itemref="licenses">
 | 
			
		||||
   <img itemprop="work" src="images/house.jpeg" alt="A white house, boarded up, sits in a forest.">
 | 
			
		||||
@ -379,9 +369,8 @@ func TestParseItemRef(t *testing.T) {
 | 
			
		||||
 | 
			
		||||
	item := ParseOneItem(html, t)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	if len(item.properties) != 3 {
 | 
			
		||||
		t.Errorf("Expecting 3 properties but got %d",len(item.properties) )
 | 
			
		||||
		t.Errorf("Expecting 3 properties but got %d", len(item.properties))
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if item.properties["license"][0].(string) != "http://www.opensource.org/licenses/mit-license.php" {
 | 
			
		||||
@ -417,13 +406,13 @@ func TestParseSharedItemRef(t *testing.T) {
 | 
			
		||||
	data := ParseData(html, t)
 | 
			
		||||
 | 
			
		||||
	if len(data.items) != 2 {
 | 
			
		||||
		t.Errorf("Expecting 2 items but got %d",len(data.items) )
 | 
			
		||||
		t.Errorf("Expecting 2 items but got %d", len(data.items))
 | 
			
		||||
	}
 | 
			
		||||
	if len(data.items[0].properties) != 3 {
 | 
			
		||||
		t.Errorf("Expecting 3 properties but got %d",len(data.items[0].properties) )
 | 
			
		||||
		t.Errorf("Expecting 3 properties but got %d", len(data.items[0].properties))
 | 
			
		||||
	}
 | 
			
		||||
	if len(data.items[1].properties) != 3 {
 | 
			
		||||
		t.Errorf("Expecting 3 properties but got %d",len(data.items[1].properties) )
 | 
			
		||||
		t.Errorf("Expecting 3 properties but got %d", len(data.items[1].properties))
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if data.items[0].properties["license"][0].(string) != "http://www.opensource.org/licenses/mit-license.php" {
 | 
			
		||||
@ -434,4 +423,26 @@ func TestParseSharedItemRef(t *testing.T) {
 | 
			
		||||
		t.Errorf("Property value 'http://www.opensource.org/licenses/mit-license.php' not found for 'license'")
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func TestParseMultiValuedItemRef(t *testing.T) {
 | 
			
		||||
	html := `<!DOCTYPE HTML>
 | 
			
		||||
		<html>
 | 
			
		||||
		 <body>
 | 
			
		||||
		 	<div itemscope id="amanda" itemref="a b"></div>
 | 
			
		||||
			<p id="a">Name: <span itemprop="name">Amanda</span></p>
 | 
			
		||||
			<p id="b">Age: <span itemprop="age">26</span></p>
 | 
			
		||||
 | 
			
		||||
		 </body>
 | 
			
		||||
		</html>`
 | 
			
		||||
 | 
			
		||||
	data := ParseData(html, t)
 | 
			
		||||
 | 
			
		||||
	if data.items[0].properties["name"][0].(string) != "Amanda" {
 | 
			
		||||
		t.Errorf("Property value 'Amanda' not found for 'name'")
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if data.items[0].properties["age"][0].(string) != "26" {
 | 
			
		||||
		t.Errorf("Property value '26' not found for 'age'")
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user