forked from ukamnya/microdata_mirror
		
	Added parsing of itemref
This commit is contained in:
		
							parent
							
								
									a58ae8f1fe
								
							
						
					
					
						commit
						aec670c37a
					
				
							
								
								
									
										44
									
								
								microdata.go
									
									
									
									
									
								
							
							
						
						
									
										44
									
								
								microdata.go
									
									
									
									
									
								
							@ -42,6 +42,7 @@ func NewMicrodata() *Microdata {
 | 
			
		||||
type Parser struct {
 | 
			
		||||
	p *h5.Parser
 | 
			
		||||
	data *Microdata
 | 
			
		||||
	identifiedNodes map[string]*h5.Node
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func NewParser(r io.Reader) *Parser {
 | 
			
		||||
@ -58,17 +59,23 @@ func (self *Parser) Parse() (*Microdata, error) {
 | 
			
		||||
	}
 | 
			
		||||
	tree := self.p.Tree()
 | 
			
		||||
 | 
			
		||||
	self.scanForItem(tree)
 | 
			
		||||
	topLevelItemNodes := make([]*h5.Node, 0)
 | 
			
		||||
	self.identifiedNodes = make(map[string]*h5.Node, 0)
 | 
			
		||||
 | 
			
		||||
	return self.data, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (self *Parser) scanForItem(node *h5.Node) {
 | 
			
		||||
	if node == nil {
 | 
			
		||||
		return
 | 
			
		||||
	}
 | 
			
		||||
	tree.Walk( func(n *h5.Node) {
 | 
			
		||||
		if _, exists := getAttr("itemscope", n); exists {
 | 
			
		||||
			if _, exists := getAttr("itemprop", n); !exists {
 | 
			
		||||
				topLevelItemNodes = append(topLevelItemNodes, n)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
	if _, exists := getAttr("itemscope", node); exists {
 | 
			
		||||
		if id, exists := getAttr("id", n); exists {
 | 
			
		||||
			self.identifiedNodes[id] = n
 | 
			
		||||
		}
 | 
			
		||||
		})
 | 
			
		||||
 | 
			
		||||
	for _, node := range topLevelItemNodes {
 | 
			
		||||
		item := NewItem()
 | 
			
		||||
		self.data.items = append(self.data.items, item)
 | 
			
		||||
		if itemtypes, exists := getAttr("itemtype", node); exists {
 | 
			
		||||
@ -79,29 +86,30 @@ func (self *Parser) scanForItem(node *h5.Node) {
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
			// itemid only valid when itemscope and itemtype are both present
 | 
			
		||||
			if itemid, exists := getAttr("itemid", node); exists {
 | 
			
		||||
			if itemid, exists := getAttr("itemid", 	node); exists {
 | 
			
		||||
				item.id = strings.TrimSpace(itemid)
 | 
			
		||||
			}
 | 
			
		||||
			
 | 
			
		||||
		} 
 | 
			
		||||
 | 
			
		||||
		if itemref, exists := getAttr("itemref", node); exists {
 | 
			
		||||
			if refnode, exists := self.identifiedNodes[itemref]; exists {
 | 
			
		||||
	        	self.readItem(item, refnode)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		if len(node.Children) > 0 {
 | 
			
		||||
	    	for _, child := range node.Children {
 | 
			
		||||
	        	self.readItem(item, child)
 | 
			
		||||
	        }
 | 
			
		||||
	    }
 | 
			
		||||
 | 
			
		||||
	} else {
 | 
			
		||||
		if len(node.Children) > 0 {
 | 
			
		||||
	    	for _, child := range node.Children {
 | 
			
		||||
	        	self.scanForItem(child)
 | 
			
		||||
	        }
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return self.data, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
func (self *Parser) readItem(item *Item, node *h5.Node) {
 | 
			
		||||
	if itemprop, exists := getAttr("itemprop", node); exists {
 | 
			
		||||
		var propertyValue string
 | 
			
		||||
@ -133,7 +141,7 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
 | 
			
		||||
					}
 | 
			
		||||
 | 
			
		||||
				})
 | 
			
		||||
			propertyValue = text.String()
 | 
			
		||||
				propertyValue = text.String()
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
 | 
			
		||||
@ -144,6 +152,8 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	if len(node.Children) > 0 {
 | 
			
		||||
    	for _, child := range node.Children {
 | 
			
		||||
        	self.readItem(item, child)
 | 
			
		||||
 | 
			
		||||
@ -366,3 +366,72 @@ func TestParseItemId(t *testing.T) {
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
func TestParseItemRef(t *testing.T) {
 | 
			
		||||
	html := `<body><p><figure itemscope itemtype="http://n.whatwg.org/work" itemref="licenses">
 | 
			
		||||
   <img itemprop="work" src="images/house.jpeg" alt="A white house, boarded up, sits in a forest.">
 | 
			
		||||
   <figcaption itemprop="title">The house I found.</figcaption>
 | 
			
		||||
  </figure></p>
 | 
			
		||||
   <p id="licenses">All images licensed under the <a itemprop="license"
 | 
			
		||||
   href="http://www.opensource.org/licenses/mit-license.php">MIT
 | 
			
		||||
   license</a>.</p></body>`
 | 
			
		||||
 | 
			
		||||
	item := ParseOneItem(html, t)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	if len(item.properties) != 3 {
 | 
			
		||||
		t.Errorf("Expecting 3 properties but got %d",len(item.properties) )
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if item.properties["license"][0].(string) != "http://www.opensource.org/licenses/mit-license.php" {
 | 
			
		||||
		t.Errorf("Property value 'http://www.opensource.org/licenses/mit-license.php' not found for 'license'")
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func TestParseSharedItemRef(t *testing.T) {
 | 
			
		||||
	html := `<!DOCTYPE HTML>
 | 
			
		||||
		<html>
 | 
			
		||||
		 <head>
 | 
			
		||||
		  <title>Photo gallery</title>
 | 
			
		||||
		 </head>
 | 
			
		||||
		 <body>
 | 
			
		||||
		  <h1>My photos</h1>
 | 
			
		||||
		  <figure itemscope itemtype="http://n.whatwg.org/work" itemref="licenses">
 | 
			
		||||
		   <img itemprop="work" src="images/house.jpeg" alt="A white house, boarded up, sits in a forest.">
 | 
			
		||||
		   <figcaption itemprop="title">The house I found.</figcaption>
 | 
			
		||||
		  </figure>
 | 
			
		||||
		  <figure itemscope itemtype="http://n.whatwg.org/work" itemref="licenses">
 | 
			
		||||
		   <img itemprop="work" src="images/mailbox.jpeg" alt="Outside the house is a mailbox. It has a leaflet inside.">
 | 
			
		||||
		   <figcaption itemprop="title">The mailbox.</figcaption>
 | 
			
		||||
		  </figure>
 | 
			
		||||
		  <footer>
 | 
			
		||||
		   <p id="licenses">All images licensed under the <a itemprop="license"
 | 
			
		||||
		   href="http://www.opensource.org/licenses/mit-license.php">MIT
 | 
			
		||||
		   license</a>.</p>
 | 
			
		||||
		  </footer>
 | 
			
		||||
		 </body>
 | 
			
		||||
		</html>`
 | 
			
		||||
 | 
			
		||||
	data := ParseData(html, t)
 | 
			
		||||
 | 
			
		||||
	if len(data.items) != 2 {
 | 
			
		||||
		t.Errorf("Expecting 2 items but got %d",len(data.items) )
 | 
			
		||||
	}
 | 
			
		||||
	if len(data.items[0].properties) != 3 {
 | 
			
		||||
		t.Errorf("Expecting 3 properties but got %d",len(data.items[0].properties) )
 | 
			
		||||
	}
 | 
			
		||||
	if len(data.items[1].properties) != 3 {
 | 
			
		||||
		t.Errorf("Expecting 3 properties but got %d",len(data.items[1].properties) )
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if data.items[0].properties["license"][0].(string) != "http://www.opensource.org/licenses/mit-license.php" {
 | 
			
		||||
		t.Errorf("Property value 'http://www.opensource.org/licenses/mit-license.php' not found for 'license'")
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if data.items[1].properties["license"][0].(string) != "http://www.opensource.org/licenses/mit-license.php" {
 | 
			
		||||
		t.Errorf("Property value 'http://www.opensource.org/licenses/mit-license.php' not found for 'license'")
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user