forked from ukamnya/microdata_mirror
		
	Fixed breakage caused by h5 changes
This commit is contained in:
		
							parent
							
								
									c667405b6d
								
							
						
					
					
						commit
						5e88f404e0
					
				
							
								
								
									
										43
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										43
									
								
								README.md
									
									
									
									
									
								
							@ -1,19 +1,18 @@
 | 
			
		||||
microdata - a microdata parser in Go
 | 
			
		||||
# microdata
 | 
			
		||||
A microdata parser in Go
 | 
			
		||||
 | 
			
		||||
See [http://www.w3.org/TR/microdata/](http://www.w3.org/TR/microdata/) for more information about Microdata
 | 
			
		||||
 | 
			
		||||
INSTALLATION
 | 
			
		||||
============
 | 
			
		||||
## Installation
 | 
			
		||||
 | 
			
		||||
Simply run
 | 
			
		||||
 | 
			
		||||
	go get github.com/iand/microdata
 | 
			
		||||
 | 
			
		||||
Documentation is at [http://go.pkgdoc.org/github.com/iand/microdata](http://go.pkgdoc.org/github.com/iand/microdata)
 | 
			
		||||
Documentation is at [http://godoc.org/github.com/iand/microdata](http://godoc.org/github.com/iand/microdata)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
USAGE
 | 
			
		||||
=====
 | 
			
		||||
## Usage
 | 
			
		||||
 | 
			
		||||
Example of parsing a string containing HTML:
 | 
			
		||||
 | 
			
		||||
@ -72,10 +71,30 @@ Extract microdata from a webpage and print the result as JSON
 | 
			
		||||
	}		
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
LICENSE
 | 
			
		||||
=======
 | 
			
		||||
This code and associated documentation is in the public domain.
 | 
			
		||||
## Authors
 | 
			
		||||
 | 
			
		||||
To the extent possible under law, Ian Davis has waived all copyright
 | 
			
		||||
and related or neighboring rights to this file. This work is published 
 | 
			
		||||
from the United Kingdom. 
 | 
			
		||||
* [Ian Davis](http://github.com/iand) - <http://iandavis.com/>
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## Contributors
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## Contributing
 | 
			
		||||
 | 
			
		||||
* Do submit your changes as a pull request
 | 
			
		||||
* Do your best to adhere to the existing coding conventions and idioms.
 | 
			
		||||
* Do run `go fmt` on the code before committing 
 | 
			
		||||
* Do feel free to add yourself to the [`CREDITS`](CREDITS) file and the
 | 
			
		||||
  corresponding Contributors list in the the [`README.md`](README.md). 
 | 
			
		||||
  Alphabetical order applies.
 | 
			
		||||
* Don't touch the [`AUTHORS`](AUTHORS) file. An existing author will add you if 
 | 
			
		||||
  your contributions are significant enough.
 | 
			
		||||
* Do note that in order for any non-trivial changes to be merged (as a rule
 | 
			
		||||
  of thumb, additions larger than about 15 lines of code), an explicit
 | 
			
		||||
  Public Domain Dedication needs to be on record from you. Please include
 | 
			
		||||
  a copy of the statement found in the [`WAIVER`](WAIVER) file with your pull request
 | 
			
		||||
 | 
			
		||||
## License
 | 
			
		||||
 | 
			
		||||
This is free and unencumbered software released into the public domain. For more
 | 
			
		||||
information, see <http://unlicense.org/> or the accompanying [`UNLICENSE`](UNLICENSE) file.
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										24
									
								
								UNLICENSE
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								UNLICENSE
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,24 @@
 | 
			
		||||
This is free and unencumbered software released into the public domain.
 | 
			
		||||
 | 
			
		||||
Anyone is free to copy, modify, publish, use, compile, sell, or
 | 
			
		||||
distribute this software, either in source code form or as a compiled
 | 
			
		||||
binary, for any purpose, commercial or non-commercial, and by any
 | 
			
		||||
means.
 | 
			
		||||
 | 
			
		||||
In jurisdictions that recognize copyright laws, the author or authors
 | 
			
		||||
of this software dedicate any and all copyright interest in the
 | 
			
		||||
software to the public domain. We make this dedication for the benefit
 | 
			
		||||
of the public at large and to the detriment of our heirs and
 | 
			
		||||
successors. We intend this dedication to be an overt act of
 | 
			
		||||
relinquishment in perpetuity of all present and future rights to this
 | 
			
		||||
software under copyright law.
 | 
			
		||||
 | 
			
		||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
			
		||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
			
		||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
			
		||||
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 | 
			
		||||
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 | 
			
		||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 | 
			
		||||
OTHER DEALINGS IN THE SOFTWARE.
 | 
			
		||||
 | 
			
		||||
For more information, please refer to <http://unlicense.org/>
 | 
			
		||||
							
								
								
									
										5
									
								
								WAIVER
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								WAIVER
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,5 @@
 | 
			
		||||
I dedicate any and all copyright interest in this software to the
 | 
			
		||||
public domain. I make this dedication for the benefit of the public at
 | 
			
		||||
large and to the detriment of my heirs and successors. I intend this
 | 
			
		||||
dedication to be an overt act of relinquishment in perpetuity of all
 | 
			
		||||
present and future rights to this software under copyright law.
 | 
			
		||||
							
								
								
									
										81
									
								
								microdata.go
									
									
									
									
									
								
							
							
						
						
									
										81
									
								
								microdata.go
									
									
									
									
									
								
							@ -1,7 +1,6 @@
 | 
			
		||||
/*
 | 
			
		||||
  To the extent possible under law, Ian Davis has waived all copyright
 | 
			
		||||
  and related or neighboring rights to this Source Code file.
 | 
			
		||||
  This work is published from the United Kingdom. 
 | 
			
		||||
  This is free and unencumbered software released into the public domain. For more
 | 
			
		||||
  information, see <http://unlicense.org/> or the accompanying UNLICENSE file.
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
// A package for parsing microdata
 | 
			
		||||
@ -11,6 +10,8 @@ package microdata
 | 
			
		||||
import (
 | 
			
		||||
	"bytes"
 | 
			
		||||
	"code.google.com/p/go-html-transform/h5"
 | 
			
		||||
	"code.google.com/p/go.net/html"
 | 
			
		||||
	"code.google.com/p/go.net/html/atom"
 | 
			
		||||
	"encoding/json"
 | 
			
		||||
	"io"
 | 
			
		||||
	"net/url"
 | 
			
		||||
@ -78,18 +79,20 @@ func (self *Microdata) Json() ([]byte, error) {
 | 
			
		||||
 | 
			
		||||
// An HTML parser that extracts microdata
 | 
			
		||||
type Parser struct {
 | 
			
		||||
	p               *h5.Parser
 | 
			
		||||
	p               *h5.Tree
 | 
			
		||||
	data            *Microdata
 | 
			
		||||
	base            *url.URL
 | 
			
		||||
	identifiedNodes map[string]*h5.Node
 | 
			
		||||
	identifiedNodes map[string]*html.Node
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Create a new parser for extracting microdata
 | 
			
		||||
// r is a reader over an HTML document
 | 
			
		||||
// base is the base URL for resolving relative URLs
 | 
			
		||||
func NewParser(r io.Reader, base *url.URL) *Parser {
 | 
			
		||||
	p, _ := h5.New(r)
 | 
			
		||||
 | 
			
		||||
	return &Parser{
 | 
			
		||||
		p:    h5.NewParser(r),
 | 
			
		||||
		p:    p,
 | 
			
		||||
		data: NewMicrodata(),
 | 
			
		||||
		base: base,
 | 
			
		||||
	}
 | 
			
		||||
@ -97,16 +100,12 @@ func NewParser(r io.Reader, base *url.URL) *Parser {
 | 
			
		||||
 | 
			
		||||
// Parse the document and return a Microdata set
 | 
			
		||||
func (self *Parser) Parse() (*Microdata, error) {
 | 
			
		||||
	err := self.p.Parse()
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return nil, err
 | 
			
		||||
	}
 | 
			
		||||
	tree := self.p.Tree()
 | 
			
		||||
	tree := self.p
 | 
			
		||||
 | 
			
		||||
	topLevelItemNodes := make([]*h5.Node, 0)
 | 
			
		||||
	self.identifiedNodes = make(map[string]*h5.Node, 0)
 | 
			
		||||
	topLevelItemNodes := make([]*html.Node, 0)
 | 
			
		||||
	self.identifiedNodes = make(map[string]*html.Node, 0)
 | 
			
		||||
 | 
			
		||||
	tree.Walk(func(n *h5.Node) {
 | 
			
		||||
	tree.Walk(func(n *html.Node) {
 | 
			
		||||
		if _, exists := getAttr("itemscope", n); exists {
 | 
			
		||||
			if _, exists := getAttr("itemprop", n); !exists {
 | 
			
		||||
				topLevelItemNodes = append(topLevelItemNodes, n)
 | 
			
		||||
@ -147,17 +146,16 @@ func (self *Parser) Parse() (*Microdata, error) {
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		if len(node.Children) > 0 {
 | 
			
		||||
			for _, child := range node.Children {
 | 
			
		||||
				self.readItem(item, child)
 | 
			
		||||
			}
 | 
			
		||||
		for child := node.FirstChild; child != nil; {
 | 
			
		||||
			self.readItem(item, child)
 | 
			
		||||
			child = child.NextSibling
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return self.data, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (self *Parser) readItem(item *Item, node *h5.Node) {
 | 
			
		||||
func (self *Parser) readItem(item *Item, node *html.Node) {
 | 
			
		||||
	if itemprop, exists := getAttr("itemprop", node); exists {
 | 
			
		||||
		if _, exists := getAttr("itemscope", node); exists {
 | 
			
		||||
			subitem := NewItem()
 | 
			
		||||
@ -172,10 +170,9 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
 | 
			
		||||
			if len(node.Children) > 0 {
 | 
			
		||||
				for _, child := range node.Children {
 | 
			
		||||
					self.readItem(subitem, child)
 | 
			
		||||
				}
 | 
			
		||||
			for child := node.FirstChild; child != nil; {
 | 
			
		||||
				self.readItem(subitem, child)
 | 
			
		||||
				child = child.NextSibling
 | 
			
		||||
			}
 | 
			
		||||
 | 
			
		||||
			for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
 | 
			
		||||
@ -190,35 +187,42 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
 | 
			
		||||
		} else {
 | 
			
		||||
			var propertyValue string
 | 
			
		||||
 | 
			
		||||
			switch node.Data() {
 | 
			
		||||
 | 
			
		||||
			case "img", "audio", "source", "video", "embed", "iframe", "track":
 | 
			
		||||
			switch node.DataAtom {
 | 
			
		||||
			case atom.Meta:
 | 
			
		||||
				if val, exists := getAttr("content", node); exists {
 | 
			
		||||
					propertyValue = val
 | 
			
		||||
				}
 | 
			
		||||
			case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video:
 | 
			
		||||
				if urlValue, exists := getAttr("src", node); exists {
 | 
			
		||||
					if parsedUrl, err := self.base.Parse(urlValue); err == nil {
 | 
			
		||||
						propertyValue = parsedUrl.String()
 | 
			
		||||
					}
 | 
			
		||||
 | 
			
		||||
				}
 | 
			
		||||
			case "a", "area", "link":
 | 
			
		||||
			case atom.A, atom.Area, atom.Link:
 | 
			
		||||
				if urlValue, exists := getAttr("href", node); exists {
 | 
			
		||||
					if parsedUrl, err := self.base.Parse(urlValue); err == nil {
 | 
			
		||||
						propertyValue = parsedUrl.String()
 | 
			
		||||
					}
 | 
			
		||||
				}
 | 
			
		||||
			case "data":
 | 
			
		||||
			case atom.Object:
 | 
			
		||||
				if urlValue, exists := getAttr("data", node); exists {
 | 
			
		||||
					propertyValue = urlValue
 | 
			
		||||
				}
 | 
			
		||||
			case atom.Data, atom.Meter:
 | 
			
		||||
				if urlValue, exists := getAttr("value", node); exists {
 | 
			
		||||
					propertyValue = urlValue
 | 
			
		||||
				}
 | 
			
		||||
			case "time":
 | 
			
		||||
			case atom.Time:
 | 
			
		||||
				if urlValue, exists := getAttr("datetime", node); exists {
 | 
			
		||||
					propertyValue = urlValue
 | 
			
		||||
				}
 | 
			
		||||
 | 
			
		||||
			default:
 | 
			
		||||
				var text bytes.Buffer
 | 
			
		||||
				node.Walk(func(n *h5.Node) {
 | 
			
		||||
					if n.Type == h5.TextNode {
 | 
			
		||||
						text.WriteString(n.Data())
 | 
			
		||||
				h5.WalkNodes(node, func(n *html.Node) {
 | 
			
		||||
					if n.Type == html.TextNode {
 | 
			
		||||
						text.WriteString(n.Data)
 | 
			
		||||
					}
 | 
			
		||||
 | 
			
		||||
				})
 | 
			
		||||
@ -238,18 +242,17 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
 | 
			
		||||
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if len(node.Children) > 0 {
 | 
			
		||||
		for _, child := range node.Children {
 | 
			
		||||
			self.readItem(item, child)
 | 
			
		||||
		}
 | 
			
		||||
	for child := node.FirstChild; child != nil; {
 | 
			
		||||
		self.readItem(item, child)
 | 
			
		||||
		child = child.NextSibling
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func getAttr(name string, node *h5.Node) (string, bool) {
 | 
			
		||||
func getAttr(name string, node *html.Node) (string, bool) {
 | 
			
		||||
	for _, a := range node.Attr {
 | 
			
		||||
		if a.Name == name {
 | 
			
		||||
			return a.Value, true
 | 
			
		||||
		if a.Key == name {
 | 
			
		||||
			return a.Val, true
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	return "", false
 | 
			
		||||
 | 
			
		||||
@ -1,7 +1,6 @@
 | 
			
		||||
/*
 | 
			
		||||
  To the extent possible under law, Ian Davis has waived all copyright
 | 
			
		||||
  and related or neighboring rights to this Source Code file.
 | 
			
		||||
  This work is published from the United Kingdom. 
 | 
			
		||||
  This is free and unencumbered software released into the public domain. For more
 | 
			
		||||
  information, see <http://unlicense.org/> or the accompanying UNLICENSE file.
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
package microdata
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user