Fixed breakage caused by h5 changes
This commit is contained in:
		
							parent
							
								
									c667405b6d
								
							
						
					
					
						commit
						5e88f404e0
					
				
							
								
								
									
										43
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										43
									
								
								README.md
									
									
									
									
									
								
							| @ -1,19 +1,18 @@ | |||||||
| microdata - a microdata parser in Go | # microdata | ||||||
|  | A microdata parser in Go | ||||||
| 
 | 
 | ||||||
| See [http://www.w3.org/TR/microdata/](http://www.w3.org/TR/microdata/) for more information about Microdata | See [http://www.w3.org/TR/microdata/](http://www.w3.org/TR/microdata/) for more information about Microdata | ||||||
| 
 | 
 | ||||||
| INSTALLATION | ## Installation | ||||||
| ============ |  | ||||||
| 
 | 
 | ||||||
| Simply run | Simply run | ||||||
| 
 | 
 | ||||||
| 	go get github.com/iand/microdata | 	go get github.com/iand/microdata | ||||||
| 
 | 
 | ||||||
| Documentation is at [http://go.pkgdoc.org/github.com/iand/microdata](http://go.pkgdoc.org/github.com/iand/microdata) | Documentation is at [http://godoc.org/github.com/iand/microdata](http://godoc.org/github.com/iand/microdata) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| USAGE | ## Usage | ||||||
| ===== |  | ||||||
| 
 | 
 | ||||||
| Example of parsing a string containing HTML: | Example of parsing a string containing HTML: | ||||||
| 
 | 
 | ||||||
| @ -72,10 +71,30 @@ Extract microdata from a webpage and print the result as JSON | |||||||
| 	}		 | 	}		 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| LICENSE | ## Authors | ||||||
| ======= |  | ||||||
| This code and associated documentation is in the public domain. |  | ||||||
| 
 | 
 | ||||||
| To the extent possible under law, Ian Davis has waived all copyright | * [Ian Davis](http://github.com/iand) - <http://iandavis.com/> | ||||||
| and related or neighboring rights to this file. This work is published  | 
 | ||||||
| from the United Kingdom.  | 
 | ||||||
|  | ## Contributors | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | ## Contributing | ||||||
|  | 
 | ||||||
|  | * Do submit your changes as a pull request | ||||||
|  | * Do your best to adhere to the existing coding conventions and idioms. | ||||||
|  | * Do run `go fmt` on the code before committing  | ||||||
|  | * Do feel free to add yourself to the [`CREDITS`](CREDITS) file and the | ||||||
|  |   corresponding Contributors list in the the [`README.md`](README.md).  | ||||||
|  |   Alphabetical order applies. | ||||||
|  | * Don't touch the [`AUTHORS`](AUTHORS) file. An existing author will add you if  | ||||||
|  |   your contributions are significant enough. | ||||||
|  | * Do note that in order for any non-trivial changes to be merged (as a rule | ||||||
|  |   of thumb, additions larger than about 15 lines of code), an explicit | ||||||
|  |   Public Domain Dedication needs to be on record from you. Please include | ||||||
|  |   a copy of the statement found in the [`WAIVER`](WAIVER) file with your pull request | ||||||
|  | 
 | ||||||
|  | ## License | ||||||
|  | 
 | ||||||
|  | This is free and unencumbered software released into the public domain. For more | ||||||
|  | information, see <http://unlicense.org/> or the accompanying [`UNLICENSE`](UNLICENSE) file. | ||||||
|  | |||||||
							
								
								
									
										24
									
								
								UNLICENSE
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								UNLICENSE
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,24 @@ | |||||||
|  | This is free and unencumbered software released into the public domain. | ||||||
|  | 
 | ||||||
|  | Anyone is free to copy, modify, publish, use, compile, sell, or | ||||||
|  | distribute this software, either in source code form or as a compiled | ||||||
|  | binary, for any purpose, commercial or non-commercial, and by any | ||||||
|  | means. | ||||||
|  | 
 | ||||||
|  | In jurisdictions that recognize copyright laws, the author or authors | ||||||
|  | of this software dedicate any and all copyright interest in the | ||||||
|  | software to the public domain. We make this dedication for the benefit | ||||||
|  | of the public at large and to the detriment of our heirs and | ||||||
|  | successors. We intend this dedication to be an overt act of | ||||||
|  | relinquishment in perpetuity of all present and future rights to this | ||||||
|  | software under copyright law. | ||||||
|  | 
 | ||||||
|  | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||||||
|  | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||||||
|  | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||||||
|  | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR | ||||||
|  | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | ||||||
|  | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | ||||||
|  | OTHER DEALINGS IN THE SOFTWARE. | ||||||
|  | 
 | ||||||
|  | For more information, please refer to <http://unlicense.org/> | ||||||
							
								
								
									
										5
									
								
								WAIVER
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								WAIVER
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,5 @@ | |||||||
|  | I dedicate any and all copyright interest in this software to the | ||||||
|  | public domain. I make this dedication for the benefit of the public at | ||||||
|  | large and to the detriment of my heirs and successors. I intend this | ||||||
|  | dedication to be an overt act of relinquishment in perpetuity of all | ||||||
|  | present and future rights to this software under copyright law. | ||||||
							
								
								
									
										75
									
								
								microdata.go
									
									
									
									
									
								
							
							
						
						
									
										75
									
								
								microdata.go
									
									
									
									
									
								
							| @ -1,7 +1,6 @@ | |||||||
| /* | /* | ||||||
|   To the extent possible under law, Ian Davis has waived all copyright |   This is free and unencumbered software released into the public domain. For more | ||||||
|   and related or neighboring rights to this Source Code file. |   information, see <http://unlicense.org/> or the accompanying UNLICENSE file. | ||||||
|   This work is published from the United Kingdom.  |  | ||||||
| */ | */ | ||||||
| 
 | 
 | ||||||
| // A package for parsing microdata | // A package for parsing microdata | ||||||
| @ -11,6 +10,8 @@ package microdata | |||||||
| import ( | import ( | ||||||
| 	"bytes" | 	"bytes" | ||||||
| 	"code.google.com/p/go-html-transform/h5" | 	"code.google.com/p/go-html-transform/h5" | ||||||
|  | 	"code.google.com/p/go.net/html" | ||||||
|  | 	"code.google.com/p/go.net/html/atom" | ||||||
| 	"encoding/json" | 	"encoding/json" | ||||||
| 	"io" | 	"io" | ||||||
| 	"net/url" | 	"net/url" | ||||||
| @ -78,18 +79,20 @@ func (self *Microdata) Json() ([]byte, error) { | |||||||
| 
 | 
 | ||||||
| // An HTML parser that extracts microdata | // An HTML parser that extracts microdata | ||||||
| type Parser struct { | type Parser struct { | ||||||
| 	p               *h5.Parser | 	p               *h5.Tree | ||||||
| 	data            *Microdata | 	data            *Microdata | ||||||
| 	base            *url.URL | 	base            *url.URL | ||||||
| 	identifiedNodes map[string]*h5.Node | 	identifiedNodes map[string]*html.Node | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| // Create a new parser for extracting microdata | // Create a new parser for extracting microdata | ||||||
| // r is a reader over an HTML document | // r is a reader over an HTML document | ||||||
| // base is the base URL for resolving relative URLs | // base is the base URL for resolving relative URLs | ||||||
| func NewParser(r io.Reader, base *url.URL) *Parser { | func NewParser(r io.Reader, base *url.URL) *Parser { | ||||||
|  | 	p, _ := h5.New(r) | ||||||
|  | 
 | ||||||
| 	return &Parser{ | 	return &Parser{ | ||||||
| 		p:    h5.NewParser(r), | 		p:    p, | ||||||
| 		data: NewMicrodata(), | 		data: NewMicrodata(), | ||||||
| 		base: base, | 		base: base, | ||||||
| 	} | 	} | ||||||
| @ -97,16 +100,12 @@ func NewParser(r io.Reader, base *url.URL) *Parser { | |||||||
| 
 | 
 | ||||||
| // Parse the document and return a Microdata set | // Parse the document and return a Microdata set | ||||||
| func (self *Parser) Parse() (*Microdata, error) { | func (self *Parser) Parse() (*Microdata, error) { | ||||||
| 	err := self.p.Parse() | 	tree := self.p | ||||||
| 	if err != nil { |  | ||||||
| 		return nil, err |  | ||||||
| 	} |  | ||||||
| 	tree := self.p.Tree() |  | ||||||
| 
 | 
 | ||||||
| 	topLevelItemNodes := make([]*h5.Node, 0) | 	topLevelItemNodes := make([]*html.Node, 0) | ||||||
| 	self.identifiedNodes = make(map[string]*h5.Node, 0) | 	self.identifiedNodes = make(map[string]*html.Node, 0) | ||||||
| 
 | 
 | ||||||
| 	tree.Walk(func(n *h5.Node) { | 	tree.Walk(func(n *html.Node) { | ||||||
| 		if _, exists := getAttr("itemscope", n); exists { | 		if _, exists := getAttr("itemscope", n); exists { | ||||||
| 			if _, exists := getAttr("itemprop", n); !exists { | 			if _, exists := getAttr("itemprop", n); !exists { | ||||||
| 				topLevelItemNodes = append(topLevelItemNodes, n) | 				topLevelItemNodes = append(topLevelItemNodes, n) | ||||||
| @ -147,17 +146,16 @@ func (self *Parser) Parse() (*Microdata, error) { | |||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
| 		if len(node.Children) > 0 { | 		for child := node.FirstChild; child != nil; { | ||||||
| 			for _, child := range node.Children { |  | ||||||
| 			self.readItem(item, child) | 			self.readItem(item, child) | ||||||
| 			} | 			child = child.NextSibling | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	return self.data, nil | 	return self.data, nil | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func (self *Parser) readItem(item *Item, node *h5.Node) { | func (self *Parser) readItem(item *Item, node *html.Node) { | ||||||
| 	if itemprop, exists := getAttr("itemprop", node); exists { | 	if itemprop, exists := getAttr("itemprop", node); exists { | ||||||
| 		if _, exists := getAttr("itemscope", node); exists { | 		if _, exists := getAttr("itemscope", node); exists { | ||||||
| 			subitem := NewItem() | 			subitem := NewItem() | ||||||
| @ -172,10 +170,9 @@ func (self *Parser) readItem(item *Item, node *h5.Node) { | |||||||
| 				} | 				} | ||||||
| 			} | 			} | ||||||
| 
 | 
 | ||||||
| 			if len(node.Children) > 0 { | 			for child := node.FirstChild; child != nil; { | ||||||
| 				for _, child := range node.Children { |  | ||||||
| 				self.readItem(subitem, child) | 				self.readItem(subitem, child) | ||||||
| 				} | 				child = child.NextSibling | ||||||
| 			} | 			} | ||||||
| 
 | 
 | ||||||
| 			for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") { | 			for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") { | ||||||
| @ -190,35 +187,42 @@ func (self *Parser) readItem(item *Item, node *h5.Node) { | |||||||
| 		} else { | 		} else { | ||||||
| 			var propertyValue string | 			var propertyValue string | ||||||
| 
 | 
 | ||||||
| 			switch node.Data() { | 			switch node.DataAtom { | ||||||
| 
 | 			case atom.Meta: | ||||||
| 			case "img", "audio", "source", "video", "embed", "iframe", "track": | 				if val, exists := getAttr("content", node); exists { | ||||||
|  | 					propertyValue = val | ||||||
|  | 				} | ||||||
|  | 			case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video: | ||||||
| 				if urlValue, exists := getAttr("src", node); exists { | 				if urlValue, exists := getAttr("src", node); exists { | ||||||
| 					if parsedUrl, err := self.base.Parse(urlValue); err == nil { | 					if parsedUrl, err := self.base.Parse(urlValue); err == nil { | ||||||
| 						propertyValue = parsedUrl.String() | 						propertyValue = parsedUrl.String() | ||||||
| 					} | 					} | ||||||
| 
 | 
 | ||||||
| 				} | 				} | ||||||
| 			case "a", "area", "link": | 			case atom.A, atom.Area, atom.Link: | ||||||
| 				if urlValue, exists := getAttr("href", node); exists { | 				if urlValue, exists := getAttr("href", node); exists { | ||||||
| 					if parsedUrl, err := self.base.Parse(urlValue); err == nil { | 					if parsedUrl, err := self.base.Parse(urlValue); err == nil { | ||||||
| 						propertyValue = parsedUrl.String() | 						propertyValue = parsedUrl.String() | ||||||
| 					} | 					} | ||||||
| 				} | 				} | ||||||
| 			case "data": | 			case atom.Object: | ||||||
|  | 				if urlValue, exists := getAttr("data", node); exists { | ||||||
|  | 					propertyValue = urlValue | ||||||
|  | 				} | ||||||
|  | 			case atom.Data, atom.Meter: | ||||||
| 				if urlValue, exists := getAttr("value", node); exists { | 				if urlValue, exists := getAttr("value", node); exists { | ||||||
| 					propertyValue = urlValue | 					propertyValue = urlValue | ||||||
| 				} | 				} | ||||||
| 			case "time": | 			case atom.Time: | ||||||
| 				if urlValue, exists := getAttr("datetime", node); exists { | 				if urlValue, exists := getAttr("datetime", node); exists { | ||||||
| 					propertyValue = urlValue | 					propertyValue = urlValue | ||||||
| 				} | 				} | ||||||
| 
 | 
 | ||||||
| 			default: | 			default: | ||||||
| 				var text bytes.Buffer | 				var text bytes.Buffer | ||||||
| 				node.Walk(func(n *h5.Node) { | 				h5.WalkNodes(node, func(n *html.Node) { | ||||||
| 					if n.Type == h5.TextNode { | 					if n.Type == html.TextNode { | ||||||
| 						text.WriteString(n.Data()) | 						text.WriteString(n.Data) | ||||||
| 					} | 					} | ||||||
| 
 | 
 | ||||||
| 				}) | 				}) | ||||||
| @ -238,18 +242,17 @@ func (self *Parser) readItem(item *Item, node *h5.Node) { | |||||||
| 
 | 
 | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	if len(node.Children) > 0 { | 	for child := node.FirstChild; child != nil; { | ||||||
| 		for _, child := range node.Children { |  | ||||||
| 		self.readItem(item, child) | 		self.readItem(item, child) | ||||||
| 		} | 		child = child.NextSibling | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| func getAttr(name string, node *h5.Node) (string, bool) { | func getAttr(name string, node *html.Node) (string, bool) { | ||||||
| 	for _, a := range node.Attr { | 	for _, a := range node.Attr { | ||||||
| 		if a.Name == name { | 		if a.Key == name { | ||||||
| 			return a.Value, true | 			return a.Val, true | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 	return "", false | 	return "", false | ||||||
|  | |||||||
| @ -1,7 +1,6 @@ | |||||||
| /* | /* | ||||||
|   To the extent possible under law, Ian Davis has waived all copyright |   This is free and unencumbered software released into the public domain. For more | ||||||
|   and related or neighboring rights to this Source Code file. |   information, see <http://unlicense.org/> or the accompanying UNLICENSE file. | ||||||
|   This work is published from the United Kingdom.  |  | ||||||
| */ | */ | ||||||
| 
 | 
 | ||||||
| package microdata | package microdata | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user