Fixed breakage caused by h5 changes

pull/2/head
Ian Davis 2013-07-10 16:59:28 +01:00
parent c667405b6d
commit 5e88f404e0
7 changed files with 105 additions and 54 deletions

1
AUTHORS 100644
View File

@ -0,0 +1 @@
* Ian Davis <nospam@iandavis.com>

0
CREDITS 100644
View File

View File

@ -1,19 +1,18 @@
microdata - a microdata parser in Go
# microdata
A microdata parser in Go
See [http://www.w3.org/TR/microdata/](http://www.w3.org/TR/microdata/) for more information about Microdata
INSTALLATION
============
## Installation
Simply run
go get github.com/iand/microdata
Documentation is at [http://go.pkgdoc.org/github.com/iand/microdata](http://go.pkgdoc.org/github.com/iand/microdata)
Documentation is at [http://godoc.org/github.com/iand/microdata](http://godoc.org/github.com/iand/microdata)
USAGE
=====
## Usage
Example of parsing a string containing HTML:
@ -72,10 +71,30 @@ Extract microdata from a webpage and print the result as JSON
}
LICENSE
=======
This code and associated documentation is in the public domain.
## Authors
To the extent possible under law, Ian Davis has waived all copyright
and related or neighboring rights to this file. This work is published
from the United Kingdom.
* [Ian Davis](http://github.com/iand) - <http://iandavis.com/>
## Contributors
## Contributing
* Do submit your changes as a pull request
* Do your best to adhere to the existing coding conventions and idioms.
* Do run `go fmt` on the code before committing
* Do feel free to add yourself to the [`CREDITS`](CREDITS) file and the
corresponding Contributors list in the the [`README.md`](README.md).
Alphabetical order applies.
* Don't touch the [`AUTHORS`](AUTHORS) file. An existing author will add you if
your contributions are significant enough.
* Do note that in order for any non-trivial changes to be merged (as a rule
of thumb, additions larger than about 15 lines of code), an explicit
Public Domain Dedication needs to be on record from you. Please include
a copy of the statement found in the [`WAIVER`](WAIVER) file with your pull request
## License
This is free and unencumbered software released into the public domain. For more
information, see <http://unlicense.org/> or the accompanying [`UNLICENSE`](UNLICENSE) file.

24
UNLICENSE 100644
View File

@ -0,0 +1,24 @@
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <http://unlicense.org/>

5
WAIVER 100644
View File

@ -0,0 +1,5 @@
I dedicate any and all copyright interest in this software to the
public domain. I make this dedication for the benefit of the public at
large and to the detriment of my heirs and successors. I intend this
dedication to be an overt act of relinquishment in perpetuity of all
present and future rights to this software under copyright law.

View File

@ -1,7 +1,6 @@
/*
To the extent possible under law, Ian Davis has waived all copyright
and related or neighboring rights to this Source Code file.
This work is published from the United Kingdom.
This is free and unencumbered software released into the public domain. For more
information, see <http://unlicense.org/> or the accompanying UNLICENSE file.
*/
// A package for parsing microdata
@ -11,6 +10,8 @@ package microdata
import (
"bytes"
"code.google.com/p/go-html-transform/h5"
"code.google.com/p/go.net/html"
"code.google.com/p/go.net/html/atom"
"encoding/json"
"io"
"net/url"
@ -78,18 +79,20 @@ func (self *Microdata) Json() ([]byte, error) {
// An HTML parser that extracts microdata
type Parser struct {
p *h5.Parser
p *h5.Tree
data *Microdata
base *url.URL
identifiedNodes map[string]*h5.Node
identifiedNodes map[string]*html.Node
}
// Create a new parser for extracting microdata
// r is a reader over an HTML document
// base is the base URL for resolving relative URLs
func NewParser(r io.Reader, base *url.URL) *Parser {
p, _ := h5.New(r)
return &Parser{
p: h5.NewParser(r),
p: p,
data: NewMicrodata(),
base: base,
}
@ -97,16 +100,12 @@ func NewParser(r io.Reader, base *url.URL) *Parser {
// Parse the document and return a Microdata set
func (self *Parser) Parse() (*Microdata, error) {
err := self.p.Parse()
if err != nil {
return nil, err
}
tree := self.p.Tree()
tree := self.p
topLevelItemNodes := make([]*h5.Node, 0)
self.identifiedNodes = make(map[string]*h5.Node, 0)
topLevelItemNodes := make([]*html.Node, 0)
self.identifiedNodes = make(map[string]*html.Node, 0)
tree.Walk(func(n *h5.Node) {
tree.Walk(func(n *html.Node) {
if _, exists := getAttr("itemscope", n); exists {
if _, exists := getAttr("itemprop", n); !exists {
topLevelItemNodes = append(topLevelItemNodes, n)
@ -147,17 +146,16 @@ func (self *Parser) Parse() (*Microdata, error) {
}
}
if len(node.Children) > 0 {
for _, child := range node.Children {
for child := node.FirstChild; child != nil; {
self.readItem(item, child)
}
child = child.NextSibling
}
}
return self.data, nil
}
func (self *Parser) readItem(item *Item, node *h5.Node) {
func (self *Parser) readItem(item *Item, node *html.Node) {
if itemprop, exists := getAttr("itemprop", node); exists {
if _, exists := getAttr("itemscope", node); exists {
subitem := NewItem()
@ -172,10 +170,9 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
}
}
if len(node.Children) > 0 {
for _, child := range node.Children {
for child := node.FirstChild; child != nil; {
self.readItem(subitem, child)
}
child = child.NextSibling
}
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
@ -190,35 +187,42 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
} else {
var propertyValue string
switch node.Data() {
case "img", "audio", "source", "video", "embed", "iframe", "track":
switch node.DataAtom {
case atom.Meta:
if val, exists := getAttr("content", node); exists {
propertyValue = val
}
case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video:
if urlValue, exists := getAttr("src", node); exists {
if parsedUrl, err := self.base.Parse(urlValue); err == nil {
propertyValue = parsedUrl.String()
}
}
case "a", "area", "link":
case atom.A, atom.Area, atom.Link:
if urlValue, exists := getAttr("href", node); exists {
if parsedUrl, err := self.base.Parse(urlValue); err == nil {
propertyValue = parsedUrl.String()
}
}
case "data":
case atom.Object:
if urlValue, exists := getAttr("data", node); exists {
propertyValue = urlValue
}
case atom.Data, atom.Meter:
if urlValue, exists := getAttr("value", node); exists {
propertyValue = urlValue
}
case "time":
case atom.Time:
if urlValue, exists := getAttr("datetime", node); exists {
propertyValue = urlValue
}
default:
var text bytes.Buffer
node.Walk(func(n *h5.Node) {
if n.Type == h5.TextNode {
text.WriteString(n.Data())
h5.WalkNodes(node, func(n *html.Node) {
if n.Type == html.TextNode {
text.WriteString(n.Data)
}
})
@ -238,18 +242,17 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
}
if len(node.Children) > 0 {
for _, child := range node.Children {
for child := node.FirstChild; child != nil; {
self.readItem(item, child)
}
child = child.NextSibling
}
}
func getAttr(name string, node *h5.Node) (string, bool) {
func getAttr(name string, node *html.Node) (string, bool) {
for _, a := range node.Attr {
if a.Name == name {
return a.Value, true
if a.Key == name {
return a.Val, true
}
}
return "", false

View File

@ -1,7 +1,6 @@
/*
To the extent possible under law, Ian Davis has waived all copyright
and related or neighboring rights to this Source Code file.
This work is published from the United Kingdom.
This is free and unencumbered software released into the public domain. For more
information, see <http://unlicense.org/> or the accompanying UNLICENSE file.
*/
package microdata