Fixed breakage caused by h5 changes

pull/2/head
Ian Davis 2013-07-10 16:59:28 +01:00
parent c667405b6d
commit 5e88f404e0
7 changed files with 105 additions and 54 deletions

1
AUTHORS 100644
View File

@ -0,0 +1 @@
* Ian Davis <nospam@iandavis.com>

0
CREDITS 100644
View File

View File

@ -1,19 +1,18 @@
microdata - a microdata parser in Go # microdata
A microdata parser in Go
See [http://www.w3.org/TR/microdata/](http://www.w3.org/TR/microdata/) for more information about Microdata See [http://www.w3.org/TR/microdata/](http://www.w3.org/TR/microdata/) for more information about Microdata
INSTALLATION ## Installation
============
Simply run Simply run
go get github.com/iand/microdata go get github.com/iand/microdata
Documentation is at [http://go.pkgdoc.org/github.com/iand/microdata](http://go.pkgdoc.org/github.com/iand/microdata) Documentation is at [http://godoc.org/github.com/iand/microdata](http://godoc.org/github.com/iand/microdata)
USAGE ## Usage
=====
Example of parsing a string containing HTML: Example of parsing a string containing HTML:
@ -72,10 +71,30 @@ Extract microdata from a webpage and print the result as JSON
} }
LICENSE ## Authors
=======
This code and associated documentation is in the public domain.
To the extent possible under law, Ian Davis has waived all copyright * [Ian Davis](http://github.com/iand) - <http://iandavis.com/>
and related or neighboring rights to this file. This work is published
from the United Kingdom.
## Contributors
## Contributing
* Do submit your changes as a pull request
* Do your best to adhere to the existing coding conventions and idioms.
* Do run `go fmt` on the code before committing
* Do feel free to add yourself to the [`CREDITS`](CREDITS) file and the
corresponding Contributors list in the the [`README.md`](README.md).
Alphabetical order applies.
* Don't touch the [`AUTHORS`](AUTHORS) file. An existing author will add you if
your contributions are significant enough.
* Do note that in order for any non-trivial changes to be merged (as a rule
of thumb, additions larger than about 15 lines of code), an explicit
Public Domain Dedication needs to be on record from you. Please include
a copy of the statement found in the [`WAIVER`](WAIVER) file with your pull request
## License
This is free and unencumbered software released into the public domain. For more
information, see <http://unlicense.org/> or the accompanying [`UNLICENSE`](UNLICENSE) file.

24
UNLICENSE 100644
View File

@ -0,0 +1,24 @@
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <http://unlicense.org/>

5
WAIVER 100644
View File

@ -0,0 +1,5 @@
I dedicate any and all copyright interest in this software to the
public domain. I make this dedication for the benefit of the public at
large and to the detriment of my heirs and successors. I intend this
dedication to be an overt act of relinquishment in perpetuity of all
present and future rights to this software under copyright law.

View File

@ -1,7 +1,6 @@
/* /*
To the extent possible under law, Ian Davis has waived all copyright This is free and unencumbered software released into the public domain. For more
and related or neighboring rights to this Source Code file. information, see <http://unlicense.org/> or the accompanying UNLICENSE file.
This work is published from the United Kingdom.
*/ */
// A package for parsing microdata // A package for parsing microdata
@ -11,6 +10,8 @@ package microdata
import ( import (
"bytes" "bytes"
"code.google.com/p/go-html-transform/h5" "code.google.com/p/go-html-transform/h5"
"code.google.com/p/go.net/html"
"code.google.com/p/go.net/html/atom"
"encoding/json" "encoding/json"
"io" "io"
"net/url" "net/url"
@ -78,18 +79,20 @@ func (self *Microdata) Json() ([]byte, error) {
// An HTML parser that extracts microdata // An HTML parser that extracts microdata
type Parser struct { type Parser struct {
p *h5.Parser p *h5.Tree
data *Microdata data *Microdata
base *url.URL base *url.URL
identifiedNodes map[string]*h5.Node identifiedNodes map[string]*html.Node
} }
// Create a new parser for extracting microdata // Create a new parser for extracting microdata
// r is a reader over an HTML document // r is a reader over an HTML document
// base is the base URL for resolving relative URLs // base is the base URL for resolving relative URLs
func NewParser(r io.Reader, base *url.URL) *Parser { func NewParser(r io.Reader, base *url.URL) *Parser {
p, _ := h5.New(r)
return &Parser{ return &Parser{
p: h5.NewParser(r), p: p,
data: NewMicrodata(), data: NewMicrodata(),
base: base, base: base,
} }
@ -97,16 +100,12 @@ func NewParser(r io.Reader, base *url.URL) *Parser {
// Parse the document and return a Microdata set // Parse the document and return a Microdata set
func (self *Parser) Parse() (*Microdata, error) { func (self *Parser) Parse() (*Microdata, error) {
err := self.p.Parse() tree := self.p
if err != nil {
return nil, err
}
tree := self.p.Tree()
topLevelItemNodes := make([]*h5.Node, 0) topLevelItemNodes := make([]*html.Node, 0)
self.identifiedNodes = make(map[string]*h5.Node, 0) self.identifiedNodes = make(map[string]*html.Node, 0)
tree.Walk(func(n *h5.Node) { tree.Walk(func(n *html.Node) {
if _, exists := getAttr("itemscope", n); exists { if _, exists := getAttr("itemscope", n); exists {
if _, exists := getAttr("itemprop", n); !exists { if _, exists := getAttr("itemprop", n); !exists {
topLevelItemNodes = append(topLevelItemNodes, n) topLevelItemNodes = append(topLevelItemNodes, n)
@ -147,17 +146,16 @@ func (self *Parser) Parse() (*Microdata, error) {
} }
} }
if len(node.Children) > 0 { for child := node.FirstChild; child != nil; {
for _, child := range node.Children {
self.readItem(item, child) self.readItem(item, child)
} child = child.NextSibling
} }
} }
return self.data, nil return self.data, nil
} }
func (self *Parser) readItem(item *Item, node *h5.Node) { func (self *Parser) readItem(item *Item, node *html.Node) {
if itemprop, exists := getAttr("itemprop", node); exists { if itemprop, exists := getAttr("itemprop", node); exists {
if _, exists := getAttr("itemscope", node); exists { if _, exists := getAttr("itemscope", node); exists {
subitem := NewItem() subitem := NewItem()
@ -172,10 +170,9 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
} }
} }
if len(node.Children) > 0 { for child := node.FirstChild; child != nil; {
for _, child := range node.Children {
self.readItem(subitem, child) self.readItem(subitem, child)
} child = child.NextSibling
} }
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") { for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
@ -190,35 +187,42 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
} else { } else {
var propertyValue string var propertyValue string
switch node.Data() { switch node.DataAtom {
case atom.Meta:
case "img", "audio", "source", "video", "embed", "iframe", "track": if val, exists := getAttr("content", node); exists {
propertyValue = val
}
case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video:
if urlValue, exists := getAttr("src", node); exists { if urlValue, exists := getAttr("src", node); exists {
if parsedUrl, err := self.base.Parse(urlValue); err == nil { if parsedUrl, err := self.base.Parse(urlValue); err == nil {
propertyValue = parsedUrl.String() propertyValue = parsedUrl.String()
} }
} }
case "a", "area", "link": case atom.A, atom.Area, atom.Link:
if urlValue, exists := getAttr("href", node); exists { if urlValue, exists := getAttr("href", node); exists {
if parsedUrl, err := self.base.Parse(urlValue); err == nil { if parsedUrl, err := self.base.Parse(urlValue); err == nil {
propertyValue = parsedUrl.String() propertyValue = parsedUrl.String()
} }
} }
case "data": case atom.Object:
if urlValue, exists := getAttr("data", node); exists {
propertyValue = urlValue
}
case atom.Data, atom.Meter:
if urlValue, exists := getAttr("value", node); exists { if urlValue, exists := getAttr("value", node); exists {
propertyValue = urlValue propertyValue = urlValue
} }
case "time": case atom.Time:
if urlValue, exists := getAttr("datetime", node); exists { if urlValue, exists := getAttr("datetime", node); exists {
propertyValue = urlValue propertyValue = urlValue
} }
default: default:
var text bytes.Buffer var text bytes.Buffer
node.Walk(func(n *h5.Node) { h5.WalkNodes(node, func(n *html.Node) {
if n.Type == h5.TextNode { if n.Type == html.TextNode {
text.WriteString(n.Data()) text.WriteString(n.Data)
} }
}) })
@ -238,18 +242,17 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
} }
if len(node.Children) > 0 { for child := node.FirstChild; child != nil; {
for _, child := range node.Children {
self.readItem(item, child) self.readItem(item, child)
} child = child.NextSibling
} }
} }
func getAttr(name string, node *h5.Node) (string, bool) { func getAttr(name string, node *html.Node) (string, bool) {
for _, a := range node.Attr { for _, a := range node.Attr {
if a.Name == name { if a.Key == name {
return a.Value, true return a.Val, true
} }
} }
return "", false return "", false

View File

@ -1,7 +1,6 @@
/* /*
To the extent possible under law, Ian Davis has waived all copyright This is free and unencumbered software released into the public domain. For more
and related or neighboring rights to this Source Code file. information, see <http://unlicense.org/> or the accompanying UNLICENSE file.
This work is published from the United Kingdom.
*/ */
package microdata package microdata