forked from ukamnya/microdata_mirror
Fixed breakage caused by h5 changes
parent
c667405b6d
commit
5e88f404e0
43
README.md
43
README.md
|
@ -1,19 +1,18 @@
|
|||
microdata - a microdata parser in Go
|
||||
# microdata
|
||||
A microdata parser in Go
|
||||
|
||||
See [http://www.w3.org/TR/microdata/](http://www.w3.org/TR/microdata/) for more information about Microdata
|
||||
|
||||
INSTALLATION
|
||||
============
|
||||
## Installation
|
||||
|
||||
Simply run
|
||||
|
||||
go get github.com/iand/microdata
|
||||
|
||||
Documentation is at [http://go.pkgdoc.org/github.com/iand/microdata](http://go.pkgdoc.org/github.com/iand/microdata)
|
||||
Documentation is at [http://godoc.org/github.com/iand/microdata](http://godoc.org/github.com/iand/microdata)
|
||||
|
||||
|
||||
USAGE
|
||||
=====
|
||||
## Usage
|
||||
|
||||
Example of parsing a string containing HTML:
|
||||
|
||||
|
@ -72,10 +71,30 @@ Extract microdata from a webpage and print the result as JSON
|
|||
}
|
||||
|
||||
|
||||
LICENSE
|
||||
=======
|
||||
This code and associated documentation is in the public domain.
|
||||
## Authors
|
||||
|
||||
To the extent possible under law, Ian Davis has waived all copyright
|
||||
and related or neighboring rights to this file. This work is published
|
||||
from the United Kingdom.
|
||||
* [Ian Davis](http://github.com/iand) - <http://iandavis.com/>
|
||||
|
||||
|
||||
## Contributors
|
||||
|
||||
|
||||
## Contributing
|
||||
|
||||
* Do submit your changes as a pull request
|
||||
* Do your best to adhere to the existing coding conventions and idioms.
|
||||
* Do run `go fmt` on the code before committing
|
||||
* Do feel free to add yourself to the [`CREDITS`](CREDITS) file and the
|
||||
corresponding Contributors list in the the [`README.md`](README.md).
|
||||
Alphabetical order applies.
|
||||
* Don't touch the [`AUTHORS`](AUTHORS) file. An existing author will add you if
|
||||
your contributions are significant enough.
|
||||
* Do note that in order for any non-trivial changes to be merged (as a rule
|
||||
of thumb, additions larger than about 15 lines of code), an explicit
|
||||
Public Domain Dedication needs to be on record from you. Please include
|
||||
a copy of the statement found in the [`WAIVER`](WAIVER) file with your pull request
|
||||
|
||||
## License
|
||||
|
||||
This is free and unencumbered software released into the public domain. For more
|
||||
information, see <http://unlicense.org/> or the accompanying [`UNLICENSE`](UNLICENSE) file.
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
This is free and unencumbered software released into the public domain.
|
||||
|
||||
Anyone is free to copy, modify, publish, use, compile, sell, or
|
||||
distribute this software, either in source code form or as a compiled
|
||||
binary, for any purpose, commercial or non-commercial, and by any
|
||||
means.
|
||||
|
||||
In jurisdictions that recognize copyright laws, the author or authors
|
||||
of this software dedicate any and all copyright interest in the
|
||||
software to the public domain. We make this dedication for the benefit
|
||||
of the public at large and to the detriment of our heirs and
|
||||
successors. We intend this dedication to be an overt act of
|
||||
relinquishment in perpetuity of all present and future rights to this
|
||||
software under copyright law.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
For more information, please refer to <http://unlicense.org/>
|
|
@ -0,0 +1,5 @@
|
|||
I dedicate any and all copyright interest in this software to the
|
||||
public domain. I make this dedication for the benefit of the public at
|
||||
large and to the detriment of my heirs and successors. I intend this
|
||||
dedication to be an overt act of relinquishment in perpetuity of all
|
||||
present and future rights to this software under copyright law.
|
81
microdata.go
81
microdata.go
|
@ -1,7 +1,6 @@
|
|||
/*
|
||||
To the extent possible under law, Ian Davis has waived all copyright
|
||||
and related or neighboring rights to this Source Code file.
|
||||
This work is published from the United Kingdom.
|
||||
This is free and unencumbered software released into the public domain. For more
|
||||
information, see <http://unlicense.org/> or the accompanying UNLICENSE file.
|
||||
*/
|
||||
|
||||
// A package for parsing microdata
|
||||
|
@ -11,6 +10,8 @@ package microdata
|
|||
import (
|
||||
"bytes"
|
||||
"code.google.com/p/go-html-transform/h5"
|
||||
"code.google.com/p/go.net/html"
|
||||
"code.google.com/p/go.net/html/atom"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"net/url"
|
||||
|
@ -78,18 +79,20 @@ func (self *Microdata) Json() ([]byte, error) {
|
|||
|
||||
// An HTML parser that extracts microdata
|
||||
type Parser struct {
|
||||
p *h5.Parser
|
||||
p *h5.Tree
|
||||
data *Microdata
|
||||
base *url.URL
|
||||
identifiedNodes map[string]*h5.Node
|
||||
identifiedNodes map[string]*html.Node
|
||||
}
|
||||
|
||||
// Create a new parser for extracting microdata
|
||||
// r is a reader over an HTML document
|
||||
// base is the base URL for resolving relative URLs
|
||||
func NewParser(r io.Reader, base *url.URL) *Parser {
|
||||
p, _ := h5.New(r)
|
||||
|
||||
return &Parser{
|
||||
p: h5.NewParser(r),
|
||||
p: p,
|
||||
data: NewMicrodata(),
|
||||
base: base,
|
||||
}
|
||||
|
@ -97,16 +100,12 @@ func NewParser(r io.Reader, base *url.URL) *Parser {
|
|||
|
||||
// Parse the document and return a Microdata set
|
||||
func (self *Parser) Parse() (*Microdata, error) {
|
||||
err := self.p.Parse()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
tree := self.p.Tree()
|
||||
tree := self.p
|
||||
|
||||
topLevelItemNodes := make([]*h5.Node, 0)
|
||||
self.identifiedNodes = make(map[string]*h5.Node, 0)
|
||||
topLevelItemNodes := make([]*html.Node, 0)
|
||||
self.identifiedNodes = make(map[string]*html.Node, 0)
|
||||
|
||||
tree.Walk(func(n *h5.Node) {
|
||||
tree.Walk(func(n *html.Node) {
|
||||
if _, exists := getAttr("itemscope", n); exists {
|
||||
if _, exists := getAttr("itemprop", n); !exists {
|
||||
topLevelItemNodes = append(topLevelItemNodes, n)
|
||||
|
@ -147,17 +146,16 @@ func (self *Parser) Parse() (*Microdata, error) {
|
|||
}
|
||||
}
|
||||
|
||||
if len(node.Children) > 0 {
|
||||
for _, child := range node.Children {
|
||||
self.readItem(item, child)
|
||||
}
|
||||
for child := node.FirstChild; child != nil; {
|
||||
self.readItem(item, child)
|
||||
child = child.NextSibling
|
||||
}
|
||||
}
|
||||
|
||||
return self.data, nil
|
||||
}
|
||||
|
||||
func (self *Parser) readItem(item *Item, node *h5.Node) {
|
||||
func (self *Parser) readItem(item *Item, node *html.Node) {
|
||||
if itemprop, exists := getAttr("itemprop", node); exists {
|
||||
if _, exists := getAttr("itemscope", node); exists {
|
||||
subitem := NewItem()
|
||||
|
@ -172,10 +170,9 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
|
|||
}
|
||||
}
|
||||
|
||||
if len(node.Children) > 0 {
|
||||
for _, child := range node.Children {
|
||||
self.readItem(subitem, child)
|
||||
}
|
||||
for child := node.FirstChild; child != nil; {
|
||||
self.readItem(subitem, child)
|
||||
child = child.NextSibling
|
||||
}
|
||||
|
||||
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
|
||||
|
@ -190,35 +187,42 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
|
|||
} else {
|
||||
var propertyValue string
|
||||
|
||||
switch node.Data() {
|
||||
|
||||
case "img", "audio", "source", "video", "embed", "iframe", "track":
|
||||
switch node.DataAtom {
|
||||
case atom.Meta:
|
||||
if val, exists := getAttr("content", node); exists {
|
||||
propertyValue = val
|
||||
}
|
||||
case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video:
|
||||
if urlValue, exists := getAttr("src", node); exists {
|
||||
if parsedUrl, err := self.base.Parse(urlValue); err == nil {
|
||||
propertyValue = parsedUrl.String()
|
||||
}
|
||||
|
||||
}
|
||||
case "a", "area", "link":
|
||||
case atom.A, atom.Area, atom.Link:
|
||||
if urlValue, exists := getAttr("href", node); exists {
|
||||
if parsedUrl, err := self.base.Parse(urlValue); err == nil {
|
||||
propertyValue = parsedUrl.String()
|
||||
}
|
||||
}
|
||||
case "data":
|
||||
case atom.Object:
|
||||
if urlValue, exists := getAttr("data", node); exists {
|
||||
propertyValue = urlValue
|
||||
}
|
||||
case atom.Data, atom.Meter:
|
||||
if urlValue, exists := getAttr("value", node); exists {
|
||||
propertyValue = urlValue
|
||||
}
|
||||
case "time":
|
||||
case atom.Time:
|
||||
if urlValue, exists := getAttr("datetime", node); exists {
|
||||
propertyValue = urlValue
|
||||
}
|
||||
|
||||
default:
|
||||
var text bytes.Buffer
|
||||
node.Walk(func(n *h5.Node) {
|
||||
if n.Type == h5.TextNode {
|
||||
text.WriteString(n.Data())
|
||||
h5.WalkNodes(node, func(n *html.Node) {
|
||||
if n.Type == html.TextNode {
|
||||
text.WriteString(n.Data)
|
||||
}
|
||||
|
||||
})
|
||||
|
@ -238,18 +242,17 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
|
|||
|
||||
}
|
||||
|
||||
if len(node.Children) > 0 {
|
||||
for _, child := range node.Children {
|
||||
self.readItem(item, child)
|
||||
}
|
||||
for child := node.FirstChild; child != nil; {
|
||||
self.readItem(item, child)
|
||||
child = child.NextSibling
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func getAttr(name string, node *h5.Node) (string, bool) {
|
||||
func getAttr(name string, node *html.Node) (string, bool) {
|
||||
for _, a := range node.Attr {
|
||||
if a.Name == name {
|
||||
return a.Value, true
|
||||
if a.Key == name {
|
||||
return a.Val, true
|
||||
}
|
||||
}
|
||||
return "", false
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
/*
|
||||
To the extent possible under law, Ian Davis has waived all copyright
|
||||
and related or neighboring rights to this Source Code file.
|
||||
This work is published from the United Kingdom.
|
||||
This is free and unencumbered software released into the public domain. For more
|
||||
information, see <http://unlicense.org/> or the accompanying UNLICENSE file.
|
||||
*/
|
||||
|
||||
package microdata
|
||||
|
|
Loading…
Reference in New Issue