forked from ukamnya/microdata_mirror
Fixed breakage caused by h5 changes
parent
c667405b6d
commit
5e88f404e0
43
README.md
43
README.md
|
@ -1,19 +1,18 @@
|
||||||
microdata - a microdata parser in Go
|
# microdata
|
||||||
|
A microdata parser in Go
|
||||||
|
|
||||||
See [http://www.w3.org/TR/microdata/](http://www.w3.org/TR/microdata/) for more information about Microdata
|
See [http://www.w3.org/TR/microdata/](http://www.w3.org/TR/microdata/) for more information about Microdata
|
||||||
|
|
||||||
INSTALLATION
|
## Installation
|
||||||
============
|
|
||||||
|
|
||||||
Simply run
|
Simply run
|
||||||
|
|
||||||
go get github.com/iand/microdata
|
go get github.com/iand/microdata
|
||||||
|
|
||||||
Documentation is at [http://go.pkgdoc.org/github.com/iand/microdata](http://go.pkgdoc.org/github.com/iand/microdata)
|
Documentation is at [http://godoc.org/github.com/iand/microdata](http://godoc.org/github.com/iand/microdata)
|
||||||
|
|
||||||
|
|
||||||
USAGE
|
## Usage
|
||||||
=====
|
|
||||||
|
|
||||||
Example of parsing a string containing HTML:
|
Example of parsing a string containing HTML:
|
||||||
|
|
||||||
|
@ -72,10 +71,30 @@ Extract microdata from a webpage and print the result as JSON
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
LICENSE
|
## Authors
|
||||||
=======
|
|
||||||
This code and associated documentation is in the public domain.
|
|
||||||
|
|
||||||
To the extent possible under law, Ian Davis has waived all copyright
|
* [Ian Davis](http://github.com/iand) - <http://iandavis.com/>
|
||||||
and related or neighboring rights to this file. This work is published
|
|
||||||
from the United Kingdom.
|
|
||||||
|
## Contributors
|
||||||
|
|
||||||
|
|
||||||
|
## Contributing
|
||||||
|
|
||||||
|
* Do submit your changes as a pull request
|
||||||
|
* Do your best to adhere to the existing coding conventions and idioms.
|
||||||
|
* Do run `go fmt` on the code before committing
|
||||||
|
* Do feel free to add yourself to the [`CREDITS`](CREDITS) file and the
|
||||||
|
corresponding Contributors list in the the [`README.md`](README.md).
|
||||||
|
Alphabetical order applies.
|
||||||
|
* Don't touch the [`AUTHORS`](AUTHORS) file. An existing author will add you if
|
||||||
|
your contributions are significant enough.
|
||||||
|
* Do note that in order for any non-trivial changes to be merged (as a rule
|
||||||
|
of thumb, additions larger than about 15 lines of code), an explicit
|
||||||
|
Public Domain Dedication needs to be on record from you. Please include
|
||||||
|
a copy of the statement found in the [`WAIVER`](WAIVER) file with your pull request
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
This is free and unencumbered software released into the public domain. For more
|
||||||
|
information, see <http://unlicense.org/> or the accompanying [`UNLICENSE`](UNLICENSE) file.
|
||||||
|
|
|
@ -0,0 +1,24 @@
|
||||||
|
This is free and unencumbered software released into the public domain.
|
||||||
|
|
||||||
|
Anyone is free to copy, modify, publish, use, compile, sell, or
|
||||||
|
distribute this software, either in source code form or as a compiled
|
||||||
|
binary, for any purpose, commercial or non-commercial, and by any
|
||||||
|
means.
|
||||||
|
|
||||||
|
In jurisdictions that recognize copyright laws, the author or authors
|
||||||
|
of this software dedicate any and all copyright interest in the
|
||||||
|
software to the public domain. We make this dedication for the benefit
|
||||||
|
of the public at large and to the detriment of our heirs and
|
||||||
|
successors. We intend this dedication to be an overt act of
|
||||||
|
relinquishment in perpetuity of all present and future rights to this
|
||||||
|
software under copyright law.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||||
|
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||||
|
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||||
|
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
For more information, please refer to <http://unlicense.org/>
|
|
@ -0,0 +1,5 @@
|
||||||
|
I dedicate any and all copyright interest in this software to the
|
||||||
|
public domain. I make this dedication for the benefit of the public at
|
||||||
|
large and to the detriment of my heirs and successors. I intend this
|
||||||
|
dedication to be an overt act of relinquishment in perpetuity of all
|
||||||
|
present and future rights to this software under copyright law.
|
81
microdata.go
81
microdata.go
|
@ -1,7 +1,6 @@
|
||||||
/*
|
/*
|
||||||
To the extent possible under law, Ian Davis has waived all copyright
|
This is free and unencumbered software released into the public domain. For more
|
||||||
and related or neighboring rights to this Source Code file.
|
information, see <http://unlicense.org/> or the accompanying UNLICENSE file.
|
||||||
This work is published from the United Kingdom.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// A package for parsing microdata
|
// A package for parsing microdata
|
||||||
|
@ -11,6 +10,8 @@ package microdata
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"code.google.com/p/go-html-transform/h5"
|
"code.google.com/p/go-html-transform/h5"
|
||||||
|
"code.google.com/p/go.net/html"
|
||||||
|
"code.google.com/p/go.net/html/atom"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"io"
|
"io"
|
||||||
"net/url"
|
"net/url"
|
||||||
|
@ -78,18 +79,20 @@ func (self *Microdata) Json() ([]byte, error) {
|
||||||
|
|
||||||
// An HTML parser that extracts microdata
|
// An HTML parser that extracts microdata
|
||||||
type Parser struct {
|
type Parser struct {
|
||||||
p *h5.Parser
|
p *h5.Tree
|
||||||
data *Microdata
|
data *Microdata
|
||||||
base *url.URL
|
base *url.URL
|
||||||
identifiedNodes map[string]*h5.Node
|
identifiedNodes map[string]*html.Node
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create a new parser for extracting microdata
|
// Create a new parser for extracting microdata
|
||||||
// r is a reader over an HTML document
|
// r is a reader over an HTML document
|
||||||
// base is the base URL for resolving relative URLs
|
// base is the base URL for resolving relative URLs
|
||||||
func NewParser(r io.Reader, base *url.URL) *Parser {
|
func NewParser(r io.Reader, base *url.URL) *Parser {
|
||||||
|
p, _ := h5.New(r)
|
||||||
|
|
||||||
return &Parser{
|
return &Parser{
|
||||||
p: h5.NewParser(r),
|
p: p,
|
||||||
data: NewMicrodata(),
|
data: NewMicrodata(),
|
||||||
base: base,
|
base: base,
|
||||||
}
|
}
|
||||||
|
@ -97,16 +100,12 @@ func NewParser(r io.Reader, base *url.URL) *Parser {
|
||||||
|
|
||||||
// Parse the document and return a Microdata set
|
// Parse the document and return a Microdata set
|
||||||
func (self *Parser) Parse() (*Microdata, error) {
|
func (self *Parser) Parse() (*Microdata, error) {
|
||||||
err := self.p.Parse()
|
tree := self.p
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
tree := self.p.Tree()
|
|
||||||
|
|
||||||
topLevelItemNodes := make([]*h5.Node, 0)
|
topLevelItemNodes := make([]*html.Node, 0)
|
||||||
self.identifiedNodes = make(map[string]*h5.Node, 0)
|
self.identifiedNodes = make(map[string]*html.Node, 0)
|
||||||
|
|
||||||
tree.Walk(func(n *h5.Node) {
|
tree.Walk(func(n *html.Node) {
|
||||||
if _, exists := getAttr("itemscope", n); exists {
|
if _, exists := getAttr("itemscope", n); exists {
|
||||||
if _, exists := getAttr("itemprop", n); !exists {
|
if _, exists := getAttr("itemprop", n); !exists {
|
||||||
topLevelItemNodes = append(topLevelItemNodes, n)
|
topLevelItemNodes = append(topLevelItemNodes, n)
|
||||||
|
@ -147,17 +146,16 @@ func (self *Parser) Parse() (*Microdata, error) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(node.Children) > 0 {
|
for child := node.FirstChild; child != nil; {
|
||||||
for _, child := range node.Children {
|
self.readItem(item, child)
|
||||||
self.readItem(item, child)
|
child = child.NextSibling
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return self.data, nil
|
return self.data, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (self *Parser) readItem(item *Item, node *h5.Node) {
|
func (self *Parser) readItem(item *Item, node *html.Node) {
|
||||||
if itemprop, exists := getAttr("itemprop", node); exists {
|
if itemprop, exists := getAttr("itemprop", node); exists {
|
||||||
if _, exists := getAttr("itemscope", node); exists {
|
if _, exists := getAttr("itemscope", node); exists {
|
||||||
subitem := NewItem()
|
subitem := NewItem()
|
||||||
|
@ -172,10 +170,9 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(node.Children) > 0 {
|
for child := node.FirstChild; child != nil; {
|
||||||
for _, child := range node.Children {
|
self.readItem(subitem, child)
|
||||||
self.readItem(subitem, child)
|
child = child.NextSibling
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
|
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
|
||||||
|
@ -190,35 +187,42 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
|
||||||
} else {
|
} else {
|
||||||
var propertyValue string
|
var propertyValue string
|
||||||
|
|
||||||
switch node.Data() {
|
switch node.DataAtom {
|
||||||
|
case atom.Meta:
|
||||||
case "img", "audio", "source", "video", "embed", "iframe", "track":
|
if val, exists := getAttr("content", node); exists {
|
||||||
|
propertyValue = val
|
||||||
|
}
|
||||||
|
case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video:
|
||||||
if urlValue, exists := getAttr("src", node); exists {
|
if urlValue, exists := getAttr("src", node); exists {
|
||||||
if parsedUrl, err := self.base.Parse(urlValue); err == nil {
|
if parsedUrl, err := self.base.Parse(urlValue); err == nil {
|
||||||
propertyValue = parsedUrl.String()
|
propertyValue = parsedUrl.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
case "a", "area", "link":
|
case atom.A, atom.Area, atom.Link:
|
||||||
if urlValue, exists := getAttr("href", node); exists {
|
if urlValue, exists := getAttr("href", node); exists {
|
||||||
if parsedUrl, err := self.base.Parse(urlValue); err == nil {
|
if parsedUrl, err := self.base.Parse(urlValue); err == nil {
|
||||||
propertyValue = parsedUrl.String()
|
propertyValue = parsedUrl.String()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
case "data":
|
case atom.Object:
|
||||||
|
if urlValue, exists := getAttr("data", node); exists {
|
||||||
|
propertyValue = urlValue
|
||||||
|
}
|
||||||
|
case atom.Data, atom.Meter:
|
||||||
if urlValue, exists := getAttr("value", node); exists {
|
if urlValue, exists := getAttr("value", node); exists {
|
||||||
propertyValue = urlValue
|
propertyValue = urlValue
|
||||||
}
|
}
|
||||||
case "time":
|
case atom.Time:
|
||||||
if urlValue, exists := getAttr("datetime", node); exists {
|
if urlValue, exists := getAttr("datetime", node); exists {
|
||||||
propertyValue = urlValue
|
propertyValue = urlValue
|
||||||
}
|
}
|
||||||
|
|
||||||
default:
|
default:
|
||||||
var text bytes.Buffer
|
var text bytes.Buffer
|
||||||
node.Walk(func(n *h5.Node) {
|
h5.WalkNodes(node, func(n *html.Node) {
|
||||||
if n.Type == h5.TextNode {
|
if n.Type == html.TextNode {
|
||||||
text.WriteString(n.Data())
|
text.WriteString(n.Data)
|
||||||
}
|
}
|
||||||
|
|
||||||
})
|
})
|
||||||
|
@ -238,18 +242,17 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(node.Children) > 0 {
|
for child := node.FirstChild; child != nil; {
|
||||||
for _, child := range node.Children {
|
self.readItem(item, child)
|
||||||
self.readItem(item, child)
|
child = child.NextSibling
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func getAttr(name string, node *h5.Node) (string, bool) {
|
func getAttr(name string, node *html.Node) (string, bool) {
|
||||||
for _, a := range node.Attr {
|
for _, a := range node.Attr {
|
||||||
if a.Name == name {
|
if a.Key == name {
|
||||||
return a.Value, true
|
return a.Val, true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return "", false
|
return "", false
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
/*
|
/*
|
||||||
To the extent possible under law, Ian Davis has waived all copyright
|
This is free and unencumbered software released into the public domain. For more
|
||||||
and related or neighboring rights to this Source Code file.
|
information, see <http://unlicense.org/> or the accompanying UNLICENSE file.
|
||||||
This work is published from the United Kingdom.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package microdata
|
package microdata
|
||||||
|
|
Loading…
Reference in New Issue