commit
f416fa49b8
85
README.md
85
README.md
|
@ -1,4 +1,5 @@
|
||||||
# microdata
|
# microdata
|
||||||
|
|
||||||
A microdata parser in Go
|
A microdata parser in Go
|
||||||
|
|
||||||
See [http://www.w3.org/TR/microdata/](http://www.w3.org/TR/microdata/) for more information about Microdata
|
See [http://www.w3.org/TR/microdata/](http://www.w3.org/TR/microdata/) for more information about Microdata
|
||||||
|
@ -9,79 +10,79 @@ See [http://www.w3.org/TR/microdata/](http://www.w3.org/TR/microdata/) for more
|
||||||
|
|
||||||
Simply run
|
Simply run
|
||||||
|
|
||||||
go get github.com/iand/microdata
|
go get github.com/iand/microdata
|
||||||
|
|
||||||
Documentation is at [http://godoc.org/github.com/iand/microdata](http://godoc.org/github.com/iand/microdata)
|
Documentation is at [http://godoc.org/github.com/iand/microdata](http://godoc.org/github.com/iand/microdata)
|
||||||
|
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
Example of parsing a string containing HTML:
|
Example of parsing a string containing HTML:
|
||||||
|
|
||||||
package main
|
```go
|
||||||
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"github.com/iand/microdata"
|
"github.com/iand/microdata"
|
||||||
"net/url"
|
"net/url"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
html := `<div itemscope>
|
html := `<div itemscope>
|
||||||
<p>My name is <span itemprop="name">Elizabeth</span>.</p>
|
<p>My name is <span itemprop="name">Elizabeth</span>.</p>
|
||||||
</div>`
|
</div>`
|
||||||
|
|
||||||
baseUrl, _ := url.Parse("http://example.com/")
|
baseUrl, _ := url.Parse("http://example.com/")
|
||||||
p := microdata.NewParser(strings.NewReader(html), baseUrl)
|
p := microdata.NewParser(strings.NewReader(html), baseUrl)
|
||||||
|
|
||||||
data, err := p.Parse()
|
data, err := p.Parse()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
println("Name: ", data.Items[0].Properties["name"][0].(string))
|
println("Name: ", data.Items[0].Properties["name"][0].(string))
|
||||||
}
|
}
|
||||||
|
```
|
||||||
|
|
||||||
Extract microdata from a webpage and print the result as JSON
|
Extract microdata from a webpage and print the result as JSON
|
||||||
|
|
||||||
package main
|
```go
|
||||||
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
|
|
||||||
"github.com/iand/microdata"
|
"github.com/iand/microdata"
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
|
||||||
baseUrl, _ := url.Parse("http://www.designhive.com/blog/using-schemaorg-microdata")
|
baseUrl, _ := url.Parse("http://www.designhive.com/blog/using-schemaorg-microdata")
|
||||||
|
|
||||||
resp, _ := http.Get(baseUrl.String())
|
resp, _ := http.Get(baseUrl.String())
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
|
|
||||||
html, _ := ioutil.ReadAll(resp.Body)
|
html, _ := ioutil.ReadAll(resp.Body)
|
||||||
|
|
||||||
p := microdata.NewParser(bytes.NewReader(html), baseUrl)
|
p := microdata.NewParser(bytes.NewReader(html), baseUrl)
|
||||||
|
|
||||||
data, _ := p.Parse()
|
data, _ := p.Parse()
|
||||||
|
|
||||||
json, _ := data.JSON()
|
|
||||||
os.Stdout.Write(json)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
json, _ := data.JSON()
|
||||||
|
os.Stdout.Write(json)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
## Authors
|
## Authors
|
||||||
|
|
||||||
* [Ian Davis](http://github.com/iand) - <http://iandavis.com/>
|
* [Ian Davis](http://github.com/iand) - <http://iandavis.com/>
|
||||||
|
|
||||||
|
|
||||||
## Contributors
|
## Contributors
|
||||||
|
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
* Do submit your changes as a pull request
|
* Do submit your changes as a pull request
|
||||||
|
|
26
microdata.go
26
microdata.go
|
@ -18,12 +18,12 @@ import (
|
||||||
"golang.org/x/net/html/atom"
|
"golang.org/x/net/html/atom"
|
||||||
)
|
)
|
||||||
|
|
||||||
type ValueList []interface{}
|
type valueList []interface{}
|
||||||
type PropertyMap map[string]ValueList
|
type propertyMap map[string]valueList
|
||||||
|
|
||||||
// Item represents a microdata item
|
// Item represents a microdata item
|
||||||
type Item struct {
|
type Item struct {
|
||||||
Properties PropertyMap `json:"properties"`
|
Properties propertyMap `json:"properties"`
|
||||||
Types []string `json:"type,omitempty"`
|
Types []string `json:"type,omitempty"`
|
||||||
ID string `json:"id,omitempty"`
|
ID string `json:"id,omitempty"`
|
||||||
}
|
}
|
||||||
|
@ -31,7 +31,7 @@ type Item struct {
|
||||||
// NewItem creates a new microdata item
|
// NewItem creates a new microdata item
|
||||||
func NewItem() *Item {
|
func NewItem() *Item {
|
||||||
return &Item{
|
return &Item{
|
||||||
Properties: make(PropertyMap, 0),
|
Properties: make(propertyMap, 0),
|
||||||
Types: make([]string, 0),
|
Types: make([]string, 0),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -132,11 +132,10 @@ func (p *Parser) Parse() (*Microdata, error) {
|
||||||
}
|
}
|
||||||
// itemid only valid when itemscope and itemtype are both present
|
// itemid only valid when itemscope and itemtype are both present
|
||||||
if itemid, exists := getAttr("itemid", node); exists {
|
if itemid, exists := getAttr("itemid", node); exists {
|
||||||
if parsedUrl, err := p.base.Parse(itemid); err == nil {
|
if parsedURL, err := p.base.Parse(itemid); err == nil {
|
||||||
item.ID = parsedUrl.String()
|
item.ID = parsedURL.String()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if itemrefs, exists := getAttr("itemref", node); exists {
|
if itemrefs, exists := getAttr("itemref", node); exists {
|
||||||
|
@ -168,7 +167,9 @@ func (p *Parser) readItem(item *Item, node *html.Node) {
|
||||||
itemref = strings.TrimSpace(itemref)
|
itemref = strings.TrimSpace(itemref)
|
||||||
|
|
||||||
if refnode, exists := p.identifiedNodes[itemref]; exists {
|
if refnode, exists := p.identifiedNodes[itemref]; exists {
|
||||||
p.readItem(subitem, refnode)
|
if refnode != node {
|
||||||
|
p.readItem(subitem, refnode)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -198,15 +199,14 @@ func (p *Parser) readItem(item *Item, node *html.Node) {
|
||||||
}
|
}
|
||||||
case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video:
|
case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video:
|
||||||
if urlValue, exists := getAttr("src", node); exists {
|
if urlValue, exists := getAttr("src", node); exists {
|
||||||
if parsedUrl, err := p.base.Parse(urlValue); err == nil {
|
if parsedURL, err := p.base.Parse(urlValue); err == nil {
|
||||||
propertyValue = parsedUrl.String()
|
propertyValue = parsedURL.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
case atom.A, atom.Area, atom.Link:
|
case atom.A, atom.Area, atom.Link:
|
||||||
if urlValue, exists := getAttr("href", node); exists {
|
if urlValue, exists := getAttr("href", node); exists {
|
||||||
if parsedUrl, err := p.base.Parse(urlValue); err == nil {
|
if parsedURL, err := p.base.Parse(urlValue); err == nil {
|
||||||
propertyValue = parsedUrl.String()
|
propertyValue = parsedURL.String()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
case atom.Object:
|
case atom.Object:
|
||||||
|
|
|
@ -8,6 +8,7 @@ package microdata
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"net/url"
|
"net/url"
|
||||||
|
"reflect"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
@ -569,3 +570,30 @@ func TestJsonWithType(t *testing.T) {
|
||||||
t.Errorf("Expecting %s but got %s", expected, actual)
|
t.Errorf("Expecting %s but got %s", expected, actual)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This test checks stack overflow doesn't happen as mentioned in
|
||||||
|
// https://github.com/iand/microdata/issues/3
|
||||||
|
func TestSkipSelfReferencingItemref(t *testing.T) {
|
||||||
|
html := `<body itemscope itemtype="http://schema.org/WebPage">
|
||||||
|
<span id="1" itemscope itemtype="http://data-vocabulary.org/Breadcrumb" itemprop="child" itemref="1">
|
||||||
|
<a title="Foo" itemprop="url" href="/foo/bar"><span itemprop="title">Foo</span></a>
|
||||||
|
</span>
|
||||||
|
</body>`
|
||||||
|
|
||||||
|
actual := ParseData(html, t)
|
||||||
|
|
||||||
|
child := NewItem()
|
||||||
|
child.AddString("title", "Foo")
|
||||||
|
child.AddString("url", "http://example.com/foo/bar")
|
||||||
|
|
||||||
|
item := NewItem()
|
||||||
|
item.AddType("http://schema.org/WebPage")
|
||||||
|
item.AddItem("child", child)
|
||||||
|
|
||||||
|
expected := NewMicrodata()
|
||||||
|
expected.AddItem(item)
|
||||||
|
|
||||||
|
if !reflect.DeepEqual(expected, actual) {
|
||||||
|
t.Errorf("Expecting %s but got %s", expected, actual)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue