Merge pull request #4 from mcnijman/master

Fix #3: fatal stack overflow
pull/5/head
Ian Davis 2018-09-14 13:06:53 +01:00 committed by GitHub
commit f416fa49b8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 84 additions and 55 deletions

View File

@ -1,4 +1,5 @@
# microdata
A microdata parser in Go
See [http://www.w3.org/TR/microdata/](http://www.w3.org/TR/microdata/) for more information about Microdata
@ -9,79 +10,79 @@ See [http://www.w3.org/TR/microdata/](http://www.w3.org/TR/microdata/) for more
Simply run
go get github.com/iand/microdata
go get github.com/iand/microdata
Documentation is at [http://godoc.org/github.com/iand/microdata](http://godoc.org/github.com/iand/microdata)
## Usage
Example of parsing a string containing HTML:
package main
```go
package main
import (
"github.com/iand/microdata"
"net/url"
"strings"
)
import (
"github.com/iand/microdata"
"net/url"
"strings"
)
func main() {
html := `<div itemscope>
<p>My name is <span itemprop="name">Elizabeth</span>.</p>
</div>`
func main() {
html := `<div itemscope>
<p>My name is <span itemprop="name">Elizabeth</span>.</p>
</div>`
baseUrl, _ := url.Parse("http://example.com/")
p := microdata.NewParser(strings.NewReader(html), baseUrl)
baseUrl, _ := url.Parse("http://example.com/")
p := microdata.NewParser(strings.NewReader(html), baseUrl)
data, err := p.Parse()
if err != nil {
panic(err)
}
data, err := p.Parse()
if err != nil {
panic(err)
}
println("Name: ", data.Items[0].Properties["name"][0].(string))
}
println("Name: ", data.Items[0].Properties["name"][0].(string))
}
```
Extract microdata from a webpage and print the result as JSON
package main
```go
package main
import (
"bytes"
"io/ioutil"
"net/http"
"net/url"
"os"
import (
"bytes"
"io/ioutil"
"net/http"
"net/url"
"os"
"github.com/iand/microdata"
)
"github.com/iand/microdata"
)
func main() {
func main() {
baseUrl, _ := url.Parse("http://www.designhive.com/blog/using-schemaorg-microdata")
baseUrl, _ := url.Parse("http://www.designhive.com/blog/using-schemaorg-microdata")
resp, _ := http.Get(baseUrl.String())
defer resp.Body.Close()
resp, _ := http.Get(baseUrl.String())
defer resp.Body.Close()
html, _ := ioutil.ReadAll(resp.Body)
html, _ := ioutil.ReadAll(resp.Body)
p := microdata.NewParser(bytes.NewReader(html), baseUrl)
p := microdata.NewParser(bytes.NewReader(html), baseUrl)
data, _ := p.Parse()
json, _ := data.JSON()
os.Stdout.Write(json)
}
data, _ := p.Parse()
json, _ := data.JSON()
os.Stdout.Write(json)
}
```
## Authors
* [Ian Davis](http://github.com/iand) - <http://iandavis.com/>
## Contributors
## Contributing
* Do submit your changes as a pull request

View File

@ -18,12 +18,12 @@ import (
"golang.org/x/net/html/atom"
)
type ValueList []interface{}
type PropertyMap map[string]ValueList
type valueList []interface{}
type propertyMap map[string]valueList
// Item represents a microdata item
type Item struct {
Properties PropertyMap `json:"properties"`
Properties propertyMap `json:"properties"`
Types []string `json:"type,omitempty"`
ID string `json:"id,omitempty"`
}
@ -31,7 +31,7 @@ type Item struct {
// NewItem creates a new microdata item
func NewItem() *Item {
return &Item{
Properties: make(PropertyMap, 0),
Properties: make(propertyMap, 0),
Types: make([]string, 0),
}
}
@ -132,11 +132,10 @@ func (p *Parser) Parse() (*Microdata, error) {
}
// itemid only valid when itemscope and itemtype are both present
if itemid, exists := getAttr("itemid", node); exists {
if parsedUrl, err := p.base.Parse(itemid); err == nil {
item.ID = parsedUrl.String()
if parsedURL, err := p.base.Parse(itemid); err == nil {
item.ID = parsedURL.String()
}
}
}
if itemrefs, exists := getAttr("itemref", node); exists {
@ -168,7 +167,9 @@ func (p *Parser) readItem(item *Item, node *html.Node) {
itemref = strings.TrimSpace(itemref)
if refnode, exists := p.identifiedNodes[itemref]; exists {
p.readItem(subitem, refnode)
if refnode != node {
p.readItem(subitem, refnode)
}
}
}
}
@ -198,15 +199,14 @@ func (p *Parser) readItem(item *Item, node *html.Node) {
}
case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video:
if urlValue, exists := getAttr("src", node); exists {
if parsedUrl, err := p.base.Parse(urlValue); err == nil {
propertyValue = parsedUrl.String()
if parsedURL, err := p.base.Parse(urlValue); err == nil {
propertyValue = parsedURL.String()
}
}
case atom.A, atom.Area, atom.Link:
if urlValue, exists := getAttr("href", node); exists {
if parsedUrl, err := p.base.Parse(urlValue); err == nil {
propertyValue = parsedUrl.String()
if parsedURL, err := p.base.Parse(urlValue); err == nil {
propertyValue = parsedURL.String()
}
}
case atom.Object:

View File

@ -8,6 +8,7 @@ package microdata
import (
"bytes"
"net/url"
"reflect"
"strings"
"testing"
)
@ -569,3 +570,30 @@ func TestJsonWithType(t *testing.T) {
t.Errorf("Expecting %s but got %s", expected, actual)
}
}
// This test checks stack overflow doesn't happen as mentioned in
// https://github.com/iand/microdata/issues/3
func TestSkipSelfReferencingItemref(t *testing.T) {
html := `<body itemscope itemtype="http://schema.org/WebPage">
<span id="1" itemscope itemtype="http://data-vocabulary.org/Breadcrumb" itemprop="child" itemref="1">
<a title="Foo" itemprop="url" href="/foo/bar"><span itemprop="title">Foo</span></a>
</span>
</body>`
actual := ParseData(html, t)
child := NewItem()
child.AddString("title", "Foo")
child.AddString("url", "http://example.com/foo/bar")
item := NewItem()
item.AddType("http://schema.org/WebPage")
item.AddItem("child", child)
expected := NewMicrodata()
expected.AddItem(item)
if !reflect.DeepEqual(expected, actual) {
t.Errorf("Expecting %s but got %s", expected, actual)
}
}