Merge pull request #4 from mcnijman/master

Fix #3: fatal stack overflow
pull/5/head
Ian Davis 2018-09-14 13:06:53 +01:00 committed by GitHub
commit f416fa49b8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 84 additions and 55 deletions

View File

@ -1,4 +1,5 @@
# microdata # microdata
A microdata parser in Go A microdata parser in Go
See [http://www.w3.org/TR/microdata/](http://www.w3.org/TR/microdata/) for more information about Microdata See [http://www.w3.org/TR/microdata/](http://www.w3.org/TR/microdata/) for more information about Microdata
@ -13,20 +14,20 @@ Simply run
Documentation is at [http://godoc.org/github.com/iand/microdata](http://godoc.org/github.com/iand/microdata) Documentation is at [http://godoc.org/github.com/iand/microdata](http://godoc.org/github.com/iand/microdata)
## Usage ## Usage
Example of parsing a string containing HTML: Example of parsing a string containing HTML:
package main ```go
package main
import ( import (
"github.com/iand/microdata" "github.com/iand/microdata"
"net/url" "net/url"
"strings" "strings"
) )
func main() { func main() {
html := `<div itemscope> html := `<div itemscope>
<p>My name is <span itemprop="name">Elizabeth</span>.</p> <p>My name is <span itemprop="name">Elizabeth</span>.</p>
</div>` </div>`
@ -40,13 +41,15 @@ Example of parsing a string containing HTML:
} }
println("Name: ", data.Items[0].Properties["name"][0].(string)) println("Name: ", data.Items[0].Properties["name"][0].(string))
} }
```
Extract microdata from a webpage and print the result as JSON Extract microdata from a webpage and print the result as JSON
package main ```go
package main
import ( import (
"bytes" "bytes"
"io/ioutil" "io/ioutil"
"net/http" "net/http"
@ -54,9 +57,9 @@ Extract microdata from a webpage and print the result as JSON
"os" "os"
"github.com/iand/microdata" "github.com/iand/microdata"
) )
func main() { func main() {
baseUrl, _ := url.Parse("http://www.designhive.com/blog/using-schemaorg-microdata") baseUrl, _ := url.Parse("http://www.designhive.com/blog/using-schemaorg-microdata")
@ -71,17 +74,15 @@ Extract microdata from a webpage and print the result as JSON
json, _ := data.JSON() json, _ := data.JSON()
os.Stdout.Write(json) os.Stdout.Write(json)
} }
```
## Authors ## Authors
* [Ian Davis](http://github.com/iand) - <http://iandavis.com/> * [Ian Davis](http://github.com/iand) - <http://iandavis.com/>
## Contributors ## Contributors
## Contributing ## Contributing
* Do submit your changes as a pull request * Do submit your changes as a pull request

View File

@ -18,12 +18,12 @@ import (
"golang.org/x/net/html/atom" "golang.org/x/net/html/atom"
) )
type ValueList []interface{} type valueList []interface{}
type PropertyMap map[string]ValueList type propertyMap map[string]valueList
// Item represents a microdata item // Item represents a microdata item
type Item struct { type Item struct {
Properties PropertyMap `json:"properties"` Properties propertyMap `json:"properties"`
Types []string `json:"type,omitempty"` Types []string `json:"type,omitempty"`
ID string `json:"id,omitempty"` ID string `json:"id,omitempty"`
} }
@ -31,7 +31,7 @@ type Item struct {
// NewItem creates a new microdata item // NewItem creates a new microdata item
func NewItem() *Item { func NewItem() *Item {
return &Item{ return &Item{
Properties: make(PropertyMap, 0), Properties: make(propertyMap, 0),
Types: make([]string, 0), Types: make([]string, 0),
} }
} }
@ -132,11 +132,10 @@ func (p *Parser) Parse() (*Microdata, error) {
} }
// itemid only valid when itemscope and itemtype are both present // itemid only valid when itemscope and itemtype are both present
if itemid, exists := getAttr("itemid", node); exists { if itemid, exists := getAttr("itemid", node); exists {
if parsedUrl, err := p.base.Parse(itemid); err == nil { if parsedURL, err := p.base.Parse(itemid); err == nil {
item.ID = parsedUrl.String() item.ID = parsedURL.String()
} }
} }
} }
if itemrefs, exists := getAttr("itemref", node); exists { if itemrefs, exists := getAttr("itemref", node); exists {
@ -168,10 +167,12 @@ func (p *Parser) readItem(item *Item, node *html.Node) {
itemref = strings.TrimSpace(itemref) itemref = strings.TrimSpace(itemref)
if refnode, exists := p.identifiedNodes[itemref]; exists { if refnode, exists := p.identifiedNodes[itemref]; exists {
if refnode != node {
p.readItem(subitem, refnode) p.readItem(subitem, refnode)
} }
} }
} }
}
for child := node.FirstChild; child != nil; { for child := node.FirstChild; child != nil; {
p.readItem(subitem, child) p.readItem(subitem, child)
@ -198,15 +199,14 @@ func (p *Parser) readItem(item *Item, node *html.Node) {
} }
case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video: case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video:
if urlValue, exists := getAttr("src", node); exists { if urlValue, exists := getAttr("src", node); exists {
if parsedUrl, err := p.base.Parse(urlValue); err == nil { if parsedURL, err := p.base.Parse(urlValue); err == nil {
propertyValue = parsedUrl.String() propertyValue = parsedURL.String()
} }
} }
case atom.A, atom.Area, atom.Link: case atom.A, atom.Area, atom.Link:
if urlValue, exists := getAttr("href", node); exists { if urlValue, exists := getAttr("href", node); exists {
if parsedUrl, err := p.base.Parse(urlValue); err == nil { if parsedURL, err := p.base.Parse(urlValue); err == nil {
propertyValue = parsedUrl.String() propertyValue = parsedURL.String()
} }
} }
case atom.Object: case atom.Object:

View File

@ -8,6 +8,7 @@ package microdata
import ( import (
"bytes" "bytes"
"net/url" "net/url"
"reflect"
"strings" "strings"
"testing" "testing"
) )
@ -569,3 +570,30 @@ func TestJsonWithType(t *testing.T) {
t.Errorf("Expecting %s but got %s", expected, actual) t.Errorf("Expecting %s but got %s", expected, actual)
} }
} }
// This test checks stack overflow doesn't happen as mentioned in
// https://github.com/iand/microdata/issues/3
func TestSkipSelfReferencingItemref(t *testing.T) {
html := `<body itemscope itemtype="http://schema.org/WebPage">
<span id="1" itemscope itemtype="http://data-vocabulary.org/Breadcrumb" itemprop="child" itemref="1">
<a title="Foo" itemprop="url" href="/foo/bar"><span itemprop="title">Foo</span></a>
</span>
</body>`
actual := ParseData(html, t)
child := NewItem()
child.AddString("title", "Foo")
child.AddString("url", "http://example.com/foo/bar")
item := NewItem()
item.AddType("http://schema.org/WebPage")
item.AddItem("child", child)
expected := NewMicrodata()
expected.AddItem(item)
if !reflect.DeepEqual(expected, actual) {
t.Errorf("Expecting %s but got %s", expected, actual)
}
}