Merge pull request #4 from mcnijman/master

Fix #3: fatal stack overflow
master
Ian Davis 2018-09-14 13:06:53 +01:00 committed by GitHub
commit f416fa49b8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 84 additions and 55 deletions

View File

@ -1,4 +1,5 @@
# microdata # microdata
A microdata parser in Go A microdata parser in Go
See [http://www.w3.org/TR/microdata/](http://www.w3.org/TR/microdata/) for more information about Microdata See [http://www.w3.org/TR/microdata/](http://www.w3.org/TR/microdata/) for more information about Microdata
@ -9,79 +10,79 @@ See [http://www.w3.org/TR/microdata/](http://www.w3.org/TR/microdata/) for more
Simply run Simply run
go get github.com/iand/microdata go get github.com/iand/microdata
Documentation is at [http://godoc.org/github.com/iand/microdata](http://godoc.org/github.com/iand/microdata) Documentation is at [http://godoc.org/github.com/iand/microdata](http://godoc.org/github.com/iand/microdata)
## Usage ## Usage
Example of parsing a string containing HTML: Example of parsing a string containing HTML:
package main ```go
package main
import ( import (
"github.com/iand/microdata" "github.com/iand/microdata"
"net/url" "net/url"
"strings" "strings"
) )
func main() { func main() {
html := `<div itemscope> html := `<div itemscope>
<p>My name is <span itemprop="name">Elizabeth</span>.</p> <p>My name is <span itemprop="name">Elizabeth</span>.</p>
</div>` </div>`
baseUrl, _ := url.Parse("http://example.com/") baseUrl, _ := url.Parse("http://example.com/")
p := microdata.NewParser(strings.NewReader(html), baseUrl) p := microdata.NewParser(strings.NewReader(html), baseUrl)
data, err := p.Parse() data, err := p.Parse()
if err != nil { if err != nil {
panic(err) panic(err)
} }
println("Name: ", data.Items[0].Properties["name"][0].(string)) println("Name: ", data.Items[0].Properties["name"][0].(string))
} }
```
Extract microdata from a webpage and print the result as JSON Extract microdata from a webpage and print the result as JSON
package main ```go
package main
import ( import (
"bytes" "bytes"
"io/ioutil" "io/ioutil"
"net/http" "net/http"
"net/url" "net/url"
"os" "os"
"github.com/iand/microdata" "github.com/iand/microdata"
) )
func main() { func main() {
baseUrl, _ := url.Parse("http://www.designhive.com/blog/using-schemaorg-microdata") baseUrl, _ := url.Parse("http://www.designhive.com/blog/using-schemaorg-microdata")
resp, _ := http.Get(baseUrl.String()) resp, _ := http.Get(baseUrl.String())
defer resp.Body.Close() defer resp.Body.Close()
html, _ := ioutil.ReadAll(resp.Body) html, _ := ioutil.ReadAll(resp.Body)
p := microdata.NewParser(bytes.NewReader(html), baseUrl) p := microdata.NewParser(bytes.NewReader(html), baseUrl)
data, _ := p.Parse() data, _ := p.Parse()
json, _ := data.JSON()
os.Stdout.Write(json)
}
json, _ := data.JSON()
os.Stdout.Write(json)
}
```
## Authors ## Authors
* [Ian Davis](http://github.com/iand) - <http://iandavis.com/> * [Ian Davis](http://github.com/iand) - <http://iandavis.com/>
## Contributors ## Contributors
## Contributing ## Contributing
* Do submit your changes as a pull request * Do submit your changes as a pull request

View File

@ -18,12 +18,12 @@ import (
"golang.org/x/net/html/atom" "golang.org/x/net/html/atom"
) )
type ValueList []interface{} type valueList []interface{}
type PropertyMap map[string]ValueList type propertyMap map[string]valueList
// Item represents a microdata item // Item represents a microdata item
type Item struct { type Item struct {
Properties PropertyMap `json:"properties"` Properties propertyMap `json:"properties"`
Types []string `json:"type,omitempty"` Types []string `json:"type,omitempty"`
ID string `json:"id,omitempty"` ID string `json:"id,omitempty"`
} }
@ -31,7 +31,7 @@ type Item struct {
// NewItem creates a new microdata item // NewItem creates a new microdata item
func NewItem() *Item { func NewItem() *Item {
return &Item{ return &Item{
Properties: make(PropertyMap, 0), Properties: make(propertyMap, 0),
Types: make([]string, 0), Types: make([]string, 0),
} }
} }
@ -132,11 +132,10 @@ func (p *Parser) Parse() (*Microdata, error) {
} }
// itemid only valid when itemscope and itemtype are both present // itemid only valid when itemscope and itemtype are both present
if itemid, exists := getAttr("itemid", node); exists { if itemid, exists := getAttr("itemid", node); exists {
if parsedUrl, err := p.base.Parse(itemid); err == nil { if parsedURL, err := p.base.Parse(itemid); err == nil {
item.ID = parsedUrl.String() item.ID = parsedURL.String()
} }
} }
} }
if itemrefs, exists := getAttr("itemref", node); exists { if itemrefs, exists := getAttr("itemref", node); exists {
@ -168,7 +167,9 @@ func (p *Parser) readItem(item *Item, node *html.Node) {
itemref = strings.TrimSpace(itemref) itemref = strings.TrimSpace(itemref)
if refnode, exists := p.identifiedNodes[itemref]; exists { if refnode, exists := p.identifiedNodes[itemref]; exists {
p.readItem(subitem, refnode) if refnode != node {
p.readItem(subitem, refnode)
}
} }
} }
} }
@ -198,15 +199,14 @@ func (p *Parser) readItem(item *Item, node *html.Node) {
} }
case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video: case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video:
if urlValue, exists := getAttr("src", node); exists { if urlValue, exists := getAttr("src", node); exists {
if parsedUrl, err := p.base.Parse(urlValue); err == nil { if parsedURL, err := p.base.Parse(urlValue); err == nil {
propertyValue = parsedUrl.String() propertyValue = parsedURL.String()
} }
} }
case atom.A, atom.Area, atom.Link: case atom.A, atom.Area, atom.Link:
if urlValue, exists := getAttr("href", node); exists { if urlValue, exists := getAttr("href", node); exists {
if parsedUrl, err := p.base.Parse(urlValue); err == nil { if parsedURL, err := p.base.Parse(urlValue); err == nil {
propertyValue = parsedUrl.String() propertyValue = parsedURL.String()
} }
} }
case atom.Object: case atom.Object:

View File

@ -8,6 +8,7 @@ package microdata
import ( import (
"bytes" "bytes"
"net/url" "net/url"
"reflect"
"strings" "strings"
"testing" "testing"
) )
@ -569,3 +570,30 @@ func TestJsonWithType(t *testing.T) {
t.Errorf("Expecting %s but got %s", expected, actual) t.Errorf("Expecting %s but got %s", expected, actual)
} }
} }
// This test checks stack overflow doesn't happen as mentioned in
// https://github.com/iand/microdata/issues/3
func TestSkipSelfReferencingItemref(t *testing.T) {
html := `<body itemscope itemtype="http://schema.org/WebPage">
<span id="1" itemscope itemtype="http://data-vocabulary.org/Breadcrumb" itemprop="child" itemref="1">
<a title="Foo" itemprop="url" href="/foo/bar"><span itemprop="title">Foo</span></a>
</span>
</body>`
actual := ParseData(html, t)
child := NewItem()
child.AddString("title", "Foo")
child.AddString("url", "http://example.com/foo/bar")
item := NewItem()
item.AddType("http://schema.org/WebPage")
item.AddItem("child", child)
expected := NewMicrodata()
expected.AddItem(item)
if !reflect.DeepEqual(expected, actual) {
t.Errorf("Expecting %s but got %s", expected, actual)
}
}