diff --git a/README.md b/README.md
index ec2a26f..527bf8d 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,34 @@
-microdata - a microdata parser in Go
\ No newline at end of file
+microdata - a microdata parser in Go
+
+INSTALLATION
+============
+
+Simply run
+
+ go get github.com/iand/microdata
+
+Documentation is at [http://go.pkgdoc.org/github.com/iand/microdata](http://go.pkgdoc.org/github.com/iand/microdata)
+
+
+USAGE
+=====
+
+Example of parsing a string containing HTML:
+
+ include (
+ "net/url"
+ "strings"
+ )
+ html = `
+
My name is Elizabeth.
+
`
+
+ baseUrl, _ := url.Parse("http://example.com/")
+ p := NewParser(strings.NewReader(html), baseUrl)
+
+ data, err := p.Parse()
+ if err != nil {
+ t.Errorf("Expected no error but got %d", err)
+ }
+
+ println("Name: ", data.items[0].properties["name"][0]
\ No newline at end of file
diff --git a/microdata.go b/microdata.go
index 7ac1237..b98ab7b 100644
--- a/microdata.go
+++ b/microdata.go
@@ -12,47 +12,49 @@ type ValueList []interface{}
type PropertyMap map[string]ValueList
type Item struct {
- properties PropertyMap
- types []string
- id string
+ Properties PropertyMap
+ Types []string
+ ID string
}
func NewItem() *Item {
return &Item{
- properties: make(PropertyMap, 0),
- types: make([]string, 0),
+ Properties: make(PropertyMap, 0),
+ Types: make([]string, 0),
}
}
func (self *Item) SetString(property string, value string) {
- self.properties[property] = append(self.properties[property], value)
+ self.Properties[property] = append(self.Properties[property], value)
}
func (self *Item) SetItem(property string, value *Item) {
- self.properties[property] = append(self.properties[property], value)
+ self.Properties[property] = append(self.Properties[property], value)
}
type Microdata struct {
- items []*Item
+ Items []*Item
}
func NewMicrodata() *Microdata {
return &Microdata{
- items: make([]*Item, 0),
+ Items: make([]*Item, 0),
}
}
type Parser struct {
p *h5.Parser
data *Microdata
+ base *url.URL
identifiedNodes map[string]*h5.Node
}
-func NewParser(r io.Reader, url.URL) *Parser {
+func NewParser(r io.Reader, base *url.URL) *Parser {
return &Parser{
p: h5.NewParser(r),
data: NewMicrodata(),
+ base: base,
}
}
@@ -80,17 +82,19 @@ func (self *Parser) Parse() (*Microdata, error) {
for _, node := range topLevelItemNodes {
item := NewItem()
- self.data.items = append(self.data.items, item)
+ self.data.Items = append(self.data.Items, item)
if itemtypes, exists := getAttr("itemtype", node); exists {
for _, itemtype := range strings.Split(strings.TrimSpace(itemtypes), " ") {
itemtype = strings.TrimSpace(itemtype)
if itemtype != "" {
- item.types = append(item.types, itemtype)
+ item.Types = append(item.Types, itemtype)
}
}
// itemid only valid when itemscope and itemtype are both present
if itemid, exists := getAttr("itemid", node); exists {
- item.id = strings.TrimSpace(itemid)
+ if parsedUrl, err := self.base.Parse(itemid); err == nil {
+ item.ID = parsedUrl.String()
+ }
}
}
@@ -152,11 +156,16 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
case "img", "audio", "source", "video", "embed", "iframe", "track":
if urlValue, exists := getAttr("src", node); exists {
- propertyValue = urlValue
+ if parsedUrl, err := self.base.Parse(urlValue); err == nil {
+ propertyValue = parsedUrl.String()
+ }
+
}
case "a", "area", "link":
if urlValue, exists := getAttr("href", node); exists {
- propertyValue = urlValue
+ if parsedUrl, err := self.base.Parse(urlValue); err == nil {
+ propertyValue = parsedUrl.String()
+ }
}
case "data":
if urlValue, exists := getAttr("value", node); exists {
@@ -178,14 +187,15 @@ func (self *Parser) readItem(item *Item, node *h5.Node) {
propertyValue = text.String()
}
- for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
- propertyName = strings.TrimSpace(propertyName)
- if propertyName != "" {
- item.SetString(propertyName, propertyValue)
+ if len(propertyValue) > 0 {
+ for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
+ propertyName = strings.TrimSpace(propertyName)
+ if propertyName != "" {
+ item.SetString(propertyName, propertyValue)
+ }
}
}
-
}
diff --git a/microdata_test.go b/microdata_test.go
index 42b34a3..172d3af 100644
--- a/microdata_test.go
+++ b/microdata_test.go
@@ -1,12 +1,14 @@
package microdata
import (
+ "net/url"
"strings"
"testing"
)
func ParseData(html string, t *testing.T) *Microdata {
- p := NewParser(strings.NewReader(html), url.URL("http://example.com/")
+ u, _ := url.Parse("http://example.com/")
+ p := NewParser(strings.NewReader(html), u)
data, err := p.Parse()
if err != nil {
@@ -22,7 +24,7 @@ func ParseData(html string, t *testing.T) *Microdata {
func ParseOneItem(html string, t *testing.T) *Item {
data := ParseData(html, t)
- return data.items[0]
+ return data.Items[0]
}
func TestParse(t *testing.T) {
@@ -33,7 +35,7 @@ func TestParse(t *testing.T) {
item := ParseOneItem(html, t)
- if item.properties["name"][0].(string) != "Elizabeth" {
+ if item.Properties["name"][0].(string) != "Elizabeth" {
t.Errorf("Property value not found")
}
@@ -46,7 +48,7 @@ func TestParseActuallyParses(t *testing.T) {
`
item := ParseOneItem(html, t)
- if item.properties["name"][0].(string) != "Daniel" {
+ if item.Properties["name"][0].(string) != "Daniel" {
t.Errorf("Property value not found")
}
@@ -62,15 +64,15 @@ func TestParseThreeProps(t *testing.T) {
item := ParseOneItem(html, t)
- if item.properties["name"][0].(string) != "Neil" {
+ if item.Properties["name"][0].(string) != "Neil" {
t.Errorf("Property value not found")
}
- if item.properties["band"][0].(string) != "Four Parts Water" {
+ if item.Properties["band"][0].(string) != "Four Parts Water" {
t.Errorf("Property value not found")
}
- if item.properties["nationality"][0].(string) != "British" {
+ if item.Properties["nationality"][0].(string) != "British" {
t.Errorf("Property value not found")
}
}
@@ -78,12 +80,12 @@ func TestParseThreeProps(t *testing.T) {
func TestParseImgSrc(t *testing.T) {
html := `
-
+
`
item := ParseOneItem(html, t)
- if item.properties["image"][0].(string) != "google-logo.png" {
+ if item.Properties["image"][0].(string) != "http://example.com/foo" {
t.Errorf("Property value not found")
}
}
@@ -91,12 +93,12 @@ func TestParseImgSrc(t *testing.T) {
func TestParseAHref(t *testing.T) {
html := `