commit 121d88747a1a64f0ff11b3ed619dfa421e60a68c Author: Ian Davis Date: Thu Jun 7 00:49:06 2012 +0100 Initial commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..ec2a26f --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +microdata - a microdata parser in Go \ No newline at end of file diff --git a/microdata.go b/microdata.go new file mode 100644 index 0000000..f79c88c --- /dev/null +++ b/microdata.go @@ -0,0 +1,148 @@ +package microdata + +import ( + "bytes" + "code.google.com/p/go-html-transform/h5" + "io" +) + + + +type ValueList []interface{} +type PropertyMap map[string]ValueList + +type Item struct { + properties PropertyMap +} + +func NewItem() *Item { + return &Item{ + properties: make(PropertyMap, 10), + } +} + +func (self *Item) SetString(property string, value string) { + self.properties[property] = append(self.properties[property], value) +} + +type Microdata struct { + items []*Item +} + +func NewMicrodata() *Microdata { + return &Microdata{ + items: make([]*Item, 0), + } +} + +type Parser struct { + p *h5.Parser + data *Microdata +} + +func NewParser(r io.Reader) *Parser { + return &Parser { + p : h5.NewParser(r), + data: NewMicrodata(), + } +} + +func (self *Parser) Parse() (*Microdata, error) { + err := self.p.Parse() + if err != nil { + return nil, err + } + tree := self.p.Tree() + + self.scanForItem(tree) + + return self.data, nil +} + +func (self *Parser) scanForItem(node *h5.Node) { + if node == nil { + return + } + + hasItemscope := false + + for _, a := range node.Attr { + if a.Name == "itemscope" { + hasItemscope = true + break + } + } + if hasItemscope { + item := NewItem() + self.data.items = append(self.data.items, item) + + + if len(node.Children) > 0 { + for _, child := range node.Children { + self.readItem(item, child) + } + } + + } else { + if len(node.Children) > 0 { + for _, child := range node.Children { + self.scanForItem(child) + } + } + } + +} + +func (self *Parser) readItem(item *Item, node *h5.Node) { + if propertyName, exists := getAttr("itemprop", node); exists { + var propertyValue string + + switch node.Data() { + + case "img","audio", "source", "video", "embed", "iframe", "track": + if urlValue, exists := getAttr("src", node); exists { + propertyValue = urlValue + } + case "a", "area", "link": + if urlValue, exists := getAttr("href", node); exists { + propertyValue = urlValue + } + case "data": + if urlValue, exists := getAttr("value", node); exists { + propertyValue = urlValue + } + case "time": + if urlValue, exists := getAttr("datetime", node); exists { + propertyValue = urlValue + } + + default: + var text bytes.Buffer + node.Walk( func(n *h5.Node) { + if n.Type == h5.TextNode { + text.WriteString(n.Data()) + } + + }) + propertyValue = text.String() + } + + item.SetString(propertyName, propertyValue) + } + + if len(node.Children) > 0 { + for _, child := range node.Children { + self.readItem(item, child) + } + } +} + +func getAttr(name string, node *h5.Node) (string, bool) { + for _, a := range node.Attr { + if a.Name == name { + return a.Value, true + } + } + return "", false +} + diff --git a/microdata_test.go b/microdata_test.go new file mode 100644 index 0000000..f7eda9b --- /dev/null +++ b/microdata_test.go @@ -0,0 +1,236 @@ +package microdata + +import ( + "strings" + "testing" +) + +func ReadOneItem(html string, t *testing.T) *Item { + p := NewParser(strings.NewReader(html)) + + data, err := p.Parse() + if err != nil { + t.Errorf("Expected no error but got %d", err) + } + + if data == nil { + t.Errorf("Expected non-nil data") + } + + return data.items[0] +} + + +func TestRead(t *testing.T) { + html := ` +
+

My name is Elizabeth.

+
` + + item := ReadOneItem(html, t) + + if item.properties["name"][0].(string) != "Elizabeth" { + t.Errorf("Property value not found") + } + +} + + +func TestReadActuallyParses(t *testing.T) { + html := ` +
+

My name is Daniel.

+
` + item := ReadOneItem(html, t) + + if item.properties["name"][0].(string) != "Daniel" { + t.Errorf("Property value not found") + } + +} + + +func TestReadThreeProps(t *testing.T) { + html := ` +
+

My name is Neil.

+

My band is called Four Parts Water.

+

I am British.

+
` + + item := ReadOneItem(html, t) + + if item.properties["name"][0].(string) != "Neil" { + t.Errorf("Property value not found") + } + + if item.properties["band"][0].(string) != "Four Parts Water" { + t.Errorf("Property value not found") + } + + if item.properties["nationality"][0].(string) != "British" { + t.Errorf("Property value not found") + } +} + + +func TestReadImgSrc(t *testing.T) { + html := ` +
+ Google +
` + + item := ReadOneItem(html, t) + + if item.properties["image"][0].(string) != "google-logo.png" { + t.Errorf("Property value not found") + } +} + +func TestReadAHref(t *testing.T) { + html := ` +
+ foo +
` + + item := ReadOneItem(html, t) + + if item.properties["image"][0].(string) != "google-logo.png" { + t.Errorf("Property value not found") + } +} + +func TestReadAreaHref(t *testing.T) { + html := ` +
+ + +
` + + item := ReadOneItem(html, t) + + if item.properties["foo"][0].(string) != "target.html" { + t.Errorf("Property value not found") + } +} + +func TestReadLinkHref(t *testing.T) { + html := ` +
+ +
` + + item := ReadOneItem(html, t) + + if item.properties["foo"][0].(string) != "target.html" { + t.Errorf("Property value not found") + } +} + +func TestReadAudioSrc(t *testing.T) { + html := ` +
+ +
` + + item := ReadOneItem(html, t) + + if item.properties["foo"][0].(string) != "target" { + t.Errorf("Property value not found") + } +} + +func TestReadSourceSrc(t *testing.T) { + html := ` +
+ +
` + + item := ReadOneItem(html, t) + + if item.properties["foo"][0].(string) != "target" { + t.Errorf("Property value not found") + } +} + + +func TestReadVideoSrc(t *testing.T) { + html := ` +
+ +
` + + item := ReadOneItem(html, t) + + if item.properties["foo"][0].(string) != "target" { + t.Errorf("Property value not found") + } +} + +func TestReadEmbedSrc(t *testing.T) { + html := ` +
+ +
` + + item := ReadOneItem(html, t) + + if item.properties["foo"][0].(string) != "target" { + t.Errorf("Property value not found") + } +} + +func TestReadTrackSrc(t *testing.T) { + html := ` +
+ +
` + + item := ReadOneItem(html, t) + + if item.properties["foo"][0].(string) != "target" { + t.Errorf("Property value not found") + } +} + +func TestReadIFrameSrc(t *testing.T) { + html := ` +
+ +
` + + item := ReadOneItem(html, t) + + if item.properties["foo"][0].(string) != "target" { + t.Errorf("Property value not found") + } +} + +func TestReadDataValue(t *testing.T) { + html := ` +

+ The Instigator 2000 +

` + + item := ReadOneItem(html, t) + + if item.properties["product-id"][0].(string) != "9678AOU879" { + t.Errorf("Property value not found") + } +} + +func TestReadTimeDatetime(t *testing.T) { + html := ` +

+ I was born on . +

` + + item := ReadOneItem(html, t) + + if item.properties["birthday"][0].(string) != "2009-05-10" { + t.Errorf("Property value not found") + } +} + + +