Initial commit

master
Ian Davis 2012-06-07 00:49:06 +01:00
commit 121d88747a
3 changed files with 385 additions and 0 deletions

1
README.md 100644
View File

@ -0,0 +1 @@
microdata - a microdata parser in Go

148
microdata.go 100644
View File

@ -0,0 +1,148 @@
package microdata
import (
"bytes"
"code.google.com/p/go-html-transform/h5"
"io"
)
type ValueList []interface{}
type PropertyMap map[string]ValueList
type Item struct {
properties PropertyMap
}
func NewItem() *Item {
return &Item{
properties: make(PropertyMap, 10),
}
}
func (self *Item) SetString(property string, value string) {
self.properties[property] = append(self.properties[property], value)
}
type Microdata struct {
items []*Item
}
func NewMicrodata() *Microdata {
return &Microdata{
items: make([]*Item, 0),
}
}
type Parser struct {
p *h5.Parser
data *Microdata
}
func NewParser(r io.Reader) *Parser {
return &Parser {
p : h5.NewParser(r),
data: NewMicrodata(),
}
}
func (self *Parser) Parse() (*Microdata, error) {
err := self.p.Parse()
if err != nil {
return nil, err
}
tree := self.p.Tree()
self.scanForItem(tree)
return self.data, nil
}
func (self *Parser) scanForItem(node *h5.Node) {
if node == nil {
return
}
hasItemscope := false
for _, a := range node.Attr {
if a.Name == "itemscope" {
hasItemscope = true
break
}
}
if hasItemscope {
item := NewItem()
self.data.items = append(self.data.items, item)
if len(node.Children) > 0 {
for _, child := range node.Children {
self.readItem(item, child)
}
}
} else {
if len(node.Children) > 0 {
for _, child := range node.Children {
self.scanForItem(child)
}
}
}
}
func (self *Parser) readItem(item *Item, node *h5.Node) {
if propertyName, exists := getAttr("itemprop", node); exists {
var propertyValue string
switch node.Data() {
case "img","audio", "source", "video", "embed", "iframe", "track":
if urlValue, exists := getAttr("src", node); exists {
propertyValue = urlValue
}
case "a", "area", "link":
if urlValue, exists := getAttr("href", node); exists {
propertyValue = urlValue
}
case "data":
if urlValue, exists := getAttr("value", node); exists {
propertyValue = urlValue
}
case "time":
if urlValue, exists := getAttr("datetime", node); exists {
propertyValue = urlValue
}
default:
var text bytes.Buffer
node.Walk( func(n *h5.Node) {
if n.Type == h5.TextNode {
text.WriteString(n.Data())
}
})
propertyValue = text.String()
}
item.SetString(propertyName, propertyValue)
}
if len(node.Children) > 0 {
for _, child := range node.Children {
self.readItem(item, child)
}
}
}
func getAttr(name string, node *h5.Node) (string, bool) {
for _, a := range node.Attr {
if a.Name == name {
return a.Value, true
}
}
return "", false
}

236
microdata_test.go 100644
View File

@ -0,0 +1,236 @@
package microdata
import (
"strings"
"testing"
)
func ReadOneItem(html string, t *testing.T) *Item {
p := NewParser(strings.NewReader(html))
data, err := p.Parse()
if err != nil {
t.Errorf("Expected no error but got %d", err)
}
if data == nil {
t.Errorf("Expected non-nil data")
}
return data.items[0]
}
func TestRead(t *testing.T) {
html := `
<div itemscope>
<p>My name is <span itemprop="name">Elizabeth</span>.</p>
</div>`
item := ReadOneItem(html, t)
if item.properties["name"][0].(string) != "Elizabeth" {
t.Errorf("Property value not found")
}
}
func TestReadActuallyParses(t *testing.T) {
html := `
<div itemscope>
<p>My name is <span itemprop="name">Daniel</span>.</p>
</div>`
item := ReadOneItem(html, t)
if item.properties["name"][0].(string) != "Daniel" {
t.Errorf("Property value not found")
}
}
func TestReadThreeProps(t *testing.T) {
html := `
<div itemscope>
<p>My name is <span itemprop="name">Neil</span>.</p>
<p>My band is called <span itemprop="band">Four Parts Water</span>.</p>
<p>I am <span itemprop="nationality">British</span>.</p>
</div>`
item := ReadOneItem(html, t)
if item.properties["name"][0].(string) != "Neil" {
t.Errorf("Property value not found")
}
if item.properties["band"][0].(string) != "Four Parts Water" {
t.Errorf("Property value not found")
}
if item.properties["nationality"][0].(string) != "British" {
t.Errorf("Property value not found")
}
}
func TestReadImgSrc(t *testing.T) {
html := `
<div itemscope>
<img itemprop="image" src="google-logo.png" alt="Google">
</div>`
item := ReadOneItem(html, t)
if item.properties["image"][0].(string) != "google-logo.png" {
t.Errorf("Property value not found")
}
}
func TestReadAHref(t *testing.T) {
html := `
<div itemscope>
<a itemprop="image" href="google-logo.png">foo</a>
</div>`
item := ReadOneItem(html, t)
if item.properties["image"][0].(string) != "google-logo.png" {
t.Errorf("Property value not found")
}
}
func TestReadAreaHref(t *testing.T) {
html := `
<div itemscope><map name="shapes">
<area itemprop="foo" href="target.html" shape=rect coords="50,50,100,100">
</map></div>`
item := ReadOneItem(html, t)
if item.properties["foo"][0].(string) != "target.html" {
t.Errorf("Property value not found")
}
}
func TestReadLinkHref(t *testing.T) {
html := `
<div itemscope>
<link itemprop="foo" rel="author" href="target.html">
</div>`
item := ReadOneItem(html, t)
if item.properties["foo"][0].(string) != "target.html" {
t.Errorf("Property value not found")
}
}
func TestReadAudioSrc(t *testing.T) {
html := `
<div itemscope>
<audio itemprop="foo" src="target"></audio>
</div>`
item := ReadOneItem(html, t)
if item.properties["foo"][0].(string) != "target" {
t.Errorf("Property value not found")
}
}
func TestReadSourceSrc(t *testing.T) {
html := `
<div itemscope>
<source itemprop="foo" src="target"></source>
</div>`
item := ReadOneItem(html, t)
if item.properties["foo"][0].(string) != "target" {
t.Errorf("Property value not found")
}
}
func TestReadVideoSrc(t *testing.T) {
html := `
<div itemscope>
<video itemprop="foo" src="target"></video>
</div>`
item := ReadOneItem(html, t)
if item.properties["foo"][0].(string) != "target" {
t.Errorf("Property value not found")
}
}
func TestReadEmbedSrc(t *testing.T) {
html := `
<div itemscope>
<embed itemprop="foo" src="target"></embed>
</div>`
item := ReadOneItem(html, t)
if item.properties["foo"][0].(string) != "target" {
t.Errorf("Property value not found")
}
}
func TestReadTrackSrc(t *testing.T) {
html := `
<div itemscope>
<track itemprop="foo" src="target"></track>
</div>`
item := ReadOneItem(html, t)
if item.properties["foo"][0].(string) != "target" {
t.Errorf("Property value not found")
}
}
func TestReadIFrameSrc(t *testing.T) {
html := `
<div itemscope>
<iframe itemprop="foo" src="target"></iframe>
</div>`
item := ReadOneItem(html, t)
if item.properties["foo"][0].(string) != "target" {
t.Errorf("Property value not found")
}
}
func TestReadDataValue(t *testing.T) {
html := `
<h1 itemscope>
<data itemprop="product-id" value="9678AOU879">The Instigator 2000</data>
</h1>`
item := ReadOneItem(html, t)
if item.properties["product-id"][0].(string) != "9678AOU879" {
t.Errorf("Property value not found")
}
}
func TestReadTimeDatetime(t *testing.T) {
html := `
<h1 itemscope>
I was born on <time itemprop="birthday" datetime="2009-05-10">May 10th 2009</time>.
</h1>`
item := ReadOneItem(html, t)
if item.properties["birthday"][0].(string) != "2009-05-10" {
t.Errorf("Property value not found")
}
}