microdata_mirror/microdata.go

256 lines
6.3 KiB
Go
Raw Normal View History

/*
2013-07-10 19:59:28 +04:00
This is free and unencumbered software released into the public domain. For more
information, see <http://unlicense.org/> or the accompanying UNLICENSE file.
*/
// Package microdata provides types and functions for paring microdata from web pages.
2012-06-10 22:59:50 +04:00
// See http://www.w3.org/TR/microdata/ for more information about Microdata
2012-06-07 03:49:06 +04:00
package microdata
import (
"bytes"
2012-06-10 22:49:15 +04:00
"encoding/json"
2012-06-07 03:49:06 +04:00
"io"
2012-06-10 19:26:53 +04:00
"net/url"
2012-06-07 18:31:43 +04:00
"strings"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
2012-06-07 03:49:06 +04:00
)
2018-09-14 14:48:17 +03:00
type valueList []interface{}
type propertyMap map[string]valueList
2012-06-07 03:49:06 +04:00
// Item represents a microdata item
2012-06-07 03:49:06 +04:00
type Item struct {
2018-09-14 14:48:17 +03:00
Properties propertyMap `json:"properties"`
2012-06-10 22:49:15 +04:00
Types []string `json:"type,omitempty"`
ID string `json:"id,omitempty"`
2012-06-07 03:49:06 +04:00
}
// NewItem creates a new microdata item
2012-06-07 03:49:06 +04:00
func NewItem() *Item {
return &Item{
2018-09-14 14:48:17 +03:00
Properties: make(propertyMap, 0),
2012-06-10 21:57:35 +04:00
Types: make([]string, 0),
2012-06-07 03:49:06 +04:00
}
}
// AddString adds a string type item property value
func (i *Item) AddString(property string, value string) {
i.Properties[property] = append(i.Properties[property], value)
2012-06-07 03:49:06 +04:00
}
// AddItem adds an Item type item property value
func (i *Item) AddItem(property string, value *Item) {
i.Properties[property] = append(i.Properties[property], value)
2012-06-10 19:22:51 +04:00
}
// AddType adds a type to the item
func (i *Item) AddType(value string) {
i.Types = append(i.Types, value)
2012-06-10 22:49:15 +04:00
}
2012-06-10 19:22:51 +04:00
// Microdata represents a set of microdata items
2012-06-07 03:49:06 +04:00
type Microdata struct {
2012-06-10 22:49:15 +04:00
Items []*Item `json:"items"`
2012-06-07 03:49:06 +04:00
}
// NewMicrodata creates a new microdata set
2012-06-07 03:49:06 +04:00
func NewMicrodata() *Microdata {
return &Microdata{
2012-06-10 21:57:35 +04:00
Items: make([]*Item, 0),
2012-06-07 03:49:06 +04:00
}
}
// AddItem adds an item to the microdata set
func (m *Microdata) AddItem(value *Item) {
m.Items = append(m.Items, value)
2012-06-10 22:49:15 +04:00
}
// JSON converts the microdata set to JSON
func (m *Microdata) JSON() ([]byte, error) {
b, err := json.Marshal(m)
2012-06-10 22:49:15 +04:00
if err != nil {
return nil, err
}
return b, nil
}
// Parser is an HTML parser that extracts microdata
2012-06-07 03:49:06 +04:00
type Parser struct {
r io.Reader
2012-06-10 17:59:30 +04:00
data *Microdata
2012-06-10 22:49:15 +04:00
base *url.URL
2013-07-10 19:59:28 +04:00
identifiedNodes map[string]*html.Node
2012-06-07 03:49:06 +04:00
}
// NewParser creates a new parser for extracting microdata
// r is a reader over an HTML document
// base is the base URL for resolving relative URLs
2012-06-10 21:57:35 +04:00
func NewParser(r io.Reader, base *url.URL) *Parser {
2012-06-10 17:59:30 +04:00
return &Parser{
r: r,
2012-06-07 03:49:06 +04:00
data: NewMicrodata(),
2012-06-10 21:57:35 +04:00
base: base,
2012-06-07 03:49:06 +04:00
}
}
// Parse the document and return a Microdata set
func (p *Parser) Parse() (*Microdata, error) {
tree, err := html.Parse(p.r)
if err != nil {
return nil, err
}
2012-06-07 03:49:06 +04:00
2013-07-10 19:59:28 +04:00
topLevelItemNodes := make([]*html.Node, 0)
p.identifiedNodes = make(map[string]*html.Node, 0)
2012-06-07 03:49:06 +04:00
walk(tree, func(n *html.Node) {
if n.Type == html.ElementNode {
if _, exists := getAttr("itemscope", n); exists {
if _, exists := getAttr("itemprop", n); !exists {
topLevelItemNodes = append(topLevelItemNodes, n)
}
2012-06-10 17:18:33 +04:00
}
2012-06-07 03:49:06 +04:00
if id, exists := getAttr("id", n); exists {
p.identifiedNodes[id] = n
}
2012-06-10 17:18:33 +04:00
}
2012-06-10 17:59:30 +04:00
})
2012-06-10 17:18:33 +04:00
for _, node := range topLevelItemNodes {
p.data.Items = append(p.data.Items, p.readItem(nil, node))
}
return p.data, nil
}
func (p *Parser) readItem(item *Item, node *html.Node) *Item {
var parent *Item
if _, exists := getAttr("itemscope", node); exists {
parent, item = item, NewItem()
2012-06-07 18:31:43 +04:00
if itemtypes, exists := getAttr("itemtype", node); exists {
for _, itemtype := range strings.Split(strings.TrimSpace(itemtypes), " ") {
itemtype = strings.TrimSpace(itemtype)
if itemtype != "" {
2012-06-10 21:57:35 +04:00
item.Types = append(item.Types, itemtype)
2012-06-07 18:31:43 +04:00
}
}
2012-06-07 18:41:50 +04:00
// itemid only valid when itemscope and itemtype are both present
2012-06-10 17:59:30 +04:00
if itemid, exists := getAttr("itemid", node); exists {
2018-09-14 14:48:17 +03:00
if parsedURL, err := p.base.Parse(itemid); err == nil {
item.ID = parsedURL.String()
2012-06-10 21:57:35 +04:00
}
2012-06-07 18:41:50 +04:00
}
2012-06-10 17:59:30 +04:00
}
if itemrefs, exists := getAttr("itemref", node); exists {
for _, itemref := range strings.Split(strings.TrimSpace(itemrefs), " ") {
itemref = strings.TrimSpace(itemref)
if refnode, exists := p.identifiedNodes[itemref]; exists {
if refnode != node {
p.readItem(item, refnode)
}
2012-06-10 17:59:30 +04:00
}
2012-06-10 17:18:33 +04:00
}
}
2012-06-07 03:49:06 +04:00
}
2012-06-07 18:31:43 +04:00
if itemprop, exists := getAttr("itemprop", node); exists {
if parent != nil {
// an itemprop on an itemscope has value of the item created by the itemscope
2012-06-10 19:22:51 +04:00
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
propertyName = strings.TrimSpace(propertyName)
if propertyName != "" {
parent.AddItem(propertyName, item)
2012-06-10 19:22:51 +04:00
}
2012-06-10 17:59:30 +04:00
}
} else {
var propertyValue string
2012-06-07 03:49:06 +04:00
switch node.DataAtom {
case atom.Meta:
if val, exists := getAttr("content", node); exists {
propertyValue = val
2012-06-10 19:22:51 +04:00
}
case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video:
if urlValue, exists := getAttr("src", node); exists {
if parsedURL, err := p.base.Parse(urlValue); err == nil {
propertyValue = parsedURL.String()
}
2012-06-10 17:59:30 +04:00
}
case atom.A, atom.Area, atom.Link:
if urlValue, exists := getAttr("href", node); exists {
if parsedURL, err := p.base.Parse(urlValue); err == nil {
propertyValue = parsedURL.String()
}
}
case atom.Object:
if urlValue, exists := getAttr("data", node); exists {
propertyValue = urlValue
}
case atom.Data, atom.Meter:
if urlValue, exists := getAttr("value", node); exists {
propertyValue = urlValue
}
case atom.Time:
if urlValue, exists := getAttr("datetime", node); exists {
propertyValue = urlValue
}
2012-06-10 19:22:51 +04:00
default:
var text bytes.Buffer
walk(node, func(n *html.Node) {
if n.Type == html.TextNode {
text.WriteString(n.Data)
}
2012-06-07 03:49:06 +04:00
})
propertyValue = text.String()
}
if len(propertyValue) > 0 {
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
propertyName = strings.TrimSpace(propertyName)
if propertyName != "" {
item.AddString(propertyName, propertyValue)
}
2012-06-10 19:22:51 +04:00
}
2012-06-07 18:31:43 +04:00
}
}
2012-06-07 03:49:06 +04:00
}
2013-07-10 19:59:28 +04:00
for child := node.FirstChild; child != nil; {
p.readItem(item, child)
2013-07-10 19:59:28 +04:00
child = child.NextSibling
2012-06-10 17:59:30 +04:00
}
2012-06-10 19:22:51 +04:00
return item
2012-06-07 03:49:06 +04:00
}
2013-07-10 19:59:28 +04:00
func getAttr(name string, node *html.Node) (string, bool) {
2012-06-07 03:49:06 +04:00
for _, a := range node.Attr {
2013-07-10 19:59:28 +04:00
if a.Key == name {
return a.Val, true
2012-06-07 03:49:06 +04:00
}
}
return "", false
}
func walk(parent *html.Node, fn func(n *html.Node)) {
if parent == nil {
return
}
fn(parent)
for child := parent.FirstChild; child != nil; {
walk(child, fn)
child = child.NextSibling
}
}