forked from ukamnya/microdata_mirror
ensure itemscope always starts a new item
parent
a5d3d8ae37
commit
ca93c08d53
148
microdata.go
148
microdata.go
|
@ -121,8 +121,18 @@ func (p *Parser) Parse() (*Microdata, error) {
|
||||||
})
|
})
|
||||||
|
|
||||||
for _, node := range topLevelItemNodes {
|
for _, node := range topLevelItemNodes {
|
||||||
item := NewItem()
|
p.data.Items = append(p.data.Items, p.readItem(nil, node))
|
||||||
p.data.Items = append(p.data.Items, item)
|
}
|
||||||
|
|
||||||
|
return p.data, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *Parser) readItem(item *Item, node *html.Node) *Item {
|
||||||
|
var parent *Item
|
||||||
|
|
||||||
|
if _, exists := getAttr("itemscope", node); exists {
|
||||||
|
parent, item = item, NewItem()
|
||||||
|
|
||||||
if itemtypes, exists := getAttr("itemtype", node); exists {
|
if itemtypes, exists := getAttr("itemtype", node); exists {
|
||||||
for _, itemtype := range strings.Split(strings.TrimSpace(itemtypes), " ") {
|
for _, itemtype := range strings.Split(strings.TrimSpace(itemtypes), " ") {
|
||||||
itemtype = strings.TrimSpace(itemtype)
|
itemtype = strings.TrimSpace(itemtype)
|
||||||
|
@ -143,105 +153,76 @@ func (p *Parser) Parse() (*Microdata, error) {
|
||||||
itemref = strings.TrimSpace(itemref)
|
itemref = strings.TrimSpace(itemref)
|
||||||
|
|
||||||
if refnode, exists := p.identifiedNodes[itemref]; exists {
|
if refnode, exists := p.identifiedNodes[itemref]; exists {
|
||||||
p.readItem(item, refnode)
|
if refnode != node {
|
||||||
}
|
p.readItem(item, refnode)
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for child := node.FirstChild; child != nil; {
|
|
||||||
p.readItem(item, child)
|
|
||||||
child = child.NextSibling
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return p.data, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *Parser) readItem(item *Item, node *html.Node) {
|
|
||||||
if itemprop, exists := getAttr("itemprop", node); exists {
|
|
||||||
if _, exists := getAttr("itemscope", node); exists {
|
|
||||||
subitem := NewItem()
|
|
||||||
|
|
||||||
if itemrefs, exists := getAttr("itemref", node); exists {
|
|
||||||
for _, itemref := range strings.Split(strings.TrimSpace(itemrefs), " ") {
|
|
||||||
itemref = strings.TrimSpace(itemref)
|
|
||||||
|
|
||||||
if refnode, exists := p.identifiedNodes[itemref]; exists {
|
|
||||||
if refnode != node {
|
|
||||||
p.readItem(subitem, refnode)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for child := node.FirstChild; child != nil; {
|
if itemprop, exists := getAttr("itemprop", node); exists {
|
||||||
p.readItem(subitem, child)
|
if parent != nil {
|
||||||
child = child.NextSibling
|
// an itemprop on an itemscope has value of the item created by the itemscope
|
||||||
}
|
|
||||||
|
|
||||||
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
|
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
|
||||||
propertyName = strings.TrimSpace(propertyName)
|
propertyName = strings.TrimSpace(propertyName)
|
||||||
if propertyName != "" {
|
if propertyName != "" {
|
||||||
item.AddItem(propertyName, subitem)
|
parent.AddItem(propertyName, item)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
var propertyValue string
|
||||||
|
|
||||||
return
|
switch node.DataAtom {
|
||||||
|
case atom.Meta:
|
||||||
}
|
if val, exists := getAttr("content", node); exists {
|
||||||
|
propertyValue = val
|
||||||
var propertyValue string
|
|
||||||
|
|
||||||
switch node.DataAtom {
|
|
||||||
case atom.Meta:
|
|
||||||
if val, exists := getAttr("content", node); exists {
|
|
||||||
propertyValue = val
|
|
||||||
}
|
|
||||||
case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video:
|
|
||||||
if urlValue, exists := getAttr("src", node); exists {
|
|
||||||
if parsedURL, err := p.base.Parse(urlValue); err == nil {
|
|
||||||
propertyValue = parsedURL.String()
|
|
||||||
}
|
}
|
||||||
}
|
case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video:
|
||||||
case atom.A, atom.Area, atom.Link:
|
if urlValue, exists := getAttr("src", node); exists {
|
||||||
if urlValue, exists := getAttr("href", node); exists {
|
if parsedURL, err := p.base.Parse(urlValue); err == nil {
|
||||||
if parsedURL, err := p.base.Parse(urlValue); err == nil {
|
propertyValue = parsedURL.String()
|
||||||
propertyValue = parsedURL.String()
|
}
|
||||||
}
|
}
|
||||||
}
|
case atom.A, atom.Area, atom.Link:
|
||||||
case atom.Object:
|
if urlValue, exists := getAttr("href", node); exists {
|
||||||
if urlValue, exists := getAttr("data", node); exists {
|
if parsedURL, err := p.base.Parse(urlValue); err == nil {
|
||||||
propertyValue = urlValue
|
propertyValue = parsedURL.String()
|
||||||
}
|
}
|
||||||
case atom.Data, atom.Meter:
|
}
|
||||||
if urlValue, exists := getAttr("value", node); exists {
|
case atom.Object:
|
||||||
propertyValue = urlValue
|
if urlValue, exists := getAttr("data", node); exists {
|
||||||
}
|
propertyValue = urlValue
|
||||||
case atom.Time:
|
}
|
||||||
if urlValue, exists := getAttr("datetime", node); exists {
|
case atom.Data, atom.Meter:
|
||||||
propertyValue = urlValue
|
if urlValue, exists := getAttr("value", node); exists {
|
||||||
}
|
propertyValue = urlValue
|
||||||
|
}
|
||||||
default:
|
case atom.Time:
|
||||||
var text bytes.Buffer
|
if urlValue, exists := getAttr("datetime", node); exists {
|
||||||
walk(node, func(n *html.Node) {
|
propertyValue = urlValue
|
||||||
if n.Type == html.TextNode {
|
|
||||||
text.WriteString(n.Data)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
})
|
default:
|
||||||
propertyValue = text.String()
|
var text bytes.Buffer
|
||||||
}
|
walk(node, func(n *html.Node) {
|
||||||
|
if n.Type == html.TextNode {
|
||||||
|
text.WriteString(n.Data)
|
||||||
|
}
|
||||||
|
|
||||||
if len(propertyValue) > 0 {
|
})
|
||||||
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
|
propertyValue = text.String()
|
||||||
propertyName = strings.TrimSpace(propertyName)
|
}
|
||||||
if propertyName != "" {
|
|
||||||
item.AddString(propertyName, propertyValue)
|
if len(propertyValue) > 0 {
|
||||||
|
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
|
||||||
|
propertyName = strings.TrimSpace(propertyName)
|
||||||
|
if propertyName != "" {
|
||||||
|
item.AddString(propertyName, propertyValue)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for child := node.FirstChild; child != nil; {
|
for child := node.FirstChild; child != nil; {
|
||||||
|
@ -249,6 +230,7 @@ func (p *Parser) readItem(item *Item, node *html.Node) {
|
||||||
child = child.NextSibling
|
child = child.NextSibling
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return item
|
||||||
}
|
}
|
||||||
|
|
||||||
func getAttr(name string, node *html.Node) (string, bool) {
|
func getAttr(name string, node *html.Node) (string, bool) {
|
||||||
|
|
|
@ -583,8 +583,9 @@ func TestSkipSelfReferencingItemref(t *testing.T) {
|
||||||
actual := ParseData(html, t)
|
actual := ParseData(html, t)
|
||||||
|
|
||||||
child := NewItem()
|
child := NewItem()
|
||||||
child.AddString("title", "Foo")
|
child.AddType("http://data-vocabulary.org/Breadcrumb")
|
||||||
child.AddString("url", "http://example.com/foo/bar")
|
child.AddString("url", "http://example.com/foo/bar")
|
||||||
|
child.AddString("title", "Foo")
|
||||||
|
|
||||||
item := NewItem()
|
item := NewItem()
|
||||||
item.AddType("http://schema.org/WebPage")
|
item.AddType("http://schema.org/WebPage")
|
||||||
|
@ -603,18 +604,16 @@ func TestSkipSelfReferencingItemref(t *testing.T) {
|
||||||
// of its container item.
|
// of its container item.
|
||||||
func TestPropertiesInContainedItem(t *testing.T) {
|
func TestPropertiesInContainedItem(t *testing.T) {
|
||||||
html := `
|
html := `
|
||||||
<body itemscope itemtype="http://schema.org/WebPage">
|
<body itemscope itemtype="http://schema.org/WebPage">
|
||||||
<meta itemprop="foo" content="foo value">
|
<meta itemprop="foo" content="foo value">
|
||||||
|
|
||||||
<div itemscope itemtype="http://schema.org/Person">
|
<div itemscope itemtype="http://schema.org/Person">
|
||||||
<meta itemprop="bar" content="bar value">
|
<meta itemprop="bar" content="bar value">
|
||||||
</div>
|
</div>
|
||||||
|
<div itemscope itemtype="http://schema.org/Person" itemprop="author">
|
||||||
<div itemscope itemtype="http://schema.org/Person" itemprop="author">
|
<meta itemprop="baz" content="baz value">
|
||||||
<meta itemprop="baz" content="baz value">
|
</div>
|
||||||
</div>
|
</body>`
|
||||||
|
|
||||||
</body>`
|
|
||||||
|
|
||||||
actual := ParseData(html, t)
|
actual := ParseData(html, t)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue