ensure itemscope always starts a new item

pull/6/head
David M. Nesting 2020-10-07 16:30:44 -04:00
parent a5d3d8ae37
commit ca93c08d53
2 changed files with 76 additions and 95 deletions

View File

@ -121,8 +121,18 @@ func (p *Parser) Parse() (*Microdata, error) {
}) })
for _, node := range topLevelItemNodes { for _, node := range topLevelItemNodes {
item := NewItem() p.data.Items = append(p.data.Items, p.readItem(nil, node))
p.data.Items = append(p.data.Items, item) }
return p.data, nil
}
func (p *Parser) readItem(item *Item, node *html.Node) *Item {
var parent *Item
if _, exists := getAttr("itemscope", node); exists {
parent, item = item, NewItem()
if itemtypes, exists := getAttr("itemtype", node); exists { if itemtypes, exists := getAttr("itemtype", node); exists {
for _, itemtype := range strings.Split(strings.TrimSpace(itemtypes), " ") { for _, itemtype := range strings.Split(strings.TrimSpace(itemtypes), " ") {
itemtype = strings.TrimSpace(itemtype) itemtype = strings.TrimSpace(itemtype)
@ -143,105 +153,76 @@ func (p *Parser) Parse() (*Microdata, error) {
itemref = strings.TrimSpace(itemref) itemref = strings.TrimSpace(itemref)
if refnode, exists := p.identifiedNodes[itemref]; exists { if refnode, exists := p.identifiedNodes[itemref]; exists {
p.readItem(item, refnode) if refnode != node {
} p.readItem(item, refnode)
}
}
for child := node.FirstChild; child != nil; {
p.readItem(item, child)
child = child.NextSibling
}
}
return p.data, nil
}
func (p *Parser) readItem(item *Item, node *html.Node) {
if itemprop, exists := getAttr("itemprop", node); exists {
if _, exists := getAttr("itemscope", node); exists {
subitem := NewItem()
if itemrefs, exists := getAttr("itemref", node); exists {
for _, itemref := range strings.Split(strings.TrimSpace(itemrefs), " ") {
itemref = strings.TrimSpace(itemref)
if refnode, exists := p.identifiedNodes[itemref]; exists {
if refnode != node {
p.readItem(subitem, refnode)
}
} }
} }
} }
}
}
for child := node.FirstChild; child != nil; { if itemprop, exists := getAttr("itemprop", node); exists {
p.readItem(subitem, child) if parent != nil {
child = child.NextSibling // an itemprop on an itemscope has value of the item created by the itemscope
}
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") { for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
propertyName = strings.TrimSpace(propertyName) propertyName = strings.TrimSpace(propertyName)
if propertyName != "" { if propertyName != "" {
item.AddItem(propertyName, subitem) parent.AddItem(propertyName, item)
} }
} }
} else {
var propertyValue string
return switch node.DataAtom {
case atom.Meta:
} if val, exists := getAttr("content", node); exists {
propertyValue = val
var propertyValue string
switch node.DataAtom {
case atom.Meta:
if val, exists := getAttr("content", node); exists {
propertyValue = val
}
case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video:
if urlValue, exists := getAttr("src", node); exists {
if parsedURL, err := p.base.Parse(urlValue); err == nil {
propertyValue = parsedURL.String()
} }
} case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video:
case atom.A, atom.Area, atom.Link: if urlValue, exists := getAttr("src", node); exists {
if urlValue, exists := getAttr("href", node); exists { if parsedURL, err := p.base.Parse(urlValue); err == nil {
if parsedURL, err := p.base.Parse(urlValue); err == nil { propertyValue = parsedURL.String()
propertyValue = parsedURL.String() }
} }
} case atom.A, atom.Area, atom.Link:
case atom.Object: if urlValue, exists := getAttr("href", node); exists {
if urlValue, exists := getAttr("data", node); exists { if parsedURL, err := p.base.Parse(urlValue); err == nil {
propertyValue = urlValue propertyValue = parsedURL.String()
} }
case atom.Data, atom.Meter: }
if urlValue, exists := getAttr("value", node); exists { case atom.Object:
propertyValue = urlValue if urlValue, exists := getAttr("data", node); exists {
} propertyValue = urlValue
case atom.Time: }
if urlValue, exists := getAttr("datetime", node); exists { case atom.Data, atom.Meter:
propertyValue = urlValue if urlValue, exists := getAttr("value", node); exists {
} propertyValue = urlValue
}
default: case atom.Time:
var text bytes.Buffer if urlValue, exists := getAttr("datetime", node); exists {
walk(node, func(n *html.Node) { propertyValue = urlValue
if n.Type == html.TextNode {
text.WriteString(n.Data)
} }
}) default:
propertyValue = text.String() var text bytes.Buffer
} walk(node, func(n *html.Node) {
if n.Type == html.TextNode {
text.WriteString(n.Data)
}
if len(propertyValue) > 0 { })
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") { propertyValue = text.String()
propertyName = strings.TrimSpace(propertyName) }
if propertyName != "" {
item.AddString(propertyName, propertyValue) if len(propertyValue) > 0 {
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
propertyName = strings.TrimSpace(propertyName)
if propertyName != "" {
item.AddString(propertyName, propertyValue)
}
} }
} }
} }
} }
for child := node.FirstChild; child != nil; { for child := node.FirstChild; child != nil; {
@ -249,6 +230,7 @@ func (p *Parser) readItem(item *Item, node *html.Node) {
child = child.NextSibling child = child.NextSibling
} }
return item
} }
func getAttr(name string, node *html.Node) (string, bool) { func getAttr(name string, node *html.Node) (string, bool) {

View File

@ -583,8 +583,9 @@ func TestSkipSelfReferencingItemref(t *testing.T) {
actual := ParseData(html, t) actual := ParseData(html, t)
child := NewItem() child := NewItem()
child.AddString("title", "Foo") child.AddType("http://data-vocabulary.org/Breadcrumb")
child.AddString("url", "http://example.com/foo/bar") child.AddString("url", "http://example.com/foo/bar")
child.AddString("title", "Foo")
item := NewItem() item := NewItem()
item.AddType("http://schema.org/WebPage") item.AddType("http://schema.org/WebPage")
@ -603,18 +604,16 @@ func TestSkipSelfReferencingItemref(t *testing.T) {
// of its container item. // of its container item.
func TestPropertiesInContainedItem(t *testing.T) { func TestPropertiesInContainedItem(t *testing.T) {
html := ` html := `
<body itemscope itemtype="http://schema.org/WebPage"> <body itemscope itemtype="http://schema.org/WebPage">
<meta itemprop="foo" content="foo value"> <meta itemprop="foo" content="foo value">
<div itemscope itemtype="http://schema.org/Person"> <div itemscope itemtype="http://schema.org/Person">
<meta itemprop="bar" content="bar value"> <meta itemprop="bar" content="bar value">
</div> </div>
<div itemscope itemtype="http://schema.org/Person" itemprop="author">
<div itemscope itemtype="http://schema.org/Person" itemprop="author"> <meta itemprop="baz" content="baz value">
<meta itemprop="baz" content="baz value"> </div>
</div> </body>`
</body>`
actual := ParseData(html, t) actual := ParseData(html, t)