Merge pull request #6 from dnesting/master

Ensure itemscope always starts a new item
pull/7/head
Ian Davis 2020-10-11 17:59:04 +01:00 committed by GitHub
commit f9a084323d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 133 additions and 84 deletions

View File

@ -121,8 +121,18 @@ func (p *Parser) Parse() (*Microdata, error) {
})
for _, node := range topLevelItemNodes {
item := NewItem()
p.data.Items = append(p.data.Items, item)
p.data.Items = append(p.data.Items, p.readItem(nil, node))
}
return p.data, nil
}
func (p *Parser) readItem(item *Item, node *html.Node) *Item {
var parent *Item
if _, exists := getAttr("itemscope", node); exists {
parent, item = item, NewItem()
if itemtypes, exists := getAttr("itemtype", node); exists {
for _, itemtype := range strings.Split(strings.TrimSpace(itemtypes), " ") {
itemtype = strings.TrimSpace(itemtype)
@ -143,105 +153,76 @@ func (p *Parser) Parse() (*Microdata, error) {
itemref = strings.TrimSpace(itemref)
if refnode, exists := p.identifiedNodes[itemref]; exists {
p.readItem(item, refnode)
}
}
}
for child := node.FirstChild; child != nil; {
p.readItem(item, child)
child = child.NextSibling
}
}
return p.data, nil
}
func (p *Parser) readItem(item *Item, node *html.Node) {
if itemprop, exists := getAttr("itemprop", node); exists {
if _, exists := getAttr("itemscope", node); exists {
subitem := NewItem()
if itemrefs, exists := getAttr("itemref", node); exists {
for _, itemref := range strings.Split(strings.TrimSpace(itemrefs), " ") {
itemref = strings.TrimSpace(itemref)
if refnode, exists := p.identifiedNodes[itemref]; exists {
if refnode != node {
p.readItem(subitem, refnode)
}
if refnode != node {
p.readItem(item, refnode)
}
}
}
}
}
for child := node.FirstChild; child != nil; {
p.readItem(subitem, child)
child = child.NextSibling
}
if itemprop, exists := getAttr("itemprop", node); exists {
if parent != nil {
// an itemprop on an itemscope has value of the item created by the itemscope
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
propertyName = strings.TrimSpace(propertyName)
if propertyName != "" {
item.AddItem(propertyName, subitem)
parent.AddItem(propertyName, item)
}
}
} else {
var propertyValue string
return
}
var propertyValue string
switch node.DataAtom {
case atom.Meta:
if val, exists := getAttr("content", node); exists {
propertyValue = val
}
case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video:
if urlValue, exists := getAttr("src", node); exists {
if parsedURL, err := p.base.Parse(urlValue); err == nil {
propertyValue = parsedURL.String()
switch node.DataAtom {
case atom.Meta:
if val, exists := getAttr("content", node); exists {
propertyValue = val
}
}
case atom.A, atom.Area, atom.Link:
if urlValue, exists := getAttr("href", node); exists {
if parsedURL, err := p.base.Parse(urlValue); err == nil {
propertyValue = parsedURL.String()
case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video:
if urlValue, exists := getAttr("src", node); exists {
if parsedURL, err := p.base.Parse(urlValue); err == nil {
propertyValue = parsedURL.String()
}
}
}
case atom.Object:
if urlValue, exists := getAttr("data", node); exists {
propertyValue = urlValue
}
case atom.Data, atom.Meter:
if urlValue, exists := getAttr("value", node); exists {
propertyValue = urlValue
}
case atom.Time:
if urlValue, exists := getAttr("datetime", node); exists {
propertyValue = urlValue
}
default:
var text bytes.Buffer
walk(node, func(n *html.Node) {
if n.Type == html.TextNode {
text.WriteString(n.Data)
case atom.A, atom.Area, atom.Link:
if urlValue, exists := getAttr("href", node); exists {
if parsedURL, err := p.base.Parse(urlValue); err == nil {
propertyValue = parsedURL.String()
}
}
case atom.Object:
if urlValue, exists := getAttr("data", node); exists {
propertyValue = urlValue
}
case atom.Data, atom.Meter:
if urlValue, exists := getAttr("value", node); exists {
propertyValue = urlValue
}
case atom.Time:
if urlValue, exists := getAttr("datetime", node); exists {
propertyValue = urlValue
}
})
propertyValue = text.String()
}
default:
var text bytes.Buffer
walk(node, func(n *html.Node) {
if n.Type == html.TextNode {
text.WriteString(n.Data)
}
if len(propertyValue) > 0 {
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
propertyName = strings.TrimSpace(propertyName)
if propertyName != "" {
item.AddString(propertyName, propertyValue)
})
propertyValue = text.String()
}
if len(propertyValue) > 0 {
for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") {
propertyName = strings.TrimSpace(propertyName)
if propertyName != "" {
item.AddString(propertyName, propertyValue)
}
}
}
}
}
for child := node.FirstChild; child != nil; {
@ -249,6 +230,7 @@ func (p *Parser) readItem(item *Item, node *html.Node) {
child = child.NextSibling
}
return item
}
func getAttr(name string, node *html.Node) (string, bool) {

View File

@ -583,8 +583,9 @@ func TestSkipSelfReferencingItemref(t *testing.T) {
actual := ParseData(html, t)
child := NewItem()
child.AddString("title", "Foo")
child.AddType("http://data-vocabulary.org/Breadcrumb")
child.AddString("url", "http://example.com/foo/bar")
child.AddString("title", "Foo")
item := NewItem()
item.AddType("http://schema.org/WebPage")
@ -597,3 +598,69 @@ func TestSkipSelfReferencingItemref(t *testing.T) {
t.Errorf("Expecting %v but got %v", expected, actual)
}
}
// This test validates that properties within an itemscope'd element remain
// with that contained item even if it is not, via itemprop, made an explicit child
// of its container item.
func TestPropertiesInContainedItem(t *testing.T) {
html := `
<body itemscope itemtype="http://schema.org/WebPage">
<meta itemprop="foo" content="foo value">
<div itemscope itemtype="http://schema.org/Person">
<meta itemprop="bar" content="bar value">
</div>
<div itemscope itemtype="http://schema.org/Person" itemprop="author">
<meta itemprop="baz" content="baz value">
</div>
</body>`
actual := ParseData(html, t)
// There should be two top-level items: WebPage and the Person that isn't an author.
if len(actual.Items) != 2 {
t.Fatalf("expected 2 top-level items, got %d", len(actual.Items))
}
outer := actual.Items[0]
inner := actual.Items[1]
// The first item should be a WebPage. The properties in its contained items should
// not be properties of the containing item.
if len(outer.Types) != 1 || outer.Types[0] != "http://schema.org/WebPage" {
t.Fatalf("expected outer to be http://schema.org/WebPage, got %v", outer.Types)
}
if _, present := outer.Properties["bar"]; present {
t.Errorf("outer should not have a 'bar' property, got %v", outer.Properties["bar"])
}
if _, present := outer.Properties["baz"]; present {
t.Errorf("outer should not have a 'baz' property, got %v", outer.Properties["baz"])
}
// The second item should be the non-author child element of outer. Since there is
// no itemprop attribute, it's not a child *item* of outer but it should stand as its
// own item.
if len(inner.Types) != 1 || inner.Types[0] != "http://schema.org/Person" {
t.Fatalf("expected inner to be http://schema.org/Person, got %v", inner.Types)
}
if _, present := inner.Properties["bar"]; !present {
t.Errorf("inner should have a 'bar' property")
}
// The third item is the author, which should be a child item (via the 'author' itemprop)
// of outer. It, too, should have its own discrete type and property.
if list := outer.Properties["author"]; len(list) == 1 {
if author, ok := list[0].(*Item); ok {
if len(author.Types) != 1 || author.Types[0] != "http://schema.org/Person" {
t.Fatalf("expected author to be http://schema.org/Person, got %v", author.Types)
}
if _, present := author.Properties["baz"]; !present {
t.Errorf("inner should have a 'baz' property")
}
} else {
t.Errorf("expected author item, got %v", list)
}
} else {
t.Errorf("expected outer to have a child of author, got %v", outer.Properties["author"])
}
}