diff --git a/microdata.go b/microdata.go index 3130fb3..329f019 100644 --- a/microdata.go +++ b/microdata.go @@ -121,8 +121,18 @@ func (p *Parser) Parse() (*Microdata, error) { }) for _, node := range topLevelItemNodes { - item := NewItem() - p.data.Items = append(p.data.Items, item) + p.data.Items = append(p.data.Items, p.readItem(nil, node)) + } + + return p.data, nil +} + +func (p *Parser) readItem(item *Item, node *html.Node) *Item { + var parent *Item + + if _, exists := getAttr("itemscope", node); exists { + parent, item = item, NewItem() + if itemtypes, exists := getAttr("itemtype", node); exists { for _, itemtype := range strings.Split(strings.TrimSpace(itemtypes), " ") { itemtype = strings.TrimSpace(itemtype) @@ -143,105 +153,76 @@ func (p *Parser) Parse() (*Microdata, error) { itemref = strings.TrimSpace(itemref) if refnode, exists := p.identifiedNodes[itemref]; exists { - p.readItem(item, refnode) - } - } - } - - for child := node.FirstChild; child != nil; { - p.readItem(item, child) - child = child.NextSibling - } - } - - return p.data, nil -} - -func (p *Parser) readItem(item *Item, node *html.Node) { - if itemprop, exists := getAttr("itemprop", node); exists { - if _, exists := getAttr("itemscope", node); exists { - subitem := NewItem() - - if itemrefs, exists := getAttr("itemref", node); exists { - for _, itemref := range strings.Split(strings.TrimSpace(itemrefs), " ") { - itemref = strings.TrimSpace(itemref) - - if refnode, exists := p.identifiedNodes[itemref]; exists { - if refnode != node { - p.readItem(subitem, refnode) - } + if refnode != node { + p.readItem(item, refnode) } } } + } + } - for child := node.FirstChild; child != nil; { - p.readItem(subitem, child) - child = child.NextSibling - } - + if itemprop, exists := getAttr("itemprop", node); exists { + if parent != nil { + // an itemprop on an itemscope has value of the item created by the itemscope for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") { propertyName = strings.TrimSpace(propertyName) if propertyName != "" { - item.AddItem(propertyName, subitem) + parent.AddItem(propertyName, item) } } + } else { + var propertyValue string - return - - } - - var propertyValue string - - switch node.DataAtom { - case atom.Meta: - if val, exists := getAttr("content", node); exists { - propertyValue = val - } - case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video: - if urlValue, exists := getAttr("src", node); exists { - if parsedURL, err := p.base.Parse(urlValue); err == nil { - propertyValue = parsedURL.String() + switch node.DataAtom { + case atom.Meta: + if val, exists := getAttr("content", node); exists { + propertyValue = val } - } - case atom.A, atom.Area, atom.Link: - if urlValue, exists := getAttr("href", node); exists { - if parsedURL, err := p.base.Parse(urlValue); err == nil { - propertyValue = parsedURL.String() + case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video: + if urlValue, exists := getAttr("src", node); exists { + if parsedURL, err := p.base.Parse(urlValue); err == nil { + propertyValue = parsedURL.String() + } } - } - case atom.Object: - if urlValue, exists := getAttr("data", node); exists { - propertyValue = urlValue - } - case atom.Data, atom.Meter: - if urlValue, exists := getAttr("value", node); exists { - propertyValue = urlValue - } - case atom.Time: - if urlValue, exists := getAttr("datetime", node); exists { - propertyValue = urlValue - } - - default: - var text bytes.Buffer - walk(node, func(n *html.Node) { - if n.Type == html.TextNode { - text.WriteString(n.Data) + case atom.A, atom.Area, atom.Link: + if urlValue, exists := getAttr("href", node); exists { + if parsedURL, err := p.base.Parse(urlValue); err == nil { + propertyValue = parsedURL.String() + } + } + case atom.Object: + if urlValue, exists := getAttr("data", node); exists { + propertyValue = urlValue + } + case atom.Data, atom.Meter: + if urlValue, exists := getAttr("value", node); exists { + propertyValue = urlValue + } + case atom.Time: + if urlValue, exists := getAttr("datetime", node); exists { + propertyValue = urlValue } - }) - propertyValue = text.String() - } + default: + var text bytes.Buffer + walk(node, func(n *html.Node) { + if n.Type == html.TextNode { + text.WriteString(n.Data) + } - if len(propertyValue) > 0 { - for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") { - propertyName = strings.TrimSpace(propertyName) - if propertyName != "" { - item.AddString(propertyName, propertyValue) + }) + propertyValue = text.String() + } + + if len(propertyValue) > 0 { + for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") { + propertyName = strings.TrimSpace(propertyName) + if propertyName != "" { + item.AddString(propertyName, propertyValue) + } } } } - } for child := node.FirstChild; child != nil; { @@ -249,6 +230,7 @@ func (p *Parser) readItem(item *Item, node *html.Node) { child = child.NextSibling } + return item } func getAttr(name string, node *html.Node) (string, bool) { diff --git a/microdata_test.go b/microdata_test.go index 63b167e..4a2e288 100644 --- a/microdata_test.go +++ b/microdata_test.go @@ -583,8 +583,9 @@ func TestSkipSelfReferencingItemref(t *testing.T) { actual := ParseData(html, t) child := NewItem() - child.AddString("title", "Foo") + child.AddType("http://data-vocabulary.org/Breadcrumb") child.AddString("url", "http://example.com/foo/bar") + child.AddString("title", "Foo") item := NewItem() item.AddType("http://schema.org/WebPage") @@ -597,3 +598,69 @@ func TestSkipSelfReferencingItemref(t *testing.T) { t.Errorf("Expecting %v but got %v", expected, actual) } } + +// This test validates that properties within an itemscope'd element remain +// with that contained item even if it is not, via itemprop, made an explicit child +// of its container item. +func TestPropertiesInContainedItem(t *testing.T) { + html := ` + + + +
+ +
+ + ` + + actual := ParseData(html, t) + + // There should be two top-level items: WebPage and the Person that isn't an author. + if len(actual.Items) != 2 { + t.Fatalf("expected 2 top-level items, got %d", len(actual.Items)) + } + + outer := actual.Items[0] + inner := actual.Items[1] + + // The first item should be a WebPage. The properties in its contained items should + // not be properties of the containing item. + if len(outer.Types) != 1 || outer.Types[0] != "http://schema.org/WebPage" { + t.Fatalf("expected outer to be http://schema.org/WebPage, got %v", outer.Types) + } + if _, present := outer.Properties["bar"]; present { + t.Errorf("outer should not have a 'bar' property, got %v", outer.Properties["bar"]) + } + if _, present := outer.Properties["baz"]; present { + t.Errorf("outer should not have a 'baz' property, got %v", outer.Properties["baz"]) + } + + // The second item should be the non-author child element of outer. Since there is + // no itemprop attribute, it's not a child *item* of outer but it should stand as its + // own item. + if len(inner.Types) != 1 || inner.Types[0] != "http://schema.org/Person" { + t.Fatalf("expected inner to be http://schema.org/Person, got %v", inner.Types) + } + if _, present := inner.Properties["bar"]; !present { + t.Errorf("inner should have a 'bar' property") + } + + // The third item is the author, which should be a child item (via the 'author' itemprop) + // of outer. It, too, should have its own discrete type and property. + if list := outer.Properties["author"]; len(list) == 1 { + if author, ok := list[0].(*Item); ok { + if len(author.Types) != 1 || author.Types[0] != "http://schema.org/Person" { + t.Fatalf("expected author to be http://schema.org/Person, got %v", author.Types) + } + if _, present := author.Properties["baz"]; !present { + t.Errorf("inner should have a 'baz' property") + } + } else { + t.Errorf("expected author item, got %v", list) + } + } else { + t.Errorf("expected outer to have a child of author, got %v", outer.Properties["author"]) + } +}