diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..775e28a --- /dev/null +++ b/AUTHORS @@ -0,0 +1 @@ +* Ian Davis \ No newline at end of file diff --git a/CREDITS b/CREDITS new file mode 100644 index 0000000..e69de29 diff --git a/README.md b/README.md index 685ce05..1037ec4 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,18 @@ -microdata - a microdata parser in Go +# microdata +A microdata parser in Go See [http://www.w3.org/TR/microdata/](http://www.w3.org/TR/microdata/) for more information about Microdata -INSTALLATION -============ +## Installation Simply run go get github.com/iand/microdata -Documentation is at [http://go.pkgdoc.org/github.com/iand/microdata](http://go.pkgdoc.org/github.com/iand/microdata) +Documentation is at [http://godoc.org/github.com/iand/microdata](http://godoc.org/github.com/iand/microdata) -USAGE -===== +## Usage Example of parsing a string containing HTML: @@ -72,10 +71,30 @@ Extract microdata from a webpage and print the result as JSON } -LICENSE -======= -This code and associated documentation is in the public domain. +## Authors -To the extent possible under law, Ian Davis has waived all copyright -and related or neighboring rights to this file. This work is published -from the United Kingdom. +* [Ian Davis](http://github.com/iand) - + + +## Contributors + + +## Contributing + +* Do submit your changes as a pull request +* Do your best to adhere to the existing coding conventions and idioms. +* Do run `go fmt` on the code before committing +* Do feel free to add yourself to the [`CREDITS`](CREDITS) file and the + corresponding Contributors list in the the [`README.md`](README.md). + Alphabetical order applies. +* Don't touch the [`AUTHORS`](AUTHORS) file. An existing author will add you if + your contributions are significant enough. +* Do note that in order for any non-trivial changes to be merged (as a rule + of thumb, additions larger than about 15 lines of code), an explicit + Public Domain Dedication needs to be on record from you. Please include + a copy of the statement found in the [`WAIVER`](WAIVER) file with your pull request + +## License + +This is free and unencumbered software released into the public domain. For more +information, see or the accompanying [`UNLICENSE`](UNLICENSE) file. diff --git a/UNLICENSE b/UNLICENSE new file mode 100644 index 0000000..00d2e13 --- /dev/null +++ b/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to \ No newline at end of file diff --git a/WAIVER b/WAIVER new file mode 100644 index 0000000..d6e90f1 --- /dev/null +++ b/WAIVER @@ -0,0 +1,5 @@ +I dedicate any and all copyright interest in this software to the +public domain. I make this dedication for the benefit of the public at +large and to the detriment of my heirs and successors. I intend this +dedication to be an overt act of relinquishment in perpetuity of all +present and future rights to this software under copyright law. \ No newline at end of file diff --git a/microdata.go b/microdata.go index 02acf03..2f8f573 100644 --- a/microdata.go +++ b/microdata.go @@ -1,7 +1,6 @@ /* - To the extent possible under law, Ian Davis has waived all copyright - and related or neighboring rights to this Source Code file. - This work is published from the United Kingdom. + This is free and unencumbered software released into the public domain. For more + information, see or the accompanying UNLICENSE file. */ // A package for parsing microdata @@ -11,6 +10,8 @@ package microdata import ( "bytes" "code.google.com/p/go-html-transform/h5" + "code.google.com/p/go.net/html" + "code.google.com/p/go.net/html/atom" "encoding/json" "io" "net/url" @@ -78,18 +79,20 @@ func (self *Microdata) Json() ([]byte, error) { // An HTML parser that extracts microdata type Parser struct { - p *h5.Parser + p *h5.Tree data *Microdata base *url.URL - identifiedNodes map[string]*h5.Node + identifiedNodes map[string]*html.Node } // Create a new parser for extracting microdata // r is a reader over an HTML document // base is the base URL for resolving relative URLs func NewParser(r io.Reader, base *url.URL) *Parser { + p, _ := h5.New(r) + return &Parser{ - p: h5.NewParser(r), + p: p, data: NewMicrodata(), base: base, } @@ -97,16 +100,12 @@ func NewParser(r io.Reader, base *url.URL) *Parser { // Parse the document and return a Microdata set func (self *Parser) Parse() (*Microdata, error) { - err := self.p.Parse() - if err != nil { - return nil, err - } - tree := self.p.Tree() + tree := self.p - topLevelItemNodes := make([]*h5.Node, 0) - self.identifiedNodes = make(map[string]*h5.Node, 0) + topLevelItemNodes := make([]*html.Node, 0) + self.identifiedNodes = make(map[string]*html.Node, 0) - tree.Walk(func(n *h5.Node) { + tree.Walk(func(n *html.Node) { if _, exists := getAttr("itemscope", n); exists { if _, exists := getAttr("itemprop", n); !exists { topLevelItemNodes = append(topLevelItemNodes, n) @@ -147,17 +146,16 @@ func (self *Parser) Parse() (*Microdata, error) { } } - if len(node.Children) > 0 { - for _, child := range node.Children { - self.readItem(item, child) - } + for child := node.FirstChild; child != nil; { + self.readItem(item, child) + child = child.NextSibling } } return self.data, nil } -func (self *Parser) readItem(item *Item, node *h5.Node) { +func (self *Parser) readItem(item *Item, node *html.Node) { if itemprop, exists := getAttr("itemprop", node); exists { if _, exists := getAttr("itemscope", node); exists { subitem := NewItem() @@ -172,10 +170,9 @@ func (self *Parser) readItem(item *Item, node *h5.Node) { } } - if len(node.Children) > 0 { - for _, child := range node.Children { - self.readItem(subitem, child) - } + for child := node.FirstChild; child != nil; { + self.readItem(subitem, child) + child = child.NextSibling } for _, propertyName := range strings.Split(strings.TrimSpace(itemprop), " ") { @@ -190,35 +187,42 @@ func (self *Parser) readItem(item *Item, node *h5.Node) { } else { var propertyValue string - switch node.Data() { - - case "img", "audio", "source", "video", "embed", "iframe", "track": + switch node.DataAtom { + case atom.Meta: + if val, exists := getAttr("content", node); exists { + propertyValue = val + } + case atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Source, atom.Track, atom.Video: if urlValue, exists := getAttr("src", node); exists { if parsedUrl, err := self.base.Parse(urlValue); err == nil { propertyValue = parsedUrl.String() } } - case "a", "area", "link": + case atom.A, atom.Area, atom.Link: if urlValue, exists := getAttr("href", node); exists { if parsedUrl, err := self.base.Parse(urlValue); err == nil { propertyValue = parsedUrl.String() } } - case "data": + case atom.Object: + if urlValue, exists := getAttr("data", node); exists { + propertyValue = urlValue + } + case atom.Data, atom.Meter: if urlValue, exists := getAttr("value", node); exists { propertyValue = urlValue } - case "time": + case atom.Time: if urlValue, exists := getAttr("datetime", node); exists { propertyValue = urlValue } default: var text bytes.Buffer - node.Walk(func(n *h5.Node) { - if n.Type == h5.TextNode { - text.WriteString(n.Data()) + h5.WalkNodes(node, func(n *html.Node) { + if n.Type == html.TextNode { + text.WriteString(n.Data) } }) @@ -238,18 +242,17 @@ func (self *Parser) readItem(item *Item, node *h5.Node) { } - if len(node.Children) > 0 { - for _, child := range node.Children { - self.readItem(item, child) - } + for child := node.FirstChild; child != nil; { + self.readItem(item, child) + child = child.NextSibling } } -func getAttr(name string, node *h5.Node) (string, bool) { +func getAttr(name string, node *html.Node) (string, bool) { for _, a := range node.Attr { - if a.Name == name { - return a.Value, true + if a.Key == name { + return a.Val, true } } return "", false diff --git a/microdata_test.go b/microdata_test.go index 0f90b88..f24572f 100644 --- a/microdata_test.go +++ b/microdata_test.go @@ -1,7 +1,6 @@ /* - To the extent possible under law, Ian Davis has waived all copyright - and related or neighboring rights to this Source Code file. - This work is published from the United Kingdom. + This is free and unencumbered software released into the public domain. For more + information, see or the accompanying UNLICENSE file. */ package microdata