Skip to content

Commit

Permalink
Atom: implement xml:base relative URI resolution
Browse files Browse the repository at this point in the history
What it does:

Resolve relative URIs in feed element attributes, feed elements which contain
URIs (like author:uri), and HTML element attributes in atom elements of
type "html" or "xhtml" according to the xml:base specification
(https://www.w3.org/TR/xmlbase/)

What it is:

The XMLBase type and functions live in the internal/shared package
(internal/shared/xmlbase.go), with a minimalish patch against
atom/parser.go.

Tests live in testdata/parser/atom/ and are adapted from the python feedparser project:

https://github.com/kurtmckee/feedparser/tree/master/feedparser/tests/wellformed/base

How it works:

As each atom element is parsed, a new xml:base is pushed to the stack;
the top xml:base URI is used to resolve attributes (uses
golang.org/x/net/html to parse any "html" or "xhtml" element content);
then the base is popped from the stack.

The shared.FindRoot() and shared.NextTag() functions have been moved to
methods of XMLBase so that they can manage the xml:base url stack.
  • Loading branch information
cristoper authored and mmcdole committed Sep 29, 2018
1 parent c9d2a40 commit 9665eb3
Show file tree
Hide file tree
Showing 59 changed files with 752 additions and 76 deletions.
85 changes: 63 additions & 22 deletions atom/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,38 @@ import (
"github.com/mmcdole/goxpp"
)

var (
// Atom elements which contain URIs
// https://tools.ietf.org/html/rfc4287
uriElements = map[string]bool{
"icon": true,
"id": true,
"logo": true,
"uri": true,
"url": true, // atom 0.3
}

// Atom attributes which contain URIs
// https://tools.ietf.org/html/rfc4287
atomURIAttrs = map[string]bool{
"href": true,
"scheme": true,
"src": true,
"uri": true,
}
)

// Parser is an Atom Parser
type Parser struct{}
type Parser struct {
base *shared.XMLBase
}

// Parse parses an xml feed into an atom.Feed
func (ap *Parser) Parse(feed io.Reader) (*Feed, error) {
p := xpp.NewXMLPullParser(feed, false, shared.NewReaderLabel)
ap.base = &shared.XMLBase{URIAttrs: atomURIAttrs}

_, err := shared.FindRoot(p)
_, err := ap.base.FindRoot(p)
if err != nil {
return nil, err
}
Expand All @@ -43,7 +67,7 @@ func (ap *Parser) parseRoot(p *xpp.XMLPullParser) (*Feed, error) {
extensions := ext.Extensions{}

for {
tok, err := shared.NextTag(p)
tok, err := ap.base.NextTag(p)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -197,7 +221,7 @@ func (ap *Parser) parseEntry(p *xpp.XMLPullParser) (*Entry, error) {
extensions := ext.Extensions{}

for {
tok, err := shared.NextTag(p)
tok, err := ap.base.NextTag(p)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -352,7 +376,7 @@ func (ap *Parser) parseSource(p *xpp.XMLPullParser) (*Source, error) {
extensions := ext.Extensions{}

for {
tok, err := shared.NextTag(p)
tok, err := ap.base.NextTag(p)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -510,7 +534,7 @@ func (ap *Parser) parsePerson(name string, p *xpp.XMLPullParser) (*Person, error
person := &Person{}

for {
tok, err := shared.NextTag(p)
tok, err := ap.base.NextTag(p)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -654,29 +678,46 @@ func (ap *Parser) parseAtomText(p *xpp.XMLPullParser) (string, error) {
result := text.InnerXML
result = strings.TrimSpace(result)

lowerType := strings.ToLower(text.Type)
lowerMode := strings.ToLower(text.Mode)

if strings.HasPrefix(result, "<![CDATA[") &&
strings.HasSuffix(result, "]]>") {
result = strings.TrimPrefix(result, "<![CDATA[")
result = strings.TrimSuffix(result, "]]>")
return result, nil
if lowerType == "html" || strings.Contains(lowerType, "xhtml") {
result, _ = ap.base.ResolveHTML(result)
}
} else {
// decode non-CDATA contents depending on type

if lowerType == "text" ||
strings.HasPrefix(lowerType, "text/") ||
(lowerType == "" && lowerMode == "") {
result, err = shared.DecodeEntities(result)
} else if strings.Contains(lowerType, "xhtml") {
result = ap.stripWrappingDiv(result)
result, _ = ap.base.ResolveHTML(result)
} else if lowerType == "html" {
result = ap.stripWrappingDiv(result)
result, err = shared.DecodeEntities(result)
if err == nil {
result, _ = ap.base.ResolveHTML(result)
}
} else {
decodedStr, err := base64.StdEncoding.DecodeString(result)
if err == nil {
result = string(decodedStr)
}
}
}

lowerType := strings.ToLower(text.Type)
lowerMode := strings.ToLower(text.Mode)

if lowerType == "text" ||
strings.HasPrefix(lowerType, "text/") ||
(lowerType == "" && lowerMode == "") {
result, err = shared.DecodeEntities(result)
} else if strings.Contains(lowerType, "xhtml") {
result = ap.stripWrappingDiv(result)
} else if lowerType == "html" {
result = ap.stripWrappingDiv(result)
result, err = shared.DecodeEntities(result)
} else {
decodedStr, err := base64.StdEncoding.DecodeString(result)
// resolve relative URIs in URI-containing elements according to xml:base
name := strings.ToLower(p.Name)
if uriElements[name] {
resolved, err := ap.base.ResolveURL(result)
if err == nil {
result = string(decodedStr)
result = resolved
}
}

Expand Down
3 changes: 2 additions & 1 deletion detector.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ const (
func DetectFeedType(feed io.Reader) FeedType {
p := xpp.NewXMLPullParser(feed, false, shared.NewReaderLabel)

_, err := shared.FindRoot(p)
xmlBase := shared.XMLBase{}
_, err := xmlBase.FindRoot(p)
if err != nil {
return FeedTypeUnknown
}
Expand Down
43 changes: 0 additions & 43 deletions internal/shared/parseutils.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,49 +21,6 @@ var (
InvalidNumericReference = errors.New("invalid numeric reference")
)

// FindRoot iterates through the tokens of an xml document until
// it encounters its first StartTag event. It returns an error
// if it reaches EndDocument before finding a tag.
func FindRoot(p *xpp.XMLPullParser) (event xpp.XMLEventType, err error) {
for {
event, err = p.Next()
if err != nil {
return event, err
}
if event == xpp.StartTag {
break
}

if event == xpp.EndDocument {
return event, fmt.Errorf("Failed to find root node before document end.")
}
}
return
}

// NextTag iterates through the tokens until it reaches a StartTag or EndTag
// It is similar to goxpp's NextTag method except it wont throw an error if
// the next immediate token isnt a Start/EndTag. Instead, it will continue to
// consume tokens until it hits a Start/EndTag or EndDocument.
func NextTag(p *xpp.XMLPullParser) (event xpp.XMLEventType, err error) {
for {
event, err = p.Next()
if err != nil {
return event, err
}

if event == xpp.StartTag || event == xpp.EndTag {
break
}

if event == xpp.EndDocument {
return event, fmt.Errorf("Failed to find NextTag before reaching the end of the document.")
}

}
return
}

// ParseText is a helper function for parsing the text
// from the current element of the XMLPullParser.
// This function can handle parsing naked XML text from
Expand Down
Loading

0 comments on commit 9665eb3

Please sign in to comment.