Atom: implement xml:base relative URI resolution

What it does: Resolve relative URIs in feed element attributes, feed elements which contain URIs (like author:uri), and HTML element attributes in atom elements of type "html" or "xhtml" according to the xml:base specification (https://www.w3.org/TR/xmlbase/) What it is: The XMLBase type and functions live in the internal/shared package (internal/shared/xmlbase.go), with a minimalish patch against atom/parser.go. Tests live in testdata/parser/atom/ and are adapted from the python feedparser project: https://github.com/kurtmckee/feedparser/tree/master/feedparser/tests/wellformed/base How it works: As each atom element is parsed, a new xml:base is pushed to the stack; the top xml:base URI is used to resolve attributes (uses golang.org/x/net/html to parse any "html" or "xhtml" element content); then the base is popped from the stack. The shared.FindRoot() and shared.NextTag() functions have been moved to methods of XMLBase so that they can manage the xml:base url stack.
ToasterKTN · Sep 29, 2018 · 9665eb3 · 9665eb3
1 parent c9d2a40
commit 9665eb3
Show file tree

Hide file tree

Showing 59 changed files with 752 additions and 76 deletions.
diff --git a/atom/parser.go b/atom/parser.go
@@ -11,14 +11,38 @@ import (
 	"github.com/mmcdole/goxpp"
 )
 
+var (
+	// Atom elements which contain URIs
+	// https://tools.ietf.org/html/rfc4287
+	uriElements = map[string]bool{
+		"icon": true,
+		"id":   true,
+		"logo": true,
+		"uri":  true,
+		"url":  true, // atom 0.3
+	}
+
+	// Atom attributes which contain URIs
+	// https://tools.ietf.org/html/rfc4287
+	atomURIAttrs = map[string]bool{
+		"href":   true,
+		"scheme": true,
+		"src":    true,
+		"uri":    true,
+	}
+)
+
 // Parser is an Atom Parser
-type Parser struct{}
+type Parser struct {
+	base *shared.XMLBase
+}
 
 // Parse parses an xml feed into an atom.Feed
 func (ap *Parser) Parse(feed io.Reader) (*Feed, error) {
 	p := xpp.NewXMLPullParser(feed, false, shared.NewReaderLabel)
+	ap.base = &shared.XMLBase{URIAttrs: atomURIAttrs}
 
-	_, err := shared.FindRoot(p)
+	_, err := ap.base.FindRoot(p)
 	if err != nil {
 		return nil, err
 	}
@@ -43,7 +67,7 @@ func (ap *Parser) parseRoot(p *xpp.XMLPullParser) (*Feed, error) {
 	extensions := ext.Extensions{}
 
 	for {
-		tok, err := shared.NextTag(p)
+		tok, err := ap.base.NextTag(p)
 		if err != nil {
 			return nil, err
 		}
@@ -197,7 +221,7 @@ func (ap *Parser) parseEntry(p *xpp.XMLPullParser) (*Entry, error) {
 	extensions := ext.Extensions{}
 
 	for {
-		tok, err := shared.NextTag(p)
+		tok, err := ap.base.NextTag(p)
 		if err != nil {
 			return nil, err
 		}
@@ -352,7 +376,7 @@ func (ap *Parser) parseSource(p *xpp.XMLPullParser) (*Source, error) {
 	extensions := ext.Extensions{}
 
 	for {
-		tok, err := shared.NextTag(p)
+		tok, err := ap.base.NextTag(p)
 		if err != nil {
 			return nil, err
 		}
@@ -510,7 +534,7 @@ func (ap *Parser) parsePerson(name string, p *xpp.XMLPullParser) (*Person, error
 	person := &Person{}
 
 	for {
-		tok, err := shared.NextTag(p)
+		tok, err := ap.base.NextTag(p)
 		if err != nil {
 			return nil, err
 		}
@@ -654,29 +678,46 @@ func (ap *Parser) parseAtomText(p *xpp.XMLPullParser) (string, error) {
 	result := text.InnerXML
 	result = strings.TrimSpace(result)
 
+	lowerType := strings.ToLower(text.Type)
+	lowerMode := strings.ToLower(text.Mode)
+
 	if strings.HasPrefix(result, "<![CDATA[") &&
 		strings.HasSuffix(result, "]]>") {
 		result = strings.TrimPrefix(result, "<![CDATA[")
 		result = strings.TrimSuffix(result, "]]>")
-		return result, nil
+		if lowerType == "html" || strings.Contains(lowerType, "xhtml") {
+			result, _ = ap.base.ResolveHTML(result)
+		}
+	} else {
+		// decode non-CDATA contents depending on type
+
+		if lowerType == "text" ||
+			strings.HasPrefix(lowerType, "text/") ||
+			(lowerType == "" && lowerMode == "") {
+			result, err = shared.DecodeEntities(result)
+		} else if strings.Contains(lowerType, "xhtml") {
+			result = ap.stripWrappingDiv(result)
+			result, _ = ap.base.ResolveHTML(result)
+		} else if lowerType == "html" {
+			result = ap.stripWrappingDiv(result)
+			result, err = shared.DecodeEntities(result)
+			if err == nil {
+				result, _ = ap.base.ResolveHTML(result)
+			}
+		} else {
+			decodedStr, err := base64.StdEncoding.DecodeString(result)
+			if err == nil {
+				result = string(decodedStr)
+			}
+		}
 	}
 
-	lowerType := strings.ToLower(text.Type)
-	lowerMode := strings.ToLower(text.Mode)
-
-	if lowerType == "text" ||
-		strings.HasPrefix(lowerType, "text/") ||
-		(lowerType == "" && lowerMode == "") {
-		result, err = shared.DecodeEntities(result)
-	} else if strings.Contains(lowerType, "xhtml") {
-		result = ap.stripWrappingDiv(result)
-	} else if lowerType == "html" {
-		result = ap.stripWrappingDiv(result)
-		result, err = shared.DecodeEntities(result)
-	} else {
-		decodedStr, err := base64.StdEncoding.DecodeString(result)
+	// resolve relative URIs in URI-containing elements according to xml:base
+	name := strings.ToLower(p.Name)
+	if uriElements[name] {
+		resolved, err := ap.base.ResolveURL(result)
 		if err == nil {
-			result = string(decodedStr)
+			result = resolved
 		}
 	}
 

diff --git a/detector.go b/detector.go
@@ -28,7 +28,8 @@ const (
 func DetectFeedType(feed io.Reader) FeedType {
 	p := xpp.NewXMLPullParser(feed, false, shared.NewReaderLabel)
 
-	_, err := shared.FindRoot(p)
+	xmlBase := shared.XMLBase{}
+	_, err := xmlBase.FindRoot(p)
 	if err != nil {
 		return FeedTypeUnknown
 	}

diff --git a/internal/shared/parseutils.go b/internal/shared/parseutils.go
@@ -21,49 +21,6 @@ var (
 	InvalidNumericReference = errors.New("invalid numeric reference")
 )
 
-// FindRoot iterates through the tokens of an xml document until
-// it encounters its first StartTag event.  It returns an error
-// if it reaches EndDocument before finding a tag.
-func FindRoot(p *xpp.XMLPullParser) (event xpp.XMLEventType, err error) {
-	for {
-		event, err = p.Next()
-		if err != nil {
-			return event, err
-		}
-		if event == xpp.StartTag {
-			break
-		}
-
-		if event == xpp.EndDocument {
-			return event, fmt.Errorf("Failed to find root node before document end.")
-		}
-	}
-	return
-}
-
-// NextTag iterates through the tokens until it reaches a StartTag or EndTag
-// It is similar to goxpp's NextTag method except it wont throw an error if
-// the next immediate token isnt a Start/EndTag.  Instead, it will continue to
-// consume tokens until it hits a Start/EndTag or EndDocument.
-func NextTag(p *xpp.XMLPullParser) (event xpp.XMLEventType, err error) {
-	for {
-		event, err = p.Next()
-		if err != nil {
-			return event, err
-		}
-
-		if event == xpp.StartTag || event == xpp.EndTag {
-			break
-		}
-
-		if event == xpp.EndDocument {
-			return event, fmt.Errorf("Failed to find NextTag before reaching the end of the document.")
-		}
-
-	}
-	return
-}
-
 // ParseText is a helper function for parsing the text
 // from the current element of the XMLPullParser.
 // This function can handle parsing naked XML text from