Skip to content

Commit

Permalink
Auto-fix nested CDATA tags in the note content
Browse files Browse the repository at this point in the history
  • Loading branch information
wormi4ok committed May 21, 2023
1 parent 9fc2167 commit 7959cdb
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 1 deletion.
32 changes: 31 additions & 1 deletion encoding/enex/enex.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"fmt"
"io"
"regexp"
"strings"
)

type (
Expand Down Expand Up @@ -123,7 +124,13 @@ type StreamDecoder struct {
}

func NewStreamDecoder(r io.Reader) (*StreamDecoder, error) {
d := xml.NewDecoder(r)
buf := bytes.Buffer{}
if _, err := buf.ReadFrom(r); err != nil {
return nil, err
}
clean := removeNestedCDATA(buf.String())

d := xml.NewDecoder(strings.NewReader(clean))
d.Strict = false

for {
Expand Down Expand Up @@ -202,3 +209,26 @@ func decodeRecognition(n *Note) error {

return nil
}

var reCDATA = regexp.MustCompile(`<!\[CDATA\[(.*?)\]\]>`)

// removeNestedCDATA tags in the note content
//
// Nested CDATA tags are not allowed by XML specification
// but Evernote puts them anyway, causing "Unexpected EOF" errors during decoding
func removeNestedCDATA(input string) string {
output := reCDATA.ReplaceAllStringFunc(input, func(match string) string {
submatch := reCDATA.FindStringSubmatch(match)
if len(submatch) > 1 {
return submatch[1]
}
return match
})

// Recursively remove nested CDATA tags
if output != input {
return removeNestedCDATA(output)
}

return output
}
16 changes: 16 additions & 0 deletions encoding/enex/enex_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,22 @@ func TestStreamDecodeWrongFile(t *testing.T) {
t.Errorf("Expected error, got = %v", err)
}
}
func TestStreamDecodeAutofixCDATA(t *testing.T) {
enexContent, err := os.Open("testdata/cdata.issue.enex")
if err != nil {
t.Fatal(err)
}
d, err := enex.NewStreamDecoder(enexContent)
if err != nil {

t.Errorf("Error while Decoding = %v", err)
}
var got enex.Note
err = d.Next(&got)
if err != nil {
t.Error(err)
}
}

func readFile(filename string) []byte {
file, err := os.ReadFile(filename)
Expand Down
30 changes: 30 additions & 0 deletions encoding/enex/testdata/cdata.issue.enex
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE en-export SYSTEM "http://xml.evernote.com/pub/evernote-export4.dtd">
<en-export export-date="20230121T235625Z" application="Evernote" version="10.19.2">
<note>
<title>Test note with nested CDATA tags</title>
<created>20230121T235548Z</created>
<updated>20230121T235625Z</updated>
<tag>test data</tag>
<tag>data</tag>
<note-attributes>
<author>me</author>
</note-attributes>
<content>
<![CDATA[<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd">
<en-note>
<div>email</div>
<![CDATA[>]]>
<div><b>Title</b><span><br/></span></div>
<div>Date<br/></div>
<div>text</div>]]
<![CDATA[>]]>
<div><br></div>
<div lang="EN-US">
</div>
<div><br/></div>
</en-note> ]]>
</content>
</note>
</en-export>

0 comments on commit 7959cdb

Please sign in to comment.