Skip to content

Commit

Permalink
convert: exclude parens as a special case
Browse files Browse the repository at this point in the history
Parens are valid characters in a URL, so we need a special case.

fixes Debian#121
  • Loading branch information
stapelberg committed Aug 8, 2020
1 parent fc82521 commit 503568d
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 8 deletions.
32 changes: 24 additions & 8 deletions internal/convert/convert.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import (
"net/url"
"sort"
"strings"
"unicode"

"golang.org/x/net/html"
)
Expand Down Expand Up @@ -107,7 +106,18 @@ func findUrls(txt string) [][]int {
lastSlash := -1
inUrl := false

maybeStripParens := func(end int) int {
if lastWordBoundary > 1 &&
txt[lastWordBoundary] == '(' &&
txt[end-1] == ')' {
end--
}
return end
}

Outer:
for i, r := range txt {
// As per https://stackoverflow.com/a/1547940/712014:
switch {
case 'a' <= r && r <= 'z' ||
'A' <= r && r <= 'Z' ||
Expand All @@ -120,12 +130,17 @@ func findUrls(txt string) [][]int {
}
lastSlash = i
default:
if inUrl && !unicode.IsSpace(r) && r != '>' {
continue
}
if inUrl && (unicode.IsSpace(r) || r == '>') {
results = append(results, []int{lastWordBoundary + 1, i})
inUrl = false
if inUrl {
switch r {
case '-', '.', '_', '~', '?', '#', '[', ']', '@', '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=':
// Valid URL character, continue processing:
continue Outer
default:
// Invalid URL character, i.e. end of URL:
end := maybeStripParens(i)
results = append(results, []int{lastWordBoundary + 1, end})
inUrl = false
}
}

lastWordBoundary = i
Expand All @@ -134,7 +149,8 @@ func findUrls(txt string) [][]int {
}
}
if inUrl {
results = append(results, []int{lastWordBoundary + 1, len(txt)})
end := maybeStripParens(len(txt))
results = append(results, []int{lastWordBoundary + 1, end})
}
return results
}
Expand Down
35 changes: 35 additions & 0 deletions internal/convert/convert_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -415,3 +415,38 @@ func BenchmarkXref(b *testing.B) {
xref(data, func(ref string) string { return ref })
}
}

func TestXrefHrefExclude(t *testing.T) {
input := &html.Node{
Type: html.TextNode,
Data: "the upstream website (http://debian.org/) goes into more detail",
}

a1 := &html.Node{
Type: html.ElementNode,
Data: "a",
Attr: []html.Attribute{
{Key: "href", Val: "http://debian.org/"},
},
}
a1.AppendChild(&html.Node{
Type: html.TextNode,
Data: "http://debian.org/",
})

want := []*html.Node{
&html.Node{
Type: html.TextNode,
Data: "the upstream website (",
},
a1,
&html.Node{
Type: html.TextNode,
Data: ") goes into more detail",
},
}
got := xref(input.Data, func(ref string) string { return ref })
if err := cmpElems(input, got, want); err != nil {
t.Fatalf("Unexpected xref() HTML result: %v", err)
}
}

0 comments on commit 503568d

Please sign in to comment.