Skip to content

Commit

Permalink
synced with master
Browse files Browse the repository at this point in the history
  • Loading branch information
ganigeorgiev committed Oct 27, 2023
2 parents 1d67a35 + 34fed67 commit f889a3f
Show file tree
Hide file tree
Showing 5 changed files with 173 additions and 89 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,15 @@
```


## v0.19.1

- Fixed `tokenizer.Scan()/ScanAll()` to ignore the separators from the default trim cutset.
An option to return also the empty found tokens was also added via `Tokenizer.KeepEmptyTokens(true)`.
_This should fix the parsing of whitespace charactes around view query column names when no quotes are used ([#3616](https://github.com/pocketbase/pocketbase/discussions/3616#discussioncomment-7398564))._

- Fixed the `:excerpt(max, withEllipsis?)` `field` query param modifier to properly add space to the generated text fragment after block tags.


## v0.19.0

- Added Patreon OAuth2 provider ([#3323](https://github.com/pocketbase/pocketbase/pull/3323); thanks @ghostdevv).
Expand Down
43 changes: 26 additions & 17 deletions tools/rest/excerpt_modifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,46 +78,55 @@ func (m *excerptModifier) Modify(value any) (any, error) {
return "", err
}

var isNotEmpty bool
var needSpace bool
var hasPrevSpace bool

// for all node types and more details check
// https://pkg.go.dev/golang.org/x/net/html#Parse
var stripTags func(*html.Node)
stripTags = func(n *html.Node) {
switch n.Type {
case html.TextNode:
if txt := strings.TrimSpace(whitespaceRegex.ReplaceAllString(n.Data, " ")); txt != "" {
if isNotEmpty && needSpace {
needSpace = false
builder.WriteString(" ")
}

builder.WriteString(txt)
// collapse multiple spaces into one
txt := whitespaceRegex.ReplaceAllString(n.Data, " ")

if !isNotEmpty {
isNotEmpty = true
}
if hasPrevSpace {
txt = strings.TrimLeft(txt, " ")
}
case html.ElementNode:
if !needSpace && !list.ExistInSlice(n.Data, inlineTags) {
needSpace = true

if txt != "" {
hasPrevSpace = strings.HasSuffix(txt, " ")

builder.WriteString(txt)
}
}

if builder.Len() > m.max {
// excerpt max has been reached => no need to further iterate
// (+2 for the extra whitespace suffix/prefix that will be trimmed later)
if builder.Len() > m.max+2 {
return
}

for c := n.FirstChild; c != nil; c = c.NextSibling {
if c.Type != html.ElementNode || !list.ExistInSlice(c.Data, excludeTags) {
isBlock := c.Type == html.ElementNode && !list.ExistInSlice(c.Data, inlineTags)

if isBlock && !hasPrevSpace {
builder.WriteString(" ")
hasPrevSpace = true
}

stripTags(c)

if isBlock && !hasPrevSpace {
builder.WriteString(" ")
hasPrevSpace = true
}
}
}
}
stripTags(doc)

result := builder.String()
result := strings.TrimSpace(builder.String())

if len(result) > m.max {
result = strings.TrimSpace(result[:m.max])
Expand Down
5 changes: 2 additions & 3 deletions tools/rest/excerpt_modifier_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,10 @@ func TestNewExcerptModifier(t *testing.T) {
}

func TestExcerptModifierModify(t *testing.T) {
// plain text value: "Hello t est12 3 word"
html := ` <script>var a = 123;</script> <p>Hello</p><div id="test_id">t est<b>12
3</b></div> <h1>word </h1> `
3</b><span>456</span></div><span>word <b>7</b> 89<span>!<b>?</b><b> a </b><b>b </b>c</span>#<h1>title</h1>`

plainText := "Hello t est12 3 word"
plainText := "Hello t est12 3456 word 7 89!? a b c# title"

scenarios := []struct {
name string
Expand Down
77 changes: 52 additions & 25 deletions tools/tokenizer/tokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ const eof = rune(0)
// DefaultSeparators is a list with the default token separator characters.
var DefaultSeparators = []rune{','}

var whitespaceChars = []rune{'\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0}

// NewFromString creates new Tokenizer from the provided string.
func NewFromString(str string) *Tokenizer {
return New(strings.NewReader(str))
Expand All @@ -33,27 +35,30 @@ func NewFromBytes(b []byte) *Tokenizer {

// New creates new Tokenizer from the provided reader with DefaultSeparators.
func New(r io.Reader) *Tokenizer {
return &Tokenizer{
r: bufio.NewReader(r),
separators: DefaultSeparators,
keepSeparator: false,
ignoreParenthesis: false,
}
t := &Tokenizer{r: bufio.NewReader(r)}

t.Separators(DefaultSeparators...)

return t
}

// Tokenizer defines a struct that parses a reader into tokens while
// respecting quotes and parenthesis boundaries.
type Tokenizer struct {
r *bufio.Reader

trimCutset string
separators []rune
keepSeparator bool
keepEmptyTokens bool
ignoreParenthesis bool
}

// Separators defines the provided separatos of the current Tokenizer.
func (t *Tokenizer) Separators(separators ...rune) {
t.separators = separators

t.rebuildTrimCutset()
}

// KeepSeparator defines whether to keep the separator rune as part
Expand All @@ -62,35 +67,37 @@ func (t *Tokenizer) KeepSeparator(state bool) {
t.keepSeparator = state
}

// KeepEmptyTokens defines whether to keep empty tokens on Scan() (default to false).
func (t *Tokenizer) KeepEmptyTokens(state bool) {
t.keepEmptyTokens = state
}

// IgnoreParenthesis defines whether to ignore the parenthesis boundaries
// and to treat the '(' and ')' as regular characters.
func (t *Tokenizer) IgnoreParenthesis(state bool) {
t.ignoreParenthesis = state
}

// Scan reads and returns the next available token from the Tokenizer's buffer (trimmed).
// Scan reads and returns the next available token from the Tokenizer's buffer (trimmed!).
//
// Empty tokens are skipped if t.keepEmptyTokens is not set (which is the default).
//
// Returns [io.EOF] error when there are no more tokens to scan.
func (t *Tokenizer) Scan() (string, error) {
ch := t.read()

if ch == eof {
return "", io.EOF
}

if isWhitespaceRune(ch) {
t.readWhiteSpaces()
} else {
t.unread()
}
t.unread()

token, err := t.readToken()
if err != nil {
return "", err
}

// read all remaining whitespaces
t.readWhiteSpaces()
if !t.keepEmptyTokens && token == "" {
return t.Scan()
}

return token, err
}
Expand Down Expand Up @@ -129,12 +136,12 @@ func (t *Tokenizer) readToken() (string, error) {
break
}

if !isEscapeRune(prevCh) {
if !t.isEscapeRune(prevCh) {
if !t.ignoreParenthesis && ch == '(' && quoteCh == eof {
parenthesis++ // opening parenthesis
} else if !t.ignoreParenthesis && ch == ')' && parenthesis > 0 && quoteCh == eof {
parenthesis-- // closing parenthesis
} else if isQuoteRune(ch) {
} else if t.isQuoteRune(ch) {
if quoteCh == ch {
quoteCh = eof // closing quote
} else if quoteCh == eof {
Expand All @@ -158,7 +165,7 @@ func (t *Tokenizer) readToken() (string, error) {
return "", fmt.Errorf("unbalanced parenthesis or quoted expression: %q", buf.String())
}

return buf.String(), nil
return strings.Trim(buf.String(), t.trimCutset), nil
}

// readWhiteSpaces consumes all contiguous whitespace runes.
Expand All @@ -170,7 +177,7 @@ func (t *Tokenizer) readWhiteSpaces() {
break
}

if !t.isSeperatorRune(ch) {
if !t.isWhitespaceRune(ch) {
t.unread()
break
}
Expand All @@ -193,6 +200,20 @@ func (t *Tokenizer) unread() error {
return t.r.UnreadRune()
}

// rebuildTrimCutset rebuilds the tokenizer trimCutset based on its separator runes.
func (t *Tokenizer) rebuildTrimCutset() {
var cutset strings.Builder

for _, w := range whitespaceChars {
if t.isSeperatorRune(w) {
continue
}
cutset.WriteRune(w)
}

t.trimCutset = cutset.String()
}

// isSeperatorRune checks if a rune is a token part separator.
func (t *Tokenizer) isSeperatorRune(ch rune) bool {
for _, r := range t.separators {
Expand All @@ -204,17 +225,23 @@ func (t *Tokenizer) isSeperatorRune(ch rune) bool {
return false
}

// isWhitespaceRune checks if a rune is a space, tab, or newline.
func isWhitespaceRune(ch rune) bool {
return ch == ' ' || ch == '\t' || ch == '\n'
// isWhitespaceRune checks if a rune is a space character (eg. space, tab, new line).
func (t *Tokenizer) isWhitespaceRune(ch rune) bool {
for _, c := range whitespaceChars {
if c == ch {
return true
}
}

return false
}

// isQuoteRune checks if a rune is a quote.
func isQuoteRune(ch rune) bool {
func (t *Tokenizer) isQuoteRune(ch rune) bool {
return ch == '\'' || ch == '"' || ch == '`'
}

// isEscapeRune checks if a rune is an escape character.
func isEscapeRune(ch rune) bool {
func (t *Tokenizer) isEscapeRune(ch rune) bool {
return ch == '\\'
}
Loading

0 comments on commit f889a3f

Please sign in to comment.