[pocketbase#3616] fixed tokenizer whitespace characters trimming

manesiotise · Oct 27, 2023 · 01e8c0f · 01e8c0f
1 parent 94253f0
commit 01e8c0f
Show file tree

Hide file tree

Showing 34 changed files with 182 additions and 108 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,10 @@
+## v0.19.1
+
+- Fixed `tokenizer.Scan()/ScanAll()` to ignore the separators from the default trim cutset.
+  An option to return also the empty found tokens was also added via `Tokenizer.KeepEmptyTokens(true)`.
+  _This should fix the parsing of whitespace charactes around view query column names when no quotes are used ([#3616](https://github.com/pocketbase/pocketbase/discussions/3616#discussioncomment-7398564))._
+
+
 ## v0.19.0
 
 - Added Patreon OAuth2 provider ([#3323](https://github.com/pocketbase/pocketbase/pull/3323); thanks @ghostdevv).

diff --git a/tools/tokenizer/tokenizer.go b/tools/tokenizer/tokenizer.go
@@ -21,6 +21,8 @@ const eof = rune(0)
 // DefaultSeparators is a list with the default token separator characters.
 var DefaultSeparators = []rune{','}
 
+var whitespaceChars = []rune{'\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0}
+
 // NewFromString creates new Tokenizer from the provided string.
 func NewFromString(str string) *Tokenizer {
 	return New(strings.NewReader(str))
@@ -33,27 +35,30 @@ func NewFromBytes(b []byte) *Tokenizer {
 
 // New creates new Tokenizer from the provided reader with DefaultSeparators.
 func New(r io.Reader) *Tokenizer {
-	return &Tokenizer{
-		r:                 bufio.NewReader(r),
-		separators:        DefaultSeparators,
-		keepSeparator:     false,
-		ignoreParenthesis: false,
-	}
+	t := &Tokenizer{r: bufio.NewReader(r)}
+
+	t.Separators(DefaultSeparators...)
+
+	return t
 }
 
 // Tokenizer defines a struct that parses a reader into tokens while
 // respecting quotes and parenthesis boundaries.
 type Tokenizer struct {
 	r *bufio.Reader
 
+	trimCutset        string
 	separators        []rune
 	keepSeparator     bool
+	keepEmptyTokens   bool
 	ignoreParenthesis bool
 }
 
 // Separators defines the provided separatos of the current Tokenizer.
 func (t *Tokenizer) Separators(separators ...rune) {
 	t.separators = separators
+
+	t.rebuildTrimCutset()
 }
 
 // KeepSeparator defines whether to keep the separator rune as part
@@ -62,35 +67,37 @@ func (t *Tokenizer) KeepSeparator(state bool) {
 	t.keepSeparator = state
 }
 
+// KeepEmptyTokens defines whether to keep empty tokens on Scan() (default to false).
+func (t *Tokenizer) KeepEmptyTokens(state bool) {
+	t.keepEmptyTokens = state
+}
+
 // IgnoreParenthesis defines whether to ignore the parenthesis boundaries
 // and to treat the '(' and ')' as regular characters.
 func (t *Tokenizer) IgnoreParenthesis(state bool) {
 	t.ignoreParenthesis = state
 }
 
-// Scan reads and returns the next available token from the Tokenizer's buffer (trimmed).
+// Scan reads and returns the next available token from the Tokenizer's buffer (trimmed!).
+//
+// Empty tokens are skipped if t.keepEmptyTokens is not set (which is the default).
 //
 // Returns [io.EOF] error when there are no more tokens to scan.
 func (t *Tokenizer) Scan() (string, error) {
 	ch := t.read()
-
 	if ch == eof {
 		return "", io.EOF
 	}
-
-	if isWhitespaceRune(ch) {
-		t.readWhiteSpaces()
-	} else {
-		t.unread()
-	}
+	t.unread()
 
 	token, err := t.readToken()
 	if err != nil {
 		return "", err
 	}
 
-	// read all remaining whitespaces
-	t.readWhiteSpaces()
+	if !t.keepEmptyTokens && token == "" {
+		return t.Scan()
+	}
 
 	return token, err
 }
@@ -129,12 +136,12 @@ func (t *Tokenizer) readToken() (string, error) {
 			break
 		}
 
-		if !isEscapeRune(prevCh) {
+		if !t.isEscapeRune(prevCh) {
 			if !t.ignoreParenthesis && ch == '(' && quoteCh == eof {
 				parenthesis++ // opening parenthesis
 			} else if !t.ignoreParenthesis && ch == ')' && parenthesis > 0 && quoteCh == eof {
 				parenthesis-- // closing parenthesis
-			} else if isQuoteRune(ch) {
+			} else if t.isQuoteRune(ch) {
 				if quoteCh == ch {
 					quoteCh = eof // closing quote
 				} else if quoteCh == eof {
@@ -158,7 +165,7 @@ func (t *Tokenizer) readToken() (string, error) {
 		return "", fmt.Errorf("unbalanced parenthesis or quoted expression: %q", buf.String())
 	}
 
-	return buf.String(), nil
+	return strings.Trim(buf.String(), t.trimCutset), nil
 }
 
 // readWhiteSpaces consumes all contiguous whitespace runes.
@@ -170,7 +177,7 @@ func (t *Tokenizer) readWhiteSpaces() {
 			break
 		}
 
-		if !t.isSeperatorRune(ch) {
+		if !t.isWhitespaceRune(ch) {
 			t.unread()
 			break
 		}
@@ -193,6 +200,20 @@ func (t *Tokenizer) unread() error {
 	return t.r.UnreadRune()
 }
 
+// rebuildTrimCutset rebuilds the tokenizer trimCutset based on its separator runes.
+func (t *Tokenizer) rebuildTrimCutset() {
+	var cutset strings.Builder
+
+	for _, w := range whitespaceChars {
+		if t.isSeperatorRune(w) {
+			continue
+		}
+		cutset.WriteRune(w)
+	}
+
+	t.trimCutset = cutset.String()
+}
+
 // isSeperatorRune checks if a rune is a token part separator.
 func (t *Tokenizer) isSeperatorRune(ch rune) bool {
 	for _, r := range t.separators {
@@ -204,17 +225,23 @@ func (t *Tokenizer) isSeperatorRune(ch rune) bool {
 	return false
 }
 
-// isWhitespaceRune checks if a rune is a space, tab, or newline.
-func isWhitespaceRune(ch rune) bool {
-	return ch == ' ' || ch == '\t' || ch == '\n'
+// isWhitespaceRune checks if a rune is a space character (eg. space, tab, new line).
+func (t *Tokenizer) isWhitespaceRune(ch rune) bool {
+	for _, c := range whitespaceChars {
+		if c == ch {
+			return true
+		}
+	}
+
+	return false
 }
 
 // isQuoteRune checks if a rune is a quote.
-func isQuoteRune(ch rune) bool {
+func (t *Tokenizer) isQuoteRune(ch rune) bool {
 	return ch == '\'' || ch == '"' || ch == '`'
 }
 
 // isEscapeRune checks if a rune is an escape character.
-func isEscapeRune(ch rune) bool {
+func (t *Tokenizer) isEscapeRune(ch rune) bool {
 	return ch == '\\'
 }