fix parsing error caused by single quote transformation (pingcap#1274)

fix parsing error caused by single quote transformation
stanxii · Jun 1, 2016 · 97df813 · 97df813
1 parent 61532d7
commit 97df813
Show file tree

Hide file tree

Showing 4 changed files with 312 additions and 10 deletions.
diff --git a/parser/parser_test.go b/parser/parser_test.go
@@ -948,6 +948,20 @@ func (s *testParserSuite) TestIndexHint(c *C) {
 	s.RunTest(c, table)
 }
 
+func (s *testParserSuite) TestEscape(c *C) {
+	defer testleak.AfterTest(c)()
+	table := []testCase{
+		{`select """;`, false},
+		{`select """";`, true},
+		{`select "汉字";`, true},
+		{`select 'abc"def';`, true},
+		{`select 'a\r\n';`, true},
+		{`select "\a\r\n"`, true},
+		{`select "\xFF"`, true},
+	}
+	s.RunTest(c, table)
+}
+
 func (s *testParserSuite) TestInsertStatementMemoryAllocation(c *C) {
 	sql := "insert t values (1)" + strings.Repeat(",(1)", 1000)
 	var oldStats, newStats runtime.MemStats

diff --git a/parser/scanner.l b/parser/scanner.l
@@ -1235,13 +1235,8 @@ func (l *lexer) str(lval *yySymType, pref string) int {
 	// TODO: performance issue.
 	s := string(l.stringLit)
 	l.stringLit = l.stringLit[0:0]
-	if pref == "'" {
-		s = strings.Replace(s, "\\'", "'", -1)    
-		s = strings.TrimSuffix(s, "'") + "\""
-		pref = "\""
-	} 
 	v := stringutil.RemoveUselessBackslash(pref+s)
-	v, err := strconv.Unquote(v)
+	v, err := stringutil.Unquote(v)
 	if err != nil {
 		v = strings.TrimSuffix(s, pref)
 	}

diff --git a/util/stringutil/string_util.go b/util/stringutil/string_util.go
@@ -16,18 +16,24 @@ package stringutil
 import (
 	"bytes"
 	"strings"
+	"unicode/utf8"
+
+	"github.com/juju/errors"
 )
 
+// ErrSyntax indicates that a value does not have the right syntax for the target type.
+var ErrSyntax = errors.New("invalid syntax")
+
 // See: https://dev.mysql.com/doc/refman/5.7/en/string-literals.html#character-escape-sequences
 const validEscapeChars = `0'"bntrz\\%_`
 
 // RemoveUselessBackslash removes backslashs which could be ignored in the string literal.
 // See: https://dev.mysql.com/doc/refman/5.7/en/string-literals.html
-// " Each of these sequences begins with a backslash (“\”), known as the escape character.
-// MySQL recognizes the escape sequences shown in Table 9.1, “Special Character Escape Sequences”.
+// " Each of these sequences begins with a backslash ("\"), known as the escape character.
+// MySQL recognizes the escape sequences shown in Table 9.1, "Special Character Escape Sequences".
 // For all other escape sequences, backslash is ignored. That is, the escaped character is
-// interpreted as if it was not escaped. For example, “\x” is just “x”. These sequences are case sensitive.
-// For example, “\b” is interpreted as a backspace, but “\B” is interpreted as “B”."
+// interpreted as if it was not escaped. For example, "\x" is just "x". These sequences are case sensitive.
+// For example, "\b" is interpreted as a backspace, but "\B" is interpreted as "B"."
 func RemoveUselessBackslash(s string) string {
 	var (
 		buf bytes.Buffer
@@ -60,3 +66,185 @@ func Reverse(s string) string {
 	}
 	return string(r)
 }
+
+// UnquoteChar decodes the first character or byte in the escaped string
+// or character literal represented by the string s.
+// It returns four values:
+//
+//1) value, the decoded Unicode code point or byte value;
+//2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
+//3) tail, the remainder of the string after the character; and
+//4) an error that will be nil if the character is syntactically valid.
+//
+// The second argument, quote, specifies the type of literal being parsed
+// and therefore which escaped quote character is permitted.
+// If set to a single quote, it permits the sequence \' and disallows unescaped '.
+// If set to a double quote, it permits \" and disallows unescaped ".
+// If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
+// Diffrent with strconv.UnquoteChar, it permits unnecessary backslash.
+func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) {
+	// easy cases
+	switch c := s[0]; {
+	case c == quote && (quote == '\'' || quote == '"'):
+		err = errors.Trace(ErrSyntax)
+		return
+	case c >= utf8.RuneSelf:
+		r, size := utf8.DecodeRuneInString(s)
+		return r, true, s[size:], nil
+	case c != '\\':
+		return rune(s[0]), false, s[1:], nil
+	}
+	// hard case: c is backslash
+	if len(s) <= 1 {
+		err = errors.Trace(ErrSyntax)
+		return
+	}
+	c := s[1]
+	s = s[2:]
+	switch c {
+	case 'a':
+		value = '\a'
+	case 'b':
+		value = '\b'
+	case 'f':
+		value = '\f'
+	case 'n':
+		value = '\n'
+	case 'r':
+		value = '\r'
+	case 't':
+		value = '\t'
+	case 'v':
+		value = '\v'
+	case 'x', 'u', 'U':
+		n := 0
+		switch c {
+		case 'x':
+			n = 2
+		case 'u':
+			n = 4
+		case 'U':
+			n = 8
+		}
+		var v rune
+		if len(s) < n {
+			err = errors.Trace(ErrSyntax)
+			return
+		}
+		for j := 0; j < n; j++ {
+			x, ok := unhex(s[j])
+			if !ok {
+				err = errors.Trace(ErrSyntax)
+				return
+			}
+			v = v<<4 | x
+		}
+		s = s[n:]
+		if c == 'x' {
+			// single-byte string, possibly not UTF-8
+			value = v
+			break
+		}
+		if v > utf8.MaxRune {
+			err = errors.Trace(ErrSyntax)
+			return
+		}
+		value = v
+		multibyte = true
+	case '0', '1', '2', '3', '4', '5', '6', '7':
+		v := rune(c) - '0'
+		if len(s) < 2 {
+			err = errors.Trace(ErrSyntax)
+			return
+		}
+		for j := 0; j < 2; j++ { // one digit already; two more
+			x := rune(s[j]) - '0'
+			if x < 0 || x > 7 {
+				err = errors.Trace(ErrSyntax)
+				return
+			}
+			v = (v << 3) | x
+		}
+		s = s[2:]
+		if v > 255 {
+			err = errors.Trace(ErrSyntax)
+			return
+		}
+		value = v
+	case '\\':
+		value = '\\'
+	case '\'', '"':
+		value = rune(c)
+	default:
+		err = errors.Trace(ErrSyntax)
+		return
+	}
+	tail = s
+	return
+}
+
+// Unquote interprets s as a single-quoted, double-quoted,
+// or backquoted Go string literal, returning the string value
+// that s quotes. For example: test=`"\"\n"` (hex: 22 5c 22 5c 6e 22)
+// should be converted to `"\n` (hex: 22 0a).
+func Unquote(s string) (t string, err error) {
+	n := len(s)
+	if n < 2 {
+		return "", errors.Trace(ErrSyntax)
+	}
+	quote := s[0]
+	if quote != s[n-1] {
+		return "", errors.Trace(ErrSyntax)
+	}
+	s = s[1 : n-1]
+	if quote == '`' {
+		if strings.IndexByte(s, '`') != -1 {
+			return "", errors.Trace(ErrSyntax)
+		}
+		return s, nil
+	}
+	if quote != '"' && quote != '\'' {
+		return "", errors.Trace(ErrSyntax)
+	}
+	// Avoid allocation. No need to convert if there is no '\'
+	if strings.IndexByte(s, '\\') == -1 && strings.IndexByte(s, quote) == -1 {
+		switch quote {
+		case '"':
+			return s, nil
+		case '\'':
+			r, size := utf8.DecodeRuneInString(s)
+			if size == len(s) && (r != utf8.RuneError || size != 1) {
+				return s, nil
+			}
+		}
+	}
+	var runeTmp [utf8.UTFMax]byte
+	buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
+	for len(s) > 0 {
+		c, multibyte, ss, err := UnquoteChar(s, quote)
+		if err != nil {
+			return "", errors.Trace(err)
+		}
+		s = ss
+		if c < utf8.RuneSelf || !multibyte {
+			buf = append(buf, byte(c))
+		} else {
+			n := utf8.EncodeRune(runeTmp[:], c)
+			buf = append(buf, runeTmp[:n]...)
+		}
+	}
+	return string(buf), nil
+}
+
+func unhex(b byte) (v rune, ok bool) {
+	c := rune(b)
+	switch {
+	case '0' <= c && c <= '9':
+		return c - '0', true
+	case 'a' <= c && c <= 'f':
+		return c - 'a' + 10, true
+	case 'A' <= c && c <= 'F':
+		return c - 'A' + 10, true
+	}
+	return
+}
diff --git a/util/stringutil/string_util_test.go b/util/stringutil/string_util_test.go
@@ -65,3 +65,108 @@ func (s *testStringUtilSuite) TestReverse(c *C) {
 		c.Assert(x, Equals, t.expect)
 	}
 }
+
+func (s *testStringUtilSuite) TestUnquote(c *C) {
+	defer testleak.AfterTest(c)()
+	table := []struct {
+		str    string
+		expect string
+		ok     bool
+	}{
+		{"", "", false},
+		{"'", "", false},
+		{`'abc"`, "", false},
+		{`abcdef`, "", false},
+		{`abcdea`, "", false},
+		{"```", "", false},
+		{"'abc'def'", "", false},
+
+		{`"abcdef"`, `abcdef`, true},
+		{`"abc'def"`, `abc'def`, true},
+		{`"汉字测试"`, `汉字测试`, true},
+		{`"☺"`, "☺", true},
+		{`"\xFF"`, "\xFF", true},
+		{`"\U00010111"`, "\U00010111", true},
+		{`"\U0001011111"`, "\U0001011111", true},
+		{`"\a\b\f\n\r\t\v\\\""`, "\a\b\f\n\r\t\v\\\"", true},
+
+		{`'abcdef'`, `abcdef`, true},
+		{`'"'`, "\"", true},
+		{`'\a\b\f\n\r\t\v\\\''`, "\a\b\f\n\r\t\v\\'", true},
+		{`' '`, " ", true},
+
+		{"``", ``, true},
+		{"`a`", `a`, true},
+		{"`abc`", `abc`, true},
+		{"`☺`", `☺`, true},
+		{"`hello world`", `hello world`, true},
+		{"`\\xFF`", `\xFF`, true},
+	}
+
+	for _, t := range table {
+		x, err := Unquote(t.str)
+		c.Assert(x, Equals, t.expect)
+		comment := Commentf("source %v", t.str)
+		if t.ok {
+			c.Assert(err, IsNil, comment)
+		} else {
+			c.Assert(err, NotNil, comment)
+		}
+	}
+}
+
+func (s *testStringUtilSuite) TestUnquoteChar(c *C) {
+	defer testleak.AfterTest(c)()
+	table := []struct {
+		str    string
+		expect string
+		ok     bool
+	}{
+		{"", "", false},
+		{"'", "", false},
+		{`'abc"`, "", false},
+		{`abcdef`, "", false},
+		{`abcdea`, "", false},
+		{"```", "", false},
+		{"'abc'def'", "", false},
+		{`'abc\n\'`, "", false},
+		{`"abc\0"`, "", false},
+		{`"\098"`, "", false},
+		{`"\777"`, "", false},
+		{`"\汉字"`, "", false},
+
+		{`"abcdef"`, `abcdef`, true},
+		{`"abc'def"`, `abc'def`, true},
+		{`"汉字测试"`, `汉字测试`, true},
+		{`"☺"`, "☺", true},
+		{`"\u0011"`, "\u0011", true},
+		{`"\xFF"`, "\xFF", true},
+		{`"\U00010111"`, "\U00010111", true},
+		{`"\U0001011111"`, "\U0001011111", true},
+		{`"\a\b\f\n\r\t\v\\\""`, "\a\b\f\n\r\t\v\\\"", true},
+		{`"\066"`, "\066", true},
+
+		{`'abcdef'`, `abcdef`, true},
+		{`'"'`, "\"", true},
+		{`'\a\b\f\n\r\t\v\\\''`, "\a\b\f\n\r\t\v\\'", true},
+		{`' '`, " ", true},
+
+		{"``", ``, true},
+		{"`a`", `a`, true},
+		{"`abc`", `abc`, true},
+		{"`☺`", `☺`, true},
+		{"`hello world`", `hello world`, true},
+		{"`\\xFF`", `\xFF`, true},
+	}
+
+	for _, t := range table {
+		x, err := Unquote(t.str)
+		c.Assert(x, Equals, t.expect)
+		comment := Commentf("source %v", t.str)
+		if t.ok {
+			c.Assert(err, IsNil, comment)
+		} else {
+			c.Assert(err, NotNil, comment)
+		}
+	}
+}