Skip to content

Commit

Permalink
charset: support builtin function convert (pingcap#28943)
Browse files Browse the repository at this point in the history
  • Loading branch information
xiongjiwei authored Nov 1, 2021
1 parent 8d9647d commit ad85341
Show file tree
Hide file tree
Showing 7 changed files with 204 additions and 6 deletions.
15 changes: 15 additions & 0 deletions cmd/explaintest/r/collation.result
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
drop table if exists t;
create table t(a char(10) collate utf8mb4_unicode_ci, b char(10) collate utf8mb4_general_ci);
insert into t values ('啊', '撒旦');
select coercibility(concat(a, b)) from t;
coercibility(concat(a, b))
1
select coercibility(convert(concat(a, b) using utf8mb4) collate utf8mb4_general_ci) from t;
coercibility(convert(concat(a, b) using utf8mb4) collate utf8mb4_general_ci)
0
select coercibility(convert('a' using utf8mb4));
coercibility(convert('a' using utf8mb4))
2
select coercibility(convert('a' using utf8mb4) collate utf8mb4_general_ci);
coercibility(convert('a' using utf8mb4) collate utf8mb4_general_ci)
0
75 changes: 75 additions & 0 deletions cmd/explaintest/r/new_character_set_builtin.result
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,78 @@ select to_base64(a), to_base64(b), to_base64(c) from t;
to_base64(a) to_base64(b) to_base64(c)
5LiA5LqM5LiJ 0ru2/sj9 5LiA5LqM5LiJAAAAAAAAAAAAAAA=
set @@tidb_enable_vectorized_expression = false;
drop table if exists t;
create table t(a char(10));
insert into t values ('中文'), ('啊'), ('a'), ('1'), ('ㅂ');
set @@tidb_enable_vectorized_expression = true;
select hex(convert(a using gbk)), convert(a using gbk) from t;
hex(convert(a using gbk)) convert(a using gbk)
D6D0CEC4 中文
B0A1 啊
61 a
31 1
3F ?
select hex(convert('中文' using gbk)), convert('中文' using gbk);
hex(convert('中文' using gbk)) convert('中文' using gbk)
D6D0CEC4 中文
select hex(convert('啊' using gbk)), convert('啊' using gbk);
hex(convert('啊' using gbk)) convert('啊' using gbk)
B0A1 啊
select hex(convert('a' using gbk)), convert('a' using gbk);
hex(convert('a' using gbk)) convert('a' using gbk)
61 a
select hex(convert('1' using gbk)), convert('1' using gbk);
hex(convert('1' using gbk)) convert('1' using gbk)
31 1
select hex(convert('ㅂ' using gbk)), convert('ㅂ' using gbk);
hex(convert('ㅂ' using gbk)) convert('ㅂ' using gbk)
3F ?
select convert(a using binary), convert(convert(a using gbk) using binary) from t;
convert(a using binary) convert(convert(a using gbk) using binary)
中文 ����
啊 ��
a a
1 1
ㅂ ?
select convert(convert('中文' using gbk) using binary), convert('中文' using binary);
convert(convert('中文' using gbk) using binary) convert('中文' using binary)
���� 中文
select convert(convert('ㅂ' using gbk) using binary), convert('ㅂ' using binary);
convert(convert('ㅂ' using gbk) using binary) convert('ㅂ' using binary)
? ㅂ
set @@tidb_enable_vectorized_expression = false;
select hex(convert(a using gbk)), convert(a using gbk) from t;
hex(convert(a using gbk)) convert(a using gbk)
D6D0CEC4 中文
B0A1 啊
61 a
31 1
3F ?
select hex(convert('中文' using gbk)), convert('中文' using gbk);
hex(convert('中文' using gbk)) convert('中文' using gbk)
D6D0CEC4 中文
select hex(convert('啊' using gbk)), convert('啊' using gbk);
hex(convert('啊' using gbk)) convert('啊' using gbk)
B0A1 啊
select hex(convert('a' using gbk)), convert('a' using gbk);
hex(convert('a' using gbk)) convert('a' using gbk)
61 a
select hex(convert('1' using gbk)), convert('1' using gbk);
hex(convert('1' using gbk)) convert('1' using gbk)
31 1
select hex(convert('ㅂ' using gbk)), convert('ㅂ' using gbk);
hex(convert('ㅂ' using gbk)) convert('ㅂ' using gbk)
3F ?
select convert(a using binary) from t;
convert(a using binary)
中文
a
1
select convert(convert('中文' using gbk) using binary), convert('中文' using binary);
convert(convert('中文' using gbk) using binary) convert('中文' using binary)
���� 中文
select convert(convert('ㅂ' using gbk) using binary), convert('ㅂ' using binary);
convert(convert('ㅂ' using gbk) using binary) convert('ㅂ' using binary)
? ㅂ
9 changes: 9 additions & 0 deletions cmd/explaintest/t/collation.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
--disable_warnings
drop table if exists t;
--enable_warnings
create table t(a char(10) collate utf8mb4_unicode_ci, b char(10) collate utf8mb4_general_ci);
insert into t values ('啊', '撒旦');
select coercibility(concat(a, b)) from t;
select coercibility(convert(concat(a, b) using utf8mb4) collate utf8mb4_general_ci) from t;
select coercibility(convert('a' using utf8mb4));
select coercibility(convert('a' using utf8mb4) collate utf8mb4_general_ci);
24 changes: 24 additions & 0 deletions cmd/explaintest/t/new_character_set_builtin.test
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,27 @@ set @@tidb_enable_vectorized_expression = true;
select to_base64(a), to_base64(b), to_base64(c) from t;
set @@tidb_enable_vectorized_expression = false;

-- test for builtin function convert()
drop table if exists t;
create table t(a char(10));
insert into t values ('中文'), ('啊'), ('a'), ('1'), ('ㅂ');
set @@tidb_enable_vectorized_expression = true;
select hex(convert(a using gbk)), convert(a using gbk) from t;
select hex(convert('中文' using gbk)), convert('中文' using gbk);
select hex(convert('啊' using gbk)), convert('啊' using gbk);
select hex(convert('a' using gbk)), convert('a' using gbk);
select hex(convert('1' using gbk)), convert('1' using gbk);
select hex(convert('ㅂ' using gbk)), convert('ㅂ' using gbk);
select convert(a using binary), convert(convert(a using gbk) using binary) from t;
select convert(convert('中文' using gbk) using binary), convert('中文' using binary);
select convert(convert('ㅂ' using gbk) using binary), convert('ㅂ' using binary);
set @@tidb_enable_vectorized_expression = false;
select hex(convert(a using gbk)), convert(a using gbk) from t;
select hex(convert('中文' using gbk)), convert('中文' using gbk);
select hex(convert('啊' using gbk)), convert('啊' using gbk);
select hex(convert('a' using gbk)), convert('a' using gbk);
select hex(convert('1' using gbk)), convert('1' using gbk);
select hex(convert('ㅂ' using gbk)), convert('ㅂ' using gbk);
select convert(a using binary) from t;
select convert(convert('中文' using gbk) using binary), convert('中文' using binary);
select convert(convert('ㅂ' using gbk) using binary), convert('ㅂ' using binary);
26 changes: 24 additions & 2 deletions expression/builtin_string.go
Original file line number Diff line number Diff line change
Expand Up @@ -1118,6 +1118,13 @@ func (c *convertFunctionClass) getFunction(ctx sessionctx.Context, args []Expres
if err != nil {
return nil, errUnknownCharacterSet.GenWithStackByArgs(transcodingName)
}
// convert function should always derive to CoercibilityImplicit
bf.SetCoercibility(CoercibilityImplicit)
if bf.tp.Charset == charset.CharsetASCII {
bf.SetRepertoire(ASCII)
} else {
bf.SetRepertoire(UNICODE)
}
// Result will be a binary string if converts charset to BINARY.
// See https://dev.mysql.com/doc/refman/5.7/en/charset-binary-set.html
if types.IsBinaryStr(bf.tp) {
Expand Down Expand Up @@ -1159,9 +1166,24 @@ func (b *builtinConvertSig) evalString(row chunk.Row) (string, bool, error) {
if encoding == nil {
return "", true, errUnknownCharacterSet.GenWithStackByArgs(b.tp.Charset)
}
// if expr is binary string and convert meet error, we should return NULL.
if types.IsBinaryStr(b.args[0].GetType()) {
target, _, err := transform.String(encoding.NewEncoder(), expr)
if err != nil {
return "", true, err
}

target, _, err := transform.String(encoding.NewDecoder(), expr)
return target, err != nil, err
// we should convert target into utf8 internal.
exprInternal, _, _ := transform.String(encoding.NewDecoder(), target)
return exprInternal, false, nil
}
if types.IsBinaryStr(b.tp) {
enc := charset.NewEncoding(b.args[0].GetType().Charset)
expr, err = enc.EncodeString(expr)
return expr, false, err
}
enc := charset.NewEncoding(b.tp.Charset)
return string(enc.EncodeInternal(nil, []byte(expr))), false, nil
}

type substringFunctionClass struct {
Expand Down
31 changes: 27 additions & 4 deletions expression/builtin_string_vec.go
Original file line number Diff line number Diff line change
Expand Up @@ -692,18 +692,41 @@ func (b *builtinConvertSig) vecEvalString(input *chunk.Chunk, result *chunk.Colu
if encoding == nil {
return errUnknownCharacterSet.GenWithStackByArgs(b.tp.Charset)
}
encoder := encoding.NewEncoder()
decoder := encoding.NewDecoder()
isBinaryStr := types.IsBinaryStr(b.args[0].GetType())
isRetBinary := types.IsBinaryStr(b.tp)
enc := charset.NewEncoding(b.tp.Charset)
if isRetBinary {
enc = charset.NewEncoding(b.args[0].GetType().Charset)
}

result.ReserveString(n)
for i := 0; i < n; i++ {
if expr.IsNull(i) {
result.AppendNull()
continue
}
exprI := expr.GetString(i)
target, _, err := transform.String(encoding.NewDecoder(), exprI)
if err != nil {
return err
if isBinaryStr {
target, _, err := transform.String(encoder, exprI)
if err != nil {
return err
}
// we should convert target into utf8 internal.
exprInternal, _, _ := transform.String(decoder, target)
result.AppendString(exprInternal)
} else {
if isRetBinary {
str, err := enc.EncodeString(exprI)
if err != nil {
return err
}
result.AppendString(str)
continue
}
result.AppendString(string(enc.EncodeInternal(nil, []byte(exprI))))
}
result.AppendString(target)
}
return nil
}
Expand Down
30 changes: 30 additions & 0 deletions parser/charset/encoding.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,36 @@ func (e *Encoding) EncodeString(src string) (string, error) {
return string(bs), err
}

// EncodeInternal convert bytes from utf-8 charset to a specific charset, we actually do not do the real convert, just find the inconvertible character and use ? replace.
// The code below is equivalent to
// expr, _ := e.Encode(dest, src)
// ret, _ := e.Decode(nil, expr)
// return ret
func (e *Encoding) EncodeInternal(dest, src []byte) []byte {
if !e.enabled() {
return src
}
if dest == nil {
dest = make([]byte, 0, len(src))
}
var srcOffset int

var buf [4]byte
transformer := e.enc.NewEncoder()
for srcOffset < len(src) {
length := characterLengthUTF8(src[srcOffset:])
_, _, err := transformer.Transform(buf[:], src[srcOffset:srcOffset+length], true)
if err != nil {
dest = append(dest, byte('?'))
} else {
dest = append(dest, src[srcOffset:srcOffset+length]...)
}
srcOffset += length
}

return dest
}

// Decode convert bytes from a specific charset to utf-8 charset.
func (e *Encoding) Decode(dest, src []byte) ([]byte, error) {
if !e.enabled() {
Expand Down

0 comments on commit ad85341

Please sign in to comment.