Skip to content

Commit

Permalink
lightning: support latin1 source file encoding (pingcap#44435)
Browse files Browse the repository at this point in the history
  • Loading branch information
lance6716 authored Jun 6, 2023
1 parent 03aff5c commit dd54ff2
Show file tree
Hide file tree
Showing 9 changed files with 51 additions and 0 deletions.
1 change: 1 addition & 0 deletions br/pkg/lightning/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,7 @@ type MydumperRuntime struct {
// - utf8mb4
// - GB18030
// - GBK: an extension of the GB2312 character set and is also known as Code Page 936.
// - latin1: IANA Windows1252
// - binary: no attempt to convert the encoding.
// Leave DataCharacterSet empty will make it use `binary` by default.
DataCharacterSet string `toml:"data-character-set" json:"data-character-set"`
Expand Down
14 changes: 14 additions & 0 deletions br/pkg/lightning/mydump/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"github.com/pingcap/tidb/br/pkg/storage"
"github.com/spkg/bom"
"go.uber.org/zap"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/simplifiedchinese"
)

Expand Down Expand Up @@ -63,6 +64,19 @@ func decodeCharacterSet(data []byte, characterSet string) ([]byte, error) {
return nil, errInvalidSchemaEncoding
}
data = decoded
case "latin1":
// use Windows1252 (not ISO 8859-1) to decode Latin1
// https://dev.mysql.com/doc/refman/8.0/en/charset-we-sets.html
decoded, err := charmap.Windows1252.NewDecoder().Bytes(data)
if err != nil {
return nil, errors.Trace(err)
}
// > Each byte that cannot be transcoded will be represented in the
// > output by the UTF-8 encoding of '\uFFFD'
if bytes.ContainsRune(decoded, '\ufffd') {
return nil, errInvalidSchemaEncoding
}
data = decoded
default:
return nil, errors.Errorf("Unsupported encoding %s", characterSet)
}
Expand Down
8 changes: 8 additions & 0 deletions br/tests/lightning_character_sets/latin1-only-schema.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[lightning]
table-concurrency = 1

[tikv-importer]
backend = "local"

[mydumper]
character-set = "latin1"
9 changes: 9 additions & 0 deletions br/tests/lightning_character_sets/latin1.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[lightning]
table-concurrency = 1

[tikv-importer]
backend = "local"

[mydumper]
character-set = "latin1"
data-character-set = "latin1"
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
create database charsets;
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
create table `latin1` (`ÏÐ` int primary key comment 'ÏÐ', `data` varchar(20));
3 changes: 3 additions & 0 deletions br/tests/lightning_character_sets/latin1/charsets.latin1.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"��","data"
1,"����"
2,"����"
13 changes: 13 additions & 0 deletions br/tests/lightning_character_sets/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,16 @@ run_sql 'TRUNCATE TABLE charsets.greek;'
run_lightning --config "tests/$TEST_NAME/greek.toml" -d "tests/$TEST_NAME/greek" --backend tidb
run_sql "SELECT count(*) FROM charsets.greek WHERE c = 'α';"
check_contains 'count(*): 1'

# latin1
# wrong encoding will have wrong column name and data
run_lightning --config "tests/$TEST_NAME/binary.toml" -d "tests/$TEST_NAME/latin1" 2>&1 | grep -q "unknown columns in header"
run_sql 'DROP TABLE charsets.latin1;'
run_lightning --config "tests/$TEST_NAME/utf8mb4.toml" -d "tests/$TEST_NAME/latin1" 2>&1 | grep -q "invalid schema encoding"
run_lightning --config "tests/$TEST_NAME/latin1-only-schema.toml" -d "tests/$TEST_NAME/latin1" 2>&1 | grep -q "unknown columns in header"
run_lightning --config "tests/$TEST_NAME/latin1.toml" -d "tests/$TEST_NAME/latin1"
run_sql 'SELECT * FROM charsets.latin1'
check_contains 'ÏÐ: 1'
check_contains 'data: ‘’“”'
check_contains 'ÏÐ: 2'
check_contains 'data: ¡¢£¤'
1 change: 1 addition & 0 deletions br/tidb-lightning.toml
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ no-schema = false
# - utf8mb4: Indicates that the source data file uses UTF-8 encoding.
# - GB18030: Indicates that the source data file uses the GB-18030 encoding.
# - GBK: The source data file uses GBK encoding (GBK encoding is an extension of the GB-2312 character set, also known as Code Page 936).
# - latin1: IANA Windows1252
# - binary: Indicates that Lightning does not convert the encoding (by default).
# If left blank, the default value "binary" is used, that is to say, Lightning does not convert the encoding.
# Note that Lightning does not predict about the character set of the source data file and only converts the source file and import the data based on this configuration.
Expand Down

0 comments on commit dd54ff2

Please sign in to comment.