forked from pingcap/tidb
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathencoding.go
149 lines (134 loc) · 4.27 KB
/
encoding.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
// Copyright 2021 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
package charset
import "bytes"
// Make sure all of them implement Encoding interface.
var (
_ Encoding = &encodingUTF8{}
_ Encoding = &encodingUTF8MB3Strict{}
_ Encoding = &encodingASCII{}
_ Encoding = &encodingLatin1{}
_ Encoding = &encodingBin{}
_ Encoding = &encodingGBK{}
)
// IsSupportedEncoding checks if the charset is fully supported.
func IsSupportedEncoding(charset string) bool {
_, ok := encodingMap[charset]
return ok
}
// FindEncodingTakeUTF8AsNoop finds the encoding according to the charset
// except that utf-8 is treated as no-operation encoding. This is used to
// reduce the overhead of utf-8 validation in some cases.
func FindEncodingTakeUTF8AsNoop(charset string) Encoding {
enc := FindEncoding(charset)
if enc.Tp() == EncodingTpUTF8 {
return EncodingBinImpl
}
return enc
}
// FindEncoding finds the encoding according to charset.
func FindEncoding(charset string) Encoding {
if len(charset) == 0 {
return EncodingBinImpl
}
if e, exist := encodingMap[charset]; exist {
return e
}
return EncodingBinImpl
}
var encodingMap = map[string]Encoding{
CharsetUTF8MB4: EncodingUTF8Impl,
CharsetUTF8: EncodingUTF8Impl,
CharsetGBK: EncodingGBKImpl,
CharsetLatin1: EncodingLatin1Impl,
CharsetBin: EncodingBinImpl,
CharsetASCII: EncodingASCIIImpl,
}
// Encoding provide encode/decode functions for a string with a specific charset.
type Encoding interface {
// Name is the name of the encoding.
Name() string
// Tp is the type of the encoding.
Tp() EncodingTp
// Peek returns the next char.
Peek(src []byte) []byte
// MbLen returns multiple byte length, if the next character is single byte, return 0.
MbLen(string) int
// IsValid checks whether the utf-8 bytes can be convert to valid string in current encoding.
IsValid(src []byte) bool
// Foreach iterates the characters in in current encoding.
Foreach(src []byte, op Op, fn func(from, to []byte, ok bool) bool)
// Transform map the bytes in src to dest according to Op.
// **the caller should initialize the dest if it wants to avoid memory alloc every time, or else it will always make a new one**
// **the returned array may be the alias of `src`, edit the returned array on your own risk**
Transform(dest *bytes.Buffer, src []byte, op Op) ([]byte, error)
// ToUpper change a string to uppercase.
ToUpper(src string) string
// ToLower change a string to lowercase.
ToLower(src string) string
}
type EncodingTp int8
const (
EncodingTpNone EncodingTp = iota
EncodingTpUTF8
EncodingTpUTF8MB3Strict
EncodingTpASCII
EncodingTpLatin1
EncodingTpBin
EncodingTpGBK
)
// Op is used by Encoding.Transform.
type Op int16
const (
opFromUTF8 Op = 1 << iota
opToUTF8
opTruncateTrim
opTruncateReplace
opCollectFrom
opCollectTo
opSkipError
)
const (
OpReplaceNoErr = opFromUTF8 | opTruncateReplace | opCollectFrom | opSkipError
OpReplace = opFromUTF8 | opTruncateReplace | opCollectFrom
OpEncode = opFromUTF8 | opTruncateTrim | opCollectTo
OpEncodeNoErr = OpEncode | opSkipError
OpEncodeReplace = opFromUTF8 | opTruncateReplace | opCollectTo
OpDecode = opToUTF8 | opTruncateTrim | opCollectTo
OpDecodeNoErr = OpDecode | opSkipError
OpDecodeReplace = opToUTF8 | opTruncateReplace | opCollectTo
)
// CountValidBytes counts the first valid bytes in src that
// can be encoded to the current encoding.
func CountValidBytes(e Encoding, src []byte) int {
nSrc := 0
e.Foreach(src, opFromUTF8, func(from, to []byte, ok bool) bool {
if ok {
nSrc += len(from)
}
return ok
})
return nSrc
}
// CountValidBytesDecode counts the first valid bytes in src that
// can be decoded to utf-8.
func CountValidBytesDecode(e Encoding, src []byte) int {
nSrc := 0
e.Foreach(src, opToUTF8, func(from, to []byte, ok bool) bool {
if ok {
nSrc += len(from)
}
return ok
})
return nSrc
}