parser/charset/encoding.go

// Copyright 2021 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.

package charset

import "bytes"

// Make sure all of them implement Encoding interface.
var (
	_ Encoding = &encodingUTF8{}
	_ Encoding = &encodingUTF8MB3Strict{}
	_ Encoding = &encodingASCII{}
	_ Encoding = &encodingLatin1{}
	_ Encoding = &encodingBin{}
	_ Encoding = &encodingGBK{}
)

// IsSupportedEncoding checks if the charset is fully supported.
func IsSupportedEncoding(charset string) bool {
	_, ok := encodingMap[charset]
	return ok
}

// FindEncodingTakeUTF8AsNoop finds the encoding according to the charset
// except that utf-8 is treated as no-operation encoding. This is used to
// reduce the overhead of utf-8 validation in some cases.
func FindEncodingTakeUTF8AsNoop(charset string) Encoding {
	enc := FindEncoding(charset)
	if enc.Tp() == EncodingTpUTF8 {
		return EncodingBinImpl
	}
	return enc
}

// FindEncoding finds the encoding according to charset.
func FindEncoding(charset string) Encoding {
	if len(charset) == 0 {
		return EncodingBinImpl
	}
	if e, exist := encodingMap[charset]; exist {
		return e
	}
	return EncodingBinImpl
}

var encodingMap = map[string]Encoding{
	CharsetUTF8MB4: EncodingUTF8Impl,
	CharsetUTF8:    EncodingUTF8Impl,
	CharsetGBK:     EncodingGBKImpl,
	CharsetLatin1:  EncodingLatin1Impl,
	CharsetBin:     EncodingBinImpl,
	CharsetASCII:   EncodingASCIIImpl,
}

// Encoding provide encode/decode functions for a string with a specific charset.
type Encoding interface {
	// Name is the name of the encoding.
	Name() string
	// Tp is the type of the encoding.
	Tp() EncodingTp
	// Peek returns the next char.
	Peek(src []byte) []byte
	// MbLen returns multiple byte length, if the next character is single byte, return 0.
	MbLen(string) int
	// IsValid checks whether the utf-8 bytes can be convert to valid string in current encoding.
	IsValid(src []byte) bool
	// Foreach iterates the characters in in current encoding.
	Foreach(src []byte, op Op, fn func(from, to []byte, ok bool) bool)
	// Transform map the bytes in src to dest according to Op.
	// **the caller should initialize the dest if it wants to avoid memory alloc every time, or else it will always make a new one**
	// **the returned array may be the alias of `src`, edit the returned array on your own risk**
	Transform(dest *bytes.Buffer, src []byte, op Op) ([]byte, error)
	// ToUpper change a string to uppercase.
	ToUpper(src string) string
	// ToLower change a string to lowercase.
	ToLower(src string) string
}

type EncodingTp int8

const (
	EncodingTpNone EncodingTp = iota
	EncodingTpUTF8
	EncodingTpUTF8MB3Strict
	EncodingTpASCII
	EncodingTpLatin1
	EncodingTpBin
	EncodingTpGBK
)

// Op is used by Encoding.Transform.
type Op int16

const (
	opFromUTF8 Op = 1 << iota
	opToUTF8
	opTruncateTrim
	opTruncateReplace
	opCollectFrom
	opCollectTo
	opSkipError
)

const (
	OpReplaceNoErr  = opFromUTF8 | opTruncateReplace | opCollectFrom | opSkipError
	OpReplace       = opFromUTF8 | opTruncateReplace | opCollectFrom
	OpEncode        = opFromUTF8 | opTruncateTrim | opCollectTo
	OpEncodeNoErr   = OpEncode | opSkipError
	OpEncodeReplace = opFromUTF8 | opTruncateReplace | opCollectTo
	OpDecode        = opToUTF8 | opTruncateTrim | opCollectTo
	OpDecodeNoErr   = OpDecode | opSkipError
	OpDecodeReplace = opToUTF8 | opTruncateReplace | opCollectTo
)

// CountValidBytes counts the first valid bytes in src that
// can be encoded to the current encoding.
func CountValidBytes(e Encoding, src []byte) int {
	nSrc := 0
	e.Foreach(src, opFromUTF8, func(from, to []byte, ok bool) bool {
		if ok {
			nSrc += len(from)
		}
		return ok
	})
	return nSrc
}

// CountValidBytesDecode counts the first valid bytes in src that
// can be decoded to utf-8.
func CountValidBytesDecode(e Encoding, src []byte) int {
	nSrc := 0
	e.Foreach(src, opToUTF8, func(from, to []byte, ok bool) bool {
		if ok {
			nSrc += len(from)
		}
		return ok
	})
	return nSrc
}