Skip to content

Commit

Permalink
feat: support qwen token
Browse files Browse the repository at this point in the history
Signed-off-by: Chen Bojun <[email protected]>
  • Loading branch information
0xd219b committed May 14, 2024
1 parent 475cdcd commit d77c5fb
Show file tree
Hide file tree
Showing 3 changed files with 151,680 additions and 0 deletions.
25 changes: 25 additions & 0 deletions encoding.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,16 @@ const FIM_SUFFIX string = "<|fim_suffix|>"
const ENDOFPROMPT string = "<|endofprompt|>"

const (
MODEL_QWEN_BASE string = "qwen_base"
MODEL_CL100K_BASE string = "cl100k_base"
MODEL_P50K_BASE string = "p50k_base"
MODEL_P50K_EDIT string = "p50k_edit"
MODEL_R50K_BASE string = "r50k_base"
)

var MODEL_TO_ENCODING = map[string]string{
// qwen
"qwen": MODEL_QWEN_BASE,
// chat
"gpt-4": MODEL_CL100K_BASE,
"gpt-3.5-turbo": MODEL_CL100K_BASE,
Expand Down Expand Up @@ -98,6 +101,8 @@ func getEncoding(encodingName string) (*Encoding, error) {

func initEncoding(encodingName string) (*Encoding, error) {
switch encodingName {
case MODEL_QWEN_BASE:
return qwen_base()
case MODEL_CL100K_BASE:
return cl100k_base()
case MODEL_P50K_BASE:
Expand All @@ -111,6 +116,26 @@ func initEncoding(encodingName string) (*Encoding, error) {
}
}

func qwen_base() (*Encoding, error) {
ranks, err := bpeLoader.LoadTiktokenBpe("tiktoken/qwen.tiktoken")
if err != nil {
return nil, err
}
special_tokens := map[string]int{
ENDOFTEXT: 100257,
FIM_PREFIX: 100258,
FIM_MIDDLE: 100259,
FIM_SUFFIX: 100260,
ENDOFPROMPT: 100276,
}
return &Encoding{
Name: MODEL_QWEN_BASE,
PatStr: `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+`,
MergeableRanks: ranks,
SpecialTokens: special_tokens,
}, nil
}

func cl100k_base() (*Encoding, error) {
ranks, err := bpeLoader.LoadTiktokenBpe("https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken")
if err != nil {
Expand Down
Loading

0 comments on commit d77c5fb

Please sign in to comment.