forked from ShannonAI/ChineseBert
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchinese_bert_dataset.py
86 lines (77 loc) · 3.09 KB
/
chinese_bert_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@file : chinese_bert_dataset.py
@author: zijun
@contact : [email protected]
@date : 2021/6/29 17:35
@version: 1.0
@desc : Base Class for dataset
"""
import json
import os
from typing import List
import tokenizers
from pypinyin import pinyin, Style
from tokenizers import BertWordPieceTokenizer
from torch.utils.data import Dataset
class ChineseBertDataset(Dataset):
def __init__(self, data_path, chinese_bert_path, max_length: int = 512):
"""
Dataset Base class
Args:
data_path: dataset file path
chinese_bert_path: pretrain model path
max_length: max sentence length
"""
super().__init__()
self.vocab_file = os.path.join(chinese_bert_path, 'vocab.txt')
self.config_path = os.path.join(chinese_bert_path, 'config')
self.data_path = data_path
self.max_length = max_length
self.tokenizer = BertWordPieceTokenizer(self.vocab_file)
# load pinyin map dict
with open(os.path.join(self.config_path, 'pinyin_map.json'), encoding='utf8') as fin:
self.pinyin_dict = json.load(fin)
# load char id map tensor
with open(os.path.join(self.config_path, 'id2pinyin.json'), encoding='utf8') as fin:
self.id2pinyin = json.load(fin)
# load pinyin map tensor
with open(os.path.join(self.config_path, 'pinyin2tensor.json'), encoding='utf8') as fin:
self.pinyin2tensor = json.load(fin)
self.lines = self.get_lines()
@property
def get_lines(self):
"""read data lines"""
raise NotImplementedError
def convert_sentence_to_pinyin_ids(self, sentence: str, tokenizer_output: tokenizers.Encoding) -> List[List[int]]:
# get pinyin of a sentence
pinyin_list = pinyin(sentence, style=Style.TONE3, heteronym=True, errors=lambda x: [['not chinese'] for _ in x])
pinyin_locs = {}
# get pinyin of each location
for index, item in enumerate(pinyin_list):
pinyin_string = item[0]
# not a Chinese character, pass
if pinyin_string == "not chinese":
continue
if pinyin_string in self.pinyin2tensor:
pinyin_locs[index] = self.pinyin2tensor[pinyin_string]
else:
ids = [0] * 8
for i, p in enumerate(pinyin_string):
if p not in self.pinyin_dict["char2idx"]:
ids = [0] * 8
break
ids[i] = self.pinyin_dict["char2idx"][p]
pinyin_locs[index] = ids
# find chinese character location, and generate pinyin ids
pinyin_ids = []
for idx, (token, offset) in enumerate(zip(tokenizer_output.tokens, tokenizer_output.offsets)):
if offset[1] - offset[0] != 1:
pinyin_ids.append([0] * 8)
continue
if offset[0] in pinyin_locs:
pinyin_ids.append(pinyin_locs[offset[0]])
else:
pinyin_ids.append([0] * 8)
return pinyin_ids