forked from illiterate/BertClassifier
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset.py
43 lines (37 loc) · 1.57 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# coding: utf-8
# @File: dataset.py
# @Author: HE D.H.
# @Email: [email protected]
# @Time: 2021/12/09 11:01:32
# @Description:
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset
class CNewsDataset(Dataset):
def __init__(self, filename, tokenizer):
# 数据集初始化
self.labels = ['体育', '娱乐', '家居', '房产', '教育', '时尚', '时政', '游戏', '科技', '财经']
self.labels_id = list(range(len(self.labels)))
self.tokenizer = tokenizer
self.input_ids = []
self.token_type_ids = []
self.attention_mask = []
self.label_id = []
self.load_data(filename)
def load_data(self, filename):
# 加载数据
print('loading data from:', filename)
with open(filename, 'r', encoding='utf-8') as rf:
lines = rf.readlines()
for line in tqdm(lines, ncols=100):
label, text = line.strip().split('\t')
label_id = self.labels.index(label)
token = self.tokenizer(text, add_special_tokens=True, padding='max_length', truncation=True, max_length=512)
self.input_ids.append(np.array(token['input_ids']))
self.token_type_ids.append(np.array(token['token_type_ids']))
self.attention_mask.append(np.array(token['attention_mask']))
self.label_id.append(label_id)
def __getitem__(self, index):
return self.input_ids[index], self.token_type_ids[index], self.attention_mask[index], self.label_id[index]
def __len__(self):
return len(self.input_ids)