forked from NVIDIA/Megatron-LM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_utils.py
105 lines (80 loc) · 2.89 KB
/
data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
""" Tasks data utility."""
import re
import numpy as np
def clean_text(text):
"""Remove new lines and multiple spaces and adjust end of sentence dot."""
text = text.replace("\n", " ")
text = re.sub(r'\s+', ' ', text)
for _ in range(3):
text = text.replace(' . ', '. ')
return text
def build_sample(ids, types, paddings, label, unique_id):
"""Convert to numpy and return a sample consumed by the batch producer."""
ids_np = np.array(ids, dtype=np.int64)
types_np = np.array(types, dtype=np.int64)
paddings_np = np.array(paddings, dtype=np.int64)
sample = ({'text': ids_np,
'types': types_np,
'padding_mask': paddings_np,
'label': int(label),
'uid': int(unique_id)})
return sample
def build_tokens_types_paddings_from_text(text_a, text_b,
tokenizer, max_seq_length):
"""Build token types and paddings, trim if needed, and pad if needed."""
text_a_ids = tokenizer.tokenize(text_a)
text_b_ids = None
if text_b is not None:
text_b_ids = tokenizer.tokenize(text_b)
return build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids,
max_seq_length, tokenizer.cls,
tokenizer.sep, tokenizer.pad)
def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq_length,
cls_id, sep_id, pad_id):
"""Build token types and paddings, trim if needed, and pad if needed."""
ids = []
types = []
paddings = []
# [CLS].
ids.append(cls_id)
types.append(0)
paddings.append(1)
# A.
len_text_a = len(text_a_ids)
ids.extend(text_a_ids)
types.extend([0] * len_text_a)
paddings.extend([1] * len_text_a)
# [SEP].
ids.append(sep_id)
types.append(0)
paddings.append(1)
# B.
if text_b_ids is not None:
len_text_b = len(text_b_ids)
ids.extend(text_b_ids)
types.extend([1] * len_text_b)
paddings.extend([1] * len_text_b)
# Cap the size.
trimmed = False
if len(ids) >= max_seq_length:
max_seq_length_m1 = max_seq_length - 1
ids = ids[0:max_seq_length_m1]
types = types[0:max_seq_length_m1]
paddings = paddings[0:max_seq_length_m1]
trimmed = True
# [SEP].
if (text_b_ids is not None) or trimmed:
ids.append(sep_id)
if text_b_ids is None:
types.append(0)
else:
types.append(1)
paddings.append(1)
# Padding.
padding_length = max_seq_length - len(ids)
if padding_length > 0:
ids.extend([pad_id] * padding_length)
types.extend([pad_id] * padding_length)
paddings.extend([0] * padding_length)
return ids, types, paddings