Skip to content

Commit

Permalink
Merge pull request namisan#205 from namisan/xiaodl/tf-upgrade
Browse files Browse the repository at this point in the history
Xiaodl/tf upgrade
  • Loading branch information
namisan authored Feb 10, 2021
2 parents 89092de + 956ebb7 commit d0188c9
Show file tree
Hide file tree
Showing 38 changed files with 78 additions and 116 deletions.
110 changes: 36 additions & 74 deletions prepro_std.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
from data_utils import load_data
from data_utils.task_def import TaskType, DataFormat
from data_utils.log_wrapper import create_logger
from experiments.exp_def import TaskDefs, EncoderModelType
from experiments.exp_def import TaskDefs
from experiments.squad import squad_utils
from pretrained_models import *
from transformers import AutoTokenizer


DEBUG_MODE = False
Expand All @@ -25,69 +25,49 @@
to_disk=True,
log_file='mt_dnn_data_proc_{}.log'.format(MAX_SEQ_LEN))

def feature_extractor(tokenizer, text_a, text_b=None, max_length=512, model_type=None, enable_padding=False, pad_on_left=False,
pad_token=0,
pad_token_segment_id=0,
mask_padding_with_zero=False): # set mask_padding_with_zero default value as False to keep consistent with original setting
inputs = tokenizer.encode_plus(
def feature_extractor(tokenizer, text_a, text_b=None, max_length=512, do_padding=False):
inputs = tokenizer(
text_a,
text_b,
add_special_tokens=True,
max_length=max_length,
truncation=True,
padding=do_padding
)
input_ids = inputs["input_ids"]
token_type_ids = inputs["token_type_ids"] if "token_type_ids" in inputs else [0] * len(input_ids)

# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

# Zero-pad up to the sequence length.
padding_length = max_length - len(input_ids)

if enable_padding:
if pad_on_left:
input_ids = ([pad_token] * padding_length) + input_ids
attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
else:
input_ids = input_ids + ([pad_token] * padding_length)
attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

attention_mask = inputs["attention_mask"]
if do_padding:
assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)

if model_type.lower() in ['bert', 'roberta']:
attention_mask = None

if model_type.lower() not in ['distilbert','bert', 'xlnet'] :
token_type_ids = [0] * len(token_type_ids)

return input_ids,attention_mask, token_type_ids # input_ids, input_mask, segment_id
return input_ids, attention_mask, token_type_ids

def build_data(data, dump_path, tokenizer, data_format=DataFormat.PremiseOnly,
max_seq_len=MAX_SEQ_LEN, encoderModelType=EncoderModelType.BERT, lab_dict=None):
max_seq_len=MAX_SEQ_LEN, lab_dict=None, do_padding=False, truncation=True):
def build_data_premise_only(
data, dump_path, max_seq_len=MAX_SEQ_LEN, tokenizer=None, encoderModelType=EncoderModelType.BERT):
data, dump_path, max_seq_len=MAX_SEQ_LEN, tokenizer=None):
"""Build data of single sentence tasks
"""
with open(dump_path, 'w', encoding='utf-8') as writer:
for idx, sample in enumerate(data):
ids = sample['uid']
premise = sample['premise']
label = sample['label']
input_ids, input_mask, type_ids = feature_extractor(tokenizer, premise, max_length=max_seq_len, model_type=encoderModelType.name)
input_ids, input_mask, type_ids = feature_extractor(tokenizer, premise, max_length=max_seq_len)
features = {
'uid': ids,
'label': label,
'token_id': input_ids,
'type_id': type_ids}
'type_id': type_ids,
'attention_mask': input_mask}
writer.write('{}\n'.format(json.dumps(features)))

def build_data_premise_and_one_hypo(
data, dump_path, max_seq_len=MAX_SEQ_LEN, tokenizer=None, encoderModelType=EncoderModelType.BERT):
data, dump_path, max_seq_len=MAX_SEQ_LEN, tokenizer=None):
"""Build data of sentence pair tasks
"""
with open(dump_path, 'w', encoding='utf-8') as writer:
Expand All @@ -96,17 +76,17 @@ def build_data_premise_and_one_hypo(
premise = sample['premise']
hypothesis = sample['hypothesis']
label = sample['label']
input_ids, input_mask, type_ids = feature_extractor(tokenizer, premise, text_b=hypothesis, max_length=max_seq_len,
model_type=encoderModelType.name)
input_ids, input_mask, type_ids = feature_extractor(tokenizer, premise, text_b=hypothesis, max_length=max_seq_len)
features = {
'uid': ids,
'label': label,
'token_id': input_ids,
'type_id': type_ids}
'type_id': type_ids,
'attention_mask': input_mask}
writer.write('{}\n'.format(json.dumps(features)))

def build_data_premise_and_multi_hypo(
data, dump_path, max_seq_len=MAX_SEQ_LEN, tokenizer=None, encoderModelType=EncoderModelType.BERT):
data, dump_path, max_seq_len=MAX_SEQ_LEN, tokenizer=None):
"""Build QNLI as a pair-wise ranking task
"""
with open(dump_path, 'w', encoding='utf-8') as writer:
Expand All @@ -117,22 +97,24 @@ def build_data_premise_and_multi_hypo(
label = sample['label']
input_ids_list = []
type_ids_list = []
attention_mask_list = []
for hypothesis in hypothesis_list:
input_ids, mask, type_ids = feature_extractor(tokenizer,
premise, hypothesis, max_length=max_seq_len,
model_type=encoderModelType.name)
input_ids, input_mask, type_ids = feature_extractor(tokenizer,
premise, hypothesis, max_length=max_seq_len)
input_ids_list.append(input_ids)
type_ids_list.append(type_ids)
attention_mask_list.append(input_mask)
features = {
'uid': ids,
'label': label,
'token_id': input_ids_list,
'type_id': type_ids_list,
'ruid': sample['ruid'],
'olabel': sample['olabel']}
'olabel': sample['olabel'],
'attention_mask': attention_mask_list}
writer.write('{}\n'.format(json.dumps(features)))

def build_data_sequence(data, dump_path, max_seq_len=MAX_SEQ_LEN, tokenizer=None, encoderModelType=EncoderModelType.BERT, label_mapper=None):
def build_data_sequence(data, dump_path, max_seq_len=MAX_SEQ_LEN, tokenizer=None, label_mapper=None):
with open(dump_path, 'w', encoding='utf-8') as writer:
for idx, sample in enumerate(data):
ids = sample['uid']
Expand Down Expand Up @@ -212,18 +194,17 @@ def build_data_mrc(data, dump_path, max_seq_len=MRC_MAX_SEQ_LEN, tokenizer=None,
data,
dump_path,
max_seq_len,
tokenizer,
encoderModelType)
tokenizer)
elif data_format == DataFormat.PremiseAndOneHypothesis:
build_data_premise_and_one_hypo(
data, dump_path, max_seq_len, tokenizer, encoderModelType)
data, dump_path, max_seq_len, tokenizer)
elif data_format == DataFormat.PremiseAndMultiHypothesis:
build_data_premise_and_multi_hypo(
data, dump_path, max_seq_len, tokenizer, encoderModelType)
data, dump_path, max_seq_len, tokenizer)
elif data_format == DataFormat.Seqence:
build_data_sequence(data, dump_path, max_seq_len, tokenizer, encoderModelType, lab_dict)
build_data_sequence(data, dump_path, max_seq_len, tokenizer, lab_dict)
elif data_format == DataFormat.MRC:
build_data_mrc(data, dump_path, max_seq_len, tokenizer, encoderModelType)
build_data_mrc(data, dump_path, max_seq_len, tokenizer)
else:
raise ValueError(data_format)

Expand All @@ -232,8 +213,9 @@ def parse_args():
parser = argparse.ArgumentParser(
description='Preprocessing GLUE/SNLI/SciTail dataset.')
parser.add_argument('--model', type=str, default='bert-base-uncased',
help='support all BERT, XLNET and ROBERTA family supported by HuggingFace Transformers')
help='support all BERT and ROBERTA family supported by HuggingFace Transformers')
parser.add_argument('--do_lower_case', action='store_true')
parser.add_argument('--do_padding', action='store_true')
parser.add_argument('--root_dir', type=str, default='data/canonical_data')
parser.add_argument('--task_def', type=str, default="experiments/glue/glue_task_def.yml")

Expand All @@ -243,33 +225,14 @@ def parse_args():

def main(args):
# hyper param
do_lower_case = args.do_lower_case
root = args.root_dir
assert os.path.exists(root)

literal_model_type = args.model.split('-')[0].upper()
encoder_model = EncoderModelType[literal_model_type]
literal_model_type = literal_model_type.lower()
mt_dnn_suffix = literal_model_type
if 'base' in args.model:
mt_dnn_suffix += "_base"
elif 'large' in args.model:
mt_dnn_suffix += "_large"

config_class, model_class, tokenizer_class = MODEL_CLASSES[literal_model_type]
tokenizer = tokenizer_class.from_pretrained(args.model, do_lower_case=do_lower_case)

if 'uncased' in args.model:
mt_dnn_suffix = '{}_uncased'.format(mt_dnn_suffix)
else:
mt_dnn_suffix = '{}_cased'.format(mt_dnn_suffix)

if do_lower_case:
mt_dnn_suffix = '{}_lower'.format(mt_dnn_suffix)
tokenizer = AutoTokenizer.from_pretrained(args.model)

mt_dnn_root = os.path.join(root, mt_dnn_suffix)
mt_dnn_root = os.path.join(root, args.model)
if not os.path.isdir(mt_dnn_root):
os.mkdir(mt_dnn_root)
os.makedirs(mt_dnn_root)

task_defs = TaskDefs(args.task_def)

Expand All @@ -289,7 +252,6 @@ def main(args):
dump_path,
tokenizer,
task_def.data_type,
encoderModelType=encoder_model,
lab_dict=task_def.label_vocab)


Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ future
apex
fairseq==0.8.0
seqeval==0.0.12
transformers==2.3.0
transformers==4.2.2
4 changes: 2 additions & 2 deletions sample_data/output/cola_dev.json
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
{"uid": "0", "label": 1, "token_id": [101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
{"uid": "1", "label": 1, "token_id": [101, 1996, 15871, 2081, 1996, 8164, 7683, 2058, 1996, 4139, 3240, 1012, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
{"uid": "0", "label": 1, "token_id": [101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{"uid": "1", "label": 1, "token_id": [101, 1996, 15871, 2081, 1996, 8164, 7683, 2058, 1996, 4139, 3240, 1012, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
2 changes: 1 addition & 1 deletion sample_data/output/cola_test.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"uid": "0", "label": 0, "token_id": [101, 3021, 26265, 2627, 1996, 2160, 1012, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0]}
{"uid": "0", "label": 0, "token_id": [101, 3021, 26265, 2627, 1996, 2160, 1012, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1]}
4 changes: 2 additions & 2 deletions sample_data/output/cola_train.json
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
{"uid": "0", "label": 1, "token_id": [101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
{"uid": "1", "label": 1, "token_id": [101, 2028, 2062, 18404, 2236, 3989, 1998, 1045, 1005, 1049, 3228, 2039, 1012, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
{"uid": "0", "label": 1, "token_id": [101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{"uid": "1", "label": 1, "token_id": [101, 2028, 2062, 18404, 2236, 3989, 1998, 1045, 1005, 1049, 3228, 2039, 1012, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
2 changes: 1 addition & 1 deletion sample_data/output/mnli_matched_dev.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"uid": "0", "label": 1, "token_id": [101, 1996, 2047, 2916, 2024, 3835, 2438, 102, 3071, 2428, 7777, 1996, 14751, 6666, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]}
{"uid": "0", "label": 1, "token_id": [101, 1996, 2047, 2916, 2024, 3835, 2438, 102, 3071, 2428, 7777, 1996, 14751, 6666, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
2 changes: 1 addition & 1 deletion sample_data/output/mnli_matched_test.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"uid": "0", "label": 0, "token_id": [101, 7632, 2121, 22083, 1010, 2019, 2015, 10819, 2080, 1010, 2019, 2015, 4241, 23314, 1010, 1998, 10424, 14031, 2721, 2024, 2074, 1037, 2261, 3415, 4276, 4363, 1037, 2298, 1011, 2041, 2005, 1012, 102, 7632, 2121, 22083, 2003, 1037, 2171, 4276, 2559, 2041, 2005, 1012, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{"uid": "0", "label": 0, "token_id": [101, 7632, 2121, 22083, 1010, 2019, 2015, 10819, 2080, 1010, 2019, 2015, 4241, 23314, 1010, 1998, 10424, 14031, 2721, 2024, 2074, 1037, 2261, 3415, 4276, 4363, 1037, 2298, 1011, 2041, 2005, 1012, 102, 7632, 2121, 22083, 2003, 1037, 2171, 4276, 2559, 2041, 2005, 1012, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
2 changes: 1 addition & 1 deletion sample_data/output/mnli_mismatched_dev.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"uid": "0", "label": 0, "token_id": [101, 2115, 6691, 3271, 2191, 2009, 2825, 2005, 2149, 2000, 3073, 2256, 2493, 2007, 1037, 3737, 2495, 1012, 102, 2115, 5857, 2020, 1997, 2053, 2393, 2007, 2256, 2493, 1005, 2495, 1012, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{"uid": "0", "label": 0, "token_id": [101, 2115, 6691, 3271, 2191, 2009, 2825, 2005, 2149, 2000, 3073, 2256, 2493, 2007, 1037, 3737, 2495, 1012, 102, 2115, 5857, 2020, 1997, 2053, 2393, 2007, 2256, 2493, 1005, 2495, 1012, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
2 changes: 1 addition & 1 deletion sample_data/output/mnli_mismatched_test.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"uid": "0", "label": 0, "token_id": [101, 2054, 2031, 2017, 2787, 1010, 2054, 2024, 2017, 2183, 2000, 2079, 1029, 102, 2061, 2054, 1005, 1055, 2115, 3247, 1029, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]}
{"uid": "0", "label": 0, "token_id": [101, 2054, 2031, 2017, 2787, 1010, 2054, 2024, 2017, 2183, 2000, 2079, 1029, 102, 2061, 2054, 1005, 1055, 2115, 3247, 1029, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
2 changes: 1 addition & 1 deletion sample_data/output/mnli_train.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"uid": "0", "label": 1, "token_id": [101, 17158, 2135, 6949, 8301, 25057, 2038, 2048, 3937, 9646, 1011, 4031, 1998, 10505, 1012, 102, 4031, 1998, 10505, 2024, 2054, 2191, 6949, 8301, 25057, 2147, 1012, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{"uid": "0", "label": 1, "token_id": [101, 17158, 2135, 6949, 8301, 25057, 2038, 2048, 3937, 9646, 1011, 4031, 1998, 10505, 1012, 102, 4031, 1998, 10505, 2024, 2054, 2191, 6949, 8301, 25057, 2147, 1012, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
2 changes: 1 addition & 1 deletion sample_data/output/mrpc_dev.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"uid": "0", "label": 1, "token_id": [101, 2002, 2056, 1996, 9440, 2121, 7903, 2063, 11345, 2449, 2987, 1005, 1056, 4906, 1996, 2194, 1005, 1055, 2146, 1011, 2744, 3930, 5656, 1012, 102, 1000, 1996, 9440, 2121, 7903, 2063, 11345, 2449, 2515, 2025, 4906, 2256, 2146, 1011, 2744, 3930, 5656, 1012, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{"uid": "0", "label": 1, "token_id": [101, 2002, 2056, 1996, 9440, 2121, 7903, 2063, 11345, 2449, 2987, 1005, 1056, 4906, 1996, 2194, 1005, 1055, 2146, 1011, 2744, 3930, 5656, 1012, 102, 1000, 1996, 9440, 2121, 7903, 2063, 11345, 2449, 2515, 2025, 4906, 2256, 2146, 1011, 2744, 3930, 5656, 1012, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
2 changes: 1 addition & 1 deletion sample_data/output/mrpc_test.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"uid": "0", "label": 0, "token_id": [101, 7473, 2278, 2860, 1005, 1055, 2708, 4082, 2961, 1010, 3505, 14998, 1010, 1998, 4074, 5196, 1010, 1996, 2708, 3361, 2961, 1010, 2097, 3189, 3495, 2000, 2720, 2061, 1012, 102, 2783, 2708, 4082, 2961, 3505, 14998, 1998, 2177, 2708, 3361, 2961, 4074, 5196, 2097, 3189, 2000, 2061, 1012, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{"uid": "0", "label": 0, "token_id": [101, 7473, 2278, 2860, 1005, 1055, 2708, 4082, 2961, 1010, 3505, 14998, 1010, 1998, 4074, 5196, 1010, 1996, 2708, 3361, 2961, 1010, 2097, 3189, 3495, 2000, 2720, 2061, 1012, 102, 2783, 2708, 4082, 2961, 3505, 14998, 1998, 2177, 2708, 3361, 2961, 4074, 5196, 2097, 3189, 2000, 2061, 1012, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
2 changes: 1 addition & 1 deletion sample_data/output/mrpc_train.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"uid": "0", "label": 1, "token_id": [101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002, 2170, 1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102, 7727, 2000, 2032, 2004, 2069, 1000, 1996, 7409, 1000, 1010, 2572, 3217, 5831, 5496, 2010, 2567, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{"uid": "0", "label": 1, "token_id": [101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002, 2170, 1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102, 7727, 2000, 2032, 2004, 2069, 1000, 1996, 7409, 1000, 1010, 2572, 3217, 5831, 5496, 2010, 2567, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102], "type_id": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Loading

0 comments on commit d0188c9

Please sign in to comment.