Skip to content

Commit

Permalink
1.6: DataLoader, ernie_encode + santiy check
Browse files Browse the repository at this point in the history
  • Loading branch information
Meiyim committed Jan 16, 2020
1 parent a3e96ed commit e5d95cb
Show file tree
Hide file tree
Showing 8 changed files with 78 additions and 95 deletions.
29 changes: 16 additions & 13 deletions ernie/ernie_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,18 +52,16 @@


def create_model(args, pyreader_name, ernie_config):
pyreader = fluid.layers.py_reader(
capacity=50,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1], [-1]],
dtypes=['int64', 'int64', 'int64', 'int64', 'float', 'int64'],
lod_levels=[0, 0, 0, 0, 0, 0],
name=pyreader_name,
use_double_buffer=True)

(src_ids, sent_ids, pos_ids, task_ids, input_mask,
seq_lens) = fluid.layers.read_file(pyreader)
src_ids = fluid.layers.data(name='1', shape=[-1, args.max_seq_len, 1], dtype='int64')
sent_ids = fluid.layers.data(name='2', shape=[-1, args.max_seq_len, 1], dtype='int64')
pos_ids = fluid.layers.data(name='3', shape=[-1, args.max_seq_len, 1], dtype='int64')
task_ids = fluid.layers.data(name='4', shape=[-1, args.max_seq_len, 1], dtype='int64')
input_mask = fluid.layers.data(name='5', shape=[-1, args.max_seq_len, 1], dtype='float32')
seq_lens = fluid.layers.data(name='8', shape=[-1], dtype='int64')

pyreader = fluid.io.DataLoader.from_generator(feed_list=[src_ids, sent_ids, pos_ids, task_ids, input_mask, seq_lens],
capacity=70,
iterable=False)

ernie = ErnieModel(
src_ids=src_ids,
Expand Down Expand Up @@ -143,7 +141,7 @@ def main(args):
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.num_threads = dev_count

pyreader.decorate_tensor_provider(data_generator)
pyreader.set_batch_generator(data_generator)
pyreader.start()

total_cls_emb = []
Expand All @@ -167,6 +165,11 @@ def main(args):
total_cls_emb = np.concatenate(total_cls_emb)
total_top_layer_emb = np.concatenate(total_top_layer_emb)

if not os.path.exists(args.output_dir):
os.mkdir(args.output_dir)
else:
raise RuntimeError('output dir exists: %s' % args.output_dir)

with open(os.path.join(args.output_dir, "cls_emb.npy"),
"wb") as cls_emb_file:
np.save(cls_emb_file, total_cls_emb)
Expand Down
40 changes: 14 additions & 26 deletions ernie/finetune/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,34 +39,22 @@ def create_model(args,
is_classify=False,
is_regression=False,
ernie_version="1.0"):

src_ids = fluid.layers.data(name='1', shape=[-1, args.max_seq_len, 1], dtype='int64')
sent_ids = fluid.layers.data(name='2', shape=[-1, args.max_seq_len, 1], dtype='int64')
pos_ids = fluid.layers.data(name='3', shape=[-1, args.max_seq_len, 1], dtype='int64')
task_ids = fluid.layers.data(name='4', shape=[-1, args.max_seq_len, 1], dtype='int64')
input_mask = fluid.layers.data(name='5', shape=[-1, args.max_seq_len, 1], dtype='float32')
qids = fluid.layers.data(name='7', shape=[-1, 1], dtype='int64')

if is_classify:
pyreader = fluid.layers.py_reader(
capacity=50,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1], [-1, 1], [-1, 1]],
dtypes=[
'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64'
],
lod_levels=[0, 0, 0, 0, 0, 0, 0],
name=task_name + "_" + pyreader_name,
use_double_buffer=True)
labels = fluid.layers.data(name='6', shape=[-1, 1], dtype='int64')
elif is_regression:
pyreader = fluid.layers.py_reader(
capacity=50,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1], [-1, 1], [-1, 1]],
dtypes=[
'int64', 'int64', 'int64', 'int64', 'float32', 'float32',
'int64'
],
lod_levels=[0, 0, 0, 0, 0, 0, 0],
name=task_name + "_" + pyreader_name,
use_double_buffer=True)

(src_ids, sent_ids, pos_ids, task_ids, input_mask, labels,
qids) = fluid.layers.read_file(pyreader)
labels = fluid.layers.data(name='6', shape=[-1, 1], dtype='float32')

pyreader = fluid.io.DataLoader.from_generator(feed_list=[src_ids, sent_ids, pos_ids, task_ids, input_mask, qids],
capacity=70,
iterable=False)

ernie = ErnieModel(
src_ids=src_ids,
Expand Down
26 changes: 12 additions & 14 deletions ernie/finetune/mrc.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,20 +40,18 @@
log = logging.getLogger(__name__)

def create_model(args, pyreader_name, ernie_config, is_training):
pyreader = fluid.layers.py_reader(
capacity=50,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1], [-1, 1], [-1, 1], [-1, 1]],
dtypes=[
'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64',
'int64'
],
lod_levels=[0, 0, 0, 0, 0, 0, 0, 0],
name=pyreader_name,
use_double_buffer=True)
(src_ids, sent_ids, pos_ids, task_ids, input_mask, start_positions,
end_positions, unique_id) = fluid.layers.read_file(pyreader)
src_ids = fluid.layers.data(name='1', shape=[-1, args.max_seq_len, 1], dtype='int64')
pos_ids = fluid.layers.data(name='2', shape=[-1, args.max_seq_len, 1], dtype='int64')
sent_ids= fluid.layers.data(name='3', shape=[-1, args.max_seq_len, 1], dtype='int64')
task_ids= fluid.layers.data(name='4', shape=[-1, args.max_seq_len, 1], dtype='int64')
input_mask = fluid.layers.data(name='5', shape=[-1, 1], dtype='float32')
start_positions = fluid.layers.data(name='6', shape=[-1, 1], dtype='int64')
end_positions = fluid.layers.data(name='7', shape=[-1, 1], dtype='int64')
unique_id = fluid.layers.data(name='8', shape=[-1, 1], dtype='int64')

pyreader = fluid.io.DataLoader.from_generator(feed_list=[
src_ids, sent_ids, pos_ids, task_ids, input_mask, start_positions,
end_positions, unique_id], capacity=50, iterable=False)

ernie = ErnieModel(
src_ids=src_ids,
Expand Down
25 changes: 11 additions & 14 deletions ernie/finetune/sequence_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,17 @@
log = logging.getLogger(__name__)

def create_model(args, pyreader_name, ernie_config, is_prediction=False):
pyreader = fluid.layers.py_reader(
capacity=50,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1]],
dtypes=[
'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64'
],
lod_levels=[0, 0, 0, 0, 0, 0, 0],
name=pyreader_name,
use_double_buffer=True)

(src_ids, sent_ids, pos_ids, task_ids, input_mask, labels,
seq_lens) = fluid.layers.read_file(pyreader)
src_ids = fluid.layers.data(name='1', shape=[-1, args.max_seq_len, 1], dtype='int64')
sent_ids = fluid.layers.data(name='2', shape=[-1, args.max_seq_len, 1], dtype='int64')
pos_ids = fluid.layers.data(name='3', shape=[-1, args.max_seq_len, 1], dtype='int64')
task_ids = fluid.layers.data(name='4', shape=[-1, args.max_seq_len, 1], dtype='int64')
input_mask = fluid.layers.data(name='5', shape=[-1, args.max_seq_len, 1], dtype='float32')
labels = fluid.layers.data(name='7', shape=[-1, args.max_seq_len, 1], dtype='int64')
seq_lens = fluid.layers.data(name='8', shape=[-1], dtype='int64')

pyreader = fluid.io.DataLoader.from_generator(feed_list=[src_ids, sent_ids, pos_ids, task_ids, input_mask, labels, seq_lens],
capacity=70,
iterable=False)

ernie = ErnieModel(
src_ids=src_ids,
Expand Down
8 changes: 4 additions & 4 deletions ernie/run_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def main(args):
num_trainers=nccl2_num_trainers,
trainer_id=nccl2_trainer_id)

train_pyreader.decorate_tensor_provider(train_data_generator)
train_pyreader.set_batch_generator(train_data_generator)
else:
train_exe = None

Expand Down Expand Up @@ -349,7 +349,7 @@ def main(args):

# final eval on dianostic, hack for glue-ax
if args.diagnostic:
test_pyreader.decorate_tensor_provider(
test_pyreader.set_batch_generator(
reader.data_generator(
args.diagnostic,
batch_size=args.batch_size,
Expand Down Expand Up @@ -380,7 +380,7 @@ def evaluate_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars,
# evaluate dev set
batch_size = args.batch_size if args.predict_batch_size is None else args.predict_batch_size
for ds in args.dev_set.split(','):
test_pyreader.decorate_tensor_provider(
test_pyreader.set_batch_generator(
reader.data_generator(
ds,
batch_size=batch_size,
Expand Down Expand Up @@ -409,7 +409,7 @@ def predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars,
batch_size = args.batch_size if args.predict_batch_size is None else args.predict_batch_size

for test_f, save_f in zip(test_sets, save_dirs):
test_pyreader.decorate_tensor_provider(
test_pyreader.set_batch_generator(
reader.data_generator(
test_f,
batch_size=batch_size,
Expand Down
10 changes: 5 additions & 5 deletions ernie/run_mrc.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def main(args):
num_trainers=nccl2_num_trainers,
trainer_id=nccl2_trainer_id)

train_pyreader.decorate_tensor_provider(train_data_generator)
train_pyreader.set_batch_generator(train_data_generator)
else:
train_exe = None

Expand Down Expand Up @@ -272,7 +272,7 @@ def main(args):

if steps % args.validation_steps == 0:
if args.do_val:
test_pyreader.decorate_tensor_provider(
test_pyreader.set_batch_generator(
reader.data_generator(
args.dev_set,
batch_size=args.batch_size,
Expand All @@ -291,7 +291,7 @@ def main(args):
args=args)

if args.do_test:
test_pyreader.decorate_tensor_provider(
test_pyreader.set_batch_generator(
reader.data_generator(
args.test_set,
batch_size=args.batch_size,
Expand All @@ -318,7 +318,7 @@ def main(args):
# final eval on dev set
if args.do_val:
log.info("Final validation result:")
test_pyreader.decorate_tensor_provider(
test_pyreader.set_batch_generator(
reader.data_generator(
args.dev_set,
batch_size=args.batch_size,
Expand All @@ -339,7 +339,7 @@ def main(args):
# final eval on test set
if args.do_test:
log.info("Final test result:")
test_pyreader.decorate_tensor_provider(
test_pyreader.set_batch_generator(
reader.data_generator(
args.test_set,
batch_size=args.batch_size,
Expand Down
6 changes: 3 additions & 3 deletions ernie/run_sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ def main(args):
num_trainers=nccl2_num_trainers,
trainer_id=nccl2_trainer_id)

train_pyreader.decorate_tensor_provider(train_data_generator)
train_pyreader.set_batch_generator(train_data_generator)
else:
train_exe = None

Expand Down Expand Up @@ -302,7 +302,7 @@ def evaluate_wrapper(reader, exe, test_prog, test_pyreader, graph_vars,
# evaluate dev set
batch_size = args.batch_size if args.predict_batch_size is None else args.predict_batch_size
for ds in args.dev_set.split(','): #single card eval
test_pyreader.decorate_tensor_provider(
test_pyreader.set_batch_generator(
reader.data_generator(
ds,
batch_size=batch_size,
Expand All @@ -324,7 +324,7 @@ def predict_wrapper(reader, exe, test_prog, test_pyreader, graph_vars,

batch_size = args.batch_size if args.predict_batch_size is None else args.predict_batch_size
for test_f, save_f in zip(test_sets, save_dirs):
test_pyreader.decorate_tensor_provider(reader.data_generator(
test_pyreader.set_batch_generator(reader.data_generator(
test_f,
batch_size=batch_size,
epoch=1,
Expand Down
29 changes: 13 additions & 16 deletions ernie/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,20 +41,17 @@


def create_model(pyreader_name, ernie_config):
pyreader = fluid.layers.py_reader(
capacity=70,
shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1],
[-1, 1], [-1, 1]],
dtypes=[
'int64', 'int64', 'int64', 'float32', 'int64', 'int64', 'int64'
],
lod_levels=[0, 0, 0, 0, 0, 0, 0],
name=pyreader_name,
use_double_buffer=True)

(src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos,
labels) = fluid.layers.read_file(pyreader)
src_ids = fluid.layers.data(name='1', shape=[-1, args.max_seq_len, 1], dtype='int64')
pos_ids = fluid.layers.data(name='2', shape=[-1, args.max_seq_len, 1], dtype='int64')
sent_ids= fluid.layers.data(name='3', shape=[-1, args.max_seq_len, 1], dtype='int64')
input_mask = fluid.layers.data(name='4', shape=[-1, args.max_seq_len, 1], dtype='float32')
mask_label = fluid.layers.data(name='5', shape=[-1, 1], dtype='int64')
mask_pos = fluid.layers.data(name='6', shape=[-1, 1], dtype='int64')
labels = fluid.layers.data(name='r', shape=[-1, 1], dtype='int64')

pyreader = fluid.io.DataLoader.from_generator(feed_list=[
src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels
], capacity=70, iterable=False)

ernie = ErnieModel(
src_ids=src_ids,
Expand Down Expand Up @@ -97,7 +94,7 @@ def predict_wrapper(args,

def predict(exe=exe, pyreader=pyreader):

pyreader.decorate_tensor_provider(data_reader.data_generator())
pyreader.set_batch_generator(data_reader.data_generator())
pyreader.start()

cost = 0
Expand Down Expand Up @@ -285,7 +282,7 @@ def train(args):
next_sent_acc.name, mask_lm_loss.name, total_loss.name
])

train_pyreader.decorate_tensor_provider(data_reader.data_generator())
train_pyreader.set_batch_generator(data_reader.data_generator())
train_pyreader.start()
steps = 0
cost = []
Expand Down

0 comments on commit e5d95cb

Please sign in to comment.