forked from mila-iqia/babyai
-
Notifications
You must be signed in to change notification settings - Fork 2
/
train_intelligent_expert.py
executable file
·340 lines (273 loc) · 13 KB
/
train_intelligent_expert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
#!/usr/bin/env python3
"""
Train an agent using an intelligent expert.
The procedure starts with a small set of training demonstrations, and
iteratively grows the training set by some percentage. At every step, the new
demos used to grow the training set are demos the agent is currently failing
on. A new model is trained from scratch at every step.
Sample usage:
scripts/train_intelligent_expert.py --env BabyAI-GoToObj-v0 --demos GoToObj-bot-100k --validation-interval 5
Vanilla imitation learning:
GoToObj, 1000 demos for 100 percent success rate
GoToLocal, over 60K demos needed
"""
import os
import csv
import copy
import gym
import time
import datetime
import numpy as np
import sys
import logging
import babyai.utils as utils
from babyai.arguments import ArgumentParser
from babyai.imitation import ImitationLearning
from babyai.evaluate import batch_evaluate, evaluate
from babyai.utils.agent import BotAgent
import torch
import blosc
from babyai.utils.agent import DemoAgent
# Parse arguments
parser = ArgumentParser()
parser.add_argument("--demos", default=None,
help="demos filename (REQUIRED or demos-origin required)")
parser.add_argument("--demos-origin", required=False,
help="origin of the demonstrations: human | agent (REQUIRED or demos required)")
parser.add_argument("--episodes", type=int, default=0,
help="number of episodes of demonstrations to use"
"(default: 0, meaning all demos)")
parser.add_argument("--start-demos", type=int, default=5000,
help="the starting number of demonstrations")
parser.add_argument("--demo-grow-factor", type=float, default=1.2,
help="number of demos to add to the training set")
parser.add_argument("--num-eval-demos", type=int, default=1000,
help="number of demos used for evaluation while growing the training set")
parser.add_argument("--finetune", action="store_true", default=False,
help="fine-tune the model at every phase instead of retraining")
parser.add_argument("--phases", type=int, default=1000,
help="maximum number of phases to train for")
parser.add_argument("--dagger", action="store_true", default=False,
help="Use DaGGER to add demos")
parser.add_argument("--continue-dagger", action="store_true", default=False,
help='Complete DaGGER trajectories to target')
parser.add_argument("--dagger-trim-coef", type=float, default=2.,
help="Trim agent's trajectories at this number multiplied by the bot mean number of steps")
parser.add_argument("--episodes-to-evaluate-mean", type=int, default=100,
help="Number of episodes to use to evaluate the mean number of steps it takes to solve the task")
parser.add_argument("--dagger-start-with-bot-demos", action="store_true", default=False,
help="If not specified, no full bot demos are considered (args.start_demos is useless)")
parser.add_argument("--additive-train-set-size", type=int, default=None,
help="If set, then train site size is additive, and this overrides args.demo_grow_factor")
logger = logging.getLogger(__name__)
check_obss_equality = DemoAgent.check_obss_equality
def evaluate_agent(il_learn, eval_seed, num_eval_demos, return_obss_actions=False):
"""
Evaluate the agent on some number of episodes and return the seeds for the
episodes the agent performed the worst on.
"""
logger.info("Evaluating agent on {} using {} demos".format(il_learn.args.env, num_eval_demos))
agent = utils.load_agent(il_learn.env, il_learn.args.model)
agent.model.eval()
logs = batch_evaluate(
agent,
il_learn.args.env,
episodes=num_eval_demos,
seed=eval_seed,
return_obss_actions=return_obss_actions
)
agent.model.train()
success_rate = np.mean([1 if r > 0 else 0 for r in logs['return_per_episode']])
logger.info("success rate: {:.2f}".format(success_rate))
# Find the seeds for all the failing demos
fail_seeds = []
fail_obss = []
fail_actions = []
for idx, ret in enumerate(logs["return_per_episode"]):
if ret <= 0:
fail_seeds.append(logs["seed_per_episode"][idx])
if return_obss_actions:
fail_obss.append(logs["observations_per_episode"][idx])
fail_actions.append(logs["actions_per_episode"][idx])
logger.info("{} fails".format(len(fail_seeds)))
if not return_obss_actions:
return success_rate, fail_seeds
else:
return success_rate, fail_seeds, fail_obss, fail_actions
def generate_dagger_demos(env_name, seeds, fail_obss, fail_actions, mean_steps):
env = gym.make(env_name)
agent = BotAgent(env)
demos = []
for i in range(len(fail_obss)):
# Run the expert for one episode
env.seed(int(seeds[i]))
new_obs = env.reset()
agent.on_reset()
env0_str = env.__str__()
actions = []
images = []
directions = []
debug_info = {'seed': [int(seeds[i])], 'actions': []}
try:
for j in range(min(int(args.dagger_trim_coef * mean_steps), len(fail_obss[i]) - 1)):
obs = fail_obss[i][j]
assert check_obss_equality(obs, new_obs), "Observations {} of seed {} don't match".format(j, seeds[i])
mission = obs['mission']
action = agent.act(update_internal_state=False)['action']
_ = agent.bot.replan(fail_actions[i][j])
debug_info['actions'].append(fail_actions[i][j])
new_obs, reward, done, _ = env.step(fail_actions[i][j])
if done and reward > 0:
raise ValueError(
"The baby's actions shouldn't solve the task. Env0 {}, Env9{}, Seed {}, actions {}.".format(
env0_str, env.__str__(), int(seeds[i]), fail_actions[i]
))
actions.append(action)
images.append(obs['image'])
directions.append(obs['direction'])
if args.continue_dagger:
obs = new_obs
while not done:
action = agent.act(obs)['action']
debug_info['actions'].append(action)
new_obs, reward, done, _ = env.step(action)
agent.analyze_feedback(reward, done)
actions.append(action)
images.append(obs['image'])
directions.append(obs['direction'])
print(debug_info, actions)
demos.append((mission, blosc.pack_array(np.array(images)), directions, actions))
except Exception as e:
logger.exception("error while generating demo #{}: {}. Env0 {}, Env9{}, Seed {}, actions {}.".format(
len(demos), e, env0_str, env.__str__(), int(seeds[i]), fail_actions[i]))
continue
return demos
def generate_demos(env_name, seeds):
env = gym.make(env_name)
agent = BotAgent(env)
demos = []
for seed in seeds:
# Run the expert for one episode
done = False
env.seed(int(seed))
obs = env.reset()
agent.on_reset()
actions = []
mission = obs["mission"]
images = []
directions = []
try:
while not done:
action = agent.act(obs)['action']
new_obs, reward, done, _ = env.step(action)
agent.analyze_feedback(reward, done)
actions.append(action)
images.append(obs['image'])
directions.append(obs['direction'])
obs = new_obs
if reward > 0:
demos.append((mission, blosc.pack_array(np.array(images)), directions, actions))
if reward == 0:
logger.info("failed to accomplish the mission")
except Exception:
logger.exception("error while generating demo #{}".format(len(demos)))
continue
# logger.info("demo #{}".format(len(demos)))
return demos
def grow_training_set(il_learn, train_demos, eval_seed, grow_factor, num_eval_demos, dagger=False, mean_steps=None):
"""
Grow the training set of demonstrations by some factor
"""
new_train_set_size = int(len(train_demos) * grow_factor)
if args.additive_train_set_size is not None:
new_train_set_size = len(train_demos) + args.additive_train_set_size
num_new_demos = new_train_set_size - len(train_demos)
logger.info("Generating {} new demos for {}".format(num_new_demos, il_learn.args.env))
# Add new demos until we rearch the new target size
while len(train_demos) < new_train_set_size:
num_new_demos = new_train_set_size - len(train_demos)
# Evaluate the success rate of the model
if not dagger:
success_rate, fail_seeds = evaluate_agent(il_learn, eval_seed, num_eval_demos)
else:
success_rate, fail_seeds, fail_obss, fail_actions = evaluate_agent(il_learn, eval_seed, num_eval_demos,
True)
eval_seed += num_eval_demos
if len(fail_seeds) > num_new_demos:
fail_seeds = fail_seeds[:num_new_demos]
if dagger:
fail_obss = fail_obss[:num_new_demos]
fail_actions = fail_actions[:num_new_demos]
# Generate demos for the worst performing seeds
if not dagger:
new_demos = generate_demos(il_learn.args.env, fail_seeds)
else:
new_demos = generate_dagger_demos(il_learn.args.env, fail_seeds, fail_obss, fail_actions, mean_steps)
train_demos.extend(new_demos)
return eval_seed
def get_bot_mean(env_name, episodes_to_evaluate_mean, seed):
logger.info("Evaluating the average number of steps using {} episodes".format(episodes_to_evaluate_mean))
env = gym.make(env_name)
env.seed(seed)
agent = BotAgent(env)
logs = evaluate(agent, env, episodes_to_evaluate_mean, model_agent=False)
average_number_of_steps = np.mean(logs["num_frames_per_episode"])
logger.info("Average number of steps: {}".format(average_number_of_steps))
return average_number_of_steps
def main(args):
args.model = args.model or ImitationLearning.default_model_name(args)
utils.configure_logging(args.model)
il_learn = ImitationLearning(args)
# Define logger and Tensorboard writer
header = (["update", "frames", "FPS", "duration", "entropy", "policy_loss", "train_accuracy"]
+ ["validation_accuracy", "validation_return", "validation_success_rate"])
writer = None
if args.tb:
from tensorboardX import SummaryWriter
writer = SummaryWriter(utils.get_log_dir(args.model))
# Define csv writer
csv_path = os.path.join(utils.get_log_dir(args.model), 'log.csv')
first_created = not os.path.exists(csv_path)
# we don't buffer data going in the csv log, cause we assume
# that one update will take much longer that one write to the log
csv_writer = csv.writer(open(csv_path, 'a', 1))
if first_created:
csv_writer.writerow(header)
# Get the status path
status_path = os.path.join(utils.get_log_dir(args.model), 'status.json')
# Log command, availability of CUDA, and model
logger.info(args)
logger.info("CUDA available: {}".format(torch.cuda.is_available()))
logger.info(il_learn.acmodel)
train_demos = []
# Generate the initial set of training demos
if not args.dagger or args.dagger_start_with_bot_demos:
train_demos += generate_demos(args.env, range(args.seed, args.seed + args.start_demos))
# Seed at which evaluation will begin
eval_seed = args.seed + args.start_demos
model_name = args.model
if args.dagger:
mean_steps = get_bot_mean(args.env, args.episodes_to_evaluate_mean, args.seed)
else:
mean_steps = None
for phase_no in range(0, args.phases):
logger.info("Starting phase {} with {} demos".format(phase_no, len(train_demos)))
if not args.finetune:
# Create a new model to be trained from scratch
logging.info("Creating new model to be trained from scratch")
args.model = model_name + ('_phase_%d' % phase_no)
il_learn = ImitationLearning(args)
# Train the imitation learning agent
if len(train_demos) > 0:
il_learn.train(train_demos, writer, csv_writer, status_path, header, reset_status=True)
# Stopping criterion
valid_log = il_learn.validate(args.val_episodes)
success_rate = np.mean([1 if r > 0 else 0 for r in valid_log[0]['return_per_episode']])
if success_rate >= 0.99:
logger.info("Reached target success rate with {} demos, stopping".format(len(train_demos)))
break
eval_seed = grow_training_set(il_learn, train_demos, eval_seed, args.demo_grow_factor, args.num_eval_demos,
args.dagger, mean_steps)
if __name__ == "__main__":
args = parser.parse_args()
main(args)