Skip to content

Commit

Permalink
variable length rollouts and wide start state distribution
Browse files Browse the repository at this point in the history
- horizon length for NPG updates exposed to learning code
- initial state distribution used for updates is a mix of MDP start state
  distribution and states from the replay buffer sampled uniformly at random
  • Loading branch information
aravindr93 committed Aug 1, 2020
1 parent 5858c36 commit eb4110e
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 6 deletions.
11 changes: 7 additions & 4 deletions mjrl/algos/model_accel/model_accel_npg.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ def train_step(self, N,
gae_lambda=0.97,
num_cpu='max',
env_kwargs=None,
init_states=None,
**kwargs,
):

ts = timer.time()
Expand All @@ -82,10 +84,11 @@ def train_step(self, N,
# we want to use the same task instances (e.g. goal locations) for each model in ensemble
paths = []

# NOTE: When running on hardware, we need to load the set of initial states from a pickle file
# init_states = pickle.load(open(<some_file>.pickle, 'rb'))
# init_states = init_states[:N]
init_states = np.array([env.reset() for _ in range(N)])
# NOTE: We can optionally specify a set of initial states to perform the rollouts from
# This is useful for starting rollouts from the states in the replay buffer
init_states = np.array([env.reset() for _ in range(N)]) if init_states is None else init_states
assert type(init_states) == list
assert len(init_states) == N

for model in self.fitted_model:
# dont set seed explicitly -- this will make rollouts follow tne global seed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@
'policy_size' : (32, 32),
'inner_steps' : 10,
'step_size' : 0.05,
'update_paths' : 100,
'update_paths' : 250,
'horizon' : 10,
'hvp_frac' : None,

}
16 changes: 15 additions & 1 deletion mjrl/algos/model_accel/run_experiments/run_model_accel_npg.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,21 @@
# ======================
agent.fitted_model = models
for inner_step in range(job_data['inner_steps']):
agent.train_step(N=job_data['update_paths'])
# Healthy mix for initial states : half of them come from the MDP initial
# state distribution and the remaining half comes from the replay buffer,
# chosen uniformly at random. Buffer already concatenated into numpy array
# for model learning (s, a, sp, r)
if job_data['device_path'] is None:
# can only do this for non-hardware tasks
num_states_1, num_states_2 = job_data['update_paths'] // 2, job_data['update_paths'] // 2
buffer_rand_idx = np.random.choice(s.shape[0], size=num_states_2, replace=True)
init_states_1 = [e.reset() for _ in range(num_states_1)]
init_states_2 = list(s[buffer_rand_idx])
init_states = init_states_1 + init_states_2
else:
buffer_rand_idx = np.random.choice(s.shape[0], size=job_data['update_paths'], replace=True)
init_states = list(s[buffer_rand_idx])
agent.train_step(N=len(init_states), init_states=init_states, horizon=job_data['horizon'])
print_data = sorted(filter(lambda v: np.asarray(v[1]).size == 1,
agent.logger.get_current_log().items()))
print(tabulate(print_data))
Expand Down

0 comments on commit eb4110e

Please sign in to comment.