variable length rollouts and wide start state distribution

- horizon length for NPG updates exposed to learning code - initial state distribution used for updates is a mix of MDP start state distribution and states from the replay buffer sampled uniformly at random
wjohnsonup · Aug 1, 2020 · eb4110e · eb4110e
1 parent 5858c36
commit eb4110e
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 6 deletions.
diff --git a/mjrl/algos/model_accel/model_accel_npg.py b/mjrl/algos/model_accel/model_accel_npg.py
@@ -61,6 +61,8 @@ def train_step(self, N,
                    gae_lambda=0.97,
                    num_cpu='max',
                    env_kwargs=None,
+                   init_states=None,
+                   **kwargs,
                    ):
 
         ts = timer.time()
@@ -82,10 +84,11 @@ def train_step(self, N,
         # we want to use the same task instances (e.g. goal locations) for each model in ensemble
         paths = []
 
-        # NOTE: When running on hardware, we need to load the set of initial states from a pickle file
-        # init_states = pickle.load(open(<some_file>.pickle, 'rb'))
-        # init_states = init_states[:N]
-        init_states = np.array([env.reset() for _ in range(N)])
+        # NOTE: We can optionally specify a set of initial states to perform the rollouts from
+        # This is useful for starting rollouts from the states in the replay buffer
+        init_states = np.array([env.reset() for _ in range(N)]) if init_states is None else init_states
+        assert type(init_states) == list
+        assert len(init_states) == N
 
         for model in self.fitted_model:
             # dont set seed explicitly -- this will make rollouts follow tne global seed

diff --git a/mjrl/algos/model_accel/run_experiments/example_config_npg.txt b/mjrl/algos/model_accel/run_experiments/example_config_npg.txt
@@ -36,7 +36,8 @@
 'policy_size'   :   (32, 32),
 'inner_steps'   :   10,
 'step_size'     :   0.05,
-'update_paths'  :   100,
+'update_paths'  :   250,
+'horizon'       :   10,   
 'hvp_frac'      :   None,
 
 }
diff --git a/mjrl/algos/model_accel/run_experiments/run_model_accel_npg.py b/mjrl/algos/model_accel/run_experiments/run_model_accel_npg.py
@@ -147,7 +147,21 @@
     # ======================
     agent.fitted_model = models
     for inner_step in range(job_data['inner_steps']):
-        agent.train_step(N=job_data['update_paths'])
+        # Healthy mix for initial states : half of them come from the MDP initial 
+        # state distribution and the remaining half comes from the replay buffer,
+        # chosen uniformly at random. Buffer already concatenated into numpy array 
+        # for model learning (s, a, sp, r)
+        if job_data['device_path'] is None:
+            # can only do this for non-hardware tasks
+            num_states_1, num_states_2 = job_data['update_paths'] // 2, job_data['update_paths'] // 2
+            buffer_rand_idx = np.random.choice(s.shape[0], size=num_states_2, replace=True)
+            init_states_1 = [e.reset() for _ in range(num_states_1)]
+            init_states_2 = list(s[buffer_rand_idx])
+            init_states = init_states_1 + init_states_2
+        else:
+            buffer_rand_idx = np.random.choice(s.shape[0], size=job_data['update_paths'], replace=True)
+            init_states = list(s[buffer_rand_idx])
+        agent.train_step(N=len(init_states), init_states=init_states, horizon=job_data['horizon'])
         print_data = sorted(filter(lambda v: np.asarray(v[1]).size == 1,
                                    agent.logger.get_current_log().items()))
         print(tabulate(print_data))