Chapter 10: Machine Teaching

ebilgin · ebilgin · commit 5ca53b438414 · 2020-11-25T04:00:04.000-08:00
diff --git a/Chapter10/custom_mcar.py b/Chapter10/custom_mcar.py
@@ -0,0 +1,100 @@
+import gym
+from gym.spaces import Box, Dict
+import numpy as np
+
+
+class MountainCar(gym.Env):
+    def __init__(self, env_config={}):
+        self.wrapped = gym.make("MountainCar-v0")
+        self.action_space = self.wrapped.action_space
+        self.t = 0
+        self.reward_fun = env_config.get("reward_fun")
+        self.lesson = env_config.get("lesson")
+        self.use_action_masking = env_config.get("use_action_masking", False)
+        self.action_mask = None
+        self.reset()
+        if self.use_action_masking:
+            self.observation_space = Dict(
+                {
+                    "action_mask": Box(0, 1, shape=(self.action_space.n,)),
+                    "actual_obs": self.wrapped.observation_space,
+                }
+            )
+        else:
+            self.observation_space = self.wrapped.observation_space
+
+    def _get_obs(self):
+        raw_obs = np.array(self.wrapped.unwrapped.state)
+        if self.use_action_masking:
+            self.update_avail_actions()
+            obs = {
+                "action_mask": self.action_mask,
+                "actual_obs": raw_obs,
+            }
+        else:
+            obs = raw_obs
+        return obs
+
+    def reset(self):
+        self.wrapped.reset()
+        self.t = 0
+        self.wrapped.unwrapped.state = self._get_init_conditions()
+        obs = self._get_obs()
+        return obs
+
+    def _get_init_conditions(self):
+        if self.lesson == 0:
+            low = 0.1
+            high = 0.4
+            velocity = self.wrapped.np_random.uniform(
+                low=0, high=self.wrapped.max_speed
+            )
+        elif self.lesson == 1:
+            low = -0.4
+            high = 0.1
+            velocity = self.wrapped.np_random.uniform(
+                low=0, high=self.wrapped.max_speed
+            )
+        elif self.lesson == 2:
+            low = -0.6
+            high = -0.4
+            velocity = self.wrapped.np_random.uniform(
+                low=-self.wrapped.max_speed, high=self.wrapped.max_speed
+            )
+        elif self.lesson == 3:
+            low = -0.6
+            high = -0.1
+            velocity = self.wrapped.np_random.uniform(
+                low=-self.wrapped.max_speed, high=self.wrapped.max_speed
+            )
+        elif self.lesson == 4 or self.lesson is None:
+            low = -0.6
+            high = -0.4
+            velocity = 0
+        else:
+            raise ValueError
+        obs = (self.wrapped.np_random.uniform(low=low, high=high), velocity)
+        return obs
+
+    def set_lesson(self, lesson):
+        self.lesson = lesson
+
+    def step(self, action):
+        self.t += 1
+        state, reward, done, info = self.wrapped.step(action)
+        if self.reward_fun == "custom_reward":
+            position, velocity = state
+            reward += (abs(position + 0.5) ** 2) * (position > -0.5)
+        obs = self._get_obs()
+        if self.t >= 200:
+            done = True
+        return obs, reward, done, info
+
+    def update_avail_actions(self):
+        self.action_mask = np.array([1.0] * self.action_space.n)
+        pos, vel = self.wrapped.unwrapped.state
+        # 0: left, 1: no action, 2: right
+        if (pos < -0.3) and (pos > -0.8) and (vel < 0) and (vel > -0.05):
+            self.action_mask[1] = 0
+            self.action_mask[2] = 0
+
diff --git a/Chapter10/masking_model.py b/Chapter10/masking_model.py
@@ -0,0 +1,39 @@
+from gym.spaces import Box
+from ray.rllib.agents.dqn.distributional_q_tf_model import DistributionalQTFModel
+from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
+from ray.rllib.utils.framework import try_import_tf
+
+tf1, tf, tfv = try_import_tf()
+
+
+class ParametricActionsModel(DistributionalQTFModel):
+    def __init__(
+        self,
+        obs_space,
+        action_space,
+        num_outputs,
+        model_config,
+        name,
+        true_obs_shape=(2,),
+        **kw
+    ):
+        super(ParametricActionsModel, self).__init__(
+            obs_space, action_space, num_outputs, model_config, name, **kw
+        )
+        self.action_value_model = FullyConnectedNetwork(
+            Box(-1, 1, shape=true_obs_shape),
+            action_space,
+            num_outputs,
+            model_config,
+            name + "_action_values",
+        )
+        self.register_variables(self.action_value_model.variables())
+
+    def forward(self, input_dict, state, seq_lens):
+        action_mask = input_dict["obs"]["action_mask"]
+        action_values, _ = self.action_value_model(
+            {"obs": input_dict["obs"]["actual_obs"]}
+        )
+        inf_mask = tf.maximum(tf.math.log(action_mask), tf.float32.min)
+        return action_values + inf_mask, state
+
diff --git a/Chapter10/mcar_demo.py b/Chapter10/mcar_demo.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python
+import os
+
+import numpy as np
+import sys, gym, time
+
+import ray.utils
+
+from ray.rllib.models.preprocessors import get_preprocessor
+from ray.rllib.evaluation.sample_batch_builder import SampleBatchBuilder
+from ray.rllib.offline.json_writer import JsonWriter
+
+from custom_mcar import MountainCar
+
+DEMO_DATA_DIR = "mcar-out"
+
+
+def key_press(key, mod):
+    global human_agent_action, human_wants_restart, human_sets_pause
+    if key == 0xFF0D:
+        human_wants_restart = True
+    if key == 32:
+        human_sets_pause = not human_sets_pause
+    a = int(key - ord("0"))
+    if a <= 0 or a >= ACTIONS:
+        return
+    human_agent_action = a
+
+
+def key_release(key, mod):
+    global human_agent_action
+    a = int(key - ord("0"))
+    if a <= 0 or a >= ACTIONS:
+        return
+    if human_agent_action == a:
+        human_agent_action = 0
+
+
+def rollout(env, eps_id):
+    global human_agent_action, human_wants_restart, human_sets_pause
+    human_wants_restart = False
+    obs = env.reset()
+    prev_action = np.zeros_like(env.action_space.sample())
+    prev_reward = 0
+    t = 0
+    skip = 0
+    total_reward = 0
+    total_timesteps = 0
+    while 1:
+        if not skip:
+            print("taking action {}".format(human_agent_action))
+            a = human_agent_action
+            total_timesteps += 1
+            skip = SKIP_CONTROL
+        else:
+            skip -= 1
+
+        new_obs, r, done, info = env.step(a)
+        # Build the batch
+        batch_builder.add_values(
+            t=t,
+            eps_id=eps_id,
+            agent_index=0,
+            obs=prep.transform(obs),
+            actions=a,
+            action_prob=1.0,  # put the true action probability here
+            action_logp=0,
+            action_dist_inputs=None,
+            rewards=r,
+            prev_actions=prev_action,
+            prev_rewards=prev_reward,
+            dones=done,
+            infos=info,
+            new_obs=prep.transform(new_obs),
+        )
+        obs = new_obs
+        prev_action = a
+        prev_reward = r
+
+        if r != 0:
+            print("reward %0.3f" % r)
+        total_reward += r
+        window_still_open = env.wrapped.render()
+        if window_still_open == False:
+            return False
+        if done:
+            break
+        if human_wants_restart:
+            break
+        while human_sets_pause:
+            env.wrapped.render()
+            time.sleep(0.1)
+        time.sleep(0.1)
+    print("timesteps %i reward %0.2f" % (total_timesteps, total_reward))
+    writer.write(batch_builder.build_and_reset())
+
+
+if __name__ == "__main__":
+    batch_builder = SampleBatchBuilder()  # or MultiAgentSampleBatchBuilder
+    writer = JsonWriter(DEMO_DATA_DIR)
+
+    env = MountainCar()
+
+    # RLlib uses preprocessors to implement transforms such as one-hot encoding
+    # and flattening of tuple and dict observations. For CartPole a no-op
+    # preprocessor is used, but this may be relevant for more complex envs.
+    prep = get_preprocessor(env.observation_space)(env.observation_space)
+    print("The preprocessor is", prep)
+
+    if not hasattr(env.action_space, "n"):
+        raise Exception("Keyboard agent only supports discrete action spaces")
+    ACTIONS = env.action_space.n
+    SKIP_CONTROL = 0  # Use previous control decision SKIP_CONTROL times, that's how you
+    # can test what skip is still usable.
+
+    human_agent_action = 0
+    human_wants_restart = False
+    human_sets_pause = False
+
+    env.reset()
+    env.wrapped.render()
+    env.wrapped.unwrapped.viewer.window.on_key_press = key_press
+    env.wrapped.unwrapped.viewer.window.on_key_release = key_release
+
+    print("ACTIONS={}".format(ACTIONS))
+    print("Press keys 1 2 3 ... to take actions 1 2 3 ...")
+    print("No keys pressed is taking action 0")
+
+    for i in range(20):
+        window_still_open = rollout(env, i)
+        if window_still_open == False:
+            break
+
diff --git a/Chapter10/mcar_train.py b/Chapter10/mcar_train.py