[RLlib] Tf2x preparation; part 2 (upgrading try_import_tf()). (ray-…

…project#9136) * WIP. * Fixes. * LINT. * WIP. * WIP. * Fixes. * Fixes. * Fixes. * Fixes. * WIP. * Fixes. * Test * Fix. * Fixes and LINT. * Fixes and LINT. * LINT.
nemo9cby · Jun 30, 2020 · 43043ee · 43043ee
1 parent fb074da
commit 43043ee
Show file tree

Hide file tree

Showing 125 changed files with 617 additions and 584 deletions.
diff --git a/python/ray/experimental/tf_utils.py b/python/ray/experimental/tf_utils.py
@@ -4,7 +4,7 @@
 from ray.rllib.utils import force_list
 from ray.rllib.utils.framework import try_import_tf
 
-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 
 
 def unflatten(vector, shapes):
@@ -79,24 +79,29 @@ def __init__(self, output, sess=None, input_variables=None):
                 variable_names.append(tf_obj.node_def.name)
         self.variables = OrderedDict()
         variable_list = [
-            v for v in tf.global_variables()
+            v for v in tf1.global_variables()
             if v.op.node_def.name in variable_names
         ]
         if input_variables is not None:
             variable_list += input_variables
-        for v in variable_list:
-            self.variables[v.op.node_def.name] = v
 
-        self.placeholders = {}
-        self.assignment_nodes = {}
+        if not tf1.executing_eagerly():
+            for v in variable_list:
+                self.variables[v.op.node_def.name] = v
 
-        # Create new placeholders to put in custom weights.
-        for k, var in self.variables.items():
-            self.placeholders[k] = tf.placeholder(
-                var.value().dtype,
-                var.get_shape().as_list(),
-                name="Placeholder_" + k)
-            self.assignment_nodes[k] = var.assign(self.placeholders[k])
+            self.placeholders = {}
+            self.assignment_nodes = {}
+
+            # Create new placeholders to put in custom weights.
+            for k, var in self.variables.items():
+                self.placeholders[k] = tf1.placeholder(
+                    var.value().dtype,
+                    var.get_shape().as_list(),
+                    name="Placeholder_" + k)
+                self.assignment_nodes[k] = var.assign(self.placeholders[k])
+        else:
+            for v in variable_list:
+                self.variables[v.name] = v
 
     def set_session(self, sess):
         """Sets the current session used by the class.
@@ -117,10 +122,12 @@ def get_flat_size(self):
 
     def _check_sess(self):
         """Checks if the session is set, and if not throw an error message."""
-        assert self.sess is not None, ("The session is not set. Set the "
-                                       "session either by passing it into the "
-                                       "TensorFlowVariables constructor or by "
-                                       "calling set_session(sess).")
+        if tf1.executing_eagerly():
+            return
+        assert self.sess is not None, \
+            "The session is not set. Set the session either by passing it " \
+            "into the TensorFlowVariables constructor or by calling " \
+            "set_session(sess)."
 
     def get_flat(self):
         """Gets the weights and returns them as a flat array.
@@ -129,6 +136,11 @@ def get_flat(self):
             1D Array containing the flattened weights.
         """
         self._check_sess()
+        # Eager mode.
+        if not self.sess:
+            return np.concatenate(
+                [v.numpy().flatten() for v in self.variables.values()])
+        # Graph mode.
         return np.concatenate([
             v.eval(session=self.sess).flatten()
             for v in self.variables.values()
@@ -147,12 +159,16 @@ def set_flat(self, new_weights):
         self._check_sess()
         shapes = [v.get_shape().as_list() for v in self.variables.values()]
         arrays = unflatten(new_weights, shapes)
-        placeholders = [
-            self.placeholders[k] for k, v in self.variables.items()
-        ]
-        self.sess.run(
-            list(self.assignment_nodes.values()),
-            feed_dict=dict(zip(placeholders, arrays)))
+        if not self.sess:
+            for v, a in zip(self.variables.values(), arrays):
+                v.assign(a)
+        else:
+            placeholders = [
+                self.placeholders[k] for k, v in self.variables.items()
+            ]
+            self.sess.run(
+                list(self.assignment_nodes.values()),
+                feed_dict=dict(zip(placeholders, arrays)))
 
     def get_weights(self):
         """Returns a dictionary containing the weights of the network.
@@ -161,6 +177,10 @@ def get_weights(self):
             Dictionary mapping variable names to their weights.
         """
         self._check_sess()
+        # Eager mode.
+        if not self.sess:
+            return self.variables
+        # Graph mode.
         return self.sess.run(self.variables)
 
     def set_weights(self, new_weights):

diff --git a/rllib/BUILD b/rllib/BUILD
@@ -344,6 +344,7 @@ py_test(
     args = ["--yaml-dir=tuned_examples/sac", "--torch"]
 )
 
+
 # TD3
 py_test(
     name = "run_regression_tests_pendulum_td3_tf",
@@ -1013,6 +1014,13 @@ py_test(
     srcs = ["models/tests/test_distributions.py"]
 )
 
+py_test(
+    name = "test_attention_nets",
+    tags = ["models"],
+    size = "small",
+    srcs = ["models/tests/test_attention_nets.py"]
+)
+
 # --------------------------------------------------------------------
 # Optimizers and Memories
 # rllib/execution/

diff --git a/rllib/agents/a3c/a3c_tf_policy.py b/rllib/agents/a3c/a3c_tf_policy.py
@@ -9,7 +9,7 @@
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.tf_ops import explained_variance, make_tf_callable
 
-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 
 
 class A3CLoss:

diff --git a/rllib/agents/ars/ars_tf_policy.py b/rllib/agents/ars/ars_tf_policy.py
@@ -13,7 +13,7 @@
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.spaces.space_utils import unbatch
 
-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 
 
 class ARSTFPolicy:
@@ -29,8 +29,8 @@ def __init__(self, obs_space, action_space, config):
         self.single_threaded = config.get("single_threaded", False)
         self.sess = make_session(single_threaded=self.single_threaded)
 
-        self.inputs = tf.placeholder(tf.float32,
-                                     [None] + list(self.preprocessor.shape))
+        self.inputs = tf1.placeholder(tf.float32,
+                                      [None] + list(self.preprocessor.shape))
 
         # Policy network.
         dist_class, dist_dim = ModelCatalog.get_action_dist(
@@ -52,7 +52,7 @@ def __init__(self, obs_space, action_space, config):
         self.num_params = sum(
             np.prod(variable.shape.as_list())
             for _, variable in self.variables.variables.items())
-        self.sess.run(tf.global_variables_initializer())
+        self.sess.run(tf1.global_variables_initializer())
 
     def compute_actions(self,
                         observation,

diff --git a/rllib/agents/ddpg/ddpg_tf_model.py b/rllib/agents/ddpg/ddpg_tf_model.py
@@ -3,7 +3,7 @@
 from ray.rllib.models.tf.tf_modelv2 import TFModelV2
 from ray.rllib.utils.framework import try_import_tf
 
-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 
 
 class DDPGTFModel(TFModelV2):

diff --git a/rllib/agents/ddpg/ddpg_tf_policy.py b/rllib/agents/ddpg/ddpg_tf_policy.py
@@ -22,7 +22,7 @@
 from ray.rllib.utils.tf_ops import huber_loss, minimize_and_clip, \
     make_tf_callable
 
-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 
 logger = logging.getLogger(__name__)
 
@@ -126,18 +126,18 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
     target_model_out_tp1, _ = policy.target_model(input_dict_next, [], None)
 
     # Policy network evaluation.
-    with tf.variable_scope(POLICY_SCOPE, reuse=True):
-        # prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
+    with tf1.variable_scope(POLICY_SCOPE, reuse=True):
+        # prev_update_ops = set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS))
         policy_t = model.get_policy_output(model_out_t)
         # policy_batchnorm_update_ops = list(
-        #    set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
+        #   set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
 
-    with tf.variable_scope(POLICY_TARGET_SCOPE):
+    with tf1.variable_scope(POLICY_TARGET_SCOPE):
         policy_tp1 = \
             policy.target_model.get_policy_output(target_model_out_tp1)
 
     # Action outputs.
-    with tf.variable_scope(ACTION_SCOPE, reuse=True):
+    with tf1.variable_scope(ACTION_SCOPE, reuse=True):
         if policy.config["smooth_target_policy"]:
             target_noise_clip = policy.config["target_noise_clip"]
             clipped_normal_sample = tf.clip_by_value(
@@ -154,29 +154,29 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
             policy_tp1_smoothed = policy_tp1
 
     # Q-net(s) evaluation.
-    # prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
-    with tf.variable_scope(Q_SCOPE):
+    # prev_update_ops = set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS))
+    with tf1.variable_scope(Q_SCOPE):
         # Q-values for given actions & observations in given current
         q_t = model.get_q_values(model_out_t, train_batch[SampleBatch.ACTIONS])
 
-    with tf.variable_scope(Q_SCOPE, reuse=True):
+    with tf1.variable_scope(Q_SCOPE, reuse=True):
         # Q-values for current policy (no noise) in given current state
         q_t_det_policy = model.get_q_values(model_out_t, policy_t)
 
     if twin_q:
-        with tf.variable_scope(TWIN_Q_SCOPE):
+        with tf1.variable_scope(TWIN_Q_SCOPE):
             twin_q_t = model.get_twin_q_values(
                 model_out_t, train_batch[SampleBatch.ACTIONS])
     # q_batchnorm_update_ops = list(
-    #     set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
+    #     set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
 
     # Target q-net(s) evaluation.
-    with tf.variable_scope(Q_TARGET_SCOPE):
+    with tf1.variable_scope(Q_TARGET_SCOPE):
         q_tp1 = policy.target_model.get_q_values(target_model_out_tp1,
                                                  policy_tp1_smoothed)
 
     if twin_q:
-        with tf.variable_scope(TWIN_Q_TARGET_SCOPE):
+        with tf1.variable_scope(TWIN_Q_TARGET_SCOPE):
             twin_q_tp1 = policy.target_model.get_twin_q_values(
                 target_model_out_tp1, policy_tp1_smoothed)
 
@@ -220,10 +220,10 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
     if l2_reg is not None:
         for var in policy.model.policy_variables():
             if "bias" not in var.name:
-                actor_loss += (l2_reg * tf.nn.l2_loss(var))
+                actor_loss += (l2_reg * tf1.nn.l2_loss(var))
         for var in policy.model.q_variables():
             if "bias" not in var.name:
-                critic_loss += (l2_reg * tf.nn.l2_loss(var))
+                critic_loss += (l2_reg * tf1.nn.l2_loss(var))
 
     # Model self-supervised losses.
     if policy.config["use_state_preprocessor"]:
@@ -259,9 +259,9 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
 
 def make_ddpg_optimizers(policy, config):
     # Create separate optimizers for actor & critic losses.
-    policy._actor_optimizer = tf.train.AdamOptimizer(
+    policy._actor_optimizer = tf1.train.AdamOptimizer(
         learning_rate=config["actor_lr"])
-    policy._critic_optimizer = tf.train.AdamOptimizer(
+    policy._critic_optimizer = tf1.train.AdamOptimizer(
         learning_rate=config["critic_lr"])
     return None
 
@@ -286,7 +286,7 @@ def build_apply_op(policy, optimizer, grads_and_vars):
     # For policy gradient, update policy net one time v.s.
     # update critic net `policy_delay` time(s).
     should_apply_actor_opt = tf.equal(
-        tf.mod(policy.global_step, policy.config["policy_delay"]), 0)
+        tf.math.floormod(policy.global_step, policy.config["policy_delay"]), 0)
 
     def make_apply_op():
         return policy._actor_optimizer.apply_gradients(
@@ -299,7 +299,7 @@ def make_apply_op():
     critic_op = policy._critic_optimizer.apply_gradients(
         policy._critic_grads_and_vars)
     # Increment global step & apply ops.
-    with tf.control_dependencies([tf.assign_add(policy.global_step, 1)]):
+    with tf1.control_dependencies([tf1.assign_add(policy.global_step, 1)]):
         return tf.group(actor_op, critic_op)
 
 
@@ -341,7 +341,7 @@ def build_ddpg_stats(policy, batch):
 
 def before_init_fn(policy, obs_space, action_space, config):
     # Create global step for counting the number of update operations.
-    policy.global_step = tf.train.get_or_create_global_step()
+    policy.global_step = tf1.train.get_or_create_global_step()
 
 
 class ComputeTDErrorMixin:

diff --git a/rllib/agents/ddpg/ddpg_torch_policy.py b/rllib/agents/ddpg/ddpg_torch_policy.py
@@ -49,10 +49,10 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
     target_model_out_tp1, _ = policy.target_model(input_dict_next, [], None)
 
     # Policy network evaluation.
-    # prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
+    # prev_update_ops = set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS))
     policy_t = model.get_policy_output(model_out_t)
     # policy_batchnorm_update_ops = list(
-    #    set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
+    #    set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
 
     policy_tp1 = \
         policy.target_model.get_policy_output(target_model_out_tp1)
@@ -73,7 +73,7 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
         policy_tp1_smoothed = policy_tp1
 
     # Q-net(s) evaluation.
-    # prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
+    # prev_update_ops = set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS))
     # Q-values for given actions & observations in given current
     q_t = model.get_q_values(model_out_t, train_batch[SampleBatch.ACTIONS])
 
@@ -86,7 +86,7 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
         twin_q_t = model.get_twin_q_values(model_out_t,
                                            train_batch[SampleBatch.ACTIONS])
     # q_batchnorm_update_ops = list(
-    #     set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
+    #     set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
 
     # Target q-net(s) evaluation.
     q_tp1 = policy.target_model.get_q_values(target_model_out_tp1,

diff --git a/rllib/agents/ddpg/noop_model.py b/rllib/agents/ddpg/noop_model.py
@@ -4,7 +4,7 @@
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_tf
 
-tf = try_import_tf()
+_, tf, _ = try_import_tf()
 
 
 class NoopModel(TFModelV2):

diff --git a/rllib/agents/ddpg/tests/test_td3.py b/rllib/agents/ddpg/tests/test_td3.py
@@ -6,7 +6,7 @@
 from ray.rllib.utils.test_utils import check, check_compute_single_action, \
     framework_iterator
 
-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 
 
 class TestTD3(unittest.TestCase):
@@ -32,8 +32,9 @@ def test_td3_exploration_and_with_random_prerun(self):
 
         # Test against all frameworks.
         for _ in framework_iterator(config, frameworks="tf"):
+            lcl_config = config.copy()
             # Default GaussianNoise setup.
-            trainer = td3.TD3Trainer(config=config, env="Pendulum-v0")
+            trainer = td3.TD3Trainer(config=lcl_config, env="Pendulum-v0")
             # Setting explore=False should always return the same action.
             a_ = trainer.compute_action(obs, explore=False)
             for _ in range(50):
@@ -44,17 +45,18 @@ def test_td3_exploration_and_with_random_prerun(self):
             for _ in range(50):
                 actions.append(trainer.compute_action(obs))
             check(np.std(actions), 0.0, false=True)
+            trainer.stop()
 
             # Check randomness at beginning.
-            config["exploration_config"] = {
+            lcl_config["exploration_config"] = {
                 # Act randomly at beginning ...
                 "random_timesteps": 30,
                 # Then act very closely to deterministic actions thereafter.
                 "stddev": 0.001,
                 "initial_scale": 0.001,
                 "final_scale": 0.001,
             }
-            trainer = td3.TD3Trainer(config=config, env="Pendulum-v0")
+            trainer = td3.TD3Trainer(config=lcl_config, env="Pendulum-v0")
             # ts=1 (get a deterministic action as per explore=False).
             deterministic_action = trainer.compute_action(obs, explore=False)
             # ts=2-5 (in random window).
@@ -73,6 +75,7 @@ def test_td3_exploration_and_with_random_prerun(self):
             for _ in range(50):
                 a = trainer.compute_action(obs, explore=False)
                 check(a, deterministic_action)
+            trainer.stop()
 
 
 if __name__ == "__main__":