Skip to content

Commit

Permalink
[RLlib] Tf2x preparation; part 2 (upgrading try_import_tf()). (ray-…
Browse files Browse the repository at this point in the history
…project#9136)

* WIP.

* Fixes.

* LINT.

* WIP.

* WIP.

* Fixes.

* Fixes.

* Fixes.

* Fixes.

* WIP.

* Fixes.

* Test

* Fix.

* Fixes and LINT.

* Fixes and LINT.

* LINT.
  • Loading branch information
sven1977 authored Jun 30, 2020
1 parent fb074da commit 43043ee
Show file tree
Hide file tree
Showing 125 changed files with 617 additions and 584 deletions.
66 changes: 43 additions & 23 deletions python/ray/experimental/tf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from ray.rllib.utils import force_list
from ray.rllib.utils.framework import try_import_tf

tf = try_import_tf()
tf1, tf, tfv = try_import_tf()


def unflatten(vector, shapes):
Expand Down Expand Up @@ -79,24 +79,29 @@ def __init__(self, output, sess=None, input_variables=None):
variable_names.append(tf_obj.node_def.name)
self.variables = OrderedDict()
variable_list = [
v for v in tf.global_variables()
v for v in tf1.global_variables()
if v.op.node_def.name in variable_names
]
if input_variables is not None:
variable_list += input_variables
for v in variable_list:
self.variables[v.op.node_def.name] = v

self.placeholders = {}
self.assignment_nodes = {}
if not tf1.executing_eagerly():
for v in variable_list:
self.variables[v.op.node_def.name] = v

# Create new placeholders to put in custom weights.
for k, var in self.variables.items():
self.placeholders[k] = tf.placeholder(
var.value().dtype,
var.get_shape().as_list(),
name="Placeholder_" + k)
self.assignment_nodes[k] = var.assign(self.placeholders[k])
self.placeholders = {}
self.assignment_nodes = {}

# Create new placeholders to put in custom weights.
for k, var in self.variables.items():
self.placeholders[k] = tf1.placeholder(
var.value().dtype,
var.get_shape().as_list(),
name="Placeholder_" + k)
self.assignment_nodes[k] = var.assign(self.placeholders[k])
else:
for v in variable_list:
self.variables[v.name] = v

def set_session(self, sess):
"""Sets the current session used by the class.
Expand All @@ -117,10 +122,12 @@ def get_flat_size(self):

def _check_sess(self):
"""Checks if the session is set, and if not throw an error message."""
assert self.sess is not None, ("The session is not set. Set the "
"session either by passing it into the "
"TensorFlowVariables constructor or by "
"calling set_session(sess).")
if tf1.executing_eagerly():
return
assert self.sess is not None, \
"The session is not set. Set the session either by passing it " \
"into the TensorFlowVariables constructor or by calling " \
"set_session(sess)."

def get_flat(self):
"""Gets the weights and returns them as a flat array.
Expand All @@ -129,6 +136,11 @@ def get_flat(self):
1D Array containing the flattened weights.
"""
self._check_sess()
# Eager mode.
if not self.sess:
return np.concatenate(
[v.numpy().flatten() for v in self.variables.values()])
# Graph mode.
return np.concatenate([
v.eval(session=self.sess).flatten()
for v in self.variables.values()
Expand All @@ -147,12 +159,16 @@ def set_flat(self, new_weights):
self._check_sess()
shapes = [v.get_shape().as_list() for v in self.variables.values()]
arrays = unflatten(new_weights, shapes)
placeholders = [
self.placeholders[k] for k, v in self.variables.items()
]
self.sess.run(
list(self.assignment_nodes.values()),
feed_dict=dict(zip(placeholders, arrays)))
if not self.sess:
for v, a in zip(self.variables.values(), arrays):
v.assign(a)
else:
placeholders = [
self.placeholders[k] for k, v in self.variables.items()
]
self.sess.run(
list(self.assignment_nodes.values()),
feed_dict=dict(zip(placeholders, arrays)))

def get_weights(self):
"""Returns a dictionary containing the weights of the network.
Expand All @@ -161,6 +177,10 @@ def get_weights(self):
Dictionary mapping variable names to their weights.
"""
self._check_sess()
# Eager mode.
if not self.sess:
return self.variables
# Graph mode.
return self.sess.run(self.variables)

def set_weights(self, new_weights):
Expand Down
8 changes: 8 additions & 0 deletions rllib/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,7 @@ py_test(
args = ["--yaml-dir=tuned_examples/sac", "--torch"]
)


# TD3
py_test(
name = "run_regression_tests_pendulum_td3_tf",
Expand Down Expand Up @@ -1013,6 +1014,13 @@ py_test(
srcs = ["models/tests/test_distributions.py"]
)

py_test(
name = "test_attention_nets",
tags = ["models"],
size = "small",
srcs = ["models/tests/test_attention_nets.py"]
)

# --------------------------------------------------------------------
# Optimizers and Memories
# rllib/execution/
Expand Down
2 changes: 1 addition & 1 deletion rllib/agents/a3c/a3c_tf_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.tf_ops import explained_variance, make_tf_callable

tf = try_import_tf()
tf1, tf, tfv = try_import_tf()


class A3CLoss:
Expand Down
8 changes: 4 additions & 4 deletions rllib/agents/ars/ars_tf_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.spaces.space_utils import unbatch

tf = try_import_tf()
tf1, tf, tfv = try_import_tf()


class ARSTFPolicy:
Expand All @@ -29,8 +29,8 @@ def __init__(self, obs_space, action_space, config):
self.single_threaded = config.get("single_threaded", False)
self.sess = make_session(single_threaded=self.single_threaded)

self.inputs = tf.placeholder(tf.float32,
[None] + list(self.preprocessor.shape))
self.inputs = tf1.placeholder(tf.float32,
[None] + list(self.preprocessor.shape))

# Policy network.
dist_class, dist_dim = ModelCatalog.get_action_dist(
Expand All @@ -52,7 +52,7 @@ def __init__(self, obs_space, action_space, config):
self.num_params = sum(
np.prod(variable.shape.as_list())
for _, variable in self.variables.variables.items())
self.sess.run(tf.global_variables_initializer())
self.sess.run(tf1.global_variables_initializer())

def compute_actions(self,
observation,
Expand Down
2 changes: 1 addition & 1 deletion rllib/agents/ddpg/ddpg_tf_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.utils.framework import try_import_tf

tf = try_import_tf()
tf1, tf, tfv = try_import_tf()


class DDPGTFModel(TFModelV2):
Expand Down
40 changes: 20 additions & 20 deletions rllib/agents/ddpg/ddpg_tf_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from ray.rllib.utils.tf_ops import huber_loss, minimize_and_clip, \
make_tf_callable

tf = try_import_tf()
tf1, tf, tfv = try_import_tf()

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -126,18 +126,18 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
target_model_out_tp1, _ = policy.target_model(input_dict_next, [], None)

# Policy network evaluation.
with tf.variable_scope(POLICY_SCOPE, reuse=True):
# prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
with tf1.variable_scope(POLICY_SCOPE, reuse=True):
# prev_update_ops = set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS))
policy_t = model.get_policy_output(model_out_t)
# policy_batchnorm_update_ops = list(
# set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
# set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)

with tf.variable_scope(POLICY_TARGET_SCOPE):
with tf1.variable_scope(POLICY_TARGET_SCOPE):
policy_tp1 = \
policy.target_model.get_policy_output(target_model_out_tp1)

# Action outputs.
with tf.variable_scope(ACTION_SCOPE, reuse=True):
with tf1.variable_scope(ACTION_SCOPE, reuse=True):
if policy.config["smooth_target_policy"]:
target_noise_clip = policy.config["target_noise_clip"]
clipped_normal_sample = tf.clip_by_value(
Expand All @@ -154,29 +154,29 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
policy_tp1_smoothed = policy_tp1

# Q-net(s) evaluation.
# prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
with tf.variable_scope(Q_SCOPE):
# prev_update_ops = set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS))
with tf1.variable_scope(Q_SCOPE):
# Q-values for given actions & observations in given current
q_t = model.get_q_values(model_out_t, train_batch[SampleBatch.ACTIONS])

with tf.variable_scope(Q_SCOPE, reuse=True):
with tf1.variable_scope(Q_SCOPE, reuse=True):
# Q-values for current policy (no noise) in given current state
q_t_det_policy = model.get_q_values(model_out_t, policy_t)

if twin_q:
with tf.variable_scope(TWIN_Q_SCOPE):
with tf1.variable_scope(TWIN_Q_SCOPE):
twin_q_t = model.get_twin_q_values(
model_out_t, train_batch[SampleBatch.ACTIONS])
# q_batchnorm_update_ops = list(
# set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
# set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)

# Target q-net(s) evaluation.
with tf.variable_scope(Q_TARGET_SCOPE):
with tf1.variable_scope(Q_TARGET_SCOPE):
q_tp1 = policy.target_model.get_q_values(target_model_out_tp1,
policy_tp1_smoothed)

if twin_q:
with tf.variable_scope(TWIN_Q_TARGET_SCOPE):
with tf1.variable_scope(TWIN_Q_TARGET_SCOPE):
twin_q_tp1 = policy.target_model.get_twin_q_values(
target_model_out_tp1, policy_tp1_smoothed)

Expand Down Expand Up @@ -220,10 +220,10 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
if l2_reg is not None:
for var in policy.model.policy_variables():
if "bias" not in var.name:
actor_loss += (l2_reg * tf.nn.l2_loss(var))
actor_loss += (l2_reg * tf1.nn.l2_loss(var))
for var in policy.model.q_variables():
if "bias" not in var.name:
critic_loss += (l2_reg * tf.nn.l2_loss(var))
critic_loss += (l2_reg * tf1.nn.l2_loss(var))

# Model self-supervised losses.
if policy.config["use_state_preprocessor"]:
Expand Down Expand Up @@ -259,9 +259,9 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):

def make_ddpg_optimizers(policy, config):
# Create separate optimizers for actor & critic losses.
policy._actor_optimizer = tf.train.AdamOptimizer(
policy._actor_optimizer = tf1.train.AdamOptimizer(
learning_rate=config["actor_lr"])
policy._critic_optimizer = tf.train.AdamOptimizer(
policy._critic_optimizer = tf1.train.AdamOptimizer(
learning_rate=config["critic_lr"])
return None

Expand All @@ -286,7 +286,7 @@ def build_apply_op(policy, optimizer, grads_and_vars):
# For policy gradient, update policy net one time v.s.
# update critic net `policy_delay` time(s).
should_apply_actor_opt = tf.equal(
tf.mod(policy.global_step, policy.config["policy_delay"]), 0)
tf.math.floormod(policy.global_step, policy.config["policy_delay"]), 0)

def make_apply_op():
return policy._actor_optimizer.apply_gradients(
Expand All @@ -299,7 +299,7 @@ def make_apply_op():
critic_op = policy._critic_optimizer.apply_gradients(
policy._critic_grads_and_vars)
# Increment global step & apply ops.
with tf.control_dependencies([tf.assign_add(policy.global_step, 1)]):
with tf1.control_dependencies([tf1.assign_add(policy.global_step, 1)]):
return tf.group(actor_op, critic_op)


Expand Down Expand Up @@ -341,7 +341,7 @@ def build_ddpg_stats(policy, batch):

def before_init_fn(policy, obs_space, action_space, config):
# Create global step for counting the number of update operations.
policy.global_step = tf.train.get_or_create_global_step()
policy.global_step = tf1.train.get_or_create_global_step()


class ComputeTDErrorMixin:
Expand Down
8 changes: 4 additions & 4 deletions rllib/agents/ddpg/ddpg_torch_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,10 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
target_model_out_tp1, _ = policy.target_model(input_dict_next, [], None)

# Policy network evaluation.
# prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
# prev_update_ops = set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS))
policy_t = model.get_policy_output(model_out_t)
# policy_batchnorm_update_ops = list(
# set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
# set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)

policy_tp1 = \
policy.target_model.get_policy_output(target_model_out_tp1)
Expand All @@ -73,7 +73,7 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
policy_tp1_smoothed = policy_tp1

# Q-net(s) evaluation.
# prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
# prev_update_ops = set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS))
# Q-values for given actions & observations in given current
q_t = model.get_q_values(model_out_t, train_batch[SampleBatch.ACTIONS])

Expand All @@ -86,7 +86,7 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
twin_q_t = model.get_twin_q_values(model_out_t,
train_batch[SampleBatch.ACTIONS])
# q_batchnorm_update_ops = list(
# set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
# set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)

# Target q-net(s) evaluation.
q_tp1 = policy.target_model.get_q_values(target_model_out_tp1,
Expand Down
2 changes: 1 addition & 1 deletion rllib/agents/ddpg/noop_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from ray.rllib.utils.annotations import override
from ray.rllib.utils.framework import try_import_tf

tf = try_import_tf()
_, tf, _ = try_import_tf()


class NoopModel(TFModelV2):
Expand Down
11 changes: 7 additions & 4 deletions rllib/agents/ddpg/tests/test_td3.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from ray.rllib.utils.test_utils import check, check_compute_single_action, \
framework_iterator

tf = try_import_tf()
tf1, tf, tfv = try_import_tf()


class TestTD3(unittest.TestCase):
Expand All @@ -32,8 +32,9 @@ def test_td3_exploration_and_with_random_prerun(self):

# Test against all frameworks.
for _ in framework_iterator(config, frameworks="tf"):
lcl_config = config.copy()
# Default GaussianNoise setup.
trainer = td3.TD3Trainer(config=config, env="Pendulum-v0")
trainer = td3.TD3Trainer(config=lcl_config, env="Pendulum-v0")
# Setting explore=False should always return the same action.
a_ = trainer.compute_action(obs, explore=False)
for _ in range(50):
Expand All @@ -44,17 +45,18 @@ def test_td3_exploration_and_with_random_prerun(self):
for _ in range(50):
actions.append(trainer.compute_action(obs))
check(np.std(actions), 0.0, false=True)
trainer.stop()

# Check randomness at beginning.
config["exploration_config"] = {
lcl_config["exploration_config"] = {
# Act randomly at beginning ...
"random_timesteps": 30,
# Then act very closely to deterministic actions thereafter.
"stddev": 0.001,
"initial_scale": 0.001,
"final_scale": 0.001,
}
trainer = td3.TD3Trainer(config=config, env="Pendulum-v0")
trainer = td3.TD3Trainer(config=lcl_config, env="Pendulum-v0")
# ts=1 (get a deterministic action as per explore=False).
deterministic_action = trainer.compute_action(obs, explore=False)
# ts=2-5 (in random window).
Expand All @@ -73,6 +75,7 @@ def test_td3_exploration_and_with_random_prerun(self):
for _ in range(50):
a = trainer.compute_action(obs, explore=False)
check(a, deterministic_action)
trainer.stop()


if __name__ == "__main__":
Expand Down
Loading

0 comments on commit 43043ee

Please sign in to comment.