Skip to content

Commit

Permalink
add MDP_Env and test DQNAgent
Browse files Browse the repository at this point in the history
  • Loading branch information
GongXudong committed Aug 22, 2018
1 parent 28f5416 commit c2309b9
Show file tree
Hide file tree
Showing 4 changed files with 179 additions and 40 deletions.
132 changes: 97 additions & 35 deletions Hierarchical_DQN/DQNAgent.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,68 +5,103 @@
import random
from common.schedules import LinearSchedule
from Hierarchical_DQN.replay_buffer import ReplayBuffer
class DQNAgent(object):

def __init__(self, states_n, actions_n, hidden_layers, scope_name, sess, learning_rate=0.001,
discount=0.98, replay_memory_size=100000, batch_size=32, rm_begin_work = 1000,
targetnet_update_freq = 1000, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_step=50000,
seed=1):

class DQNAgent(object):
"""
refs: https://github.com/skumar9876/Hierarchical-DQN/blob/master/dqn.py
"""
def __init__(self, states_n, actions_n, hidden_layers, scope_name, sess=None, learning_rate=0.001,
discount=0.98, replay_memory_size=100000, batch_size=32, begin_train=1000,
targetnet_update_freq=1000, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_step=50000,
seed=1, logdir='logs'):
"""
:param states_n: tuple
:param actions_n: int
:param hidden_layers: list
:param scope_name: str
:param sess: tf.Session
:param learning_rate: float
:param discount: float
:param replay_memory_size: int
:param batch_size: int
:param begin_train: int
:param targetnet_update_freq: int
:param epsilon_start: float
:param epsilon_end: float
:param epsilon_decay_step: int
:param seed: int
:param logdir: str
"""
self.states_n = states_n
self.actions_n = actions_n
self._hidden_layers = hidden_layers
self.lr = learning_rate

self._target_net_update_freq = targetnet_update_freq
self._current_time_step = 0
self._epsilon_schedule = LinearSchedule(epsilon_decay_step, epsilon_end, epsilon_start)
self._train_batch_size = batch_size
self._begin_train = begin_train
self._gamma = discount

self.qnet_optimizer = tf.train.AdamOptimizer(self.lr)

self._replay_buffer = ReplayBuffer(replay_memory_size)

self._seed(seed)

with tf.Graph().as_default():
self._build_graph()
self._merged_summary = tf.summary.merge_all()
self._saver = tf.train.Saver()
if sess is None:
self.sess = tf.Session()
else:
self.sess = sess
self.sess.run(tf.global_variables_initializer())

def _q_network(self, state, hidden_layers, scope_name, trainable):
self._summary_writer = tf.summary.FileWriter(logdir=logdir)
self._summary_writer.add_graph(tf.get_default_graph())


def _q_network(self, state, hidden_layers, outputs, scope_name, trainable):

with tf.variable_scope(scope_name):
out = state
for ly in hidden_layers[:-1]:
for ly in hidden_layers:
out = layers.fully_connected(out, ly, activation_fn=tf.nn.relu, trainable=trainable)
out = layers.fully_connected(out, hidden_layers[-1], activation_fn=None, trainable=trainable)
out = layers.fully_connected(out, outputs, activation_fn=None, trainable=trainable)
return out

def _build_graph(self):
self._state = tf.placeholder(dtype=tf.float32, shape=[None, self.states_n], name='state_input')
self._state = tf.placeholder(dtype=tf.float32, shape=(None, ) + self.states_n, name='state_input')

self._q_values = self._q_network(self._state, 'q_network', True)
self._target_q_values = self._q_network(self._state, 'target_q_network', False)
self._q_values = self._q_network(self._state, self._hidden_layers, self.actions_n, 'q_network', True)
self._target_q_values = self._q_network(self._state, self._hidden_layers, self.actions_n, 'target_q_network', False)

with tf.variable_scope('q_network_update'):
self._picked_actions = tf.placeholder(dtype=tf.float32, shape=[None, self.actions_n], name='actions_input')
self._td_targets = tf.placeholder(dtype=tf.float32, shape=[None], name='td_targets')
self._q_values_pred = tf.gather_nd(self._q_values, self._picked_actions)
self._loss = tf.reduce_mean(self._clipped_error(self._q_values_pred - self._td_targets))

self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
self._actions_onehot = tf.placeholder(dtype=tf.float32, shape=(None, self.actions_n), name='actions_onehot_input')
self._td_targets = tf.placeholder(dtype=tf.float32, shape=(None, ), name='td_targets')
self._q_values_pred = tf.reduce_sum(self._q_values * self._actions_onehot, axis=1)

grads_and_vars = self.optimizer.compute_gradients(self._loss, tf.trainable_variables())
self._error = tf.abs(self._q_values_pred - self._td_targets)
quadratic_part = tf.clip_by_value(self._error, 0.0, 1.0)
linear_part = self._error - quadratic_part
self._loss = tf.reduce_mean(0.5 * tf.square(quadratic_part) + linear_part)

grads = [gv[0] for gv in grads_and_vars]
params = [gv[1] for gv in grads_and_vars]
grads = tf.clip_by_global_norm(grads, 5.0)[0]
qnet_gradients = self.qnet_optimizer.compute_gradients(self._loss, tf.trainable_variables())
for i, (grad, var) in enumerate(qnet_gradients):
if grad is not None:
qnet_gradients[i] = (tf.clip_by_norm(grad, 10), var)
self.train_op = self.qnet_optimizer.apply_gradients(qnet_gradients)

clipped_grads_and_vars = zip(grads, params)
tf.summary.scalar('loss', self._loss)

self.train_op = self.optimizer.apply_gradients(clipped_grads_and_vars,
global_step=tf.contrib.framework.get_global_step())

with tf.name_scope('target_network_update'):
q_network_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_network')
q_network_params = [t for t in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_network')
if t.name.startswith('q_network/')]
target_q_network_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_network')

self.target_update_ops = []
Expand All @@ -76,28 +111,55 @@ def _build_graph(self):
self.target_update_ops = tf.group(*self.target_update_ops)

def choose_action(self, state):
self._current_time_step += 1

if np.random.random() < self._epsilon_schedule.value(self._current_time_step):
return np.random.randint(0, self.actions_n)
else:
q_values = self.sess.run(self._q_values, feed_dict={self._state: state})
return np.argmax(q_values)
q_values = self.sess.run(self._q_values, feed_dict={self._state: state[None]})

return np.argmax(q_values[0])

def store(self, state, action, reward, next_state, terminate):
self._replay_buffer.add(state, action, reward, next_state, terminate)

def train(self):
pass

def _clipped_error(self, x):
return tf.where(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5)
self._current_time_step += 1

if self._current_time_step == 1:
print('Training starts.')
self.sess.run(self.target_update_ops)

if self._current_time_step > self._begin_train:
states, actions, rewards, next_states, terminates = self._replay_buffer.sample(batch_size=self._train_batch_size)

actions_onehot = np.zeros((self._train_batch_size, self.actions_n))
for i in range(self._train_batch_size):
actions_onehot[i, actions[i]] = 1.

next_state_q_values = self.sess.run(self._q_values, feed_dict={self._state: next_states})
next_state_target_q_values = self.sess.run(self._target_q_values, feed_dict={self._state: next_states})

next_select_actions = np.argmax(next_state_q_values, axis=1)
next_select_actions_onehot = np.zeros((self._train_batch_size, self.actions_n))
for i in range(self._train_batch_size):
next_select_actions_onehot[i, next_select_actions[i]] = 1.

next_state_max_q_values = np.sum(next_state_target_q_values * next_select_actions_onehot, axis=1)

td_targets = rewards + self._gamma * next_state_max_q_values * (1 - terminates)

_, str_ = self.sess.run([self.train_op, self._merged_summary], feed_dict={self._state: states,
self._actions_onehot: actions_onehot,
self._td_targets: td_targets})

self._summary_writer.add_summary(str_, self._current_time_step)

if self._current_time_step % self._target_net_update_freq == 0:
self.sess.run(self.target_update_ops)

def _seed(self, lucky_number):
tf.set_random_seed(lucky_number)
np.random.seed(lucky_number)
random.seed(lucky_number)




15 changes: 10 additions & 5 deletions Hierarchical_DQN/Env/Stochastic_MDP.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
import gym
import numpy as np
import random

class StochasticMDPEnv:
class StochasticMDPEnv(gym.Env):

def __init__(self):
self.visited_six = False
self.current_state = 2

self.observation_space = gym.spaces.Box(low=np.array([1]), high=np.array([6]))
self.action_space = gym.spaces.Discrete(2)

def reset(self):
self.visited_six = False
self.current_state = 2
return self.current_state
return np.array([self.current_state])

def step(self, action):
if self.current_state != 1:
Expand All @@ -27,8 +32,8 @@ def step(self, action):
self.visited_six = True
if self.current_state == 1:
if self.visited_six:
return self.current_state, 1.00, True
return np.array([self.current_state]), 1.00, True, {}
else:
return self.current_state, 1.00/100.00, True
return np.array([self.current_state]), 1.00/100.00, True, {}
else:
return self.current_state, 0.0, False
return np.array([self.current_state]), 0.0, False, {}
38 changes: 38 additions & 0 deletions Hierarchical_DQN/t_CartPole.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import gym
import time
import numpy as np
import tensorflow as tf
from Hierarchical_DQN.DQNAgent import DQNAgent

DEBUG = False

if __name__ == '__main__':

env = gym.make('CartPole-v0')
print((None,) + env.observation_space.shape, env.action_space.n)
agent = DQNAgent(env.observation_space.shape, env.action_space.n, [64], 'cartpole',
epsilon_decay_step=10000, epsilon_end=0.02, replay_memory_size=50000,
learning_rate=5e-4)

for i in range(10000):
state = env.reset()
episode_len = 0
episode_reward = 0
while True:
if DEBUG:
env.render()
action = agent.choose_action(state=state)
next_state, reward, done, _ = env.step(action)
agent.store(state, action, reward, next_state, float(done))
agent.train()

episode_len += 1
episode_reward += reward

state = next_state

if done:
print('episode_{}: len {} reward {}'.format(i, episode_len, episode_reward))
break

env.close()
34 changes: 34 additions & 0 deletions Hierarchical_DQN/t_MDP.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import gym
import time
import numpy as np
import tensorflow as tf
from Hierarchical_DQN.DQNAgent import DQNAgent
from Hierarchical_DQN.Env.Stochastic_MDP import StochasticMDPEnv

if __name__ == '__main__':

env = StochasticMDPEnv()
agent = DQNAgent(env.observation_space.shape, env.action_space.n, [32, 32, 32], 'smdp',
epsilon_decay_step=10000, epsilon_end=0.02, replay_memory_size=50000,
learning_rate=5e-4)

for i in range(10000):
state = env.reset()
episode_len = 0
episode_reward = 0
while True:
action = agent.choose_action(state=state)
next_state, reward, done, _ = env.step(action)
agent.store(state, action, reward, next_state, float(done))
agent.train()

episode_len += 1
episode_reward += reward

state = next_state

if done:
print('episode_{}: len {} reward {}'.format(i, episode_len, episode_reward))
break

env.close()

0 comments on commit c2309b9

Please sign in to comment.