d3lm · Jan 2, 2018
diff --git a/‎agent.py
+53-13 b/‎agent.py
+53-13
diff --git a/‎main.py
+9-25 b/‎main.py
+9-25
diff --git a/‎memory.py
+33-9 b/‎memory.py
+33-9
diff --git a/‎models/model_ddqn_5x5_10000_1514923302.h5
11.4 MB b/‎models/model_ddqn_5x5_10000_1514923302.h5
11.4 MB
diff --git a/‎models/model_5x5_10000_1514840795.h5 ‎models/model_dqn_5x5_10000_1514914945.h5
11.4 MB b/‎models/model_5x5_10000_1514840795.h5 ‎models/model_dqn_5x5_10000_1514914945.h5
11.4 MB
@@ -1,26 +1,44 @@
+import sys
 import numpy as np
 import warnings
 import utils
+from enum import Enum
 from time import time, sleep
 import matplotlib.pyplot as plt
 from policy import EpsGreedyPolicy
 from memory import ExperienceReplay
+from keras.models import Sequential
+from keras.layers import *
+from keras.optimizers import *
+from keras.models import load_model
 
-class Agent:
-  def __init__(self, game, model, nb_epoch=10000, memory_size=1000, batch_size=50, nb_frames=4, epsilon=1., discount=.9, learning_rate=.1):
-
-    channels = model.input_shape[1]
+TEST = 0
+SIMPLE = 1
+DOUBLE = 2
 
-    if nb_frames != channels:
-      warnings.warn("Dimension mismatch: Using number of channels for number of frames")
-      nb_frames = channels
+class Agent:
+  def __init__(self, game, mode=SIMPLE, nb_epoch=10000, memory_size=1000, batch_size=50, nb_frames=4, epsilon=1., discount=.9, learning_rate=.1, model=None):
 
     self.game = game
-    self.model = model
+    self.mode = mode
+    self.target_model = None
+    self.rows, self.columns = game.field_shape()
     self.nb_epoch = nb_epoch
     self.nb_frames = nb_frames
     self.nb_actions = game.nb_actions()
 
+    if mode == TEST:
+      print('Training Mode: Loading model...')
+      self.model = load_model(model)
+    elif mode == SIMPLE:
+      print('Using Plain DQN: Building model...')
+      self.model = self.build_model()
+    elif mode == DOUBLE:
+      print('Using Double DQN: Building primary and target model...')
+      self.model = self.build_model()
+      self.target_model = self.build_model()
+      self.update_target_model()
+
     # Trades off the importance of sooner versus later rewards.
     # A factor of 0 means it rather prefers immediate rewards
     # and it will mostly consider current rewards. A factor of 1
@@ -39,18 +57,34 @@ def __init__(self, game, model, nb_epoch=10000, memory_size=1000, batch_size=50,
     # a random action by the probability 'eps'. Without this policy the network
     # is greedy and it will it settles with the first effective strategy it finds.
     # Hence, we introduce certain randomness.
-    # Epislon reaches its minimum at 2/3 of the games
+    # Epislon reaches its minimum at 1/2 of the games
     epsilon_end = self.nb_epoch - (self.nb_epoch / 2)
     self.policy = EpsGreedyPolicy(self.model, epsilon_end, self.nb_actions, epsilon, .1)
 
     # Create new experience replay memory. Without this optimization
     # the training takes extremely long even on a GPU and most
     # importantly the approximation of Q-values using non-linear
     # functions, that is used for our NN, is not very stable.
-    self.memory = ExperienceReplay(self.model, self.nb_actions, memory_size, batch_size, self.discount, self.learning_rate)
+    self.memory = ExperienceReplay(self.model, self.target_model, self.nb_actions, memory_size, batch_size, self.discount, self.learning_rate)
 
     self.frames = None
 
+  def build_model(self):
+    model = Sequential()
+    model.add(Conv2D(32, (2, 2), activation='relu', input_shape=(self.nb_frames, self.rows, self.columns), data_format="channels_first"))
+    model.add(Conv2D(64, (2, 2), activation='relu'))
+    model.add(Conv2D(64, (3, 3), activation='relu'))
+    model.add(Flatten())
+    model.add(Dropout(0.1))
+    model.add(Dense(512, activation='relu'))
+    model.add(Dense(self.nb_actions))
+    model.compile(Adam(), 'MSE')
+
+    return model
+
+  def update_target_model(self):
+    self.target_model.set_weights(self.model.get_weights())
+
   def get_frames(self):
     frame = self.game.get_state()
     if self.frames is None:
@@ -85,7 +119,8 @@ def print_stats(self, data, y_label, x_label='Epoch', marker='-'):
     path = './plots/{name}_{size}x{size}_{timestamp}'
     fig.savefig(path.format(size=self.game.grid_size, name=file_name, timestamp=int(time())))
 
-  def train(self, visualize=True):
+  def train(self, update_freq=10):
+    total_steps = 0
     max_steps = self.game.grid_size**2 * 3
     loops = 0
     nb_wins = 0
@@ -119,6 +154,7 @@ def train(self, visualize=True):
 
         cumulative_reward += reward
         steps += 1
+        total_steps += 1
 
         if steps == max_steps and not done:
           loops += 1
@@ -145,6 +181,9 @@ def train(self, visualize=True):
         if done:
           duration = utils.get_time_difference(start_time, time())
 
+        if self.mode == DOUBLE and self.target_model is not None and total_steps % (update_freq) == 0:
+          self.update_target_model()
+
       current_epoch = epoch + 1
       reward_buffer.append([current_epoch, cumulative_reward])
       duration_buffer.append([current_epoch, duration])
@@ -160,8 +199,9 @@ def train(self, visualize=True):
     self.print_stats(steps_buffer, 'Steps per Game')
     self.print_stats(wins_buffer, 'Wins')
 
-    path = './models/model_{size}x{size}_{epochs}_{timestamp}.h5'
-    self.model.save(path.format(size=self.game.grid_size, epochs=self.nb_epoch, timestamp=int(time())))
+    path = './models/model_{mode}_{size}x{size}_{epochs}_{timestamp}.h5'
+    mode = 'dqn' if self.mode == SIMPLE else 'ddqn'
+    self.model.save(path.format(mode=mode, size=self.game.grid_size, epochs=self.nb_epoch, timestamp=int(time())))
 
   def play(self, nb_games=5, interval=.7):
     nb_wins = 0
 
@@ -1,17 +1,15 @@
-from keras.models import Sequential
-from keras.layers import *
-from keras.optimizers import *
 from games import Snake
-from keras.models import load_model
-from agent import Agent
+from agent import Agent, TEST
 import argparse
 
 boolean = lambda x: (str(x).lower() == 'true')
 
-# Command line arguments
+# Command line argumentss
 parser = argparse.ArgumentParser()
 parser.add_argument("--train", nargs='?', type=boolean, const=True, default=True)
 parser.add_argument("--model", nargs='?', const=True)
+parser.add_argument("--mode", nargs='?', type=int, const=True, default=1, choices=[0,1,2])
+parser.add_argument("--update-freq", nargs='?', type=int, const=True, default=10)
 parser.add_argument("--grid-size", nargs='?', type=int, const=True, default=10)
 parser.add_argument("--frames", nargs='?', type=int, const=True, default=4)
 parser.add_argument("--epochs", nargs='?', type=int, const=True, default=10000)
@@ -27,14 +25,13 @@
 args = parser.parse_args()
 
 if not args.train and args.model is None:
-    parser.error("Non-training mode requires a model")
+  parser.error("Non-training mode requires a model")
 
 print(args)
 
 game = Snake(grid_size=args.grid_size, walls=args.walls)
 
 # Hyper parameter for the neural net and the agent
-rows, columns = game.field_shape()
 nb_frames = args.frames
 nb_epoch = args.epochs
 memory_size = args.memory_size
@@ -43,25 +40,12 @@
 discount = args.discount
 learning_rate = args.learning_rate
 nb_actions = game.nb_actions()
+mode = args.mode if args.train else TEST
+update_freq = args.update_freq
 
-model = None
+agent = Agent(game, mode, nb_epoch, memory_size, batch_size, nb_frames, epsilon, discount, learning_rate, model=args.model)
 
 if args.train:
-  model = Sequential()
-  model.add(Conv2D(32, (2, 2), activation='relu', input_shape=(nb_frames, rows, columns), data_format="channels_first"))
-  model.add(Conv2D(64, (2, 2), activation='relu'))
-  model.add(Conv2D(64, (3, 3), activation='relu'))
-  model.add(Flatten())
-  model.add(Dropout(0.1))
-  model.add(Dense(512, activation='relu'))
-  model.add(Dense(nb_actions))
-  model.compile(Adam(), 'MSE')
-else:
-  model = load_model(args.model)
-
-agent = Agent(game, model, nb_epoch, memory_size, batch_size, nb_frames, epsilon, discount, learning_rate)
-
-if args.train:
-  agent.train()
+  agent.train(update_freq=update_freq)
 else:
   agent.play(nb_games=args.games, interval=args.interval)
@@ -2,9 +2,10 @@
 from random import sample
 
 class ExperienceReplay():
-  def __init__(self, model, nb_actions, memory_size=100, batch_size=50, discount=.9, learning_rate=.1):
+  def __init__(self, model, target_model, nb_actions, memory_size=100, batch_size=50, discount=.9, learning_rate=.1):
       self.memory = []
       self.model = model
+      self.target_model = target_model
       self.nb_actions = nb_actions
       self.memory_size = memory_size
       self.batch_size = batch_size
@@ -57,7 +58,7 @@ def get_batch(self):
     q_t = self.model.predict(batch)
 
     # q-values for the next states (states_tn)
-    q_tn = self.get_q_next(q_t, batch_size)
+    q_tn = self.get_q_next(q_t, states_tn, batch_size)
 
     # Delta (learning rate). Determines how aggressively
     # the q-values should be updated. 1 means very a
@@ -68,18 +69,41 @@ def get_batch(self):
 
     inputs = states_t
 
-    # Update q-values for states_t given the reward and the max q-value for states_tn
+    # Update q-values based on the next states (states_tn)
     # q_t[:batch_size] = q-values for the current states (states_t)
     targets = (1 - delta) * q_t[:batch_size] + delta * (rewards + self.discount * (1 - done) * q_tn)
 
     return inputs, targets
 
-  def get_q_next(self, q_t, batch_size):
-    # Take max q-value from each next state (state_tn) and reshape into
-    # [[ .5 .5 .5 .5 .5 ] | max q for state_tn[0]
-    #  [ .2 .2 .2 .2 .2 ] | max q for state_tn[1]
-    #   ... #state_tn ]
-    return np.max(q_t[batch_size:], axis=1).repeat(self.nb_actions).reshape((batch_size, self.nb_actions))
+  def get_q_next(self, q_t, states_tn, batch_size):
+    if not self.target_model:
+      # Plain DQN
+      # A single network for action selection and generation of target q-values
+      # Take max q-value from each next state (state_tn) and reshape into
+      # [[ .5 .5 .5 .5 .5 ] | max q for state_tn[0]
+      #  [ .2 .2 .2 .2 .2 ] | max q for state_tn[1]
+      #   ... #state_tn ]
+      q_next = np.max(q_t[batch_size:], axis=1)
+    else:
+      # Double DQN
+      # The problem with plain DQN is that it tends to overestimate the q-values due to the
+      # 'max' used in the formula to update the targets. The 'max' leads to a positive bias
+      # because the highest q-value is propagated to previous states.
+      # The solution is to have two separate networks, one primary network for determining the
+      # action and a second (target) network to genrate the target q-values for that action.
+      # By decoupling the action choice from the target Q-value generation, we are able to
+      # substantially reduce the overestimation, and train faster and more reliably.
+
+      # Select max action from primary network (from states_tn)
+      next_actions = np.argmax(q_t[batch_size:], axis=1)
+
+      # Generate target q-values with secondary (target) network
+      target_q_values = self.target_model.predict(states_tn)
+
+      # Take the highest q-values
+      q_next = target_q_values[range(batch_size), next_actions]
+
+    return q_next.repeat(self.nb_actions).reshape((batch_size, self.nb_actions))
 
   def extract_transition(self, experience, batch_size):
     input_dim = self.input_dim