Small bug fixes, historical policy gradient competition

lorerave85 · Sep 25, 2016 · 164fb93 · 164fb93
1 parent 69d7305
commit 164fb93
Show file tree

Hide file tree

Showing 8 changed files with 269 additions and 13 deletions.
diff --git a/base_game_spec.py b/base_game_spec.py
@@ -30,6 +30,20 @@ def board_dimensions(self):
     def board_squares(self):
         return reduce(operator.mul, self.board_dimensions(), 1)
 
+    def outputs(self):
+        """The number of moves that could be made in this kind of game, weather or not they are legal. For most games
+        this will be every single square on the board, but for connect 4 this is different. If we wanted to do chess in
+        the future this method may need to get a bit more complicated.
+
+        Returns:
+            int
+        """
+        return self.board_squares()
+
+    def flat_move_to_tuple(self, move_index):
+        board_x = self.board_dimensions()[0]
+        return move_index / board_x, move_index % board_x
+
     def play_game(self, plus_player_func, minus_player_func, log=False):
         """Run a single game of until the end, using the provided function args to determine the moves for each
         player.
@@ -86,4 +100,4 @@ def get_random_player_func(self):
         Returns:
             board_state, side (int) -> move : function that plays this game by making random moves
         """
-        return lambda board_state, side: random.choice(list(self.available_moves(board_state)))
+        return lambda board_state, side: random.choice(list(self.available_moves(board_state)))
diff --git a/games/connect_4.py b/games/connect_4.py
@@ -114,13 +114,13 @@ def has_winner(board_state, winning_length=4):
     # check diagonals
     diagonals_start = -(board_width - winning_length)
     diagonals_end = (board_width - winning_length)
-    for d in range(diagonals_start, diagonals_end):
+    for d in range(diagonals_start, diagonals_end+1):
         winner = _has_winning_line(
             (board_state[i][i + d] for i in range(max(-d, 0), min(board_width, board_height - d))),
             winning_length)
         if winner != 0:
             return winner
-    for d in range(diagonals_start, diagonals_end):
+    for d in range(diagonals_start, diagonals_end+1):
         winner = _has_winning_line(
             (board_state[i][board_height - i - d - 1] for i in range(max(-d, 0), min(board_width, board_height - d))),
             winning_length)
@@ -205,18 +205,20 @@ def __init__(self, board_width, board_height, winning_length):
         self.available_moves = available_moves
         self.apply_move = apply_move
 
-        self.new_board.__doc__ = _new_board.__doc__
-        self.has_winner.__doc__ = has_winner.__doc__
-
     def new_board(self):
         return _new_board(self._board_width, self._board_height)
 
     def has_winner(self, board_sate):
-        return has_winner(self._board_size, self._winning_length)
+        return has_winner(board_sate, self._winning_length)
 
     def board_dimensions(self):
         return self._board_width, self._board_height
 
+    def flat_move_to_tuple(self, move_index):
+        return move_index
+
+    def outputs(self):
+        return self._board_width
 
 if __name__ == '__main__':
     # example of playing a game

diff --git a/games/tic_tac_toe_x.py b/games/tic_tac_toe_x.py
@@ -200,14 +200,11 @@ def __init__(self, board_size, winning_length):
         self.available_moves = available_moves
         self.apply_move = apply_move
 
-        self.new_board.__doc__ = _new_board.__doc__
-        self.has_winner.__doc__ = has_winner.__doc__
-
     def new_board(self):
         return _new_board(self._board_size)
 
     def has_winner(self, board_sate):
-        return has_winner(self._board_size, self._winning_length)
+        return has_winner(board_sate, self._winning_length)
 
     def board_dimensions(self):
         return self._board_size, self._board_size

diff --git a/network_helpers.py b/network_helpers.py
@@ -0,0 +1,84 @@
+import tensorflow as tf
+import numpy as np
+import pickle
+
+
+def create_network(input_nodes, hidden_nodes, output_nodes = None):
+    """Create a network with relu activations at each layer
+
+    Args:
+        output_nodes: (int): Number of output nodes, if None then number of input nodes is used
+        input_nodes (int): The size of the board this network will work on. The output layer will also be this size
+        hidden_nodes ([int]): The number of hidden nodes in each hidden layer
+
+    Returns:
+        (input_layer, output_layer, [variables]) : The final item in the tuple is a list containing all the parameters,
+            wieghts and biases used in this network
+    """
+    output_nodes = output_nodes or input_nodes
+    variables = []
+
+    with tf.name_scope('network'):
+        input_layer = tf.placeholder("float", (None, input_nodes))
+
+        current_layer = input_layer
+
+        for hidden_nodes in hidden_nodes:
+            last_layer_nodes = int(current_layer.get_shape()[-1])
+            hidden_weights = tf.Variable(
+                tf.truncated_normal((last_layer_nodes, hidden_nodes), stddev=1. / np.sqrt(last_layer_nodes)),
+                name='weights')
+            hidden_bias = tf.Variable(tf.constant(0.01, shape=(hidden_nodes,)), name='biases')
+
+            variables.append(hidden_weights)
+            variables.append(hidden_bias)
+
+            current_layer = tf.nn.relu(
+                tf.matmul(current_layer, hidden_weights) + hidden_bias)
+
+        output_weights = tf.Variable(
+            tf.truncated_normal((hidden_nodes, output_nodes), stddev=1. / np.sqrt(hidden_nodes)), name="output_weights")
+        output_bias = tf.Variable(tf.constant(0.01, shape=(output_nodes,)), name="output_bias")
+
+        variables.append(output_weights)
+        variables.append(output_bias)
+
+        output_layer = tf.nn.softmax(
+            tf.matmul(current_layer, output_weights) + output_bias)
+
+    return input_layer, output_layer, variables
+
+
+def save_network(session, variables, file_path):
+    variable_values = session.run(variables)
+    with open(file_path, mode='w') as f:
+        pickle.dump(variable_values, f)
+
+
+def load_network(session, tf_variables, file_path):
+    with open(file_path, mode='r') as f:
+        variable_values = pickle.load(f)
+    for value, tf_variable in zip(variable_values, tf_variables):
+        session.run(tf_variable.assign(value))
+
+
+def invert_board_state(board_state):
+    return tuple(tuple(-board_state(j, i) for i in range(len(board_state[0]))) for j in range(len(board_state)))
+
+
+def get_stochastic_network_move(session, input_layer, output_layer, board_state, side):
+    board_state_flat = np.ravel(board_state)
+    if side == -1:
+        board_state_flat = -board_state_flat
+
+    probability_of_actions = session.run(output_layer,
+                                         feed_dict={input_layer: [board_state_flat.ravel()]})[0]
+
+    try:
+        move = np.random.multinomial(1, probability_of_actions)
+    except ValueError:
+        # sometimes because of rounding errors we end up with probability_of_actions summing to greater than 1.
+        # so need to reduce slightly to be a valid value
+        move = np.random.multinomial(1, probability_of_actions / (sum(probability_of_actions) + 1e-7))
+
+    return move
diff --git a/policy_gradient_historical_competition.py b/policy_gradient_historical_competition.py
@@ -0,0 +1,123 @@
+import collections
+import functools
+import os
+import random
+
+import numpy as np
+import tensorflow as tf
+
+from games.tic_tac_toe_x import TicTacToeXGameSpec
+from network_helpers import create_network, load_network, get_stochastic_network_move, \
+    save_network
+
+NUMBER_OF_HISTORICAL_COPIES_TO_KEEP = 8
+NUMBER_OF_GAMES_TO_PLAY = 1000000
+MINI_BATCH_SIZE = 100
+SAVE_HISTORICAL_NETWORK_EVERY = 100000
+STARTING_NETWORK_WEIGHTS = 'current_network.p'
+BASE_HISTORICAL_NETWORK_PATH = 'historical_network_'
+HIDDEN_NODES = (100, 80, 60, 40)
+PRINT_RESULTS_EVERY_X = 500
+LEARN_RATE = 1e-4
+game_spec = TicTacToeXGameSpec(5, 4)
+
+input_layer, output_layer, variables = create_network(game_spec.board_squares(), HIDDEN_NODES,
+                                                      output_nodes=game_spec.outputs())
+
+reward_placeholder = tf.placeholder("float", shape=(None,))
+actual_move_placeholder = tf.placeholder("float", shape=(None, game_spec.board_squares()))
+policy_gradient = tf.reduce_sum(tf.reshape(reward_placeholder, (-1, 1)) * actual_move_placeholder * output_layer)
+train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(-policy_gradient)
+
+current_historical_index = 0
+historical_networks = []
+
+mini_batch_moves = []
+mini_batch_board_states = []
+mini_batch_rewards = []
+results = collections.deque(maxlen=PRINT_RESULTS_EVERY_X)
+
+for _ in range(NUMBER_OF_HISTORICAL_COPIES_TO_KEEP):
+    historical_input_layer, historical_output_layer, historical_variables = create_network(game_spec.board_squares(),
+                                                                                           HIDDEN_NODES)
+    historical_networks.append((historical_input_layer, historical_output_layer, historical_variables))
+
+with tf.Session() as session:
+    session.run(tf.initialize_all_variables())
+
+
+    def make_move_historical(histoical_network_index, board_state, side):
+        net = historical_networks[histoical_network_index]
+        move = get_stochastic_network_move(session, net[0], net[1], board_state, side)
+        return game_spec.flat_move_to_tuple(move.argmax())
+
+
+    def make_training_move(board_state, side):
+        mini_batch_board_states.append(np.ravel(board_state) * side)
+        move = get_stochastic_network_move(session, input_layer, output_layer, board_state, side)
+        mini_batch_moves.append(move)
+        return game_spec.flat_move_to_tuple(move.argmax())
+
+
+    if os.path.isfile(STARTING_NETWORK_WEIGHTS):
+        print("loading pre existing weights")
+        load_network(session, variables, STARTING_NETWORK_WEIGHTS)
+    else:
+        print("could not find previous weights so initialising randomly")
+
+    for i in range(NUMBER_OF_HISTORICAL_COPIES_TO_KEEP):
+        if os.path.isfile(STARTING_NETWORK_WEIGHTS + str(i) + '.p'):
+            load_network(session, variables, BASE_HISTORICAL_NETWORK_PATH + str(i) + '.p')
+
+    for episode_number in range(1, NUMBER_OF_GAMES_TO_PLAY):
+        opponent_index = random.randint(0, NUMBER_OF_HISTORICAL_COPIES_TO_KEEP-1)
+        make_move_historical_for_index = functools.partial(make_move_historical, opponent_index)
+
+        # randomize if going first or second
+        if bool(random.getrandbits(1)):
+            reward = game_spec.play_game(make_training_move, make_move_historical_for_index)
+        else:
+            reward = game_spec.play_game(make_move_historical_for_index, make_training_move)
+
+        results.append(reward)
+        if len(results) > PRINT_RESULTS_EVERY_X:
+            results.popleft()
+
+        last_game_length = len(mini_batch_board_states) - len(mini_batch_rewards)
+
+        # we scale here so winning quickly is better winning slowly and loosing slowly better than loosing quick
+        reward /= float(last_game_length)
+
+        mini_batch_rewards += ([reward] * last_game_length)
+
+        episode_number += 1
+
+        if episode_number % MINI_BATCH_SIZE == 0:
+            normalized_rewards = mini_batch_rewards - np.mean(mini_batch_rewards)
+            normalized_rewards /= np.std(normalized_rewards)
+
+            session.run(train_step, feed_dict={input_layer: mini_batch_board_states,
+                                               reward_placeholder: normalized_rewards,
+                                               actual_move_placeholder: mini_batch_moves})
+
+            # clear batches
+            del mini_batch_board_states[:]
+            del mini_batch_moves[:]
+            del mini_batch_rewards[:]
+
+        if episode_number % PRINT_RESULTS_EVERY_X == 0:
+            print("episode: %s average result: %s" % (episode_number, np.mean(results)))
+
+        if episode_number % SAVE_HISTORICAL_NETWORK_EVERY == 0:
+            print("saving historical network %s", current_historical_index)
+            save_network(session, variables, BASE_HISTORICAL_NETWORK_PATH + str(current_historical_index) + '.p')
+            load_network(session, historical_networks[current_historical_index][2],
+                         BASE_HISTORICAL_NETWORK_PATH + str(current_historical_index) + '.p')
+
+            current_historical_index += 1
+            current_historical_index %= NUMBER_OF_HISTORICAL_COPIES_TO_KEEP
+
+    # save our final weights
+    save_network(session, variables, STARTING_NETWORK_WEIGHTS)
+
+print("completed")
diff --git a/scripts/policy_gradient.py b/scripts/policy_gradient.py
@@ -60,7 +60,7 @@ def make_move(board_state, side):
     actual_moves.append(move)
 
     move_index = move.argmax()
-    return move_index / 3, move_index % 3
+    return game_spec.flat_move_to_tuple(move_index)
 
 
 while True:

diff --git a/scripts/supervised_training.py b/scripts/supervised_training.py
@@ -1,4 +1,3 @@
-import collections
 import numpy as np
 import tensorflow as tf
 

diff --git a/tests/test_network_helpers.py b/tests/test_network_helpers.py
@@ -0,0 +1,37 @@
+import os
+from unittest import TestCase
+import tensorflow as tf
+import numpy as np
+from network_helpers import create_network, save_network, load_network
+
+
+class TestNetworkHelpers(TestCase):
+    def test_create_network(self):
+        input_nodes = 20
+        hidden_nodes = (50, 40, 30)
+        input_layer, output_layer, variables = create_network(input_nodes, hidden_nodes)
+        self.assertSequenceEqual(input_layer.get_shape().as_list(), [None, input_nodes])
+        self.assertSequenceEqual(output_layer.get_shape().as_list(), [None, input_nodes])
+        self.assertEqual(len(variables), (len(hidden_nodes) + 1) * 2)
+
+    def test_save_and_load_network(self):
+        try:
+            file_name = 'test.p'
+            input_nodes = 20
+            hidden_nodes = (50, 40, 30)
+            _, _, variables1 = create_network(input_nodes, hidden_nodes)
+            _, _, variables2 = create_network(input_nodes, hidden_nodes)
+
+            with tf.Session() as session:
+                session.run(tf.initialize_all_variables())
+
+                save_network(session, variables1, file_name)
+                load_network(session, variables2, file_name)
+
+                for var1, var2 in zip(variables1, variables2):
+                    np.testing.assert_array_almost_equal(session.run(var1), session.run(var2))
+        finally:
+            try:
+                os.remove(file_name)
+            except OSError:
+                pass