Skip to content

Commit

Permalink
Small bug fixes, historical policy gradient competition
Browse files Browse the repository at this point in the history
  • Loading branch information
DanielSlater committed Sep 25, 2016
1 parent 69d7305 commit 164fb93
Show file tree
Hide file tree
Showing 8 changed files with 269 additions and 13 deletions.
16 changes: 15 additions & 1 deletion base_game_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,20 @@ def board_dimensions(self):
def board_squares(self):
return reduce(operator.mul, self.board_dimensions(), 1)

def outputs(self):
"""The number of moves that could be made in this kind of game, weather or not they are legal. For most games
this will be every single square on the board, but for connect 4 this is different. If we wanted to do chess in
the future this method may need to get a bit more complicated.
Returns:
int
"""
return self.board_squares()

def flat_move_to_tuple(self, move_index):
board_x = self.board_dimensions()[0]
return move_index / board_x, move_index % board_x

def play_game(self, plus_player_func, minus_player_func, log=False):
"""Run a single game of until the end, using the provided function args to determine the moves for each
player.
Expand Down Expand Up @@ -86,4 +100,4 @@ def get_random_player_func(self):
Returns:
board_state, side (int) -> move : function that plays this game by making random moves
"""
return lambda board_state, side: random.choice(list(self.available_moves(board_state)))
return lambda board_state, side: random.choice(list(self.available_moves(board_state)))
14 changes: 8 additions & 6 deletions games/connect_4.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,13 +114,13 @@ def has_winner(board_state, winning_length=4):
# check diagonals
diagonals_start = -(board_width - winning_length)
diagonals_end = (board_width - winning_length)
for d in range(diagonals_start, diagonals_end):
for d in range(diagonals_start, diagonals_end+1):
winner = _has_winning_line(
(board_state[i][i + d] for i in range(max(-d, 0), min(board_width, board_height - d))),
winning_length)
if winner != 0:
return winner
for d in range(diagonals_start, diagonals_end):
for d in range(diagonals_start, diagonals_end+1):
winner = _has_winning_line(
(board_state[i][board_height - i - d - 1] for i in range(max(-d, 0), min(board_width, board_height - d))),
winning_length)
Expand Down Expand Up @@ -205,18 +205,20 @@ def __init__(self, board_width, board_height, winning_length):
self.available_moves = available_moves
self.apply_move = apply_move

self.new_board.__doc__ = _new_board.__doc__
self.has_winner.__doc__ = has_winner.__doc__

def new_board(self):
return _new_board(self._board_width, self._board_height)

def has_winner(self, board_sate):
return has_winner(self._board_size, self._winning_length)
return has_winner(board_sate, self._winning_length)

def board_dimensions(self):
return self._board_width, self._board_height

def flat_move_to_tuple(self, move_index):
return move_index

def outputs(self):
return self._board_width

if __name__ == '__main__':
# example of playing a game
Expand Down
5 changes: 1 addition & 4 deletions games/tic_tac_toe_x.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,14 +200,11 @@ def __init__(self, board_size, winning_length):
self.available_moves = available_moves
self.apply_move = apply_move

self.new_board.__doc__ = _new_board.__doc__
self.has_winner.__doc__ = has_winner.__doc__

def new_board(self):
return _new_board(self._board_size)

def has_winner(self, board_sate):
return has_winner(self._board_size, self._winning_length)
return has_winner(board_sate, self._winning_length)

def board_dimensions(self):
return self._board_size, self._board_size
Expand Down
84 changes: 84 additions & 0 deletions network_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import tensorflow as tf
import numpy as np
import pickle


def create_network(input_nodes, hidden_nodes, output_nodes = None):
"""Create a network with relu activations at each layer
Args:
output_nodes: (int): Number of output nodes, if None then number of input nodes is used
input_nodes (int): The size of the board this network will work on. The output layer will also be this size
hidden_nodes ([int]): The number of hidden nodes in each hidden layer
Returns:
(input_layer, output_layer, [variables]) : The final item in the tuple is a list containing all the parameters,
wieghts and biases used in this network
"""
output_nodes = output_nodes or input_nodes
variables = []

with tf.name_scope('network'):
input_layer = tf.placeholder("float", (None, input_nodes))

current_layer = input_layer

for hidden_nodes in hidden_nodes:
last_layer_nodes = int(current_layer.get_shape()[-1])
hidden_weights = tf.Variable(
tf.truncated_normal((last_layer_nodes, hidden_nodes), stddev=1. / np.sqrt(last_layer_nodes)),
name='weights')
hidden_bias = tf.Variable(tf.constant(0.01, shape=(hidden_nodes,)), name='biases')

variables.append(hidden_weights)
variables.append(hidden_bias)

current_layer = tf.nn.relu(
tf.matmul(current_layer, hidden_weights) + hidden_bias)

output_weights = tf.Variable(
tf.truncated_normal((hidden_nodes, output_nodes), stddev=1. / np.sqrt(hidden_nodes)), name="output_weights")
output_bias = tf.Variable(tf.constant(0.01, shape=(output_nodes,)), name="output_bias")

variables.append(output_weights)
variables.append(output_bias)

output_layer = tf.nn.softmax(
tf.matmul(current_layer, output_weights) + output_bias)

return input_layer, output_layer, variables


def save_network(session, variables, file_path):
variable_values = session.run(variables)
with open(file_path, mode='w') as f:
pickle.dump(variable_values, f)


def load_network(session, tf_variables, file_path):
with open(file_path, mode='r') as f:
variable_values = pickle.load(f)
for value, tf_variable in zip(variable_values, tf_variables):
session.run(tf_variable.assign(value))


def invert_board_state(board_state):
return tuple(tuple(-board_state(j, i) for i in range(len(board_state[0]))) for j in range(len(board_state)))


def get_stochastic_network_move(session, input_layer, output_layer, board_state, side):
board_state_flat = np.ravel(board_state)
if side == -1:
board_state_flat = -board_state_flat

probability_of_actions = session.run(output_layer,
feed_dict={input_layer: [board_state_flat.ravel()]})[0]

try:
move = np.random.multinomial(1, probability_of_actions)
except ValueError:
# sometimes because of rounding errors we end up with probability_of_actions summing to greater than 1.
# so need to reduce slightly to be a valid value
move = np.random.multinomial(1, probability_of_actions / (sum(probability_of_actions) + 1e-7))

return move
123 changes: 123 additions & 0 deletions policy_gradient_historical_competition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import collections
import functools
import os
import random

import numpy as np
import tensorflow as tf

from games.tic_tac_toe_x import TicTacToeXGameSpec
from network_helpers import create_network, load_network, get_stochastic_network_move, \
save_network

NUMBER_OF_HISTORICAL_COPIES_TO_KEEP = 8
NUMBER_OF_GAMES_TO_PLAY = 1000000
MINI_BATCH_SIZE = 100
SAVE_HISTORICAL_NETWORK_EVERY = 100000
STARTING_NETWORK_WEIGHTS = 'current_network.p'
BASE_HISTORICAL_NETWORK_PATH = 'historical_network_'
HIDDEN_NODES = (100, 80, 60, 40)
PRINT_RESULTS_EVERY_X = 500
LEARN_RATE = 1e-4
game_spec = TicTacToeXGameSpec(5, 4)

input_layer, output_layer, variables = create_network(game_spec.board_squares(), HIDDEN_NODES,
output_nodes=game_spec.outputs())

reward_placeholder = tf.placeholder("float", shape=(None,))
actual_move_placeholder = tf.placeholder("float", shape=(None, game_spec.board_squares()))
policy_gradient = tf.reduce_sum(tf.reshape(reward_placeholder, (-1, 1)) * actual_move_placeholder * output_layer)
train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(-policy_gradient)

current_historical_index = 0
historical_networks = []

mini_batch_moves = []
mini_batch_board_states = []
mini_batch_rewards = []
results = collections.deque(maxlen=PRINT_RESULTS_EVERY_X)

for _ in range(NUMBER_OF_HISTORICAL_COPIES_TO_KEEP):
historical_input_layer, historical_output_layer, historical_variables = create_network(game_spec.board_squares(),
HIDDEN_NODES)
historical_networks.append((historical_input_layer, historical_output_layer, historical_variables))

with tf.Session() as session:
session.run(tf.initialize_all_variables())


def make_move_historical(histoical_network_index, board_state, side):
net = historical_networks[histoical_network_index]
move = get_stochastic_network_move(session, net[0], net[1], board_state, side)
return game_spec.flat_move_to_tuple(move.argmax())


def make_training_move(board_state, side):
mini_batch_board_states.append(np.ravel(board_state) * side)
move = get_stochastic_network_move(session, input_layer, output_layer, board_state, side)
mini_batch_moves.append(move)
return game_spec.flat_move_to_tuple(move.argmax())


if os.path.isfile(STARTING_NETWORK_WEIGHTS):
print("loading pre existing weights")
load_network(session, variables, STARTING_NETWORK_WEIGHTS)
else:
print("could not find previous weights so initialising randomly")

for i in range(NUMBER_OF_HISTORICAL_COPIES_TO_KEEP):
if os.path.isfile(STARTING_NETWORK_WEIGHTS + str(i) + '.p'):
load_network(session, variables, BASE_HISTORICAL_NETWORK_PATH + str(i) + '.p')

for episode_number in range(1, NUMBER_OF_GAMES_TO_PLAY):
opponent_index = random.randint(0, NUMBER_OF_HISTORICAL_COPIES_TO_KEEP-1)
make_move_historical_for_index = functools.partial(make_move_historical, opponent_index)

# randomize if going first or second
if bool(random.getrandbits(1)):
reward = game_spec.play_game(make_training_move, make_move_historical_for_index)
else:
reward = game_spec.play_game(make_move_historical_for_index, make_training_move)

results.append(reward)
if len(results) > PRINT_RESULTS_EVERY_X:
results.popleft()

last_game_length = len(mini_batch_board_states) - len(mini_batch_rewards)

# we scale here so winning quickly is better winning slowly and loosing slowly better than loosing quick
reward /= float(last_game_length)

mini_batch_rewards += ([reward] * last_game_length)

episode_number += 1

if episode_number % MINI_BATCH_SIZE == 0:
normalized_rewards = mini_batch_rewards - np.mean(mini_batch_rewards)
normalized_rewards /= np.std(normalized_rewards)

session.run(train_step, feed_dict={input_layer: mini_batch_board_states,
reward_placeholder: normalized_rewards,
actual_move_placeholder: mini_batch_moves})

# clear batches
del mini_batch_board_states[:]
del mini_batch_moves[:]
del mini_batch_rewards[:]

if episode_number % PRINT_RESULTS_EVERY_X == 0:
print("episode: %s average result: %s" % (episode_number, np.mean(results)))

if episode_number % SAVE_HISTORICAL_NETWORK_EVERY == 0:
print("saving historical network %s", current_historical_index)
save_network(session, variables, BASE_HISTORICAL_NETWORK_PATH + str(current_historical_index) + '.p')
load_network(session, historical_networks[current_historical_index][2],
BASE_HISTORICAL_NETWORK_PATH + str(current_historical_index) + '.p')

current_historical_index += 1
current_historical_index %= NUMBER_OF_HISTORICAL_COPIES_TO_KEEP

# save our final weights
save_network(session, variables, STARTING_NETWORK_WEIGHTS)

print("completed")
2 changes: 1 addition & 1 deletion scripts/policy_gradient.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def make_move(board_state, side):
actual_moves.append(move)

move_index = move.argmax()
return move_index / 3, move_index % 3
return game_spec.flat_move_to_tuple(move_index)


while True:
Expand Down
1 change: 0 additions & 1 deletion scripts/supervised_training.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import collections
import numpy as np
import tensorflow as tf

Expand Down
37 changes: 37 additions & 0 deletions tests/test_network_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import os
from unittest import TestCase
import tensorflow as tf
import numpy as np
from network_helpers import create_network, save_network, load_network


class TestNetworkHelpers(TestCase):
def test_create_network(self):
input_nodes = 20
hidden_nodes = (50, 40, 30)
input_layer, output_layer, variables = create_network(input_nodes, hidden_nodes)
self.assertSequenceEqual(input_layer.get_shape().as_list(), [None, input_nodes])
self.assertSequenceEqual(output_layer.get_shape().as_list(), [None, input_nodes])
self.assertEqual(len(variables), (len(hidden_nodes) + 1) * 2)

def test_save_and_load_network(self):
try:
file_name = 'test.p'
input_nodes = 20
hidden_nodes = (50, 40, 30)
_, _, variables1 = create_network(input_nodes, hidden_nodes)
_, _, variables2 = create_network(input_nodes, hidden_nodes)

with tf.Session() as session:
session.run(tf.initialize_all_variables())

save_network(session, variables1, file_name)
load_network(session, variables2, file_name)

for var1, var2 in zip(variables1, variables2):
np.testing.assert_array_almost_equal(session.run(var1), session.run(var2))
finally:
try:
os.remove(file_name)
except OSError:
pass

0 comments on commit 164fb93

Please sign in to comment.