forked from DanielSlater/AlphaToe
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Small bug fixes, historical policy gradient competition
- Loading branch information
1 parent
69d7305
commit 164fb93
Showing
8 changed files
with
269 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import tensorflow as tf | ||
import numpy as np | ||
import pickle | ||
|
||
|
||
def create_network(input_nodes, hidden_nodes, output_nodes = None): | ||
"""Create a network with relu activations at each layer | ||
Args: | ||
output_nodes: (int): Number of output nodes, if None then number of input nodes is used | ||
input_nodes (int): The size of the board this network will work on. The output layer will also be this size | ||
hidden_nodes ([int]): The number of hidden nodes in each hidden layer | ||
Returns: | ||
(input_layer, output_layer, [variables]) : The final item in the tuple is a list containing all the parameters, | ||
wieghts and biases used in this network | ||
""" | ||
output_nodes = output_nodes or input_nodes | ||
variables = [] | ||
|
||
with tf.name_scope('network'): | ||
input_layer = tf.placeholder("float", (None, input_nodes)) | ||
|
||
current_layer = input_layer | ||
|
||
for hidden_nodes in hidden_nodes: | ||
last_layer_nodes = int(current_layer.get_shape()[-1]) | ||
hidden_weights = tf.Variable( | ||
tf.truncated_normal((last_layer_nodes, hidden_nodes), stddev=1. / np.sqrt(last_layer_nodes)), | ||
name='weights') | ||
hidden_bias = tf.Variable(tf.constant(0.01, shape=(hidden_nodes,)), name='biases') | ||
|
||
variables.append(hidden_weights) | ||
variables.append(hidden_bias) | ||
|
||
current_layer = tf.nn.relu( | ||
tf.matmul(current_layer, hidden_weights) + hidden_bias) | ||
|
||
output_weights = tf.Variable( | ||
tf.truncated_normal((hidden_nodes, output_nodes), stddev=1. / np.sqrt(hidden_nodes)), name="output_weights") | ||
output_bias = tf.Variable(tf.constant(0.01, shape=(output_nodes,)), name="output_bias") | ||
|
||
variables.append(output_weights) | ||
variables.append(output_bias) | ||
|
||
output_layer = tf.nn.softmax( | ||
tf.matmul(current_layer, output_weights) + output_bias) | ||
|
||
return input_layer, output_layer, variables | ||
|
||
|
||
def save_network(session, variables, file_path): | ||
variable_values = session.run(variables) | ||
with open(file_path, mode='w') as f: | ||
pickle.dump(variable_values, f) | ||
|
||
|
||
def load_network(session, tf_variables, file_path): | ||
with open(file_path, mode='r') as f: | ||
variable_values = pickle.load(f) | ||
for value, tf_variable in zip(variable_values, tf_variables): | ||
session.run(tf_variable.assign(value)) | ||
|
||
|
||
def invert_board_state(board_state): | ||
return tuple(tuple(-board_state(j, i) for i in range(len(board_state[0]))) for j in range(len(board_state))) | ||
|
||
|
||
def get_stochastic_network_move(session, input_layer, output_layer, board_state, side): | ||
board_state_flat = np.ravel(board_state) | ||
if side == -1: | ||
board_state_flat = -board_state_flat | ||
|
||
probability_of_actions = session.run(output_layer, | ||
feed_dict={input_layer: [board_state_flat.ravel()]})[0] | ||
|
||
try: | ||
move = np.random.multinomial(1, probability_of_actions) | ||
except ValueError: | ||
# sometimes because of rounding errors we end up with probability_of_actions summing to greater than 1. | ||
# so need to reduce slightly to be a valid value | ||
move = np.random.multinomial(1, probability_of_actions / (sum(probability_of_actions) + 1e-7)) | ||
|
||
return move |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
import collections | ||
import functools | ||
import os | ||
import random | ||
|
||
import numpy as np | ||
import tensorflow as tf | ||
|
||
from games.tic_tac_toe_x import TicTacToeXGameSpec | ||
from network_helpers import create_network, load_network, get_stochastic_network_move, \ | ||
save_network | ||
|
||
NUMBER_OF_HISTORICAL_COPIES_TO_KEEP = 8 | ||
NUMBER_OF_GAMES_TO_PLAY = 1000000 | ||
MINI_BATCH_SIZE = 100 | ||
SAVE_HISTORICAL_NETWORK_EVERY = 100000 | ||
STARTING_NETWORK_WEIGHTS = 'current_network.p' | ||
BASE_HISTORICAL_NETWORK_PATH = 'historical_network_' | ||
HIDDEN_NODES = (100, 80, 60, 40) | ||
PRINT_RESULTS_EVERY_X = 500 | ||
LEARN_RATE = 1e-4 | ||
game_spec = TicTacToeXGameSpec(5, 4) | ||
|
||
input_layer, output_layer, variables = create_network(game_spec.board_squares(), HIDDEN_NODES, | ||
output_nodes=game_spec.outputs()) | ||
|
||
reward_placeholder = tf.placeholder("float", shape=(None,)) | ||
actual_move_placeholder = tf.placeholder("float", shape=(None, game_spec.board_squares())) | ||
policy_gradient = tf.reduce_sum(tf.reshape(reward_placeholder, (-1, 1)) * actual_move_placeholder * output_layer) | ||
train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(-policy_gradient) | ||
|
||
current_historical_index = 0 | ||
historical_networks = [] | ||
|
||
mini_batch_moves = [] | ||
mini_batch_board_states = [] | ||
mini_batch_rewards = [] | ||
results = collections.deque(maxlen=PRINT_RESULTS_EVERY_X) | ||
|
||
for _ in range(NUMBER_OF_HISTORICAL_COPIES_TO_KEEP): | ||
historical_input_layer, historical_output_layer, historical_variables = create_network(game_spec.board_squares(), | ||
HIDDEN_NODES) | ||
historical_networks.append((historical_input_layer, historical_output_layer, historical_variables)) | ||
|
||
with tf.Session() as session: | ||
session.run(tf.initialize_all_variables()) | ||
|
||
|
||
def make_move_historical(histoical_network_index, board_state, side): | ||
net = historical_networks[histoical_network_index] | ||
move = get_stochastic_network_move(session, net[0], net[1], board_state, side) | ||
return game_spec.flat_move_to_tuple(move.argmax()) | ||
|
||
|
||
def make_training_move(board_state, side): | ||
mini_batch_board_states.append(np.ravel(board_state) * side) | ||
move = get_stochastic_network_move(session, input_layer, output_layer, board_state, side) | ||
mini_batch_moves.append(move) | ||
return game_spec.flat_move_to_tuple(move.argmax()) | ||
|
||
|
||
if os.path.isfile(STARTING_NETWORK_WEIGHTS): | ||
print("loading pre existing weights") | ||
load_network(session, variables, STARTING_NETWORK_WEIGHTS) | ||
else: | ||
print("could not find previous weights so initialising randomly") | ||
|
||
for i in range(NUMBER_OF_HISTORICAL_COPIES_TO_KEEP): | ||
if os.path.isfile(STARTING_NETWORK_WEIGHTS + str(i) + '.p'): | ||
load_network(session, variables, BASE_HISTORICAL_NETWORK_PATH + str(i) + '.p') | ||
|
||
for episode_number in range(1, NUMBER_OF_GAMES_TO_PLAY): | ||
opponent_index = random.randint(0, NUMBER_OF_HISTORICAL_COPIES_TO_KEEP-1) | ||
make_move_historical_for_index = functools.partial(make_move_historical, opponent_index) | ||
|
||
# randomize if going first or second | ||
if bool(random.getrandbits(1)): | ||
reward = game_spec.play_game(make_training_move, make_move_historical_for_index) | ||
else: | ||
reward = game_spec.play_game(make_move_historical_for_index, make_training_move) | ||
|
||
results.append(reward) | ||
if len(results) > PRINT_RESULTS_EVERY_X: | ||
results.popleft() | ||
|
||
last_game_length = len(mini_batch_board_states) - len(mini_batch_rewards) | ||
|
||
# we scale here so winning quickly is better winning slowly and loosing slowly better than loosing quick | ||
reward /= float(last_game_length) | ||
|
||
mini_batch_rewards += ([reward] * last_game_length) | ||
|
||
episode_number += 1 | ||
|
||
if episode_number % MINI_BATCH_SIZE == 0: | ||
normalized_rewards = mini_batch_rewards - np.mean(mini_batch_rewards) | ||
normalized_rewards /= np.std(normalized_rewards) | ||
|
||
session.run(train_step, feed_dict={input_layer: mini_batch_board_states, | ||
reward_placeholder: normalized_rewards, | ||
actual_move_placeholder: mini_batch_moves}) | ||
|
||
# clear batches | ||
del mini_batch_board_states[:] | ||
del mini_batch_moves[:] | ||
del mini_batch_rewards[:] | ||
|
||
if episode_number % PRINT_RESULTS_EVERY_X == 0: | ||
print("episode: %s average result: %s" % (episode_number, np.mean(results))) | ||
|
||
if episode_number % SAVE_HISTORICAL_NETWORK_EVERY == 0: | ||
print("saving historical network %s", current_historical_index) | ||
save_network(session, variables, BASE_HISTORICAL_NETWORK_PATH + str(current_historical_index) + '.p') | ||
load_network(session, historical_networks[current_historical_index][2], | ||
BASE_HISTORICAL_NETWORK_PATH + str(current_historical_index) + '.p') | ||
|
||
current_historical_index += 1 | ||
current_historical_index %= NUMBER_OF_HISTORICAL_COPIES_TO_KEEP | ||
|
||
# save our final weights | ||
save_network(session, variables, STARTING_NETWORK_WEIGHTS) | ||
|
||
print("completed") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,3 @@ | ||
import collections | ||
import numpy as np | ||
import tensorflow as tf | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import os | ||
from unittest import TestCase | ||
import tensorflow as tf | ||
import numpy as np | ||
from network_helpers import create_network, save_network, load_network | ||
|
||
|
||
class TestNetworkHelpers(TestCase): | ||
def test_create_network(self): | ||
input_nodes = 20 | ||
hidden_nodes = (50, 40, 30) | ||
input_layer, output_layer, variables = create_network(input_nodes, hidden_nodes) | ||
self.assertSequenceEqual(input_layer.get_shape().as_list(), [None, input_nodes]) | ||
self.assertSequenceEqual(output_layer.get_shape().as_list(), [None, input_nodes]) | ||
self.assertEqual(len(variables), (len(hidden_nodes) + 1) * 2) | ||
|
||
def test_save_and_load_network(self): | ||
try: | ||
file_name = 'test.p' | ||
input_nodes = 20 | ||
hidden_nodes = (50, 40, 30) | ||
_, _, variables1 = create_network(input_nodes, hidden_nodes) | ||
_, _, variables2 = create_network(input_nodes, hidden_nodes) | ||
|
||
with tf.Session() as session: | ||
session.run(tf.initialize_all_variables()) | ||
|
||
save_network(session, variables1, file_name) | ||
load_network(session, variables2, file_name) | ||
|
||
for var1, var2 in zip(variables1, variables2): | ||
np.testing.assert_array_almost_equal(session.run(var1), session.run(var2)) | ||
finally: | ||
try: | ||
os.remove(file_name) | ||
except OSError: | ||
pass |