-
Notifications
You must be signed in to change notification settings - Fork 1
/
treestrap.py
113 lines (100 loc) · 3.32 KB
/
treestrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python
"""
treestrap learning algorithm
"""
from tDLambda import *
from engine import *
from node import *
from play import *
from value_network import *
from noughts_crosses import *
import matplotlib.pyplot as plt
import csv
def get_trace(node, trace):
'return list of boards and reward of principle variation'
trace.append(node.board)
if node.pv is not None:
return get_trace(node.pv, trace)
else:
return trace, node.reward
def train_games(network, node, discount):
'train on all possible games from node'
boards, reward = get_trace(node, [])
# if no reward, then need to remove final value
if reward is None:
reward = network(boards[-1])
boards = boards[:-1]
if boards != []:
network.temporal_difference(boards, reward, discount)
for board in node.other:
train_games(network, board, discount)
def TreeStrap(engines, network, discount):
'return sequence of boards and reward for training'
board = initialBoard
player = players[0]
index = 0
while evaluate(board) is None:
node = engines[index].create_search_tree(board, player)
train_games(network, node, discount)
board = node.pv.board
player = next_player(player)
index = int(not index)
def train(engine, games):
'train engine for self play in games'
for _ in range(games):
TreeStrap([engine, engine], engine.policy, engine.discount)
if __name__ == "__main__":
with open("treestrap.csv", "wb") as csv_file:
writer = csv.writer(csv_file, delimiter=',')
plt.ion()
batch = 20
learningRate = 0.01
discount = 0.7
directory = "treestrap"
valueNetwork = ValueNet(learningRate, 0.7)
e = Engine(valueNetwork, 3, discount)
r = Engine(random, 1, discount)
win, lose, draw = [], [], []
testGamesNum = 10
count = 0
while True:
# plot first before train
w, l, d = 0, 0, 0
for _ in range(testGamesNum):
score = self_play([e, r])
if score == 1:
w += 1
elif score == -1:
l += 1
else:
d += 1
score = self_play([r, e])
if score == -1:
w += 1
elif score == 1:
l += 1
else:
d += 1
w = float(w) / (2.0 * testGamesNum)
l = float(l) / (2.0 * testGamesNum)
d = float(d) / (2.0 * testGamesNum)
writer.writerow([w, l, d])
print "Wins, Losses, Draws:", w, l, d, e.policy(initialBoard)
win.append(w)
lose.append(l)
draw.append(d)
x = range(0, batch*(count + 1), batch)
plt.plot(x, win, label="P(win)")
plt.plot(x, draw, label="P(draw)")
plt.plot(x, lose, label="P(lose)")
plt.legend()
plt.title("Training vs Time")
plt.xlabel('Self-Play Games Played')
plt.ylabel('Probability')
plt.pause(0.001)
plt.clf()
# train
train(e, batch)
if (count % 100) == 99:
e.policy.save_weights(directory)
count += 1