1
+ import sys
1
2
import numpy as np
2
3
import warnings
3
4
import utils
5
+ from enum import Enum
4
6
from time import time , sleep
5
7
import matplotlib .pyplot as plt
6
8
from policy import EpsGreedyPolicy
7
9
from memory import ExperienceReplay
10
+ from keras .models import Sequential
11
+ from keras .layers import *
12
+ from keras .optimizers import *
13
+ from keras .models import load_model
8
14
9
- class Agent :
10
- def __init__ (self , game , model , nb_epoch = 10000 , memory_size = 1000 , batch_size = 50 , nb_frames = 4 , epsilon = 1. , discount = .9 , learning_rate = .1 ):
11
-
12
- channels = model .input_shape [1 ]
15
+ TEST = 0
16
+ SIMPLE = 1
17
+ DOUBLE = 2
13
18
14
- if nb_frames != channels :
15
- warnings .warn ("Dimension mismatch: Using number of channels for number of frames" )
16
- nb_frames = channels
19
+ class Agent :
20
+ def __init__ (self , game , mode = SIMPLE , nb_epoch = 10000 , memory_size = 1000 , batch_size = 50 , nb_frames = 4 , epsilon = 1. , discount = .9 , learning_rate = .1 , model = None ):
17
21
18
22
self .game = game
19
- self .model = model
23
+ self .mode = mode
24
+ self .target_model = None
25
+ self .rows , self .columns = game .field_shape ()
20
26
self .nb_epoch = nb_epoch
21
27
self .nb_frames = nb_frames
22
28
self .nb_actions = game .nb_actions ()
23
29
30
+ if mode == TEST :
31
+ print ('Training Mode: Loading model...' )
32
+ self .model = load_model (model )
33
+ elif mode == SIMPLE :
34
+ print ('Using Plain DQN: Building model...' )
35
+ self .model = self .build_model ()
36
+ elif mode == DOUBLE :
37
+ print ('Using Double DQN: Building primary and target model...' )
38
+ self .model = self .build_model ()
39
+ self .target_model = self .build_model ()
40
+ self .update_target_model ()
41
+
24
42
# Trades off the importance of sooner versus later rewards.
25
43
# A factor of 0 means it rather prefers immediate rewards
26
44
# and it will mostly consider current rewards. A factor of 1
@@ -39,18 +57,34 @@ def __init__(self, game, model, nb_epoch=10000, memory_size=1000, batch_size=50,
39
57
# a random action by the probability 'eps'. Without this policy the network
40
58
# is greedy and it will it settles with the first effective strategy it finds.
41
59
# Hence, we introduce certain randomness.
42
- # Epislon reaches its minimum at 2/3 of the games
60
+ # Epislon reaches its minimum at 1/2 of the games
43
61
epsilon_end = self .nb_epoch - (self .nb_epoch / 2 )
44
62
self .policy = EpsGreedyPolicy (self .model , epsilon_end , self .nb_actions , epsilon , .1 )
45
63
46
64
# Create new experience replay memory. Without this optimization
47
65
# the training takes extremely long even on a GPU and most
48
66
# importantly the approximation of Q-values using non-linear
49
67
# functions, that is used for our NN, is not very stable.
50
- self .memory = ExperienceReplay (self .model , self .nb_actions , memory_size , batch_size , self .discount , self .learning_rate )
68
+ self .memory = ExperienceReplay (self .model , self .target_model , self . nb_actions , memory_size , batch_size , self .discount , self .learning_rate )
51
69
52
70
self .frames = None
53
71
72
+ def build_model (self ):
73
+ model = Sequential ()
74
+ model .add (Conv2D (32 , (2 , 2 ), activation = 'relu' , input_shape = (self .nb_frames , self .rows , self .columns ), data_format = "channels_first" ))
75
+ model .add (Conv2D (64 , (2 , 2 ), activation = 'relu' ))
76
+ model .add (Conv2D (64 , (3 , 3 ), activation = 'relu' ))
77
+ model .add (Flatten ())
78
+ model .add (Dropout (0.1 ))
79
+ model .add (Dense (512 , activation = 'relu' ))
80
+ model .add (Dense (self .nb_actions ))
81
+ model .compile (Adam (), 'MSE' )
82
+
83
+ return model
84
+
85
+ def update_target_model (self ):
86
+ self .target_model .set_weights (self .model .get_weights ())
87
+
54
88
def get_frames (self ):
55
89
frame = self .game .get_state ()
56
90
if self .frames is None :
@@ -85,7 +119,8 @@ def print_stats(self, data, y_label, x_label='Epoch', marker='-'):
85
119
path = './plots/{name}_{size}x{size}_{timestamp}'
86
120
fig .savefig (path .format (size = self .game .grid_size , name = file_name , timestamp = int (time ())))
87
121
88
- def train (self , visualize = True ):
122
+ def train (self , update_freq = 10 ):
123
+ total_steps = 0
89
124
max_steps = self .game .grid_size ** 2 * 3
90
125
loops = 0
91
126
nb_wins = 0
@@ -119,6 +154,7 @@ def train(self, visualize=True):
119
154
120
155
cumulative_reward += reward
121
156
steps += 1
157
+ total_steps += 1
122
158
123
159
if steps == max_steps and not done :
124
160
loops += 1
@@ -145,6 +181,9 @@ def train(self, visualize=True):
145
181
if done :
146
182
duration = utils .get_time_difference (start_time , time ())
147
183
184
+ if self .mode == DOUBLE and self .target_model is not None and total_steps % (update_freq ) == 0 :
185
+ self .update_target_model ()
186
+
148
187
current_epoch = epoch + 1
149
188
reward_buffer .append ([current_epoch , cumulative_reward ])
150
189
duration_buffer .append ([current_epoch , duration ])
@@ -160,8 +199,9 @@ def train(self, visualize=True):
160
199
self .print_stats (steps_buffer , 'Steps per Game' )
161
200
self .print_stats (wins_buffer , 'Wins' )
162
201
163
- path = './models/model_{size}x{size}_{epochs}_{timestamp}.h5'
164
- self .model .save (path .format (size = self .game .grid_size , epochs = self .nb_epoch , timestamp = int (time ())))
202
+ path = './models/model_{mode}_{size}x{size}_{epochs}_{timestamp}.h5'
203
+ mode = 'dqn' if self .mode == SIMPLE else 'ddqn'
204
+ self .model .save (path .format (mode = mode , size = self .game .grid_size , epochs = self .nb_epoch , timestamp = int (time ())))
165
205
166
206
def play (self , nb_games = 5 , interval = .7 ):
167
207
nb_wins = 0
0 commit comments