Skip to content

Commit

Permalink
tried to use RL (failed)
Browse files Browse the repository at this point in the history
  • Loading branch information
GreenWizard2015 committed Nov 24, 2020
1 parent 4389744 commit 3a2d4c4
Show file tree
Hide file tree
Showing 5 changed files with 384 additions and 19 deletions.
67 changes: 67 additions & 0 deletions CMazeExperience.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import random
import numpy as np
import math

class CMazeExperience:
def __init__(self, maxSize):
self.maxSize = maxSize
self.sizeLimit = (maxSize * 1.1)
self.episodes = []
self.gamma = 0.5
self.minScore = -math.inf

def addEpisode(self, replay):
score = sum(x[2] for x in replay)
if score < self.minScore: return

# for i in range(len(replay)):
# state, act, score, nextState = replay[i]
# gamma = self.gamma
# for j in range(i + 1, len(replay)):
# score += gamma * replay[j][2]
# gamma *= self.gamma
self.episodes.append((replay, score))

if self.sizeLimit < len(self.episodes):
self.update()
return

def update(self):
self.episodes = list(
sorted(self.episodes, key=lambda x: x[1], reverse=True)
)[:self.maxSize]
self.minScore = self.episodes[-1][1]
print('Min score: %.6f' % self.minScore)

def __len__(self):
return len(self.episodes)

def take_batch(self, batch_size):
batch = []
weights = [x[1] for x in self.episodes]
while len(batch) < batch_size:
episode, _ = random.choices(
self.episodes,
weights=weights,
k=1
)[0]

minibatchIndexes = set(random.choices(
np.arange(len(episode)),
weights=[abs(x[2]) for x in episode],
k=min((5, batch_size - len(batch), len(episode)))
))

for ind in minibatchIndexes:
state, act, score, nextState = episode[ind]
nextStateWeight = 1 if ind < len(episode) - 1 else 0
batch.append((state, act, score, nextState, nextStateWeight))


return (
np.array([x[0] for x in batch]),
np.array([x[1] for x in batch]),
np.array([x[2] for x in batch]),
np.array([x[3] for x in batch]),
np.array([x[4] for x in batch]),
)
70 changes: 57 additions & 13 deletions Core/CMazeEnviroment.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,14 @@ class MazeActions(Enum):
RIGHT = (1, 0)
UP = (0, -1)
DOWN = (0, 1)

MAZE_ACTIONS_AS_INT = { x: i for i, x in enumerate(MazeActions) }
MAZE_ACTIONS = [x for x in MazeActions]

class CMazeEnviroment:
def __init__(self, maze, pos, FOV):
def __init__(self, maze, pos, FOV, minimapSize):
self.maze = np.pad(np.array(maze), FOV, constant_values=(1,))
self.minimapSize = minimapSize
self._fov = FOV

x, y = np.array(pos) + FOV
Expand All @@ -20,7 +24,8 @@ def __init__(self, maze, pos, FOV):
def spawnAt(self, x, y):
self.pos = np.array([y, x])
self.fog = np.zeros_like(self.maze)
self._updateFog()
self.moves = np.zeros_like(self.maze)
self._update()
return

def respawn(self):
Expand All @@ -33,17 +38,17 @@ def respawn(self):
break
return

def _updateFog(self):
def _update(self):
y, x = self.pos
self.fog[
x - self._fov:x + self._fov + 1,
y - self._fov:y + self._fov + 1
] = 1
d = self._fov
self.fog[x - d:x + d + 1, y - d:y + d + 1] = 1
self.moves[x, y] = 1
return

def apply(self, action):
self.pos += action.value
self._updateFog()
self.lastAction = MAZE_ACTIONS_AS_INT[action]
self._update()
return

def vision(self):
Expand All @@ -52,15 +57,38 @@ def vision(self):
x - self._fov:x + self._fov + 1,
y - self._fov:y + self._fov + 1
]

def _takeShot(self):
maze, fog, moves = self.maze, self.fog, self.moves
y, x = self.pos
h, w = self.maze.shape

isXAxisOk = (self.minimapSize < x) and (x < (w - self.minimapSize))
isYAxisOk = (self.minimapSize < y) and (y < (h - self.minimapSize))
if not (isXAxisOk and isYAxisOk):
x += self.minimapSize
y += self.minimapSize
maze = np.pad(maze, self.minimapSize, constant_values=(1,))
fog, moves = (
np.pad(data, self.minimapSize, constant_values=(0,)) for data in (fog, moves)
)

d = self.minimapSize
return (data[x - d:x + d + 1, y - d:y + d + 1] for data in (maze, fog, moves))

def minimap(self):
#maze, fog, moves = self._takeShot()
maze, fog, moves = self.maze, self.fog, self.moves
return (maze * fog, moves)

@property
def state(self):
return ((self.vision(), self.fog, ), self.score, self.done)
return ((self.minimap(), ), self.score, self.done)

@property
def done(self):
y, x = self._pos
return 1 < self.maze[x, y]
y, x = self.pos
return 0 < self.maze[x, y]

@property
def score(self):
Expand All @@ -70,15 +98,31 @@ def score(self):

def copy(self):
# dirty copy
res = CMazeEnviroment(self.maze, self.pos, self._fov)
res = CMazeEnviroment(self.maze, self.pos, self._fov, self.minimapSize)
res.maze = self.maze.copy()
res.fog = self.fog.copy()
res.pos = self.pos.copy()
res.moves = self.moves.copy()
return res

def isPossible(self, action):
y, x = self.pos + action.value
return self.maze[x, y] <= 0

def validActions(self):
return [ act for act in MazeActions if self.isPossible(act) ]
return [ act for act in MazeActions if self.isPossible(act) ]

def validActionsIndex(self):
return [ i for i, act in enumerate(MazeActions) if self.isPossible(act) ]

def invalidActions(self):
return [ i for i, act in enumerate(MazeActions) if not self.isPossible(act) ]

def state2input(self):
maze, moves = self.minimap()
state = np.dstack((maze, ))
return state

@property
def input_size(self):
return self.state2input().shape
41 changes: 41 additions & 0 deletions model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import tensorflow.keras as keras
import tensorflow.keras.layers as layers

def convBlock(prev, sz, filters):
conv_1 = layers.Convolution2D(filters, (sz, sz), padding="same", activation="relu")(prev)
conv_1 = layers.Dropout(0.1)(conv_1)
conv_1 = layers.BatchNormalization()(conv_1)
return conv_1

def createModel(shape):
inputs = res = layers.Input(shape=shape)
res = convBlock(res, 3, filters=32)
res = convBlock(res, 3, filters=32)
res = convBlock(res, 3, filters=32)

res = layers.Flatten()(res)

res = layers.Dense(16 ** 2, activation='relu')(res)
res = layers.Dropout(.2)(res)
res = layers.Dense(16 ** 2, activation='relu')(res)
res = layers.Dropout(.2)(res)
res = layers.Dense(16 ** 2, activation='relu')(res)
res = layers.Dropout(.2)(res)
res = layers.Dense(8 ** 2, activation='relu')(res)
res = layers.Dropout(.2)(res)
res = layers.Dense(8 ** 2, activation='relu')(res)
res = layers.Dropout(.2)(res)
res = layers.Dense(8 ** 2, activation='relu')(res)
res = layers.Dropout(.2)(res)
res = layers.Dense(4 ** 2, activation='relu')(res)
res = layers.Dropout(.2)(res)
res = layers.Dense(4 ** 2, activation='relu')(res)
res = layers.Dropout(.2)(res)
res = layers.Dense(4 ** 2, activation='relu')(res)
res = layers.Dropout(.2)(res)

res = layers.Dense(4, activation='linear')(res)
return keras.Model(
inputs=inputs,
outputs=res
)
146 changes: 146 additions & 0 deletions train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# -*- coding: utf-8 -*-
import sys
import os
import tensorflow as tf
from CMazeExperience import CMazeExperience

if 'COLAB_GPU' in os.environ:
# fix resolve modules
from os.path import dirname
sys.path.append(dirname(dirname(dirname(__file__))))
else: # local GPU
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_virtual_device_configuration(
gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1 * 1024)]
)

import random
import numpy as np

from keras.optimizers import Adam

from Core.CMazeEnviroment import CMazeEnviroment, MAZE_ACTIONS
from model import createModel

def emulate(env, model, exploreRate, exploreDecay, steps, stopOnInvalid=False):
episodeReplay = []
done = False
N = 0
while (N < steps) and not done:
N += 1
act = None
valid = env.validActionsIndex()
if not valid: break

state = env.state2input()
if random.random() < exploreRate:
act = random.choice(valid)
else:
probe = model.predict(np.array([state]))[0]
if not stopOnInvalid:
for i in env.invalidActions():
probe[i] = -1
act = np.argmax(probe)

if stopOnInvalid and not (act in valid):
episodeReplay.append([state, act, -1, env.state2input()])
break

prevScore = env.score
env.apply(MAZE_ACTIONS[act])
normedScore = 1 if 0 < (env.score - prevScore) else -.1
episodeReplay.append([state, act, normedScore, env.state2input()])

done = env.done
exploreRate = max((.01, exploreRate * exploreDecay))
return episodeReplay

if __name__ == "__main__":
sz = 32
env = CMazeEnviroment(
maze=(0.8 < np.random.rand(sz, sz)).astype(np.float32),
pos=(0, 0),
FOV=3,
minimapSize=8
)
memory = CMazeExperience(maxSize=100)
done = False
batch_size = 64
playSteps = 64

bestModelScore = 0
model = createModel(shape=env.input_size)
model.compile(
optimizer=Adam(lr=1e-3),
loss='mean_squared_error'
)
# model.load_weights('model.h5')

targetModel = createModel(shape=env.input_size)
np.set_printoptions(precision=3)
# collect data
while len(memory) < 50:
env.respawn()
episodeReplay = emulate(
env, model,
exploreRate=0.9,
exploreDecay=0.9,
steps=playSteps,
stopOnInvalid=False
)
#################
if 1 < len(episodeReplay):
memory.addEpisode(episodeReplay)
print(len(memory), env.score)
memory.update()

train_episodes = 500
test_episodes = 10
exploreRate = 1
exploreDecayPerEpoch = .9
exploreDecay = .9
for epoch in range(5000):
print('Epoch %d' % epoch)
# train
targetModel.set_weights(model.get_weights())
lossSum = 0
for n in range(train_episodes):
states, actions, rewards, nextStates, nextReward = memory.take_batch(batch_size)
targets = targetModel.predict(nextStates)
targets[np.arange(len(targets)), actions] = rewards + np.max(targets, axis=1) * .9 * nextReward

lossSum += model.fit(
states, targets,
epochs=1,
verbose=0
).history['loss'][0]
print('Avg. train loss: %.4f' % (lossSum / train_episodes))
print(targets[0])

# test
print('Epoch %d testing' % epoch)
bestScore = scoreSum = movesSum = 0
n = 0
while n < test_episodes:
env.respawn()
episodeReplay = emulate(
env, model,
exploreRate=exploreRate,
exploreDecay=exploreDecay,
steps=playSteps*2,
stopOnInvalid=True
)
if 1 < len(episodeReplay):
memory.addEpisode(episodeReplay)
n += 1
bestScore = max((bestScore, env.score))
scoreSum += env.score
movesSum += len(episodeReplay)
#################
print('Best score: %.3f, avg. score: %.3f, avg. moves: %.1f' % (bestScore, scoreSum / n, movesSum / n))
if bestModelScore < scoreSum:
bestModelScore = scoreSum
print('save best model')
model.save_weights('model.h5')
model.save_weights('latest.h5')
exploreRate *= exploreDecayPerEpoch
Loading

0 comments on commit 3a2d4c4

Please sign in to comment.