diff --git a/.gitignore b/.gitignore index 23815ea..14d57c5 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ __pycache__ /.pydevproject /.project /.settings +/charts +/chart.jpg diff --git a/Agent/DQNAgent.py b/Agent/DQNAgent.py new file mode 100644 index 0000000..37cc92c --- /dev/null +++ b/Agent/DQNAgent.py @@ -0,0 +1,31 @@ +import numpy as np +import math + +class DQNAgent: + def __init__(self, model, exploreRate=0, noise=None): + self._model = model + self._exploreRate = exploreRate + self._noise = noise + return + + def reset(self): + return + + def process(self, state, actionsMask = [1, 1, 1, 1]): + return self.processBatch([state], [actionsMask])[0] + + def processBatch(self, states, actionsMask): + actions = self._model.predict(np.array(states)) + if 0 < self._exploreRate: + rndIndexes = np.where(np.random.random_sample(actions.shape[0]) < self._exploreRate) + actions[rndIndexes] = np.random.random_sample(actions.shape)[rndIndexes] + + if not (self._noise is None): + # softmax + e_x = np.exp(actions - actions.max(axis=-1, keepdims=True)) + normed = e_x / e_x.sum(axis=-1, keepdims=True) + # add noise + actions = normed + (np.random.random_sample(actions.shape) * self._noise) + + actions[np.where(~(1 == np.array(actionsMask)))] = -math.inf + return actions.argmax(axis=-1) \ No newline at end of file diff --git a/Agent/DQNEnsembleAgent.py b/Agent/DQNEnsembleAgent.py new file mode 100644 index 0000000..b7fb0ae --- /dev/null +++ b/Agent/DQNEnsembleAgent.py @@ -0,0 +1,67 @@ +import numpy as np +import math +import tensorflow.keras as keras +import tensorflow.keras.layers as layers +import tensorflow as tf + +def combineModels(models, combiner): + shape = models[0].layers[0].input_shape[0][1:] + inputs = layers.Input(shape=shape) + actionsMask = layers.Input(shape=(4, )) + res = layers.Lambda(combiner)([actionsMask] + [ x(inputs) for x in models ]) + return keras.Model(inputs=[inputs, actionsMask], outputs=res) + +def maskedSoftmax(mask, inputs): + mask = tf.where(tf.equal(mask, 1)) + return [ + tf.sparse.to_dense( + tf.sparse.softmax( + tf.sparse.SparseTensor( + indices=mask, + values=tf.gather_nd(x, mask), + dense_shape=tf.shape(x, out_type=tf.int64) + ) + ) + ) for x in inputs + ] + +def multiplyOutputs(inputs): + outputs = maskedSoftmax(inputs[0], inputs[1:]) + + res = 1 + outputs[0] + for x in outputs[1:]: + res = tf.math.multiply(res, 1 + x) + return res + +ENSEMBLE_MODE = { + 'multiply': multiplyOutputs +} + +class DQNEnsembleAgent: + def __init__(self, models, mode='multiply', exploreRate=0, noise=None): + self._model = combineModels(models, ENSEMBLE_MODE.get(mode, mode)) + self._exploreRate = exploreRate + self._noise = noise + return + + def reset(self): + return + + def process(self, state, actionsMask = [1, 1, 1, 1]): + return self.processBatch([state], [actionsMask])[0] + + def processBatch(self, states, actionsMask): + actions = self._model.predict([np.array(states), np.array(actionsMask)]) + if 0 < self._exploreRate: + rndIndexes = np.where(np.random.random_sample(actions.shape[0]) < self._exploreRate) + actions[rndIndexes] = np.random.random_sample(actions.shape)[rndIndexes] + + if not (self._noise is None): + # softmax + e_x = np.exp(actions - actions.max(axis=-1, keepdims=True)) + normed = e_x / e_x.sum(axis=-1, keepdims=True) + # add noise + actions = normed + (np.random.random_sample(actions.shape) * self._noise) + + actions[np.where(~(1 == np.array(actionsMask)))] = -math.inf + return actions.argmax(axis=-1) \ No newline at end of file diff --git a/Agent/RandomAgent.py b/Agent/RandomAgent.py new file mode 100644 index 0000000..3f969f3 --- /dev/null +++ b/Agent/RandomAgent.py @@ -0,0 +1,17 @@ +import numpy as np +import math + +class RandomAgent: + def __init__(self): + return + + def reset(self): + pass + + def process(self, state, actionsMask = [1, 1, 1, 1]): + return self.processBatch([state], [actionsMask]) + + def processBatch(self, states, actionsMask): + actions = np.random.random_sample((np.array(states).shape[0], 4)) + actions[np.where(~(1 == np.array(actionsMask)))] = -math.inf + return actions.argmax(axis=-1) \ No newline at end of file diff --git a/tests/__init__.py b/Agent/__init__.py similarity index 100% rename from tests/__init__.py rename to Agent/__init__.py diff --git a/CMazeExperience.py b/CMazeExperience.py deleted file mode 100644 index 5083588..0000000 --- a/CMazeExperience.py +++ /dev/null @@ -1,59 +0,0 @@ -import random -import numpy as np -import math - -class CMazeExperience: - def __init__(self, maxSize): - self.maxSize = maxSize - self.sizeLimit = (maxSize * 1.1) - self.episodes = [] - self.gamma = 0.5 - self.minScore = -math.inf - - def addEpisode(self, replay): - score = sum(x[2] for x in replay) - if score < self.minScore: return - self.episodes.append((replay, score)) - - if self.sizeLimit < len(self.episodes): - self.update() - return - - def update(self): - self.episodes = list( - sorted(self.episodes, key=lambda x: x[1], reverse=True) - )[:self.maxSize] - self.minScore = self.episodes[-1][1] - print('Min score: %.6f' % self.minScore) - - def __len__(self): - return len(self.episodes) - - def take_batch(self, batch_size): - batch = [] - weights = [x[1] for x in self.episodes] - while len(batch) < batch_size: - episode, _ = random.choices( - self.episodes, - weights=weights, - k=1 - )[0] - - minibatchIndexes = set(random.choices( - np.arange(len(episode)), - weights=[abs(x[2]) for x in episode], - k=min((5, batch_size - len(batch), len(episode))) - )) - - for ind in minibatchIndexes: - state, act, score, nextState = episode[ind] - nextStateWeight = 1 if ind < len(episode) - 1 else 0 - batch.append((state, act, score, nextState, nextStateWeight)) - - return ( - np.array([x[0] for x in batch]), - np.array([x[1] for x in batch]), - np.array([x[2] for x in batch]), - np.array([x[3] for x in batch]), - np.array([x[4] for x in batch]), - ) \ No newline at end of file diff --git a/Core/CMazeEnviroment.py b/Core/CMazeEnvironment.py similarity index 79% rename from Core/CMazeEnviroment.py rename to Core/CMazeEnvironment.py index 16a8636..de2b4a6 100644 --- a/Core/CMazeEnviroment.py +++ b/Core/CMazeEnvironment.py @@ -11,17 +11,18 @@ class MazeActions(Enum): MAZE_ACTIONS_AS_INT = { x: i for i, x in enumerate(MazeActions) } MAZE_ACTIONS = [x for x in MazeActions] -class CMazeEnviroment: +class CMazeEnvironment: def __init__(self, maze, pos, FOV, minimapSize): self.maze = np.pad(np.array(maze), FOV, constant_values=(1,)) self.minimapSize = minimapSize - self._fov = FOV + self._fov = self.FOV = FOV x, y = np.array(pos) + FOV self.spawnAt(x, y) return def spawnAt(self, x, y): + self._steps = 0 self.pos = np.array([y, x]) self.fog = np.zeros_like(self.maze) self.moves = np.zeros_like(self.maze) @@ -42,14 +43,19 @@ def _update(self): y, x = self.pos d = self._fov self.fog[x - d:x + d + 1, y - d:y + d + 1] = 1 + self.moves *= .98 self.moves[x, y] = 1 return def apply(self, action): + self._steps += 1 self.pos += action.value - self.lastAction = MAZE_ACTIONS_AS_INT[action] self._update() return + + def isMovingToVisited(self, action): + y, x = self.pos + action.value + return 1 == self.moves[x, y] def vision(self): y, x = self.pos @@ -81,23 +87,21 @@ def minimap(self): return ((maze * fog) - (1 - fog), moves) @property - def state(self): - return ((self.minimap(), ), self.score, self.done) - - @property - def done(self): + def dead(self): y, x = self.pos return 0 < self.maze[x, y] @property def score(self): - h, w = self.fog.shape - total = h * w - return np.count_nonzero(self.fog) / total + return np.count_nonzero(self.fog) * self.minScoreDelta + + @property + def steps(self): + return self._steps def copy(self): # dirty copy - res = CMazeEnviroment(self.maze, self.pos, self._fov, self.minimapSize) + res = CMazeEnvironment(self.maze, self.pos, self._fov, self.minimapSize) res.maze = self.maze.copy() res.fog = self.fog.copy() res.pos = self.pos.copy() @@ -114,6 +118,9 @@ def validActions(self): def validActionsIndex(self): return [ i for i, act in enumerate(MazeActions) if self.isPossible(act) ] + def actionsMask(self): + return [ (1 if self.isPossible(act) else 0) for act in MazeActions ] + def invalidActions(self): return [ i for i, act in enumerate(MazeActions) if not self.isPossible(act) ] @@ -124,4 +131,9 @@ def state2input(self): @property def input_size(self): - return self.state2input().shape \ No newline at end of file + return self.state2input().shape + + @property + def minScoreDelta(self): + h, w = self.fog.shape + return 1.0 / (h * w) \ No newline at end of file diff --git a/Core/MazeRLWrapper.py b/Core/MazeRLWrapper.py new file mode 100644 index 0000000..3d2cb4e --- /dev/null +++ b/Core/MazeRLWrapper.py @@ -0,0 +1,97 @@ +from Core.CMazeEnvironment import CMazeEnvironment, MAZE_ACTIONS +import numpy as np +import math + +class MazeRLWrapper: + def __init__(self, params): + maze = ( + params.get('obstacles rate', 0.8) < np.random.rand(params['size'], params['size']) + ).astype(np.float32) + + env = CMazeEnvironment( + maze=maze, + pos=(0, 0), + FOV=params['FOV'], + minimapSize=params['minimapSize'] + ) + env.respawn() + self._env = env + + self._stepsLimit = params['loop limit'] + self._minUniqSteps = params.get('min unique positions rate', 0.3) + self._stopIfLoop = params.get('stop if loop', True) + self._onlyNewCells = params.get('only new cells reward', False) + return + + def reset(self): + self._stopInLoop = False + self._done = False + self._env.respawn() + self._moves = [] + return + + def apply(self, actionIndex): + act = MAZE_ACTIONS[actionIndex] + prevState = self.state + prevScore = self.score + isNewCell = not self._env.isMovingToVisited(act) + self._env.apply(act) + nextState = self.state + + self._done = True + if self._env.dead: # unreachable due to actions masking + return nextState, -10, True, prevState + + if 0.95 <= self._env.score: + return nextState, 0, True, prevState + + if self._movingLoop(): + return nextState, -5, True, prevState + + self._done = False + reward = 0.3 if isNewCell else 0 # small reward for visiting new cell + + if not self._onlyNewCells: + discovered = (self._env.score - prevScore) / self._env.minScoreDelta + reward += 1 + math.log(discovered, 10) if 0 < discovered else -1 + return nextState, reward, False, prevState + + def actionsMask(self): + return self._env.actionsMask() + + @property + def state(self): + return self._env.state2input() + + @property + def done(self): + return self._done + + @property + def hitTheLoop(self): + return self._stopInLoop + + @property + def score(self): + return self._env.score + + @property + def input_size(self): + return self._env.input_size + + @property + def uniqueMoves(self): + if self._stepsLimit <= len(self._moves): + return len(set(self._moves)) / len(self._moves) + return 1 + + def _movingLoop(self): + self._moves.append(str(self._env.pos)) + self._moves = self._moves[1:] if self._stepsLimit < len(self._moves) else self._moves + self._stopInLoop = self._stopIfLoop and (self.uniqueMoves < self._minUniqSteps) + return self._stopInLoop + + def Continue(self): + self._done = False + self._moves = [] + return \ No newline at end of file diff --git a/README.md b/README.md index 83801d3..90d9903 100644 --- a/README.md +++ b/README.md @@ -1 +1,34 @@ -# deep-maze \ No newline at end of file +# Deep Maze + +Этот проект является симуляцией изучения простого grid world-a ботом с ограниченным полем зрения. Очки начисляются исключительно за открытие новых участков мира, что требует, как минимум, планирования и навигации. + +Типовые настройки мира: + +``` +Размер - 64x64 +Вероятность генерации препятствия - 80% +Открываемая область пространства вокруг агента - 7x7 +Видимая область - 17x17 +``` + +В данный момент, используется простой Q-learning, без какой-либо памяти, поэтому в состояние мира был добавлен срез с данными о прошлых передвижениях агента. Таким образом, агент получает данные о проходимости окружающих его клеток, проходил ли он по ней и давно ли (чтоб не использовать LSTM). + +Агенты очень часто застревали в сложных участках, поэтому было добавлено детектирование данного поведения, остановка агента и запуск того же агента в режиме исследования. Полученные таким способом данные помещаются в отдельную память, чтоб потом обучать агента действовать в подобных ситуациях. Эмпирически эффект заметен, но нельзя однозначно утверждать пользу подобного подхода. + +Изначально использовалась CNN (что логичнее для карт), но простая Dense-сетка давала сравнимый результат. Возможно, конечно, что остальные доработки могли привести к более заметному улучшению предсказаний CNN. Кроме того, были испробованы различные варианты наград, начальных условий, предобработки и др. + +Длительная тренировка одного агента не давала ощутимого прогресса, поэтому, в итоге, были натренированы 4 версии той же сети и затем их решения объединяются вместе (см. [DQNEnsembleAgent.py](Agent/DQNEnsembleAgent.py)). Ансамбль из агентов позволяет получать более стабильные результаты в сложных ситуациях. Например, если агент попадает в сложный участок пространства, то существенно выше шанс что он сможет попытаться найти выход, чем когда агент основывается на предсказании лишь одной сети. Верхнюю же границу ансамбль не улучшает. + +Ниже показано сравнение верхней границы (кол-во открытой области в 10 симуляциях из 100, по 5 прогонов): + +![](img/20201231-high.jpg) + +Как видно, ансамбль ведёт себя стабильнее, но не намного лучше отдельных его частей. + +А это нижняя граница (кол-во открытой области в 90 симуляциях из 100, по 5 прогонов): + +![](img/20201231-low.jpg) + + + + diff --git a/Utils/ExperienceBuffers/CebLinear.py b/Utils/ExperienceBuffers/CebLinear.py new file mode 100644 index 0000000..805b92c --- /dev/null +++ b/Utils/ExperienceBuffers/CebLinear.py @@ -0,0 +1,83 @@ +import random +import numpy as np +import math +import itertools + +_WEIGHTS_MODES = { + 'abs': math.fabs, + 'reward': lambda x: x, + 'same': lambda _: 1 +} + +class CebLinear: + def __init__(self, maxSize, sampleWeight='samp'): + self.maxSize = maxSize + self._sizeLimit = math.floor(maxSize * 1.1) + self._samples = [] + self._sampleWeight = _WEIGHTS_MODES.get(sampleWeight, sampleWeight) + + def addEpisode(self, replay, terminated): + if 1 < len(replay): + for step in replay[:-1]: + self._samples.append((*step, 1)) + self._samples.append((*replay[-1], -1 if terminated else 0)) + + self.update() + return + + def update(self): + if self._sizeLimit < len(self._samples): + self._samples = self._samples[-self.maxSize:] + return + + def __len__(self): + return len(self._samples) + + def _fixRewardMultiplier(self, x): + if np.isscalar(x): + return abs(x) + + if isinstance(x, (np.ndarray, np.generic)): + return np.abs(x) + + raise Exception('Unknown reward type. (%s)' % type(x)) + + def _createBatch(self, batch_size, sampler): + samplesLeft = batch_size + cumweights = list(itertools.accumulate(self._sampleWeight(x[2]) for x in self._samples)) + indexRange = np.arange(len(self._samples)) + res = [] + while 0 < samplesLeft: + indexes = set(random.choices( + indexRange, cum_weights=cumweights, + k=min((samplesLeft, len(self._samples))) + )) + + for i in indexes: + sample = sampler(i) + if sample: + while len(res) < len(sample): res.append([]) + for i, value in enumerate(sample[:-1]): + res[i].append(value) + res[-1].append(self._fixRewardMultiplier(sample[-1])) + samplesLeft -= 1 + + return [np.array(values) for values in res] + + def sampleBatch(self, batch_size): + return self._createBatch(batch_size, lambda i: self._samples[i]) + + def sampleSequenceBatch(self, batch_size, sequenceLen, **kwargs): + def sampler(ind): + sample = self._samples[ind:ind+sequenceLen] + if not (sequenceLen == len(sample)): return None + if 1 < sequenceLen: + if any(x[-1] < 1 for x in sample[:-1]): + return None + + transposed = [ + np.array([x[col] for x in sample]) for col in range(len(sample[0])) + ] + return transposed + + return self._createBatch(batch_size, sampler) \ No newline at end of file diff --git a/Utils/ExperienceBuffers/CebPrioritized.py b/Utils/ExperienceBuffers/CebPrioritized.py new file mode 100644 index 0000000..be57d08 --- /dev/null +++ b/Utils/ExperienceBuffers/CebPrioritized.py @@ -0,0 +1,104 @@ +import random +import numpy as np +import math +import itertools + +_WEIGHTS_MODES = { + 'abs': math.fabs, + 'reward': lambda x: x, + 'same': lambda _: 1 +} + +class CebPrioritized: + def __init__(self, maxSize, sampleWeight='abs'): + self.maxSize = maxSize + self.sizeLimit = math.floor(maxSize * 1.1) + self.episodes = [] + self.minScore = -math.inf + self._sampleWeight = _WEIGHTS_MODES.get(sampleWeight, sampleWeight) + + def addEpisode(self, replay, terminated): + score = sum(x[2] for x in replay) # state, action, 2 - reward + if score < self.minScore: return + self.episodes.append((replay, score, terminated)) + + if self.sizeLimit < len(self.episodes): + self.update() + return + + def update(self): + self.episodes = list( + sorted(self.episodes, key=lambda x: x[1], reverse=True) + )[:self.maxSize] + self.minScore = self.episodes[-1][1] + return + + def __len__(self): + return len(self.episodes) + + def _sampleIndexes(self, episode, maxSamples): + return set(random.choices( + np.arange(len(episode)), + weights=[self._sampleWeight(x[2]) for x in episode], + k=min((maxSamples, len(episode))) + )) + + def _createBatch(self, batch_size, sampler): + batchSize = 0 + cumweights = list(itertools.accumulate(x[1] for x in self.episodes)) + res = [] + while batchSize < batch_size: + Episode = random.choices(self.episodes, cum_weights=cumweights, k=1)[0] + for sample, rewardMultiplier in sampler(Episode, batch_size - batchSize): + while len(res) <= len(sample): res.append([]) + for i, value in enumerate(sample): + res[i].append(value) + res[-1].append(rewardMultiplier) + batchSize += 1 + + return [np.array(values) for values in res] + + def sampleBatch(self, batch_size, maxSamplesFromEpisode=5): + def sampler(Episode, limit): + limit = min((maxSamplesFromEpisode, limit)) + episode, _, wasTerminated = Episode + lastActionScore = 1 if wasTerminated else 0 + minibatchIndexes = self._sampleIndexes(episode, limit) + for ind in minibatchIndexes: + yield (( + episode[ind], + lastActionScore if ind == len(episode) - 1 else 1 # last action in replay? + )) + return + + return self._createBatch(batch_size, sampler) + + def _sampleEpisodeMultipleSteps(self, Episode, maxSamplesFromEpisode, steps): + episode, _, wasTerminated = Episode + lastActionScore = 1 if wasTerminated else 0 + minibatchIndexes = self._sampleIndexes( + episode[:-(steps - 1)] if 1 < steps else episode, + maxSamplesFromEpisode + ) + for ind in minibatchIndexes: + sample = episode[ind:ind+steps] + transposed = [ + np.array([x[col] for x in sample]) for col in range(len(sample[0])) + ] + yield(( + transposed, + np.array([ # last action in replay? + (lastActionScore if (ind + i) == len(episode) - 1 else 1) for i in range(steps) + ]) + )) + return + + def sampleSequenceBatch(self, batch_size, sequenceLen, maxSamplesFromEpisode=5): + def sampler(Episode, limit): + return self._sampleEpisodeMultipleSteps( + Episode, + maxSamplesFromEpisode=min((maxSamplesFromEpisode, limit)), + steps=sequenceLen + ) + + return self._createBatch(batch_size, sampler) \ No newline at end of file diff --git a/Utils/ExperienceBuffers/__init__.py b/Utils/ExperienceBuffers/__init__.py new file mode 100644 index 0000000..15951f2 --- /dev/null +++ b/Utils/ExperienceBuffers/__init__.py @@ -0,0 +1,2 @@ +from .CebPrioritized import CebPrioritized +from .CebLinear import CebLinear \ No newline at end of file diff --git a/Utils/__init__.py b/Utils/__init__.py new file mode 100644 index 0000000..fe40b19 --- /dev/null +++ b/Utils/__init__.py @@ -0,0 +1,80 @@ +import pylab as plt +import numpy as np +import math + +def emulateBatch(testEnvs, agent, maxSteps): + replays = [[] for _ in testEnvs] + steps = 0 + while (steps < maxSteps) and not all(e.done for e in testEnvs): + steps += 1 + + activeEnvs = [(i, e) for i, e in enumerate(testEnvs) if not e.done] + + states = [e.state for _, e in activeEnvs] + actionsMasks = [e.actionsMask() for _, e in activeEnvs] + actions = agent.processBatch(states, actionsMasks) + + for (i, e), action, actionsMask in zip(activeEnvs, actions, actionsMasks): + state, reward, done, prevState = e.apply(action) + replays[i].append((prevState, action, reward, actionsMask)) + if done: # save last state with dummy data + replays[i].append((state, action, 0, actionsMask)) + + return [(replay, e.done) for replay, e in zip(replays, testEnvs)] + +def normalizeRewards(replay): + prevStates, actions, rewards, actionsMasks = zip(*replay) + rewards = np.array(rewards) + + std = rewards.std() + std = 1 if 0 == std else std + rewards = (rewards - rewards.mean()) / std + return list(zip(prevStates, actions, rewards, actionsMasks)) + +def clipReplay(replay, loopLimit): + if loopLimit < len(replay): + # find and cutoff loops + tail = replay[-loopLimit:] + lastState = replay[-1][0] + ind = next((i for i, step in enumerate(tail) if np.array_equal(step[0], lastState)), len(tail)) - len(tail) + return replay[ind:] + return replay + +def trackScores(scores, metrics, levels=[.1, .3, .5, .75, .9], metricName='scores'): + if metricName not in metrics: + metrics[metricName] = {} + + def series(name): + if name not in metrics[metricName]: + metrics[metricName][name] = [] + return metrics[metricName][name] + ######## + N = len(scores) + orderedScores = list(sorted(scores, reverse=True)) + totalScores = sum(scores) / N + series('avg.').append(totalScores) + + for level in levels: + series('top %.0f%%' % (level * 100)).append(orderedScores[int(N * level)]) + return + +def plotData2file(data, filename, maxCols=3): + plt.clf() + N = len(data) + rows = (N + maxCols - 1) // maxCols + cols = min((N, maxCols)) + + figSize = plt.rcParams['figure.figsize'] + fig = plt.figure(figsize=(figSize[0] * cols, figSize[1] * rows)) + + axes = fig.subplots(ncols=cols, nrows=rows) + axes = axes.reshape((-1,)) if 1 < len(data) else [axes] + for (chartname, series), axe in zip(data.items(), axes): + for name, dataset in series.items(): + axe.plot(dataset, label=name) + axe.title.set_text(chartname) + axe.legend() + + fig.savefig(filename) + plt.close(fig) + return \ No newline at end of file diff --git a/fit_stage.py b/fit_stage.py new file mode 100644 index 0000000..5011a8b --- /dev/null +++ b/fit_stage.py @@ -0,0 +1,36 @@ +import tensorflow as tf +import numpy as np + +def train(model, memory, params): + if len(memory) < params['batchSize']: return np.Inf + + modelClone = tf.keras.models.clone_model(model) + modelClone.set_weights(model.get_weights()) # use clone model for stability + + BOOTSTRAPPED_STEPS = params['steps'] + GAMMA = params['gamma'] + ALPHA = params.get('alpha', 1.0) + rows = np.arange(params['batchSize']) + lossSum = 0 + for _ in range(params['episodes']): + allStates, actions, rewards, _, nextStateScoreMultiplier = memory.sampleSequenceBatch( + batch_size=params['batchSize'], + maxSamplesFromEpisode=params.get('maxSamplesFromEpisode', 16), + sequenceLen=BOOTSTRAPPED_STEPS + 1 + ) + + states = allStates[:, :-1] + rewards = rewards[:, :-1] + actions = actions[:, 0] + + futureScores = modelClone.predict(allStates[:, -1]).max(axis=-1) * nextStateScoreMultiplier[:, -1] + totalRewards = (rewards * (GAMMA ** np.arange(BOOTSTRAPPED_STEPS))).sum(axis=-1) + targets = modelClone.predict(states[:, 0]) + targets[rows, actions] += ALPHA * ( + totalRewards + futureScores * (GAMMA ** BOOTSTRAPPED_STEPS) - targets[rows, actions] + ) + + lossSum += model.fit(states[:, 0], targets, epochs=1, verbose=0).history['loss'][0] + ### + + return lossSum / params['episodes'] diff --git a/img/20201231-high.jpg b/img/20201231-high.jpg new file mode 100644 index 0000000..528a246 Binary files /dev/null and b/img/20201231-high.jpg differ diff --git a/img/20201231-low.jpg b/img/20201231-low.jpg new file mode 100644 index 0000000..04b7153 Binary files /dev/null and b/img/20201231-low.jpg differ diff --git a/learn_environment.py b/learn_environment.py new file mode 100644 index 0000000..669816b --- /dev/null +++ b/learn_environment.py @@ -0,0 +1,118 @@ +from Core.MazeRLWrapper import MazeRLWrapper +from Utils.ExperienceBuffers.CebPrioritized import CebPrioritized +from Agent.DQNAgent import DQNAgent +import time +import Utils +import fit_stage +import os +from Utils.ExperienceBuffers.CebLinear import CebLinear + +def learn_environment(model, params): + NAME = params['name'] + BATCH_SIZE = params['batch size'] + GAMMA = params['gamma'] + BOOTSTRAPPED_STEPS = params['bootstrapped steps'] + LOOP_LIMIT = params['maze']['loop limit'] + metrics = {} + + environments = [ + MazeRLWrapper(params['maze']) for _ in range(params['test episodes']) + ] + + memory = CebPrioritized(maxSize=5000, sampleWeight='abs') + doomMemory = CebLinear( + maxSize=params.get('max steps after loop', 16) * 1000, + sampleWeight='abs' + ) + + ###################################################### + def testModel(EXPLORE_RATE): + for e in environments: e.reset() + replays = Utils.emulateBatch( + environments, + DQNAgent(model, exploreRate=EXPLORE_RATE, noise=params.get('agent noise', 0)), + maxSteps=params.get('max test steps') + ) + for replay, _ in replays: + if params.get('clip replay', False): + replay = Utils.clipReplay(replay, loopLimit=LOOP_LIMIT) + if BOOTSTRAPPED_STEPS < len(replay): + memory.addEpisode(replay, terminated=True) + + scores = [x.score for x in environments] + ################ + # collect bad experience + envs = [e for e in environments if e.hitTheLoop] + if envs: + for e in envs: e.Continue() + replays = Utils.emulateBatch( + envs, + DQNAgent( + model, + exploreRate=params.get('explore rate after loop', 1), + noise=params.get('agent noise after loop', 0) + ), + maxSteps=params.get('max steps after loop', 16) + ) + for replay, _ in replays: + if BOOTSTRAPPED_STEPS < len(replay): + doomMemory.addEpisode(replay, terminated=True) + ################ + return scores + ###################################################### + # collect some experience + for _ in range(2): + testModel(EXPLORE_RATE=0) + ####################### + bestModelScore = -float('inf') + for epoch in range(params['epochs']): + T = time.time() + + EXPLORE_RATE = params['explore rate'](epoch) + alpha = params.get('alpha', lambda _: 1)(epoch) + print( + '[%s] %d/%d epoch. Explore rate: %.3f. Alpha: %.5f.' % (NAME, epoch, params['epochs'], EXPLORE_RATE, alpha) + ) + ################## + # Training + trainLoss = fit_stage.train( + model, memory, + { + 'gamma': GAMMA, + 'batchSize': BATCH_SIZE, + 'steps': BOOTSTRAPPED_STEPS, + 'episodes': params['train episodes'](epoch), + 'alpha': alpha + } + ) + print('Avg. train loss: %.4f' % trainLoss) + + trainLoss = fit_stage.train( + model, doomMemory, + { + 'gamma': GAMMA, + 'batchSize': BATCH_SIZE, + 'steps': BOOTSTRAPPED_STEPS, + 'episodes': params['train doom episodes'](epoch), + 'alpha': params.get('doom alpha', lambda _: alpha)(epoch) + } + ) + print('Avg. train doom loss: %.4f' % trainLoss) + ################## + # test + print('Testing...') + scores = testModel(EXPLORE_RATE) + Utils.trackScores(scores, metrics) + ################## + + scoreSum = sum(scores) + print('Scores sum: %.5f' % scoreSum) + if (bestModelScore < scoreSum) and (params['warm up epochs'] < epoch): + print('save best model (%.2f => %.2f)' % (bestModelScore, scoreSum)) + bestModelScore = scoreSum + model.save_weights('weights/%s.h5' % NAME) + ################## + os.makedirs('charts', exist_ok=True) + Utils.plotData2file(metrics, 'charts/%s.jpg' % NAME) + print('Epoch %d finished in %.1f sec.' % (epoch, time.time() - T)) + print('------------------') \ No newline at end of file diff --git a/main.py b/main.py deleted file mode 100644 index ab3419c..0000000 --- a/main.py +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -def main(): - pass - -if __name__ == '__main__': - main() diff --git a/model.py b/model.py index d94c1b2..23c6834 100644 --- a/model.py +++ b/model.py @@ -2,28 +2,23 @@ import tensorflow.keras.layers as layers import tensorflow as tf -def convBlock(prev, sz, filters): - conv_1 = layers.Convolution2D(filters, (sz, sz), padding="same", activation="relu")(prev) - conv_1 = layers.Dropout(0.1)(conv_1) - conv_1 = layers.BatchNormalization()(conv_1) - return conv_1 - def createModel(shape): inputs = res = layers.Input(shape=shape) - res = convBlock(res, 3, filters=32) - res = convBlock(res, 3, filters=32) - res = convBlock(res, 3, filters=32) - - res = layers.Flatten()(res) + raw = res = layers.Flatten()(res) + res = layers.Dense(256, activation='relu')(res) + res = layers.Dense(256, activation='relu')(res) + res = layers.Dense(128, activation='relu')(res) + res = layers.Concatenate()([raw, res]) # dueling dqn - valueBranch = layers.Dense(32, activation='relu')(res) - valueBranch = layers.Dense(32, activation='relu')(valueBranch) + valueBranch = layers.Dense(128, activation='relu')(res) + valueBranch = layers.Dense(64, activation='relu')(valueBranch) valueBranch = layers.Dense(32, activation='relu')(valueBranch) valueBranch = layers.Dense(1, activation='linear')(valueBranch) actionsBranch = layers.Dense(128, activation='relu')(res) actionsBranch = layers.Dense(64, activation='relu')(actionsBranch) + actionsBranch = layers.Concatenate()([raw, actionsBranch]) actionsBranch = layers.Dense(64, activation='relu')(actionsBranch) actionsBranch = layers.Dense(64, activation='relu')(actionsBranch) actionsBranch = layers.Dense(4, activation='linear')(actionsBranch) diff --git a/test.py b/test.py new file mode 100644 index 0000000..3b6fac9 --- /dev/null +++ b/test.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- +import sys +import os +import tensorflow as tf +import Utils + +if 'COLAB_GPU' in os.environ: + # fix resolve modules + from os.path import dirname + sys.path.append(dirname(dirname(dirname(__file__)))) +else: # local GPU + gpus = tf.config.experimental.list_physical_devices('GPU') + tf.config.experimental.set_virtual_device_configuration( + gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=3 * 1024)] + ) + +from model import createModel +from Core.MazeRLWrapper import MazeRLWrapper +import glob +from Agent.DQNAgent import DQNAgent +from Agent.DQNEnsembleAgent import DQNEnsembleAgent +import pylab as plt +import numpy as np + +####################################### +MAZE_FOV = 3 +MAZE_MINIMAP_SIZE = 8 +####################################### + +def plot2file(data, filename, chartname): + plt.clf() + + figSize = plt.rcParams['figure.figsize'] + fig = plt.figure(figsize=(figSize[0] * 2, figSize[1])) + + axe = fig.subplots() + series = data[chartname] + for name, dataset in series.items(): + axe.plot(dataset, label=name) + axe.title.set_text(chartname) + + fig.legend() + fig.savefig(filename) + plt.close(fig) + return + +def testAgent(environments, agent, name, metrics, N=5): + print('Agent: %s' % name) + + scoreTop90 = metrics['scores']['%s worst 10%%' % name] = [] + + for i in range(N): + print('Round %d/%d...' % (i, N)) + scores = [] + + for e in environments: e.reset() + replays = Utils.emulateBatch(environments, agent, maxSteps=1000) + for (replay, _), env in zip(replays, environments): + scores.append(env.score) + + scores = list(sorted(scores, reverse=True)) + scoreTop90.append(scores[int(0.9 * len(scores))]) + + plot2file(metrics, 'chart.jpg', 'scores') + return + +if __name__ == "__main__": + MAZE_PARAMS = { + 'size': 64, + 'FOV': MAZE_FOV, + 'minimapSize': MAZE_MINIMAP_SIZE, + 'loop limit': 64, + } + environments = [MazeRLWrapper(MAZE_PARAMS) for _ in range(100)] + MODEL_INPUT_SHAPE = environments[0].input_size + + metrics = { + 'scores': {} + } + models = [] + for i, x in enumerate(glob.iglob('weights/*.h5')): + filename = os.path.abspath(x) + model = createModel(shape=MODEL_INPUT_SHAPE) + model.load_weights(filename) + models.append(model) + + testAgent( + environments, + DQNAgent(model), + name=os.path.basename(filename)[:-3], + metrics=metrics + ) + + testAgent( + environments, + DQNEnsembleAgent(models), + name='ensemble', + metrics=metrics + ) \ No newline at end of file diff --git a/tests/Test_CMazeEnviroment.py b/tests/Test_CMazeEnviroment.py deleted file mode 100644 index 527ca1b..0000000 --- a/tests/Test_CMazeEnviroment.py +++ /dev/null @@ -1,87 +0,0 @@ -from verify import expect -from Core.CMazeEnviroment import CMazeEnviroment, MazeActions -import numpy as np - -class Test_CMazeEnviroment: - def test_vision(self): - env = CMazeEnviroment( - maze=[ - [0, 0, 1, 0], - [0, 0, 0, 0], - [0, 1, 0, 0], - [0, 1, 0, 1], - [0, 0, 2, 0], - ], - pos=(0, 0), - FOV=1 - ) - - valid = np.array([ - [1, 1, 1], - [1, 0, 0], - [1, 0, 0], - ]) - expect(str(env.vision())).is_equal(str(valid)) - - def test_apply(self): - env = CMazeEnviroment( - maze=[ - [0, 0, 1, 0], - [0, 0, 0, 0], - [0, 1, 0, 0], - [0, 1, 0, 1], - [0, 0, 2, 0], - ], - pos=(0, 0), - FOV=1 - ) - env.apply(MazeActions.RIGHT) - env.apply(MazeActions.DOWN) - env.apply(MazeActions.RIGHT) - env.apply(MazeActions.DOWN) - env.apply(MazeActions.RIGHT) - - valid = np.array([ - [0, 0, 1], - [0, 0, 1], - [0, 1, 1], - ]) - expect(str(env.vision())).is_equal(str(valid)) - - def test_increasingScoreWhileExploring(self): - env = CMazeEnviroment( - maze=[ - [0, 0, 1, 0], - [0, 0, 0, 0], - [0, 1, 0, 0], - [0, 1, 0, 1], - [0, 0, 2, 0], - ], - pos=(0, 0), - FOV=1 - ) - - oldScore = env.score - env.apply(MazeActions.RIGHT) - newScore = env.score - expect(oldScore).is_less(newScore) - - def test_scoreNotChanged(self): - env = CMazeEnviroment( - maze=[ - [0, 0, 1, 0], - [0, 0, 0, 0], - [0, 1, 0, 0], - [0, 1, 0, 1], - [0, 0, 2, 0], - ], - pos=(0, 0), - FOV=1 - ) - - env.apply(MazeActions.RIGHT) - oldScore = env.score - env.apply(MazeActions.LEFT) - newScore = env.score - expect(oldScore).is_equal(newScore) - \ No newline at end of file diff --git a/train.py b/train.py index 86aa7f8..3678541 100644 --- a/train.py +++ b/train.py @@ -2,7 +2,6 @@ import sys import os import tensorflow as tf -from CMazeExperience import CMazeExperience if 'COLAB_GPU' in os.environ: # fix resolve modules @@ -11,135 +10,66 @@ else: # local GPU gpus = tf.config.experimental.list_physical_devices('GPU') tf.config.experimental.set_virtual_device_configuration( - gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1 * 1024)] + gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=3 * 1024)] ) -import random -import numpy as np +from learn_environment import learn_environment +from tensorflow.keras.optimizers import Adam +from tensorflow.keras.losses import Huber -from keras.optimizers import Adam - -from Core.CMazeEnviroment import CMazeEnviroment, MAZE_ACTIONS from model import createModel +from Core.MazeRLWrapper import MazeRLWrapper -def emulate(env, model, exploreRate, exploreDecay, steps, stopOnInvalid=False): - episodeReplay = [] - done = False - N = 0 - while (N < steps) and not done: - N += 1 - act = None - valid = env.validActionsIndex() - if not valid: break +####################################### +MAZE_FOV = 3 +MAZE_MINIMAP_SIZE = 8 +MAZE_LOOPLIMIT = 32 +####################################### - state = env.state2input() - if random.random() < exploreRate: - act = random.choice(valid) - else: - probe = model.predict(np.array([state]))[0] - if not stopOnInvalid: - for i in env.invalidActions(): - probe[i] = -float('inf') - act = np.argmax(probe) - - if stopOnInvalid and not (act in valid): - episodeReplay.append([state, act, -10, env.state2input()]) - break - - prevScore = env.score - env.apply(MAZE_ACTIONS[act]) - normedScore = 1 if 0 < (env.score - prevScore) else -0.1 - episodeReplay.append([state, act, normedScore, env.state2input()]) - - done = env.done - exploreRate = max((.001, exploreRate * exploreDecay)) - return episodeReplay +def getModel(shape): + model = createModel(shape=MODEL_INPUT_SHAPE) + model.compile(optimizer=Adam(lr=1e-3), loss=Huber(delta=1)) + return model if __name__ == "__main__": - sz = 64 - env = CMazeEnviroment( - maze=(0.8 < np.random.rand(sz, sz)).astype(np.float32), - pos=(0, 0), - FOV=3, - minimapSize=8 - ) - memory = CMazeExperience(maxSize=1000) - done = False - batch_size = 256 - playSteps = 96 + DEFAULT_MAZE_PARAMS = { + 'size': 40, + 'FOV': MAZE_FOV, + 'minimapSize': MAZE_MINIMAP_SIZE, + 'loop limit': MAZE_LOOPLIMIT, + } - bestModelScore = -float('inf') - model = createModel(shape=env.input_size) - model.compile( - optimizer=Adam(lr=1e-3), - loss='mean_squared_error' - ) - #model.load_weights('weights/best.h5') + MODEL_INPUT_SHAPE = MazeRLWrapper(DEFAULT_MAZE_PARAMS).input_size - targetModel = createModel(shape=env.input_size) - # collect data - while len(memory) < 100: - env.respawn() - episodeReplay = emulate( - env, model, - exploreRate=1, - exploreDecay=1, - steps=playSteps, - stopOnInvalid=False - ) - ################# - if 1 < len(episodeReplay): - memory.addEpisode(episodeReplay) - print(len(memory), env.score) - - train_episodes = 100 - test_episodes = 20 - exploreRate = .5 - exploreDecayPerEpoch = .95 - exploreDecay = .95 - for epoch in range(5000): - print('Epoch %d' % epoch) - # train - targetModel.set_weights(model.get_weights()) - lossSum = 0 - for n in range(train_episodes): - states, actions, rewards, nextStates, nextReward = memory.take_batch(batch_size) - nextScores = targetModel.predict(nextStates) - targets = targetModel.predict(states) - targets[np.arange(len(targets)), actions] = rewards + np.max(nextScores, axis=1) * .95 * nextReward - - lossSum += model.fit( - states, targets, - epochs=1, - verbose=0 - ).history['loss'][0] + ####################### + DEFAULT_LEARNING_PARAMS = { + 'maze': DEFAULT_MAZE_PARAMS, + 'batch size': 256, + 'gamma': 0.95, + 'bootstrapped steps': 3, - print('Avg. train loss: %.4f' % (lossSum / train_episodes)) + 'epochs': 100, + 'warm up epochs': 0, + 'test episodes': 128, + 'train episodes': lambda _: 128, + 'train doom episodes': lambda _: 32, - # test - print('Epoch %d testing' % epoch) - bestScore = scoreSum = movesSum = 0 - n = 0 - while n < test_episodes: - env.respawn() - episodeReplay = emulate( - env, model, - exploreRate=exploreRate, - exploreDecay=exploreDecay, - steps=playSteps*2, - stopOnInvalid=True - ) - if 1 < len(episodeReplay): - memory.addEpisode(episodeReplay) - n += 1 - bestScore = max((bestScore, env.score)) - scoreSum += env.score - movesSum += len(episodeReplay) - ################# - print('Best score: %.3f, avg. score: %.3f, avg. moves: %.1f' % (bestScore, scoreSum / n, movesSum / n)) - if bestModelScore < scoreSum: - bestModelScore = scoreSum - print('save best model') - model.save_weights('weights/best.h5') - model.save_weights('weights/latest.h5') - exploreRate *= exploreDecayPerEpoch \ No newline at end of file + 'alpha': lambda _: 1, + 'explore rate': lambda _: 0, + + 'agent noise': 0.01, + 'clip replay': True, + + 'explore rate after loop': 0.2, + 'agent noise after loop': 0.1 + } + ####################### + for i in range(4): + learn_environment( + getModel(MODEL_INPUT_SHAPE), + { + **DEFAULT_LEARNING_PARAMS, + 'name': 'agent-%d' % i, + 'max test steps': 1000 + } + ) \ No newline at end of file diff --git a/view_maze.py b/view_maze.py index 2fd8fa1..363bd6d 100644 --- a/view_maze.py +++ b/view_maze.py @@ -2,24 +2,27 @@ # -*- coding: utf-8 -*- import tensorflow as tf import os - +from Agent.DQNEnsembleAgent import DQNEnsembleAgent # limit GPU usage gpus = tf.config.experimental.list_physical_devices('GPU') tf.config.experimental.set_virtual_device_configuration( gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1 * 1024)] ) -from Core.CMazeEnviroment import CMazeEnviroment, MazeActions +from Core.CMazeEnvironment import CMazeEnvironment, MazeActions import numpy as np import pygame import pygame.locals as G import random +from Agent.DQNAgent import DQNAgent +import glob +from collections import namedtuple from model import createModel def createMaze(): - sz = 64 + sz = 16 * 4 maze = (0.8 < np.random.rand(sz, sz)).astype(np.float32) - res = CMazeEnviroment( + res = CMazeEnvironment( maze=maze, pos=(0, 0), FOV=3, @@ -37,41 +40,70 @@ class Colors: RED = (255, 0, 0) PURPLE = (255, 0, 255) +RLAgent = namedtuple('RLAgent', 'name agent environment') + class App: MODES = ['manual', 'random', 'agent'] - NETWORKS = ['best', 'latest'] def __init__(self): self._running = True self._display_surf = None - self._createMaze() self._mode = 'manual' self._paused = True - self._speed = 20 - self._usedNetwork = self.NETWORKS[0] + self._speed = 1 + self._agents = [] + self._activeAgent = 0 + self._createMaze() return def _createMaze(self): self._maze = createMaze() self._initMaze = self._maze.copy() + if 'agent' == self._mode: + self._assignMaze2Agents() return def on_init(self): pygame.init() - self._display_surf = pygame.display.set_mode((800, 650), pygame.HWSURFACE) + self._display_surf = pygame.display.set_mode((850, 650), pygame.HWSURFACE) pygame.display.set_caption('Deep maze') self._font = pygame.font.Font(pygame.font.get_default_font(), 16) self._running = True + def _assignMaze2Agents(self): + agents = [] + for agent in self._agents: + agents.append(RLAgent( + agent.name, agent.agent, + self._initMaze.copy() + )) + + self._agents = agents + return + def _createNewAgent(self): - filename = 'weights/%s.h5' % self._usedNetwork - if not os.path.exists(filename): - self._usedNetwork = self.NETWORKS[0] - filename = 'weights/%s.h5' % self._usedNetwork - - self._agent = createModel(shape=self._maze.input_size) - self._agent.load_weights(filename) + self._agents = [] + models = [] + for i, x in enumerate(glob.iglob('weights/*.h5')): + filename = os.path.abspath(x) + model = createModel(shape=self._maze.input_size) + model.load_weights(filename) + models.append(model) + agent = DQNAgent(model) + name = os.path.basename(filename) + + self._agents.append(RLAgent( + name[:-3], agent, self._initMaze.copy() + )) + + self._agents.insert(0, RLAgent( + 'ensemble', + DQNEnsembleAgent(models), + self._initMaze.copy() + )) + + self._activeAgent = 0 self._paused = True return @@ -80,53 +112,55 @@ def on_event(self, event): self._running = False if event.type == G.KEYDOWN: + if G.K_ESCAPE == event.key: + self._running = False + + if G.K_r == event.key: + self._createMaze() + # Switch mode if G.K_m == event.key: mode = next((i for i, x in enumerate(self.MODES) if x == self._mode)) self._mode = self.MODES[(mode + 1) % len(self.MODES)] self._paused = True + self._agents = [] if 'agent' == self._mode: self._createNewAgent() - + ##### if G.K_SPACE == event.key: self._paused = not self._paused - + ##### if 'agent' == self._mode: - if G.K_r == event.key: - self._createMaze() if G.K_n == event.key: self._createNewAgent() - if G.K_t == event.key: - network = next((i for i, x in enumerate(self.NETWORKS) if x == self._usedNetwork)) - self._usedNetwork = self.NETWORKS[(network + 1) % len(self.NETWORKS)] - self._createNewAgent() - - if G.K_ESCAPE == event.key: - self._running = False - + + if G.K_a == event.key: + self._activeAgent = (self._activeAgent + 1) % len(self._agents) + if 'manual' == self._mode: - if G.K_r == event.key: - self._createMaze() - - if G.K_i == event.key: - self._maze = self._initMaze.copy() - - if G.K_y == event.key: - self._maze.respawn() + self._manualEvent(event) + + if not ('manual' == self._mode): + if G.K_KP_PLUS == event.key: + self._speed = min((32, 2 * self._speed)) + if G.K_KP_MINUS == event.key: + self._speed = max((1, self._speed // 2)) - actMapping = { - G.K_LEFT: MazeActions.LEFT, - G.K_RIGHT: MazeActions.RIGHT, - G.K_UP: MazeActions.UP, - G.K_DOWN: MazeActions.DOWN - } - - act = actMapping.get(event.key, False) - if act and self._maze.isPossible(act): - self._maze.apply(act) - ##### return - + + def _manualEvent(self, event): + actMapping = { + G.K_LEFT: MazeActions.LEFT, + G.K_RIGHT: MazeActions.RIGHT, + G.K_UP: MazeActions.UP, + G.K_DOWN: MazeActions.DOWN + } + + act = actMapping.get(event.key, False) + if act and self._maze.isPossible(act): + self._maze.apply(act) + return + def on_loop(self): if self._paused: return @@ -137,20 +171,19 @@ def on_loop(self): self._maze.apply(random.choice(actions)) if 'agent' == self._mode: - probe = self._agent.predict(np.array([self._maze.state2input()]))[0] - for i in self._maze.invalidActions(): - probe[i] = -float('inf') - pred = np.argmax(probe) - - act = list(MazeActions)[pred] - if self._maze.isPossible(act): - self._maze.apply(act) + for _ in range(self._speed): + for agent in self._agents: + maze = agent.environment + pred = agent.agent.process(maze.state2input(), maze.actionsMask()) + act = list(MazeActions)[pred] + if maze.isPossible(act): + maze.apply(act) pass - def _renderMaze(self): - fog = self._maze.fog - maze = self._maze.maze - moves = self._maze.moves + def _renderMaze(self, env): + fog = env.fog + moves = env.moves + maze = env.maze h, w = maze.shape dx, dy = delta = np.array([640, 640]) / np.array([w, h]) @@ -169,65 +202,53 @@ def _renderMaze(self): clr = np.array(clr) * .3 pygame.draw.rect(self._display_surf, clr, [x, y, dx - 1, dy - 1], 0) # current pos - x, y = delta * self._maze.pos + x, y = delta * env.pos pygame.draw.rect(self._display_surf, Colors.RED, [x, y, dx - 1, dy - 1], 0) return - def _renderMazeMinimap(self): - anchor = np.array((450, 650)) - maze, moves = self._maze.minimap() - h, w = maze.shape - dx, dy = delta = 2 * np.array([64, 64]) / np.array([w, h]) - for ix in range(w): - for iy in range(h): - isWall = 0 < maze[ix, iy] - isWasHere = 0 < moves[ix, iy] - isUnknownArea = maze[ix, iy] < 0 - - clr = Colors.WHITE - if isWasHere: clr = Colors.GREEN - if isWall: clr = Colors.PURPLE - if isUnknownArea: clr = Colors.BLACK + def _renderAgentsMaze(self): + self._renderMaze(self._agents[self._activeAgent].environment) + return - y, x = (delta * np.array([ix, iy])) + anchor - pygame.draw.rect(self._display_surf, clr, [x, y, dx - 1, dy - 1], 0) - + def _drawText(self, text, pos, color): self._display_surf.blit( - self._font.render( - 'Observed state:', - False, Colors.BLUE - ), (anchor[1], anchor[0] - 25) + self._font.render(text, False, color), + pos ) return def _renderInfo(self): - self._display_surf.blit( - self._font.render( - 'Score: %.2f' % (self._maze.score), - False, Colors.BLUE - ), (655, 15) - ) - - self._display_surf.blit( - self._font.render( - 'Mode: %s' % (self._mode), - False, Colors.BLUE - ), (655, 35) - ) + line = lambda i: (655, 15 + i * 20) - if 'agent' == self._mode: - self._display_surf.blit( - self._font.render( - 'Network: %s' % (self._usedNetwork), - False, Colors.BLUE - ), (655, 55) + self._drawText('Mode: %s' % (self._mode), line(0), Colors.BLUE) + if not ('agent' == self._mode): + self._drawText( + 'Score: %.1f (%d)' % (self._maze.score * 100.0, self._maze.steps), + line(1), Colors.BLUE ) + + if 'random' == self._mode: + self._drawText('Speed: x%.0f' % (self._speed), line(2), Colors.BLUE) + + if 'agent' == self._mode: + self._drawText('Speed: x%.0f' % (self._speed), line(1), Colors.BLUE) + for i, agent in enumerate(self._agents): + self._drawText( + '%s%s | %.1f (%d)' % ( + '>> ' if i == self._activeAgent else '', + agent.name, agent.environment.score * 100.0, agent.environment.steps + ), + line(2 + i), Colors.BLUE + ) return def on_render(self): self._display_surf.fill(Colors.SILVER) - self._renderMaze() - self._renderMazeMinimap() + if 'agent' == self._mode: + self._renderAgentsMaze() + else: + self._renderMaze(self._maze) + self._renderInfo() pygame.display.flip() @@ -243,7 +264,7 @@ def run(self): self.on_render() pygame.quit() - + def main(): app = App() app.run() diff --git a/weights/agent-0.h5 b/weights/agent-0.h5 new file mode 100644 index 0000000..2e02b44 Binary files /dev/null and b/weights/agent-0.h5 differ diff --git a/weights/agent-1.h5 b/weights/agent-1.h5 new file mode 100644 index 0000000..4f75ebd Binary files /dev/null and b/weights/agent-1.h5 differ diff --git a/weights/agent-2.h5 b/weights/agent-2.h5 new file mode 100644 index 0000000..fbf3649 Binary files /dev/null and b/weights/agent-2.h5 differ diff --git a/weights/agent-3.h5 b/weights/agent-3.h5 new file mode 100644 index 0000000..f139bcd Binary files /dev/null and b/weights/agent-3.h5 differ diff --git a/weights/best.h5 b/weights/best.h5 deleted file mode 100644 index 51110e6..0000000 Binary files a/weights/best.h5 and /dev/null differ diff --git a/weights/latest.h5 b/weights/latest.h5 deleted file mode 100644 index 83541f5..0000000 Binary files a/weights/latest.h5 and /dev/null differ