tried to use RL (failed)

GreenWizard2015 · Nov 24, 2020 · 3a2d4c4 · 3a2d4c4
1 parent 4389744
commit 3a2d4c4
Show file tree

Hide file tree

Showing 5 changed files with 384 additions and 19 deletions.
diff --git a/CMazeExperience.py b/CMazeExperience.py
@@ -0,0 +1,67 @@
+import random
+import numpy as np
+import math
+
+class CMazeExperience:
+  def __init__(self, maxSize):
+    self.maxSize = maxSize
+    self.sizeLimit = (maxSize * 1.1)
+    self.episodes = []
+    self.gamma = 0.5
+    self.minScore = -math.inf
+
+  def addEpisode(self, replay):
+    score = sum(x[2] for x in replay)
+    if score < self.minScore: return
+
+#     for i in range(len(replay)):
+#       state, act, score, nextState = replay[i]
+#       gamma = self.gamma
+#       for j in range(i + 1, len(replay)):
+#         score += gamma * replay[j][2]
+#         gamma *= self.gamma
+    self.episodes.append((replay, score))
+
+    if self.sizeLimit < len(self.episodes):
+      self.update()
+    return
+
+  def update(self):
+    self.episodes = list(
+      sorted(self.episodes, key=lambda x: x[1], reverse=True)
+    )[:self.maxSize]
+    self.minScore = self.episodes[-1][1]
+    print('Min score: %.6f' % self.minScore)
+
+  def __len__(self):
+    return len(self.episodes)
+
+  def take_batch(self, batch_size):
+    batch = []
+    weights = [x[1] for x in self.episodes]
+    while len(batch) < batch_size:
+      episode, _ = random.choices(
+        self.episodes, 
+        weights=weights, 
+        k=1
+      )[0]
+
+      minibatchIndexes = set(random.choices(
+        np.arange(len(episode)),
+        weights=[abs(x[2]) for x in episode],
+        k=min((5, batch_size - len(batch), len(episode)))
+      ))
+
+      for ind in minibatchIndexes:
+        state, act, score, nextState = episode[ind]
+        nextStateWeight = 1 if ind < len(episode) - 1 else 0 
+        batch.append((state, act, score, nextState, nextStateWeight))
+
+
+    return (
+      np.array([x[0] for x in batch]),
+      np.array([x[1] for x in batch]),
+      np.array([x[2] for x in batch]),
+      np.array([x[3] for x in batch]),
+      np.array([x[4] for x in batch]),
+    )
diff --git a/Core/CMazeEnviroment.py b/Core/CMazeEnviroment.py
@@ -7,10 +7,14 @@ class MazeActions(Enum):
   RIGHT = (1, 0)
   UP = (0, -1)
   DOWN = (0, 1)
+
+MAZE_ACTIONS_AS_INT = { x: i for i, x in enumerate(MazeActions) }
+MAZE_ACTIONS = [x for x in MazeActions]
 
 class CMazeEnviroment:
-  def __init__(self, maze, pos, FOV):
+  def __init__(self, maze, pos, FOV, minimapSize):
     self.maze = np.pad(np.array(maze), FOV, constant_values=(1,))
+    self.minimapSize = minimapSize
     self._fov = FOV
 
     x, y = np.array(pos) + FOV
@@ -20,7 +24,8 @@ def __init__(self, maze, pos, FOV):
   def spawnAt(self, x, y):
     self.pos = np.array([y, x])
     self.fog = np.zeros_like(self.maze)
-    self._updateFog()
+    self.moves = np.zeros_like(self.maze)
+    self._update()
     return
 
   def respawn(self):
@@ -33,17 +38,17 @@ def respawn(self):
         break
     return
 
-  def _updateFog(self):
+  def _update(self):
     y, x = self.pos
-    self.fog[
-      x - self._fov:x + self._fov + 1,
-      y - self._fov:y + self._fov + 1
-    ] = 1
+    d = self._fov
+    self.fog[x - d:x + d + 1, y - d:y + d + 1] = 1
+    self.moves[x, y] = 1
     return
 
   def apply(self, action):
     self.pos += action.value
-    self._updateFog()
+    self.lastAction = MAZE_ACTIONS_AS_INT[action]
+    self._update()
     return
 
   def vision(self):
@@ -52,15 +57,38 @@ def vision(self):
       x - self._fov:x + self._fov + 1,
       y - self._fov:y + self._fov + 1
     ]
+
+  def _takeShot(self):
+    maze, fog, moves = self.maze, self.fog, self.moves
+    y, x = self.pos
+    h, w = self.maze.shape
+
+    isXAxisOk = (self.minimapSize < x) and (x < (w - self.minimapSize))
+    isYAxisOk = (self.minimapSize < y) and (y < (h - self.minimapSize))
+    if not (isXAxisOk and isYAxisOk):
+      x += self.minimapSize
+      y += self.minimapSize
+      maze = np.pad(maze, self.minimapSize, constant_values=(1,))
+      fog, moves = (
+        np.pad(data, self.minimapSize, constant_values=(0,)) for data in (fog, moves)
+      )
+
+    d = self.minimapSize
+    return (data[x - d:x + d + 1, y - d:y + d + 1] for data in (maze, fog, moves))
+
+  def minimap(self):
+    #maze, fog, moves = self._takeShot()
+    maze, fog, moves = self.maze, self.fog, self.moves
+    return (maze * fog, moves)
 
   @property
   def state(self):
-    return ((self.vision(), self.fog, ), self.score, self.done)
+    return ((self.minimap(), ), self.score, self.done)
 
   @property
   def done(self):
-    y, x = self._pos
-    return 1 < self.maze[x, y]
+    y, x = self.pos
+    return 0 < self.maze[x, y]
 
   @property
   def score(self):
@@ -70,15 +98,31 @@ def score(self):
 
   def copy(self):
     # dirty copy
-    res = CMazeEnviroment(self.maze, self.pos, self._fov)
+    res = CMazeEnviroment(self.maze, self.pos, self._fov, self.minimapSize)
     res.maze = self.maze.copy()
     res.fog = self.fog.copy()
     res.pos = self.pos.copy()
+    res.moves = self.moves.copy()
     return res
 
   def isPossible(self, action):
     y, x = self.pos + action.value
     return self.maze[x, y] <= 0
 
   def validActions(self):
-    return [ act for act in MazeActions if self.isPossible(act) ]
+    return [ act for act in MazeActions if self.isPossible(act) ]
+
+  def validActionsIndex(self):
+    return [ i for i, act in enumerate(MazeActions) if self.isPossible(act) ]
+
+  def invalidActions(self):
+    return [ i for i, act in enumerate(MazeActions) if not self.isPossible(act) ]
+
+  def state2input(self):
+    maze, moves = self.minimap()
+    state = np.dstack((maze, ))
+    return state
+
+  @property
+  def input_size(self):
+    return self.state2input().shape
diff --git a/model.py b/model.py
@@ -0,0 +1,41 @@
+import tensorflow.keras as keras
+import tensorflow.keras.layers as layers
+
+def convBlock(prev, sz, filters):
+  conv_1 = layers.Convolution2D(filters, (sz, sz), padding="same", activation="relu")(prev)
+  conv_1 = layers.Dropout(0.1)(conv_1)
+  conv_1 = layers.BatchNormalization()(conv_1)
+  return conv_1
+
+def createModel(shape):
+  inputs = res = layers.Input(shape=shape)
+  res = convBlock(res, 3, filters=32)
+  res = convBlock(res, 3, filters=32)
+  res = convBlock(res, 3, filters=32)
+
+  res = layers.Flatten()(res)
+
+  res = layers.Dense(16 ** 2, activation='relu')(res)
+  res = layers.Dropout(.2)(res)
+  res = layers.Dense(16 ** 2, activation='relu')(res)
+  res = layers.Dropout(.2)(res)
+  res = layers.Dense(16 ** 2, activation='relu')(res)
+  res = layers.Dropout(.2)(res)
+  res = layers.Dense(8 ** 2, activation='relu')(res)
+  res = layers.Dropout(.2)(res)
+  res = layers.Dense(8 ** 2, activation='relu')(res)
+  res = layers.Dropout(.2)(res)
+  res = layers.Dense(8 ** 2, activation='relu')(res)
+  res = layers.Dropout(.2)(res)
+  res = layers.Dense(4 ** 2, activation='relu')(res)
+  res = layers.Dropout(.2)(res)
+  res = layers.Dense(4 ** 2, activation='relu')(res)
+  res = layers.Dropout(.2)(res)
+  res = layers.Dense(4 ** 2, activation='relu')(res)
+  res = layers.Dropout(.2)(res)
+
+  res = layers.Dense(4, activation='linear')(res)
+  return keras.Model(
+    inputs=inputs,
+    outputs=res
+  )
diff --git a/train.py b/train.py
@@ -0,0 +1,146 @@
+# -*- coding: utf-8 -*-
+import sys
+import os
+import tensorflow as tf
+from CMazeExperience import CMazeExperience
+
+if 'COLAB_GPU' in os.environ:
+  # fix resolve modules
+  from os.path import dirname
+  sys.path.append(dirname(dirname(dirname(__file__))))
+else: # local GPU
+  gpus = tf.config.experimental.list_physical_devices('GPU')
+  tf.config.experimental.set_virtual_device_configuration(
+    gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1 * 1024)]
+  )
+
+import random
+import numpy as np
+
+from keras.optimizers import Adam
+
+from Core.CMazeEnviroment import CMazeEnviroment, MAZE_ACTIONS
+from model import createModel
+
+def emulate(env, model, exploreRate, exploreDecay, steps, stopOnInvalid=False):
+  episodeReplay = []
+  done = False
+  N = 0
+  while (N < steps) and not done:
+    N += 1
+    act = None
+    valid = env.validActionsIndex()
+    if not valid: break
+
+    state = env.state2input()      
+    if random.random() < exploreRate:
+      act = random.choice(valid)
+    else:
+      probe = model.predict(np.array([state]))[0]
+      if not stopOnInvalid:
+        for i in env.invalidActions():
+          probe[i] = -1
+      act = np.argmax(probe)
+
+    if stopOnInvalid and not (act in valid):
+      episodeReplay.append([state, act, -1, env.state2input()])
+      break
+
+    prevScore = env.score
+    env.apply(MAZE_ACTIONS[act])
+    normedScore = 1 if 0 < (env.score - prevScore) else -.1
+    episodeReplay.append([state, act, normedScore, env.state2input()])
+
+    done = env.done
+    exploreRate = max((.01, exploreRate * exploreDecay))
+  return episodeReplay
+
+if __name__ == "__main__":
+  sz = 32
+  env = CMazeEnviroment(
+    maze=(0.8 < np.random.rand(sz, sz)).astype(np.float32),
+    pos=(0, 0),
+    FOV=3,
+    minimapSize=8
+  )
+  memory = CMazeExperience(maxSize=100)
+  done = False
+  batch_size = 64
+  playSteps = 64
+
+  bestModelScore = 0
+  model = createModel(shape=env.input_size)
+  model.compile(
+    optimizer=Adam(lr=1e-3),
+    loss='mean_squared_error'
+  )
+#   model.load_weights('model.h5')
+
+  targetModel = createModel(shape=env.input_size)
+  np.set_printoptions(precision=3)
+  # collect data
+  while len(memory) < 50:
+    env.respawn()
+    episodeReplay = emulate(
+      env, model,
+      exploreRate=0.9,
+      exploreDecay=0.9,
+      steps=playSteps,
+      stopOnInvalid=False
+    ) 
+    #################
+    if 1 < len(episodeReplay):
+      memory.addEpisode(episodeReplay)
+      print(len(memory), env.score)
+  memory.update()
+
+  train_episodes = 500
+  test_episodes = 10
+  exploreRate = 1
+  exploreDecayPerEpoch = .9
+  exploreDecay = .9
+  for epoch in range(5000):
+    print('Epoch %d' % epoch)
+    # train
+    targetModel.set_weights(model.get_weights())
+    lossSum = 0
+    for n in range(train_episodes):
+      states, actions, rewards, nextStates, nextReward = memory.take_batch(batch_size)
+      targets = targetModel.predict(nextStates)
+      targets[np.arange(len(targets)), actions] = rewards + np.max(targets, axis=1) * .9 * nextReward
+
+      lossSum += model.fit(
+        states, targets,
+        epochs=1,
+        verbose=0
+      ).history['loss'][0]
+    print('Avg. train loss: %.4f' % (lossSum / train_episodes))
+    print(targets[0])
+
+    # test
+    print('Epoch %d testing' % epoch)
+    bestScore = scoreSum = movesSum = 0
+    n = 0
+    while n < test_episodes:
+      env.respawn()
+      episodeReplay = emulate(
+        env, model,
+        exploreRate=exploreRate,
+        exploreDecay=exploreDecay,
+        steps=playSteps*2,
+        stopOnInvalid=True
+      )
+      if 1 < len(episodeReplay):
+        memory.addEpisode(episodeReplay)
+        n += 1
+        bestScore = max((bestScore, env.score))
+        scoreSum += env.score
+        movesSum += len(episodeReplay)
+      #################
+    print('Best score: %.3f, avg. score: %.3f, avg. moves: %.1f' % (bestScore, scoreSum / n, movesSum / n))
+    if bestModelScore < scoreSum:
+      bestModelScore = scoreSum
+      print('save best model')
+      model.save_weights('model.h5')
+    model.save_weights('latest.h5')
+    exploreRate *= exploreDecayPerEpoch