huge update

GreenWizard2015 · Dec 31, 2020 · 1cedbc4 · 1cedbc4
1 parent 18e9dde
commit 1cedbc4
Show file tree

Hide file tree

Showing 29 changed files with 980 additions and 407 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,5 @@ __pycache__
 /.pydevproject
 /.project
 /.settings
+/charts
+/chart.jpg
diff --git a/Agent/DQNAgent.py b/Agent/DQNAgent.py
@@ -0,0 +1,31 @@
+import numpy as np
+import math
+
+class DQNAgent:
+  def __init__(self, model, exploreRate=0, noise=None):
+    self._model = model
+    self._exploreRate = exploreRate
+    self._noise = noise
+    return
+
+  def reset(self):
+    return
+
+  def process(self, state, actionsMask = [1, 1, 1, 1]):
+    return self.processBatch([state], [actionsMask])[0]
+
+  def processBatch(self, states, actionsMask):
+    actions = self._model.predict(np.array(states))
+    if 0 < self._exploreRate:
+      rndIndexes = np.where(np.random.random_sample(actions.shape[0]) < self._exploreRate)
+      actions[rndIndexes] = np.random.random_sample(actions.shape)[rndIndexes]
+
+    if not (self._noise is None):
+      # softmax
+      e_x = np.exp(actions - actions.max(axis=-1, keepdims=True))
+      normed = e_x / e_x.sum(axis=-1, keepdims=True)
+      # add noise
+      actions = normed + (np.random.random_sample(actions.shape) * self._noise)
+
+    actions[np.where(~(1 == np.array(actionsMask)))] = -math.inf
+    return actions.argmax(axis=-1)
diff --git a/Agent/DQNEnsembleAgent.py b/Agent/DQNEnsembleAgent.py
@@ -0,0 +1,67 @@
+import numpy as np
+import math
+import tensorflow.keras as keras
+import tensorflow.keras.layers as layers
+import tensorflow as tf
+
+def combineModels(models, combiner):
+  shape = models[0].layers[0].input_shape[0][1:]
+  inputs = layers.Input(shape=shape)
+  actionsMask = layers.Input(shape=(4, ))
+  res = layers.Lambda(combiner)([actionsMask] + [ x(inputs) for x in models ])
+  return keras.Model(inputs=[inputs, actionsMask], outputs=res)
+
+def maskedSoftmax(mask, inputs):
+  mask = tf.where(tf.equal(mask, 1))
+  return [
+    tf.sparse.to_dense(
+      tf.sparse.softmax(
+        tf.sparse.SparseTensor(
+          indices=mask,
+          values=tf.gather_nd(x, mask),
+          dense_shape=tf.shape(x, out_type=tf.int64)
+        )
+      )
+    ) for x in inputs
+  ]
+
+def multiplyOutputs(inputs):
+  outputs = maskedSoftmax(inputs[0], inputs[1:])
+
+  res = 1 + outputs[0]
+  for x in outputs[1:]:
+    res = tf.math.multiply(res, 1 + x)
+  return res
+
+ENSEMBLE_MODE = {
+  'multiply': multiplyOutputs
+}
+
+class DQNEnsembleAgent:
+  def __init__(self, models, mode='multiply', exploreRate=0, noise=None):
+    self._model = combineModels(models, ENSEMBLE_MODE.get(mode, mode))
+    self._exploreRate = exploreRate
+    self._noise = noise
+    return
+
+  def reset(self):
+    return
+
+  def process(self, state, actionsMask = [1, 1, 1, 1]):
+    return self.processBatch([state], [actionsMask])[0]
+
+  def processBatch(self, states, actionsMask):
+    actions = self._model.predict([np.array(states), np.array(actionsMask)])
+    if 0 < self._exploreRate:
+      rndIndexes = np.where(np.random.random_sample(actions.shape[0]) < self._exploreRate)
+      actions[rndIndexes] = np.random.random_sample(actions.shape)[rndIndexes]
+
+    if not (self._noise is None):
+      # softmax
+      e_x = np.exp(actions - actions.max(axis=-1, keepdims=True))
+      normed = e_x / e_x.sum(axis=-1, keepdims=True)
+      # add noise
+      actions = normed + (np.random.random_sample(actions.shape) * self._noise)
+
+    actions[np.where(~(1 == np.array(actionsMask)))] = -math.inf
+    return actions.argmax(axis=-1)
diff --git a/Agent/RandomAgent.py b/Agent/RandomAgent.py
@@ -0,0 +1,17 @@
+import numpy as np
+import math
+
+class RandomAgent:
+  def __init__(self):
+    return
+
+  def reset(self):
+    pass
+
+  def process(self, state, actionsMask = [1, 1, 1, 1]):
+    return self.processBatch([state], [actionsMask])
+
+  def processBatch(self, states, actionsMask):
+    actions = np.random.random_sample((np.array(states).shape[0], 4))
+    actions[np.where(~(1 == np.array(actionsMask)))] = -math.inf
+    return actions.argmax(axis=-1)
diff --git a/tests/__init__.py → Agent/__init__.py b/tests/__init__.py → Agent/__init__.py
diff --git a/CMazeExperience.py b/CMazeExperience.py
diff --git a/Core/CMazeEnviroment.py → Core/CMazeEnvironment.py b/Core/CMazeEnviroment.py → Core/CMazeEnvironment.py
@@ -11,17 +11,18 @@ class MazeActions(Enum):
 MAZE_ACTIONS_AS_INT = { x: i for i, x in enumerate(MazeActions) }
 MAZE_ACTIONS = [x for x in MazeActions]
 
-class CMazeEnviroment:
+class CMazeEnvironment:
   def __init__(self, maze, pos, FOV, minimapSize):
     self.maze = np.pad(np.array(maze), FOV, constant_values=(1,))
     self.minimapSize = minimapSize
-    self._fov = FOV
+    self._fov = self.FOV = FOV
 
     x, y = np.array(pos) + FOV
     self.spawnAt(x, y)
     return
 
   def spawnAt(self, x, y):
+    self._steps = 0
     self.pos = np.array([y, x])
     self.fog = np.zeros_like(self.maze)
     self.moves = np.zeros_like(self.maze)
@@ -42,14 +43,19 @@ def _update(self):
     y, x = self.pos
     d = self._fov
     self.fog[x - d:x + d + 1, y - d:y + d + 1] = 1
+    self.moves *= .98
     self.moves[x, y] = 1
     return
 
   def apply(self, action):
+    self._steps += 1
     self.pos += action.value
-    self.lastAction = MAZE_ACTIONS_AS_INT[action]
     self._update()
     return
+
+  def isMovingToVisited(self, action):
+    y, x = self.pos + action.value
+    return 1 == self.moves[x, y]
 
   def vision(self):
     y, x = self.pos
@@ -81,23 +87,21 @@ def minimap(self):
     return ((maze * fog) - (1 - fog), moves)
 
   @property
-  def state(self):
-    return ((self.minimap(), ), self.score, self.done)
-
-  @property
-  def done(self):
+  def dead(self):
     y, x = self.pos
     return 0 < self.maze[x, y]
 
   @property
   def score(self):
-    h, w = self.fog.shape
-    total = h * w
-    return np.count_nonzero(self.fog) / total
+    return np.count_nonzero(self.fog) * self.minScoreDelta
+
+  @property
+  def steps(self):
+    return self._steps
 
   def copy(self):
     # dirty copy
-    res = CMazeEnviroment(self.maze, self.pos, self._fov, self.minimapSize)
+    res = CMazeEnvironment(self.maze, self.pos, self._fov, self.minimapSize)
     res.maze = self.maze.copy()
     res.fog = self.fog.copy()
     res.pos = self.pos.copy()
@@ -114,6 +118,9 @@ def validActions(self):
   def validActionsIndex(self):
     return [ i for i, act in enumerate(MazeActions) if self.isPossible(act) ]
 
+  def actionsMask(self):
+    return [ (1 if self.isPossible(act) else 0) for act in MazeActions ]
+
   def invalidActions(self):
     return [ i for i, act in enumerate(MazeActions) if not self.isPossible(act) ]
 
@@ -124,4 +131,9 @@ def state2input(self):
 
   @property
   def input_size(self):
-    return self.state2input().shape
+    return self.state2input().shape
+
+  @property
+  def minScoreDelta(self):
+    h, w = self.fog.shape
+    return 1.0 / (h * w)
diff --git a/Core/MazeRLWrapper.py b/Core/MazeRLWrapper.py
@@ -0,0 +1,97 @@
+from Core.CMazeEnvironment import CMazeEnvironment, MAZE_ACTIONS
+import numpy as np
+import math
+
+class MazeRLWrapper:
+  def __init__(self, params):
+    maze = (
+      params.get('obstacles rate', 0.8) < np.random.rand(params['size'], params['size'])
+    ).astype(np.float32)
+
+    env = CMazeEnvironment(
+      maze=maze,
+      pos=(0, 0),
+      FOV=params['FOV'],
+      minimapSize=params['minimapSize']
+    )
+    env.respawn()
+    self._env = env
+
+    self._stepsLimit = params['loop limit']
+    self._minUniqSteps = params.get('min unique positions rate', 0.3)
+    self._stopIfLoop = params.get('stop if loop', True)
+    self._onlyNewCells = params.get('only new cells reward', False)
+    return
+
+  def reset(self):
+    self._stopInLoop = False
+    self._done = False
+    self._env.respawn()
+    self._moves = []
+    return
+
+  def apply(self, actionIndex):
+    act = MAZE_ACTIONS[actionIndex]
+    prevState = self.state
+    prevScore = self.score
+    isNewCell = not self._env.isMovingToVisited(act)
+    self._env.apply(act)
+    nextState = self.state
+
+    self._done = True
+    if self._env.dead: # unreachable due to actions masking 
+      return nextState, -10, True, prevState
+
+    if 0.95 <= self._env.score: 
+      return nextState, 0, True, prevState
+
+    if self._movingLoop():
+      return nextState, -5, True, prevState
+
+    self._done = False
+    reward = 0.3 if isNewCell else 0 # small reward for visiting new cell
+
+    if not self._onlyNewCells:
+      discovered = (self._env.score - prevScore) / self._env.minScoreDelta
+      reward += 1 + math.log(discovered, 10) if 0 < discovered else -1
+    return nextState, reward, False, prevState
+
+  def actionsMask(self):
+    return self._env.actionsMask()
+
+  @property
+  def state(self):
+    return self._env.state2input()
+
+  @property
+  def done(self):
+    return self._done
+
+  @property
+  def hitTheLoop(self):
+    return self._stopInLoop
+
+  @property
+  def score(self):
+    return self._env.score
+
+  @property
+  def input_size(self):
+    return self._env.input_size
+
+  @property
+  def uniqueMoves(self):
+    if self._stepsLimit <= len(self._moves):
+      return len(set(self._moves)) / len(self._moves)
+    return 1
+
+  def _movingLoop(self):
+    self._moves.append(str(self._env.pos))
+    self._moves = self._moves[1:] if self._stepsLimit < len(self._moves) else self._moves
+    self._stopInLoop = self._stopIfLoop and (self.uniqueMoves < self._minUniqSteps)
+    return self._stopInLoop
+
+  def Continue(self):
+    self._done = False
+    self._moves = []
+    return