diff --git a/Core/MazeRLWrapper.py b/Core/MazeRLWrapper.py index 3d2cb4e..42f4d58 100644 --- a/Core/MazeRLWrapper.py +++ b/Core/MazeRLWrapper.py @@ -42,11 +42,11 @@ def apply(self, actionIndex): if self._env.dead: # unreachable due to actions masking return nextState, -10, True, prevState - if 0.95 <= self._env.score: + if 0.99 <= self._env.score: return nextState, 0, True, prevState if self._movingLoop(): - return nextState, -5, True, prevState + return nextState, 0, True, prevState self._done = False reward = 0.3 if isNewCell else 0 # small reward for visiting new cell diff --git a/README.md b/README.md index 344aba5..6664342 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,6 @@ # Идеи и эксперименты -- [ ] Заменить разделение памяти/эпизодов на основные и после попадания в цикл. - [ ] Реализовать дистилляцию нескольких политик, используя доп. награды или иные методы. - [ ] Сравнить обученного без учителя агента с обученным с учителем. (500 эпох) - [ ] Обучить агента, который не получает информацию о своих перемещениях (только с данным об окружении). diff --git a/distillation.py b/distillation.py index 7d97a82..80f4643 100644 --- a/distillation.py +++ b/distillation.py @@ -106,10 +106,6 @@ def learn_environment(teacher, model, params): ] memory = CebPrioritized(maxSize=5000, sampleWeight='abs') - doomMemory = CebLinear( - maxSize=params.get('max steps after loop', 16) * 10000, - sampleWeight='abs' - ) trainableModel, teacherPower = wrapStudentModel(model) ###################################################### def withTeacherPredictions(replay): @@ -119,24 +115,20 @@ def withTeacherPredictions(replay): def testModel(EXPLORE_RATE): for e in environments: e.reset() - replays = Utils.emulateBatch( - environments, - DQNAgent(model, exploreRate=EXPLORE_RATE, noise=params.get('agent noise', 0)), - maxSteps=params.get('max test steps') - ) - for replay, _ in replays: - if params.get('clip replay', False): - replay = Utils.clipReplay(replay, loopLimit=LOOP_LIMIT) - if BOOTSTRAPPED_STEPS < len(replay): - memory.addEpisode(withTeacherPredictions(replay), terminated=True) + replays = [replay for replay, _ in Utils.emulateBatch( + environments, + DQNAgent(model, exploreRate=EXPLORE_RATE, noise=params.get('agent noise', 0)), + maxSteps=params.get('max test steps') + ) + ] - scores = [x.score for x in environments] ################ - # collect bad experience - envs = [e for e in environments if e.hitTheLoop] - if envs: + # explore if hit the loop + envsIndexes = [i for i, e in enumerate(environments) if e.hitTheLoop] + if envsIndexes: + envs = [environments[i] for i in envsIndexes] for e in envs: e.Continue() - replays = Utils.emulateBatch( + exploreReplays = Utils.emulateBatch( envs, DQNAgent( model, @@ -145,11 +137,13 @@ def testModel(EXPLORE_RATE): ), maxSteps=params.get('max steps after loop', 16) ) - for replay, _ in replays: - if BOOTSTRAPPED_STEPS < len(replay): - doomMemory.addEpisode(withTeacherPredictions(replay), terminated=True) + for ind, (replay, _) in zip(envsIndexes, exploreReplays): + replays[ind] += replay[1:] ################ - return scores + for replay in replays: + if BOOTSTRAPPED_STEPS < len(replay): + memory.addEpisode(withTeacherPredictions(replay), terminated=True) + return [x.score for x in environments] ###################################################### # collect some experience for _ in range(2): @@ -181,19 +175,6 @@ def testModel(EXPLORE_RATE): } ) print('Avg. train loss: %.4f' % trainLoss) - - if BATCH_SIZE < len(doomMemory): - trainLoss = train( - model, trainableModel, doomMemory, - { - 'gamma': GAMMA, - 'batchSize': BATCH_SIZE, - 'steps': BOOTSTRAPPED_STEPS, - 'episodes': params['train doom episodes'](epoch), - 'alpha': params.get('doom alpha', lambda _: alpha)(epoch) - } - ) - print('Avg. train doom loss: %.4f' % trainLoss) ################## # test print('Testing...') @@ -249,13 +230,11 @@ def testModel(EXPLORE_RATE): 'warm up epochs': 0, 'test episodes': 128, 'train episodes': lambda _: 128, - 'train doom episodes': lambda _: 32, 'alpha': lambda _: 1, 'explore rate': lambda _: 0, 'agent noise': 0.01, - 'clip replay': True, 'explore rate after loop': 0.2, 'agent noise after loop': 0.1, diff --git a/learn_environment.py b/learn_environment.py index 03a82d5..4d07d46 100644 --- a/learn_environment.py +++ b/learn_environment.py @@ -5,14 +5,12 @@ import Utils import fit_stage import os -from Utils.ExperienceBuffers.CebLinear import CebLinear def learn_environment(model, params): NAME = params['name'] BATCH_SIZE = params['batch size'] GAMMA = params['gamma'] BOOTSTRAPPED_STEPS = params['bootstrapped steps'] - LOOP_LIMIT = params['maze']['loop limit'] metrics = {} environments = [ @@ -20,32 +18,23 @@ def learn_environment(model, params): ] memory = CebPrioritized(maxSize=5000, sampleWeight='abs') - doomMemory = CebLinear( - maxSize=params.get('max steps after loop', 16) * 1000, - sampleWeight='abs' - ) - ###################################################### def testModel(EXPLORE_RATE): for e in environments: e.reset() - replays = Utils.emulateBatch( - environments, - DQNAgent(model, exploreRate=EXPLORE_RATE, noise=params.get('agent noise', 0)), - maxSteps=params.get('max test steps') - ) - for replay, _ in replays: - if params.get('clip replay', False): - replay = Utils.clipReplay(replay, loopLimit=LOOP_LIMIT) - if BOOTSTRAPPED_STEPS < len(replay): - memory.addEpisode(replay, terminated=True) + replays = [replay for replay, _ in Utils.emulateBatch( + environments, + DQNAgent(model, exploreRate=EXPLORE_RATE, noise=params.get('agent noise', 0)), + maxSteps=params.get('max test steps') + ) + ] - scores = [x.score for x in environments] ################ - # collect bad experience - envs = [e for e in environments if e.hitTheLoop] - if envs: + # explore if hit the loop + envsIndexes = [i for i, e in enumerate(environments) if e.hitTheLoop] + if envsIndexes: + envs = [environments[i] for i in envsIndexes] for e in envs: e.Continue() - replays = Utils.emulateBatch( + exploreReplays = Utils.emulateBatch( envs, DQNAgent( model, @@ -54,11 +43,13 @@ def testModel(EXPLORE_RATE): ), maxSteps=params.get('max steps after loop', 16) ) - for replay, _ in replays: - if BOOTSTRAPPED_STEPS < len(replay): - doomMemory.addEpisode(replay, terminated=True) + for ind, (replay, _) in zip(envsIndexes, exploreReplays): + replays[ind] += replay[1:] ################ - return scores + for replay in replays: + if BOOTSTRAPPED_STEPS < len(replay): + memory.addEpisode(replay, terminated=True) + return [x.score for x in environments] ###################################################### # collect some experience for _ in range(2): @@ -86,19 +77,6 @@ def testModel(EXPLORE_RATE): } ) print('Avg. train loss: %.4f' % trainLoss) - - if BATCH_SIZE < len(doomMemory): - trainLoss = fit_stage.train( - model, doomMemory, - { - 'gamma': GAMMA, - 'batchSize': BATCH_SIZE, - 'steps': BOOTSTRAPPED_STEPS, - 'episodes': params['train doom episodes'](epoch), - 'alpha': params.get('doom alpha', lambda _: alpha)(epoch) - } - ) - print('Avg. train doom loss: %.4f' % trainLoss) ################## # test print('Testing...') diff --git a/train.py b/train.py index 3678541..338c881 100644 --- a/train.py +++ b/train.py @@ -52,13 +52,11 @@ def getModel(shape): 'warm up epochs': 0, 'test episodes': 128, 'train episodes': lambda _: 128, - 'train doom episodes': lambda _: 32, 'alpha': lambda _: 1, 'explore rate': lambda _: 0, 'agent noise': 0.01, - 'clip replay': True, 'explore rate after loop': 0.2, 'agent noise after loop': 0.1 diff --git a/view_maze.py b/view_maze.py index 87c5b0f..661b712 100644 --- a/view_maze.py +++ b/view_maze.py @@ -20,7 +20,7 @@ from model import createModel def createMaze(): - sz = 16 * 4 + sz = 64 maze = (0.8 < np.random.rand(sz, sz)).astype(np.float32) res = CMazeEnvironment( maze=maze,