Skip to content

Commit

Permalink
ensemble distillation
Browse files Browse the repository at this point in the history
  • Loading branch information
GreenWizard2015 committed Jan 5, 2021
1 parent ac8c9f3 commit 9977f7f
Show file tree
Hide file tree
Showing 10 changed files with 323 additions and 11 deletions.
10 changes: 5 additions & 5 deletions Agent/DQNEnsembleAgent.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def combineModels(models, combiner):
) for x in models ]

res = layers.Lambda(combiner)( layers.Concatenate(axis=1)(predictions) )
res = MaskedSoftmax()( res, actionsMask )
return keras.Model(inputs=[inputs, actionsMask], outputs=res)

@tf.function
Expand Down Expand Up @@ -45,11 +46,10 @@ def processBatch(self, states, actionsMask):
actions[rndIndexes] = np.random.random_sample(actions.shape)[rndIndexes]

if not (self._noise is None):
# softmax
e_x = np.exp(actions - actions.max(axis=-1, keepdims=True))
normed = e_x / e_x.sum(axis=-1, keepdims=True)
# add noise
actions = normed + (np.random.random_sample(actions.shape) * self._noise)

actions[np.where(~(1 == np.array(actionsMask)))] = -math.inf
return actions.argmax(axis=-1)
return actions.argmax(axis=-1)

def predict(self, states, actionsMask):
return self._model.predict([states, actionsMask])
30 changes: 29 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,35 @@

Опять же, прямо ощутимого улучшения нет, но ансамбль немного стабильнее открывает 20-25% карты.

Следующим шагом будет дистилляция ансамбля в единую сеть, а так же использование полноценной сети для комбинации предсказаний подсетей. Есть большая вероятность того, что это позволит уловить более глубокие корреляции т. к. обучаемая сеть будет уже иметь представление о соотношение Q-values (сами значения индивидуальны для каждой сети).
# Дистилляция ансамбля

Новая сеть обучалась с дополнительным лоссом, который определяет сходство распределения Q-values обучаемой сети с предсказанием ансамбля.

![](img/20210106-distilled.jpg)

Обучаемая с учителем нейронная сеть практически сразу же достигает более высоких результатов, чем обучаемая без учителя. Более того, в полноценных тестах она показывает себя немного лучше ансамбля:

![](img/20210106-high.jpg)

![](img/20210106-low.jpg)

Некоторые наблюдения:

- Новая сеть какое-то время (10-30 эпох) способна показывать хорошие результаты, если "копировать" только распределение и не контролировать сами значения. Это вполне ожидаемо, но всё же интересно.
- Сеть лишь копирует распределение, поэтому не способна улучшить результаты. Вполне возможно, что необходимо более длительное обучение, чтоб сеть полностью адаптировала Q-values к распределению диктуемому ансамблем, а затем она смогла бы продолжить обучение. Целесообразно ли это? Не лучше ли тогда обучить сеть полностью с нуля?
- Ансамбль усредняет поведение агентов, выделяя общее поведение. Новая сеть копирует усреднённое поведение, тоже сглаживая нюансы поведения, стратегию. Таким образом, новая сеть теряет особенности, которые позволяли агентам демонстрировать более хорошие результаты в особых ситуациях. Как тогда эффективно объединять "знания" агентов? Полезные материалы по данной теме:
- [Distill and transfer learning for robust multitask RL (YouTube)](https://www.youtube.com/watch?v=scf7Przmh7c)
- [Teacher-Student Framework: A Reinforcement Learning Approach](https://www.researchgate.net/publication/280255927_Teacher-Student_Framework_A_Reinforcement_Learning_Approach)
- [Progressive Reinforcement Learning with Distillation for Multi-Skilled Motion Control](https://arxiv.org/abs/1802.04765)

# Идеи и эксперименты

- [ ] Заменить разделение памяти/эпизодов на основные и после попадания в цикл.
- [ ] Реализовать дистилляцию нескольких политик, используя доп. награды или иные методы.
- [ ] Сравнить обученного без учителя агента с обученным с учителем. (500 эпох)
- [ ] Обучить агента, который не получает информацию о своих перемещениях (только с данным об окружении).
- [ ] Реализовать полноценного агента с памятью.
- [ ] Использовать A2C или иные методы, фундаментально отличающиеся от DQN.

# Области применения

Expand Down
7 changes: 7 additions & 0 deletions Utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@
import numpy as np
import math

def softmax(x, mask=None):
e_x = np.exp(x - x.max(axis=-1, keepdims=True))
if not (mask is None):
e_x *= mask
e_sum = e_x.sum(axis=-1, keepdims=True)
return np.divide(e_x, e_sum, out=np.zeros_like(e_x), where=(e_sum != 0))

def emulateBatch(testEnvs, agent, maxSteps):
replays = [[] for _ in testEnvs]
steps = 0
Expand Down
275 changes: 275 additions & 0 deletions distillation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,275 @@
# -*- coding: utf-8 -*-
import sys
import os
import tensorflow as tf
from Agent.MaskedSoftmax import MaskedSoftmax

if 'COLAB_GPU' in os.environ:
# fix resolve modules
from os.path import dirname
sys.path.append(dirname(dirname(dirname(__file__))))
else: # local GPU
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_virtual_device_configuration(
gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=3 * 1024)]
)

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import Huber
import tensorflow.keras as keras

from model import createModel
from Core.MazeRLWrapper import MazeRLWrapper
from Utils.ExperienceBuffers.CebPrioritized import CebPrioritized
from Agent.DQNAgent import DQNAgent
from Agent.DQNEnsembleAgent import DQNEnsembleAgent
import time
import Utils
from Utils.ExperienceBuffers.CebLinear import CebLinear
import glob
import numpy as np

#######################################
def train(model, trainableModel, memory, params):
modelClone = tf.keras.models.clone_model(model)
modelClone.set_weights(model.get_weights()) # use clone model for stability

BOOTSTRAPPED_STEPS = params['steps']
GAMMA = params['gamma']
ALPHA = params.get('alpha', 1.0)
rows = np.arange(params['batchSize'])
lossSum = 0
for _ in range(params['episodes']):
allStates, actions, rewards, actionsMask, teacherPredictions, nextStateScoreMultiplier = memory.sampleSequenceBatch(
batch_size=params['batchSize'],
maxSamplesFromEpisode=params.get('maxSamplesFromEpisode', 16),
sequenceLen=BOOTSTRAPPED_STEPS + 1
)

states = allStates[:, :-1]
rewards = rewards[:, :-1]
actions = actions[:, 0]

futureScores = modelClone.predict(allStates[:, -1]).max(axis=-1) * nextStateScoreMultiplier[:, -1]
totalRewards = (rewards * (GAMMA ** np.arange(BOOTSTRAPPED_STEPS))).sum(axis=-1)
targets = modelClone.predict(states[:, 0])

targets[rows, actions] += ALPHA * (
totalRewards + futureScores * (GAMMA ** BOOTSTRAPPED_STEPS) - targets[rows, actions]
)

lossSum += trainableModel.fit(
[states[:, 0], teacherPredictions[:, 0], actionsMask[:, 0], targets],
epochs=1, verbose=0
).history['loss'][0]
###

return lossSum / params['episodes']

def complexLoss(valueLoss, teacherPower, distributions, actionsMasks, y_true, y_pred, y_pred_softmax):
# mask out invalid actions
lossValues = valueLoss(y_true * actionsMasks, y_pred * actionsMasks)

lossDistribution = keras.losses.kl_divergence(distributions * actionsMasks, y_pred_softmax * actionsMasks)
return lossValues + (lossDistribution * teacherPower)

def wrapStudentModel(student):
inputA = keras.layers.Input(shape=student.layers[0].input_shape[0][1:])
inputDistributions = keras.layers.Input(shape=(4, ))
inputMasks = keras.layers.Input(shape=(4, ))
inputTargets = keras.layers.Input(shape=(4, ))
teacherPower = tf.Variable(1.0, tf.float32)

res = student(inputA)
resSoftmax = MaskedSoftmax()(res, inputMasks)

model = keras.Model(inputs=[inputA, inputDistributions, inputMasks, inputTargets], outputs=[res, resSoftmax])
model.add_loss(complexLoss(
Huber(delta=1),
teacherPower,
inputDistributions, inputMasks, inputTargets,
res, resSoftmax
))
model.compile(optimizer=Adam(lr=1e-3), loss=None )
return model, teacherPower

def learn_environment(teacher, model, params):
NAME = params['name']
BATCH_SIZE = params['batch size']
GAMMA = params['gamma']
BOOTSTRAPPED_STEPS = params['bootstrapped steps']
LOOP_LIMIT = params['maze']['loop limit']
metrics = {}

environments = [
MazeRLWrapper(params['maze']) for _ in range(params['test episodes'])
]

memory = CebPrioritized(maxSize=5000, sampleWeight='abs')
doomMemory = CebLinear(
maxSize=params.get('max steps after loop', 16) * 10000,
sampleWeight='abs'
)
trainableModel, teacherPower = wrapStudentModel(model)
######################################################
def withTeacherPredictions(replay):
prevStates, actions, rewards, actionsMasks = zip(*replay)
teacherPredictions = teacher.predict(np.array(prevStates), np.array(actionsMasks))
return list(zip(prevStates, actions, rewards, actionsMasks, teacherPredictions))

def testModel(EXPLORE_RATE):
for e in environments: e.reset()
replays = Utils.emulateBatch(
environments,
DQNAgent(model, exploreRate=EXPLORE_RATE, noise=params.get('agent noise', 0)),
maxSteps=params.get('max test steps')
)
for replay, _ in replays:
if params.get('clip replay', False):
replay = Utils.clipReplay(replay, loopLimit=LOOP_LIMIT)
if BOOTSTRAPPED_STEPS < len(replay):
memory.addEpisode(withTeacherPredictions(replay), terminated=True)

scores = [x.score for x in environments]
################
# collect bad experience
envs = [e for e in environments if e.hitTheLoop]
if envs:
for e in envs: e.Continue()
replays = Utils.emulateBatch(
envs,
DQNAgent(
model,
exploreRate=params.get('explore rate after loop', 1),
noise=params.get('agent noise after loop', 0)
),
maxSteps=params.get('max steps after loop', 16)
)
for replay, _ in replays:
if BOOTSTRAPPED_STEPS < len(replay):
doomMemory.addEpisode(withTeacherPredictions(replay), terminated=True)
################
return scores
######################################################
# collect some experience
for _ in range(2):
testModel(EXPLORE_RATE=0)
#######################
bestModelScore = -float('inf')
for epoch in range(params['epochs']):
T = time.time()

EXPLORE_RATE = params['explore rate'](epoch)
alpha = params.get('alpha', lambda _: 1)(epoch)
teacherP = max((0, params.get('teacher power', lambda _: 1)(epoch) ))
teacherPower.assign(teacherP)
print(
'[%s] %d/%d epoch. Explore rate: %.3f. Alpha: %.5f. Teacher power: %.3f' % (
NAME, epoch, params['epochs'], EXPLORE_RATE, alpha, teacherP
)
)
##################
# Training
trainLoss = train(
model, trainableModel, memory,
{
'gamma': GAMMA,
'batchSize': BATCH_SIZE,
'steps': BOOTSTRAPPED_STEPS,
'episodes': params['train episodes'](epoch),
'alpha': alpha
}
)
print('Avg. train loss: %.4f' % trainLoss)

if BATCH_SIZE < len(doomMemory):
trainLoss = train(
model, trainableModel, doomMemory,
{
'gamma': GAMMA,
'batchSize': BATCH_SIZE,
'steps': BOOTSTRAPPED_STEPS,
'episodes': params['train doom episodes'](epoch),
'alpha': params.get('doom alpha', lambda _: alpha)(epoch)
}
)
print('Avg. train doom loss: %.4f' % trainLoss)
##################
# test
print('Testing...')
scores = testModel(EXPLORE_RATE)
Utils.trackScores(scores, metrics)
##################

scoreSum = sum(scores)
print('Scores sum: %.5f' % scoreSum)
if (bestModelScore < scoreSum) and (params['warm up epochs'] < epoch):
print('save best model (%.2f => %.2f)' % (bestModelScore, scoreSum))
bestModelScore = scoreSum
model.save_weights('weights/%s.h5' % NAME)
##################
os.makedirs('charts', exist_ok=True)
Utils.plotData2file(metrics, 'charts/%s.jpg' % NAME)
print('Epoch %d finished in %.1f sec.' % (epoch, time.time() - T))
print('------------------')

#######################################
MAZE_FOV = 3
MAZE_MINIMAP_SIZE = 8
MAZE_LOOPLIMIT = 32
#######################################

if __name__ == "__main__":
DEFAULT_MAZE_PARAMS = {
'size': 40,
'FOV': MAZE_FOV,
'minimapSize': MAZE_MINIMAP_SIZE,
'loop limit': MAZE_LOOPLIMIT,
}

MODEL_INPUT_SHAPE = MazeRLWrapper(DEFAULT_MAZE_PARAMS).input_size

models = []
for x in glob.iglob('weights/agent-*.h5'):
filename = os.path.abspath(x)
model = createModel(shape=MODEL_INPUT_SHAPE)
model.load_weights(filename)
models.append(model)

teacher = DQNEnsembleAgent(models)
#######################
DEFAULT_LEARNING_PARAMS = {
'maze': DEFAULT_MAZE_PARAMS,

'batch size': 256,
'gamma': 0.95,
'bootstrapped steps': 3,

'epochs': 100,
'warm up epochs': 0,
'test episodes': 128,
'train episodes': lambda _: 128,
'train doom episodes': lambda _: 32,

'alpha': lambda _: 1,
'explore rate': lambda _: 0,

'agent noise': 0.01,
'clip replay': True,

'explore rate after loop': 0.2,
'agent noise after loop': 0.1,

'max test steps': 1000
}
#######################
# just transfer distributions from teacher
learn_environment(
teacher,
createModel(shape=MODEL_INPUT_SHAPE),
{
**DEFAULT_LEARNING_PARAMS,
'name': 'distilled',
'teacher power': lambda epoch: 1,
}
)
Binary file added img/20210106-distilled.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/20210106-high.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/20210106-low.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
7 changes: 4 additions & 3 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,13 @@ def testAgent(environments, agent, name, metrics, N=20):
'Worst scores (top 90%)': {},
'Best scores (top 10%)': {}
}
models = []
agents = []
for i, x in enumerate(glob.iglob('weights/*.h5')):
filename = os.path.abspath(x)
model = createModel(shape=MODEL_INPUT_SHAPE)
model.load_weights(filename)
models.append(model)
if os.path.basename(filename).startswith('agent-'):
agents.append(model)

testAgent(
environments,
Expand All @@ -95,7 +96,7 @@ def testAgent(environments, agent, name, metrics, N=20):

testAgent(
environments,
DQNEnsembleAgent(models),
DQNEnsembleAgent(agents),
name='ensemble',
metrics=metrics
)
Expand Down
5 changes: 3 additions & 2 deletions view_maze.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,10 @@ def _createNewAgent(self):
filename = os.path.abspath(x)
model = createModel(shape=self._maze.input_size)
model.load_weights(filename)
models.append(model)
agent = DQNAgent(model)
name = os.path.basename(filename)
if name.startswith('agent-'):
models.append(model)
agent = DQNAgent(model)

self._agents.append(RLAgent(
name[:-3], agent, None, None
Expand Down
Binary file added weights/distilled.h5
Binary file not shown.

0 comments on commit 9977f7f

Please sign in to comment.