tutorial5: relevant updates

Include tests with different frameworks, libraries and algorithms. Tutorial 5 is still a work in progress.
Ritesh1991 · Jun 18, 2016 · 8c07b97 · 8c07b97
1 parent e92291a
commit 8c07b97
Show file tree

Hide file tree

Showing 29 changed files with 3,415 additions and 188 deletions.
diff --git a/tutorial5/README.md b/tutorial5/README.md
@@ -5,25 +5,38 @@ Up until this tutorial Q-learning algorithm has been storing state-action pairs
 
 That's where neural networks come in. Or any other type of function approximator, even a simple linear model. We can use a neural network, instead of a lookup table, as our  Q(s,a)Q(s,a)  function. Just like before, it will accept a state and an action and spit out the value of that state-action.
 
+### A bit of theoretical background
+Discuss Deepmind's papers and original DQN algorithm.
+
+(from second paper)
+>Reinforcement learning is known to be unstable or even to diverge when a nonlinear function approximator such as a neural network is used to represent the action-value (also known as Q) function20.
+
 ### Comparing different techniques in `CartPole`
 
 
 | Algorithm | `epochs:` 100 | `epochs:` 500  | `epochs:` 1000  |
 |-----------|----------------|----------------|-----------------|
 | Q-learning| 104.87 (86.37) | 181.22 (145.78) | 191.35 (141.31) |
-| DQN	| | |
+| DQN (default params)	| 24 (17) | 200 (199) | |
 
 *Each cell represents the best 100 scores for the number of epochs and in parenthesis the average score over all the epochs*
 
 
-- Intro to Theano
-- Intro MNIST in Theano
-- Intro to MNIST in Keras
-- TODO: DQN with Keras in OpenAI gym
+- [x] Intro to Theano
+- [x] Intro MNIST in Theano
+- [x] Intro to MNIST in Keras
+- [x] DQN with Keras in OpenAI gym
+- [x] deer implementation for CartPole (theano and keras)
+
+
+- [ ] review q-learning4k
+- [ ] catch example https://edersantana.github.io/articles/keras_rl/
 
 ### References:
 - http://deeplearning.net/tutorial/mlp.html#mlp
 - http://outlace.com/Reinforcement-Learning-Part-3/
 - http://keras.io/
+- http://ufldl.stanford.edu/tutorial/supervised/MultiLayerNeuralNetworks/
 - https://github.com/sherjilozair/dqn
+- https://github.com/VinF/deer
 
diff --git a/tutorial5/aux/dqn.py b/tutorial5/aux/dqn.py
diff --git a/tutorial5/aux/gym_dqn.py b/tutorial5/aux/gym_dqn.py
diff --git a/tutorial5/deer_examples/mountain_car_env.py b/tutorial5/deer_examples/mountain_car_env.py
@@ -0,0 +1,88 @@
+import numpy as np
+import copy
+import math
+from deer.base_classes import Environment
+import gym
+
+class MyEnv(Environment):
+    def __init__(self, rng):
+        """ Initialize environment.
+
+        Arguments:
+            rng - the numpy random number generator            
+        """
+        #print gym.envs.registry.all()
+        self.env = gym.make('MountainCar-v0')
+        self.rng=rng
+        self._last_observation = self.env.reset()
+        print self._last_observation
+        self.is_terminal=False
+        self._input_dim = [(1,), (1,)]      # self.env.observation_space.shape is equal to 4 
+                                            # and we use only the current value in the belief state
+    def act(self, action):
+        """ Simulate one time step in the environment.
+        """
+        reward=0
+        for _ in range(5):
+            self._last_observation, r, self.is_terminal, info = self.env.step(action)
+            reward+=r
+            if(self.is_terminal==True):
+                break
+
+            if (self.mode==0): # Show the policy only at test time
+                self.env.render()
+
+        s=copy.deepcopy(self._last_observation)
+
+        ## Possibility to add a reward shaping for faster convergence   
+        #s[0]+=math.pi/6
+        #if(s[0]>0):
+        #    reward+=pow(s[0],2)#np.linalg.norm(s[0])
+
+        return reward
+
+    def reset(self, mode=0):
+        """ Reset environment for a new episode.
+
+        Arguments:
+        Mode : int
+            -1 corresponds to training and 0 to test
+        """
+        self.mode=mode
+
+        self._last_observation = self.env.reset()
+        if (self.mode==-1): # Reset to a random value when in training mode (that allows to increase exploration)
+            high=self.env.observation_space.high
+            low=self.env.observation_space.low
+            self._last_observation=low+self.rng.rand(2)*(high-low)            
+            self.env.state=self._last_observation
+
+        self.is_terminal=False
+
+
+        return self._last_observation
+
+    def inTerminalState(self):
+        """ Tell whether the environment reached a terminal state after the last transition (i.e. the last transition 
+        that occured was terminal).
+        """
+        return self.is_terminal
+
+    def inputDimensions(self):
+        return self._input_dim  
+
+    def nActions(self):
+        return 3 #Would be useful to have this directly in gym : self.env.action_space.shape  
+
+    def observe(self):
+        return copy.deepcopy(self._last_observation)
+
+def main():
+    # This function can be used for debug purposes
+    rng = np.random.RandomState(123456)
+    myenv=MyEnv(rng)
+
+    print (myenv.observe())
+
+if __name__ == "__main__":
+    main()
diff --git a/tutorial5/deer_examples/pendulum_env.py b/tutorial5/deer_examples/pendulum_env.py
@@ -0,0 +1,67 @@
+import numpy as np
+import copy
+
+from deer.base_classes import Environment
+import gym
+
+class MyEnv(Environment):
+    def __init__(self, rng):
+        """ Initialize environment.
+
+        Arguments:
+            rng - the numpy random number generator            
+        """
+        # Defining the type of environment
+        self.env = gym.make('CartPole-v0')
+        self._last_observation = self.env.reset()
+        self.is_terminal=False
+        self._input_dim = [(1,), (1,), (1,), (1,)]  # self.env.observation_space.shape is equal to 4 
+                                                    # and we use only the current value in the belief state
+
+    def act(self, action):
+        """ Simulate one time step in the environment.
+        """
+
+        self._last_observation, reward, self.is_terminal, info = self.env.step(action)
+        if (self.mode==0): # Show the policy only at test time
+            self.env.render()
+
+        return reward
+
+    def reset(self, mode=0):
+        """ Reset environment for a new episode.
+
+        Arguments:
+        Mode : int
+            -1 corresponds to training and 0 to test
+        """
+        # Reset initial observation to a random x and theta
+        self._last_observation = self.env.reset()
+        self.is_terminal=False
+        self.mode=mode
+
+        return self._last_observation
+
+    def inTerminalState(self):
+        """Tell whether the environment reached a terminal state after the last transition (i.e. the last transition 
+        that occured was terminal).
+        """
+        return self.is_terminal
+
+    def inputDimensions(self):
+        return self._input_dim  
+
+    def nActions(self):
+        return 2 #Would be useful to have this directly in gym : self.env.action_space.shape  
+
+    def observe(self):
+        return copy.deepcopy(self._last_observation)
+
+def main():
+    rng = np.random.RandomState(123456)
+    myenv=MyEnv(rng)
+
+    print (myenv.observe())
+
+if __name__ == "__main__":
+    main()
diff --git a/tutorial5/deer_examples/pendulum_env.pyc b/tutorial5/deer_examples/pendulum_env.pyc