1
+ """
2
+ A simple example for Reinforcement Learning using table lookup Q-learning method.
3
+ An agent "o" is on the left of a 1 dimensional world, the treasure is on the rightmost location.
4
+ Run this program and to see how the agent will improve its strategy of finding the treasure.
5
+ """
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ import time
10
+
11
+ np .random .seed (2 ) # reproducible
12
+
13
+
14
+ N_STATES = 6 # the length of the 1 dimensional world
15
+ ACTIONS = ['left' , 'right' ] # available actions
16
+ EPSILON = 0.9 # greedy police
17
+ ALPHA = 0.1 # learning rate
18
+ LAMBDA = 0.9 # discount factor
19
+ MAX_EPISODES = 13 # maximum episodes
20
+ FRESH_TIME = 0.3 # fresh time for one move
21
+
22
+
23
+ def build_q_table (n_states , actions ):
24
+ table = pd .DataFrame (
25
+ np .zeros ((n_states , len (actions ))), # q_table initial values
26
+ columns = actions , # actions's name
27
+ )
28
+ # print(table) # show table
29
+ return table
30
+
31
+
32
+ def choose_action (state , q_table ):
33
+ # This is how to choose an action
34
+ state_actions = q_table .iloc [state , :]
35
+ if (np .random .rand () > EPSILON ) or (state_actions .all () == 0 ): # act non-greedy or state-action have no value
36
+ action_name = np .random .choice (ACTIONS )
37
+ else : # act greedy
38
+ action_name = state_actions .argmax ()
39
+ return action_name
40
+
41
+
42
+ def get_env_feedback (S , A ):
43
+ # This is how agent will interact with the environment
44
+ if A == 'right' : # move right
45
+ if S == N_STATES - 2 : # terminate
46
+ S_ = 'terminal'
47
+ R = 1
48
+ else :
49
+ S_ = S + 1
50
+ R = 0
51
+ else : # move left
52
+ R = 0
53
+ if S == 0 :
54
+ S_ = S # reach the wall
55
+ else :
56
+ S_ = S - 1
57
+ return S_ , R
58
+
59
+
60
+ def update_env (S , episode , step_counter ):
61
+ # This is how environment be updated
62
+ env_list = ['-' ]* (N_STATES - 1 ) + ['T' ] # '---------T' our environment
63
+ if S == 'terminal' :
64
+ interaction = 'Episode %s: total_steps = %s' % (episode + 1 , step_counter )
65
+ print ('\r {}' .format (interaction ), end = '' )
66
+ time .sleep (2 )
67
+ else :
68
+ env_list [S ] = 'o'
69
+ interaction = '' .join (env_list )
70
+ print ('\r {}' .format (interaction ), end = '' )
71
+ time .sleep (FRESH_TIME )
72
+
73
+
74
+ def rl ():
75
+ # main part of RL loop
76
+ q_table = build_q_table (N_STATES , ACTIONS )
77
+ for episode in range (MAX_EPISODES ):
78
+ step_counter = 0
79
+ S = 0
80
+ is_terminated = False
81
+ update_env (S , episode , step_counter )
82
+ while not is_terminated :
83
+
84
+ A = choose_action (S , q_table )
85
+ S_ , R = get_env_feedback (S , A ) # take action & get next state and reward
86
+ q_predict = q_table .ix [S , A ]
87
+ if S_ != 'terminal' :
88
+ q_target = R + LAMBDA * q_table .iloc [S_ , :].max () # next state is not terminal
89
+ else :
90
+ q_target = R # next state is terminal
91
+ is_terminated = True # terminate this episode
92
+
93
+ q_table .ix [S , A ] += ALPHA * (q_target - q_predict ) # update
94
+ S = S_ # move to next state
95
+
96
+ update_env (S , episode , step_counter + 1 )
97
+ step_counter += 1
98
+
99
+
100
+ if __name__ == "__main__" :
101
+ rl ()
102
+ print ('\r \n finish' )
0 commit comments