Skip to content

Commit 9fe156f

Browse files
committed
create RL 1
1 parent 018caa0 commit 9fe156f

File tree

2 files changed

+102
-130
lines changed

2 files changed

+102
-130
lines changed

RL/example1/hunter_prey.py

-130
This file was deleted.

RL/example1/treasure_on_right.py

+102
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
"""
2+
A simple example for Reinforcement Learning using table lookup Q-learning method.
3+
An agent "o" is on the left of a 1 dimensional world, the treasure is on the rightmost location.
4+
Run this program and to see how the agent will improve its strategy of finding the treasure.
5+
"""
6+
7+
import numpy as np
8+
import pandas as pd
9+
import time
10+
11+
np.random.seed(2) # reproducible
12+
13+
14+
N_STATES = 6 # the length of the 1 dimensional world
15+
ACTIONS = ['left', 'right'] # available actions
16+
EPSILON = 0.9 # greedy police
17+
ALPHA = 0.1 # learning rate
18+
LAMBDA = 0.9 # discount factor
19+
MAX_EPISODES = 13 # maximum episodes
20+
FRESH_TIME = 0.3 # fresh time for one move
21+
22+
23+
def build_q_table(n_states, actions):
24+
table = pd.DataFrame(
25+
np.zeros((n_states, len(actions))), # q_table initial values
26+
columns=actions, # actions's name
27+
)
28+
# print(table) # show table
29+
return table
30+
31+
32+
def choose_action(state, q_table):
33+
# This is how to choose an action
34+
state_actions = q_table.iloc[state, :]
35+
if (np.random.rand() > EPSILON) or (state_actions.all() == 0): # act non-greedy or state-action have no value
36+
action_name = np.random.choice(ACTIONS)
37+
else: # act greedy
38+
action_name = state_actions.argmax()
39+
return action_name
40+
41+
42+
def get_env_feedback(S, A):
43+
# This is how agent will interact with the environment
44+
if A == 'right': # move right
45+
if S == N_STATES - 2: # terminate
46+
S_ = 'terminal'
47+
R = 1
48+
else:
49+
S_ = S + 1
50+
R = 0
51+
else: # move left
52+
R = 0
53+
if S == 0:
54+
S_ = S # reach the wall
55+
else:
56+
S_ = S - 1
57+
return S_, R
58+
59+
60+
def update_env(S, episode, step_counter):
61+
# This is how environment be updated
62+
env_list = ['-']*(N_STATES-1) + ['T'] # '---------T' our environment
63+
if S == 'terminal':
64+
interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
65+
print('\r{}'.format(interaction), end='')
66+
time.sleep(2)
67+
else:
68+
env_list[S] = 'o'
69+
interaction = ''.join(env_list)
70+
print('\r{}'.format(interaction), end='')
71+
time.sleep(FRESH_TIME)
72+
73+
74+
def rl():
75+
# main part of RL loop
76+
q_table = build_q_table(N_STATES, ACTIONS)
77+
for episode in range(MAX_EPISODES):
78+
step_counter = 0
79+
S = 0
80+
is_terminated = False
81+
update_env(S, episode, step_counter)
82+
while not is_terminated:
83+
84+
A = choose_action(S, q_table)
85+
S_, R = get_env_feedback(S, A) # take action & get next state and reward
86+
q_predict = q_table.ix[S, A]
87+
if S_ != 'terminal':
88+
q_target = R + LAMBDA * q_table.iloc[S_, :].max() # next state is not terminal
89+
else:
90+
q_target = R # next state is terminal
91+
is_terminated = True # terminate this episode
92+
93+
q_table.ix[S, A] += ALPHA * (q_target - q_predict) # update
94+
S = S_ # move to next state
95+
96+
update_env(S, episode, step_counter+1)
97+
step_counter += 1
98+
99+
100+
if __name__ == "__main__":
101+
rl()
102+
print('\r\nfinish')

0 commit comments

Comments
 (0)