Skip to content

Commit 1b58c4d

Browse files
committedDec 10, 2020
Chapter 12: Meta RL
1 parent 9ea4961 commit 1b58c4d

File tree

4 files changed

+354
-0
lines changed

4 files changed

+354
-0
lines changed
 

‎Chapter12/ml-test.py

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
from copy import deepcopy
2+
3+
import ray
4+
from ray import tune
5+
6+
#from ray.rllib.examples.env.pendulum_mass import PendulumMassEnv
7+
from ray.rllib.agents.maml.maml import MAMLTrainer, DEFAULT_CONFIG
8+
9+
import numpy as np
10+
import gym
11+
from gym.envs.classic_control.pendulum import PendulumEnv
12+
from ray.rllib.env.meta_env import MetaEnv
13+
14+
from penenv3 import PenEnv
15+
16+
config = deepcopy(DEFAULT_CONFIG)
17+
18+
19+
ray.init()
20+
tune.run(
21+
"MAML",
22+
stop={"training_iteration": 500},
23+
config=dict(
24+
DEFAULT_CONFIG,
25+
**{
26+
"env": PenEnv,
27+
"horizon": 200,
28+
"rollout_fragment_length": 200,
29+
"num_envs_per_worker": 10,
30+
"inner_adaptation_steps": 1,
31+
"maml_optimizer_steps": 5,
32+
"gamma": 0.99,
33+
"lambda": 1.0,
34+
"lr": 0.001,
35+
"vf_loss_coeff": 0.5,
36+
"clip_param": 0.3,
37+
"kl_target": 0.01,
38+
"kl_coeff": 0.001,
39+
"num_workers": 60,
40+
"num_gpus": 1,
41+
"inner_lr": 0.03,
42+
"explore": True,
43+
"clip_actions": False,
44+
"model": {"fcnet_hiddens": [64, 64],
45+
"free_log_std": True
46+
}
47+
}
48+
),
49+
checkpoint_freq=10,
50+
)

‎Chapter12/ml-test_lstm.py

+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
from copy import deepcopy
2+
3+
import ray
4+
from ray import tune
5+
6+
from ray.rllib.agents.ppo.ppo import PPOTrainer, DEFAULT_CONFIG
7+
8+
import numpy as np
9+
import gym
10+
from gym.envs.classic_control.pendulum import PendulumEnv
11+
from ray.rllib.env.meta_env import MetaEnv
12+
13+
from penenv2 import PenEnv2
14+
15+
config = deepcopy(DEFAULT_CONFIG)
16+
17+
18+
ray.init()
19+
tune.run(
20+
PPOTrainer,
21+
#stop={"training_iteration": 500},
22+
config=dict(
23+
DEFAULT_CONFIG,
24+
**{
25+
"env": PenEnv2,
26+
"horizon": 200,
27+
"rollout_fragment_length": 200,
28+
#"num_envs_per_worker": 10,
29+
"gamma": 0.99,
30+
"lambda": 1.0,
31+
"lr": 0.001,
32+
"vf_loss_coeff": 0.5,
33+
"clip_param": 0.3,
34+
"kl_target": 0.01,
35+
"kl_coeff": 0.001,
36+
"num_workers": 60,
37+
"num_gpus": 1,
38+
39+
"clip_actions": False,
40+
"model": {#"fcnet_hiddens": [64, 64],
41+
"use_lstm": True,
42+
"lstm_cell_size": 128,
43+
"lstm_use_prev_action_reward": True,
44+
"max_seq_len": 10
45+
}
46+
}
47+
),
48+
checkpoint_freq=10,
49+
)

‎Chapter12/penenv2.py

+128
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
import gym
2+
from gym import spaces
3+
from gym.utils import seeding
4+
import numpy as np
5+
from os import path
6+
7+
8+
import gym
9+
from gym import spaces
10+
from gym.utils import seeding
11+
import numpy as np
12+
from os import path
13+
14+
15+
class PenEnv2(gym.Env):
16+
metadata = {
17+
'render.modes': ['human', 'rgb_array'],
18+
'video.frames_per_second': 30
19+
}
20+
21+
def __init__(self, env_config={}):
22+
self.max_speed = 8
23+
self.max_torque = 2.
24+
self.dt = .05
25+
self.g = env_config.get("g", 10)
26+
self.m = 1.
27+
self.l = 1.
28+
self.viewer = None
29+
30+
high = np.array([1., 1., self.max_speed], dtype=np.float32)
31+
self.action_space = spaces.Box(
32+
low=-self.max_torque,
33+
high=self.max_torque, shape=(1,),
34+
dtype=np.float32
35+
)
36+
self.observation_space = spaces.Box(
37+
low=-high,
38+
high=high,
39+
dtype=np.float32
40+
)
41+
42+
self.seed()
43+
44+
def seed(self, seed=None):
45+
self.np_random, seed = seeding.np_random(seed)
46+
return [seed]
47+
48+
def step(self, u):
49+
th, thdot = self.state # th := theta
50+
51+
g = self.g
52+
m = self.m
53+
l = self.l
54+
dt = self.dt
55+
56+
u = np.clip(u, -self.max_torque, self.max_torque)[0]
57+
self.last_u = u # for rendering
58+
costs = angle_normalize(th) ** 2 + .1 * thdot ** 2 + .001 * (u ** 2)
59+
60+
newthdot = thdot + (-3 * g / (2 * l) * np.sin(th + np.pi) + 3. / (m * l ** 2) * u) * dt
61+
newth = th + newthdot * dt
62+
newthdot = np.clip(newthdot, -self.max_speed, self.max_speed)
63+
64+
self.state = np.array([newth, newthdot])
65+
return self._get_obs(), -costs, False, {}
66+
67+
def reset(self):
68+
high = np.array([np.pi, 1])
69+
self.state = self.np_random.uniform(low=-high, high=high)
70+
self.last_u = None
71+
self.m = np.random.uniform(low=0.5, high=2.0)
72+
return self._get_obs()
73+
74+
def _get_obs(self):
75+
theta, thetadot = self.state
76+
return np.array([np.cos(theta), np.sin(theta), thetadot])
77+
78+
def render(self, mode='human'):
79+
if self.viewer is None:
80+
from gym.envs.classic_control import rendering
81+
self.viewer = rendering.Viewer(500, 500)
82+
self.viewer.set_bounds(-2.2, 2.2, -2.2, 2.2)
83+
rod = rendering.make_capsule(1, .2)
84+
rod.set_color(.8, .3, .3)
85+
self.pole_transform = rendering.Transform()
86+
rod.add_attr(self.pole_transform)
87+
self.viewer.add_geom(rod)
88+
axle = rendering.make_circle(.05)
89+
axle.set_color(0, 0, 0)
90+
self.viewer.add_geom(axle)
91+
fname = path.join(path.dirname(__file__), "assets/clockwise.png")
92+
self.img = rendering.Image(fname, 1., 1.)
93+
self.imgtrans = rendering.Transform()
94+
self.img.add_attr(self.imgtrans)
95+
96+
self.viewer.add_onetime(self.img)
97+
self.pole_transform.set_rotation(self.state[0] + np.pi / 2)
98+
if self.last_u:
99+
self.imgtrans.scale = (-self.last_u / 2, np.abs(self.last_u) / 2)
100+
101+
return self.viewer.render(return_rgb_array=mode == 'rgb_array')
102+
103+
def close(self):
104+
if self.viewer:
105+
self.viewer.close()
106+
self.viewer = None
107+
108+
def sample_tasks(self, n_tasks):
109+
# Mass is a random float between 0.5 and 2
110+
return np.random.uniform(low=0.5, high=2.0, size=(n_tasks, ))
111+
112+
def set_task(self, task):
113+
"""
114+
Args:
115+
task: task of the meta-learning environment
116+
"""
117+
self.m = task
118+
119+
def get_task(self):
120+
"""
121+
Returns:
122+
task: task of the meta-learning environment
123+
"""
124+
return self.m
125+
126+
127+
def angle_normalize(x):
128+
return (((x+np.pi) % (2*np.pi)) - np.pi)

‎Chapter12/penenv3.py

+127
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
import gym
2+
from gym import spaces
3+
from gym.utils import seeding
4+
import numpy as np
5+
from os import path
6+
7+
8+
import gym
9+
from gym import spaces
10+
from gym.utils import seeding
11+
import numpy as np
12+
from os import path
13+
14+
15+
class PenEnv(gym.Env):
16+
metadata = {
17+
'render.modes': ['human', 'rgb_array'],
18+
'video.frames_per_second': 30
19+
}
20+
21+
def __init__(self, env_config={}):
22+
self.max_speed = 8
23+
self.max_torque = 2.
24+
self.dt = .05
25+
self.g = env_config.get("g", 10)
26+
self.m = 1.
27+
self.l = 1.
28+
self.viewer = None
29+
30+
high = np.array([1., 1., self.max_speed], dtype=np.float32)
31+
self.action_space = spaces.Box(
32+
low=-self.max_torque,
33+
high=self.max_torque, shape=(1,),
34+
dtype=np.float32
35+
)
36+
self.observation_space = spaces.Box(
37+
low=-high,
38+
high=high,
39+
dtype=np.float32
40+
)
41+
42+
self.seed()
43+
44+
def seed(self, seed=None):
45+
self.np_random, seed = seeding.np_random(seed)
46+
return [seed]
47+
48+
def step(self, u):
49+
th, thdot = self.state # th := theta
50+
51+
g = self.g
52+
m = self.m
53+
l = self.l
54+
dt = self.dt
55+
56+
u = np.clip(u, -self.max_torque, self.max_torque)[0]
57+
self.last_u = u # for rendering
58+
costs = angle_normalize(th) ** 2 + .1 * thdot ** 2 + .001 * (u ** 2)
59+
60+
newthdot = thdot + (-3 * g / (2 * l) * np.sin(th + np.pi) + 3. / (m * l ** 2) * u) * dt
61+
newth = th + newthdot * dt
62+
newthdot = np.clip(newthdot, -self.max_speed, self.max_speed)
63+
64+
self.state = np.array([newth, newthdot])
65+
return self._get_obs(), -costs, False, {}
66+
67+
def reset(self):
68+
high = np.array([np.pi, 1])
69+
self.state = self.np_random.uniform(low=-high, high=high)
70+
self.last_u = None
71+
return self._get_obs()
72+
73+
def _get_obs(self):
74+
theta, thetadot = self.state
75+
return np.array([np.cos(theta), np.sin(theta), thetadot])
76+
77+
def render(self, mode='human'):
78+
if self.viewer is None:
79+
from gym.envs.classic_control import rendering
80+
self.viewer = rendering.Viewer(500, 500)
81+
self.viewer.set_bounds(-2.2, 2.2, -2.2, 2.2)
82+
rod = rendering.make_capsule(1, .2)
83+
rod.set_color(.8, .3, .3)
84+
self.pole_transform = rendering.Transform()
85+
rod.add_attr(self.pole_transform)
86+
self.viewer.add_geom(rod)
87+
axle = rendering.make_circle(.05)
88+
axle.set_color(0, 0, 0)
89+
self.viewer.add_geom(axle)
90+
fname = path.join(path.dirname(__file__), "assets/clockwise.png")
91+
self.img = rendering.Image(fname, 1., 1.)
92+
self.imgtrans = rendering.Transform()
93+
self.img.add_attr(self.imgtrans)
94+
95+
self.viewer.add_onetime(self.img)
96+
self.pole_transform.set_rotation(self.state[0] + np.pi / 2)
97+
if self.last_u:
98+
self.imgtrans.scale = (-self.last_u / 2, np.abs(self.last_u) / 2)
99+
100+
return self.viewer.render(return_rgb_array=mode == 'rgb_array')
101+
102+
def close(self):
103+
if self.viewer:
104+
self.viewer.close()
105+
self.viewer = None
106+
107+
def sample_tasks(self, n_tasks):
108+
# Mass is a random float between 0.5 and 2
109+
return np.random.uniform(low=0.5, high=2.0, size=(n_tasks, ))
110+
111+
def set_task(self, task):
112+
"""
113+
Args:
114+
task: task of the meta-learning environment
115+
"""
116+
self.m = task
117+
118+
def get_task(self):
119+
"""
120+
Returns:
121+
task: task of the meta-learning environment
122+
"""
123+
return self.m
124+
125+
126+
def angle_normalize(x):
127+
return (((x+np.pi) % (2*np.pi)) - np.pi)

0 commit comments

Comments
 (0)
Please sign in to comment.