Skip to content

Commit 5ad71ad

Browse files
committedNov 13, 2020
Chapter 15
1 parent 3e4f379 commit 5ad71ad

5 files changed

+481
-0
lines changed
 

‎Chapter15/Newsvendor plots.ipynb

+152
Large diffs are not rendered by default.

‎Chapter15/eval_inv_policy.py

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import numpy as np
2+
import ray
3+
from ray.rllib.agents.ppo.ppo import DEFAULT_CONFIG
4+
from ray.rllib.agents.ppo.ppo import PPOTrainer
5+
6+
from inventory_env import InventoryEnv
7+
8+
config = DEFAULT_CONFIG.copy()
9+
config["env"] = InventoryEnv
10+
11+
ray.init()
12+
trainer = PPOTrainer(config=config, env=InventoryEnv)
13+
14+
trainer.restore(
15+
# Replace this with your checkpoint path.
16+
"/home/enes/ray_results/PPO_InventoryEnv_2020-10-06_04-58-04t8r36o9o/checkpoint_781/checkpoint-781"
17+
)
18+
19+
if __name__ == "__main__":
20+
np.random.seed(0)
21+
env = InventoryEnv()
22+
episode_reward_avgs = []
23+
episode_total_rewards = []
24+
for i in range(2000):
25+
print(f"Episode: {i+1}")
26+
state = env.reset()
27+
done = False
28+
ep_rewards = []
29+
while not done:
30+
action = trainer.compute_action(state)
31+
state, reward, done, info = env.step(action)
32+
ep_rewards.append(reward)
33+
total_reward = np.sum(ep_rewards)
34+
reward_per_day = np.mean(ep_rewards)
35+
print(f"Total reward: {total_reward}")
36+
print(f"Reward per time step: {reward_per_day}")
37+
episode_reward_avgs.append(reward_per_day)
38+
episode_total_rewards.append(total_reward)
39+
print(
40+
f"Average daily reward over {len(episode_reward_avgs)} "
41+
f"test episodes: {np.mean(episode_reward_avgs)}. "
42+
f"Average total epsisode reward: {np.mean(episode_total_rewards)}"
43+
)

‎Chapter15/inventory_env.py

+191
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
"""
2+
This code is modified from:
3+
https://github.com/awslabs/or-rl-benchmarks/blob/master/News%20Vendor/src/news_vendor_environment.py
4+
"""
5+
6+
import gym
7+
import numpy as np
8+
from gym import spaces
9+
from scipy.stats import poisson
10+
11+
12+
class InventoryEnv(gym.Env):
13+
def __init__(self, config={}):
14+
self.l = config.get("lead time", 5)
15+
self.storage_capacity = 4000
16+
self.order_limit = 1000
17+
self.step_count = 0
18+
self.max_steps = 40
19+
20+
self.max_value = 100.0
21+
self.max_holding_cost = 5.0
22+
self.max_loss_goodwill = 10.0
23+
self.max_mean = 200
24+
25+
self.inv_dim = max(1, self.l)
26+
space_low = self.inv_dim * [0]
27+
space_high = self.inv_dim * [self.storage_capacity]
28+
space_low += 5 * [0]
29+
space_high += [
30+
self.max_value,
31+
self.max_value,
32+
self.max_holding_cost,
33+
self.max_loss_goodwill,
34+
self.max_mean,
35+
]
36+
self.observation_space = spaces.Box(
37+
low=np.array(space_low),
38+
high=np.array(space_high),
39+
dtype=np.float32
40+
)
41+
42+
# Action is between 0 and 1, representing order quantity from
43+
# 0 up to the order limit.
44+
self.action_space = spaces.Box(
45+
low=np.array([0]),
46+
high=np.array([1]),
47+
dtype=np.float32
48+
)
49+
self.state = None
50+
self.reset()
51+
52+
def _normalize_obs(self):
53+
obs = np.array(self.state)
54+
obs[:self.inv_dim] = obs[:self.inv_dim] / self.order_limit
55+
obs[self.inv_dim] = obs[self.inv_dim] / self.max_value
56+
obs[self.inv_dim + 1] = obs[self.inv_dim + 1] / self.max_value
57+
obs[self.inv_dim + 2] = obs[self.inv_dim + 2] / self.max_holding_cost
58+
obs[self.inv_dim + 3] = obs[self.inv_dim + 3] / self.max_loss_goodwill
59+
obs[self.inv_dim + 4] = obs[self.inv_dim + 4] / self.max_mean
60+
return obs
61+
62+
def reset(self):
63+
self.step_count = 0
64+
65+
price = np.random.rand() * self.max_value
66+
cost = np.random.rand() * price
67+
holding_cost = np.random.rand() * min(cost, self.max_holding_cost)
68+
loss_goodwill = np.random.rand() * self.max_loss_goodwill
69+
mean_demand = np.random.rand() * self.max_mean
70+
71+
self.state = np.zeros(self.inv_dim + 5)
72+
self.state[self.inv_dim] = price
73+
self.state[self.inv_dim + 1] = cost
74+
self.state[self.inv_dim + 2] = holding_cost
75+
self.state[self.inv_dim + 3] = loss_goodwill
76+
self.state[self.inv_dim + 4] = mean_demand
77+
78+
return self._normalize_obs()
79+
80+
def break_state(self):
81+
inv_state = self.state[: self.inv_dim]
82+
p = self.state[self.inv_dim]
83+
c = self.state[self.inv_dim + 1]
84+
h = self.state[self.inv_dim + 2]
85+
k = self.state[self.inv_dim + 3]
86+
mu = self.state[self.inv_dim + 4]
87+
return inv_state, p, c, h, k, mu
88+
89+
def step(self, action):
90+
beginning_inv_state, p, c, h, k, mu = \
91+
self.break_state()
92+
action = np.clip(action[0], 0, 1)
93+
action = int(action * self.order_limit)
94+
done = False
95+
96+
available_capacity = self.storage_capacity \
97+
- np.sum(beginning_inv_state)
98+
assert available_capacity >= 0
99+
buys = min(action, available_capacity)
100+
# If lead time is zero, immediately
101+
# increase the inventory
102+
if self.l == 0:
103+
self.state[0] += buys
104+
on_hand = self.state[0]
105+
demand_realization = np.random.poisson(mu)
106+
107+
# Compute Reward
108+
sales = min(on_hand,
109+
demand_realization)
110+
sales_revenue = p * sales
111+
overage = on_hand - sales
112+
underage = max(0, demand_realization
113+
- on_hand)
114+
purchase_cost = c * buys
115+
holding = overage * h
116+
penalty_lost_sale = k * underage
117+
reward = sales_revenue \
118+
- purchase_cost \
119+
- holding \
120+
- penalty_lost_sale
121+
122+
# Day is over. Update the inventory
123+
# levels for the beginning of the next day
124+
# In-transit inventory levels shift to left
125+
self.state[0] = 0
126+
if self.inv_dim > 1:
127+
self.state[: self.inv_dim - 1] \
128+
= self.state[1: self.inv_dim]
129+
self.state[0] += overage
130+
# Add the recently bought inventory
131+
# if the lead time is positive
132+
if self.l > 0:
133+
self.state[self.l - 1] = buys
134+
self.step_count += 1
135+
if self.step_count >= self.max_steps:
136+
done = True
137+
138+
# Normalize the reward
139+
reward = reward / 10000
140+
info = {
141+
"demand realization": demand_realization,
142+
"sales": sales,
143+
"underage": underage,
144+
"overage": overage,
145+
}
146+
return self._normalize_obs(), reward, done, info
147+
148+
149+
def get_action_from_benchmark_policy(env):
150+
inv_state, p, c, h, k, mu = env.break_state()
151+
cost_of_overage = h
152+
cost_of_underage = p - c + k
153+
critical_ratio = np.clip(
154+
0, 1, cost_of_underage
155+
/ (cost_of_underage + cost_of_overage)
156+
)
157+
horizon_target = int(poisson.ppf(critical_ratio,
158+
(len(inv_state) + 1) * mu))
159+
deficit = max(0, horizon_target - np.sum(inv_state))
160+
buy_action = min(deficit, env.order_limit)
161+
return [buy_action / env.order_limit]
162+
163+
164+
if __name__ == "__main__":
165+
np.random.seed(100)
166+
env = InventoryEnv()
167+
episode_reward_avgs = []
168+
episode_total_rewards = []
169+
for i in range(2000):
170+
print(f"Episode: {i+1}")
171+
initial_state = env.reset()
172+
done = False
173+
ep_rewards = []
174+
while not done:
175+
# action = env.action_space.sample()
176+
action = get_action_from_benchmark_policy(env)
177+
# print("Action: ", action)
178+
state, reward, done, info = env.step(action)
179+
# print("State: ", state)
180+
ep_rewards.append(reward)
181+
total_reward = np.sum(ep_rewards)
182+
reward_per_day = np.mean(ep_rewards)
183+
# print(f"Total reward: {total_reward}")
184+
# print(f"Reward per time step: {reward_per_day}")
185+
episode_reward_avgs.append(reward_per_day)
186+
episode_total_rewards.append(total_reward)
187+
print(
188+
f"Average daily reward over {len(episode_reward_avgs)} "
189+
f"test episodes: {np.mean(episode_reward_avgs)}. "
190+
f"Average total epsisode reward: {np.mean(episode_total_rewards)}"
191+
)

‎Chapter15/train_inv_policy.py

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import numpy as np
2+
3+
import ray
4+
from ray.tune.logger import pretty_print
5+
from ray.rllib.agents.ppo.ppo import DEFAULT_CONFIG
6+
from ray.rllib.agents.ppo.ppo import PPOTrainer
7+
8+
from inventory_env import InventoryEnv
9+
10+
config = DEFAULT_CONFIG.copy()
11+
config["env"] = InventoryEnv
12+
config["num_gpus"] = 1 # Set this to 0 if you don't have a GPU.
13+
config["num_workers"] = 50 # Set this based on the number of CPUs on your machine
14+
15+
# Combination 1
16+
# config["clip_param"] = 0.3
17+
# config["entropy_coeff"] = 0
18+
# config["grad_clip"] = 0.01
19+
# config["kl_target"] = 0.05
20+
# config["lr"] = 0.0001
21+
# config["num_sgd_iter"] = 10
22+
# config["sgd_minibatch_size"] = 128
23+
# config["train_batch_size"] = 10000
24+
# config["use_gae"] = True
25+
# config["vf_clip_param"] = 10
26+
# config["vf_loss_coeff"] = 1
27+
# config["vf_share_layers"] = True
28+
29+
# Combination 2
30+
config["clip_param"] = 0.3
31+
config["entropy_coeff"] = 0
32+
config["grad_clip"] = None
33+
config["kl_target"] = 0.005
34+
config["lr"] = 0.001
35+
config["num_sgd_iter"] = 5
36+
config["sgd_minibatch_size"] = 8192
37+
config["train_batch_size"] = 20000
38+
config["use_gae"] = True
39+
config["vf_clip_param"] = 10
40+
config["vf_loss_coeff"] = 1
41+
config["vf_share_layers"] = False
42+
43+
# For better gradient estimates in the later stages
44+
# of the training, increase the batch sizes.
45+
# config["sgd_minibatch_size"] = 8192 * 4
46+
# config["train_batch_size"] = 20000 * 10
47+
48+
ray.init()
49+
trainer = PPOTrainer(config=config, env=InventoryEnv)
50+
51+
# Use this when you want to continue from a checkpoint.
52+
# trainer.restore(
53+
# "/home/enes/ray_results/PPO_InventoryEnv_2020-10-06_04-31-2945lwn1wg/checkpoint_737/checkpoint-737"
54+
# )
55+
56+
57+
58+
best_mean_reward = np.NINF
59+
while True:
60+
result = trainer.train()
61+
print(pretty_print(result))
62+
mean_reward = result.get("episode_reward_mean", np.NINF)
63+
if mean_reward > best_mean_reward:
64+
checkpoint = trainer.save()
65+
print("checkpoint saved at", checkpoint)
66+
best_mean_reward = mean_reward

‎Chapter15/tune_inv_policy.py

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import ray
2+
from ray import tune
3+
from inventory_env import InventoryEnv
4+
5+
ray.init()
6+
tune.run(
7+
"PPO",
8+
stop={"timesteps_total": 1e6},
9+
num_samples=5,
10+
config={
11+
"env": InventoryEnv,
12+
"rollout_fragment_length": 40,
13+
"num_gpus": 1,
14+
"num_workers": 50,
15+
"lr": tune.grid_search([0.01, 0.001, 0.0001, 0.00001]),
16+
"use_gae": tune.choice([True, False]),
17+
"train_batch_size": tune.choice([5000, 10000, 20000, 40000]),
18+
"sgd_minibatch_size": tune.choice([128, 1024, 4096, 8192]),
19+
"num_sgd_iter": tune.choice([5, 10, 30]),
20+
"vf_loss_coeff": tune.choice([0.1, 1, 10]),
21+
"vf_share_layers": tune.choice([True, False]),
22+
"entropy_coeff": tune.choice([0, 0.1, 1]),
23+
"clip_param": tune.choice([0.05, 0.1, 0.3, 0.5]),
24+
"vf_clip_param": tune.choice([1, 5, 10]),
25+
"grad_clip": tune.choice([None, 0.01, 0.1, 1]),
26+
"kl_target": tune.choice([0.005, 0.01, 0.05]),
27+
"eager": False,
28+
},
29+
)

0 commit comments

Comments
 (0)
Please sign in to comment.