|
| 1 | +import csv |
| 2 | +from datetime import datetime |
| 3 | +import numpy as np |
| 4 | +import ray |
| 5 | +from ray.tune.logger import pretty_print |
| 6 | +from ray.rllib.agents.dqn.apex import ApexTrainer |
| 7 | +from ray.rllib.agents.dqn.apex import APEX_DEFAULT_CONFIG |
| 8 | +from ray.rllib.models import ModelCatalog |
| 9 | + |
| 10 | +from custom_mcar import MountainCar |
| 11 | +from masking_model import ParametricActionsModel |
| 12 | +from mcar_demo import DEMO_DATA_DIR |
| 13 | + |
| 14 | +ALL_STRATEGIES = [ |
| 15 | + "default", |
| 16 | + "with_dueling", |
| 17 | + "custom_reward", |
| 18 | + "custom_reward_n_dueling", |
| 19 | + "demonstration", |
| 20 | + "curriculum", |
| 21 | + "curriculum_n_dueling", |
| 22 | + "action_masking", |
| 23 | +] |
| 24 | +STRATEGY = "demonstration" |
| 25 | +CURRICULUM_MAX_LESSON = 4 |
| 26 | +CURRICULUM_TRANS = 150 |
| 27 | +MAX_STEPS = 2e6 |
| 28 | +MAX_STEPS_OFFLINE = 4e5 |
| 29 | +NUM_TRIALS = 5 |
| 30 | +NUM_FINAL_EVAL_EPS = 20 |
| 31 | + |
| 32 | + |
| 33 | +def get_apex_trainer(strategy): |
| 34 | + config = APEX_DEFAULT_CONFIG.copy() |
| 35 | + config["env"] = MountainCar |
| 36 | + config["buffer_size"] = 1000000 |
| 37 | + config["learning_starts"] = 10000 |
| 38 | + config["target_network_update_freq"] = 50000 |
| 39 | + config["rollout_fragment_length"] = 200 |
| 40 | + config["timesteps_per_iteration"] = 10000 |
| 41 | + config["num_gpus"] = 1 |
| 42 | + config["num_workers"] = 20 |
| 43 | + config["evaluation_num_workers"] = 10 |
| 44 | + config["evaluation_interval"] = 1 |
| 45 | + if strategy not in [ |
| 46 | + "with_dueling", |
| 47 | + "custom_reward_n_dueling", |
| 48 | + "curriculum_n_dueling", |
| 49 | + ]: |
| 50 | + config["hiddens"] = [] |
| 51 | + config["dueling"] = False |
| 52 | + |
| 53 | + if strategy == "action_masking": |
| 54 | + ModelCatalog.register_custom_model("pa_model", ParametricActionsModel) |
| 55 | + config["env_config"] = {"use_action_masking": True} |
| 56 | + config["model"] = { |
| 57 | + "custom_model": "pa_model", |
| 58 | + } |
| 59 | + elif strategy == "custom_reward" or strategy == "custom_reward_n_dueling": |
| 60 | + config["env_config"] = {"reward_fun": "custom_reward"} |
| 61 | + elif strategy in ["curriculum", "curriculum_n_dueling"]: |
| 62 | + config["env_config"] = {"lesson": 0} |
| 63 | + elif strategy == "demonstration": |
| 64 | + config["input"] = DEMO_DATA_DIR |
| 65 | + #config["input"] = {"sampler": 0.7, DEMO_DATA_DIR: 0.3} |
| 66 | + config["explore"] = False |
| 67 | + config["input_evaluation"] = [] |
| 68 | + config["n_step"] = 1 |
| 69 | + |
| 70 | + trainer = ApexTrainer(config=config) |
| 71 | + return trainer, config["env_config"] |
| 72 | + |
| 73 | + |
| 74 | +def set_trainer_lesson(trainer, lesson): |
| 75 | + trainer.evaluation_workers.foreach_worker( |
| 76 | + lambda ev: ev.foreach_env(lambda env: env.set_lesson(lesson)) |
| 77 | + ) |
| 78 | + trainer.workers.foreach_worker( |
| 79 | + lambda ev: ev.foreach_env(lambda env: env.set_lesson(lesson)) |
| 80 | + ) |
| 81 | + |
| 82 | + |
| 83 | +def increase_lesson(lesson): |
| 84 | + if lesson < CURRICULUM_MAX_LESSON: |
| 85 | + lesson += 1 |
| 86 | + return lesson |
| 87 | + |
| 88 | + |
| 89 | +def final_evaluation(trainer, n_final_eval, env_config={}): |
| 90 | + env = MountainCar(env_config) |
| 91 | + eps_lengths = [] |
| 92 | + for i_episode in range(n_final_eval): |
| 93 | + observation = env.reset() |
| 94 | + done = False |
| 95 | + t = 0 |
| 96 | + while not done: |
| 97 | + t += 1 |
| 98 | + action = trainer.compute_action(observation) |
| 99 | + observation, reward, done, info = env.step(action) |
| 100 | + if done: |
| 101 | + eps_lengths.append(t) |
| 102 | + print(f"Episode finished after {t} time steps") |
| 103 | + print( |
| 104 | + f"Avg. episode length {np.mean(eps_lengths)} out of {len(eps_lengths)} episodes." |
| 105 | + ) |
| 106 | + return np.mean(eps_lengths) |
| 107 | + |
| 108 | + |
| 109 | +### START TRAINING ### |
| 110 | +ray.init() |
| 111 | +avg_eps_lens = [] |
| 112 | +for i in range(NUM_TRIALS): |
| 113 | + trainer, env_config = get_apex_trainer(STRATEGY) |
| 114 | + if STRATEGY in ["curriculum", "curriculum_n_dueling"]: |
| 115 | + lesson = 0 |
| 116 | + set_trainer_lesson(trainer, lesson) |
| 117 | + # Training |
| 118 | + while True: |
| 119 | + results = trainer.train() |
| 120 | + print(pretty_print(results)) |
| 121 | + if STRATEGY == "demonstration": |
| 122 | + demo_training_steps = results["timesteps_total"] |
| 123 | + if results["timesteps_total"] >= MAX_STEPS_OFFLINE: |
| 124 | + trainer, _ = get_apex_trainer("with_dueling") |
| 125 | + if results["timesteps_total"] >= MAX_STEPS: |
| 126 | + if STRATEGY == "demonstration": |
| 127 | + if results["timesteps_total"] >= MAX_STEPS + demo_training_steps: |
| 128 | + break |
| 129 | + else: |
| 130 | + break |
| 131 | + if "evaluation" in results and STRATEGY in ["curriculum", "curriculum_n_dueling"]: |
| 132 | + if results["evaluation"]["episode_len_mean"] < CURRICULUM_TRANS: |
| 133 | + lesson = increase_lesson(lesson) |
| 134 | + set_trainer_lesson(trainer, lesson) |
| 135 | + print(f"Lesson: {lesson}") |
| 136 | + |
| 137 | + # Final evaluation |
| 138 | + checkpoint = trainer.save() |
| 139 | + if STRATEGY in ["curriculum", "curriculum_n_dueling"]: |
| 140 | + env_config["lesson"] = CURRICULUM_MAX_LESSON |
| 141 | + if STRATEGY == "action_masking": |
| 142 | + # Action masking is running into errors in Ray 1.0.1 during compute action |
| 143 | + # So, we use evaluation episode lengths. |
| 144 | + avg_eps_len = results["evaluation"]["episode_len_mean"] |
| 145 | + else: |
| 146 | + avg_eps_len = final_evaluation(trainer, NUM_FINAL_EVAL_EPS, env_config) |
| 147 | + date_time = datetime.now().strftime("%m/%d/%Y, %H:%M:%S") |
| 148 | + result = [date_time, STRATEGY, str(i), avg_eps_len, checkpoint] |
| 149 | + avg_eps_lens.append(avg_eps_len) |
| 150 | + with open(r"results.csv", "a") as f: |
| 151 | + writer = csv.writer(f) |
| 152 | + writer.writerow(result) |
| 153 | +print(f"Average episode length: {np.mean(avg_eps_lens)}") |
| 154 | + |
0 commit comments