Skip to content

Commit

Permalink
Parallelize CI tests
Browse files Browse the repository at this point in the history
Summary: Pull Request resolved: facebookresearch#174

Reviewed By: czxttkl

Differential Revision: D17956417

fbshipit-source-id: 9ba6059fcec6b463aa96b914bded3e8cbec87b27
  • Loading branch information
kittipatv authored and facebook-github-bot committed Oct 17, 2019
1 parent 6dd81da commit adfa6ab
Show file tree
Hide file tree
Showing 13 changed files with 45 additions and 255 deletions.
11 changes: 8 additions & 3 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ binary_common: &binary_common
resource_class_str:
description: "Resource class to use"
type: string
test_script:
description: "Path to test script"
type: string
environment:
DOCKER_IMAGE: << parameters.docker_image >>

Expand All @@ -39,16 +42,18 @@ jobs:
resource_class: << parameters.resource_class_str >>
steps:
- checkout_merge
- run: .jenkins/build.sh
- run: << parameters.test_script >>

workflows:
build:
jobs:
- binary_linux_conda:
name: binary_linux_conda_py3.7_cpu
name: linux_conda_py3.7_end_to_end
docker_image: kittipatv/reagent:cpu
test_script: scripts/ci/run_end_to_end_test.sh
resource_class_str: large
- binary_linux_conda:
name: binary_linux_conda_py3.7_cu92
name: linux_conda_py3.7_pytest
docker_image: kittipatv/reagent:cuda
test_script: scripts/ci/run_python_unittest.sh
resource_class_str: large
27 changes: 0 additions & 27 deletions .jenkins/README.md

This file was deleted.

93 changes: 0 additions & 93 deletions docker/jenkins.Dockerfile

This file was deleted.

53 changes: 0 additions & 53 deletions docker/jenkins/README.md

This file was deleted.

24 changes: 0 additions & 24 deletions docker/jenkins/add_jenkins_user.sh

This file was deleted.

36 changes: 0 additions & 36 deletions docker/jenkins/build.sh

This file was deleted.

3 changes: 2 additions & 1 deletion ml/rl/parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ class CEMParameters(BaseDataClass):
@dataclass
class OpenAiRunDetails(BaseDataClass):
solved_reward_threshold: Optional[int] = None
max_episodes_to_run_after_solved: int = 0
max_episodes_to_run_after_solved: Optional[int] = None
stop_training_after_solved: bool = False
num_episodes: int = 301
max_steps: Optional[int] = None
Expand Down Expand Up @@ -327,6 +327,7 @@ class OpenAiGymParameters(BaseDataClass):
actor_training: Optional[FeedForwardParameters] = None
cem: Optional[CEMParameters] = None
mdnrnn: Optional[MDNRNNParameters] = None
evaluation: EvaluationParameters = EvaluationParameters()


#################################################
Expand Down
20 changes: 12 additions & 8 deletions ml/rl/test/gym/discrete_dqn_cartpole_small_v0.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
"rl": {
"gamma": 0.99,
"target_update_rate": 0.1,

"maxq_learning": true,
"epsilon": 0.05,
"temperature": 0.35,
Expand All @@ -28,20 +27,25 @@
"relu",
"linear"
],
"minibatch_size": 1024,
"learning_rate": 0.001,
"minibatch_size": 256,
"learning_rate": 0.005,
"optimizer": "ADAM",
"lr_decay": 0.999
},
"evaluation": {
"calc_cpe_in_training": false
},
"run_details": {
"num_episodes": 150,
"num_episodes": 70,
"max_steps": 200,
"train_every_ts": 1,
"stop_training_after_solved": true,
"solved_reward_threshold": 200,
"train_every_ts": 2,
"train_after_ts": 1,
"test_every_ts": 2000,
"test_every_ts": 400,
"test_after_ts": 1,
"num_train_batches": 1,
"avg_over_num_episodes": 100,
"offline_train_epochs": 30
"avg_over_num_episodes": 25,
"offline_train_epochs": 10
}
}
12 changes: 5 additions & 7 deletions ml/rl/test/gym/run_gym.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,9 +147,6 @@ def train(
run_details: OpenAiRunDetails,
save_timesteps_to_dataset=None,
start_saving_from_score=None,
solved_reward_threshold=None,
max_episodes_to_run_after_solved=None,
stop_training_after_solved=False,
bcq_imitator_hyperparams=None,
reward_shape_func=None,
):
Expand Down Expand Up @@ -193,9 +190,9 @@ def train(
run_details.render,
save_timesteps_to_dataset,
start_saving_from_score,
solved_reward_threshold,
max_episodes_to_run_after_solved,
stop_training_after_solved,
run_details.solved_reward_threshold,
run_details.max_episodes_to_run_after_solved,
run_details.stop_training_after_solved,
reward_shape_func,
)

Expand Down Expand Up @@ -515,7 +512,7 @@ def train_gym_online_rl(

if (
solved_reward_threshold is not None
and best_episode_score_seen > solved_reward_threshold
and best_episode_score_seen >= solved_reward_threshold
):
solved = True

Expand Down Expand Up @@ -782,6 +779,7 @@ def create_trainer(params: OpenAiGymParameters, env: OpenAIGymEnvironment):
rl=rl_parameters,
training=training_parameters,
rainbow=params.rainbow,
evaluation=params.evaluation,
)
trainer = create_dqn_trainer_from_params(
discrete_trainer_params, env.normalization, use_gpu
Expand Down
4 changes: 4 additions & 0 deletions ml/rl/workflow/dqn_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,10 @@ def main(params):
"Horizon is configured to use all GPUs but your platform doesn't support torch.distributed & torch.cuda!"
)
params["use_all_avail_gpus"] = False
if params["use_gpu"] and not torch.cuda.is_available():
logger.info("GPU requested but not available")
params["use_gpu"] = False

if params["use_all_avail_gpus"]:
params["num_processes_per_node"] = max(1, torch.cuda.device_count())
multiprocessing.spawn(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
"rl": {
"gamma": 0.99,
"target_update_rate": 0.2,

"maxq_learning": true,
"epsilon": 0.2,
"temperature": 0.35,
Expand Down
Loading

0 comments on commit adfa6ab

Please sign in to comment.