update

Eagalon · Jan 4, 2024 · cf7a22d · cf7a22d
1 parent 4508fd0
commit cf7a22d
Show file tree

Hide file tree

Showing 6 changed files with 937 additions and 1 deletion.
diff --git a/WizardCoder/README.md b/WizardCoder/README.md
@@ -255,6 +255,167 @@ python process_humaneval.py --path ${output_path} --out_path ${output_path}.json
 evaluate_functional_correctness ${output_path}.jsonl
 ```
 
+### How to Reproduce the Humaneval(Plus)/MBPP(Plus) Performance of WizardCoder-33B-v1.1?
+
+❗❗❗**This performance is 100% reproducible!**
+
+```
+transformers==4.36.2
+vllm==0.2.5
+```
+
+(1) HumanEval and HumanEval-Plus
+
+- Step 1
+
+Code Generation (w/o accelerate)
+```bash
+model="WizardLM/WizardCoder-33B-V1.1"
+temp=0.0
+max_len=2048
+pred_num=1
+num_seqs_per_iter=1
+
+output_path=preds/T${temp}_N${pred_num}_WizardCoder-33B-V1.1_Greedy_Decode
+
+mkdir -p ${output_path}
+echo 'Output path: '$output_path
+echo 'Model to eval: '$model
+
+# 164 problems, 21 per GPU if GPU=8
+index=0
+gpu_num=8
+for ((i = 0; i < $gpu_num; i++)); do
+  start_index=$((i * 21))
+  end_index=$(((i + 1) * 21))
+
+  gpu=$((i))
+  echo 'Running process #' ${i} 'from' $start_index 'to' $end_index 'on GPU' ${gpu}
+  ((index++))
+  (
+    CUDA_VISIBLE_DEVICES=$gpu python humaneval_gen.py --model ${model} \
+      --start_index ${start_index} --end_index ${end_index} --temperature ${temp} \
+      --num_seqs_per_iter ${num_seqs_per_iter} --N ${pred_num} --max_len ${max_len} --output_path ${output_path} --greedy_decode
+  ) &
+  if (($index % $gpu_num == 0)); then wait; fi
+done
+```
+
+Code Generation (w/ vllm accelerate)
+```bash
+model="WizardLM/WizardCoder-33B-V1.1"
+temp=0.0
+max_len=2048
+pred_num=1
+num_seqs_per_iter=1
+
+output_path=preds/T${temp}_N${pred_num}_WizardCoder-33B-V1.1_Greedy_Decode_vllm
+
+mkdir -p ${output_path}
+echo 'Output path: '$output_path
+echo 'Model to eval: '$model
+
+CUDA_VISIBLE_DEVICES=0,1,2,3 python humaneval_gen_vllm.py --model ${model} \
+    --start_index 0 --end_index 164 --temperature ${temp} \
+    --num_seqs_per_iter ${num_seqs_per_iter} --N ${pred_num} --max_len ${max_len} --output_path ${output_path} --num_gpus 4 --overwrite
+```
+
+- Step 2: Get the score
+
+Install [Eval-Plus](https://github.com/evalplus/evalplus) benchmark.
+```bash
+git clone https://github.com/evalplus/evalplus.git
+cd evalplus
+export PYTHONPATH=$PYTHONPATH:$(pwd)
+pip install -r requirements.txt
+```
+Get HumanEval and HumanEval-Plus scores.
+```bash
+output_path=preds/T0.0_N1_WizardCoder-33B-V1.1_Greedy_Decode
+
+echo 'Output path: '$output_path
+python process_humaneval.py --path ${output_path} --out_path ${output_path}.jsonl --add_prompt
+
+evalplus.evaluate --dataset humaneval --samples ${output_path}.jsonl
+```
+
+(2) MBPP and MBPP-Plus
+
+The preprocessed questions are provided in `WizardLM/WizardCoder/data/mbppplus.json`
+
+- Step 1
+
+Code Generation (w/o accelerate)
+```bash
+model="WizardLM/WizardCoder-33B-V1.1"
+temp=0.0
+max_len=2048
+pred_num=1
+num_seqs_per_iter=1
+
+output_path=preds/MBPP_T${temp}_N${pred_num}_WizardCoder-33B-V1.1_Greedy_Decode
+
+mkdir -p ${output_path}
+echo 'Output path: '$output_path
+echo 'Model to eval: '$model
+
+# 399 problems, 50 per GPU if GPU=8
+index=0
+gpu_num=8
+for ((i = 0; i < $gpu_num; i++)); do
+  start_index=$((i * 50))
+  end_index=$(((i + 1) * 50))
+
+  gpu=$((i))
+  echo 'Running process #' ${i} 'from' $start_index 'to' $end_index 'on GPU' ${gpu}
+  ((index++))
+  (
+    CUDA_VISIBLE_DEVICES=$gpu python mbppplus_gen.py --model ${model} \
+      --start_index ${start_index} --end_index ${end_index} --temperature ${temp} \
+      --num_seqs_per_iter ${num_seqs_per_iter} --N ${pred_num} --max_len ${max_len} --output_path ${output_path} --mbpp_path "mbppplus.json" --greedy_decode
+  ) &
+  if (($index % $gpu_num == 0)); then wait; fi
+done
+```
+
+Code Generation (w/ vllm accelerate)
+```bash
+model="WizardLM/WizardCoder-33B-V1.1"
+temp=0.0
+max_len=2048
+pred_num=1
+num_seqs_per_iter=1
+
+output_path=preds/MBPP_T${temp}_N${pred_num}_WizardCoder-33B-V1.1_Greedy_Decode_vllm
+
+mkdir -p ${output_path}
+echo 'Output path: '$output_path
+echo 'Model to eval: '$model
+
+CUDA_VISIBLE_DEVICES=0,1,2,3 python mbppplus_gen_vllm.py --model ${model} \
+    --start_index ${start_index} --end_index ${end_index} --temperature ${temp} \
+    --num_seqs_per_iter ${num_seqs_per_iter} --N ${pred_num} --max_len ${max_len} --output_path ${output_path} --mbpp_path "mbppplus.json" --num_gpus 4
+```
+
+- Step 2: Get the score
+
+Install [Eval-Plus](https://github.com/evalplus/evalplus) benchmark.
+```bash
+git clone https://github.com/evalplus/evalplus.git
+cd evalplus
+export PYTHONPATH=$PYTHONPATH:$(pwd)
+pip install -r requirements.txt
+```
+Get HumanEval and HumanEval-Plus scores.
+```bash
+output_path=preds/MBPP_T0.0_N1_WizardCoder-33B-V1.1_Greedy_Decode
+
+echo 'Output path: '$output_path
+python mbppplus_process_preds.py --path ${output_path} --out_path ${output_path}.jsonl --add_prompt
+
+evalplus.evaluate --dataset mbpp --samples ${output_path}.jsonl
+```
+
 ### How to Reproduce the 73.2 Pass@1 on HumanEval with Greedy Decoding?
 
 ❗❗❗**This performance is 100% reproducible!**