From a90b99686a3b89ad36c31a01b766ecd14c302dbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabriel=20Mart=C3=ADn=20Bl=C3=A1zquez?= Date: Sat, 25 Jan 2025 15:01:15 +0100 Subject: [PATCH] Fix passing `vLLM` server URL (#21) * Use head node ip as vLLM server url * Pass correct server url * Add num_generations argument * Fix style * Remove `select` --------- Co-authored-by: plaguss --- slurm/generate.slurm | 56 ++++++++++++++++++++++++++++++++--------- src/open_r1/generate.py | 36 ++++++++++++++++++-------- 2 files changed, 69 insertions(+), 23 deletions(-) diff --git a/slurm/generate.slurm b/slurm/generate.slurm index 2f72819e..26b0e7e5 100644 --- a/slurm/generate.slurm +++ b/slurm/generate.slurm @@ -2,7 +2,7 @@ #SBATCH --job-name=deepseek-r1-generation #SBATCH --partition=hopper-prod #SBATCH --qos=normal -#SBATCH --nodes=2 +#SBATCH --nodes=4 #SBATCH --exclusive #SBATCH --gpus-per-node=8 #SBATCH --output=./logs/%x-%j.out @@ -44,6 +44,10 @@ while [[ $# -gt 0 ]]; do MAX_NEW_TOKENS="$2" shift 2 ;; + --num-generations) + NUM_GENERATIONS="$2" + shift 2 + ;; --hf-output-dataset) HF_OUTPUT_DATASET="$2" shift 2 @@ -64,15 +68,32 @@ if [ -z "$MODEL" ] || [ -z "$HF_DATASET" ]; then exit 1 fi +# Set default values for optional parameters HF_DATASET_SPLIT=${HF_DATASET_SPLIT:-"train"} PROMPT_COLUMN=${PROMPT_COLUMN:-"prompt"} -TEMPERATURE=${TEMPERATURE:-0.7} -TOP_P=${TOP_P:-0.9} MAX_NEW_TOKENS=${MAX_NEW_TOKENS:-8192} +NUM_GENERATIONS=${NUM_GENERATIONS:-1} PRIVATE=${PRIVATE:-"false"} +# Print all input arguments +echo "Input arguments:" +echo "MODEL: $MODEL" +echo "HF_DATASET: $HF_DATASET" +echo "HF_DATASET_CONFIG: $HF_DATASET_CONFIG" +echo "HF_DATASET_SPLIT: $HF_DATASET_SPLIT" +echo "PROMPT_COLUMN: $PROMPT_COLUMN" +echo "TEMPERATURE: $TEMPERATURE" +echo "TOP_P: $TOP_P" +echo "MAX_NEW_TOKENS: $MAX_NEW_TOKENS" +echo "NUM_GENERATIONS: $NUM_GENERATIONS" +echo "HF_OUTPUT_DATASET: $HF_OUTPUT_DATASET" +echo "PRIVATE: $PRIVATE" +echo "-------------------" + set -ex +module load cuda/12.1 + export LD_LIBRARY_PATH=.venv/lib/python3.11/site-packages/nvidia/nvjitlink/lib echo "SLURM_JOB_ID: $SLURM_JOB_ID" @@ -127,19 +148,19 @@ RAY_ADDRESS="http://$head_node_ip:8265" ray job submit \ --no-wait \ -- vllm serve $MODEL \ --tensor-parallel-size 8 \ - --pipeline-parallel-size 2 \ - --max-model-len 32768 \ + --pipeline-parallel-size 4 \ + --max-model-len 16384 \ --enable-chunked-prefill \ --trust-remote-code \ --distributed-executor-backend ray # wait for vllm to load the model -echo "Waiting for vLLM (http://localhost:8000) server to be up..." +echo "Waiting for vLLM (http://$head_node_ip:8000) server to be up..." # wait for vllm to load and serve the model while true; do - if curl -s -o /dev/null -w "%{http_code}" http://localhost:8000 >/dev/null 2>&1; then - echo "Received response from http://localhost:8000" + if curl -s -o /dev/null -w "%{http_code}" http://$head_node_ip:8000 >/dev/null 2>&1; then + echo "Received response from http://$head_node_ip:8000" break else echo "Still waiting... (Press Ctrl+C to cancel)" @@ -148,21 +169,32 @@ while true; do done echo "Checking available models..." -curl http://localhost:8000/v1/models +curl http://$head_node_ip:8000/v1/models + +echo "Executing sanity check..." +curl http://$head_node_ip:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d "{ + \"model\": \"$MODEL\", + \"prompt\": \"<|begin▁of▁sentence|><|User|>hi, how are you?<|Assistant|>\", + \"max_tokens\": 2048, + \"temperature\": 0.6 + }" # Finally submit the job to the cluster echo "Submitting job to ray cluster..." RAY_ADDRESS="http://$head_node_ip:8265" ray job submit \ - --working-dir pipeline \ + --working-dir src/open_r1 \ -- python -u generate.py \ --model "$MODEL" \ --hf-dataset "$HF_DATASET" \ ${HF_DATASET_CONFIG:+--hf-dataset-config "$HF_DATASET_CONFIG"} \ --hf-dataset-split "$HF_DATASET_SPLIT" \ --prompt-column "$PROMPT_COLUMN" \ - --temperature "$TEMPERATURE" \ - --top-p "$TOP_P" \ + ${TEMPERATURE:+--temperature "$TEMPERATURE"} \ + ${TOP_P:+--top-p "$TOP_P"} \ --max-new-tokens "$MAX_NEW_TOKENS" \ + --num-generations "$NUM_GENERATIONS" \ ${HF_OUTPUT_DATASET:+--hf-output-dataset "$HF_OUTPUT_DATASET"} \ ${PRIVATE:+--private} \ --vllm-server-url "http://$head_node_ip:8000/v1" \ No newline at end of file diff --git a/src/open_r1/generate.py b/src/open_r1/generate.py index b2a0d7d1..a6bb1d03 100644 --- a/src/open_r1/generate.py +++ b/src/open_r1/generate.py @@ -23,23 +23,32 @@ def build_distilabel_pipeline( model: str, base_url: str = "http://localhost:8000/v1", prompt_column: Optional[str] = None, - temperature: float = 0.7, - top_p: float = 0.9, + temperature: Optional[float] = None, + top_p: Optional[float] = None, max_new_tokens: int = 8192, + num_generations: int = 1, ) -> Pipeline: + generation_kwargs = {"max_new_tokens": max_new_tokens} + + if temperature is not None: + generation_kwargs["temperature"] = temperature + + if top_p is not None: + generation_kwargs["top_p"] = top_p + with Pipeline().ray() as pipeline: TextGeneration( llm=OpenAILLM( base_url=base_url, api_key="something", model=model, - generation_kwargs={ - "temperature": temperature, - "top_p": top_p, - "max_new_tokens": max_new_tokens, - }, + # thinking can take some time... + timeout=10 * 60, + generation_kwargs=generation_kwargs, ), input_mappings={"instruction": prompt_column} if prompt_column is not None else {}, + input_batch_size=10, + num_generations=num_generations, ) return pipeline @@ -85,13 +94,11 @@ def build_distilabel_pipeline( parser.add_argument( "--temperature", type=float, - default=0.7, help="Temperature for generation", ) parser.add_argument( "--top-p", type=float, - default=0.9, help="Top-p value for generation", ) parser.add_argument( @@ -100,6 +107,12 @@ def build_distilabel_pipeline( default=8192, help="Maximum number of new tokens to generate", ) + parser.add_argument( + "--num-generations", + type=int, + default=1, + help="Number of generations per problem", + ) parser.add_argument( "--hf-output-dataset", type=str, @@ -120,7 +133,7 @@ def build_distilabel_pipeline( print() print(f"Loading '{args.hf_dataset}' (config: {args.hf_dataset_config}, split: {args.hf_dataset_split}) dataset...") - dataset = load_dataset(args.hf_dataset, split=args.hf_dataset_split).select(range(50)) + dataset = load_dataset(args.hf_dataset, split=args.hf_dataset_split) print("Dataset loaded!") pipeline = build_distilabel_pipeline( @@ -130,10 +143,11 @@ def build_distilabel_pipeline( temperature=args.temperature, top_p=args.top_p, max_new_tokens=args.max_new_tokens, + num_generations=args.num_generations, ) print("Running generation pipeline...") - distiset = pipeline.run(dataset=dataset, dataset_batch_size=5000) + distiset = pipeline.run(dataset=dataset, use_cache=False) print("Generation pipeline finished!") if args.hf_output_dataset: