Improve sft (huggingface#5)

* first commit * working training * change model_id * Update scripts/training/sft.py Co-authored-by: Quentin Gallouédec <[email protected]> --------- Co-authored-by: Quentin Gallouédec <[email protected]>
Ammar-Alnagar · Jan 24, 2025 · c421bc8 · c421bc8
1 parent 52aefc2
commit c421bc8
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 29 deletions.
diff --git a/.../Qwen2.5-1.5B-Instruct/config_v00.00.yaml → ...n2.5-1.5B-Instruct/sft/config_v00.00.yaml b/.../Qwen2.5-1.5B-Instruct/config_v00.00.yaml → ...n2.5-1.5B-Instruct/sft/config_v00.00.yaml
@@ -2,7 +2,7 @@
 model_name_or_path: Qwen/Qwen2.5-1.5B-Instruct
 model_revision: main
 torch_dtype: bfloat16
-attn_implementation: flash_attention_2
+attn_implementation: sdpa
 
 # Data training arguments
 dataset_mixer:
@@ -20,8 +20,9 @@ gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: False
-hub_model_id: HuggingFaceH4/Qwen2.5-1.5B-R1-v00.00
+hub_model_id: HuggingFaceH4/qwen-test
 hub_strategy: every_save
+hub_private_repo: true
 learning_rate: 2.0e-05
 log_level: info
 logging_steps: 5  
@@ -33,7 +34,7 @@ num_train_epochs: 1
 output_dir: data/Qwen2.5-1.5B-Distill-R1-v00.00
 overwrite_output_dir: true
 per_device_eval_batch_size: 8
-per_device_train_batch_size: 16
+per_device_train_batch_size: 8
 push_to_hub: true
 remove_unused_columns: true
 report_to:

diff --git a/recipes/launch.slurm b/recipes/launch.slurm
@@ -1,33 +1,41 @@
 #!/bin/bash
+#SBATCH --job-name=default
+#SBATCH --nodes=1
 #SBATCH --ntasks-per-node=1
 #SBATCH --exclusive
 #SBATCH --gres=gpu:8
-#SBATCH --partition=hopper-prod  # Adjust this for your cluster
-#SBATCH --output=/fsx/h4/logs/%x-%j.out # Adjust this for your cluster
-#SBATCH --err=/fsx/h4/logs/%x-%j.err    # Adjust this for your cluster
+#SBATCH --partition=hopper-prod 
+#SBATCH --qos=high
+#SBATCH --time=01:59:00
+#SBATCH --output=/fsx/elie_bakouch/open-r1/logs/%x-%j.out 
+#SBATCH --err=/fsx/elie_bakouch/open-r1/logs/%x-%j.err
 
 set -x -e
 
+# DON'T FORGET TO MODIFY THE HEADER WITH YOUR PATH 
+# @todo:eliebak maybe with use submitit at some point
+
 source ~/.bashrc
 conda activate openr1
+module load cuda/12.1
 echo "START TIME: $(date)"
+echo "PYTHON ENV: $(which python)"
 
-MODEL=$1
-TASK=$2
-PRECISION=$3
-ACCELERATOR=$4
-OPTIONAL_ARGS=$5
+MODEL=Qwen2.5-1.5B-Instruct
+TASK=sft
+PRECISION=v00.00
+ACCELERATOR=deepspeed_zero3
 
 # Training setup
 NUM_NODES=$SLURM_NNODES
 GPUS_PER_NODE=8
 WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE))
 # Due to conflicts between Accelerate's DeepSpeed configs and Transformers' TrainingArguments, we need to parse the gradient accumulation steps from the config file to ensure they match
 CONFIG_FILE=recipes/$MODEL/$TASK/config_$PRECISION.yaml
+
+echo "CONFIG_FILE: $CONFIG_FILE"
 GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE | awk '{print $2}')
 
-# Split the string into individual arguments
-IFS=' ' read -ra ARGS <<< "$OPTIONAL_ARGS"
 
 # Loop through the arguments and find the one with "--gradient_accumulation_steps"
 for arg in "${ARGS[@]}"; do
@@ -44,7 +52,7 @@ MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
 MASTER_PORT=6000
 
 export CMD=" \
-    scripts/run_$TASK.py $CONFIG_FILE $OPTIONAL_ARGS
+    scripts/training/$TASK.py $CONFIG_FILE $OPTIONAL_ARGS
     "
 
 export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch \

diff --git a/scripts/training/sft.py b/scripts/training/sft.py
@@ -40,7 +40,7 @@
     get_quantization_config,
     get_tokenizer,
 )
-from trl import SFTTrainer, setup_chat_format
+from trl import SFTTrainer, SFTConfig, setup_chat_format
 
 
 logger = logging.getLogger(__name__)
@@ -91,11 +91,13 @@ def main():
         configs=data_args.dataset_configs,
         columns_to_keep=["messages", "chosen", "rejected", "prompt", "completion", "label"],
     )
+
+
     logger.info(
         f"Training on the following datasets and their proportions: {[split + ' : ' + str(dset.num_rows) for split, dset in raw_datasets.items()]}"
     )
     column_names = list(raw_datasets["train"].features)
-
+    print(f"column_names: {column_names}")
     ################
     # Load tokenizer
     ################
@@ -122,10 +124,10 @@ def main():
 
     model = model_args.model_name_or_path
     # For ChatML we need to add special tokens and resize the embedding layer
-    if "<|im_start|>" in tokenizer.chat_template and "gemma-tokenizer-chatml" not in tokenizer.name_or_path:
-        model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, **model_kwargs)
-        model, tokenizer = setup_chat_format(model, tokenizer)
-        model_kwargs = None
+    # if "<|im_start|>" in tokenizer.chat_template and "gemma-tokenizer-chatml" not in tokenizer.name_or_path:
+    #     model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, **model_kwargs)
+    #     model, tokenizer = setup_chat_format(model, tokenizer)
+    #     model_kwargs = None
 
     #####################
     # Apply chat template
@@ -142,8 +144,10 @@ def main():
         desc="Applying chat template",
     )
 
+    print(f"raw_datasets['train'].features: {raw_datasets['train'].features}")
+
     ##########################
-    # Decontaminate benchmarks
+    # Decontaminate benchmarks (change this with math)
     ##########################
     num_raw_train_samples = len(raw_datasets["train"])
     raw_datasets = raw_datasets.filter(decontaminate_humaneval, batched=True, batch_size=10_000, num_proc=1)
@@ -159,21 +163,26 @@ def main():
         for index in random.sample(range(len(raw_datasets["train"])), 3):
             logger.info(f"Sample {index} of the processed training set:\n\n{raw_datasets['train'][index]['text']}")
 
+
     ########################
     # Initialize the Trainer
     ########################
+
+    # Adding packing and dataset_text_field to the config
+    setattr(training_args, "model_init_kwargs", model_kwargs)
+
     trainer = SFTTrainer(
         model=model,
-        model_init_kwargs=model_kwargs,
+        # model_init_kwargs=model_kwargs,
         args=training_args,
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
-        dataset_text_field="text",
-        max_seq_length=training_args.max_seq_length,
+        # dataset_text_field="text",
+        # max_seq_length=training_args.max_seq_length,
         tokenizer=tokenizer,
-        packing=True,
+        # packing=True,
         peft_config=get_peft_config(model_args),
-        dataset_kwargs=training_args.dataset_kwargs,
+        # dataset_kwargs=training_args.dataset_kwargs,
     )
 
     ###############
@@ -212,8 +221,9 @@ def main():
         trainer.model.config.use_cache = True
         trainer.model.config.save_pretrained(training_args.output_dir)
 
+
     ##########
-    # Evaluate
+    # Evaluate (to change or supress?)
     ##########
     if training_args.do_eval:
         logger.info("*** Evaluate ***")

diff --git a/src/open_r1/model_utils.py b/src/open_r1/model_utils.py
@@ -22,8 +22,8 @@
 
 from accelerate import Accelerator
 from huggingface_hub import list_repo_files
-from huggingface_hub.utils._errors import RepositoryNotFoundError
-from huggingface_hub.utils._validators import HFValidationError
+from huggingface_hub.utils import RepositoryNotFoundError
+from huggingface_hub.utils import HFValidationError
 from peft import LoraConfig, PeftConfig
 
 from .configs import DataArguments, DPOConfig, ModelArguments, SFTConfig