mirror of
https://github.com/lilakk/BLEUBERI.git
synced 2026-04-26 17:13:21 +00:00
176 lines
5 KiB
Bash
176 lines
5 KiB
Bash
#!/bin/bash
|
|
|
|
# -------------------------------
|
|
# Working Directory and Environment # TODO: change to your own!
|
|
# -------------------------------
|
|
|
|
# conda activate bleuberi
|
|
|
|
WORK_DIR="/mnt/sharedfs/yapei/BLEUBERI/training"
|
|
cd $WORK_DIR
|
|
source ~/miniconda3/etc/profile.d/conda.sh
|
|
conda activate /mnt/sharedfs/yapei/venvs/rlit
|
|
|
|
# --------------------------------
|
|
# Creating training data # TODO: change to your own!
|
|
# --------------------------------
|
|
|
|
grpo_dataset_name="BLEUBERI-Tulu3-50k_bleu_Qwen2.5-7B_1ref-gold_hard_5000"
|
|
|
|
# the data will be saved to ../data/data_sft/${grpo_dataset_name}_SFT
|
|
python create_training_data.py sft \
|
|
--input_data_path ../data/data_grpo/$grpo_dataset_name
|
|
|
|
# -------------------------------
|
|
# Training parameters # TODO: change to your own!
|
|
# -------------------------------
|
|
|
|
out_dir="../ckpts"
|
|
cache_dir="/mnt/sharedfs/cache/hub"
|
|
run_prefix=""
|
|
|
|
wandb_project="BLEUBERI"
|
|
run_name="qwen7b_SFT_1ref-gold_5k"
|
|
|
|
model="Qwen/Qwen2.5-7B"
|
|
data_path="../data/data_sft/${grpo_dataset_name}_SFT"
|
|
|
|
per_device_train_batch_size=4
|
|
gradient_accumulation_steps=8
|
|
|
|
lr=5e-6
|
|
lr_scheduler_type="constant_with_warmup"
|
|
|
|
num_epochs=1
|
|
max_steps=1 # if not -1, will override num_epochs
|
|
save_strategy="steps"
|
|
save_steps=250
|
|
|
|
warmup_ratio=0.05
|
|
max_grad_norm=0.2
|
|
|
|
train_split="train"
|
|
model_max_length=128000
|
|
|
|
packing=false
|
|
dataset_text_field="text"
|
|
|
|
save_total_limit=20
|
|
seed=42
|
|
bf16=true
|
|
|
|
NUM_GPUS=$(nvidia-smi -L | wc -l)
|
|
global_bsz=$((per_device_train_batch_size * gradient_accumulation_steps * NUM_GPUS))
|
|
data_base_name=$(basename $data_path)
|
|
model_base_name=$(basename $model)
|
|
|
|
if [ -z "$run_name" ]; then
|
|
run_name="${run_prefix}_${model_base_name}_${data_base_name}_lr${lr}_bsz${global_bsz}_epochs${num_epochs}"
|
|
fi
|
|
|
|
ckpt_dir=${out_dir}/${run_name}
|
|
mkdir -p $ckpt_dir
|
|
|
|
cp "$0" "$ckpt_dir/$(basename $0)"
|
|
rsync -av --exclude="wandb" --exclude="wandb_tables" --exclude=".git" --exclude="__pycache__" --exclude="*.pyc" --exclude="*.csv" $WORK_DIR/ $ckpt_dir/training/
|
|
|
|
# -------------------------------
|
|
# DeepSpeed configuration
|
|
# -------------------------------
|
|
|
|
cat << EOF > ${ckpt_dir}/ds_config.json
|
|
{
|
|
"bf16": {
|
|
"enabled": "auto"
|
|
},
|
|
"activation_checkpointing": {
|
|
"partition_activations": true,
|
|
"cpu_checkpointing": true,
|
|
"contiguous_memory_optimization": true,
|
|
"checkpoint_in_cpu": true,
|
|
"profile": false
|
|
},
|
|
"zero_optimization": {
|
|
"stage": 3,
|
|
"stage3_gather_16bit_weights_on_model_save": true,
|
|
"offload_optimizer": {
|
|
"device": "cpu"
|
|
},
|
|
"offload_param": {
|
|
"device": "cpu"
|
|
},
|
|
"allgather_partitions": true,
|
|
"allgather_bucket_size": 5e8,
|
|
"overlap_comm": true,
|
|
"reduce_scatter": true,
|
|
"reduce_bucket_size": 5e8,
|
|
"contiguous_gradients": true
|
|
},
|
|
"gradient_accumulation_steps": ${gradient_accumulation_steps},
|
|
"gradient_clipping": ${max_grad_norm},
|
|
"train_batch_size": ${global_bsz},
|
|
"train_micro_batch_size_per_gpu": ${per_device_train_batch_size},
|
|
"wall_clock_breakdown": false
|
|
}
|
|
EOF
|
|
|
|
# -------------------------------
|
|
# Logging and setup
|
|
# -------------------------------
|
|
|
|
echo "=== System Information ===" | tee -a $ckpt_dir/setup.log
|
|
echo "Starting job at $(date)" | tee -a $ckpt_dir/setup.log
|
|
echo "Working directory: $WORK_DIR" | tee -a $ckpt_dir/setup.log
|
|
echo "Host: $(hostname)" | tee -a $ckpt_dir/setup.log
|
|
nvidia-smi | tee -a $ckpt_dir/setup.log
|
|
|
|
arguments=(
|
|
--run_name $run_name
|
|
--model_path $model
|
|
--cache_dir $cache_dir
|
|
--ckpt_dir $ckpt_dir
|
|
--data_path $data_path
|
|
--train_split $train_split
|
|
--model_max_length $model_max_length
|
|
--num_epochs $num_epochs
|
|
--max_steps $max_steps
|
|
--save_strategy $save_strategy
|
|
--save_steps $save_steps
|
|
--save_total_limit $save_total_limit
|
|
--seed $seed
|
|
--learning_rate $lr
|
|
--lr_scheduler_type $lr_scheduler_type
|
|
--warmup_ratio $warmup_ratio
|
|
--max_grad_norm $max_grad_norm
|
|
--per_device_train_batch_size $per_device_train_batch_size
|
|
--gradient_accumulation_steps $gradient_accumulation_steps
|
|
--log_level info
|
|
--logging_steps 1
|
|
--deepspeed ${ckpt_dir}/ds_config.json
|
|
)
|
|
|
|
# Only add boolean flags if they are True
|
|
[[ "$packing" == true ]] && arguments+=(--packing)
|
|
[[ "$bf16" == true ]] && arguments+=(--bf16)
|
|
|
|
echo "Arguments: ${arguments[*]}" | tee -a $ckpt_dir/setup.log
|
|
|
|
# -------------------------------
|
|
# Launch DeepSpeed on current node
|
|
# -------------------------------
|
|
|
|
echo "Launching DeepSpeed on $(hostname)" | tee -a $ckpt_dir/setup.log
|
|
|
|
deepspeed_cmd="deepspeed \
|
|
--num_gpus ${NUM_GPUS} \
|
|
${WORK_DIR}/sft.py ${arguments[*]}"
|
|
|
|
echo "Command: $deepspeed_cmd" | tee -a $ckpt_dir/setup.log
|
|
|
|
$deepspeed_cmd 2>&1 | tee -a $ckpt_dir/node_$(hostname).log
|
|
|
|
echo "===========================================" | tee -a $ckpt_dir/setup.log
|
|
echo "Training process completed!" | tee -a $ckpt_dir/setup.log
|
|
echo "===========================================" | tee -a $ckpt_dir/setup.log
|
|
echo "Checkpoint directory: $ckpt_dir"
|
|
echo "Log file: $ckpt_dir/node_$(hostname).log"
|