diff --git a/examples/verifiers/README.md b/examples/verifiers/README.md new file mode 100644 index 00000000..19f966e9 --- /dev/null +++ b/examples/verifiers/README.md @@ -0,0 +1,38 @@ +## Setup + +Prepare virtual environment, e.g. + +```bash +python -m venv venv +source venv/bin/activate +``` + +Install dependencies + +```bash +pip install -r requirements.txt +pip install flash-attn --no-build-isolation +``` + +Login to W&B and HuggingFace if desired + +```bash +wandb login +huggingface-cli login +``` + +## Training + +Here we assume two GPUs, with one used for inference (vLLM) and the other for training (accelerate). You may need to adjust some settings for different GPU configs. + +Run the vLLM server for inference: + +```bash +CUDA_VISIBLE_DEVICES=0 vf-vllm --model Qwen/Qwen2.5-1.5B-Instruct --tensor-parallel-size 1 +``` + +Run the training script using accelerate: + +```bash +CUDA_VISIBLE_DEVICES=1 accelerate launch --config-file zero3.yaml --num-processes 1 vf_rg.py +``` diff --git a/examples/verifiers/requirements.txt b/examples/verifiers/requirements.txt new file mode 100644 index 00000000..cc2594fc --- /dev/null +++ b/examples/verifiers/requirements.txt @@ -0,0 +1 @@ +verifiers[all] diff --git a/examples/verifiers/vf_rg.py b/examples/verifiers/vf_rg.py new file mode 100644 index 00000000..7eebce3a --- /dev/null +++ b/examples/verifiers/vf_rg.py @@ -0,0 +1,36 @@ +"""Example training script for using the Reasoning Gym environment in verifiers.""" + +import verifiers as vf +from verifiers.envs.reasoninggym_env import ReasoningGymEnv + +model_name = f"Qwen/Qwen2.5-1.5B-Instruct" +model, tokenizer = vf.get_model_and_tokenizer(model_name) + +vf_env = ReasoningGymEnv( + gym=[ + "basic_arithmetic", + "bitwise_arithmetic", + "decimal_arithmetic", + ], + num_samples=100, + num_eval_samples=50, + max_concurrent=100, +) + +training_args = vf.grpo_defaults(run_name="reasoning-gym-test") +training_args.num_iterations = 1 +training_args.per_device_train_batch_size = 4 +training_args.num_generations = 8 +training_args.gradient_accumulation_steps = 4 +training_args.max_prompt_length = 1024 +training_args.max_completion_length = 4096 +training_args.max_steps = 100 + +trainer = vf.GRPOTrainer( + model=model, + processing_class=tokenizer, + env=vf_env, + args=training_args, +) + +trainer.train() diff --git a/examples/verifiers/zero3.yaml b/examples/verifiers/zero3.yaml new file mode 100644 index 00000000..c7dbe8fe --- /dev/null +++ b/examples/verifiers/zero3.yaml @@ -0,0 +1,22 @@ +compute_environment: LOCAL_MACHINE +debug: false +deepspeed_config: + deepspeed_multinode_launcher: standard + offload_optimizer_device: none + offload_param_device: none + zero3_init_flag: true + zero3_save_16bit_model: true + zero_stage: 3 +distributed_type: DEEPSPEED +downcast_bf16: 'no' +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 1 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false