mirror of
https://github.com/open-thought/reasoning-gym.git
synced 2026-04-28 17:29:39 +00:00
add minimal verifiers example (#472)
This commit is contained in:
parent
9e79fc84b6
commit
49f3821098
4 changed files with 97 additions and 0 deletions
38
examples/verifiers/README.md
Normal file
38
examples/verifiers/README.md
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
## Setup
|
||||
|
||||
Prepare virtual environment, e.g.
|
||||
|
||||
```bash
|
||||
python -m venv venv
|
||||
source venv/bin/activate
|
||||
```
|
||||
|
||||
Install dependencies
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
pip install flash-attn --no-build-isolation
|
||||
```
|
||||
|
||||
Login to W&B and HuggingFace if desired
|
||||
|
||||
```bash
|
||||
wandb login
|
||||
huggingface-cli login
|
||||
```
|
||||
|
||||
## Training
|
||||
|
||||
Here we assume two GPUs, with one used for inference (vLLM) and the other for training (accelerate). You may need to adjust some settings for different GPU configs.
|
||||
|
||||
Run the vLLM server for inference:
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 vf-vllm --model Qwen/Qwen2.5-1.5B-Instruct --tensor-parallel-size 1
|
||||
```
|
||||
|
||||
Run the training script using accelerate:
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=1 accelerate launch --config-file zero3.yaml --num-processes 1 vf_rg.py
|
||||
```
|
||||
1
examples/verifiers/requirements.txt
Normal file
1
examples/verifiers/requirements.txt
Normal file
|
|
@ -0,0 +1 @@
|
|||
verifiers[all]
|
||||
36
examples/verifiers/vf_rg.py
Normal file
36
examples/verifiers/vf_rg.py
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
"""Example training script for using the Reasoning Gym environment in verifiers."""
|
||||
|
||||
import verifiers as vf
|
||||
from verifiers.envs.reasoninggym_env import ReasoningGymEnv
|
||||
|
||||
model_name = f"Qwen/Qwen2.5-1.5B-Instruct"
|
||||
model, tokenizer = vf.get_model_and_tokenizer(model_name)
|
||||
|
||||
vf_env = ReasoningGymEnv(
|
||||
gym=[
|
||||
"basic_arithmetic",
|
||||
"bitwise_arithmetic",
|
||||
"decimal_arithmetic",
|
||||
],
|
||||
num_samples=100,
|
||||
num_eval_samples=50,
|
||||
max_concurrent=100,
|
||||
)
|
||||
|
||||
training_args = vf.grpo_defaults(run_name="reasoning-gym-test")
|
||||
training_args.num_iterations = 1
|
||||
training_args.per_device_train_batch_size = 4
|
||||
training_args.num_generations = 8
|
||||
training_args.gradient_accumulation_steps = 4
|
||||
training_args.max_prompt_length = 1024
|
||||
training_args.max_completion_length = 4096
|
||||
training_args.max_steps = 100
|
||||
|
||||
trainer = vf.GRPOTrainer(
|
||||
model=model,
|
||||
processing_class=tokenizer,
|
||||
env=vf_env,
|
||||
args=training_args,
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
22
examples/verifiers/zero3.yaml
Normal file
22
examples/verifiers/zero3.yaml
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
compute_environment: LOCAL_MACHINE
|
||||
debug: false
|
||||
deepspeed_config:
|
||||
deepspeed_multinode_launcher: standard
|
||||
offload_optimizer_device: none
|
||||
offload_param_device: none
|
||||
zero3_init_flag: true
|
||||
zero3_save_16bit_model: true
|
||||
zero_stage: 3
|
||||
distributed_type: DEEPSPEED
|
||||
downcast_bf16: 'no'
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
mixed_precision: bf16
|
||||
num_machines: 1
|
||||
num_processes: 1
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
Loading…
Add table
Add a link
Reference in a new issue