From 73f7cc7a66d16778d3f0bcb4ff23e9377075e74b Mon Sep 17 00:00:00 2001 From: joesharratt1229 Date: Tue, 25 Mar 2025 06:17:54 +0000 Subject: [PATCH] added spell backward --- reasoning_gym/algorithmic/spell_backward.py | 2 +- training/configs/qwen2.5_3b_grpo.yaml | 54 ++++++++++----------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/reasoning_gym/algorithmic/spell_backward.py b/reasoning_gym/algorithmic/spell_backward.py index e5627677..f7f4d843 100644 --- a/reasoning_gym/algorithmic/spell_backward.py +++ b/reasoning_gym/algorithmic/spell_backward.py @@ -34,7 +34,7 @@ class SpellBackwardDataset(ProceduralDataset): super().__init__(config=config, seed=config.seed, size=config.size) # Load and preprocess text - text = read_data_file("in_the_year_2889.txt") + text = read_data_file("words3to10.txt") self.words = [ word.strip() for word in text.splitlines() diff --git a/training/configs/qwen2.5_3b_grpo.yaml b/training/configs/qwen2.5_3b_grpo.yaml index a6271688..a279e85d 100644 --- a/training/configs/qwen2.5_3b_grpo.yaml +++ b/training/configs/qwen2.5_3b_grpo.yaml @@ -1,12 +1,12 @@ reasoning_gym: - dataset_size: 10000 + dataset_size: 20000 developer_prompt: DeepSeekZero datasets: - figlet_font: - weight: 1 - config: - min_word_len: 3 - max_word_len: 10 + spell_backward: + weight: 1 + config: + min_word_len: 3 + max_word_len: 10 curriculum: enabled: False schedule: @@ -20,20 +20,20 @@ curriculum: attribute_levels: word_len: 0 reward: - use_accuracy: true + use_accuracy: True secondary_rewards: - name: format - scaling_factor: 0.5 + scaling_factor: 0.2 data: tokenizer: null train_files: train.parquet val_files: test.parquet prompt_key: prompt - max_prompt_length: 2048 - max_response_length: 4096 - train_batch_size: 128 - val_batch_size: 128 + max_prompt_length: 512 + max_response_length: 1024 + train_batch_size: 32 + val_batch_size: 32 return_raw_chat: True return_raw_input_ids: True @@ -62,10 +62,10 @@ actor_rollout_ref: shuffle: False ulysses_sequence_parallel_size: 1 # sp size optim: - lr: 3e-4 - lr_warmup_steps_ratio: 0.1 # the total steps will be injected during runtime - min_lr_ratio: 0.1 # only useful for warmup with cosine - warmup_style: cosine # select from constant/cosine + lr: 1e-6 + lr_warmup_steps_ratio: 0 # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine total_training_steps: -1 # must be override by program fsdp_config: wrap_policy: @@ -81,7 +81,7 @@ actor_rollout_ref: # transformer_layer_cls_to_wrap: None min_num_params: 0 log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu - log_prob_micro_batch_size_per_gpu: 16 + log_prob_micro_batch_size_per_gpu: 160 log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size @@ -90,7 +90,7 @@ actor_rollout_ref: temperature: 1.0 top_k: -1 # 0 for hf rollout, -1 for vllm rollout top_p: 1 - prompt_length: ${data.max_prompt_length} # not use for opensource + prompt_length: ${data.max_prompt_length} # not use for opensource response_length: ${data.max_response_length} # for vllm rollout dtype: bfloat16 # should align with FSDP @@ -99,9 +99,8 @@ actor_rollout_ref: enforce_eager: True free_cache_engine: True load_format: dummy_dtensor - tensor_model_parallel_size: 2 - max_model_len: 32768 - max_num_batched_tokens: 32768 + tensor_model_parallel_size: 4 + max_num_batched_tokens: 16384 max_num_seqs: 1024 log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu log_prob_micro_batch_size_per_gpu: 160 @@ -112,6 +111,7 @@ actor_rollout_ref: # for hf rollout do_sample: True use_fire_sampling: False + max_model_len: 16384 # number of responses (i.e. num sample times) n: 8 # > 1 for grpo val_kwargs: @@ -131,16 +131,16 @@ trainer: total_epochs: 5 total_training_steps: null project_name: rg-test - experiment_name: verl_grpo_qwen_figlet + experiment_name: verl_grpo_qwen_composite logger: [ 'console', 'wandb' ] val_generations_to_log_to_wandb: 0 nnodes: 1 - n_gpus_per_node: 2 - save_freq: 50 + n_gpus_per_node: 4 + save_freq: 100 # auto: find the last ckpt to resume. If can't find, start from scratch resume_mode: auto # or auto or resume_path if resume_from_path: False - test_freq: 300 + test_freq: 100 critic_warmup: 0 default_hdfs_dir: null remove_previous_ckpt_in_save: False @@ -151,7 +151,7 @@ critic: strategy: fsdp optim: lr: 1e-6 - lr_warmup_steps_ratio: 0.1 # the total steps will be injected during runtime + lr_warmup_steps_ratio: 0 # the total steps will be injected during runtime min_lr_ratio: null # only useful for warmup with cosine warmup_style: cosine # select from constant/cosine total_training_steps: -1 # must be override by program @@ -171,7 +171,7 @@ critic: fsdp_size: -1 ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu - ppo_micro_batch_size_per_gpu: null + ppo_micro_batch_size_per_gpu: 8 forward_micro_batch_size: ${critic.ppo_micro_batch_size} forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}