diff --git a/reasoning_gym/algorithmic/spell_backward.py b/reasoning_gym/algorithmic/spell_backward.py index 0de8d5f2..ab26f843 100644 --- a/reasoning_gym/algorithmic/spell_backward.py +++ b/reasoning_gym/algorithmic/spell_backward.py @@ -98,7 +98,7 @@ class SpellBackwardCurriculum(BaseCurriculum): self._define_attributes( RangeAttributeDefinition( name="word_len", - levels=list(range(3, 11, 1)), + levels=list(range(3, 13, 2)), description="Word length", lower_field_name="min_word_len", upper_field_name="max_word_len", diff --git a/reasoning_gym/coaching/base_curriculum.py b/reasoning_gym/coaching/base_curriculum.py index 2a6553c5..c4755141 100644 --- a/reasoning_gym/coaching/base_curriculum.py +++ b/reasoning_gym/coaching/base_curriculum.py @@ -19,11 +19,22 @@ class RangeAttributeMode(StrEnum): UPPER_BOUND = "upper_bound" # only use the highest range segment INCLUSIVE = "inclusive" # include all previous levels + LAST_K = "last_k" # use only the last *k* difficulty levels class DefaultCurriculumContext(CurriculumContext): - def __init__(self, mode: RangeAttributeMode = RangeAttributeMode.INCLUSIVE): + def __init__(self, mode: RangeAttributeMode = RangeAttributeMode.INCLUSIVE, k: int | None = None): + """Create a CurriculumContext used when turning a curriculum into a concrete configuration. + + Args: + mode: Strategy for translating a RangeAttributeDefinition level into a concrete range. + k: When ``mode`` is ``RangeAttributeMode.LAST_K`` this parameter indicates how many of the + most-recent difficulty levels (counting backwards from the current one) should be kept. + If ``k`` is ``None`` the behaviour falls back to ``INCLUSIVE`` (i.e. keep everything). The + parameter is ignored for other modes. + """ self.mode = mode + self.k = k # window size used for LAST_K mode def get_range_attr_value(self, curriculum, attr: RangeAttributeDefinition) -> Any: level = curriculum.get_attr_level(attr.name) @@ -39,6 +50,12 @@ class DefaultCurriculumContext(CurriculumContext): elif self.mode == RangeAttributeMode.INCLUSIVE: lo_index = 0 hi_index = min(level + 1, len(attr.levels) - 1) + + elif self.mode == RangeAttributeMode.LAST_K: + hi_index = min(level, len(attr.levels) - 1) + window = self.k if self.k is not None else hi_index + 1 + lo_index = max(0, hi_index - window + 1) + else: if self.mode == RangeAttributeMode.UPPER_BOUND: hi_index = min(level, len(attr.levels) - 1) @@ -48,6 +65,17 @@ class DefaultCurriculumContext(CurriculumContext): lo_index = 0 hi_index = min(level, len(attr.levels) - 1) + elif self.mode == RangeAttributeMode.LAST_K: + hi_index = min(level, len(attr.levels) - 1) + window = self.k if self.k is not None else hi_index + 1 + lo_index = max(0, hi_index - window + 1) + + # Additional handling for LAST_K when ensure_interval is True (above branch) + if attr.ensure_interval and self.mode == RangeAttributeMode.LAST_K: + # Re-compute indices so that we always return at least a two-value interval + if hi_index == lo_index: + lo_index = max(0, hi_index - 1) + lo = attr.get_level_value(lo_index) hi = attr.get_level_value(hi_index) diff --git a/reasoning_gym/data/holdout_words.txt b/reasoning_gym/data/holdout_words.txt new file mode 100644 index 00000000..1ccdea73 --- /dev/null +++ b/reasoning_gym/data/holdout_words.txt @@ -0,0 +1,400 @@ +jib +fam +Jos +rel +gun +Abu +jet +bis +poi +led +Fin +dim +hei +sha +mau +nep +nob +joe +oft +kou +pow +yea +mum +twa +now +Mwa +wae +Pia +Bud +hie +rea +bes +Ree +bog +duo +fey +zac +Jef +Tho +pet +yez +tji +bot +nap +Ben +mal +Mon +Huk +aid +jot +pimp +adda +duel +lees +oven +dean +bhoy +tret +Etta +tolu +mesh +punt +Beck +mule +buff +brae +gowl +does +bego +tave +Vote +Ahir +Mahi +Boer +soft +Kuba +bort +pavy +even +unci +laur +hatt +pipi +rada +Dane +thin +keno +yerd +lori +Coos +Leto +Diau +nife +hath +fury +thus +date +nast +cush +spot +clung +Mbuba +Gippy +spent +crowl +waugh +nibby +urine +snaky +pyche +filet +lohan +kedge +atman +other +dooms +ariel +adlay +Brule +covid +booty +Chiot +blend +chewy +shune +stema +renky +twirk +guyer +lewis +palmy +xylem +helve +semen +scobs +salep +mosey +Sabra +taqua +thats +seave +Nambe +flume +antal +tubig +Aimee +wanle +unmet +undog +acold +heppen +lovely +Ixodes +manbot +simlin +unduly +whilst +tremor +seraph +streng +richen +brutal +solute +unique +motory +denier +Andrea +pinery +eatery +Turkic +vennel +Elohim +tampon +uracil +untold +pigdan +nonene +router +japing +calved +Selago +digram +utinam +fidate +Arundo +hubbub +rerobe +Alfirk +Argive +buzzle +papist +omagra +arrest +lucken +crotyl +rantan +greund +cipher +maraca +blenny +hyoidal +Regulus +sphenic +werefox +Dagomba +unsonsy +reslide +Himawan +almadie +doarium +Barbara +gunyang +ecology +unvoted +dropout +shedded +neotype +wriggly +Zuludom +ruffled +runtish +cantlet +vitreal +distome +modulus +curlike +eveweed +waddler +akmudar +dibhole +lignose +copyist +addable +torques +acridyl +deraign +setline +preform +rarebit +lyncine +tarnish +pentace +lastage +gleaner +spiller +aplasia +trommel +goldish +stadium +unplied +grizzler +seabeard +slipcase +cobbling +guruship +antipope +hydremic +Seleucid +otosteon +islander +lacunose +nasiform +chloasma +indicium +Seidlitz +Bisharin +scission +moulinet +frampold +Macropus +overwake +stannate +gallbush +bakeoven +Cytherea +unrising +voltzite +unspared +Mongolic +Coccyzus +systolic +toilinet +everyone +alangine +perioeci +diapalma +parillin +binodose +unevaded +shillety +Andorran +apodosis +goodyism +capitoul +peaceman +anticous +obeisant +pulmonar +emeritus +apolysis +mismanage +hopscotch +anodynous +tetarcone +demilance +acuminose +unimbibed +typophile +rhagionid +bloodwort +splenulus +Argentine +resurface +kingdomed +outsnatch +octometer +morphemic +praepubis +unexcised +maliceful +waganging +monosperm +nailsmith +Volutidae +phenolate +delapsion +cabureiba +coxcombic +mesically +focimetry +spearwood +multirate +unteeming +forehatch +synedrial +commingle +grassweed +pelecypod +lodgerdom +phacocele +orthopedy +reticulum +recushion +pyromucyl +monkeynut +Carduelis +brotherly +luminesce +plumiform +orrisroot +anthochlor +rememberer +unslipping +militation +dextrorsal +mesomorphy +unsmutched +Hopkinsian +neuterness +termlessly +cryptogram +pinipicrin +overdrench +otherworld +multilobed +iconolatry +survigrous +semiuncial +chromatoid +precedence +gillhooter +antiplague +Girellidae +nestiatria +enthraldom +elasticize +claudetite +cryptopine +postmeatal +habitually +breathable +nonshedder +beneficial +undersweep +billposter +extraovate +rouvillite +anagenesis +hydrologic +lifesaving +shadowland +laboratory +permeative +copatentee +schizocyte +perihelial +approacher +cancrizans +prosthetic +barramundi diff --git a/tests/test_spell_backward.py b/tests/test_spell_backward.py index 022b8228..4495c383 100644 --- a/tests/test_spell_backward.py +++ b/tests/test_spell_backward.py @@ -76,7 +76,7 @@ def test_spell_backward_curriculum(): # test incrementing attribute levels curriculum.increment_attr_level("word_len") increased_cfg = curriculum.generate_configuration(base_value) - assert increased_cfg.min_word_len == 3 and increased_cfg.max_word_len == 4 + assert increased_cfg.min_word_len == 3 and increased_cfg.max_word_len == 5 # test decrementing attribute levels curriculum.decrement_attr_level("word_len") diff --git a/training/configs/curriculum/spell_backward.yaml b/training/configs/curriculum/spell_backward.yaml new file mode 100644 index 00000000..2c295ba0 --- /dev/null +++ b/training/configs/curriculum/spell_backward.yaml @@ -0,0 +1,211 @@ +hydra: + searchpath: + - file:///home/ubuntu/verl/verl/trainer/config + +defaults: + - ppo_trainer + - _self_ + +reasoning_gym: + dataset_size: 20000 + developer_prompt: DeepSeekZero + datasets: +curriculum: + enabled: True + schedule: + automatic: False + update_steps: 30 # automatic curriculum updating after 50 steps + last_k: 20 + success_threshold: 0.70 + failure_threshold: 0.10 + curricula: + spell_backward: + attribute_levels: + word_len: 0 +reward: + use_accuracy: True + secondary_rewards: + - name: cosine + scaling_factor: 0.3 + - name: format + scaling_factor: 0.2 + kwargs: + preappend_thinking_token: False + +data: + tokenizer: null + train_files: train.parquet + val_files: test.parquet + prompt_key: prompt + max_prompt_length: 512 + max_response_length: 1024 + train_batch_size: 32 + val_batch_size: 64 + return_raw_chat: True + return_raw_input_ids: True +actor_rollout_ref: + hybrid_engine: True + model: + path: Qwen/Qwen2.5-3B-Instruct + external_lib: null + override_config: { } + enable_gradient_checkpointing: True + use_remove_padding: True + actor: + strategy: fsdp # This is for backward-compatibility + ppo_mini_batch_size: 16 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: 4 + use_dynamic_bsz: False + ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length} + grad_clip: 1.0 + clip_ratio: 0.2 + entropy_coeff: 0.001 + use_kl_loss: True # True for GRPO + kl_loss_coef: 0.001 # for grpo + kl_loss_type: low_var_kl # for grpo + ppo_epochs: 1 + shuffle: False + ulysses_sequence_parallel_size: 1 # sp size + optim: + lr: 1e-6 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: 400 # must be override by program + fsdp_config: + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + param_offload: False + optimizer_offload: False + fsdp_size: -1 + ref: + fsdp_config: + param_offload: True + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size + rollout: + name: vllm + temperature: 1.0 + top_k: -1 # 0 for hf rollout, -1 for vllm rollout + top_p: 1 + prompt_length: ${data.max_prompt_length} # not use for opensource + response_length: ${data.max_response_length} + # for vllm rollout + dtype: bfloat16 # should align with FSDP + gpu_memory_utilization: 0.7 + ignore_eos: False + enforce_eager: True + free_cache_engine: True + load_format: dummy_dtensor + tensor_model_parallel_size: 4 + max_num_batched_tokens: 12288 + max_num_seqs: 1024 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 160 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + disable_log_stats: True + enable_chunked_prefill: True # could get higher throughput + # for hf rollout + do_sample: True + use_fire_sampling: False + max_model_len: 12288 + # number of responses (i.e. num sample times) + n: 8 # > 1 for grpo + val_kwargs: + do_sample: True + +algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + kl_penalty: kl # how to estimate kl divergence + kl_ctrl: + type: fixed + kl_coef: 0.001 +verbose: True +trainer: + balance_batch: True + total_epochs: 1 + total_training_steps: 400 + project_name: rg-test + experiment_name: intra_reasoning_algebra_qwen_3b_composite + logger: [ 'console', 'wandb' ] + val_generations_to_log_to_wandb: 0 + nnodes: 1 + n_gpus_per_node: 4 + save_freq: 100 + # auto: find the last ckpt to resume. If can't find, start from scratch + resume_mode: auto # or auto or resume_path if + resume_from_path: False + test_freq: 100 + critic_warmup: 0 + default_hdfs_dir: null + remove_previous_ckpt_in_save: False + del_local_ckpt_after_load: False + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + + +critic: + strategy: fsdp + optim: + lr: 1e-5 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: null # only useful for warmup with cosine + warmup_style: constant # select from constant/cosine + total_training_steps: -1 # must be override by program + model: + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${actor_rollout_ref.model.path} + override_config: { } + external_lib: ${actor_rollout_ref.model.external_lib} + enable_gradient_checkpointing: True + use_remove_padding: False + fsdp_config: + param_offload: False + optimizer_offload: False + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + fsdp_size: -1 + ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: null + forward_micro_batch_size: ${critic.ppo_micro_batch_size} + forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} + use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 + forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: 1 # sp size + ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} + shuffle: ${actor_rollout_ref.actor.shuffle} + grad_clip: 1.0 + cliprange_value: 0.5 + +# Reward model not used for GRPO +reward_model: + enable: False + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + use_remove_padding: False + fsdp_config: + min_num_params: 0 + param_offload: False + fsdp_size: -1 + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} diff --git a/training/trainers/ray_grpo_trainer.py b/training/trainers/ray_grpo_trainer.py index 7c414ccb..acbaa740 100644 --- a/training/trainers/ray_grpo_trainer.py +++ b/training/trainers/ray_grpo_trainer.py @@ -367,12 +367,11 @@ class RayGRPOTrainer(RayPPOTrainer): if self.global_steps % self.config.curriculum.schedule.update_steps == 0: self.train_dataset.experiment.update_difficulty(dataset_name, method="increment") else: - print(grouped_scores) for dataset_name in grouped_scores.keys(): if ( grouped_scores[dataset_name]["results"] > self.config.curriculum.success_threshold ) and (grouped_scores[dataset_name]["total_samples"] >= self.config.curriculum.last_k): - self.train_dataset.update_experiment_difficulty(dataset_name, method="increment") + self.train_dataset.experiment.update_difficulty(dataset_name, method="increment") metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic)) metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))