diff --git a/reasoning_gym/algorithmic/spell_backward.py b/reasoning_gym/algorithmic/spell_backward.py index f26f1d19..b0249fd8 100644 --- a/reasoning_gym/algorithmic/spell_backward.py +++ b/reasoning_gym/algorithmic/spell_backward.py @@ -19,6 +19,7 @@ class SpellBackwardConfig: min_word_len: int = 3 # Minimum word length max_word_len: int = 10 # Maximum word length seed: Optional[int] = None + data_file: str = "words3to10.txt" size: int = 500 # Virtual dataset size def validate(self) -> None: @@ -34,7 +35,7 @@ class SpellBackwardDataset(ProceduralDataset): super().__init__(config=config, seed=config.seed, size=config.size) # Load and preprocess text - text = read_data_file("words3to10.txt") + text = read_data_file(self.config.data_file) self.words = [ word.strip() for word in text.splitlines() @@ -73,9 +74,9 @@ class SpellBackwardDataset(ProceduralDataset): if expected_answer == answer: reward = 1.0 else: - answer_len = len(answer) + answer_len = len(expected_answer) for i in range(len(expected_answer)): - if (i < len(expected_answer) and i < len(answer)) and expected_answer[i] == answer[i]: + if i < len(expected_answer) and i < len(answer): if expected_answer[i] == answer[i]: reward += 1 / answer_len else: @@ -96,7 +97,7 @@ class SpellBackwardCurriculum(BaseCurriculum): self._define_attributes( RangeAttributeDefinition( name="word_len", - levels=list(range(3, 11)), + levels=list(range(3, 10, 1)), description="Word length", lower_field_name="min_word_len", upper_field_name="max_word_len", diff --git a/reasoning_gym/coaching/curriculum_config.py b/reasoning_gym/coaching/curriculum_config.py index 7b431204..452e29a3 100644 --- a/reasoning_gym/coaching/curriculum_config.py +++ b/reasoning_gym/coaching/curriculum_config.py @@ -54,7 +54,6 @@ class CurriculumExperimentConfig: if not isinstance(data, dict): raise ValueError("YAML data must contain a dictionary") - if "curricula" not in data: raise ValueError("YAML data must contain a 'curricula' key") diff --git a/reasoning_gym/coaching/experiment.py b/reasoning_gym/coaching/experiment.py index bb6b74d0..53384e62 100644 --- a/reasoning_gym/coaching/experiment.py +++ b/reasoning_gym/coaching/experiment.py @@ -1,6 +1,6 @@ """Experiment class combining dataset, scoreboard and curriculum.""" -from typing import Any, Optional +from typing import Any, Literal, Optional from reasoning_gym.coaching.base_curriculum import CurriculumContext @@ -27,7 +27,8 @@ class Experiment: entry = dataset[index] score = dataset.score_answer(answer, entry) metadata = entry["metadata"] - self.score_board.add_score(score, metadata, conversation) + score_board_metadata = {"difficulty": metadata["difficulty"], "source_dataset": metadata["source_dataset"]} + self.score_board.add_score(dataset_name, score, score_board_metadata, conversation) return score @classmethod @@ -97,7 +98,15 @@ class CurriculumExperiment(Experiment): self.curriculum_config = config self.context = context - def update_difficulty(self): + def update_difficulty(self, dataset_name: str, method: Literal["increment", "decrement"]): """Update difficulty levels based on performance metrics""" - # TODO: Implement difficulty adjustment logic - pass + if method not in ["increment", "decrement"]: + raise ValueError(f"Invalid method: {method}") + + if method == "increment": + self.curricula[dataset_name].increment_global_level() + elif method == "decrement": + self.curricula[dataset_name].decrement_global_level() + + config = self.curricula[dataset_name].get_global_level() + self.composite.update_dataset_config(dataset_name, config) diff --git a/reasoning_gym/coaching/score_board.py b/reasoning_gym/coaching/score_board.py index 69413473..533e201b 100644 --- a/reasoning_gym/coaching/score_board.py +++ b/reasoning_gym/coaching/score_board.py @@ -114,11 +114,13 @@ class GroupedScores: class ScoreBoard: """Tracks scores and metadata for coaching sessions""" - scores: list[float] = field(default_factory=list) - metadata: list[dict[str, Any]] = field(default_factory=list) - conversations: list[Optional[list[dict]]] = field(default_factory=list) + scores: dict[str, list[float]] = field(default_factory=dict) + metadata: dict[str, list[dict[str, Any]]] = field(default_factory=dict) + conversations: dict[str, list[Optional[list[dict]]]] = field(default_factory=dict) - def add_score(self, score: float, metadata: dict[str, Any], conversation: Optional[list[dict]] = None) -> None: + def add_score( + self, dataset_name: str, score: float, metadata: dict[str, Any], conversation: Optional[list[dict]] = None + ) -> None: """Add a new score entry with associated metadata and optional conversation Args: @@ -126,15 +128,19 @@ class ScoreBoard: metadata: Dictionary of metadata about the task/attempt conversation: Optional list of conversation turns as dicts """ - self.scores.append(score) - self.metadata.append(metadata) - self.conversations.append(conversation) + if dataset_name not in self.scores: + self.scores[dataset_name] = [] + self.metadata[dataset_name] = [] + self.conversations[dataset_name] = [] + self.scores[dataset_name].append(score) + self.metadata[dataset_name].append(metadata) + self.conversations[dataset_name].append(conversation) - def clear(self) -> None: + def clear(self, dataset_name: str) -> None: """Clear all stored scores, metadata and conversations""" - self.scores.clear() - self.metadata.clear() - self.conversations.clear() + self.scores[dataset_name] = [] + self.metadata[dataset_name] = [] + self.conversations[dataset_name] = [] def __len__(self) -> int: """Return the number of stored scores""" @@ -147,7 +153,7 @@ class ScoreBoard: placed first in the tuple as ("source", dataset) and ("idx", index). """ # Start with empty list - key_items = [("source", metadata["source_dataset"]), ("idx", metadata["source_index"])] + key_items = [("source", metadata["source_dataset"])] # Add difficulty parameters or other metadata if "difficulty" in metadata: @@ -155,39 +161,52 @@ class ScoreBoard: items = metadata["difficulty"].items() else: # Use all metadata except source info - items = ((k, v) for k, v in metadata.items() if k not in ("source_dataset", "source_index")) + items = ((k, v) for k, v in metadata.items() if k not in ("source_dataset")) # Add remaining items in sorted order key_items.extend(sorted((str(k), v) for k, v in items)) return tuple(key_items) - def aggregate(self, last_n: Optional[int] = None) -> GroupedScores: - """Aggregate scores by difficulty parameters or full metadata if no difficulty present + def aggregate(self, last_n: Optional[int] = None) -> dict[str, GroupedScores]: + """Aggregate scores by dataset name and then by difficulty parameters Args: last_n: Optional number of most recent entries to consider - If None, use all entries + If None, use all entries Returns: - OrderedDict mapping difficulty parameter combinations to lists of scores - Keys are tuples of (param_name, value) pairs, sorted by param_name + Dictionary mapping dataset names to their respective GroupedScores objects + Each GroupedScores contains scores grouped by difficulty parameters for that dataset """ if not self.scores: - return GroupedScores(scores=OrderedDict(), total_scores=0) + return {} - # Determine start index for iteration - start_idx = max(0, len(self.scores) - last_n) if last_n is not None else 0 + # Create a nested structure: dataset -> parameter groups -> scores + result = {} - # Group scores by difficulty parameters without creating intermediate lists - result = OrderedDict() - for i in range(start_idx, len(self.scores)): - key = self._metadata_to_key(self.metadata[i]) - if key not in result: - result[key] = [] - result[key].append(self.scores[i]) + # Process each dataset + for dataset_name, dataset_scores in self.scores.items(): + # Determine start index for this dataset + dataset_len = len(dataset_scores) + start_idx = max(0, dataset_len - last_n) if last_n is not None else 0 - # Count total scores - total_scores = sum(len(scores) for scores in result.values()) + # Create OrderedDict for this dataset's parameter groupings + dataset_groups = OrderedDict() - return GroupedScores(scores=result, total_scores=total_scores) + # Process scores for this dataset + for i in range(start_idx, dataset_len): + # Get metadata for this score + metadata = self.metadata[dataset_name][i] + params = self._metadata_to_key(metadata) + + if params not in dataset_groups: + dataset_groups[params] = [] + + dataset_groups[params].append(dataset_scores[i]) + + # Create a GroupedScores object for this dataset + total_scores = sum(len(scores) for scores in dataset_groups.values()) + result[dataset_name] = GroupedScores(scores=dataset_groups, total_scores=total_scores) + + return result diff --git a/training/configs/qwen2.5_3b_grpo.yaml b/training/configs/qwen2.5_1.5b_grpo.yaml similarity index 94% rename from training/configs/qwen2.5_3b_grpo.yaml rename to training/configs/qwen2.5_1.5b_grpo.yaml index 644653df..e2536bb3 100644 --- a/training/configs/qwen2.5_3b_grpo.yaml +++ b/training/configs/qwen2.5_1.5b_grpo.yaml @@ -1,19 +1,12 @@ reasoning_gym: - dataset_size: 20000 + dataset_size: 10000 developer_prompt: DeepSeekZero datasets: - mini_sudoku: - weight: 0.33 - config: - min_empty: 6 - futoshiki: - weight: 0.33 - config: - max_board_size: 5 - sudoku: - weight: 0.34 - config: - min_empty: 20 + spell_backward: + weight: 1 + config: + min_word_len: 3 + max_word_len: 10 curriculum: enabled: False schedule: @@ -29,8 +22,10 @@ curriculum: reward: use_accuracy: True secondary_rewards: - - name: length - scaling_factor: 1.0 + - name: cosine + scaling_factor: 0.3 + - name: format + scaling_factor: 0.2 data: tokenizer: null @@ -46,7 +41,7 @@ data: actor_rollout_ref: hybrid_engine: True model: - path: Qwen/Qwen2.5-3B-Instruct + path: Qwen/Qwen2.5-1.5B-Instruct external_lib: null override_config: { } enable_gradient_checkpointing: True @@ -72,7 +67,7 @@ actor_rollout_ref: lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime min_lr_ratio: null # only useful for warmup with cosine warmup_style: constant # select from constant/cosine - total_training_steps: -1 # must be override by program + total_training_steps: 200 # must be override by program fsdp_config: wrap_policy: # transformer_layer_cls_to_wrap: None @@ -142,7 +137,7 @@ trainer: val_generations_to_log_to_wandb: 0 nnodes: 1 n_gpus_per_node: 2 - save_freq: 100 + save_freq: 50 # auto: find the last ckpt to resume. If can't find, start from scratch resume_mode: auto # or auto or resume_path if resume_from_path: False diff --git a/training/configs/qwen2.5_3b_grpo_composite.yaml b/training/configs/qwen2.5_1.5b_grpo_composite.yaml similarity index 82% rename from training/configs/qwen2.5_3b_grpo_composite.yaml rename to training/configs/qwen2.5_1.5b_grpo_composite.yaml index ae7fae4b..6f0121ff 100644 --- a/training/configs/qwen2.5_3b_grpo_composite.yaml +++ b/training/configs/qwen2.5_1.5b_grpo_composite.yaml @@ -2,26 +2,34 @@ reasoning_gym: dataset_size: 20000 developer_prompt: DeepSeekZero datasets: - mini_sudoku: - weight: 0.33 - config: - min_empty: 6 - futoshiki: - weight: 0.33 - config: - max_board_size: 5 - sudoku: - weight: 0.34 - config: - min_empty: 20 + spell_backward: + weight: 0.33 + config: + min_word_len: 3 + max_word_len: 10 + letter_counting: + weight: 0.34 + config: + min_words: 5 + max_words: 15 + number_sorting: + weight: 0.33 + config: + min_numbers: 3 + max_numbers: 10 + min_decimals: 0 + max_decimals: 2 + min_value: -100 + max_value: 100 + curriculum: enabled: False schedule: automatic: True update_steps: 30 # automatic curriculum updating after 50 steps last_k: 20 - success_threshold: 0.7 - failure_threshold: 0.1 + success_threshold: 0.70 + failure_threshold: 0.10 curricula: spell_backward: attribute_levels: @@ -29,6 +37,8 @@ curriculum: reward: use_accuracy: True secondary_rewards: + - name: cosine + scaling_factor: 0.3 - name: format scaling_factor: 0.2 @@ -40,21 +50,20 @@ data: max_prompt_length: 512 max_response_length: 1024 train_batch_size: 32 - val_batch_size: 32 + val_batch_size: 64 return_raw_chat: True return_raw_input_ids: True - actor_rollout_ref: hybrid_engine: True model: - path: Qwen/Qwen2.5-3B-Instruct + path: Qwen/Qwen2.5-1.5B-Instruct external_lib: null override_config: { } enable_gradient_checkpointing: True use_remove_padding: True actor: strategy: fsdp # This is for backward-compatibility - ppo_mini_batch_size: 32 + ppo_mini_batch_size: 16 ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu ppo_micro_batch_size_per_gpu: 8 use_dynamic_bsz: False @@ -70,10 +79,10 @@ actor_rollout_ref: ulysses_sequence_parallel_size: 1 # sp size optim: lr: 1e-6 - lr_warmup_steps_ratio: 0 # the total steps will be injected during runtime + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime min_lr_ratio: null # only useful for warmup with cosine - warmup_style: constant # select from constant/cosine - total_training_steps: -1 # must be override by program + warmup_style: constant # select from constant/cosine + total_training_steps: 200 # must be override by program fsdp_config: wrap_policy: # transformer_layer_cls_to_wrap: None @@ -101,13 +110,13 @@ actor_rollout_ref: response_length: ${data.max_response_length} # for vllm rollout dtype: bfloat16 # should align with FSDP - gpu_memory_utilization: 0.6 + gpu_memory_utilization: 0.7 ignore_eos: False enforce_eager: True free_cache_engine: True load_format: dummy_dtensor - tensor_model_parallel_size: 4 - max_num_batched_tokens: 16384 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 12288 max_num_seqs: 1024 log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu log_prob_micro_batch_size_per_gpu: 160 @@ -118,7 +127,7 @@ actor_rollout_ref: # for hf rollout do_sample: True use_fire_sampling: False - max_model_len: 16384 + max_model_len: 12288 # number of responses (i.e. num sample times) n: 8 # > 1 for grpo val_kwargs: @@ -135,15 +144,15 @@ algorithm: verbose: True trainer: balance_batch: True - total_epochs: 5 + total_epochs: 1 total_training_steps: null project_name: rg-test - experiment_name: verl_grpo_qwen_composite + experiment_name: verl_grpo_qwen_3b_composite logger: [ 'console', 'wandb' ] val_generations_to_log_to_wandb: 0 nnodes: 1 - n_gpus_per_node: 4 - save_freq: 100 + n_gpus_per_node: 2 + save_freq: 50 # auto: find the last ckpt to resume. If can't find, start from scratch resume_mode: auto # or auto or resume_path if resume_from_path: False @@ -154,13 +163,14 @@ trainer: del_local_ckpt_after_load: False default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + critic: strategy: fsdp optim: - lr: 1e-6 - lr_warmup_steps_ratio: 0 # the total steps will be injected during runtime + lr: 1e-5 + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime min_lr_ratio: null # only useful for warmup with cosine - warmup_style: cosine # select from constant/cosine + warmup_style: constant # select from constant/cosine total_training_steps: -1 # must be override by program model: path: ~/models/deepseek-llm-7b-chat @@ -178,7 +188,7 @@ critic: fsdp_size: -1 ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu - ppo_micro_batch_size_per_gpu: 8 + ppo_micro_batch_size_per_gpu: null forward_micro_batch_size: ${critic.ppo_micro_batch_size} forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} diff --git a/training/configs/qwen2.5_1.5b_grpo_curr.yaml b/training/configs/qwen2.5_1.5b_grpo_curr.yaml index 31c70423..f0ced4bd 100644 --- a/training/configs/qwen2.5_1.5b_grpo_curr.yaml +++ b/training/configs/qwen2.5_1.5b_grpo_curr.yaml @@ -1,20 +1,31 @@ reasoning_gym: - dataset_size: 10000 - enable_curriculum_learning: True + dataset_size: 20000 developer_prompt: DeepSeekZero -reward: - secondary_rewards: - - name: format - scaling_factor: 0.5 + datasets: + spell_backward: + weight: 1 + config: + min_word_len: 3 + max_word_len: 10 curriculum: enabled: True - last_k: 30 - success_threshold: 0.7 - failure_threshold: 0.1 + schedule: + automatic: False + update_steps: 30 # automatic curriculum updating after 50 steps + last_k: 5120 # Minimum number of samples needed for model to exceeded specific threshold - 20*num_generations*batch_size + success_threshold: 0.70 + failure_threshold: 0.10 curricula: spell_backward: attribute_levels: word_len: 0 +reward: + use_accuracy: True + secondary_rewards: + - name: cosine + scaling_factor: 0.3 + - name: format + scaling_factor: 0.2 data: tokenizer: null @@ -23,11 +34,10 @@ data: prompt_key: prompt max_prompt_length: 512 max_response_length: 1024 - train_batch_size: 64 + train_batch_size: 32 val_batch_size: 64 - return_raw_input_ids: True # This should be set to true when the tokenizer between policy and rm differs return_raw_chat: True - + return_raw_input_ids: True actor_rollout_ref: hybrid_engine: True model: @@ -38,9 +48,9 @@ actor_rollout_ref: use_remove_padding: True actor: strategy: fsdp # This is for backward-compatibility - ppo_mini_batch_size: 32 + ppo_mini_batch_size: 16 ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu - ppo_micro_batch_size_per_gpu: 16 + ppo_micro_batch_size_per_gpu: 8 use_dynamic_bsz: False ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length} grad_clip: 1.0 @@ -57,7 +67,7 @@ actor_rollout_ref: lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime min_lr_ratio: null # only useful for warmup with cosine warmup_style: constant # select from constant/cosine - total_training_steps: -1 # must be override by program + total_training_steps: 200 # must be override by program fsdp_config: wrap_policy: # transformer_layer_cls_to_wrap: None @@ -78,7 +88,6 @@ actor_rollout_ref: ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size rollout: name: vllm - max_model_len: 512 temperature: 1.0 top_k: -1 # 0 for hf rollout, -1 for vllm rollout top_p: 1 @@ -86,13 +95,13 @@ actor_rollout_ref: response_length: ${data.max_response_length} # for vllm rollout dtype: bfloat16 # should align with FSDP - gpu_memory_utilization: 0.6 + gpu_memory_utilization: 0.7 ignore_eos: False enforce_eager: True free_cache_engine: True load_format: dummy_dtensor tensor_model_parallel_size: 2 - max_num_batched_tokens: 8192 + max_num_batched_tokens: 12288 max_num_seqs: 1024 log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu log_prob_micro_batch_size_per_gpu: 160 @@ -103,6 +112,7 @@ actor_rollout_ref: # for hf rollout do_sample: True use_fire_sampling: False + max_model_len: 12288 # number of responses (i.e. num sample times) n: 8 # > 1 for grpo val_kwargs: @@ -119,15 +129,15 @@ algorithm: verbose: True trainer: balance_batch: True - total_epochs: 10 + total_epochs: 1 total_training_steps: null project_name: rg-test - experiment_name: verl_grpo_qwen_curr + experiment_name: verl_grpo_qwen_3b_curr logger: [ 'console', 'wandb' ] val_generations_to_log_to_wandb: 0 nnodes: 1 n_gpus_per_node: 2 - save_freq: 100 + save_freq: 50 # auto: find the last ckpt to resume. If can't find, start from scratch resume_mode: auto # or auto or resume_path if resume_from_path: False @@ -136,7 +146,8 @@ trainer: default_hdfs_dir: null remove_previous_ckpt_in_save: False del_local_ckpt_after_load: False - default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} + default_local_dir: /workspace/joe/checkpoints/checkpoints/${trainer.project_name}/${trainer.experiment_name} + critic: strategy: fsdp diff --git a/training/configs/qwen2.5_3b_grpo_curr.yaml b/training/configs/qwen2.5_3b_grpo_curr.yaml deleted file mode 100644 index 405b9170..00000000 --- a/training/configs/qwen2.5_3b_grpo_curr.yaml +++ /dev/null @@ -1,201 +0,0 @@ -reasoning_gym: - dataset_size: 10000 - enable_curriculum_learning: True - developer_prompt: DeepSeekZero -curriculum: - enabled: True - schedule: - automatic: True - update_steps: 30 # automatic curriculum updating after 50 steps - last_k: 20 - success_threshold: 0.7 - failure_threshold: 0.1 - curricula: - spell_backward: - attribute_levels: - word_len: 0 -reward: - use_accuracy: false - secondary_rewards: - - name: cosine - scaling_factor: 2 - - name: format - scaling_factor: 0.5 - -data: - tokenizer: null - train_files: train.parquet - val_files: test.parquet - prompt_key: prompt - max_prompt_length: 512 - max_response_length: 1024 - train_batch_size: 128 - val_batch_size: 128 - return_raw_chat: True - return_raw_input_ids: True - -actor_rollout_ref: - hybrid_engine: True - model: - path: Qwen/Qwen2.5-3B-Instruct - external_lib: null - override_config: { } - enable_gradient_checkpointing: True - use_remove_padding: True - actor: - strategy: fsdp # This is for backward-compatibility - ppo_mini_batch_size: 32 - ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu - ppo_micro_batch_size_per_gpu: 8 - use_dynamic_bsz: False - ppo_max_token_len_per_gpu: 12288 # n * ${data.max_prompt_length} + ${data.max_response_length} - grad_clip: 1.0 - clip_ratio: 0.2 - entropy_coeff: 0.001 - use_kl_loss: True # True for GRPO - kl_loss_coef: 0.001 # for grpo - kl_loss_type: low_var_kl # for grpo - ppo_epochs: 1 - shuffle: False - ulysses_sequence_parallel_size: 1 # sp size - optim: - lr: 1e-6 - lr_warmup_steps_ratio: 0.1 # the total steps will be injected during runtime - min_lr_ratio: 0.1 # only useful for warmup with cosine - warmup_style: cosine # select from constant/cosine - total_training_steps: -1 # must be override by program - fsdp_config: - wrap_policy: - # transformer_layer_cls_to_wrap: None - min_num_params: 0 - param_offload: False - optimizer_offload: False - fsdp_size: -1 - ref: - fsdp_config: - param_offload: True - wrap_policy: - # transformer_layer_cls_to_wrap: None - min_num_params: 0 - log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu - log_prob_micro_batch_size_per_gpu: 16 - log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} - ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size - rollout: - name: vllm - max_model_len: 1024 - temperature: 0.7 - top_k: -1 # 0 for hf rollout, -1 for vllm rollout - top_p: 1 - prompt_length: ${data.max_prompt_length} # not use for opensource - response_length: ${data.max_response_length} - # for vllm rollout - dtype: bfloat16 # should align with FSDP - gpu_memory_utilization: 0.6 - ignore_eos: False - enforce_eager: True - free_cache_engine: True - load_format: dummy_dtensor - tensor_model_parallel_size: 4 - max_num_batched_tokens: 8192 - max_num_seqs: 1024 - log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu - log_prob_micro_batch_size_per_gpu: 16 - log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} - disable_log_stats: True - enable_chunked_prefill: True # could get higher throughput - # for hf rollout - do_sample: True - use_fire_sampling: False - # number of responses (i.e. num sample times) - n: 8 # > 1 for grpo - val_kwargs: - do_sample: True - -algorithm: - gamma: 1.0 - lam: 1.0 - adv_estimator: grpo - kl_penalty: kl # how to estimate kl divergence - kl_ctrl: - type: fixed - kl_coef: 0.001 -verbose: True -trainer: - balance_batch: True - total_epochs: 5 - total_training_steps: null - project_name: rg-test - experiment_name: verl_grpo_qwen_curr - logger: [ 'console', 'wandb' ] - val_generations_to_log_to_wandb: 0 - nnodes: 1 - n_gpus_per_node: 4 - save_freq: 50 - # auto: find the last ckpt to resume. If can't find, start from scratch - resume_mode: auto # or auto or resume_path if - resume_from_path: False - test_freq: 300 - critic_warmup: 0 - default_hdfs_dir: null - remove_previous_ckpt_in_save: False - del_local_ckpt_after_load: False - default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} - -critic: - strategy: fsdp - optim: - lr: 1e-6 - lr_warmup_steps_ratio: 0.1 # the total steps will be injected during runtime - min_lr_ratio: null # only useful for warmup with cosine - warmup_style: cosine # select from constant/cosine - total_training_steps: -1 # must be override by program - model: - path: ~/models/deepseek-llm-7b-chat - tokenizer_path: ${actor_rollout_ref.model.path} - override_config: { } - external_lib: ${actor_rollout_ref.model.external_lib} - enable_gradient_checkpointing: True - use_remove_padding: False - fsdp_config: - param_offload: False - optimizer_offload: False - wrap_policy: - # transformer_layer_cls_to_wrap: None - min_num_params: 0 - fsdp_size: -1 - ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} - ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu - ppo_micro_batch_size_per_gpu: null - forward_micro_batch_size: ${critic.ppo_micro_batch_size} - forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu} - use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} - ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 - forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} - ulysses_sequence_parallel_size: 1 # sp size - ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} - shuffle: ${actor_rollout_ref.actor.shuffle} - grad_clip: 1.0 - cliprange_value: 0.5 - -# Reward model not used for GRPO -reward_model: - enable: False - strategy: fsdp - model: - input_tokenizer: ${actor_rollout_ref.model.path} - path: ~/models/FsfairX-LLaMA3-RM-v0.1 - external_lib: ${actor_rollout_ref.model.external_lib} - use_remove_padding: False - fsdp_config: - min_num_params: 0 - param_offload: False - fsdp_size: -1 - micro_batch_size: null - micro_batch_size_per_gpu: null - max_length: null - ulysses_sequence_parallel_size: 1 - use_dynamic_bsz: ${critic.use_dynamic_bsz} - forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}