diff --git a/example_trainer/data.py b/example_trainer/data.py index bb3ebbdb..16a38564 100644 --- a/example_trainer/data.py +++ b/example_trainer/data.py @@ -5,7 +5,7 @@ Handles data retrieval from Atropos API, padding, batching, and advantage normalization. Also extracts inference logprobs for proper GRPO loss computation: -- Inference logprobs serve as π_old (reference policy) for importance sampling +- Inference logprobs are used in importance-ratio computation - They are batched and padded to align token-by-token with training labels """ diff --git a/example_trainer/training.py b/example_trainer/training.py index 92e18f2e..035d45c7 100644 --- a/example_trainer/training.py +++ b/example_trainer/training.py @@ -287,7 +287,7 @@ def run_training_step( temperature_batches: List of temperature tensors config: Training configuration (includes clip_eps, warmup_steps) step_idx: Current global training step (0-based) - inference_logprob_batches: Batched logprobs from inference (π_old), aligned with labels + inference_logprob_batches: Rollout logprobs from inference, aligned with labels Returns: Dict of training metrics for this step