diff --git a/training/trainers/ray_grpo_trainer.py b/training/trainers/ray_grpo_trainer.py index fecca6c1..6e793ccc 100644 --- a/training/trainers/ray_grpo_trainer.py +++ b/training/trainers/ray_grpo_trainer.py @@ -330,7 +330,7 @@ class RayGRPOTrainer(RayPPOTrainer): batch.batch["token_level_scores"] = reward_tensor # compute rewards. apply_kl_penalty if available - if config.algorithm.use_kl_in_reward: + if self.config.algorithm.use_kl_in_reward: batch, kl_metrics = apply_kl_penalty( batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty )