From f02c24204d12d989600c1591ba583dd2c48b994b Mon Sep 17 00:00:00 2001
From: Partho Das <parthodas6176@gmail.com>
Date: Mon, 23 Feb 2026 15:03:17 +0530
Subject: [PATCH 1/7] rm hardcoded same score check

---
 .../answer_format_environment.py                          | 8 --------
 environments/community/cat_behavior_env/catbot_arena.py   | 2 --
 environments/community/dynastai/dynastai_server.py        | 5 -----
 .../community/mcp_tool_calling/tool_calling_server.py     | 4 ----
 .../options_iv_prediction/options_iv_prediction.py        | 4 ----
 environments/community/pay_to_play/pay_to_play_env.py     | 4 ----
 environments/community/physical_space_stl/physical_env.py | 3 ---
 environments/community/regex_generation/regex_env.py      | 4 ----
 .../solitaire_winning_probability/solitaire_server.py     | 2 --
 environments/community/sql_query_env/sql_query_env.py     | 3 ---
 .../community/wikipedia_research/tool_calling_server.py   | 4 ----
 .../eval_environments/pairwise_judgement_environment.py   | 4 ----
 environments/fundamental_prediction_environment.py        | 4 ----
 environments/gsm8k_server.py                              | 2 --
 environments/gsm8k_server_axolotl.py                      | 2 --
 .../letter_counting_environment.py                        | 7 -------
 environments/math_server.py                               | 4 ----
 environments/math_server_zero.py                          | 4 ----
 environments/mcp_env.py                                   | 3 ---
 environments/mcqa_thinking_env.py                         | 4 ----
 environments/text_reversal_environment.py                 | 4 ----
 environments/tool_calling_server.py                       | 4 ----
 22 files changed, 85 deletions(-)

diff --git a/environments/answer_format_environment/answer_format_environment.py b/environments/answer_format_environment/answer_format_environment.py
index 949f8f9e..c84d53c7 100644
--- a/environments/answer_format_environment/answer_format_environment.py
+++ b/environments/answer_format_environment/answer_format_environment.py
@@ -3653,14 +3653,6 @@ class AnswerFormatEnv(BaseEnv):
                     )
                 await self._save_failed_rollouts_to_jsonl()
 
-        # Check if all scores are the same (no learning signal)
-        if all(group_scores[0] == score for score in group_scores):
-            if self.debug_logging:
-                self.logger.debug(
-                    "All scores are identical, returning None for learning signal"
-                )
-            return None
-
         # Track successful groups for equivalent ratio enforcement
         if self.ensure_equivalent_ratios:
             # Count this as a successful group if we have any successful examples
diff --git a/environments/community/cat_behavior_env/catbot_arena.py b/environments/community/cat_behavior_env/catbot_arena.py
index 1b2d69fd..8de5a843 100644
--- a/environments/community/cat_behavior_env/catbot_arena.py
+++ b/environments/community/cat_behavior_env/catbot_arena.py
@@ -285,8 +285,6 @@ class GSM8kEnv(BaseEnv):
                         percentage_of_range = min(percentage_of_range, 1.0)
                         # Apply linear penalty scaling from 1.0 down to 0.0
                         scores["scores"].append(1.0 - percentage_of_range)
-            if all([scores["scores"][0] == score for score in scores["scores"]]):
-                return None  # If all the same, we return None
             return scores
         else:
             # If the gold solution is not parseable, we return None
diff --git a/environments/community/dynastai/dynastai_server.py b/environments/community/dynastai/dynastai_server.py
index 1ea3af05..0678e96e 100755
--- a/environments/community/dynastai/dynastai_server.py
+++ b/environments/community/dynastai/dynastai_server.py
@@ -503,11 +503,6 @@ class DynastAIEnv(BaseEnv):
         for score in scores["scores"]:
             self.percent_correct_buffer.append(max(score, 0))
 
-        # Check if all the same
-        if all([score == scores["scores"][0] for score in scores["scores"]]):
-            print("[DYNASTAI] All scores identical, returning None")
-            return None  # If all the same, we return None
-
         return scores
 
     async def get_next_item(self) -> DynastAIRow:
diff --git a/environments/community/mcp_tool_calling/tool_calling_server.py b/environments/community/mcp_tool_calling/tool_calling_server.py
index 3c765c72..57d2cd56 100644
--- a/environments/community/mcp_tool_calling/tool_calling_server.py
+++ b/environments/community/mcp_tool_calling/tool_calling_server.py
@@ -404,10 +404,6 @@ class SingleToolCallingEnv(BaseEnv):
                     # Apply linear penalty scaling from 1.0 down to 0.0
                     scores["scores"].append(1.0 - percentage_of_range)
 
-        # Check if all scores are the same (no learning signal)
-        if all(scores["scores"][0] == score for score in scores["scores"]):
-            return None
-
         return scores
 
     async def get_next_item(self):
diff --git a/environments/community/options_iv_prediction/options_iv_prediction.py b/environments/community/options_iv_prediction/options_iv_prediction.py
index 16d7b9fe..d771516e 100644
--- a/environments/community/options_iv_prediction/options_iv_prediction.py
+++ b/environments/community/options_iv_prediction/options_iv_prediction.py
@@ -433,10 +433,6 @@ class OptionsIVPrediction(BaseEnv):
             if len(scores["tokens"]) >= self.config.group_size:
                 break
 
-        # Return None if all scores are the same (no learning signal)
-        if all(scores["scores"][0] == score for score in scores["scores"]):
-            return None
-
         return scores
 
     async def rollout_and_score_eval(self, test_item):
diff --git a/environments/community/pay_to_play/pay_to_play_env.py b/environments/community/pay_to_play/pay_to_play_env.py
index 1eacdb8b..d9760d41 100644
--- a/environments/community/pay_to_play/pay_to_play_env.py
+++ b/environments/community/pay_to_play/pay_to_play_env.py
@@ -777,10 +777,6 @@ End your evaluation with \\boxed{{score}} where score is your numerical rating.
         if hasattr(self, "last_agent_card_feedback"):
             self.last_agent_card_feedback = agent_card_feedback
 
-        # Ensure we have different scores for training signal
-        if len(set(scores["scores"])) == 1:
-            return None
-
         return scores
 
     def _extract_score_from_agent_card(self, agent_card_response: str) -> float:
diff --git a/environments/community/physical_space_stl/physical_env.py b/environments/community/physical_space_stl/physical_env.py
index 7cedb2ea..91c56672 100644
--- a/environments/community/physical_space_stl/physical_env.py
+++ b/environments/community/physical_space_stl/physical_env.py
@@ -409,9 +409,6 @@ class PhysicalEnv(BaseEnv):
                         percentage_of_range = min(percentage_of_range, 1.0)
                         scores["scores"].append(1.0 - percentage_of_range)
 
-        if all([scores["scores"][0] == score for score in scores["scores"]]):
-            return None  # If all the same, we return None
-
         return scores
 
     async def get_next_item(self) -> PhysicalRow:
diff --git a/environments/community/regex_generation/regex_env.py b/environments/community/regex_generation/regex_env.py
index baa16e59..30fe311b 100644
--- a/environments/community/regex_generation/regex_env.py
+++ b/environments/community/regex_generation/regex_env.py
@@ -248,10 +248,6 @@ class RegexEnv(BaseEnv):
                 1.0 if s >= self.config.score_threshold else 0.0
             )
 
-        # If all scores identical, no learning signal
-        if len(set(scores["scores"])) == 1:
-            return None
-
         return scores
 
     async def rollout_and_score_eval(self, problem: dict) -> dict:
diff --git a/environments/community/solitaire_winning_probability/solitaire_server.py b/environments/community/solitaire_winning_probability/solitaire_server.py
index e38f02f8..c106d067 100644
--- a/environments/community/solitaire_winning_probability/solitaire_server.py
+++ b/environments/community/solitaire_winning_probability/solitaire_server.py
@@ -377,8 +377,6 @@ class SolitaireEnv(BaseEnv):
                         percentage_of_range = min(percentage_of_range, 1.0)
                         # Apply linear penalty scaling from 1.0 down to 0.0
                         scores["scores"].append(1.0 - percentage_of_range)
-            if all([scores["scores"][0] == score for score in scores["scores"]]):
-                return None  # If all the same, we return None
             return scores
         else:
             # If the gold solution is not parseable, we return None
diff --git a/environments/community/sql_query_env/sql_query_env.py b/environments/community/sql_query_env/sql_query_env.py
index 1e2a75f2..8f86e4c8 100644
--- a/environments/community/sql_query_env/sql_query_env.py
+++ b/environments/community/sql_query_env/sql_query_env.py
@@ -408,9 +408,6 @@ class SQLQueryEnv(BaseEnv):
                     percentage_of_range = min(percentage_of_range, 1.0)
                     scores["scores"].append(1.0 - percentage_of_range)
 
-        if all([scores["scores"][0] == score for score in scores["scores"]]):
-            return None  # If all the same, return None
-
         return scores
 
     async def get_next_item(self) -> WikiSQLRow:
diff --git a/environments/community/wikipedia_research/tool_calling_server.py b/environments/community/wikipedia_research/tool_calling_server.py
index e14bc717..e51cba19 100644
--- a/environments/community/wikipedia_research/tool_calling_server.py
+++ b/environments/community/wikipedia_research/tool_calling_server.py
@@ -404,10 +404,6 @@ class SingleToolCallingEnv(BaseEnv):
                     # Apply linear penalty scaling from 1.0 down to 0.0
                     scores["scores"].append(1.0 - percentage_of_range)
 
-        # Check if all scores are the same (no learning signal)
-        if all(scores["scores"][0] == score for score in scores["scores"]):
-            return None
-
         return scores
 
     async def get_next_item(self):
diff --git a/environments/eval_environments/pairwise_judgement_environment.py b/environments/eval_environments/pairwise_judgement_environment.py
index 06afae54..6fc894b3 100644
--- a/environments/eval_environments/pairwise_judgement_environment.py
+++ b/environments/eval_environments/pairwise_judgement_environment.py
@@ -1112,10 +1112,6 @@ class PairwiseJudgementEnv(BaseEnv):
             for score in scores["scores"]:
                 self.percent_correct_buffer.append(max(score, 0))
 
-            # Return None if all scores are the same (no learning signal)
-            if len(set(scores["scores"])) == 1:
-                return None
-
             return scores
 
         except Exception as e:
diff --git a/environments/fundamental_prediction_environment.py b/environments/fundamental_prediction_environment.py
index 9c359306..eeaaed98 100644
--- a/environments/fundamental_prediction_environment.py
+++ b/environments/fundamental_prediction_environment.py
@@ -402,10 +402,6 @@ class FundamentalPredictionEnv(BaseEnv):
             if len(scores["tokens"]) >= self.config.group_size:
                 break
 
-        # Return None if all scores are the same (no learning signal)
-        if all(scores["scores"][0] == score for score in scores["scores"]):
-            return None
-
         return scores
 
     async def rollout_and_score_eval(self, test_item):
diff --git a/environments/gsm8k_server.py b/environments/gsm8k_server.py
index 6ae5285b..8b3589da 100644
--- a/environments/gsm8k_server.py
+++ b/environments/gsm8k_server.py
@@ -352,8 +352,6 @@ class GSM8kEnv(BaseEnv):
                         percentage_of_range = min(percentage_of_range, 1.0)
                         # Apply linear penalty scaling from 1.0 down to 0.0
                         scores["scores"].append(1.0 - percentage_of_range)
-            if all([scores["scores"][0] == score for score in scores["scores"]]):
-                return None  # If all the same, we return None
             return scores
         else:
             # If the gold solution is not parseable, we return None
diff --git a/environments/gsm8k_server_axolotl.py b/environments/gsm8k_server_axolotl.py
index 244861f8..571eb609 100644
--- a/environments/gsm8k_server_axolotl.py
+++ b/environments/gsm8k_server_axolotl.py
@@ -283,8 +283,6 @@ class GSM8kEnv(BaseEnv):
                         percentage_of_range = min(percentage_of_range, 1.0)
                         # Apply linear penalty scaling from 1.0 down to 0.0
                         scores["scores"].append(1.0 - percentage_of_range)
-            if all([scores["scores"][0] == score for score in scores["scores"]]):
-                return None  # If all the same, we return None
             return scores
         else:
             # If the gold solution is not parseable, we return None
diff --git a/environments/letter_counting_environment/letter_counting_environment.py b/environments/letter_counting_environment/letter_counting_environment.py
index 2398373b..6f4f9cf1 100644
--- a/environments/letter_counting_environment/letter_counting_environment.py
+++ b/environments/letter_counting_environment/letter_counting_environment.py
@@ -1248,13 +1248,6 @@ class LetterCountingEnv(BaseEnv):
             )
             return None
 
-        # Skip if all scores are identical (no learning signal)
-        if all(scores["scores"][0] == score for score in scores["scores"]):
-            self.logger.debug(
-                f"All scores identical ({scores['scores'][0]:.2f}) - skipping group"
-            )
-            return None
-
         return scores
 
     async def rollout_and_score_eval(self, eval_item: Dict) -> Tuple[int, int]:
diff --git a/environments/math_server.py b/environments/math_server.py
index 5b343b9f..b0dafbfd 100644
--- a/environments/math_server.py
+++ b/environments/math_server.py
@@ -534,8 +534,6 @@ class MathEnv(BaseEnv):
                         return None, to_backlog
                 else:
                     return None, to_backlog
-            else:
-                return None, to_backlog
         else:
             self.normal_rollouts.append(
                 (
@@ -1167,8 +1165,6 @@ class MathEnv(BaseEnv):
                     "Max message delta is less than 0.1 * shortest message, no length penalty"
                 )
                 return None, []
-        elif all([score == scores["scores"][0] for score in scores["scores"]]):
-            return None, []
         if len(for_table) > 0:
             self.judge_rollouts.append(for_table)
             if len(self.judge_rollouts) >= self.config.num_rollouts_to_keep:
diff --git a/environments/math_server_zero.py b/environments/math_server_zero.py
index 1432ab4d..43cd6a09 100644
--- a/environments/math_server_zero.py
+++ b/environments/math_server_zero.py
@@ -383,10 +383,6 @@ class MathEnv(BaseEnv):
         to_postprocess = await self.score(to_score)
         if to_postprocess is None:
             return None, to_backlog
-        if all(
-            [to_postprocess["scores"][0] == score for score in to_postprocess["scores"]]
-        ):
-            return None, to_backlog
         self.normal_rollouts.append(
             (
                 prompt_format.format(prompt=problem_format.format(problem=item[0])),
diff --git a/environments/mcp_env.py b/environments/mcp_env.py
index a8d621bc..f8892cb1 100644
--- a/environments/mcp_env.py
+++ b/environments/mcp_env.py
@@ -347,9 +347,6 @@ class McpEnv(BaseEnv):
         for score in scores["scores"]:
             self.percent_correct_buffer.append(max(score, 0))
 
-        if all(scores["scores"][0] == score for score in scores["scores"]):
-            return None
-
         return scores
 
     async def get_next_item(self):
diff --git a/environments/mcqa_thinking_env.py b/environments/mcqa_thinking_env.py
index 417822f6..9738a85e 100644
--- a/environments/mcqa_thinking_env.py
+++ b/environments/mcqa_thinking_env.py
@@ -373,10 +373,6 @@ class MCQAThinkingEnv(BaseEnv):
         for score in scores["scores"]:
             self.percent_correct_buffer.append(max(score, 0))
 
-        # Return None if all scores are the same (no learning signal)
-        if all(scores["scores"][0] == score for score in scores["scores"]):
-            return None
-
         return scores
 
     async def rollout_and_score_eval(self, test_item):
diff --git a/environments/text_reversal_environment.py b/environments/text_reversal_environment.py
index 0b41e2ec..738cb663 100644
--- a/environments/text_reversal_environment.py
+++ b/environments/text_reversal_environment.py
@@ -770,10 +770,6 @@ class TextReversalEnv(BaseEnv):
             for score in scores["scores"]:
                 self.percent_correct_buffer.append(max(score, 0))
 
-            # Return None if all scores are the same (no learning signal)
-            if len(set(scores["scores"])) == 1:
-                return None
-
             return scores
 
         except Exception as e:
diff --git a/environments/tool_calling_server.py b/environments/tool_calling_server.py
index 10409417..e6824118 100644
--- a/environments/tool_calling_server.py
+++ b/environments/tool_calling_server.py
@@ -413,10 +413,6 @@ class SingleToolCallingEnv(BaseEnv):
                     # Apply linear penalty scaling from 1.0 down to 0.0
                     scores["scores"].append(1.0 - percentage_of_range)
 
-        # Check if all scores are the same (no learning signal)
-        if all(scores["scores"][0] == score for score in scores["scores"]):
-            return None
-
         return scores
 
     async def get_next_item(self):

From adf075112cfd964ef92cf58e59ef1a26ebf67edb Mon Sep 17 00:00:00 2001
From: Partho Das <parthodas6176@gmail.com>
Date: Mon, 23 Feb 2026 22:50:38 +0530
Subject: [PATCH 2/7] re-append stop in math training path

---
 environments/math_server_zero.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/environments/math_server_zero.py b/environments/math_server_zero.py
index 43cd6a09..d543b7e7 100644
--- a/environments/math_server_zero.py
+++ b/environments/math_server_zero.py
@@ -421,6 +421,10 @@ class MathEnv(BaseEnv):
                 if self.config.mask_too_long_completions:
                     scores["overrides"][-1]["set_advantage_to_zero"] = True
             else:
+                # re-append </answer> if stripped by vLLM stop string handling
+                # (mirrors the eval path in rollout_and_score_eval)
+                if ("<answer>" in resp) and ("</answer>" not in resp):
+                    resp = resp + "</answer>"
                 task = loop.run_in_executor(self.mp_executor, score_answer, gold, resp)
                 reward = await task
                 if reward is None:

From bd98a82bbc6c2983196bab35dd6bff9629c5493a Mon Sep 17 00:00:00 2001
From: Partho Das <parthodas6176@gmail.com>
Date: Tue, 24 Feb 2026 12:21:27 +0530
Subject: [PATCH 3/7] allow serve openai overrides

---
 atroposlib/envs/base.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/atroposlib/envs/base.py b/atroposlib/envs/base.py
index e9b672a6..ab9244ae 100644
--- a/atroposlib/envs/base.py
+++ b/atroposlib/envs/base.py
@@ -1427,9 +1427,11 @@ class BaseEnv(ABC):
                 if isinstance(default_server_configs, ServerBaseline) and (
                     oai_cli_passed_args or yaml_oai_config
                 ):
-                    raise ValueError(
-                        "ServerBaseline is not compatible with OpenAI-namespaced CLI arguments. Please edit `config_init` directly or use APIServerConfig."  # noqa: E501
-                    )
+                    # If config_init provided ServerBaseline, but CLI/YAML provides OpenAI specifics,
+                    # it implies an override intent for a single server.
+                    # We use the default_openai_config_instance_for_cli (which would be a default APIServerConfig)
+                    # as the base for merging, allowing it to be fully specified by YAML/CLI.
+                    pass  # Base is already set correctly for this case
                 if (
                     isinstance(default_server_configs, list)
                     and len(default_server_configs) == 1

From 5f52befd384f100919fc5a1d7e2cd3120b79a194 Mon Sep 17 00:00:00 2001
From: Partho Das <parthodas6176@gmail.com>
Date: Tue, 3 Mar 2026 18:03:04 +0530
Subject: [PATCH 4/7] eval max_token_length consistent with training config

instead of hardcoding, follows other envs pattern
---
 environments/math_server_zero.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environments/math_server_zero.py b/environments/math_server_zero.py
index d543b7e7..b067728e 100644
--- a/environments/math_server_zero.py
+++ b/environments/math_server_zero.py
@@ -259,7 +259,7 @@ class MathEnv(BaseEnv):
             completion = await managed.completion(
                 prompt=question,
                 n=1,
-                max_tokens=32765,
+                max_tokens=self.config.max_token_length,
                 temperature=0.0,
                 split="eval",
                 stop=stop_list,

From cd3a9163c7c447d4eb5903cabc602d0a4bdef279 Mon Sep 17 00:00:00 2001
From: Partho Das <parthodas6176@gmail.com>
Date: Sun, 8 Mar 2026 04:42:02 +0530
Subject: [PATCH 5/7] Revert "eval max_token_length consistent with training
 config"

This reverts commit 5f52befd384f100919fc5a1d7e2cd3120b79a194.
---
 environments/math_server_zero.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environments/math_server_zero.py b/environments/math_server_zero.py
index b067728e..d543b7e7 100644
--- a/environments/math_server_zero.py
+++ b/environments/math_server_zero.py
@@ -259,7 +259,7 @@ class MathEnv(BaseEnv):
             completion = await managed.completion(
                 prompt=question,
                 n=1,
-                max_tokens=self.config.max_token_length,
+                max_tokens=32765,
                 temperature=0.0,
                 split="eval",
                 stop=stop_list,

From cdc23ba5dc30e215fa4a26c3f017e695f6ff6209 Mon Sep 17 00:00:00 2001
From: Partho Das <parthodas6176@gmail.com>
Date: Sun, 8 Mar 2026 04:42:09 +0530
Subject: [PATCH 6/7] Revert "allow serve openai overrides"

This reverts commit bd98a82bbc6c2983196bab35dd6bff9629c5493a.
---
 atroposlib/envs/base.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/atroposlib/envs/base.py b/atroposlib/envs/base.py
index ab9244ae..e9b672a6 100644
--- a/atroposlib/envs/base.py
+++ b/atroposlib/envs/base.py
@@ -1427,11 +1427,9 @@ class BaseEnv(ABC):
                 if isinstance(default_server_configs, ServerBaseline) and (
                     oai_cli_passed_args or yaml_oai_config
                 ):
-                    # If config_init provided ServerBaseline, but CLI/YAML provides OpenAI specifics,
-                    # it implies an override intent for a single server.
-                    # We use the default_openai_config_instance_for_cli (which would be a default APIServerConfig)
-                    # as the base for merging, allowing it to be fully specified by YAML/CLI.
-                    pass  # Base is already set correctly for this case
+                    raise ValueError(
+                        "ServerBaseline is not compatible with OpenAI-namespaced CLI arguments. Please edit `config_init` directly or use APIServerConfig."  # noqa: E501
+                    )
                 if (
                     isinstance(default_server_configs, list)
                     and len(default_server_configs) == 1

From 632ab0161ce8039608707c56c21bb321962f7d82 Mon Sep 17 00:00:00 2001
From: Partho Das <parthodas6176@gmail.com>
Date: Tue, 10 Mar 2026 01:42:44 +0530
Subject: [PATCH 7/7] Revert "rm hardcoded same score check"

This reverts commit f02c24204d12d989600c1591ba583dd2c48b994b.
---
 .../answer_format_environment.py                          | 8 ++++++++
 environments/community/cat_behavior_env/catbot_arena.py   | 2 ++
 environments/community/dynastai/dynastai_server.py        | 5 +++++
 .../community/mcp_tool_calling/tool_calling_server.py     | 4 ++++
 .../options_iv_prediction/options_iv_prediction.py        | 4 ++++
 environments/community/pay_to_play/pay_to_play_env.py     | 4 ++++
 environments/community/physical_space_stl/physical_env.py | 3 +++
 environments/community/regex_generation/regex_env.py      | 4 ++++
 .../solitaire_winning_probability/solitaire_server.py     | 2 ++
 environments/community/sql_query_env/sql_query_env.py     | 3 +++
 .../community/wikipedia_research/tool_calling_server.py   | 4 ++++
 .../eval_environments/pairwise_judgement_environment.py   | 4 ++++
 environments/fundamental_prediction_environment.py        | 4 ++++
 environments/gsm8k_server.py                              | 2 ++
 environments/gsm8k_server_axolotl.py                      | 2 ++
 .../letter_counting_environment.py                        | 7 +++++++
 environments/math_server.py                               | 4 ++++
 environments/math_server_zero.py                          | 4 ++++
 environments/mcp_env.py                                   | 3 +++
 environments/mcqa_thinking_env.py                         | 4 ++++
 environments/text_reversal_environment.py                 | 4 ++++
 environments/tool_calling_server.py                       | 4 ++++
 22 files changed, 85 insertions(+)

diff --git a/environments/answer_format_environment/answer_format_environment.py b/environments/answer_format_environment/answer_format_environment.py
index c84d53c7..949f8f9e 100644
--- a/environments/answer_format_environment/answer_format_environment.py
+++ b/environments/answer_format_environment/answer_format_environment.py
@@ -3653,6 +3653,14 @@ class AnswerFormatEnv(BaseEnv):
                     )
                 await self._save_failed_rollouts_to_jsonl()
 
+        # Check if all scores are the same (no learning signal)
+        if all(group_scores[0] == score for score in group_scores):
+            if self.debug_logging:
+                self.logger.debug(
+                    "All scores are identical, returning None for learning signal"
+                )
+            return None
+
         # Track successful groups for equivalent ratio enforcement
         if self.ensure_equivalent_ratios:
             # Count this as a successful group if we have any successful examples
diff --git a/environments/community/cat_behavior_env/catbot_arena.py b/environments/community/cat_behavior_env/catbot_arena.py
index 8de5a843..1b2d69fd 100644
--- a/environments/community/cat_behavior_env/catbot_arena.py
+++ b/environments/community/cat_behavior_env/catbot_arena.py
@@ -285,6 +285,8 @@ class GSM8kEnv(BaseEnv):
                         percentage_of_range = min(percentage_of_range, 1.0)
                         # Apply linear penalty scaling from 1.0 down to 0.0
                         scores["scores"].append(1.0 - percentage_of_range)
+            if all([scores["scores"][0] == score for score in scores["scores"]]):
+                return None  # If all the same, we return None
             return scores
         else:
             # If the gold solution is not parseable, we return None
diff --git a/environments/community/dynastai/dynastai_server.py b/environments/community/dynastai/dynastai_server.py
index 0678e96e..1ea3af05 100755
--- a/environments/community/dynastai/dynastai_server.py
+++ b/environments/community/dynastai/dynastai_server.py
@@ -503,6 +503,11 @@ class DynastAIEnv(BaseEnv):
         for score in scores["scores"]:
             self.percent_correct_buffer.append(max(score, 0))
 
+        # Check if all the same
+        if all([score == scores["scores"][0] for score in scores["scores"]]):
+            print("[DYNASTAI] All scores identical, returning None")
+            return None  # If all the same, we return None
+
         return scores
 
     async def get_next_item(self) -> DynastAIRow:
diff --git a/environments/community/mcp_tool_calling/tool_calling_server.py b/environments/community/mcp_tool_calling/tool_calling_server.py
index 57d2cd56..3c765c72 100644
--- a/environments/community/mcp_tool_calling/tool_calling_server.py
+++ b/environments/community/mcp_tool_calling/tool_calling_server.py
@@ -404,6 +404,10 @@ class SingleToolCallingEnv(BaseEnv):
                     # Apply linear penalty scaling from 1.0 down to 0.0
                     scores["scores"].append(1.0 - percentage_of_range)
 
+        # Check if all scores are the same (no learning signal)
+        if all(scores["scores"][0] == score for score in scores["scores"]):
+            return None
+
         return scores
 
     async def get_next_item(self):
diff --git a/environments/community/options_iv_prediction/options_iv_prediction.py b/environments/community/options_iv_prediction/options_iv_prediction.py
index d771516e..16d7b9fe 100644
--- a/environments/community/options_iv_prediction/options_iv_prediction.py
+++ b/environments/community/options_iv_prediction/options_iv_prediction.py
@@ -433,6 +433,10 @@ class OptionsIVPrediction(BaseEnv):
             if len(scores["tokens"]) >= self.config.group_size:
                 break
 
+        # Return None if all scores are the same (no learning signal)
+        if all(scores["scores"][0] == score for score in scores["scores"]):
+            return None
+
         return scores
 
     async def rollout_and_score_eval(self, test_item):
diff --git a/environments/community/pay_to_play/pay_to_play_env.py b/environments/community/pay_to_play/pay_to_play_env.py
index d9760d41..1eacdb8b 100644
--- a/environments/community/pay_to_play/pay_to_play_env.py
+++ b/environments/community/pay_to_play/pay_to_play_env.py
@@ -777,6 +777,10 @@ End your evaluation with \\boxed{{score}} where score is your numerical rating.
         if hasattr(self, "last_agent_card_feedback"):
             self.last_agent_card_feedback = agent_card_feedback
 
+        # Ensure we have different scores for training signal
+        if len(set(scores["scores"])) == 1:
+            return None
+
         return scores
 
     def _extract_score_from_agent_card(self, agent_card_response: str) -> float:
diff --git a/environments/community/physical_space_stl/physical_env.py b/environments/community/physical_space_stl/physical_env.py
index 91c56672..7cedb2ea 100644
--- a/environments/community/physical_space_stl/physical_env.py
+++ b/environments/community/physical_space_stl/physical_env.py
@@ -409,6 +409,9 @@ class PhysicalEnv(BaseEnv):
                         percentage_of_range = min(percentage_of_range, 1.0)
                         scores["scores"].append(1.0 - percentage_of_range)
 
+        if all([scores["scores"][0] == score for score in scores["scores"]]):
+            return None  # If all the same, we return None
+
         return scores
 
     async def get_next_item(self) -> PhysicalRow:
diff --git a/environments/community/regex_generation/regex_env.py b/environments/community/regex_generation/regex_env.py
index 30fe311b..baa16e59 100644
--- a/environments/community/regex_generation/regex_env.py
+++ b/environments/community/regex_generation/regex_env.py
@@ -248,6 +248,10 @@ class RegexEnv(BaseEnv):
                 1.0 if s >= self.config.score_threshold else 0.0
             )
 
+        # If all scores identical, no learning signal
+        if len(set(scores["scores"])) == 1:
+            return None
+
         return scores
 
     async def rollout_and_score_eval(self, problem: dict) -> dict:
diff --git a/environments/community/solitaire_winning_probability/solitaire_server.py b/environments/community/solitaire_winning_probability/solitaire_server.py
index c106d067..e38f02f8 100644
--- a/environments/community/solitaire_winning_probability/solitaire_server.py
+++ b/environments/community/solitaire_winning_probability/solitaire_server.py
@@ -377,6 +377,8 @@ class SolitaireEnv(BaseEnv):
                         percentage_of_range = min(percentage_of_range, 1.0)
                         # Apply linear penalty scaling from 1.0 down to 0.0
                         scores["scores"].append(1.0 - percentage_of_range)
+            if all([scores["scores"][0] == score for score in scores["scores"]]):
+                return None  # If all the same, we return None
             return scores
         else:
             # If the gold solution is not parseable, we return None
diff --git a/environments/community/sql_query_env/sql_query_env.py b/environments/community/sql_query_env/sql_query_env.py
index 8f86e4c8..1e2a75f2 100644
--- a/environments/community/sql_query_env/sql_query_env.py
+++ b/environments/community/sql_query_env/sql_query_env.py
@@ -408,6 +408,9 @@ class SQLQueryEnv(BaseEnv):
                     percentage_of_range = min(percentage_of_range, 1.0)
                     scores["scores"].append(1.0 - percentage_of_range)
 
+        if all([scores["scores"][0] == score for score in scores["scores"]]):
+            return None  # If all the same, return None
+
         return scores
 
     async def get_next_item(self) -> WikiSQLRow:
diff --git a/environments/community/wikipedia_research/tool_calling_server.py b/environments/community/wikipedia_research/tool_calling_server.py
index e51cba19..e14bc717 100644
--- a/environments/community/wikipedia_research/tool_calling_server.py
+++ b/environments/community/wikipedia_research/tool_calling_server.py
@@ -404,6 +404,10 @@ class SingleToolCallingEnv(BaseEnv):
                     # Apply linear penalty scaling from 1.0 down to 0.0
                     scores["scores"].append(1.0 - percentage_of_range)
 
+        # Check if all scores are the same (no learning signal)
+        if all(scores["scores"][0] == score for score in scores["scores"]):
+            return None
+
         return scores
 
     async def get_next_item(self):
diff --git a/environments/eval_environments/pairwise_judgement_environment.py b/environments/eval_environments/pairwise_judgement_environment.py
index 6fc894b3..06afae54 100644
--- a/environments/eval_environments/pairwise_judgement_environment.py
+++ b/environments/eval_environments/pairwise_judgement_environment.py
@@ -1112,6 +1112,10 @@ class PairwiseJudgementEnv(BaseEnv):
             for score in scores["scores"]:
                 self.percent_correct_buffer.append(max(score, 0))
 
+            # Return None if all scores are the same (no learning signal)
+            if len(set(scores["scores"])) == 1:
+                return None
+
             return scores
 
         except Exception as e:
diff --git a/environments/fundamental_prediction_environment.py b/environments/fundamental_prediction_environment.py
index eeaaed98..9c359306 100644
--- a/environments/fundamental_prediction_environment.py
+++ b/environments/fundamental_prediction_environment.py
@@ -402,6 +402,10 @@ class FundamentalPredictionEnv(BaseEnv):
             if len(scores["tokens"]) >= self.config.group_size:
                 break
 
+        # Return None if all scores are the same (no learning signal)
+        if all(scores["scores"][0] == score for score in scores["scores"]):
+            return None
+
         return scores
 
     async def rollout_and_score_eval(self, test_item):
diff --git a/environments/gsm8k_server.py b/environments/gsm8k_server.py
index 8b3589da..6ae5285b 100644
--- a/environments/gsm8k_server.py
+++ b/environments/gsm8k_server.py
@@ -352,6 +352,8 @@ class GSM8kEnv(BaseEnv):
                         percentage_of_range = min(percentage_of_range, 1.0)
                         # Apply linear penalty scaling from 1.0 down to 0.0
                         scores["scores"].append(1.0 - percentage_of_range)
+            if all([scores["scores"][0] == score for score in scores["scores"]]):
+                return None  # If all the same, we return None
             return scores
         else:
             # If the gold solution is not parseable, we return None
diff --git a/environments/gsm8k_server_axolotl.py b/environments/gsm8k_server_axolotl.py
index 571eb609..244861f8 100644
--- a/environments/gsm8k_server_axolotl.py
+++ b/environments/gsm8k_server_axolotl.py
@@ -283,6 +283,8 @@ class GSM8kEnv(BaseEnv):
                         percentage_of_range = min(percentage_of_range, 1.0)
                         # Apply linear penalty scaling from 1.0 down to 0.0
                         scores["scores"].append(1.0 - percentage_of_range)
+            if all([scores["scores"][0] == score for score in scores["scores"]]):
+                return None  # If all the same, we return None
             return scores
         else:
             # If the gold solution is not parseable, we return None
diff --git a/environments/letter_counting_environment/letter_counting_environment.py b/environments/letter_counting_environment/letter_counting_environment.py
index 6f4f9cf1..2398373b 100644
--- a/environments/letter_counting_environment/letter_counting_environment.py
+++ b/environments/letter_counting_environment/letter_counting_environment.py
@@ -1248,6 +1248,13 @@ class LetterCountingEnv(BaseEnv):
             )
             return None
 
+        # Skip if all scores are identical (no learning signal)
+        if all(scores["scores"][0] == score for score in scores["scores"]):
+            self.logger.debug(
+                f"All scores identical ({scores['scores'][0]:.2f}) - skipping group"
+            )
+            return None
+
         return scores
 
     async def rollout_and_score_eval(self, eval_item: Dict) -> Tuple[int, int]:
diff --git a/environments/math_server.py b/environments/math_server.py
index b0dafbfd..5b343b9f 100644
--- a/environments/math_server.py
+++ b/environments/math_server.py
@@ -534,6 +534,8 @@ class MathEnv(BaseEnv):
                         return None, to_backlog
                 else:
                     return None, to_backlog
+            else:
+                return None, to_backlog
         else:
             self.normal_rollouts.append(
                 (
@@ -1165,6 +1167,8 @@ class MathEnv(BaseEnv):
                     "Max message delta is less than 0.1 * shortest message, no length penalty"
                 )
                 return None, []
+        elif all([score == scores["scores"][0] for score in scores["scores"]]):
+            return None, []
         if len(for_table) > 0:
             self.judge_rollouts.append(for_table)
             if len(self.judge_rollouts) >= self.config.num_rollouts_to_keep:
diff --git a/environments/math_server_zero.py b/environments/math_server_zero.py
index d543b7e7..2133cebb 100644
--- a/environments/math_server_zero.py
+++ b/environments/math_server_zero.py
@@ -383,6 +383,10 @@ class MathEnv(BaseEnv):
         to_postprocess = await self.score(to_score)
         if to_postprocess is None:
             return None, to_backlog
+        if all(
+            [to_postprocess["scores"][0] == score for score in to_postprocess["scores"]]
+        ):
+            return None, to_backlog
         self.normal_rollouts.append(
             (
                 prompt_format.format(prompt=problem_format.format(problem=item[0])),
diff --git a/environments/mcp_env.py b/environments/mcp_env.py
index f8892cb1..a8d621bc 100644
--- a/environments/mcp_env.py
+++ b/environments/mcp_env.py
@@ -347,6 +347,9 @@ class McpEnv(BaseEnv):
         for score in scores["scores"]:
             self.percent_correct_buffer.append(max(score, 0))
 
+        if all(scores["scores"][0] == score for score in scores["scores"]):
+            return None
+
         return scores
 
     async def get_next_item(self):
diff --git a/environments/mcqa_thinking_env.py b/environments/mcqa_thinking_env.py
index 9738a85e..417822f6 100644
--- a/environments/mcqa_thinking_env.py
+++ b/environments/mcqa_thinking_env.py
@@ -373,6 +373,10 @@ class MCQAThinkingEnv(BaseEnv):
         for score in scores["scores"]:
             self.percent_correct_buffer.append(max(score, 0))
 
+        # Return None if all scores are the same (no learning signal)
+        if all(scores["scores"][0] == score for score in scores["scores"]):
+            return None
+
         return scores
 
     async def rollout_and_score_eval(self, test_item):
diff --git a/environments/text_reversal_environment.py b/environments/text_reversal_environment.py
index 738cb663..0b41e2ec 100644
--- a/environments/text_reversal_environment.py
+++ b/environments/text_reversal_environment.py
@@ -770,6 +770,10 @@ class TextReversalEnv(BaseEnv):
             for score in scores["scores"]:
                 self.percent_correct_buffer.append(max(score, 0))
 
+            # Return None if all scores are the same (no learning signal)
+            if len(set(scores["scores"])) == 1:
+                return None
+
             return scores
 
         except Exception as e:
diff --git a/environments/tool_calling_server.py b/environments/tool_calling_server.py
index e6824118..10409417 100644
--- a/environments/tool_calling_server.py
+++ b/environments/tool_calling_server.py
@@ -413,6 +413,10 @@ class SingleToolCallingEnv(BaseEnv):
                     # Apply linear penalty scaling from 1.0 down to 0.0
                     scores["scores"].append(1.0 - percentage_of_range)
 
+        # Check if all scores are the same (no learning signal)
+        if all(scores["scores"][0] == score for score in scores["scores"]):
+            return None
+
         return scores
 
     async def get_next_item(self):