mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-19 12:57:58 +00:00
parent
cdc23ba5dc
commit
632ab0161c
22 changed files with 85 additions and 0 deletions
|
|
@ -3653,6 +3653,14 @@ class AnswerFormatEnv(BaseEnv):
|
|||
)
|
||||
await self._save_failed_rollouts_to_jsonl()
|
||||
|
||||
# Check if all scores are the same (no learning signal)
|
||||
if all(group_scores[0] == score for score in group_scores):
|
||||
if self.debug_logging:
|
||||
self.logger.debug(
|
||||
"All scores are identical, returning None for learning signal"
|
||||
)
|
||||
return None
|
||||
|
||||
# Track successful groups for equivalent ratio enforcement
|
||||
if self.ensure_equivalent_ratios:
|
||||
# Count this as a successful group if we have any successful examples
|
||||
|
|
|
|||
|
|
@ -285,6 +285,8 @@ class GSM8kEnv(BaseEnv):
|
|||
percentage_of_range = min(percentage_of_range, 1.0)
|
||||
# Apply linear penalty scaling from 1.0 down to 0.0
|
||||
scores["scores"].append(1.0 - percentage_of_range)
|
||||
if all([scores["scores"][0] == score for score in scores["scores"]]):
|
||||
return None # If all the same, we return None
|
||||
return scores
|
||||
else:
|
||||
# If the gold solution is not parseable, we return None
|
||||
|
|
|
|||
|
|
@ -503,6 +503,11 @@ class DynastAIEnv(BaseEnv):
|
|||
for score in scores["scores"]:
|
||||
self.percent_correct_buffer.append(max(score, 0))
|
||||
|
||||
# Check if all the same
|
||||
if all([score == scores["scores"][0] for score in scores["scores"]]):
|
||||
print("[DYNASTAI] All scores identical, returning None")
|
||||
return None # If all the same, we return None
|
||||
|
||||
return scores
|
||||
|
||||
async def get_next_item(self) -> DynastAIRow:
|
||||
|
|
|
|||
|
|
@ -404,6 +404,10 @@ class SingleToolCallingEnv(BaseEnv):
|
|||
# Apply linear penalty scaling from 1.0 down to 0.0
|
||||
scores["scores"].append(1.0 - percentage_of_range)
|
||||
|
||||
# Check if all scores are the same (no learning signal)
|
||||
if all(scores["scores"][0] == score for score in scores["scores"]):
|
||||
return None
|
||||
|
||||
return scores
|
||||
|
||||
async def get_next_item(self):
|
||||
|
|
|
|||
|
|
@ -433,6 +433,10 @@ class OptionsIVPrediction(BaseEnv):
|
|||
if len(scores["tokens"]) >= self.config.group_size:
|
||||
break
|
||||
|
||||
# Return None if all scores are the same (no learning signal)
|
||||
if all(scores["scores"][0] == score for score in scores["scores"]):
|
||||
return None
|
||||
|
||||
return scores
|
||||
|
||||
async def rollout_and_score_eval(self, test_item):
|
||||
|
|
|
|||
|
|
@ -777,6 +777,10 @@ End your evaluation with \\boxed{{score}} where score is your numerical rating.
|
|||
if hasattr(self, "last_agent_card_feedback"):
|
||||
self.last_agent_card_feedback = agent_card_feedback
|
||||
|
||||
# Ensure we have different scores for training signal
|
||||
if len(set(scores["scores"])) == 1:
|
||||
return None
|
||||
|
||||
return scores
|
||||
|
||||
def _extract_score_from_agent_card(self, agent_card_response: str) -> float:
|
||||
|
|
|
|||
|
|
@ -409,6 +409,9 @@ class PhysicalEnv(BaseEnv):
|
|||
percentage_of_range = min(percentage_of_range, 1.0)
|
||||
scores["scores"].append(1.0 - percentage_of_range)
|
||||
|
||||
if all([scores["scores"][0] == score for score in scores["scores"]]):
|
||||
return None # If all the same, we return None
|
||||
|
||||
return scores
|
||||
|
||||
async def get_next_item(self) -> PhysicalRow:
|
||||
|
|
|
|||
|
|
@ -248,6 +248,10 @@ class RegexEnv(BaseEnv):
|
|||
1.0 if s >= self.config.score_threshold else 0.0
|
||||
)
|
||||
|
||||
# If all scores identical, no learning signal
|
||||
if len(set(scores["scores"])) == 1:
|
||||
return None
|
||||
|
||||
return scores
|
||||
|
||||
async def rollout_and_score_eval(self, problem: dict) -> dict:
|
||||
|
|
|
|||
|
|
@ -377,6 +377,8 @@ class SolitaireEnv(BaseEnv):
|
|||
percentage_of_range = min(percentage_of_range, 1.0)
|
||||
# Apply linear penalty scaling from 1.0 down to 0.0
|
||||
scores["scores"].append(1.0 - percentage_of_range)
|
||||
if all([scores["scores"][0] == score for score in scores["scores"]]):
|
||||
return None # If all the same, we return None
|
||||
return scores
|
||||
else:
|
||||
# If the gold solution is not parseable, we return None
|
||||
|
|
|
|||
|
|
@ -408,6 +408,9 @@ class SQLQueryEnv(BaseEnv):
|
|||
percentage_of_range = min(percentage_of_range, 1.0)
|
||||
scores["scores"].append(1.0 - percentage_of_range)
|
||||
|
||||
if all([scores["scores"][0] == score for score in scores["scores"]]):
|
||||
return None # If all the same, return None
|
||||
|
||||
return scores
|
||||
|
||||
async def get_next_item(self) -> WikiSQLRow:
|
||||
|
|
|
|||
|
|
@ -404,6 +404,10 @@ class SingleToolCallingEnv(BaseEnv):
|
|||
# Apply linear penalty scaling from 1.0 down to 0.0
|
||||
scores["scores"].append(1.0 - percentage_of_range)
|
||||
|
||||
# Check if all scores are the same (no learning signal)
|
||||
if all(scores["scores"][0] == score for score in scores["scores"]):
|
||||
return None
|
||||
|
||||
return scores
|
||||
|
||||
async def get_next_item(self):
|
||||
|
|
|
|||
|
|
@ -1112,6 +1112,10 @@ class PairwiseJudgementEnv(BaseEnv):
|
|||
for score in scores["scores"]:
|
||||
self.percent_correct_buffer.append(max(score, 0))
|
||||
|
||||
# Return None if all scores are the same (no learning signal)
|
||||
if len(set(scores["scores"])) == 1:
|
||||
return None
|
||||
|
||||
return scores
|
||||
|
||||
except Exception as e:
|
||||
|
|
|
|||
|
|
@ -402,6 +402,10 @@ class FundamentalPredictionEnv(BaseEnv):
|
|||
if len(scores["tokens"]) >= self.config.group_size:
|
||||
break
|
||||
|
||||
# Return None if all scores are the same (no learning signal)
|
||||
if all(scores["scores"][0] == score for score in scores["scores"]):
|
||||
return None
|
||||
|
||||
return scores
|
||||
|
||||
async def rollout_and_score_eval(self, test_item):
|
||||
|
|
|
|||
|
|
@ -352,6 +352,8 @@ class GSM8kEnv(BaseEnv):
|
|||
percentage_of_range = min(percentage_of_range, 1.0)
|
||||
# Apply linear penalty scaling from 1.0 down to 0.0
|
||||
scores["scores"].append(1.0 - percentage_of_range)
|
||||
if all([scores["scores"][0] == score for score in scores["scores"]]):
|
||||
return None # If all the same, we return None
|
||||
return scores
|
||||
else:
|
||||
# If the gold solution is not parseable, we return None
|
||||
|
|
|
|||
|
|
@ -283,6 +283,8 @@ class GSM8kEnv(BaseEnv):
|
|||
percentage_of_range = min(percentage_of_range, 1.0)
|
||||
# Apply linear penalty scaling from 1.0 down to 0.0
|
||||
scores["scores"].append(1.0 - percentage_of_range)
|
||||
if all([scores["scores"][0] == score for score in scores["scores"]]):
|
||||
return None # If all the same, we return None
|
||||
return scores
|
||||
else:
|
||||
# If the gold solution is not parseable, we return None
|
||||
|
|
|
|||
|
|
@ -1248,6 +1248,13 @@ class LetterCountingEnv(BaseEnv):
|
|||
)
|
||||
return None
|
||||
|
||||
# Skip if all scores are identical (no learning signal)
|
||||
if all(scores["scores"][0] == score for score in scores["scores"]):
|
||||
self.logger.debug(
|
||||
f"All scores identical ({scores['scores'][0]:.2f}) - skipping group"
|
||||
)
|
||||
return None
|
||||
|
||||
return scores
|
||||
|
||||
async def rollout_and_score_eval(self, eval_item: Dict) -> Tuple[int, int]:
|
||||
|
|
|
|||
|
|
@ -534,6 +534,8 @@ class MathEnv(BaseEnv):
|
|||
return None, to_backlog
|
||||
else:
|
||||
return None, to_backlog
|
||||
else:
|
||||
return None, to_backlog
|
||||
else:
|
||||
self.normal_rollouts.append(
|
||||
(
|
||||
|
|
@ -1165,6 +1167,8 @@ class MathEnv(BaseEnv):
|
|||
"Max message delta is less than 0.1 * shortest message, no length penalty"
|
||||
)
|
||||
return None, []
|
||||
elif all([score == scores["scores"][0] for score in scores["scores"]]):
|
||||
return None, []
|
||||
if len(for_table) > 0:
|
||||
self.judge_rollouts.append(for_table)
|
||||
if len(self.judge_rollouts) >= self.config.num_rollouts_to_keep:
|
||||
|
|
|
|||
|
|
@ -383,6 +383,10 @@ class MathEnv(BaseEnv):
|
|||
to_postprocess = await self.score(to_score)
|
||||
if to_postprocess is None:
|
||||
return None, to_backlog
|
||||
if all(
|
||||
[to_postprocess["scores"][0] == score for score in to_postprocess["scores"]]
|
||||
):
|
||||
return None, to_backlog
|
||||
self.normal_rollouts.append(
|
||||
(
|
||||
prompt_format.format(prompt=problem_format.format(problem=item[0])),
|
||||
|
|
|
|||
|
|
@ -347,6 +347,9 @@ class McpEnv(BaseEnv):
|
|||
for score in scores["scores"]:
|
||||
self.percent_correct_buffer.append(max(score, 0))
|
||||
|
||||
if all(scores["scores"][0] == score for score in scores["scores"]):
|
||||
return None
|
||||
|
||||
return scores
|
||||
|
||||
async def get_next_item(self):
|
||||
|
|
|
|||
|
|
@ -373,6 +373,10 @@ class MCQAThinkingEnv(BaseEnv):
|
|||
for score in scores["scores"]:
|
||||
self.percent_correct_buffer.append(max(score, 0))
|
||||
|
||||
# Return None if all scores are the same (no learning signal)
|
||||
if all(scores["scores"][0] == score for score in scores["scores"]):
|
||||
return None
|
||||
|
||||
return scores
|
||||
|
||||
async def rollout_and_score_eval(self, test_item):
|
||||
|
|
|
|||
|
|
@ -770,6 +770,10 @@ class TextReversalEnv(BaseEnv):
|
|||
for score in scores["scores"]:
|
||||
self.percent_correct_buffer.append(max(score, 0))
|
||||
|
||||
# Return None if all scores are the same (no learning signal)
|
||||
if len(set(scores["scores"])) == 1:
|
||||
return None
|
||||
|
||||
return scores
|
||||
|
||||
except Exception as e:
|
||||
|
|
|
|||
|
|
@ -413,6 +413,10 @@ class SingleToolCallingEnv(BaseEnv):
|
|||
# Apply linear penalty scaling from 1.0 down to 0.0
|
||||
scores["scores"].append(1.0 - percentage_of_range)
|
||||
|
||||
# Check if all scores are the same (no learning signal)
|
||||
if all(scores["scores"][0] == score for score in scores["scores"]):
|
||||
return None
|
||||
|
||||
return scores
|
||||
|
||||
async def get_next_item(self):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue