[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
This commit is contained in:
pre-commit-ci[bot] 2025-12-24 10:48:20 +00:00
parent ef9c0c3699
commit afab28dfa9
37 changed files with 4868 additions and 4052 deletions

View file

@ -41,16 +41,15 @@ from typing import Any, Dict, List, Optional, Tuple
import numpy as np
import openai
import scipy.stats
from pydantic import Field
from tqdm.asyncio import tqdm_asyncio
from atroposlib.envs.base import BaseEnv, BaseEnvConfig, APIServerConfig
from eval_helpers import (
create_system_content,
get_default_thinking_prompt,
save_eval_results,
)
from pydantic import Field
from tqdm.asyncio import tqdm_asyncio
from atroposlib.envs.base import APIServerConfig, BaseEnv, BaseEnvConfig
# Path to JudgeMark data files (relative to this file)
JUDGEMARK_DATA_DIR = Path(__file__).parent.parent.parent / "Judgemark-v2" / "data"
@ -80,9 +79,15 @@ REFERENCE_MODEL_SCORES = {
# Negative criteria markers - these get inverted (higher = worse writing)
NEGATIVE_MARKERS = [
"meandering", "weak dialogue", "tell-don't-show",
"unsurprising or uncreative", "amateurish", "purple prose",
"overwrought", "incongruent ending positivity", "unearned transformations"
"meandering",
"weak dialogue",
"tell-don't-show",
"unsurprising or uncreative",
"amateurish",
"purple prose",
"overwrought",
"incongruent ending positivity",
"unearned transformations",
]
@ -90,50 +95,49 @@ NEGATIVE_MARKERS = [
# Scoring Functions (from Judgemark-v2/core/scoring.py)
# =============================================================================
def parse_scores(judge_response: str) -> Dict[str, float]:
"""
Parse score lines from judge output with flexible formatting.
Accepts formats like:
**Quality:** 7.5
Quality: 7.5
**Quality:** [7.5]
"""
pattern = (
r'^\s*'
r'(?:\*\*)?([^\n:\[\]]{2,100}):(?:\*\*)?'
r'\s*'
r'(?:\[)?(?:\*\*)?'
r'(-?\d+(?:\.\d+)?)'
r'(?:\*\*)?(?:\])?'
r'\s*$'
r"^\s*"
r"(?:\*\*)?([^\n:\[\]]{2,100}):(?:\*\*)?"
r"\s*"
r"(?:\[)?(?:\*\*)?"
r"(-?\d+(?:\.\d+)?)"
r"(?:\*\*)?(?:\])?"
r"\s*$"
)
matches = re.findall(pattern, judge_response, re.MULTILINE)
scores = {metric.strip(): float(score) for metric, score in matches}
return scores
def compute_raw_score(
scores: Dict[str, float],
scoring_min: float = 0,
scoring_max: float = 10
scores: Dict[str, float], scoring_min: float = 0, scoring_max: float = 10
) -> Optional[float]:
"""
Compute aggregated raw score from parsed criterion scores.
- Filters to valid range [min, max]
- Inverts negative criteria (e.g., "purple prose" where high = bad)
- Averages and scales to 1-10 range
"""
valid_scores = {k: v for k, v in scores.items() if scoring_min <= v <= scoring_max}
if len(valid_scores) < 5:
return None
total = 0.0
count = 0
for criteria, val in valid_scores.items():
crit_lower = criteria.lower().strip()
if crit_lower in NEGATIVE_MARKERS:
@ -143,15 +147,15 @@ def compute_raw_score(
new_val = val
total += new_val
count += 1
avg = total / count
# Scale to 1-10 range
if scoring_max == scoring_min:
scaled = 1
else:
scaled = 1 + (avg - scoring_min) * (9 / (scoring_max - scoring_min))
return round(scaled, 2)
@ -183,8 +187,7 @@ def compute_detailed_distribution(scores: List[float]) -> Dict:
def build_landmark_calibration_config(
scores: List[float],
desired_points: List[float] = None
scores: List[float], desired_points: List[float] = None
) -> Dict:
"""
Build piecewise-linear calibration from raw distribution landmarks.
@ -192,16 +195,16 @@ def build_landmark_calibration_config(
"""
if not scores or len(scores) < 2:
return {"method": "piecewise_landmark", "in_landmarks": [], "out_landmarks": []}
if desired_points is None:
desired_points = [0, 3, 5, 7, 10]
in_min = min(scores)
in_q1 = float(np.percentile(scores, 25))
in_med = float(statistics.median(scores))
in_q3 = float(np.percentile(scores, 75))
in_max = max(scores)
return {
"method": "piecewise_landmark",
"in_landmarks": [in_min, in_q1, in_med, in_q3, in_max],
@ -213,19 +216,19 @@ def apply_landmark_calibration(x: float, config: Dict) -> float:
"""Apply piecewise-linear calibration transform."""
inL = config.get("in_landmarks", [])
outL = config.get("out_landmarks", [])
if len(inL) != 5 or len(outL) != 5:
return x
in_min, in_q1, in_med, in_q3, in_max = inL
out_min, out_q1, out_med, out_q3, out_max = outL
def linear_map(val, old_lo, old_hi, new_lo, new_hi):
if abs(old_hi - old_lo) < 1e-12:
return new_lo
frac = (val - old_lo) / (old_hi - old_lo)
return new_lo + frac * (new_hi - new_lo)
if x <= in_q1:
return linear_map(x, in_min, in_q1, out_min, out_q1)
elif x <= in_med:
@ -236,17 +239,19 @@ def apply_landmark_calibration(x: float, config: Dict) -> float:
return linear_map(x, in_q3, in_max, out_q3, out_max)
def normalize(val: float, min_val: float, max_val: float, ascending: bool = True) -> float:
def normalize(
val: float, min_val: float, max_val: float, ascending: bool = True
) -> float:
"""Normalize a value to 0-1 range."""
if max_val == min_val:
return 0.5
normalized = (val - min_val) / (max_val - min_val)
normalized = max(0.0, min(1.0, normalized))
if not ascending:
normalized = 1.0 - normalized
return normalized
@ -254,72 +259,64 @@ def normalize(val: float, min_val: float, max_val: float, ascending: bool = True
# JudgeMark Configuration
# =============================================================================
class JudgeMarkEvalConfig(BaseEnvConfig):
"""JudgeMark v2 evaluation configuration."""
# Data files (defaults to bundled Judgemark-v2 data)
samples_file: str = Field(
default="",
description="Path to samples JSON (uses bundled if empty)"
default="", description="Path to samples JSON (uses bundled if empty)"
)
prompts_file: str = Field(
default="",
description="Path to prompts JSON (uses bundled if empty)"
default="", description="Path to prompts JSON (uses bundled if empty)"
)
# Scoring settings
scoring_min: float = Field(default=0, description="Minimum score value")
scoring_max: float = Field(default=10, description="Maximum score value")
# Generation settings
eval_max_tokens: int = Field(
default=0,
description="Max tokens for judge response (0 = model default)"
default=0, description="Max tokens for judge response (0 = model default)"
)
eval_temperature: float = Field(
default=0.0,
description="Temperature for judge model"
default=0.0, description="Temperature for judge model"
)
# Thinking mode (optional - can help reasoning about scores)
thinking_mode: bool = Field(
default=False,
description="Enable thinking mode for judge"
default=False, description="Enable thinking mode for judge"
)
custom_thinking_prompt: Optional[str] = Field(default=None)
custom_system_prompt: Optional[str] = Field(default=None)
# Retry settings
max_retries: int = Field(default=3, description="Max retries on API failure")
retry_delay: float = Field(default=1.0, description="Delay between retries")
# Debug
full_debug: bool = Field(
default=False,
description="Save full judge responses"
)
full_debug: bool = Field(default=False, description="Save full judge responses")
# Subset filtering (optional)
max_samples: Optional[int] = Field(
default=None,
description="Limit number of samples to evaluate (None = all)"
default=None, description="Limit number of samples to evaluate (None = all)"
)
class JudgeMarkEvalEnv(BaseEnv):
"""JudgeMark v2 evaluation environment."""
name = "judgemark_eval"
def __init__(
self,
config: JudgeMarkEvalConfig,
server_configs: List[APIServerConfig],
slurm_config=None
slurm_config=None,
):
super().__init__(config, server_configs, slurm_config)
self.config: JudgeMarkEvalConfig = config
# Initialize OpenAI client
server_config = server_configs[0]
self.client = openai.AsyncOpenAI(
@ -327,13 +324,13 @@ class JudgeMarkEvalEnv(BaseEnv):
base_url=server_config.base_url,
)
self.model_name = server_config.model_name
# Storage for results
self.samples_data = {}
self.judge_prompts = {}
self.rubric_criteria = ""
self.score_anchoring = ""
@classmethod
def config_init(cls) -> Tuple[JudgeMarkEvalConfig, List[APIServerConfig]]:
"""Initialize default configuration."""
@ -343,17 +340,19 @@ class JudgeMarkEvalEnv(BaseEnv):
use_wandb=True,
wandb_name="judgemark_eval",
),
[APIServerConfig(
model_name="gpt-4o",
base_url="https://api.openai.com/v1",
api_key=os.environ.get("OPENAI_API_KEY", ""),
)]
[
APIServerConfig(
model_name="gpt-4o",
base_url="https://api.openai.com/v1",
api_key=os.environ.get("OPENAI_API_KEY", ""),
)
],
)
async def setup(self):
"""Load JudgeMark data files."""
print(f"\nLoading JudgeMark v2 data...")
# Determine data directory
data_dir = JUDGEMARK_DATA_DIR
if not data_dir.exists():
@ -361,69 +360,71 @@ class JudgeMarkEvalEnv(BaseEnv):
f"JudgeMark data not found at {data_dir}. "
"Please clone Judgemark-v2 into the atropos root directory."
)
# Load samples
samples_path = (
Path(self.config.samples_file) if self.config.samples_file
Path(self.config.samples_file)
if self.config.samples_file
else data_dir / "judgemark_v3_samples_3_iter.json"
)
with open(samples_path) as f:
self.samples_data = json.load(f)
# Load prompts
prompts_path = (
Path(self.config.prompts_file) if self.config.prompts_file
Path(self.config.prompts_file)
if self.config.prompts_file
else data_dir / "judge_prompts_v3_noref_nocot_noanchor_x96.json"
)
with open(prompts_path) as f:
self.judge_prompts = json.load(f)
# Load rubric files
with open(data_dir / "rubric_criteria.txt") as f:
self.rubric_criteria = f.read()
with open(data_dir / "rubric_score_anchoring.txt") as f:
self.score_anchoring = f.read()
# Inject rubric into prompts
for key, prompt in self.judge_prompts.items():
if isinstance(prompt, str):
prompt = prompt.replace("<RUBRIC_CRITERIA>", self.rubric_criteria)
prompt = prompt.replace("<SCORE_ANCHORING>", self.score_anchoring)
self.judge_prompts[key] = prompt
# Count total samples
total_samples = sum(
len(items)
len(items)
for model_info in self.samples_data.values()
for items in model_info.get("samples", {}).values()
)
print(f" Writer models: {len(self.samples_data)}")
print(f" Total samples: {total_samples}")
print(f" Judge prompts: {len(self.judge_prompts)}")
if self.config.max_samples:
print(f" Limiting to: {self.config.max_samples} samples")
def _create_system_content(self) -> Optional[str]:
"""Create system message for the judge."""
base_content = create_system_content(
self.config.thinking_mode,
self.config.custom_thinking_prompt,
self.config.custom_system_prompt
self.config.custom_system_prompt,
)
return base_content
async def _send_to_judge(self, prompt: str) -> str:
"""Send prompt to judge model and get response."""
system_content = self._create_system_content()
messages = []
if system_content:
messages.append({"role": "system", "content": system_content})
messages.append({"role": "user", "content": prompt})
for attempt in range(self.config.max_retries):
try:
kwargs = {
@ -433,25 +434,25 @@ class JudgeMarkEvalEnv(BaseEnv):
}
if self.config.eval_max_tokens > 0:
kwargs["max_tokens"] = self.config.eval_max_tokens
response = await self.client.chat.completions.create(**kwargs)
return response.choices[0].message.content or ""
except Exception as e:
if attempt < self.config.max_retries - 1:
await asyncio.sleep(self.config.retry_delay * (attempt + 1))
else:
raise e
return ""
async def _evaluate_single_sample(
self,
model_name: str,
iteration_key: str,
item_id: str,
item_text: str,
prompt_template: str
prompt_template: str,
) -> Dict:
"""Evaluate a single writing sample."""
result = {
@ -463,78 +464,81 @@ class JudgeMarkEvalEnv(BaseEnv):
"aggregated_score_raw": None,
"error": None,
}
try:
# Build the full prompt
final_prompt = prompt_template.replace(
"[TEST MODEL RESPONSE]",
"[TEST MODEL RESPONSE]\n" + item_text
"[TEST MODEL RESPONSE]", "[TEST MODEL RESPONSE]\n" + item_text
)
# Get judge response
judge_response = await self._send_to_judge(final_prompt)
if self.config.full_debug:
result["judge_response"] = judge_response
# Parse scores
extracted_scores = parse_scores(judge_response)
result["parsed_scores"] = extracted_scores
# Compute raw score
raw_score = compute_raw_score(
extracted_scores,
self.config.scoring_min,
self.config.scoring_max
extracted_scores, self.config.scoring_min, self.config.scoring_max
)
result["aggregated_score_raw"] = raw_score
if raw_score is None:
result["error"] = f"Only {len(extracted_scores)} valid scores parsed (need 5+)"
result["error"] = (
f"Only {len(extracted_scores)} valid scores parsed (need 5+)"
)
except Exception as e:
result["error"] = f"{type(e).__name__}: {str(e)}"
if self.config.full_debug:
result["traceback"] = traceback.format_exc()
return result
def _compute_cross_model_stats(
self,
scores_by_model: Dict[str, List[float]]
self, scores_by_model: Dict[str, List[float]]
) -> Dict:
"""Compute cross-model statistics including correlation with reference."""
arrays = list(scores_by_model.values())
if len(arrays) < 2:
return {
"anova_f": 0, "anova_p": 1,
"kw_stat": 0, "kw_p": 1,
"anova_f": 0,
"anova_p": 1,
"kw_stat": 0,
"kw_p": 1,
"std_dev_across_models": 0,
"pearson_r": 0, "kendall_tau": 0,
"pearson_r": 0,
"kendall_tau": 0,
}
# ANOVA and Kruskal-Wallis
f_stat, f_p = scipy.stats.f_oneway(*arrays)
kw_stat, kw_p = scipy.stats.kruskal(*arrays)
# Std across model means
model_means = [statistics.mean(scores) for scores in arrays]
std_across = statistics.pstdev(model_means)
# Correlation with reference rankings
ref_pairs = []
for model, scores in scores_by_model.items():
if model in REFERENCE_MODEL_SCORES:
ref_pairs.append((statistics.mean(scores), REFERENCE_MODEL_SCORES[model]))
ref_pairs.append(
(statistics.mean(scores), REFERENCE_MODEL_SCORES[model])
)
if len(ref_pairs) >= 2:
means, refs = zip(*ref_pairs)
pearson_r, _ = scipy.stats.pearsonr(means, refs)
kendall_tau, _ = scipy.stats.kendalltau(means, refs)
else:
pearson_r, kendall_tau = 0.0, 0.0
return {
"anova_f": f_stat,
"anova_p": f_p,
@ -545,7 +549,7 @@ class JudgeMarkEvalEnv(BaseEnv):
"kendall_tau": kendall_tau,
"num_models_with_reference": len(ref_pairs),
}
async def evaluate(self):
"""Run the full JudgeMark evaluation."""
print(f"\n{'='*60}")
@ -554,7 +558,7 @@ class JudgeMarkEvalEnv(BaseEnv):
print(f" Judge model: {self.model_name}")
print(f" Thinking mode: {self.config.thinking_mode}")
print(f"{'='*60}\n")
# Build list of items to process
items_to_process = []
for model_name, model_info in self.samples_data.items():
@ -563,21 +567,23 @@ class JudgeMarkEvalEnv(BaseEnv):
for item_id, item_text in iteration_items.items():
if item_id not in self.judge_prompts:
continue
items_to_process.append({
"model_name": model_name,
"iteration_key": iteration_key,
"item_id": item_id,
"item_text": item_text,
"prompt_template": self.judge_prompts[item_id],
})
items_to_process.append(
{
"model_name": model_name,
"iteration_key": iteration_key,
"item_id": item_id,
"item_text": item_text,
"prompt_template": self.judge_prompts[item_id],
}
)
# Apply sample limit if specified
if self.config.max_samples:
items_to_process = items_to_process[:self.config.max_samples]
items_to_process = items_to_process[: self.config.max_samples]
print(f"Processing {len(items_to_process)} samples...\n")
# Process all samples
tasks = [
self._evaluate_single_sample(
@ -589,44 +595,42 @@ class JudgeMarkEvalEnv(BaseEnv):
)
for item in items_to_process
]
results = await tqdm_asyncio.gather(
*tasks,
desc="Judging samples"
)
results = await tqdm_asyncio.gather(*tasks, desc="Judging samples")
# Aggregate results by writer model
raw_scores_by_model = defaultdict(list)
calibrated_scores_by_model = defaultdict(list)
valid_results = [r for r in results if r["aggregated_score_raw"] is not None]
failed_results = [r for r in results if r["aggregated_score_raw"] is None]
for r in valid_results:
raw_scores_by_model[r["writer_model"]].append(r["aggregated_score_raw"])
# Compute calibration
all_raw_scores = [r["aggregated_score_raw"] for r in valid_results]
calibration_config = build_landmark_calibration_config(all_raw_scores)
# Apply calibration
for r in valid_results:
calibrated = apply_landmark_calibration(
r["aggregated_score_raw"],
calibration_config
r["aggregated_score_raw"], calibration_config
)
r["aggregated_score_calibrated"] = calibrated
calibrated_scores_by_model[r["writer_model"]].append(calibrated)
# Compute statistics
raw_distribution = compute_detailed_distribution(all_raw_scores)
calibrated_distribution = compute_detailed_distribution(
[r["aggregated_score_calibrated"] for r in valid_results]
)
raw_cross_stats = self._compute_cross_model_stats(dict(raw_scores_by_model))
calibrated_cross_stats = self._compute_cross_model_stats(dict(calibrated_scores_by_model))
calibrated_cross_stats = self._compute_cross_model_stats(
dict(calibrated_scores_by_model)
)
# Per-model stats
model_stats = {}
for model, scores in raw_scores_by_model.items():
@ -634,31 +638,37 @@ class JudgeMarkEvalEnv(BaseEnv):
model_stats[model] = {
"count": len(scores),
"mean_raw": statistics.mean(scores),
"mean_calibrated": statistics.mean(calibrated_scores_by_model[model]),
"mean_calibrated": statistics.mean(
calibrated_scores_by_model[model]
),
"stdev": statistics.stdev(scores) if len(scores) > 1 else 0,
"ci95": confidence_interval_95(scores),
}
# Compute final Judgemark score
# Normalize components
kendall_norm = normalize(calibrated_cross_stats["kendall_tau"], 0.15, 0.75)
kw_norm = normalize(calibrated_cross_stats["kw_stat"], 100.0, 1300.0)
std_norm = normalize(calibrated_cross_stats["std_dev_across_models"], 0.0, 2.6)
# Separability score (simplified)
separability = (kw_norm + std_norm) / 2.0
# Final score: correlation + separability (simplified version)
final_judgemark_score = (kendall_norm + separability) / 2.0
# Build metrics summary
metrics = {
"final_judgemark_score": round(final_judgemark_score, 4),
"kendall_tau_calibrated": round(calibrated_cross_stats["kendall_tau"], 4),
"kendall_tau_raw": round(raw_cross_stats["kendall_tau"], 4),
"kruskal_wallis_stat": round(calibrated_cross_stats["kw_stat"], 2),
"std_dev_across_models": round(calibrated_cross_stats["std_dev_across_models"], 4),
"num_models_with_reference": calibrated_cross_stats["num_models_with_reference"],
"std_dev_across_models": round(
calibrated_cross_stats["std_dev_across_models"], 4
),
"num_models_with_reference": calibrated_cross_stats[
"num_models_with_reference"
],
"total_samples": len(results),
"valid_samples": len(valid_results),
"failed_samples": len(failed_results),
@ -673,56 +683,57 @@ class JudgeMarkEvalEnv(BaseEnv):
"separability": round(separability, 4),
},
}
# Print summary
print(f"\n{'='*60}")
print("JudgeMark v2 Evaluation Results")
print(f"{'='*60}")
print(f" Final Judgemark Score: {final_judgemark_score:.4f}")
print(f" Kendall's τ (calibrated): {calibrated_cross_stats['kendall_tau']:.4f}")
print(
f" Kendall's τ (calibrated): {calibrated_cross_stats['kendall_tau']:.4f}"
)
print(f" Kruskal-Wallis stat: {calibrated_cross_stats['kw_stat']:.2f}")
print(f" Std dev across models: {calibrated_cross_stats['std_dev_across_models']:.4f}")
print(
f" Std dev across models: {calibrated_cross_stats['std_dev_across_models']:.4f}"
)
print(f"\n Valid samples: {len(valid_results)}/{len(results)}")
print(f" Models with reference: {calibrated_cross_stats['num_models_with_reference']}")
print(
f" Models with reference: {calibrated_cross_stats['num_models_with_reference']}"
)
print(f"\n Per-model averages (calibrated):")
sorted_models = sorted(
model_stats.items(),
key=lambda x: x[1]["mean_calibrated"],
reverse=True
model_stats.items(), key=lambda x: x[1]["mean_calibrated"], reverse=True
)
for model, stats in sorted_models[:10]:
print(f" {model:.<40} {stats['mean_calibrated']:.3f} ±{stats['ci95']:.3f}")
print(
f" {model:.<40} {stats['mean_calibrated']:.3f} ±{stats['ci95']:.3f}"
)
if len(sorted_models) > 10:
print(f" ... and {len(sorted_models) - 10} more models")
print(f"{'='*60}\n")
# Save results
if self.config.data_dir_to_save_evals:
save_eval_results(
self.config.data_dir_to_save_evals,
metrics,
results
)
save_eval_results(self.config.data_dir_to_save_evals, metrics, results)
return metrics, results
# Required BaseEnv methods (not used for CLI evaluation)
async def get_next_item(self):
pass
async def collect_trajectories(self, item):
pass
async def score(self, rollout_group_data):
pass
async def wandb_log(self, *args, **kwargs):
pass
if __name__ == "__main__":
JudgeMarkEvalEnv.cli()