atropos/environments/eval_environments/judgemark_eval.py
2025-12-24 23:36:36 +00:00

737 lines
24 KiB
Python

"""
JudgeMark v2 Evaluation Environment
This environment evaluates how well a language model can judge creative writing.
It measures the model's ability to:
- Assign consistent, discriminative scores
- Correlate with human preferences (LMSYS Arena rankings)
- Separate good writing from bad writing
Based on: https://github.com/EQ-bench/Judgemark-v2
Paper/Leaderboard: https://eqbench.com/judgemark-v2.html
The benchmark presents pre-generated creative writing samples to the judge model,
asks for 0-10 scores on 17 literary criteria, then computes:
- Raw and calibrated score distributions
- Kendall's tau correlation with reference rankings
- Score stability across repeated runs
- Inter-model separability metrics
- Final composite Judgemark score
Usage:
python judgemark_eval.py evaluate \
--openai.base_url https://api.openai.com/v1 \
--openai.api_key $OPENAI_API_KEY \
--openai.model_name gpt-4o \
--env.data_dir_to_save_evals ../evals/judgemark/gpt-4o
"""
import asyncio
import json
import math
import os
import re
import statistics
import traceback
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import numpy as np
import openai
import scipy.stats
from eval_helpers import (
create_system_content,
save_eval_results,
)
from pydantic import Field
from tqdm.asyncio import tqdm_asyncio
from atroposlib.envs.base import APIServerConfig, BaseEnv, BaseEnvConfig
# Path to JudgeMark data files (relative to this file)
JUDGEMARK_DATA_DIR = Path(__file__).parent.parent.parent / "Judgemark-v2" / "data"
# =============================================================================
# Constants (from Judgemark-v2/config/constants.py)
# =============================================================================
# Reference model scores for correlation (LMSYS Arena ELO-like scores)
REFERENCE_MODEL_SCORES = {
"kimi-k2": 1387,
"claude-opus-4": 1417,
"claude-sonnet-4": 1380,
"chatgpt-4o-latest": 1425,
"gpt-4.1": 1399,
"qwen3-235b-a22b": 1366,
"gemma-3-27b-it": 1355,
"mistral-small-3.2-24b": 1334,
"reka-flash-3": 1250,
"grok-3-beta": 1401,
"gpt-4.1-mini": 1349,
"gemma-3-12b-it": 1333,
"gemma-3-4b-it": 1282,
"gpt-4.1-nano": 1309,
}
# Negative criteria markers - these get inverted (higher = worse writing)
NEGATIVE_MARKERS = [
"meandering",
"weak dialogue",
"tell-don't-show",
"unsurprising or uncreative",
"amateurish",
"purple prose",
"overwrought",
"incongruent ending positivity",
"unearned transformations",
]
# =============================================================================
# Scoring Functions (from Judgemark-v2/core/scoring.py)
# =============================================================================
def parse_scores(judge_response: str) -> Dict[str, float]:
"""
Parse score lines from judge output with flexible formatting.
Accepts formats like:
**Quality:** 7.5
Quality: 7.5
**Quality:** [7.5]
"""
pattern = (
r"^\s*"
r"(?:\*\*)?([^\n:\[\]]{2,100}):(?:\*\*)?"
r"\s*"
r"(?:\[)?(?:\*\*)?"
r"(-?\d+(?:\.\d+)?)"
r"(?:\*\*)?(?:\])?"
r"\s*$"
)
matches = re.findall(pattern, judge_response, re.MULTILINE)
scores = {metric.strip(): float(score) for metric, score in matches}
return scores
def compute_raw_score(
scores: Dict[str, float], scoring_min: float = 0, scoring_max: float = 10
) -> Optional[float]:
"""
Compute aggregated raw score from parsed criterion scores.
- Filters to valid range [min, max]
- Inverts negative criteria (e.g., "purple prose" where high = bad)
- Averages and scales to 1-10 range
"""
valid_scores = {k: v for k, v in scores.items() if scoring_min <= v <= scoring_max}
if len(valid_scores) < 5:
return None
total = 0.0
count = 0
for criteria, val in valid_scores.items():
crit_lower = criteria.lower().strip()
if crit_lower in NEGATIVE_MARKERS:
# Invert negative criteria
new_val = (scoring_min + scoring_max) - val
else:
new_val = val
total += new_val
count += 1
avg = total / count
# Scale to 1-10 range
if scoring_max == scoring_min:
scaled = 1
else:
scaled = 1 + (avg - scoring_min) * (9 / (scoring_max - scoring_min))
return round(scaled, 2)
def confidence_interval_95(data: List[float]) -> float:
"""Compute 95% confidence interval for the mean."""
n = len(data)
if n < 2:
return 0.0
stdev = statistics.stdev(data)
return 1.96 * (stdev / math.sqrt(n))
def compute_detailed_distribution(scores: List[float]) -> Dict:
"""Compute detailed distribution statistics."""
if not scores:
return {}
return {
"count": len(scores),
"min": round(min(scores), 3),
"max": round(max(scores), 3),
"mean": round(statistics.mean(scores), 3),
"median": round(statistics.median(scores), 3),
"stdev": round(statistics.stdev(scores) if len(scores) > 1 else 0.0, 3),
"p10": round(float(np.percentile(scores, 10)), 3),
"p25": round(float(np.percentile(scores, 25)), 3),
"p75": round(float(np.percentile(scores, 75)), 3),
"p90": round(float(np.percentile(scores, 90)), 3),
}
def build_landmark_calibration_config(
scores: List[float], desired_points: List[float] = None
) -> Dict:
"""
Build piecewise-linear calibration from raw distribution landmarks.
Maps [min, Q1, median, Q3, max] to desired_points [0, 3, 5, 7, 10].
"""
if not scores or len(scores) < 2:
return {"method": "piecewise_landmark", "in_landmarks": [], "out_landmarks": []}
if desired_points is None:
desired_points = [0, 3, 5, 7, 10]
in_min = min(scores)
in_q1 = float(np.percentile(scores, 25))
in_med = float(statistics.median(scores))
in_q3 = float(np.percentile(scores, 75))
in_max = max(scores)
return {
"method": "piecewise_landmark",
"in_landmarks": [in_min, in_q1, in_med, in_q3, in_max],
"out_landmarks": desired_points,
}
def apply_landmark_calibration(x: float, config: Dict) -> float:
"""Apply piecewise-linear calibration transform."""
inL = config.get("in_landmarks", [])
outL = config.get("out_landmarks", [])
if len(inL) != 5 or len(outL) != 5:
return x
in_min, in_q1, in_med, in_q3, in_max = inL
out_min, out_q1, out_med, out_q3, out_max = outL
def linear_map(val, old_lo, old_hi, new_lo, new_hi):
if abs(old_hi - old_lo) < 1e-12:
return new_lo
frac = (val - old_lo) / (old_hi - old_lo)
return new_lo + frac * (new_hi - new_lo)
if x <= in_q1:
return linear_map(x, in_min, in_q1, out_min, out_q1)
elif x <= in_med:
return linear_map(x, in_q1, in_med, out_q1, out_med)
elif x <= in_q3:
return linear_map(x, in_med, in_q3, out_med, out_q3)
else:
return linear_map(x, in_q3, in_max, out_q3, out_max)
def normalize(
val: float, min_val: float, max_val: float, ascending: bool = True
) -> float:
"""Normalize a value to 0-1 range."""
if max_val == min_val:
return 0.5
normalized = (val - min_val) / (max_val - min_val)
normalized = max(0.0, min(1.0, normalized))
if not ascending:
normalized = 1.0 - normalized
return normalized
# =============================================================================
# JudgeMark Configuration
# =============================================================================
class JudgeMarkEvalConfig(BaseEnvConfig):
"""JudgeMark v2 evaluation configuration."""
# Data files (defaults to bundled Judgemark-v2 data)
samples_file: str = Field(
default="", description="Path to samples JSON (uses bundled if empty)"
)
prompts_file: str = Field(
default="", description="Path to prompts JSON (uses bundled if empty)"
)
# Scoring settings
scoring_min: float = Field(default=0, description="Minimum score value")
scoring_max: float = Field(default=10, description="Maximum score value")
# Generation settings
eval_max_tokens: int = Field(
default=0, description="Max tokens for judge response (0 = model default)"
)
eval_temperature: float = Field(
default=0.0, description="Temperature for judge model"
)
# Thinking mode (optional - can help reasoning about scores)
thinking_mode: bool = Field(
default=False, description="Enable thinking mode for judge"
)
custom_thinking_prompt: Optional[str] = Field(default=None)
custom_system_prompt: Optional[str] = Field(default=None)
# Retry settings
max_retries: int = Field(default=3, description="Max retries on API failure")
retry_delay: float = Field(default=1.0, description="Delay between retries")
# Debug
full_debug: bool = Field(default=False, description="Save full judge responses")
# Subset filtering (optional)
max_samples: Optional[int] = Field(
default=None, description="Limit number of samples to evaluate (None = all)"
)
class JudgeMarkEvalEnv(BaseEnv):
"""JudgeMark v2 evaluation environment."""
name = "judgemark_eval"
def __init__(
self,
config: JudgeMarkEvalConfig,
server_configs: List[APIServerConfig],
slurm_config=None,
):
super().__init__(config, server_configs, slurm_config)
self.config: JudgeMarkEvalConfig = config
# Initialize OpenAI client
server_config = server_configs[0]
self.client = openai.AsyncOpenAI(
api_key=server_config.api_key,
base_url=server_config.base_url,
)
self.model_name = server_config.model_name
# Storage for results
self.samples_data = {}
self.judge_prompts = {}
self.rubric_criteria = ""
self.score_anchoring = ""
@classmethod
def config_init(cls) -> Tuple[JudgeMarkEvalConfig, List[APIServerConfig]]:
"""Initialize default configuration."""
return (
JudgeMarkEvalConfig(
eval_max_tokens=0,
use_wandb=True,
wandb_name="judgemark_eval",
),
[
APIServerConfig(
model_name="gpt-4o",
base_url="https://api.openai.com/v1",
api_key=os.environ.get("OPENAI_API_KEY", ""),
)
],
)
async def setup(self):
"""Load JudgeMark data files."""
print("\nLoading JudgeMark v2 data...")
# Determine data directory
data_dir = JUDGEMARK_DATA_DIR
if not data_dir.exists():
raise FileNotFoundError(
f"JudgeMark data not found at {data_dir}. "
"Please clone Judgemark-v2 into the atropos root directory."
)
# Load samples
samples_path = (
Path(self.config.samples_file)
if self.config.samples_file
else data_dir / "judgemark_v3_samples_3_iter.json"
)
with open(samples_path) as f:
self.samples_data = json.load(f)
# Load prompts
prompts_path = (
Path(self.config.prompts_file)
if self.config.prompts_file
else data_dir / "judge_prompts_v3_noref_nocot_noanchor_x96.json"
)
with open(prompts_path) as f:
self.judge_prompts = json.load(f)
# Load rubric files
with open(data_dir / "rubric_criteria.txt") as f:
self.rubric_criteria = f.read()
with open(data_dir / "rubric_score_anchoring.txt") as f:
self.score_anchoring = f.read()
# Inject rubric into prompts
for key, prompt in self.judge_prompts.items():
if isinstance(prompt, str):
prompt = prompt.replace("<RUBRIC_CRITERIA>", self.rubric_criteria)
prompt = prompt.replace("<SCORE_ANCHORING>", self.score_anchoring)
self.judge_prompts[key] = prompt
# Count total samples
total_samples = sum(
len(items)
for model_info in self.samples_data.values()
for items in model_info.get("samples", {}).values()
)
print(f" Writer models: {len(self.samples_data)}")
print(f" Total samples: {total_samples}")
print(f" Judge prompts: {len(self.judge_prompts)}")
if self.config.max_samples:
print(f" Limiting to: {self.config.max_samples} samples")
def _create_system_content(self) -> Optional[str]:
"""Create system message for the judge."""
base_content = create_system_content(
self.config.thinking_mode,
self.config.custom_thinking_prompt,
self.config.custom_system_prompt,
)
return base_content
async def _send_to_judge(self, prompt: str) -> str:
"""Send prompt to judge model and get response."""
system_content = self._create_system_content()
messages = []
if system_content:
messages.append({"role": "system", "content": system_content})
messages.append({"role": "user", "content": prompt})
for attempt in range(self.config.max_retries):
try:
kwargs = {
"model": self.model_name,
"messages": messages,
"temperature": self.config.eval_temperature,
}
if self.config.eval_max_tokens > 0:
kwargs["max_tokens"] = self.config.eval_max_tokens
response = await self.client.chat.completions.create(**kwargs)
return response.choices[0].message.content or ""
except Exception as e:
if attempt < self.config.max_retries - 1:
await asyncio.sleep(self.config.retry_delay * (attempt + 1))
else:
raise e
return ""
async def _evaluate_single_sample(
self,
model_name: str,
iteration_key: str,
item_id: str,
item_text: str,
prompt_template: str,
) -> Dict:
"""Evaluate a single writing sample."""
result = {
"writer_model": model_name,
"iteration": iteration_key,
"item_id": item_id,
"text_length": len(item_text),
"parsed_scores": {},
"aggregated_score_raw": None,
"error": None,
}
try:
# Build the full prompt
final_prompt = prompt_template.replace(
"[TEST MODEL RESPONSE]", "[TEST MODEL RESPONSE]\n" + item_text
)
# Get judge response
judge_response = await self._send_to_judge(final_prompt)
if self.config.full_debug:
result["judge_response"] = judge_response
# Parse scores
extracted_scores = parse_scores(judge_response)
result["parsed_scores"] = extracted_scores
# Compute raw score
raw_score = compute_raw_score(
extracted_scores, self.config.scoring_min, self.config.scoring_max
)
result["aggregated_score_raw"] = raw_score
if raw_score is None:
result["error"] = (
f"Only {len(extracted_scores)} valid scores parsed (need 5+)"
)
except Exception as e:
result["error"] = f"{type(e).__name__}: {str(e)}"
if self.config.full_debug:
result["traceback"] = traceback.format_exc()
return result
def _compute_cross_model_stats(
self, scores_by_model: Dict[str, List[float]]
) -> Dict:
"""Compute cross-model statistics including correlation with reference."""
arrays = list(scores_by_model.values())
if len(arrays) < 2:
return {
"anova_f": 0,
"anova_p": 1,
"kw_stat": 0,
"kw_p": 1,
"std_dev_across_models": 0,
"pearson_r": 0,
"kendall_tau": 0,
}
# ANOVA and Kruskal-Wallis
f_stat, f_p = scipy.stats.f_oneway(*arrays)
kw_stat, kw_p = scipy.stats.kruskal(*arrays)
# Std across model means
model_means = [statistics.mean(scores) for scores in arrays]
std_across = statistics.pstdev(model_means)
# Correlation with reference rankings
ref_pairs = []
for model, scores in scores_by_model.items():
if model in REFERENCE_MODEL_SCORES:
ref_pairs.append(
(statistics.mean(scores), REFERENCE_MODEL_SCORES[model])
)
if len(ref_pairs) >= 2:
means, refs = zip(*ref_pairs)
pearson_r, _ = scipy.stats.pearsonr(means, refs)
kendall_tau, _ = scipy.stats.kendalltau(means, refs)
else:
pearson_r, kendall_tau = 0.0, 0.0
return {
"anova_f": f_stat,
"anova_p": f_p,
"kw_stat": kw_stat,
"kw_p": kw_p,
"std_dev_across_models": std_across,
"pearson_r": pearson_r,
"kendall_tau": kendall_tau,
"num_models_with_reference": len(ref_pairs),
}
async def evaluate(self):
"""Run the full JudgeMark evaluation."""
print(f"\n{'='*60}")
print("Starting JudgeMark v2 Evaluation")
print(f"{'='*60}")
print(f" Judge model: {self.model_name}")
print(f" Thinking mode: {self.config.thinking_mode}")
print(f"{'='*60}\n")
# Build list of items to process
items_to_process = []
for model_name, model_info in self.samples_data.items():
samples_dict = model_info.get("samples", {})
for iteration_key, iteration_items in samples_dict.items():
for item_id, item_text in iteration_items.items():
if item_id not in self.judge_prompts:
continue
items_to_process.append(
{
"model_name": model_name,
"iteration_key": iteration_key,
"item_id": item_id,
"item_text": item_text,
"prompt_template": self.judge_prompts[item_id],
}
)
# Apply sample limit if specified
if self.config.max_samples:
items_to_process = items_to_process[: self.config.max_samples]
print(f"Processing {len(items_to_process)} samples...\n")
# Process all samples
tasks = [
self._evaluate_single_sample(
item["model_name"],
item["iteration_key"],
item["item_id"],
item["item_text"],
item["prompt_template"],
)
for item in items_to_process
]
results = await tqdm_asyncio.gather(*tasks, desc="Judging samples")
# Aggregate results by writer model
raw_scores_by_model = defaultdict(list)
calibrated_scores_by_model = defaultdict(list)
valid_results = [r for r in results if r["aggregated_score_raw"] is not None]
failed_results = [r for r in results if r["aggregated_score_raw"] is None]
for r in valid_results:
raw_scores_by_model[r["writer_model"]].append(r["aggregated_score_raw"])
# Compute calibration
all_raw_scores = [r["aggregated_score_raw"] for r in valid_results]
calibration_config = build_landmark_calibration_config(all_raw_scores)
# Apply calibration
for r in valid_results:
calibrated = apply_landmark_calibration(
r["aggregated_score_raw"], calibration_config
)
r["aggregated_score_calibrated"] = calibrated
calibrated_scores_by_model[r["writer_model"]].append(calibrated)
# Compute statistics
raw_distribution = compute_detailed_distribution(all_raw_scores)
calibrated_distribution = compute_detailed_distribution(
[r["aggregated_score_calibrated"] for r in valid_results]
)
raw_cross_stats = self._compute_cross_model_stats(dict(raw_scores_by_model))
calibrated_cross_stats = self._compute_cross_model_stats(
dict(calibrated_scores_by_model)
)
# Per-model stats
model_stats = {}
for model, scores in raw_scores_by_model.items():
if len(scores) > 0:
model_stats[model] = {
"count": len(scores),
"mean_raw": statistics.mean(scores),
"mean_calibrated": statistics.mean(
calibrated_scores_by_model[model]
),
"stdev": statistics.stdev(scores) if len(scores) > 1 else 0,
"ci95": confidence_interval_95(scores),
}
# Compute final Judgemark score
# Normalize components
kendall_norm = normalize(calibrated_cross_stats["kendall_tau"], 0.15, 0.75)
kw_norm = normalize(calibrated_cross_stats["kw_stat"], 100.0, 1300.0)
std_norm = normalize(calibrated_cross_stats["std_dev_across_models"], 0.0, 2.6)
# Separability score (simplified)
separability = (kw_norm + std_norm) / 2.0
# Final score: correlation + separability (simplified version)
final_judgemark_score = (kendall_norm + separability) / 2.0
# Build metrics summary
metrics = {
"final_judgemark_score": round(final_judgemark_score, 4),
"kendall_tau_calibrated": round(calibrated_cross_stats["kendall_tau"], 4),
"kendall_tau_raw": round(raw_cross_stats["kendall_tau"], 4),
"kruskal_wallis_stat": round(calibrated_cross_stats["kw_stat"], 2),
"std_dev_across_models": round(
calibrated_cross_stats["std_dev_across_models"], 4
),
"num_models_with_reference": calibrated_cross_stats[
"num_models_with_reference"
],
"total_samples": len(results),
"valid_samples": len(valid_results),
"failed_samples": len(failed_results),
"raw_score_distribution": raw_distribution,
"calibrated_score_distribution": calibrated_distribution,
"calibration_config": calibration_config,
"model_stats": model_stats,
"normalized_components": {
"kendall_tau_norm": round(kendall_norm, 4),
"kw_stat_norm": round(kw_norm, 4),
"std_dev_norm": round(std_norm, 4),
"separability": round(separability, 4),
},
}
# Print summary
print(f"\n{'='*60}")
print("JudgeMark v2 Evaluation Results")
print(f"{'='*60}")
print(f" Final Judgemark Score: {final_judgemark_score:.4f}")
print(
f" Kendall's τ (calibrated): {calibrated_cross_stats['kendall_tau']:.4f}"
)
print(f" Kruskal-Wallis stat: {calibrated_cross_stats['kw_stat']:.2f}")
print(
f" Std dev across models: {calibrated_cross_stats['std_dev_across_models']:.4f}"
)
print(f"\n Valid samples: {len(valid_results)}/{len(results)}")
print(
f" Models with reference: {calibrated_cross_stats['num_models_with_reference']}"
)
print("\n Per-model averages (calibrated):")
sorted_models = sorted(
model_stats.items(), key=lambda x: x[1]["mean_calibrated"], reverse=True
)
for model, stats in sorted_models[:10]:
print(
f" {model:.<40} {stats['mean_calibrated']:.3f} ±{stats['ci95']:.3f}"
)
if len(sorted_models) > 10:
print(f" ... and {len(sorted_models) - 10} more models")
print(f"{'='*60}\n")
# Save results
if self.config.data_dir_to_save_evals:
save_eval_results(self.config.data_dir_to_save_evals, metrics, results)
return metrics, results
# Required BaseEnv methods (not used for CLI evaluation)
async def get_next_item(self):
pass
async def collect_trajectories(self, item):
pass
async def score(self, rollout_group_data):
pass
async def wandb_log(self, *args, **kwargs):
pass
if __name__ == "__main__":
JudgeMarkEvalEnv.cli()