mirror of
https://github.com/collinear-ai/yc-bench.git
synced 2026-04-19 12:58:03 +00:00
Bug fixes: - CLI --horizon-years defaulted to 3, silently overriding config presets. Now defaults to None so config value (1yr for medium/hard/nightmare) is used. - Runtime passed a single api_key kwarg regardless of provider, breaking Gemini. Now lets LiteLLM resolve keys from provider-specific env vars. - Removed temperature+top_p from LLM calls (Anthropic rejects both together). - DB and result filenames now include config name to prevent cross-config collisions. Benchmark results (1yr horizon, 3 seeds each): Sonnet 4.6: medium 2/3, hard 0/3, nightmare 1/3 Gemini Flash: medium 3/3, hard 1/3, nightmare 1/3 Gemini has higher win rates (93-98% vs 40-83% on medium). Sonnet's ceiling is higher when it survives (nightmare $10.1M vs $478K). New scripts: plot_comparison.py, plot_sonnet_results.py, notepad_gif.py Updated README with detailed comparison tables and failure analysis. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
179 lines
6.6 KiB
Python
179 lines
6.6 KiB
Python
"""Generate a GIF showing scratchpad/notepad evolution over turns."""
|
|
import json
|
|
import re
|
|
import textwrap
|
|
from pathlib import Path
|
|
|
|
from PIL import Image, ImageDraw, ImageFont
|
|
|
|
ROOT = Path(__file__).parent.parent
|
|
|
|
|
|
def extract_scratchpad_versions(result_path):
|
|
"""Extract all scratchpad write commands from a result JSON transcript."""
|
|
with open(result_path) as f:
|
|
d = json.load(f)
|
|
|
|
versions = []
|
|
for t in d["transcript"]:
|
|
for cmd in t.get("commands_executed", []):
|
|
if "scratchpad write" not in cmd.lower():
|
|
continue
|
|
idx = cmd.find("--content ")
|
|
if idx < 0:
|
|
continue
|
|
content_start = idx + len("--content ")
|
|
if cmd[content_start] == '"':
|
|
content_start += 1
|
|
arrow = cmd.find(' -> {')
|
|
if arrow > 0:
|
|
content = cmd[content_start:arrow].rstrip('"')
|
|
else:
|
|
content = cmd[content_start:].rstrip('"')
|
|
# Unescape
|
|
content = content.replace("\\n", "\n").replace('\\"', '"')
|
|
versions.append({
|
|
"turn": t["turn"],
|
|
"content": content,
|
|
})
|
|
return versions, d
|
|
|
|
|
|
def render_frame(content, turn, total_turns, meta, frame_size=(1200, 800)):
|
|
"""Render a single scratchpad frame as a PIL Image."""
|
|
w, h = frame_size
|
|
img = Image.new("RGB", (w, h), "#ffffff")
|
|
draw = ImageDraw.Draw(img)
|
|
|
|
# Try to use a monospace font
|
|
try:
|
|
body_font = ImageFont.truetype("/System/Library/Fonts/Menlo.ttc", 13)
|
|
title_font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 16)
|
|
small_font = ImageFont.truetype("/System/Library/Fonts/Menlo.ttc", 11)
|
|
except (OSError, IOError):
|
|
body_font = ImageFont.load_default()
|
|
title_font = body_font
|
|
small_font = body_font
|
|
|
|
# Header bar
|
|
draw.rectangle([(0, 0), (w, 50)], fill="#1a1a2e")
|
|
model_label = meta.get("model", "unknown")
|
|
config = meta.get("config", "")
|
|
seed = meta.get("seed", "")
|
|
outcome = meta.get("outcome", "")
|
|
outcome_color = "#4ade80" if "survived" in outcome.lower() else "#f87171"
|
|
|
|
draw.text((16, 8), f"SCRATCHPAD", fill="#e2e8f0", font=title_font)
|
|
draw.text((180, 8), f"{model_label}", fill="#94a3b8", font=small_font)
|
|
draw.text((180, 26), f"{config} · seed {seed}", fill="#64748b", font=small_font)
|
|
|
|
# Turn indicator + progress bar
|
|
draw.text((w - 280, 8), f"Turn {turn}/{total_turns}", fill="#e2e8f0", font=title_font)
|
|
draw.text((w - 280, 30), outcome, fill=outcome_color, font=small_font)
|
|
|
|
bar_x, bar_y, bar_w, bar_h = w - 130, 15, 110, 20
|
|
draw.rectangle([(bar_x, bar_y), (bar_x + bar_w, bar_y + bar_h)], outline="#334155", width=1)
|
|
progress = min(turn / max(total_turns, 1), 1.0)
|
|
fill_color = "#3b82f6" if "survived" not in outcome.lower() else "#22c55e"
|
|
draw.rectangle([(bar_x + 1, bar_y + 1), (bar_x + 1 + int((bar_w - 2) * progress), bar_y + bar_h - 1)], fill=fill_color)
|
|
|
|
# Content area
|
|
margin = 20
|
|
y = 60
|
|
max_width = 115 # characters per line
|
|
|
|
lines = []
|
|
for raw_line in content.split("\n"):
|
|
if len(raw_line) <= max_width:
|
|
lines.append(raw_line)
|
|
else:
|
|
wrapped = textwrap.wrap(raw_line, width=max_width, break_long_words=True, break_on_hyphens=False)
|
|
lines.extend(wrapped if wrapped else [""])
|
|
|
|
max_lines = (h - y - 20) // 16
|
|
|
|
for i, line in enumerate(lines[:max_lines]):
|
|
text_y = y + i * 16
|
|
|
|
# Color coding
|
|
if line.startswith("##") or line.startswith("==="):
|
|
color = "#1e40af"
|
|
draw.text((margin, text_y), line, fill=color, font=body_font)
|
|
elif "CRISIS" in line or "LOCKED" in line or "LATE" in line or "FAIL" in line or "bankrupt" in line.lower():
|
|
color = "#dc2626"
|
|
draw.text((margin, text_y), line, fill=color, font=body_font)
|
|
elif "LESSON" in line or "KEY" in line or "RULE" in line or "STRATEGY" in line:
|
|
color = "#7c3aed"
|
|
draw.text((margin, text_y), line, fill=color, font=body_font)
|
|
elif line.startswith("- ") or line.startswith(" -"):
|
|
draw.text((margin, text_y), line, fill="#374151", font=body_font)
|
|
elif "✅" in line or "SUCCESS" in line or "survived" in line.lower():
|
|
draw.text((margin, text_y), line, fill="#16a34a", font=body_font)
|
|
else:
|
|
draw.text((margin, text_y), line, fill="#1f2937", font=body_font)
|
|
|
|
if len(lines) > max_lines:
|
|
draw.text((margin, y + max_lines * 16), f" ... ({len(lines) - max_lines} more lines)", fill="#9ca3af", font=small_font)
|
|
|
|
# Bottom border
|
|
draw.line([(0, h - 2), (w, h - 2)], fill="#e5e7eb", width=1)
|
|
|
|
return img
|
|
|
|
|
|
def make_gif(result_path, output_path=None):
|
|
versions, data = extract_scratchpad_versions(result_path)
|
|
if not versions:
|
|
print(f"No scratchpad writes found in {result_path}")
|
|
return
|
|
|
|
total_turns = data.get("turns_completed", versions[-1]["turn"])
|
|
model = data.get("model", "unknown").split("/")[-1]
|
|
reason = data.get("terminal_reason", "unknown")
|
|
outcome = "SURVIVED" if reason == "horizon_end" else reason.upper()
|
|
|
|
# Infer config from filename
|
|
fname = Path(result_path).stem
|
|
config_match = re.search(r"result_(\w+)_\d+_", fname)
|
|
config = config_match.group(1) if config_match else "unknown"
|
|
seed_match = re.search(r"_(\d+)_anthropic", fname) or re.search(r"_(\d+)_gemini", fname)
|
|
seed = seed_match.group(1) if seed_match else "?"
|
|
|
|
meta = {"model": model, "config": config, "seed": seed, "outcome": outcome}
|
|
|
|
print(f"Generating GIF: {len(versions)} frames, {model}, {config} seed={seed}, {outcome}")
|
|
|
|
frames = []
|
|
for v in versions:
|
|
frame = render_frame(v["content"], v["turn"], total_turns, meta)
|
|
frames.append(frame)
|
|
|
|
if not output_path:
|
|
output_path = ROOT / "plots" / f"notepad_{config}_{seed}_{model}.gif"
|
|
|
|
output_path = Path(output_path)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Each frame shown for 3 seconds, last frame for 6 seconds
|
|
durations = [3000] * len(frames)
|
|
if durations:
|
|
durations[-1] = 6000
|
|
|
|
frames[0].save(
|
|
output_path,
|
|
save_all=True,
|
|
append_images=frames[1:],
|
|
duration=durations,
|
|
loop=0,
|
|
)
|
|
print(f"Saved: {output_path} ({len(frames)} frames)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
if len(sys.argv) > 1:
|
|
make_gif(sys.argv[1], sys.argv[2] if len(sys.argv) > 2 else None)
|
|
else:
|
|
# Generate for all available result files
|
|
for p in sorted(ROOT.glob("results/yc_bench_result_*.json")):
|
|
make_gif(p)
|