yc-bench/scripts/plot_sonnet_results.py
adit jain 5d2962073d Fix horizon bug, multi-provider support, add Sonnet vs Gemini benchmark results
Bug fixes:
- CLI --horizon-years defaulted to 3, silently overriding config presets.
  Now defaults to None so config value (1yr for medium/hard/nightmare) is used.
- Runtime passed a single api_key kwarg regardless of provider, breaking
  Gemini. Now lets LiteLLM resolve keys from provider-specific env vars.
- Removed temperature+top_p from LLM calls (Anthropic rejects both together).
- DB and result filenames now include config name to prevent cross-config collisions.

Benchmark results (1yr horizon, 3 seeds each):
  Sonnet 4.6: medium 2/3, hard 0/3, nightmare 1/3
  Gemini Flash: medium 3/3, hard 1/3, nightmare 1/3
  Gemini has higher win rates (93-98% vs 40-83% on medium).
  Sonnet's ceiling is higher when it survives (nightmare $10.1M vs $478K).

New scripts: plot_comparison.py, plot_sonnet_results.py, notepad_gif.py
Updated README with detailed comparison tables and failure analysis.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-26 00:31:00 -08:00

167 lines
5.9 KiB
Python

"""Plot Sonnet 4.6 results across configs and seeds — clean white style."""
import sqlite3
from pathlib import Path
from datetime import datetime
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mticker
ROOT = Path(__file__).parent.parent
INITIAL_FUNDS_CENTS = 25_000_000
CONFIGS = [
{"name": "medium", "color": "#2563eb", "seeds": [1, 2, 3]},
{"name": "hard", "color": "#dc2626", "seeds": [1, 2, 3]},
{"name": "nightmare", "color": "#7c3aed", "seeds": [1, 2, 3]},
]
MODEL_SLUG = "anthropic_claude-sonnet-4-6"
def load_funds_curve(db_path):
con = sqlite3.connect(str(db_path))
rows = con.execute(
"SELECT occurred_at, amount_cents FROM ledger_entries ORDER BY occurred_at ASC"
).fetchall()
con.close()
if not rows:
return [], []
times, balances = [], []
running = INITIAL_FUNDS_CENTS
start = datetime.fromisoformat(rows[0][0]).replace(
month=1, day=1, hour=9, minute=0, second=0, microsecond=0
)
times.append(start)
balances.append(running / 100)
for occurred_at, amount_cents in rows:
running += int(amount_cents)
times.append(datetime.fromisoformat(occurred_at))
balances.append(running / 100)
return times, balances
def load_all_runs():
runs = []
for cfg in CONFIGS:
for seed in cfg["seeds"]:
db_path = ROOT / "db" / f"{cfg['name']}_{seed}_{MODEL_SLUG}.db"
if not db_path.exists():
print(f" Skip: {db_path.name}")
continue
times, balances = load_funds_curve(db_path)
bankrupt = len(balances) > 0 and balances[-1] <= 0
runs.append({
"config": cfg["name"],
"seed": seed,
"color": cfg["color"],
"times": times,
"balances": balances,
"bankrupt": bankrupt,
"final_balance": balances[-1] if balances else 0,
"final_time": times[-1] if times else None,
})
status = "BANKRUPT" if bankrupt else f"${balances[-1]:,.0f}"
print(f" Loaded {cfg['name']} seed={seed}: {status}")
return runs
def make_plot(runs):
fig, axes = plt.subplots(1, 3, figsize=(18, 6), facecolor="white", sharey=False)
fig.suptitle(
"Sonnet 4.6 · YC-Bench · 3 Seeds per Config · 1-Year Horizon",
fontsize=15, fontweight="600", y=0.98, color="#1a1a1a",
)
config_names = ["medium", "hard", "nightmare"]
config_labels = ["Medium", "Hard", "Nightmare"]
for idx, (ax, cname, clabel) in enumerate(zip(axes, config_names, config_labels)):
ax.set_facecolor("white")
for spine in ax.spines.values():
spine.set_edgecolor("#d0d0d0")
spine.set_linewidth(0.8)
cfg_runs = [r for r in runs if r["config"] == cname]
color = cfg_runs[0]["color"] if cfg_runs else "#333"
# Bankruptcy line
ax.axhline(0, color="#ef4444", linewidth=1, linestyle="--", alpha=0.5, zorder=1)
# Starting funds line
ax.axhline(250_000, color="#9ca3af", linewidth=0.6, linestyle=":", alpha=0.5, zorder=1)
survived = 0
for r in cfg_runs:
if not r["times"]:
continue
seed = r["seed"]
alpha = 0.4 if r["bankrupt"] else 1.0
lw = 1.2 if r["bankrupt"] else 2.2
ls = "-"
if r["bankrupt"]:
label = f"Seed {seed} — bankrupt"
else:
label = f"Seed {seed} — ${r['final_balance']/1e6:.1f}M"
survived += 1
ax.plot(r["times"], r["balances"], color=color,
linewidth=lw, alpha=alpha, linestyle=ls, label=label, zorder=3)
# Terminal marker
if r["bankrupt"]:
ax.scatter([r["times"][-1]], [r["balances"][-1]],
color=color, marker="x", s=60, linewidths=2, alpha=0.6, zorder=5)
else:
ax.scatter([r["times"][-1]], [r["balances"][-1]],
color=color, marker="*", s=120, zorder=5)
# Title with survival rate
survival_text = f"{survived}/3 survived"
title_color = "#16a34a" if survived >= 2 else "#dc2626" if survived == 0 else "#d97706"
ax.set_title(f"{clabel}\n", fontsize=13, fontweight="600", color="#1a1a1a", pad=12)
ax.text(0.5, 1.01, survival_text, transform=ax.transAxes,
fontsize=10, color=title_color, ha="center", va="bottom", fontweight="500")
# Formatting
ax.xaxis.set_major_formatter(mdates.DateFormatter("%b"))
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=2))
ax.tick_params(colors="#555", labelsize=8)
ax.grid(axis="y", color="#e5e7eb", linewidth=0.5, linestyle="-")
ax.grid(axis="x", color="#f3f4f6", linewidth=0.3, linestyle="-")
ax.yaxis.set_major_formatter(
mticker.FuncFormatter(
lambda x, _: f"${x/1e6:.0f}M" if abs(x) >= 1e6
else f"${x/1e3:.0f}K" if abs(x) >= 1e3
else f"${x:.0f}"
)
)
legend = ax.legend(fontsize=8, loc="upper left", frameon=True,
facecolor="white", edgecolor="#e5e7eb", framealpha=0.95)
for text in legend.get_texts():
text.set_color("#374151")
if idx == 0:
ax.set_ylabel("Company Funds", fontsize=10, color="#374151")
plt.tight_layout(rect=[0, 0, 1, 0.94])
out_path = ROOT / "plots" / "sonnet_results.png"
out_path.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(out_path, dpi=180, bbox_inches="tight", facecolor="white")
print(f"\nSaved: {out_path}")
if __name__ == "__main__":
print("Loading Sonnet 4.6 runs...")
runs = load_all_runs()
if not runs:
print("No data found.")
else:
make_plot(runs)