mirror of
https://github.com/collinear-ai/yc-bench.git
synced 2026-04-19 12:58:03 +00:00
Bug fixes: - CLI --horizon-years defaulted to 3, silently overriding config presets. Now defaults to None so config value (1yr for medium/hard/nightmare) is used. - Runtime passed a single api_key kwarg regardless of provider, breaking Gemini. Now lets LiteLLM resolve keys from provider-specific env vars. - Removed temperature+top_p from LLM calls (Anthropic rejects both together). - DB and result filenames now include config name to prevent cross-config collisions. Benchmark results (1yr horizon, 3 seeds each): Sonnet 4.6: medium 2/3, hard 0/3, nightmare 1/3 Gemini Flash: medium 3/3, hard 1/3, nightmare 1/3 Gemini has higher win rates (93-98% vs 40-83% on medium). Sonnet's ceiling is higher when it survives (nightmare $10.1M vs $478K). New scripts: plot_comparison.py, plot_sonnet_results.py, notepad_gif.py Updated README with detailed comparison tables and failure analysis. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
167 lines
5.9 KiB
Python
167 lines
5.9 KiB
Python
"""Plot Sonnet 4.6 results across configs and seeds — clean white style."""
|
|
import sqlite3
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
import matplotlib
|
|
matplotlib.use("Agg")
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib.dates as mdates
|
|
import matplotlib.ticker as mticker
|
|
|
|
ROOT = Path(__file__).parent.parent
|
|
INITIAL_FUNDS_CENTS = 25_000_000
|
|
|
|
CONFIGS = [
|
|
{"name": "medium", "color": "#2563eb", "seeds": [1, 2, 3]},
|
|
{"name": "hard", "color": "#dc2626", "seeds": [1, 2, 3]},
|
|
{"name": "nightmare", "color": "#7c3aed", "seeds": [1, 2, 3]},
|
|
]
|
|
|
|
MODEL_SLUG = "anthropic_claude-sonnet-4-6"
|
|
|
|
|
|
def load_funds_curve(db_path):
|
|
con = sqlite3.connect(str(db_path))
|
|
rows = con.execute(
|
|
"SELECT occurred_at, amount_cents FROM ledger_entries ORDER BY occurred_at ASC"
|
|
).fetchall()
|
|
con.close()
|
|
if not rows:
|
|
return [], []
|
|
|
|
times, balances = [], []
|
|
running = INITIAL_FUNDS_CENTS
|
|
start = datetime.fromisoformat(rows[0][0]).replace(
|
|
month=1, day=1, hour=9, minute=0, second=0, microsecond=0
|
|
)
|
|
times.append(start)
|
|
balances.append(running / 100)
|
|
|
|
for occurred_at, amount_cents in rows:
|
|
running += int(amount_cents)
|
|
times.append(datetime.fromisoformat(occurred_at))
|
|
balances.append(running / 100)
|
|
|
|
return times, balances
|
|
|
|
|
|
def load_all_runs():
|
|
runs = []
|
|
for cfg in CONFIGS:
|
|
for seed in cfg["seeds"]:
|
|
db_path = ROOT / "db" / f"{cfg['name']}_{seed}_{MODEL_SLUG}.db"
|
|
if not db_path.exists():
|
|
print(f" Skip: {db_path.name}")
|
|
continue
|
|
times, balances = load_funds_curve(db_path)
|
|
bankrupt = len(balances) > 0 and balances[-1] <= 0
|
|
runs.append({
|
|
"config": cfg["name"],
|
|
"seed": seed,
|
|
"color": cfg["color"],
|
|
"times": times,
|
|
"balances": balances,
|
|
"bankrupt": bankrupt,
|
|
"final_balance": balances[-1] if balances else 0,
|
|
"final_time": times[-1] if times else None,
|
|
})
|
|
status = "BANKRUPT" if bankrupt else f"${balances[-1]:,.0f}"
|
|
print(f" Loaded {cfg['name']} seed={seed}: {status}")
|
|
return runs
|
|
|
|
|
|
def make_plot(runs):
|
|
fig, axes = plt.subplots(1, 3, figsize=(18, 6), facecolor="white", sharey=False)
|
|
fig.suptitle(
|
|
"Sonnet 4.6 · YC-Bench · 3 Seeds per Config · 1-Year Horizon",
|
|
fontsize=15, fontweight="600", y=0.98, color="#1a1a1a",
|
|
)
|
|
|
|
config_names = ["medium", "hard", "nightmare"]
|
|
config_labels = ["Medium", "Hard", "Nightmare"]
|
|
|
|
for idx, (ax, cname, clabel) in enumerate(zip(axes, config_names, config_labels)):
|
|
ax.set_facecolor("white")
|
|
for spine in ax.spines.values():
|
|
spine.set_edgecolor("#d0d0d0")
|
|
spine.set_linewidth(0.8)
|
|
|
|
cfg_runs = [r for r in runs if r["config"] == cname]
|
|
color = cfg_runs[0]["color"] if cfg_runs else "#333"
|
|
|
|
# Bankruptcy line
|
|
ax.axhline(0, color="#ef4444", linewidth=1, linestyle="--", alpha=0.5, zorder=1)
|
|
# Starting funds line
|
|
ax.axhline(250_000, color="#9ca3af", linewidth=0.6, linestyle=":", alpha=0.5, zorder=1)
|
|
|
|
survived = 0
|
|
for r in cfg_runs:
|
|
if not r["times"]:
|
|
continue
|
|
seed = r["seed"]
|
|
alpha = 0.4 if r["bankrupt"] else 1.0
|
|
lw = 1.2 if r["bankrupt"] else 2.2
|
|
ls = "-"
|
|
|
|
if r["bankrupt"]:
|
|
label = f"Seed {seed} — bankrupt"
|
|
else:
|
|
label = f"Seed {seed} — ${r['final_balance']/1e6:.1f}M"
|
|
survived += 1
|
|
|
|
ax.plot(r["times"], r["balances"], color=color,
|
|
linewidth=lw, alpha=alpha, linestyle=ls, label=label, zorder=3)
|
|
|
|
# Terminal marker
|
|
if r["bankrupt"]:
|
|
ax.scatter([r["times"][-1]], [r["balances"][-1]],
|
|
color=color, marker="x", s=60, linewidths=2, alpha=0.6, zorder=5)
|
|
else:
|
|
ax.scatter([r["times"][-1]], [r["balances"][-1]],
|
|
color=color, marker="*", s=120, zorder=5)
|
|
|
|
# Title with survival rate
|
|
survival_text = f"{survived}/3 survived"
|
|
title_color = "#16a34a" if survived >= 2 else "#dc2626" if survived == 0 else "#d97706"
|
|
ax.set_title(f"{clabel}\n", fontsize=13, fontweight="600", color="#1a1a1a", pad=12)
|
|
ax.text(0.5, 1.01, survival_text, transform=ax.transAxes,
|
|
fontsize=10, color=title_color, ha="center", va="bottom", fontweight="500")
|
|
|
|
# Formatting
|
|
ax.xaxis.set_major_formatter(mdates.DateFormatter("%b"))
|
|
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=2))
|
|
ax.tick_params(colors="#555", labelsize=8)
|
|
ax.grid(axis="y", color="#e5e7eb", linewidth=0.5, linestyle="-")
|
|
ax.grid(axis="x", color="#f3f4f6", linewidth=0.3, linestyle="-")
|
|
|
|
ax.yaxis.set_major_formatter(
|
|
mticker.FuncFormatter(
|
|
lambda x, _: f"${x/1e6:.0f}M" if abs(x) >= 1e6
|
|
else f"${x/1e3:.0f}K" if abs(x) >= 1e3
|
|
else f"${x:.0f}"
|
|
)
|
|
)
|
|
|
|
legend = ax.legend(fontsize=8, loc="upper left", frameon=True,
|
|
facecolor="white", edgecolor="#e5e7eb", framealpha=0.95)
|
|
for text in legend.get_texts():
|
|
text.set_color("#374151")
|
|
|
|
if idx == 0:
|
|
ax.set_ylabel("Company Funds", fontsize=10, color="#374151")
|
|
|
|
plt.tight_layout(rect=[0, 0, 1, 0.94])
|
|
out_path = ROOT / "plots" / "sonnet_results.png"
|
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
plt.savefig(out_path, dpi=180, bbox_inches="tight", facecolor="white")
|
|
print(f"\nSaved: {out_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print("Loading Sonnet 4.6 runs...")
|
|
runs = load_all_runs()
|
|
if not runs:
|
|
print("No data found.")
|
|
else:
|
|
make_plot(runs)
|