Fix horizon bug, multi-provider support, add Sonnet vs Gemini benchmark results

Bug fixes: - CLI --horizon-years defaulted to 3, silently overriding config presets. Now defaults to None so config value (1yr for medium/hard/nightmare) is used. - Runtime passed a single api_key kwarg regardless of provider, breaking Gemini. Now lets LiteLLM resolve keys from provider-specific env vars. - Removed temperature+top_p from LLM calls (Anthropic rejects both together). - DB and result filenames now include config name to prevent cross-config collisions. Benchmark results (1yr horizon, 3 seeds each): Sonnet 4.6: medium 2/3, hard 0/3, nightmare 1/3 Gemini Flash: medium 3/3, hard 1/3, nightmare 1/3 Gemini has higher win rates (93-98% vs 40-83% on medium). Sonnet's ceiling is higher when it survives (nightmare $10.1M vs $478K). New scripts: plot_comparison.py, plot_sonnet_results.py, notepad_gif.py Updated README with detailed comparison tables and failure analysis. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-19 12:58:03 +00:00 · 2026-02-26 00:31:00 -08:00 · 2026-02-26 00:31:00 -08:00 · 5d2962073d
commit 5d2962073d
parent d1d7bc97b5
38 changed files with 16654 additions and 34 deletions
--- a/scripts/plot_comparison.py
+++ b/scripts/plot_comparison.py
@ -0,0 +1,169 @@
+"""Sonnet 4.6 vs Gemini 3 Flash — apples-to-apples comparison plot."""
+import sqlite3
+from pathlib import Path
+from datetime import datetime
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import matplotlib.dates as mdates
+import matplotlib.ticker as mticker
+
+ROOT = Path(__file__).parent.parent
+INITIAL_FUNDS_CENTS = 25_000_000
+
+MODELS = {
+    "sonnet": {
+        "slug": "anthropic_claude-sonnet-4-6",
+        "label": "Sonnet 4.6",
+        "color": "#2563eb",
+        "dash": "-",
+    },
+    "gemini": {
+        "slug": "gemini_gemini-3-flash-preview",
+        "label": "Gemini 3 Flash",
+        "color": "#f97316",
+        "dash": "-",
+    },
+}
+
+CONFIGS = ["medium", "hard", "nightmare"]
+SEEDS = [1, 2, 3]
+
+
+def load_funds_curve(db_path):
+    con = sqlite3.connect(str(db_path))
+    rows = con.execute(
+        "SELECT occurred_at, amount_cents FROM ledger_entries ORDER BY occurred_at ASC"
+    ).fetchall()
+    con.close()
+    if not rows:
+        return [], []
+
+    times, balances = [], []
+    running = INITIAL_FUNDS_CENTS
+    start = datetime.fromisoformat(rows[0][0]).replace(
+        month=1, day=1, hour=9, minute=0, second=0, microsecond=0
+    )
+    times.append(start)
+    balances.append(running / 100)
+
+    for occurred_at, amount_cents in rows:
+        running += int(amount_cents)
+        t = datetime.fromisoformat(occurred_at)
+        # Cap at end of year 1 for apples-to-apples
+        if t.year > 2025:
+            break
+        times.append(t)
+        balances.append(running / 100)
+
+    return times, balances
+
+
+def load_all():
+    runs = []
+    for config in CONFIGS:
+        for seed in SEEDS:
+            for key, model in MODELS.items():
+                db_path = ROOT / "db" / f"{config}_{seed}_{model['slug']}.db"
+                if not db_path.exists():
+                    continue
+                times, balances = load_funds_curve(db_path)
+                bankrupt = len(balances) > 1 and balances[-1] <= 0
+                runs.append({
+                    "config": config,
+                    "seed": seed,
+                    "model_key": key,
+                    "label": model["label"],
+                    "color": model["color"],
+                    "times": times,
+                    "balances": balances,
+                    "bankrupt": bankrupt,
+                    "final": balances[-1] if balances else 0,
+                })
+                tag = "BANKRUPT" if bankrupt else f"${balances[-1]:,.0f}"
+                print(f"  {config} seed={seed} {model['label']}: {tag}")
+    return runs
+
+
+def make_plot(runs):
+    fig, axes = plt.subplots(3, 3, figsize=(18, 14), facecolor="white")
+    fig.suptitle(
+        "Sonnet 4.6  vs  Gemini 3 Flash  ·  YC-Bench  ·  1-Year Horizon",
+        fontsize=16, fontweight="600", y=0.98, color="#1a1a1a",
+    )
+
+    for row, config in enumerate(CONFIGS):
+        for col, seed in enumerate(SEEDS):
+            ax = axes[row][col]
+            ax.set_facecolor("white")
+            for spine in ax.spines.values():
+                spine.set_edgecolor("#d0d0d0")
+                spine.set_linewidth(0.7)
+
+            # Bankruptcy line
+            ax.axhline(0, color="#ef4444", linewidth=0.8, linestyle="--", alpha=0.4)
+            ax.axhline(250_000, color="#9ca3af", linewidth=0.5, linestyle=":", alpha=0.4)
+
+            cell_runs = [r for r in runs if r["config"] == config and r["seed"] == seed]
+
+            for r in cell_runs:
+                if not r["times"]:
+                    continue
+                alpha = 0.35 if r["bankrupt"] else 1.0
+                lw = 1.0 if r["bankrupt"] else 2.0
+
+                if r["bankrupt"]:
+                    lbl = f"{r['label']} — bankrupt"
+                else:
+                    val = r["final"]
+                    lbl = f"{r['label']} — ${val/1e6:.1f}M" if val >= 1e6 else f"{r['label']} — ${val/1e3:.0f}K"
+
+                ax.plot(r["times"], r["balances"], color=r["color"],
+                        linewidth=lw, alpha=alpha, label=lbl, zorder=3)
+
+                if r["bankrupt"]:
+                    ax.scatter([r["times"][-1]], [r["balances"][-1]],
+                               color=r["color"], marker="x", s=50, linewidths=1.5, alpha=0.5, zorder=5)
+                else:
+                    ax.scatter([r["times"][-1]], [r["balances"][-1]],
+                               color=r["color"], marker="*", s=100, zorder=5)
+
+            # Title
+            if row == 0:
+                ax.set_title(f"Seed {seed}", fontsize=11, fontweight="500", color="#374151", pad=8)
+
+            # Row label
+            if col == 0:
+                ax.set_ylabel(f"{config.upper()}\n\nFunds", fontsize=10, color="#374151", fontweight="600")
+
+            # Formatting
+            ax.xaxis.set_major_formatter(mdates.DateFormatter("%b"))
+            ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
+            ax.tick_params(colors="#666", labelsize=7)
+            ax.grid(axis="y", color="#f0f0f0", linewidth=0.5)
+
+            ax.yaxis.set_major_formatter(
+                mticker.FuncFormatter(
+                    lambda x, _: f"${x/1e6:.0f}M" if abs(x) >= 1e6
+                    else f"${x/1e3:.0f}K" if abs(x) >= 1e3
+                    else f"${x:.0f}"
+                )
+            )
+
+            legend = ax.legend(fontsize=7, loc="upper left", frameon=True,
+                              facecolor="white", edgecolor="#e5e7eb", framealpha=0.9)
+            for text in legend.get_texts():
+                text.set_color("#374151")
+
+    plt.tight_layout(rect=[0, 0, 1, 0.95])
+    out = ROOT / "plots" / "sonnet_vs_gemini.png"
+    out.parent.mkdir(parents=True, exist_ok=True)
+    plt.savefig(out, dpi=180, bbox_inches="tight", facecolor="white")
+    print(f"\nSaved: {out}")
+
+
+if __name__ == "__main__":
+    print("Loading runs...")
+    runs = load_all()
+    make_plot(runs)