mirror of
https://github.com/collinear-ai/yc-bench.git
synced 2026-04-19 12:58:03 +00:00
394 lines
12 KiB
Python
394 lines
12 KiB
Python
"""YC-Bench comparison plot — Collinear AI branding."""
|
|
|
|
import sqlite3
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
import matplotlib
|
|
|
|
matplotlib.use("Agg")
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib.dates as mdates
|
|
import matplotlib.ticker as mticker
|
|
import numpy as np
|
|
|
|
ROOT = Path(__file__).parent.parent
|
|
INITIAL_FUNDS_CENTS = 25_000_000
|
|
|
|
# ── Collinear brand palette ──────────────────────────────────────────────────
|
|
NAVY = "#13234D"
|
|
ORANGE = "#F26125"
|
|
BLUE = "#4D65FF"
|
|
BG_COLOR = "#FAFBFD"
|
|
GRID_CLR = "#E8ECF2"
|
|
TEXT_CLR = "#2A2F3D"
|
|
MUTED = "#6B7694"
|
|
CARD_BG = "#FFFFFF"
|
|
|
|
MODELS = {
|
|
"sonnet": {
|
|
"slug": "anthropic_claude-sonnet-4-6",
|
|
"label": "Sonnet 4.6",
|
|
"color": BLUE,
|
|
},
|
|
"gemini": {
|
|
"slug": "gemini_gemini-3-flash-preview",
|
|
"label": "Gemini 3 Flash",
|
|
"color": ORANGE,
|
|
},
|
|
"gpt52": {
|
|
"slug": "openai_gpt-5.2",
|
|
"label": "GPT-5.2",
|
|
"color": "#22C55E",
|
|
},
|
|
"greedy": {
|
|
"slug": "greedy_bot",
|
|
"label": "Human Devised Rule",
|
|
"color": NAVY,
|
|
},
|
|
}
|
|
|
|
BOT_KEYS = {"greedy"}
|
|
|
|
CONFIGS = ["medium", "hard", "nightmare"]
|
|
SEEDS = [1, 2, 3]
|
|
|
|
DIFF_COLORS = {"medium": BLUE, "hard": ORANGE, "nightmare": "#DC2626"}
|
|
|
|
|
|
def load_logo_image(height_px=80):
|
|
"""Render the wordmark SVG to a high-res RGBA PIL image."""
|
|
try:
|
|
import os, ctypes.util
|
|
|
|
if ctypes.util.find_library("cairo") is None:
|
|
brew_lib = "/opt/homebrew/lib"
|
|
if Path(brew_lib).exists():
|
|
os.environ.setdefault("DYLD_LIBRARY_PATH", brew_lib)
|
|
import cairosvg
|
|
from PIL import Image
|
|
import io
|
|
except ImportError:
|
|
return None
|
|
p = ROOT / "plots" / "collinear_wordmark.svg"
|
|
if not p.exists():
|
|
return None
|
|
png_data = cairosvg.svg2png(url=str(p), output_height=height_px)
|
|
return Image.open(io.BytesIO(png_data)).convert("RGBA")
|
|
|
|
|
|
def load_funds_curve(db_path):
|
|
con = sqlite3.connect(str(db_path))
|
|
rows = con.execute(
|
|
"SELECT occurred_at, amount_cents FROM ledger_entries ORDER BY occurred_at ASC"
|
|
).fetchall()
|
|
con.close()
|
|
if not rows:
|
|
return [], []
|
|
times, balances = [], []
|
|
running = INITIAL_FUNDS_CENTS
|
|
start = datetime.fromisoformat(rows[0][0]).replace(
|
|
month=1, day=1, hour=9, minute=0, second=0, microsecond=0
|
|
)
|
|
times.append(start)
|
|
balances.append(running / 100)
|
|
for occurred_at, amount_cents in rows:
|
|
running += int(amount_cents)
|
|
t = datetime.fromisoformat(occurred_at)
|
|
if t.year > 2025:
|
|
break
|
|
times.append(t)
|
|
balances.append(running / 100)
|
|
return times, balances
|
|
|
|
|
|
def load_all():
|
|
runs = []
|
|
for config in CONFIGS:
|
|
for seed in SEEDS:
|
|
for key, model in MODELS.items():
|
|
db_path = ROOT / "db" / f"{config}_{seed}_{model['slug']}.db"
|
|
if not db_path.exists():
|
|
continue
|
|
times, balances = load_funds_curve(db_path)
|
|
bankrupt = len(balances) > 1 and balances[-1] <= 0
|
|
runs.append(
|
|
{
|
|
"config": config,
|
|
"seed": seed,
|
|
"model_key": key,
|
|
"label": model["label"],
|
|
"color": model["color"],
|
|
"times": times,
|
|
"balances": balances,
|
|
"bankrupt": bankrupt,
|
|
"final": balances[-1] if balances else 0,
|
|
}
|
|
)
|
|
tag = "BANKRUPT" if bankrupt else f"${balances[-1]:,.0f}"
|
|
print(f" {config} seed={seed} {model['label']}: {tag}")
|
|
return runs
|
|
|
|
|
|
def make_plot(runs):
|
|
fig, axes = plt.subplots(3, 3, figsize=(30, 22), facecolor=BG_COLOR)
|
|
|
|
# ── Header band (drawn as a filled Rectangle patch on the figure) ────
|
|
from matplotlib.patches import FancyBboxPatch
|
|
|
|
header_rect = plt.Rectangle(
|
|
(0, 0.90),
|
|
1,
|
|
0.10,
|
|
transform=fig.transFigure,
|
|
facecolor=NAVY,
|
|
edgecolor="none",
|
|
zorder=0,
|
|
)
|
|
fig.patches.append(header_rect)
|
|
# Orange accent line under header
|
|
accent_rect = plt.Rectangle(
|
|
(0, 0.895),
|
|
1,
|
|
0.006,
|
|
transform=fig.transFigure,
|
|
facecolor=ORANGE,
|
|
edgecolor="none",
|
|
zorder=1,
|
|
)
|
|
fig.patches.append(accent_rect)
|
|
|
|
fig.text(
|
|
0.5,
|
|
0.955,
|
|
"YC-Bench | 1-Year Horizon",
|
|
ha="center",
|
|
va="center",
|
|
fontsize=50,
|
|
fontweight="700",
|
|
color="white",
|
|
fontfamily="Helvetica Neue",
|
|
zorder=2,
|
|
)
|
|
# ── Common legend in header ─────────────────────────────────────────
|
|
legend_items = [
|
|
("Sonnet 4.6", BLUE, "-", 4.0, 0.95),
|
|
("Gemini 3 Flash", ORANGE, "-", 4.0, 0.95),
|
|
("GPT-5.2", "#22C55E", "-", 4.0, 0.95),
|
|
("Human Devised Rule", NAVY, "--", 3.5, 0.75),
|
|
]
|
|
legend_handles = []
|
|
for lbl, clr, ls, lw, alpha in legend_items:
|
|
line = plt.Line2D([0], [0], color=clr, linewidth=lw, linestyle=ls, alpha=alpha)
|
|
legend_handles.append(line)
|
|
legend_labels = [item[0] for item in legend_items]
|
|
fig.legend(
|
|
legend_handles,
|
|
legend_labels,
|
|
loc="center",
|
|
bbox_to_anchor=(0.53, 0.855),
|
|
ncol=4,
|
|
fontsize=22,
|
|
frameon=False,
|
|
labelcolor=TEXT_CLR,
|
|
handlelength=3.5,
|
|
handletextpad=1.0,
|
|
columnspacing=3.0,
|
|
)
|
|
|
|
# Pre-render logo from SVG at high res (will composite after savefig)
|
|
logo_img = load_logo_image(height_px=120)
|
|
|
|
for row, config in enumerate(CONFIGS):
|
|
for col, seed in enumerate(SEEDS):
|
|
ax = axes[row][col]
|
|
ax.set_facecolor(CARD_BG)
|
|
|
|
for spine in ax.spines.values():
|
|
spine.set_edgecolor(GRID_CLR)
|
|
spine.set_linewidth(1.2)
|
|
|
|
# Log scale on y-axis
|
|
ax.set_yscale("log")
|
|
|
|
# Reference lines
|
|
ax.axhline(
|
|
250_000, color=MUTED, linewidth=0.8, linestyle=":", alpha=0.3, zorder=1
|
|
)
|
|
|
|
cell_runs = [r for r in runs if r["config"] == config and r["seed"] == seed]
|
|
|
|
# Sort: bots first (background), then survivors desc, then bankrupt
|
|
def sort_key(r):
|
|
if r["model_key"] in BOT_KEYS:
|
|
return (0, 0)
|
|
if not r["bankrupt"]:
|
|
return (1, -r["final"])
|
|
return (2, 0)
|
|
|
|
cell_runs.sort(key=sort_key)
|
|
|
|
for r in cell_runs:
|
|
if not r["times"]:
|
|
continue
|
|
is_bot = r["model_key"] in BOT_KEYS
|
|
|
|
if r["bankrupt"]:
|
|
alpha, lw, ls = 0.4, 2.0, "-" if not is_bot else "--"
|
|
elif is_bot:
|
|
alpha, lw, ls = 0.75, 3.5, "--"
|
|
else:
|
|
alpha, lw, ls = 0.95, 3.0, "-"
|
|
|
|
val = r["final"]
|
|
if r["bankrupt"]:
|
|
lbl = f"{r['label']} — bankrupt"
|
|
elif val >= 1e6:
|
|
lbl = f"{r['label']} — ${val/1e6:.1f}M"
|
|
else:
|
|
lbl = f"{r['label']} — ${val/1e3:.0f}K"
|
|
|
|
# Clamp balances for log scale (floor at $1K)
|
|
plot_bals = [max(b, 1_000) for b in r["balances"]]
|
|
|
|
ax.plot(
|
|
r["times"],
|
|
plot_bals,
|
|
color=r["color"],
|
|
linewidth=lw,
|
|
alpha=alpha,
|
|
label=lbl,
|
|
linestyle=ls,
|
|
zorder=2 if is_bot else 3,
|
|
)
|
|
|
|
if r["bankrupt"]:
|
|
ax.scatter(
|
|
[r["times"][-1]],
|
|
[max(r["balances"][-1], 1_000)],
|
|
color=r["color"],
|
|
marker="X",
|
|
s=120,
|
|
linewidths=2,
|
|
alpha=0.6,
|
|
zorder=5,
|
|
edgecolors="white",
|
|
)
|
|
elif not is_bot:
|
|
ax.scatter(
|
|
[r["times"][-1]],
|
|
[r["balances"][-1]],
|
|
color=r["color"],
|
|
marker="o",
|
|
s=100,
|
|
zorder=5,
|
|
edgecolors="white",
|
|
linewidths=2.5,
|
|
)
|
|
|
|
# No per-axis column title (seed labels placed via fig.text below)
|
|
|
|
# Row label
|
|
if col == 0:
|
|
ax.set_ylabel(
|
|
"Funds ($)", fontsize=20, color=MUTED, fontweight="400", labelpad=10
|
|
)
|
|
ax.annotate(
|
|
config.upper(),
|
|
xy=(-0.22, 0.5),
|
|
xycoords="axes fraction",
|
|
fontsize=23,
|
|
fontweight="800",
|
|
color=DIFF_COLORS[config],
|
|
ha="center",
|
|
va="center",
|
|
rotation=90,
|
|
)
|
|
|
|
# Axes formatting
|
|
ax.xaxis.set_major_formatter(mdates.DateFormatter("%b"))
|
|
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=2))
|
|
ax.tick_params(colors=MUTED, labelsize=18, length=5, width=0.8, pad=6)
|
|
ax.grid(axis="y", color=GRID_CLR, linewidth=0.7, alpha=0.8)
|
|
ax.grid(axis="x", color=GRID_CLR, linewidth=0.4, alpha=0.4)
|
|
|
|
ax.yaxis.set_major_formatter(
|
|
mticker.FuncFormatter(
|
|
lambda x, _: (
|
|
f"${x/1e6:.0f}M"
|
|
if x >= 1e6
|
|
else f"${x/1e3:.0f}K" if x >= 1e3 else f"${x:.0f}"
|
|
)
|
|
)
|
|
)
|
|
ax.yaxis.set_minor_formatter(mticker.NullFormatter())
|
|
|
|
# No per-cell legend (common legend in header)
|
|
|
|
plt.subplots_adjust(
|
|
left=0.08,
|
|
right=0.98,
|
|
top=0.79,
|
|
bottom=0.05,
|
|
hspace=0.30,
|
|
wspace=0.22,
|
|
)
|
|
|
|
# Seed column headers just above the plot grid
|
|
col_centers = [0.08 + (0.98 - 0.08) * (i + 0.5) / 3 for i in range(3)]
|
|
for i, seed in enumerate(SEEDS):
|
|
fig.text(
|
|
col_centers[i],
|
|
0.80,
|
|
f"Seed {seed}",
|
|
ha="center",
|
|
va="bottom",
|
|
fontsize=26,
|
|
fontweight="600",
|
|
color=TEXT_CLR,
|
|
)
|
|
|
|
# Footer
|
|
fig.text(
|
|
0.5,
|
|
0.01,
|
|
"collinear.ai | YC-Bench: Long-Horizon Deterministic Benchmark for LLM Agents",
|
|
ha="center",
|
|
va="bottom",
|
|
fontsize=18,
|
|
fontweight="400",
|
|
color=MUTED,
|
|
fontstyle="italic",
|
|
)
|
|
|
|
out = ROOT / "plots" / "sonnet_vs_gemini.png"
|
|
out.parent.mkdir(parents=True, exist_ok=True)
|
|
dpi = 150
|
|
plt.savefig(out, dpi=dpi, facecolor=BG_COLOR, pad_inches=0)
|
|
|
|
# Composite SVG logo onto the navy header band
|
|
if logo_img is not None:
|
|
from PIL import Image
|
|
|
|
plot_img = Image.open(out).convert("RGBA")
|
|
img_w, img_h = plot_img.size
|
|
# Header band is top 10% of image (no pad_inches)
|
|
header_top = 0
|
|
header_h = int(img_h * 0.10)
|
|
# Scale logo to ~65% of header height
|
|
target_h = int(header_h * 0.65)
|
|
scale = target_h / logo_img.size[1]
|
|
logo = logo_img.resize((int(logo_img.size[0] * scale), target_h), Image.LANCZOS)
|
|
# Center vertically in the navy header band
|
|
y_offset = header_top + (header_h - target_h) // 2
|
|
x_offset = 70
|
|
plot_img.paste(logo, (x_offset, y_offset), logo)
|
|
plot_img.save(out)
|
|
|
|
print(f"\nSaved: {out}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print("Loading runs...")
|
|
runs = load_all()
|
|
make_plot(runs)
|