Add Collinear branding, bot runners, and clean up stale plots

- Restyle plot_comparison.py with Collinear brand palette and logo
- Add collinear_logo.svg and collinear_wordmark.svg
- Add bot_runner.py (greedy/random/throughput/prestige strategies)
- Add greedy_bot.py shim
- Remove old unused plots (funds_curves, notepad gifs, sonnet_results)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
adit jain 2026-02-26 21:12:05 -08:00
parent 75a53de25c
commit 5f31969865
11 changed files with 723 additions and 52 deletions

11
plots/collinear_logo.svg Normal file
View file

@ -0,0 +1,11 @@
<svg width="39" height="40" viewBox="0 0 39 40" fill="none" xmlns="http://www.w3.org/2000/svg">
<g clip-path="url(#clip0_369_133)">
<path d="M25.6184 26.5238H36.7879C34.2158 33.8016 27.3592 39.0125 19.316 39.0125C9.06315 39.0125 0.75 30.5566 0.75 20.1276C0.75 9.69867 9.06315 1.25 19.316 1.25C27.3592 1.25 34.2087 6.45363 36.7879 13.7315H25.6184C23.9558 12.0258 21.7176 11.0646 19.316 11.0646C14.4063 11.0646 10.406 15.1336 10.406 20.1348C10.406 25.1361 14.4063 29.1978 19.316 29.1978C21.7176 29.1978 23.9558 28.2438 25.6184 26.531V26.5238Z" fill="#13234D"/>
<path d="M37.8818 20.125C37.8818 20.7249 37.8534 21.3103 37.7966 21.8957H23.4653L22.7831 22.8064C21.9447 23.9194 20.68 24.5626 19.3157 24.5626C16.9071 24.5626 14.9531 22.5751 14.9531 20.125C14.9531 17.675 16.9071 15.6875 19.3157 15.6875C20.6871 15.6875 21.9518 16.3307 22.7831 17.4437L23.4653 18.3471H37.7966C37.8534 18.9325 37.8818 19.5252 37.8818 20.125Z" fill="#F26125"/>
</g>
<defs>
<clipPath id="clip0_369_133">
<rect width="39" height="39" fill="white" transform="translate(0 0.5)"/>
</clipPath>
</defs>
</svg>

After

Width:  |  Height:  |  Size: 1.1 KiB

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 7.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 150 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 178 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 407 KiB

After

Width:  |  Height:  |  Size: 721 KiB

Before After
Before After

451
scripts/bot_runner.py Normal file
View file

@ -0,0 +1,451 @@
"""Bot runner: plays YC-Bench using direct DB access with pluggable strategies.
Strategies:
greedy pick highest reward among completable tasks
random pick randomly among completable tasks (deterministic via RngStreams)
throughput pick highest reward/hour among completable tasks
prestige phase 1: climb prestige fast, phase 2: throughput
Usage:
uv run python scripts/bot_runner.py # all bots, all configs, all seeds
uv run python scripts/bot_runner.py --bot greedy # just greedy
uv run python scripts/bot_runner.py --bot random --seed 1 --config medium
"""
from __future__ import annotations
import argparse
import os
import sys
from dataclasses import dataclass
from datetime import datetime, timezone
from decimal import Decimal
from pathlib import Path
from typing import Callable, Optional
from uuid import uuid4
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from yc_bench.config import load_config
from yc_bench.core.business_time import add_business_hours
from yc_bench.core.engine import advance_time
from yc_bench.core.eta import recalculate_etas
from yc_bench.core.events import fetch_next_event, insert_event
from yc_bench.db.models.company import Company, CompanyPrestige
from yc_bench.db.models.employee import Employee, EmployeeSkillRate
from yc_bench.db.models.event import EventType
from yc_bench.db.models.sim_state import SimState
from yc_bench.db.models.task import Task, TaskAssignment, TaskRequirement, TaskStatus
from yc_bench.db.session import build_engine, build_session_factory, init_db, session_scope
from yc_bench.services.generate_tasks import generate_replacement_task
from yc_bench.services.rng import RngStreams
from yc_bench.services.seed_world import SeedWorldRequest, seed_world_transactional
CONFIGS = ["medium", "hard", "nightmare"]
SEEDS = [1, 2, 3]
# Cap task cycles to match LLM throughput. An LLM gets 500 turns and needs
# ~5 turns per task cycle (browse + accept + 5× assign + dispatch + resume),
# so it can complete at most ~100 tasks. The sim still runs to horizon —
# once the budget is exhausted the bot just advances time (paying salaries,
# bleeding cash) exactly like an LLM that hit max_turns.
MAX_TASK_CYCLES = 100
@dataclass
class CandidateTask:
task: object # ORM Task row
reward_cents: int
prestige_delta: float
completion_hours: Decimal
is_completable: bool
def estimate_completion_hours(task_reqs, employee_skills, n_concurrent_tasks=1):
"""Estimate hours to complete task with all employees assigned."""
domain_rates = {}
for req in task_reqs:
domain = req["domain"]
total_rate = Decimal("0")
for emp in employee_skills:
rate = emp.get(domain, Decimal("0"))
total_rate += rate / Decimal(n_concurrent_tasks)
domain_rates[domain] = total_rate
max_hours = Decimal("0")
for req in task_reqs:
domain = req["domain"]
qty = Decimal(str(req["required_qty"]))
rate = domain_rates.get(domain, Decimal("0"))
if rate <= 0:
return None
hours = qty / rate
if hours > max_hours:
max_hours = hours
return max_hours
def _compute_deadline(accepted_at, total_required_qty, cfg):
work_hours = cfg.workday_end_hour - cfg.workday_start_hour
biz_days = max(cfg.deadline_min_biz_days, int(total_required_qty / cfg.deadline_qty_per_day))
return add_business_hours(accepted_at, Decimal(str(biz_days)) * Decimal(str(work_hours)))
def _build_candidates(db, company_id, sim_state, world_cfg, emp_skills):
"""Build CandidateTask list for all market tasks the company can see."""
prestige_rows = db.query(CompanyPrestige).filter(
CompanyPrestige.company_id == company_id
).all()
max_prestige = max((float(p.prestige_level) for p in prestige_rows), default=1.0)
market_tasks = db.query(Task).filter(
Task.status == TaskStatus.MARKET,
Task.required_prestige <= int(max_prestige),
).order_by(Task.reward_funds_cents.desc()).all()
all_skills = [{d: r for d, r in e["skills"].items()} for e in emp_skills]
candidates = []
for task in market_tasks:
reqs = db.query(TaskRequirement).filter(
TaskRequirement.task_id == task.id
).all()
total_qty = sum(float(r.required_qty) for r in reqs)
task_reqs = [{"domain": r.domain, "required_qty": float(r.required_qty)} for r in reqs]
completion_hours = estimate_completion_hours(task_reqs, all_skills, n_concurrent_tasks=1)
is_completable = False
if completion_hours is not None:
deadline = _compute_deadline(sim_state.sim_time, total_qty, world_cfg)
completion_time = add_business_hours(sim_state.sim_time, completion_hours)
is_completable = completion_time <= deadline
candidates.append(CandidateTask(
task=task,
reward_cents=task.reward_funds_cents,
prestige_delta=float(task.reward_prestige_delta),
completion_hours=completion_hours if completion_hours is not None else Decimal("999999"),
is_completable=is_completable,
))
return candidates, max_prestige
# ── Strategy functions ──────────────────────────────────────────────────────
StrategyFn = Callable # (completable: list[CandidateTask], context: dict) -> Optional[CandidateTask]
def strategy_greedy(completable: list[CandidateTask], context: dict) -> Optional[CandidateTask]:
"""Pick the task with the highest reward."""
if not completable:
return None
return max(completable, key=lambda c: c.reward_cents)
def strategy_random(completable: list[CandidateTask], context: dict) -> Optional[CandidateTask]:
"""Pick a random completable task (deterministic via seeded RNG)."""
if not completable:
return None
seed = context["seed"]
turn = context["turn"]
rng = RngStreams(seed).stream(f"bot_random_select:{turn}")
return rng.choice(completable)
def strategy_throughput(completable: list[CandidateTask], context: dict) -> Optional[CandidateTask]:
"""Pick the task with the highest reward per hour."""
if not completable:
return None
return max(completable, key=lambda c: Decimal(c.reward_cents) / c.completion_hours)
def strategy_prestige(completable: list[CandidateTask], context: dict) -> Optional[CandidateTask]:
"""Phase 1 (prestige < 5): climb prestige fastest. Phase 2: throughput."""
if not completable:
return None
current_prestige = context["max_prestige"]
if current_prestige < 5:
# Prefer tasks that give prestige delta per hour of work
prestige_tasks = [c for c in completable if c.prestige_delta > 0]
if prestige_tasks:
return max(prestige_tasks, key=lambda c: Decimal(str(c.prestige_delta)) / c.completion_hours)
# Fall back to throughput
return max(completable, key=lambda c: Decimal(c.reward_cents) / c.completion_hours)
STRATEGIES = {
"greedy": ("greedy_bot", strategy_greedy),
"random": ("random_bot", strategy_random),
"throughput": ("throughput_bot", strategy_throughput),
"prestige": ("prestige_bot", strategy_prestige),
}
# ── Shared simulation runner ───────────────────────────────────────────────
def run_bot(config_name: str, seed: int, bot_slug: str, strategy_fn: StrategyFn):
"""Run a bot strategy on one (config, seed) pair. Returns result dict."""
cfg = load_config(config_name)
world_cfg = cfg.world
db_dir = Path("db")
db_dir.mkdir(exist_ok=True)
db_path = db_dir / f"{config_name}_{seed}_{bot_slug}.db"
if db_path.exists():
db_path.unlink()
db_url = f"sqlite:///{db_path}"
os.environ["DATABASE_URL"] = db_url
os.environ["YC_BENCH_EXPERIMENT"] = config_name
engine = build_engine(db_url)
init_db(engine)
factory = build_session_factory(engine)
with session_scope(factory) as db:
start_dt = datetime(2025, 1, 1, 9, 0, 0, tzinfo=timezone.utc)
horizon_end = start_dt.replace(year=start_dt.year + cfg.sim.horizon_years)
req = SeedWorldRequest(
run_seed=seed,
company_name=bot_slug.replace("_", " ").title(),
horizon_years=cfg.sim.horizon_years,
employee_count=world_cfg.num_employees,
market_task_count=world_cfg.num_market_tasks,
start_date=start_dt,
)
result = seed_world_transactional(db, req)
company_id = result.company_id
insert_event(
db=db,
company_id=company_id,
event_type=EventType.HORIZON_END,
scheduled_at=horizon_end,
payload={"reason": "horizon_end"},
dedupe_key="horizon_end",
)
sim_state = SimState(
company_id=company_id,
sim_time=start_dt,
run_seed=seed,
horizon_end=horizon_end,
replenish_counter=0,
)
db.add(sim_state)
db.flush()
tasks_completed = 0
tasks_failed = 0
task_cycles_used = 0
turn = 0
while True:
turn += 1
with session_scope(factory) as db:
sim_state = db.query(SimState).first()
company = db.query(Company).filter(Company.id == company_id).one()
if company.funds_cents < 0:
break
if sim_state.sim_time >= sim_state.horizon_end:
break
active_tasks = db.query(Task).filter(
Task.company_id == company_id,
Task.status == TaskStatus.ACTIVE,
).all()
if active_tasks:
next_event = fetch_next_event(db, company_id, sim_state.horizon_end)
if next_event is None:
break
adv = advance_time(db, company_id, next_event.scheduled_at)
for we in adv.wake_events:
if we.get("type") == "task_completed":
if we.get("success"):
tasks_completed += 1
else:
tasks_failed += 1
if adv.bankrupt or adv.horizon_reached:
break
continue
# No active task — if we've used up our task budget, just
# advance time (pay salaries, bleed cash) like an LLM that
# hit max_turns would.
if task_cycles_used >= MAX_TASK_CYCLES:
next_event = fetch_next_event(db, company_id, sim_state.horizon_end)
if next_event is None:
adv = advance_time(db, company_id, sim_state.horizon_end)
break
adv = advance_time(db, company_id, next_event.scheduled_at)
if adv.bankrupt or adv.horizon_reached:
break
continue
# Get employees and build candidates
employees = db.query(Employee).filter(Employee.company_id == company_id).all()
emp_skills = []
for emp in employees:
skills = db.query(EmployeeSkillRate).filter(
EmployeeSkillRate.employee_id == emp.id
).all()
skill_map = {s.domain: Decimal(s.rate_domain_per_hour) for s in skills}
emp_skills.append({"id": emp.id, "skills": skill_map})
candidates, max_prestige = _build_candidates(db, company_id, sim_state, world_cfg, emp_skills)
completable = [c for c in candidates if c.is_completable]
context = {
"seed": seed,
"turn": turn,
"max_prestige": max_prestige,
}
chosen = strategy_fn(completable, context)
if chosen is None:
next_event = fetch_next_event(db, company_id, sim_state.horizon_end)
if next_event is None:
adv = advance_time(db, company_id, sim_state.horizon_end)
break
adv = advance_time(db, company_id, next_event.scheduled_at)
if adv.bankrupt or adv.horizon_reached:
break
continue
best_task = chosen.task
# Accept the task
reqs = db.query(TaskRequirement).filter(
TaskRequirement.task_id == best_task.id
).all()
total_qty = sum(float(r.required_qty) for r in reqs)
best_task.status = TaskStatus.PLANNED
best_task.company_id = company_id
best_task.accepted_at = sim_state.sim_time
best_task.deadline = _compute_deadline(sim_state.sim_time, total_qty, world_cfg)
# Generate replacement
counter = sim_state.replenish_counter
sim_state.replenish_counter = counter + 1
replacement = generate_replacement_task(
run_seed=sim_state.run_seed,
replenish_counter=counter,
cfg=world_cfg,
)
replacement_row = Task(
id=uuid4(),
company_id=None,
status=TaskStatus.MARKET,
title=replacement.title,
description=replacement.description,
required_prestige=replacement.required_prestige,
reward_funds_cents=replacement.reward_funds_cents,
reward_prestige_delta=replacement.reward_prestige_delta,
skill_boost_pct=replacement.skill_boost_pct,
accepted_at=None, deadline=None, completed_at=None,
success=None, halfway_event_emitted=False,
)
db.add(replacement_row)
for domain, qty in replacement.requirements.items():
db.add(TaskRequirement(
task_id=replacement_row.id,
domain=domain,
required_qty=qty,
completed_qty=0,
))
# Assign ALL employees
for e in emp_skills:
db.add(TaskAssignment(
task_id=best_task.id,
employee_id=e["id"],
assigned_at=sim_state.sim_time,
))
db.flush()
best_task.status = TaskStatus.ACTIVE
db.flush()
recalculate_etas(db, company_id, sim_state.sim_time,
impacted_task_ids={best_task.id},
half_threshold=world_cfg.task_half_threshold)
task_cycles_used += 1
# Final state
with session_scope(factory) as db:
company = db.query(Company).filter(Company.id == company_id).one()
sim_state = db.query(SimState).first()
final_balance = company.funds_cents
bankrupt = final_balance < 0
prestige_rows = db.query(CompanyPrestige).filter(
CompanyPrestige.company_id == company_id
).all()
max_p = max((float(p.prestige_level) for p in prestige_rows), default=1.0)
return {
"config": config_name,
"seed": seed,
"bot": bot_slug,
"turns": turn,
"final_balance_cents": final_balance,
"bankrupt": bankrupt,
"tasks_completed": tasks_completed,
"tasks_failed": tasks_failed,
"max_prestige": max_p,
}
def main():
parser = argparse.ArgumentParser(description="Run YC-Bench bot strategies")
parser.add_argument("--bot", choices=list(STRATEGIES.keys()), default=None,
help="Run only this bot (default: all)")
parser.add_argument("--config", choices=CONFIGS, default=None,
help="Run only this config (default: all)")
parser.add_argument("--seed", type=int, default=None,
help="Run only this seed (default: all)")
args = parser.parse_args()
bots = [args.bot] if args.bot else list(STRATEGIES.keys())
configs = [args.config] if args.config else CONFIGS
seeds = [args.seed] if args.seed else SEEDS
results = []
total = len(bots) * len(configs) * len(seeds)
print(f"Running {total} bot simulations...\n")
for bot_name in bots:
slug, strategy_fn = STRATEGIES[bot_name]
for config_name in configs:
for seed in seeds:
print(f" {slug} | {config_name} seed={seed} ...", end=" ", flush=True)
r = run_bot(config_name, seed, slug, strategy_fn)
results.append(r)
if r["bankrupt"]:
tag = "BANKRUPT"
else:
tag = f"${r['final_balance_cents']/100:,.0f}"
print(f"{tag} | {r['tasks_completed']} OK, {r['tasks_failed']} fail | prestige {r['max_prestige']:.1f} | {r['turns']} turns")
print(f"\n{'Bot':<16} {'Config':<12} {'Seed':<5} {'Final Balance':>14} {'OK':>4} {'Fail':>5} {'Prestige':>9}")
print("-" * 70)
for r in results:
fb = "BANKRUPT" if r["bankrupt"] else f"${r['final_balance_cents']/100:,.0f}"
print(f"{r['bot']:<16} {r['config']:<12} {r['seed']:<5} {fb:>14} {r['tasks_completed']:>4} {r['tasks_failed']:>5} {r['max_prestige']:>8.1f}")
bankrupt_count = sum(1 for r in results if r["bankrupt"])
print(f"\nBankruptcies: {bankrupt_count}/{len(results)}")
if __name__ == "__main__":
main()

48
scripts/greedy_bot.py Normal file
View file

@ -0,0 +1,48 @@
"""Greedy bot shim — delegates to bot_runner.py.
Usage:
uv run python scripts/greedy_bot.py
"""
from __future__ import annotations
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
sys.path.insert(0, str(Path(__file__).parent))
from bot_runner import CONFIGS, SEEDS, STRATEGIES, run_bot
def main():
slug, strategy_fn = STRATEGIES["greedy"]
print("Running greedy bot across all configs and seeds...\n")
results = []
for config_name in CONFIGS:
for seed in SEEDS:
print(f" {config_name} seed={seed} ...", end=" ", flush=True)
r = run_bot(config_name, seed, slug, strategy_fn)
results.append(r)
if r["bankrupt"]:
tag = "BANKRUPT"
elif r["final_balance_cents"] >= 1_000_000_00:
tag = f"${r['final_balance_cents']/100:,.0f}"
else:
tag = f"${r['final_balance_cents']/100:,.0f}"
print(f"{tag} | {r['tasks_completed']} OK, {r['tasks_failed']} fail | prestige {r['max_prestige']:.1f} | {r['turns']} turns")
print(f"\n{'Config':<12} {'Seed':<5} {'Final Balance':>14} {'OK':>4} {'Fail':>5} {'Prestige':>9}")
print("-" * 55)
for r in results:
fb = "BANKRUPT" if r["bankrupt"] else f"${r['final_balance_cents']/100:,.0f}"
print(f"{r['config']:<12} {r['seed']:<5} {fb:>14} {r['tasks_completed']:>4} {r['tasks_failed']:>5} {r['max_prestige']:>8.1f}")
bankrupt_count = sum(1 for r in results if r["bankrupt"])
print(f"\nBankruptcies: {bankrupt_count}/{len(results)}")
if __name__ == "__main__":
main()

View file

@ -1,4 +1,4 @@
"""Sonnet 4.6 vs Gemini 3 Flash vs GPT-5.2 — apples-to-apples comparison plot."""
"""YC-Bench comparison plot — Collinear AI branding."""
import sqlite3
from pathlib import Path
from datetime import datetime
@ -8,31 +8,69 @@ matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mticker
import numpy as np
ROOT = Path(__file__).parent.parent
INITIAL_FUNDS_CENTS = 25_000_000
# ── Collinear brand palette ──────────────────────────────────────────────────
NAVY = "#13234D"
ORANGE = "#F26125"
BLUE = "#4D65FF"
BG_COLOR = "#FAFBFD"
GRID_CLR = "#E8ECF2"
TEXT_CLR = "#2A2F3D"
MUTED = "#6B7694"
CARD_BG = "#FFFFFF"
MODELS = {
"sonnet": {
"slug": "anthropic_claude-sonnet-4-6",
"label": "Sonnet 4.6",
"color": "#2563eb",
"color": BLUE,
},
"gemini": {
"slug": "gemini_gemini-3-flash-preview",
"label": "Gemini 3 Flash",
"color": "#f97316",
"color": ORANGE,
},
"gpt52": {
"slug": "openai_gpt-5.2",
"label": "GPT-5.2",
"color": "#16a34a",
"color": "#22C55E",
},
"greedy": {
"slug": "greedy_bot",
"label": "Greedy Bot",
"color": NAVY,
},
}
BOT_KEYS = {"greedy"}
CONFIGS = ["medium", "hard", "nightmare"]
SEEDS = [1, 2, 3]
DIFF_COLORS = {"medium": BLUE, "hard": ORANGE, "nightmare": "#DC2626"}
def load_logo_image(height_px=80):
"""Render the wordmark SVG to a high-res RGBA PIL image."""
import os, ctypes.util
# Ensure homebrew cairo is findable
if ctypes.util.find_library("cairo") is None:
brew_lib = "/opt/homebrew/lib"
if Path(brew_lib).exists():
os.environ.setdefault("DYLD_LIBRARY_PATH", brew_lib)
import cairosvg
from PIL import Image
import io
p = ROOT / "plots" / "collinear_wordmark.svg"
if not p.exists():
return None
png_data = cairosvg.svg2png(url=str(p), output_height=height_px)
return Image.open(io.BytesIO(png_data)).convert("RGBA")
def load_funds_curve(db_path):
con = sqlite3.connect(str(db_path))
@ -42,7 +80,6 @@ def load_funds_curve(db_path):
con.close()
if not rows:
return [], []
times, balances = [], []
running = INITIAL_FUNDS_CENTS
start = datetime.fromisoformat(rows[0][0]).replace(
@ -50,16 +87,13 @@ def load_funds_curve(db_path):
)
times.append(start)
balances.append(running / 100)
for occurred_at, amount_cents in rows:
running += int(amount_cents)
t = datetime.fromisoformat(occurred_at)
# Cap at end of year 1 for apples-to-apples
if t.year > 2025:
break
times.append(t)
balances.append(running / 100)
return times, balances
@ -74,13 +108,10 @@ def load_all():
times, balances = load_funds_curve(db_path)
bankrupt = len(balances) > 1 and balances[-1] <= 0
runs.append({
"config": config,
"seed": seed,
"model_key": key,
"label": model["label"],
"config": config, "seed": seed,
"model_key": key, "label": model["label"],
"color": model["color"],
"times": times,
"balances": balances,
"times": times, "balances": balances,
"bankrupt": bankrupt,
"final": balances[-1] if balances else 0,
})
@ -90,79 +121,197 @@ def load_all():
def make_plot(runs):
fig, axes = plt.subplots(3, 3, figsize=(18, 14), facecolor="white")
fig.suptitle(
"Sonnet 4.6 vs Gemini 3 Flash vs GPT-5.2 · YC-Bench · 1-Year Horizon",
fontsize=16, fontweight="600", y=0.98, color="#1a1a1a",
fig, axes = plt.subplots(3, 3, figsize=(30, 22), facecolor=BG_COLOR)
# ── Header band (drawn as a filled Rectangle patch on the figure) ────
from matplotlib.patches import FancyBboxPatch
header_rect = plt.Rectangle((0, 0.90), 1, 0.10,
transform=fig.transFigure, facecolor=NAVY,
edgecolor="none", zorder=0)
fig.patches.append(header_rect)
# Orange accent line under header
accent_rect = plt.Rectangle((0, 0.895), 1, 0.006,
transform=fig.transFigure, facecolor=ORANGE,
edgecolor="none", zorder=1)
fig.patches.append(accent_rect)
fig.text(
0.5, 0.955,
"YC-Bench | 1-Year Horizon",
ha="center", va="center",
fontsize=50, fontweight="700", color="white",
fontfamily="Helvetica Neue", zorder=2,
)
# ── Common legend in header ─────────────────────────────────────────
legend_items = [
("Sonnet 4.6", BLUE, "-", 4.0, 0.95),
("Gemini 3 Flash", ORANGE, "-", 4.0, 0.95),
("GPT-5.2", "#22C55E", "-", 4.0, 0.95),
("Greedy Bot", NAVY, "--", 3.5, 0.75),
]
legend_handles = []
for lbl, clr, ls, lw, alpha in legend_items:
line = plt.Line2D([0], [0], color=clr, linewidth=lw, linestyle=ls,
alpha=alpha)
legend_handles.append(line)
legend_labels = [item[0] for item in legend_items]
fig.legend(
legend_handles, legend_labels,
loc="center", bbox_to_anchor=(0.53, 0.855),
ncol=4, fontsize=22, frameon=False,
labelcolor=TEXT_CLR, handlelength=3.5, handletextpad=1.0,
columnspacing=3.0,
)
# Pre-render logo from SVG at high res (will composite after savefig)
logo_img = load_logo_image(height_px=120)
for row, config in enumerate(CONFIGS):
for col, seed in enumerate(SEEDS):
ax = axes[row][col]
ax.set_facecolor("white")
for spine in ax.spines.values():
spine.set_edgecolor("#d0d0d0")
spine.set_linewidth(0.7)
ax.set_facecolor(CARD_BG)
# Bankruptcy line
ax.axhline(0, color="#ef4444", linewidth=0.8, linestyle="--", alpha=0.4)
ax.axhline(250_000, color="#9ca3af", linewidth=0.5, linestyle=":", alpha=0.4)
for spine in ax.spines.values():
spine.set_edgecolor(GRID_CLR)
spine.set_linewidth(1.2)
# Log scale on y-axis
ax.set_yscale("log")
# Reference lines
ax.axhline(250_000, color=MUTED, linewidth=0.8, linestyle=":", alpha=0.3, zorder=1)
cell_runs = [r for r in runs if r["config"] == config and r["seed"] == seed]
# Sort: bots first (background), then survivors desc, then bankrupt
def sort_key(r):
if r["model_key"] in BOT_KEYS: return (0, 0)
if not r["bankrupt"]: return (1, -r["final"])
return (2, 0)
cell_runs.sort(key=sort_key)
for r in cell_runs:
if not r["times"]:
continue
alpha = 0.35 if r["bankrupt"] else 1.0
lw = 1.0 if r["bankrupt"] else 2.0
is_bot = r["model_key"] in BOT_KEYS
if r["bankrupt"]:
alpha, lw, ls = 0.4, 2.0, "-" if not is_bot else "--"
elif is_bot:
alpha, lw, ls = 0.75, 3.5, "--"
else:
alpha, lw, ls = 0.95, 3.0, "-"
val = r["final"]
if r["bankrupt"]:
lbl = f"{r['label']} — bankrupt"
elif val >= 1e6:
lbl = f"{r['label']} — ${val/1e6:.1f}M"
else:
val = r["final"]
lbl = f"{r['label']} — ${val/1e6:.1f}M" if val >= 1e6 else f"{r['label']} — ${val/1e3:.0f}K"
lbl = f"{r['label']} — ${val/1e3:.0f}K"
ax.plot(r["times"], r["balances"], color=r["color"],
linewidth=lw, alpha=alpha, label=lbl, zorder=3)
# Clamp balances for log scale (floor at $1K)
plot_bals = [max(b, 1_000) for b in r["balances"]]
ax.plot(
r["times"], plot_bals,
color=r["color"], linewidth=lw, alpha=alpha,
label=lbl, linestyle=ls,
zorder=2 if is_bot else 3,
)
if r["bankrupt"]:
ax.scatter([r["times"][-1]], [r["balances"][-1]],
color=r["color"], marker="x", s=50, linewidths=1.5, alpha=0.5, zorder=5)
else:
ax.scatter([r["times"][-1]], [r["balances"][-1]],
color=r["color"], marker="*", s=100, zorder=5)
ax.scatter(
[r["times"][-1]], [max(r["balances"][-1], 1_000)],
color=r["color"], marker="X", s=120,
linewidths=2, alpha=0.6, zorder=5,
edgecolors="white",
)
elif not is_bot:
ax.scatter(
[r["times"][-1]], [r["balances"][-1]],
color=r["color"], marker="o", s=100, zorder=5,
edgecolors="white", linewidths=2.5,
)
# Title
if row == 0:
ax.set_title(f"Seed {seed}", fontsize=11, fontweight="500", color="#374151", pad=8)
# No per-axis column title (seed labels placed via fig.text below)
# Row label
if col == 0:
ax.set_ylabel(f"{config.upper()}\n\nFunds", fontsize=10, color="#374151", fontweight="600")
ax.set_ylabel("Funds ($)", fontsize=20, color=MUTED, fontweight="400", labelpad=10)
ax.annotate(
config.upper(),
xy=(-0.22, 0.5), xycoords="axes fraction",
fontsize=23, fontweight="800",
color=DIFF_COLORS[config],
ha="center", va="center", rotation=90,
)
# Formatting
# Axes formatting
ax.xaxis.set_major_formatter(mdates.DateFormatter("%b"))
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
ax.tick_params(colors="#666", labelsize=7)
ax.grid(axis="y", color="#f0f0f0", linewidth=0.5)
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=2))
ax.tick_params(colors=MUTED, labelsize=18, length=5, width=0.8, pad=6)
ax.grid(axis="y", color=GRID_CLR, linewidth=0.7, alpha=0.8)
ax.grid(axis="x", color=GRID_CLR, linewidth=0.4, alpha=0.4)
ax.yaxis.set_major_formatter(
mticker.FuncFormatter(
lambda x, _: f"${x/1e6:.0f}M" if abs(x) >= 1e6
else f"${x/1e3:.0f}K" if abs(x) >= 1e3
lambda x, _: f"${x/1e6:.0f}M" if x >= 1e6
else f"${x/1e3:.0f}K" if x >= 1e3
else f"${x:.0f}"
)
)
ax.yaxis.set_minor_formatter(mticker.NullFormatter())
legend = ax.legend(fontsize=7, loc="upper left", frameon=True,
facecolor="white", edgecolor="#e5e7eb", framealpha=0.9)
for text in legend.get_texts():
text.set_color("#374151")
# No per-cell legend (common legend in header)
plt.subplots_adjust(
left=0.08, right=0.98, top=0.79, bottom=0.05,
hspace=0.30, wspace=0.22,
)
# Seed column headers just above the plot grid
col_centers = [0.08 + (0.98 - 0.08) * (i + 0.5) / 3 for i in range(3)]
for i, seed in enumerate(SEEDS):
fig.text(
col_centers[i], 0.80,
f"Seed {seed}",
ha="center", va="bottom",
fontsize=26, fontweight="600", color=TEXT_CLR,
)
# Footer
fig.text(
0.5, 0.01,
"collinear.ai | YC-Bench: Long-Horizon Deterministic Benchmark for LLM Agents",
ha="center", va="bottom",
fontsize=18, fontweight="400", color=MUTED,
fontstyle="italic",
)
plt.tight_layout(rect=[0, 0, 1, 0.95])
out = ROOT / "plots" / "sonnet_vs_gemini.png"
out.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(out, dpi=180, bbox_inches="tight", facecolor="white")
dpi = 150
plt.savefig(out, dpi=dpi, facecolor=BG_COLOR, pad_inches=0)
# Composite SVG logo onto the navy header band
if logo_img is not None:
from PIL import Image
plot_img = Image.open(out).convert("RGBA")
img_w, img_h = plot_img.size
# Header band is top 10% of image (no pad_inches)
header_top = 0
header_h = int(img_h * 0.10)
# Scale logo to ~65% of header height
target_h = int(header_h * 0.65)
scale = target_h / logo_img.size[1]
logo = logo_img.resize((int(logo_img.size[0] * scale), target_h), Image.LANCZOS)
# Center vertically in the navy header band
y_offset = header_top + (header_h - target_h) // 2
x_offset = 70
plot_img.paste(logo, (x_offset, y_offset), logo)
plot_img.save(out)
print(f"\nSaved: {out}")