diff --git a/plots/collinear_logo.svg b/plots/collinear_logo.svg
new file mode 100644
index 0000000..76cd7d0
--- /dev/null
+++ b/plots/collinear_logo.svg
@@ -0,0 +1,11 @@
+
diff --git a/plots/collinear_wordmark.svg b/plots/collinear_wordmark.svg
new file mode 100644
index 0000000..951fb1e
--- /dev/null
+++ b/plots/collinear_wordmark.svg
@@ -0,0 +1,12 @@
+
diff --git a/plots/funds_curves.png b/plots/funds_curves.png
deleted file mode 100644
index a85b93e..0000000
Binary files a/plots/funds_curves.png and /dev/null differ
diff --git a/plots/notepad_unknown_?_gemini-3-flash-preview.gif b/plots/notepad_unknown_?_gemini-3-flash-preview.gif
deleted file mode 100644
index b51f50f..0000000
Binary files a/plots/notepad_unknown_?_gemini-3-flash-preview.gif and /dev/null differ
diff --git a/plots/notepad_unknown_?_glm-5.gif b/plots/notepad_unknown_?_glm-5.gif
deleted file mode 100644
index 11bd60d..0000000
Binary files a/plots/notepad_unknown_?_glm-5.gif and /dev/null differ
diff --git a/plots/notepad_unknown_?_grok-4.1-fast.gif b/plots/notepad_unknown_?_grok-4.1-fast.gif
deleted file mode 100644
index e5f9c60..0000000
Binary files a/plots/notepad_unknown_?_grok-4.1-fast.gif and /dev/null differ
diff --git a/plots/sonnet_results.png b/plots/sonnet_results.png
deleted file mode 100644
index eb59e8f..0000000
Binary files a/plots/sonnet_results.png and /dev/null differ
diff --git a/plots/sonnet_vs_gemini.png b/plots/sonnet_vs_gemini.png
index f14b749..c87f717 100644
Binary files a/plots/sonnet_vs_gemini.png and b/plots/sonnet_vs_gemini.png differ
diff --git a/scripts/bot_runner.py b/scripts/bot_runner.py
new file mode 100644
index 0000000..959b645
--- /dev/null
+++ b/scripts/bot_runner.py
@@ -0,0 +1,451 @@
+"""Bot runner: plays YC-Bench using direct DB access with pluggable strategies.
+
+Strategies:
+ greedy — pick highest reward among completable tasks
+ random — pick randomly among completable tasks (deterministic via RngStreams)
+ throughput — pick highest reward/hour among completable tasks
+ prestige — phase 1: climb prestige fast, phase 2: throughput
+
+Usage:
+ uv run python scripts/bot_runner.py # all bots, all configs, all seeds
+ uv run python scripts/bot_runner.py --bot greedy # just greedy
+ uv run python scripts/bot_runner.py --bot random --seed 1 --config medium
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from decimal import Decimal
+from pathlib import Path
+from typing import Callable, Optional
+from uuid import uuid4
+
+sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
+
+from yc_bench.config import load_config
+from yc_bench.core.business_time import add_business_hours
+from yc_bench.core.engine import advance_time
+from yc_bench.core.eta import recalculate_etas
+from yc_bench.core.events import fetch_next_event, insert_event
+from yc_bench.db.models.company import Company, CompanyPrestige
+from yc_bench.db.models.employee import Employee, EmployeeSkillRate
+from yc_bench.db.models.event import EventType
+from yc_bench.db.models.sim_state import SimState
+from yc_bench.db.models.task import Task, TaskAssignment, TaskRequirement, TaskStatus
+from yc_bench.db.session import build_engine, build_session_factory, init_db, session_scope
+from yc_bench.services.generate_tasks import generate_replacement_task
+from yc_bench.services.rng import RngStreams
+from yc_bench.services.seed_world import SeedWorldRequest, seed_world_transactional
+
+CONFIGS = ["medium", "hard", "nightmare"]
+SEEDS = [1, 2, 3]
+
+# Cap task cycles to match LLM throughput. An LLM gets 500 turns and needs
+# ~5 turns per task cycle (browse + accept + 5× assign + dispatch + resume),
+# so it can complete at most ~100 tasks. The sim still runs to horizon —
+# once the budget is exhausted the bot just advances time (paying salaries,
+# bleeding cash) exactly like an LLM that hit max_turns.
+MAX_TASK_CYCLES = 100
+
+
+@dataclass
+class CandidateTask:
+ task: object # ORM Task row
+ reward_cents: int
+ prestige_delta: float
+ completion_hours: Decimal
+ is_completable: bool
+
+
+def estimate_completion_hours(task_reqs, employee_skills, n_concurrent_tasks=1):
+ """Estimate hours to complete task with all employees assigned."""
+ domain_rates = {}
+ for req in task_reqs:
+ domain = req["domain"]
+ total_rate = Decimal("0")
+ for emp in employee_skills:
+ rate = emp.get(domain, Decimal("0"))
+ total_rate += rate / Decimal(n_concurrent_tasks)
+ domain_rates[domain] = total_rate
+
+ max_hours = Decimal("0")
+ for req in task_reqs:
+ domain = req["domain"]
+ qty = Decimal(str(req["required_qty"]))
+ rate = domain_rates.get(domain, Decimal("0"))
+ if rate <= 0:
+ return None
+ hours = qty / rate
+ if hours > max_hours:
+ max_hours = hours
+ return max_hours
+
+
+def _compute_deadline(accepted_at, total_required_qty, cfg):
+ work_hours = cfg.workday_end_hour - cfg.workday_start_hour
+ biz_days = max(cfg.deadline_min_biz_days, int(total_required_qty / cfg.deadline_qty_per_day))
+ return add_business_hours(accepted_at, Decimal(str(biz_days)) * Decimal(str(work_hours)))
+
+
+def _build_candidates(db, company_id, sim_state, world_cfg, emp_skills):
+ """Build CandidateTask list for all market tasks the company can see."""
+ prestige_rows = db.query(CompanyPrestige).filter(
+ CompanyPrestige.company_id == company_id
+ ).all()
+ max_prestige = max((float(p.prestige_level) for p in prestige_rows), default=1.0)
+
+ market_tasks = db.query(Task).filter(
+ Task.status == TaskStatus.MARKET,
+ Task.required_prestige <= int(max_prestige),
+ ).order_by(Task.reward_funds_cents.desc()).all()
+
+ all_skills = [{d: r for d, r in e["skills"].items()} for e in emp_skills]
+
+ candidates = []
+ for task in market_tasks:
+ reqs = db.query(TaskRequirement).filter(
+ TaskRequirement.task_id == task.id
+ ).all()
+ total_qty = sum(float(r.required_qty) for r in reqs)
+ task_reqs = [{"domain": r.domain, "required_qty": float(r.required_qty)} for r in reqs]
+
+ completion_hours = estimate_completion_hours(task_reqs, all_skills, n_concurrent_tasks=1)
+
+ is_completable = False
+ if completion_hours is not None:
+ deadline = _compute_deadline(sim_state.sim_time, total_qty, world_cfg)
+ completion_time = add_business_hours(sim_state.sim_time, completion_hours)
+ is_completable = completion_time <= deadline
+
+ candidates.append(CandidateTask(
+ task=task,
+ reward_cents=task.reward_funds_cents,
+ prestige_delta=float(task.reward_prestige_delta),
+ completion_hours=completion_hours if completion_hours is not None else Decimal("999999"),
+ is_completable=is_completable,
+ ))
+
+ return candidates, max_prestige
+
+
+# ── Strategy functions ──────────────────────────────────────────────────────
+
+StrategyFn = Callable # (completable: list[CandidateTask], context: dict) -> Optional[CandidateTask]
+
+
+def strategy_greedy(completable: list[CandidateTask], context: dict) -> Optional[CandidateTask]:
+ """Pick the task with the highest reward."""
+ if not completable:
+ return None
+ return max(completable, key=lambda c: c.reward_cents)
+
+
+def strategy_random(completable: list[CandidateTask], context: dict) -> Optional[CandidateTask]:
+ """Pick a random completable task (deterministic via seeded RNG)."""
+ if not completable:
+ return None
+ seed = context["seed"]
+ turn = context["turn"]
+ rng = RngStreams(seed).stream(f"bot_random_select:{turn}")
+ return rng.choice(completable)
+
+
+def strategy_throughput(completable: list[CandidateTask], context: dict) -> Optional[CandidateTask]:
+ """Pick the task with the highest reward per hour."""
+ if not completable:
+ return None
+ return max(completable, key=lambda c: Decimal(c.reward_cents) / c.completion_hours)
+
+
+def strategy_prestige(completable: list[CandidateTask], context: dict) -> Optional[CandidateTask]:
+ """Phase 1 (prestige < 5): climb prestige fastest. Phase 2: throughput."""
+ if not completable:
+ return None
+ current_prestige = context["max_prestige"]
+ if current_prestige < 5:
+ # Prefer tasks that give prestige delta per hour of work
+ prestige_tasks = [c for c in completable if c.prestige_delta > 0]
+ if prestige_tasks:
+ return max(prestige_tasks, key=lambda c: Decimal(str(c.prestige_delta)) / c.completion_hours)
+ # Fall back to throughput
+ return max(completable, key=lambda c: Decimal(c.reward_cents) / c.completion_hours)
+
+
+STRATEGIES = {
+ "greedy": ("greedy_bot", strategy_greedy),
+ "random": ("random_bot", strategy_random),
+ "throughput": ("throughput_bot", strategy_throughput),
+ "prestige": ("prestige_bot", strategy_prestige),
+}
+
+
+# ── Shared simulation runner ───────────────────────────────────────────────
+
+def run_bot(config_name: str, seed: int, bot_slug: str, strategy_fn: StrategyFn):
+ """Run a bot strategy on one (config, seed) pair. Returns result dict."""
+ cfg = load_config(config_name)
+ world_cfg = cfg.world
+
+ db_dir = Path("db")
+ db_dir.mkdir(exist_ok=True)
+ db_path = db_dir / f"{config_name}_{seed}_{bot_slug}.db"
+
+ if db_path.exists():
+ db_path.unlink()
+
+ db_url = f"sqlite:///{db_path}"
+ os.environ["DATABASE_URL"] = db_url
+ os.environ["YC_BENCH_EXPERIMENT"] = config_name
+
+ engine = build_engine(db_url)
+ init_db(engine)
+ factory = build_session_factory(engine)
+
+ with session_scope(factory) as db:
+ start_dt = datetime(2025, 1, 1, 9, 0, 0, tzinfo=timezone.utc)
+ horizon_end = start_dt.replace(year=start_dt.year + cfg.sim.horizon_years)
+
+ req = SeedWorldRequest(
+ run_seed=seed,
+ company_name=bot_slug.replace("_", " ").title(),
+ horizon_years=cfg.sim.horizon_years,
+ employee_count=world_cfg.num_employees,
+ market_task_count=world_cfg.num_market_tasks,
+ start_date=start_dt,
+ )
+ result = seed_world_transactional(db, req)
+ company_id = result.company_id
+
+ insert_event(
+ db=db,
+ company_id=company_id,
+ event_type=EventType.HORIZON_END,
+ scheduled_at=horizon_end,
+ payload={"reason": "horizon_end"},
+ dedupe_key="horizon_end",
+ )
+
+ sim_state = SimState(
+ company_id=company_id,
+ sim_time=start_dt,
+ run_seed=seed,
+ horizon_end=horizon_end,
+ replenish_counter=0,
+ )
+ db.add(sim_state)
+ db.flush()
+
+ tasks_completed = 0
+ tasks_failed = 0
+ task_cycles_used = 0
+ turn = 0
+
+ while True:
+ turn += 1
+
+ with session_scope(factory) as db:
+ sim_state = db.query(SimState).first()
+ company = db.query(Company).filter(Company.id == company_id).one()
+
+ if company.funds_cents < 0:
+ break
+ if sim_state.sim_time >= sim_state.horizon_end:
+ break
+
+ active_tasks = db.query(Task).filter(
+ Task.company_id == company_id,
+ Task.status == TaskStatus.ACTIVE,
+ ).all()
+
+ if active_tasks:
+ next_event = fetch_next_event(db, company_id, sim_state.horizon_end)
+ if next_event is None:
+ break
+ adv = advance_time(db, company_id, next_event.scheduled_at)
+ for we in adv.wake_events:
+ if we.get("type") == "task_completed":
+ if we.get("success"):
+ tasks_completed += 1
+ else:
+ tasks_failed += 1
+ if adv.bankrupt or adv.horizon_reached:
+ break
+ continue
+
+ # No active task — if we've used up our task budget, just
+ # advance time (pay salaries, bleed cash) like an LLM that
+ # hit max_turns would.
+ if task_cycles_used >= MAX_TASK_CYCLES:
+ next_event = fetch_next_event(db, company_id, sim_state.horizon_end)
+ if next_event is None:
+ adv = advance_time(db, company_id, sim_state.horizon_end)
+ break
+ adv = advance_time(db, company_id, next_event.scheduled_at)
+ if adv.bankrupt or adv.horizon_reached:
+ break
+ continue
+
+ # Get employees and build candidates
+ employees = db.query(Employee).filter(Employee.company_id == company_id).all()
+ emp_skills = []
+ for emp in employees:
+ skills = db.query(EmployeeSkillRate).filter(
+ EmployeeSkillRate.employee_id == emp.id
+ ).all()
+ skill_map = {s.domain: Decimal(s.rate_domain_per_hour) for s in skills}
+ emp_skills.append({"id": emp.id, "skills": skill_map})
+
+ candidates, max_prestige = _build_candidates(db, company_id, sim_state, world_cfg, emp_skills)
+ completable = [c for c in candidates if c.is_completable]
+
+ context = {
+ "seed": seed,
+ "turn": turn,
+ "max_prestige": max_prestige,
+ }
+ chosen = strategy_fn(completable, context)
+
+ if chosen is None:
+ next_event = fetch_next_event(db, company_id, sim_state.horizon_end)
+ if next_event is None:
+ adv = advance_time(db, company_id, sim_state.horizon_end)
+ break
+ adv = advance_time(db, company_id, next_event.scheduled_at)
+ if adv.bankrupt or adv.horizon_reached:
+ break
+ continue
+
+ best_task = chosen.task
+
+ # Accept the task
+ reqs = db.query(TaskRequirement).filter(
+ TaskRequirement.task_id == best_task.id
+ ).all()
+ total_qty = sum(float(r.required_qty) for r in reqs)
+
+ best_task.status = TaskStatus.PLANNED
+ best_task.company_id = company_id
+ best_task.accepted_at = sim_state.sim_time
+ best_task.deadline = _compute_deadline(sim_state.sim_time, total_qty, world_cfg)
+
+ # Generate replacement
+ counter = sim_state.replenish_counter
+ sim_state.replenish_counter = counter + 1
+ replacement = generate_replacement_task(
+ run_seed=sim_state.run_seed,
+ replenish_counter=counter,
+ cfg=world_cfg,
+ )
+ replacement_row = Task(
+ id=uuid4(),
+ company_id=None,
+ status=TaskStatus.MARKET,
+ title=replacement.title,
+ description=replacement.description,
+ required_prestige=replacement.required_prestige,
+ reward_funds_cents=replacement.reward_funds_cents,
+ reward_prestige_delta=replacement.reward_prestige_delta,
+ skill_boost_pct=replacement.skill_boost_pct,
+ accepted_at=None, deadline=None, completed_at=None,
+ success=None, halfway_event_emitted=False,
+ )
+ db.add(replacement_row)
+ for domain, qty in replacement.requirements.items():
+ db.add(TaskRequirement(
+ task_id=replacement_row.id,
+ domain=domain,
+ required_qty=qty,
+ completed_qty=0,
+ ))
+
+ # Assign ALL employees
+ for e in emp_skills:
+ db.add(TaskAssignment(
+ task_id=best_task.id,
+ employee_id=e["id"],
+ assigned_at=sim_state.sim_time,
+ ))
+ db.flush()
+
+ best_task.status = TaskStatus.ACTIVE
+ db.flush()
+
+ recalculate_etas(db, company_id, sim_state.sim_time,
+ impacted_task_ids={best_task.id},
+ half_threshold=world_cfg.task_half_threshold)
+
+ task_cycles_used += 1
+
+ # Final state
+ with session_scope(factory) as db:
+ company = db.query(Company).filter(Company.id == company_id).one()
+ sim_state = db.query(SimState).first()
+
+ final_balance = company.funds_cents
+ bankrupt = final_balance < 0
+
+ prestige_rows = db.query(CompanyPrestige).filter(
+ CompanyPrestige.company_id == company_id
+ ).all()
+ max_p = max((float(p.prestige_level) for p in prestige_rows), default=1.0)
+
+ return {
+ "config": config_name,
+ "seed": seed,
+ "bot": bot_slug,
+ "turns": turn,
+ "final_balance_cents": final_balance,
+ "bankrupt": bankrupt,
+ "tasks_completed": tasks_completed,
+ "tasks_failed": tasks_failed,
+ "max_prestige": max_p,
+ }
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Run YC-Bench bot strategies")
+ parser.add_argument("--bot", choices=list(STRATEGIES.keys()), default=None,
+ help="Run only this bot (default: all)")
+ parser.add_argument("--config", choices=CONFIGS, default=None,
+ help="Run only this config (default: all)")
+ parser.add_argument("--seed", type=int, default=None,
+ help="Run only this seed (default: all)")
+ args = parser.parse_args()
+
+ bots = [args.bot] if args.bot else list(STRATEGIES.keys())
+ configs = [args.config] if args.config else CONFIGS
+ seeds = [args.seed] if args.seed else SEEDS
+
+ results = []
+ total = len(bots) * len(configs) * len(seeds)
+ print(f"Running {total} bot simulations...\n")
+
+ for bot_name in bots:
+ slug, strategy_fn = STRATEGIES[bot_name]
+ for config_name in configs:
+ for seed in seeds:
+ print(f" {slug} | {config_name} seed={seed} ...", end=" ", flush=True)
+ r = run_bot(config_name, seed, slug, strategy_fn)
+ results.append(r)
+
+ if r["bankrupt"]:
+ tag = "BANKRUPT"
+ else:
+ tag = f"${r['final_balance_cents']/100:,.0f}"
+ print(f"{tag} | {r['tasks_completed']} OK, {r['tasks_failed']} fail | prestige {r['max_prestige']:.1f} | {r['turns']} turns")
+
+ print(f"\n{'Bot':<16} {'Config':<12} {'Seed':<5} {'Final Balance':>14} {'OK':>4} {'Fail':>5} {'Prestige':>9}")
+ print("-" * 70)
+ for r in results:
+ fb = "BANKRUPT" if r["bankrupt"] else f"${r['final_balance_cents']/100:,.0f}"
+ print(f"{r['bot']:<16} {r['config']:<12} {r['seed']:<5} {fb:>14} {r['tasks_completed']:>4} {r['tasks_failed']:>5} {r['max_prestige']:>8.1f}")
+
+ bankrupt_count = sum(1 for r in results if r["bankrupt"])
+ print(f"\nBankruptcies: {bankrupt_count}/{len(results)}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/greedy_bot.py b/scripts/greedy_bot.py
new file mode 100644
index 0000000..cff343e
--- /dev/null
+++ b/scripts/greedy_bot.py
@@ -0,0 +1,48 @@
+"""Greedy bot shim — delegates to bot_runner.py.
+
+Usage:
+ uv run python scripts/greedy_bot.py
+"""
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
+sys.path.insert(0, str(Path(__file__).parent))
+
+from bot_runner import CONFIGS, SEEDS, STRATEGIES, run_bot
+
+
+def main():
+ slug, strategy_fn = STRATEGIES["greedy"]
+ print("Running greedy bot across all configs and seeds...\n")
+ results = []
+
+ for config_name in CONFIGS:
+ for seed in SEEDS:
+ print(f" {config_name} seed={seed} ...", end=" ", flush=True)
+ r = run_bot(config_name, seed, slug, strategy_fn)
+ results.append(r)
+
+ if r["bankrupt"]:
+ tag = "BANKRUPT"
+ elif r["final_balance_cents"] >= 1_000_000_00:
+ tag = f"${r['final_balance_cents']/100:,.0f}"
+ else:
+ tag = f"${r['final_balance_cents']/100:,.0f}"
+
+ print(f"{tag} | {r['tasks_completed']} OK, {r['tasks_failed']} fail | prestige {r['max_prestige']:.1f} | {r['turns']} turns")
+
+ print(f"\n{'Config':<12} {'Seed':<5} {'Final Balance':>14} {'OK':>4} {'Fail':>5} {'Prestige':>9}")
+ print("-" * 55)
+ for r in results:
+ fb = "BANKRUPT" if r["bankrupt"] else f"${r['final_balance_cents']/100:,.0f}"
+ print(f"{r['config']:<12} {r['seed']:<5} {fb:>14} {r['tasks_completed']:>4} {r['tasks_failed']:>5} {r['max_prestige']:>8.1f}")
+
+ bankrupt_count = sum(1 for r in results if r["bankrupt"])
+ print(f"\nBankruptcies: {bankrupt_count}/{len(results)}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/plot_comparison.py b/scripts/plot_comparison.py
index e0825a1..79f0c46 100644
--- a/scripts/plot_comparison.py
+++ b/scripts/plot_comparison.py
@@ -1,4 +1,4 @@
-"""Sonnet 4.6 vs Gemini 3 Flash vs GPT-5.2 — apples-to-apples comparison plot."""
+"""YC-Bench comparison plot — Collinear AI branding."""
import sqlite3
from pathlib import Path
from datetime import datetime
@@ -8,31 +8,69 @@ matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mticker
+import numpy as np
ROOT = Path(__file__).parent.parent
INITIAL_FUNDS_CENTS = 25_000_000
+# ── Collinear brand palette ──────────────────────────────────────────────────
+NAVY = "#13234D"
+ORANGE = "#F26125"
+BLUE = "#4D65FF"
+BG_COLOR = "#FAFBFD"
+GRID_CLR = "#E8ECF2"
+TEXT_CLR = "#2A2F3D"
+MUTED = "#6B7694"
+CARD_BG = "#FFFFFF"
+
MODELS = {
"sonnet": {
"slug": "anthropic_claude-sonnet-4-6",
"label": "Sonnet 4.6",
- "color": "#2563eb",
+ "color": BLUE,
},
"gemini": {
"slug": "gemini_gemini-3-flash-preview",
"label": "Gemini 3 Flash",
- "color": "#f97316",
+ "color": ORANGE,
},
"gpt52": {
"slug": "openai_gpt-5.2",
"label": "GPT-5.2",
- "color": "#16a34a",
+ "color": "#22C55E",
+ },
+ "greedy": {
+ "slug": "greedy_bot",
+ "label": "Greedy Bot",
+ "color": NAVY,
},
}
+BOT_KEYS = {"greedy"}
+
CONFIGS = ["medium", "hard", "nightmare"]
SEEDS = [1, 2, 3]
+DIFF_COLORS = {"medium": BLUE, "hard": ORANGE, "nightmare": "#DC2626"}
+
+
+def load_logo_image(height_px=80):
+ """Render the wordmark SVG to a high-res RGBA PIL image."""
+ import os, ctypes.util
+ # Ensure homebrew cairo is findable
+ if ctypes.util.find_library("cairo") is None:
+ brew_lib = "/opt/homebrew/lib"
+ if Path(brew_lib).exists():
+ os.environ.setdefault("DYLD_LIBRARY_PATH", brew_lib)
+ import cairosvg
+ from PIL import Image
+ import io
+ p = ROOT / "plots" / "collinear_wordmark.svg"
+ if not p.exists():
+ return None
+ png_data = cairosvg.svg2png(url=str(p), output_height=height_px)
+ return Image.open(io.BytesIO(png_data)).convert("RGBA")
+
def load_funds_curve(db_path):
con = sqlite3.connect(str(db_path))
@@ -42,7 +80,6 @@ def load_funds_curve(db_path):
con.close()
if not rows:
return [], []
-
times, balances = [], []
running = INITIAL_FUNDS_CENTS
start = datetime.fromisoformat(rows[0][0]).replace(
@@ -50,16 +87,13 @@ def load_funds_curve(db_path):
)
times.append(start)
balances.append(running / 100)
-
for occurred_at, amount_cents in rows:
running += int(amount_cents)
t = datetime.fromisoformat(occurred_at)
- # Cap at end of year 1 for apples-to-apples
if t.year > 2025:
break
times.append(t)
balances.append(running / 100)
-
return times, balances
@@ -74,13 +108,10 @@ def load_all():
times, balances = load_funds_curve(db_path)
bankrupt = len(balances) > 1 and balances[-1] <= 0
runs.append({
- "config": config,
- "seed": seed,
- "model_key": key,
- "label": model["label"],
+ "config": config, "seed": seed,
+ "model_key": key, "label": model["label"],
"color": model["color"],
- "times": times,
- "balances": balances,
+ "times": times, "balances": balances,
"bankrupt": bankrupt,
"final": balances[-1] if balances else 0,
})
@@ -90,79 +121,197 @@ def load_all():
def make_plot(runs):
- fig, axes = plt.subplots(3, 3, figsize=(18, 14), facecolor="white")
- fig.suptitle(
- "Sonnet 4.6 vs Gemini 3 Flash vs GPT-5.2 · YC-Bench · 1-Year Horizon",
- fontsize=16, fontweight="600", y=0.98, color="#1a1a1a",
+ fig, axes = plt.subplots(3, 3, figsize=(30, 22), facecolor=BG_COLOR)
+
+ # ── Header band (drawn as a filled Rectangle patch on the figure) ────
+ from matplotlib.patches import FancyBboxPatch
+ header_rect = plt.Rectangle((0, 0.90), 1, 0.10,
+ transform=fig.transFigure, facecolor=NAVY,
+ edgecolor="none", zorder=0)
+ fig.patches.append(header_rect)
+ # Orange accent line under header
+ accent_rect = plt.Rectangle((0, 0.895), 1, 0.006,
+ transform=fig.transFigure, facecolor=ORANGE,
+ edgecolor="none", zorder=1)
+ fig.patches.append(accent_rect)
+
+ fig.text(
+ 0.5, 0.955,
+ "YC-Bench | 1-Year Horizon",
+ ha="center", va="center",
+ fontsize=50, fontweight="700", color="white",
+ fontfamily="Helvetica Neue", zorder=2,
)
+ # ── Common legend in header ─────────────────────────────────────────
+ legend_items = [
+ ("Sonnet 4.6", BLUE, "-", 4.0, 0.95),
+ ("Gemini 3 Flash", ORANGE, "-", 4.0, 0.95),
+ ("GPT-5.2", "#22C55E", "-", 4.0, 0.95),
+ ("Greedy Bot", NAVY, "--", 3.5, 0.75),
+ ]
+ legend_handles = []
+ for lbl, clr, ls, lw, alpha in legend_items:
+ line = plt.Line2D([0], [0], color=clr, linewidth=lw, linestyle=ls,
+ alpha=alpha)
+ legend_handles.append(line)
+ legend_labels = [item[0] for item in legend_items]
+ fig.legend(
+ legend_handles, legend_labels,
+ loc="center", bbox_to_anchor=(0.53, 0.855),
+ ncol=4, fontsize=22, frameon=False,
+ labelcolor=TEXT_CLR, handlelength=3.5, handletextpad=1.0,
+ columnspacing=3.0,
+ )
+
+ # Pre-render logo from SVG at high res (will composite after savefig)
+ logo_img = load_logo_image(height_px=120)
for row, config in enumerate(CONFIGS):
for col, seed in enumerate(SEEDS):
ax = axes[row][col]
- ax.set_facecolor("white")
- for spine in ax.spines.values():
- spine.set_edgecolor("#d0d0d0")
- spine.set_linewidth(0.7)
+ ax.set_facecolor(CARD_BG)
- # Bankruptcy line
- ax.axhline(0, color="#ef4444", linewidth=0.8, linestyle="--", alpha=0.4)
- ax.axhline(250_000, color="#9ca3af", linewidth=0.5, linestyle=":", alpha=0.4)
+ for spine in ax.spines.values():
+ spine.set_edgecolor(GRID_CLR)
+ spine.set_linewidth(1.2)
+
+ # Log scale on y-axis
+ ax.set_yscale("log")
+
+ # Reference lines
+ ax.axhline(250_000, color=MUTED, linewidth=0.8, linestyle=":", alpha=0.3, zorder=1)
cell_runs = [r for r in runs if r["config"] == config and r["seed"] == seed]
+ # Sort: bots first (background), then survivors desc, then bankrupt
+ def sort_key(r):
+ if r["model_key"] in BOT_KEYS: return (0, 0)
+ if not r["bankrupt"]: return (1, -r["final"])
+ return (2, 0)
+ cell_runs.sort(key=sort_key)
+
for r in cell_runs:
if not r["times"]:
continue
- alpha = 0.35 if r["bankrupt"] else 1.0
- lw = 1.0 if r["bankrupt"] else 2.0
+ is_bot = r["model_key"] in BOT_KEYS
+ if r["bankrupt"]:
+ alpha, lw, ls = 0.4, 2.0, "-" if not is_bot else "--"
+ elif is_bot:
+ alpha, lw, ls = 0.75, 3.5, "--"
+ else:
+ alpha, lw, ls = 0.95, 3.0, "-"
+
+ val = r["final"]
if r["bankrupt"]:
lbl = f"{r['label']} — bankrupt"
+ elif val >= 1e6:
+ lbl = f"{r['label']} — ${val/1e6:.1f}M"
else:
- val = r["final"]
- lbl = f"{r['label']} — ${val/1e6:.1f}M" if val >= 1e6 else f"{r['label']} — ${val/1e3:.0f}K"
+ lbl = f"{r['label']} — ${val/1e3:.0f}K"
- ax.plot(r["times"], r["balances"], color=r["color"],
- linewidth=lw, alpha=alpha, label=lbl, zorder=3)
+ # Clamp balances for log scale (floor at $1K)
+ plot_bals = [max(b, 1_000) for b in r["balances"]]
+
+ ax.plot(
+ r["times"], plot_bals,
+ color=r["color"], linewidth=lw, alpha=alpha,
+ label=lbl, linestyle=ls,
+ zorder=2 if is_bot else 3,
+ )
if r["bankrupt"]:
- ax.scatter([r["times"][-1]], [r["balances"][-1]],
- color=r["color"], marker="x", s=50, linewidths=1.5, alpha=0.5, zorder=5)
- else:
- ax.scatter([r["times"][-1]], [r["balances"][-1]],
- color=r["color"], marker="*", s=100, zorder=5)
+ ax.scatter(
+ [r["times"][-1]], [max(r["balances"][-1], 1_000)],
+ color=r["color"], marker="X", s=120,
+ linewidths=2, alpha=0.6, zorder=5,
+ edgecolors="white",
+ )
+ elif not is_bot:
+ ax.scatter(
+ [r["times"][-1]], [r["balances"][-1]],
+ color=r["color"], marker="o", s=100, zorder=5,
+ edgecolors="white", linewidths=2.5,
+ )
- # Title
- if row == 0:
- ax.set_title(f"Seed {seed}", fontsize=11, fontweight="500", color="#374151", pad=8)
+ # No per-axis column title (seed labels placed via fig.text below)
# Row label
if col == 0:
- ax.set_ylabel(f"{config.upper()}\n\nFunds", fontsize=10, color="#374151", fontweight="600")
+ ax.set_ylabel("Funds ($)", fontsize=20, color=MUTED, fontweight="400", labelpad=10)
+ ax.annotate(
+ config.upper(),
+ xy=(-0.22, 0.5), xycoords="axes fraction",
+ fontsize=23, fontweight="800",
+ color=DIFF_COLORS[config],
+ ha="center", va="center", rotation=90,
+ )
- # Formatting
+ # Axes formatting
ax.xaxis.set_major_formatter(mdates.DateFormatter("%b"))
- ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
- ax.tick_params(colors="#666", labelsize=7)
- ax.grid(axis="y", color="#f0f0f0", linewidth=0.5)
+ ax.xaxis.set_major_locator(mdates.MonthLocator(interval=2))
+ ax.tick_params(colors=MUTED, labelsize=18, length=5, width=0.8, pad=6)
+ ax.grid(axis="y", color=GRID_CLR, linewidth=0.7, alpha=0.8)
+ ax.grid(axis="x", color=GRID_CLR, linewidth=0.4, alpha=0.4)
ax.yaxis.set_major_formatter(
mticker.FuncFormatter(
- lambda x, _: f"${x/1e6:.0f}M" if abs(x) >= 1e6
- else f"${x/1e3:.0f}K" if abs(x) >= 1e3
+ lambda x, _: f"${x/1e6:.0f}M" if x >= 1e6
+ else f"${x/1e3:.0f}K" if x >= 1e3
else f"${x:.0f}"
)
)
+ ax.yaxis.set_minor_formatter(mticker.NullFormatter())
- legend = ax.legend(fontsize=7, loc="upper left", frameon=True,
- facecolor="white", edgecolor="#e5e7eb", framealpha=0.9)
- for text in legend.get_texts():
- text.set_color("#374151")
+ # No per-cell legend (common legend in header)
+
+ plt.subplots_adjust(
+ left=0.08, right=0.98, top=0.79, bottom=0.05,
+ hspace=0.30, wspace=0.22,
+ )
+
+ # Seed column headers just above the plot grid
+ col_centers = [0.08 + (0.98 - 0.08) * (i + 0.5) / 3 for i in range(3)]
+ for i, seed in enumerate(SEEDS):
+ fig.text(
+ col_centers[i], 0.80,
+ f"Seed {seed}",
+ ha="center", va="bottom",
+ fontsize=26, fontweight="600", color=TEXT_CLR,
+ )
+
+ # Footer
+ fig.text(
+ 0.5, 0.01,
+ "collinear.ai | YC-Bench: Long-Horizon Deterministic Benchmark for LLM Agents",
+ ha="center", va="bottom",
+ fontsize=18, fontweight="400", color=MUTED,
+ fontstyle="italic",
+ )
- plt.tight_layout(rect=[0, 0, 1, 0.95])
out = ROOT / "plots" / "sonnet_vs_gemini.png"
out.parent.mkdir(parents=True, exist_ok=True)
- plt.savefig(out, dpi=180, bbox_inches="tight", facecolor="white")
+ dpi = 150
+ plt.savefig(out, dpi=dpi, facecolor=BG_COLOR, pad_inches=0)
+
+ # Composite SVG logo onto the navy header band
+ if logo_img is not None:
+ from PIL import Image
+ plot_img = Image.open(out).convert("RGBA")
+ img_w, img_h = plot_img.size
+ # Header band is top 10% of image (no pad_inches)
+ header_top = 0
+ header_h = int(img_h * 0.10)
+ # Scale logo to ~65% of header height
+ target_h = int(header_h * 0.65)
+ scale = target_h / logo_img.size[1]
+ logo = logo_img.resize((int(logo_img.size[0] * scale), target_h), Image.LANCZOS)
+ # Center vertically in the navy header band
+ y_offset = header_top + (header_h - target_h) // 2
+ x_offset = 70
+ plot_img.paste(logo, (x_offset, y_offset), logo)
+ plot_img.save(out)
+
print(f"\nSaved: {out}")