diff --git a/plots/collinear_logo.svg b/plots/collinear_logo.svg new file mode 100644 index 0000000..76cd7d0 --- /dev/null +++ b/plots/collinear_logo.svg @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/plots/collinear_wordmark.svg b/plots/collinear_wordmark.svg new file mode 100644 index 0000000..951fb1e --- /dev/null +++ b/plots/collinear_wordmark.svg @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/plots/funds_curves.png b/plots/funds_curves.png deleted file mode 100644 index a85b93e..0000000 Binary files a/plots/funds_curves.png and /dev/null differ diff --git a/plots/notepad_unknown_?_gemini-3-flash-preview.gif b/plots/notepad_unknown_?_gemini-3-flash-preview.gif deleted file mode 100644 index b51f50f..0000000 Binary files a/plots/notepad_unknown_?_gemini-3-flash-preview.gif and /dev/null differ diff --git a/plots/notepad_unknown_?_glm-5.gif b/plots/notepad_unknown_?_glm-5.gif deleted file mode 100644 index 11bd60d..0000000 Binary files a/plots/notepad_unknown_?_glm-5.gif and /dev/null differ diff --git a/plots/notepad_unknown_?_grok-4.1-fast.gif b/plots/notepad_unknown_?_grok-4.1-fast.gif deleted file mode 100644 index e5f9c60..0000000 Binary files a/plots/notepad_unknown_?_grok-4.1-fast.gif and /dev/null differ diff --git a/plots/sonnet_results.png b/plots/sonnet_results.png deleted file mode 100644 index eb59e8f..0000000 Binary files a/plots/sonnet_results.png and /dev/null differ diff --git a/plots/sonnet_vs_gemini.png b/plots/sonnet_vs_gemini.png index f14b749..c87f717 100644 Binary files a/plots/sonnet_vs_gemini.png and b/plots/sonnet_vs_gemini.png differ diff --git a/scripts/bot_runner.py b/scripts/bot_runner.py new file mode 100644 index 0000000..959b645 --- /dev/null +++ b/scripts/bot_runner.py @@ -0,0 +1,451 @@ +"""Bot runner: plays YC-Bench using direct DB access with pluggable strategies. + +Strategies: + greedy — pick highest reward among completable tasks + random — pick randomly among completable tasks (deterministic via RngStreams) + throughput — pick highest reward/hour among completable tasks + prestige — phase 1: climb prestige fast, phase 2: throughput + +Usage: + uv run python scripts/bot_runner.py # all bots, all configs, all seeds + uv run python scripts/bot_runner.py --bot greedy # just greedy + uv run python scripts/bot_runner.py --bot random --seed 1 --config medium +""" +from __future__ import annotations + +import argparse +import os +import sys +from dataclasses import dataclass +from datetime import datetime, timezone +from decimal import Decimal +from pathlib import Path +from typing import Callable, Optional +from uuid import uuid4 + +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from yc_bench.config import load_config +from yc_bench.core.business_time import add_business_hours +from yc_bench.core.engine import advance_time +from yc_bench.core.eta import recalculate_etas +from yc_bench.core.events import fetch_next_event, insert_event +from yc_bench.db.models.company import Company, CompanyPrestige +from yc_bench.db.models.employee import Employee, EmployeeSkillRate +from yc_bench.db.models.event import EventType +from yc_bench.db.models.sim_state import SimState +from yc_bench.db.models.task import Task, TaskAssignment, TaskRequirement, TaskStatus +from yc_bench.db.session import build_engine, build_session_factory, init_db, session_scope +from yc_bench.services.generate_tasks import generate_replacement_task +from yc_bench.services.rng import RngStreams +from yc_bench.services.seed_world import SeedWorldRequest, seed_world_transactional + +CONFIGS = ["medium", "hard", "nightmare"] +SEEDS = [1, 2, 3] + +# Cap task cycles to match LLM throughput. An LLM gets 500 turns and needs +# ~5 turns per task cycle (browse + accept + 5× assign + dispatch + resume), +# so it can complete at most ~100 tasks. The sim still runs to horizon — +# once the budget is exhausted the bot just advances time (paying salaries, +# bleeding cash) exactly like an LLM that hit max_turns. +MAX_TASK_CYCLES = 100 + + +@dataclass +class CandidateTask: + task: object # ORM Task row + reward_cents: int + prestige_delta: float + completion_hours: Decimal + is_completable: bool + + +def estimate_completion_hours(task_reqs, employee_skills, n_concurrent_tasks=1): + """Estimate hours to complete task with all employees assigned.""" + domain_rates = {} + for req in task_reqs: + domain = req["domain"] + total_rate = Decimal("0") + for emp in employee_skills: + rate = emp.get(domain, Decimal("0")) + total_rate += rate / Decimal(n_concurrent_tasks) + domain_rates[domain] = total_rate + + max_hours = Decimal("0") + for req in task_reqs: + domain = req["domain"] + qty = Decimal(str(req["required_qty"])) + rate = domain_rates.get(domain, Decimal("0")) + if rate <= 0: + return None + hours = qty / rate + if hours > max_hours: + max_hours = hours + return max_hours + + +def _compute_deadline(accepted_at, total_required_qty, cfg): + work_hours = cfg.workday_end_hour - cfg.workday_start_hour + biz_days = max(cfg.deadline_min_biz_days, int(total_required_qty / cfg.deadline_qty_per_day)) + return add_business_hours(accepted_at, Decimal(str(biz_days)) * Decimal(str(work_hours))) + + +def _build_candidates(db, company_id, sim_state, world_cfg, emp_skills): + """Build CandidateTask list for all market tasks the company can see.""" + prestige_rows = db.query(CompanyPrestige).filter( + CompanyPrestige.company_id == company_id + ).all() + max_prestige = max((float(p.prestige_level) for p in prestige_rows), default=1.0) + + market_tasks = db.query(Task).filter( + Task.status == TaskStatus.MARKET, + Task.required_prestige <= int(max_prestige), + ).order_by(Task.reward_funds_cents.desc()).all() + + all_skills = [{d: r for d, r in e["skills"].items()} for e in emp_skills] + + candidates = [] + for task in market_tasks: + reqs = db.query(TaskRequirement).filter( + TaskRequirement.task_id == task.id + ).all() + total_qty = sum(float(r.required_qty) for r in reqs) + task_reqs = [{"domain": r.domain, "required_qty": float(r.required_qty)} for r in reqs] + + completion_hours = estimate_completion_hours(task_reqs, all_skills, n_concurrent_tasks=1) + + is_completable = False + if completion_hours is not None: + deadline = _compute_deadline(sim_state.sim_time, total_qty, world_cfg) + completion_time = add_business_hours(sim_state.sim_time, completion_hours) + is_completable = completion_time <= deadline + + candidates.append(CandidateTask( + task=task, + reward_cents=task.reward_funds_cents, + prestige_delta=float(task.reward_prestige_delta), + completion_hours=completion_hours if completion_hours is not None else Decimal("999999"), + is_completable=is_completable, + )) + + return candidates, max_prestige + + +# ── Strategy functions ────────────────────────────────────────────────────── + +StrategyFn = Callable # (completable: list[CandidateTask], context: dict) -> Optional[CandidateTask] + + +def strategy_greedy(completable: list[CandidateTask], context: dict) -> Optional[CandidateTask]: + """Pick the task with the highest reward.""" + if not completable: + return None + return max(completable, key=lambda c: c.reward_cents) + + +def strategy_random(completable: list[CandidateTask], context: dict) -> Optional[CandidateTask]: + """Pick a random completable task (deterministic via seeded RNG).""" + if not completable: + return None + seed = context["seed"] + turn = context["turn"] + rng = RngStreams(seed).stream(f"bot_random_select:{turn}") + return rng.choice(completable) + + +def strategy_throughput(completable: list[CandidateTask], context: dict) -> Optional[CandidateTask]: + """Pick the task with the highest reward per hour.""" + if not completable: + return None + return max(completable, key=lambda c: Decimal(c.reward_cents) / c.completion_hours) + + +def strategy_prestige(completable: list[CandidateTask], context: dict) -> Optional[CandidateTask]: + """Phase 1 (prestige < 5): climb prestige fastest. Phase 2: throughput.""" + if not completable: + return None + current_prestige = context["max_prestige"] + if current_prestige < 5: + # Prefer tasks that give prestige delta per hour of work + prestige_tasks = [c for c in completable if c.prestige_delta > 0] + if prestige_tasks: + return max(prestige_tasks, key=lambda c: Decimal(str(c.prestige_delta)) / c.completion_hours) + # Fall back to throughput + return max(completable, key=lambda c: Decimal(c.reward_cents) / c.completion_hours) + + +STRATEGIES = { + "greedy": ("greedy_bot", strategy_greedy), + "random": ("random_bot", strategy_random), + "throughput": ("throughput_bot", strategy_throughput), + "prestige": ("prestige_bot", strategy_prestige), +} + + +# ── Shared simulation runner ─────────────────────────────────────────────── + +def run_bot(config_name: str, seed: int, bot_slug: str, strategy_fn: StrategyFn): + """Run a bot strategy on one (config, seed) pair. Returns result dict.""" + cfg = load_config(config_name) + world_cfg = cfg.world + + db_dir = Path("db") + db_dir.mkdir(exist_ok=True) + db_path = db_dir / f"{config_name}_{seed}_{bot_slug}.db" + + if db_path.exists(): + db_path.unlink() + + db_url = f"sqlite:///{db_path}" + os.environ["DATABASE_URL"] = db_url + os.environ["YC_BENCH_EXPERIMENT"] = config_name + + engine = build_engine(db_url) + init_db(engine) + factory = build_session_factory(engine) + + with session_scope(factory) as db: + start_dt = datetime(2025, 1, 1, 9, 0, 0, tzinfo=timezone.utc) + horizon_end = start_dt.replace(year=start_dt.year + cfg.sim.horizon_years) + + req = SeedWorldRequest( + run_seed=seed, + company_name=bot_slug.replace("_", " ").title(), + horizon_years=cfg.sim.horizon_years, + employee_count=world_cfg.num_employees, + market_task_count=world_cfg.num_market_tasks, + start_date=start_dt, + ) + result = seed_world_transactional(db, req) + company_id = result.company_id + + insert_event( + db=db, + company_id=company_id, + event_type=EventType.HORIZON_END, + scheduled_at=horizon_end, + payload={"reason": "horizon_end"}, + dedupe_key="horizon_end", + ) + + sim_state = SimState( + company_id=company_id, + sim_time=start_dt, + run_seed=seed, + horizon_end=horizon_end, + replenish_counter=0, + ) + db.add(sim_state) + db.flush() + + tasks_completed = 0 + tasks_failed = 0 + task_cycles_used = 0 + turn = 0 + + while True: + turn += 1 + + with session_scope(factory) as db: + sim_state = db.query(SimState).first() + company = db.query(Company).filter(Company.id == company_id).one() + + if company.funds_cents < 0: + break + if sim_state.sim_time >= sim_state.horizon_end: + break + + active_tasks = db.query(Task).filter( + Task.company_id == company_id, + Task.status == TaskStatus.ACTIVE, + ).all() + + if active_tasks: + next_event = fetch_next_event(db, company_id, sim_state.horizon_end) + if next_event is None: + break + adv = advance_time(db, company_id, next_event.scheduled_at) + for we in adv.wake_events: + if we.get("type") == "task_completed": + if we.get("success"): + tasks_completed += 1 + else: + tasks_failed += 1 + if adv.bankrupt or adv.horizon_reached: + break + continue + + # No active task — if we've used up our task budget, just + # advance time (pay salaries, bleed cash) like an LLM that + # hit max_turns would. + if task_cycles_used >= MAX_TASK_CYCLES: + next_event = fetch_next_event(db, company_id, sim_state.horizon_end) + if next_event is None: + adv = advance_time(db, company_id, sim_state.horizon_end) + break + adv = advance_time(db, company_id, next_event.scheduled_at) + if adv.bankrupt or adv.horizon_reached: + break + continue + + # Get employees and build candidates + employees = db.query(Employee).filter(Employee.company_id == company_id).all() + emp_skills = [] + for emp in employees: + skills = db.query(EmployeeSkillRate).filter( + EmployeeSkillRate.employee_id == emp.id + ).all() + skill_map = {s.domain: Decimal(s.rate_domain_per_hour) for s in skills} + emp_skills.append({"id": emp.id, "skills": skill_map}) + + candidates, max_prestige = _build_candidates(db, company_id, sim_state, world_cfg, emp_skills) + completable = [c for c in candidates if c.is_completable] + + context = { + "seed": seed, + "turn": turn, + "max_prestige": max_prestige, + } + chosen = strategy_fn(completable, context) + + if chosen is None: + next_event = fetch_next_event(db, company_id, sim_state.horizon_end) + if next_event is None: + adv = advance_time(db, company_id, sim_state.horizon_end) + break + adv = advance_time(db, company_id, next_event.scheduled_at) + if adv.bankrupt or adv.horizon_reached: + break + continue + + best_task = chosen.task + + # Accept the task + reqs = db.query(TaskRequirement).filter( + TaskRequirement.task_id == best_task.id + ).all() + total_qty = sum(float(r.required_qty) for r in reqs) + + best_task.status = TaskStatus.PLANNED + best_task.company_id = company_id + best_task.accepted_at = sim_state.sim_time + best_task.deadline = _compute_deadline(sim_state.sim_time, total_qty, world_cfg) + + # Generate replacement + counter = sim_state.replenish_counter + sim_state.replenish_counter = counter + 1 + replacement = generate_replacement_task( + run_seed=sim_state.run_seed, + replenish_counter=counter, + cfg=world_cfg, + ) + replacement_row = Task( + id=uuid4(), + company_id=None, + status=TaskStatus.MARKET, + title=replacement.title, + description=replacement.description, + required_prestige=replacement.required_prestige, + reward_funds_cents=replacement.reward_funds_cents, + reward_prestige_delta=replacement.reward_prestige_delta, + skill_boost_pct=replacement.skill_boost_pct, + accepted_at=None, deadline=None, completed_at=None, + success=None, halfway_event_emitted=False, + ) + db.add(replacement_row) + for domain, qty in replacement.requirements.items(): + db.add(TaskRequirement( + task_id=replacement_row.id, + domain=domain, + required_qty=qty, + completed_qty=0, + )) + + # Assign ALL employees + for e in emp_skills: + db.add(TaskAssignment( + task_id=best_task.id, + employee_id=e["id"], + assigned_at=sim_state.sim_time, + )) + db.flush() + + best_task.status = TaskStatus.ACTIVE + db.flush() + + recalculate_etas(db, company_id, sim_state.sim_time, + impacted_task_ids={best_task.id}, + half_threshold=world_cfg.task_half_threshold) + + task_cycles_used += 1 + + # Final state + with session_scope(factory) as db: + company = db.query(Company).filter(Company.id == company_id).one() + sim_state = db.query(SimState).first() + + final_balance = company.funds_cents + bankrupt = final_balance < 0 + + prestige_rows = db.query(CompanyPrestige).filter( + CompanyPrestige.company_id == company_id + ).all() + max_p = max((float(p.prestige_level) for p in prestige_rows), default=1.0) + + return { + "config": config_name, + "seed": seed, + "bot": bot_slug, + "turns": turn, + "final_balance_cents": final_balance, + "bankrupt": bankrupt, + "tasks_completed": tasks_completed, + "tasks_failed": tasks_failed, + "max_prestige": max_p, + } + + +def main(): + parser = argparse.ArgumentParser(description="Run YC-Bench bot strategies") + parser.add_argument("--bot", choices=list(STRATEGIES.keys()), default=None, + help="Run only this bot (default: all)") + parser.add_argument("--config", choices=CONFIGS, default=None, + help="Run only this config (default: all)") + parser.add_argument("--seed", type=int, default=None, + help="Run only this seed (default: all)") + args = parser.parse_args() + + bots = [args.bot] if args.bot else list(STRATEGIES.keys()) + configs = [args.config] if args.config else CONFIGS + seeds = [args.seed] if args.seed else SEEDS + + results = [] + total = len(bots) * len(configs) * len(seeds) + print(f"Running {total} bot simulations...\n") + + for bot_name in bots: + slug, strategy_fn = STRATEGIES[bot_name] + for config_name in configs: + for seed in seeds: + print(f" {slug} | {config_name} seed={seed} ...", end=" ", flush=True) + r = run_bot(config_name, seed, slug, strategy_fn) + results.append(r) + + if r["bankrupt"]: + tag = "BANKRUPT" + else: + tag = f"${r['final_balance_cents']/100:,.0f}" + print(f"{tag} | {r['tasks_completed']} OK, {r['tasks_failed']} fail | prestige {r['max_prestige']:.1f} | {r['turns']} turns") + + print(f"\n{'Bot':<16} {'Config':<12} {'Seed':<5} {'Final Balance':>14} {'OK':>4} {'Fail':>5} {'Prestige':>9}") + print("-" * 70) + for r in results: + fb = "BANKRUPT" if r["bankrupt"] else f"${r['final_balance_cents']/100:,.0f}" + print(f"{r['bot']:<16} {r['config']:<12} {r['seed']:<5} {fb:>14} {r['tasks_completed']:>4} {r['tasks_failed']:>5} {r['max_prestige']:>8.1f}") + + bankrupt_count = sum(1 for r in results if r["bankrupt"]) + print(f"\nBankruptcies: {bankrupt_count}/{len(results)}") + + +if __name__ == "__main__": + main() diff --git a/scripts/greedy_bot.py b/scripts/greedy_bot.py new file mode 100644 index 0000000..cff343e --- /dev/null +++ b/scripts/greedy_bot.py @@ -0,0 +1,48 @@ +"""Greedy bot shim — delegates to bot_runner.py. + +Usage: + uv run python scripts/greedy_bot.py +""" +from __future__ import annotations + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) +sys.path.insert(0, str(Path(__file__).parent)) + +from bot_runner import CONFIGS, SEEDS, STRATEGIES, run_bot + + +def main(): + slug, strategy_fn = STRATEGIES["greedy"] + print("Running greedy bot across all configs and seeds...\n") + results = [] + + for config_name in CONFIGS: + for seed in SEEDS: + print(f" {config_name} seed={seed} ...", end=" ", flush=True) + r = run_bot(config_name, seed, slug, strategy_fn) + results.append(r) + + if r["bankrupt"]: + tag = "BANKRUPT" + elif r["final_balance_cents"] >= 1_000_000_00: + tag = f"${r['final_balance_cents']/100:,.0f}" + else: + tag = f"${r['final_balance_cents']/100:,.0f}" + + print(f"{tag} | {r['tasks_completed']} OK, {r['tasks_failed']} fail | prestige {r['max_prestige']:.1f} | {r['turns']} turns") + + print(f"\n{'Config':<12} {'Seed':<5} {'Final Balance':>14} {'OK':>4} {'Fail':>5} {'Prestige':>9}") + print("-" * 55) + for r in results: + fb = "BANKRUPT" if r["bankrupt"] else f"${r['final_balance_cents']/100:,.0f}" + print(f"{r['config']:<12} {r['seed']:<5} {fb:>14} {r['tasks_completed']:>4} {r['tasks_failed']:>5} {r['max_prestige']:>8.1f}") + + bankrupt_count = sum(1 for r in results if r["bankrupt"]) + print(f"\nBankruptcies: {bankrupt_count}/{len(results)}") + + +if __name__ == "__main__": + main() diff --git a/scripts/plot_comparison.py b/scripts/plot_comparison.py index e0825a1..79f0c46 100644 --- a/scripts/plot_comparison.py +++ b/scripts/plot_comparison.py @@ -1,4 +1,4 @@ -"""Sonnet 4.6 vs Gemini 3 Flash vs GPT-5.2 — apples-to-apples comparison plot.""" +"""YC-Bench comparison plot — Collinear AI branding.""" import sqlite3 from pathlib import Path from datetime import datetime @@ -8,31 +8,69 @@ matplotlib.use("Agg") import matplotlib.pyplot as plt import matplotlib.dates as mdates import matplotlib.ticker as mticker +import numpy as np ROOT = Path(__file__).parent.parent INITIAL_FUNDS_CENTS = 25_000_000 +# ── Collinear brand palette ────────────────────────────────────────────────── +NAVY = "#13234D" +ORANGE = "#F26125" +BLUE = "#4D65FF" +BG_COLOR = "#FAFBFD" +GRID_CLR = "#E8ECF2" +TEXT_CLR = "#2A2F3D" +MUTED = "#6B7694" +CARD_BG = "#FFFFFF" + MODELS = { "sonnet": { "slug": "anthropic_claude-sonnet-4-6", "label": "Sonnet 4.6", - "color": "#2563eb", + "color": BLUE, }, "gemini": { "slug": "gemini_gemini-3-flash-preview", "label": "Gemini 3 Flash", - "color": "#f97316", + "color": ORANGE, }, "gpt52": { "slug": "openai_gpt-5.2", "label": "GPT-5.2", - "color": "#16a34a", + "color": "#22C55E", + }, + "greedy": { + "slug": "greedy_bot", + "label": "Greedy Bot", + "color": NAVY, }, } +BOT_KEYS = {"greedy"} + CONFIGS = ["medium", "hard", "nightmare"] SEEDS = [1, 2, 3] +DIFF_COLORS = {"medium": BLUE, "hard": ORANGE, "nightmare": "#DC2626"} + + +def load_logo_image(height_px=80): + """Render the wordmark SVG to a high-res RGBA PIL image.""" + import os, ctypes.util + # Ensure homebrew cairo is findable + if ctypes.util.find_library("cairo") is None: + brew_lib = "/opt/homebrew/lib" + if Path(brew_lib).exists(): + os.environ.setdefault("DYLD_LIBRARY_PATH", brew_lib) + import cairosvg + from PIL import Image + import io + p = ROOT / "plots" / "collinear_wordmark.svg" + if not p.exists(): + return None + png_data = cairosvg.svg2png(url=str(p), output_height=height_px) + return Image.open(io.BytesIO(png_data)).convert("RGBA") + def load_funds_curve(db_path): con = sqlite3.connect(str(db_path)) @@ -42,7 +80,6 @@ def load_funds_curve(db_path): con.close() if not rows: return [], [] - times, balances = [], [] running = INITIAL_FUNDS_CENTS start = datetime.fromisoformat(rows[0][0]).replace( @@ -50,16 +87,13 @@ def load_funds_curve(db_path): ) times.append(start) balances.append(running / 100) - for occurred_at, amount_cents in rows: running += int(amount_cents) t = datetime.fromisoformat(occurred_at) - # Cap at end of year 1 for apples-to-apples if t.year > 2025: break times.append(t) balances.append(running / 100) - return times, balances @@ -74,13 +108,10 @@ def load_all(): times, balances = load_funds_curve(db_path) bankrupt = len(balances) > 1 and balances[-1] <= 0 runs.append({ - "config": config, - "seed": seed, - "model_key": key, - "label": model["label"], + "config": config, "seed": seed, + "model_key": key, "label": model["label"], "color": model["color"], - "times": times, - "balances": balances, + "times": times, "balances": balances, "bankrupt": bankrupt, "final": balances[-1] if balances else 0, }) @@ -90,79 +121,197 @@ def load_all(): def make_plot(runs): - fig, axes = plt.subplots(3, 3, figsize=(18, 14), facecolor="white") - fig.suptitle( - "Sonnet 4.6 vs Gemini 3 Flash vs GPT-5.2 · YC-Bench · 1-Year Horizon", - fontsize=16, fontweight="600", y=0.98, color="#1a1a1a", + fig, axes = plt.subplots(3, 3, figsize=(30, 22), facecolor=BG_COLOR) + + # ── Header band (drawn as a filled Rectangle patch on the figure) ──── + from matplotlib.patches import FancyBboxPatch + header_rect = plt.Rectangle((0, 0.90), 1, 0.10, + transform=fig.transFigure, facecolor=NAVY, + edgecolor="none", zorder=0) + fig.patches.append(header_rect) + # Orange accent line under header + accent_rect = plt.Rectangle((0, 0.895), 1, 0.006, + transform=fig.transFigure, facecolor=ORANGE, + edgecolor="none", zorder=1) + fig.patches.append(accent_rect) + + fig.text( + 0.5, 0.955, + "YC-Bench | 1-Year Horizon", + ha="center", va="center", + fontsize=50, fontweight="700", color="white", + fontfamily="Helvetica Neue", zorder=2, ) + # ── Common legend in header ───────────────────────────────────────── + legend_items = [ + ("Sonnet 4.6", BLUE, "-", 4.0, 0.95), + ("Gemini 3 Flash", ORANGE, "-", 4.0, 0.95), + ("GPT-5.2", "#22C55E", "-", 4.0, 0.95), + ("Greedy Bot", NAVY, "--", 3.5, 0.75), + ] + legend_handles = [] + for lbl, clr, ls, lw, alpha in legend_items: + line = plt.Line2D([0], [0], color=clr, linewidth=lw, linestyle=ls, + alpha=alpha) + legend_handles.append(line) + legend_labels = [item[0] for item in legend_items] + fig.legend( + legend_handles, legend_labels, + loc="center", bbox_to_anchor=(0.53, 0.855), + ncol=4, fontsize=22, frameon=False, + labelcolor=TEXT_CLR, handlelength=3.5, handletextpad=1.0, + columnspacing=3.0, + ) + + # Pre-render logo from SVG at high res (will composite after savefig) + logo_img = load_logo_image(height_px=120) for row, config in enumerate(CONFIGS): for col, seed in enumerate(SEEDS): ax = axes[row][col] - ax.set_facecolor("white") - for spine in ax.spines.values(): - spine.set_edgecolor("#d0d0d0") - spine.set_linewidth(0.7) + ax.set_facecolor(CARD_BG) - # Bankruptcy line - ax.axhline(0, color="#ef4444", linewidth=0.8, linestyle="--", alpha=0.4) - ax.axhline(250_000, color="#9ca3af", linewidth=0.5, linestyle=":", alpha=0.4) + for spine in ax.spines.values(): + spine.set_edgecolor(GRID_CLR) + spine.set_linewidth(1.2) + + # Log scale on y-axis + ax.set_yscale("log") + + # Reference lines + ax.axhline(250_000, color=MUTED, linewidth=0.8, linestyle=":", alpha=0.3, zorder=1) cell_runs = [r for r in runs if r["config"] == config and r["seed"] == seed] + # Sort: bots first (background), then survivors desc, then bankrupt + def sort_key(r): + if r["model_key"] in BOT_KEYS: return (0, 0) + if not r["bankrupt"]: return (1, -r["final"]) + return (2, 0) + cell_runs.sort(key=sort_key) + for r in cell_runs: if not r["times"]: continue - alpha = 0.35 if r["bankrupt"] else 1.0 - lw = 1.0 if r["bankrupt"] else 2.0 + is_bot = r["model_key"] in BOT_KEYS + if r["bankrupt"]: + alpha, lw, ls = 0.4, 2.0, "-" if not is_bot else "--" + elif is_bot: + alpha, lw, ls = 0.75, 3.5, "--" + else: + alpha, lw, ls = 0.95, 3.0, "-" + + val = r["final"] if r["bankrupt"]: lbl = f"{r['label']} — bankrupt" + elif val >= 1e6: + lbl = f"{r['label']} — ${val/1e6:.1f}M" else: - val = r["final"] - lbl = f"{r['label']} — ${val/1e6:.1f}M" if val >= 1e6 else f"{r['label']} — ${val/1e3:.0f}K" + lbl = f"{r['label']} — ${val/1e3:.0f}K" - ax.plot(r["times"], r["balances"], color=r["color"], - linewidth=lw, alpha=alpha, label=lbl, zorder=3) + # Clamp balances for log scale (floor at $1K) + plot_bals = [max(b, 1_000) for b in r["balances"]] + + ax.plot( + r["times"], plot_bals, + color=r["color"], linewidth=lw, alpha=alpha, + label=lbl, linestyle=ls, + zorder=2 if is_bot else 3, + ) if r["bankrupt"]: - ax.scatter([r["times"][-1]], [r["balances"][-1]], - color=r["color"], marker="x", s=50, linewidths=1.5, alpha=0.5, zorder=5) - else: - ax.scatter([r["times"][-1]], [r["balances"][-1]], - color=r["color"], marker="*", s=100, zorder=5) + ax.scatter( + [r["times"][-1]], [max(r["balances"][-1], 1_000)], + color=r["color"], marker="X", s=120, + linewidths=2, alpha=0.6, zorder=5, + edgecolors="white", + ) + elif not is_bot: + ax.scatter( + [r["times"][-1]], [r["balances"][-1]], + color=r["color"], marker="o", s=100, zorder=5, + edgecolors="white", linewidths=2.5, + ) - # Title - if row == 0: - ax.set_title(f"Seed {seed}", fontsize=11, fontweight="500", color="#374151", pad=8) + # No per-axis column title (seed labels placed via fig.text below) # Row label if col == 0: - ax.set_ylabel(f"{config.upper()}\n\nFunds", fontsize=10, color="#374151", fontweight="600") + ax.set_ylabel("Funds ($)", fontsize=20, color=MUTED, fontweight="400", labelpad=10) + ax.annotate( + config.upper(), + xy=(-0.22, 0.5), xycoords="axes fraction", + fontsize=23, fontweight="800", + color=DIFF_COLORS[config], + ha="center", va="center", rotation=90, + ) - # Formatting + # Axes formatting ax.xaxis.set_major_formatter(mdates.DateFormatter("%b")) - ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3)) - ax.tick_params(colors="#666", labelsize=7) - ax.grid(axis="y", color="#f0f0f0", linewidth=0.5) + ax.xaxis.set_major_locator(mdates.MonthLocator(interval=2)) + ax.tick_params(colors=MUTED, labelsize=18, length=5, width=0.8, pad=6) + ax.grid(axis="y", color=GRID_CLR, linewidth=0.7, alpha=0.8) + ax.grid(axis="x", color=GRID_CLR, linewidth=0.4, alpha=0.4) ax.yaxis.set_major_formatter( mticker.FuncFormatter( - lambda x, _: f"${x/1e6:.0f}M" if abs(x) >= 1e6 - else f"${x/1e3:.0f}K" if abs(x) >= 1e3 + lambda x, _: f"${x/1e6:.0f}M" if x >= 1e6 + else f"${x/1e3:.0f}K" if x >= 1e3 else f"${x:.0f}" ) ) + ax.yaxis.set_minor_formatter(mticker.NullFormatter()) - legend = ax.legend(fontsize=7, loc="upper left", frameon=True, - facecolor="white", edgecolor="#e5e7eb", framealpha=0.9) - for text in legend.get_texts(): - text.set_color("#374151") + # No per-cell legend (common legend in header) + + plt.subplots_adjust( + left=0.08, right=0.98, top=0.79, bottom=0.05, + hspace=0.30, wspace=0.22, + ) + + # Seed column headers just above the plot grid + col_centers = [0.08 + (0.98 - 0.08) * (i + 0.5) / 3 for i in range(3)] + for i, seed in enumerate(SEEDS): + fig.text( + col_centers[i], 0.80, + f"Seed {seed}", + ha="center", va="bottom", + fontsize=26, fontweight="600", color=TEXT_CLR, + ) + + # Footer + fig.text( + 0.5, 0.01, + "collinear.ai | YC-Bench: Long-Horizon Deterministic Benchmark for LLM Agents", + ha="center", va="bottom", + fontsize=18, fontweight="400", color=MUTED, + fontstyle="italic", + ) - plt.tight_layout(rect=[0, 0, 1, 0.95]) out = ROOT / "plots" / "sonnet_vs_gemini.png" out.parent.mkdir(parents=True, exist_ok=True) - plt.savefig(out, dpi=180, bbox_inches="tight", facecolor="white") + dpi = 150 + plt.savefig(out, dpi=dpi, facecolor=BG_COLOR, pad_inches=0) + + # Composite SVG logo onto the navy header band + if logo_img is not None: + from PIL import Image + plot_img = Image.open(out).convert("RGBA") + img_w, img_h = plot_img.size + # Header band is top 10% of image (no pad_inches) + header_top = 0 + header_h = int(img_h * 0.10) + # Scale logo to ~65% of header height + target_h = int(header_h * 0.65) + scale = target_h / logo_img.size[1] + logo = logo_img.resize((int(logo_img.size[0] * scale), target_h), Image.LANCZOS) + # Center vertically in the navy header band + y_offset = header_top + (header_h - target_h) // 2 + x_offset = 70 + plot_img.paste(logo, (x_offset, y_offset), logo) + plot_img.save(out) + print(f"\nSaved: {out}")