Add Collinear branding, bot runners, and clean up stale plots
- Restyle plot_comparison.py with Collinear brand palette and logo - Add collinear_logo.svg and collinear_wordmark.svg - Add bot_runner.py (greedy/random/throughput/prestige strategies) - Add greedy_bot.py shim - Remove old unused plots (funds_curves, notepad gifs, sonnet_results) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
11
plots/collinear_logo.svg
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
<svg width="39" height="40" viewBox="0 0 39 40" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<g clip-path="url(#clip0_369_133)">
|
||||
<path d="M25.6184 26.5238H36.7879C34.2158 33.8016 27.3592 39.0125 19.316 39.0125C9.06315 39.0125 0.75 30.5566 0.75 20.1276C0.75 9.69867 9.06315 1.25 19.316 1.25C27.3592 1.25 34.2087 6.45363 36.7879 13.7315H25.6184C23.9558 12.0258 21.7176 11.0646 19.316 11.0646C14.4063 11.0646 10.406 15.1336 10.406 20.1348C10.406 25.1361 14.4063 29.1978 19.316 29.1978C21.7176 29.1978 23.9558 28.2438 25.6184 26.531V26.5238Z" fill="#13234D"/>
|
||||
<path d="M37.8818 20.125C37.8818 20.7249 37.8534 21.3103 37.7966 21.8957H23.4653L22.7831 22.8064C21.9447 23.9194 20.68 24.5626 19.3157 24.5626C16.9071 24.5626 14.9531 22.5751 14.9531 20.125C14.9531 17.675 16.9071 15.6875 19.3157 15.6875C20.6871 15.6875 21.9518 16.3307 22.7831 17.4437L23.4653 18.3471H37.7966C37.8534 18.9325 37.8818 19.5252 37.8818 20.125Z" fill="#F26125"/>
|
||||
</g>
|
||||
<defs>
|
||||
<clipPath id="clip0_369_133">
|
||||
<rect width="39" height="39" fill="white" transform="translate(0 0.5)"/>
|
||||
</clipPath>
|
||||
</defs>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 1.1 KiB |
12
plots/collinear_wordmark.svg
Normal file
|
After Width: | Height: | Size: 7.4 KiB |
|
Before Width: | Height: | Size: 150 KiB |
|
Before Width: | Height: | Size: 17 KiB |
|
Before Width: | Height: | Size: 39 KiB |
|
Before Width: | Height: | Size: 20 KiB |
|
Before Width: | Height: | Size: 178 KiB |
|
Before Width: | Height: | Size: 407 KiB After Width: | Height: | Size: 721 KiB |
451
scripts/bot_runner.py
Normal file
|
|
@ -0,0 +1,451 @@
|
|||
"""Bot runner: plays YC-Bench using direct DB access with pluggable strategies.
|
||||
|
||||
Strategies:
|
||||
greedy — pick highest reward among completable tasks
|
||||
random — pick randomly among completable tasks (deterministic via RngStreams)
|
||||
throughput — pick highest reward/hour among completable tasks
|
||||
prestige — phase 1: climb prestige fast, phase 2: throughput
|
||||
|
||||
Usage:
|
||||
uv run python scripts/bot_runner.py # all bots, all configs, all seeds
|
||||
uv run python scripts/bot_runner.py --bot greedy # just greedy
|
||||
uv run python scripts/bot_runner.py --bot random --seed 1 --config medium
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from decimal import Decimal
|
||||
from pathlib import Path
|
||||
from typing import Callable, Optional
|
||||
from uuid import uuid4
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from yc_bench.config import load_config
|
||||
from yc_bench.core.business_time import add_business_hours
|
||||
from yc_bench.core.engine import advance_time
|
||||
from yc_bench.core.eta import recalculate_etas
|
||||
from yc_bench.core.events import fetch_next_event, insert_event
|
||||
from yc_bench.db.models.company import Company, CompanyPrestige
|
||||
from yc_bench.db.models.employee import Employee, EmployeeSkillRate
|
||||
from yc_bench.db.models.event import EventType
|
||||
from yc_bench.db.models.sim_state import SimState
|
||||
from yc_bench.db.models.task import Task, TaskAssignment, TaskRequirement, TaskStatus
|
||||
from yc_bench.db.session import build_engine, build_session_factory, init_db, session_scope
|
||||
from yc_bench.services.generate_tasks import generate_replacement_task
|
||||
from yc_bench.services.rng import RngStreams
|
||||
from yc_bench.services.seed_world import SeedWorldRequest, seed_world_transactional
|
||||
|
||||
CONFIGS = ["medium", "hard", "nightmare"]
|
||||
SEEDS = [1, 2, 3]
|
||||
|
||||
# Cap task cycles to match LLM throughput. An LLM gets 500 turns and needs
|
||||
# ~5 turns per task cycle (browse + accept + 5× assign + dispatch + resume),
|
||||
# so it can complete at most ~100 tasks. The sim still runs to horizon —
|
||||
# once the budget is exhausted the bot just advances time (paying salaries,
|
||||
# bleeding cash) exactly like an LLM that hit max_turns.
|
||||
MAX_TASK_CYCLES = 100
|
||||
|
||||
|
||||
@dataclass
|
||||
class CandidateTask:
|
||||
task: object # ORM Task row
|
||||
reward_cents: int
|
||||
prestige_delta: float
|
||||
completion_hours: Decimal
|
||||
is_completable: bool
|
||||
|
||||
|
||||
def estimate_completion_hours(task_reqs, employee_skills, n_concurrent_tasks=1):
|
||||
"""Estimate hours to complete task with all employees assigned."""
|
||||
domain_rates = {}
|
||||
for req in task_reqs:
|
||||
domain = req["domain"]
|
||||
total_rate = Decimal("0")
|
||||
for emp in employee_skills:
|
||||
rate = emp.get(domain, Decimal("0"))
|
||||
total_rate += rate / Decimal(n_concurrent_tasks)
|
||||
domain_rates[domain] = total_rate
|
||||
|
||||
max_hours = Decimal("0")
|
||||
for req in task_reqs:
|
||||
domain = req["domain"]
|
||||
qty = Decimal(str(req["required_qty"]))
|
||||
rate = domain_rates.get(domain, Decimal("0"))
|
||||
if rate <= 0:
|
||||
return None
|
||||
hours = qty / rate
|
||||
if hours > max_hours:
|
||||
max_hours = hours
|
||||
return max_hours
|
||||
|
||||
|
||||
def _compute_deadline(accepted_at, total_required_qty, cfg):
|
||||
work_hours = cfg.workday_end_hour - cfg.workday_start_hour
|
||||
biz_days = max(cfg.deadline_min_biz_days, int(total_required_qty / cfg.deadline_qty_per_day))
|
||||
return add_business_hours(accepted_at, Decimal(str(biz_days)) * Decimal(str(work_hours)))
|
||||
|
||||
|
||||
def _build_candidates(db, company_id, sim_state, world_cfg, emp_skills):
|
||||
"""Build CandidateTask list for all market tasks the company can see."""
|
||||
prestige_rows = db.query(CompanyPrestige).filter(
|
||||
CompanyPrestige.company_id == company_id
|
||||
).all()
|
||||
max_prestige = max((float(p.prestige_level) for p in prestige_rows), default=1.0)
|
||||
|
||||
market_tasks = db.query(Task).filter(
|
||||
Task.status == TaskStatus.MARKET,
|
||||
Task.required_prestige <= int(max_prestige),
|
||||
).order_by(Task.reward_funds_cents.desc()).all()
|
||||
|
||||
all_skills = [{d: r for d, r in e["skills"].items()} for e in emp_skills]
|
||||
|
||||
candidates = []
|
||||
for task in market_tasks:
|
||||
reqs = db.query(TaskRequirement).filter(
|
||||
TaskRequirement.task_id == task.id
|
||||
).all()
|
||||
total_qty = sum(float(r.required_qty) for r in reqs)
|
||||
task_reqs = [{"domain": r.domain, "required_qty": float(r.required_qty)} for r in reqs]
|
||||
|
||||
completion_hours = estimate_completion_hours(task_reqs, all_skills, n_concurrent_tasks=1)
|
||||
|
||||
is_completable = False
|
||||
if completion_hours is not None:
|
||||
deadline = _compute_deadline(sim_state.sim_time, total_qty, world_cfg)
|
||||
completion_time = add_business_hours(sim_state.sim_time, completion_hours)
|
||||
is_completable = completion_time <= deadline
|
||||
|
||||
candidates.append(CandidateTask(
|
||||
task=task,
|
||||
reward_cents=task.reward_funds_cents,
|
||||
prestige_delta=float(task.reward_prestige_delta),
|
||||
completion_hours=completion_hours if completion_hours is not None else Decimal("999999"),
|
||||
is_completable=is_completable,
|
||||
))
|
||||
|
||||
return candidates, max_prestige
|
||||
|
||||
|
||||
# ── Strategy functions ──────────────────────────────────────────────────────
|
||||
|
||||
StrategyFn = Callable # (completable: list[CandidateTask], context: dict) -> Optional[CandidateTask]
|
||||
|
||||
|
||||
def strategy_greedy(completable: list[CandidateTask], context: dict) -> Optional[CandidateTask]:
|
||||
"""Pick the task with the highest reward."""
|
||||
if not completable:
|
||||
return None
|
||||
return max(completable, key=lambda c: c.reward_cents)
|
||||
|
||||
|
||||
def strategy_random(completable: list[CandidateTask], context: dict) -> Optional[CandidateTask]:
|
||||
"""Pick a random completable task (deterministic via seeded RNG)."""
|
||||
if not completable:
|
||||
return None
|
||||
seed = context["seed"]
|
||||
turn = context["turn"]
|
||||
rng = RngStreams(seed).stream(f"bot_random_select:{turn}")
|
||||
return rng.choice(completable)
|
||||
|
||||
|
||||
def strategy_throughput(completable: list[CandidateTask], context: dict) -> Optional[CandidateTask]:
|
||||
"""Pick the task with the highest reward per hour."""
|
||||
if not completable:
|
||||
return None
|
||||
return max(completable, key=lambda c: Decimal(c.reward_cents) / c.completion_hours)
|
||||
|
||||
|
||||
def strategy_prestige(completable: list[CandidateTask], context: dict) -> Optional[CandidateTask]:
|
||||
"""Phase 1 (prestige < 5): climb prestige fastest. Phase 2: throughput."""
|
||||
if not completable:
|
||||
return None
|
||||
current_prestige = context["max_prestige"]
|
||||
if current_prestige < 5:
|
||||
# Prefer tasks that give prestige delta per hour of work
|
||||
prestige_tasks = [c for c in completable if c.prestige_delta > 0]
|
||||
if prestige_tasks:
|
||||
return max(prestige_tasks, key=lambda c: Decimal(str(c.prestige_delta)) / c.completion_hours)
|
||||
# Fall back to throughput
|
||||
return max(completable, key=lambda c: Decimal(c.reward_cents) / c.completion_hours)
|
||||
|
||||
|
||||
STRATEGIES = {
|
||||
"greedy": ("greedy_bot", strategy_greedy),
|
||||
"random": ("random_bot", strategy_random),
|
||||
"throughput": ("throughput_bot", strategy_throughput),
|
||||
"prestige": ("prestige_bot", strategy_prestige),
|
||||
}
|
||||
|
||||
|
||||
# ── Shared simulation runner ───────────────────────────────────────────────
|
||||
|
||||
def run_bot(config_name: str, seed: int, bot_slug: str, strategy_fn: StrategyFn):
|
||||
"""Run a bot strategy on one (config, seed) pair. Returns result dict."""
|
||||
cfg = load_config(config_name)
|
||||
world_cfg = cfg.world
|
||||
|
||||
db_dir = Path("db")
|
||||
db_dir.mkdir(exist_ok=True)
|
||||
db_path = db_dir / f"{config_name}_{seed}_{bot_slug}.db"
|
||||
|
||||
if db_path.exists():
|
||||
db_path.unlink()
|
||||
|
||||
db_url = f"sqlite:///{db_path}"
|
||||
os.environ["DATABASE_URL"] = db_url
|
||||
os.environ["YC_BENCH_EXPERIMENT"] = config_name
|
||||
|
||||
engine = build_engine(db_url)
|
||||
init_db(engine)
|
||||
factory = build_session_factory(engine)
|
||||
|
||||
with session_scope(factory) as db:
|
||||
start_dt = datetime(2025, 1, 1, 9, 0, 0, tzinfo=timezone.utc)
|
||||
horizon_end = start_dt.replace(year=start_dt.year + cfg.sim.horizon_years)
|
||||
|
||||
req = SeedWorldRequest(
|
||||
run_seed=seed,
|
||||
company_name=bot_slug.replace("_", " ").title(),
|
||||
horizon_years=cfg.sim.horizon_years,
|
||||
employee_count=world_cfg.num_employees,
|
||||
market_task_count=world_cfg.num_market_tasks,
|
||||
start_date=start_dt,
|
||||
)
|
||||
result = seed_world_transactional(db, req)
|
||||
company_id = result.company_id
|
||||
|
||||
insert_event(
|
||||
db=db,
|
||||
company_id=company_id,
|
||||
event_type=EventType.HORIZON_END,
|
||||
scheduled_at=horizon_end,
|
||||
payload={"reason": "horizon_end"},
|
||||
dedupe_key="horizon_end",
|
||||
)
|
||||
|
||||
sim_state = SimState(
|
||||
company_id=company_id,
|
||||
sim_time=start_dt,
|
||||
run_seed=seed,
|
||||
horizon_end=horizon_end,
|
||||
replenish_counter=0,
|
||||
)
|
||||
db.add(sim_state)
|
||||
db.flush()
|
||||
|
||||
tasks_completed = 0
|
||||
tasks_failed = 0
|
||||
task_cycles_used = 0
|
||||
turn = 0
|
||||
|
||||
while True:
|
||||
turn += 1
|
||||
|
||||
with session_scope(factory) as db:
|
||||
sim_state = db.query(SimState).first()
|
||||
company = db.query(Company).filter(Company.id == company_id).one()
|
||||
|
||||
if company.funds_cents < 0:
|
||||
break
|
||||
if sim_state.sim_time >= sim_state.horizon_end:
|
||||
break
|
||||
|
||||
active_tasks = db.query(Task).filter(
|
||||
Task.company_id == company_id,
|
||||
Task.status == TaskStatus.ACTIVE,
|
||||
).all()
|
||||
|
||||
if active_tasks:
|
||||
next_event = fetch_next_event(db, company_id, sim_state.horizon_end)
|
||||
if next_event is None:
|
||||
break
|
||||
adv = advance_time(db, company_id, next_event.scheduled_at)
|
||||
for we in adv.wake_events:
|
||||
if we.get("type") == "task_completed":
|
||||
if we.get("success"):
|
||||
tasks_completed += 1
|
||||
else:
|
||||
tasks_failed += 1
|
||||
if adv.bankrupt or adv.horizon_reached:
|
||||
break
|
||||
continue
|
||||
|
||||
# No active task — if we've used up our task budget, just
|
||||
# advance time (pay salaries, bleed cash) like an LLM that
|
||||
# hit max_turns would.
|
||||
if task_cycles_used >= MAX_TASK_CYCLES:
|
||||
next_event = fetch_next_event(db, company_id, sim_state.horizon_end)
|
||||
if next_event is None:
|
||||
adv = advance_time(db, company_id, sim_state.horizon_end)
|
||||
break
|
||||
adv = advance_time(db, company_id, next_event.scheduled_at)
|
||||
if adv.bankrupt or adv.horizon_reached:
|
||||
break
|
||||
continue
|
||||
|
||||
# Get employees and build candidates
|
||||
employees = db.query(Employee).filter(Employee.company_id == company_id).all()
|
||||
emp_skills = []
|
||||
for emp in employees:
|
||||
skills = db.query(EmployeeSkillRate).filter(
|
||||
EmployeeSkillRate.employee_id == emp.id
|
||||
).all()
|
||||
skill_map = {s.domain: Decimal(s.rate_domain_per_hour) for s in skills}
|
||||
emp_skills.append({"id": emp.id, "skills": skill_map})
|
||||
|
||||
candidates, max_prestige = _build_candidates(db, company_id, sim_state, world_cfg, emp_skills)
|
||||
completable = [c for c in candidates if c.is_completable]
|
||||
|
||||
context = {
|
||||
"seed": seed,
|
||||
"turn": turn,
|
||||
"max_prestige": max_prestige,
|
||||
}
|
||||
chosen = strategy_fn(completable, context)
|
||||
|
||||
if chosen is None:
|
||||
next_event = fetch_next_event(db, company_id, sim_state.horizon_end)
|
||||
if next_event is None:
|
||||
adv = advance_time(db, company_id, sim_state.horizon_end)
|
||||
break
|
||||
adv = advance_time(db, company_id, next_event.scheduled_at)
|
||||
if adv.bankrupt or adv.horizon_reached:
|
||||
break
|
||||
continue
|
||||
|
||||
best_task = chosen.task
|
||||
|
||||
# Accept the task
|
||||
reqs = db.query(TaskRequirement).filter(
|
||||
TaskRequirement.task_id == best_task.id
|
||||
).all()
|
||||
total_qty = sum(float(r.required_qty) for r in reqs)
|
||||
|
||||
best_task.status = TaskStatus.PLANNED
|
||||
best_task.company_id = company_id
|
||||
best_task.accepted_at = sim_state.sim_time
|
||||
best_task.deadline = _compute_deadline(sim_state.sim_time, total_qty, world_cfg)
|
||||
|
||||
# Generate replacement
|
||||
counter = sim_state.replenish_counter
|
||||
sim_state.replenish_counter = counter + 1
|
||||
replacement = generate_replacement_task(
|
||||
run_seed=sim_state.run_seed,
|
||||
replenish_counter=counter,
|
||||
cfg=world_cfg,
|
||||
)
|
||||
replacement_row = Task(
|
||||
id=uuid4(),
|
||||
company_id=None,
|
||||
status=TaskStatus.MARKET,
|
||||
title=replacement.title,
|
||||
description=replacement.description,
|
||||
required_prestige=replacement.required_prestige,
|
||||
reward_funds_cents=replacement.reward_funds_cents,
|
||||
reward_prestige_delta=replacement.reward_prestige_delta,
|
||||
skill_boost_pct=replacement.skill_boost_pct,
|
||||
accepted_at=None, deadline=None, completed_at=None,
|
||||
success=None, halfway_event_emitted=False,
|
||||
)
|
||||
db.add(replacement_row)
|
||||
for domain, qty in replacement.requirements.items():
|
||||
db.add(TaskRequirement(
|
||||
task_id=replacement_row.id,
|
||||
domain=domain,
|
||||
required_qty=qty,
|
||||
completed_qty=0,
|
||||
))
|
||||
|
||||
# Assign ALL employees
|
||||
for e in emp_skills:
|
||||
db.add(TaskAssignment(
|
||||
task_id=best_task.id,
|
||||
employee_id=e["id"],
|
||||
assigned_at=sim_state.sim_time,
|
||||
))
|
||||
db.flush()
|
||||
|
||||
best_task.status = TaskStatus.ACTIVE
|
||||
db.flush()
|
||||
|
||||
recalculate_etas(db, company_id, sim_state.sim_time,
|
||||
impacted_task_ids={best_task.id},
|
||||
half_threshold=world_cfg.task_half_threshold)
|
||||
|
||||
task_cycles_used += 1
|
||||
|
||||
# Final state
|
||||
with session_scope(factory) as db:
|
||||
company = db.query(Company).filter(Company.id == company_id).one()
|
||||
sim_state = db.query(SimState).first()
|
||||
|
||||
final_balance = company.funds_cents
|
||||
bankrupt = final_balance < 0
|
||||
|
||||
prestige_rows = db.query(CompanyPrestige).filter(
|
||||
CompanyPrestige.company_id == company_id
|
||||
).all()
|
||||
max_p = max((float(p.prestige_level) for p in prestige_rows), default=1.0)
|
||||
|
||||
return {
|
||||
"config": config_name,
|
||||
"seed": seed,
|
||||
"bot": bot_slug,
|
||||
"turns": turn,
|
||||
"final_balance_cents": final_balance,
|
||||
"bankrupt": bankrupt,
|
||||
"tasks_completed": tasks_completed,
|
||||
"tasks_failed": tasks_failed,
|
||||
"max_prestige": max_p,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Run YC-Bench bot strategies")
|
||||
parser.add_argument("--bot", choices=list(STRATEGIES.keys()), default=None,
|
||||
help="Run only this bot (default: all)")
|
||||
parser.add_argument("--config", choices=CONFIGS, default=None,
|
||||
help="Run only this config (default: all)")
|
||||
parser.add_argument("--seed", type=int, default=None,
|
||||
help="Run only this seed (default: all)")
|
||||
args = parser.parse_args()
|
||||
|
||||
bots = [args.bot] if args.bot else list(STRATEGIES.keys())
|
||||
configs = [args.config] if args.config else CONFIGS
|
||||
seeds = [args.seed] if args.seed else SEEDS
|
||||
|
||||
results = []
|
||||
total = len(bots) * len(configs) * len(seeds)
|
||||
print(f"Running {total} bot simulations...\n")
|
||||
|
||||
for bot_name in bots:
|
||||
slug, strategy_fn = STRATEGIES[bot_name]
|
||||
for config_name in configs:
|
||||
for seed in seeds:
|
||||
print(f" {slug} | {config_name} seed={seed} ...", end=" ", flush=True)
|
||||
r = run_bot(config_name, seed, slug, strategy_fn)
|
||||
results.append(r)
|
||||
|
||||
if r["bankrupt"]:
|
||||
tag = "BANKRUPT"
|
||||
else:
|
||||
tag = f"${r['final_balance_cents']/100:,.0f}"
|
||||
print(f"{tag} | {r['tasks_completed']} OK, {r['tasks_failed']} fail | prestige {r['max_prestige']:.1f} | {r['turns']} turns")
|
||||
|
||||
print(f"\n{'Bot':<16} {'Config':<12} {'Seed':<5} {'Final Balance':>14} {'OK':>4} {'Fail':>5} {'Prestige':>9}")
|
||||
print("-" * 70)
|
||||
for r in results:
|
||||
fb = "BANKRUPT" if r["bankrupt"] else f"${r['final_balance_cents']/100:,.0f}"
|
||||
print(f"{r['bot']:<16} {r['config']:<12} {r['seed']:<5} {fb:>14} {r['tasks_completed']:>4} {r['tasks_failed']:>5} {r['max_prestige']:>8.1f}")
|
||||
|
||||
bankrupt_count = sum(1 for r in results if r["bankrupt"])
|
||||
print(f"\nBankruptcies: {bankrupt_count}/{len(results)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
48
scripts/greedy_bot.py
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
"""Greedy bot shim — delegates to bot_runner.py.
|
||||
|
||||
Usage:
|
||||
uv run python scripts/greedy_bot.py
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from bot_runner import CONFIGS, SEEDS, STRATEGIES, run_bot
|
||||
|
||||
|
||||
def main():
|
||||
slug, strategy_fn = STRATEGIES["greedy"]
|
||||
print("Running greedy bot across all configs and seeds...\n")
|
||||
results = []
|
||||
|
||||
for config_name in CONFIGS:
|
||||
for seed in SEEDS:
|
||||
print(f" {config_name} seed={seed} ...", end=" ", flush=True)
|
||||
r = run_bot(config_name, seed, slug, strategy_fn)
|
||||
results.append(r)
|
||||
|
||||
if r["bankrupt"]:
|
||||
tag = "BANKRUPT"
|
||||
elif r["final_balance_cents"] >= 1_000_000_00:
|
||||
tag = f"${r['final_balance_cents']/100:,.0f}"
|
||||
else:
|
||||
tag = f"${r['final_balance_cents']/100:,.0f}"
|
||||
|
||||
print(f"{tag} | {r['tasks_completed']} OK, {r['tasks_failed']} fail | prestige {r['max_prestige']:.1f} | {r['turns']} turns")
|
||||
|
||||
print(f"\n{'Config':<12} {'Seed':<5} {'Final Balance':>14} {'OK':>4} {'Fail':>5} {'Prestige':>9}")
|
||||
print("-" * 55)
|
||||
for r in results:
|
||||
fb = "BANKRUPT" if r["bankrupt"] else f"${r['final_balance_cents']/100:,.0f}"
|
||||
print(f"{r['config']:<12} {r['seed']:<5} {fb:>14} {r['tasks_completed']:>4} {r['tasks_failed']:>5} {r['max_prestige']:>8.1f}")
|
||||
|
||||
bankrupt_count = sum(1 for r in results if r["bankrupt"])
|
||||
print(f"\nBankruptcies: {bankrupt_count}/{len(results)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
"""Sonnet 4.6 vs Gemini 3 Flash vs GPT-5.2 — apples-to-apples comparison plot."""
|
||||
"""YC-Bench comparison plot — Collinear AI branding."""
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
|
@ -8,31 +8,69 @@ matplotlib.use("Agg")
|
|||
import matplotlib.pyplot as plt
|
||||
import matplotlib.dates as mdates
|
||||
import matplotlib.ticker as mticker
|
||||
import numpy as np
|
||||
|
||||
ROOT = Path(__file__).parent.parent
|
||||
INITIAL_FUNDS_CENTS = 25_000_000
|
||||
|
||||
# ── Collinear brand palette ──────────────────────────────────────────────────
|
||||
NAVY = "#13234D"
|
||||
ORANGE = "#F26125"
|
||||
BLUE = "#4D65FF"
|
||||
BG_COLOR = "#FAFBFD"
|
||||
GRID_CLR = "#E8ECF2"
|
||||
TEXT_CLR = "#2A2F3D"
|
||||
MUTED = "#6B7694"
|
||||
CARD_BG = "#FFFFFF"
|
||||
|
||||
MODELS = {
|
||||
"sonnet": {
|
||||
"slug": "anthropic_claude-sonnet-4-6",
|
||||
"label": "Sonnet 4.6",
|
||||
"color": "#2563eb",
|
||||
"color": BLUE,
|
||||
},
|
||||
"gemini": {
|
||||
"slug": "gemini_gemini-3-flash-preview",
|
||||
"label": "Gemini 3 Flash",
|
||||
"color": "#f97316",
|
||||
"color": ORANGE,
|
||||
},
|
||||
"gpt52": {
|
||||
"slug": "openai_gpt-5.2",
|
||||
"label": "GPT-5.2",
|
||||
"color": "#16a34a",
|
||||
"color": "#22C55E",
|
||||
},
|
||||
"greedy": {
|
||||
"slug": "greedy_bot",
|
||||
"label": "Greedy Bot",
|
||||
"color": NAVY,
|
||||
},
|
||||
}
|
||||
|
||||
BOT_KEYS = {"greedy"}
|
||||
|
||||
CONFIGS = ["medium", "hard", "nightmare"]
|
||||
SEEDS = [1, 2, 3]
|
||||
|
||||
DIFF_COLORS = {"medium": BLUE, "hard": ORANGE, "nightmare": "#DC2626"}
|
||||
|
||||
|
||||
def load_logo_image(height_px=80):
|
||||
"""Render the wordmark SVG to a high-res RGBA PIL image."""
|
||||
import os, ctypes.util
|
||||
# Ensure homebrew cairo is findable
|
||||
if ctypes.util.find_library("cairo") is None:
|
||||
brew_lib = "/opt/homebrew/lib"
|
||||
if Path(brew_lib).exists():
|
||||
os.environ.setdefault("DYLD_LIBRARY_PATH", brew_lib)
|
||||
import cairosvg
|
||||
from PIL import Image
|
||||
import io
|
||||
p = ROOT / "plots" / "collinear_wordmark.svg"
|
||||
if not p.exists():
|
||||
return None
|
||||
png_data = cairosvg.svg2png(url=str(p), output_height=height_px)
|
||||
return Image.open(io.BytesIO(png_data)).convert("RGBA")
|
||||
|
||||
|
||||
def load_funds_curve(db_path):
|
||||
con = sqlite3.connect(str(db_path))
|
||||
|
|
@ -42,7 +80,6 @@ def load_funds_curve(db_path):
|
|||
con.close()
|
||||
if not rows:
|
||||
return [], []
|
||||
|
||||
times, balances = [], []
|
||||
running = INITIAL_FUNDS_CENTS
|
||||
start = datetime.fromisoformat(rows[0][0]).replace(
|
||||
|
|
@ -50,16 +87,13 @@ def load_funds_curve(db_path):
|
|||
)
|
||||
times.append(start)
|
||||
balances.append(running / 100)
|
||||
|
||||
for occurred_at, amount_cents in rows:
|
||||
running += int(amount_cents)
|
||||
t = datetime.fromisoformat(occurred_at)
|
||||
# Cap at end of year 1 for apples-to-apples
|
||||
if t.year > 2025:
|
||||
break
|
||||
times.append(t)
|
||||
balances.append(running / 100)
|
||||
|
||||
return times, balances
|
||||
|
||||
|
||||
|
|
@ -74,13 +108,10 @@ def load_all():
|
|||
times, balances = load_funds_curve(db_path)
|
||||
bankrupt = len(balances) > 1 and balances[-1] <= 0
|
||||
runs.append({
|
||||
"config": config,
|
||||
"seed": seed,
|
||||
"model_key": key,
|
||||
"label": model["label"],
|
||||
"config": config, "seed": seed,
|
||||
"model_key": key, "label": model["label"],
|
||||
"color": model["color"],
|
||||
"times": times,
|
||||
"balances": balances,
|
||||
"times": times, "balances": balances,
|
||||
"bankrupt": bankrupt,
|
||||
"final": balances[-1] if balances else 0,
|
||||
})
|
||||
|
|
@ -90,79 +121,197 @@ def load_all():
|
|||
|
||||
|
||||
def make_plot(runs):
|
||||
fig, axes = plt.subplots(3, 3, figsize=(18, 14), facecolor="white")
|
||||
fig.suptitle(
|
||||
"Sonnet 4.6 vs Gemini 3 Flash vs GPT-5.2 · YC-Bench · 1-Year Horizon",
|
||||
fontsize=16, fontweight="600", y=0.98, color="#1a1a1a",
|
||||
fig, axes = plt.subplots(3, 3, figsize=(30, 22), facecolor=BG_COLOR)
|
||||
|
||||
# ── Header band (drawn as a filled Rectangle patch on the figure) ────
|
||||
from matplotlib.patches import FancyBboxPatch
|
||||
header_rect = plt.Rectangle((0, 0.90), 1, 0.10,
|
||||
transform=fig.transFigure, facecolor=NAVY,
|
||||
edgecolor="none", zorder=0)
|
||||
fig.patches.append(header_rect)
|
||||
# Orange accent line under header
|
||||
accent_rect = plt.Rectangle((0, 0.895), 1, 0.006,
|
||||
transform=fig.transFigure, facecolor=ORANGE,
|
||||
edgecolor="none", zorder=1)
|
||||
fig.patches.append(accent_rect)
|
||||
|
||||
fig.text(
|
||||
0.5, 0.955,
|
||||
"YC-Bench | 1-Year Horizon",
|
||||
ha="center", va="center",
|
||||
fontsize=50, fontweight="700", color="white",
|
||||
fontfamily="Helvetica Neue", zorder=2,
|
||||
)
|
||||
# ── Common legend in header ─────────────────────────────────────────
|
||||
legend_items = [
|
||||
("Sonnet 4.6", BLUE, "-", 4.0, 0.95),
|
||||
("Gemini 3 Flash", ORANGE, "-", 4.0, 0.95),
|
||||
("GPT-5.2", "#22C55E", "-", 4.0, 0.95),
|
||||
("Greedy Bot", NAVY, "--", 3.5, 0.75),
|
||||
]
|
||||
legend_handles = []
|
||||
for lbl, clr, ls, lw, alpha in legend_items:
|
||||
line = plt.Line2D([0], [0], color=clr, linewidth=lw, linestyle=ls,
|
||||
alpha=alpha)
|
||||
legend_handles.append(line)
|
||||
legend_labels = [item[0] for item in legend_items]
|
||||
fig.legend(
|
||||
legend_handles, legend_labels,
|
||||
loc="center", bbox_to_anchor=(0.53, 0.855),
|
||||
ncol=4, fontsize=22, frameon=False,
|
||||
labelcolor=TEXT_CLR, handlelength=3.5, handletextpad=1.0,
|
||||
columnspacing=3.0,
|
||||
)
|
||||
|
||||
# Pre-render logo from SVG at high res (will composite after savefig)
|
||||
logo_img = load_logo_image(height_px=120)
|
||||
|
||||
for row, config in enumerate(CONFIGS):
|
||||
for col, seed in enumerate(SEEDS):
|
||||
ax = axes[row][col]
|
||||
ax.set_facecolor("white")
|
||||
for spine in ax.spines.values():
|
||||
spine.set_edgecolor("#d0d0d0")
|
||||
spine.set_linewidth(0.7)
|
||||
ax.set_facecolor(CARD_BG)
|
||||
|
||||
# Bankruptcy line
|
||||
ax.axhline(0, color="#ef4444", linewidth=0.8, linestyle="--", alpha=0.4)
|
||||
ax.axhline(250_000, color="#9ca3af", linewidth=0.5, linestyle=":", alpha=0.4)
|
||||
for spine in ax.spines.values():
|
||||
spine.set_edgecolor(GRID_CLR)
|
||||
spine.set_linewidth(1.2)
|
||||
|
||||
# Log scale on y-axis
|
||||
ax.set_yscale("log")
|
||||
|
||||
# Reference lines
|
||||
ax.axhline(250_000, color=MUTED, linewidth=0.8, linestyle=":", alpha=0.3, zorder=1)
|
||||
|
||||
cell_runs = [r for r in runs if r["config"] == config and r["seed"] == seed]
|
||||
|
||||
# Sort: bots first (background), then survivors desc, then bankrupt
|
||||
def sort_key(r):
|
||||
if r["model_key"] in BOT_KEYS: return (0, 0)
|
||||
if not r["bankrupt"]: return (1, -r["final"])
|
||||
return (2, 0)
|
||||
cell_runs.sort(key=sort_key)
|
||||
|
||||
for r in cell_runs:
|
||||
if not r["times"]:
|
||||
continue
|
||||
alpha = 0.35 if r["bankrupt"] else 1.0
|
||||
lw = 1.0 if r["bankrupt"] else 2.0
|
||||
is_bot = r["model_key"] in BOT_KEYS
|
||||
|
||||
if r["bankrupt"]:
|
||||
alpha, lw, ls = 0.4, 2.0, "-" if not is_bot else "--"
|
||||
elif is_bot:
|
||||
alpha, lw, ls = 0.75, 3.5, "--"
|
||||
else:
|
||||
alpha, lw, ls = 0.95, 3.0, "-"
|
||||
|
||||
val = r["final"]
|
||||
if r["bankrupt"]:
|
||||
lbl = f"{r['label']} — bankrupt"
|
||||
elif val >= 1e6:
|
||||
lbl = f"{r['label']} — ${val/1e6:.1f}M"
|
||||
else:
|
||||
val = r["final"]
|
||||
lbl = f"{r['label']} — ${val/1e6:.1f}M" if val >= 1e6 else f"{r['label']} — ${val/1e3:.0f}K"
|
||||
lbl = f"{r['label']} — ${val/1e3:.0f}K"
|
||||
|
||||
ax.plot(r["times"], r["balances"], color=r["color"],
|
||||
linewidth=lw, alpha=alpha, label=lbl, zorder=3)
|
||||
# Clamp balances for log scale (floor at $1K)
|
||||
plot_bals = [max(b, 1_000) for b in r["balances"]]
|
||||
|
||||
ax.plot(
|
||||
r["times"], plot_bals,
|
||||
color=r["color"], linewidth=lw, alpha=alpha,
|
||||
label=lbl, linestyle=ls,
|
||||
zorder=2 if is_bot else 3,
|
||||
)
|
||||
|
||||
if r["bankrupt"]:
|
||||
ax.scatter([r["times"][-1]], [r["balances"][-1]],
|
||||
color=r["color"], marker="x", s=50, linewidths=1.5, alpha=0.5, zorder=5)
|
||||
else:
|
||||
ax.scatter([r["times"][-1]], [r["balances"][-1]],
|
||||
color=r["color"], marker="*", s=100, zorder=5)
|
||||
ax.scatter(
|
||||
[r["times"][-1]], [max(r["balances"][-1], 1_000)],
|
||||
color=r["color"], marker="X", s=120,
|
||||
linewidths=2, alpha=0.6, zorder=5,
|
||||
edgecolors="white",
|
||||
)
|
||||
elif not is_bot:
|
||||
ax.scatter(
|
||||
[r["times"][-1]], [r["balances"][-1]],
|
||||
color=r["color"], marker="o", s=100, zorder=5,
|
||||
edgecolors="white", linewidths=2.5,
|
||||
)
|
||||
|
||||
# Title
|
||||
if row == 0:
|
||||
ax.set_title(f"Seed {seed}", fontsize=11, fontweight="500", color="#374151", pad=8)
|
||||
# No per-axis column title (seed labels placed via fig.text below)
|
||||
|
||||
# Row label
|
||||
if col == 0:
|
||||
ax.set_ylabel(f"{config.upper()}\n\nFunds", fontsize=10, color="#374151", fontweight="600")
|
||||
ax.set_ylabel("Funds ($)", fontsize=20, color=MUTED, fontweight="400", labelpad=10)
|
||||
ax.annotate(
|
||||
config.upper(),
|
||||
xy=(-0.22, 0.5), xycoords="axes fraction",
|
||||
fontsize=23, fontweight="800",
|
||||
color=DIFF_COLORS[config],
|
||||
ha="center", va="center", rotation=90,
|
||||
)
|
||||
|
||||
# Formatting
|
||||
# Axes formatting
|
||||
ax.xaxis.set_major_formatter(mdates.DateFormatter("%b"))
|
||||
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
|
||||
ax.tick_params(colors="#666", labelsize=7)
|
||||
ax.grid(axis="y", color="#f0f0f0", linewidth=0.5)
|
||||
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=2))
|
||||
ax.tick_params(colors=MUTED, labelsize=18, length=5, width=0.8, pad=6)
|
||||
ax.grid(axis="y", color=GRID_CLR, linewidth=0.7, alpha=0.8)
|
||||
ax.grid(axis="x", color=GRID_CLR, linewidth=0.4, alpha=0.4)
|
||||
|
||||
ax.yaxis.set_major_formatter(
|
||||
mticker.FuncFormatter(
|
||||
lambda x, _: f"${x/1e6:.0f}M" if abs(x) >= 1e6
|
||||
else f"${x/1e3:.0f}K" if abs(x) >= 1e3
|
||||
lambda x, _: f"${x/1e6:.0f}M" if x >= 1e6
|
||||
else f"${x/1e3:.0f}K" if x >= 1e3
|
||||
else f"${x:.0f}"
|
||||
)
|
||||
)
|
||||
ax.yaxis.set_minor_formatter(mticker.NullFormatter())
|
||||
|
||||
legend = ax.legend(fontsize=7, loc="upper left", frameon=True,
|
||||
facecolor="white", edgecolor="#e5e7eb", framealpha=0.9)
|
||||
for text in legend.get_texts():
|
||||
text.set_color("#374151")
|
||||
# No per-cell legend (common legend in header)
|
||||
|
||||
plt.subplots_adjust(
|
||||
left=0.08, right=0.98, top=0.79, bottom=0.05,
|
||||
hspace=0.30, wspace=0.22,
|
||||
)
|
||||
|
||||
# Seed column headers just above the plot grid
|
||||
col_centers = [0.08 + (0.98 - 0.08) * (i + 0.5) / 3 for i in range(3)]
|
||||
for i, seed in enumerate(SEEDS):
|
||||
fig.text(
|
||||
col_centers[i], 0.80,
|
||||
f"Seed {seed}",
|
||||
ha="center", va="bottom",
|
||||
fontsize=26, fontweight="600", color=TEXT_CLR,
|
||||
)
|
||||
|
||||
# Footer
|
||||
fig.text(
|
||||
0.5, 0.01,
|
||||
"collinear.ai | YC-Bench: Long-Horizon Deterministic Benchmark for LLM Agents",
|
||||
ha="center", va="bottom",
|
||||
fontsize=18, fontweight="400", color=MUTED,
|
||||
fontstyle="italic",
|
||||
)
|
||||
|
||||
plt.tight_layout(rect=[0, 0, 1, 0.95])
|
||||
out = ROOT / "plots" / "sonnet_vs_gemini.png"
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
plt.savefig(out, dpi=180, bbox_inches="tight", facecolor="white")
|
||||
dpi = 150
|
||||
plt.savefig(out, dpi=dpi, facecolor=BG_COLOR, pad_inches=0)
|
||||
|
||||
# Composite SVG logo onto the navy header band
|
||||
if logo_img is not None:
|
||||
from PIL import Image
|
||||
plot_img = Image.open(out).convert("RGBA")
|
||||
img_w, img_h = plot_img.size
|
||||
# Header band is top 10% of image (no pad_inches)
|
||||
header_top = 0
|
||||
header_h = int(img_h * 0.10)
|
||||
# Scale logo to ~65% of header height
|
||||
target_h = int(header_h * 0.65)
|
||||
scale = target_h / logo_img.size[1]
|
||||
logo = logo_img.resize((int(logo_img.size[0] * scale), target_h), Image.LANCZOS)
|
||||
# Center vertically in the navy header band
|
||||
y_offset = header_top + (header_h - target_h) // 2
|
||||
x_offset = 70
|
||||
plot_img.paste(logo, (x_offset, y_offset), logo)
|
||||
plot_img.save(out)
|
||||
|
||||
print(f"\nSaved: {out}")
|
||||
|
||||
|
||||
|
|
|
|||