"""Plot all statistics from YC-Bench result JSON files. Usage: uv run python scripts/plot_run.py results/yc_bench_result_medium_1_*.json uv run python scripts/plot_run.py results/some_result.json # single run """ from __future__ import annotations import json import sys from datetime import datetime from pathlib import Path import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import matplotlib.dates as mdates COLORS = ['#00d4aa', '#ff6b6b', '#4ecdc4', '#ffe66d', '#a29bfe', '#fd79a8', '#6c5ce7', '#00b894'] DOMAIN_COLORS = { "research": "#3498db", "inference": "#9b59b6", "data_environment": "#1abc9c", "training": "#e67e22", } def load(path: str) -> dict: with open(path) as f: return json.load(f) def dt(iso: str) -> datetime: return datetime.fromisoformat(iso) def short_name(data: dict, path: str) -> str: model = data.get("model", "") if "/" in model: return model.split("/")[-1] return Path(path).stem.split("_", 4)[-1] # --------------------------------------------------------------------------- # Individual plot functions # --------------------------------------------------------------------------- def plot_funds(ax, runs): for i, (path, data) in enumerate(runs): funds = data["time_series"]["funds"] if not funds: continue times = [dt(f["time"]) for f in funds] vals = [f["funds_cents"] / 100 for f in funds] ax.plot(times, vals, color=COLORS[i % len(COLORS)], linewidth=2, label=short_name(data, path)) ax.axhline(y=200000, color='gray', linestyle='--', alpha=0.3) ax.set_ylabel("Funds ($)") ax.set_title("Funds Over Time") def plot_tasks_cumulative(ax, runs): for i, (path, data) in enumerate(runs): tasks = data["time_series"].get("tasks", []) ok = sorted([t for t in tasks if t.get("success") is True and t.get("completed_at")], key=lambda t: t["completed_at"]) fail = sorted([t for t in tasks if t.get("success") is False and t.get("completed_at")], key=lambda t: t["completed_at"]) color = COLORS[i % len(COLORS)] name = short_name(data, path) if ok: ax.step([dt(t["completed_at"]) for t in ok], range(1, len(ok)+1), color=color, linewidth=2, label=f"{name} OK", where='post') if fail: ax.step([dt(t["completed_at"]) for t in fail], range(1, len(fail)+1), color=color, linewidth=1.5, linestyle='--', label=f"{name} fail", where='post', alpha=0.6) ax.set_ylabel("Cumulative Tasks") ax.set_title("Task Completions (OK vs Fail)") def plot_prestige(ax, runs): # Only plot first run's prestige to avoid clutter if not runs: return path, data = runs[0] prestige = data["time_series"].get("prestige", []) if not prestige: return domains = sorted(set(p["domain"] for p in prestige)) for domain in domains: pts = [p for p in prestige if p["domain"] == domain] times = [dt(p["time"]) for p in pts] levels = [p["level"] for p in pts] ax.plot(times, levels, color=DOMAIN_COLORS.get(domain, 'gray'), linewidth=1.5, label=domain) ax.set_ylabel("Prestige Level") ax.set_title(f"Prestige by Domain ({short_name(data, path)})") def plot_trust(ax, runs): if not runs: return path, data = runs[0] trust = data["time_series"].get("client_trust", []) if not trust: return clients = sorted(set(t["client_name"] for t in trust)) for client in clients: pts = [t for t in trust if t["client_name"] == client] times = [dt(t["time"]) for t in pts] levels = [t["trust_level"] for t in pts] is_rat = pts[0].get("loyalty", 0) < -0.3 ax.plot(times, levels, linewidth=1.5, linestyle='--' if is_rat else '-', label=f"{client}{'*' if is_rat else ''}") ax.set_ylabel("Trust Level") ax.set_title(f"Client Trust ({short_name(data, path)}) (* = RAT)") def plot_payroll(ax, runs): for i, (path, data) in enumerate(runs): ledger = data["time_series"].get("ledger", []) payrolls = [e for e in ledger if e["category"] == "monthly_payroll"] if not payrolls: continue # Group by month monthly = {} for p in payrolls: m = p["time"][:7] monthly[m] = monthly.get(m, 0) + abs(p["amount_cents"]) months = sorted(monthly.keys()) times = [datetime.strptime(m, "%Y-%m") for m in months] amounts = [monthly[m] / 100 for m in months] ax.plot(times, amounts, color=COLORS[i % len(COLORS)], linewidth=2, marker='o', markersize=3, label=short_name(data, path)) ax.set_ylabel("Monthly Payroll ($)") ax.set_title("Payroll Growth") def plot_assignments(ax, runs): for i, (path, data) in enumerate(runs): assignments = data["time_series"].get("assignments", []) completed = [a for a in assignments if a.get("completed_at")] if not completed: continue times = [dt(a["completed_at"]) for a in completed] counts = [a["num_assigned"] for a in completed] ax.scatter(times, counts, color=COLORS[i % len(COLORS)], alpha=0.5, s=15, label=short_name(data, path)) ax.axhline(y=4, color='green', linestyle='--', alpha=0.3, label='efficient (4)') ax.set_ylabel("Employees Assigned") ax.set_title("Assignment Pattern Per Task") def plot_tokens(ax, runs): for i, (path, data) in enumerate(runs): transcript = data.get("transcript", []) if not transcript or not transcript[0].get("prompt_tokens"): continue turns = [t["turn"] for t in transcript] prompt = [t.get("prompt_tokens", 0) for t in transcript] color = COLORS[i % len(COLORS)] ax.plot(turns, prompt, color=color, linewidth=1, alpha=0.7, label=f"{short_name(data, path)} prompt") ax.set_ylabel("Tokens") ax.set_title("Prompt Tokens Per Turn") ax.set_xlabel("Turn") def plot_cost(ax, runs): for i, (path, data) in enumerate(runs): transcript = data.get("transcript", []) if not transcript: continue costs = [t.get("cost_usd", 0) for t in transcript] cumulative = [] running = 0 for c in costs: running += c cumulative.append(running) turns = [t["turn"] for t in transcript] ax.plot(turns, cumulative, color=COLORS[i % len(COLORS)], linewidth=2, label=short_name(data, path)) ax.set_ylabel("Cumulative Cost ($)") ax.set_title("API Cost") ax.set_xlabel("Turn") # --------------------------------------------------------------------------- # Summary # --------------------------------------------------------------------------- def print_summary(data, path): ts = data["time_series"] ledger = ts.get("ledger", []) cats = {} for e in ledger: cats[e["category"]] = cats.get(e["category"], 0) + e["amount_cents"] revenue = cats.get("task_reward", 0) payroll = abs(cats.get("monthly_payroll", 0)) tasks = ts.get("tasks", []) ok = sum(1 for t in tasks if t.get("success") is True) fail = sum(1 for t in tasks if t.get("success") is False) gated = sum(1 for t in tasks if t.get("success") is True and t.get("required_trust", 0) > 0) assignments = ts.get("assignments", []) avg_emp = sum(a["num_assigned"] for a in assignments) / len(assignments) if assignments else 0 employees = ts.get("employees", []) final_payroll = sum(e["salary_cents"] for e in employees) / 100 if employees else 0 clients = ts.get("clients", []) rats = [c for c in clients if c.get("is_rat")] transcript = data.get("transcript", []) total_prompt = sum(t.get("prompt_tokens", 0) for t in transcript) total_completion = sum(t.get("completion_tokens", 0) for t in transcript) final_funds = (200000 * 100 + revenue - payroll) / 100 print(f"\n{'='*60}") print(f" {short_name(data, path)}") print(f"{'='*60}") print(f" Model: {data.get('model', '?')}") print(f" Seed: {data.get('seed', '?')}") print(f" Terminal: {data.get('terminal_reason', '?')} at turn {data.get('turns_completed', '?')}") print(f" Final: ${final_funds:,.0f}") print(f" Revenue: ${revenue/100:,.0f} | Payroll: ${payroll/100:,.0f}") print(f" Tasks: {ok} OK, {fail} fail ({gated} trust-gated)") print(f" Avg emp: {avg_emp:.1f} per task") print(f" Payroll: ${final_payroll:,.0f}/mo (final)") print(f" RATs: {len(rats)} — {', '.join(c['name'] for c in rats) if rats else 'none'}") print(f" Scratchpad: {'yes' if ts.get('scratchpad') else 'no'}") total_tokens = total_prompt + total_completion print(f" Tokens: {total_prompt:,} prompt + {total_completion:,} completion = {total_tokens:,} total") print(f" Cost: ${data.get('total_cost_usd', 0):.2f}") started = data.get('started_at', '') ended = data.get('ended_at', '') if started and ended: try: t0 = datetime.fromisoformat(started) t1 = datetime.fromisoformat(ended) duration = t1 - t0 mins = duration.total_seconds() / 60 print(f" Time: {started[:19]} → {ended[:19]} ({mins:.1f} min)") except Exception: print(f" Time: {started[:19]} → {ended[:19]}") else: print(f" Time: N/A") config = ts.get("config", {}) if config: print(f" Config: salary_bump={config.get('salary_bump_pct')}, " f"trust_build={config.get('trust_build_rate')}, " f"rat_fraction={config.get('loyalty_rat_fraction')}, " f"fail_penalty={config.get('penalty_fail_funds_pct')}") # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main(): if len(sys.argv) < 2: print("Usage: uv run python scripts/plot_run.py results/*.json") sys.exit(1) paths = sys.argv[1:] runs = [(p, load(p)) for p in paths] for path, data in runs: print_summary(data, path) fig, axes = plt.subplots(4, 2, figsize=(16, 20)) fig.suptitle(f"YC-Bench — {len(runs)} run(s)", fontsize=14, fontweight='bold') plot_funds(axes[0, 0], runs) plot_tasks_cumulative(axes[0, 1], runs) plot_prestige(axes[1, 0], runs) plot_trust(axes[1, 1], runs) plot_payroll(axes[2, 0], runs) plot_assignments(axes[2, 1], runs) plot_tokens(axes[3, 0], runs) plot_cost(axes[3, 1], runs) for ax in axes.flat: ax.legend(fontsize=7, loc='best') ax.grid(True, alpha=0.2) ax.tick_params(labelsize=8) if ax.get_xlabel() != "Turn": try: ax.xaxis.set_major_formatter(mdates.DateFormatter('%b')) ax.xaxis.set_major_locator(mdates.MonthLocator()) except Exception: pass plt.tight_layout() Path("plots").mkdir(exist_ok=True) out = "plots/run_analysis.png" plt.savefig(out, dpi=150, bbox_inches='tight') print(f"\nPlot saved to {out}") if __name__ == "__main__": main()