yc-bench/scripts/plot_run.py

"""Plot all statistics from YC-Bench result JSON files.

Usage:
    uv run python scripts/plot_run.py results/yc_bench_result_medium_1_*.json
    uv run python scripts/plot_run.py results/some_result.json  # single run
"""

from __future__ import annotations

import json
import sys
from datetime import datetime
from pathlib import Path

import matplotlib

matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

COLORS = [
    "#00d4aa",
    "#ff6b6b",
    "#4ecdc4",
    "#ffe66d",
    "#a29bfe",
    "#fd79a8",
    "#6c5ce7",
    "#00b894",
]
DOMAIN_COLORS = {
    "research": "#3498db",
    "inference": "#9b59b6",
    "data_environment": "#1abc9c",
    "training": "#e67e22",
}


def load(path: str) -> dict:
    with open(path) as f:
        return json.load(f)


def dt(iso: str) -> datetime:
    return datetime.fromisoformat(iso)


def short_name(data: dict, path: str) -> str:
    model = data.get("model", "")
    if "/" in model:
        return model.split("/")[-1]
    return Path(path).stem.split("_", 4)[-1]


# ---------------------------------------------------------------------------
# Individual plot functions
# ---------------------------------------------------------------------------


def plot_funds(ax, runs):
    for i, (path, data) in enumerate(runs):
        funds = data["time_series"]["funds"]
        if not funds:
            continue
        times = [dt(f["time"]) for f in funds]
        vals = [f["funds_cents"] / 100 for f in funds]
        ax.plot(
            times,
            vals,
            color=COLORS[i % len(COLORS)],
            linewidth=2,
            label=short_name(data, path),
        )
    ax.axhline(y=200000, color="gray", linestyle="--", alpha=0.3)
    ax.set_ylabel("Funds ($)")
    ax.set_title("Funds Over Time")


def plot_tasks_cumulative(ax, runs):
    for i, (path, data) in enumerate(runs):
        tasks = data["time_series"].get("tasks", [])
        ok = sorted(
            [t for t in tasks if t.get("success") is True and t.get("completed_at")],
            key=lambda t: t["completed_at"],
        )
        fail = sorted(
            [t for t in tasks if t.get("success") is False and t.get("completed_at")],
            key=lambda t: t["completed_at"],
        )
        color = COLORS[i % len(COLORS)]
        name = short_name(data, path)
        if ok:
            ax.step(
                [dt(t["completed_at"]) for t in ok],
                range(1, len(ok) + 1),
                color=color,
                linewidth=2,
                label=f"{name} OK",
                where="post",
            )
        if fail:
            ax.step(
                [dt(t["completed_at"]) for t in fail],
                range(1, len(fail) + 1),
                color=color,
                linewidth=1.5,
                linestyle="--",
                label=f"{name} fail",
                where="post",
                alpha=0.6,
            )
    ax.set_ylabel("Cumulative Tasks")
    ax.set_title("Task Completions (OK vs Fail)")


def plot_prestige(ax, runs):
    # Only plot first run's prestige to avoid clutter
    if not runs:
        return
    path, data = runs[0]
    prestige = data["time_series"].get("prestige", [])
    if not prestige:
        return
    domains = sorted(set(p["domain"] for p in prestige))
    for domain in domains:
        pts = [p for p in prestige if p["domain"] == domain]
        times = [dt(p["time"]) for p in pts]
        levels = [p["level"] for p in pts]
        ax.plot(
            times,
            levels,
            color=DOMAIN_COLORS.get(domain, "gray"),
            linewidth=1.5,
            label=domain,
        )
    ax.set_ylabel("Prestige Level")
    ax.set_title(f"Prestige by Domain ({short_name(data, path)})")


def plot_trust(ax, runs):
    if not runs:
        return
    path, data = runs[0]
    trust = data["time_series"].get("client_trust", [])
    if not trust:
        return
    clients = sorted(set(t["client_name"] for t in trust))
    for client in clients:
        pts = [t for t in trust if t["client_name"] == client]
        times = [dt(t["time"]) for t in pts]
        levels = [t["trust_level"] for t in pts]
        is_rat = pts[0].get("loyalty", 0) < -0.3
        ax.plot(
            times,
            levels,
            linewidth=1.5,
            linestyle="--" if is_rat else "-",
            label=f"{client}{'*' if is_rat else ''}",
        )
    ax.set_ylabel("Trust Level")
    ax.set_title(f"Client Trust ({short_name(data, path)}) (* = RAT)")


def plot_payroll(ax, runs):
    for i, (path, data) in enumerate(runs):
        ledger = data["time_series"].get("ledger", [])
        payrolls = [e for e in ledger if e["category"] == "monthly_payroll"]
        if not payrolls:
            continue
        # Group by month
        monthly = {}
        for p in payrolls:
            m = p["time"][:7]
            monthly[m] = monthly.get(m, 0) + abs(p["amount_cents"])
        months = sorted(monthly.keys())
        times = [datetime.strptime(m, "%Y-%m") for m in months]
        amounts = [monthly[m] / 100 for m in months]
        ax.plot(
            times,
            amounts,
            color=COLORS[i % len(COLORS)],
            linewidth=2,
            marker="o",
            markersize=3,
            label=short_name(data, path),
        )
    ax.set_ylabel("Monthly Payroll ($)")
    ax.set_title("Payroll Growth")


def plot_assignments(ax, runs):
    for i, (path, data) in enumerate(runs):
        assignments = data["time_series"].get("assignments", [])
        completed = [a for a in assignments if a.get("completed_at")]
        if not completed:
            continue
        times = [dt(a["completed_at"]) for a in completed]
        counts = [a["num_assigned"] for a in completed]
        ax.scatter(
            times,
            counts,
            color=COLORS[i % len(COLORS)],
            alpha=0.5,
            s=15,
            label=short_name(data, path),
        )
    ax.axhline(y=4, color="green", linestyle="--", alpha=0.3, label="efficient (4)")
    ax.set_ylabel("Employees Assigned")
    ax.set_title("Assignment Pattern Per Task")


def plot_tokens(ax, runs):
    for i, (path, data) in enumerate(runs):
        transcript = data.get("transcript", [])
        if not transcript or not transcript[0].get("prompt_tokens"):
            continue
        turns = [t["turn"] for t in transcript]
        prompt = [t.get("prompt_tokens", 0) for t in transcript]
        color = COLORS[i % len(COLORS)]
        ax.plot(
            turns,
            prompt,
            color=color,
            linewidth=1,
            alpha=0.7,
            label=f"{short_name(data, path)} prompt",
        )
    ax.set_ylabel("Tokens")
    ax.set_title("Prompt Tokens Per Turn")
    ax.set_xlabel("Turn")


def plot_cost(ax, runs):
    for i, (path, data) in enumerate(runs):
        transcript = data.get("transcript", [])
        if not transcript:
            continue
        costs = [t.get("cost_usd", 0) for t in transcript]
        cumulative = []
        running = 0
        for c in costs:
            running += c
            cumulative.append(running)
        turns = [t["turn"] for t in transcript]
        ax.plot(
            turns,
            cumulative,
            color=COLORS[i % len(COLORS)],
            linewidth=2,
            label=short_name(data, path),
        )
    ax.set_ylabel("Cumulative Cost ($)")
    ax.set_title("API Cost")
    ax.set_xlabel("Turn")


# ---------------------------------------------------------------------------
# Summary
# ---------------------------------------------------------------------------


def print_summary(data, path):
    ts = data["time_series"]
    ledger = ts.get("ledger", [])
    cats = {}
    for e in ledger:
        cats[e["category"]] = cats.get(e["category"], 0) + e["amount_cents"]

    revenue = cats.get("task_reward", 0)
    payroll = abs(cats.get("monthly_payroll", 0))
    tasks = ts.get("tasks", [])
    ok = sum(1 for t in tasks if t.get("success") is True)
    fail = sum(1 for t in tasks if t.get("success") is False)
    gated = sum(
        1 for t in tasks if t.get("success") is True and t.get("required_trust", 0) > 0
    )

    assignments = ts.get("assignments", [])
    avg_emp = (
        sum(a["num_assigned"] for a in assignments) / len(assignments)
        if assignments
        else 0
    )

    employees = ts.get("employees", [])
    final_payroll = sum(e["salary_cents"] for e in employees) / 100 if employees else 0

    clients = ts.get("clients", [])
    rats = [c for c in clients if c.get("is_rat")]

    transcript = data.get("transcript", [])
    total_prompt = sum(t.get("prompt_tokens", 0) for t in transcript)
    total_completion = sum(t.get("completion_tokens", 0) for t in transcript)
    final_funds = (200000 * 100 + revenue - payroll) / 100

    print(f"\n{'='*60}")
    print(f"  {short_name(data, path)}")
    print(f"{'='*60}")
    print(f"  Model:     {data.get('model', '?')}")
    print(f"  Seed:      {data.get('seed', '?')}")
    print(
        f"  Terminal:  {data.get('terminal_reason', '?')} at turn {data.get('turns_completed', '?')}"
    )
    print(f"  Final:     ${final_funds:,.0f}")
    print(f"  Revenue:   ${revenue/100:,.0f} | Payroll: ${payroll/100:,.0f}")
    print(f"  Tasks:     {ok} OK, {fail} fail ({gated} trust-gated)")
    print(f"  Avg emp:   {avg_emp:.1f} per task")
    print(f"  Payroll:   ${final_payroll:,.0f}/mo (final)")
    print(
        f"  RATs:      {len(rats)} — {', '.join(c['name'] for c in rats) if rats else 'none'}"
    )
    print(f"  Scratchpad: {'yes' if ts.get('scratchpad') else 'no'}")
    total_tokens = total_prompt + total_completion
    print(
        f"  Tokens:    {total_prompt:,} prompt + {total_completion:,} completion = {total_tokens:,} total"
    )
    print(f"  Cost:      ${data.get('total_cost_usd', 0):.2f}")
    started = data.get("started_at", "")
    ended = data.get("ended_at", "")
    if started and ended:
        try:
            t0 = datetime.fromisoformat(started)
            t1 = datetime.fromisoformat(ended)
            duration = t1 - t0
            mins = duration.total_seconds() / 60
            print(f"  Time:      {started[:19]} → {ended[:19]} ({mins:.1f} min)")
        except Exception:
            print(f"  Time:      {started[:19]} → {ended[:19]}")
    else:
        print(f"  Time:      N/A")

    config = ts.get("config", {})
    if config:
        print(
            f"  Config:    salary_bump={config.get('salary_bump_pct')}, "
            f"trust_build={config.get('trust_build_rate')}, "
            f"rat_fraction={config.get('loyalty_rat_fraction')}, "
            f"fail_penalty={config.get('penalty_fail_funds_pct')}"
        )


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------


def main():
    if len(sys.argv) < 2:
        print("Usage: uv run python scripts/plot_run.py results/*.json")
        sys.exit(1)

    paths = sys.argv[1:]
    runs = [(p, load(p)) for p in paths]

    for path, data in runs:
        print_summary(data, path)

    fig, axes = plt.subplots(4, 2, figsize=(16, 20))
    fig.suptitle(f"YC-Bench — {len(runs)} run(s)", fontsize=14, fontweight="bold")

    plot_funds(axes[0, 0], runs)
    plot_tasks_cumulative(axes[0, 1], runs)
    plot_prestige(axes[1, 0], runs)
    plot_trust(axes[1, 1], runs)
    plot_payroll(axes[2, 0], runs)
    plot_assignments(axes[2, 1], runs)
    plot_tokens(axes[3, 0], runs)
    plot_cost(axes[3, 1], runs)

    for ax in axes.flat:
        ax.legend(fontsize=7, loc="best")
        ax.grid(True, alpha=0.2)
        ax.tick_params(labelsize=8)
        if ax.get_xlabel() != "Turn":
            try:
                ax.xaxis.set_major_formatter(mdates.DateFormatter("%b"))
                ax.xaxis.set_major_locator(mdates.MonthLocator())
            except Exception:
                pass

    plt.tight_layout()

    Path("plots").mkdir(exist_ok=True)
    out = "plots/run_analysis.png"
    plt.savefig(out, dpi=150, bbox_inches="tight")
    print(f"\nPlot saved to {out}")


if __name__ == "__main__":
    main()