yc-bench/scripts/plot_run.py
2026-03-20 05:19:56 -07:00

294 lines
11 KiB
Python

"""Plot all statistics from YC-Bench result JSON files.
Usage:
uv run python scripts/plot_run.py results/yc_bench_result_medium_1_*.json
uv run python scripts/plot_run.py results/some_result.json # single run
"""
from __future__ import annotations
import json
import sys
from datetime import datetime
from pathlib import Path
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
COLORS = ['#00d4aa', '#ff6b6b', '#4ecdc4', '#ffe66d', '#a29bfe', '#fd79a8', '#6c5ce7', '#00b894']
DOMAIN_COLORS = {
"research": "#3498db",
"inference": "#9b59b6",
"data_environment": "#1abc9c",
"training": "#e67e22",
}
def load(path: str) -> dict:
with open(path) as f:
return json.load(f)
def dt(iso: str) -> datetime:
return datetime.fromisoformat(iso)
def short_name(data: dict, path: str) -> str:
model = data.get("model", "")
if "/" in model:
return model.split("/")[-1]
return Path(path).stem.split("_", 4)[-1]
# ---------------------------------------------------------------------------
# Individual plot functions
# ---------------------------------------------------------------------------
def plot_funds(ax, runs):
for i, (path, data) in enumerate(runs):
funds = data["time_series"]["funds"]
if not funds:
continue
times = [dt(f["time"]) for f in funds]
vals = [f["funds_cents"] / 100 for f in funds]
ax.plot(times, vals, color=COLORS[i % len(COLORS)], linewidth=2, label=short_name(data, path))
ax.axhline(y=200000, color='gray', linestyle='--', alpha=0.3)
ax.set_ylabel("Funds ($)")
ax.set_title("Funds Over Time")
def plot_tasks_cumulative(ax, runs):
for i, (path, data) in enumerate(runs):
tasks = data["time_series"].get("tasks", [])
ok = sorted([t for t in tasks if t.get("success") is True and t.get("completed_at")], key=lambda t: t["completed_at"])
fail = sorted([t for t in tasks if t.get("success") is False and t.get("completed_at")], key=lambda t: t["completed_at"])
color = COLORS[i % len(COLORS)]
name = short_name(data, path)
if ok:
ax.step([dt(t["completed_at"]) for t in ok], range(1, len(ok)+1), color=color, linewidth=2, label=f"{name} OK", where='post')
if fail:
ax.step([dt(t["completed_at"]) for t in fail], range(1, len(fail)+1), color=color, linewidth=1.5, linestyle='--', label=f"{name} fail", where='post', alpha=0.6)
ax.set_ylabel("Cumulative Tasks")
ax.set_title("Task Completions (OK vs Fail)")
def plot_prestige(ax, runs):
# Only plot first run's prestige to avoid clutter
if not runs:
return
path, data = runs[0]
prestige = data["time_series"].get("prestige", [])
if not prestige:
return
domains = sorted(set(p["domain"] for p in prestige))
for domain in domains:
pts = [p for p in prestige if p["domain"] == domain]
times = [dt(p["time"]) for p in pts]
levels = [p["level"] for p in pts]
ax.plot(times, levels, color=DOMAIN_COLORS.get(domain, 'gray'), linewidth=1.5, label=domain)
ax.set_ylabel("Prestige Level")
ax.set_title(f"Prestige by Domain ({short_name(data, path)})")
def plot_trust(ax, runs):
if not runs:
return
path, data = runs[0]
trust = data["time_series"].get("client_trust", [])
if not trust:
return
clients = sorted(set(t["client_name"] for t in trust))
for client in clients:
pts = [t for t in trust if t["client_name"] == client]
times = [dt(t["time"]) for t in pts]
levels = [t["trust_level"] for t in pts]
is_rat = pts[0].get("loyalty", 0) < -0.3
ax.plot(times, levels, linewidth=1.5, linestyle='--' if is_rat else '-', label=f"{client}{'*' if is_rat else ''}")
ax.set_ylabel("Trust Level")
ax.set_title(f"Client Trust ({short_name(data, path)}) (* = RAT)")
def plot_payroll(ax, runs):
for i, (path, data) in enumerate(runs):
ledger = data["time_series"].get("ledger", [])
payrolls = [e for e in ledger if e["category"] == "monthly_payroll"]
if not payrolls:
continue
# Group by month
monthly = {}
for p in payrolls:
m = p["time"][:7]
monthly[m] = monthly.get(m, 0) + abs(p["amount_cents"])
months = sorted(monthly.keys())
times = [datetime.strptime(m, "%Y-%m") for m in months]
amounts = [monthly[m] / 100 for m in months]
ax.plot(times, amounts, color=COLORS[i % len(COLORS)], linewidth=2, marker='o', markersize=3, label=short_name(data, path))
ax.set_ylabel("Monthly Payroll ($)")
ax.set_title("Payroll Growth")
def plot_assignments(ax, runs):
for i, (path, data) in enumerate(runs):
assignments = data["time_series"].get("assignments", [])
completed = [a for a in assignments if a.get("completed_at")]
if not completed:
continue
times = [dt(a["completed_at"]) for a in completed]
counts = [a["num_assigned"] for a in completed]
ax.scatter(times, counts, color=COLORS[i % len(COLORS)], alpha=0.5, s=15, label=short_name(data, path))
ax.axhline(y=4, color='green', linestyle='--', alpha=0.3, label='efficient (4)')
ax.set_ylabel("Employees Assigned")
ax.set_title("Assignment Pattern Per Task")
def plot_tokens(ax, runs):
for i, (path, data) in enumerate(runs):
transcript = data.get("transcript", [])
if not transcript or not transcript[0].get("prompt_tokens"):
continue
turns = [t["turn"] for t in transcript]
prompt = [t.get("prompt_tokens", 0) for t in transcript]
color = COLORS[i % len(COLORS)]
ax.plot(turns, prompt, color=color, linewidth=1, alpha=0.7, label=f"{short_name(data, path)} prompt")
ax.set_ylabel("Tokens")
ax.set_title("Prompt Tokens Per Turn")
ax.set_xlabel("Turn")
def plot_cost(ax, runs):
for i, (path, data) in enumerate(runs):
transcript = data.get("transcript", [])
if not transcript:
continue
costs = [t.get("cost_usd", 0) for t in transcript]
cumulative = []
running = 0
for c in costs:
running += c
cumulative.append(running)
turns = [t["turn"] for t in transcript]
ax.plot(turns, cumulative, color=COLORS[i % len(COLORS)], linewidth=2, label=short_name(data, path))
ax.set_ylabel("Cumulative Cost ($)")
ax.set_title("API Cost")
ax.set_xlabel("Turn")
# ---------------------------------------------------------------------------
# Summary
# ---------------------------------------------------------------------------
def print_summary(data, path):
ts = data["time_series"]
ledger = ts.get("ledger", [])
cats = {}
for e in ledger:
cats[e["category"]] = cats.get(e["category"], 0) + e["amount_cents"]
revenue = cats.get("task_reward", 0)
payroll = abs(cats.get("monthly_payroll", 0))
tasks = ts.get("tasks", [])
ok = sum(1 for t in tasks if t.get("success") is True)
fail = sum(1 for t in tasks if t.get("success") is False)
gated = sum(1 for t in tasks if t.get("success") is True and t.get("required_trust", 0) > 0)
assignments = ts.get("assignments", [])
avg_emp = sum(a["num_assigned"] for a in assignments) / len(assignments) if assignments else 0
employees = ts.get("employees", [])
final_payroll = sum(e["salary_cents"] for e in employees) / 100 if employees else 0
clients = ts.get("clients", [])
rats = [c for c in clients if c.get("is_rat")]
transcript = data.get("transcript", [])
total_prompt = sum(t.get("prompt_tokens", 0) for t in transcript)
total_completion = sum(t.get("completion_tokens", 0) for t in transcript)
final_funds = (200000 * 100 + revenue - payroll) / 100
print(f"\n{'='*60}")
print(f" {short_name(data, path)}")
print(f"{'='*60}")
print(f" Model: {data.get('model', '?')}")
print(f" Seed: {data.get('seed', '?')}")
print(f" Terminal: {data.get('terminal_reason', '?')} at turn {data.get('turns_completed', '?')}")
print(f" Final: ${final_funds:,.0f}")
print(f" Revenue: ${revenue/100:,.0f} | Payroll: ${payroll/100:,.0f}")
print(f" Tasks: {ok} OK, {fail} fail ({gated} trust-gated)")
print(f" Avg emp: {avg_emp:.1f} per task")
print(f" Payroll: ${final_payroll:,.0f}/mo (final)")
print(f" RATs: {len(rats)}{', '.join(c['name'] for c in rats) if rats else 'none'}")
print(f" Scratchpad: {'yes' if ts.get('scratchpad') else 'no'}")
total_tokens = total_prompt + total_completion
print(f" Tokens: {total_prompt:,} prompt + {total_completion:,} completion = {total_tokens:,} total")
print(f" Cost: ${data.get('total_cost_usd', 0):.2f}")
started = data.get('started_at', '')
ended = data.get('ended_at', '')
if started and ended:
try:
t0 = datetime.fromisoformat(started)
t1 = datetime.fromisoformat(ended)
duration = t1 - t0
mins = duration.total_seconds() / 60
print(f" Time: {started[:19]}{ended[:19]} ({mins:.1f} min)")
except Exception:
print(f" Time: {started[:19]}{ended[:19]}")
else:
print(f" Time: N/A")
config = ts.get("config", {})
if config:
print(f" Config: salary_bump={config.get('salary_bump_pct')}, "
f"trust_build={config.get('trust_build_rate')}, "
f"rat_fraction={config.get('loyalty_rat_fraction')}, "
f"fail_penalty={config.get('penalty_fail_funds_pct')}")
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
if len(sys.argv) < 2:
print("Usage: uv run python scripts/plot_run.py results/*.json")
sys.exit(1)
paths = sys.argv[1:]
runs = [(p, load(p)) for p in paths]
for path, data in runs:
print_summary(data, path)
fig, axes = plt.subplots(4, 2, figsize=(16, 20))
fig.suptitle(f"YC-Bench — {len(runs)} run(s)", fontsize=14, fontweight='bold')
plot_funds(axes[0, 0], runs)
plot_tasks_cumulative(axes[0, 1], runs)
plot_prestige(axes[1, 0], runs)
plot_trust(axes[1, 1], runs)
plot_payroll(axes[2, 0], runs)
plot_assignments(axes[2, 1], runs)
plot_tokens(axes[3, 0], runs)
plot_cost(axes[3, 1], runs)
for ax in axes.flat:
ax.legend(fontsize=7, loc='best')
ax.grid(True, alpha=0.2)
ax.tick_params(labelsize=8)
if ax.get_xlabel() != "Turn":
try:
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
ax.xaxis.set_major_locator(mdates.MonthLocator())
except Exception:
pass
plt.tight_layout()
Path("plots").mkdir(exist_ok=True)
out = "plots/run_analysis.png"
plt.savefig(out, dpi=150, bbox_inches='tight')
print(f"\nPlot saved to {out}")
if __name__ == "__main__":
main()