yc-bench/scripts/plot_run.py

"""Plot a benchmark run: funds over time, prestige evolution, task outcomes."""
import os
import sys
from decimal import Decimal
from pathlib import Path

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.dates as mdates
import numpy as np

os.environ.setdefault("DATABASE_URL", "postgresql://postgres:postgres@localhost:5432/yc_bench")
sys.path.insert(0, str(Path(__file__).parent))

from src.bench.db.session import build_engine, build_session_factory, session_scope
from src.bench.db.models.ledger import LedgerEntry, LedgerCategory
from src.bench.db.models.task import Task, TaskRequirement, TaskStatus
from src.bench.db.models.company import CompanyPrestige

engine = build_engine()
factory = build_session_factory(engine)

DOMAIN_COLORS = {
    "research":         "#3498db",
    "inference":        "#9b59b6",
    "data_environment": "#1abc9c",
    "training":         "#e67e22",
}

with session_scope(factory) as db:
    # --- Ledger: reconstruct running balance ---
    entries = (
        db.query(LedgerEntry)
        .order_by(LedgerEntry.occurred_at)
        .all()
    )
    initial_funds = 25_000_000
    times, balances, categories = [], [], []
    running = initial_funds
    for e in entries:
        running += int(e.amount_cents)
        times.append(e.occurred_at)
        balances.append(running / 100)
        categories.append(e.category)

    # --- Tasks ---
    tasks = (
        db.query(Task)
        .filter(Task.completed_at.isnot(None))
        .order_by(Task.completed_at)
        .all()
    )
    task_times, task_rewards, task_success, task_prestige = [], [], [], []
    for t in tasks:
        task_times.append(t.completed_at)
        task_rewards.append(int(t.reward_funds_cents) / 100)
        task_success.append(t.status == TaskStatus.COMPLETED_SUCCESS)
        task_prestige.append(t.required_prestige)

    # --- Prestige per domain (sampled from task completions) ---
    # Build prestige history by replaying prestige deltas
    from src.bench.db.models.task import TaskRequirement
    from src.bench.db.models.company import Domain
    prestige_history = {d.value: [(times[0] if times else None, 1.0)] for d in Domain}

    completed = [t for t in tasks if t.completed_at]
    completed.sort(key=lambda t: t.completed_at)

    current_prestige = {d.value: 1.0 for d in Domain}
    for t in completed:
        reqs = db.query(TaskRequirement).filter(TaskRequirement.task_id == t.id).all()
        for req in reqs:
            d = req.domain.value
            if t.status == TaskStatus.COMPLETED_SUCCESS:
                current_prestige[d] = min(10.0, current_prestige[d] + float(t.reward_prestige_delta))
            else:
                penalty = 1.4 * float(t.reward_prestige_delta)
                current_prestige[d] = max(1.0, current_prestige[d] - penalty)
            prestige_history[d].append((t.completed_at, current_prestige[d]))

    # Final prestige from DB
    final_prestige = {
        row.domain.value: float(row.prestige_level)
        for row in db.query(CompanyPrestige).all()
    }

# ── Plot ────────────────────────────────────────────────────────────────
fig = plt.figure(figsize=(16, 10), facecolor="#0f1117")
gs = gridspec.GridSpec(2, 2, figure=fig, hspace=0.45, wspace=0.35)

ax_funds    = fig.add_subplot(gs[0, :])   # full width top
ax_prestige = fig.add_subplot(gs[1, 0])
ax_tasks    = fig.add_subplot(gs[1, 1])

for ax in [ax_funds, ax_prestige, ax_tasks]:
    ax.set_facecolor("#1a1d27")
    ax.tick_params(colors="#aaaaaa", labelsize=9)
    for spine in ax.spines.values():
        spine.set_edgecolor("#333344")

# ── Funds over time ──────────────────────────────────────────────────────
payroll_times = [t for t, c in zip(times, categories) if c == LedgerCategory.MONTHLY_PAYROLL]
payroll_vals  = [b for b, c in zip(balances, categories) if c == LedgerCategory.MONTHLY_PAYROLL]
reward_times  = [t for t, c in zip(times, categories) if c == LedgerCategory.TASK_REWARD]
reward_vals   = [b for b, c in zip(balances, categories) if c == LedgerCategory.TASK_REWARD]

ax_funds.plot(times, balances, color="#4fc3f7", linewidth=1.8, zorder=3, label="Balance")
ax_funds.fill_between(times, [b / max(balances) * min(balances) * 0.5 for b in balances],
                      balances, alpha=0.08, color="#4fc3f7", zorder=2)
ax_funds.scatter(reward_times, reward_vals, color="#2ecc71", s=30, zorder=5,
                 label="Task reward", marker="^")
ax_funds.scatter(payroll_times, payroll_vals, color="#e74c3c", s=20, zorder=5,
                 label="Payroll", marker="v", alpha=0.7)

ax_funds.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"${x/1000:.0f}K" if x < 1_000_000 else f"${x/1_000_000:.1f}M"))
ax_funds.xaxis.set_major_formatter(mdates.DateFormatter("%b '%y"))
ax_funds.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
plt.setp(ax_funds.xaxis.get_majorticklabels(), rotation=30, ha="right")
ax_funds.set_title("Company Funds Over Time", color="white", fontsize=12, pad=8)
ax_funds.set_ylabel("Balance", color="#aaaaaa", fontsize=9)
ax_funds.legend(fontsize=8, facecolor="#1a1d27", edgecolor="#333344",
                labelcolor="white", loc="upper left")
ax_funds.grid(axis="y", color="#333344", linewidth=0.5, linestyle="--")

# ── Prestige evolution ───────────────────────────────────────────────────
for domain, history in prestige_history.items():
    hist_times = [h[0] for h in history if h[0] is not None]
    hist_vals  = [h[1] for h in history if h[0] is not None]
    if len(hist_times) < 2:
        continue
    color = DOMAIN_COLORS.get(domain, "#aaaaaa")
    ax_prestige.step(hist_times, hist_vals, where="post",
                     color=color, linewidth=1.6, label=domain)

ax_prestige.axhline(y=1.0, color="#555566", linewidth=0.8, linestyle=":")
ax_prestige.set_ylim(0.8, 10.5)
ax_prestige.xaxis.set_major_formatter(mdates.DateFormatter("%b '%y"))
ax_prestige.xaxis.set_major_locator(mdates.MonthLocator(interval=2))
plt.setp(ax_prestige.xaxis.get_majorticklabels(), rotation=30, ha="right")
ax_prestige.set_title("Prestige by Domain", color="white", fontsize=12, pad=8)
ax_prestige.set_ylabel("Prestige Level", color="#aaaaaa", fontsize=9)
ax_prestige.legend(fontsize=7.5, facecolor="#1a1d27", edgecolor="#333344",
                   labelcolor="white", ncol=2, loc="upper left")
ax_prestige.grid(axis="y", color="#333344", linewidth=0.5, linestyle="--")

# ── Task outcomes scatter ────────────────────────────────────────────────
if task_times:
    colors = ["#2ecc71" if s else "#e74c3c" for s in task_success]
    scatter = ax_tasks.scatter(
        task_times, task_rewards,
        c=colors, s=[40 + p * 12 for p in task_prestige],
        alpha=0.85, zorder=4, edgecolors="none"
    )
    # Annotate prestige on each dot
    for t, r, p, s in zip(task_times, task_rewards, task_prestige, task_success):
        ax_tasks.annotate(f"p{p}", (t, r), fontsize=6, color="#cccccc",
                          xytext=(3, 3), textcoords="offset points")

ax_tasks.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"${x/1000:.0f}K" if x < 1_000_000 else f"${x/1_000_000:.1f}M"))
ax_tasks.xaxis.set_major_formatter(mdates.DateFormatter("%b '%y"))
ax_tasks.xaxis.set_major_locator(mdates.MonthLocator(interval=2))
plt.setp(ax_tasks.xaxis.get_majorticklabels(), rotation=30, ha="right")
ax_tasks.set_title("Task Outcomes  (▲ success  ● fail,  size = prestige req)", color="white", fontsize=10, pad=8)
ax_tasks.set_ylabel("Reward Value", color="#aaaaaa", fontsize=9)
ax_tasks.grid(axis="y", color="#333344", linewidth=0.5, linestyle="--")

# Legend patches
from matplotlib.patches import Patch
ax_tasks.legend(handles=[
    Patch(color="#2ecc71", label=f"Success ({sum(task_success)})"),
    Patch(color="#e74c3c", label=f"Fail ({sum(not s for s in task_success)})"),
], fontsize=8, facecolor="#1a1d27", edgecolor="#333344", labelcolor="white")

# ── Summary annotation ───────────────────────────────────────────────────
final_bal = balances[-1] if balances else 0
fig.text(0.5, 0.97,
         f"minimax-m2.5  |  seed=42  |  harder config  |  "
         f"150 turns  |  Aug 2025 sim time  |  "
         f"final balance ${final_bal/1_000_000:.2f}M  |  "
         f"{sum(task_success)}/{len(task_success)} tasks succeeded",
         ha="center", va="top", color="#aaaaaa", fontsize=9)

out = Path("plot_run_hard.png")
plt.savefig(out, dpi=150, bbox_inches="tight", facecolor=fig.get_facecolor())
print(f"Saved: {out}")