Add live terminal dashboard with Rich

Replace scrolling LiteLLM debug logs with an in-place Rich Live dashboard
that shows key metrics after each turn: funds sparkline, task progress bars
with colored domain labels, team skill bars, runway urgency, and more.

- New: src/yc_bench/runner/dashboard.py (BenchmarkDashboard, DashboardState)
- Add on_turn/on_turn_start callbacks to agent loop
- Auto-detect TTY, redirect all logging to logs/debug.log when live
- Add --no-live flag to disable dashboard and get old log output
- Use alternate screen buffer (screen=True) for clean rendering
- Fix start.sh: clean up stale temp files before mktemp

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
adit jain 2026-02-26 22:11:55 -08:00
parent d4ce0a1e5a
commit f25a2be1e4
6 changed files with 570 additions and 17 deletions

View file

@ -12,6 +12,7 @@ class RunArgs:
company_name: str
start_date: str
config_name: str = "default"
no_live: bool = False
def build_parser():
parser = argparse.ArgumentParser(
@ -27,6 +28,10 @@ def build_parser():
"--config", dest="config_name", default="default",
help="Preset name ('default', 'fast_test', 'high_reward') or path to a .toml file",
)
parser.add_argument(
"--no-live", action="store_true", default=False,
help="Disable the live terminal dashboard (show raw log output instead)",
)
return parser
def parse_run_args(argv):
@ -40,6 +45,7 @@ def parse_run_args(argv):
company_name=ns.company_name,
start_date=ns.start_date,
config_name=ns.config_name,
no_live=ns.no_live,
)
def _validate(ns, parser):

View file

@ -0,0 +1,454 @@
"""Live terminal dashboard for YC-Bench using Rich."""
from __future__ import annotations
import os
import time
from dataclasses import dataclass, field
from decimal import Decimal
from typing import Any
from rich.console import Console, Group
from rich.live import Live
from rich.panel import Panel
from rich.table import Table
from rich.text import Text
SPARK_CHARS = "▁▂▃▄▅▆▇█"
# Domain → (display name, color) for styled inline display
DOMAIN_STYLE = {
"system": ("System", "bright_cyan"),
"research": ("Research", "bright_magenta"),
"data": ("Data", "bright_blue"),
"frontend": ("Frontend", "bright_yellow"),
"backend": ("Backend", "bright_green"),
"training": ("Training", "red"),
"hardware": ("Hardware", "white"),
}
def _sparkline(values: list[float], width: int = 20) -> str:
"""Return a Unicode sparkline string from a list of values."""
if not values:
return ""
vals = values[-width:]
lo, hi = min(vals), max(vals)
span = hi - lo if hi != lo else 1.0
return "".join(SPARK_CHARS[min(int((v - lo) / span * (len(SPARK_CHARS) - 1)), len(SPARK_CHARS) - 1)] for v in vals)
def _fmt_dollars(cents: int) -> str:
return f"${cents / 100:,.2f}"
def _fmt_delta(cents: int) -> str:
sign = "+" if cents >= 0 else "-"
return f"{sign}${abs(cents) / 100:,.0f}"
def _domain_tag(domain_str: str) -> str:
"""Colored domain tag like [bright_cyan]SYS[/bright_cyan]."""
label, color = DOMAIN_STYLE.get(domain_str, (domain_str[:3].upper(), "white"))
return f"[{color}]{label}[/{color}]"
def _mini_bar(pct: float, width: int = 8) -> str:
"""Colored progress bar: green when done, yellow partial, dim empty."""
filled = int(pct * width)
if pct >= 1.0:
return f"[bold green]{'=' * width}[/bold green]"
elif pct >= 0.5:
return f"[yellow]{'=' * filled}[/yellow][dim]{'.' * (width - filled)}[/dim]"
else:
return f"[red]{'=' * filled}[/red][dim]{'.' * (width - filled)}[/dim]"
@dataclass
class TaskInfo:
title: str
status: str
prestige: int
reward_dollars: float
deadline: str
domains: list[str]
progress: list[tuple[str, float, float]] # [(domain, completed, required)]
@dataclass
class EmployeeInfo:
name: str
salary_dollars: float
skills: list[tuple[str, float]] # [(domain, rate)]
@dataclass
class DashboardState:
model: str = ""
seed: int = 0
config_name: str = ""
turn: int = 0
sim_date: str = ""
horizon_end: str = ""
funds_cents: int = 0
funds_delta_cents: int = 0
funds_history: list[float] = field(default_factory=list)
runway_months: float = 0.0
active_tasks: int = 0
planned_tasks: int = 0
employee_count: int = 0
monthly_payroll_cents: int = 0
api_cost_usd: float = 0.0
turn_time_sec: float = 0.0
last_action: str = ""
status: str = ""
elapsed_sec: float = 0.0
tasks_detail: list[TaskInfo] = field(default_factory=list)
employees_detail: list[EmployeeInfo] = field(default_factory=list)
completed_count: int = 0
failed_count: int = 0
def _query_detailed_snapshot(db_factory, company_id) -> dict[str, Any]:
"""Query rich task/employee details from the DB for dashboard display."""
from ..db.models.task import Task, TaskStatus, TaskRequirement
from ..db.models.employee import Employee, EmployeeSkillRate
with db_factory() as db:
tasks_detail = []
for status in (TaskStatus.ACTIVE, TaskStatus.PLANNED):
tasks = db.query(Task).filter(
Task.company_id == company_id,
Task.status == status,
).all()
for t in tasks:
reqs = db.query(TaskRequirement).filter(
TaskRequirement.task_id == t.id,
).all()
domains = [r.domain.value for r in reqs]
progress = [
(r.domain.value, float(r.completed_qty), float(r.required_qty))
for r in reqs
]
deadline_str = t.deadline.strftime("%Y-%m-%d") if t.deadline else "-"
tasks_detail.append(TaskInfo(
title=t.title,
status=status.value,
prestige=t.required_prestige,
reward_dollars=t.reward_funds_cents / 100.0,
deadline=deadline_str,
domains=domains,
progress=progress,
))
from sqlalchemy import func
completed_count = db.query(func.count(Task.id)).filter(
Task.company_id == company_id,
Task.status == TaskStatus.COMPLETED_SUCCESS,
).scalar() or 0
failed_count = db.query(func.count(Task.id)).filter(
Task.company_id == company_id,
Task.status == TaskStatus.COMPLETED_FAIL,
).scalar() or 0
employees_detail = []
employees = db.query(Employee).filter(
Employee.company_id == company_id,
).all()
for emp in employees:
skills = db.query(EmployeeSkillRate).filter(
EmployeeSkillRate.employee_id == emp.id,
).all()
skill_list = [
(s.domain.value, float(s.rate_domain_per_hour))
for s in sorted(skills, key=lambda s: float(s.rate_domain_per_hour), reverse=True)
]
employees_detail.append(EmployeeInfo(
name=emp.name,
salary_dollars=emp.salary_cents / 100.0,
skills=skill_list,
))
return {
"tasks_detail": tasks_detail,
"employees_detail": employees_detail,
"completed_count": completed_count,
"failed_count": failed_count,
}
class BenchmarkDashboard:
"""Rich Live dashboard for benchmark progress."""
def __init__(self, model: str, seed: int, config_name: str,
db_factory=None, company_id=None):
self._console = Console()
self._live: Live | None = None
self._state = DashboardState(model=model, seed=seed, config_name=config_name)
self._start_time = time.monotonic()
self._turn_start_time = 0.0
self._prev_funds_cents = 0
self._db_factory = db_factory
self._company_id = company_id
self._stderr_backup = None
self._devnull = None
def start(self) -> None:
import sys
self._start_time = time.monotonic()
self._state.status = "[dim]Starting...[/dim]"
self._stderr_backup = sys.stderr
self._devnull = open(os.devnull, "w")
sys.stderr = self._devnull
self._live = Live(
self._render(),
console=self._console,
refresh_per_second=2,
screen=True,
)
self._live.start()
def stop(self) -> None:
import sys
if self._live is not None:
self._live.stop()
self._live = None
if self._stderr_backup is not None:
sys.stderr = self._stderr_backup
self._stderr_backup = None
if self._devnull is not None:
self._devnull.close()
self._devnull = None
def mark_turn_start(self, turn_num: int) -> None:
self._turn_start_time = time.monotonic()
self._state.turn = turn_num
self._state.status = f"[yellow]>> Turn {turn_num}: waiting for LLM...[/yellow]"
self._state.elapsed_sec = time.monotonic() - self._start_time
self._refresh()
def update(self, snapshot: dict[str, Any], run_state: Any, commands: list[str] | None = None) -> None:
now = time.monotonic()
s = self._state
s.turn = run_state.turn_count
s.sim_date = snapshot.get("sim_time", "")[:10]
s.horizon_end = snapshot.get("horizon_end", "")[:10]
s.funds_cents = snapshot.get("funds_cents", 0)
s.funds_delta_cents = s.funds_cents - self._prev_funds_cents
self._prev_funds_cents = s.funds_cents
s.funds_history.append(s.funds_cents / 100.0)
s.active_tasks = snapshot.get("active_tasks", 0)
s.planned_tasks = snapshot.get("planned_tasks", 0)
s.employee_count = snapshot.get("employee_count", 0)
s.monthly_payroll_cents = snapshot.get("monthly_payroll_cents", 0)
s.api_cost_usd = run_state.total_cost_usd
s.turn_time_sec = now - self._turn_start_time if self._turn_start_time else 0.0
s.elapsed_sec = now - self._start_time
if s.monthly_payroll_cents > 0:
s.runway_months = s.funds_cents / s.monthly_payroll_cents
else:
s.runway_months = float("inf")
if commands:
first = commands[0].split(" -> ")[0] if " -> " in commands[0] else commands[0]
if len(commands) > 1:
s.last_action = f"{first} (+{len(commands)-1} more)"
else:
s.last_action = first
else:
s.last_action = "(no commands)"
if run_state.terminal:
reason = run_state.terminal_reason.value if run_state.terminal_reason else "unknown"
s.status = f"[bold green]DONE: {reason}[/bold green]"
else:
s.status = f"[green]Turn {s.turn} complete[/green]"
if self._db_factory is not None and self._company_id is not None:
try:
detail = _query_detailed_snapshot(self._db_factory, self._company_id)
s.tasks_detail = detail["tasks_detail"]
s.employees_detail = detail["employees_detail"]
s.completed_count = detail["completed_count"]
s.failed_count = detail["failed_count"]
except Exception:
pass
self._refresh()
def print_final_summary(self, run_state: Any) -> None:
s = self._state
elapsed_m, elapsed_s = divmod(int(s.elapsed_sec), 60)
elapsed_h, elapsed_m = divmod(elapsed_m, 60)
table = Table(show_header=False, box=None, padding=(0, 2))
table.add_column(style="bold cyan", width=14)
table.add_column()
table.add_row("Turns", str(s.turn))
table.add_row("Final Funds", _fmt_dollars(s.funds_cents))
table.add_row("Tasks", f"[green]{s.completed_count} done[/green] / [red]{s.failed_count} failed[/red]")
table.add_row("API Cost", f"${s.api_cost_usd:.4f}")
table.add_row("Elapsed", f"{elapsed_h}h {elapsed_m:02d}m {elapsed_s:02d}s")
reason = run_state.terminal_reason.value if run_state.terminal_reason else "max_turns"
table.add_row("Outcome", reason)
panel = Panel(
table,
title="[bold]YC-Bench Complete[/bold]",
border_style="green" if reason == "horizon_end" else "red" if reason == "bankruptcy" else "yellow",
)
self._console.print(panel)
def _refresh(self) -> None:
if self._live is not None:
self._live.update(self._render())
# ------------------------------------------------------------------
# Render helpers
# ------------------------------------------------------------------
def _render_stats_panel(self) -> Panel:
s = self._state
elapsed_m, elapsed_s = divmod(int(s.elapsed_sec), 60)
elapsed_h, elapsed_m = divmod(elapsed_m, 60)
short_model = s.model.rsplit("/", 1)[-1]
table = Table(show_header=False, box=None, padding=(0, 1))
table.add_column(style="bold cyan", width=12)
table.add_column(overflow="ellipsis", no_wrap=True)
table.add_row("Model", f"[bold]{short_model}[/bold] seed={s.seed} {s.config_name}")
table.add_row("Turn", f"[bold white]{s.turn}[/bold white]")
table.add_row("Sim Date", f"{s.sim_date} [dim]->[/dim] {s.horizon_end}" if s.sim_date else "[dim]--[/dim]")
table.add_row("Elapsed", f"{elapsed_h}h {elapsed_m:02d}m {elapsed_s:02d}s")
# Funds with colored sparkline
spark = _sparkline(s.funds_history)
delta_color = "green" if s.funds_delta_cents >= 0 else "red"
if s.turn > 0:
funds_str = f"[bold]{_fmt_dollars(s.funds_cents)}[/bold] [{delta_color}]{_fmt_delta(s.funds_delta_cents)}[/{delta_color}] [{delta_color}]{spark}[/{delta_color}]"
else:
funds_str = "[dim]--[/dim]"
table.add_row("Funds", funds_str)
# Runway with urgency coloring
if s.runway_months == float("inf"):
runway_str = "[green]unlimited[/green]"
elif s.runway_months < 2:
runway_str = f"[bold red blink]{s.runway_months:.1f}mo CRITICAL[/bold red blink]"
elif s.runway_months < 4:
runway_str = f"[bold yellow]{s.runway_months:.1f}mo LOW[/bold yellow]"
else:
runway_str = f"[green]{s.runway_months:.1f}mo[/green]"
table.add_row("Runway", runway_str)
# Task scoreboard
task_parts = f"{s.active_tasks} active / {s.planned_tasks} queued"
if s.completed_count or s.failed_count:
task_parts += f" [green]{s.completed_count} done[/green] [red]{s.failed_count} fail[/red]"
table.add_row("Tasks", task_parts)
table.add_row("Team", f"{s.employee_count} people {_fmt_dollars(s.monthly_payroll_cents)}/mo" if s.monthly_payroll_cents else str(s.employee_count))
table.add_row("Cost", f"${s.api_cost_usd:.4f} ({s.turn_time_sec:.1f}s/turn)" if s.turn_time_sec else f"${s.api_cost_usd:.4f}")
table.add_row("Action", s.last_action or "[dim]--[/dim]")
table.add_row("Status", s.status)
return Panel(table, title="[bold]YC-Bench[/bold]", border_style="blue")
def _render_tasks_panel(self) -> Panel:
s = self._state
if not s.tasks_detail:
return Panel(
"[dim]No active or planned tasks yet...[/dim]",
title="[bold]Tasks[/bold]",
border_style="yellow",
)
table = Table(box=None, padding=(0, 1), show_edge=False)
table.add_column("", width=2) # status marker
table.add_column("Task", style="bold white", no_wrap=True, max_width=20)
table.add_column("$$$", width=8, justify="right", no_wrap=True) # reward
table.add_column("Due", width=10, no_wrap=True) # deadline
table.add_column("Progress", no_wrap=True, overflow="ellipsis", ratio=1)
for t in s.tasks_detail[:6]:
if t.status == "active":
marker = "[bold green]>>[/bold green]"
else:
marker = "[dim]..[/dim]"
# Prestige stars in yellow
stars = f"[yellow]{'*' * min(t.prestige, 5)}[/yellow]"
# Reward colored by size
if t.reward_dollars >= 50000:
reward = f"[bold green]${t.reward_dollars:,.0f}[/bold green]"
elif t.reward_dollars >= 20000:
reward = f"[green]${t.reward_dollars:,.0f}[/green]"
else:
reward = f"${t.reward_dollars:,.0f}"
# Domain progress with colored bars
prog_parts = []
for domain, completed, required in t.progress:
pct = completed / required if required > 0 else 0
bar = _mini_bar(pct, width=6)
tag = _domain_tag(domain)
prog_parts.append(f"{tag} {bar}")
progress_str = " ".join(prog_parts)
table.add_row(marker, t.title[:20], reward, t.deadline, progress_str)
remaining = len(s.tasks_detail) - 6
if remaining > 0:
table.add_row("", f"[dim]+{remaining} more[/dim]", "", "", "")
return Panel(table, title="[bold]Tasks[/bold]", border_style="yellow")
def _render_team_panel(self) -> Panel:
s = self._state
if not s.employees_detail:
return Panel("[dim]No employees hired yet...[/dim]", title="[bold]Team[/bold]", border_style="magenta")
table = Table(box=None, padding=(0, 1), show_edge=False)
table.add_column("Name", style="bold white", width=14, no_wrap=True)
table.add_column("Pay", width=8, justify="right", no_wrap=True)
table.add_column("Skills", no_wrap=True, overflow="ellipsis", ratio=1)
for emp in s.employees_detail:
# Salary colored by cost
if emp.salary_dollars >= 10000:
pay = f"[bold red]${emp.salary_dollars:,.0f}[/bold red]"
elif emp.salary_dollars >= 5000:
pay = f"[yellow]${emp.salary_dollars:,.0f}[/yellow]"
else:
pay = f"[green]${emp.salary_dollars:,.0f}[/green]"
# Skill bars — top 3
skill_parts = []
for d, r in emp.skills[:3]:
tag = _domain_tag(d)
# Rate bar: scale 0-15 to a mini bar
bar_pct = min(r / 15.0, 1.0)
bar = _mini_bar(bar_pct, width=4)
skill_parts.append(f"{tag}{bar}")
skills_str = " ".join(skill_parts)
table.add_row(emp.name[:14], pay, skills_str)
return Panel(table, title="[bold]Team[/bold]", border_style="magenta")
def _render(self) -> Group:
return Group(
self._render_stats_panel(),
self._render_tasks_panel(),
self._render_team_panel(),
)
__all__ = ["BenchmarkDashboard", "DashboardState"]

View file

@ -4,6 +4,7 @@ from __future__ import annotations
import json
import logging
import os
import sys
from contextlib import contextmanager
from datetime import datetime, timezone
from pathlib import Path
@ -18,6 +19,9 @@ from .args import parse_run_args
logger = logging.getLogger(__name__)
# Loggers that produce noisy debug output during LLM calls
_NOISY_LOGGERS = ("litellm", "httpx", "httpcore", "openai", "LiteLLM")
def _parse_date(date_str: str) -> datetime:
"""Accept ISO (2025-01-01) or legacy MM/DD/YYYY format."""
@ -119,6 +123,33 @@ def _init_simulation(db_factory, args, experiment_cfg, horizon_years):
# Main
# ---------------------------------------------------------------------------
def _redirect_all_logging_to_file(log_file: Path) -> None:
"""Redirect ALL logging from the console to a file.
When the Rich Live dashboard is active, any output to stdout/stderr
breaks the in-place rendering, causing stacked panels. This removes
the root logger's console handlers and replaces them with a file handler.
"""
log_file.parent.mkdir(exist_ok=True)
file_handler = logging.FileHandler(str(log_file), mode="a")
file_handler.setFormatter(logging.Formatter(
"[%(asctime)s] %(name)s %(levelname)s: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
))
# Replace all console handlers on root logger with the file handler
root = logging.getLogger()
root.handlers.clear()
root.addHandler(file_handler)
# Also ensure noisy loggers don't propagate (belt and suspenders)
for name in _NOISY_LOGGERS:
lg = logging.getLogger(name)
lg.propagate = False
lg.handlers.clear()
lg.addHandler(file_handler)
def run_benchmark(args):
"""Run a full benchmark: migrate, seed, loop until terminal."""
logging.basicConfig(
@ -139,6 +170,15 @@ def run_benchmark(args):
# --horizon-years CLI flag overrides config; fall back to sim.horizon_years from config
horizon_years = args.horizon_years if args.horizon_years is not None else experiment_cfg.sim.horizon_years
# Decide whether to use the live dashboard
use_live = sys.stdout.isatty() and not getattr(args, "no_live", False)
# When using the live dashboard, redirect all logging to file immediately
# so no console output interferes with Rich Live rendering.
if use_live:
log_file = Path("logs") / "debug.log"
_redirect_all_logging_to_file(log_file)
logger.info(
"YC-Bench starting: experiment=%s model=%s seed=%d horizon=%dy",
experiment_cfg.name, args.model, args.seed, horizon_years,
@ -187,19 +227,54 @@ def run_benchmark(args):
horizon_years=horizon_years,
)
# 6. Run agent loop
loop_cfg = experiment_cfg.loop
final_state = run_agent_loop(
runtime=runtime,
db_factory=db_factory,
company_id=company_id,
run_state=run_state,
command_executor=run_command,
auto_advance_after_turns=loop_cfg.auto_advance_after_turns,
max_turns=loop_cfg.max_turns,
)
# 6. Set up live dashboard (or not)
dashboard = None
on_turn_start = None
on_turn = None
# 7. Save full rollout (with transcript) and print summary
if use_live:
from .dashboard import BenchmarkDashboard
dashboard = BenchmarkDashboard(
model=args.model,
seed=args.seed,
config_name=args.config_name,
db_factory=db_factory,
company_id=company_id,
)
def on_turn_start(turn_num):
dashboard.mark_turn_start(turn_num)
def on_turn(snapshot, rs, commands):
dashboard.update(snapshot, rs, commands)
# 7. Run agent loop
loop_cfg = experiment_cfg.loop
try:
if dashboard is not None:
dashboard.start()
final_state = run_agent_loop(
runtime=runtime,
db_factory=db_factory,
company_id=company_id,
run_state=run_state,
command_executor=run_command,
auto_advance_after_turns=loop_cfg.auto_advance_after_turns,
max_turns=loop_cfg.max_turns,
on_turn_start=on_turn_start,
on_turn=on_turn,
)
finally:
if dashboard is not None:
dashboard.stop()
# 8. Print final summary
if dashboard is not None:
dashboard.print_final_summary(final_state)
# 9. Save full rollout (with transcript) and print summary
rollout = final_state.full_rollout()
summary = final_state.summary()
logger.info("Run complete: %s", json.dumps(summary, indent=2))