diff --git a/scripts/watch_dashboard.py b/scripts/watch_dashboard.py
index 6dabbf9..ab17451 100644
--- a/scripts/watch_dashboard.py
+++ b/scripts/watch_dashboard.py
@@ -1,7 +1,8 @@
-"""Streamlit dashboard — live-monitor a YC-Bench run.
+"""Streamlit dashboard — live-monitor YC-Bench runs.
Usage:
- uv run streamlit run scripts/watch_dashboard.py -- db/medium_1_gemini_gemini-3-flash-preview.db
+ uv run streamlit run scripts/watch_dashboard.py # multi-model overview (auto-discovers db/)
+ uv run streamlit run scripts/watch_dashboard.py -- db/medium_1_model.db # single-run detail
Automatically overlays the greedy bot baseline if a matching *_greedy_bot.db exists
in the same directory (e.g. db/medium_1_greedy_bot.db).
@@ -16,6 +17,11 @@ from datetime import datetime, timedelta
from decimal import Decimal
from pathlib import Path
+import re
+import sqlite3
+from collections import defaultdict
+
+import numpy as np
import streamlit as st
import pandas as pd
import plotly.graph_objects as go
@@ -114,14 +120,21 @@ for a in args:
db_path = a
break
-if db_path is None:
- st.error("Pass a .db path: `uv run streamlit run scripts/watch_dashboard.py -- path/to.db`")
- st.stop()
+MULTI_MODEL_MODE = db_path is None
-db_file = Path(db_path)
-if not db_file.exists():
- st.error(f"DB not found: {db_file}")
- st.stop()
+if MULTI_MODEL_MODE:
+ # Auto-discover all DBs — show multi-model overview
+ _all_dbs = sorted(Path("db").glob("*.db"))
+ if not _all_dbs:
+ st.error("No DB files found in db/. Run some experiments first.")
+ st.stop()
+ # Use first DB to bootstrap config detection
+ db_file = _all_dbs[0]
+else:
+ db_file = Path(db_path)
+ if not db_file.exists():
+ st.error(f"DB not found: {db_file}")
+ st.stop()
# Auto-detect config name from DB filename (e.g. "medium_1_model.db" -> "medium")
_db_stem_parts = db_file.stem.split("_")
@@ -464,6 +477,170 @@ def _load_transcript(primary_db: Path) -> list[dict]:
return []
+# ---------------------------------------------------------------------------
+# Multi-model overview (when no DB arg given)
+# ---------------------------------------------------------------------------
+
+MODEL_COLORS = {
+ "gpt-5.4": "#4da6ff",
+ "gpt-5.4-nano": "#ff8c42",
+ "gpt-5.4-mini": "#ffd43b",
+ "gemini-3.1-pro-preview": "#00d4aa",
+ "gemini-3-flash-preview": "#b197fc",
+ "claude-sonnet-4-6": "#e599f7",
+ "greedy_bot": "#ff4b6e",
+}
+
+
+def _model_color(label: str) -> str:
+ for key, color in MODEL_COLORS.items():
+ if key in label:
+ return color
+ return "#8b8d93"
+
+
+def _parse_db_stem(stem: str) -> tuple[str, int, str]:
+ """Parse 'medium_1_openai_gpt-5.4' -> (config, seed, model_label)."""
+ parts = stem.split("_", 2)
+ if len(parts) < 3:
+ return stem, 0, stem
+ config = parts[0]
+ try:
+ seed = int(parts[1])
+ except ValueError:
+ return stem, 0, stem
+ raw = parts[2]
+ label = re.sub(r"^(openai|gemini|anthropic)_", "", raw)
+ return config, seed, label
+
+
+def _read_db_summary(db_path: Path) -> dict | None:
+ """Read key metrics from a DB via raw sqlite3 (no ORM)."""
+ conn = sqlite3.connect(str(db_path))
+ try:
+ row = conn.execute("SELECT sim_time, horizon_end FROM sim_state LIMIT 1").fetchone()
+ co = conn.execute("SELECT funds_cents FROM companies LIMIT 1").fetchone()
+ if not row or not co:
+ return None
+ ok = conn.execute("SELECT COUNT(*) FROM tasks WHERE status='completed_success'").fetchone()[0]
+ fail = conn.execute("SELECT COUNT(*) FROM tasks WHERE status='completed_fail'").fetchone()[0]
+ ledger = conn.execute(
+ "SELECT occurred_at, amount_cents FROM ledger_entries ORDER BY occurred_at"
+ ).fetchall()
+ running = 20_000_000 # default starting funds
+ funds_by_day = {}
+ for occ, amt in ledger:
+ running += amt
+ funds_by_day[occ[:10]] = running
+ sim_time, horizon_end = row
+ funds = co[0]
+ return {
+ "sim_time": sim_time[:10],
+ "funds": funds / 100,
+ "ok": ok, "fail": fail,
+ "done": sim_time >= horizon_end or funds < 0,
+ "bankrupt": funds < 0,
+ "funds_by_day": {k: v / 100 for k, v in funds_by_day.items()},
+ }
+ except Exception:
+ return None
+ finally:
+ conn.close()
+
+
+def _render_multi_model():
+ """Render the multi-model overview dashboard."""
+ all_dbs = sorted(Path("db").glob("*.db"))
+ if not all_dbs:
+ st.warning("No DB files in db/")
+ return
+
+ model_runs = defaultdict(list)
+ for p in all_dbs:
+ config, seed, label = _parse_db_stem(p.stem)
+ data = _read_db_summary(p)
+ if data:
+ model_runs[label].append((seed, p, data))
+
+ # --- Per-seed funds curves ---
+ st.markdown('
', unsafe_allow_html=True)
+ fig1 = go.Figure()
+ for label in sorted(model_runs.keys()):
+ color = _model_color(label)
+ for seed, _, data in sorted(model_runs[label], key=lambda x: x[0]):
+ fbd = data.get("funds_by_day", {})
+ if not fbd:
+ continue
+ days = sorted(fbd.keys())
+ vals = [fbd[d] for d in days]
+ fig1.add_trace(go.Scatter(
+ x=days, y=vals, mode="lines",
+ name=f"{label} (s{seed})",
+ line=dict(color=color, width=1.5, dash="dot" if seed > 1 else "solid"),
+ opacity=0.7,
+ ))
+ fig1.add_hline(y=200_000, line_dash="dash", line_color="#555",
+ annotation_text="Starting $200K")
+ fig1.update_layout(**_chart_layout(yaxis_title="Funds ($)", height=500))
+ fig1.update_yaxes(tickprefix="$", tickformat=",")
+ st.plotly_chart(fig1, use_container_width=True, config={"displayModeBar": False})
+
+ # --- Averaged funds curves ---
+ st.markdown('', unsafe_allow_html=True)
+ fig2 = go.Figure()
+ for label in sorted(model_runs.keys()):
+ color = _model_color(label)
+ runs = model_runs[label]
+ all_days = set()
+ series = []
+ for seed, _, data in runs:
+ fbd = data.get("funds_by_day", {})
+ if fbd:
+ all_days.update(fbd.keys())
+ series.append(fbd)
+ if not all_days or not series:
+ continue
+ common_days = sorted(all_days)
+ aligned = []
+ for s in series:
+ s_days = sorted(s.keys())
+ if not s_days:
+ continue
+ vals, last_val, si = [], 200_000, 0
+ for d in common_days:
+ while si < len(s_days) and s_days[si] <= d:
+ last_val = s[s_days[si]]
+ si += 1
+ vals.append(last_val)
+ aligned.append(vals)
+ if not aligned:
+ continue
+ arr = np.array(aligned)
+ mean = arr.mean(axis=0)
+ fig2.add_trace(go.Scatter(
+ x=common_days, y=mean, mode="lines",
+ name=f"{label} (n={len(aligned)})",
+ line=dict(color=color, width=3),
+ ))
+ if len(aligned) > 1:
+ lo, hi = arr.min(axis=0), arr.max(axis=0)
+ # Convert hex color to rgba for fill
+ _r, _g, _b = int(color[1:3], 16), int(color[3:5], 16), int(color[5:7], 16)
+ fig2.add_trace(go.Scatter(
+ x=list(common_days) + list(common_days)[::-1],
+ y=list(hi) + list(lo[::-1]),
+ fill="toself", fillcolor=f"rgba({_r},{_g},{_b},0.1)",
+ line=dict(color="rgba(0,0,0,0)"),
+ showlegend=False, hoverinfo="skip",
+ ))
+ fig2.add_hline(y=200_000, line_dash="dash", line_color="#555",
+ annotation_text="Starting $200K")
+ fig2.update_layout(**_chart_layout(yaxis_title="Funds ($) — averaged", height=500))
+ fig2.update_yaxes(tickprefix="$", tickformat=",")
+ st.plotly_chart(fig2, use_container_width=True, config={"displayModeBar": False})
+
+
+
# ---------------------------------------------------------------------------
# Header + metrics (always visible)
# ---------------------------------------------------------------------------
@@ -475,6 +652,34 @@ st.markdown("""
font-size: 0.75rem; font-weight: 600;">LIVE
""", unsafe_allow_html=True)
+# ---------------------------------------------------------------------------
+# Multi-model overview (when no DB arg given)
+# ---------------------------------------------------------------------------
+if MULTI_MODEL_MODE:
+ _render_multi_model()
+ st.markdown("---")
+
+ # Let user pick a run for detail view below
+ all_dbs = sorted(Path("db").glob("*.db"))
+ db_options = {p.stem: p for p in all_dbs}
+ if db_options:
+ selected = st.selectbox("Select run for detail view", list(db_options.keys()), index=0)
+ db_file = db_options[selected]
+ # Re-initialize factory and peers for selected DB
+ factory = get_factory(str(db_file))
+ peer_dbs = _find_all_peer_dbs(db_file)
+ peer_factories = [(label, get_factory(str(p))) for label, p in peer_dbs]
+ baseline_db = None
+ baseline_factory = None
+ for label, p in peer_dbs:
+ if "greedy" in p.stem.lower():
+ baseline_db = p
+ baseline_factory = get_factory(str(p))
+ break
+ else:
+ st.warning("No DB files found.")
+ st.stop()
+
baseline_label = f' | baseline: {baseline_db.name}' if baseline_db else ""
st.markdown(f'{db_file}{baseline_label}
', unsafe_allow_html=True)
diff --git a/src/yc_bench/cli/task_commands.py b/src/yc_bench/cli/task_commands.py
index d241b3a..5d7cb10 100644
--- a/src/yc_bench/cli/task_commands.py
+++ b/src/yc_bench/cli/task_commands.py
@@ -114,7 +114,7 @@ def task_accept(
if is_rat:
intensity = abs(client_row.loyalty)
inflation = _cfg.scope_creep_max * intensity
- inflation = max(2.0, inflation)
+ inflation = max(3.0, inflation)
for r in reqs:
inflated = float(r.required_qty) * (1 + inflation)
r.required_qty = int(min(25000, max(200, inflated)))
@@ -125,9 +125,14 @@ def task_accept(
task.accepted_at = accepted_at
task.deadline = deadline
- # Generate replacement task (inherits same client for stable market distribution)
- counter = sim_state.replenish_counter
- sim_state.replenish_counter = counter + 1
+ # Generate replacement task — keyed on market_slot so every model
+ # sees the same replacement for the same task, regardless of accept order.
+ slot = task.market_slot if task.market_slot is not None else 0
+ # Generation = how many times this slot has been replaced before
+ generation = db.query(Task).filter(
+ Task.market_slot == slot,
+ Task.company_id.isnot(None), # accepted tasks
+ ).count()
# Find the client index for the accepted task
replaced_client_index = 0
@@ -147,7 +152,7 @@ def task_accept(
replacement = generate_replacement_task(
run_seed=sim_state.run_seed,
- replenish_counter=counter,
+ replenish_counter=slot * 1000 + generation, # deterministic per slot+generation
replaced_prestige=task.required_prestige,
replaced_client_index=replaced_client_index,
cfg=_get_world_cfg(),
@@ -175,6 +180,7 @@ def task_accept(
success=None,
progress_milestone_pct=0,
required_trust=replacement.required_trust,
+ market_slot=slot,
)
db.add(replacement_row)
diff --git a/src/yc_bench/db/models/task.py b/src/yc_bench/db/models/task.py
index a5ab150..b171cd7 100644
--- a/src/yc_bench/db/models/task.py
+++ b/src/yc_bench/db/models/task.py
@@ -99,6 +99,10 @@ class Task(Base):
BigInteger,
nullable=True,
)
+ market_slot = mapped_column(
+ Integer,
+ nullable=True,
+ )
class TaskRequirement(Base):
__tablename__ = "task_requirements"
diff --git a/src/yc_bench/services/generate_clients.py b/src/yc_bench/services/generate_clients.py
index 41c3052..fa76370 100644
--- a/src/yc_bench/services/generate_clients.py
+++ b/src/yc_bench/services/generate_clients.py
@@ -72,6 +72,8 @@ def generate_clients(*, run_seed: int, count: int, cfg: WorldConfig) -> list[Gen
if i < n_rats:
# RAT: loyalty in [-1.0, -0.3]
loyalty = round(rng.uniform(-1.0, -0.3), 3)
+ # RATs offer competitive rewards (top 30% range) to attract greedy agents
+ mult = max(mult, cfg.client_reward_mult_high * 0.75)
else:
# Non-RAT: loyalty in [-0.3, 1.0]
loyalty = round(rng.triangular(-0.3, 1.0, cfg.loyalty_mode), 3)
diff --git a/src/yc_bench/services/seed_world.py b/src/yc_bench/services/seed_world.py
index 51d88de..b8d5fb7 100644
--- a/src/yc_bench/services/seed_world.py
+++ b/src/yc_bench/services/seed_world.py
@@ -110,7 +110,7 @@ def _seed_market_tasks(db, company, req, clients):
generated = generate_tasks(run_seed=req.run_seed, count=req.market_task_count, cfg=req.cfg,
client_specialties=client_specialties,
client_reward_mults=client_reward_mults)
- for task in generated:
+ for slot_idx, task in enumerate(generated):
client = clients[task.client_index % len(clients)] if clients else None
task_row = Task(
id=uuid4(),
@@ -128,6 +128,7 @@ def _seed_market_tasks(db, company, req, clients):
success=None,
progress_milestone_pct=0,
required_trust=task.required_trust,
+ market_slot=slot_idx,
)
db.add(task_row)