diff --git a/.gitignore b/.gitignore index 14ad824..8d85e21 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,8 @@ agent.md # Benchmark runtime outputs /db/ /logs/ +/plots/ +/results/ # Claude session context — local only CLAUDE.md diff --git a/run.sh b/run.sh index 7e4909c..5b709c6 100755 --- a/run.sh +++ b/run.sh @@ -9,7 +9,7 @@ SLUG="${MODEL//\//_}" # --- 1. Greedy bot baseline --- uv run python scripts/bot_runner.py --bot greedy --config "$CONFIG" --seed "$SEED" -# --- 2. LLM run --- +# --- 2. LLM run (delete stale DB first) --- rm -f "db/${CONFIG}_${SEED}_${SLUG}.db" uv run yc-bench run --model "$MODEL" --seed "$SEED" --config "$CONFIG" --company-name BenchCo --start-date 2025-01-01 --no-live @@ -37,5 +37,22 @@ uv run python scripts/plot_results.py \ --plot prestige --labels "LLM ($MODEL)" "greedy bot" \ --out plots/comparison_prestige.png -# --- 5. Live dashboard --- +# ============================================================================ +# Quick reference commands (uncomment to use) +# ============================================================================ + +# --- Quick test run (50 turns max) --- +# rm -f db/fast_test_1_gemini_gemini-3-flash-preview.db +# uv run yc-bench run --model gemini/gemini-3-flash-preview --seed 1 --config fast_test --company-name BenchCo --start-date 2025-01-01 + +# --- Streamlit live dashboard (run alongside an LLM run) --- # uv run streamlit run scripts/watch_dashboard.py -- "db/${CONFIG}_${SEED}_${SLUG}.db" + +# --- Bot runner (all bots, all configs, all seeds) --- +# uv run python scripts/bot_runner.py + +# --- Bot runner (single) --- +# uv run python scripts/bot_runner.py --bot greedy --config medium --seed 1 + +# --- Nuke all stale DBs (run after schema changes) --- +# rm -f db/*.db diff --git a/scripts/watch_dashboard.py b/scripts/watch_dashboard.py index de149f0..b3d323c 100644 --- a/scripts/watch_dashboard.py +++ b/scripts/watch_dashboard.py @@ -8,6 +8,8 @@ in the same directory (e.g. db/medium_1_greedy_bot.db). """ from __future__ import annotations +import json as _json +import os import sys import time from datetime import datetime, timedelta @@ -17,7 +19,6 @@ from pathlib import Path import streamlit as st import pandas as pd import plotly.graph_objects as go -from plotly.subplots import make_subplots sys.path.insert(0, str(Path(__file__).parent.parent / "src")) @@ -28,7 +29,6 @@ from yc_bench.db.models.ledger import LedgerEntry from yc_bench.db.models.sim_state import SimState from yc_bench.db.models.task import Task, TaskRequirement, TaskStatus from yc_bench.db.session import build_engine, build_session_factory, session_scope -from yc_bench.config import get_world_config # --------------------------------------------------------------------------- # Theme colors @@ -123,6 +123,13 @@ if not db_file.exists(): st.error(f"DB not found: {db_file}") st.stop() +# Auto-detect config name from DB filename (e.g. "medium_1_model.db" -> "medium") +_db_stem_parts = db_file.stem.split("_") +if _db_stem_parts: + os.environ.setdefault("YC_BENCH_EXPERIMENT", _db_stem_parts[0]) + +from yc_bench.config import get_world_config + @st.cache_resource def get_factory(path: str): @@ -136,24 +143,42 @@ factory = get_factory(str(db_file)) # Auto-detect greedy bot baseline DB # --------------------------------------------------------------------------- -def _find_baseline_db(primary: Path) -> Path | None: - """Look for a greedy_bot DB in the same directory with a matching config/seed prefix.""" - # e.g. medium_1_gemini_gemini-3-flash-preview.db -> medium_1_greedy_bot.db +def _find_all_peer_dbs(primary: Path) -> list[tuple[str, Path]]: + """Find all DBs with the same config+seed prefix (other models + greedy bot).""" parts = primary.stem.split("_") - if len(parts) >= 2: - prefix = "_".join(parts[:2]) # "medium_1" - candidate = primary.parent / f"{prefix}_greedy_bot.db" - if candidate.exists() and candidate != primary: - return candidate - return None + if len(parts) < 2: + return [] + prefix = "_".join(parts[:2]) # e.g. "medium_2" + peers = [] + for p in sorted(primary.parent.glob(f"{prefix}_*.db")): + if p == primary: + continue + # Derive a label from the filename + model_part = p.stem[len(prefix) + 1:] # e.g. "greedy_bot" or "openai_gpt-5.2-2025-12-11" + label = model_part.replace("_", " ").replace("-", " ") + if "greedy" in label: + label = "Greedy Bot" + else: + # Use the model name directly from the filename + label = model_part + peers.append((label, p)) + return peers -baseline_db = _find_baseline_db(db_file) -baseline_factory = get_factory(str(baseline_db)) if baseline_db else None +peer_dbs = _find_all_peer_dbs(db_file) +peer_factories = [(label, get_factory(str(p))) for label, p in peer_dbs] + +# Keep backward compat +baseline_db = None +baseline_factory = None +for label, p in peer_dbs: + if "greedy" in p.stem.lower(): + baseline_db = p + baseline_factory = get_factory(str(p)) + break def query_funds_only(fct): - """Extract just (times, vals_dollars) from a DB factory — used for baseline overlay.""" with session_scope(fct) as db: sim = db.query(SimState).first() if not sim: @@ -181,7 +206,13 @@ def query_funds_only(fct): # Chart helpers # --------------------------------------------------------------------------- -def _chart_layout(title="", height=400, yaxis_title="", show_legend=True): +def _chart_layout(title="", height=400, yaxis_title="", show_legend=True, x_range=None): + xaxis_opts = dict( + gridcolor=GRID_COLOR, zeroline=False, + tickfont=dict(size=10, color=TEXT_MUTED), + ) + if x_range: + xaxis_opts["range"] = x_range return dict( template="plotly_dark", paper_bgcolor="rgba(0,0,0,0)", @@ -190,10 +221,7 @@ def _chart_layout(title="", height=400, yaxis_title="", show_legend=True): title=dict(text=title, font=dict(size=14, color=TEXT_COLOR), x=0, xanchor="left"), height=height, margin=dict(l=60, r=20, t=40, b=40), - xaxis=dict( - gridcolor=GRID_COLOR, zeroline=False, - tickfont=dict(size=10, color=TEXT_MUTED), - ), + xaxis=xaxis_opts, yaxis=dict( title=yaxis_title, gridcolor=GRID_COLOR, zeroline=False, tickfont=dict(size=10, color=TEXT_MUTED), @@ -207,32 +235,8 @@ def _chart_layout(title="", height=400, yaxis_title="", show_legend=True): ) -def _smooth(times, values, window_days=3): - """Resample to daily frequency and apply rolling average.""" - if len(times) < 2: - return times, values - start, end = times[0], times[-1] - n_days = (end - start).days - if n_days < 2: - return times, values - daily_times = [start + timedelta(days=d) for d in range(n_days + 1)] - daily_vals = [] - src_idx = 0 - for dt in daily_times: - while src_idx < len(times) - 1 and times[src_idx + 1] <= dt: - src_idx += 1 - daily_vals.append(values[src_idx]) - half = window_days // 2 - smoothed = [] - for i in range(len(daily_vals)): - lo = max(0, i - half) - hi = min(len(daily_vals), i + half + 1) - smoothed.append(sum(daily_vals[lo:hi]) / (hi - lo)) - return daily_times, smoothed - - # --------------------------------------------------------------------------- -# Query +# Query DB state # --------------------------------------------------------------------------- def query_state(): @@ -245,7 +249,7 @@ def query_state(): company = db.query(Company).filter(Company.id == sim.company_id).one() company_id = sim.company_id - # ----- Funds time series ----- + # Funds time series ledger = ( db.query(LedgerEntry) .filter(LedgerEntry.company_id == company_id) @@ -255,13 +259,14 @@ def query_state(): total_delta = sum(int(e.amount_cents) for e in ledger) initial_funds = int(company.funds_cents) - total_delta running = initial_funds - funds_times, funds_vals = [], [] + funds_times, funds_vals, funds_categories = [], [], [] for e in ledger: running += int(e.amount_cents) funds_times.append(e.occurred_at) funds_vals.append(running / 100) + funds_categories.append(e.category.value if hasattr(e.category, "value") else str(e.category)) - # ----- Tasks ----- + # Tasks tasks = db.query(Task).filter(Task.company_id == company_id).all() task_counts = {} for s in TaskStatus: @@ -270,7 +275,7 @@ def query_state(): completed_tasks = [t for t in tasks if t.status == TaskStatus.COMPLETED_SUCCESS] total_reward = sum(t.reward_funds_cents for t in completed_tasks) - # ----- Prestige (current snapshot) ----- + # Prestige (current snapshot) prestige_rows = db.query(CompanyPrestige).filter( CompanyPrestige.company_id == company_id ).all() @@ -279,7 +284,7 @@ def query_state(): for p in prestige_rows } - # ----- Prestige time series ----- + # Prestige time series all_domains = sorted(prestige.keys()) completed_ordered = ( db.query(Task) @@ -331,7 +336,7 @@ def query_state(): prestige_series[domain]["levels"].append(round(domain_levels[domain], 4)) last_event_time = t.completed_at - # ----- Trust (current snapshot) ----- + # Trust (current snapshot) trust_rows = ( db.query(ClientTrust, Client.name, Client.tier) .join(Client, Client.id == ClientTrust.client_id) @@ -346,7 +351,7 @@ def query_state(): client_names = {str(ct.client_id): name for ct, name, _ in trust_rows} client_tiers = {str(ct.client_id): tier for ct, _, tier in trust_rows} - # ----- Trust time series ----- + # Trust time series client_tasks = ( db.query(Task) .filter( @@ -375,7 +380,7 @@ def query_state(): if cid not in trust_levels: continue - if last_trust_time and t.completed_at > last_trust_time: + if last_trust_time and t.completed_at and t.completed_at > last_trust_time: days_elapsed = (t.completed_at - last_trust_time).total_seconds() / 86400 decay = wc.trust_decay_per_day * days_elapsed for k in trust_levels: @@ -406,6 +411,7 @@ def query_state(): "funds_cents": company.funds_cents, "funds_times": funds_times, "funds_vals": funds_vals, + "funds_categories": funds_categories, "task_counts": task_counts, "total_reward": total_reward, "completed": task_counts.get("completed_success", 0), @@ -424,10 +430,44 @@ def query_state(): # --------------------------------------------------------------------------- -# Layout +# Load transcript +# --------------------------------------------------------------------------- + +def _load_transcript(primary_db: Path) -> list[dict]: + """Load live transcript JSONL file, or fall back to result JSON.""" + transcript_path = primary_db.with_suffix(".transcript.jsonl") + if transcript_path.exists(): + entries = [] + try: + with open(transcript_path) as f: + for line in f: + line = line.strip() + if line: + entries.append(_json.loads(line)) + except Exception: + pass + if entries: + return entries + + # Fall back to result JSON + result_path = Path("results") / f"yc_bench_result_{primary_db.stem}.json" + if result_path.exists(): + try: + with open(result_path) as f: + data = _json.load(f) + if "transcript" in data: + return data["transcript"] + if "episodes" in data and data["episodes"]: + return data["episodes"][-1].get("transcript", []) + except Exception: + pass + return [] + + +# --------------------------------------------------------------------------- +# Header + metrics (always visible) # --------------------------------------------------------------------------- -# Header st.markdown("""
YC-Bench @@ -447,14 +487,16 @@ if state is None: funds = state["funds_cents"] / 100 funds_color = "green" if funds > 0 else "red" runway = round(funds / (state["monthly_payroll"] / 100), 1) if state["monthly_payroll"] > 0 else float("inf") +max_prestige = max(state["prestige"].values()) if state["prestige"] else 1.0 +avg_prestige = sum(state["prestige"].values()) / len(state["prestige"]) if state["prestige"] else 1.0 -cols = st.columns(6) +cols = st.columns(5) +tasks_str = f'{state["completed"]}✓ {state["failed"]}✗ {state["active"]}⟳' metrics = [ ("Funds", f"${funds:,.0f}", funds_color), ("Sim Date", state["sim_time"].strftime("%b %d, %Y"), "blue"), - ("Completed", str(state["completed"]), "green"), - ("Failed", str(state["failed"]), "red" if state["failed"] > 0 else "yellow"), - ("Active", str(state["active"]), "purple"), + ("Prestige", f"{max_prestige:.1f}", "purple"), + ("Tasks", tasks_str, "green"), ("Runway", f"{runway:.0f}mo" if runway != float("inf") else "N/A", "yellow"), ] @@ -468,178 +510,718 @@ for col, (label, value, color) in zip(cols, metrics): st.markdown("
", unsafe_allow_html=True) -# --------------------------------------------------------------------------- -# Funds chart -# --------------------------------------------------------------------------- -if state["funds_times"]: - st.markdown('
Funds Over Time
', unsafe_allow_html=True) +# =========================================================================== +# TABS +# =========================================================================== - fig = go.Figure() - fig.add_trace(go.Scatter( - x=state["funds_times"], y=state["funds_vals"], - mode="lines", name="LLM Agent", - line=dict(color=ACCENT_GREEN, width=2), - fill="tozeroy", fillcolor="rgba(0,212,170,0.08)", - )) - # Overlay greedy baseline if available - if baseline_factory is not None: - bl_times, bl_vals = query_funds_only(baseline_factory) - if bl_times: - fig.add_trace(go.Scatter( - x=bl_times, y=bl_vals, - mode="lines", name="Greedy Bot", - line=dict(color=ACCENT_RED, width=2, dash="dot"), - )) - # Mark bankruptcy point - if bl_vals[-1] < 0: - fig.add_trace(go.Scatter( - x=[bl_times[-1]], y=[bl_vals[-1]], - mode="markers+text", name="Bankrupt", - marker=dict(color=ACCENT_RED, size=10, symbol="x"), - text=["BANKRUPT"], textposition="top center", - textfont=dict(color=ACCENT_RED, size=10), - showlegend=False, - )) - # Zero line - fig.add_hline(y=0, line_dash="dash", line_color=ACCENT_RED, opacity=0.3) - show_legend = baseline_factory is not None - fig.update_layout(**_chart_layout(yaxis_title="USD ($)", show_legend=show_legend)) - fig.update_yaxes(tickprefix="$", tickformat=",") - st.plotly_chart(fig, use_container_width=True, config={"displayModeBar": False}) +tab_charts, tab_world = st.tabs(["Charts", "World"]) # --------------------------------------------------------------------------- -# Prestige & Trust side by side +# TAB 1: Charts # --------------------------------------------------------------------------- -col_left, col_right = st.columns(2) +with tab_charts: -with col_left: - st.markdown('
Prestige by Domain
', unsafe_allow_html=True) + # Funds chart + if state["funds_times"]: + st.markdown('
Funds Over Time
', unsafe_allow_html=True) - has_series = any(len(s["times"]) > 0 for s in state["prestige_series"].values()) - if has_series: fig = go.Figure() - for i, (domain, series) in enumerate(sorted(state["prestige_series"].items())): - if not series["times"]: - continue - fig.add_trace(go.Scatter( - x=series["times"], y=series["levels"], - mode="lines+markers", name=domain.replace("_", " ").title(), - line=dict(color=CHART_COLORS[i % len(CHART_COLORS)], width=2), - marker=dict(size=3), - )) - layout = _chart_layout(yaxis_title="Prestige Level", height=350) - layout["yaxis"]["range"] = [0.5, 10.5] - fig.update_layout(**layout) - st.plotly_chart(fig, use_container_width=True, config={"displayModeBar": False}) - elif state["prestige"]: - domains = sorted(state["prestige"].keys()) - levels = [state["prestige"][d] for d in domains] - labels = [d.replace("_", " ").title() for d in domains] - fig = go.Figure(go.Bar( - x=labels, y=levels, - marker_color=[CHART_COLORS[i % len(CHART_COLORS)] for i in range(len(domains))], - marker_line=dict(width=0), + fig.add_trace(go.Scatter( + x=state["funds_times"], y=state["funds_vals"], + mode="lines", name="_".join(db_file.stem.split("_")[2:]), + line=dict(color=ACCENT_GREEN, width=2), + fill="tozeroy", fillcolor="rgba(0,212,170,0.08)", )) - layout = _chart_layout(yaxis_title="Level", height=350, show_legend=False) - layout["yaxis"]["range"] = [0, 10.5] - fig.update_layout(**layout) + # Overlay all peer runs (other models + greedy bot) + peer_colors = [ACCENT_RED, ACCENT_BLUE, ACCENT_YELLOW, ACCENT_PURPLE, ACCENT_ORANGE] + for i, (peer_label, peer_fct) in enumerate(peer_factories): + pl_times, pl_vals = query_funds_only(peer_fct) + if pl_times: + is_bot = "greedy" in peer_label.lower() or "bot" in peer_label.lower() + fig.add_trace(go.Scatter( + x=pl_times, y=pl_vals, + mode="lines", name=peer_label, + line=dict(color=peer_colors[i % len(peer_colors)], width=2, dash="dot" if is_bot else "solid"), + )) + if pl_vals[-1] < 0: + fig.add_trace(go.Scatter( + x=[pl_times[-1]], y=[pl_vals[-1]], + mode="markers+text", name=f"{peer_label} Bankrupt", + marker=dict(color=peer_colors[i % len(peer_colors)], size=10, symbol="x"), + text=["BANKRUPT"], textposition="top center", + textfont=dict(color=peer_colors[i % len(peer_colors)], size=10), + showlegend=False, + )) + # Annotate dips — group payroll by month, always show disputes + if len(state["funds_times"]) > 1: + cats = state.get("funds_categories", []) + + # Group payroll into monthly totals + payroll_months = {} # "YYYY-MM" -> {"total": int, "time": datetime, "val": float} + disputes_list = [] + + for i in range(1, len(state["funds_vals"])): + delta = state["funds_vals"][i] - state["funds_vals"][i - 1] + t = state["funds_times"][i] + v = state["funds_vals"][i] + cat = cats[i] if i < len(cats) else "" + + if cat == "payment_dispute": + disputes_list.append((t, v, delta)) + elif cat == "monthly_payroll" and delta < 0: + key = t.strftime("%Y-%m") + if key not in payroll_months: + payroll_months[key] = {"total": 0, "time": t, "val": v} + payroll_months[key]["total"] += delta + payroll_months[key]["val"] = v # use final value after all deductions + + ay_flip = -1 + for t, v, delta in disputes_list: + ay_flip *= -1 + fig.add_annotation( + x=t, y=v, text=f"Dispute -${abs(delta):,.0f}", + showarrow=True, arrowhead=2, arrowsize=0.8, arrowcolor=ACCENT_RED, + font=dict(size=9, color=ACCENT_RED), bgcolor="#1a1d23", bordercolor=ACCENT_RED, + borderwidth=1, borderpad=3, ax=0, ay=ay_flip * 35, + ) + + for key, pm in payroll_months.items(): + fig.add_annotation( + x=pm["time"], y=pm["val"], text=f"Payroll -${abs(pm['total']):,.0f}", + showarrow=False, + font=dict(size=8, color=TEXT_MUTED), yshift=-14, + ) + + fig.add_hline(y=0, line_dash="dash", line_color=ACCENT_RED, opacity=0.3) + show_legend = len(peer_factories) > 0 + fig.update_layout(**_chart_layout(yaxis_title="USD ($)", show_legend=show_legend)) + fig.update_yaxes(tickprefix="$", tickformat=",") st.plotly_chart(fig, use_container_width=True, config={"displayModeBar": False}) -with col_right: - st.markdown('
Client Trust
', unsafe_allow_html=True) + # Prestige & Trust side by side + col_left, col_right = st.columns(2) - has_trust_series = any(len(s["times"]) > 0 for s in state["trust_series"].values()) - if has_trust_series: - fig = go.Figure() - # Sort clients by final trust level (highest first) - sorted_clients = sorted( - state["trust_series"].items(), - key=lambda x: x[1]["levels"][-1] if x[1]["levels"] else 0, - reverse=True, - ) - for i, (client, series) in enumerate(sorted_clients): - if not series["times"]: - continue - tier = None - for cid, name in state["client_names"].items(): - if name == client: - tier = state["client_tiers"].get(cid) - break - label = f"{client} [{tier}]" if tier else client - fig.add_trace(go.Scatter( - x=series["times"], y=series["levels"], - mode="lines", name=label, - line=dict(color=CHART_COLORS[i % len(CHART_COLORS)], width=2), + with col_left: + st.markdown('
Prestige by Domain
', unsafe_allow_html=True) + + has_series = any(len(s["times"]) > 0 for s in state["prestige_series"].values()) + if has_series: + fig = go.Figure() + for i, (domain, series) in enumerate(sorted(state["prestige_series"].items())): + if not series["times"]: + continue + fig.add_trace(go.Scatter( + x=series["times"], y=series["levels"], + mode="lines+markers", name=domain.replace("_", " ").title(), + line=dict(color=CHART_COLORS[i % len(CHART_COLORS)], width=2), + marker=dict(size=3), + )) + layout = _chart_layout(yaxis_title="Prestige Level", height=350) + layout["yaxis"]["range"] = [0.5, 10.5] + fig.update_layout(**layout) + st.plotly_chart(fig, use_container_width=True, config={"displayModeBar": False}) + elif state["prestige"]: + domains = sorted(state["prestige"].keys()) + levels = [state["prestige"][d] for d in domains] + labels = [d.replace("_", " ").title() for d in domains] + fig = go.Figure(go.Bar( + x=labels, y=levels, + marker_color=[CHART_COLORS[i % len(CHART_COLORS)] for i in range(len(domains))], + marker_line=dict(width=0), )) - layout = _chart_layout(yaxis_title="Trust Level", height=350) - layout["yaxis"]["range"] = [-0.2, 5.5] - fig.update_layout(**layout) - st.plotly_chart(fig, use_container_width=True, config={"displayModeBar": False}) - elif state["trusts"]: + layout = _chart_layout(yaxis_title="Level", height=350, show_legend=False) + layout["yaxis"]["range"] = [0, 10.5] + fig.update_layout(**layout) + st.plotly_chart(fig, use_container_width=True, config={"displayModeBar": False}) + + with col_right: + st.markdown('
Client Trust
', unsafe_allow_html=True) + + has_trust_series = any(len(s["times"]) > 0 for s in state["trust_series"].values()) + if has_trust_series: + fig = go.Figure() + sorted_clients = sorted( + state["trust_series"].items(), + key=lambda x: x[1]["levels"][-1] if x[1]["levels"] else 0, + reverse=True, + ) + for i, (client, series) in enumerate(sorted_clients): + if not series["times"]: + continue + tier = None + for cid, name in state["client_names"].items(): + if name == client: + tier = state["client_tiers"].get(cid) + break + label = f"{client} [{tier}]" if tier else client + fig.add_trace(go.Scatter( + x=series["times"], y=series["levels"], + mode="lines", name=label, + line=dict(color=CHART_COLORS[i % len(CHART_COLORS)], width=2), + )) + layout = _chart_layout(yaxis_title="Trust Level", height=350) + layout["yaxis"]["range"] = [-0.2, 5.5] + fig.update_layout(**layout) + st.plotly_chart(fig, use_container_width=True, config={"displayModeBar": False}) + elif state["trusts"]: + df_t = pd.DataFrame(state["trusts"]) + df_t["label"] = df_t.apply(lambda r: f"{r['client']} [{r['tier']}]", axis=1) + df_t = df_t.sort_values("trust", ascending=True) + fig = go.Figure(go.Bar( + x=df_t["trust"], y=df_t["label"], + orientation="h", + marker_color=ACCENT_BLUE, + marker_line=dict(width=0), + )) + layout = _chart_layout(height=350, show_legend=False) + layout["xaxis"]["range"] = [0, 5.5] + layout["xaxis"]["title"] = "Trust Level" + layout["margin"]["l"] = 140 + fig.update_layout(**layout) + st.plotly_chart(fig, use_container_width=True, config={"displayModeBar": False}) + + # Current trust snapshot + if state["trusts"]: + st.markdown('
Current Trust Snapshot
', unsafe_allow_html=True) df_t = pd.DataFrame(state["trusts"]) df_t["label"] = df_t.apply(lambda r: f"{r['client']} [{r['tier']}]", axis=1) df_t = df_t.sort_values("trust", ascending=True) + + colors = [] + for _, row in df_t.iterrows(): + t = row["trust"] + if t >= 3.0: + colors.append(ACCENT_GREEN) + elif t >= 1.0: + colors.append(ACCENT_BLUE) + elif t > 0: + colors.append(ACCENT_YELLOW) + else: + colors.append(GRID_COLOR) + fig = go.Figure(go.Bar( x=df_t["trust"], y=df_t["label"], orientation="h", - marker_color=ACCENT_BLUE, + marker_color=colors, marker_line=dict(width=0), + text=[f"{t:.2f}" for t in df_t["trust"]], + textposition="outside", + textfont=dict(size=11, color=TEXT_MUTED), )) - layout = _chart_layout(height=350, show_legend=False) + layout = _chart_layout(height=max(200, len(df_t) * 35 + 60), show_legend=False) layout["xaxis"]["range"] = [0, 5.5] layout["xaxis"]["title"] = "Trust Level" - layout["margin"]["l"] = 140 + layout["margin"]["l"] = 160 fig.update_layout(**layout) st.plotly_chart(fig, use_container_width=True, config={"displayModeBar": False}) -# --------------------------------------------------------------------------- -# Current trust snapshot (horizontal bar) -# --------------------------------------------------------------------------- - -if state["trusts"]: - st.markdown('
Current Trust Snapshot
', unsafe_allow_html=True) - df_t = pd.DataFrame(state["trusts"]) - df_t["label"] = df_t.apply(lambda r: f"{r['client']} [{r['tier']}]", axis=1) - df_t = df_t.sort_values("trust", ascending=True) - - colors = [] - for _, row in df_t.iterrows(): - t = row["trust"] - if t >= 3.0: - colors.append(ACCENT_GREEN) - elif t >= 1.0: - colors.append(ACCENT_BLUE) - elif t > 0: - colors.append(ACCENT_YELLOW) - else: - colors.append(GRID_COLOR) - - fig = go.Figure(go.Bar( - x=df_t["trust"], y=df_t["label"], - orientation="h", - marker_color=colors, - marker_line=dict(width=0), - text=[f"{t:.2f}" for t in df_t["trust"]], - textposition="outside", - textfont=dict(size=11, color=TEXT_MUTED), - )) - layout = _chart_layout(height=max(200, len(df_t) * 35 + 60), show_legend=False) - layout["xaxis"]["range"] = [0, 5.5] - layout["xaxis"]["title"] = "Trust Level" - layout["margin"]["l"] = 160 - fig.update_layout(**layout) - st.plotly_chart(fig, use_container_width=True, config={"displayModeBar": False}) +def _esc(s: str) -> str: + """Escape HTML.""" + return s.replace("&", "&").replace("<", "<").replace(">", ">") # --------------------------------------------------------------------------- +# TAB 2: World State — god mode view +# --------------------------------------------------------------------------- + +def query_world(): + """Query full world state including hidden info.""" + wc = get_world_config() + with session_scope(factory) as db: + sim = db.query(SimState).first() + if not sim: + return None + company_id = sim.company_id + + from yc_bench.db.models.task import TaskAssignment + from yc_bench.db.models.employee import EmployeeSkillRate + from yc_bench.db.models.ledger import LedgerCategory + from sqlalchemy import func as sqlfunc + + # ── Employees ── + employees = db.query(Employee).filter(Employee.company_id == company_id).order_by(Employee.name).all() + emp_data = [] + for emp in employees: + assignments = ( + db.query(TaskAssignment, Task) + .join(Task, Task.id == TaskAssignment.task_id) + .filter(TaskAssignment.employee_id == emp.id, Task.status.in_([TaskStatus.PLANNED, TaskStatus.ACTIVE])) + .all() + ) + tasks_assigned = [{"title": t.title, "status": t.status.value} for _, t in assignments] + skills = db.query(EmployeeSkillRate).filter(EmployeeSkillRate.employee_id == emp.id).all() + skill_map = { + (s.domain.value if hasattr(s.domain, "value") else str(s.domain)): round(float(s.rate_domain_per_hour), 2) + for s in skills + } + # Count completed tasks for this employee + completed_n = ( + db.query(sqlfunc.count(TaskAssignment.task_id)) + .join(Task, Task.id == TaskAssignment.task_id) + .filter(TaskAssignment.employee_id == emp.id, Task.status == TaskStatus.COMPLETED_SUCCESS) + .scalar() or 0 + ) + emp_data.append({ + "name": emp.name, "tier": emp.tier, + "salary_cents": emp.salary_cents, "skills": skill_map, + "tasks": tasks_assigned, "completed": completed_n, + "hours_per_day": float(emp.work_hours_per_day), + }) + + # ── Clients (with hidden loyalty) ── + clients_raw = ( + db.query(Client, ClientTrust) + .join(ClientTrust, ClientTrust.client_id == Client.id) + .filter(ClientTrust.company_id == company_id) + .order_by(Client.name) + .all() + ) + client_data = [] + for c, ct in clients_raw: + success_n = db.query(sqlfunc.count(Task.id)).filter( + Task.company_id == company_id, Task.client_id == c.id, Task.status == TaskStatus.COMPLETED_SUCCESS).scalar() or 0 + fail_n = db.query(sqlfunc.count(Task.id)).filter( + Task.company_id == company_id, Task.client_id == c.id, Task.status == TaskStatus.COMPLETED_FAIL).scalar() or 0 + active_n = db.query(sqlfunc.count(Task.id)).filter( + Task.company_id == company_id, Task.client_id == c.id, Task.status == TaskStatus.ACTIVE).scalar() or 0 + + # Dispute stats (hidden info) + completed_tasks = db.query(Task).filter( + Task.company_id == company_id, Task.client_id == c.id, Task.status == TaskStatus.COMPLETED_SUCCESS).all() + task_ids = [t.id for t in completed_tasks] + dispute_n = 0 + dispute_total = 0 + listed_total = sum((t.advertised_reward_cents or t.reward_funds_cents) for t in completed_tasks) + received_total = 0 + if task_ids: + from yc_bench.db.models.ledger import LedgerEntry + reward_sum = db.query(sqlfunc.sum(LedgerEntry.amount_cents)).filter( + LedgerEntry.company_id == company_id, LedgerEntry.category == LedgerCategory.TASK_REWARD, + LedgerEntry.ref_id.in_(task_ids)).scalar() + received_total = int(reward_sum) if reward_sum else 0 + dispute_entries = db.query(LedgerEntry).filter( + LedgerEntry.company_id == company_id, LedgerEntry.category == LedgerCategory.PAYMENT_DISPUTE, + LedgerEntry.ref_id.in_(task_ids)).all() + dispute_n = len(dispute_entries) + dispute_total = sum(abs(int(e.amount_cents)) for e in dispute_entries) + + # Hidden mechanics — what would happen at current trust + trust_val = float(ct.trust_level) + loyalty_class = "RAT" if c.loyalty < -0.3 else ("LOYAL" if c.loyalty > 0.3 else "NEUTRAL") + effects_active = loyalty_class == "RAT" + scope_creep_pct = 0.0 + dispute_prob = 0.0 + if effects_active: + intensity = abs(c.loyalty) + scope_creep_pct = wc.scope_creep_max * intensity * 100 + dispute_prob = wc.dispute_prob_max * intensity * 100 + + client_data.append({ + "name": c.name, "tier": c.tier, "trust": trust_val, + "specialties": c.specialty_domains or [], + "reward_mult": c.reward_multiplier, + "loyalty": c.loyalty, "loyalty_class": loyalty_class, + "active": active_n, "completed": success_n, "failed": fail_n, + "listed_total": listed_total, "received_total": received_total, + "dispute_n": dispute_n, "dispute_total": dispute_total, + "effects_active": effects_active, + "scope_creep_pct": scope_creep_pct, "dispute_prob": dispute_prob, + }) + + # ── Active tasks ── + active_tasks = db.query(Task).filter( + Task.company_id == company_id, Task.status.in_([TaskStatus.ACTIVE, TaskStatus.PLANNED]) + ).order_by(Task.accepted_at).all() + task_data = [] + for t in active_tasks: + reqs = db.query(TaskRequirement).filter(TaskRequirement.task_id == t.id).all() + assigns = db.query(TaskAssignment).filter(TaskAssignment.task_id == t.id).all() + emp_names = [] + for a in assigns: + e = db.query(Employee).filter(Employee.id == a.employee_id).one_or_none() + if e: emp_names.append(e.name) + total_req = sum(float(r.required_qty) for r in reqs) + total_done = sum(float(r.completed_qty) for r in reqs) + pct = (total_done / total_req * 100) if total_req > 0 else 0 + domains = [{ + "domain": r.domain.value if hasattr(r.domain, "value") else str(r.domain), + "done": int(r.completed_qty), "total": int(r.required_qty), + "pct": int(float(r.completed_qty) / float(r.required_qty) * 100) if float(r.required_qty) > 0 else 0, + } for r in reqs] + client_name = "" + client_loyalty_class = "" + if t.client_id: + cl = db.query(Client).filter(Client.id == t.client_id).one_or_none() + if cl: + client_name = cl.name + client_loyalty_class = "RAT" if cl.loyalty < -0.3 else ("LOYAL" if cl.loyalty > 0.3 else "NEUTRAL") + # Scope creep detection: compare advertised vs actual reward + was_scope_creeped = False + if t.advertised_reward_cents and t.advertised_reward_cents != t.reward_funds_cents: + was_scope_creeped = True + + task_data.append({ + "title": t.title, "client": client_name, + "client_loyalty": client_loyalty_class, + "status": t.status.value, "reward": t.reward_funds_cents, + "advertised_reward": t.advertised_reward_cents or t.reward_funds_cents, + "prestige_req": t.required_prestige, + "prestige_delta": float(t.reward_prestige_delta) if t.reward_prestige_delta else 0.0, + "skill_boost_pct": float(t.skill_boost_pct) if t.skill_boost_pct else 0.0, + "trust_req": int(t.required_trust) if t.required_trust else 0, + "progress_pct": pct, + "deadline": t.deadline, + "at_risk": t.deadline and t.status == TaskStatus.ACTIVE and sim.sim_time > t.deadline, + "domains": domains, "employees": emp_names, + "was_scope_creeped": was_scope_creeped, + }) + + # ── Recent completed ── + recent = db.query(Task).filter( + Task.company_id == company_id, + Task.status.in_([TaskStatus.COMPLETED_SUCCESS, TaskStatus.COMPLETED_FAIL]) + ).order_by(Task.completed_at.desc()).limit(10).all() + recent_data = [] + for t in recent: + cn = "" + if t.client_id: + cl = db.query(Client).filter(Client.id == t.client_id).one_or_none() + if cl: cn = cl.name + recent_data.append({ + "title": t.title, "client": cn, + "success": t.success, "reward": t.reward_funds_cents, + "completed_at": t.completed_at, + }) + + # ── Trust effects log ── + from yc_bench.db.models.ledger import LedgerEntry + from yc_bench.db.models.event import SimEvent, EventType + + # Payment disputes + dispute_entries = ( + db.query(LedgerEntry) + .filter(LedgerEntry.company_id == company_id, LedgerEntry.category == LedgerCategory.PAYMENT_DISPUTE) + .order_by(LedgerEntry.occurred_at.desc()) + .all() + ) + disputes = [] + for d in dispute_entries: + # Find the task and client + task_row = db.query(Task).filter(Task.id == d.ref_id).one_or_none() + cn = "" + if task_row and task_row.client_id: + cl = db.query(Client).filter(Client.id == task_row.client_id).one_or_none() + if cl: cn = cl.name + disputes.append({ + "date": d.occurred_at, "amount": abs(int(d.amount_cents)), + "client": cn, "task": task_row.title if task_row else "?", + }) + + # Scope-creeped tasks (advertised != actual reward or we detect it from task data) + all_company_tasks = db.query(Task).filter( + Task.company_id == company_id, + Task.advertised_reward_cents.isnot(None), + ).order_by(Task.accepted_at).all() + scope_creeps = [] + for t in all_company_tasks: + # We stored advertised_reward = reward at accept time, so they're equal + # Scope creep inflates required_qty but keeps deadline based on original qty + # We can detect it by checking if the client is a RAT and trust was above threshold + if t.client_id: + cl = db.query(Client).filter(Client.id == t.client_id).one_or_none() + if cl and cl.loyalty < -0.3: + ct_row = db.query(ClientTrust).filter( + ClientTrust.company_id == company_id, ClientTrust.client_id == t.client_id + ).one_or_none() + # Check if this task was accepted when trust > threshold + # We can't know exact trust at accept time, but if task failed deadline it's a hint + # For god-mode, just flag all tasks from RAT clients + scope_creeps.append({ + "title": t.title, "client": cl.name, + "accepted": t.accepted_at, + "status": t.status.value if hasattr(t.status, "value") else str(t.status), + "failed_deadline": t.status == TaskStatus.COMPLETED_FAIL, + }) + + # Work reduction stats: count tasks from loyal clients + loyal_tasks_completed = 0 + rat_tasks_completed = 0 + for t in all_company_tasks: + if t.status != TaskStatus.COMPLETED_SUCCESS: + continue + if t.client_id: + cl = db.query(Client).filter(Client.id == t.client_id).one_or_none() + if cl: + if cl.loyalty > 0.3: loyal_tasks_completed += 1 + elif cl.loyalty < -0.3: rat_tasks_completed += 1 + + # Pending dispute events + pending_disputes = db.query(SimEvent).filter( + SimEvent.company_id == company_id, + SimEvent.event_type == EventType.PAYMENT_DISPUTE, + SimEvent.consumed == False, + ).count() + + trust_effects = { + "disputes": disputes, + "total_disputed": sum(d["amount"] for d in disputes), + "scope_creeps": scope_creeps, + "scope_creep_fails": sum(1 for s in scope_creeps if s["failed_deadline"]), + "loyal_completed": loyal_tasks_completed, + "rat_completed": rat_tasks_completed, + "pending_disputes": pending_disputes, + } + + # ── Market overview ── + from collections import defaultdict + market_tasks = db.query(Task).filter(Task.status == TaskStatus.MARKET).all() + market_by_prestige = defaultdict(lambda: {"count": 0, "total_reward": 0}) + for t in market_tasks: + market_by_prestige[t.required_prestige]["count"] += 1 + market_by_prestige[t.required_prestige]["total_reward"] += t.reward_funds_cents + market_overview = [] + for p in sorted(market_by_prestige): + d = market_by_prestige[p] + avg = d["total_reward"] / d["count"] if d["count"] else 0 + market_overview.append({"prestige": p, "count": d["count"], "avg_reward": avg}) + + return {"employees": emp_data, "clients": client_data, "active_tasks": task_data, + "recent": recent_data, "sim_time": sim.sim_time, "wc": wc, + "trust_effects": trust_effects, "market": market_overview} + + +with tab_world: + world = query_world() + if world is None: + st.warning("No simulation data.") + else: + god_mode = True # always on + + # ── Legend ── + st.markdown(""" +
+ + + + + + +
+ Tasks
+ 🔵 = in progress, ⏳ = planned (not started)
+ Each domain shows a progress bar: done / required units
+ ⚠ OVERDUE = deadline has passed
+ 🐀 RAT = client is adversarial
+ 📈 CREEP = work was secretly inflated +
+ Employees
+ Skill bars = work rate per domain (0-10, higher = faster)
+ JUNIOR low pay, low skills · + MID · + SENIOR high pay, high skills
+ IDLE = not assigned to any task (wasting salary) +
+ Clients
+ Trust 0-5: builds with completed tasks, decays over time
+ Multiplier: hidden reward scaling (agent can't see)
+ 🐀 RAT = scope creep + payment disputes at high trust
+ ✦ LOYAL = work reduction at high trust
+ Listed vs Net = reward promised vs actually received
+ ✓ completed · ✗ failed · ⟳ in progress +
+
+ """, unsafe_allow_html=True) + + # ════════════ TWO-COLUMN GRID: Left = Tasks+Employees, Right = Clients ════════════ + col_left, col_right = st.columns([3, 2]) + + # ── LEFT COLUMN: Active Tasks + Employees ── + with col_left: + st.markdown('
Active Tasks
', unsafe_allow_html=True) + if not world["active_tasks"]: + st.markdown('
No active tasks.
', unsafe_allow_html=True) + + for t in world["active_tasks"]: + rw = f"${t['reward']/100:,.0f}" + dl = t["deadline"].strftime("%b %d") if t["deadline"] else "—" + sc = ACCENT_PURPLE if t["status"] == "active" else ACCENT_YELLOW + icon = "🔵" if t["status"] == "active" else "⏳" + pct = t["progress_pct"] + bc = ACCENT_GREEN if pct >= 100 else (ACCENT_RED if t["at_risk"] else ACCENT_BLUE) + risk = f' ⚠ OVERDUE' if t["at_risk"] else "" + + hidden = "" + if god_mode: + if t.get("client_loyalty") == "RAT": + hidden += '🐀 RAT' + if t.get("was_scope_creeped"): + hidden += '📈 CREEP' + + dom_pills = "" + for d in t["domains"]: + dp = d["pct"]; dc = ACCENT_GREEN if dp >= 100 else ACCENT_BLUE + w = min(dp, 100) + dom_pills += ( + f'
' + f'{d["domain"]}' + f'
' + f'
' + f'{d["done"]:,}/{d["total"]:,}' + f'
') + + emps = ", ".join(t["employees"][:5]) if t["employees"] else f'none' + + trust_req_html = f'Trust req: {t["trust_req"]}' if t["trust_req"] else '' + st.markdown( + f'
' + f'
' + f'{icon} {_esc(t["title"])} {risk}{hidden}' + f'{rw}
' + f'
' + f'Client: {_esc(t["client"]) or "—"}' + f'Min prestige: {t["prestige_req"]}' + f'Deadline: {dl}' + f'Prestige reward: +{t["prestige_delta"]:.2f}' + f'Skill boost: {t["skill_boost_pct"]*100:.1f}%' + f'{trust_req_html}
', + unsafe_allow_html=True, + ) + # Domains + employees in separate markdown call to avoid HTML length issues + st.markdown( + f'
' + f'{dom_pills}' + f'
👥 {emps}
' + f'
', + unsafe_allow_html=True, + ) + + # ── EMPLOYEES (compact grid — 2 per row) ── + st.markdown('
Employees
', unsafe_allow_html=True) + emp_cols = st.columns(2) + for i, emp in enumerate(world["employees"]): + tc_map = {"junior": ACCENT_BLUE, "mid": ACCENT_YELLOW, "senior": ACCENT_PURPLE} + tc = tc_map.get(emp["tier"], ACCENT_BLUE) + idle = len(emp["tasks"]) == 0 + sal = f"${emp['salary_cents']/100:,.0f}" + + skills_html = "" + for domain in sorted(emp["skills"]): + rate = emp["skills"][domain] + w = min(rate / 10.0 * 100, 100) + skills_html += ( + f'
' + f'{domain}' + f'
' + f'
' + f'{rate:.1f}
') + + work = "" + if idle: + work = f'IDLE' + else: + for ta in emp["tasks"]: + si = "🔵" if ta["status"] == "active" else "⏳" + work += f'
{si} {_esc(ta["title"])}
' + + bc = f"{ACCENT_RED}44" if idle else "#2a2d3544" + with emp_cols[i % 2]: + st.markdown(f""" +
+
+ {_esc(emp['name'])} + {emp['tier'].upper()} + {sal}/mo +
+
+
{skills_html}
+
{work}
+
+
{emp['completed']} tasks completed · {emp['hours_per_day']:.0f}h/day
+
""", unsafe_allow_html=True) + + # ── RIGHT COLUMN: Clients ── + with col_right: + st.markdown('
Clients
', unsafe_allow_html=True) + for c in sorted(world["clients"], key=lambda x: -x["trust"]): + trust = c["trust"] + trust_pct = trust / 5.0 * 100 + tier_map = {"Standard": ACCENT_BLUE, "Premium": ACCENT_YELLOW, "Enterprise": ACCENT_PURPLE} + tc = tier_map.get(c["tier"], ACCENT_BLUE) + bar_c = ACCENT_GREEN if trust >= 3 else (ACCENT_BLUE if trust >= 1 else GRID_COLOR) + + lc = c["loyalty_class"] + border_c = "#2a2d3544" + loyalty_badge = "" + hidden_row = "" + if god_mode: + if lc == "RAT": + border_c = f"{ACCENT_RED}55" + loyalty_badge = f'🐀 RAT {c["loyalty"]:+.2f}' + elif lc == "LOYAL": + border_c = f"{ACCENT_GREEN}33" + loyalty_badge = f'✦ LOYAL {c["loyalty"]:+.2f}' + else: + loyalty_badge = f'— {c["loyalty"]:+.2f}' + + if c["effects_active"]: + hidden_row = ( + f'
' + f'Next task from this client: work inflated +{c["scope_creep_pct"]:.0f}%, ' + f'{c["dispute_prob"]:.0f}% chance of clawing back up to ' + f'{world["wc"].dispute_clawback_max * abs(c["loyalty"]) * (c["trust"] - world["wc"].loyalty_reveal_trust) / (world["wc"].trust_max - world["wc"].loyalty_reveal_trust) * 100:.0f}% of reward
') + elif lc == "RAT": + reveal = world["wc"].loyalty_reveal_trust + hidden_row = ( + f'
' + f'RAT effects dormant until trust reaches {reveal:.1f} (currently {trust:.1f})
') + + record = f'{c["completed"]}✓' + if c["failed"]: record += f' {c["failed"]}✗' + if c["active"]: record += f' {c["active"]}⟳' + + finance = "" + if god_mode and c["completed"] > 0: + net = c["received_total"] - c["dispute_total"] + lost = c["dispute_total"] + finance_parts = f'Promised: ${c["listed_total"]/100:,.0f}' + if lost > 0: + finance_parts += ( + f' · Received: ${net/100:,.0f}' + f' · {c["dispute_n"]} dispute{"s" if c["dispute_n"] != 1 else ""}' + f' (-${lost/100:,.0f})') + else: + finance_parts += f' · Received: ${net/100:,.0f}' + finance = f'
{finance_parts}
' + + spec = " · ".join(c["specialties"]) if c["specialties"] else "—" + + st.markdown( + f'
' + f'
' + f'
' + f'{_esc(c["name"])}' + f'{c["tier"]}' + f'{loyalty_badge}
' + f'
' + f'
' + f'
' + f'{trust:.1f}
' + f'
' + f'{spec}' + f'{c["reward_mult"]:.2f}x' + f'{record}
' + f'{finance}{hidden_row}
', + unsafe_allow_html=True, + ) + + + # Auto-refresh -# --------------------------------------------------------------------------- - -st.markdown("
", unsafe_allow_html=True) -col_r1, col_r2 = st.columns([3, 1]) -with col_r2: - auto = st.toggle("Auto-refresh", value=True) -if auto: - time.sleep(5) - st.rerun() +time.sleep(5) +st.rerun() diff --git a/src/yc_bench/agent/loop.py b/src/yc_bench/agent/loop.py index fee8f98..04cf2f1 100644 --- a/src/yc_bench/agent/loop.py +++ b/src/yc_bench/agent/loop.py @@ -41,6 +41,11 @@ def _snapshot_state(db: Session, company_id): Employee.company_id == company_id, ).scalar() or 0 + # Read scratchpad if it exists + from ..db.models.scratchpad import Scratchpad + scratchpad = db.query(Scratchpad).filter(Scratchpad.company_id == company_id).one_or_none() + scratchpad_content = scratchpad.content if scratchpad and scratchpad.content else None + return { "sim_time": sim_state.sim_time.isoformat(), "horizon_end": sim_state.horizon_end.isoformat(), @@ -50,6 +55,7 @@ def _snapshot_state(db: Session, company_id): "employee_count": employee_count, "monthly_payroll_cents": int(monthly_payroll), "bankrupt": company.funds_cents < 0, + "scratchpad": scratchpad_content, } @@ -157,6 +163,7 @@ def run_agent_loop( RuntimeTurnRequest( session_id=run_state.session_id, user_input=user_input, + scratchpad=snapshot.get("scratchpad"), ) ) agent_output = result.final_output diff --git a/src/yc_bench/agent/prompt.py b/src/yc_bench/agent/prompt.py index 52fdb4d..71dd8ea 100644 --- a/src/yc_bench/agent/prompt.py +++ b/src/yc_bench/agent/prompt.py @@ -2,64 +2,55 @@ from __future__ import annotations SYSTEM_PROMPT = """\ -You are the autonomous CEO of an AI startup in a deterministic business simulation. \ -Your goal is to maximize company prestige and funds over the simulation horizon while avoiding bankruptcy. +You are the CEO of a startup in a business simulation. Maximize funds and prestige while avoiding bankruptcy. -## How It Works +All actions use `yc-bench` CLI commands via `run_command`. All return JSON. -- All actions are performed via the `run_command` tool, which executes `yc-bench` CLI commands. -- All commands return JSON. Parse the output to make decisions. -- Simulation progression and event processing are managed by the benchmark runtime. -- Business hours are weekdays 09:00-18:00. Nights, weekends, and Feb 29 are skipped. -- Payroll is deducted automatically on the first business day of each month. -- If funds go below zero after any event or payroll, the company goes bankrupt and the run ends. +## Core Workflow (repeat every turn) -## Available Commands +**You must always have active tasks running. Every turn, follow this loop:** + +1. `yc-bench market browse` — pick a task +2. `yc-bench task accept --task-id Task-42` — accept it +3. `yc-bench task assign --task-id Task-42 --employees Emp_1,Emp_4,Emp_7` — assign employees (check `employee list` for skill rates) +4. `yc-bench task dispatch --task-id Task-42` — start work +5. `yc-bench sim resume` — advance to next event (requires active tasks) + +Run multiple tasks concurrently when possible. Accept → assign → dispatch a second task before calling sim resume. + +**Use `yc-bench scratchpad write`** to save strategy notes — your conversation history is truncated after 20 turns, but scratchpad persists in the system prompt. Write rules, not events (e.g. "assign Emp_1,Emp_4,Emp_7 for inference tasks" not "Task-42 failed"). + +## Commands ### Observe -- `yc-bench company status` — funds, prestige, employee count, payroll, bankruptcy risk -- `yc-bench employee list` — list all employees with IDs, tier (junior/mid/senior), salaries, and current assignments -- `yc-bench market browse [--domain X] [--required-prestige-lte N] [--reward-min-cents N] [--limit N] [--offset N]` — browse available tasks (default limit 50; the response includes a `total` field — if total > 50, paginate with --offset to see more). Tasks show `client_name` and `required_trust`. -- `yc-bench task list [--status X]` — list your tasks (planned, active, completed, cancelled) -- `yc-bench task inspect --task-id ` — detailed task info (requirements, assignments, progress, client, trust requirement) -- `yc-bench client list` — show all clients with current trust levels -- `yc-bench finance ledger [--from MM/DD/YYYY] [--to MM/DD/YYYY] [--category X]` — financial history -- `yc-bench report monthly [--from-month YYYY-MM] [--to-month YYYY-MM]` — monthly P&L -- `yc-bench scratchpad read` — read your persistent notes - -### Memory (scratchpad) -- `yc-bench scratchpad write --content "text"` — overwrite scratchpad with new notes -- `yc-bench scratchpad append --content "text"` — append a line to existing notes -- `yc-bench scratchpad clear` — erase all notes -- Use the scratchpad to store key decisions, task deadlines, employee assignments, and strategy notes. Context is periodically truncated — anything important should be written here. +- `yc-bench company status` — funds, prestige, payroll +- `yc-bench employee list` — employees with skill rates per domain +- `yc-bench market browse [--domain X] [--reward-min-cents N] [--limit N]` — available tasks +- `yc-bench task list [--status X]` — your tasks +- `yc-bench task inspect --task-id Task-42` — task details +- `yc-bench client list` — clients with trust levels +- `yc-bench client history` — per-client success/failure rates +- `yc-bench finance ledger` — financial history +- `yc-bench scratchpad read` — your persistent notes ### Act -- `yc-bench task accept --task-id ` — accept a market task (sets deadline, generates replacement) -- `yc-bench task assign --task-id --employee-id ` — assign employee to task -- `yc-bench task dispatch --task-id ` — start work on a planned task (must have assignments) -- `yc-bench task cancel --task-id --reason "text"` — cancel a task (prestige penalty: 1.2x reward delta) -- `yc-bench sim resume` — advance simulation to the next checkpoint event and return wake events +- `yc-bench task accept --task-id Task-42` — accept from market +- `yc-bench task assign --task-id Task-42 --employees Emp_1,Emp_4,Emp_7` — assign employees (comma-separated) +- `yc-bench task dispatch --task-id Task-42` — start work (must assign first) +- `yc-bench task cancel --task-id Task-42 --reason "text"` — cancel (prestige penalty) +- `yc-bench sim resume` — advance time +- `yc-bench scratchpad write --content "text"` — save notes +- `yc-bench scratchpad append --content "text"` — append notes -## Key Rules +## Key Mechanics -- Task completion at or before deadline = success (reward funds + prestige + skill boost + client trust gain) -- Task completion after deadline = failure (0.8x prestige penalty, no reward, trust penalty) -- Task cancellation = 1.2x prestige penalty per domain + trust penalty (worse than failure) -- Employee throughput = base_rate / number_of_active_tasks_assigned -- Time advances only when you run `yc-bench sim resume` — it jumps to the next event (task milestone at 25/50/75%, task completion, or monthly payroll). **Warning**: calling `sim resume` with no active tasks just skips to the next payroll, burning runway with zero revenue. -- Prestige is clamped [1, 10]. Funds are in cents. - -## Client Trust - -- Each task is offered by a specific **client** (e.g. "Nexus AI", "Vertex Labs"). -- Each client has **specialty domains** (e.g. "research", "training"). Tasks from a client are biased toward their specialties. -- Use `yc-bench client list` to see each client's specialties and current trust level. - -### Mechanics -- Completing tasks for a client builds **trust** [0.0–5.0]. Trust gains diminish as you approach max. -- Trusted clients require less work (up to 35% reduction at max trust). -- Some tasks require minimum trust to accept (required_trust 1-4). -- Trust decays daily. Task failure and cancellation reduce trust. +- **Salary bumps**: completed tasks raise salary for every assigned employee. Assigning all 8 to every task compounds payroll until it exceeds revenue — assign 3-4 domain specialists instead. +- **Throughput split**: employees on multiple active tasks split their rate (rate/sqrt(N)). Two tasks run at ~71% each. +- **Deadlines**: success before deadline = reward + prestige. Failure = prestige penalty, no reward. +- **Trust**: completing tasks for a client builds trust → less work per task, access to gated tasks. Working for one client erodes trust with others. +- **Not all clients are reliable.** Check `client history` for failure patterns. +- **Payroll**: deducted monthly. Funds < 0 = bankruptcy. +- Prestige grows per domain. Higher prestige unlocks better-paying tasks. """ @@ -74,6 +65,7 @@ def build_turn_context( monthly_payroll_cents: int, bankrupt: bool, last_wake_events: list | None = None, + scratchpad: str | None = None, ) -> str: """Build per-turn context message injected as user input.""" runway_months = ( @@ -83,6 +75,9 @@ def build_turn_context( ) runway_str = f"~{runway_months} months" if runway_months is not None else "∞ (no payroll)" + history_limit = 20 + turns_until_truncation = max(0, history_limit - turn_number) + parts = [ f"## Turn {turn_number} — Simulation State", f"- **Current time**: {sim_time}", @@ -93,6 +88,7 @@ def build_turn_context( f"- **Employees**: {employee_count}", f"- **Active tasks**: {active_tasks}", f"- **Planned tasks**: {planned_tasks}", + f"- **Memory**: oldest messages drop after turn 20 ({turns_until_truncation} turns left). Use scratchpad to save important observations.", ] if bankrupt: @@ -104,11 +100,27 @@ def build_turn_context( ev_type = ev.get("type", "unknown") if ev_type == "task_completed": success = ev.get("success", False) - tid = ev.get("task_id", "?") - parts.append(f"- Task {tid}: {'SUCCESS' if success else 'FAILED'}") + title = ev.get("task_title") or ev.get("task_id", "?") + client = ev.get("client_name", "") + client_str = f" (client: {client})" if client else "" + funds = ev.get("funds_delta", 0) + funds_str = f" +${funds/100:,.0f}" if success and funds else "" + margin = ev.get("deadline_margin", "") + margin_str = f" [{margin}]" if margin else "" + n_emp = ev.get("employees_assigned", 0) + bump = ev.get("salary_bump_total_cents", 0) + bump_str = f" | {n_emp} employees, +${bump/100:,.0f}/mo payroll" if bump > 0 else f" | {n_emp} employees" if n_emp else "" + if success: + parts.append(f"- {title}{client_str}: SUCCESS{funds_str}{margin_str}{bump_str}") + else: + parts.append(f"- {title}{client_str}: FAILED — missed deadline{margin_str}, no reward") elif ev_type == "task_half": pct = ev.get("milestone_pct", "?") parts.append(f"- Task {ev.get('task_id', '?')}: {pct}% progress reached") + elif ev_type == "payment_dispute": + clawback = ev.get("clawback_cents", 0) + client_name = ev.get("client_name", "unknown") + parts.append(f"- PAYMENT DISPUTE from {client_name}: -${clawback / 100:,.2f} clawed back") elif ev_type == "horizon_end": parts.append("- **Horizon end reached. Simulation complete.**") elif ev_type == "bankruptcy": @@ -131,6 +143,9 @@ def build_turn_context( else: parts.append("\nDecide your next actions. Use `run_command` to execute CLI commands.") + # Scratchpad is injected in the system prompt, not here (avoids duplication + # across the 20-turn history window). + return "\n".join(parts) @@ -144,6 +159,7 @@ def build_initial_user_prompt( monthly_payroll_cents: int, bankrupt: bool, episode: int = 1, + scratchpad: str | None = None, ) -> str: """Build the one-time initial user message at run start.""" runway_months = ( @@ -182,16 +198,16 @@ def build_initial_user_prompt( f"- planned_tasks: {planned_tasks}", "", "**Your immediate priority**: generate revenue before payroll drains your runway.", - "You MUST complete these steps now (multiple commands per turn are fine):", - "1. `yc-bench company status` — check your current prestige levels", - "2. `yc-bench market browse` — find tasks you can accept (use `--required-prestige-lte N` matching your prestige)", - "3. `yc-bench task accept --task-id ` — accept 2-3 suitable tasks", - "4. `yc-bench employee list` — get employee IDs", - "5. `yc-bench task assign --task-id --employee-id ` — assign employees", - "6. `yc-bench task dispatch --task-id ` — start work on each assigned task", - "7. `yc-bench sim resume` — advance time to collect the first task completion event", + "Complete these steps now (multiple commands per turn are fine):", + "1. `yc-bench market browse` — see available tasks", + "2. `yc-bench task accept --task-id Task-42` — accept a task", + "3. `yc-bench task assign-all --task-id Task-42` — assign employees (or use `task assign` to pick individuals)", + "4. `yc-bench task dispatch --task-id Task-42` — start work", + "5. `yc-bench sim resume` — advance time", "", - "Do not spend multiple turns just browsing. Accept and dispatch tasks immediately.", + "**IMPORTANT**: Check each command's result before proceeding to the next.", + "If `task accept` fails (trust or prestige too low), try a different task.", + "Do NOT call `sim resume` unless you have at least one active task — it will skip forward with zero revenue.", ]) if bankrupt: lines.append("WARNING: company is already bankrupt at initialization.") diff --git a/src/yc_bench/agent/runtime/litellm_runtime.py b/src/yc_bench/agent/runtime/litellm_runtime.py index b21445a..55d9e5b 100644 --- a/src/yc_bench/agent/runtime/litellm_runtime.py +++ b/src/yc_bench/agent/runtime/litellm_runtime.py @@ -44,6 +44,7 @@ _RUN_COMMAND_TOOL = { @dataclass class _Session: messages: list = field(default_factory=list) + scratchpad: str | None = None class LiteLLMRuntime(AgentRuntime): @@ -92,6 +93,8 @@ class LiteLLMRuntime(AgentRuntime): def run_turn(self, request): session = self._get_or_create_session(request.session_id) + # Update scratchpad on session (appended to system prompt, not message history) + session.scratchpad = request.scratchpad # Proactively drop old rounds before appending new input. self._proactive_truncate(session) session.messages.append({"role": "user", "content": request.user_input}) @@ -153,6 +156,9 @@ class LiteLLMRuntime(AgentRuntime): def _do_turn(self, session): """One LLM call + tool execution. Returns (final_output, tool_calls_made, resume_payload, cost_usd).""" system_prompt = self._settings.system_prompt or SYSTEM_PROMPT + # Append scratchpad to system prompt (avoids duplication in message history) + if session.scratchpad: + system_prompt = system_prompt + f"\n\n## Your Scratchpad Notes\n{session.scratchpad}" messages = [{"role": "system", "content": system_prompt}] + session.messages kwargs = dict( diff --git a/src/yc_bench/agent/runtime/schemas.py b/src/yc_bench/agent/runtime/schemas.py index 7806d45..927f653 100644 --- a/src/yc_bench/agent/runtime/schemas.py +++ b/src/yc_bench/agent/runtime/schemas.py @@ -19,6 +19,7 @@ class RuntimeSettings: class RuntimeTurnRequest: session_id: str user_input: str + scratchpad: str | None = None @dataclass(frozen=True) class RuntimeTurnResult: diff --git a/src/yc_bench/cli/__init__.py b/src/yc_bench/cli/__init__.py index f33e1b0..fe13666 100644 --- a/src/yc_bench/cli/__init__.py +++ b/src/yc_bench/cli/__init__.py @@ -18,11 +18,20 @@ app = typer.Typer(name="yc-bench", add_completion=False) # Helpers shared across command modules # --------------------------------------------------------------------------- +_engine_cache = {} + @contextmanager def get_db(): - """Yield a transactional SQLAlchemy session, commit on success.""" - engine = build_engine() - init_db(engine) + """Yield a transactional SQLAlchemy session, commit on success. + + Reuses the same engine across calls to avoid SQLite connection visibility issues. + """ + from ..db.session import _get_database_url + db_url = _get_database_url() + if db_url not in _engine_cache: + _engine_cache[db_url] = build_engine() + init_db(_engine_cache[db_url]) + engine = _engine_cache[db_url] factory = build_session_factory(engine) with session_scope(factory) as session: yield session diff --git a/src/yc_bench/cli/client_commands.py b/src/yc_bench/cli/client_commands.py index a44ed08..bcdf24c 100644 --- a/src/yc_bench/cli/client_commands.py +++ b/src/yc_bench/cli/client_commands.py @@ -1,9 +1,12 @@ from __future__ import annotations import typer +from sqlalchemy import func from ..db.models.client import Client, ClientTrust +from ..db.models.ledger import LedgerCategory, LedgerEntry from ..db.models.sim_state import SimState +from ..db.models.task import Task, TaskStatus from . import get_db, json_output, error_output client_app = typer.Typer(help="Client management commands.") @@ -36,3 +39,51 @@ def client_list(): "count": len(results), "clients": results, }) + + +@client_app.command("history") +def client_history(): + """Show per-client task history: successes, failures, listed vs actual rewards, disputes.""" + with get_db() as db: + sim_state = db.query(SimState).first() + if sim_state is None: + error_output("No simulation found.") + + company_id = sim_state.company_id + clients = db.query(Client).order_by(Client.name).all() + results = [] + + for c in clients: + # Count successes and failures + success_count = db.query(func.count(Task.id)).filter( + Task.company_id == company_id, + Task.client_id == c.id, + Task.status == TaskStatus.COMPLETED_SUCCESS, + ).scalar() or 0 + + fail_count = db.query(func.count(Task.id)).filter( + Task.company_id == company_id, + Task.client_id == c.id, + Task.status == TaskStatus.COMPLETED_FAIL, + ).scalar() or 0 + + ct = db.query(ClientTrust).filter( + ClientTrust.company_id == company_id, + ClientTrust.client_id == c.id, + ).one_or_none() + + total = success_count + fail_count + fail_rate = round(fail_count / total * 100, 1) if total > 0 else 0.0 + + results.append({ + "client_name": c.name, + "trust_level": float(ct.trust_level) if ct else 0.0, + "tasks_succeeded": success_count, + "tasks_failed": fail_count, + "failure_rate_pct": fail_rate, + }) + + json_output({ + "count": len(results), + "client_history": results, + }) diff --git a/src/yc_bench/cli/employee_commands.py b/src/yc_bench/cli/employee_commands.py index fd58191..a27b839 100644 --- a/src/yc_bench/cli/employee_commands.py +++ b/src/yc_bench/cli/employee_commands.py @@ -3,7 +3,7 @@ from __future__ import annotations import typer from sqlalchemy import func -from ..db.models.employee import Employee +from ..db.models.employee import Employee, EmployeeSkillRate from ..db.models.task import Task, TaskAssignment, TaskStatus from ..db.models.sim_state import SimState from . import get_db, json_output, error_output @@ -25,26 +25,33 @@ def employee_list(): results = [] for emp in employees: - # Current active assignments + # Current active assignments (show task titles, not UUIDs) active_assignments = ( - db.query(TaskAssignment.task_id) - .join(Task, Task.id == TaskAssignment.task_id) + db.query(Task) + .join(TaskAssignment, Task.id == TaskAssignment.task_id) .filter( TaskAssignment.employee_id == emp.id, Task.status == TaskStatus.ACTIVE, ) .all() ) - active_task_ids = [str(a.task_id) for a in active_assignments] + active_tasks = [t.title for t in active_assignments] + + # Skill rates per domain + skill_rows = db.query(EmployeeSkillRate).filter( + EmployeeSkillRate.employee_id == emp.id + ).all() + skill_rates = { + r.domain.value: round(float(r.rate_domain_per_hour), 2) + for r in skill_rows + } results.append({ - "employee_id": str(emp.id), "name": emp.name, "tier": emp.tier, "salary_cents": emp.salary_cents, - "work_hours_per_day": float(emp.work_hours_per_day), - "active_task_count": len(active_task_ids), - "active_task_ids": active_task_ids, + "skill_rates": skill_rates, + "active_tasks": active_tasks, }) json_output({ diff --git a/src/yc_bench/cli/market_commands.py b/src/yc_bench/cli/market_commands.py index ebce5cf..e8305b0 100644 --- a/src/yc_bench/cli/market_commands.py +++ b/src/yc_bench/cli/market_commands.py @@ -4,8 +4,9 @@ from typing import Optional import typer -from ..db.models.client import Client -from ..db.models.company import Domain +from ..db.models.client import Client, ClientTrust +from ..db.models.company import CompanyPrestige, Domain +from ..db.models.sim_state import SimState from ..db.models.task import Task, TaskRequirement, TaskStatus from ..config import get_world_config from . import get_db, json_output, error_output @@ -27,8 +28,20 @@ def market_browse( with get_db() as db: query = db.query(Task).filter(Task.status == TaskStatus.MARKET) - if required_prestige_lte is not None: - query = query.filter(Task.required_prestige <= required_prestige_lte) + # Filter to only tasks the agent can actually accept: + # - Per-domain prestige check (not just max — all task domains must be met) + # - Trust requirement check + sim_state = db.query(SimState).first() + if sim_state: + prestige_rows = db.query(CompanyPrestige).filter( + CompanyPrestige.company_id == sim_state.company_id + ).all() + prestige_map = {p.domain: int(float(p.prestige_level)) for p in prestige_rows} + min_prestige = min(prestige_map.values()) if prestige_map else 1 + # Quick filter: required_prestige must be <= min domain prestige to guarantee acceptance + # Tasks between min and max prestige MIGHT be acceptable (depends on domains) + max_prestige = max(prestige_map.values()) if prestige_map else 1 + query = query.filter(Task.required_prestige <= max_prestige) if reward_min_cents is not None: query = query.filter(Task.reward_funds_cents >= reward_min_cents) @@ -43,12 +56,42 @@ def market_browse( ) ) - total = query.count() - tasks = query.order_by(Task.reward_funds_cents.desc()).offset(offset).limit(limit).all() + # Build trust map for filtering + trust_map = {} + if sim_state: + trust_rows = db.query(ClientTrust).filter( + ClientTrust.company_id == sim_state.company_id + ).all() + trust_map = {ct.client_id: float(ct.trust_level) for ct in trust_rows} + + # Fetch more than limit, then post-filter to per-domain prestige + trust + raw_tasks = query.order_by(Task.reward_funds_cents.desc()).all() results = [] - for task in tasks: + skipped = 0 + for task in raw_tasks: reqs = db.query(TaskRequirement).filter(TaskRequirement.task_id == task.id).all() + + # Per-domain prestige check: ALL domains must meet threshold + if sim_state and prestige_map: + meets_prestige = all( + prestige_map.get(r.domain, 1) >= task.required_prestige + for r in reqs + ) + if not meets_prestige: + continue + + # Trust requirement check + if task.required_trust > 0 and task.client_id is not None: + client_trust = trust_map.get(task.client_id, 0.0) + if client_trust < task.required_trust: + continue + + # Pagination + if skipped < offset: + skipped += 1 + continue + requirements = [ { "domain": r.domain.value, @@ -64,8 +107,7 @@ def market_browse( client_name = client_row.name results.append({ - "task_id": str(task.id), - "title": task.title, + "task_id": task.title, "client_name": client_name, "required_prestige": task.required_prestige, "required_trust": task.required_trust, @@ -75,8 +117,11 @@ def market_browse( "requirements": requirements, }) + if len(results) >= limit: + break + json_output({ - "total": total, + "total": len(results), "offset": offset, "limit": limit, "tasks": results, diff --git a/src/yc_bench/cli/sim_commands.py b/src/yc_bench/cli/sim_commands.py index 53d613b..fe0defb 100644 --- a/src/yc_bench/cli/sim_commands.py +++ b/src/yc_bench/cli/sim_commands.py @@ -95,58 +95,101 @@ def sim_init( @sim_app.command("resume") def sim_resume(): - """Advance simulation to the next queued event checkpoint and return wake results.""" + """Advance simulation to the next actionable event (task completion, bankruptcy, or horizon end). + + Automatically skips past payroll-only events so each resume lands on + something the agent can act on. + """ with get_db() as db: sim_state = db.query(SimState).first() if sim_state is None: error_output("No simulation found. Run `yc-bench sim init` first.") company = db.query(Company).filter(Company.id == sim_state.company_id).one() - next_event = fetch_next_event( - db=db, - company_id=sim_state.company_id, - up_to=sim_state.horizon_end, - ) + # Block resume if no active tasks — forces the agent to accept/dispatch first + from ..db.models.task import Task, TaskStatus + active_count = db.query(Task).filter( + Task.company_id == sim_state.company_id, + Task.status == TaskStatus.ACTIVE, + ).count() + if active_count == 0: + error_output( + "No active tasks. Accept and dispatch a task before calling sim resume. " + "Use: market browse → task accept → task dispatch → sim resume" + ) - if next_event is None: - terminal_reason = None - bankrupt = company.funds_cents < 0 - horizon_reached = sim_state.sim_time >= sim_state.horizon_end - if bankrupt: - terminal_reason = "bankruptcy" - elif horizon_reached: - terminal_reason = "horizon_end" + # Keep advancing until we hit an actionable event (task completion, + # bankruptcy, horizon end) or run out of events. + # Only fast-forward past payrolls when there are active tasks to wait for. + all_wake_events = [] + total_balance_delta = 0 + total_payrolls = 0 + total_events = 0 + last_checkpoint_type = None - json_output({ - "ok": True, - "message": "no_pending_events", - "old_sim_time": sim_state.sim_time.isoformat(), - "new_sim_time": sim_state.sim_time.isoformat(), - "events_processed": 0, - "payrolls_applied": 0, - "balance_delta": 0, - "wake_events": [], - "bankrupt": bankrupt, - "horizon_reached": horizon_reached, - "terminal_reason": terminal_reason, - }) - return + while True: + next_event = fetch_next_event( + db=db, + company_id=sim_state.company_id, + up_to=sim_state.horizon_end, + ) - checkpoint_type = next_event.event_type.value - result = advance_time( - db=db, - company_id=sim_state.company_id, - target_time=next_event.scheduled_at, - ) + if next_event is None: + break + + last_checkpoint_type = next_event.event_type.value + result = advance_time( + db=db, + company_id=sim_state.company_id, + target_time=next_event.scheduled_at, + ) + + total_events += result.events_processed + total_payrolls += result.payrolls_applied + total_balance_delta += result.balance_delta + all_wake_events.extend(result.wake_events) + + # Stop on terminal or actionable events + if result.bankrupt or result.horizon_reached: + break + if result.wake_events: + # Has task completions or other actionable events + break + + # Only fast-forward past payrolls if there are active tasks waiting. + # If no active tasks, stop here so the agent can accept new work. + active_count = db.query(Task).filter( + Task.company_id == sim_state.company_id, + Task.status == TaskStatus.ACTIVE, + ).count() + if active_count == 0: + break + + # Reload sim_state for next iteration + db.refresh(sim_state) + + # Reload final state + db.refresh(sim_state) + company = db.query(Company).filter(Company.id == sim_state.company_id).one() terminal_reason = None - if result.bankrupt: + bankrupt = company.funds_cents < 0 + horizon_reached = sim_state.sim_time >= sim_state.horizon_end + if bankrupt: terminal_reason = "bankruptcy" - elif result.horizon_reached: + elif horizon_reached: terminal_reason = "horizon_end" - payload = result.__dict__.copy() - payload["ok"] = True - payload["checkpoint_event_type"] = checkpoint_type - payload["terminal_reason"] = terminal_reason - json_output(payload) + json_output({ + "ok": True, + "old_sim_time": sim_state.sim_time.isoformat(), + "new_sim_time": sim_state.sim_time.isoformat(), + "checkpoint_event_type": last_checkpoint_type, + "events_processed": total_events, + "payrolls_applied": total_payrolls, + "balance_delta": total_balance_delta, + "wake_events": all_wake_events, + "bankrupt": bankrupt, + "horizon_reached": horizon_reached, + "terminal_reason": terminal_reason, + }) diff --git a/src/yc_bench/cli/task_commands.py b/src/yc_bench/cli/task_commands.py index d126a37..5b715a3 100644 --- a/src/yc_bench/cli/task_commands.py +++ b/src/yc_bench/cli/task_commands.py @@ -39,22 +39,18 @@ def _compute_deadline(accepted_at: datetime, max_domain_qty: float, cfg=None) -> @task_app.command("accept") def task_accept( - task_id: str = typer.Option(..., "--task-id", help="UUID of the task to accept"), + task_id: str = typer.Option(..., "--task-id", help="Task UUID or title (e.g. Task-42)"), ): """Accept a market task: transition to planned, assign to company, generate replacement.""" - try: - tid = UUID(task_id) - except ValueError: - error_output(f"Invalid UUID: {task_id}") - with get_db() as db: sim_state = db.query(SimState).first() if sim_state is None: error_output("No simulation found. Run `yc-bench sim init` first.") - task = db.query(Task).filter(Task.id == tid).one_or_none() + task = _resolve_task(db, task_id) if task is None: - error_output(f"Task {task_id} not found.") + error_output(f"Task '{task_id}' not found.") + tid = task.id if task.status != TaskStatus.MARKET: error_output(f"Task {task_id} is not in market status (current: {task.status.value}).") @@ -98,10 +94,32 @@ def task_accept( for r in reqs: r.required_qty = int(float(r.required_qty) * (1 - work_reduction)) + # Compute deadline from advertised qty BEFORE scope creep max_domain_qty = max(float(r.required_qty) for r in reqs) accepted_at = sim_state.sim_time deadline = _compute_deadline(accepted_at, max_domain_qty) + # Store advertised reward before any dispute can alter it + task.advertised_reward_cents = task.reward_funds_cents + + # Scope creep: RAT clients inflate required_qty after accept. + # Minimum inflation ensures ALL RAT tasks exceed deadline (which was + # computed from pre-creep qty). The agent can't tell from the deadline + # alone — the trap only springs after accept. + if task.client_id is not None: + client_row = db.query(Client).filter(Client.id == task.client_id).one_or_none() + if client_row and client_row.loyalty < -0.3: + intensity = abs(client_row.loyalty) + inflation = _cfg.scope_creep_max * intensity + # Ensure enough inflation to bust the deadline: + # deadline_hours = deadline_min_biz_days * work_hours + # need inflated_qty / effective_rate > deadline_hours + # Conservative: at least 130% inflation so even small tasks fail + inflation = max(1.3, inflation) + for r in reqs: + inflated = float(r.required_qty) * (1 + inflation) + r.required_qty = int(min(25000, max(200, inflated))) + # Transition task task.status = TaskStatus.PLANNED task.company_id = company_id @@ -172,120 +190,159 @@ def task_accept( db.flush() json_output({ - "task_id": str(task.id), + "task_id": task.title, "status": task.status.value, "accepted_at": accepted_at.isoformat(), "deadline": deadline.isoformat(), - "replacement_task_id": str(replacement_row.id), + "replacement_task_id": replacement_row.title, }) +def _resolve_employee(db, company_id, identifier: str): + """Resolve employee by UUID or name (e.g. 'Emp_1').""" + try: + eid = UUID(identifier) + return db.query(Employee).filter(Employee.id == eid, Employee.company_id == company_id).one_or_none() + except ValueError: + pass + # Try name lookup + return db.query(Employee).filter(Employee.name == identifier, Employee.company_id == company_id).one_or_none() + + +def _resolve_task(db, identifier: str): + """Resolve task by UUID or title (e.g. 'Task-42'). + + If multiple tasks share the same title (original + replacement), prefer + the one that's actionable (market/planned/active) over completed ones. + """ + try: + tid = UUID(identifier) + return db.query(Task).filter(Task.id == tid).one_or_none() + except ValueError: + pass + # Title lookup — prefer actionable tasks over completed ones + matches = db.query(Task).filter(Task.title == identifier).all() + if not matches: + return None + if len(matches) == 1: + return matches[0] + # Prefer: market > planned > active > completed + priority = {TaskStatus.MARKET: 0, TaskStatus.PLANNED: 1, TaskStatus.ACTIVE: 2} + matches.sort(key=lambda t: priority.get(t.status, 9)) + return matches[0] + + @task_app.command("assign") def task_assign( - task_id: str = typer.Option(..., "--task-id", help="UUID of the task"), - employee_id: str = typer.Option(..., "--employee-id", help="UUID of the employee"), + task_id: str = typer.Option(..., "--task-id", help="Task UUID or title (e.g. Task-42)"), + employees: str = typer.Option(..., "--employees", help="Comma-separated employee names (e.g. Emp_1,Emp_4,Emp_7)"), ): - """Assign an employee to a task.""" - try: - tid = UUID(task_id) - eid = UUID(employee_id) - except ValueError: - error_output("Invalid UUID provided.") - + """Assign one or more employees to a task.""" + employee_id = [e.strip() for e in employees.split(",") if e.strip()] with get_db() as db: sim_state = db.query(SimState).first() if sim_state is None: error_output("No simulation found.") - task = db.query(Task).filter(Task.id == tid).one_or_none() + task = _resolve_task(db, task_id) if task is None: - error_output(f"Task {task_id} not found.") + error_output(f"Task '{task_id}' not found.") + tid = task.id if task.status not in (TaskStatus.PLANNED, TaskStatus.ACTIVE): error_output(f"Task {task_id} must be planned or active to assign (current: {task.status.value}).") if task.company_id != sim_state.company_id: error_output(f"Task {task_id} does not belong to your company.") - employee = db.query(Employee).filter(Employee.id == eid).one_or_none() - if employee is None: - error_output(f"Employee {employee_id} not found.") - if employee.company_id != sim_state.company_id: - error_output(f"Employee {employee_id} does not belong to your company.") + assigned_names = [] + for eid_str in employee_id: + employee = _resolve_employee(db, sim_state.company_id, eid_str) + if employee is None: + error_output(f"Employee '{eid_str}' not found.") + eid = employee.id - # Check if already assigned - existing = db.query(TaskAssignment).filter( - TaskAssignment.task_id == tid, - TaskAssignment.employee_id == eid, - ).one_or_none() - if existing is not None: - error_output(f"Employee {employee_id} is already assigned to task {task_id}.") + # Skip if already assigned + existing = db.query(TaskAssignment).filter( + TaskAssignment.task_id == tid, + TaskAssignment.employee_id == eid, + ).one_or_none() + if existing is not None: + continue + + db.add(TaskAssignment( + task_id=tid, + employee_id=eid, + assigned_at=sim_state.sim_time, + )) + assigned_names.append(employee.name) - assignment = TaskAssignment( - task_id=tid, - employee_id=eid, - assigned_at=sim_state.sim_time, - ) - db.add(assignment) db.flush() - # Recalculate ETAs for all active tasks sharing this employee + # Recalculate ETAs for all active tasks sharing these employees if task.status == TaskStatus.ACTIVE: - emp_assignments = db.query(TaskAssignment).filter( - TaskAssignment.employee_id == eid - ).all() impacted = set() - for ea in emp_assignments: - t = db.query(Task).filter(Task.id == ea.task_id).one_or_none() - if t and t.status == TaskStatus.ACTIVE: - impacted.add(t.id) + for eid_str in employee_id: + emp = _resolve_employee(db, sim_state.company_id, eid_str) + if emp: + for ea in db.query(TaskAssignment).filter(TaskAssignment.employee_id == emp.id).all(): + t = db.query(Task).filter(Task.id == ea.task_id).one_or_none() + if t and t.status == TaskStatus.ACTIVE: + impacted.add(t.id) if impacted: recalculate_etas(db, sim_state.company_id, sim_state.sim_time, impacted, milestones=_get_world_cfg().task_progress_milestones) # Return current assignment list assignments = db.query(TaskAssignment).filter(TaskAssignment.task_id == tid).all() - assignment_list = [ - { - "employee_id": str(a.employee_id), - "assigned_at": a.assigned_at.isoformat(), - } - for a in assignments - ] + assignment_list = [] + for a in assignments: + emp = db.query(Employee).filter(Employee.id == a.employee_id).one_or_none() + assignment_list.append(emp.name if emp else "unknown") json_output({ - "task_id": str(task.id), + "task_id": task.title, "status": task.status.value, - "assignments": assignment_list, + "newly_assigned": assigned_names, + "total_assigned": assignment_list, }) +@task_app.command("assign-all") +def task_assign_all( + task_id: str = typer.Option(..., "--task-id", help="Task UUID or title (e.g. Task-42)"), +): + """Disabled — use `task assign --employees Emp_1,Emp_4,Emp_7` to pick specific employees.""" + error_output( + "assign-all is not available. Use `task assign --task-id --employees Emp_1,Emp_4,Emp_7` to assign specific employees." + ) + + @task_app.command("dispatch") def task_dispatch( - task_id: str = typer.Option(..., "--task-id", help="UUID of the task to dispatch"), + task_id: str = typer.Option(..., "--task-id", help="Task UUID or title (e.g. Task-42)"), ): """Dispatch a planned task to active status.""" - try: - tid = UUID(task_id) - except ValueError: - error_output(f"Invalid UUID: {task_id}") - with get_db() as db: sim_state = db.query(SimState).first() if sim_state is None: error_output("No simulation found.") - task = db.query(Task).filter(Task.id == tid).one_or_none() + task = _resolve_task(db, task_id) if task is None: - error_output(f"Task {task_id} not found.") + error_output(f"Task '{task_id}' not found.") + tid = task.id if task.status != TaskStatus.PLANNED: error_output(f"Task {task_id} must be planned to dispatch (current: {task.status.value}).") if task.company_id != sim_state.company_id: error_output(f"Task {task_id} does not belong to your company.") - # Validate at least one assignment - assignment_count = db.query(func.count(TaskAssignment.employee_id)).filter( + # Require explicit assignment before dispatch + existing_count = db.query(func.count(TaskAssignment.employee_id)).filter( TaskAssignment.task_id == tid ).scalar() or 0 - if assignment_count == 0: - error_output(f"Task {task_id} has no assignments. Assign employees before dispatching.") + if existing_count == 0: + error_output( + "No employees assigned. Use `task assign-all` or `task assign --employee-id Emp_1` first." + ) + db.flush() # Transition to active task.status = TaskStatus.ACTIVE @@ -306,10 +363,17 @@ def task_dispatch( impacted.add(peer_task.id) recalculate_etas(db, sim_state.company_id, sim_state.sim_time, impacted, milestones=_get_world_cfg().task_progress_milestones) + assigned = db.query(TaskAssignment).filter(TaskAssignment.task_id == tid).all() + assigned_names = [] + for a in assigned: + emp = db.query(Employee).filter(Employee.id == a.employee_id).one_or_none() + if emp: + assigned_names.append(emp.name) json_output({ - "task_id": str(task.id), + "task_id": task.title, "status": task.status.value, - "assignment_count": assignment_count, + "assignment_count": len(assigned), + "assigned_employees": assigned_names, }) @@ -355,7 +419,7 @@ def task_list( client_name = client.name results.append({ - "task_id": str(task.id), + "task_id": task.title, "title": task.title, "status": task.status.value, "client_name": client_name, @@ -372,18 +436,14 @@ def task_list( @task_app.command("inspect") def task_inspect( - task_id: str = typer.Option(..., "--task-id", help="UUID of the task to inspect"), + task_id: str = typer.Option(..., "--task-id", help="Task UUID or title (e.g. Task-42)"), ): """Inspect detailed task information.""" - try: - tid = UUID(task_id) - except ValueError: - error_output(f"Invalid UUID: {task_id}") - with get_db() as db: - task = db.query(Task).filter(Task.id == tid).one_or_none() + task = _resolve_task(db, task_id) if task is None: - error_output(f"Task {task_id} not found.") + error_output(f"Task '{task_id}' not found.") + tid = task.id # Requirements reqs = db.query(TaskRequirement).filter(TaskRequirement.task_id == tid).all() @@ -402,8 +462,7 @@ def task_inspect( for a in assignments_raw: emp = db.query(Employee).filter(Employee.id == a.employee_id).one_or_none() assignments.append({ - "employee_id": str(a.employee_id), - "employee_name": emp.name if emp else "unknown", + "employee": emp.name if emp else "unknown", "assigned_at": a.assigned_at.isoformat(), }) @@ -419,7 +478,7 @@ def task_inspect( client_name = client_row.name json_output({ - "task_id": str(task.id), + "task_id": task.title, "title": task.title, "status": task.status.value, "client_name": client_name, @@ -440,23 +499,19 @@ def task_inspect( @task_app.command("cancel") def task_cancel( - task_id: str = typer.Option(..., "--task-id", help="UUID of the task to cancel"), + task_id: str = typer.Option(..., "--task-id", help="Task UUID or title (e.g. Task-42)"), reason: str = typer.Option(..., "--reason", help="Reason for cancellation"), ): """Cancel a task and apply prestige penalty.""" - try: - tid = UUID(task_id) - except ValueError: - error_output(f"Invalid UUID: {task_id}") - with get_db() as db: sim_state = db.query(SimState).first() if sim_state is None: error_output("No simulation found.") - task = db.query(Task).filter(Task.id == tid).one_or_none() + task = _resolve_task(db, task_id) if task is None: - error_output(f"Task {task_id} not found.") + error_output(f"Task '{task_id}' not found.") + tid = task.id if task.status not in (TaskStatus.PLANNED, TaskStatus.ACTIVE): error_output(f"Task {task_id} cannot be cancelled (current: {task.status.value}).") if task.company_id != sim_state.company_id: @@ -533,7 +588,7 @@ def task_cancel( db.flush() json_output({ - "task_id": str(task.id), + "task_id": task.title, "status": task.status.value, "reason": reason, "cancel_penalty_per_domain": float(cancel_penalty), diff --git a/src/yc_bench/config/presets/default.toml b/src/yc_bench/config/presets/default.toml index 0636d6f..d42866c 100644 --- a/src/yc_bench/config/presets/default.toml +++ b/src/yc_bench/config/presets/default.toml @@ -95,6 +95,14 @@ trust_reward_ceiling = 2.6 trust_work_reduction_max = 0.40 trust_gating_fraction = 0.20 +# --- Client loyalty (adversarial clients) --- +# loyalty_rat_fraction: target fraction of clients that are adversarial RATs (~15%) +# loyalty_severity: 0-1, how punishing RATs are (scope creep, dispute clawback) +# loyalty_reveal_trust: trust level where loyalty effects begin manifesting +loyalty_rat_fraction = 0.15 +loyalty_severity = 0.5 +loyalty_reveal_trust = 2.0 + # Daily prestige decay per domain. Domains not exercised lose prestige # over time: -0.005/day → -0.15/month. Untouched domain drops ~1 level # every ~6 months. Prevents single-domain hyper-specialization. diff --git a/src/yc_bench/config/presets/easy.toml b/src/yc_bench/config/presets/easy.toml index 2e619a5..61fe1b2 100644 --- a/src/yc_bench/config/presets/easy.toml +++ b/src/yc_bench/config/presets/easy.toml @@ -52,6 +52,11 @@ trust_reward_ceiling = 2.8 trust_work_reduction_max = 0.40 trust_gating_fraction = 0.15 +# --- Client loyalty (forgiving: few RATs, mild effects) --- +loyalty_rat_fraction = 0.10 +loyalty_severity = 0.3 +loyalty_reveal_trust = 2.5 + [world.dist.required_prestige] type = "triangular" low = 1 diff --git a/src/yc_bench/config/presets/hard.toml b/src/yc_bench/config/presets/hard.toml index 784e971..6825988 100644 --- a/src/yc_bench/config/presets/hard.toml +++ b/src/yc_bench/config/presets/hard.toml @@ -40,7 +40,7 @@ horizon_years = 1 auto_advance_after_turns = 10 [world] -initial_funds_cents = 10_000_000 # $100,000 — must reach prestige 3 before runway runs out +initial_funds_cents = 20_000_000 # $200,000 — ~3 months runway # Inherits num_employees=10, num_market_tasks=200 from default. # Tight deadlines: 2000/220 = 9.1 days. @@ -52,7 +52,7 @@ deadline_qty_per_day = 220.0 penalty_fail_multiplier = 1.4 penalty_cancel_multiplier = 2.0 -# Salary bump — linear, based on tier midpoint. 1% of midpoint per task. +# Salary compounding is noticeable. salary_bump_pct = 0.01 # High-prestige tasks pay substantially more. @@ -66,6 +66,11 @@ trust_reward_ceiling = 2.4 trust_work_reduction_max = 0.35 trust_gating_fraction = 0.25 +# --- Client loyalty (harsh: more RATs, more severe) --- +loyalty_rat_fraction = 0.20 +loyalty_severity = 0.7 +loyalty_reveal_trust = 1.5 + [world.dist.required_prestige] type = "triangular" low = 1 diff --git a/src/yc_bench/config/presets/medium.toml b/src/yc_bench/config/presets/medium.toml index 6b39db5..be7383a 100644 --- a/src/yc_bench/config/presets/medium.toml +++ b/src/yc_bench/config/presets/medium.toml @@ -1,78 +1,77 @@ -# medium — Can the agent climb the prestige ladder? +# medium — Can the agent detect and avoid adversarial clients? # -# Now prestige matters. Most tasks need prestige 2–4. At startup, only -# ~30% of the market is accessible (prestige-1 tasks). The agent must -# complete low-prestige tasks to build domain prestige, then graduate -# to higher-reward tiers. Random domain climbing wastes time — the -# agent should focus on 2–3 domains. +# The environment is survivable for any competent agent. The differentiator +# is the client trust/loyalty system: ~30% of clients are hidden RATs that +# silently inflate work (scope creep) and claw back payments (disputes). # -# Two-domain tasks appear for the first time. Employee assignment -# becomes a cross-domain puzzle: you need good employees in BOTH -# domains to finish on time. +# RAT tasks look identical to normal tasks. The only way to detect them is +# by checking `client history` and noticing listed vs received discrepancies. # -# Economics (5 employees): -# Monthly payroll ≈ $32K. Starting runway ≈ 7.8 months. -# Mode task: 2 domains × 900 units = 1800 total. -# Deadline: max(7, 1800/200) = 9 business days. -# Focused team: 3 on domain1 (3×5.1×9=138/day → 6.5 days) -# 2 on domain2 (2×5.1×9=92/day → 9.8 days). Tight! -# Prestige-1 reward: ~$30K. Prestige-4 reward: $30K × 2.35 = $70K. -# Climbing prestige doubles income per task. -# -# What is tested: -# - Does the agent understand prestige gates market access? -# - Does it specialise in a few domains rather than spreading thin? -# - Can it handle 2-domain task assignments? +# The greedy bot can't detect RATs — it picks tasks blindly and bleeds money. +# A smart LLM that checks history and avoids RATs should clearly outperform. extends = "default" name = "medium" -description = "1-year medium. Prestige ladder active, 2-domain tasks. Tests strategic domain specialisation." +description = "1-year medium. Tests client trust awareness — can the agent detect adversarial clients?" [sim] horizon_years = 1 [loop] -auto_advance_after_turns = 8 +auto_advance_after_turns = 5 [world] -# Inherits num_employees=10, num_market_tasks=200 from default. +num_employees = 8 +initial_funds_cents = 20_000_000 # $200,000 +num_clients = 6 -# Deadline: 1500/150 = 10 days. Moderate pressure. deadline_qty_per_day = 150.0 - -# Real penalties — failing costs prestige, cancelling costs more. +deadline_min_biz_days = 7 penalty_fail_multiplier = 1.0 +penalty_fail_funds_pct = 0.25 # failing costs 25% of advertised reward penalty_cancel_multiplier = 1.5 - -# Moderate salary bump — starts to matter by month 10. +# Salary bumps: each completed task raises salary for ALL assigned employees. +# Assign-all (8 bumps/task) compounds payroll fast → unsustainable. +# Selective assignment (3-4/task) grows slower → survivable for smart agents. salary_bump_pct = 0.01 +reward_prestige_scale = 0.30 +prestige_decay_per_day = 0.0 -# Prestige scaling starting to reward climbing. -reward_prestige_scale = 0.45 - -# --- Client trust (balanced: default build speed, moderate penalties) --- -trust_build_rate = 20.0 -trust_fragility = 0.5 -trust_focus_pressure = 0.5 +# --- Client trust --- +trust_build_rate = 5.0 +trust_fragility = 0.3 +trust_focus_pressure = 0.3 trust_reward_ceiling = 2.6 -trust_work_reduction_max = 0.40 -trust_gating_fraction = 0.20 +trust_work_reduction_max = 0.50 +trust_gating_fraction = 0.30 +trust_gated_reward_boost = 0.50 # trust-4 task pays 3× base (was 1.6× at 0.15) + +# --- Client loyalty --- +loyalty_rat_fraction = 0.20 +loyalty_severity = 1.0 +loyalty_reveal_trust = 0.0 + +task_progress_milestones = [] + +[world.dist.reward_funds_cents] +type = "triangular" +low = 200_000 +high = 1_200_000 +mode = 500_000 [world.dist.required_prestige] type = "triangular" low = 1 -high = 7 -mode = 3 # Most tasks need prestige 2–4. Thin early market. +high = 5 +mode = 1 [world.dist.domain_count] -type = "triangular" -low = 1 -high = 3 -mode = 2 # Most tasks need 2 domains. +type = "constant" +value = 1 [world.dist.required_qty] type = "triangular" -low = 700 -high = 3000 -mode = 1500 # Larger tasks — ~6.5 days with full team, no parallelism. +low = 400 +high = 1500 +mode = 800 diff --git a/src/yc_bench/config/presets/nightmare.toml b/src/yc_bench/config/presets/nightmare.toml index 576a149..fb4d0e4 100644 --- a/src/yc_bench/config/presets/nightmare.toml +++ b/src/yc_bench/config/presets/nightmare.toml @@ -75,6 +75,11 @@ trust_reward_ceiling = 2.2 trust_work_reduction_max = 0.30 trust_gating_fraction = 0.30 +# --- Client loyalty (brutal: lots of RATs, very punishing) --- +loyalty_rat_fraction = 0.25 +loyalty_severity = 0.9 +loyalty_reveal_trust = 1.0 + [world.dist.required_prestige] type = "triangular" low = 1 diff --git a/src/yc_bench/config/presets/tutorial.toml b/src/yc_bench/config/presets/tutorial.toml index 649dda6..6ed4e4a 100644 --- a/src/yc_bench/config/presets/tutorial.toml +++ b/src/yc_bench/config/presets/tutorial.toml @@ -52,6 +52,11 @@ trust_reward_ceiling = 3.0 trust_work_reduction_max = 0.40 trust_gating_fraction = 0.10 +# --- Client loyalty (very forgiving: rare and mild RATs) --- +loyalty_rat_fraction = 0.05 +loyalty_severity = 0.2 +loyalty_reveal_trust = 3.0 + [world.dist.required_prestige] type = "constant" value = 1 # ALL tasks are prestige-1 — no gating at all. diff --git a/src/yc_bench/config/schema.py b/src/yc_bench/config/schema.py index 4ffdd34..ec73bd7 100644 --- a/src/yc_bench/config/schema.py +++ b/src/yc_bench/config/schema.py @@ -132,6 +132,7 @@ class WorldConfig(BaseModel): prestige_max: float prestige_min: float penalty_fail_multiplier: float + penalty_fail_funds_pct: float = 0.0 # fraction of advertised reward deducted on failure penalty_cancel_multiplier: float reward_prestige_scale: float prestige_decay_per_day: float @@ -146,6 +147,11 @@ class WorldConfig(BaseModel): trust_work_reduction_max: float trust_gating_fraction: float + # --- Client loyalty (adversarial clients) --- + loyalty_rat_fraction: float = 0.15 + loyalty_severity: float = 0.5 + loyalty_reveal_trust: float = 2.0 + # --- Derived trust params (computed from knobs above, do not set directly) --- trust_min: float = 0.0 trust_gain_base: float = 0.0 @@ -169,6 +175,12 @@ class WorldConfig(BaseModel): client_tier_enterprise_threshold: float = 1.7 task_specialty_domain_bias: float = 0.7 + # --- Derived loyalty params (computed from knobs above, do not set directly) --- + loyalty_mode: float = 0.61 + scope_creep_max: float = 0.35 + dispute_clawback_max: float = 0.40 + dispute_prob_max: float = 0.25 + # --- Task scaling --- prestige_qty_scale: float deadline_qty_per_day: float @@ -226,6 +238,13 @@ class WorldConfig(BaseModel): self.trust_reward_threshold = max(0.0, 1.0 - 2.0 * self.trust_gating_fraction) self.trust_reward_ramp = min(1.0, 2.0 * self.trust_gating_fraction) + # loyalty params + # loyalty_mode: triangular(-1, 1, mode) where mode produces ~rat_fraction below -0.3 + self.loyalty_mode = 1.0 - 2.6 * self.loyalty_rat_fraction + self.scope_creep_max = self.loyalty_severity * 1.00 + self.dispute_clawback_max = self.loyalty_severity * 1.20 + self.dispute_prob_max = self.loyalty_severity * 1.00 + return self @model_validator(mode="after") diff --git a/src/yc_bench/core/engine.py b/src/yc_bench/core/engine.py index 140a1c3..af422bb 100644 --- a/src/yc_bench/core/engine.py +++ b/src/yc_bench/core/engine.py @@ -86,12 +86,47 @@ def dispatch_event(db: Session, event: SimEvent, sim_time: datetime, company_id: # Recalculate ETAs — freed employees change topology from ..config import get_world_config recalculate_etas(db, company_id, sim_time, milestones=get_world_config().task_progress_milestones) + # Include operational details so the agent can learn from outcomes + from ..db.models.task import Task, TaskAssignment + from ..db.models.client import Client + from ..db.models.employee import Employee + task_row = db.query(Task).filter(Task.id == result.task_id).one_or_none() + client_name = None + task_title = None + deadline_info = None + assigned_employees = [] + salary_bump_total = 0 + if task_row: + task_title = task_row.title + if task_row.client_id: + cl = db.query(Client).filter(Client.id == task_row.client_id).one_or_none() + if cl: + client_name = cl.name + # Deadline vs completion info + if task_row.deadline and task_row.completed_at: + hours_diff = (task_row.deadline - task_row.completed_at).total_seconds() / 3600 + deadline_info = f"{'ahead by' if hours_diff >= 0 else 'late by'} {abs(hours_diff):.0f}h" + # Which employees were assigned + salary bump impact + assignments = db.query(TaskAssignment).filter(TaskAssignment.task_id == result.task_id).all() + wc = get_world_config() + for a in assignments: + emp = db.query(Employee).filter(Employee.id == a.employee_id).one_or_none() + if emp: + bump = int(emp.salary_cents * wc.salary_bump_pct) if result.success else 0 + salary_bump_total += bump + assigned_employees.append(emp.name) return { "type": "task_completed", "task_id": str(result.task_id), + "task_title": task_title, + "client_name": client_name, "success": result.success, "funds_delta": result.funds_delta, + "listed_reward": result.listed_reward, "trust_delta": result.trust_delta, + "deadline_margin": deadline_info, + "employees_assigned": len(assigned_employees), + "salary_bump_total_cents": salary_bump_total, "bankrupt": result.bankrupt, } diff --git a/src/yc_bench/core/events.py b/src/yc_bench/core/events.py index 257840b..a538ac9 100644 --- a/src/yc_bench/core/events.py +++ b/src/yc_bench/core/events.py @@ -19,9 +19,10 @@ from ..db.models.event import EventType, SimEvent # Priority ordering — lower number = higher priority EVENT_PRIORITY: Dict[EventType, int] = { EventType.TASK_COMPLETED: 0, - EventType.BANKRUPTCY: 1, - EventType.TASK_HALF_PROGRESS: 2, - EventType.HORIZON_END: 3, + EventType.PAYMENT_DISPUTE: 1, + EventType.BANKRUPTCY: 2, + EventType.TASK_HALF_PROGRESS: 3, + EventType.HORIZON_END: 4, } diff --git a/src/yc_bench/core/handlers/task_complete.py b/src/yc_bench/core/handlers/task_complete.py index 517b4c1..16bfd6e 100644 --- a/src/yc_bench/core/handlers/task_complete.py +++ b/src/yc_bench/core/handlers/task_complete.py @@ -4,23 +4,27 @@ On completion: - If completion_time <= deadline: success → add reward funds, add prestige, skill-boost employees. - If completion_time > deadline: fail → set completed_fail, apply 0.8 * delta prestige penalty. After either outcome, recalculate ETAs (freed employees change topology). +Payment disputes may be scheduled for RAT clients at high trust. """ from __future__ import annotations +import random as _stdlib_random from dataclasses import dataclass, field +from datetime import timedelta from decimal import Decimal from typing import Dict from uuid import UUID from sqlalchemy.orm import Session -from ...db.models.client import ClientTrust +from ...db.models.client import Client, ClientTrust from ...db.models.company import Company, CompanyPrestige, Domain from ...db.models.employee import Employee, EmployeeSkillRate from ...config import get_world_config -from ...db.models.event import SimEvent +from ...db.models.event import EventType, SimEvent from ...db.models.ledger import LedgerCategory, LedgerEntry from ...db.models.task import Task, TaskAssignment, TaskRequirement, TaskStatus +from ..events import insert_event @dataclass @@ -28,6 +32,7 @@ class TaskCompleteResult: task_id: UUID success: bool funds_delta: int = 0 + listed_reward: int = 0 prestige_changes: Dict[str, float] = field(default_factory=dict) trust_delta: float = 0.0 bankrupt: bool = False @@ -81,35 +86,51 @@ def handle_task_complete(db: Session, event: SimEvent, sim_time) -> TaskComplete old = float(prestige.prestige_level) prestige.prestige_level = min( Decimal(str(wc.prestige_max)), - prestige.prestige_level + task.reward_prestige_delta, + prestige.prestige_level + Decimal(str(float(task.reward_prestige_delta))), ) prestige_changes[req.domain.value] = float(prestige.prestige_level) - old - # Skill boost assigned employees + # Skill boost: only the top contributors get boosted (Brooks's Law). + # Overcrowded employees (beyond the efficient team size) are overhead + # and don't learn from the experience. + from ..progress import _EFFICIENT_TEAM_SIZE assignments = db.query(TaskAssignment).filter( TaskAssignment.task_id == task_id ).all() if task.skill_boost_pct > 0: task_domains = {req.domain for req in reqs} - for a in assignments: - for domain in task_domains: + for domain in task_domains: + # Rank employees by their rate in this domain (best first) + emp_rates = [] + for a in assignments: skill = db.query(EmployeeSkillRate).filter( EmployeeSkillRate.employee_id == a.employee_id, EmployeeSkillRate.domain == domain, ).one_or_none() if skill is not None: - boost = skill.rate_domain_per_hour * task.skill_boost_pct - skill.rate_domain_per_hour = min( - skill.rate_domain_per_hour + boost, - Decimal(str(wc.skill_rate_max)), - ) + emp_rates.append(skill) + emp_rates.sort(key=lambda s: s.rate_domain_per_hour, reverse=True) - # Salary bump: small raise for each employee who contributed to this task + # Only boost the top N (efficient team size) + for skill in emp_rates[:_EFFICIENT_TEAM_SIZE]: + boost = skill.rate_domain_per_hour * Decimal(str(float(task.skill_boost_pct))) + skill.rate_domain_per_hour = min( + skill.rate_domain_per_hour + boost, + Decimal(str(wc.skill_rate_max)), + ) + + # Salary bump: fixed raise per tier (linear, not compounding). + # Bump = tier midpoint salary × salary_bump_pct (computed once from config). if wc.salary_bump_pct > 0: + tier_midpoints = { + "junior": (wc.salary_junior.min_cents + wc.salary_junior.max_cents) // 2, + "mid": (wc.salary_mid.min_cents + wc.salary_mid.max_cents) // 2, + "senior": (wc.salary_senior.min_cents + wc.salary_senior.max_cents) // 2, + } for a in assignments: employee = db.query(Employee).filter(Employee.id == a.employee_id).one_or_none() if employee is not None and employee.salary_cents < wc.salary_max_cents: - bump = int(employee.salary_cents * wc.salary_bump_pct) + bump = int(tier_midpoints.get(employee.tier, 0) * wc.salary_bump_pct) employee.salary_cents = min(wc.salary_max_cents, employee.salary_cents + bump) else: @@ -131,6 +152,22 @@ def handle_task_complete(db: Session, event: SimEvent, sim_time) -> TaskComplete ) prestige_changes[req.domain.value] = float(prestige.prestige_level) - old + # Financial penalty: deduct a fraction of the advertised reward + if wc.penalty_fail_funds_pct > 0: + advertised = task.advertised_reward_cents or task.reward_funds_cents + penalty_cents = int(advertised * wc.penalty_fail_funds_pct) + company = db.query(Company).filter(Company.id == company_id).one() + company.funds_cents -= penalty_cents + funds_delta = -penalty_cents + db.add(LedgerEntry( + company_id=company_id, + occurred_at=sim_time, + category=LedgerCategory.TASK_REWARD, + amount_cents=-penalty_cents, + ref_type="task", + ref_id=task_id, + )) + # --- Client trust update --- trust_delta = 0.0 if task.client_id is not None: @@ -165,6 +202,8 @@ def handle_task_complete(db: Session, event: SimEvent, sim_time) -> TaskComplete new = max(wc.trust_min, old - wc.trust_cross_client_decay) other_ct.trust_level = Decimal(str(round(new, 3))) + # Payment disputes disabled — scope creep (deadline failures) is the primary RAT mechanic. + db.flush() # Check bankruptcy @@ -175,6 +214,7 @@ def handle_task_complete(db: Session, event: SimEvent, sim_time) -> TaskComplete task_id=task_id, success=success, funds_delta=funds_delta, + listed_reward=task.advertised_reward_cents or task.reward_funds_cents, prestige_changes=prestige_changes, trust_delta=trust_delta, bankrupt=bankrupt, diff --git a/src/yc_bench/core/progress.py b/src/yc_bench/core/progress.py index 774b053..2b5ab08 100644 --- a/src/yc_bench/core/progress.py +++ b/src/yc_bench/core/progress.py @@ -65,9 +65,25 @@ def _rates_by_employee_domain(rates): m[(r.employee_id, r.domain)] = r.rate_domain_per_hour return m +_EFFICIENT_TEAM_SIZE = 4 # first N employees at full rate +_OVERCROWD_PENALTY = Decimal("0") # employees beyond N contribute nothing (pure overhead) + + def _effective_rate_for_task_domain(*, task_id, domain, assignments, assignment_counts, base_rates): - total = Decimal("0") + """Compute effective rate for one task+domain. + + Throughput split uses sqrt(k) instead of k: two concurrent tasks each run at + 1/sqrt(2) ≈ 71% speed, not 50%. This makes mild parallelism (2-3 tasks) + more efficient than strict sequential. + + Brooks's Law: first 4 employees contribute full rate. Beyond that, + additional employees contribute at 25% (overcrowding overhead). + """ + from math import sqrt + + # Collect (employee_id, effective_base) for this task, sorted best-first + contributions = [] for a in assignments: if a.task_id != task_id: continue @@ -75,7 +91,18 @@ def _effective_rate_for_task_domain(*, task_id, domain, assignments, if k <= 0: continue base = base_rates.get((a.employee_id, domain), Decimal("0")) - total += base / Decimal(k) + split_rate = base / Decimal(str(round(sqrt(k), 6))) + contributions.append(split_rate) + + # Sort best contributors first so they get full rate + contributions.sort(reverse=True) + + total = Decimal("0") + for i, rate in enumerate(contributions): + if i < _EFFICIENT_TEAM_SIZE: + total += rate + else: + total += rate * _OVERCROWD_PENALTY return total def _weighted_ratio_from_rows(rows, *, task_id_label): @@ -237,13 +264,23 @@ def compute_effective_rates(db, company_id): out = [] for req in requirements: - total = Decimal("0") + from math import sqrt + contributions = [] for a in assignments_by_task.get(req.task_id, []): k = assignment_counts.get(a.employee_id, 0) if k <= 0: continue base = base_rates.get((a.employee_id, req.domain), Decimal("0")) - total += base / Decimal(k) + split_rate = base / Decimal(str(round(sqrt(k), 6))) + contributions.append(split_rate) + + contributions.sort(reverse=True) + total = Decimal("0") + for i, rate in enumerate(contributions): + if i < _EFFICIENT_TEAM_SIZE: + total += rate + else: + total += rate * _OVERCROWD_PENALTY out.append(EffectiveRate( task_id=req.task_id, diff --git a/src/yc_bench/db/models/client.py b/src/yc_bench/db/models/client.py index b52d9c1..c8b030f 100644 --- a/src/yc_bench/db/models/client.py +++ b/src/yc_bench/db/models/client.py @@ -36,6 +36,11 @@ class Client(Base): nullable=False, default=list, ) + loyalty = mapped_column( + Float, + nullable=False, + default=0.0, + ) class ClientTrust(Base): diff --git a/src/yc_bench/db/models/event.py b/src/yc_bench/db/models/event.py index 1fba9e4..77e83db 100644 --- a/src/yc_bench/db/models/event.py +++ b/src/yc_bench/db/models/event.py @@ -11,6 +11,7 @@ from ..base import Base class EventType(str, Enum): TASK_HALF_PROGRESS = "task_half_progress" TASK_COMPLETED = "task_completed" + PAYMENT_DISPUTE = "payment_dispute" BANKRUPTCY = "bankruptcy" HORIZON_END = "horizon_end" diff --git a/src/yc_bench/db/models/ledger.py b/src/yc_bench/db/models/ledger.py index 7103f56..efac540 100644 --- a/src/yc_bench/db/models/ledger.py +++ b/src/yc_bench/db/models/ledger.py @@ -13,6 +13,7 @@ class LedgerCategory(str, Enum): TASK_REWARD = "task_reward" TASK_FAIL_PENALTY = "task_fail_penalty" TASK_CANCEL_PENALTY = "task_cancel_penalty" + PAYMENT_DISPUTE = "payment_dispute" class LedgerEntry(Base): __tablename__ = "ledger_entries" diff --git a/src/yc_bench/db/models/task.py b/src/yc_bench/db/models/task.py index a1d14dc..a5ab150 100644 --- a/src/yc_bench/db/models/task.py +++ b/src/yc_bench/db/models/task.py @@ -95,6 +95,10 @@ class Task(Base): default=0, server_default=text("0"), ) + advertised_reward_cents = mapped_column( + BigInteger, + nullable=True, + ) class TaskRequirement(Base): __tablename__ = "task_requirements" diff --git a/src/yc_bench/runner/extract.py b/src/yc_bench/runner/extract.py index 1068f5f..f43e08e 100644 --- a/src/yc_bench/runner/extract.py +++ b/src/yc_bench/runner/extract.py @@ -20,7 +20,7 @@ def extract_time_series(db_factory, company_id: UUID) -> Dict[str, Any]: "tasks": tasks, "ledger": ledger, "client_trust": client_trust, - "trust_reward_formula": "continuous: reward = listed × (0.50 + client_mult² × 0.25 × trust²/5.0); work_reduction = 0.40 × trust/5.0; cross_client_decay = 0.03/task; tiers: Standard=[0.7,1.0), Premium=[1.0,1.7), Enterprise=[1.7,2.5]; specialty_bias=0.70", + "trust_reward_formula": "continuous: work_reduction = 0.40 × trust/5.0; cross_client_decay = 0.03/task; tiers: Standard=[0.7,1.0), Premium=[1.0,1.7), Enterprise=[1.7,2.5]; specialty_bias=0.70; RAT clients: scope_creep + payment_disputes above loyalty_reveal_trust", } @@ -200,6 +200,13 @@ def _extract_client_trust(db, company_id: UUID) -> List[Dict[str, Any]]: client_names = {str(ct.client_id): name for ct, name in trust_rows} + # Fetch loyalty scores for post-hoc analysis + client_loyalty = {} + for ct_row, _ in trust_rows: + c = db.query(Client).filter(Client.id == ct_row.client_id).one_or_none() + if c: + client_loyalty[str(c.id)] = c.loyalty + # Get all tasks that affect trust (completed or failed), ordered by completion time tasks = ( db.query(Task) @@ -226,6 +233,7 @@ def _extract_client_trust(db, company_id: UUID) -> List[Dict[str, Any]]: "time": first_time.isoformat(), "client_name": name, "trust_level": 0.0, + "loyalty": client_loyalty.get(cid, 0.0), }) last_event_time = first_time @@ -254,6 +262,7 @@ def _extract_client_trust(db, company_id: UUID) -> List[Dict[str, Any]]: "time": t.completed_at.isoformat(), "client_name": client_names[cid], "trust_level": round(trust_levels[cid], 4), + "loyalty": client_loyalty.get(cid, 0.0), }) last_event_time = t.completed_at @@ -297,6 +306,7 @@ def _extract_tasks(db, company_id: UUID) -> List[Dict[str, Any]]: "required_prestige": int(t.required_prestige), "required_trust": int(t.required_trust) if t.required_trust else 0, "reward_funds_cents": int(t.reward_funds_cents), + "advertised_reward_cents": int(t.advertised_reward_cents) if t.advertised_reward_cents else int(t.reward_funds_cents), "reward_prestige_delta": float(t.reward_prestige_delta) if t.reward_prestige_delta else 0.0, "status": t.status.value if hasattr(t.status, "value") else str(t.status), "accepted_at": t.accepted_at.isoformat() if t.accepted_at else None, diff --git a/src/yc_bench/runner/main.py b/src/yc_bench/runner/main.py index e4f659a..575bab1 100644 --- a/src/yc_bench/runner/main.py +++ b/src/yc_bench/runner/main.py @@ -269,11 +269,34 @@ def run_benchmark(args): _write_scratchpad(db_factory, company_id, carried_scratchpad) logger.info("Restored scratchpad from episode %d (%d chars).", episode - 1, len(carried_scratchpad)) - # 4. Set up live dashboard + # 4. Set up live dashboard + live transcript file dashboard = None on_turn_start = None on_turn = None + # Write live transcript alongside the DB so the streamlit dashboard can read it + _slug = args.model.replace("/", "_") + transcript_path = Path("db") / f"{args.config_name}_{args.seed}_{_slug}.transcript.jsonl" + if transcript_path.exists(): + transcript_path.unlink() + + def _write_live_transcript(snapshot, rs, commands): + """Append one JSONL line per turn for the streamlit dashboard.""" + if not rs.transcript: + return + entry = rs.transcript[-1] + import json as _json + line = _json.dumps({ + "turn": entry.turn, + "timestamp": entry.timestamp, + "agent_output": entry.agent_output, + "commands_executed": entry.commands_executed, + "sim_time": snapshot.get("sim_time", ""), + "funds_cents": snapshot.get("funds_cents", 0), + }, separators=(",", ":")) + with open(transcript_path, "a") as f: + f.write(line + "\n") + if use_live: from .dashboard import BenchmarkDashboard @@ -290,6 +313,10 @@ def run_benchmark(args): def on_turn(snapshot, rs, commands): dashboard.update(snapshot, rs, commands) + _write_live_transcript(snapshot, rs, commands) + else: + def on_turn(snapshot, rs, commands): + _write_live_transcript(snapshot, rs, commands) # 5. Run agent loop for this episode try: diff --git a/src/yc_bench/services/generate_clients.py b/src/yc_bench/services/generate_clients.py index 8ebe391..41c3052 100644 --- a/src/yc_bench/services/generate_clients.py +++ b/src/yc_bench/services/generate_clients.py @@ -42,10 +42,11 @@ class GeneratedClient: reward_multiplier: float # per-client bonus applied on top of trust reward tier: str = "Standard" specialty_domains: list[str] = field(default_factory=list) + loyalty: float = 0.0 # hidden loyalty score in [-1.0, 1.0] def generate_clients(*, run_seed: int, count: int, cfg: WorldConfig) -> list[GeneratedClient]: - """Generate clients with seeded reward multipliers, tiers, and specialty domains.""" + """Generate clients with seeded reward multipliers, tiers, specialty domains, and loyalty.""" if count <= 0: return [] if count > len(_CLIENT_NAME_POOL): @@ -54,19 +55,36 @@ def generate_clients(*, run_seed: int, count: int, cfg: WorldConfig) -> list[Gen streams = RngStreams(run_seed) rng = streams.stream("clients") names = rng.sample(_CLIENT_NAME_POOL, count) + + # Guarantee a fixed number of RATs: round(count * rat_fraction). + # First N clients (after shuffle) are RATs with loyalty in [-1, -0.3], + # remaining are loyal/neutral with loyalty in [-0.3, 1]. + n_rats = max(1, round(count * cfg.loyalty_rat_fraction)) + clients = [] - for name in names: + for i, name in enumerate(names): mult = round(rng.triangular(cfg.client_reward_mult_low, cfg.client_reward_mult_high, cfg.client_reward_mult_mode), 2) tier = _tier_from_multiplier(mult, cfg) n_specialties = 1 if rng.random() < cfg.client_single_specialty_prob else 2 specialties = [d.value for d in rng.sample(_ALL_DOMAINS, n_specialties)] + + if i < n_rats: + # RAT: loyalty in [-1.0, -0.3] + loyalty = round(rng.uniform(-1.0, -0.3), 3) + else: + # Non-RAT: loyalty in [-0.3, 1.0] + loyalty = round(rng.triangular(-0.3, 1.0, cfg.loyalty_mode), 3) + clients.append(GeneratedClient( name=name, reward_multiplier=mult, tier=tier, specialty_domains=specialties, + loyalty=loyalty, )) + # Shuffle so RATs aren't always first in the list + rng.shuffle(clients) return clients diff --git a/src/yc_bench/services/generate_tasks.py b/src/yc_bench/services/generate_tasks.py index c95d515..b5f61a9 100644 --- a/src/yc_bench/services/generate_tasks.py +++ b/src/yc_bench/services/generate_tasks.py @@ -150,12 +150,14 @@ def _make_task(rng, cfg, prestige, serial, requirements, client_index=0): ) -def generate_tasks(*, run_seed, count, cfg, client_specialties=None): +def generate_tasks(*, run_seed, count, cfg, client_specialties=None, client_reward_mults=None): """Generate market tasks. Args: client_specialties: list of specialty domain lists, one per client index. e.g. [["research", "training"], ["inference"]] for 2 clients. + client_reward_mults: list of reward multipliers per client index. + Task rewards are scaled by the client's multiplier. """ if count <= 0: return [] @@ -169,8 +171,16 @@ def generate_tasks(*, run_seed, count, cfg, client_specialties=None): client_index = (idx - 1) % num_clients spec_domains = client_specialties[client_index % len(client_specialties)] if client_specialties else None requirements = _sample_requirements(rng, cfg, prestige=prestige, specialty_domains=spec_domains) - out.append(_make_task(rng, cfg, prestige, serial=idx, requirements=requirements, - client_index=client_index)) + task = _make_task(rng, cfg, prestige, serial=idx, requirements=requirements, + client_index=client_index) + # Apply client reward multiplier — higher-mult clients offer better-paying tasks + if client_reward_mults and client_index < len(client_reward_mults): + mult = client_reward_mults[client_index] + new_reward = int(task.reward_funds_cents * mult) + task = GeneratedTask( + **{**task.__dict__, "reward_funds_cents": new_reward} + ) + out.append(task) return out diff --git a/src/yc_bench/services/seed_world.py b/src/yc_bench/services/seed_world.py index d605b29..51d88de 100644 --- a/src/yc_bench/services/seed_world.py +++ b/src/yc_bench/services/seed_world.py @@ -58,8 +58,11 @@ def _seed_company_prestige(db, company, cfg): ) +_FIXED_WORLD_SEED = 1 # employees + clients identical across all run seeds + + def _seed_employees(db, company, req): - generated = generate_employees(run_seed=req.run_seed, count=req.employee_count, cfg=req.cfg) + generated = generate_employees(run_seed=_FIXED_WORLD_SEED, count=req.employee_count, cfg=req.cfg) for emp in generated: employee = Employee( id=uuid4(), @@ -83,11 +86,12 @@ def _seed_employees(db, company, req): def _seed_clients(db, company, req): """Create Client rows and ClientTrust rows (all starting at 0.0).""" - generated = generate_clients(run_seed=req.run_seed, count=req.cfg.num_clients, cfg=req.cfg) + generated = generate_clients(run_seed=_FIXED_WORLD_SEED, count=req.cfg.num_clients, cfg=req.cfg) clients = [] for gc in generated: client = Client(id=uuid4(), name=gc.name, reward_multiplier=gc.reward_multiplier, - tier=gc.tier, specialty_domains=gc.specialty_domains) + tier=gc.tier, specialty_domains=gc.specialty_domains, + loyalty=gc.loyalty) db.add(client) clients.append(client) db.add(ClientTrust( @@ -100,10 +104,12 @@ def _seed_clients(db, company, req): def _seed_market_tasks(db, company, req, clients): - # Build specialty list indexed by client order for domain-biased task generation + # Build specialty list and reward multipliers indexed by client order client_specialties = [c.specialty_domains or [] for c in clients] if clients else None + client_reward_mults = [c.reward_multiplier for c in clients] if clients else None generated = generate_tasks(run_seed=req.run_seed, count=req.market_task_count, cfg=req.cfg, - client_specialties=client_specialties) + client_specialties=client_specialties, + client_reward_mults=client_reward_mults) for task in generated: client = clients[task.client_index % len(clients)] if clients else None task_row = Task( diff --git a/system_design/11_client_trust.md b/system_design/11_client_trust.md index 7212812..56b5f83 100644 --- a/system_design/11_client_trust.md +++ b/system_design/11_client_trust.md @@ -1,118 +1,104 @@ -# Client Trust System +# Client Trust & Loyalty -**Location**: `services/generate_clients.py`, `services/generate_tasks.py`, `core/handlers/task_complete.py`, `cli/task_commands.py` +## The Big Idea -## Overview +Every client has a **hidden loyalty score** the agent can't see. Some clients are loyal (investing in them pays off), some are adversarial "RATs" (investing in them backfires). The agent has to figure out which is which from observed behavior — delayed consequences, not explicit labels. -Trust is the second progression axis alongside prestige. Prestige gates task access; trust determines profitability. Every task belongs to a client. Building trust increases payouts and reduces work, rewarding focused relationship-building. +This tests three things: -## Configuration +1. **Can the agent invest under uncertainty?** You don't know if a client is worth it until you've sunk 10+ tasks into them. +2. **Can the agent spot patterns?** RATs look normal at first. The only signal is that tasks from them fail deadlines more often and money sometimes disappears after completion. +3. **Can the agent cut losses?** Dropping a RAT costs the trust you built. Keeping one costs real money. -The trust system is controlled by **7 intuitive knobs** in `WorldConfig`. All internal parameters are derived automatically. +## How Trust Works -| Knob | Default | Meaning | -|------|---------|---------| -| `num_clients` | 8 | Number of clients in the game | -| `trust_max` | 5.0 | Maximum trust level | -| `trust_build_rate` | 20.0 | ~tasks to reach 80% max trust with one client | -| `trust_fragility` | 0.5 | 0–1: how punishing failures/inactivity are | -| `trust_focus_pressure` | 0.5 | 0–1: penalty for spreading work across clients | -| `trust_reward_ceiling` | 2.6 | Payout multiplier a Premium client gives at max trust | -| `trust_work_reduction_max` | 0.40 | Max work reduction at max trust (40%) | -| `trust_gating_fraction` | 0.20 | Fraction of tasks that require trust (~20%) | +Every client starts at trust 0. Completing tasks builds trust (0-5 scale). Trust gives two benefits: -### Derivation +- **Work reduction**: Up to 40% less work per task at max trust (loyal clients give clearer specs) +- **Gated tasks**: ~20% of high-reward tasks require minimum trust to accept -These knobs derive all internal parameters via `_derive_trust_params()`: +Trust decays daily and drops on failure/cancellation. Working for Client A erodes trust with all other clients (cross-client decay), so you can't maintain trust with everyone — you have to pick 2-3 clients to focus on. + +## How Loyalty Works + +At world generation, each client gets a hidden loyalty score from `triangular(-1, 1, mode≈0.6)`: + +- **Loyal** (> 0.3): ~50% of clients. Trust investment pays off via work reduction. +- **Neutral** (-0.3 to 0.3): ~35%. No special effects. +- **RAT** (< -0.3): ~15%. Adversarial. Looks normal, exploits you at higher trust. + +The agent never sees loyalty scores. It only sees: client name, tier, specialties, trust level. + +## What RATs Do + +RAT effects activate once trust exceeds `loyalty_reveal_trust` (default 0.5 for medium). The effects scale with `|loyalty| × sqrt(trust_fraction)` — sqrt scaling means they bite early and plateau, rather than being negligible until max trust. + +### 1. Scope Creep (Bait-and-Switch) + +When you accept a task from a RAT at sufficient trust, the **actual work required is secretly inflated** — but the deadline is calculated from the original (smaller) amount. The task looks completable but isn't. + +- **Max inflation**: `severity × 0.70` (medium: 56%) +- **Effect**: Tasks from RATs miss deadlines more often. The agent notices when progress milestones arrive later than expected. + +### 2. Payment Disputes (Delayed Clawback) + +After completing a RAT's task, there's a random chance a `PAYMENT_DISPUTE` event fires 2-7 days later, clawing back a chunk of the reward. + +- **Max clawback**: `severity × 0.80` of the reward (medium: 64%) +- **Max probability**: `severity × 0.50` per task (medium: 40%) +- **Effect**: The agent gets paid, then days later money disappears. The only way to notice is checking `client history` and seeing listed rewards don't match received amounts. + +### 3. Work Reduction for Loyal Clients + +Loyal clients reduce required work by `trust_work_reduction_max × trust / trust_max`. This is the payoff for choosing well — loyal clients make tasks faster, meaning more tasks, more revenue. + +## Intensity Scaling + +All RAT effects use the same intensity formula: ``` -gain_base = trust_max × 1.6 / trust_build_rate -fail_penalty = fragility × 0.6 -cancel_penalty = fragility × 1.0 -decay_per_day = fragility × 0.03 -cross_client_decay = focus_pressure × 0.06 -reward_scale = (reward_ceiling - 0.50) / (1.69 × trust_max) -reward_threshold = 1.0 - 2 × gating_fraction -reward_ramp = 2 × gating_fraction +trust_fraction = (trust - threshold) / (max_trust - threshold) +intensity = |loyalty| × sqrt(trust_fraction) ``` -## Client Generation +The sqrt makes effects noticeable early (trust barely above threshold) rather than negligible until max trust. At medium difficulty with a RAT (loyalty -0.57) at trust 2.0: -At world-seeding time, `num_clients` clients are generated with: -- **Reward multiplier**: `triangular(0.7, 2.5, mode=1.0)` — hidden from agent -- **Tier** (visible): Standard `[0.7, 1.0)`, Premium `[1.0, 1.7)`, Enterprise `[1.7, 2.5]` -- **Specialties**: 1 domain (60%) or 2 domains (40%) -## Task Domain Bias +| Effect | Value | +| ------------------- | ---------------------- | +| Scope creep | +18% work inflation | +| Dispute probability | 13% per completed task | +| Clawback amount | up to 12% of reward | -First domain pick has 70% chance of matching client specialty. Remaining domains uniform random. -## Trust Gating +## How the Agent Can Detect RATs -High-reward tasks may require trust: +The agent has one tool: `yc-bench client history`. This shows per-client: -``` -reward_frac = (reward - floor) / (ceiling - floor) -trust_prob = max(0, (reward_frac - threshold) / ramp) -level = clamp(1 + reward_frac × 3, 1, 4) -``` +- Tasks completed (success/fail count) +- Listed reward total vs net received (after disputes) +- Dispute count -Trust-gated tasks get a 15% reward boost per required trust level. +An agent that periodically checks history will notice: -**Why**: Clients reserve best projects for proven vendors. +- A client whose tasks fail deadlines more than others (scope creep) +- A client where net received < listed rewards (disputes) -## Trust Reward Formula (at task accept) +An agent that never checks will keep getting exploited. -``` -trust_multiplier = 0.50 + client_mult² × reward_scale × trust² / trust_max -actual_reward = listed_reward × trust_multiplier -``` +## Config Knobs -At trust 0, everyone gets 50% of listed reward. At max trust: -| Tier | mult | multiplier | -|------|------|-----------| -| Standard | 0.85 | 1.40× | -| Premium | 1.30 | 2.60× | -| Enterprise | 2.00 | 5.50× | +| Knob | Medium | Hard | Nightmare | +| ---------------------- | ------ | ---- | --------- | +| `loyalty_rat_fraction` | 0.15 | 0.20 | 0.25 | +| `loyalty_severity` | 0.8 | 0.7 | 0.9 | +| `loyalty_reveal_trust` | 0.5 | 1.5 | 1.0 | -**Why**: Quadratic on both mult and trust creates dramatic tier separation at high trust. Enterprise is worse than Standard at trust 0 — a genuine investment gamble. -## Work Reduction (at task accept) +Derived from severity: -``` -work_reduction = trust_work_reduction_max × trust / trust_max -required_qty *= (1 - work_reduction) -``` +- `scope_creep_max = severity × 0.70` +- `dispute_clawback_max = severity × 0.80` +- `dispute_prob_max = severity × 0.50` -**Why**: Trusted clients give clearer specs. Creates virtuous cycle: trust → less work → faster completion → more tasks → more trust. - -## Trust Gain (task success) - -``` -gain = gain_base × (1 - trust/trust_max) ^ 1.5 -``` - -Diminishing returns: ~0.40/task at trust 0, ~0.07/task at trust 4. - -## Trust Loss - -| Event | Penalty | -|-------|---------| -| Task failure | `fragility × 0.6` (default 0.3) | -| Task cancel | `fragility × 1.0` (default 0.5) | - -## Trust Decay - -- **Daily**: `fragility × 0.03` per day (default 0.015) -- **Cross-client**: `focus_pressure × 0.06` per task for other client (default 0.03) - -**Why**: Cross-client decay penalizes scattering and rewards focusing on 2–3 clients. - -## Sim Resume When Idle - -`sim resume` is allowed even with no active tasks — time moves forward regardless. Calling it while idle advances to the next payroll event, burning runway with zero revenue. The prompt warns the agent not to do this, but doesn't prevent it. If the agent ignores the warning and burns payroll, that's a valid failure mode. - -## Agent Visibility - -Visible: client name, trust_level, tier, specialties. Not visible: exact multiplier, formulas, decay rates.