Calibrated domain prestge bump

2026-04-19 12:58:03 +00:00 · 2026-03-06 14:40:45 -08:00 · 2026-03-06 14:40:45 -08:00 · 99e69190ec
commit 99e69190ec
parent 5671e0102f
20 changed files with 6829 additions and 133 deletions
--- a/README.md
+++ b/README.md
@ -45,7 +45,7 @@ Outputs a SQLite DB in `db/` and a JSON rollout in `results/`.
 ### Run multiple models in parallel

 ```bash
-bash scripts/run_benchmark.sh --seed 1 --config challenge
+bash scripts/run_benchmark.sh --seed 1 --config hard
 ```

 ---
--- a/results/yc_bench_result_hard_1_openai_gpt-5.4.json
+++ b/results/yc_bench_result_hard_1_openai_gpt-5.4.json
--- a/scripts/bot_runner.py
+++ b/scripts/bot_runner.py
@ -91,15 +91,15 @@ def _compute_deadline(accepted_at, max_domain_qty, cfg):


 def _build_candidates(db, company_id, sim_state, world_cfg, emp_skills):
-    """Build CandidateTask list for all market tasks the company can see."""
+    """Build CandidateTask list for all market tasks the company can accept (per-domain prestige gating)."""
    prestige_rows = db.query(CompanyPrestige).filter(
        CompanyPrestige.company_id == company_id
    ).all()
-    max_prestige = max((float(p.prestige_level) for p in prestige_rows), default=1.0)
+    prestige_map = {p.domain: float(p.prestige_level) for p in prestige_rows}
+    min_prestige = min(prestige_map.values()) if prestige_map else 1.0

    market_tasks = db.query(Task).filter(
        Task.status == TaskStatus.MARKET,
-        Task.required_prestige <= int(max_prestige),
    ).order_by(Task.reward_funds_cents.desc()).all()

    all_skills = [{d: r for d, r in e["skills"].items()} for e in emp_skills]
@ -109,6 +109,15 @@ def _build_candidates(db, company_id, sim_state, world_cfg, emp_skills):
        reqs = db.query(TaskRequirement).filter(
            TaskRequirement.task_id == task.id
        ).all()
+
+        # Per-domain prestige check: all required domains must meet threshold
+        meets_prestige = all(
+            prestige_map.get(r.domain, 1.0) >= task.required_prestige
+            for r in reqs
+        )
+        if not meets_prestige:
+            continue
+
        max_domain_qty = max(float(r.required_qty) for r in reqs)
        task_reqs = [{"domain": r.domain, "required_qty": float(r.required_qty)} for r in reqs]

@ -128,7 +137,7 @@ def _build_candidates(db, company_id, sim_state, world_cfg, emp_skills):
            is_completable=is_completable,
        ))

-    return candidates, max_prestige
+    return candidates, min_prestige


 # ── Strategy functions ──────────────────────────────────────────────────────
--- a/scripts/plot_multi_model.py
+++ b/scripts/plot_multi_model.py
@ -1,7 +1,7 @@
 """Multi-model comparison plot: funds curves + cost vs budget.

 Run from the repo root:
-    uv run python scripts/plot_multi_model.py [--seed 1] [--config challenge] [--budget 30]
+    uv run python scripts/plot_multi_model.py [--seed 1] [--config hard] [--budget 30]
 """
 import argparse
 import json
@ -32,7 +32,7 @@ INITIAL_FUNDS_CENTS = 25_000_000  # $250K
 def parse_args():
    p = argparse.ArgumentParser()
    p.add_argument("--seed", type=int, default=1)
-    p.add_argument("--config", default="challenge")
+    p.add_argument("--config", default="hard")
    p.add_argument("--budget", type=float, default=30.0)
    p.add_argument("--out", default=None, help="Output PNG path (default: plots/funds_curves.png)")
    return p.parse_args()
--- a/scripts/run_benchmark.sh
+++ b/scripts/run_benchmark.sh
@ -9,7 +9,7 @@
 set -euo pipefail

 SEEDS="1 2 3"
-CONFIG=challenge
+CONFIG=hard

 while [[ $# -gt 0 ]]; do
  case "$1" in
--- a/src/yc_bench/cli/start_command.py
+++ b/src/yc_bench/cli/start_command.py
@ -69,12 +69,11 @@ def detect_key(api_key: str) -> tuple[str, str]:
 # ── Config presets ───────────────────────────────────────────────────────

 PRESETS = [
-    ("tutorial",  "Tutorial",   "1 yr", "3 emp",  "50 tasks",  "Learn the basics"),
-    ("easy",      "Easy",       "1 yr", "5 emp",  "100 tasks", "Gentle intro"),
-    ("medium",    "Medium",     "1 yr", "5 emp",  "150 tasks", "Prestige + specialization"),
-    ("hard",      "Hard",       "1 yr", "7 emp",  "200 tasks", "Deadline pressure"),
-    ("challenge", "Challenge",  "3 yr", "5 emp",  "200 tasks", "Long-horizon endurance"),
-    ("nightmare", "Nightmare",  "1 yr", "8 emp",  "300 tasks", "Sustained perfection"),
+    ("tutorial",  "Tutorial",   "1 yr", "10 emp", "200 tasks", "Learn the basics"),
+    ("easy",      "Easy",       "1 yr", "10 emp", "200 tasks", "Gentle intro"),
+    ("medium",    "Medium",     "1 yr", "10 emp", "200 tasks", "Prestige + specialization"),
+    ("hard",      "Hard",       "1 yr", "10 emp", "200 tasks", "Deadline pressure"),
+    ("nightmare", "Nightmare",  "1 yr", "10 emp", "200 tasks", "Sustained perfection"),
 ]


--- a/src/yc_bench/cli/task_commands.py
+++ b/src/yc_bench/cli/task_commands.py
@ -57,20 +57,21 @@ def task_accept(
        if task.status != TaskStatus.MARKET:
            error_output(f"Task {task_id} is not in market status (current: {task.status.value}).")

-        # Validate prestige requirement
+        # Validate per-domain prestige requirement
        company_id = sim_state.company_id
+        reqs = db.query(TaskRequirement).filter(TaskRequirement.task_id == tid).all()
        prestige_rows = db.query(CompanyPrestige).filter(
            CompanyPrestige.company_id == company_id
        ).all()
-        max_prestige = max((float(p.prestige_level) for p in prestige_rows), default=1.0)
+        prestige_map = {p.domain: float(p.prestige_level) for p in prestige_rows}

-        if task.required_prestige > max_prestige:
-            error_output(
-                f"Company max prestige ({max_prestige}) does not meet task requirement ({task.required_prestige})."
-            )
-
-        # Compute deadline
-        reqs = db.query(TaskRequirement).filter(TaskRequirement.task_id == tid).all()
+        for req in reqs:
+            domain_prestige = prestige_map.get(req.domain, 1.0)
+            if task.required_prestige > domain_prestige:
+                error_output(
+                    f"Company prestige in {req.domain.value} ({domain_prestige:.1f}) "
+                    f"does not meet task requirement ({task.required_prestige})."
+                )
        max_domain_qty = max(float(r.required_qty) for r in reqs)
        accepted_at = sim_state.sim_time
        deadline = _compute_deadline(accepted_at, max_domain_qty)
--- a/src/yc_bench/config/presets/challenge.toml
+++ b/src/yc_bench/config/presets/challenge.toml
@ -1,76 +0,0 @@
-# challenge — 3-year benchmark designed for differentiated model behavior
-#
-# Design goals:
-#   - Models that focus employees on tasks complete on time, build prestige,
-#     and can reach the 3-year horizon.
-#   - Models that spread employees or forget to dispatch go bankrupt.
-#   - Wide outcome variance across models: some die early, some survive.
-#
-# Economics (seed=1, 5 employees):
-#   Monthly payroll ≈ $32K.  Starting runway ≈ 7.8 months.
-#   Mode task: 2 domains × 700 units = 1400 total.
-#   Deadline: max(7, 1400/200) = 7 business days (minimum).
-#   All 5 focused on one task, split ~2+3: domain1 3×5.1×9=137.7 units/day →
-#     700/137.7 = 5.1 days; domain2 2×5.1×9=91.8 → 700/91.8 = 7.6 days > 7 FAIL.
-#   → Need ≥3 employees per domain: put 3 on domain1 and 3 on domain2 (count ×2)
-#     or accept single-domain tasks where all 5 focus.
-#   When done right: complete 3 tasks/month × avg $30K = $90K > $32K payroll ✓
-#   Late game: prestige-5 tasks pay $30K × 3.2× = $96K each → strong growth.
-
-extends = "default"
-
-name        = "challenge"
-description = "3-year benchmark calibrated for differentiated model behavior. ~7.8 months starting runway. Strategy and focus determine survival."
-
-# ---------------------------------------------------------------------------
-# Simulation
-# ---------------------------------------------------------------------------
-
-[sim]
-horizon_years = 3
-
-# ---------------------------------------------------------------------------
-# Agent loop
-# ---------------------------------------------------------------------------
-
-[loop]
-# Force a sim advance if the agent goes 5 consecutive turns without one.
-auto_advance_after_turns = 5
-
-# 500 turns covers 3 years for an efficient agent:
-#   advancing every 3 turns → 167 advances × ~14 avg sim days = 2338 days = 6.4 yrs.
-#   forced advances only → 100 advances × ~14 days = 1400 days = 3.8 yrs.
-max_turns = 500
-
-# ---------------------------------------------------------------------------
-# World — 5 employees keeps payroll manageable (~$32K/month).
-# 200 market tasks gives a deep enough pool to find tasks in any prestige tier.
-# ---------------------------------------------------------------------------
-
-[world]
-num_employees    = 5
-num_market_tasks = 200
-deadline_qty_per_day = 100.0
-
-# ---------------------------------------------------------------------------
-# Task size: mode=700 (smaller than hardened default 1400) so tasks complete
-# in ~5-10 sim days when employees are focused. This creates frequent reward
-# events visible in the funds curve, and makes payroll crises recoverable.
-# ---------------------------------------------------------------------------
-
-[world.dist.required_qty]
-type = "triangular"
-low  = 200
-high = 2000
-mode = 700
-
-# ---------------------------------------------------------------------------
-# Prestige: mode=3 ensures a healthy supply of prestige-1 and prestige-2 tasks
-# at the start. Agents that specialise unlock 4–8 tasks paying 2–5× more.
-# ---------------------------------------------------------------------------
-
-[world.dist.required_prestige]
-type = "triangular"
-low  = 1
-high = 8
-mode = 3
--- a/src/yc_bench/config/presets/default.toml
+++ b/src/yc_bench/config/presets/default.toml
@ -57,7 +57,7 @@ initial_funds_cents      = 25_000_000    # $250,000
 initial_prestige_level   = 1.0
 work_hours_per_day       = 9.0

-num_market_tasks             = 500
+num_market_tasks             = 200
 market_browse_default_limit  = 50

 # Salary bump per completed task — each assigned employee gets this raise.
@ -77,6 +77,16 @@ penalty_cancel_multiplier  = 2.0    # hardened: was 1.2
 # At 0.55: a prestige-8 task pays ~4.85x more than a prestige-1 task.
 reward_prestige_scale = 0.55    # hardened: was 0.3

+# Daily prestige decay per domain. Domains not exercised lose prestige
+# over time: -0.01/day → -0.3/month. Untouched domain drops ~1 level
+# every ~3 months. Prevents single-domain hyper-specialization.
+prestige_decay_per_day = 0.01
+
+# Required qty scaling by prestige: qty *= 1 + scale * (prestige - 1).
+# At 0.3: prestige-5 tasks need 2.2x the work of prestige-1 tasks.
+# High prestige pays more but demands proportionally more capacity.
+prestige_qty_scale = 0.3
+
 # --- Deadline ---
 # Deadline = max(deadline_min_biz_days, max_domain_qty / deadline_qty_per_day).
 # Domains are worked in parallel, so deadline scales with heaviest domain, not sum.
@ -133,15 +143,15 @@ low  = 500          # hardened: base default is 200
 high = 3000
 mode = 1400         # hardened: base default is 800

-# Prestige delta awarded per required domain on task success.
-# Left-skewed beta: most completions give small gains; occasional large jumps.
+# Prestige delta awarded per domain on task success.
+# Mean ~0.1: climbing prestige 1→5 takes ~40 tasks.
 [world.dist.reward_prestige_delta]
 type  = "beta"
 alpha = 1.2
 beta  = 2.8
-scale = 2.0
+scale = 0.35
 low   = 0.0
-high  = 2.0
+high  = 0.35

 # Skill rate boost applied to each assigned employee on task success.
 # Expressed as a fraction of current rate: 0.12 → average 12% boost.
--- a/src/yc_bench/config/presets/easy.toml
+++ b/src/yc_bench/config/presets/easy.toml
@ -28,8 +28,7 @@ horizon_years = 1
 auto_advance_after_turns = 8

 [world]
-num_employees    = 5
-num_market_tasks = 100
+# Inherits num_employees=10, num_market_tasks=200 from default.

 # Moderate deadlines: 60 qty/day → ~12 day deadline. Comfortable with 3–4 tasks.
 deadline_qty_per_day = 60.0
--- a/src/yc_bench/config/presets/fast_test.toml
+++ b/src/yc_bench/config/presets/fast_test.toml
@ -19,8 +19,7 @@ auto_advance_after_turns = 5
 max_turns                = 50

 [world]
-num_employees    = 5
-num_market_tasks = 100
+# Inherits num_employees=10, num_market_tasks=200 from default.

 # Deadline based on max per-domain qty (parallel domain work).
 deadline_qty_per_day = 150
--- a/src/yc_bench/config/presets/hard.toml
+++ b/src/yc_bench/config/presets/hard.toml
@ -40,8 +40,7 @@ horizon_years = 1
 auto_advance_after_turns = 10

 [world]
-num_employees    = 7
-num_market_tasks = 200
+# Inherits num_employees=10, num_market_tasks=200 from default.

 # Tight deadlines: 1200/150 = 8 days.
 # 1 task with 5 per domain → 5.8 days. OK.
--- a/src/yc_bench/config/presets/high_reward.toml
+++ b/src/yc_bench/config/presets/high_reward.toml
@ -23,6 +23,6 @@ mode = 6_000_000    # $60,000
 type  = "beta"
 alpha = 1.2
 beta  = 2.8
-scale = 2.0
+scale = 0.5
 low   = 0.0
-high  = 3.0         # raised ceiling: up to 3.0 instead of 2.0
+high  = 0.5         # raised ceiling vs default 0.35
--- a/src/yc_bench/config/presets/medium.toml
+++ b/src/yc_bench/config/presets/medium.toml
@ -36,8 +36,7 @@ horizon_years = 1
 auto_advance_after_turns = 8

 [world]
-num_employees    = 5
-num_market_tasks = 150
+# Inherits num_employees=10, num_market_tasks=200 from default.

 # Deadline uses max per-domain qty. 900/100 = 9 days.
 # 2 concurrent tasks: 5 per task → 4.3 days each. Manageable.
--- a/src/yc_bench/config/presets/nightmare.toml
+++ b/src/yc_bench/config/presets/nightmare.toml
@ -49,8 +49,7 @@ horizon_years = 1
 auto_advance_after_turns = 10

 [world]
-num_employees    = 8
-num_market_tasks = 300
+# Inherits num_employees=10, num_market_tasks=200 from default.

 # Razor deadlines: 1600/200 = 8 days.
 # 1 task with 5 per domain → 7.7 days. Barely makes it.
@ -86,13 +85,13 @@ low  = 600
 high = 3000
 mode = 1600     # Large work volumes — no quick wins.

-# Larger prestige jumps on success — makes climbing feasible if you
-# never fail. But the fail penalty is so high that one blown task
-# wipes out 2 successes worth of prestige.
+# Slightly larger prestige gains than default (~0.13 avg) to make
+# climbing feasible despite the steep penalty. But one blown task
+# still wipes out multiple successes worth of prestige.
 [world.dist.reward_prestige_delta]
 type  = "beta"
 alpha = 1.5
 beta  = 2.5
-scale = 2.5
+scale = 0.45
 low   = 0.0
-high  = 2.5
+high  = 0.45
--- a/src/yc_bench/config/presets/tutorial.toml
+++ b/src/yc_bench/config/presets/tutorial.toml
@ -28,8 +28,7 @@ horizon_years = 1
 auto_advance_after_turns = 5

 [world]
-num_employees    = 3
-num_market_tasks = 50
+# Inherits num_employees=10, num_market_tasks=200 from default.

 # Very generous deadlines: 30 qty/day → most tasks get 13+ day deadline.
 deadline_qty_per_day = 30.0
--- a/src/yc_bench/config/schema.py
+++ b/src/yc_bench/config/schema.py
@ -49,9 +49,10 @@ class WorldDists(BaseModel):
    required_qty: DistSpec = Field(
        default_factory=lambda: TriangularDist(low=200, high=3000, mode=800)
    )
-    # Prestige delta awarded on task success.
+    # Prestige delta awarded per domain on task success.
+    # Mean ~0.1: climbing from prestige 1→5 takes ~40 tasks.
    reward_prestige_delta: DistSpec = Field(
-        default_factory=lambda: BetaDist(alpha=1.2, beta=2.8, scale=2.0, low=0.0, high=2.0)
+        default_factory=lambda: BetaDist(alpha=1.2, beta=2.8, scale=0.35, low=0.0, high=0.35)
    )
    # Skill rate boost fraction applied to each assigned employee on task success.
    skill_boost: DistSpec = Field(
@ -124,6 +125,15 @@ class WorldConfig(BaseModel):
    # At 0.55: prestige-8 tasks pay ~4.85x more than prestige-1.
    reward_prestige_scale: float = 0.3

+    # Daily prestige decay per domain. Domains not exercised lose prestige
+    # over time: -0.01/day → -0.3/month → untouched domain drops ~1 level
+    # every ~3 months. Floored at prestige_min.
+    prestige_decay_per_day: float = 0.01
+
+    # Required qty scaling by prestige: qty *= 1 + prestige_qty_scale * (prestige - 1).
+    # At 0.3: prestige-5 tasks need 2.2× the work of prestige-1 tasks.
+    prestige_qty_scale: float = 0.3
+
    # --- Deadline computation ---
    deadline_qty_per_day: float = 150.0  # max per-domain qty / this = deadline days
    deadline_min_biz_days: int = 7
--- a/src/yc_bench/core/engine.py
+++ b/src/yc_bench/core/engine.py
@ -19,11 +19,12 @@ from uuid import UUID

 from sqlalchemy.orm import Session

-from ..db.models.company import Company
+from ..db.models.company import Company, CompanyPrestige
 from ..db.models.employee import Employee
 from ..db.models.event import EventType, SimEvent
 from ..db.models.ledger import LedgerCategory, LedgerEntry
 from ..db.models.sim_state import SimState
+from ..config import get_world_config
 from .business_time import iter_monthly_payroll_boundaries
 from .eta import recalculate_etas
 from .events import consume_event, fetch_next_event, insert_event
@ -103,6 +104,19 @@ def dispatch_event(db: Session, event: SimEvent, sim_time: datetime, company_id:
    return {"type": "unknown", "event_type": event.event_type.value}


+def apply_prestige_decay(db: Session, company_id: UUID, days_elapsed: float) -> None:
+    """Reduce prestige in all domains by decay_rate × days. Floors at prestige_min."""
+    wc = get_world_config()
+    if wc.prestige_decay_per_day <= 0 or days_elapsed <= 0:
+        return
+    decay = Decimal(str(wc.prestige_decay_per_day * days_elapsed))
+    floor = Decimal(str(wc.prestige_min))
+    rows = db.query(CompanyPrestige).filter(CompanyPrestige.company_id == company_id).all()
+    for row in rows:
+        row.prestige_level = max(floor, row.prestige_level - decay)
+    db.flush()
+
+
 def advance_time(
    db: Session,
    company_id: UUID,
@ -148,9 +162,11 @@ def advance_time(

        action_type, action_time = candidates[0]

-        # Flush progress from current_time to action_time
+        # Flush progress and apply prestige decay from current_time to action_time
        if action_time > current_time:
+            days_elapsed = (action_time - current_time).total_seconds() / 86400.0
            flush_progress(db, company_id, current_time, action_time)
+            apply_prestige_decay(db, company_id, days_elapsed)
            current_time = action_time

        if action_type == "target":
--- a/src/yc_bench/runner/dashboard.py
+++ b/src/yc_bench/runner/dashboard.py
@ -129,7 +129,7 @@ def _query_detailed_snapshot(db_factory, company_id) -> dict[str, Any]:
                ]
                deadline_str = t.deadline.strftime("%Y-%m-%d") if t.deadline else "-"
                tasks_detail.append(TaskInfo(
-                    title=t.title[:20],
+                    title=t.title,
                    status=status.value,
                    prestige=t.required_prestige,
                    reward_dollars=t.reward_funds_cents / 100.0,
@ -398,7 +398,7 @@ class BenchmarkDashboard:
                prog_parts.append(f"{tag} {bar}")
            progress_str = " ".join(prog_parts)

-            table.add_row(marker, t.title[:20], reward, t.deadline, progress_str)
+            table.add_row(marker, t.title, reward, t.deadline, progress_str)

        remaining = len(s.tasks_detail) - 6
        if remaining > 0:
--- a/src/yc_bench/services/generate_tasks.py
+++ b/src/yc_bench/services/generate_tasks.py
@ -63,10 +63,11 @@ def _sample_required_qty(rng, cfg):
    return int(sample_from_spec(rng, cfg.dist.required_qty))


-def _sample_requirements(rng, cfg):
+def _sample_requirements(rng, cfg, prestige=1):
    k = _sample_domain_count(rng, cfg)
    picked_domains = sample_without_replacement(rng, _ALL_DOMAINS, k)
-    return {domain: _sample_required_qty(rng, cfg) for domain in picked_domains}
+    scale = 1 + cfg.prestige_qty_scale * (prestige - 1)
+    return {domain: int(_sample_required_qty(rng, cfg) * scale) for domain in picked_domains}


 def _make_task(rng, cfg, prestige, serial, requirements):
@ -97,8 +98,8 @@ def generate_tasks(*, run_seed, count, cfg=None):
    out = []
    for idx in range(1, count + 1):
        rng = streams.stream(f"task_{idx}")
-        requirements = _sample_requirements(rng, cfg)
        prestige = _sample_required_prestige(rng, cfg, index=idx - 1)
+        requirements = _sample_requirements(rng, cfg, prestige=prestige)
        out.append(_make_task(rng, cfg, prestige, serial=idx, requirements=requirements))
    return out

@ -138,8 +139,8 @@ def generate_replacement_task(*, run_seed, replenish_counter, cfg=None):
        cfg = WorldConfig()
    streams = RngStreams(run_seed)
    rng = streams.stream(f"replenish_{replenish_counter}")
-    requirements = _sample_requirements(rng, cfg)
    prestige = _sample_required_prestige(rng, cfg)
+    requirements = _sample_requirements(rng, cfg, prestige=prestige)
    return _make_task(rng, cfg, prestige, serial=replenish_counter, requirements=requirements)