Calibrated domain prestge bump

This commit is contained in:
Muyu He 2026-03-06 14:40:45 -08:00
parent 5671e0102f
commit 99e69190ec
20 changed files with 6829 additions and 133 deletions

View file

@ -45,7 +45,7 @@ Outputs a SQLite DB in `db/` and a JSON rollout in `results/`.
### Run multiple models in parallel
```bash
bash scripts/run_benchmark.sh --seed 1 --config challenge
bash scripts/run_benchmark.sh --seed 1 --config hard
```
---

File diff suppressed because it is too large Load diff

View file

@ -91,15 +91,15 @@ def _compute_deadline(accepted_at, max_domain_qty, cfg):
def _build_candidates(db, company_id, sim_state, world_cfg, emp_skills):
"""Build CandidateTask list for all market tasks the company can see."""
"""Build CandidateTask list for all market tasks the company can accept (per-domain prestige gating)."""
prestige_rows = db.query(CompanyPrestige).filter(
CompanyPrestige.company_id == company_id
).all()
max_prestige = max((float(p.prestige_level) for p in prestige_rows), default=1.0)
prestige_map = {p.domain: float(p.prestige_level) for p in prestige_rows}
min_prestige = min(prestige_map.values()) if prestige_map else 1.0
market_tasks = db.query(Task).filter(
Task.status == TaskStatus.MARKET,
Task.required_prestige <= int(max_prestige),
).order_by(Task.reward_funds_cents.desc()).all()
all_skills = [{d: r for d, r in e["skills"].items()} for e in emp_skills]
@ -109,6 +109,15 @@ def _build_candidates(db, company_id, sim_state, world_cfg, emp_skills):
reqs = db.query(TaskRequirement).filter(
TaskRequirement.task_id == task.id
).all()
# Per-domain prestige check: all required domains must meet threshold
meets_prestige = all(
prestige_map.get(r.domain, 1.0) >= task.required_prestige
for r in reqs
)
if not meets_prestige:
continue
max_domain_qty = max(float(r.required_qty) for r in reqs)
task_reqs = [{"domain": r.domain, "required_qty": float(r.required_qty)} for r in reqs]
@ -128,7 +137,7 @@ def _build_candidates(db, company_id, sim_state, world_cfg, emp_skills):
is_completable=is_completable,
))
return candidates, max_prestige
return candidates, min_prestige
# ── Strategy functions ──────────────────────────────────────────────────────

View file

@ -1,7 +1,7 @@
"""Multi-model comparison plot: funds curves + cost vs budget.
Run from the repo root:
uv run python scripts/plot_multi_model.py [--seed 1] [--config challenge] [--budget 30]
uv run python scripts/plot_multi_model.py [--seed 1] [--config hard] [--budget 30]
"""
import argparse
import json
@ -32,7 +32,7 @@ INITIAL_FUNDS_CENTS = 25_000_000 # $250K
def parse_args():
p = argparse.ArgumentParser()
p.add_argument("--seed", type=int, default=1)
p.add_argument("--config", default="challenge")
p.add_argument("--config", default="hard")
p.add_argument("--budget", type=float, default=30.0)
p.add_argument("--out", default=None, help="Output PNG path (default: plots/funds_curves.png)")
return p.parse_args()

View file

@ -9,7 +9,7 @@
set -euo pipefail
SEEDS="1 2 3"
CONFIG=challenge
CONFIG=hard
while [[ $# -gt 0 ]]; do
case "$1" in

View file

@ -69,12 +69,11 @@ def detect_key(api_key: str) -> tuple[str, str]:
# ── Config presets ───────────────────────────────────────────────────────
PRESETS = [
("tutorial", "Tutorial", "1 yr", "3 emp", "50 tasks", "Learn the basics"),
("easy", "Easy", "1 yr", "5 emp", "100 tasks", "Gentle intro"),
("medium", "Medium", "1 yr", "5 emp", "150 tasks", "Prestige + specialization"),
("hard", "Hard", "1 yr", "7 emp", "200 tasks", "Deadline pressure"),
("challenge", "Challenge", "3 yr", "5 emp", "200 tasks", "Long-horizon endurance"),
("nightmare", "Nightmare", "1 yr", "8 emp", "300 tasks", "Sustained perfection"),
("tutorial", "Tutorial", "1 yr", "10 emp", "200 tasks", "Learn the basics"),
("easy", "Easy", "1 yr", "10 emp", "200 tasks", "Gentle intro"),
("medium", "Medium", "1 yr", "10 emp", "200 tasks", "Prestige + specialization"),
("hard", "Hard", "1 yr", "10 emp", "200 tasks", "Deadline pressure"),
("nightmare", "Nightmare", "1 yr", "10 emp", "200 tasks", "Sustained perfection"),
]

View file

@ -57,20 +57,21 @@ def task_accept(
if task.status != TaskStatus.MARKET:
error_output(f"Task {task_id} is not in market status (current: {task.status.value}).")
# Validate prestige requirement
# Validate per-domain prestige requirement
company_id = sim_state.company_id
reqs = db.query(TaskRequirement).filter(TaskRequirement.task_id == tid).all()
prestige_rows = db.query(CompanyPrestige).filter(
CompanyPrestige.company_id == company_id
).all()
max_prestige = max((float(p.prestige_level) for p in prestige_rows), default=1.0)
prestige_map = {p.domain: float(p.prestige_level) for p in prestige_rows}
if task.required_prestige > max_prestige:
error_output(
f"Company max prestige ({max_prestige}) does not meet task requirement ({task.required_prestige})."
)
# Compute deadline
reqs = db.query(TaskRequirement).filter(TaskRequirement.task_id == tid).all()
for req in reqs:
domain_prestige = prestige_map.get(req.domain, 1.0)
if task.required_prestige > domain_prestige:
error_output(
f"Company prestige in {req.domain.value} ({domain_prestige:.1f}) "
f"does not meet task requirement ({task.required_prestige})."
)
max_domain_qty = max(float(r.required_qty) for r in reqs)
accepted_at = sim_state.sim_time
deadline = _compute_deadline(accepted_at, max_domain_qty)

View file

@ -1,76 +0,0 @@
# challenge — 3-year benchmark designed for differentiated model behavior
#
# Design goals:
# - Models that focus employees on tasks complete on time, build prestige,
# and can reach the 3-year horizon.
# - Models that spread employees or forget to dispatch go bankrupt.
# - Wide outcome variance across models: some die early, some survive.
#
# Economics (seed=1, 5 employees):
# Monthly payroll ≈ $32K. Starting runway ≈ 7.8 months.
# Mode task: 2 domains × 700 units = 1400 total.
# Deadline: max(7, 1400/200) = 7 business days (minimum).
# All 5 focused on one task, split ~2+3: domain1 3×5.1×9=137.7 units/day →
# 700/137.7 = 5.1 days; domain2 2×5.1×9=91.8 → 700/91.8 = 7.6 days > 7 FAIL.
# → Need ≥3 employees per domain: put 3 on domain1 and 3 on domain2 (count ×2)
# or accept single-domain tasks where all 5 focus.
# When done right: complete 3 tasks/month × avg $30K = $90K > $32K payroll ✓
# Late game: prestige-5 tasks pay $30K × 3.2× = $96K each → strong growth.
extends = "default"
name = "challenge"
description = "3-year benchmark calibrated for differentiated model behavior. ~7.8 months starting runway. Strategy and focus determine survival."
# ---------------------------------------------------------------------------
# Simulation
# ---------------------------------------------------------------------------
[sim]
horizon_years = 3
# ---------------------------------------------------------------------------
# Agent loop
# ---------------------------------------------------------------------------
[loop]
# Force a sim advance if the agent goes 5 consecutive turns without one.
auto_advance_after_turns = 5
# 500 turns covers 3 years for an efficient agent:
# advancing every 3 turns → 167 advances × ~14 avg sim days = 2338 days = 6.4 yrs.
# forced advances only → 100 advances × ~14 days = 1400 days = 3.8 yrs.
max_turns = 500
# ---------------------------------------------------------------------------
# World — 5 employees keeps payroll manageable (~$32K/month).
# 200 market tasks gives a deep enough pool to find tasks in any prestige tier.
# ---------------------------------------------------------------------------
[world]
num_employees = 5
num_market_tasks = 200
deadline_qty_per_day = 100.0
# ---------------------------------------------------------------------------
# Task size: mode=700 (smaller than hardened default 1400) so tasks complete
# in ~5-10 sim days when employees are focused. This creates frequent reward
# events visible in the funds curve, and makes payroll crises recoverable.
# ---------------------------------------------------------------------------
[world.dist.required_qty]
type = "triangular"
low = 200
high = 2000
mode = 700
# ---------------------------------------------------------------------------
# Prestige: mode=3 ensures a healthy supply of prestige-1 and prestige-2 tasks
# at the start. Agents that specialise unlock 48 tasks paying 25× more.
# ---------------------------------------------------------------------------
[world.dist.required_prestige]
type = "triangular"
low = 1
high = 8
mode = 3

View file

@ -57,7 +57,7 @@ initial_funds_cents = 25_000_000 # $250,000
initial_prestige_level = 1.0
work_hours_per_day = 9.0
num_market_tasks = 500
num_market_tasks = 200
market_browse_default_limit = 50
# Salary bump per completed task — each assigned employee gets this raise.
@ -77,6 +77,16 @@ penalty_cancel_multiplier = 2.0 # hardened: was 1.2
# At 0.55: a prestige-8 task pays ~4.85x more than a prestige-1 task.
reward_prestige_scale = 0.55 # hardened: was 0.3
# Daily prestige decay per domain. Domains not exercised lose prestige
# over time: -0.01/day → -0.3/month. Untouched domain drops ~1 level
# every ~3 months. Prevents single-domain hyper-specialization.
prestige_decay_per_day = 0.01
# Required qty scaling by prestige: qty *= 1 + scale * (prestige - 1).
# At 0.3: prestige-5 tasks need 2.2x the work of prestige-1 tasks.
# High prestige pays more but demands proportionally more capacity.
prestige_qty_scale = 0.3
# --- Deadline ---
# Deadline = max(deadline_min_biz_days, max_domain_qty / deadline_qty_per_day).
# Domains are worked in parallel, so deadline scales with heaviest domain, not sum.
@ -133,15 +143,15 @@ low = 500 # hardened: base default is 200
high = 3000
mode = 1400 # hardened: base default is 800
# Prestige delta awarded per required domain on task success.
# Left-skewed beta: most completions give small gains; occasional large jumps.
# Prestige delta awarded per domain on task success.
# Mean ~0.1: climbing prestige 1→5 takes ~40 tasks.
[world.dist.reward_prestige_delta]
type = "beta"
alpha = 1.2
beta = 2.8
scale = 2.0
scale = 0.35
low = 0.0
high = 2.0
high = 0.35
# Skill rate boost applied to each assigned employee on task success.
# Expressed as a fraction of current rate: 0.12 → average 12% boost.

View file

@ -28,8 +28,7 @@ horizon_years = 1
auto_advance_after_turns = 8
[world]
num_employees = 5
num_market_tasks = 100
# Inherits num_employees=10, num_market_tasks=200 from default.
# Moderate deadlines: 60 qty/day → ~12 day deadline. Comfortable with 34 tasks.
deadline_qty_per_day = 60.0

View file

@ -19,8 +19,7 @@ auto_advance_after_turns = 5
max_turns = 50
[world]
num_employees = 5
num_market_tasks = 100
# Inherits num_employees=10, num_market_tasks=200 from default.
# Deadline based on max per-domain qty (parallel domain work).
deadline_qty_per_day = 150

View file

@ -40,8 +40,7 @@ horizon_years = 1
auto_advance_after_turns = 10
[world]
num_employees = 7
num_market_tasks = 200
# Inherits num_employees=10, num_market_tasks=200 from default.
# Tight deadlines: 1200/150 = 8 days.
# 1 task with 5 per domain → 5.8 days. OK.

View file

@ -23,6 +23,6 @@ mode = 6_000_000 # $60,000
type = "beta"
alpha = 1.2
beta = 2.8
scale = 2.0
scale = 0.5
low = 0.0
high = 3.0 # raised ceiling: up to 3.0 instead of 2.0
high = 0.5 # raised ceiling vs default 0.35

View file

@ -36,8 +36,7 @@ horizon_years = 1
auto_advance_after_turns = 8
[world]
num_employees = 5
num_market_tasks = 150
# Inherits num_employees=10, num_market_tasks=200 from default.
# Deadline uses max per-domain qty. 900/100 = 9 days.
# 2 concurrent tasks: 5 per task → 4.3 days each. Manageable.

View file

@ -49,8 +49,7 @@ horizon_years = 1
auto_advance_after_turns = 10
[world]
num_employees = 8
num_market_tasks = 300
# Inherits num_employees=10, num_market_tasks=200 from default.
# Razor deadlines: 1600/200 = 8 days.
# 1 task with 5 per domain → 7.7 days. Barely makes it.
@ -86,13 +85,13 @@ low = 600
high = 3000
mode = 1600 # Large work volumes — no quick wins.
# Larger prestige jumps on success — makes climbing feasible if you
# never fail. But the fail penalty is so high that one blown task
# wipes out 2 successes worth of prestige.
# Slightly larger prestige gains than default (~0.13 avg) to make
# climbing feasible despite the steep penalty. But one blown task
# still wipes out multiple successes worth of prestige.
[world.dist.reward_prestige_delta]
type = "beta"
alpha = 1.5
beta = 2.5
scale = 2.5
scale = 0.45
low = 0.0
high = 2.5
high = 0.45

View file

@ -28,8 +28,7 @@ horizon_years = 1
auto_advance_after_turns = 5
[world]
num_employees = 3
num_market_tasks = 50
# Inherits num_employees=10, num_market_tasks=200 from default.
# Very generous deadlines: 30 qty/day → most tasks get 13+ day deadline.
deadline_qty_per_day = 30.0

View file

@ -49,9 +49,10 @@ class WorldDists(BaseModel):
required_qty: DistSpec = Field(
default_factory=lambda: TriangularDist(low=200, high=3000, mode=800)
)
# Prestige delta awarded on task success.
# Prestige delta awarded per domain on task success.
# Mean ~0.1: climbing from prestige 1→5 takes ~40 tasks.
reward_prestige_delta: DistSpec = Field(
default_factory=lambda: BetaDist(alpha=1.2, beta=2.8, scale=2.0, low=0.0, high=2.0)
default_factory=lambda: BetaDist(alpha=1.2, beta=2.8, scale=0.35, low=0.0, high=0.35)
)
# Skill rate boost fraction applied to each assigned employee on task success.
skill_boost: DistSpec = Field(
@ -124,6 +125,15 @@ class WorldConfig(BaseModel):
# At 0.55: prestige-8 tasks pay ~4.85x more than prestige-1.
reward_prestige_scale: float = 0.3
# Daily prestige decay per domain. Domains not exercised lose prestige
# over time: -0.01/day → -0.3/month → untouched domain drops ~1 level
# every ~3 months. Floored at prestige_min.
prestige_decay_per_day: float = 0.01
# Required qty scaling by prestige: qty *= 1 + prestige_qty_scale * (prestige - 1).
# At 0.3: prestige-5 tasks need 2.2× the work of prestige-1 tasks.
prestige_qty_scale: float = 0.3
# --- Deadline computation ---
deadline_qty_per_day: float = 150.0 # max per-domain qty / this = deadline days
deadline_min_biz_days: int = 7

View file

@ -19,11 +19,12 @@ from uuid import UUID
from sqlalchemy.orm import Session
from ..db.models.company import Company
from ..db.models.company import Company, CompanyPrestige
from ..db.models.employee import Employee
from ..db.models.event import EventType, SimEvent
from ..db.models.ledger import LedgerCategory, LedgerEntry
from ..db.models.sim_state import SimState
from ..config import get_world_config
from .business_time import iter_monthly_payroll_boundaries
from .eta import recalculate_etas
from .events import consume_event, fetch_next_event, insert_event
@ -103,6 +104,19 @@ def dispatch_event(db: Session, event: SimEvent, sim_time: datetime, company_id:
return {"type": "unknown", "event_type": event.event_type.value}
def apply_prestige_decay(db: Session, company_id: UUID, days_elapsed: float) -> None:
"""Reduce prestige in all domains by decay_rate × days. Floors at prestige_min."""
wc = get_world_config()
if wc.prestige_decay_per_day <= 0 or days_elapsed <= 0:
return
decay = Decimal(str(wc.prestige_decay_per_day * days_elapsed))
floor = Decimal(str(wc.prestige_min))
rows = db.query(CompanyPrestige).filter(CompanyPrestige.company_id == company_id).all()
for row in rows:
row.prestige_level = max(floor, row.prestige_level - decay)
db.flush()
def advance_time(
db: Session,
company_id: UUID,
@ -148,9 +162,11 @@ def advance_time(
action_type, action_time = candidates[0]
# Flush progress from current_time to action_time
# Flush progress and apply prestige decay from current_time to action_time
if action_time > current_time:
days_elapsed = (action_time - current_time).total_seconds() / 86400.0
flush_progress(db, company_id, current_time, action_time)
apply_prestige_decay(db, company_id, days_elapsed)
current_time = action_time
if action_type == "target":

View file

@ -129,7 +129,7 @@ def _query_detailed_snapshot(db_factory, company_id) -> dict[str, Any]:
]
deadline_str = t.deadline.strftime("%Y-%m-%d") if t.deadline else "-"
tasks_detail.append(TaskInfo(
title=t.title[:20],
title=t.title,
status=status.value,
prestige=t.required_prestige,
reward_dollars=t.reward_funds_cents / 100.0,
@ -398,7 +398,7 @@ class BenchmarkDashboard:
prog_parts.append(f"{tag} {bar}")
progress_str = " ".join(prog_parts)
table.add_row(marker, t.title[:20], reward, t.deadline, progress_str)
table.add_row(marker, t.title, reward, t.deadline, progress_str)
remaining = len(s.tasks_detail) - 6
if remaining > 0:

View file

@ -63,10 +63,11 @@ def _sample_required_qty(rng, cfg):
return int(sample_from_spec(rng, cfg.dist.required_qty))
def _sample_requirements(rng, cfg):
def _sample_requirements(rng, cfg, prestige=1):
k = _sample_domain_count(rng, cfg)
picked_domains = sample_without_replacement(rng, _ALL_DOMAINS, k)
return {domain: _sample_required_qty(rng, cfg) for domain in picked_domains}
scale = 1 + cfg.prestige_qty_scale * (prestige - 1)
return {domain: int(_sample_required_qty(rng, cfg) * scale) for domain in picked_domains}
def _make_task(rng, cfg, prestige, serial, requirements):
@ -97,8 +98,8 @@ def generate_tasks(*, run_seed, count, cfg=None):
out = []
for idx in range(1, count + 1):
rng = streams.stream(f"task_{idx}")
requirements = _sample_requirements(rng, cfg)
prestige = _sample_required_prestige(rng, cfg, index=idx - 1)
requirements = _sample_requirements(rng, cfg, prestige=prestige)
out.append(_make_task(rng, cfg, prestige, serial=idx, requirements=requirements))
return out
@ -138,8 +139,8 @@ def generate_replacement_task(*, run_seed, replenish_counter, cfg=None):
cfg = WorldConfig()
streams = RngStreams(run_seed)
rng = streams.stream(f"replenish_{replenish_counter}")
requirements = _sample_requirements(rng, cfg)
prestige = _sample_required_prestige(rng, cfg)
requirements = _sample_requirements(rng, cfg, prestige=prestige)
return _make_task(rng, cfg, prestige, serial=replenish_counter, requirements=requirements)