calibrating + bug fix tool_choice="auto" for 5.4 mini/nano

2026-04-19 12:58:03 +00:00 · 2026-03-20 16:27:30 -07:00 · 2026-03-20 16:27:30 -07:00 · f76f5be652
commit f76f5be652
parent d829b07e60
4 changed files with 86 additions and 70 deletions
--- a/run_all_models.sh
+++ b/run_all_models.sh
@ -1,11 +1,12 @@
 #!/bin/bash
-# Run all models on medium config across seeds 1-3.
-# Usage: bash run_all_models.sh [--seed 1] [--config medium]
-
-set -e
+# Run models on medium config across seeds — PARALLEL across models.
+# Usage:
+#   bash run_all_models.sh                    # all models, seeds 1-2
+#   bash run_all_models.sh --seed "1"         # single seed
+#   bash run_all_models.sh --config medium    # custom config

 CONFIG="${CONFIG:-medium}"
-SEEDS="${SEEDS:-1 2 3}"
+SEEDS="1 2"

 # Parse optional args
 while [[ $# -gt 0 ]]; do
@ -16,59 +17,71 @@ while [[ $# -gt 0 ]]; do
    esac
 done

-# Direct API models
-DIRECT_MODELS=(
+# Models to run (direct API)
+MODELS=(
    "openai/gpt-5.4"
    "openai/gpt-5.4-mini"
    "openai/gpt-5.4-nano"
    "gemini/gemini-3.1-pro-preview"
    "gemini/gemini-3-flash-preview"
-    "anthropic/claude-opus-4-6"
-    "anthropic/claude-sonnet-4-6"
 )

-# OpenRouter models
-OPENROUTER_MODELS=(
-    "openrouter/qwen/qwen3.5-397b-a17b"
-    "openrouter/minimax/minimax-m2.7"
-    "openrouter/deepseek/deepseek-v3.2"
-    "openrouter/z-ai/glm-5"
-    "openrouter/moonshotai/kimi-k2.5"
-    "openrouter/x-ai/grok-4.20-beta"
-)
-
-ALL_MODELS=("${DIRECT_MODELS[@]}" "${OPENROUTER_MODELS[@]}")
-
-echo "=== YC-Bench Full Run ==="
+echo "=== YC-Bench Experiment Run (PARALLEL) ==="
 echo "Config: $CONFIG"
 echo "Seeds: $SEEDS"
-echo "Models: ${#ALL_MODELS[@]}"
+echo "Models: ${#MODELS[@]}"
 echo ""

-# Run all LLM models
-for model in "${ALL_MODELS[@]}"; do
-    for seed in $SEEDS; do
-        # Derive DB name from model string (replace / with _)
-        db_name=$(echo "$model" | tr '/' '_')
-        db_path="db/${CONFIG}_${seed}_${db_name}.db"
+mkdir -p db results plots

-        # Skip if result already exists
+PIDS=()
+LABELS=()
+
+for model in "${MODELS[@]}"; do
+    for seed in $SEEDS; do
+        db_name=$(echo "$model" | tr '/' '_')
        result_file="results/yc_bench_result_${CONFIG}_${seed}_${db_name}.json"
+
        if [[ -f "$result_file" ]]; then
            echo "  SKIP $model seed=$seed (result exists)"
            continue
        fi

-        echo "  RUN  $model | $CONFIG seed=$seed"
+        echo "  LAUNCH $model | $CONFIG seed=$seed"
+        db_path="db/${CONFIG}_${seed}_${db_name}.db"
        rm -f "$db_path"
+
        uv run yc-bench run \
            --model "$model" \
            --seed "$seed" \
            --config "$CONFIG" \
            --no-live \
-            2>&1 | tail -3
-        echo ""
+            > "logs/${db_name}_seed${seed}.log" 2>&1 &
+
+        PIDS+=($!)
+        LABELS+=("$model seed=$seed")
    done
 done

-echo "=== All runs complete ==="
+echo ""
+echo "Launched ${#PIDS[@]} runs in parallel. Waiting..."
+echo ""
+
+# Wait for all and report
+FAILED=0
+for i in "${!PIDS[@]}"; do
+    wait "${PIDS[$i]}"
+    EXIT_CODE=$?
+    if [[ $EXIT_CODE -eq 0 ]]; then
+        echo "  DONE ${LABELS[$i]}"
+    else
+        echo "  FAIL ${LABELS[$i]} (exit $EXIT_CODE)"
+        FAILED=$((FAILED + 1))
+    fi
+done
+
+echo ""
+echo "=== Complete: $((${#PIDS[@]} - FAILED)) succeeded, $FAILED failed ==="
+echo ""
+echo "Plot results with:"
+echo "  uv run python scripts/plot_run.py results/yc_bench_result_${CONFIG}_*.json"
--- a/src/yc_bench/agent/runtime/litellm_runtime.py
+++ b/src/yc_bench/agent/runtime/litellm_runtime.py
@ -17,6 +17,7 @@ from ..tools.run_command_schema import normalize_result
 logger = logging.getLogger(__name__)

 litellm.suppress_debug_info = True
+litellm.drop_params = True  # silently drop unsupported params (e.g. tool_choice for mini/nano models)

 # Tool schema passed to the LLM on every call
 _RUN_COMMAND_TOOL = {
--- a/src/yc_bench/cli/task_commands.py
+++ b/src/yc_bench/cli/task_commands.py
@ -86,15 +86,23 @@ def task_accept(
                    f"does not meet task requirement ({task.required_trust})."
                )

-        # Apply trust work reduction at accept time (no reward multiplier —
-        # faster completion from trust already increases revenue via throughput).
+        # Check if this client is a RAT (loyalty < -0.3)
        _cfg = _get_world_cfg()
+        is_rat = False
        if task.client_id is not None:
+            client_row = db.query(Client).filter(Client.id == task.client_id).one_or_none()
+            if client_row and client_row.loyalty < -0.3:
+                is_rat = True
+
+        # Apply trust work reduction (only for non-RAT clients —
+        # RATs don't honor trust, they scope-creep regardless).
+        if not is_rat and task.client_id is not None:
            work_reduction = _cfg.trust_work_reduction_max * (trust_level / _cfg.trust_max)
            for r in reqs:
-                r.required_qty = int(float(r.required_qty) * (1 - work_reduction))
+                reduced = int(float(r.required_qty) * (1 - work_reduction))
+                r.required_qty = max(200, reduced)  # respect DB constraint

-        # Compute deadline from advertised qty BEFORE scope creep
+        # Compute deadline from current qty (after trust reduction, before scope creep)
        max_domain_qty = max(float(r.required_qty) for r in reqs)
        accepted_at = sim_state.sim_time
        deadline = _compute_deadline(accepted_at, max_domain_qty)
@ -103,22 +111,13 @@ def task_accept(
        task.advertised_reward_cents = task.reward_funds_cents

        # Scope creep: RAT clients inflate required_qty after accept.
-        # Minimum inflation ensures ALL RAT tasks exceed deadline (which was
-        # computed from pre-creep qty). The agent can't tell from the deadline
-        # alone — the trap only springs after accept.
-        if task.client_id is not None:
-            client_row = db.query(Client).filter(Client.id == task.client_id).one_or_none()
-            if client_row and client_row.loyalty < -0.3:
-                intensity = abs(client_row.loyalty)
-                inflation = _cfg.scope_creep_max * intensity
-                # Ensure enough inflation to bust the deadline:
-                # deadline_hours = deadline_min_biz_days * work_hours
-                # need inflated_qty / effective_rate > deadline_hours
-                # Conservative: at least 130% inflation so even small tasks fail
-                inflation = max(1.3, inflation)
-                for r in reqs:
-                    inflated = float(r.required_qty) * (1 + inflation)
-                    r.required_qty = int(min(25000, max(200, inflated)))
+        if is_rat:
+            intensity = abs(client_row.loyalty)
+            inflation = _cfg.scope_creep_max * intensity
+            inflation = max(2.0, inflation)
+            for r in reqs:
+                inflated = float(r.required_qty) * (1 + inflation)
+                r.required_qty = int(min(25000, max(200, inflated)))

        # Transition task
        task.status = TaskStatus.PLANNED
--- a/src/yc_bench/config/presets/medium.toml
+++ b/src/yc_bench/config/presets/medium.toml
@ -1,19 +1,22 @@
-# medium — Can the agent detect and avoid adversarial clients?
+# medium — Can the agent manage employees efficiently and navigate adversarial clients?
 #
-# The environment is survivable for any competent agent. The differentiator
-# is the client trust/loyalty system: ~30% of clients are hidden RATs that
-# silently inflate work (scope creep) and claw back payments (disputes).
+# Primary differentiator: PAYROLL GROWTH from employee assignment.
+# Assign-all (8 emp) grows payroll ~2.7× faster than selective (3 emp).
+# Over 12 months this creates a $100K+ gap in profitability.
 #
-# RAT tasks look identical to normal tasks. The only way to detect them is
-# by checking `client history` and noticing listed vs received discrepancies.
+# Secondary differentiator: RAT CLIENTS (~35% of clients).
+# RAT tasks have scope creep that causes deadline failures.
+# RATs offer top-tier rewards to attract the greedy bot.
+# Smart agents can detect RATs via `client history` after initial failures.
 #
-# The greedy bot can't detect RATs — it picks tasks blindly and bleeds money.
-# A smart LLM that checks history and avoids RATs should clearly outperform.
+# Tertiary differentiator: TRUST BUILDING.
+# Focusing on fewer clients builds trust → work reduction → faster tasks.
+# Spreading across many clients erodes trust via cross-client decay.

 extends = "default"

 name        = "medium"
-description = "1-year medium. Tests client trust awareness — can the agent detect adversarial clients?"
+description = "1-year medium. Tests employee efficiency, client trust, and adversarial client detection."

 [sim]
 horizon_years = 1
@ -29,26 +32,26 @@ num_clients              = 6
 deadline_qty_per_day = 150.0
 deadline_min_biz_days = 7
 penalty_fail_multiplier   = 1.0
-penalty_fail_funds_pct    = 0.25   # failing costs 25% of advertised reward
+penalty_fail_funds_pct    = 0.35   # failing costs 35% of advertised reward
 penalty_cancel_multiplier = 1.5
 # Salary bumps: each completed task raises salary for ALL assigned employees.
-# Assign-all (8 bumps/task) compounds payroll fast → unsustainable.
-# Selective assignment (3-4/task) grows slower → survivable for smart agents.
+# Assign-all (8 bumps/task) grows payroll ~2.7× faster than selective (3/task).
+# This is the PRIMARY differentiator between bot and smart agents.
 salary_bump_pct = 0.01
 reward_prestige_scale = 0.30
 prestige_decay_per_day = 0.0

 # --- Client trust ---
-trust_build_rate         = 5.0
+trust_build_rate         = 5.0    # ~5 tasks to reach significant trust
 trust_fragility          = 0.3
 trust_focus_pressure     = 0.3
 trust_reward_ceiling     = 2.6
-trust_work_reduction_max = 0.50
+trust_work_reduction_max = 0.50   # 50% less work at max trust
 trust_gating_fraction    = 0.30
-trust_gated_reward_boost = 0.50    # trust-4 task pays 3× base (was 1.6× at 0.15)
+trust_gated_reward_boost = 0.0    # no reward distortion

 # --- Client loyalty ---
-loyalty_rat_fraction     = 0.20
+loyalty_rat_fraction     = 0.35
 loyalty_severity         = 1.0
 loyalty_reveal_trust     = 0.0