calibrating + bug fix tool_choice="auto" for 5.4 mini/nano

This commit is contained in:
alckasoc 2026-03-20 16:27:30 -07:00
parent d829b07e60
commit f76f5be652
4 changed files with 86 additions and 70 deletions

View file

@ -1,11 +1,12 @@
#!/bin/bash
# Run all models on medium config across seeds 1-3.
# Usage: bash run_all_models.sh [--seed 1] [--config medium]
set -e
# Run models on medium config across seeds — PARALLEL across models.
# Usage:
# bash run_all_models.sh # all models, seeds 1-2
# bash run_all_models.sh --seed "1" # single seed
# bash run_all_models.sh --config medium # custom config
CONFIG="${CONFIG:-medium}"
SEEDS="${SEEDS:-1 2 3}"
SEEDS="1 2"
# Parse optional args
while [[ $# -gt 0 ]]; do
@ -16,59 +17,71 @@ while [[ $# -gt 0 ]]; do
esac
done
# Direct API models
DIRECT_MODELS=(
# Models to run (direct API)
MODELS=(
"openai/gpt-5.4"
"openai/gpt-5.4-mini"
"openai/gpt-5.4-nano"
"gemini/gemini-3.1-pro-preview"
"gemini/gemini-3-flash-preview"
"anthropic/claude-opus-4-6"
"anthropic/claude-sonnet-4-6"
)
# OpenRouter models
OPENROUTER_MODELS=(
"openrouter/qwen/qwen3.5-397b-a17b"
"openrouter/minimax/minimax-m2.7"
"openrouter/deepseek/deepseek-v3.2"
"openrouter/z-ai/glm-5"
"openrouter/moonshotai/kimi-k2.5"
"openrouter/x-ai/grok-4.20-beta"
)
ALL_MODELS=("${DIRECT_MODELS[@]}" "${OPENROUTER_MODELS[@]}")
echo "=== YC-Bench Full Run ==="
echo "=== YC-Bench Experiment Run (PARALLEL) ==="
echo "Config: $CONFIG"
echo "Seeds: $SEEDS"
echo "Models: ${#ALL_MODELS[@]}"
echo "Models: ${#MODELS[@]}"
echo ""
# Run all LLM models
for model in "${ALL_MODELS[@]}"; do
for seed in $SEEDS; do
# Derive DB name from model string (replace / with _)
db_name=$(echo "$model" | tr '/' '_')
db_path="db/${CONFIG}_${seed}_${db_name}.db"
mkdir -p db results plots
# Skip if result already exists
PIDS=()
LABELS=()
for model in "${MODELS[@]}"; do
for seed in $SEEDS; do
db_name=$(echo "$model" | tr '/' '_')
result_file="results/yc_bench_result_${CONFIG}_${seed}_${db_name}.json"
if [[ -f "$result_file" ]]; then
echo " SKIP $model seed=$seed (result exists)"
continue
fi
echo " RUN $model | $CONFIG seed=$seed"
echo " LAUNCH $model | $CONFIG seed=$seed"
db_path="db/${CONFIG}_${seed}_${db_name}.db"
rm -f "$db_path"
uv run yc-bench run \
--model "$model" \
--seed "$seed" \
--config "$CONFIG" \
--no-live \
2>&1 | tail -3
echo ""
> "logs/${db_name}_seed${seed}.log" 2>&1 &
PIDS+=($!)
LABELS+=("$model seed=$seed")
done
done
echo "=== All runs complete ==="
echo ""
echo "Launched ${#PIDS[@]} runs in parallel. Waiting..."
echo ""
# Wait for all and report
FAILED=0
for i in "${!PIDS[@]}"; do
wait "${PIDS[$i]}"
EXIT_CODE=$?
if [[ $EXIT_CODE -eq 0 ]]; then
echo " DONE ${LABELS[$i]}"
else
echo " FAIL ${LABELS[$i]} (exit $EXIT_CODE)"
FAILED=$((FAILED + 1))
fi
done
echo ""
echo "=== Complete: $((${#PIDS[@]} - FAILED)) succeeded, $FAILED failed ==="
echo ""
echo "Plot results with:"
echo " uv run python scripts/plot_run.py results/yc_bench_result_${CONFIG}_*.json"

View file

@ -17,6 +17,7 @@ from ..tools.run_command_schema import normalize_result
logger = logging.getLogger(__name__)
litellm.suppress_debug_info = True
litellm.drop_params = True # silently drop unsupported params (e.g. tool_choice for mini/nano models)
# Tool schema passed to the LLM on every call
_RUN_COMMAND_TOOL = {

View file

@ -86,15 +86,23 @@ def task_accept(
f"does not meet task requirement ({task.required_trust})."
)
# Apply trust work reduction at accept time (no reward multiplier —
# faster completion from trust already increases revenue via throughput).
# Check if this client is a RAT (loyalty < -0.3)
_cfg = _get_world_cfg()
is_rat = False
if task.client_id is not None:
client_row = db.query(Client).filter(Client.id == task.client_id).one_or_none()
if client_row and client_row.loyalty < -0.3:
is_rat = True
# Apply trust work reduction (only for non-RAT clients —
# RATs don't honor trust, they scope-creep regardless).
if not is_rat and task.client_id is not None:
work_reduction = _cfg.trust_work_reduction_max * (trust_level / _cfg.trust_max)
for r in reqs:
r.required_qty = int(float(r.required_qty) * (1 - work_reduction))
reduced = int(float(r.required_qty) * (1 - work_reduction))
r.required_qty = max(200, reduced) # respect DB constraint
# Compute deadline from advertised qty BEFORE scope creep
# Compute deadline from current qty (after trust reduction, before scope creep)
max_domain_qty = max(float(r.required_qty) for r in reqs)
accepted_at = sim_state.sim_time
deadline = _compute_deadline(accepted_at, max_domain_qty)
@ -103,22 +111,13 @@ def task_accept(
task.advertised_reward_cents = task.reward_funds_cents
# Scope creep: RAT clients inflate required_qty after accept.
# Minimum inflation ensures ALL RAT tasks exceed deadline (which was
# computed from pre-creep qty). The agent can't tell from the deadline
# alone — the trap only springs after accept.
if task.client_id is not None:
client_row = db.query(Client).filter(Client.id == task.client_id).one_or_none()
if client_row and client_row.loyalty < -0.3:
intensity = abs(client_row.loyalty)
inflation = _cfg.scope_creep_max * intensity
# Ensure enough inflation to bust the deadline:
# deadline_hours = deadline_min_biz_days * work_hours
# need inflated_qty / effective_rate > deadline_hours
# Conservative: at least 130% inflation so even small tasks fail
inflation = max(1.3, inflation)
for r in reqs:
inflated = float(r.required_qty) * (1 + inflation)
r.required_qty = int(min(25000, max(200, inflated)))
if is_rat:
intensity = abs(client_row.loyalty)
inflation = _cfg.scope_creep_max * intensity
inflation = max(2.0, inflation)
for r in reqs:
inflated = float(r.required_qty) * (1 + inflation)
r.required_qty = int(min(25000, max(200, inflated)))
# Transition task
task.status = TaskStatus.PLANNED

View file

@ -1,19 +1,22 @@
# medium — Can the agent detect and avoid adversarial clients?
# medium — Can the agent manage employees efficiently and navigate adversarial clients?
#
# The environment is survivable for any competent agent. The differentiator
# is the client trust/loyalty system: ~30% of clients are hidden RATs that
# silently inflate work (scope creep) and claw back payments (disputes).
# Primary differentiator: PAYROLL GROWTH from employee assignment.
# Assign-all (8 emp) grows payroll ~2.7× faster than selective (3 emp).
# Over 12 months this creates a $100K+ gap in profitability.
#
# RAT tasks look identical to normal tasks. The only way to detect them is
# by checking `client history` and noticing listed vs received discrepancies.
# Secondary differentiator: RAT CLIENTS (~35% of clients).
# RAT tasks have scope creep that causes deadline failures.
# RATs offer top-tier rewards to attract the greedy bot.
# Smart agents can detect RATs via `client history` after initial failures.
#
# The greedy bot can't detect RATs — it picks tasks blindly and bleeds money.
# A smart LLM that checks history and avoids RATs should clearly outperform.
# Tertiary differentiator: TRUST BUILDING.
# Focusing on fewer clients builds trust → work reduction → faster tasks.
# Spreading across many clients erodes trust via cross-client decay.
extends = "default"
name = "medium"
description = "1-year medium. Tests client trust awareness — can the agent detect adversarial clients?"
description = "1-year medium. Tests employee efficiency, client trust, and adversarial client detection."
[sim]
horizon_years = 1
@ -29,26 +32,26 @@ num_clients = 6
deadline_qty_per_day = 150.0
deadline_min_biz_days = 7
penalty_fail_multiplier = 1.0
penalty_fail_funds_pct = 0.25 # failing costs 25% of advertised reward
penalty_fail_funds_pct = 0.35 # failing costs 35% of advertised reward
penalty_cancel_multiplier = 1.5
# Salary bumps: each completed task raises salary for ALL assigned employees.
# Assign-all (8 bumps/task) compounds payroll fast → unsustainable.
# Selective assignment (3-4/task) grows slower → survivable for smart agents.
# Assign-all (8 bumps/task) grows payroll ~2.7× faster than selective (3/task).
# This is the PRIMARY differentiator between bot and smart agents.
salary_bump_pct = 0.01
reward_prestige_scale = 0.30
prestige_decay_per_day = 0.0
# --- Client trust ---
trust_build_rate = 5.0
trust_build_rate = 5.0 # ~5 tasks to reach significant trust
trust_fragility = 0.3
trust_focus_pressure = 0.3
trust_reward_ceiling = 2.6
trust_work_reduction_max = 0.50
trust_work_reduction_max = 0.50 # 50% less work at max trust
trust_gating_fraction = 0.30
trust_gated_reward_boost = 0.50 # trust-4 task pays 3× base (was 1.6× at 0.15)
trust_gated_reward_boost = 0.0 # no reward distortion
# --- Client loyalty ---
loyalty_rat_fraction = 0.20
loyalty_rat_fraction = 0.35
loyalty_severity = 1.0
loyalty_reveal_trust = 0.0