diff --git a/run_all_models.sh b/run_all_models.sh index 1668549..90357f6 100755 --- a/run_all_models.sh +++ b/run_all_models.sh @@ -1,11 +1,12 @@ #!/bin/bash -# Run all models on medium config across seeds 1-3. -# Usage: bash run_all_models.sh [--seed 1] [--config medium] - -set -e +# Run models on medium config across seeds — PARALLEL across models. +# Usage: +# bash run_all_models.sh # all models, seeds 1-2 +# bash run_all_models.sh --seed "1" # single seed +# bash run_all_models.sh --config medium # custom config CONFIG="${CONFIG:-medium}" -SEEDS="${SEEDS:-1 2 3}" +SEEDS="1 2" # Parse optional args while [[ $# -gt 0 ]]; do @@ -16,59 +17,71 @@ while [[ $# -gt 0 ]]; do esac done -# Direct API models -DIRECT_MODELS=( +# Models to run (direct API) +MODELS=( "openai/gpt-5.4" "openai/gpt-5.4-mini" "openai/gpt-5.4-nano" "gemini/gemini-3.1-pro-preview" "gemini/gemini-3-flash-preview" - "anthropic/claude-opus-4-6" - "anthropic/claude-sonnet-4-6" ) -# OpenRouter models -OPENROUTER_MODELS=( - "openrouter/qwen/qwen3.5-397b-a17b" - "openrouter/minimax/minimax-m2.7" - "openrouter/deepseek/deepseek-v3.2" - "openrouter/z-ai/glm-5" - "openrouter/moonshotai/kimi-k2.5" - "openrouter/x-ai/grok-4.20-beta" -) - -ALL_MODELS=("${DIRECT_MODELS[@]}" "${OPENROUTER_MODELS[@]}") - -echo "=== YC-Bench Full Run ===" +echo "=== YC-Bench Experiment Run (PARALLEL) ===" echo "Config: $CONFIG" echo "Seeds: $SEEDS" -echo "Models: ${#ALL_MODELS[@]}" +echo "Models: ${#MODELS[@]}" echo "" -# Run all LLM models -for model in "${ALL_MODELS[@]}"; do - for seed in $SEEDS; do - # Derive DB name from model string (replace / with _) - db_name=$(echo "$model" | tr '/' '_') - db_path="db/${CONFIG}_${seed}_${db_name}.db" +mkdir -p db results plots - # Skip if result already exists +PIDS=() +LABELS=() + +for model in "${MODELS[@]}"; do + for seed in $SEEDS; do + db_name=$(echo "$model" | tr '/' '_') result_file="results/yc_bench_result_${CONFIG}_${seed}_${db_name}.json" + if [[ -f "$result_file" ]]; then echo " SKIP $model seed=$seed (result exists)" continue fi - echo " RUN $model | $CONFIG seed=$seed" + echo " LAUNCH $model | $CONFIG seed=$seed" + db_path="db/${CONFIG}_${seed}_${db_name}.db" rm -f "$db_path" + uv run yc-bench run \ --model "$model" \ --seed "$seed" \ --config "$CONFIG" \ --no-live \ - 2>&1 | tail -3 - echo "" + > "logs/${db_name}_seed${seed}.log" 2>&1 & + + PIDS+=($!) + LABELS+=("$model seed=$seed") done done -echo "=== All runs complete ===" +echo "" +echo "Launched ${#PIDS[@]} runs in parallel. Waiting..." +echo "" + +# Wait for all and report +FAILED=0 +for i in "${!PIDS[@]}"; do + wait "${PIDS[$i]}" + EXIT_CODE=$? + if [[ $EXIT_CODE -eq 0 ]]; then + echo " DONE ${LABELS[$i]}" + else + echo " FAIL ${LABELS[$i]} (exit $EXIT_CODE)" + FAILED=$((FAILED + 1)) + fi +done + +echo "" +echo "=== Complete: $((${#PIDS[@]} - FAILED)) succeeded, $FAILED failed ===" +echo "" +echo "Plot results with:" +echo " uv run python scripts/plot_run.py results/yc_bench_result_${CONFIG}_*.json" diff --git a/src/yc_bench/agent/runtime/litellm_runtime.py b/src/yc_bench/agent/runtime/litellm_runtime.py index 72e43d7..dced6ed 100644 --- a/src/yc_bench/agent/runtime/litellm_runtime.py +++ b/src/yc_bench/agent/runtime/litellm_runtime.py @@ -17,6 +17,7 @@ from ..tools.run_command_schema import normalize_result logger = logging.getLogger(__name__) litellm.suppress_debug_info = True +litellm.drop_params = True # silently drop unsupported params (e.g. tool_choice for mini/nano models) # Tool schema passed to the LLM on every call _RUN_COMMAND_TOOL = { diff --git a/src/yc_bench/cli/task_commands.py b/src/yc_bench/cli/task_commands.py index 5b715a3..d241b3a 100644 --- a/src/yc_bench/cli/task_commands.py +++ b/src/yc_bench/cli/task_commands.py @@ -86,15 +86,23 @@ def task_accept( f"does not meet task requirement ({task.required_trust})." ) - # Apply trust work reduction at accept time (no reward multiplier — - # faster completion from trust already increases revenue via throughput). + # Check if this client is a RAT (loyalty < -0.3) _cfg = _get_world_cfg() + is_rat = False if task.client_id is not None: + client_row = db.query(Client).filter(Client.id == task.client_id).one_or_none() + if client_row and client_row.loyalty < -0.3: + is_rat = True + + # Apply trust work reduction (only for non-RAT clients — + # RATs don't honor trust, they scope-creep regardless). + if not is_rat and task.client_id is not None: work_reduction = _cfg.trust_work_reduction_max * (trust_level / _cfg.trust_max) for r in reqs: - r.required_qty = int(float(r.required_qty) * (1 - work_reduction)) + reduced = int(float(r.required_qty) * (1 - work_reduction)) + r.required_qty = max(200, reduced) # respect DB constraint - # Compute deadline from advertised qty BEFORE scope creep + # Compute deadline from current qty (after trust reduction, before scope creep) max_domain_qty = max(float(r.required_qty) for r in reqs) accepted_at = sim_state.sim_time deadline = _compute_deadline(accepted_at, max_domain_qty) @@ -103,22 +111,13 @@ def task_accept( task.advertised_reward_cents = task.reward_funds_cents # Scope creep: RAT clients inflate required_qty after accept. - # Minimum inflation ensures ALL RAT tasks exceed deadline (which was - # computed from pre-creep qty). The agent can't tell from the deadline - # alone — the trap only springs after accept. - if task.client_id is not None: - client_row = db.query(Client).filter(Client.id == task.client_id).one_or_none() - if client_row and client_row.loyalty < -0.3: - intensity = abs(client_row.loyalty) - inflation = _cfg.scope_creep_max * intensity - # Ensure enough inflation to bust the deadline: - # deadline_hours = deadline_min_biz_days * work_hours - # need inflated_qty / effective_rate > deadline_hours - # Conservative: at least 130% inflation so even small tasks fail - inflation = max(1.3, inflation) - for r in reqs: - inflated = float(r.required_qty) * (1 + inflation) - r.required_qty = int(min(25000, max(200, inflated))) + if is_rat: + intensity = abs(client_row.loyalty) + inflation = _cfg.scope_creep_max * intensity + inflation = max(2.0, inflation) + for r in reqs: + inflated = float(r.required_qty) * (1 + inflation) + r.required_qty = int(min(25000, max(200, inflated))) # Transition task task.status = TaskStatus.PLANNED diff --git a/src/yc_bench/config/presets/medium.toml b/src/yc_bench/config/presets/medium.toml index be7383a..f018a71 100644 --- a/src/yc_bench/config/presets/medium.toml +++ b/src/yc_bench/config/presets/medium.toml @@ -1,19 +1,22 @@ -# medium — Can the agent detect and avoid adversarial clients? +# medium — Can the agent manage employees efficiently and navigate adversarial clients? # -# The environment is survivable for any competent agent. The differentiator -# is the client trust/loyalty system: ~30% of clients are hidden RATs that -# silently inflate work (scope creep) and claw back payments (disputes). +# Primary differentiator: PAYROLL GROWTH from employee assignment. +# Assign-all (8 emp) grows payroll ~2.7× faster than selective (3 emp). +# Over 12 months this creates a $100K+ gap in profitability. # -# RAT tasks look identical to normal tasks. The only way to detect them is -# by checking `client history` and noticing listed vs received discrepancies. +# Secondary differentiator: RAT CLIENTS (~35% of clients). +# RAT tasks have scope creep that causes deadline failures. +# RATs offer top-tier rewards to attract the greedy bot. +# Smart agents can detect RATs via `client history` after initial failures. # -# The greedy bot can't detect RATs — it picks tasks blindly and bleeds money. -# A smart LLM that checks history and avoids RATs should clearly outperform. +# Tertiary differentiator: TRUST BUILDING. +# Focusing on fewer clients builds trust → work reduction → faster tasks. +# Spreading across many clients erodes trust via cross-client decay. extends = "default" name = "medium" -description = "1-year medium. Tests client trust awareness — can the agent detect adversarial clients?" +description = "1-year medium. Tests employee efficiency, client trust, and adversarial client detection." [sim] horizon_years = 1 @@ -29,26 +32,26 @@ num_clients = 6 deadline_qty_per_day = 150.0 deadline_min_biz_days = 7 penalty_fail_multiplier = 1.0 -penalty_fail_funds_pct = 0.25 # failing costs 25% of advertised reward +penalty_fail_funds_pct = 0.35 # failing costs 35% of advertised reward penalty_cancel_multiplier = 1.5 # Salary bumps: each completed task raises salary for ALL assigned employees. -# Assign-all (8 bumps/task) compounds payroll fast → unsustainable. -# Selective assignment (3-4/task) grows slower → survivable for smart agents. +# Assign-all (8 bumps/task) grows payroll ~2.7× faster than selective (3/task). +# This is the PRIMARY differentiator between bot and smart agents. salary_bump_pct = 0.01 reward_prestige_scale = 0.30 prestige_decay_per_day = 0.0 # --- Client trust --- -trust_build_rate = 5.0 +trust_build_rate = 5.0 # ~5 tasks to reach significant trust trust_fragility = 0.3 trust_focus_pressure = 0.3 trust_reward_ceiling = 2.6 -trust_work_reduction_max = 0.50 +trust_work_reduction_max = 0.50 # 50% less work at max trust trust_gating_fraction = 0.30 -trust_gated_reward_boost = 0.50 # trust-4 task pays 3× base (was 1.6× at 0.15) +trust_gated_reward_boost = 0.0 # no reward distortion # --- Client loyalty --- -loyalty_rat_fraction = 0.20 +loyalty_rat_fraction = 0.35 loyalty_severity = 1.0 loyalty_reveal_trust = 0.0