mirror of
https://github.com/collinear-ai/yc-bench.git
synced 2026-04-19 12:58:03 +00:00
calibrating + bug fix tool_choice="auto" for 5.4 mini/nano
This commit is contained in:
parent
d829b07e60
commit
f76f5be652
4 changed files with 86 additions and 70 deletions
|
|
@ -1,11 +1,12 @@
|
|||
#!/bin/bash
|
||||
# Run all models on medium config across seeds 1-3.
|
||||
# Usage: bash run_all_models.sh [--seed 1] [--config medium]
|
||||
|
||||
set -e
|
||||
# Run models on medium config across seeds — PARALLEL across models.
|
||||
# Usage:
|
||||
# bash run_all_models.sh # all models, seeds 1-2
|
||||
# bash run_all_models.sh --seed "1" # single seed
|
||||
# bash run_all_models.sh --config medium # custom config
|
||||
|
||||
CONFIG="${CONFIG:-medium}"
|
||||
SEEDS="${SEEDS:-1 2 3}"
|
||||
SEEDS="1 2"
|
||||
|
||||
# Parse optional args
|
||||
while [[ $# -gt 0 ]]; do
|
||||
|
|
@ -16,59 +17,71 @@ while [[ $# -gt 0 ]]; do
|
|||
esac
|
||||
done
|
||||
|
||||
# Direct API models
|
||||
DIRECT_MODELS=(
|
||||
# Models to run (direct API)
|
||||
MODELS=(
|
||||
"openai/gpt-5.4"
|
||||
"openai/gpt-5.4-mini"
|
||||
"openai/gpt-5.4-nano"
|
||||
"gemini/gemini-3.1-pro-preview"
|
||||
"gemini/gemini-3-flash-preview"
|
||||
"anthropic/claude-opus-4-6"
|
||||
"anthropic/claude-sonnet-4-6"
|
||||
)
|
||||
|
||||
# OpenRouter models
|
||||
OPENROUTER_MODELS=(
|
||||
"openrouter/qwen/qwen3.5-397b-a17b"
|
||||
"openrouter/minimax/minimax-m2.7"
|
||||
"openrouter/deepseek/deepseek-v3.2"
|
||||
"openrouter/z-ai/glm-5"
|
||||
"openrouter/moonshotai/kimi-k2.5"
|
||||
"openrouter/x-ai/grok-4.20-beta"
|
||||
)
|
||||
|
||||
ALL_MODELS=("${DIRECT_MODELS[@]}" "${OPENROUTER_MODELS[@]}")
|
||||
|
||||
echo "=== YC-Bench Full Run ==="
|
||||
echo "=== YC-Bench Experiment Run (PARALLEL) ==="
|
||||
echo "Config: $CONFIG"
|
||||
echo "Seeds: $SEEDS"
|
||||
echo "Models: ${#ALL_MODELS[@]}"
|
||||
echo "Models: ${#MODELS[@]}"
|
||||
echo ""
|
||||
|
||||
# Run all LLM models
|
||||
for model in "${ALL_MODELS[@]}"; do
|
||||
for seed in $SEEDS; do
|
||||
# Derive DB name from model string (replace / with _)
|
||||
db_name=$(echo "$model" | tr '/' '_')
|
||||
db_path="db/${CONFIG}_${seed}_${db_name}.db"
|
||||
mkdir -p db results plots
|
||||
|
||||
# Skip if result already exists
|
||||
PIDS=()
|
||||
LABELS=()
|
||||
|
||||
for model in "${MODELS[@]}"; do
|
||||
for seed in $SEEDS; do
|
||||
db_name=$(echo "$model" | tr '/' '_')
|
||||
result_file="results/yc_bench_result_${CONFIG}_${seed}_${db_name}.json"
|
||||
|
||||
if [[ -f "$result_file" ]]; then
|
||||
echo " SKIP $model seed=$seed (result exists)"
|
||||
continue
|
||||
fi
|
||||
|
||||
echo " RUN $model | $CONFIG seed=$seed"
|
||||
echo " LAUNCH $model | $CONFIG seed=$seed"
|
||||
db_path="db/${CONFIG}_${seed}_${db_name}.db"
|
||||
rm -f "$db_path"
|
||||
|
||||
uv run yc-bench run \
|
||||
--model "$model" \
|
||||
--seed "$seed" \
|
||||
--config "$CONFIG" \
|
||||
--no-live \
|
||||
2>&1 | tail -3
|
||||
echo ""
|
||||
> "logs/${db_name}_seed${seed}.log" 2>&1 &
|
||||
|
||||
PIDS+=($!)
|
||||
LABELS+=("$model seed=$seed")
|
||||
done
|
||||
done
|
||||
|
||||
echo "=== All runs complete ==="
|
||||
echo ""
|
||||
echo "Launched ${#PIDS[@]} runs in parallel. Waiting..."
|
||||
echo ""
|
||||
|
||||
# Wait for all and report
|
||||
FAILED=0
|
||||
for i in "${!PIDS[@]}"; do
|
||||
wait "${PIDS[$i]}"
|
||||
EXIT_CODE=$?
|
||||
if [[ $EXIT_CODE -eq 0 ]]; then
|
||||
echo " DONE ${LABELS[$i]}"
|
||||
else
|
||||
echo " FAIL ${LABELS[$i]} (exit $EXIT_CODE)"
|
||||
FAILED=$((FAILED + 1))
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "=== Complete: $((${#PIDS[@]} - FAILED)) succeeded, $FAILED failed ==="
|
||||
echo ""
|
||||
echo "Plot results with:"
|
||||
echo " uv run python scripts/plot_run.py results/yc_bench_result_${CONFIG}_*.json"
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ from ..tools.run_command_schema import normalize_result
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
litellm.suppress_debug_info = True
|
||||
litellm.drop_params = True # silently drop unsupported params (e.g. tool_choice for mini/nano models)
|
||||
|
||||
# Tool schema passed to the LLM on every call
|
||||
_RUN_COMMAND_TOOL = {
|
||||
|
|
|
|||
|
|
@ -86,15 +86,23 @@ def task_accept(
|
|||
f"does not meet task requirement ({task.required_trust})."
|
||||
)
|
||||
|
||||
# Apply trust work reduction at accept time (no reward multiplier —
|
||||
# faster completion from trust already increases revenue via throughput).
|
||||
# Check if this client is a RAT (loyalty < -0.3)
|
||||
_cfg = _get_world_cfg()
|
||||
is_rat = False
|
||||
if task.client_id is not None:
|
||||
client_row = db.query(Client).filter(Client.id == task.client_id).one_or_none()
|
||||
if client_row and client_row.loyalty < -0.3:
|
||||
is_rat = True
|
||||
|
||||
# Apply trust work reduction (only for non-RAT clients —
|
||||
# RATs don't honor trust, they scope-creep regardless).
|
||||
if not is_rat and task.client_id is not None:
|
||||
work_reduction = _cfg.trust_work_reduction_max * (trust_level / _cfg.trust_max)
|
||||
for r in reqs:
|
||||
r.required_qty = int(float(r.required_qty) * (1 - work_reduction))
|
||||
reduced = int(float(r.required_qty) * (1 - work_reduction))
|
||||
r.required_qty = max(200, reduced) # respect DB constraint
|
||||
|
||||
# Compute deadline from advertised qty BEFORE scope creep
|
||||
# Compute deadline from current qty (after trust reduction, before scope creep)
|
||||
max_domain_qty = max(float(r.required_qty) for r in reqs)
|
||||
accepted_at = sim_state.sim_time
|
||||
deadline = _compute_deadline(accepted_at, max_domain_qty)
|
||||
|
|
@ -103,22 +111,13 @@ def task_accept(
|
|||
task.advertised_reward_cents = task.reward_funds_cents
|
||||
|
||||
# Scope creep: RAT clients inflate required_qty after accept.
|
||||
# Minimum inflation ensures ALL RAT tasks exceed deadline (which was
|
||||
# computed from pre-creep qty). The agent can't tell from the deadline
|
||||
# alone — the trap only springs after accept.
|
||||
if task.client_id is not None:
|
||||
client_row = db.query(Client).filter(Client.id == task.client_id).one_or_none()
|
||||
if client_row and client_row.loyalty < -0.3:
|
||||
intensity = abs(client_row.loyalty)
|
||||
inflation = _cfg.scope_creep_max * intensity
|
||||
# Ensure enough inflation to bust the deadline:
|
||||
# deadline_hours = deadline_min_biz_days * work_hours
|
||||
# need inflated_qty / effective_rate > deadline_hours
|
||||
# Conservative: at least 130% inflation so even small tasks fail
|
||||
inflation = max(1.3, inflation)
|
||||
for r in reqs:
|
||||
inflated = float(r.required_qty) * (1 + inflation)
|
||||
r.required_qty = int(min(25000, max(200, inflated)))
|
||||
if is_rat:
|
||||
intensity = abs(client_row.loyalty)
|
||||
inflation = _cfg.scope_creep_max * intensity
|
||||
inflation = max(2.0, inflation)
|
||||
for r in reqs:
|
||||
inflated = float(r.required_qty) * (1 + inflation)
|
||||
r.required_qty = int(min(25000, max(200, inflated)))
|
||||
|
||||
# Transition task
|
||||
task.status = TaskStatus.PLANNED
|
||||
|
|
|
|||
|
|
@ -1,19 +1,22 @@
|
|||
# medium — Can the agent detect and avoid adversarial clients?
|
||||
# medium — Can the agent manage employees efficiently and navigate adversarial clients?
|
||||
#
|
||||
# The environment is survivable for any competent agent. The differentiator
|
||||
# is the client trust/loyalty system: ~30% of clients are hidden RATs that
|
||||
# silently inflate work (scope creep) and claw back payments (disputes).
|
||||
# Primary differentiator: PAYROLL GROWTH from employee assignment.
|
||||
# Assign-all (8 emp) grows payroll ~2.7× faster than selective (3 emp).
|
||||
# Over 12 months this creates a $100K+ gap in profitability.
|
||||
#
|
||||
# RAT tasks look identical to normal tasks. The only way to detect them is
|
||||
# by checking `client history` and noticing listed vs received discrepancies.
|
||||
# Secondary differentiator: RAT CLIENTS (~35% of clients).
|
||||
# RAT tasks have scope creep that causes deadline failures.
|
||||
# RATs offer top-tier rewards to attract the greedy bot.
|
||||
# Smart agents can detect RATs via `client history` after initial failures.
|
||||
#
|
||||
# The greedy bot can't detect RATs — it picks tasks blindly and bleeds money.
|
||||
# A smart LLM that checks history and avoids RATs should clearly outperform.
|
||||
# Tertiary differentiator: TRUST BUILDING.
|
||||
# Focusing on fewer clients builds trust → work reduction → faster tasks.
|
||||
# Spreading across many clients erodes trust via cross-client decay.
|
||||
|
||||
extends = "default"
|
||||
|
||||
name = "medium"
|
||||
description = "1-year medium. Tests client trust awareness — can the agent detect adversarial clients?"
|
||||
description = "1-year medium. Tests employee efficiency, client trust, and adversarial client detection."
|
||||
|
||||
[sim]
|
||||
horizon_years = 1
|
||||
|
|
@ -29,26 +32,26 @@ num_clients = 6
|
|||
deadline_qty_per_day = 150.0
|
||||
deadline_min_biz_days = 7
|
||||
penalty_fail_multiplier = 1.0
|
||||
penalty_fail_funds_pct = 0.25 # failing costs 25% of advertised reward
|
||||
penalty_fail_funds_pct = 0.35 # failing costs 35% of advertised reward
|
||||
penalty_cancel_multiplier = 1.5
|
||||
# Salary bumps: each completed task raises salary for ALL assigned employees.
|
||||
# Assign-all (8 bumps/task) compounds payroll fast → unsustainable.
|
||||
# Selective assignment (3-4/task) grows slower → survivable for smart agents.
|
||||
# Assign-all (8 bumps/task) grows payroll ~2.7× faster than selective (3/task).
|
||||
# This is the PRIMARY differentiator between bot and smart agents.
|
||||
salary_bump_pct = 0.01
|
||||
reward_prestige_scale = 0.30
|
||||
prestige_decay_per_day = 0.0
|
||||
|
||||
# --- Client trust ---
|
||||
trust_build_rate = 5.0
|
||||
trust_build_rate = 5.0 # ~5 tasks to reach significant trust
|
||||
trust_fragility = 0.3
|
||||
trust_focus_pressure = 0.3
|
||||
trust_reward_ceiling = 2.6
|
||||
trust_work_reduction_max = 0.50
|
||||
trust_work_reduction_max = 0.50 # 50% less work at max trust
|
||||
trust_gating_fraction = 0.30
|
||||
trust_gated_reward_boost = 0.50 # trust-4 task pays 3× base (was 1.6× at 0.15)
|
||||
trust_gated_reward_boost = 0.0 # no reward distortion
|
||||
|
||||
# --- Client loyalty ---
|
||||
loyalty_rat_fraction = 0.20
|
||||
loyalty_rat_fraction = 0.35
|
||||
loyalty_severity = 1.0
|
||||
loyalty_reveal_trust = 0.0
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue