yc-bench/run_all_models.sh

87 lines
2.1 KiB
Bash
Executable file

#!/bin/bash
# Run models on medium config across seeds — PARALLEL across models.
# Usage:
# bash run_all_models.sh # all models, seeds 1-2
# bash run_all_models.sh --seed "1" # single seed
# bash run_all_models.sh --config medium # custom config
CONFIG="${CONFIG:-medium}"
SEEDS="1 2"
# Parse optional args
while [[ $# -gt 0 ]]; do
case $1 in
--seed) SEEDS="$2"; shift 2;;
--config) CONFIG="$2"; shift 2;;
*) echo "Unknown arg: $1"; exit 1;;
esac
done
# Models to run (direct API)
MODELS=(
"openai/gpt-5.4"
"openai/gpt-5.4-mini"
"openai/gpt-5.4-nano"
"gemini/gemini-3.1-pro-preview"
"gemini/gemini-3-flash-preview"
)
echo "=== YC-Bench Experiment Run (PARALLEL) ==="
echo "Config: $CONFIG"
echo "Seeds: $SEEDS"
echo "Models: ${#MODELS[@]}"
echo ""
mkdir -p db results plots
PIDS=()
LABELS=()
for model in "${MODELS[@]}"; do
for seed in $SEEDS; do
db_name=$(echo "$model" | tr '/' '_')
result_file="results/yc_bench_result_${CONFIG}_${seed}_${db_name}.json"
if [[ -f "$result_file" ]]; then
echo " SKIP $model seed=$seed (result exists)"
continue
fi
echo " LAUNCH $model | $CONFIG seed=$seed"
db_path="db/${CONFIG}_${seed}_${db_name}.db"
rm -f "$db_path"
uv run yc-bench run \
--model "$model" \
--seed "$seed" \
--config "$CONFIG" \
--no-live \
> "logs/${db_name}_seed${seed}.log" 2>&1 &
PIDS+=($!)
LABELS+=("$model seed=$seed")
done
done
echo ""
echo "Launched ${#PIDS[@]} runs in parallel. Waiting..."
echo ""
# Wait for all and report
FAILED=0
for i in "${!PIDS[@]}"; do
wait "${PIDS[$i]}"
EXIT_CODE=$?
if [[ $EXIT_CODE -eq 0 ]]; then
echo " DONE ${LABELS[$i]}"
else
echo " FAIL ${LABELS[$i]} (exit $EXIT_CODE)"
FAILED=$((FAILED + 1))
fi
done
echo ""
echo "=== Complete: $((${#PIDS[@]} - FAILED)) succeeded, $FAILED failed ==="
echo ""
echo "Plot results with:"
echo " uv run python scripts/plot_run.py results/yc_bench_result_${CONFIG}_*.json"