[eval-basic] initial scripts for evaluating models on reasoning gym

2026-04-19 12:58:07 +00:00 · 2025-02-09 22:36:27 -08:00 · 2025-02-09 22:36:27 -08:00 · 75cfd31ec2
commit 75cfd31ec2
parent 8c4400b18a
11 changed files with 1306 additions and 0 deletions
--- a/eval/eval_basic.sh
+++ b/eval/eval_basic.sh
@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Check if OPENROUTER_API_KEY is set
+if [ -z "$OPENROUTER_API_KEY" ]; then
+    echo "Error: OPENROUTER_API_KEY environment variable is not set"
+    echo "Please set it using: export OPENROUTER_API_KEY=your-api-key"
+    exit 1
+fi
+
+# Configuration
+OUTPUT_DIR="results"
+
+# List of models to evaluate
+MODELS=(
+    "google/gemini-2.0-flash-001"
+)
+
+# Create output directory
+mkdir -p "$OUTPUT_DIR"
+
+# Run evaluations
+for model in "${MODELS[@]}"; do
+    echo "Evaluating $model..."
+    python eval_basic.py \
+        --model "$model" \
+        --config "eval_basic.json" \
+        --output-dir "$OUTPUT_DIR"
+done
+
+echo "All evaluations completed!"