BLEUBERI/eval/show_eval_results.sh
2025-06-04 20:36:43 +00:00

18 lines
496 B
Bash

#! /bin/bash
cd eval/arena-hard
python3 show_result.py --style-control --load-bootstrap --load-battles
cd ../..
# for arena-hard-v2.0, there can be small floating point changes in the results compared to what's reported in the paper
cd eval/arena-hard-v2.0
python3 show_result.py --judge-names gpt-4.1-mini --control-features markdown length
cd ../..
cd eval/FastChat/fastchat/llm_judge
python show_result.py
cd ../../../..
cd eval/WildBench
bash leaderboard/show_eval.sh score_only
cd ../..