mirror of
https://github.com/lilakk/BLEUBERI.git
synced 2026-04-19 12:58:12 +00:00
86 lines
No EOL
3.9 KiB
Bash
86 lines
No EOL
3.9 KiB
Bash
model_name=$1 # model to test
|
|
# by default use gpt-3.5-turbo-0125 as ref_name
|
|
ref_name=${2:-"gpt-3.5-turbo-0125"} # model to compare
|
|
# by default use "gpt-4-0125-preview" as gpt_eval_name
|
|
# gpt_eval_name=${3:-"gpt-4-turbo-2024-04-09"} # evaluator name # gpt-4-0125-preview
|
|
gpt_eval_name=${3:-"gpt-4.1-mini"} # evaluator name # gpt-4-0125-preview
|
|
use_checklist=${4:-"True"}
|
|
num_shards=${5:-8} # shards
|
|
|
|
total_ex=1024
|
|
|
|
eval_template="evaluation/eval_template.pairwise.v2.md"
|
|
eval_folder="eval_results/v2.0522/pairwise.v2/eval=${gpt_eval_name}/ref=${ref_name}/"
|
|
echo "Evaluating $model_name vs $ref_name using $gpt_eval_name with $eval_template"
|
|
|
|
mkdir -p $eval_folder
|
|
|
|
# Decide the shard size dynamically based on $num_shards and the total number 1024
|
|
if [ "$num_shards" -eq 1 ]; then
|
|
eval_file="${eval_folder}/${model_name}.json"
|
|
python src/eval.py \
|
|
--action eval \
|
|
--model $gpt_eval_name \
|
|
--max_words_to_eval 1000 \
|
|
--mode pairwise \
|
|
--eval_template $eval_template \
|
|
--target_model_name $model_name \
|
|
--ref_model_name $ref_name \
|
|
--eval_output_file $eval_file
|
|
else
|
|
echo "Using $num_shards shards"
|
|
shard_size=$(($total_ex / $num_shards))
|
|
echo "Shard size: $shard_size"
|
|
start_gpu=0 # not used
|
|
for ((start = 0, end = (($shard_size)), gpu = $start_gpu; gpu < $num_shards+$start_gpu; start += $shard_size, end += $shard_size, gpu++)); do
|
|
eval_file="${eval_folder}/${model_name}.$start-$end.json"
|
|
echo "Evaluating $model_name vs $ref_name from $start to $end"
|
|
python src/eval.py \
|
|
--action eval \
|
|
--model $gpt_eval_name \
|
|
--max_words_to_eval 1000 \
|
|
--mode pairwise \
|
|
--eval_template $eval_template \
|
|
--target_model_name $model_name \
|
|
--ref_model_name $ref_name \
|
|
--eval_output_file $eval_file \
|
|
--start_idx $start --end_idx $end \
|
|
&
|
|
done
|
|
# Wait for all background processes to finish
|
|
wait
|
|
|
|
# # Run the merge results script after all evaluation scripts have completed
|
|
python src/merge_results.py $eval_folder $model_name
|
|
fi
|
|
|
|
|
|
## V2
|
|
# bash evaluation/run_eval_v2_internal.sh gpt-3.5-turbo-0125 gpt-4-turbo-2024-04-09
|
|
# bash evaluation/run_eval_v2_internal.sh claude-3-haiku-20240307 gpt-4-turbo-2024-04-09
|
|
# bash evaluation/run_eval_v2_internal.sh claude-3-sonnet-20240229 gpt-4-turbo-2024-04-09
|
|
# bash evaluation/run_eval_v2_internal.sh claude-3-opus-20240229 gpt-4-turbo-2024-04-09
|
|
# bash evaluation/run_eval_v2_internal.sh Meta-Llama-3-8B-Instruct gpt-4-turbo-2024-04-09
|
|
# bash evaluation/run_eval_v2_internal.sh Meta-Llama-3-70B-Instruct gpt-4-turbo-2024-04-09
|
|
# bash evaluation/run_eval_v2_internal.sh gpt-3.5-turbo-0125 gpt-4o-2024-05-13
|
|
# bash evaluation/run_eval_v2_internal.sh claude-3-haiku-20240307 claude-3-opus-20240229
|
|
# bash evaluation/run_eval_v2_internal.sh claude-3-sonnet-20240229 claude-3-opus-20240229
|
|
# bash evaluation/run_eval_v2_internal.sh claude-3-haiku-20240307 claude-3-sonnet-20240229
|
|
# bash evaluation/run_eval_v2_internal.sh Meta-Llama-3-8B-Instruct Meta-Llama-3-70B-Instruct
|
|
# bash evaluation/run_eval_v2_internal.sh Qwen1.5-72B-Chat Qwen1.5-7B-Chat@together
|
|
|
|
|
|
|
|
## V2.0522
|
|
|
|
# test_model, ref_model, eval_model
|
|
|
|
# bash evaluation/run_eval_v2_internal.sh claude-3-haiku-20240307 gpt-4-turbo-2024-04-09 gpt-4-turbo-2024-04-09
|
|
# bash evaluation/run_eval_v2_internal.sh claude-3-sonnet-20240229 gpt-4-turbo-2024-04-09 gpt-4-turbo-2024-04-09
|
|
# bash evaluation/run_eval_v2_internal.sh claude-3-opus-20240229 gpt-4-turbo-2024-04-09 gpt-4-turbo-2024-04-09
|
|
# bash evaluation/run_eval_v2_internal.sh Meta-Llama-3-8B-Instruct gpt-4-turbo-2024-04-09 gpt-4-turbo-2024-04-09
|
|
# bash evaluation/run_eval_v2_internal.sh Meta-Llama-3-70B-Instruct gpt-4-turbo-2024-04-09 gpt-4-turbo-2024-04-09
|
|
|
|
# bash evaluation/run_eval_v2_internal.sh claude-3-opus-20240229 gpt-4-turbo-2024-04-09 claude-3-opus-20240229
|
|
|
|
|