model_name=$1 model_pretty_name=$2 n_shards=$3 TEMP=0; TOP_P=1.0; MAX_TOKENS=4096; batch_size=1; CACHE_DIR=${HF_HOME:-"default"} output_dir="result_dirs/wild_bench_v2/" # If the n_shards is 1, then we can directly run the model # else, use Data-parallellism if [ $n_shards -eq 1 ]; then # gpu="0,1,2,3"; num_gpus=4; # change the number of gpus to your preference # decide the number gpus automatically from cuda num_gpus=$(nvidia-smi --query-gpu=count --format=csv,noheader | head -n 1) # gpu= # from 0 to the last gpu id gpu=$(seq -s, 0 $((num_gpus - 1))) echo "n_shards = 1; num_gpus = $num_gpus; gpu = $gpu" CUDA_VISIBLE_DEVICES=$gpu \ python src/unified_infer.py \ --data_name wild_bench \ --model_name $model_name \ --use_hf_conv_template --use_imend_stop \ --download_dir $CACHE_DIR \ --tensor_parallel_size $num_gpus \ --dtype bfloat16 \ --model_pretty_name $model_pretty_name \ --top_p $TOP_P --temperature $TEMP \ --batch_size $batch_size --max_tokens $MAX_TOKENS \ --output_folder $output_dir/ elif [ $n_shards -gt 1 ]; then TOTAL_EXAMPLE=1024 echo "Using Data-parallelism" start_gpu=0 num_gpus=1 shard_size=$((TOTAL_EXAMPLE/n_shards)) shards_dir="${output_dir}/tmp_${model_pretty_name}" for ((start = 0, end = (($shard_size)), gpu = $start_gpu; gpu < $n_shards+$start_gpu; start += $shard_size, end += $shard_size, gpu++)); do CUDA_VISIBLE_DEVICES=$gpu \ python src/unified_infer.py \ --start_index $start --end_index $end \ --data_name wild_bench \ --model_name $model_name \ --use_hf_conv_template --use_imend_stop \ --download_dir $CACHE_DIR \ --tensor_parallel_size $num_gpus \ --dtype bfloat16 \ --model_pretty_name $model_pretty_name \ --top_p $TOP_P --temperature $TEMP \ --batch_size $batch_size --max_tokens $MAX_TOKENS \ --output_folder $shards_dir/ \ & done wait python src/merge_results.py $shards_dir/ $model_pretty_name cp $shards_dir/${model_pretty_name}.json $output_dir/${model_pretty_name}.json else echo "Invalid n_shards" exit fi