BLEUBERI/eval/arena-hard-v2.0/gen_judgment.py
2025-06-04 20:36:43 +00:00

205 lines
6.3 KiB
Python

import json
import yaml
import argparse
import os
import concurrent.futures
from tqdm import tqdm
from utils.completion import (
load_questions,
registered_api_completion,
load_questions,
load_model_answers,
get_endpoint,
make_config,
)
from utils.judge_utils import JUDGE_SETTINGS
def get_score(judgment, patterns):
import re
for pattern in patterns:
pattern = re.compile(pattern)
matches = pattern.findall(judgment.upper())
matches = [m for m in matches if m != ""]
if len(set(matches)) > 0:
return matches[-1].strip("\n")
return None
def pairwise_judgment(question, baseline, answer, reference, configs, settings):
prompt_args = {
"QUESTION": question['prompt'],
"ANSWER_A": baseline["messages"][-1]["content"]['answer'],
"ANSWER_B": answer["messages"][-1]["content"]['answer']
}
if reference:
prompt_args[f"REFERENCE"] = reference["messages"][-1]["content"]['answer']
user_prompt = configs["prompt_template"].format(**prompt_args)
messages = [
{
"role": "system",
"content": JUDGE_SETTINGS[question["category"]]["system_prompt"],
},
{
"role": "user",
"content": user_prompt,
}
]
# build arguments for api completions
kwargs = settings | {
"api_dict": get_endpoint(settings["endpoints"]),
"messages": messages,
}
kwargs['temperature'] = configs['temperature']
kwargs['max_tokens'] = configs['max_tokens']
api_completion_func = registered_api_completion[settings["api_type"]]
output = api_completion_func(**kwargs)
if output is None:
return None
score = get_score(output['answer'], configs["regex_patterns"])
result = {
"score": score,
"judgment": output,
"prompt": messages,
}
return result
def judgment(args):
answer = args['answer']
baseline = args['baseline']
output = {
"uid": args['question']["uid"],
"category": args['question']["category"],
"judge": args['configs']['judge_model'],
"model": answer["model"],
"baseline": baseline["model"],
"games": []
}
# round 1
result = pairwise_judgment(
question=args['question'],
baseline=baseline,
answer=answer,
reference=args['reference'],
configs=args['configs'],
settings=args['settings'],
)
output["games"].append(result)
# round 2
result = pairwise_judgment(
question=args['question'],
baseline=answer,
answer=baseline,
reference=args['reference'],
configs=args['configs'],
settings=args['settings'],
)
output["games"].append(result)
with open(args['output_file'], "a", encoding="utf-8") as f:
f.write(json.dumps(output, ensure_ascii=False) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--setting-file", type=str, default="config/arena-hard-v2.0.yaml")
parser.add_argument("--endpoint-file", type=str, default="config/api_config.yaml")
parser.add_argument("--model", type=str, default=None)
args = parser.parse_args()
print(args)
configs = make_config(args.setting_file)
endpoint_list = make_config(args.endpoint_file)
print(f'judge model: {configs["judge_model"]}, reference: {configs["reference"]}, temperature: {configs["temperature"]}, max tokens: {configs["max_tokens"]}')
question_file = os.path.join("data", configs["bench_name"], "question.jsonl")
answer_dir = os.path.join("data", configs["bench_name"], "model_answer")
questions = load_questions(question_file)
model_answers = load_model_answers(answer_dir)
# if user choose a set of models, only judge those models
if args.model:
models = [args.model]
else:
models = [model for model in configs["model_list"]]
if configs["reference"]:
assert not configs["reference"] in models, "ERROR: one of the models being evaluated is used as reference."
ref_answers = [answer_dir[model] for model in configs["reference"]]
else:
ref_answers = None
output_files = {}
output_dir = f"data/{configs['bench_name']}/model_judgment/{configs['judge_model']}"
for model in models:
output_files[model] = os.path.join(
output_dir,
f"{model}.jsonl",
)
for output_file in output_files.values():
os.makedirs(os.path.dirname(output_file), exist_ok=True)
existing_judgments = load_model_answers(output_dir)
endpoint_settings = endpoint_list[configs["judge_model"]]
with concurrent.futures.ThreadPoolExecutor(max_workers=endpoint_settings["parallel"]) as executor:
futures = []
for model in models:
count = 0
for question in questions:
uid = question["uid"]
kwargs = {}
kwargs["question"] = question
if model in model_answers and not uid in model_answers[model]:
print(f"Warning: {model} answer to {question['uid']} cannot be found.")
continue
if model in existing_judgments and uid in existing_judgments[model]:
count += 1
continue
kwargs["answer"] = model_answers[model][uid]
kwargs["baseline"] = model_answers[
JUDGE_SETTINGS[question["category"]]["baseline"]
][uid]
if ref_answers:
kwargs["reference"] = [ref_answer[uid] for ref_answer in ref_answers]
else:
kwargs["reference"] = None
kwargs["configs"] = configs
kwargs["settings"] = endpoint_settings
kwargs["output_file"] = output_files[model]
future = executor.submit(judgment, kwargs)
futures.append(future)
if count > 0:
print(f"{count} number of existing judgments")
for future in tqdm(
concurrent.futures.as_completed(futures), total=len(futures)
):
future.result()