InternBootcamp/examples/unittests/run_eval.py
2025-06-16 10:33:07 +08:00

357 lines
No EOL
18 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import asyncio
import openai
import pandas as pd
import argparse
import json
import os
import random
import jsonlines
import logging
from copy import deepcopy
from internbootcamp.bootcamp import *
from internbootcamp.bootcamp_utils.formatted_time import formatted_time
from datetime import datetime, timedelta
# Disable logging
logging.disable(logging.CRITICAL)
TEMPLATE_MAP = {
"r1": {"chat_template":"<begin▁of▁sentence><User>{input}<Assistant><think>\n","stop_words":["<end▁of▁sentence>"]}, # r1 new chat template
"qwen": {"chat_template":"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n","stop_words":["<|im_end|>", "<|endoftext|>"]}, # default qwen template
"internthinker":{"chat_template":"<|im_start|>system\nYou are an expert reasoner with extensive experience in mathematical and code competitions. You approach problems through systematic thinking and rigorous reasoning. Your response should reflect deep understanding and precise logical thinking, making your solution path and reasoning clear to others. Please put your thinking process within <think>...</think> tags.<|im_end|>\n<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n","stop_words":["<|im_end|>", "<|endoftext|>"]},
"internbootcamp":{"chat_template":"<|im_start|>system\nYou are an expert reasoner with extensive experience in mathematical and code competitions. You approach problems through systematic thinking and rigorous reasoning. Your response should reflect deep understanding and precise logical thinking, making your solution path and reasoning clear to others. Please put your thinking process within <think>...</think> tags. After careful thought, present your final solution or answer clearly.<|im_end|>\n<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n","stop_words":["<|im_end|>", "<|endoftext|>"]},
"internbootcamp_v2":{"chat_template":"<|im_start|>system\nYou are a helpful assistant, skilled at solving various complex reasoning problems. When faced with any user questions, please first conduct a detailed thinking process, similar to drafting, where you can freely analyze problem-solving strategies and verify the correctness of your thought process. Please put your thinking process within <think> and </think> tags. After completing the thinking process, provide the user with a detailed response. Please note that the response accessible to the user will start after \"</think>\", so ensure that detailed chain-of-thought solution steps should be provided after the </think> tag.<|im_end|>\n<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n","stop_words":["<|im_end|>", "<|endoftext|>"]},
"chatml":{"chat_template":"<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n","stop_words":["<|im_end|>", "<|endoftext|>"]}, # No sys prompt chatml
}
# Global file paths, locks, and progress status
progress_file_path = None
progress_file_lock = asyncio.Lock()
progress_status = {} # Stores the current progress for each file
from datetime import timedelta
def format_progress_bar(current, total, start_time, update_time, bar_length=50):
"""
Format the progress bar with time statistics.
"""
# Validate inputs
if total <= 0:
print("Total must be greater than 0.")
return ""
if current < 0 or current > total:
print("Current must be between 0 and total.")
return ""
# Calculate progress percentage
percent = current / total
filled_length = int(bar_length * percent)
bar = '' * filled_length + '-' * (bar_length - filled_length)
# Calculate time statistics
elapsed_time = update_time - start_time
elapsed_time_seconds = int(elapsed_time.total_seconds()) # Extract total seconds from timedelta
elapsed_time_str = str(timedelta(seconds=elapsed_time_seconds))
if percent > 0:
remaining_time = elapsed_time_seconds * (1 - percent) / percent
remaining_time_str = str(timedelta(seconds=int(remaining_time)))
else:
remaining_time_str = "N/A" # Handle division by zero when percent is 0
# Format the output string
if current < total:
return (
f"{current}/{total} [{bar}] {percent:.1%} "
f"Elapsed: {elapsed_time_str} Remaining: {remaining_time_str} "
)
else:
return (
f"{current}/{total} [{bar}] {percent:.1%} "
f"Elapsed: {elapsed_time_str} Remaining: {remaining_time_str} "
f"Completed✔"
)
async def update_progress(position, description, total, init_model=False):
"""Update the progress bar with time statistics"""
update_time = datetime.now()
global progress_file_path, progress_status
async with progress_file_lock:
# Initialize progress for this position if not already done
if position not in progress_status and not init_model:
progress_status[position] = {"current": 0, "total": total, "start_time": update_time}
if init_model:
current = 0
start_time = update_time
else:
# Increment current progress
progress_status[position]["current"] += 1
current = progress_status[position]["current"]
total = progress_status[position]["total"]
start_time = progress_status[position]["start_time"]
# Open the progress file and update the corresponding line
with open(progress_file_path, 'r+') as f:
lines = f.readlines()
# Add empty lines if the file has fewer lines than the position
while len(lines) < position:
lines.append("\n")
# Update the corresponding line content
lines[position - 1] = f"{description}: {format_progress_bar(current, total, start_time=start_time,update_time=update_time)}\n"
# Write back to the file
f.seek(0)
f.writelines(lines)
f.truncate()
async def check_model_url_alive(url, api_key, model_name, max_attempts=60, interval=60):
"""
检查模型 URL 是否存活,并验证指定的 model_name 是否已注册。
"""
attempt = 0
print("Checking model URL availability and model registration...")
while attempt < max_attempts:
try:
# 创建 OpenAI 客户端
async with openai.AsyncOpenAI(base_url=url, api_key=api_key) as client:
# 获取模型列表
models = await client.models.list()
model_ids = [model.id for model in models.data] # 提取所有模型的 ID
if model_name in model_ids:
print(f"Model '{model_name}' is registered and available after {attempt * interval} seconds.")
return True
else:
print(f"Attempt {attempt + 1}: Model '{model_name}' is not registered yet. Retrying in {interval} seconds...")
except Exception as e:
print(f"Attempt {attempt + 1}: Model URL not available yet. Error: {str(e)}. Retrying in {interval} seconds...")
# 等待指定的间隔时间后重试
await asyncio.sleep(interval)
attempt += 1
# 如果超过最大尝试次数仍未成功,则抛出异常
raise RuntimeError(f"Model URL or model '{model_name}' did not become available within the maximum allowed time.")
async def process_item(client, item, bootcamp, template, output_dir, semaphore, api_mode, sys_prompt, max_tokens, temperature, timeout, model_name, max_retries, max_retrying_delay, position, total_items):
async with semaphore:
chat_template = template["chat_template"]
stop_words = template["stop_words"]
for attempt in range(max_retries):
try:
if api_mode == "chat_completion":
messages = [{"role": "user", "content": item["prompt"]}]
if sys_prompt:
messages.insert(0, {"role": "system", "content": sys_prompt})
response = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
timeout=timeout,
)
output = response.choices[0].message.content
elif api_mode == "completion":
response = await client.completions.create(
model=model_name,
prompt=chat_template.format(input=item["prompt"]), # Use templated prompt
max_tokens=max_tokens,
temperature=temperature,
timeout=timeout,
stop=stop_words if stop_words else None,
)
output = response.choices[0].text
else:
raise ValueError("Invalid API mode")
break
except Exception as e:
print(f"Retries remaining: {max_retries - attempt - 1}. Error occurred while processing {bootcamp.__name__}:{item['id']}. {e}")
if attempt == max_retries - 1:
await update_progress(position, os.path.basename(item['file_path']).replace(".jsonl", ""), total_items)
raise RuntimeError(f"Failed to process {item['id']} after {max_retries} attempts.")
await asyncio.sleep(min((attempt + 1) ^ 2, max_retrying_delay))
score = bootcamp.verify_score(output, item["ground_truth"], short_penalty=False, format_penalty=False)
try:
extracted = bootcamp.extract_output(output)
if type(extracted) is not str:
# Convert non-string extracted output to string, only in this way we can ensure that the output is JSON serializable
extracted = str(extracted)
except:
extracted = None
output_len = response.usage.completion_tokens if 'usage' in response else len(output.split())
result = {
"id": item["id"],
"prompt": item["prompt"],
"output_len": output_len,
"score": score,
"extracted_output": extracted,
"ground_truth": item["ground_truth"],
"output": output,
}
# Save results immediately
detail_file = os.path.join(output_dir, "details", os.path.basename(item['file_path']))
os.makedirs(os.path.dirname(detail_file), exist_ok=True)
async with asyncio.Lock():
with open(detail_file, 'a') as f:
try:
json.dump(result, f, ensure_ascii=False)
except Exception as e:
print(f"Error in saving details for {bootcamp.__name__} with result: {result}, which is {e}")
f.write('\n')
# Update progress bar with time statistics
await update_progress(position, os.path.basename(item['file_path']).replace(".jsonl", ""), total_items)
return result
async def evaluate_dataset(file_path, bootcamp, output_dir, template, semaphore, api_mode, sys_prompt, max_tokens, temperature, timeout, position, url, model_name, max_retries, max_retrying_delay,total_file_num,api_key):
global progress_file_path
# Load data
with jsonlines.open(file_path) as reader:
data = list(reader)
async with openai.AsyncOpenAI(base_url=url, api_key=api_key) as client:
tasks = []
for idx, row in enumerate(data):
item = {
"id": idx,
"file_path": file_path,
"prompt": row["prompt"],
"ground_truth": row["ground_truth"],
"data_source": row["data_source"]
}
task = process_item(client, item, bootcamp, template, output_dir, semaphore, api_mode, sys_prompt, max_tokens, temperature, timeout, model_name, max_retries, max_retrying_delay, position, len(data))
tasks.append(task)
results = await asyncio.gather(*tasks)
avg_score = sum(r['score'] for r in results) / len(results)
avg_len = sum(r['output_len'] for r in results) / len(results)
meta_info = {
"bootcamp": bootcamp.__name__,
"avg_score": avg_score,
"avg_len": avg_len
}
# update main progress bar
await update_progress(position=1, description="Main Progress",total=total_file_num, init_model=False)
# save meta info to file
meta_info_output_file = os.path.join(output_dir, "meta.jsonl")
os.makedirs(os.path.dirname(meta_info_output_file), exist_ok=True)
with jsonlines.open(meta_info_output_file, mode='a') as writer:
writer.write(meta_info)
return bootcamp.__name__, avg_score, avg_len
async def main():
parser = argparse.ArgumentParser()
parser.add_argument('--url', default='http://10.130.133.35:8000/v1',
help='Base URL of the OpenAI API compatible service. Default format is http://{ip}:{port}/v1.')
parser.add_argument('--api_key', default='EMPTY',
help='API key for accessing the model service. Set to "EMPTY" if no key is required.')
parser.add_argument('--model_name', default='Qwen2.5-32B-Instruct',
help='Name of the model to be evaluated, e.g., r1_32B or other custom model name.')
parser.add_argument('--test_dir', default='/cpfs01/shared/llm_ddd/lipeiji/InternBootcamp/examples/bootcamp_generator_outputs/2025-06-12-14:29:13/test',
help='Path to the directory containing test JSONL files for evaluation.')
parser.add_argument('--max_concurrent_requests', type=int, default=144,
help='Maximum number of concurrent requests allowed globally.')
parser.add_argument('--template', default='internbootcamp_v2',choices=['r1', 'qwen', 'internthinker', 'chatml','internbootcamp'],
help='Predefined conversation template used to format prompts. Only valid when api_mode is completion.')
parser.add_argument('--max_tokens', type=int, default=8192,
help='Maximum number of tokens the model can generate.')
parser.add_argument('--temperature', type=float, default=0,
help='Controls randomness in text generation. Lower values produce more deterministic outputs.')
parser.add_argument('--timeout', type=int, default=6000,
help='Request timeout in milliseconds.')
parser.add_argument('--api_mode', default='completion',choices=['completion', 'chat_completion'],
help='API mode to use: "completion" for raw text generation or "chat_completion" for chat-style APIs.')
parser.add_argument('--sys_prompt', type=str,
help='System prompt content used in chat_completion mode. If not provided, uses the default from the template (if any).')
parser.add_argument('--max_retries', type=int, default=8,
help='Maximum number of retries for failed requests.')
parser.add_argument('--max_retrying_delay', type=int, default=60,
help='Maximum delay between retries in seconds (using exponential backoff).')
args = parser.parse_args()
# Check if the model URL is alive before starting evaluation
await check_model_url_alive(args.url,api_key=args.api_key,model_name=args.model_name, max_attempts=60, interval=60)
cur_file_dir = os.path.dirname(os.path.abspath(__file__))
output_dir = f"{cur_file_dir}/output/{args.model_name}_{os.path.basename(args.test_dir)}_{formatted_time()}"
os.makedirs(os.path.join(output_dir, 'details'), exist_ok=True)
# save args to output_dir
args_dict = vars(args)
args_output_file = os.path.join(output_dir, "eval_args.json")
with open(args_output_file, 'w') as f:
json.dump(args_dict, f, indent=4)
# Set progress log file path
global progress_file_path
progress_file_path = os.path.join(output_dir, "progress.log")
open(progress_file_path, 'w').close() # Clear file content
# Notify the user to check detailed outputs and progress
print(f"\nEvaluating model {args.model_name}. Please check the progress at {progress_file_path}. \nDetailed outputs will be saved in {os.path.join(output_dir, 'details')}.")
# Collect all tasks
tasks = []
global_semaphore = asyncio.Semaphore(args.max_concurrent_requests)
position = 2
test_files = os.listdir(args.test_dir)
total_file_num = len(test_files)
for file_name in test_files:
if not file_name.endswith('.jsonl'):
print(f"Skipping non-JSONL file: {file_name}")
continue
file_path = os.path.join(args.test_dir, file_name)
with jsonlines.open(file_path) as f:
list_f = list(f)
cur_file_num = len(list_f)
data_source = list_f[0]['data_source']
bootcamp_class = globals().get(f"{data_source}bootcamp")
if not bootcamp_class:
print(f"bootcamp class not found: {data_source}bootcamp")
continue
# Assign a fixed position for each dataset
task = evaluate_dataset(
file_path=file_path,
bootcamp=bootcamp_class,
output_dir=output_dir,
template=TEMPLATE_MAP[args.template],
semaphore=global_semaphore,
api_mode=args.api_mode,
sys_prompt=args.sys_prompt,
max_tokens=args.max_tokens,
temperature=args.temperature,
timeout=args.timeout,
position=position,
url=args.url,
api_key=args.api_key,
model_name=args.model_name,
max_retries=args.max_retries,
max_retrying_delay=args.max_retrying_delay,
total_file_num=total_file_num
)
# Init progress bar
await update_progress(position=position, description=os.path.basename(file_path).replace(".jsonl", ""), total=cur_file_num, init_model=True)
tasks.append(task)
position += 1
# Init total progress bar
await update_progress(position=1, description="Main Progress",total=total_file_num, init_model=True)
# Execute all tasks
results = await asyncio.gather(*tasks)
results = sorted(results, key=lambda x: x[0])
# Save results
df = pd.DataFrame(results, columns=["bootcamp", "Average Score", "Average Output Length"])
df.loc[len(df)] = ["Total Average", df["Average Score"].mean(), df["Average Output Length"].mean()]
df.to_excel(os.path.join(output_dir, f"{args.model_name}_scores.xlsx"), index=False)
if __name__ == "__main__":
asyncio.run(main())