Add eval configs, small fixes to eval script & rush-hour score_answer

This commit is contained in:
Andreas Köpf 2025-03-16 09:18:05 +01:00 committed by GitHub
parent fa950d0189
commit 677a2af03e
6 changed files with 283 additions and 22 deletions

View file

@ -399,17 +399,16 @@ class AsyncModelEvaluator:
Dict with processing results
"""
responses = None
completion_results = []
best_score = 0.0
total_score = 0.0
best_answer = None
best_response = None
try:
# Get multiple model responses
responses = await self.get_model_response(entry["question"])
# Process each response
completion_results = []
best_score = 0.0
total_score = 0.0
best_answer = None
best_response = None
# Count total completions for mean score calculation
total_completions = len(responses)