Eval N completions per prompt (#374)

* feat: Add support for generating multiple completions per prompt
* feat: Track best and mean scores for multiple completions per prompt
* feat: Add checkpoint and resume functionality to evaluation script
This commit is contained in:
Andreas Köpf 2025-03-15 16:39:36 +01:00 committed by GitHub
parent bd13b1b92a
commit bfa5f8078b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 426 additions and 126 deletions

View file

@ -55,6 +55,7 @@ class EvalConfig:
default_seed: Optional[int] = None # Default random seed if not specified for a dataset
save_metadata: bool = False # Whether to include dataset entry metadata in results
save_full_results: bool = False # Whether to save the full results file
completions_per_prompt: int = 1 # Number of completions to generate per prompt
# Sampling parameters
max_tokens: Optional[int] = 32768 # Maximum number of tokens to generate
temperature: Optional[float] = 0.6 # Sampling temperature (higher = more random)
@ -175,5 +176,6 @@ class EvalConfig:
max_tokens=config_data.get("max_tokens", 32768),
temperature=config_data.get("temperature", 0.6),
top_p=config_data.get("top_p", 0.95),
completions_per_prompt=config_data.get("completions_per_prompt", 1),
categories=categories,
)