diff --git a/eval/eval.py b/eval/eval.py index e31bf25f..ce117d4d 100644 --- a/eval/eval.py +++ b/eval/eval.py @@ -123,7 +123,7 @@ async def main_async(): eval_start_time = time.time() all_results = await evaluator.evaluate_datasets(dataset_configs) - print(f'Time taken to collect evaluation data: {time.time() - eval_start_time}') + print(f'Time taken to collect evaluation data: {time.time() - eval_start_time:.2f} seconds') # Save results output_file = os.path.join( args.output_dir, diff --git a/eval/results/summary_google_gemini-2.0-flash-001_20250210_214631.json b/eval/results/summary_google_gemini-2.0-flash-001_20250210_214631.json new file mode 100644 index 00000000..587afe36 --- /dev/null +++ b/eval/results/summary_google_gemini-2.0-flash-001_20250210_214631.json @@ -0,0 +1,39 @@ +[ + { + "dataset_name": "letter_counting", + "model": "google/gemini-2.0-flash-001", + "average_score": 0.059, + "total_examples": 10, + "timestamp": "2025-02-10T21:46:27.185026", + "config": { + "min_words": 5, + "max_words": 15, + "size": 10, + "seed": 42 + } + }, + { + "dataset_name": "propositional_logic", + "model": "google/gemini-2.0-flash-001", + "average_score": 0.059, + "total_examples": 10, + "timestamp": "2025-02-10T21:46:31.805110", + "config": { + "size": 10, + "seed": 42 + } + }, + { + "dataset_name": "leg_counting", + "model": "google/gemini-2.0-flash-001", + "average_score": 0.40199999999999997, + "total_examples": 10, + "timestamp": "2025-02-10T21:46:31.805665", + "config": { + "min_animals": 3, + "max_animals": 8, + "size": 10, + "seed": 42 + } + } +] \ No newline at end of file