diff --git a/eval/eval_basic.json b/eval/eval_basic.json index 6240ce9d..7a739f99 100644 --- a/eval/eval_basic.json +++ b/eval/eval_basic.json @@ -3,19 +3,29 @@ "name": "letter_counting", "min_words": 5, "max_words": 15, - "size": 10, + "size": 50, "seed": 42 }, { "name": "propositional_logic", - "size": 10, + "size": 50, "seed": 42 }, { "name": "leg_counting", "min_animals": 3, "max_animals": 8, - "size": 10, + "size": 50, "seed": 42 + }, + { + "name": "group_anagrams", + "size": 50, + "seed": 42 + }, + { + "name": "spell_backward", + "size": 50, + "seed": 42 } ] diff --git a/eval/results/summary_google_gemini-2.0-flash-001_20250210_220158.json b/eval/results/summary_google_gemini-2.0-flash-001_20250210_220158.json new file mode 100644 index 00000000..13b1ff77 --- /dev/null +++ b/eval/results/summary_google_gemini-2.0-flash-001_20250210_220158.json @@ -0,0 +1,61 @@ +[ + { + "dataset_name": "letter_counting", + "model": "google/gemini-2.0-flash-001", + "average_score": 0.1766, + "total_examples": 50, + "timestamp": "2025-02-10T22:00:57.977510", + "config": { + "min_words": 5, + "max_words": 15, + "size": 50, + "seed": 42 + } + }, + { + "dataset_name": "propositional_logic", + "model": "google/gemini-2.0-flash-001", + "average_score": 0.059000000000000004, + "total_examples": 50, + "timestamp": "2025-02-10T22:01:17.805230", + "config": { + "size": 50, + "seed": 42 + } + }, + { + "dataset_name": "leg_counting", + "model": "google/gemini-2.0-flash-001", + "average_score": 0.402, + "total_examples": 50, + "timestamp": "2025-02-10T22:01:22.652618", + "config": { + "min_animals": 3, + "max_animals": 8, + "size": 50, + "seed": 42 + } + }, + { + "dataset_name": "group_anagrams", + "model": "google/gemini-2.0-flash-001", + "average_score": 0.0, + "total_examples": 50, + "timestamp": "2025-02-10T22:01:57.094468", + "config": { + "size": 50, + "seed": 42 + } + }, + { + "dataset_name": "spell_backward", + "model": "google/gemini-2.0-flash-001", + "average_score": 0.4512, + "total_examples": 50, + "timestamp": "2025-02-10T22:01:58.325957", + "config": { + "size": 50, + "seed": 42 + } + } +] \ No newline at end of file