reasoning-gym/eval/results/summary_google_gemini-2.0-flash-001_20250210_220158.json
2025-02-10 22:05:45 -08:00

61 lines
1.3 KiB
JSON

[
{
"dataset_name": "letter_counting",
"model": "google/gemini-2.0-flash-001",
"average_score": 0.1766,
"total_examples": 50,
"timestamp": "2025-02-10T22:00:57.977510",
"config": {
"min_words": 5,
"max_words": 15,
"size": 50,
"seed": 42
}
},
{
"dataset_name": "propositional_logic",
"model": "google/gemini-2.0-flash-001",
"average_score": 0.059000000000000004,
"total_examples": 50,
"timestamp": "2025-02-10T22:01:17.805230",
"config": {
"size": 50,
"seed": 42
}
},
{
"dataset_name": "leg_counting",
"model": "google/gemini-2.0-flash-001",
"average_score": 0.402,
"total_examples": 50,
"timestamp": "2025-02-10T22:01:22.652618",
"config": {
"min_animals": 3,
"max_animals": 8,
"size": 50,
"seed": 42
}
},
{
"dataset_name": "group_anagrams",
"model": "google/gemini-2.0-flash-001",
"average_score": 0.0,
"total_examples": 50,
"timestamp": "2025-02-10T22:01:57.094468",
"config": {
"size": 50,
"seed": 42
}
},
{
"dataset_name": "spell_backward",
"model": "google/gemini-2.0-flash-001",
"average_score": 0.4512,
"total_examples": 50,
"timestamp": "2025-02-10T22:01:58.325957",
"config": {
"size": 50,
"seed": 42
}
}
]