[eval-basic] initial scripts for evaluating models on reasoning gym

This commit is contained in:
rishabhranawat 2025-02-09 22:36:27 -08:00
parent 8c4400b18a
commit 75cfd31ec2
11 changed files with 1306 additions and 0 deletions

View file

@ -0,0 +1,39 @@
[
{
"dataset_name": "letter_counting",
"model": "google/gemini-2.0-flash-001",
"average_score": 0.20600000000000002,
"total_examples": 10,
"timestamp": "2025-02-10T06:34:37.091554",
"config": {
"min_words": 5,
"max_words": 15,
"size": 10,
"seed": 42
}
},
{
"dataset_name": "propositional_logic",
"model": "google/gemini-2.0-flash-001",
"average_score": 0.059,
"total_examples": 10,
"timestamp": "2025-02-10T06:35:11.432275",
"config": {
"size": 10,
"seed": 42
}
},
{
"dataset_name": "leg_counting",
"model": "google/gemini-2.0-flash-001",
"average_score": 0.40199999999999997,
"total_examples": 10,
"timestamp": "2025-02-10T06:35:27.087469",
"config": {
"min_animals": 3,
"max_animals": 8,
"size": 10,
"seed": 42
}
}
]