diff --git a/eval/eval.py b/eval/eval.py old mode 100644 new mode 100755 diff --git a/eval/requirements-eval.txt b/eval/requirements-eval.txt index cfd4254e..91567b6a 100644 --- a/eval/requirements-eval.txt +++ b/eval/requirements-eval.txt @@ -1,3 +1,2 @@ -openai>=1.64.0 aiohttp>=3.11.13 tenacity>=9.0.0 diff --git a/eval/yaml/r1/algebra.yaml b/eval/yaml/r1/algebra.yaml index b95c1e31..cdad5982 100644 --- a/eval/yaml/r1/algebra.yaml +++ b/eval/yaml/r1/algebra.yaml @@ -7,7 +7,7 @@ datasets: - simple_equations - simple_integration - complex_arithmetic -eval_dir: eval/r1 +eval_dir: results/r1 dataset_size: 50 dataset_seed: 42 developer_role: system diff --git a/eval/yaml/r1/cognition.yaml b/eval/yaml/r1/cognition.yaml index 911a92e5..c15d7087 100644 --- a/eval/yaml/r1/cognition.yaml +++ b/eval/yaml/r1/cognition.yaml @@ -5,7 +5,7 @@ datasets: - figlet_font - number_sequence - rubiks_cube -eval_dir: eval/r1 +eval_dir: results/r1 dataset_size: 50 dataset_seed: 42 developer_role: system diff --git a/eval/yaml/r1/logic.yaml b/eval/yaml/r1/logic.yaml index 400c4ff3..57cd05d6 100644 --- a/eval/yaml/r1/logic.yaml +++ b/eval/yaml/r1/logic.yaml @@ -5,7 +5,7 @@ datasets: - self_reference - syllogism - zebra_puzzles -eval_dir: eval/r1 +eval_dir: results/r1 dataset_size: 50 dataset_seed: 42 developer_role: system diff --git a/eval/yaml/r1/test.yaml b/eval/yaml/r1/test.yaml new file mode 100644 index 00000000..1660fd82 --- /dev/null +++ b/eval/yaml/r1/test.yaml @@ -0,0 +1,8 @@ +model: deepseek/deepseek-r1 +category: test +datasets: + - YOUR_DATASET_NAME +eval_dir: results/r1 +dataset_size: 10 +dataset_seed: 42 +developer_role: system