diff --git a/eval/.gitignore b/eval/.gitignore new file mode 100644 index 00000000..fbca2253 --- /dev/null +++ b/eval/.gitignore @@ -0,0 +1 @@ +results/ diff --git a/eval/eval.py b/eval/eval.py index bca4c3e5..94ecfe38 100755 --- a/eval/eval.py +++ b/eval/eval.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python import argparse import asyncio import json diff --git a/eval/yaml/example.yaml b/eval/yaml/example.yaml index 44dd4154..0722bd8e 100644 --- a/eval/yaml/example.yaml +++ b/eval/yaml/example.yaml @@ -1,4 +1,5 @@ model: anthropic/claude-3.7-sonnet # find model id: https://openrouter.ai/models +provider: Anthropic category: test datasets: - YOUR_DATASET_NAME diff --git a/eval/yaml/llama-3.3-70b-instruct/algebra.yaml b/eval/yaml/llama-3.3-70b-instruct/algebra.yaml new file mode 100644 index 00000000..13da48ea --- /dev/null +++ b/eval/yaml/llama-3.3-70b-instruct/algebra.yaml @@ -0,0 +1,14 @@ +model: meta-llama/llama-3.3-70b-instruct +provider: Hyperbolic +category: algebra +datasets: + - intermediate_integration + - polynomial_equations + - polynomial_multiplication + - simple_equations + - simple_integration + - complex_arithmetic +eval_dir: results/llama-3.3-70b-instruct +dataset_size: 50 +dataset_seed: 42 +developer_role: system diff --git a/eval/yaml/llama-3.3-70b-instruct/algorithmic.yaml b/eval/yaml/llama-3.3-70b-instruct/algorithmic.yaml new file mode 100644 index 00000000..5291bc7d --- /dev/null +++ b/eval/yaml/llama-3.3-70b-instruct/algorithmic.yaml @@ -0,0 +1,41 @@ +model: meta-llama/llama-3.3-70b-instruct +provider: Hyperbolic +category: algorithmic +datasets: + - ab + - base_conversion + - binary_alternation + - binary_matrix + - caesar_cipher + - count_primes + - cryptarithm + - game_of_life + - graph_color + - group_anagrams + - isomorphic_strings + - jugs + - letter_counting + - letter_jumble + - manipulate_matrix + - number_filtering + - number_sorting + - palindrome + - palindrome_partitioning + - pool_matrix + - ransom_note + - rotate_matrix + - rotten_oranges + - sentence_reordering + - spell_backward + - spiral_matrix + - string_insertion + - string_manipulation + - string_splitting + - string_synthesis + - word_ladder + - word_sequence_reversal + - word_sorting +eval_dir: results/llama-3.3-70b-instruct +dataset_size: 50 +dataset_seed: 45 +developer_role: system diff --git a/eval/yaml/r1/algebra.yaml b/eval/yaml/r1/algebra.yaml index cdad5982..a4939285 100644 --- a/eval/yaml/r1/algebra.yaml +++ b/eval/yaml/r1/algebra.yaml @@ -1,4 +1,5 @@ model: deepseek/deepseek-r1 +provider: Nebius category: algebra datasets: - intermediate_integration diff --git a/eval/yaml/r1/algorithmic.yaml b/eval/yaml/r1/algorithmic.yaml index 9db9c370..55374af6 100644 --- a/eval/yaml/r1/algorithmic.yaml +++ b/eval/yaml/r1/algorithmic.yaml @@ -1,4 +1,5 @@ model: deepseek/deepseek-r1 +provider: Nebius category: algorithmic datasets: - ab diff --git a/eval/yaml/r1/cognition.yaml b/eval/yaml/r1/cognition.yaml index c15d7087..dec09f72 100644 --- a/eval/yaml/r1/cognition.yaml +++ b/eval/yaml/r1/cognition.yaml @@ -1,4 +1,5 @@ model: deepseek/deepseek-r1 +provider: Nebius category: cognition datasets: - color_cube_rotation diff --git a/eval/yaml/r1/logic.yaml b/eval/yaml/r1/logic.yaml index 57cd05d6..45eef787 100644 --- a/eval/yaml/r1/logic.yaml +++ b/eval/yaml/r1/logic.yaml @@ -1,4 +1,5 @@ model: deepseek/deepseek-r1 +provider: Nebius category: logic datasets: - propositional_logic