diff --git a/eval/.gitignore b/eval/.gitignore
new file mode 100644
index 00000000..fbca2253
--- /dev/null
+++ b/eval/.gitignore
@@ -0,0 +1 @@
+results/
diff --git a/eval/eval.py b/eval/eval.py
index bca4c3e5..94ecfe38 100755
--- a/eval/eval.py
+++ b/eval/eval.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 import argparse
 import asyncio
 import json
diff --git a/eval/yaml/example.yaml b/eval/yaml/example.yaml
index 44dd4154..0722bd8e 100644
--- a/eval/yaml/example.yaml
+++ b/eval/yaml/example.yaml
@@ -1,4 +1,5 @@
 model: anthropic/claude-3.7-sonnet  # find model id: https://openrouter.ai/models
+provider: Anthropic
 category: test
 datasets:
   - YOUR_DATASET_NAME
diff --git a/eval/yaml/llama-3.3-70b-instruct/algebra.yaml b/eval/yaml/llama-3.3-70b-instruct/algebra.yaml
new file mode 100644
index 00000000..13da48ea
--- /dev/null
+++ b/eval/yaml/llama-3.3-70b-instruct/algebra.yaml
@@ -0,0 +1,14 @@
+model: meta-llama/llama-3.3-70b-instruct
+provider: Hyperbolic
+category: algebra
+datasets:
+  - intermediate_integration
+  - polynomial_equations
+  - polynomial_multiplication
+  - simple_equations
+  - simple_integration
+  - complex_arithmetic
+eval_dir: results/llama-3.3-70b-instruct
+dataset_size: 50
+dataset_seed: 42
+developer_role: system
diff --git a/eval/yaml/llama-3.3-70b-instruct/algorithmic.yaml b/eval/yaml/llama-3.3-70b-instruct/algorithmic.yaml
new file mode 100644
index 00000000..5291bc7d
--- /dev/null
+++ b/eval/yaml/llama-3.3-70b-instruct/algorithmic.yaml
@@ -0,0 +1,41 @@
+model: meta-llama/llama-3.3-70b-instruct
+provider: Hyperbolic
+category: algorithmic
+datasets:
+  - ab
+  - base_conversion
+  - binary_alternation
+  - binary_matrix
+  - caesar_cipher
+  - count_primes
+  - cryptarithm
+  - game_of_life
+  - graph_color
+  - group_anagrams
+  - isomorphic_strings
+  - jugs
+  - letter_counting
+  - letter_jumble
+  - manipulate_matrix
+  - number_filtering
+  - number_sorting
+  - palindrome
+  - palindrome_partitioning
+  - pool_matrix
+  - ransom_note
+  - rotate_matrix
+  - rotten_oranges
+  - sentence_reordering
+  - spell_backward
+  - spiral_matrix
+  - string_insertion
+  - string_manipulation
+  - string_splitting
+  - string_synthesis
+  - word_ladder
+  - word_sequence_reversal
+  - word_sorting
+eval_dir: results/llama-3.3-70b-instruct
+dataset_size: 50
+dataset_seed: 45
+developer_role: system
diff --git a/eval/yaml/r1/algebra.yaml b/eval/yaml/r1/algebra.yaml
index cdad5982..a4939285 100644
--- a/eval/yaml/r1/algebra.yaml
+++ b/eval/yaml/r1/algebra.yaml
@@ -1,4 +1,5 @@
 model: deepseek/deepseek-r1
+provider: Nebius
 category: algebra
 datasets:
   - intermediate_integration
diff --git a/eval/yaml/r1/algorithmic.yaml b/eval/yaml/r1/algorithmic.yaml
index 9db9c370..55374af6 100644
--- a/eval/yaml/r1/algorithmic.yaml
+++ b/eval/yaml/r1/algorithmic.yaml
@@ -1,4 +1,5 @@
 model: deepseek/deepseek-r1
+provider: Nebius
 category: algorithmic
 datasets:
   - ab
diff --git a/eval/yaml/r1/cognition.yaml b/eval/yaml/r1/cognition.yaml
index c15d7087..dec09f72 100644
--- a/eval/yaml/r1/cognition.yaml
+++ b/eval/yaml/r1/cognition.yaml
@@ -1,4 +1,5 @@
 model: deepseek/deepseek-r1
+provider: Nebius
 category: cognition
 datasets:
   -  color_cube_rotation
diff --git a/eval/yaml/r1/logic.yaml b/eval/yaml/r1/logic.yaml
index 57cd05d6..45eef787 100644
--- a/eval/yaml/r1/logic.yaml
+++ b/eval/yaml/r1/logic.yaml
@@ -1,4 +1,5 @@
 model: deepseek/deepseek-r1
+provider: Nebius
 category: logic
 datasets:
   - propositional_logic