add llama-3.3-70b-instruct algebra, algorithmic eval configs

2026-04-27 17:23:19 +00:00 · 2025-02-25 23:07:46 +01:00 · 2025-02-25 23:07:46 +01:00 · 6d5168d1e5
commit 6d5168d1e5
parent 92c8be1699
9 changed files with 62 additions and 0 deletions
--- a/eval/.gitignore
+++ b/eval/.gitignore
@ -0,0 +1 @@
 results/
--- a/eval/eval.py
+++ b/eval/eval.py
@ -1,3 +1,4 @@
 #!/usr/bin/env python
 import argparse
 import asyncio
 import json
--- a/eval/yaml/example.yaml
+++ b/eval/yaml/example.yaml
@ -1,4 +1,5 @@
 model: anthropic/claude-3.7-sonnet  # find model id: https://openrouter.ai/models
 provider: Anthropic
 category: test
 datasets:
  - YOUR_DATASET_NAME
--- a/eval/yaml/llama-3.3-70b-instruct/algebra.yaml
+++ b/eval/yaml/llama-3.3-70b-instruct/algebra.yaml
@ -0,0 +1,14 @@
 model: meta-llama/llama-3.3-70b-instruct
 provider: Hyperbolic
 category: algebra
 datasets:
  - intermediate_integration
  - polynomial_equations
  - polynomial_multiplication
  - simple_equations
  - simple_integration
  - complex_arithmetic
 eval_dir: results/llama-3.3-70b-instruct
 dataset_size: 50
 dataset_seed: 42
 developer_role: system
--- a/eval/yaml/llama-3.3-70b-instruct/algorithmic.yaml
+++ b/eval/yaml/llama-3.3-70b-instruct/algorithmic.yaml
@ -0,0 +1,41 @@
 model: meta-llama/llama-3.3-70b-instruct
 provider: Hyperbolic
 category: algorithmic
 datasets:
  - ab
  - base_conversion
  - binary_alternation
  - binary_matrix
  - caesar_cipher
  - count_primes
  - cryptarithm
  - game_of_life
  - graph_color
  - group_anagrams
  - isomorphic_strings
  - jugs
  - letter_counting
  - letter_jumble
  - manipulate_matrix
  - number_filtering
  - number_sorting
  - palindrome
  - palindrome_partitioning
  - pool_matrix
  - ransom_note
  - rotate_matrix
  - rotten_oranges
  - sentence_reordering
  - spell_backward
  - spiral_matrix
  - string_insertion
  - string_manipulation
  - string_splitting
  - string_synthesis
  - word_ladder
  - word_sequence_reversal
  - word_sorting
 eval_dir: results/llama-3.3-70b-instruct
 dataset_size: 50
 dataset_seed: 45
 developer_role: system
--- a/eval/yaml/r1/algebra.yaml
+++ b/eval/yaml/r1/algebra.yaml
@ -1,4 +1,5 @@
 model: deepseek/deepseek-r1
 provider: Nebius
 category: algebra
 datasets:
  - intermediate_integration
--- a/eval/yaml/r1/algorithmic.yaml
+++ b/eval/yaml/r1/algorithmic.yaml
@ -1,4 +1,5 @@
 model: deepseek/deepseek-r1
 provider: Nebius
 category: algorithmic
 datasets:
  - ab
--- a/eval/yaml/r1/cognition.yaml
+++ b/eval/yaml/r1/cognition.yaml
@ -1,4 +1,5 @@
 model: deepseek/deepseek-r1
 provider: Nebius
 category: cognition
 datasets:
  -  color_cube_rotation
--- a/eval/yaml/r1/logic.yaml
+++ b/eval/yaml/r1/logic.yaml
@ -1,4 +1,5 @@
 model: deepseek/deepseek-r1
 provider: Nebius
 category: logic
 datasets:
  - propositional_logic