diff --git a/eval/eval.py b/eval/eval.py
index df24ca53..72bf1140 100755
--- a/eval/eval.py
+++ b/eval/eval.py
@@ -24,13 +24,13 @@ logging.basicConfig(
 
 
 class OpenRouterEvaluator:
-    def __init__(self, model: str, config: EvalConfig):
+    def __init__(self, model: str, config: EvalConfig, api_key: str):
         self.logger = logging.getLogger(f"OpenRouterEvaluator.{model}")
         self.config = config
         self.output_dir = f"{config.eval_dir}/{config.category}"
         os.makedirs(self.output_dir, exist_ok=True)
         self.base_url = "https://openrouter.ai/api/v1/chat/completions"
-        self.api_key = os.getenv("OPENROUTER_API_KEY")
+        self.api_key = api_key
         self.model = model
         self.headers = {
             "Authorization": f"Bearer {self.api_key}",
@@ -98,6 +98,8 @@ class OpenRouterEvaluator:
             model_answer = extract_answer(response)
             score = dataset.score_answer(answer=model_answer, entry=entry)
 
+            print(f"answer: {model_answer}, score: {score}")
+
             return {
                 "question": entry["question"],
                 "expected_answer": str(entry["answer"]),
@@ -120,18 +122,23 @@ class OpenRouterEvaluator:
 
     async def evaluate_datasets(self) -> list[dict[str, Any]]:
         """Main async evaluation entry point."""
-        all_results = []
         async with aiohttp.ClientSession(headers=self.headers) as session:
             return await asyncio.gather(*(self.evaluate_dataset(session, name) for name in self.config.datasets))
 
 
 async def async_main():
+    api_key = os.getenv("OPENROUTER_API_KEY")
+    if not api_key:
+        print("Error: OPENROUTER_API_KEY environment variable is not set")
+        print("Please set it using: export OPENROUTER_API_KEY=your-api-key")
+        exit(1)
+
     parser = argparse.ArgumentParser(description="Evaluate models on reasoning datasets")
     parser.add_argument("--yaml", required=True, help="Path to YAML configuration file")
     args = parser.parse_args()
 
     config = EvalConfig.from_yaml(args.yaml)
-    evaluator = OpenRouterEvaluator(model=config.model, config=config)
+    evaluator = OpenRouterEvaluator(model=config.model, config=config, api_key=api_key)
     results = await evaluator.evaluate_datasets()
 
     output_dir = f"{config.eval_dir}/{config.category}"
diff --git a/eval/scripts/run_llama-3.3-70-instruct_all.sh b/eval/scripts/run_llama-3.3-70-instruct_all.sh
index 083bb86b..08b355ec 100755
--- a/eval/scripts/run_llama-3.3-70-instruct_all.sh
+++ b/eval/scripts/run_llama-3.3-70-instruct_all.sh
@@ -1,12 +1,12 @@
 #!/bin/bash
 # run this script from the parent directory
-./eval.py --yaml algebra.yaml
-./eval.py --yaml algorithmic.yaml
-./eval.py --yaml arc.yaml
-./eval.py --yaml arithmetic.yaml
-./eval.py --yaml code.yaml
-./eval.py --yaml cognition.yaml
-./eval.py --yaml games.yaml
-./eval.py --yaml geometry.yaml
-./eval.py --yaml graphs.yaml
-./eval.py --yaml logic.yaml
+./eval.py --yaml yaml/llama-3.3-70b-instruct/algebra.yaml
+./eval.py --yaml yaml/llama-3.3-70b-instruct/algorithmic.yaml
+./eval.py --yaml yaml/llama-3.3-70b-instruct/arc.yaml
+./eval.py --yaml yaml/llama-3.3-70b-instruct/arithmetic.yaml
+./eval.py --yaml yaml/llama-3.3-70b-instruct/code.yaml
+./eval.py --yaml yaml/llama-3.3-70b-instruct/cognition.yaml
+./eval.py --yaml yaml/llama-3.3-70b-instruct/games.yaml
+./eval.py --yaml yaml/llama-3.3-70b-instruct/geometry.yaml
+./eval.py --yaml yaml/llama-3.3-70b-instruct/graphs.yaml
+./eval.py --yaml yaml/llama-3.3-70b-instruct/logic.yaml