Integrate BLEUBERI as a submodule with direct import of reference-based reward functions.

2026-04-19 12:57:58 +00:00 · 2025-06-08 19:21:14 -05:00 · 2025-06-08 19:21:14 -05:00 · a520f5f663
commit a520f5f663
parent 5bb5bd2c3d
4 changed files with 270 additions and 70 deletions
--- a/environments/bleuberi/configs/default.yaml
+++ b/environments/bleuberi/configs/default.yaml
@ -6,15 +6,32 @@ env:
  steps_per_eval: 100
  total_steps: 1000
  include_messages: true
+
+  # Dataset configuration
  dataset_name: "allenai/tulu-3-sft-mixture"
  dataset_split: "train"
  selection_mode: "hard"
  num_examples: 5000
+  cache_dir: null
+  streaming: false
+  shuffle: true
+
+  # Reward configuration
  reward_funcs:
    - "bleu"
  ref_models:
    - "gold"  # Use ground truth as reference

+  # System prompt configuration
+  system_prompt: "You are a deep thinking AI, you may use extremely long chains of thought to deeply consider the problem and deliberate with yourself via systematic reasoning processes to help come to a correct solution prior to answering. You should enclose your thoughts and internal monologue inside <think> </think> tags, and then provide your solution or response to the problem. After your thinking, make sure to clearly provide your final answer inside <answer></answer> tags."
+
+  # Seeds and evaluation
+  seed: 42
+  eval_seed: 123
+  num_eval_samples_per_task: 5
+  eval_limit_ratio: 0.1
+  reasoning: true
+
 openai:
  base_url: "http://localhost:8000/v1"
  model: "Llama-3.1-8B-Instruct"