env:
wandb_name: bleuberi
group_size: 4
max_token_length: 2048
max_num_workers_per_node: 8
steps_per_eval: 100
total_steps: 1000
include_messages: true
# Dataset configuration
dataset_name: "allenai/tulu-3-sft-mixture"
dataset_split: "train"
selection_mode: "hard"
num_examples: 5000
cache_dir: null
streaming: false
shuffle: true
# Reward configuration
reward_funcs:
- "bleu"
ref_models:
- "gold" # Use ground truth as reference
# System prompt configuration
system_prompt: "You are a deep thinking AI, you may use extremely long chains of thought to deeply consider the problem and deliberate with yourself via systematic reasoning processes to help come to a correct solution prior to answering. You should enclose your thoughts and internal monologue inside tags, and then provide your solution or response to the problem. After your thinking, make sure to clearly provide your final answer inside tags."
# Seeds and evaluation
seed: 42
eval_seed: 123
num_eval_samples_per_task: 5
eval_limit_ratio: 0.1
reasoning: true
openai:
base_url: "http://localhost:8000/v1"
model: "Llama-3.1-8B-Instruct"
api_key: "PLACEHOLDER"
temperature: 0.7
max_tokens: 1024
top_p: 0.95