env:
  wandb_name: bleuberi
  group_size: 4
  max_token_length: 2048
  max_num_workers_per_node: 8
  steps_per_eval: 100
  total_steps: 1000
  include_messages: true
  dataset_name: "allenai/tulu-3-sft-mixture"
  dataset_split: "train"
  selection_mode: "hard"
  num_examples: 5000
  reward_funcs:
    - "bleu"
  ref_models:
    - "gold"  # Use ground truth as reference

openai:
  base_url: "http://localhost:8000/v1"
  model: "Llama-3.1-8B-Instruct"
  api_key: "PLACEHOLDER"
  temperature: 0.7
  max_tokens: 1024
  top_p: 0.95