atropos/environments/letter_counting_environment/config.yaml

# Tinker-Atropos Configuration - Letter Counting Environment
# This environment uses adaptive difficulty (curriculum learning) with 10 tiers
# Evaluation uses static dataset from HuggingFace: NousResearch/Letter-Counting-Eval

# Environment configuration
env:
  # Base environment config
  group_size: 8
  batch_size: 256
  max_batches_offpolicy: 3
  tokenizer_name: "Qwen/Qwen3-8B"
  use_wandb: true
  rollout_server_url: "http://localhost:8000"
  wandb_name: "letter-counting-env"
  ensure_scores_are_not_same: true
  max_token_length: 8192
  max_num_workers: 24
  worker_timeout: 3600  # 1 hour - needed for high difficulty levels
  total_steps: 5000
  steps_per_eval: 5
  inference_weight: 1.0
  data_path_to_save_groups: null
  eval_limit_ratio: 0.1

  # Generation configuration
  generation_temperature: 1.0
  eval_temperature: 0.6
  max_generation_tokens: 15360

  # Training filtering (CRITICAL for stable training):
  #   - Groups with >80% success rate are SKIPPED (too easy, no learning signal)
  #   - Groups with <20% success rate are SKIPPED (too hard, no learning signal)
  #   - Groups with all identical scores are SKIPPED (no variance)
  difficulty_window_size: 150         # Number of recent groups to track (larger = more stable)
  difficulty_increase_threshold: 0.8  # Increase difficulty if success rate > this (also skip group)
  difficulty_decrease_threshold: 0.2  # Decrease difficulty if success rate < this (also skip group)
  min_difficulty_level: 1             # Minimum difficulty (1 = easiest)
  max_difficulty_level: 10            # Maximum difficulty (10 = 500 chars, 50 letters)
  starting_difficulty_level: 4        # Start at medium difficulty

  # Logging configuration
  debug_logging: true
  suppress_base_env_logs: true

  # Data dumping configuration (for creating offline training datasets)
  dump_rollouts: false
  dump_batch_size: 100

# OpenAI-compatible server configuration
openai:
  - model_name: "Qwen/Qwen3-8B"
    base_url: "http://localhost:8001/v1"
    api_key: "x"
    weight: 1.0
    num_requests_for_eval: 256

# Tinker-specific example configuration
tinker:
  lora_rank: 32
  learning_rate: 0.00004
  max_token_trainer_length: 16864
  checkpoint_dir: "./temp/"
  save_checkpoint_interval: 0

  # Wandb configuration for trainer
  wandb_project: "tinker-letter-counting"
  wandb_group: null
  wandb_run_name: "tinker-letter-counting-run"

# Standard Atropos flags
slurm: false
testing: false