first commit

2026-04-22 16:48:57 +00:00 · 2025-04-29 12:10:10 -07:00 · 2025-04-29 12:10:10 -07:00 · 621d00dd80
commit 621d00dd80
89 changed files with 15315 additions and 0 deletions
--- a/environments/dataset_environment/configs/dataset_local.yaml
+++ b/environments/dataset_environment/configs/dataset_local.yaml
@ -0,0 +1,52 @@
+# Dataset Environment Local Testing Configuration
+
+# Base environment parameters
+tokenizer_name: "NousResearch/DeepHermes-3-Llama-3-8B-Preview"
+group_size: 1
+use_wandb: false
+max_num_workers: 1
+rollout_server_url: "http://localhost:8000"
+total_steps: 1
+batch_size: 1
+steps_per_eval: 5
+max_token_length: 4096
+wandb_name: "dataset_test_local"
+ensure_scores_are_not_same: false
+
+# Dataset specific configuration
+dataset:
+  # Dataset parameters
+  dataset_name: "gsm8k"  # Example dataset
+  dataset_config: "main"
+  split: "train"
+  prompt_field: "question"
+  answer_field: "answer"
+
+  # Generation parameters
+  system_prompt: "You are an expert mathematician. You need to solve the given math problem step-by-step, showing your reasoning clearly. You should enclose your thoughts and internal monologue inside <think> </think> tags, and then provide your final answer.\n\nFollow these steps:\n1. Understand the problem carefully\n2. Plan your approach\n3. Execute the calculations step-by-step\n4. Verify your solution\n\nYou may use extremely long chains of thought to deeply consider the problem and deliberate with yourself via systematic reasoning processes to help come to a correct solution."
+  prefill: "<think>\n"
+  shuffle_dataset: true
+  max_generations_per_prompt: 1
+
+  # Generation length parameters
+  max_tokens: 4096
+  length_warmup_steps: 0
+  min_tokens: 0
+
+  # Completion parameters
+  temperature: 0.7
+  top_p: 0.9
+
+  # Reward functions
+  reward_functions:
+    - "accuracy"
+    - "format"
+  accuracy_reward_weight: 1.0
+  format_reward_weight: 0.2
+
+
+# Server configuration
+server_configs:
+  - model_name: "gpt-4.1-nano"
+    api_key: ${OPENAI_API_KEY}
+    timeout: 600 
--- a/environments/dataset_environment/configs/gsm8k.yaml
+++ b/environments/dataset_environment/configs/gsm8k.yaml
@ -0,0 +1,73 @@
+tokenizer_name: "NousResearch/DeepHermes-3-Llama-3-1B-Preview"
+group_size: 8
+use_wandb: true
+max_num_workers: 256
+max_eval_workers: 16
+steps_per_eval: 100
+batch_size: 1024
+max_batches_offpolicy: 3
+total_steps: 1000
+rollout_server_url: "http://localhost:8000"
+
+use_local_agents: true
+
+dataset:
+  dataset_name: "gsm8k"
+  dataset_config: "main"
+  split: "train"
+
+  prompt_field: "question"
+  answer_field: "answer"
+
+  system_prompt: "You are a deep thinking AI, you may use extremely long chains of thought to deeply consider the problem and deliberate with yourself via systematic reasoning processes to help come to a correct solution prior to answering. You should enclose your thoughts and internal monologue inside <think> </think> tags, and then provide your solution or response to the problem."
+  shuffle_dataset: true
+  max_generations_per_prompt: 1
+  include_messages_in_scoring: false
+
+  # New configurable reward functions
+  reward_functions:
+    - type: "r1"
+      weight: 1.5
+      params:
+        format_weight: 0.5
+        accuracy_weight: 1.0
+    - type: "cosine_scaled"
+      weight: 0.8
+      params:
+        scale_factor: 1.2
+        min_reward: -1.0
+        max_reward: 1.0
+    - type: "accuracy"
+      weight: 2.0
+      params:
+        split_on_think_tag: true
+    - type: "format"
+      weight: 0.7
+      params:
+        preferred_tags: ["think", "reasoning"]
+        require_all_tags: false
+    - type: "reasoning_steps"
+      weight: 1.0
+      params:
+        min_steps: 3
+    - type: "repetition_penalty"
+      weight: 0.5
+      params:
+        threshold: 0.1
+
+  # Legacy format still supported for backward compatibility
+  # reward_funcs:
+  #   - "r1_reward"
+  #   - "cosine_scaled_reward"
+  #   - "accuracy_reward"
+  #   - "format_reward"
+  #   - "reasoning_steps_reward"
+  #   - "repetition_penalty_reward"
+
+  max_tokens: 16000
+  length_warmup_steps: 100
+  min_tokens: 2048
+
+  eval_dataset_name: "gsm8k"
+  eval_dataset_config: "main"
+  eval_split: "test" 
--- a/environments/dataset_environment/configs/gsm8k_debug.yaml
+++ b/environments/dataset_environment/configs/gsm8k_debug.yaml
@ -0,0 +1,30 @@
+tokenizer_name: "NousResearch/DeepHermes-3-Llama-3-8B-Preview"
+group_size: 1
+use_wandb: false
+max_num_workers: 1
+max_eval_workers: 0
+batch_size: 1
+total_steps: 100
+rollout_server_url: "http://localhost:8000"
+
+dataset:
+  dataset_name: "gsm8k"
+  dataset_config: "main"
+  split: "train"
+
+  prompt_field: "question"
+  answer_field: "answer"
+
+  system_prompt: "You are a deep thinking AI, you may use extremely long chains of thought to deeply consider the problem and deliberate with yourself via systematic reasoning processes to help come to a correct solution prior to answering. You should enclose your thoughts and internal monologue inside <think> </think> tags, and then provide your solution or response to the problem."
+  shuffle_dataset: true
+  max_generations_per_prompt: 1
+  include_messages_in_scoring: true
+
+  # Using multiple reward functions for testing
+  reward_funcs:
+    - "accuracy_reward"
+    - "format_reward"
+
+  max_tokens: 4096
+  length_warmup_steps: 0
+  min_tokens: 200