Adding thinking reward

2026-04-24 17:04:55 +00:00 · 2025-05-10 19:50:30 +10:00 · 2025-05-10 19:50:30 +10:00 · a049dde6b1
commit a049dde6b1
parent 840ff20921
1 changed files with 20 additions and 21 deletions
--- a/environments/game_environments/gymnasium/blackjack_env.py
+++ b/environments/game_environments/gymnasium/blackjack_env.py
@ -11,6 +11,7 @@ Uses Monte Carlo sampling to estimate the value of the current state, similar to
 import json
 import logging
 import random
+import re
 from typing import Any, Dict, List, Optional, Tuple

 import gymnasium
@ -143,29 +144,27 @@ class BlackjackEnv(BaseEnv):
        Calculates a score for a single agent response based purely on environment reward
        and a penalty for invalid action format.
        """
-        current_env_reward = env_reward
-
+        current_env_reward = env_reward * 1.0
+        # Action is good?
        if parsed_action == -1:
-            current_env_reward -= 0.5
-            logger.debug(
-                f"[_score_response Seed: {episode_seed}] Penalty applied to env_reward for "
-                f"invalid action format (-0.5). Current env_reward: {current_env_reward:.4f}"
-            )
-
-        final_score = current_env_reward
-
-        logger.debug(
-            f"[_score_response Seed: {episode_seed}] Score Calculation: "
-            f"EnvReward(raw): {env_reward:.4f}, EnvReward(adj for invalid): {current_env_reward:.4f} "
-            f"==> Final Score (from env): {final_score:.4f}"
-        )
-        # Try to get a valid tool call from the response
-        tool_call = self._parse_tool_call(response_text)
-        if tool_call == -1:
-            final_score -= 0.5
+            current_env_reward -= 0.2
        else:
-            final_score += 0.5
-        return final_score
+            current_env_reward += 0.2
+
+        # Check the thinking tags exist, with valid content
+        # 1 and only 1 thinking tag
+        match = re.search(r"<think>(.*?)</think>", response_text)
+        if match:
+            thinking_content = match.group(1)
+            if thinking_content:
+                current_env_reward += 0.2
+            # Check there's actually valid content (not just whitespace)
+            if not thinking_content.strip():
+                current_env_reward -= 0.2
+        else:
+            current_env_reward -= 0.2
+
+        return current_env_reward

    def _parse_tool_call(self, response: str) -> int:
        if not response: