diff --git a/environments/game_environments/gymnasium/blackjack_env.py b/environments/game_environments/gymnasium/blackjack_env.py index 5aae4206..0c761d2f 100644 --- a/environments/game_environments/gymnasium/blackjack_env.py +++ b/environments/game_environments/gymnasium/blackjack_env.py @@ -11,6 +11,7 @@ Uses Monte Carlo sampling to estimate the value of the current state, similar to import json import logging import random +import re from typing import Any, Dict, List, Optional, Tuple import gymnasium @@ -143,29 +144,27 @@ class BlackjackEnv(BaseEnv): Calculates a score for a single agent response based purely on environment reward and a penalty for invalid action format. """ - current_env_reward = env_reward - + current_env_reward = env_reward * 1.0 + # Action is good? if parsed_action == -1: - current_env_reward -= 0.5 - logger.debug( - f"[_score_response Seed: {episode_seed}] Penalty applied to env_reward for " - f"invalid action format (-0.5). Current env_reward: {current_env_reward:.4f}" - ) - - final_score = current_env_reward - - logger.debug( - f"[_score_response Seed: {episode_seed}] Score Calculation: " - f"EnvReward(raw): {env_reward:.4f}, EnvReward(adj for invalid): {current_env_reward:.4f} " - f"==> Final Score (from env): {final_score:.4f}" - ) - # Try to get a valid tool call from the response - tool_call = self._parse_tool_call(response_text) - if tool_call == -1: - final_score -= 0.5 + current_env_reward -= 0.2 else: - final_score += 0.5 - return final_score + current_env_reward += 0.2 + + # Check the thinking tags exist, with valid content + # 1 and only 1 thinking tag + match = re.search(r"(.*?)", response_text) + if match: + thinking_content = match.group(1) + if thinking_content: + current_env_reward += 0.2 + # Check there's actually valid content (not just whitespace) + if not thinking_content.strip(): + current_env_reward -= 0.2 + else: + current_env_reward -= 0.2 + + return current_env_reward def _parse_tool_call(self, response: str) -> int: if not response: