Adding thinking reward

This commit is contained in:
Shannon Sands 2025-05-10 19:50:30 +10:00
parent 840ff20921
commit a049dde6b1

View file

@ -11,6 +11,7 @@ Uses Monte Carlo sampling to estimate the value of the current state, similar to
import json
import logging
import random
import re
from typing import Any, Dict, List, Optional, Tuple
import gymnasium
@ -143,29 +144,27 @@ class BlackjackEnv(BaseEnv):
Calculates a score for a single agent response based purely on environment reward
and a penalty for invalid action format.
"""
current_env_reward = env_reward
current_env_reward = env_reward * 1.0
# Action is good?
if parsed_action == -1:
current_env_reward -= 0.5
logger.debug(
f"[_score_response Seed: {episode_seed}] Penalty applied to env_reward for "
f"invalid action format (-0.5). Current env_reward: {current_env_reward:.4f}"
)
final_score = current_env_reward
logger.debug(
f"[_score_response Seed: {episode_seed}] Score Calculation: "
f"EnvReward(raw): {env_reward:.4f}, EnvReward(adj for invalid): {current_env_reward:.4f} "
f"==> Final Score (from env): {final_score:.4f}"
)
# Try to get a valid tool call from the response
tool_call = self._parse_tool_call(response_text)
if tool_call == -1:
final_score -= 0.5
current_env_reward -= 0.2
else:
final_score += 0.5
return final_score
current_env_reward += 0.2
# Check the thinking tags exist, with valid content
# 1 and only 1 thinking tag
match = re.search(r"<think>(.*?)</think>", response_text)
if match:
thinking_content = match.group(1)
if thinking_content:
current_env_reward += 0.2
# Check there's actually valid content (not just whitespace)
if not thinking_content.strip():
current_env_reward -= 0.2
else:
current_env_reward -= 0.2
return current_env_reward
def _parse_tool_call(self, response: str) -> int:
if not response: