diff --git a/atroposlib/envs/server_handling/openai_server.py b/atroposlib/envs/server_handling/openai_server.py index f99c14e2..54f03fb4 100644 --- a/atroposlib/envs/server_handling/openai_server.py +++ b/atroposlib/envs/server_handling/openai_server.py @@ -234,4 +234,16 @@ def resolve_openai_configs( server_configs = [final_openai_config] else: server_configs = [final_openai_config] + + if isinstance(server_configs, list): + logger.info( + "resolve_openai_configs returning %s config(s) with URLs: %s", + len(server_configs), + [getattr(c, "base_url", None) for c in server_configs], + ) + else: + logger.info( + "resolve_openai_configs returning %s", + type(server_configs).__name__, + ) return server_configs diff --git a/atroposlib/envs/server_handling/vllm_server.py b/atroposlib/envs/server_handling/vllm_server.py index cc5bf9a5..1c9cb24d 100644 --- a/atroposlib/envs/server_handling/vllm_server.py +++ b/atroposlib/envs/server_handling/vllm_server.py @@ -424,6 +424,10 @@ def resolve_openai_configs( elif isinstance(default_server_configs, list): server_configs = [final_openai_config] else: + logger.warning( + f"Unexpected type for default_server_configs: {type(default_server_configs)}. " + "Proceeding with single OpenAI server configuration based on merged settings." + ) server_configs = [final_openai_config] return server_configs diff --git a/environments/gsm8k_server.py b/environments/gsm8k_server.py index f8437f7b..2cb74795 100644 --- a/environments/gsm8k_server.py +++ b/environments/gsm8k_server.py @@ -1,4 +1,3 @@ -import logging import random import time from typing import Dict, List, Optional, Tuple, TypedDict, Union @@ -32,9 +31,6 @@ It is important that you provide your answer in the correct format. If you do not, you will not receive credit for your answer. So please end your answer with \\boxed{your answer here}""" -logger = logging.getLogger(__name__) - - class GSM8kRow(TypedDict): question: str answer: str @@ -353,9 +349,8 @@ class GSM8kEnv(BaseEnv): percentage_of_range = min(percentage_of_range, 1.0) # Apply linear penalty scaling from 1.0 down to 0.0 scores["scores"].append(1.0 - percentage_of_range) - # NOTE: identical-score filter disabled for testing. - # if all([scores["scores"][0] == score for score in scores["scores"]]): - # return None + if all([scores["scores"][0] == score for score in scores["scores"]]): + return None return scores else: # If the gold solution is not parseable, we return None