diff --git a/environments/pairwise_judgement_environment.py b/environments/pairwise_judgement_environment.py index 8d573dcd..526cdbb8 100644 --- a/environments/pairwise_judgement_environment.py +++ b/environments/pairwise_judgement_environment.py @@ -225,45 +225,60 @@ class PairwiseJudgementEnv(BaseEnv): """Format text for debug output (first 100 + last 100 chars).""" if not text: return f"{label}: " - + text_clean = text.strip() if len(text_clean) <= 200: return f"{label}: '{text_clean}'" - + first_100 = text_clean[:100] last_100 = text_clean[-100:] return f"{label}: '{first_100}...{last_100}' (total {len(text_clean)} chars)" - def _log_full_debug_request(self, messages: List[Dict], params: Dict, category: str = "unknown", item_id: str = "unknown", context: str = ""): + def _log_full_debug_request( + self, + messages: List[Dict], + params: Dict, + category: str = "unknown", + item_id: str = "unknown", + context: str = "", + ): """Log full debug information for API requests.""" if not self.config.full_debug: return - + print(f"\nšŸ” FULL DEBUG - API REQUEST [{context}]") print(f" Category: {category}") print(f" Item ID: {item_id}") print(f" Parameters: {params}") - + for i, message in enumerate(messages): role = message.get("role", "unknown") content = message.get("content", "") - print(f" Message {i+1} ({role}): {self._format_debug_text(content, 'Content')}") + print( + f" Message {i+1} ({role}): {self._format_debug_text(content, 'Content')}" + ) def _log_full_debug_response(self, completion, context: str = ""): """Log full debug information for API responses.""" if not self.config.full_debug: return - + print(f"\nšŸ” FULL DEBUG - API RESPONSE [{context}]") - - if hasattr(completion, 'usage'): + + if hasattr(completion, "usage"): print(f" Usage: {completion.usage}") - - if hasattr(completion, 'choices') and completion.choices: + + if hasattr(completion, "choices") and completion.choices: for i, choice in enumerate(completion.choices): - content = choice.message.content if hasattr(choice, 'message') else "" - finish_reason = choice.finish_reason if hasattr(choice, 'finish_reason') else "unknown" - print(f" Choice {i+1}: {self._format_debug_text(content, 'Response')}") + content = choice.message.content if hasattr(choice, "message") else "" + finish_reason = ( + choice.finish_reason + if hasattr(choice, "finish_reason") + else "unknown" + ) + print( + f" Choice {i+1}: {self._format_debug_text(content, 'Response')}" + ) print(f" Finish reason: {finish_reason}") else: print(f" No choices in response") @@ -408,12 +423,20 @@ class PairwiseJudgementEnv(BaseEnv): # Show debug mode status if self.config.full_debug: - print(f"\nšŸ” FULL DEBUG MODE ENABLED - Will log all API requests and responses") - print(f" šŸ“Š Will show: category, item ID, first/last 100 chars of prompts and responses") - print(f" āš™ļø Retry settings: max_retries={self.config.max_retries}, retry_delay={self.config.retry_delay}s") + print( + f"\nšŸ” FULL DEBUG MODE ENABLED - Will log all API requests and responses" + ) + print( + f" šŸ“Š Will show: category, item ID, first/last 100 chars of prompts and responses" + ) + print( + f" āš™ļø Retry settings: max_retries={self.config.max_retries}, retry_delay={self.config.retry_delay}s" + ) print(f" šŸ“ Min response length: {self.config.min_response_length} chars") else: - print(f"\nšŸ” Full debug mode disabled - Use full_debug=True to enable detailed logging") + print( + f"\nšŸ” Full debug mode disabled - Use full_debug=True to enable detailed logging" + ) # Debug: Show sample evaluation item structure if len(self.test) > 0: @@ -688,73 +711,98 @@ class PairwiseJudgementEnv(BaseEnv): # Retry logic for training trajectories max_retries = self.config.max_retries retry_delay = self.config.retry_delay - + # Get category info for debug logging (this is synthetic training data) category = "synthetic_training" item_id = f"train_{self.iter if hasattr(self, 'iter') else 'unknown'}" - + for attempt in range(max_retries): try: # Log full debug request self._log_full_debug_request( - messages, completion_params, category, item_id, - f"TRAINING attempt {attempt + 1}/{max_retries}" + messages, + completion_params, + category, + item_id, + f"TRAINING attempt {attempt + 1}/{max_retries}", ) - + completions = await self.server.chat_completion( messages=messages, **completion_params ) - + # Log full debug response - self._log_full_debug_response(completions, f"TRAINING attempt {attempt + 1}/{max_retries}") + self._log_full_debug_response( + completions, f"TRAINING attempt {attempt + 1}/{max_retries}" + ) # Check if we got valid completions if not completions.choices: if attempt < max_retries - 1: - print(f"DEBUG: No choices in collect_trajectories (attempt {attempt + 1}/{max_retries})") + print( + f"DEBUG: No choices in collect_trajectories (attempt {attempt + 1}/{max_retries})" + ) await asyncio.sleep(retry_delay) continue else: - print(f"DEBUG: No choices in collect_trajectories after {max_retries} attempts") + print( + f"DEBUG: No choices in collect_trajectories after {max_retries} attempts" + ) return None, [] - + # Check if any completion has None content valid_completions = [] for completion_choice in completions.choices: - if (completion_choice.message.content is not None + if ( + completion_choice.message.content is not None and isinstance(completion_choice.message.content, str) - and len(completion_choice.message.content.strip()) >= self.config.min_response_length): + and len(completion_choice.message.content.strip()) + >= self.config.min_response_length + ): valid_completions.append(completion_choice) - + # If we don't have enough valid completions, retry - if len(valid_completions) < len(completions.choices) // 2: # If less than half are valid + if ( + len(valid_completions) < len(completions.choices) // 2 + ): # If less than half are valid if attempt < max_retries - 1: - print(f"DEBUG: Only {len(valid_completions)}/{len(completions.choices)} valid completions (attempt {attempt + 1}/{max_retries})") + print( + f"DEBUG: Only {len(valid_completions)}/{len(completions.choices)} valid completions (attempt {attempt + 1}/{max_retries})" + ) await asyncio.sleep(retry_delay) continue else: - print(f"DEBUG: Only {len(valid_completions)}/{len(completions.choices)} valid completions after {max_retries} attempts") + print( + f"DEBUG: Only {len(valid_completions)}/{len(completions.choices)} valid completions after {max_retries} attempts" + ) # Continue with what we have - + # Build trajectories using valid completions to_score = [] for completion_choice in valid_completions: # Add assistant response to existing messages trajectory_messages = messages + [ - {"role": "assistant", "content": completion_choice.message.content} + { + "role": "assistant", + "content": completion_choice.message.content, + } ] to_score.append((tuple(trajectory_messages), item[1])) - + # Success - we got at least some valid trajectories break - + except Exception as e: if attempt < max_retries - 1: - print(f"DEBUG: collect_trajectories API call failed (attempt {attempt + 1}/{max_retries}): {e}") + print( + f"DEBUG: collect_trajectories API call failed (attempt {attempt + 1}/{max_retries}): {e}" + ) await asyncio.sleep(retry_delay) continue else: - print(f"DEBUG: collect_trajectories API call failed after {max_retries} attempts: {e}") + print( + f"DEBUG: collect_trajectories API call failed after {max_retries} attempts: {e}" + ) return None, [] scored_data = await self.score(to_score) @@ -871,29 +919,36 @@ class PairwiseJudgementEnv(BaseEnv): # Retry logic for failed API calls max_retries = self.config.max_retries retry_delay = self.config.retry_delay - + # Get category and item info for debug logging category = test_item.get("subset", "unknown") item_id = test_item.get("id", "unknown") - + for attempt in range(max_retries): try: # Log full debug request self._log_full_debug_request( - messages, completion_params, category, item_id, - f"CHOICE_EVAL attempt {attempt + 1}/{max_retries}" + messages, + completion_params, + category, + item_id, + f"CHOICE_EVAL attempt {attempt + 1}/{max_retries}", ) - + completion = await self.server.chat_completion( messages=messages, **completion_params ) - + # Log full debug response - self._log_full_debug_response(completion, f"CHOICE_EVAL attempt {attempt + 1}/{max_retries}") + self._log_full_debug_response( + completion, f"CHOICE_EVAL attempt {attempt + 1}/{max_retries}" + ) if not completion.choices: if attempt < max_retries - 1: - print(f"DEBUG: No choices in completion (attempt {attempt + 1}/{max_retries})") + print( + f"DEBUG: No choices in completion (attempt {attempt + 1}/{max_retries})" + ) await asyncio.sleep(retry_delay) continue else: @@ -901,51 +956,69 @@ class PairwiseJudgementEnv(BaseEnv): return {"score": 0.0, "sample": None} model_response = completion.choices[0].message.content - + # Check for None content or very short responses (likely just EOS token) if model_response is None: if attempt < max_retries - 1: - print(f"DEBUG: model_response is None (attempt {attempt + 1}/{max_retries})") + print( + f"DEBUG: model_response is None (attempt {attempt + 1}/{max_retries})" + ) print(f"DEBUG: Completion: {completion}") await asyncio.sleep(retry_delay) continue else: - print(f"DEBUG: model_response is None after {max_retries} attempts") + print( + f"DEBUG: model_response is None after {max_retries} attempts" + ) print(f"DEBUG: Final completion: {completion}") return {"score": 0.0, "sample": None} - + if not isinstance(model_response, str): if attempt < max_retries - 1: - print(f"DEBUG: model_response is not a string. Type: {type(model_response)}, Value: {model_response} (attempt {attempt + 1}/{max_retries})") + print( + f"DEBUG: model_response is not a string. Type: {type(model_response)}, Value: {model_response} (attempt {attempt + 1}/{max_retries})" + ) await asyncio.sleep(retry_delay) continue else: - print(f"DEBUG: model_response is not a string after {max_retries} attempts. Type: {type(model_response)}, Value: {model_response}") + print( + f"DEBUG: model_response is not a string after {max_retries} attempts. Type: {type(model_response)}, Value: {model_response}" + ) return {"score": 0.0, "sample": None} - + # Check for very short responses (likely just EOS token) if len(model_response.strip()) < self.config.min_response_length: if attempt < max_retries - 1: - print(f"DEBUG: Very short response (likely EOS token only): '{model_response}' (attempt {attempt + 1}/{max_retries})") - print(f"DEBUG: Completion tokens: {completion.usage.completion_tokens if hasattr(completion, 'usage') else 'unknown'}") + print( + f"DEBUG: Very short response (likely EOS token only): '{model_response}' (attempt {attempt + 1}/{max_retries})" + ) + print( + f"DEBUG: Completion tokens: {completion.usage.completion_tokens if hasattr(completion, 'usage') else 'unknown'}" + ) await asyncio.sleep(retry_delay) continue else: - print(f"DEBUG: Very short response after {max_retries} attempts: '{model_response}'") + print( + f"DEBUG: Very short response after {max_retries} attempts: '{model_response}'" + ) return {"score": 0.0, "sample": None} - + # Success - we got a valid response break - + except Exception as e: if attempt < max_retries - 1: - print(f"DEBUG: API call failed (attempt {attempt + 1}/{max_retries}): {e}") + print( + f"DEBUG: API call failed (attempt {attempt + 1}/{max_retries}): {e}" + ) await asyncio.sleep(retry_delay) continue else: - print(f"DEBUG: API call failed after {max_retries} attempts: {e}") + print( + f"DEBUG: API call failed after {max_retries} attempts: {e}" + ) raise - + predicted_answer = self.process_judgement( model_response, track_metrics=False ) @@ -1069,7 +1142,7 @@ class PairwiseJudgementEnv(BaseEnv): # Get category and item info for debug logging category = test_item.get("subset", "unknown") item_id = test_item.get("id", "unknown") - + for prompt, response_text, is_correct in prompts_and_responses: messages = self._prepare_completion_input(prompt) completion_params = self._get_eval_completion_params() @@ -1078,52 +1151,63 @@ class PairwiseJudgementEnv(BaseEnv): max_retries = self.config.max_retries retry_delay = self.config.retry_delay success = False - + for attempt in range(max_retries): try: # Log full debug request self._log_full_debug_request( - messages, completion_params, category, item_id, - f"TIES_EVAL attempt {attempt + 1}/{max_retries}" + messages, + completion_params, + category, + item_id, + f"TIES_EVAL attempt {attempt + 1}/{max_retries}", ) - + completion = await self.server.chat_completion( messages=messages, **completion_params ) - + # Log full debug response - self._log_full_debug_response(completion, f"TIES_EVAL attempt {attempt + 1}/{max_retries}") + self._log_full_debug_response( + completion, f"TIES_EVAL attempt {attempt + 1}/{max_retries}" + ) if not completion.choices: if attempt < max_retries - 1: - print(f"DEBUG: No choices in ties completion (attempt {attempt + 1}/{max_retries})") + print( + f"DEBUG: No choices in ties completion (attempt {attempt + 1}/{max_retries})" + ) await asyncio.sleep(retry_delay) continue else: break # Failed after all retries - + model_response = completion.choices[0].message.content - + # Check for None content or very short responses if model_response is None: if attempt < max_retries - 1: - print(f"DEBUG: ties model_response is None (attempt {attempt + 1}/{max_retries})") + print( + f"DEBUG: ties model_response is None (attempt {attempt + 1}/{max_retries})" + ) await asyncio.sleep(retry_delay) continue else: break # Failed after all retries - + if not isinstance(model_response, str): if attempt < max_retries - 1: - print(f"DEBUG: ties model_response is not a string. Type: {type(model_response)} (attempt {attempt + 1}/{max_retries})") + print( + f"DEBUG: ties model_response is not a string. Type: {type(model_response)} (attempt {attempt + 1}/{max_retries})" + ) await asyncio.sleep(retry_delay) continue else: break # Failed after all retries - + # For ties evaluation, don't check response format - invalid ratings are part of normal evaluation # Only retry for technical failures (None content, API errors, etc.) - + # Success - process the rating rating = self._process_rating_judgment(model_response) ratings.append(rating) @@ -1138,16 +1222,20 @@ class PairwiseJudgementEnv(BaseEnv): ) success = True break - + except Exception as e: if attempt < max_retries - 1: - print(f"DEBUG: ties API call failed (attempt {attempt + 1}/{max_retries}): {e}") + print( + f"DEBUG: ties API call failed (attempt {attempt + 1}/{max_retries}): {e}" + ) await asyncio.sleep(retry_delay) continue else: - print(f"DEBUG: ties API call failed after {max_retries} attempts: {e}") + print( + f"DEBUG: ties API call failed after {max_retries} attempts: {e}" + ) break - + # If we failed after all retries, add error rating if not success: ratings.append(-1) # Error rating