diff --git a/environments/eval_environments/refusalbench_environment.py b/environments/eval_environments/refusalbench_environment.py index 4d2e74ae..143893ef 100644 --- a/environments/eval_environments/refusalbench_environment.py +++ b/environments/eval_environments/refusalbench_environment.py @@ -405,14 +405,6 @@ class RefusalBenchEnv(BaseEnv): random.shuffle(train_list) self.train = train_list except Exception as e: - print(f"āŒ CRITICAL ERROR: Failed to load training dataset '{self.config.train_dataset}': {e}") - print(f" This would result in corrupted training with unrepresentative data.") - print(f" Please fix the training dataset configuration before proceeding.") - print(f" Available options:") - print(f" 1. Check if the dataset path/name is correct") - print(f" 2. Verify dataset access permissions") - print(f" 3. Ensure required dependencies are installed") - print(f" 4. Check network connectivity for HuggingFace datasets") raise ValueError( f"Training dataset '{self.config.train_dataset}' failed to load. " f"Cannot proceed with training as fallback data would corrupt the training process. " @@ -511,9 +503,6 @@ class RefusalBenchEnv(BaseEnv): if not self.config.full_debug: return - print(f"\nšŸ” FULL DEBUG - API REQUEST [{context}]") - print(f" Parameters: {params}") - for i, message in enumerate(messages): role = message.get("role", "unknown") content = message.get("content", "") @@ -526,8 +515,6 @@ class RefusalBenchEnv(BaseEnv): if not self.config.full_debug: return - print(f"\nšŸ” FULL DEBUG - API RESPONSE [{context}]") - if hasattr(completion, "usage"): print(f" Usage: {completion.usage}") @@ -539,10 +526,6 @@ class RefusalBenchEnv(BaseEnv): if hasattr(choice, "finish_reason") else "unknown" ) - print( - f" Choice {i+1}: {self._format_debug_text(content, 'Response')}" - ) - print(f" Finish reason: {finish_reason}") else: print(" No choices in response") @@ -638,7 +621,6 @@ class RefusalBenchEnv(BaseEnv): async def _inner_judge_call(): async with self.judge_semaphore: # Simple rate limiting: wait before each call - print(f" ā±ļø Rate limiting: waiting {self.config.judge_rate_limit_delay}s before judge call") await asyncio.sleep(self.config.judge_rate_limit_delay) return await self._judge_api_call_raw(messages) @@ -649,7 +631,6 @@ class RefusalBenchEnv(BaseEnv): async def _judge_api_call_raw(self, messages: List[Dict]): """Make a raw API call to the judge model without retry logic.""" try: - print(f" šŸ¤– Making judge API call to {self.config.judge_model_name}") result = await self.judge_client.chat.completions.create( model=self.config.judge_model_name, messages=messages, @@ -659,7 +640,6 @@ class RefusalBenchEnv(BaseEnv): print(f" āœ… Judge API call successful") return result except Exception as e: - print(f" āŒ Judge API call failed: {type(e).__name__}: {str(e)}") if hasattr(e, 'response') and hasattr(e.response, 'status_code'): print(f" HTTP Status: {e.response.status_code}") if hasattr(e, 'response') and hasattr(e.response, 'text'): @@ -773,7 +753,6 @@ Your answer:""" await self._update_judge_error_metrics() if self.config.use_fallback_scoring: - print(" šŸ”„ Using fallback refusal detection") await self._update_fallback_metrics() return self._fallback_refusal_detection(response) else: @@ -1074,7 +1053,6 @@ Your answer:""" for attempt in range(max_retries): try: - print(f"\nšŸ”„ EVAL Attempt {attempt + 1}/{max_retries}") print(f" Prompt: {prompt_text[:100]}...") print(f" Category: {overarching_category}") print(f" Danger: {danger}") @@ -1089,7 +1067,6 @@ Your answer:""" print(f" Server sem value: {server.sem._value}") print(f" Server eval_sem value: {server.eval_sem._value}") - print(f"šŸš€ Calling self.server.chat_completion...") completion = await self.server.chat_completion( messages=messages, **completion_params ) @@ -1133,7 +1110,6 @@ Your answer:""" break except Exception as e: - print(f"\nāŒ EVAL Attempt {attempt + 1} FAILED!") print(f" Error type: {type(e).__name__}") print(f" Error message: {str(e)}") @@ -1151,11 +1127,10 @@ Your answer:""" traceback.print_exc() if attempt < max_retries - 1: - print(f" ā³ Retrying in {retry_delay} seconds...") await asyncio.sleep(retry_delay) continue else: - print(f" šŸ’€ All retry attempts failed, giving up") + print(f"All retry attempts failed, giving up") raise # Validate thinking format if enabled @@ -1321,18 +1296,15 @@ Your answer:""" # Build evaluation metrics dict - ordered with most important metrics first eval_metrics = {} - # šŸ” TOP PRIORITY: Overall and aggregated refusal rates eval_metrics["eval/overall_refusal_rate"] = overall_refusal_rate if normal_refusal_rates: eval_metrics["eval/normal_categories_refusal_rate"] = avg_normal_refusal_rate if inverted_refusal_rates: eval_metrics["eval/inverted_categories_refusal_rate"] = avg_inverted_refusal_rate - # šŸ“Š SUMMARY METRICS: Basic counts and compliance eval_metrics["eval/total_samples"] = len(samples) eval_metrics["eval/format_compliance_rate"] = format_compliance_rate - # šŸ“‹ CATEGORY-SPECIFIC METRICS: Sorted alphabetically for consistency category_metrics = {} for category, rates in category_refusal_rates.items(): if rates: @@ -1363,7 +1335,6 @@ Your answer:""" """Evaluate the model on the test dataset with optional averaging over multiple runs.""" if self.config.eval_runs_for_average == 1: # Single run - use the original behavior - print("šŸ”„ Running single evaluation...") result = await self._evaluate_single_run(1, *args, **kwargs) try: @@ -1378,12 +1349,10 @@ Your answer:""" print(f"Error logging evaluation results: {e}") else: # Multiple runs - perform averaging - print(f"šŸ”„ Running evaluation {self.config.eval_runs_for_average} times for averaging...") all_run_results = [] overall_start_time = time.time() for run_num in range(1, self.config.eval_runs_for_average + 1): - print(f"\nšŸ“Š Starting evaluation run {run_num}/{self.config.eval_runs_for_average}") try: result = await self._evaluate_single_run(run_num, *args, **kwargs) @@ -1399,12 +1368,11 @@ Your answer:""" generation_parameters=result["generation_parameters"], task_name=f"{self.name}_eval_run_{run_num}" if self.name else f"RefusalBench_eval_run_{run_num}", ) - print(f"āœ… Run {run_num} completed and logged") except Exception as e: - print(f"āš ļø Error logging run {run_num} results: {e}") + print(f"Error logging run {run_num} results: {e}") except Exception as e: - print(f"āŒ Error in evaluation run {run_num}: {e}") + print(f"Error in evaluation run {run_num}: {e}") continue overall_end_time = time.time() @@ -1414,7 +1382,6 @@ Your answer:""" return # Calculate averages across all runs - print(f"\n🧮 Computing averages across {len(all_run_results)} successful runs...") averaged_results = await self._compute_averaged_results(all_run_results, overall_start_time, overall_end_time) # Log averaged results @@ -1427,7 +1394,6 @@ Your answer:""" generation_parameters=averaged_results["generation_parameters"], task_name=f"{self.name}_eval_averaged" if self.name else "RefusalBench_eval_averaged", ) - print(f"āœ… Averaged results across {len(all_run_results)} runs logged successfully") except Exception as e: print(f"āŒ Error logging averaged results: {e}")