diff --git a/atroposlib/utils/display.py b/atroposlib/utils/display.py index f663a1a9..b99d9835 100644 --- a/atroposlib/utils/display.py +++ b/atroposlib/utils/display.py @@ -15,9 +15,7 @@ def display_metrics_table( start_time: Start time of evaluation (unix timestamp) end_time: End time of evaluation (unix timestamp) """ - print("\n" + "=" * 84) - print(f"Evaluation Results: {task_name}") - print("=" * 84) + print(f"\nEvaluation Results: {task_name}") # Column widths col_groups = 20 @@ -49,6 +47,4 @@ def display_metrics_table( f"|{task_name:<{col_groups}}|{1:<{col_version}}|{'none':<{col_filter}}|{'':<{col_nshot}}|{clean_metric_name:<{col_metric}}|{direction:<{col_dir}}|{metric_value:>{col_value}.4f}|{'±':<{col_pm}}|{'0.0000':>{col_stderr}}|" # noqa: E501 ) - print("=" * 84) - print(f"Evaluation completed in {end_time - start_time:.2f} seconds") - print("=" * 84 + "\n") + print(f"Evaluation completed in {end_time - start_time:.2f} seconds\n") diff --git a/environments/gsm8k_server.py b/environments/gsm8k_server.py index cfe5d5b2..65418863 100644 --- a/environments/gsm8k_server.py +++ b/environments/gsm8k_server.py @@ -211,8 +211,6 @@ class GSM8kEnv(BaseEnv): # Log evaluation results eval_metrics = { "eval/percent_correct": percent_correct, - "eval/total_samples": len(scores), - "eval/correct_samples": sum(scores), } await self.evaluate_log( @@ -223,7 +221,6 @@ class GSM8kEnv(BaseEnv): generation_parameters={ "temperature": 0.0, "max_tokens": self.config.max_token_length, - "split": "eval", }, )