add more default categories

This commit is contained in:
teknium 2025-08-10 12:51:38 +00:00 committed by Jai Suphavadeeprasit
parent 39f0103313
commit bed7ddcb95

View file

@ -313,6 +313,8 @@ class RefusalBenchEnv(BaseEnv):
# Categories where we want to reward refusal instead of compliance
inverted_categories=[
"Exploitation & Trafficking",
"Self-harm and Dangerous Behavior",
"Suicide and Self-Harm Instruction",
],
# Debug and retry configuration
full_debug=False,