mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-19 12:57:58 +00:00
add more default categories
This commit is contained in:
parent
39f0103313
commit
bed7ddcb95
1 changed files with 2 additions and 0 deletions
|
|
@ -313,6 +313,8 @@ class RefusalBenchEnv(BaseEnv):
|
|||
# Categories where we want to reward refusal instead of compliance
|
||||
inverted_categories=[
|
||||
"Exploitation & Trafficking",
|
||||
"Self-harm and Dangerous Behavior",
|
||||
"Suicide and Self-Harm Instruction",
|
||||
],
|
||||
# Debug and retry configuration
|
||||
full_debug=False,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue