diff --git a/example_trainer/run_gsm8k_teacher_distill_single_terminal.sh b/example_trainer/run_gsm8k_teacher_distill_single_terminal.sh index 5680f679..6b22d767 100755 --- a/example_trainer/run_gsm8k_teacher_distill_single_terminal.sh +++ b/example_trainer/run_gsm8k_teacher_distill_single_terminal.sh @@ -56,7 +56,7 @@ LR="${LR:-1e-5}" WARMUP_STEPS="${WARMUP_STEPS:-0}" CLIP_EPS="${CLIP_EPS:-0.2}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-16384}" -ENV_MAX_TOKEN_LENGTH="${ENV_MAX_TOKEN_LENGTH:-8192}" +ENV_MAX_TOKEN_LENGTH="${ENV_MAX_TOKEN_LENGTH:-4096}" DISTILL_COEF="${DISTILL_COEF:-0.2}" DISTILL_TEMPERATURE="${DISTILL_TEMPERATURE:-1.0}" TEACHER_TOP_K="${TEACHER_TOP_K:-8}"