From 8a348beccd9dac4c9a9d9566144149adfb92ed03 Mon Sep 17 00:00:00 2001 From: Jai Suphavadeeprasit Date: Wed, 11 Mar 2026 19:37:17 -0400 Subject: [PATCH] tokenizer bug --- .../run_gsm8k_teacher_distill_single_terminal.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/example_trainer/run_gsm8k_teacher_distill_single_terminal.sh b/example_trainer/run_gsm8k_teacher_distill_single_terminal.sh index 1b58b738..311f668d 100755 --- a/example_trainer/run_gsm8k_teacher_distill_single_terminal.sh +++ b/example_trainer/run_gsm8k_teacher_distill_single_terminal.sh @@ -57,6 +57,9 @@ WARMUP_STEPS="${WARMUP_STEPS:-0}" CLIP_EPS="${CLIP_EPS:-0.2}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-16384}" TEACHER_MAX_MODEL_LEN="${TEACHER_MAX_MODEL_LEN:-32768}" +# Trainer seq_len must be larger than ENV_MAX_TOKEN_LENGTH to accommodate +# chat template overhead (~400-800 tokens for Qwen3 thinking format). +TRAINER_SEQ_LEN="${TRAINER_SEQ_LEN:-20480}" ENV_MAX_TOKEN_LENGTH="${ENV_MAX_TOKEN_LENGTH:-16384}" DISTILL_COEF="${DISTILL_COEF:-0.2}" DISTILL_TEMPERATURE="${DISTILL_TEMPERATURE:-1.0}" @@ -263,7 +266,7 @@ if [[ "$DRY_RUN" == "1" ]]; then --warmup-steps "$WARMUP_STEPS" \ --lr "$LR" \ --clip-eps "$CLIP_EPS" \ - --seq-len "$ENV_MAX_TOKEN_LENGTH" \ + --seq-len "$TRAINER_SEQ_LEN" \ --distill-enabled \ --distill-coef "$DISTILL_COEF" \ --distill-temperature "$DISTILL_TEMPERATURE" @@ -287,7 +290,7 @@ start_process "trainer" "${LOG_DIR}/trainer.log" \ --warmup-steps "$WARMUP_STEPS" \ --lr "$LR" \ --clip-eps "$CLIP_EPS" \ - --seq-len "$ENV_MAX_TOKEN_LENGTH" \ + --seq-len "$TRAINER_SEQ_LEN" \ --distill-enabled \ --distill-coef "$DISTILL_COEF" \ --distill-temperature "$DISTILL_TEMPERATURE"