diff --git a/training/configs/external_generalisation/math_qwen_3b.yaml b/training/configs/external_generalisation/math_qwen_3b.yaml
deleted file mode 100644
index 7916584c..00000000
--- a/training/configs/external_generalisation/math_qwen_3b.yaml
+++ /dev/null
@@ -1,242 +0,0 @@
-reasoning_gym:
-  dataset_size: 50000
-  developer_prompt: DeepSeekZero
-  datasets:
-    complex_arithmetic:
-      weight: 1
-    intermediate_integration:
-      weight: 1
-    polynomial_equations:
-      weight: 1
-    polynomial_multiplication:
-      weight: 1
-    simple_equations:
-      weight: 1
-    simple_integration:
-      weight: 1
-    propositional_logic:
-      weight: 1
-    advanced_geometry:
-      weight: 1
-    simple_geometry:
-      weight: 1
-    basic_arithmetic:
-      weight: 1
-    bitwise_arithmetic:
-      weight: 1
-    chain_sum:
-      weight: 1
-    decimal_arithmetic:
-      weight: 1
-    decimal_chain_sum:
-      weight: 1
-    fraction_simplification:
-      weight: 1
-    gcd:
-      weight: 1
-    lcm:
-      weight: 1
-    prime_factorization:
-      weight: 1
-
-curriculum:
-    enabled: False
-    schedule:
-      automatic: True
-      update_steps: 30 # automatic curriculum updating after 50 steps
-    last_k: 20
-    success_threshold: 0.70
-    failure_threshold: 0.10
-    curricula:
-      spell_backward:
-        attribute_levels:
-          word_len: 0
-reward:
-  use_accuracy: True
-  conditional_reward: False
-  secondary_rewards:
-   - name: format
-     scaling_factor: 0.2
-     kwargs:
-        preappend_thinking_token: False
-   - name: length
-     scaling_factor: 0.2
-
-data:
-  tokenizer: null
-  train_files: train.parquet
-  val_files: test.parquet
-  prompt_key: prompt
-  max_prompt_length: 4096
-  max_response_length: 2048
-  train_batch_size: 32
-  val_batch_size: 64
-  return_raw_chat: True
-  return_raw_input_ids: True
-
-actor_rollout_ref:
-  hybrid_engine: True
-  model:
-    path: Qwen/Qwen2.5-3B-Instruct
-    external_lib: null
-    override_config: { }
-    enable_gradient_checkpointing: True
-    use_remove_padding: True
-  actor:
-    strategy: fsdp  # This is for backward-compatibility
-    ppo_mini_batch_size: 32
-    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
-    ppo_micro_batch_size_per_gpu: 8
-    use_dynamic_bsz: False
-    ppo_max_token_len_per_gpu: 49152 # n * ${data.max_prompt_length} + ${data.max_response_length}
-    grad_clip: 1.0
-    clip_ratio: 0.2
-    entropy_coeff: 0.001
-    use_kl_loss: True # True for GRPO
-    kl_loss_coef: 0.001 # for grpo
-    kl_loss_type: low_var_kl # for grpo
-    ppo_epochs: 1
-    shuffle: False
-    ulysses_sequence_parallel_size: 1 # sp size
-    optim:
-      lr: 1e-6
-      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
-      min_lr_ratio: null   # only useful for warmup with cosine
-      warmup_style: constant  # select from constant/cosine
-      total_training_steps: 500  # must be override by program
-    fsdp_config:
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-      param_offload: False
-      optimizer_offload: False
-      fsdp_size: -1
-  ref:
-    fsdp_config:
-      param_offload: True
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 160
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
-  rollout:
-    name: vllm
-    temperature: 1.0
-    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-    top_p: 1
-    prompt_length: ${data.max_prompt_length}  # not use for opensource
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.7
-    ignore_eos: False
-    enforce_eager: True
-    free_cache_engine: True
-    load_format: dummy_dtensor
-    tensor_model_parallel_size: 4
-    max_num_batched_tokens: 12288
-    max_num_seqs: 1024
-    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 160
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    use_fire_sampling: False
-    max_model_len: 12288
-    # number of responses (i.e. num sample times)
-    n: 8 # > 1 for grpo
-    val_kwargs:
-      do_sample: True
-
-algorithm:
-  gamma: 1.0
-  lam: 1.0
-  adv_estimator: grpo
-  kl_penalty: kl  # how to estimate kl divergence
-  kl_ctrl:
-    type: fixed
-    kl_coef: 0.001
-verbose: True
-trainer:
-  balance_batch: True
-  total_epochs: 1
-  total_training_steps: 1500
-  project_name: external-generalisation
-  experiment_name: math_qwen_3b
-  logger: [ 'console', 'wandb' ]
-  val_generations_to_log_to_wandb: 0
-  nnodes: 1
-  n_gpus_per_node: 4
-  save_freq: 100
-  # auto: find the last ckpt to resume. If can't find, start from scratch
-  resume_mode: auto # or auto or resume_path if
-  resume_from_path: False
-  test_freq: 100
-  critic_warmup: 0
-  default_hdfs_dir: null
-  remove_previous_ckpt_in_save: False
-  del_local_ckpt_after_load: False
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
-
-
-critic:
-  strategy: fsdp
-  optim:
-    lr: 1e-5
-    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
-    min_lr_ratio: null   # only useful for warmup with cosine
-    warmup_style: constant  # select from constant/cosine
-    total_training_steps: -1  # must be override by program
-  model:
-    path: ~/models/deepseek-llm-7b-chat
-    tokenizer_path: ${actor_rollout_ref.model.path}
-    override_config: { }
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    enable_gradient_checkpointing: True
-    use_remove_padding: False
-    fsdp_config:
-      param_offload: False
-      optimizer_offload: False
-      wrap_policy:
-        # transformer_layer_cls_to_wrap: None
-        min_num_params: 0
-      fsdp_size: -1
-  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
-  ppo_micro_batch_size_per_gpu: null
-  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
-  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
-  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
-  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
-  ulysses_sequence_parallel_size: 1 # sp size
-  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
-  shuffle: ${actor_rollout_ref.actor.shuffle}
-  grad_clip: 1.0
-  cliprange_value: 0.5
-
-# Reward model not used for GRPO
-reward_model:
-  enable: False
-  strategy: fsdp
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    use_remove_padding: False
-    fsdp_config:
-      min_num_params: 0
-      param_offload: False
-      fsdp_size: -1
-  micro_batch_size: null
-  micro_batch_size_per_gpu: null
-  max_length: null
-  ulysses_sequence_parallel_size: 1
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}