fix bugs for symbolic regression bootcamp

2026-04-26 17:13:14 +00:00 · 2025-06-16 17:31:52 +08:00 · 2025-06-16 17:31:52 +08:00 · 5eb513f014
commit 5eb513f014
parent 33101ef068
11 changed files with 148 additions and 64 deletions
--- a/examples/pipelines/run_pipeline.sh
+++ b/examples/pipelines/run_pipeline.sh
@ -10,15 +10,17 @@ fi
 # 时间戳
 timestamp=$(date +"%Y-%m-%d-%H:%M:%S")
 # cipher输入集
-cipher_input_file='internbootcamp/libs/data/words_alpha_370000.txt'
+

 tokenizer="/cpfs01/shared/llm_ddd/lipeiji/hf_hub_1/models--Qwen--Qwen2.5-32B-Instruct/snapshots/afb2829595f63efa3548e9d6b13aa66e61aa0f38" # tokenizer is used to calculate the sequence length of the prompt
 max_prompt_len=4096
-max_jobs=60  # 设置最大并发进程数
+max_jobs=64  # 设置最大并发进程数
 jobs=()     # 用于存储后台进程的PID


-# initialize, do not modify this
+
+# initialize, do not modify below part
+cipher_input_file='internbootcamp/libs/data/words_alpha_370000.txt'
 cipher_test_nums_for_single_cipher=0
 cipher_train_nums_for_single_cipher=0

@ -64,7 +66,8 @@ while IFS= read -r line || [ -n "$line" ]; do

    pid=$!  # 获取后台进程的PID
    jobs+=("$pid")  # 将PID加入数组
-
+    # 打印当前进程总数
+    # echo "Current running jobs: ${#jobs[@]}"
    # 控制并发数量
    while [ ${#jobs[@]} -ge $max_jobs ]; do
        wait -n  # 等待任意一个子进程结束
@ -79,6 +82,9 @@ while IFS= read -r line || [ -n "$line" ]; do
    done
 done < examples/pipelines/data_configs/data_config_train.jsonl

+wait
+
+echo "train set generation finished, start test generation."

 while IFS= read -r line || [ -n "$line" ]; do
    # 跳过空行
@ -125,9 +131,10 @@ while IFS= read -r line || [ -n "$line" ]; do
    done
 done < examples/pipelines/data_configs/data_config_test.jsonl

-# 等待所有后台任务完成
 wait

+echo "test set generation finished"
+
 # cipher test-set gen 
 python examples/pipelines/cipher_data_generator.py \
    --nums $cipher_test_nums_for_single_cipher \