fix bugs for symbolic regression bootcamp

This commit is contained in:
lipeiji 2025-06-16 17:31:52 +08:00
parent 33101ef068
commit 5eb513f014
11 changed files with 148 additions and 64 deletions

View file

@ -10,15 +10,17 @@ fi
# 时间戳
timestamp=$(date +"%Y-%m-%d-%H:%M:%S")
# cipher输入集
cipher_input_file='internbootcamp/libs/data/words_alpha_370000.txt'
tokenizer="/cpfs01/shared/llm_ddd/lipeiji/hf_hub_1/models--Qwen--Qwen2.5-32B-Instruct/snapshots/afb2829595f63efa3548e9d6b13aa66e61aa0f38" # tokenizer is used to calculate the sequence length of the prompt
max_prompt_len=4096
max_jobs=60 # 设置最大并发进程数
max_jobs=64 # 设置最大并发进程数
jobs=() # 用于存储后台进程的PID
# initialize, do not modify this
# initialize, do not modify below part
cipher_input_file='internbootcamp/libs/data/words_alpha_370000.txt'
cipher_test_nums_for_single_cipher=0
cipher_train_nums_for_single_cipher=0
@ -64,7 +66,8 @@ while IFS= read -r line || [ -n "$line" ]; do
pid=$! # 获取后台进程的PID
jobs+=("$pid") # 将PID加入数组
# 打印当前进程总数
# echo "Current running jobs: ${#jobs[@]}"
# 控制并发数量
while [ ${#jobs[@]} -ge $max_jobs ]; do
wait -n # 等待任意一个子进程结束
@ -79,6 +82,9 @@ while IFS= read -r line || [ -n "$line" ]; do
done
done < examples/pipelines/data_configs/data_config_train.jsonl
wait
echo "train set generation finished, start test generation."
while IFS= read -r line || [ -n "$line" ]; do
# 跳过空行
@ -125,9 +131,10 @@ while IFS= read -r line || [ -n "$line" ]; do
done
done < examples/pipelines/data_configs/data_config_test.jsonl
# 等待所有后台任务完成
wait
echo "test set generation finished"
# cipher test-set gen
python examples/pipelines/cipher_data_generator.py \
--nums $cipher_test_nums_for_single_cipher \