InternBootcamp/examples/pipelines/run_pipeline.sh
2025-06-16 10:33:07 +08:00

145 lines
4.9 KiB
Bash
Executable file

#!/bin/bash
# 检查 jq 是否已安装
if ! dpkg -l | grep -q '^ii jq '; then
echo "jq 未安装,正在安装..."
sudo apt-get update
sudo apt-get install jq -y
fi
# 时间戳
timestamp=$(date +"%Y-%m-%d-%H:%M:%S")
# cipher输入集
cipher_input_file='internbootcamp/libs/data/words_alpha_370000.txt'
tokenizer="/cpfs01/shared/llm_ddd/lipeiji/hf_hub_1/models--Qwen--Qwen2.5-32B-Instruct/snapshots/afb2829595f63efa3548e9d6b13aa66e61aa0f38" # tokenizer is used to calculate the sequence length of the prompt
max_prompt_len=4096
max_jobs=60 # 设置最大并发进程数
jobs=() # 用于存储后台进程的PID
# initialize, do not modify this
cipher_test_nums_for_single_cipher=0
cipher_train_nums_for_single_cipher=0
while IFS= read -r line || [ -n "$line" ]; do
# 跳过空行
if [ -z "$line" ]; then
continue
fi
# 解析JSON行并提取变量
bootcamp_name=$(echo "$line" | jq -r '.bootcamp_name')
declare -i sample_number=$(echo "$line" | jq -r '.sample_number')
config_file=$(echo "$line" | jq -r '.config_file')
bootcamp_cls_name=$(echo "$line" | jq -r '.bootcamp_cls_name')
# 如果 config_file 为 "cipher",保存 sample_number
if [[ "$config_file" == "cipher" ]]; then
cipehr_train_nums_for_single_cipher=$sample_number
continue
fi
# 异步运行Python脚本
# python examples/pipelines/data_generator.py \
# --bootcamp_name "$bootcamp_name" \
# --n $sample_number \
# --save_file "examples/bootcamp_generator_outputs/$timestamp/train/${bootcamp_name}.jsonl" \
# --config_file "examples/pipelines/puzzle_configs/${config_file}_train.json" \
# --bootcamp_cls_name "$bootcamp_cls_name" \
# --tokenizer "$tokenizer" \
# --max_prompt_len $max_prompt_len \
# --shuffle
# If there is no problem with the above command, you can use the following line to run it in multiple processes, replacing the above command
python examples/pipelines/data_generator.py \
--bootcamp_name "$bootcamp_name" \
--n $sample_number \
--save_file "examples/bootcamp_generator_outputs/$timestamp/train/${bootcamp_name}.jsonl" \
--config_file "examples/pipelines/puzzle_configs/${config_file}_train.json" \
--bootcamp_cls_name "$bootcamp_cls_name" \
--tokenizer "$tokenizer" \
--max_prompt_len $max_prompt_len \
--shuffle &
pid=$! # 获取后台进程的PID
jobs+=("$pid") # 将PID加入数组
# 控制并发数量
while [ ${#jobs[@]} -ge $max_jobs ]; do
wait -n # 等待任意一个子进程结束
# 清理已结束的进程的PID
new_jobs=()
for job_pid in "${jobs[@]}"; do
if kill -0 "$job_pid" 2>/dev/null; then
new_jobs+=("$job_pid")
fi
done
jobs=("${new_jobs[@]}")
done
done < examples/pipelines/data_configs/data_config_train.jsonl
while IFS= read -r line || [ -n "$line" ]; do
# 跳过空行
if [ -z "$line" ]; then
continue
fi
# 解析JSON行并提取变量
bootcamp_name=$(echo "$line" | jq -r '.bootcamp_name')
declare -i sample_number=$(echo "$line" | jq -r '.sample_number')
config_file=$(echo "$line" | jq -r '.config_file')
bootcamp_cls_name=$(echo "$line" | jq -r '.bootcamp_cls_name')
# 如果 config_file 为 "cipher",保存 sample_number
if [[ "$config_file" == "cipher" ]]; then
cipher_test_nums_for_single_cipher=$sample_number
continue
fi
# 异步运行Python脚本
python examples/pipelines/data_generator.py \
--bootcamp_name "$bootcamp_name" \
--n $sample_number \
--save_file "examples/bootcamp_generator_outputs/$timestamp/test/${bootcamp_name}.jsonl" \
--config_file "examples/pipelines/puzzle_configs/${config_file}_test.json" \
--tokenizer "$tokenizer" \
--bootcamp_cls_name "$bootcamp_cls_name" \
--max_prompt_len $max_prompt_len \
--shuffle &
pid=$! # 获取后台进程的PID
jobs+=("$pid") # 将PID加入数组
# 控制并发数量
while [ ${#jobs[@]} -ge $max_jobs ]; do
wait -n # 等待任意一个子进程结束
# 清理已结束的进程的PID
new_jobs=()
for job_pid in "${jobs[@]}"; do
if kill -0 "$job_pid" 2>/dev/null; then
new_jobs+=("$job_pid")
fi
done
jobs=("${new_jobs[@]}")
done
done < examples/pipelines/data_configs/data_config_test.jsonl
# 等待所有后台任务完成
wait
# cipher test-set gen
python examples/pipelines/cipher_data_generator.py \
--nums $cipher_test_nums_for_single_cipher \
--split test \
--timestamp $timestamp \
--filepath $cipher_input_file
# cipher train——set gen
python examples/pipelines/cipher_data_generator.py \
--nums $cipher_train_nums_for_single_cipher \
--split train \
--timestamp $timestamp \
--filepath $cipher_input_file
wait