mirror of
https://github.com/InternLM/InternBootcamp.git
synced 2026-04-19 12:58:04 +00:00
145 lines
4.9 KiB
Bash
Executable file
145 lines
4.9 KiB
Bash
Executable file
#!/bin/bash
|
|
|
|
|
|
# 检查 jq 是否已安装
|
|
if ! dpkg -l | grep -q '^ii jq '; then
|
|
echo "jq 未安装,正在安装..."
|
|
sudo apt-get update
|
|
sudo apt-get install jq -y
|
|
fi
|
|
# 时间戳
|
|
timestamp=$(date +"%Y-%m-%d-%H:%M:%S")
|
|
# cipher输入集
|
|
cipher_input_file='internbootcamp/libs/data/words_alpha_370000.txt'
|
|
|
|
tokenizer="/cpfs01/shared/llm_ddd/lipeiji/hf_hub_1/models--Qwen--Qwen2.5-32B-Instruct/snapshots/afb2829595f63efa3548e9d6b13aa66e61aa0f38" # tokenizer is used to calculate the sequence length of the prompt
|
|
max_prompt_len=4096
|
|
max_jobs=60 # 设置最大并发进程数
|
|
jobs=() # 用于存储后台进程的PID
|
|
|
|
|
|
# initialize, do not modify this
|
|
cipher_test_nums_for_single_cipher=0
|
|
cipher_train_nums_for_single_cipher=0
|
|
|
|
while IFS= read -r line || [ -n "$line" ]; do
|
|
# 跳过空行
|
|
if [ -z "$line" ]; then
|
|
continue
|
|
fi
|
|
|
|
# 解析JSON行并提取变量
|
|
bootcamp_name=$(echo "$line" | jq -r '.bootcamp_name')
|
|
declare -i sample_number=$(echo "$line" | jq -r '.sample_number')
|
|
config_file=$(echo "$line" | jq -r '.config_file')
|
|
bootcamp_cls_name=$(echo "$line" | jq -r '.bootcamp_cls_name')
|
|
|
|
# 如果 config_file 为 "cipher",保存 sample_number
|
|
if [[ "$config_file" == "cipher" ]]; then
|
|
cipehr_train_nums_for_single_cipher=$sample_number
|
|
continue
|
|
fi
|
|
|
|
# 异步运行Python脚本
|
|
# python examples/pipelines/data_generator.py \
|
|
# --bootcamp_name "$bootcamp_name" \
|
|
# --n $sample_number \
|
|
# --save_file "examples/bootcamp_generator_outputs/$timestamp/train/${bootcamp_name}.jsonl" \
|
|
# --config_file "examples/pipelines/puzzle_configs/${config_file}_train.json" \
|
|
# --bootcamp_cls_name "$bootcamp_cls_name" \
|
|
# --tokenizer "$tokenizer" \
|
|
# --max_prompt_len $max_prompt_len \
|
|
# --shuffle
|
|
|
|
# If there is no problem with the above command, you can use the following line to run it in multiple processes, replacing the above command
|
|
python examples/pipelines/data_generator.py \
|
|
--bootcamp_name "$bootcamp_name" \
|
|
--n $sample_number \
|
|
--save_file "examples/bootcamp_generator_outputs/$timestamp/train/${bootcamp_name}.jsonl" \
|
|
--config_file "examples/pipelines/puzzle_configs/${config_file}_train.json" \
|
|
--bootcamp_cls_name "$bootcamp_cls_name" \
|
|
--tokenizer "$tokenizer" \
|
|
--max_prompt_len $max_prompt_len \
|
|
--shuffle &
|
|
|
|
pid=$! # 获取后台进程的PID
|
|
jobs+=("$pid") # 将PID加入数组
|
|
|
|
# 控制并发数量
|
|
while [ ${#jobs[@]} -ge $max_jobs ]; do
|
|
wait -n # 等待任意一个子进程结束
|
|
# 清理已结束的进程的PID
|
|
new_jobs=()
|
|
for job_pid in "${jobs[@]}"; do
|
|
if kill -0 "$job_pid" 2>/dev/null; then
|
|
new_jobs+=("$job_pid")
|
|
fi
|
|
done
|
|
jobs=("${new_jobs[@]}")
|
|
done
|
|
done < examples/pipelines/data_configs/data_config_train.jsonl
|
|
|
|
|
|
while IFS= read -r line || [ -n "$line" ]; do
|
|
# 跳过空行
|
|
if [ -z "$line" ]; then
|
|
continue
|
|
fi
|
|
|
|
# 解析JSON行并提取变量
|
|
bootcamp_name=$(echo "$line" | jq -r '.bootcamp_name')
|
|
declare -i sample_number=$(echo "$line" | jq -r '.sample_number')
|
|
config_file=$(echo "$line" | jq -r '.config_file')
|
|
bootcamp_cls_name=$(echo "$line" | jq -r '.bootcamp_cls_name')
|
|
|
|
# 如果 config_file 为 "cipher",保存 sample_number
|
|
if [[ "$config_file" == "cipher" ]]; then
|
|
cipher_test_nums_for_single_cipher=$sample_number
|
|
continue
|
|
fi
|
|
|
|
# 异步运行Python脚本
|
|
python examples/pipelines/data_generator.py \
|
|
--bootcamp_name "$bootcamp_name" \
|
|
--n $sample_number \
|
|
--save_file "examples/bootcamp_generator_outputs/$timestamp/test/${bootcamp_name}.jsonl" \
|
|
--config_file "examples/pipelines/puzzle_configs/${config_file}_test.json" \
|
|
--tokenizer "$tokenizer" \
|
|
--bootcamp_cls_name "$bootcamp_cls_name" \
|
|
--max_prompt_len $max_prompt_len \
|
|
--shuffle &
|
|
pid=$! # 获取后台进程的PID
|
|
jobs+=("$pid") # 将PID加入数组
|
|
|
|
# 控制并发数量
|
|
while [ ${#jobs[@]} -ge $max_jobs ]; do
|
|
wait -n # 等待任意一个子进程结束
|
|
# 清理已结束的进程的PID
|
|
new_jobs=()
|
|
for job_pid in "${jobs[@]}"; do
|
|
if kill -0 "$job_pid" 2>/dev/null; then
|
|
new_jobs+=("$job_pid")
|
|
fi
|
|
done
|
|
jobs=("${new_jobs[@]}")
|
|
done
|
|
done < examples/pipelines/data_configs/data_config_test.jsonl
|
|
|
|
# 等待所有后台任务完成
|
|
wait
|
|
|
|
# cipher test-set gen
|
|
python examples/pipelines/cipher_data_generator.py \
|
|
--nums $cipher_test_nums_for_single_cipher \
|
|
--split test \
|
|
--timestamp $timestamp \
|
|
--filepath $cipher_input_file
|
|
|
|
# cipher train——set gen
|
|
python examples/pipelines/cipher_data_generator.py \
|
|
--nums $cipher_train_nums_for_single_cipher \
|
|
--split train \
|
|
--timestamp $timestamp \
|
|
--filepath $cipher_input_file
|
|
|
|
wait
|