InternBootcamp/examples/pipelines/run_pipeline.sh

#!/bin/bash


# 检查 jq 是否已安装
if ! dpkg -l | grep -q '^ii  jq '; then
    echo "jq 未安装，正在安装..."
    sudo apt-get update
    sudo apt-get install jq -y
fi
# 时间戳
timestamp=$(date +"%Y-%m-%d-%H:%M:%S")
# cipher输入集

train_config_file=examples/pipelines/data_configs/data_config_train_verified.jsonl
test_config_file=None
tokenizer="hf_model_path" # tokenizer is used to calculate the sequence length of the prompt, use one tokenizer of huggingface model, such as "Qwen2.5-7B-Instruct"
max_prompt_len=4096
max_jobs=32  # 设置最大并发进程数
jobs=()     # 用于存储后台进程的PID
config_type=all_configs # 配置文件类型，puzzle_configs or autogen_configs


# initialize, do not modify below part
cipher_input_file='internbootcamp/libs/data/words_alpha_370000.txt'
cipher_test_nums_for_single_cipher=0
cipher_train_nums_for_single_cipher=0

while IFS= read -r line || [ -n "$line" ]; do
    # 跳过空行
    if [ -z "$line" ]; then
        continue
    fi

    # 解析JSON行并提取变量
    bootcamp_name=$(echo "$line" | jq -r '.bootcamp_name')
    declare -i sample_number=$(echo "$line" | jq -r '.sample_number')
    config_file=$(echo "$line" | jq -r '.config_file')
    bootcamp_cls_name=$(echo "$line" | jq -r '.bootcamp_cls_name')

    # 如果 config_file 为 "cipher"，保存 sample_number
    if [[ "$config_file" == "cipher" ]]; then
        cipher_train_nums_for_single_cipher=$sample_number
        continue
    fi

    # 异步运行Python脚本
    # python examples/pipelines/data_generator.py \
    #     --bootcamp_name "$bootcamp_name" \
    #     --n $sample_number \
    #     --save_file "examples/bootcamp_generator_outputs/$timestamp/train/${bootcamp_name}.jsonl" \
    #     --config_file "examples/pipelines/puzzle_configs/${config_file}_train.json" \
    #     --bootcamp_cls_name "$bootcamp_cls_name" \
    #     --tokenizer "$tokenizer" \
    #     --max_prompt_len $max_prompt_len \
    #     --shuffle

    # If there is no problem with the above command, you can use the following line to run it in multiple processes, replacing the above command
    python examples/pipelines/data_generator.py \
        --bootcamp_name "$bootcamp_name" \
        --n $sample_number \
        --save_file "examples/bootcamp_generator_outputs/$timestamp/train/${bootcamp_name}.jsonl" \
        --config_file "examples/pipelines/$config_type/${config_file}_train.json" \
        --bootcamp_cls_name "$bootcamp_cls_name" \
        --tokenizer "$tokenizer" \
        --max_prompt_len $max_prompt_len \
        --shuffle &

    pid=$!  # 获取后台进程的PID
    jobs+=("$pid")  # 将PID加入数组
    # 打印当前进程总数
    # echo "Current running jobs: ${#jobs[@]}"
    # 控制并发数量
    while [ ${#jobs[@]} -ge $max_jobs ]; do
        wait -n  # 等待任意一个子进程结束
        # 清理已结束的进程的PID
        new_jobs=()
        for job_pid in "${jobs[@]}"; do
            if kill -0 "$job_pid" 2>/dev/null; then
                new_jobs+=("$job_pid")
            fi
        done
        jobs=("${new_jobs[@]}")
    done
done < $train_config_file

wait

echo "train set generation finished, start test generation."

while IFS= read -r line || [ -n "$line" ]; do
    # 跳过空行
    if [ -z "$line" ]; then
        continue
    fi

    # 解析JSON行并提取变量
    bootcamp_name=$(echo "$line" | jq -r '.bootcamp_name')
    declare -i sample_number=$(echo "$line" | jq -r '.sample_number')
    config_file=$(echo "$line" | jq -r '.config_file')
    bootcamp_cls_name=$(echo "$line" | jq -r '.bootcamp_cls_name')

    # 如果 config_file 为 "cipher"，保存 sample_number
    if [[ "$config_file" == "cipher" ]]; then
        cipher_test_nums_for_single_cipher=$sample_number
        continue
    fi

    # 异步运行Python脚本
    python examples/pipelines/data_generator.py \
        --bootcamp_name "$bootcamp_name" \
        --n $sample_number \
        --save_file "examples/bootcamp_generator_outputs/$timestamp/test/${bootcamp_name}.jsonl" \
        --config_file "examples/pipelines/$config_type/${config_file}_test.json" \
        --tokenizer "$tokenizer" \
        --bootcamp_cls_name "$bootcamp_cls_name" \
        --max_prompt_len $max_prompt_len \
        --shuffle &
    pid=$!  # 获取后台进程的PID
    jobs+=("$pid")  # 将PID加入数组

    # 控制并发数量
    while [ ${#jobs[@]} -ge $max_jobs ]; do
        wait -n  # 等待任意一个子进程结束
        # 清理已结束的进程的PID
        new_jobs=()
        for job_pid in "${jobs[@]}"; do
            if kill -0 "$job_pid" 2>/dev/null; then
                new_jobs+=("$job_pid")
            fi
        done
        jobs=("${new_jobs[@]}")
    done
done < $test_config_file

wait

echo "test set generation finished"

# cipher test-set gen
python examples/pipelines/cipher_data_generator.py \
    --nums $cipher_test_nums_for_single_cipher \
    --split test \
    --timestamp $timestamp \
    --filepath $cipher_input_file

# cipher train——set gen
python examples/pipelines/cipher_data_generator.py \
    --nums $cipher_train_nums_for_single_cipher \
    --split train \
    --timestamp $timestamp \
    --filepath $cipher_input_file

wait