InternBootcamp/examples/pipelines/run_pipeline.sh
Yongkang Chen a8249acc18
update to tech report version (#10)
* feat(run_eval): add checkpoint resume functionality and update example documentation;
- update new bootcamp benchmark dataset

* refactor(data_pipeline): optimize data generation pipeline; add multiple preset configurations for data generation

* docs: update bootcamp list and add new scripts

- Update Fulllist_InternBootcamp.md with new bootcamps and categories
- Add new scripts to .gitignore:
  - examples/pipelines/filter_autogen_configs.py
  - examples/pipelines/quickgen_data_configs_from_eval_meta.py
- Update dependencies in setup.py:
  - Add scipy and scikit-learn

* refactor(internbootcamp): update bootcamp modules and improve error handling

- Update import statements in __init__.py files
- Add timestamp to target directory name in verl_data_preprocess.py
- Improve error handling and scoring logic in bootcamp_judger.py
- Remove unnecessary comments and update puzzle descriptions in multiple files
2025-08-28 12:39:47 +08:00

154 lines
5.2 KiB
Bash
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# 检查 jq 是否已安装
if ! dpkg -l | grep -q '^ii jq '; then
echo "jq 未安装,正在安装..."
sudo apt-get update
sudo apt-get install jq -y
fi
# 时间戳
timestamp=$(date +"%Y-%m-%d-%H:%M:%S")
# cipher输入集
train_config_file=examples/pipelines/data_configs/data_config_train_verified.jsonl
test_config_file=None
tokenizer="hf_model_path" # tokenizer is used to calculate the sequence length of the prompt, use one tokenizer of huggingface model, such as "Qwen2.5-7B-Instruct"
max_prompt_len=4096
max_jobs=32 # 设置最大并发进程数
jobs=() # 用于存储后台进程的PID
config_type=all_configs # 配置文件类型puzzle_configs or autogen_configs
# initialize, do not modify below part
cipher_input_file='internbootcamp/libs/data/words_alpha_370000.txt'
cipher_test_nums_for_single_cipher=0
cipher_train_nums_for_single_cipher=0
while IFS= read -r line || [ -n "$line" ]; do
# 跳过空行
if [ -z "$line" ]; then
continue
fi
# 解析JSON行并提取变量
bootcamp_name=$(echo "$line" | jq -r '.bootcamp_name')
declare -i sample_number=$(echo "$line" | jq -r '.sample_number')
config_file=$(echo "$line" | jq -r '.config_file')
bootcamp_cls_name=$(echo "$line" | jq -r '.bootcamp_cls_name')
# 如果 config_file 为 "cipher",保存 sample_number
if [[ "$config_file" == "cipher" ]]; then
cipher_train_nums_for_single_cipher=$sample_number
continue
fi
# 异步运行Python脚本
# python examples/pipelines/data_generator.py \
# --bootcamp_name "$bootcamp_name" \
# --n $sample_number \
# --save_file "examples/bootcamp_generator_outputs/$timestamp/train/${bootcamp_name}.jsonl" \
# --config_file "examples/pipelines/puzzle_configs/${config_file}_train.json" \
# --bootcamp_cls_name "$bootcamp_cls_name" \
# --tokenizer "$tokenizer" \
# --max_prompt_len $max_prompt_len \
# --shuffle
# If there is no problem with the above command, you can use the following line to run it in multiple processes, replacing the above command
python examples/pipelines/data_generator.py \
--bootcamp_name "$bootcamp_name" \
--n $sample_number \
--save_file "examples/bootcamp_generator_outputs/$timestamp/train/${bootcamp_name}.jsonl" \
--config_file "examples/pipelines/$config_type/${config_file}_train.json" \
--bootcamp_cls_name "$bootcamp_cls_name" \
--tokenizer "$tokenizer" \
--max_prompt_len $max_prompt_len \
--shuffle &
pid=$! # 获取后台进程的PID
jobs+=("$pid") # 将PID加入数组
# 打印当前进程总数
# echo "Current running jobs: ${#jobs[@]}"
# 控制并发数量
while [ ${#jobs[@]} -ge $max_jobs ]; do
wait -n # 等待任意一个子进程结束
# 清理已结束的进程的PID
new_jobs=()
for job_pid in "${jobs[@]}"; do
if kill -0 "$job_pid" 2>/dev/null; then
new_jobs+=("$job_pid")
fi
done
jobs=("${new_jobs[@]}")
done
done < $train_config_file
wait
echo "train set generation finished, start test generation."
while IFS= read -r line || [ -n "$line" ]; do
# 跳过空行
if [ -z "$line" ]; then
continue
fi
# 解析JSON行并提取变量
bootcamp_name=$(echo "$line" | jq -r '.bootcamp_name')
declare -i sample_number=$(echo "$line" | jq -r '.sample_number')
config_file=$(echo "$line" | jq -r '.config_file')
bootcamp_cls_name=$(echo "$line" | jq -r '.bootcamp_cls_name')
# 如果 config_file 为 "cipher",保存 sample_number
if [[ "$config_file" == "cipher" ]]; then
cipher_test_nums_for_single_cipher=$sample_number
continue
fi
# 异步运行Python脚本
python examples/pipelines/data_generator.py \
--bootcamp_name "$bootcamp_name" \
--n $sample_number \
--save_file "examples/bootcamp_generator_outputs/$timestamp/test/${bootcamp_name}.jsonl" \
--config_file "examples/pipelines/$config_type/${config_file}_test.json" \
--tokenizer "$tokenizer" \
--bootcamp_cls_name "$bootcamp_cls_name" \
--max_prompt_len $max_prompt_len \
--shuffle &
pid=$! # 获取后台进程的PID
jobs+=("$pid") # 将PID加入数组
# 控制并发数量
while [ ${#jobs[@]} -ge $max_jobs ]; do
wait -n # 等待任意一个子进程结束
# 清理已结束的进程的PID
new_jobs=()
for job_pid in "${jobs[@]}"; do
if kill -0 "$job_pid" 2>/dev/null; then
new_jobs+=("$job_pid")
fi
done
jobs=("${new_jobs[@]}")
done
done < $test_config_file
wait
echo "test set generation finished"
# cipher test-set gen
python examples/pipelines/cipher_data_generator.py \
--nums $cipher_test_nums_for_single_cipher \
--split test \
--timestamp $timestamp \
--filepath $cipher_input_file
# cipher train——set gen
python examples/pipelines/cipher_data_generator.py \
--nums $cipher_train_nums_for_single_cipher \
--split train \
--timestamp $timestamp \
--filepath $cipher_input_file
wait