mirror of
https://github.com/InternLM/InternBootcamp.git
synced 2026-04-22 16:49:04 +00:00
* feat(run_eval): add checkpoint resume functionality and update example documentation; - update new bootcamp benchmark dataset * refactor(data_pipeline): optimize data generation pipeline; add multiple preset configurations for data generation * docs: update bootcamp list and add new scripts - Update Fulllist_InternBootcamp.md with new bootcamps and categories - Add new scripts to .gitignore: - examples/pipelines/filter_autogen_configs.py - examples/pipelines/quickgen_data_configs_from_eval_meta.py - Update dependencies in setup.py: - Add scipy and scikit-learn * refactor(internbootcamp): update bootcamp modules and improve error handling - Update import statements in __init__.py files - Add timestamp to target directory name in verl_data_preprocess.py - Improve error handling and scoring logic in bootcamp_judger.py - Remove unnecessary comments and update puzzle descriptions in multiple files
154 lines
5.2 KiB
Bash
Executable file
154 lines
5.2 KiB
Bash
Executable file
#!/bin/bash
|
||
|
||
|
||
# 检查 jq 是否已安装
|
||
if ! dpkg -l | grep -q '^ii jq '; then
|
||
echo "jq 未安装,正在安装..."
|
||
sudo apt-get update
|
||
sudo apt-get install jq -y
|
||
fi
|
||
# 时间戳
|
||
timestamp=$(date +"%Y-%m-%d-%H:%M:%S")
|
||
# cipher输入集
|
||
|
||
train_config_file=examples/pipelines/data_configs/data_config_train_verified.jsonl
|
||
test_config_file=None
|
||
tokenizer="hf_model_path" # tokenizer is used to calculate the sequence length of the prompt, use one tokenizer of huggingface model, such as "Qwen2.5-7B-Instruct"
|
||
max_prompt_len=4096
|
||
max_jobs=32 # 设置最大并发进程数
|
||
jobs=() # 用于存储后台进程的PID
|
||
config_type=all_configs # 配置文件类型,puzzle_configs or autogen_configs
|
||
|
||
|
||
|
||
# initialize, do not modify below part
|
||
cipher_input_file='internbootcamp/libs/data/words_alpha_370000.txt'
|
||
cipher_test_nums_for_single_cipher=0
|
||
cipher_train_nums_for_single_cipher=0
|
||
|
||
while IFS= read -r line || [ -n "$line" ]; do
|
||
# 跳过空行
|
||
if [ -z "$line" ]; then
|
||
continue
|
||
fi
|
||
|
||
# 解析JSON行并提取变量
|
||
bootcamp_name=$(echo "$line" | jq -r '.bootcamp_name')
|
||
declare -i sample_number=$(echo "$line" | jq -r '.sample_number')
|
||
config_file=$(echo "$line" | jq -r '.config_file')
|
||
bootcamp_cls_name=$(echo "$line" | jq -r '.bootcamp_cls_name')
|
||
|
||
# 如果 config_file 为 "cipher",保存 sample_number
|
||
if [[ "$config_file" == "cipher" ]]; then
|
||
cipher_train_nums_for_single_cipher=$sample_number
|
||
continue
|
||
fi
|
||
|
||
# 异步运行Python脚本
|
||
# python examples/pipelines/data_generator.py \
|
||
# --bootcamp_name "$bootcamp_name" \
|
||
# --n $sample_number \
|
||
# --save_file "examples/bootcamp_generator_outputs/$timestamp/train/${bootcamp_name}.jsonl" \
|
||
# --config_file "examples/pipelines/puzzle_configs/${config_file}_train.json" \
|
||
# --bootcamp_cls_name "$bootcamp_cls_name" \
|
||
# --tokenizer "$tokenizer" \
|
||
# --max_prompt_len $max_prompt_len \
|
||
# --shuffle
|
||
|
||
# If there is no problem with the above command, you can use the following line to run it in multiple processes, replacing the above command
|
||
python examples/pipelines/data_generator.py \
|
||
--bootcamp_name "$bootcamp_name" \
|
||
--n $sample_number \
|
||
--save_file "examples/bootcamp_generator_outputs/$timestamp/train/${bootcamp_name}.jsonl" \
|
||
--config_file "examples/pipelines/$config_type/${config_file}_train.json" \
|
||
--bootcamp_cls_name "$bootcamp_cls_name" \
|
||
--tokenizer "$tokenizer" \
|
||
--max_prompt_len $max_prompt_len \
|
||
--shuffle &
|
||
|
||
pid=$! # 获取后台进程的PID
|
||
jobs+=("$pid") # 将PID加入数组
|
||
# 打印当前进程总数
|
||
# echo "Current running jobs: ${#jobs[@]}"
|
||
# 控制并发数量
|
||
while [ ${#jobs[@]} -ge $max_jobs ]; do
|
||
wait -n # 等待任意一个子进程结束
|
||
# 清理已结束的进程的PID
|
||
new_jobs=()
|
||
for job_pid in "${jobs[@]}"; do
|
||
if kill -0 "$job_pid" 2>/dev/null; then
|
||
new_jobs+=("$job_pid")
|
||
fi
|
||
done
|
||
jobs=("${new_jobs[@]}")
|
||
done
|
||
done < $train_config_file
|
||
|
||
wait
|
||
|
||
echo "train set generation finished, start test generation."
|
||
|
||
while IFS= read -r line || [ -n "$line" ]; do
|
||
# 跳过空行
|
||
if [ -z "$line" ]; then
|
||
continue
|
||
fi
|
||
|
||
# 解析JSON行并提取变量
|
||
bootcamp_name=$(echo "$line" | jq -r '.bootcamp_name')
|
||
declare -i sample_number=$(echo "$line" | jq -r '.sample_number')
|
||
config_file=$(echo "$line" | jq -r '.config_file')
|
||
bootcamp_cls_name=$(echo "$line" | jq -r '.bootcamp_cls_name')
|
||
|
||
# 如果 config_file 为 "cipher",保存 sample_number
|
||
if [[ "$config_file" == "cipher" ]]; then
|
||
cipher_test_nums_for_single_cipher=$sample_number
|
||
continue
|
||
fi
|
||
|
||
# 异步运行Python脚本
|
||
python examples/pipelines/data_generator.py \
|
||
--bootcamp_name "$bootcamp_name" \
|
||
--n $sample_number \
|
||
--save_file "examples/bootcamp_generator_outputs/$timestamp/test/${bootcamp_name}.jsonl" \
|
||
--config_file "examples/pipelines/$config_type/${config_file}_test.json" \
|
||
--tokenizer "$tokenizer" \
|
||
--bootcamp_cls_name "$bootcamp_cls_name" \
|
||
--max_prompt_len $max_prompt_len \
|
||
--shuffle &
|
||
pid=$! # 获取后台进程的PID
|
||
jobs+=("$pid") # 将PID加入数组
|
||
|
||
# 控制并发数量
|
||
while [ ${#jobs[@]} -ge $max_jobs ]; do
|
||
wait -n # 等待任意一个子进程结束
|
||
# 清理已结束的进程的PID
|
||
new_jobs=()
|
||
for job_pid in "${jobs[@]}"; do
|
||
if kill -0 "$job_pid" 2>/dev/null; then
|
||
new_jobs+=("$job_pid")
|
||
fi
|
||
done
|
||
jobs=("${new_jobs[@]}")
|
||
done
|
||
done < $test_config_file
|
||
|
||
wait
|
||
|
||
echo "test set generation finished"
|
||
|
||
# cipher test-set gen
|
||
python examples/pipelines/cipher_data_generator.py \
|
||
--nums $cipher_test_nums_for_single_cipher \
|
||
--split test \
|
||
--timestamp $timestamp \
|
||
--filepath $cipher_input_file
|
||
|
||
# cipher train——set gen
|
||
python examples/pipelines/cipher_data_generator.py \
|
||
--nums $cipher_train_nums_for_single_cipher \
|
||
--split train \
|
||
--timestamp $timestamp \
|
||
--filepath $cipher_input_file
|
||
|
||
wait
|