InternBootcamp/examples/pipelines/quickgen_data_configs.py
Yongkang Chen a8249acc18
update to tech report version (#10)
* feat(run_eval): add checkpoint resume functionality and update example documentation;
- update new bootcamp benchmark dataset

* refactor(data_pipeline): optimize data generation pipeline; add multiple preset configurations for data generation

* docs: update bootcamp list and add new scripts

- Update Fulllist_InternBootcamp.md with new bootcamps and categories
- Add new scripts to .gitignore:
  - examples/pipelines/filter_autogen_configs.py
  - examples/pipelines/quickgen_data_configs_from_eval_meta.py
- Update dependencies in setup.py:
  - Add scipy and scikit-learn

* refactor(internbootcamp): update bootcamp modules and improve error handling

- Update import statements in __init__.py files
- Add timestamp to target directory name in verl_data_preprocess.py
- Improve error handling and scoring logic in bootcamp_judger.py
- Remove unnecessary comments and update puzzle descriptions in multiple files
2025-08-28 12:39:47 +08:00

94 lines
No EOL
3.4 KiB
Python

import json
import jsonlines
import os
import argparse
import os
import json
import glob
import re
def checkpath(target_dir):
# 检查目录是否存在
if not os.path.exists(target_dir):
# 如果目录不存在,则创建它
try:
os.makedirs(target_dir)
print(f"目录 {target_dir} 创建成功。")
except FileExistsError:
# 理论上不会触发这个异常,但以防万一,比如多线程环境下同时创建
print(f"目录 {target_dir} 已存在。")
except PermissionError:
print(f"没有权限创建目录 {target_dir}")
except Exception as e:
print(f"创建目录 {target_dir} 时出现错误: {e}")
def process_data_config(config_dir, train_sample_number, test_sample_number):
"""
# 遍历data_config目录下所有符合条件的json文件
"""
json_files = os.listdir(config_dir)
train_data = []
test_data = []
json_files.sort(key=lambda x: x.capitalize())
for file_path in json_files:
file_name = os.path.basename(file_path)
if 'test.json' in file_name:
name_res = file_name.split("_test.json")
mode = 'test'
elif 'train.json' in file_name:
name_res = file_name.split("_train.json")
mode = 'train'
else:
continue
config_file = name_res[0]
bootcamp_name = name_res[0].replace("_", "")
bootcamp_cls_name = bootcamp_name[0].upper() + bootcamp_name[1:] if bootcamp_name else ''
entry_test = {
"bootcamp_name": bootcamp_name,
"sample_number": test_sample_number,
"config_file": config_file,
"bootcamp_cls_name": f"{bootcamp_cls_name}bootcamp"
}
entry_train = {
"bootcamp_name": bootcamp_name,
"sample_number": train_sample_number,
"config_file": config_file,
"bootcamp_cls_name": f"{bootcamp_cls_name}bootcamp"
}
if mode == 'train':
train_data.append(entry_train)
elif mode == 'test':
test_data.append(entry_test)
save_dir = 'examples/pipelines/data_configs'
# 不区分大小写排序
train_data.sort(key=lambda x: x['bootcamp_cls_name'].lower())
test_data.sort(key=lambda x: x['bootcamp_cls_name'].lower())
# 检查dir
checkpath(save_dir)
output_file_train = f'{save_dir}/data_config_train.jsonl'
with open(output_file_train, 'w', encoding='utf-8') as f_out:
for entry in train_data:
f_out.write(json.dumps(entry) + '\n')
output_file_train = 'examples/pipelines/data_configs/data_config_test.jsonl'
with open(output_file_train, 'w', encoding='utf-8') as f_out:
for entry in test_data:
f_out.write(json.dumps(entry) + '\n')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Process data config files.')
parser.add_argument('--config_dir', type=str, default='examples/pipelines/puzzle_configs', help='Directory containing config files')
parser.add_argument('--train_sample_number', type=int, default=1000, help='Number of training samples per task')
parser.add_argument('--test_sample_number', type=int, default=0, help='Number of test samples per task')
args = parser.parse_args()
process_data_config(args.config_dir, args.train_sample_number, args.test_sample_number)