mirror of
https://github.com/InternLM/InternBootcamp.git
synced 2026-04-19 12:58:04 +00:00
init-commit
This commit is contained in:
commit
18a552597a
3461 changed files with 1150579 additions and 0 deletions
86
examples/pipelines/quickgen_data_configs.py
Normal file
86
examples/pipelines/quickgen_data_configs.py
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
import json
|
||||
import jsonlines
|
||||
import os
|
||||
|
||||
import os
|
||||
import json
|
||||
import glob
|
||||
import re
|
||||
|
||||
# 每个puzzle的gen数量
|
||||
train_sample_number = 1000
|
||||
test_sample_number = 64
|
||||
|
||||
def checkpath(target_dir):
|
||||
# 检查目录是否存在
|
||||
if not os.path.exists(target_dir):
|
||||
# 如果目录不存在,则创建它
|
||||
try:
|
||||
os.makedirs(target_dir)
|
||||
print(f"目录 {target_dir} 创建成功。")
|
||||
except FileExistsError:
|
||||
# 理论上不会触发这个异常,但以防万一,比如多线程环境下同时创建
|
||||
print(f"目录 {target_dir} 已存在。")
|
||||
except PermissionError:
|
||||
print(f"没有权限创建目录 {target_dir}。")
|
||||
except Exception as e:
|
||||
print(f"创建目录 {target_dir} 时出现错误: {e}")
|
||||
|
||||
def process_data_config():
|
||||
# 遍历data_config目录下所有符合条件的json文件
|
||||
data_dir = 'examples/pipelines/puzzle_configs'
|
||||
|
||||
json_files = os.listdir(data_dir)
|
||||
train_data = []
|
||||
test_data = []
|
||||
json_files.sort(key=lambda x: x.capitalize())
|
||||
for file_path in json_files:
|
||||
file_name = os.path.basename(file_path)
|
||||
if 'test.json' in file_name:
|
||||
name_res = file_name.split("_test.json")
|
||||
mode = 'test'
|
||||
elif 'train.json' in file_name:
|
||||
name_res = file_name.split("_train.json")
|
||||
mode = 'train'
|
||||
else:
|
||||
continue
|
||||
|
||||
config_file = name_res[0]
|
||||
bootcamp_name = name_res[0].replace("_", "")
|
||||
bootcamp_cls_name = bootcamp_name[0].upper() + bootcamp_name[1:] if bootcamp_name else ''
|
||||
|
||||
entry_test = {
|
||||
"bootcamp_name": bootcamp_name,
|
||||
"sample_number": test_sample_number,
|
||||
"config_file": config_file,
|
||||
"bootcamp_cls_name": f"{bootcamp_cls_name}bootcamp"
|
||||
}
|
||||
|
||||
entry_train = {
|
||||
"bootcamp_name": bootcamp_name,
|
||||
"sample_number": train_sample_number,
|
||||
"config_file": config_file,
|
||||
"bootcamp_cls_name": f"{bootcamp_cls_name}bootcamp"
|
||||
}
|
||||
|
||||
if mode == 'train':
|
||||
train_data.append(entry_train)
|
||||
elif mode == 'test':
|
||||
test_data.append(entry_test)
|
||||
|
||||
save_dir = 'examples/pipelines/data_configs'
|
||||
# 检查dir
|
||||
checkpath(save_dir)
|
||||
output_file_train = f'{save_dir}/data_config_train.jsonl'
|
||||
with open(output_file_train, 'w', encoding='utf-8') as f_out:
|
||||
for entry in train_data:
|
||||
f_out.write(json.dumps(entry) + '\n')
|
||||
|
||||
output_file_train = 'examples/pipelines/data_configs/data_config_test.jsonl'
|
||||
with open(output_file_train, 'w', encoding='utf-8') as f_out:
|
||||
for entry in test_data:
|
||||
f_out.write(json.dumps(entry) + '\n')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
process_data_config()
|
||||
Loading…
Add table
Add a link
Reference in a new issue