InternBootcamp/internbootcamp/bootcamp_utils/jsonlines.py
2025-05-23 15:27:15 +08:00

83 lines
No EOL
3.2 KiB
Python
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
def append_json_lines(source_file, target_file):
"""
将source_file中的所有JSON Lines记录追加到target_file中。
参数:
source_file: 要追加的JSON Lines文件路径
target_file: 目标JSON Lines文件路径新的记录将被追加到这里
"""
# 使用 'a' 模式打开目标文件以进行追加写入
with open(target_file, 'a', encoding='utf-8') as outfile:
# 打开源文件并逐行读取其内容
with open(source_file, 'r', encoding='utf-8') as infile:
for line in infile:
outfile.write(line)
def read_jsonl(path, encoding='utf-8'):
"""
Reads a jsonl file and returns a list of dictionaries.
"""
with open(path, 'r', encoding=encoding) as f:
lines = f.readlines()
return [json.loads(line) for line in lines]
def add_jsonl(data, path, encoding='utf-8'):
"""
add a dictionary to a jsonl file.
"""
with open(path, 'a', encoding=encoding) as f:
f.write(json.dumps(data, ensure_ascii=False)+'\n')
def write_jsonl(data, path, encoding='utf-8'):
"""
Write a list of dictionaries to a jsonl file.
"""
with open(path, 'w', encoding=encoding) as f:
for line in data:
f.write(json.dumps(line, ensure_ascii=False)+'\n')
def extend_jsonl(datas, path, encoding='utf-8'):
"""
Extend a jsonl file with a list of dictionaries
"""
with open(path, 'a', encoding=encoding) as f:
for line in datas:
f.write(json.dumps(line, ensure_ascii=False)+'\n')
import os
from itertools import count
def merge_jsonlines(input_directory, output_file):
id_counter = count(start=1)
with open(output_file, 'w', encoding='utf-8') as outfile:
# 使用 os.walk 遍历目录及其子目录
for root, dirs, files in os.walk(input_directory):
for filename in files:
if filename.endswith('.jsonl'):
file_path = os.path.join(root, filename)
with open(file_path, 'r', encoding='utf-8') as infile:
for line in infile:
try:
record = json.loads(line)
# 添加新的字段到记录中
new_record = {}
new_record['id'] = next(id_counter)
new_record['source_filename'] = os.path.relpath(file_path, input_directory)
for key, value in record.items():
new_record[key] = value
# 将更新后的记录写入输出文件
outfile.write(json.dumps(new_record, ensure_ascii=False) + '\n')
except json.JSONDecodeError:
print(f"Warning: Could not decode line in {filename}")
if __name__ == '__main__':
directory = 'data_generator_outputs/_Eval/20250114_153504/easy'
output_file = 'data_generator_outputs/_Eval/20250114_153504/easy_test.jsonl'
merge_jsonlines(directory, output_file)