mirror of
https://github.com/InternLM/InternBootcamp.git
synced 2026-04-19 12:58:04 +00:00
73 lines
No EOL
2.6 KiB
Python
Executable file
73 lines
No EOL
2.6 KiB
Python
Executable file
import json
|
||
import os
|
||
def deduplicate_jsonl_by_field(input_file_path, output_file_path=None,field_name='id'):
|
||
seen_ids = set()
|
||
unique_entries = []
|
||
|
||
# Read the input file and collect unique entries based on 'id'
|
||
with open(input_file_path, 'r', encoding='utf-8') as infile:
|
||
for line in infile:
|
||
try:
|
||
entry = json.loads(line)
|
||
entry_id = entry.get(field_name)
|
||
if entry_id is not None and entry_id not in seen_ids:
|
||
seen_ids.add(entry_id)
|
||
unique_entries.append(entry)
|
||
except json.JSONDecodeError as e:
|
||
print(f"Error decoding JSON: {e}")
|
||
|
||
# Write unique entries to the output file
|
||
if not output_file_path:
|
||
output_file_path = input_file_path
|
||
with open(output_file_path, 'w', encoding='utf-8') as outfile:
|
||
for entry in unique_entries:
|
||
outfile.write(json.dumps(entry, ensure_ascii=False) + '\n')
|
||
|
||
print(f"Deduplication complete. {len(unique_entries)} unique entries written to {output_file_path}")
|
||
|
||
|
||
def get_difference_optimized(file1, file2, key_field, output_file):
|
||
"""
|
||
根据key_field字段计算两个JSON Lines文件的差集,并返回结果。
|
||
|
||
参数:
|
||
file1: 第一个JSON Lines文件路径(基准文件)
|
||
file2: 第二个JSON Lines文件路径(要从中减去的文件)
|
||
key_field: 用于比较的字段名
|
||
output_file: 差集结果的文件路径
|
||
|
||
返回:
|
||
差集结果的文件路径
|
||
"""
|
||
if not os.path.exists(file2):
|
||
return output_file
|
||
# 收集两个文件中所有的key值
|
||
keys_in_file1 = set()
|
||
records1 = []
|
||
with open(file1, 'r', encoding='utf-8') as f1:
|
||
for line in f1:
|
||
record = json.loads(line)
|
||
records1.append(record)
|
||
keys_in_file1.add(record[key_field])
|
||
|
||
keys_in_file2 = set()
|
||
with open(file2, 'r', encoding='utf-8') as f2:
|
||
for line in f2:
|
||
record = json.loads(line)
|
||
keys_in_file2.add(record[key_field])
|
||
|
||
# 计算file1中独有的key值
|
||
unique_keys = keys_in_file1 - keys_in_file2
|
||
|
||
# 构建差集记录列表
|
||
difference_records = [record for record in records1 if record[key_field] in unique_keys]
|
||
|
||
# 将差集写入新的JSON Lines文件
|
||
def write_json_lines(file_path, records):
|
||
"""将记录列表写入JSON Lines文件"""
|
||
with open(file_path, 'w', encoding='utf-8') as file:
|
||
for record in records:
|
||
file.write(json.dumps(record, ensure_ascii=False) + '\n')
|
||
write_json_lines(output_file, difference_records)
|
||
|
||
return output_file |