init-commit

This commit is contained in:
lilinyang 2025-05-23 15:27:15 +08:00
commit 18a552597a
3461 changed files with 1150579 additions and 0 deletions

View file

@ -0,0 +1,73 @@
import json
import os
def deduplicate_jsonl_by_field(input_file_path, output_file_path=None,field_name='id'):
seen_ids = set()
unique_entries = []
# Read the input file and collect unique entries based on 'id'
with open(input_file_path, 'r', encoding='utf-8') as infile:
for line in infile:
try:
entry = json.loads(line)
entry_id = entry.get(field_name)
if entry_id is not None and entry_id not in seen_ids:
seen_ids.add(entry_id)
unique_entries.append(entry)
except json.JSONDecodeError as e:
print(f"Error decoding JSON: {e}")
# Write unique entries to the output file
if not output_file_path:
output_file_path = input_file_path
with open(output_file_path, 'w', encoding='utf-8') as outfile:
for entry in unique_entries:
outfile.write(json.dumps(entry, ensure_ascii=False) + '\n')
print(f"Deduplication complete. {len(unique_entries)} unique entries written to {output_file_path}")
def get_difference_optimized(file1, file2, key_field, output_file):
"""
根据key_field字段计算两个JSON Lines文件的差集并返回结果
参数
file1: 第一个JSON Lines文件路径基准文件
file2: 第二个JSON Lines文件路径要从中减去的文件
key_field: 用于比较的字段名
output_file: 差集结果的文件路径
返回
差集结果的文件路径
"""
if not os.path.exists(file2):
return output_file
# 收集两个文件中所有的key值
keys_in_file1 = set()
records1 = []
with open(file1, 'r', encoding='utf-8') as f1:
for line in f1:
record = json.loads(line)
records1.append(record)
keys_in_file1.add(record[key_field])
keys_in_file2 = set()
with open(file2, 'r', encoding='utf-8') as f2:
for line in f2:
record = json.loads(line)
keys_in_file2.add(record[key_field])
# 计算file1中独有的key值
unique_keys = keys_in_file1 - keys_in_file2
# 构建差集记录列表
difference_records = [record for record in records1 if record[key_field] in unique_keys]
# 将差集写入新的JSON Lines文件
def write_json_lines(file_path, records):
"""将记录列表写入JSON Lines文件"""
with open(file_path, 'w', encoding='utf-8') as file:
for record in records:
file.write(json.dumps(record, ensure_ascii=False) + '\n')
write_json_lines(output_file, difference_records)
return output_file