InternBootcamp/internbootcamp/bootcamp_utils/deduplicator.py

import json
import os
def deduplicate_jsonl_by_field(input_file_path, output_file_path=None,field_name='id'):
    seen_ids = set()
    unique_entries = []

    # Read the input file and collect unique entries based on 'id'
    with open(input_file_path, 'r', encoding='utf-8') as infile:
        for line in infile:
            try:
                entry = json.loads(line)
                entry_id = entry.get(field_name)
                if entry_id is not None and entry_id not in seen_ids:
                    seen_ids.add(entry_id)
                    unique_entries.append(entry)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")

    # Write unique entries to the output file
    if not output_file_path:
        output_file_path = input_file_path
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        for entry in unique_entries:
            outfile.write(json.dumps(entry, ensure_ascii=False) + '\n')

    print(f"Deduplication complete. {len(unique_entries)} unique entries written to {output_file_path}")


def get_difference_optimized(file1, file2, key_field, output_file):
    """
    根据key_field字段计算两个JSON Lines文件的差集，并返回结果。

    参数：
        file1: 第一个JSON Lines文件路径（基准文件）
        file2: 第二个JSON Lines文件路径（要从中减去的文件）
        key_field: 用于比较的字段名
        output_file: 差集结果的文件路径

    返回：
        差集结果的文件路径
    """
    if not os.path.exists(file2):
        return output_file
    # 收集两个文件中所有的key值
    keys_in_file1 = set()
    records1 = []
    with open(file1, 'r', encoding='utf-8') as f1:
        for line in f1:
            record = json.loads(line)
            records1.append(record)
            keys_in_file1.add(record[key_field])

    keys_in_file2 = set()
    with open(file2, 'r', encoding='utf-8') as f2:
        for line in f2:
            record = json.loads(line)
            keys_in_file2.add(record[key_field])

    # 计算file1中独有的key值
    unique_keys = keys_in_file1 - keys_in_file2

    # 构建差集记录列表
    difference_records = [record for record in records1 if record[key_field] in unique_keys]

    # 将差集写入新的JSON Lines文件
    def write_json_lines(file_path, records):
        """将记录列表写入JSON Lines文件"""
        with open(file_path, 'w', encoding='utf-8') as file:
            for record in records:
                file.write(json.dumps(record, ensure_ascii=False) + '\n')
    write_json_lines(output_file, difference_records)

    return output_file