import json
import os
import argparse
from collections import defaultdict
import pandas as pd # For easier display of grouped averages
import traceback # For detailed error logging

def analyze_json_files(json_directory, output_file_path):
    print(f"DEBUG: analyze_json_files called with json_directory='{json_directory}', output_file_path='{output_file_path}'")
    
    total_rows = 0
    total_characters = 0
    response_type_stats = defaultdict(lambda: {'prompt_chars': [], 'response_chars': []})
    
    outfile = None # Initialize outfile to None
    try:
        # Manually open the file
        outfile = open(output_file_path, 'w')
        print(f"DEBUG: Output file '{output_file_path}' opened successfully for writing.")
        outfile.write(f"Analysis script started. Outputting to: {output_file_path}\n")
        outfile.write(f"Analyzing JSON files from directory: {json_directory}\n")
        outfile.flush()

        if not os.path.isdir(json_directory):
            err_msg_dir = f"Error: Directory not found: {json_directory}\n"
            outfile.write(err_msg_dir)
            print(err_msg_dir.strip())
            return # Return will trigger the 'finally' block

        json_files_processed = 0
        for filename in os.listdir(json_directory):
            if filename.endswith("_rl.json"):
                file_path = os.path.join(json_directory, filename)
                outfile.write(f"Processing file: {file_path}...\n")
                outfile.flush()
                print(f"Processing file: {file_path}...")
                try:
                    file_size = os.path.getsize(file_path)
                    total_characters += file_size
                    json_files_processed += 1

                    with open(file_path, 'r') as f:
                        data = json.load(f)
                    
                    if not isinstance(data, list):
                        warning_msg = f"  Warning: Expected a list of objects in {filename}, got {type(data)}. Skipping file.\n"
                        outfile.write(warning_msg)
                        print(warning_msg.strip())
                        continue
                    
                    total_rows += len(data)

                    for entry in data:
                        response_type = entry.get('response_type', "UNKNOWN_RESPONSE_TYPE")
                        prompt_content = entry.get('prompt')
                        llm_response_content = entry.get('llm_response')

                        if prompt_content is not None and isinstance(prompt_content, str):
                            response_type_stats[response_type]['prompt_chars'].append(len(prompt_content))
                        
                        if llm_response_content is not None:
                            if isinstance(llm_response_content, str):
                                response_type_stats[response_type]['response_chars'].append(len(llm_response_content))
                            else:
                                try:
                                    response_str = json.dumps(llm_response_content)
                                    response_type_stats[response_type]['response_chars'].append(len(response_str))
                                except TypeError:
                                    warning_msg_ser = f"  Warning: Could not serialize llm_response in {filename}.\n"
                                    outfile.write(warning_msg_ser)
                                    print(warning_msg_ser.strip())
                except json.JSONDecodeError:
                    warning_msg_json = f"  Warning: Could not decode JSON from {filename}. Skipping file.\n"
                    outfile.write(warning_msg_json)
                    print(warning_msg_json.strip())
                except Exception as e:
                    warning_msg_exc = f"  Warning: An error occurred processing {filename}: {e}. Skipping file.\n"
                    outfile.write(warning_msg_exc)
                    print(warning_msg_exc.strip())

        if json_files_processed == 0:
            no_files_msg = f"No '*_rl.json' files found in {json_directory}.\n"
            outfile.write(no_files_msg)
            print(no_files_msg.strip())
            return

        outfile.write("\n--- Overall Statistics ---\n")
        outfile.write(f"Total JSON files processed: {json_files_processed}\n")
        outfile.write(f"Total JSON objects (rows) generated: {total_rows}\n")
        outfile.write(f"Total characters of JSON generated (sum of file sizes): {total_characters:,}\n")

        outfile.write("\n--- Average Lengths by Response Type (in characters) ---\n")
        outfile.write(f"Found {len(response_type_stats)} unique response_type categories.\n")
        print(f"Found {len(response_type_stats)} unique response_type categories.")

        avg_data = [] 
        for rt, stats_item in response_type_stats.items():
            avg_prompt_len = sum(stats_item['prompt_chars']) / len(stats_item['prompt_chars']) if stats_item['prompt_chars'] else 0
            avg_response_len = sum(stats_item['response_chars']) / len(stats_item['response_chars']) if stats_item['response_chars'] else 0
            count = max(len(stats_item['prompt_chars']), len(stats_item['response_chars']))
            avg_data.append({
                'Response Type': rt,
                'Count': count,
                'Avg Prompt Length': f"{avg_prompt_len:.2f}",
                'Avg LLM Response Length': f"{avg_response_len:.2f}"
            })
        
        if avg_data:
            df_avg = pd.DataFrame(avg_data)
            outfile.write(df_avg.to_string(index=False) + "\n")
            print("DataFrame successfully written.")
        else:
            no_avg_data_msg = "No data available for response type analysis.\n"
            outfile.write(no_avg_data_msg)
            print(no_avg_data_msg.strip())
        
        outfile.write("\nAnalysis script finished successfully.\n")
        print(f"\nAnalysis complete. Summary saved to: {output_file_path}")

    except Exception as e:
        print(f"FATAL SCRIPT ERROR: An exception occurred: {e}")
        traceback.print_exc()
        # Attempt to write to a fallback error log
        try:
            with open('analyze_rl_json_CRITICAL_ERROR.log', 'w') as err_log:
                err_log.write(f"Timestamp: {pd.Timestamp.now()}\n")
                err_log.write(f"Failed during script execution.\nError: {e}\n")
                err_log.write(f"Traceback:\n{traceback.format_exc()}\n")
        except Exception as e_fallback:
            print(f"CRITICAL FALLBACK LOGGING FAILED: {e_fallback}")
    finally:
        # This block will always execute, ensuring the file is closed.
        if outfile and not outfile.closed:
            print("DEBUG: Closing output file in 'finally' block.")
            outfile.close()

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Analyze generated RL JSON files.')
    parser.add_argument('json_dir', type=str, help='Directory containing the *_rl.json files to analyze.')
    parser.add_argument('--output_file', type=str, default='analysis_summary.txt', 
                        help='Path to save the analysis summary (default: analysis_summary.txt in the CWD).')
    
    args = parser.parse_args()
    
    abs_json_dir = os.path.abspath(args.json_dir)
    # Ensure output_file_path is absolute or relative to CWD as intended
    output_file_path_arg = os.path.abspath(args.output_file) 

    analyze_json_files(abs_json_dir, output_file_path_arg)