import pandas as pd import json import os import argparse import traceback def parse_success_value(value): """Converts success string to boolean or keeps as is if not clearly boolean.""" if isinstance(value, str): if value.lower() == 'true': return True elif value.lower() == 'false': return False elif pd.isna(value): return None return value def convert_csv_to_rl_json(csv_file_path, output_json_path, game_id): """ Converts a CSV file of LLM responses into a JSON format suitable for RL fine-tuning. Args: csv_file_path (str): The absolute path to the input CSV file. output_json_path (str): The absolute path for the output JSON file. game_id (str): The game identifier for this conversion. Returns: bool: True if conversion was successful, False otherwise. """ try: print(f" Attempting to read CSV: {csv_file_path}") if not os.path.exists(csv_file_path): print(f" Error: CSV file not found at {csv_file_path}") return False df = pd.read_csv(csv_file_path) # print(f" Successfully read CSV. Shape: {df.shape} for game_id: {game_id}") rl_data = [] for index, row in df.iterrows(): raw_response_data = row.get('raw_response') try: if isinstance(raw_response_data, str) and \ raw_response_data.strip().startswith(('{', '[')) and \ raw_response_data.strip().endswith(('}', ']')): llm_response_parsed = json.loads(raw_response_data) else: llm_response_parsed = raw_response_data except json.JSONDecodeError: llm_response_parsed = raw_response_data entry = { "game_id": game_id, "model": row.get('model'), "power": row.get('power'), "phase": row.get('phase'), "response_type": row.get('response_type'), "prompt": row.get('raw_input'), "llm_response": llm_response_parsed, "success": parse_success_value(row.get('success')) } rl_data.append(entry) # Ensure output directory exists os.makedirs(os.path.dirname(output_json_path), exist_ok=True) print(f" Writing JSON output to: {output_json_path}") with open(output_json_path, 'w') as f: json.dump(rl_data, f, indent=4) print(f" Successfully converted CSV to JSON for game_id '{game_id}': {output_json_path}") return True except FileNotFoundError: print(f" Error: The file {csv_file_path} was not found during conversion for game_id '{game_id}'.") return False except pd.errors.EmptyDataError: print(f" Error: The file {csv_file_path} is empty for game_id '{game_id}'.") return False except Exception as e: print(f" An unexpected error occurred during conversion for game_id '{game_id}': {e}") traceback.print_exc() return False if __name__ == '__main__': parser = argparse.ArgumentParser( description='Convert LLM responses CSV to RL-ready JSON. ' 'Operates in one of two modes: single CSV file conversion or batch directory scan.' ) group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--input_csv', type=str, help='Path to a single input CSV file. Output JSON is saved in the same directory.') group.add_argument('--scan_dir', type=str, help='Path to a root directory (e.g., results/) to scan for subdirectories ending in \'FULL_GAME\'. ' 'Output JSONs are saved in a \'/json/\' subdirectory.') args = parser.parse_args() if args.input_csv: input_csv_path_arg = os.path.abspath(args.input_csv) if not os.path.exists(input_csv_path_arg): print(f"Error: Input CSV file does not exist at {input_csv_path_arg}") exit(1) # For single file mode, game_id is the name of the parent directory of the CSV # This matches the original behavior for llm_responses.csv inside a game-specific folder game_id_derived = os.path.basename(os.path.dirname(input_csv_path_arg)) # Output JSON in the same directory as the input CSV output_filename = os.path.splitext(os.path.basename(input_csv_path_arg))[0] + "_rl.json" output_json_file_path = os.path.join(os.path.dirname(input_csv_path_arg), output_filename) print(f"Starting single file conversion for: {input_csv_path_arg}") print(f" Game ID (derived from parent folder): {game_id_derived}") print(f" Outputting to: {output_json_file_path}") success = convert_csv_to_rl_json(input_csv_path_arg, output_json_file_path, game_id_derived) if success: print("Single file conversion successful.") else: print("Single file conversion failed.") elif args.scan_dir: scan_directory_arg = os.path.abspath(args.scan_dir) if not os.path.isdir(scan_directory_arg): print(f"Error: Scan directory does not exist or is not a directory: {scan_directory_arg}") exit(1) output_base_dir = os.path.join(scan_directory_arg, "json") os.makedirs(output_base_dir, exist_ok=True) print(f"Starting batch conversion. Scanning directory: {scan_directory_arg}") print(f"Outputting all JSON files to: {output_base_dir}") processed_games_count = 0 found_target_dirs = 0 for item_name in os.listdir(scan_directory_arg): item_path = os.path.join(scan_directory_arg, item_name) if os.path.isdir(item_path) and item_name.endswith("FULL_GAME"): found_target_dirs += 1 current_game_id = item_name csv_to_process = os.path.join(item_path, "llm_responses.csv") output_json_for_game = os.path.join(output_base_dir, f"{current_game_id}_rl.json") if os.path.exists(csv_to_process): print(f"Processing game directory: {item_name}") print(f" Input CSV: {csv_to_process}") if convert_csv_to_rl_json(csv_to_process, output_json_for_game, current_game_id): processed_games_count += 1 else: print(f"Warning: 'llm_responses.csv' not found in directory {item_path}. Skipping this directory.") if found_target_dirs == 0: print(f"No subdirectories ending with 'FULL_GAME' found in {scan_directory_arg}.") elif processed_games_count == 0 and found_target_dirs > 0: print(f"Found {found_target_dirs} director(y/ies) ending with 'FULL_GAME', but none contained 'llm_responses.csv' or failed processing.") else: print(f"Batch conversion completed. Successfully processed {processed_games_count} out of {found_target_dirs} found 'FULL_GAME' director(y/ies).")