diff --git a/diplomacy/engine/game.py b/diplomacy/engine/game.py index 8e72b71..db1d105 100644 --- a/diplomacy/engine/game.py +++ b/diplomacy/engine/game.py @@ -4695,14 +4695,42 @@ class Game(Jsonable): f"RESULTS:\n{results_block}\n\n" f"CURRENT BOARD STATE:\n{current_state_block}\n\n" f"CHANGES FROM PREVIOUS PHASE:\n{differences_block}\n\n" - "Please write a concise but detailed summary of what happened this turn, including " - "important captures, retreats, or changes in board position, using a helpful, neutral tone." + "Below is the final board state after the latest phase, along with the moves each power submitted and the engine’s adjudication results. Please create a summary in JSON, explaining:" + "- Each successful move," + "- Each bounce or voided order, with reasons (e.g. equal force, no valid route, contradictory support)," + "- Key changes in supply centers," + "- Potential strategic ramifications if relevant." + + "Return ONLY JSON:" + + "PARSABLE OUTPUT:" + "{{" + "'summary': ... your text ..." + "}}" ) # We might also have a system prompt to guide the AI, e.g.: system_prompt = ( - "You are an AI summarizing a Diplomacy game turn based on the provided data. " - "Focus on describing important changes, successes/failures of orders, and new unit positions." + """ + You are a Diplomacy expert, summarizing the results of the latest phase. + Your tasks: + 1) Provide a concise summary of how the board changed. + 2) Specifically list each voided or bounced order, and *why* it occurred. + 3) If possible, describe which moves or supports succeeded and how that affected centers. + + Format: + - Must return a JSON with the top-level key "summary" or "orders" or similar. + - Possibly: + + PARSABLE OUTPUT: + { + "summary": "...(your textual summary)..." + } + + Ensure the summary clarifies reasons for bounces, e.g., "F TRI -> VEN bounced because Italy also moved A VEN -> TRI with equal force." + + No extra text outside the JSON block. + """ ) if summary_callback: diff --git a/lm_game.py b/lm_game.py index 6c679af..6dea157 100644 --- a/lm_game.py +++ b/lm_game.py @@ -160,6 +160,76 @@ def my_summary_callback(system_prompt, user_prompt): # Pseudo-code for generating a response: return client.generate_response(combined_prompt) +def get_valid_orders_with_retry(game, + client, + board_state, + power_name, + possible_orders, + conversation_text_for_orders, + phase_summaries, + model_error_stats, + max_retries=3): + """ + Tries up to 'max_retries' to generate and validate orders. + If invalid, we append the error feedback to the conversation + context for the next retry. If still invalid, return fallback. + """ + error_feedback = "" + for attempt in range(max_retries): + # Incorporate any error feedback into the conversation text + augmented_conversation_text = conversation_text_for_orders + if error_feedback: + augmented_conversation_text += ( + "\n\n[ORDER VALIDATION FEEDBACK]\n" + error_feedback + ) + + # Ask the LLM for orders + orders = client.get_orders( + board_state=board_state, + power_name=power_name, + possible_orders=possible_orders, + conversation_text=augmented_conversation_text, + phase_summaries=phase_summaries, + model_error_stats=model_error_stats + ) + + # Validate each order + invalid_info = [] + for move in orders: + # Example move: "A PAR H" -> unit="A PAR", order_part="H" + tokens = move.split(" ", 2) + if len(tokens) < 3: + invalid_info.append( + f"Order '{move}' is malformed; expected 'A PAR H' style." + ) + continue + unit = " ".join(tokens[:2]) # e.g. "A PAR" + order_part = tokens[2] # e.g. "H" or "S A MAR" + + # Use the internal game validation method + validity = game._valid_order(power_name, unit, order_part, report=1) + if validity != 1: + invalid_info.append( + f"Order '{move}' returned validity={validity}. (None/-1=invalid, 0=partial, 1=valid)" + ) + + if not invalid_info: + # All orders are fully valid + return orders + else: + # Build feedback for the next retry + error_feedback = ( + f"Attempt {attempt+1}/{max_retries} had invalid orders:\n" + + "\n".join(invalid_info) + ) + + # If we finish the loop without returning, fallback + logger.warning( + f"[{power_name}] Exhausted {max_retries} attempts for valid orders, using fallback." + ) + fallback = client.fallback_orders(possible_orders) + return fallback + def main(): logger.info("Starting a new Diplomacy game for testing with multiple LLMs, now concurrent!") start_whole = time.time() @@ -232,28 +302,33 @@ def main(): logger.info(f"No orderable locations for {power_name}; skipping.") continue board_state = game.get_state() + + # Submit a task that includes up to 3 attempts at valid orders future = executor.submit( - client.get_orders, - board_state, - power_name, - possible_orders, - conversation_text_for_orders, + get_valid_orders_with_retry, + game, + client, + board_state, + power_name, + possible_orders, + conversation_text_for_orders, # existing conversation text game.phase_summaries, - model_error_stats # pass our stats + model_error_stats, + 3 # max_retries ) futures[future] = power_name - logger.debug(f"Submitted get_orders task for power {power_name}.") + logger.debug(f"Submitted get_valid_orders_with_retry task for {power_name}.") for future in concurrent.futures.as_completed(futures): p_name = futures[future] try: orders = future.result() - logger.debug(f"Orders for {p_name}: {orders}") + logger.debug(f"Validated orders for {p_name}: {orders}") if orders: game.set_orders(p_name, orders) logger.debug(f"Set orders for {p_name} in {game.current_short_phase}: {orders}") else: - logger.debug(f"No orders returned for {p_name}.") + logger.debug(f"No valid orders returned for {p_name}.") except Exception as exc: logger.error(f"LLM request failed for {p_name}: {exc}") diff --git a/lm_service_versus.py b/lm_service_versus.py index 22cf784..736fbbe 100644 --- a/lm_service_versus.py +++ b/lm_service_versus.py @@ -144,8 +144,6 @@ class BaseModelClient: except Exception as e: logger.error(f"[{self.model_name}] LLM error for {power_name}: {e}") - if model_error_stats is not None: - model_error_stats[self.model_name]["order_decoding_errors"] += 1 return self.fallback_orders(possible_orders) def _extract_moves(self, raw_response: str, power_name: str) -> Optional[List[str]]: diff --git a/prompts/few_shot_example.txt b/prompts/few_shot_example.txt index a6867ab..9ef2056 100644 --- a/prompts/few_shot_example.txt +++ b/prompts/few_shot_example.txt @@ -5,6 +5,13 @@ Your Units: ['A PAR','F BRE'] Possible Orders: PAR: ['A PAR H','A PAR - BUR','A PAR - GAS'] BRE: ['F BRE H','F BRE - MAO'] + +Convoy Paths Possible: + [("A NAP", {"F ION","F TYS"}, "TUN")] # Example route + +Past Phase Summaries: +- Your move A BUD -> SER bounced last time because Turkey also moved A SMY -> SER with support. +- Your support F TRI S A BUD -> SER was wasted because F TRI was needed to block Ionian invasion. Chain-of-thought: [Be consistent with your secret chain-of-thought here, but do not reveal it. diff --git a/prompts/system_prompt_response.txt b/prompts/system_prompt_response.txt index 1df8fba..250b9c6 100644 --- a/prompts/system_prompt_response.txt +++ b/prompts/system_prompt_response.txt @@ -1,22 +1,44 @@ -You are a Diplomacy expert, tasked with deciding movement orders for a single power. +You are a Diplomacy expert responsible for deciding final orders for your power. You will be given: • Which power you are controlling. • The current phase (e.g. S1901M). +• Your units and the possible orders for each. +• Summaries of past phases (including bounces, voids, and the reasons). • A summary of recent negotiations (which might include conflicting or deceptive statements). -• A list of your units and the possible orders they can each make. • A list of enemy units and centers. +• A “convoy_paths_possible” listing, if relevant, describing possible convoy routes (e.g. [("A NAP", {F ION, F TYS}, "TUN"), ...]). +• Your previously stated goals or alliances from negotiations. -Your goals: -1) **Strategize** to increase your supply centers, defend your existing centers, and expand influence. -2) **Coordinate** your orders so they do not produce internal contradictions. For example, do not support a move you are not actually making. -3) **Anticipate** that other powers may lie or might try to bounce you. If you suspect a bounce, consider using support or a safer move. -4) **Avoid guaranteed bounces** unless there's a diplomatic reason. If your negotiations strongly suggest you’ll get support, you can rely on it—but remain cautious if you suspect deception. -5) **Return a valid, consistent set of final orders** in the required JSON format. + +**Your tasks**: +1) Reflect on your strategic goals and the current board situation. +2) **Strategize** to increase your supply centers, defend your existing centers, and expand influence. +3) **Coordinate** your orders so they do not produce internal contradictions. For example, do not support a move you are not actually making. +4) Check if any of your previous moves were blocked or voided. Learn from that: + - If you bounced due to equal force, consider using support or picking a different target. + - If you had an invalid adjacency or a mismatch in support, fix it this turn. +5) Evaluate if you can use a convoy; consult “convoy_paths_possible” to see if a valid route exists. +6) Propose a set of final orders in a JSON block exactly like: +7) **Anticipate** that other powers may lie or might try to bounce you. If you suspect a bounce, consider using support or a safer move. +8) **Avoid guaranteed bounces** +9) **Return a valid, consistent set of final orders** in the required JSON format. CRUCIAL: - If you use a support order (e.g. “A BUD S F TRI - VEN”), then you must actually order “F TRI - VEN” in your moves. - If you are uncertain or the move list is complicated, you can choose a defensive hold or a safer approach. + +Remember that while your private chain-of-thought can consider your in-depth reasoning about possible outcomes, **only** the “PARSABLE OUTPUT” (your final orders array) will be used by the game engine. + +- If you use a support order, ensure you actually have a matching move that it supports. +- If you do a convoy, ensure the fleets and adjacency match the “convoy_paths_possible” data. +- Attempt to avoid guaranteed bounces unless you see a diplomatic reason to do so. +- If you suspect an enemy might also move to your target, consider using support or picking a safer approach. +- Provide only the JSON block, no extra text or disclaimers. + +Finally, internally you may do a Q&A with yourself its important to think through the options and make the best decision. +Who is lying? Who is telling the truth? Who is trying to deceive you? When would what my goal be thwarted and should I do something else instead? + - Provide the final answer as: PARSABLE OUTPUT: {{ @@ -25,5 +47,3 @@ PARSABLE OUTPUT: - **No extra text** inside that JSON block and YOU MUST return the JSON block. -Remember that while your private chain-of-thought can consider your in-depth reasoning about possible outcomes, **only** the “PARSABLE OUTPUT” (your final orders array) will be used by the game engine. - diff --git a/results/20250210_192004/game_manifesto.txt b/results/20250210_192004/game_manifesto.txt new file mode 100644 index 0000000..bd41e86 --- /dev/null +++ b/results/20250210_192004/game_manifesto.txt @@ -0,0 +1,6 @@ +=== S1901M === +PARSABLE OUTPUT: +{ + "summary": "In phase S1901M all powers largely held their positions. Austria, France, Italy, Russia, and Turkey successfully executed hold orders, resulting in no changes to their unit placements or centers. England attempted to move its fleet from Edinburgh to the North Sea (with support from its fleet in London), but the move bounced, likely due to an equal contest of force or a miscalculation in strength, so F EDI remained in Edinburgh. Similarly, Germany’s attempt to move its unit from Munich to Ruhr (A MUN - RUH) failed after bouncing, probably due to insufficient support or issues with valid adjacencies, leaving the unit in Munich. There were no changes to supply centers, and the status quo persists, setting up the stage for any future strategic adjustments." +} +