Add debugging + more context. Too many orders failing to be validated. Prompting fixes?

This commit is contained in:
AlxAI 2025-02-10 19:28:02 -08:00
parent 5fa1b82bd7
commit 80a883b75f
6 changed files with 159 additions and 25 deletions

View file

@ -4695,14 +4695,42 @@ class Game(Jsonable):
f"RESULTS:\n{results_block}\n\n" f"RESULTS:\n{results_block}\n\n"
f"CURRENT BOARD STATE:\n{current_state_block}\n\n" f"CURRENT BOARD STATE:\n{current_state_block}\n\n"
f"CHANGES FROM PREVIOUS PHASE:\n{differences_block}\n\n" f"CHANGES FROM PREVIOUS PHASE:\n{differences_block}\n\n"
"Please write a concise but detailed summary of what happened this turn, including " "Below is the final board state after the latest phase, along with the moves each power submitted and the engines adjudication results. Please create a summary in JSON, explaining:"
"important captures, retreats, or changes in board position, using a helpful, neutral tone." "- Each successful move,"
"- Each bounce or voided order, with reasons (e.g. equal force, no valid route, contradictory support),"
"- Key changes in supply centers,"
"- Potential strategic ramifications if relevant."
"Return ONLY JSON:"
"PARSABLE OUTPUT:"
"{{"
"'summary': ... your text ..."
"}}"
) )
# We might also have a system prompt to guide the AI, e.g.: # We might also have a system prompt to guide the AI, e.g.:
system_prompt = ( system_prompt = (
"You are an AI summarizing a Diplomacy game turn based on the provided data. " """
"Focus on describing important changes, successes/failures of orders, and new unit positions." You are a Diplomacy expert, summarizing the results of the latest phase.
Your tasks:
1) Provide a concise summary of how the board changed.
2) Specifically list each voided or bounced order, and *why* it occurred.
3) If possible, describe which moves or supports succeeded and how that affected centers.
Format:
- Must return a JSON with the top-level key "summary" or "orders" or similar.
- Possibly:
PARSABLE OUTPUT:
{
"summary": "...(your textual summary)..."
}
Ensure the summary clarifies reasons for bounces, e.g., "F TRI -> VEN bounced because Italy also moved A VEN -> TRI with equal force."
No extra text outside the JSON block.
"""
) )
if summary_callback: if summary_callback:

View file

@ -160,6 +160,76 @@ def my_summary_callback(system_prompt, user_prompt):
# Pseudo-code for generating a response: # Pseudo-code for generating a response:
return client.generate_response(combined_prompt) return client.generate_response(combined_prompt)
def get_valid_orders_with_retry(game,
client,
board_state,
power_name,
possible_orders,
conversation_text_for_orders,
phase_summaries,
model_error_stats,
max_retries=3):
"""
Tries up to 'max_retries' to generate and validate orders.
If invalid, we append the error feedback to the conversation
context for the next retry. If still invalid, return fallback.
"""
error_feedback = ""
for attempt in range(max_retries):
# Incorporate any error feedback into the conversation text
augmented_conversation_text = conversation_text_for_orders
if error_feedback:
augmented_conversation_text += (
"\n\n[ORDER VALIDATION FEEDBACK]\n" + error_feedback
)
# Ask the LLM for orders
orders = client.get_orders(
board_state=board_state,
power_name=power_name,
possible_orders=possible_orders,
conversation_text=augmented_conversation_text,
phase_summaries=phase_summaries,
model_error_stats=model_error_stats
)
# Validate each order
invalid_info = []
for move in orders:
# Example move: "A PAR H" -> unit="A PAR", order_part="H"
tokens = move.split(" ", 2)
if len(tokens) < 3:
invalid_info.append(
f"Order '{move}' is malformed; expected 'A PAR H' style."
)
continue
unit = " ".join(tokens[:2]) # e.g. "A PAR"
order_part = tokens[2] # e.g. "H" or "S A MAR"
# Use the internal game validation method
validity = game._valid_order(power_name, unit, order_part, report=1)
if validity != 1:
invalid_info.append(
f"Order '{move}' returned validity={validity}. (None/-1=invalid, 0=partial, 1=valid)"
)
if not invalid_info:
# All orders are fully valid
return orders
else:
# Build feedback for the next retry
error_feedback = (
f"Attempt {attempt+1}/{max_retries} had invalid orders:\n"
+ "\n".join(invalid_info)
)
# If we finish the loop without returning, fallback
logger.warning(
f"[{power_name}] Exhausted {max_retries} attempts for valid orders, using fallback."
)
fallback = client.fallback_orders(possible_orders)
return fallback
def main(): def main():
logger.info("Starting a new Diplomacy game for testing with multiple LLMs, now concurrent!") logger.info("Starting a new Diplomacy game for testing with multiple LLMs, now concurrent!")
start_whole = time.time() start_whole = time.time()
@ -232,28 +302,33 @@ def main():
logger.info(f"No orderable locations for {power_name}; skipping.") logger.info(f"No orderable locations for {power_name}; skipping.")
continue continue
board_state = game.get_state() board_state = game.get_state()
# Submit a task that includes up to 3 attempts at valid orders
future = executor.submit( future = executor.submit(
client.get_orders, get_valid_orders_with_retry,
board_state, game,
power_name, client,
possible_orders, board_state,
conversation_text_for_orders, power_name,
possible_orders,
conversation_text_for_orders, # existing conversation text
game.phase_summaries, game.phase_summaries,
model_error_stats # pass our stats model_error_stats,
3 # max_retries
) )
futures[future] = power_name futures[future] = power_name
logger.debug(f"Submitted get_orders task for power {power_name}.") logger.debug(f"Submitted get_valid_orders_with_retry task for {power_name}.")
for future in concurrent.futures.as_completed(futures): for future in concurrent.futures.as_completed(futures):
p_name = futures[future] p_name = futures[future]
try: try:
orders = future.result() orders = future.result()
logger.debug(f"Orders for {p_name}: {orders}") logger.debug(f"Validated orders for {p_name}: {orders}")
if orders: if orders:
game.set_orders(p_name, orders) game.set_orders(p_name, orders)
logger.debug(f"Set orders for {p_name} in {game.current_short_phase}: {orders}") logger.debug(f"Set orders for {p_name} in {game.current_short_phase}: {orders}")
else: else:
logger.debug(f"No orders returned for {p_name}.") logger.debug(f"No valid orders returned for {p_name}.")
except Exception as exc: except Exception as exc:
logger.error(f"LLM request failed for {p_name}: {exc}") logger.error(f"LLM request failed for {p_name}: {exc}")

View file

@ -144,8 +144,6 @@ class BaseModelClient:
except Exception as e: except Exception as e:
logger.error(f"[{self.model_name}] LLM error for {power_name}: {e}") logger.error(f"[{self.model_name}] LLM error for {power_name}: {e}")
if model_error_stats is not None:
model_error_stats[self.model_name]["order_decoding_errors"] += 1
return self.fallback_orders(possible_orders) return self.fallback_orders(possible_orders)
def _extract_moves(self, raw_response: str, power_name: str) -> Optional[List[str]]: def _extract_moves(self, raw_response: str, power_name: str) -> Optional[List[str]]:

View file

@ -5,6 +5,13 @@ Your Units: ['A PAR','F BRE']
Possible Orders: Possible Orders:
PAR: ['A PAR H','A PAR - BUR','A PAR - GAS'] PAR: ['A PAR H','A PAR - BUR','A PAR - GAS']
BRE: ['F BRE H','F BRE - MAO'] BRE: ['F BRE H','F BRE - MAO']
Convoy Paths Possible:
[("A NAP", {"F ION","F TYS"}, "TUN")] # Example route
Past Phase Summaries:
- Your move A BUD -> SER bounced last time because Turkey also moved A SMY -> SER with support.
- Your support F TRI S A BUD -> SER was wasted because F TRI was needed to block Ionian invasion.
Chain-of-thought: Chain-of-thought:
[Be consistent with your secret chain-of-thought here, but do not reveal it. [Be consistent with your secret chain-of-thought here, but do not reveal it.

View file

@ -1,22 +1,44 @@
You are a Diplomacy expert, tasked with deciding movement orders for a single power. You are a Diplomacy expert responsible for deciding final orders for your power.
You will be given: You will be given:
• Which power you are controlling. • Which power you are controlling.
• The current phase (e.g. S1901M). • The current phase (e.g. S1901M).
• Your units and the possible orders for each.
• Summaries of past phases (including bounces, voids, and the reasons).
• A summary of recent negotiations (which might include conflicting or deceptive statements). • A summary of recent negotiations (which might include conflicting or deceptive statements).
• A list of your units and the possible orders they can each make.
• A list of enemy units and centers. • A list of enemy units and centers.
• A “convoy_paths_possible” listing, if relevant, describing possible convoy routes (e.g. [("A NAP", {F ION, F TYS}, "TUN"), ...]).
• Your previously stated goals or alliances from negotiations.
Your goals:
1) **Strategize** to increase your supply centers, defend your existing centers, and expand influence. **Your tasks**:
2) **Coordinate** your orders so they do not produce internal contradictions. For example, do not support a move you are not actually making. 1) Reflect on your strategic goals and the current board situation.
3) **Anticipate** that other powers may lie or might try to bounce you. If you suspect a bounce, consider using support or a safer move. 2) **Strategize** to increase your supply centers, defend your existing centers, and expand influence.
4) **Avoid guaranteed bounces** unless there's a diplomatic reason. If your negotiations strongly suggest youll get support, you can rely on it—but remain cautious if you suspect deception. 3) **Coordinate** your orders so they do not produce internal contradictions. For example, do not support a move you are not actually making.
5) **Return a valid, consistent set of final orders** in the required JSON format. 4) Check if any of your previous moves were blocked or voided. Learn from that:
- If you bounced due to equal force, consider using support or picking a different target.
- If you had an invalid adjacency or a mismatch in support, fix it this turn.
5) Evaluate if you can use a convoy; consult “convoy_paths_possible” to see if a valid route exists.
6) Propose a set of final orders in a JSON block exactly like:
7) **Anticipate** that other powers may lie or might try to bounce you. If you suspect a bounce, consider using support or a safer move.
8) **Avoid guaranteed bounces**
9) **Return a valid, consistent set of final orders** in the required JSON format.
CRUCIAL: CRUCIAL:
- If you use a support order (e.g. “A BUD S F TRI - VEN”), then you must actually order “F TRI - VEN” in your moves. - If you use a support order (e.g. “A BUD S F TRI - VEN”), then you must actually order “F TRI - VEN” in your moves.
- If you are uncertain or the move list is complicated, you can choose a defensive hold or a safer approach. - If you are uncertain or the move list is complicated, you can choose a defensive hold or a safer approach.
Remember that while your private chain-of-thought can consider your in-depth reasoning about possible outcomes, **only** the “PARSABLE OUTPUT” (your final orders array) will be used by the game engine.
- If you use a support order, ensure you actually have a matching move that it supports.
- If you do a convoy, ensure the fleets and adjacency match the “convoy_paths_possible” data.
- Attempt to avoid guaranteed bounces unless you see a diplomatic reason to do so.
- If you suspect an enemy might also move to your target, consider using support or picking a safer approach.
- Provide only the JSON block, no extra text or disclaimers.
Finally, internally you may do a Q&A with yourself its important to think through the options and make the best decision.
Who is lying? Who is telling the truth? Who is trying to deceive you? When would what my goal be thwarted and should I do something else instead?
- Provide the final answer as: - Provide the final answer as:
PARSABLE OUTPUT: PARSABLE OUTPUT:
{{ {{
@ -25,5 +47,3 @@ PARSABLE OUTPUT:
- **No extra text** inside that JSON block and YOU MUST return the JSON block. - **No extra text** inside that JSON block and YOU MUST return the JSON block.
Remember that while your private chain-of-thought can consider your in-depth reasoning about possible outcomes, **only** the “PARSABLE OUTPUT” (your final orders array) will be used by the game engine.

View file

@ -0,0 +1,6 @@
=== S1901M ===
PARSABLE OUTPUT:
{
"summary": "In phase S1901M all powers largely held their positions. Austria, France, Italy, Russia, and Turkey successfully executed hold orders, resulting in no changes to their unit placements or centers. England attempted to move its fleet from Edinburgh to the North Sea (with support from its fleet in London), but the move bounced, likely due to an equal contest of force or a miscalculation in strength, so F EDI remained in Edinburgh. Similarly, Germanys attempt to move its unit from Munich to Ruhr (A MUN - RUH) failed after bouncing, probably due to insufficient support or issues with valid adjacencies, leaving the unit in Munich. There were no changes to supply centers, and the status quo persists, setting up the stage for any future strategic adjustments."
}