mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-24 17:04:55 +00:00
159 lines
7.4 KiB
Python
159 lines
7.4 KiB
Python
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
SYSTEM_PROMPT = (
|
|
"You are a specialized AI system for de novo protein design via a staged simulation loop. "
|
|
"Your objective is to generate binder sequences that are structurally and functionally "
|
|
"optimized to bind a given target protein.\n\n"
|
|
"You will be guided through a multi-step pipeline:\n\n"
|
|
"1. Structure prediction (AlphaFold)\n"
|
|
"2. Binder backbone generation (RFdiffusion)\n"
|
|
"3. Sequence design (ProteinMPNN)\n"
|
|
"4. Structure evaluation (AlphaFold-Multimer)\n"
|
|
"5. Feedback loop\n\n"
|
|
"You must always:\n"
|
|
"- Respect the required file format for each tool (e.g., FASTA, PDB).\n"
|
|
"- Structure your outputs cleanly so they can be parsed and executed programmatically.\n"
|
|
"- Be explicit in all configuration steps (e.g., contigs, hotspots).\n"
|
|
"- Minimize ambiguity or verbosity; prefer concise and functional outputs.\n"
|
|
"- Reason step-by-step when appropriate."
|
|
)
|
|
|
|
|
|
def construct_user_prompt(state: dict) -> str:
|
|
"""
|
|
Constructs the appropriate user prompt for the current internal step.
|
|
|
|
Args:
|
|
state: The current workflow state dictionary (from episodes_state)
|
|
|
|
Returns:
|
|
A formatted user prompt string for the current step
|
|
"""
|
|
internal_step = state.get("current_internal_step", 0)
|
|
target_sequence = state.get("target_sequence")
|
|
user_prompt_str = ""
|
|
|
|
if internal_step == 0:
|
|
user_prompt_str = (
|
|
f"The target protein sequence is: {target_sequence}. "
|
|
"Your first task is to predict its 3D structure using the 'predict_target_structure_alphafold2' tool. "
|
|
"You must provide the 'sequence' argument."
|
|
)
|
|
elif internal_step == 1:
|
|
|
|
chain_details = state.get("target_chain_details", {})
|
|
if chain_details:
|
|
chain_info_parts = []
|
|
for chain_id, details in chain_details.items():
|
|
min_r = details.get("min_residue", "N/A")
|
|
max_r = details.get("max_residue", "N/A")
|
|
length = details.get("length", "N/A")
|
|
chain_info_parts.append(
|
|
f"Chain {chain_id} (Residues: {min_r}-{max_r}, Length: {length} amino acids)"
|
|
)
|
|
chain_info_str = "\n- ".join(chain_info_parts)
|
|
if chain_info_str:
|
|
chain_info_str = "- " + chain_info_str
|
|
else:
|
|
chain_info_str = "Chain information not available or PDB not yet processed."
|
|
|
|
user_prompt_str = (
|
|
f"The 3D structure of the target protein has been predicted.\n"
|
|
f"Target Protein Chain Details:\n{chain_info_str}\n\n"
|
|
"Your task is to design a binder backbone using the 'design_binder_backbone_rfdiffusion' tool. "
|
|
"You MUST specify 'contigs' for this tool. The 'contigs' string defines segments from the target PDB "
|
|
"and segments for the new binder. "
|
|
"Examples:\n"
|
|
" - To use residues 10 through 100 of target chain A, and then diffuse a 60-residue binder: "
|
|
"'A10-100/0 60'\n"
|
|
" - To use chain B from residue 5 to 50, then diffuse a 30-residue binder, then use chain B "
|
|
"from residue 60 to 100: 'B5-50/0 30 B60-100'\n"
|
|
"You MUST use the chain IDs and residue ranges exactly as provided in the "
|
|
"'Target Protein Chain Details' above. "
|
|
"Do not invent chains or residue numbers outside these specified ranges for the target segments. "
|
|
"For binder segments (e.g., '/0 60'), specify the desired length (e.g., 60).\n"
|
|
"Optionally, provide 'hotspot_residues' (e.g., ['A50', 'A52']), ensuring these residues exist "
|
|
"on the target as per the details above."
|
|
)
|
|
elif internal_step == 2:
|
|
binder_pdb_content = state.get("binder_backbone_pdb_content")
|
|
|
|
binder_pdb_preview = state.get(
|
|
"binder_pdb_preview", "Binder PDB preview not available."
|
|
)
|
|
binder_chain_info_str = "Binder chain information not available."
|
|
|
|
if binder_pdb_content:
|
|
binder_chain_details = state.get("binder_chain_details", {})
|
|
|
|
if binder_chain_details:
|
|
chain_info_parts = []
|
|
for cID, d_details in binder_chain_details.items():
|
|
min_r = d_details.get("min_residue", "N/A")
|
|
max_r = d_details.get("max_residue", "N/A")
|
|
length = d_details.get("length", "N/A")
|
|
chain_info_parts.append(
|
|
f"Chain {cID} (Residues: {min_r}-{max_r}, Length: {length} amino acids)"
|
|
)
|
|
binder_chain_info_str = "\n- ".join(chain_info_parts)
|
|
if binder_chain_info_str:
|
|
binder_chain_info_str = "- " + binder_chain_info_str
|
|
else:
|
|
binder_chain_info_str = "Binder chain details not found in state (expected from RFDiffusion)."
|
|
|
|
else:
|
|
pass
|
|
|
|
user_prompt_str = (
|
|
f"A binder backbone has been generated. Binder PDB preview:\n{binder_pdb_preview}\n"
|
|
f"Binder chain information:\n{binder_chain_info_str}.\n"
|
|
"Now, design an optimal amino acid sequence for this binder backbone using the "
|
|
"'design_binder_sequence_proteinmpnn' tool. "
|
|
"You can optionally specify 'sampling_temp' (e.g., [0.1, 0.2])."
|
|
)
|
|
elif internal_step == 3:
|
|
designed_binder_seq_data = state.get("designed_binder_sequence")
|
|
|
|
binder_display_str = "Not available"
|
|
if isinstance(designed_binder_seq_data, list) and designed_binder_seq_data:
|
|
if len(designed_binder_seq_data) == 1:
|
|
binder_display_str = designed_binder_seq_data[0]
|
|
else:
|
|
binder_display_str = (
|
|
f"{len(designed_binder_seq_data)} chains: "
|
|
+ ", ".join(
|
|
[
|
|
f"Chain {i+1} ({len(s)} aa): {s[:20]}..."
|
|
for i, s in enumerate(designed_binder_seq_data)
|
|
]
|
|
)
|
|
)
|
|
elif isinstance(designed_binder_seq_data, str):
|
|
binder_display_str = designed_binder_seq_data
|
|
|
|
user_prompt_str = (
|
|
f"A binder has been designed. Designed binder sequence(s): {binder_display_str}. "
|
|
f"The original target sequence was: {target_sequence[:60]}...\n"
|
|
"Finally, evaluate the binding complex of the original target protein and ALL chains of this "
|
|
"designed binder using the 'evaluate_binder_complex_alphafold2_multimer' tool. "
|
|
"You can optionally specify 'relax_prediction' (default is True)."
|
|
)
|
|
else:
|
|
user_prompt_str = (
|
|
"The protein design workflow is complete. No further actions required by you for this item. "
|
|
"If successful, the key metric was the pLDDT of the complex."
|
|
)
|
|
|
|
if state.get("retry_count_this_internal_step", 0) > 0 and internal_step < 4:
|
|
retry_prefix = "Your previous attempt at this step was not successful. "
|
|
if state.get("previous_tool_error_message"):
|
|
retry_prefix += f"Details: {state['previous_tool_error_message']}. "
|
|
retry_prefix += (
|
|
"Please review the requirements and PDB details carefully and try again to correctly use "
|
|
"the expected tool.\n\n"
|
|
)
|
|
user_prompt_str = retry_prefix + user_prompt_str
|
|
|
|
return user_prompt_str
|