refactor, full run

2026-04-22 16:48:57 +00:00 · 2025-05-20 20:11:52 -07:00 · 2025-05-20 20:11:52 -07:00 · 1ee67de035
commit 1ee67de035
parent de9dfff221
12 changed files with 1039 additions and 1127 deletions
--- a/environments/hack0/protein_design_env/utils/pdb_utils.py
+++ b/environments/hack0/protein_design_env/utils/pdb_utils.py
@ -0,0 +1,65 @@
+import logging
+from typing import Dict, Tuple, List, Set, Union
+
+logger = logging.getLogger(__name__)
+
+def get_pdb_chain_details(pdb_content: str, preview_lines: int = 10) -> Tuple[Dict[str, Dict[str, int]], str]:
+    """
+    Parses PDB content to extract detailed information for each chain.
+
+    Returns:
+        A tuple containing:
+        - chain_details (Dict[str, Dict[str, int]]):
+            A dictionary where keys are chain IDs (e.g., "A").
+            Each value is another dictionary:
+            {
+                "min_residue": int,  # Smallest residue number found for this chain
+                "max_residue": int,  # Largest residue number found for this chain
+                "length": int       # Count of unique C-alpha atoms (residues) in this chain
+            }
+        - pdb_preview (str): A string preview of the PDB content.
+    """
+    chain_info_temp: Dict[str, Dict[str, Union[Set[int], int]]] = {}
+    atom_lines = []
+    header_lines = []
+
+    for line in pdb_content.splitlines():
+        if line.startswith("ATOM"):
+            atom_lines.append(line)
+            chain_id = line[21:22].strip()
+            if not chain_id: chain_id = " "
+            atom_name = line[12:16].strip()
+            try:
+                residue_num = int(line[22:26].strip())
+                if chain_id not in chain_info_temp:
+                    chain_info_temp[chain_id] = {"residues": set(), "ca_count": 0}
+                chain_info_temp[chain_id]["residues"].add(residue_num)
+                if atom_name == "CA":
+                    chain_info_temp[chain_id]["ca_count"] += 1
+            except ValueError:
+                logger.warning(f"Could not parse residue number from PDB line: {line}")
+                continue
+        elif line.startswith("HEADER") or line.startswith("TITLE") or line.startswith("COMPND"):
+            header_lines.append(line)
+
+    chain_details: Dict[str, Dict[str, int]] = {}
+    for chain_id, data in chain_info_temp.items():
+        if data["residues"]:
+            min_res = min(data["residues"])
+            max_res = max(data["residues"])
+            length = data["ca_count"] if data["ca_count"] > 0 else len(data["residues"])
+            chain_details[chain_id] = {
+                "min_residue": min_res,
+                "max_residue": max_res,
+                "length": length
+            }
+        else:
+            logger.warning(f"Chain {chain_id} had no parseable ATOM residue numbers.")
+
+    preview_str_parts = header_lines[:min(len(header_lines), preview_lines // 2)]
+    remaining_preview_lines = preview_lines - len(preview_str_parts)
+    preview_str_parts.extend(atom_lines[:min(len(atom_lines), remaining_preview_lines)])
+    pdb_preview = "\n".join(preview_str_parts)
+    if len(pdb_content.splitlines()) > preview_lines:
+        pdb_preview += "\n..."
+    return chain_details, pdb_preview