atropos/environments/community/protein_design/utils/pdb_utils.py

import logging
from typing import Dict, Set, Tuple, Union

logger = logging.getLogger(__name__)


def get_pdb_chain_details(
    pdb_content: str, preview_lines: int = 10
) -> Tuple[Dict[str, Dict[str, int]], str]:
    """
    Parses PDB content to extract detailed information for each chain.

    Returns:
        A tuple containing:
        - chain_details (Dict[str, Dict[str, int]]):
            A dictionary where keys are chain IDs (e.g., "A").
            Each value is another dictionary:
            {
                "min_residue": int,  # Smallest residue number found for this chain
                "max_residue": int,  # Largest residue number found for this chain
                "length": int       # Count of unique C-alpha atoms (residues) in this chain
            }
        - pdb_preview (str): A string preview of the PDB content.
    """
    chain_info_temp: Dict[str, Dict[str, Union[Set[int], int]]] = {}
    atom_lines = []
    header_lines = []

    for line in pdb_content.splitlines():
        if line.startswith("ATOM"):
            atom_lines.append(line)
            chain_id = line[21:22].strip()
            if not chain_id:
                chain_id = " "
            atom_name = line[12:16].strip()
            try:
                residue_num = int(line[22:26].strip())
                if chain_id not in chain_info_temp:
                    chain_info_temp[chain_id] = {"residues": set(), "ca_count": 0}
                chain_info_temp[chain_id]["residues"].add(residue_num)
                if atom_name == "CA":
                    chain_info_temp[chain_id]["ca_count"] += 1
            except ValueError:
                logger.warning(f"Could not parse residue number from PDB line: {line}")
                continue
        elif (
            line.startswith("HEADER")
            or line.startswith("TITLE")
            or line.startswith("COMPND")
        ):
            header_lines.append(line)

    chain_details: Dict[str, Dict[str, int]] = {}
    for chain_id, data in chain_info_temp.items():
        if data["residues"]:
            min_res = min(data["residues"])
            max_res = max(data["residues"])
            length = data["ca_count"] if data["ca_count"] > 0 else len(data["residues"])
            chain_details[chain_id] = {
                "min_residue": min_res,
                "max_residue": max_res,
                "length": length,
            }
        else:
            logger.warning(f"Chain {chain_id} had no parseable ATOM residue numbers.")

    preview_str_parts = header_lines[: min(len(header_lines), preview_lines // 2)]
    remaining_preview_lines = preview_lines - len(preview_str_parts)
    preview_str_parts.extend(
        atom_lines[: min(len(atom_lines), remaining_preview_lines)]
    )
    pdb_preview = "\n".join(preview_str_parts)
    if len(pdb_content.splitlines()) > preview_lines:
        pdb_preview += "\n..."
    return chain_details, pdb_preview