atropos/environments/community/protein_design/utils/pdb_utils.py
Shannon Sands 54967ecae9 linting
2025-05-27 12:15:15 +10:00

75 lines
2.9 KiB
Python

import logging
from typing import Dict, Set, Tuple, Union
logger = logging.getLogger(__name__)
def get_pdb_chain_details(
pdb_content: str, preview_lines: int = 10
) -> Tuple[Dict[str, Dict[str, int]], str]:
"""
Parses PDB content to extract detailed information for each chain.
Returns:
A tuple containing:
- chain_details (Dict[str, Dict[str, int]]):
A dictionary where keys are chain IDs (e.g., "A").
Each value is another dictionary:
{
"min_residue": int, # Smallest residue number found for this chain
"max_residue": int, # Largest residue number found for this chain
"length": int # Count of unique C-alpha atoms (residues) in this chain
}
- pdb_preview (str): A string preview of the PDB content.
"""
chain_info_temp: Dict[str, Dict[str, Union[Set[int], int]]] = {}
atom_lines = []
header_lines = []
for line in pdb_content.splitlines():
if line.startswith("ATOM"):
atom_lines.append(line)
chain_id = line[21:22].strip()
if not chain_id:
chain_id = " "
atom_name = line[12:16].strip()
try:
residue_num = int(line[22:26].strip())
if chain_id not in chain_info_temp:
chain_info_temp[chain_id] = {"residues": set(), "ca_count": 0}
chain_info_temp[chain_id]["residues"].add(residue_num)
if atom_name == "CA":
chain_info_temp[chain_id]["ca_count"] += 1
except ValueError:
logger.warning(f"Could not parse residue number from PDB line: {line}")
continue
elif (
line.startswith("HEADER")
or line.startswith("TITLE")
or line.startswith("COMPND")
):
header_lines.append(line)
chain_details: Dict[str, Dict[str, int]] = {}
for chain_id, data in chain_info_temp.items():
if data["residues"]:
min_res = min(data["residues"])
max_res = max(data["residues"])
length = data["ca_count"] if data["ca_count"] > 0 else len(data["residues"])
chain_details[chain_id] = {
"min_residue": min_res,
"max_residue": max_res,
"length": length,
}
else:
logger.warning(f"Chain {chain_id} had no parseable ATOM residue numbers.")
preview_str_parts = header_lines[: min(len(header_lines), preview_lines // 2)]
remaining_preview_lines = preview_lines - len(preview_str_parts)
preview_str_parts.extend(
atom_lines[: min(len(atom_lines), remaining_preview_lines)]
)
pdb_preview = "\n".join(preview_str_parts)
if len(pdb_content.splitlines()) > preview_lines:
pdb_preview += "\n..."
return chain_details, pdb_preview