adding MR and LogP Prediction tasks

2026-04-19 12:58:04 +00:00 · 2025-06-16 15:10:05 +08:00 · 2025-06-16 15:10:05 +08:00 · 91972f43ea
commit 91972f43ea
parent c2dad02fe4
7 changed files with 490 additions and 0 deletions
--- a/internbootcamp/bootcamp/ChemStructure2Property/InChI2MRBootCamp.py
+++ b/internbootcamp/bootcamp/ChemStructure2Property/InChI2MRBootCamp.py
@ -0,0 +1,28 @@
+from rdkit import Chem
+from rdkit.Chem import Crippen
+from .InChI2logPBootCamp import InChI2logPbootcamp
+
+class InChI2MRBootCamp(InChI2logPbootcamp):
+
+    def prompt_func(self, InChI) -> str:
+
+        instruction = f"Given the InChI, determine the Molar Refractivity (MR) value of the material. The InChI is: {InChI}"
+        instruction_following = """Let's think step by step and output the final answer within \\boxed{}.The final answer should be one float number. For example "Final Answer: \\boxed{afloat}"."""
+        
+        prompt = instruction + '\n' + instruction_following
+        return prompt
+    
+        
+    @classmethod 
+    def _verify_correction(cls, solution, InChI)->bool:
+        """
+        Verify the correction of the solution.
+        """ 
+        mol = Chem.MolFromInchi(InChI)
+        true_MR = Crippen.MolMR(mol)
+        print(f"Comparing pred: {solution}, ground_truth: {true_MR}")
+        return abs(true_MR - float(solution)) <= 0.01  # maybe mse or mae better?
+        
+
+
+
--- a/internbootcamp/bootcamp/ChemStructure2Property/InChI2logPBootCamp.py
+++ b/internbootcamp/bootcamp/ChemStructure2Property/InChI2logPBootCamp.py
@ -0,0 +1,56 @@
+from internbootcamp.bootcamp.base import Basebootcamp
+from internbootcamp.libs.chemStructure2Property.ChemStructureGenerator import InChIGenerator
+from .utils import last_boxed_only_string, remove_boxed
+from rdkit import Chem
+from rdkit.Chem import Crippen
+
+
+class InChI2logPbootcamp(Basebootcamp):
+    def __init__(self, num_numbers=4, max_atoms=15, min_atoms=3, elements=None, seed=None):
+        # super.__init__()
+        self.num_numbers = num_numbers
+        self.InChIGenerator = InChIGenerator(max_atoms=max_atoms, min_atoms=min_atoms, elements=elements, seed=seed)
+        
+    def case_generator(self) -> str:
+        """
+        生成一组数字和目标值。
+        """
+        return self.InChIGenerator.generate_n_valid_inchi(1)[0]
+
+    def prompt_func(self, InChI) -> str:
+
+        instruction = f"Given the InChI, determine the lipophilicity (logP) value of the material. The InChI is: {InChI}"
+        instruction_following = """Let's think step by step and output the final answer within \\boxed{}.The final answer should be one float number. For example "Final Answer: \\boxed{afloat}"."""
+        
+        prompt = instruction + '\n' + instruction_following
+        return prompt
+    
+    @staticmethod
+    def extract_output(output):
+        """
+        Extract the output from the solution.
+        
+        Args:
+            output: Model output to be processed.
+        
+        Returns:
+            The processed output.
+        """
+        output = last_boxed_only_string(output)
+        if output is None:
+            return None
+        return remove_boxed(output)
+        
+    @classmethod 
+    def _verify_correction(cls, solution, InChI)->bool:
+        """
+        Verify the correction of the solution.
+        """ 
+        mol = Chem.MolFromInchi(InChI)
+        true_logp = Crippen.MolLogP(mol)
+        print(f"Comparing pred: {solution}, ground_truth: {true_logp}")
+        return abs(true_logp - float(solution)) <= 0.01  # maybe mse or mae better?
+        
+
+
+
--- a/internbootcamp/bootcamp/ChemStructure2Property/SMILES2MRBootCamp.py
+++ b/internbootcamp/bootcamp/ChemStructure2Property/SMILES2MRBootCamp.py
@ -0,0 +1,34 @@
+from internbootcamp.bootcamp.base import Basebootcamp
+from internbootcamp.libs.chemStructure2Property.ChemStructureGenerator import SMILESGenerator
+from .utils import last_boxed_only_string, remove_boxed
+from rdkit import Chem
+from rdkit.Chem import Crippen
+
+from .SMILES2logPBootCamp import SMILES2logPBootCamp
+
+class SMILES2MRBootCamp(SMILES2logPBootCamp):
+
+
+
+    def prompt_func(self,  SMILES) -> str:
+
+        instruction = f"Given the  SMILES, determine the Molar Refractivity (MR) value of the material. The  SMILES is: {SMILES}"
+        instruction_following = """Let's think step by step and output the final answer within \\boxed{}.The final answer should be one float number. For example "Final Answer: \\boxed{afloat}"."""
+        
+        prompt = instruction + '\n' + instruction_following
+        return prompt
+
+        
+    @classmethod 
+    def _verify_correction(cls, solution, SMILES)->bool:
+        """
+        Verify the correction of the solution.
+        """ 
+        mol = Chem.MolFromSmiles(SMILES)
+        true_MR = Crippen.MolMR(mol)
+        print(f"Comparing pred: {solution}, ground_truth: {true_MR}")
+        return abs(true_MR - float(solution)) <= 0.01  # maybe mse or mae better?
+        
+
+
+
--- a/internbootcamp/bootcamp/ChemStructure2Property/SMILES2logPBootCamp.py
+++ b/internbootcamp/bootcamp/ChemStructure2Property/SMILES2logPBootCamp.py
@ -0,0 +1,43 @@
+from internbootcamp.bootcamp.base import Basebootcamp
+from internbootcamp.libs.chemStructure2Property.ChemStructureGenerator import SMILESGenerator
+from .utils import last_boxed_only_string, remove_boxed
+from rdkit import Chem
+from rdkit.Chem import Crippen
+
+from .InChI2logPBootCamp import InChI2logPbootcamp
+
+class SMILES2logPBootCamp(InChI2logPbootcamp):
+    def __init__(self, num_numbers=4, min_len=5, max_len=25, 
+                 seed=None):
+        # super.__init__()
+        self.num_numbers = num_numbers
+        self.SMILESGenerator = SMILESGenerator(min_len=5, max_len=25, seed=None)
+        
+    def case_generator(self) -> str:
+        """
+        生成一组数字和目标值。
+        """
+        return self.SMILESGenerator.generate_n_valid_smiles(1)[0]
+
+    def prompt_func(self,  SMILES) -> str:
+
+        instruction = f"Given the  SMILES, determine the lipophilicity (logP) value of the material. The  SMILES is: {SMILES}"
+        instruction_following = """Let's think step by step and output the final answer within \\boxed{}.The final answer should be one float number. For example "Final Answer: \\boxed{afloat}"."""
+        
+        prompt = instruction + '\n' + instruction_following
+        return prompt
+
+        
+    @classmethod 
+    def _verify_correction(cls, solution, SMILES)->bool:
+        """
+        Verify the correction of the solution.
+        """ 
+        mol = Chem.MolFromSmiles(SMILES)
+        true_logp = Crippen.MolLogP(mol)
+        print(f"Comparing pred: {solution}, ground_truth: {true_logp}")
+        return abs(true_logp - float(solution)) <= 0.01  # maybe mse or mae better?
+        
+
+
+
--- a/internbootcamp/bootcamp/ChemStructure2Property/utils.py
+++ b/internbootcamp/bootcamp/ChemStructure2Property/utils.py
@ -0,0 +1,43 @@
+
+def remove_boxed(s):
+    if "\\boxed " in s:
+        left = "\\boxed "
+        assert s[:len(left)] == left
+        return s[len(left):]
+
+    left = "\\boxed{"
+
+    assert s[:len(left)] == left
+    assert s[-1] == "}"
+
+    return s[len(left):-1]
+
+
+def last_boxed_only_string(string):
+    idx = string.rfind("\\boxed")
+    if "\\boxed " in string:
+        return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
+    if idx < 0:
+        idx = string.rfind("\\fbox")
+        if idx < 0:
+            return None
+
+    i = idx
+    right_brace_idx = None
+    num_left_braces_open = 0
+    while i < len(string):
+        if string[i] == "{":
+            num_left_braces_open += 1
+        if string[i] == "}":
+            num_left_braces_open -= 1
+            if num_left_braces_open == 0:
+                right_brace_idx = i
+                break
+        i += 1
+
+    if right_brace_idx is None:
+        retval = None
+    else:
+        retval = string[idx:right_brace_idx + 1]
+
+    return retval
--- a/internbootcamp/bootcamp/init.py
+++ b/internbootcamp/bootcamp/init.py
@ -30,6 +30,13 @@ from .bbeh_hyperbaton.hyperbaton_default import BBEHHyperbatonbootcamp
 from .bbeh_boardgame_qa.bbeh_boardgame_qa import Bbehboardgameqabootcamp
 from .bbeh_boolean_expressions.bbeh_boolean_expressions import Bbehbooleanexpressionsbootcamp

+
+from .ChemStructure2Property.InChI2logPBootCamp import InChI2logPbootcamp
+from .ChemStructure2Property.InChI2MRBootCamp import InChI2MRBootCamp
+from .ChemStructure2Property.SMILES2logPBootCamp import SMILES2logPBootCamp
+from .ChemStructure2Property.SMILES2MRBootCamp import SMILES2MRBootCamp
+
+
 from .kakurasu.kakurasu import Kakurasubootcamp
 from .nonograms.nonograms import Nonogramsbootcamp
 from .hitori.hitori import Hitoribootcamp
--- a/internbootcamp/libs/chemStructure2Property/ChemStructureGenerator.py
+++ b/internbootcamp/libs/chemStructure2Property/ChemStructureGenerator.py
@ -0,0 +1,279 @@
+import random
+from rdkit import Chem
+from rdkit import RDLogger
+from rdkit.Chem.inchi import MolToInchi
+from rdkit.Chem import Crippen
+
+
+class InChIGenerator:
+    def __init__(self, max_atoms=15, min_atoms=3, elements=None, 
+                 seed=None):
+        RDLogger.DisableLog('rdApp.*')
+        random.seed(42) if seed is None else random.seed(seed)
+        self.max_atoms = max_atoms
+        self.min_atoms = min_atoms
+        if elements is None:
+            self.elements = ['C', 'N', 'O', 'S', 'P', 'F', 'Cl', 'Br', 'I']
+        else:
+            self.elements = elements
+
+    def generate_random_molecule_rdkit(self):
+        """
+        Generates a random molecule using RDKit's RWMol.
+        Can optionally try to assign random stereochemistry.
+        """
+
+        rw_mol = Chem.RWMol()  # Editable molecule
+        num_atoms_to_add = random.randint(self.min_atoms, self.max_atoms)
+
+        if num_atoms_to_add == 0:
+            return None
+
+        # Add the first atom
+        atom_symbol = random.choice(self.elements)
+        rw_mol.AddAtom(Chem.Atom(atom_symbol))
+
+        # Add subsequent atoms and connect them
+        for i in range(1, num_atoms_to_add):
+            if not rw_mol.GetNumAtoms(): break
+
+            existing_atom_idx = random.randrange(rw_mol.GetNumAtoms())
+            new_atom_symbol = random.choice(self.elements)
+            new_atom_idx = rw_mol.AddAtom(Chem.Atom(new_atom_symbol))
+
+            bond_type = random.choice([Chem.BondType.SINGLE, Chem.BondType.SINGLE, Chem.BondType.DOUBLE, Chem.BondType.TRIPLE])
+            
+            rw_mol.AddBond(existing_atom_idx, new_atom_idx, bond_type)
+
+        # Attempt to form rings
+        if rw_mol.GetNumAtoms() > 2:
+            num_rings_to_try = random.randint(0, rw_mol.GetNumAtoms() // 3)
+            for _ in range(num_rings_to_try):
+                if rw_mol.GetNumAtoms() < 2: break
+                
+                atom_indices = list(range(rw_mol.GetNumAtoms()))
+                if len(atom_indices) < 2: break
+                
+                idx1, idx2 = random.sample(atom_indices, 2)
+                
+                if rw_mol.GetBondBetweenAtoms(idx1, idx2) is None:
+                    rw_mol.AddBond(idx1, idx2, Chem.BondType.SINGLE) # Usually single for new rings
+
+        try:
+            mol = rw_mol.GetMol()
+            Chem.SanitizeMol(mol)  # Crucial: checks valency, aromaticity, etc.
+
+            if mol.GetNumAtoms() > 0:
+                # It might create non-physical or conflicting assignments.
+                # InChI will represent whatever stereo is defined.
+                Chem.AssignStereochemistryFrom3D(mol) # If 3D coords were present (not here)
+                # Or, more directly, find potential chiral centers and assign randomly
+                chiral_centers = Chem.FindMolChiralCenters(mol, includeUnassigned=True)
+                for center_idx, stereo_val in chiral_centers:
+                    if stereo_val == '?': # Unassigned
+                        atom = mol.GetAtomWithIdx(center_idx)
+                        if random.choice([True, False]):
+                            atom.SetChiralTag(Chem.ChiralType.CHI_TETRAHEDRAL_CW)
+                        else:
+                            atom.SetChiralTag(Chem.ChiralType.CHI_TETRAHEDRAL_CCW)
+                # Re-sanitize after modifying chiral tags might be good practice
+                Chem.SanitizeMol(mol)
+            return mol
+        except Exception as e:
+            # print(f"Debug: RDKit molecule construction/sanitization failed: {e}")
+            return None
+               
+    def generate_n_valid_inchi(self, n):
+        """
+        Generates N valid, unique InChI strings.
+        kwargs_for_mol_gen are passed to generate_random_molecule_rdkit.
+        """
+        valid_inchi_set = set()
+        total_attempts_overall = 0
+
+        while len(valid_inchi_set) < n:
+            attempts_for_current_inchi = 0
+            generated_this_round = False
+            while not generated_this_round:
+                total_attempts_overall += 1
+                attempts_for_current_inchi += 1
+                
+                mol = self.generate_random_molecule_rdkit()
+                
+                if mol:
+                    try:
+                        inchi_string = MolToInchi(mol)
+                        mol = Chem.MolFromInchi(inchi_string)
+                        logp = Crippen.MolLogP(mol)
+                        if inchi_string and inchi_string not in valid_inchi_set:
+
+                            valid_inchi_set.add(inchi_string)
+                            generated_this_round = True
+                            break  # Found one
+                    except Exception as e:
+                        # This can happen if the molecule is somehow malformed even after sanitization,
+                        # or if InChI generation itself encounters an issue (rare).
+                        print(f"Debug: MolToInchi failed: {e} for SMILES: {Chem.MolToSmiles(mol)}")
+                        pass
+
+        return list(valid_inchi_set)
+
+
+class SMILESGenerator:
+    def __init__(self, min_len=5, max_len=25, 
+                 seed=None):
+        RDLogger.DisableLog('rdApp.*')
+        random.seed(42) if seed is None else random.seed(seed)
+        self.min_len = min_len
+        self.max_len = max_len
+
+
+    def is_valid_smiles(self, smi):
+        """Checks if a SMILES string is valid using RDKit."""
+        if not smi:
+            return False
+        mol = Chem.MolFromSmiles(smi, sanitize=False)  # Parse without sanitization first
+        if mol is None:
+            return False
+        try:
+            Chem.SanitizeMol(mol)
+            return True
+        except Exception as e:
+            return False     
+
+    def generate_random_smiles_candidate(self):
+        """
+        Generates a random string that might be a SMILES string.
+        This is a VERY naive generator and will produce many invalid SMILES.
+        """
+        # A basic set of SMILES characters
+        # More comprehensive: C,c,N,n,O,o,S,s,P,p,F,Cl,Br,I,B,Si,Se,*,[nH],[nH+],[cH-],...
+        # Also: -,=,#,$,:,.,(,),[,],%,0-9 (for ring closures and isotopes/charges)
+        atom_chars = ['C', 'N', 'O', 'S', 'F', 'Cl', 'Br', 'I', 'P']
+        aromatic_chars = ['c', 'n', 'o', 's']
+        bond_chars = ['-', '=', '#']
+        branch_chars = ['(', ')']
+        ring_digits = [str(i) for i in range(1, 10)] # 1-9
+        # More complex elements like charges, isotopes, chiral centers are harder to randomize simply
+        # For simplicity, we'll stick to a subset.
+
+        all_chars = atom_chars + aromatic_chars + bond_chars + branch_chars + ring_digits
+
+        length = random.randint(self.min_len, self.max_len)
+        candidate = ""
+
+        candidate += random.choice(atom_chars + aromatic_chars)
+
+        open_parentheses = 0
+        open_rings = {} 
+
+        for _ in range(length - 1):
+            choices = []
+            weights = []
+
+            choices.extend(atom_chars + aromatic_chars)
+            weights.extend([10] * (len(atom_chars) + len(aromatic_chars)))
+
+            if candidate and (candidate[-1].isalpha() or candidate[-1] == ')' or candidate[-1].isdigit()):
+                choices.extend(bond_chars)
+                weights.extend([5] * len(bond_chars))
+
+
+            if candidate and (candidate[-1].isalpha() or candidate[-1] == ')' or candidate[-1].isdigit()):
+                choices.append('(')
+                weights.append(3)
+
+            if open_parentheses > 0:
+                choices.append(')')
+                weights.append(3)
+
+            # Ring closures
+            if candidate and (candidate[-1].isalpha() or candidate[-1] == ')'):
+                # Try to close an open ring
+                open_ring_digits = [d for d, status in open_rings.items() if status == 'open']
+                if open_ring_digits and random.random() < 0.5: # 50% chance to close an open ring
+                    digit_to_close = random.choice(open_ring_digits)
+                    choices.append(digit_to_close)
+                    weights.append(5)
+                else: # Try to open a new ring
+                    available_digits = [d for d in ring_digits if d not in open_rings or open_rings[d] == 'closed']
+                    if available_digits:
+                        choices.extend(available_digits)
+                        weights.extend([2] * len(available_digits))
+            
+            if not choices: # Fallback if no valid options (e.g., after certain bonds)
+                chosen_char = random.choice(atom_chars)
+            else:
+                chosen_char = random.choices(choices, weights=weights, k=1)[0]
+            
+            # Update state
+            if chosen_char == '(':
+                open_parentheses += 1
+            elif chosen_char == ')':
+                if open_parentheses > 0:
+                    open_parentheses -= 1
+                else:
+                    continue # Don't add a closing parenthesis if none are open
+            elif chosen_char in ring_digits:
+                if chosen_char not in open_rings or open_rings[chosen_char] == 'closed':
+                    open_rings[chosen_char] = 'open'
+                elif open_rings[chosen_char] == 'open':
+                    open_rings[chosen_char] = 'closed'
+            
+            candidate += chosen_char
+
+        # Attempt to close any remaining open parentheses
+        candidate += ')' * open_parentheses
+        
+        # Attempt to close any remaining open rings (very crudely)
+        for digit, status in open_rings.items():
+            if status == 'open':
+                # Find a suitable place to close it - this is hard without graph info
+                # For now, just append another atom and the digit if possible
+                if candidate and (candidate[-1].isalpha() or candidate[-1] == ')'):
+                    if random.random() < 0.7 and len(candidate) < self.max_len -1 : # Add another atom then close
+                        candidate += random.choice(atom_chars) + digit
+                    else: # Just append the digit (might be invalid)
+                        candidate += digit
+
+
+        return candidate
+
+    def generate_n_valid_smiles(self, n):
+        """Generates N valid, unique (canonical) SMILES strings."""
+        valid_smiles_set = set()
+        total_attempts_overall = 0
+
+        print(f"Attempting to generate {n} valid SMILES (min_len={self.min_len}, max_len={self.max_len})...")
+        while len(valid_smiles_set) < n:
+            attempts_for_current_smiles = 0
+            generated_this_round = False
+            while not generated_this_round:
+                total_attempts_overall += 1
+                attempts_for_current_smiles += 1
+                candidate = self.generate_random_smiles_candidate()
+                
+                if self.is_valid_smiles(candidate):
+                    mol = Chem.MolFromSmiles(candidate) # Re-parse to be sure and for canonicalization
+                    if mol: # Should be true if is_valid_smiles passed
+                        canonical_smi = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)
+                        try:
+                            mol = Chem.MolFromSmiles(canonical_smi)
+                            logp = Crippen.MolLogP(mol)
+                            if canonical_smi not in valid_smiles_set:
+                                valid_smiles_set.add(canonical_smi)
+                                # print(f"Generated ({len(valid_smiles_set)}/{n}): {canonical_smi} (after {attempts_for_current_smiles} attempts for this one, {total_attempts_overall} total)")
+                                generated_this_round = True
+                                break # Found one, move to the next
+                        except Exception as e:
+                            pass
+           
+        return list(valid_smiles_set)
+
+if __name__ == "__main__":
+    aInChIGenerator = InChIGenerator(max_atoms=15, min_atoms=3)
+    inchi_10 = aInChIGenerator.generate_n_valid_inchi(10)
+    print(inchi_10)
+    aSMILESGenerator = SMILESGenerator(min_len=5, max_len=25)
+    smiles_10 = aSMILESGenerator.generate_n_valid_smiles(10)
+    print(smiles_10)