diff --git a/internbootcamp/bootcamp/ChemStructure2Property/InChI2MRBootCamp.py b/internbootcamp/bootcamp/ChemStructure2Property/InChI2MRBootCamp.py new file mode 100755 index 0000000..e14c406 --- /dev/null +++ b/internbootcamp/bootcamp/ChemStructure2Property/InChI2MRBootCamp.py @@ -0,0 +1,28 @@ +from rdkit import Chem +from rdkit.Chem import Crippen +from .InChI2logPBootCamp import InChI2logPbootcamp + +class InChI2MRBootCamp(InChI2logPbootcamp): + + def prompt_func(self, InChI) -> str: + + instruction = f"Given the InChI, determine the Molar Refractivity (MR) value of the material. The InChI is: {InChI}" + instruction_following = """Let's think step by step and output the final answer within \\boxed{}.The final answer should be one float number. For example "Final Answer: \\boxed{afloat}".""" + + prompt = instruction + '\n' + instruction_following + return prompt + + + @classmethod + def _verify_correction(cls, solution, InChI)->bool: + """ + Verify the correction of the solution. + """ + mol = Chem.MolFromInchi(InChI) + true_MR = Crippen.MolMR(mol) + print(f"Comparing pred: {solution}, ground_truth: {true_MR}") + return abs(true_MR - float(solution)) <= 0.01 # maybe mse or mae better? + + + + diff --git a/internbootcamp/bootcamp/ChemStructure2Property/InChI2logPBootCamp.py b/internbootcamp/bootcamp/ChemStructure2Property/InChI2logPBootCamp.py new file mode 100755 index 0000000..19d6d85 --- /dev/null +++ b/internbootcamp/bootcamp/ChemStructure2Property/InChI2logPBootCamp.py @@ -0,0 +1,56 @@ +from internbootcamp.bootcamp.base import Basebootcamp +from internbootcamp.libs.chemStructure2Property.ChemStructureGenerator import InChIGenerator +from .utils import last_boxed_only_string, remove_boxed +from rdkit import Chem +from rdkit.Chem import Crippen + + +class InChI2logPbootcamp(Basebootcamp): + def __init__(self, num_numbers=4, max_atoms=15, min_atoms=3, elements=None, seed=None): + # super.__init__() + self.num_numbers = num_numbers + self.InChIGenerator = InChIGenerator(max_atoms=max_atoms, min_atoms=min_atoms, elements=elements, seed=seed) + + def case_generator(self) -> str: + """ + 生成一组数字和目标值。 + """ + return self.InChIGenerator.generate_n_valid_inchi(1)[0] + + def prompt_func(self, InChI) -> str: + + instruction = f"Given the InChI, determine the lipophilicity (logP) value of the material. The InChI is: {InChI}" + instruction_following = """Let's think step by step and output the final answer within \\boxed{}.The final answer should be one float number. For example "Final Answer: \\boxed{afloat}".""" + + prompt = instruction + '\n' + instruction_following + return prompt + + @staticmethod + def extract_output(output): + """ + Extract the output from the solution. + + Args: + output: Model output to be processed. + + Returns: + The processed output. + """ + output = last_boxed_only_string(output) + if output is None: + return None + return remove_boxed(output) + + @classmethod + def _verify_correction(cls, solution, InChI)->bool: + """ + Verify the correction of the solution. + """ + mol = Chem.MolFromInchi(InChI) + true_logp = Crippen.MolLogP(mol) + print(f"Comparing pred: {solution}, ground_truth: {true_logp}") + return abs(true_logp - float(solution)) <= 0.01 # maybe mse or mae better? + + + + diff --git a/internbootcamp/bootcamp/ChemStructure2Property/SMILES2MRBootCamp.py b/internbootcamp/bootcamp/ChemStructure2Property/SMILES2MRBootCamp.py new file mode 100755 index 0000000..8716068 --- /dev/null +++ b/internbootcamp/bootcamp/ChemStructure2Property/SMILES2MRBootCamp.py @@ -0,0 +1,34 @@ +from internbootcamp.bootcamp.base import Basebootcamp +from internbootcamp.libs.chemStructure2Property.ChemStructureGenerator import SMILESGenerator +from .utils import last_boxed_only_string, remove_boxed +from rdkit import Chem +from rdkit.Chem import Crippen + +from .SMILES2logPBootCamp import SMILES2logPBootCamp + +class SMILES2MRBootCamp(SMILES2logPBootCamp): + + + + def prompt_func(self, SMILES) -> str: + + instruction = f"Given the SMILES, determine the Molar Refractivity (MR) value of the material. The SMILES is: {SMILES}" + instruction_following = """Let's think step by step and output the final answer within \\boxed{}.The final answer should be one float number. For example "Final Answer: \\boxed{afloat}".""" + + prompt = instruction + '\n' + instruction_following + return prompt + + + @classmethod + def _verify_correction(cls, solution, SMILES)->bool: + """ + Verify the correction of the solution. + """ + mol = Chem.MolFromSmiles(SMILES) + true_MR = Crippen.MolMR(mol) + print(f"Comparing pred: {solution}, ground_truth: {true_MR}") + return abs(true_MR - float(solution)) <= 0.01 # maybe mse or mae better? + + + + diff --git a/internbootcamp/bootcamp/ChemStructure2Property/SMILES2logPBootCamp.py b/internbootcamp/bootcamp/ChemStructure2Property/SMILES2logPBootCamp.py new file mode 100755 index 0000000..8c6d3be --- /dev/null +++ b/internbootcamp/bootcamp/ChemStructure2Property/SMILES2logPBootCamp.py @@ -0,0 +1,43 @@ +from internbootcamp.bootcamp.base import Basebootcamp +from internbootcamp.libs.chemStructure2Property.ChemStructureGenerator import SMILESGenerator +from .utils import last_boxed_only_string, remove_boxed +from rdkit import Chem +from rdkit.Chem import Crippen + +from .InChI2logPBootCamp import InChI2logPbootcamp + +class SMILES2logPBootCamp(InChI2logPbootcamp): + def __init__(self, num_numbers=4, min_len=5, max_len=25, + seed=None): + # super.__init__() + self.num_numbers = num_numbers + self.SMILESGenerator = SMILESGenerator(min_len=5, max_len=25, seed=None) + + def case_generator(self) -> str: + """ + 生成一组数字和目标值。 + """ + return self.SMILESGenerator.generate_n_valid_smiles(1)[0] + + def prompt_func(self, SMILES) -> str: + + instruction = f"Given the SMILES, determine the lipophilicity (logP) value of the material. The SMILES is: {SMILES}" + instruction_following = """Let's think step by step and output the final answer within \\boxed{}.The final answer should be one float number. For example "Final Answer: \\boxed{afloat}".""" + + prompt = instruction + '\n' + instruction_following + return prompt + + + @classmethod + def _verify_correction(cls, solution, SMILES)->bool: + """ + Verify the correction of the solution. + """ + mol = Chem.MolFromSmiles(SMILES) + true_logp = Crippen.MolLogP(mol) + print(f"Comparing pred: {solution}, ground_truth: {true_logp}") + return abs(true_logp - float(solution)) <= 0.01 # maybe mse or mae better? + + + + diff --git a/internbootcamp/bootcamp/ChemStructure2Property/utils.py b/internbootcamp/bootcamp/ChemStructure2Property/utils.py new file mode 100644 index 0000000..1d57203 --- /dev/null +++ b/internbootcamp/bootcamp/ChemStructure2Property/utils.py @@ -0,0 +1,43 @@ + +def remove_boxed(s): + if "\\boxed " in s: + left = "\\boxed " + assert s[:len(left)] == left + return s[len(left):] + + left = "\\boxed{" + + assert s[:len(left)] == left + assert s[-1] == "}" + + return s[len(left):-1] + + +def last_boxed_only_string(string): + idx = string.rfind("\\boxed") + if "\\boxed " in string: + return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0] + if idx < 0: + idx = string.rfind("\\fbox") + if idx < 0: + return None + + i = idx + right_brace_idx = None + num_left_braces_open = 0 + while i < len(string): + if string[i] == "{": + num_left_braces_open += 1 + if string[i] == "}": + num_left_braces_open -= 1 + if num_left_braces_open == 0: + right_brace_idx = i + break + i += 1 + + if right_brace_idx is None: + retval = None + else: + retval = string[idx:right_brace_idx + 1] + + return retval \ No newline at end of file diff --git a/internbootcamp/bootcamp/__init__.py b/internbootcamp/bootcamp/__init__.py index 8101837..5c454de 100755 --- a/internbootcamp/bootcamp/__init__.py +++ b/internbootcamp/bootcamp/__init__.py @@ -30,6 +30,13 @@ from .bbeh_hyperbaton.hyperbaton_default import BBEHHyperbatonbootcamp from .bbeh_boardgame_qa.bbeh_boardgame_qa import Bbehboardgameqabootcamp from .bbeh_boolean_expressions.bbeh_boolean_expressions import Bbehbooleanexpressionsbootcamp + +from .ChemStructure2Property.InChI2logPBootCamp import InChI2logPbootcamp +from .ChemStructure2Property.InChI2MRBootCamp import InChI2MRBootCamp +from .ChemStructure2Property.SMILES2logPBootCamp import SMILES2logPBootCamp +from .ChemStructure2Property.SMILES2MRBootCamp import SMILES2MRBootCamp + + from .kakurasu.kakurasu import Kakurasubootcamp from .nonograms.nonograms import Nonogramsbootcamp from .hitori.hitori import Hitoribootcamp diff --git a/internbootcamp/libs/chemStructure2Property/ChemStructureGenerator.py b/internbootcamp/libs/chemStructure2Property/ChemStructureGenerator.py new file mode 100755 index 0000000..94fddaa --- /dev/null +++ b/internbootcamp/libs/chemStructure2Property/ChemStructureGenerator.py @@ -0,0 +1,279 @@ +import random +from rdkit import Chem +from rdkit import RDLogger +from rdkit.Chem.inchi import MolToInchi +from rdkit.Chem import Crippen + + +class InChIGenerator: + def __init__(self, max_atoms=15, min_atoms=3, elements=None, + seed=None): + RDLogger.DisableLog('rdApp.*') + random.seed(42) if seed is None else random.seed(seed) + self.max_atoms = max_atoms + self.min_atoms = min_atoms + if elements is None: + self.elements = ['C', 'N', 'O', 'S', 'P', 'F', 'Cl', 'Br', 'I'] + else: + self.elements = elements + + def generate_random_molecule_rdkit(self): + """ + Generates a random molecule using RDKit's RWMol. + Can optionally try to assign random stereochemistry. + """ + + rw_mol = Chem.RWMol() # Editable molecule + num_atoms_to_add = random.randint(self.min_atoms, self.max_atoms) + + if num_atoms_to_add == 0: + return None + + # Add the first atom + atom_symbol = random.choice(self.elements) + rw_mol.AddAtom(Chem.Atom(atom_symbol)) + + # Add subsequent atoms and connect them + for i in range(1, num_atoms_to_add): + if not rw_mol.GetNumAtoms(): break + + existing_atom_idx = random.randrange(rw_mol.GetNumAtoms()) + new_atom_symbol = random.choice(self.elements) + new_atom_idx = rw_mol.AddAtom(Chem.Atom(new_atom_symbol)) + + bond_type = random.choice([Chem.BondType.SINGLE, Chem.BondType.SINGLE, Chem.BondType.DOUBLE, Chem.BondType.TRIPLE]) + + rw_mol.AddBond(existing_atom_idx, new_atom_idx, bond_type) + + # Attempt to form rings + if rw_mol.GetNumAtoms() > 2: + num_rings_to_try = random.randint(0, rw_mol.GetNumAtoms() // 3) + for _ in range(num_rings_to_try): + if rw_mol.GetNumAtoms() < 2: break + + atom_indices = list(range(rw_mol.GetNumAtoms())) + if len(atom_indices) < 2: break + + idx1, idx2 = random.sample(atom_indices, 2) + + if rw_mol.GetBondBetweenAtoms(idx1, idx2) is None: + rw_mol.AddBond(idx1, idx2, Chem.BondType.SINGLE) # Usually single for new rings + + try: + mol = rw_mol.GetMol() + Chem.SanitizeMol(mol) # Crucial: checks valency, aromaticity, etc. + + if mol.GetNumAtoms() > 0: + # It might create non-physical or conflicting assignments. + # InChI will represent whatever stereo is defined. + Chem.AssignStereochemistryFrom3D(mol) # If 3D coords were present (not here) + # Or, more directly, find potential chiral centers and assign randomly + chiral_centers = Chem.FindMolChiralCenters(mol, includeUnassigned=True) + for center_idx, stereo_val in chiral_centers: + if stereo_val == '?': # Unassigned + atom = mol.GetAtomWithIdx(center_idx) + if random.choice([True, False]): + atom.SetChiralTag(Chem.ChiralType.CHI_TETRAHEDRAL_CW) + else: + atom.SetChiralTag(Chem.ChiralType.CHI_TETRAHEDRAL_CCW) + # Re-sanitize after modifying chiral tags might be good practice + Chem.SanitizeMol(mol) + return mol + except Exception as e: + # print(f"Debug: RDKit molecule construction/sanitization failed: {e}") + return None + + def generate_n_valid_inchi(self, n): + """ + Generates N valid, unique InChI strings. + kwargs_for_mol_gen are passed to generate_random_molecule_rdkit. + """ + valid_inchi_set = set() + total_attempts_overall = 0 + + while len(valid_inchi_set) < n: + attempts_for_current_inchi = 0 + generated_this_round = False + while not generated_this_round: + total_attempts_overall += 1 + attempts_for_current_inchi += 1 + + mol = self.generate_random_molecule_rdkit() + + if mol: + try: + inchi_string = MolToInchi(mol) + mol = Chem.MolFromInchi(inchi_string) + logp = Crippen.MolLogP(mol) + if inchi_string and inchi_string not in valid_inchi_set: + + valid_inchi_set.add(inchi_string) + generated_this_round = True + break # Found one + except Exception as e: + # This can happen if the molecule is somehow malformed even after sanitization, + # or if InChI generation itself encounters an issue (rare). + print(f"Debug: MolToInchi failed: {e} for SMILES: {Chem.MolToSmiles(mol)}") + pass + + return list(valid_inchi_set) + + +class SMILESGenerator: + def __init__(self, min_len=5, max_len=25, + seed=None): + RDLogger.DisableLog('rdApp.*') + random.seed(42) if seed is None else random.seed(seed) + self.min_len = min_len + self.max_len = max_len + + + def is_valid_smiles(self, smi): + """Checks if a SMILES string is valid using RDKit.""" + if not smi: + return False + mol = Chem.MolFromSmiles(smi, sanitize=False) # Parse without sanitization first + if mol is None: + return False + try: + Chem.SanitizeMol(mol) + return True + except Exception as e: + return False + + def generate_random_smiles_candidate(self): + """ + Generates a random string that might be a SMILES string. + This is a VERY naive generator and will produce many invalid SMILES. + """ + # A basic set of SMILES characters + # More comprehensive: C,c,N,n,O,o,S,s,P,p,F,Cl,Br,I,B,Si,Se,*,[nH],[nH+],[cH-],... + # Also: -,=,#,$,:,.,(,),[,],%,0-9 (for ring closures and isotopes/charges) + atom_chars = ['C', 'N', 'O', 'S', 'F', 'Cl', 'Br', 'I', 'P'] + aromatic_chars = ['c', 'n', 'o', 's'] + bond_chars = ['-', '=', '#'] + branch_chars = ['(', ')'] + ring_digits = [str(i) for i in range(1, 10)] # 1-9 + # More complex elements like charges, isotopes, chiral centers are harder to randomize simply + # For simplicity, we'll stick to a subset. + + all_chars = atom_chars + aromatic_chars + bond_chars + branch_chars + ring_digits + + length = random.randint(self.min_len, self.max_len) + candidate = "" + + candidate += random.choice(atom_chars + aromatic_chars) + + open_parentheses = 0 + open_rings = {} + + for _ in range(length - 1): + choices = [] + weights = [] + + choices.extend(atom_chars + aromatic_chars) + weights.extend([10] * (len(atom_chars) + len(aromatic_chars))) + + if candidate and (candidate[-1].isalpha() or candidate[-1] == ')' or candidate[-1].isdigit()): + choices.extend(bond_chars) + weights.extend([5] * len(bond_chars)) + + + if candidate and (candidate[-1].isalpha() or candidate[-1] == ')' or candidate[-1].isdigit()): + choices.append('(') + weights.append(3) + + if open_parentheses > 0: + choices.append(')') + weights.append(3) + + # Ring closures + if candidate and (candidate[-1].isalpha() or candidate[-1] == ')'): + # Try to close an open ring + open_ring_digits = [d for d, status in open_rings.items() if status == 'open'] + if open_ring_digits and random.random() < 0.5: # 50% chance to close an open ring + digit_to_close = random.choice(open_ring_digits) + choices.append(digit_to_close) + weights.append(5) + else: # Try to open a new ring + available_digits = [d for d in ring_digits if d not in open_rings or open_rings[d] == 'closed'] + if available_digits: + choices.extend(available_digits) + weights.extend([2] * len(available_digits)) + + if not choices: # Fallback if no valid options (e.g., after certain bonds) + chosen_char = random.choice(atom_chars) + else: + chosen_char = random.choices(choices, weights=weights, k=1)[0] + + # Update state + if chosen_char == '(': + open_parentheses += 1 + elif chosen_char == ')': + if open_parentheses > 0: + open_parentheses -= 1 + else: + continue # Don't add a closing parenthesis if none are open + elif chosen_char in ring_digits: + if chosen_char not in open_rings or open_rings[chosen_char] == 'closed': + open_rings[chosen_char] = 'open' + elif open_rings[chosen_char] == 'open': + open_rings[chosen_char] = 'closed' + + candidate += chosen_char + + # Attempt to close any remaining open parentheses + candidate += ')' * open_parentheses + + # Attempt to close any remaining open rings (very crudely) + for digit, status in open_rings.items(): + if status == 'open': + # Find a suitable place to close it - this is hard without graph info + # For now, just append another atom and the digit if possible + if candidate and (candidate[-1].isalpha() or candidate[-1] == ')'): + if random.random() < 0.7 and len(candidate) < self.max_len -1 : # Add another atom then close + candidate += random.choice(atom_chars) + digit + else: # Just append the digit (might be invalid) + candidate += digit + + + return candidate + + def generate_n_valid_smiles(self, n): + """Generates N valid, unique (canonical) SMILES strings.""" + valid_smiles_set = set() + total_attempts_overall = 0 + + print(f"Attempting to generate {n} valid SMILES (min_len={self.min_len}, max_len={self.max_len})...") + while len(valid_smiles_set) < n: + attempts_for_current_smiles = 0 + generated_this_round = False + while not generated_this_round: + total_attempts_overall += 1 + attempts_for_current_smiles += 1 + candidate = self.generate_random_smiles_candidate() + + if self.is_valid_smiles(candidate): + mol = Chem.MolFromSmiles(candidate) # Re-parse to be sure and for canonicalization + if mol: # Should be true if is_valid_smiles passed + canonical_smi = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True) + try: + mol = Chem.MolFromSmiles(canonical_smi) + logp = Crippen.MolLogP(mol) + if canonical_smi not in valid_smiles_set: + valid_smiles_set.add(canonical_smi) + # print(f"Generated ({len(valid_smiles_set)}/{n}): {canonical_smi} (after {attempts_for_current_smiles} attempts for this one, {total_attempts_overall} total)") + generated_this_round = True + break # Found one, move to the next + except Exception as e: + pass + + return list(valid_smiles_set) + +if __name__ == "__main__": + aInChIGenerator = InChIGenerator(max_atoms=15, min_atoms=3) + inchi_10 = aInChIGenerator.generate_n_valid_inchi(10) + print(inchi_10) + aSMILESGenerator = SMILESGenerator(min_len=5, max_len=25) + smiles_10 = aSMILESGenerator.generate_n_valid_smiles(10) + print(smiles_10) \ No newline at end of file