adding MR and LogP Prediction tasks

This commit is contained in:
Jucheng Hu 2025-06-16 15:10:05 +08:00
parent c2dad02fe4
commit 91972f43ea
7 changed files with 490 additions and 0 deletions

View file

@ -0,0 +1,28 @@
from rdkit import Chem
from rdkit.Chem import Crippen
from .InChI2logPBootCamp import InChI2logPbootcamp
class InChI2MRBootCamp(InChI2logPbootcamp):
def prompt_func(self, InChI) -> str:
instruction = f"Given the InChI, determine the Molar Refractivity (MR) value of the material. The InChI is: {InChI}"
instruction_following = """Let's think step by step and output the final answer within \\boxed{}.The final answer should be one float number. For example "Final Answer: \\boxed{afloat}"."""
prompt = instruction + '\n' + instruction_following
return prompt
@classmethod
def _verify_correction(cls, solution, InChI)->bool:
"""
Verify the correction of the solution.
"""
mol = Chem.MolFromInchi(InChI)
true_MR = Crippen.MolMR(mol)
print(f"Comparing pred: {solution}, ground_truth: {true_MR}")
return abs(true_MR - float(solution)) <= 0.01 # maybe mse or mae better?

View file

@ -0,0 +1,56 @@
from internbootcamp.bootcamp.base import Basebootcamp
from internbootcamp.libs.chemStructure2Property.ChemStructureGenerator import InChIGenerator
from .utils import last_boxed_only_string, remove_boxed
from rdkit import Chem
from rdkit.Chem import Crippen
class InChI2logPbootcamp(Basebootcamp):
def __init__(self, num_numbers=4, max_atoms=15, min_atoms=3, elements=None, seed=None):
# super.__init__()
self.num_numbers = num_numbers
self.InChIGenerator = InChIGenerator(max_atoms=max_atoms, min_atoms=min_atoms, elements=elements, seed=seed)
def case_generator(self) -> str:
"""
生成一组数字和目标值
"""
return self.InChIGenerator.generate_n_valid_inchi(1)[0]
def prompt_func(self, InChI) -> str:
instruction = f"Given the InChI, determine the lipophilicity (logP) value of the material. The InChI is: {InChI}"
instruction_following = """Let's think step by step and output the final answer within \\boxed{}.The final answer should be one float number. For example "Final Answer: \\boxed{afloat}"."""
prompt = instruction + '\n' + instruction_following
return prompt
@staticmethod
def extract_output(output):
"""
Extract the output from the solution.
Args:
output: Model output to be processed.
Returns:
The processed output.
"""
output = last_boxed_only_string(output)
if output is None:
return None
return remove_boxed(output)
@classmethod
def _verify_correction(cls, solution, InChI)->bool:
"""
Verify the correction of the solution.
"""
mol = Chem.MolFromInchi(InChI)
true_logp = Crippen.MolLogP(mol)
print(f"Comparing pred: {solution}, ground_truth: {true_logp}")
return abs(true_logp - float(solution)) <= 0.01 # maybe mse or mae better?

View file

@ -0,0 +1,34 @@
from internbootcamp.bootcamp.base import Basebootcamp
from internbootcamp.libs.chemStructure2Property.ChemStructureGenerator import SMILESGenerator
from .utils import last_boxed_only_string, remove_boxed
from rdkit import Chem
from rdkit.Chem import Crippen
from .SMILES2logPBootCamp import SMILES2logPBootCamp
class SMILES2MRBootCamp(SMILES2logPBootCamp):
def prompt_func(self, SMILES) -> str:
instruction = f"Given the SMILES, determine the Molar Refractivity (MR) value of the material. The SMILES is: {SMILES}"
instruction_following = """Let's think step by step and output the final answer within \\boxed{}.The final answer should be one float number. For example "Final Answer: \\boxed{afloat}"."""
prompt = instruction + '\n' + instruction_following
return prompt
@classmethod
def _verify_correction(cls, solution, SMILES)->bool:
"""
Verify the correction of the solution.
"""
mol = Chem.MolFromSmiles(SMILES)
true_MR = Crippen.MolMR(mol)
print(f"Comparing pred: {solution}, ground_truth: {true_MR}")
return abs(true_MR - float(solution)) <= 0.01 # maybe mse or mae better?

View file

@ -0,0 +1,43 @@
from internbootcamp.bootcamp.base import Basebootcamp
from internbootcamp.libs.chemStructure2Property.ChemStructureGenerator import SMILESGenerator
from .utils import last_boxed_only_string, remove_boxed
from rdkit import Chem
from rdkit.Chem import Crippen
from .InChI2logPBootCamp import InChI2logPbootcamp
class SMILES2logPBootCamp(InChI2logPbootcamp):
def __init__(self, num_numbers=4, min_len=5, max_len=25,
seed=None):
# super.__init__()
self.num_numbers = num_numbers
self.SMILESGenerator = SMILESGenerator(min_len=5, max_len=25, seed=None)
def case_generator(self) -> str:
"""
生成一组数字和目标值
"""
return self.SMILESGenerator.generate_n_valid_smiles(1)[0]
def prompt_func(self, SMILES) -> str:
instruction = f"Given the SMILES, determine the lipophilicity (logP) value of the material. The SMILES is: {SMILES}"
instruction_following = """Let's think step by step and output the final answer within \\boxed{}.The final answer should be one float number. For example "Final Answer: \\boxed{afloat}"."""
prompt = instruction + '\n' + instruction_following
return prompt
@classmethod
def _verify_correction(cls, solution, SMILES)->bool:
"""
Verify the correction of the solution.
"""
mol = Chem.MolFromSmiles(SMILES)
true_logp = Crippen.MolLogP(mol)
print(f"Comparing pred: {solution}, ground_truth: {true_logp}")
return abs(true_logp - float(solution)) <= 0.01 # maybe mse or mae better?

View file

@ -0,0 +1,43 @@
def remove_boxed(s):
if "\\boxed " in s:
left = "\\boxed "
assert s[:len(left)] == left
return s[len(left):]
left = "\\boxed{"
assert s[:len(left)] == left
assert s[-1] == "}"
return s[len(left):-1]
def last_boxed_only_string(string):
idx = string.rfind("\\boxed")
if "\\boxed " in string:
return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
if idx < 0:
idx = string.rfind("\\fbox")
if idx < 0:
return None
i = idx
right_brace_idx = None
num_left_braces_open = 0
while i < len(string):
if string[i] == "{":
num_left_braces_open += 1
if string[i] == "}":
num_left_braces_open -= 1
if num_left_braces_open == 0:
right_brace_idx = i
break
i += 1
if right_brace_idx is None:
retval = None
else:
retval = string[idx:right_brace_idx + 1]
return retval