from verdict.schema import Schema from verdict import Pipeline, Layer from verdict.common.judge import JudgeUnit from verdict.scale import ContinuousScale from verdict.common.judge import CategoricalJudgeUnit from verdict.transform import MaxPoolUnit from verdict.scale import DiscreteScale from dotenv import load_dotenv load_dotenv() class OptimizerEvaluator: def __init__(self): self.score_pipeline = ( Pipeline() >> Layer( JudgeUnit(scale=ContinuousScale(1, 10)).prompt( ( "You are a judge that is an expert at evaluating optimizers for their novelty " "as they will be accepted to a prestigious research conference. Given the following " "optimizer code and its architecture/use-case, you must rate it on a scale of 1 to 10 " "based on how novel it is and its impactfulness in speeding up model training. " "Here is the code: {source.optimizer_code}\n" "Here is the architecture: {source.architecture}" ) ), repeat=3, ).via("xai/grok-3-latest") >> MaxPoolUnit() ) self.validity_pipeline = ( Pipeline() >> Layer( CategoricalJudgeUnit( name='Judge', categories=DiscreteScale(['yes', 'no']), explanation=False, ).prompt(""" You are an expert code validator specializing in PyTorch optimizers. Your task is to determine if the provided optimizer code is completely valid and error-free. A valid optimizer MUST satisfy ALL of these criteria: 1. Has zero syntax or runtime errors: - No undefined variables - No type mismatches - No memory issues - No CUDA/CPU compatibility problems 2. Can be imported and instantiated without blocking errors 3. Can run a complete optimization step without exceptions Optimizer Code: {source.optimizer_code} Stdout: {source.stdout} Stderr: {source.stderr} Respond with: - "yes" if ALL criteria are met and the code is completely error-free - "no" if ANY criterion fails or there are ANY potential issues Be extremely strict in your evaluation. """).via("xai/grok-3-latest", retries=2) ) ) def score(self, optimizer_code: str, architecture: str) -> int: schema = Schema.of( optimizer_code=optimizer_code, architecture=architecture, ) response, _ = self.score_pipeline.run(schema) return response.get("Pipeline_root.block.block.unit[Map MaxPool]_score", 0.0) def check_validity(self, optimizer_code: str, stdout: str, stderr: str) -> bool: schema = Schema.of( optimizer_code=optimizer_code, stdout=stdout, stderr=stderr, ) response, _ = self.validity_pipeline.run(schema) print(response) choice = response.get("Pipeline_root.block.layer[0].unit[CategoricalJudge Judge]_choice", None) return choice == "yes" if __name__ == "__main__": evaluator = OptimizerEvaluator() optimizer_code = """ import torch # Define parameter (requires_grad=True) x = torch.tensor([0.0], requires_grad=True) optimizer = torch.optim.SGD([x], lr=0.1) for step in range(20): optimizer.zero_grad() loss = (x - 3) ** 2 loss.backward() optimizer.step() print(f"Step {step + 1}: x = {x.item():.4f}, loss = {loss.item():.4f}") print(f"\nOptimal x: {x.item():.4f}") """ stdout = """ Step 1: x = 0.0000, loss = 9.0000 Step 2: x = 0.0900, loss = 8.1000 Step 3: x = 0.1620, loss = 7.2900 Step 4: x = 0.2187, loss = 6.5610 Step 5: x = 0.2624, loss = 5.9049 Step 6: x = 0.2962, loss = 5.3144 Step 7: x = 0.3225, loss = 4.7830 """ stderr = """ Traceback (most recent call last): """ score = evaluator.check_validity(optimizer_code=optimizer_code, stdout=stdout, stderr=stderr) print(score)