from dotenv import load_dotenv from verdict import Layer, Pipeline from verdict.common.judge import CategoricalJudgeUnit, JudgeUnit from verdict.scale import ContinuousScale, DiscreteScale from verdict.schema import Schema from verdict.transform import MaxPoolUnit load_dotenv() class OptimizerEvaluator: def __init__(self): self.score_pipeline = ( Pipeline() >> Layer( JudgeUnit(scale=ContinuousScale(1, 10)).prompt( ( "You are a judge that is an expert at evaluating optimizers for their novelty " "as they will be accepted to a prestigious research conference. Given the following " "optimizer code and its architecture/use-case, you must rate it on a scale of 1 to 10 " "based on how novel it is and its impactfulness in speeding up model training. " "Here is the code: {source.optimizer_code}\n" "Here is the architecture: {source.architecture}" ) ), repeat=3, ).via("xai/grok-3-latest") >> MaxPoolUnit() ) self.validity_pipeline = Pipeline() >> Layer( CategoricalJudgeUnit( name="Judge", categories=DiscreteScale(["yes", "no"]), explanation=False, ) .prompt(""" You are an expert code validator specializing in PyTorch optimizers. Your task is to determine if the provided optimizer code is completely valid and error-free. A valid optimizer MUST satisfy ALL of these criteria: 1. Has zero syntax or runtime errors: - No undefined variables - No type mismatches - No memory issues - No CUDA/CPU compatibility problems 2. Can be imported and instantiated without blocking errors 3. Can run a complete optimization step without exceptions Optimizer Code: {source.optimizer_code} Stdout: {source.stdout} Stderr: {source.stderr} Respond with: - "yes" if ALL criteria are met and the code is completely error-free - "no" if ANY criterion fails or there are ANY potential issues Be extremely strict in your evaluation. """) .via("xai/grok-3-latest", retries=2) ) def score(self, optimizer_code: str, architecture: str) -> int: schema = Schema.of( optimizer_code=optimizer_code, architecture=architecture, ) response, _ = self.score_pipeline.run(schema) return response.get("Pipeline_root.block.block.unit[Map MaxPool]_score", 0.0) def check_validity(self, optimizer_code: str, stdout: str, stderr: str) -> bool: schema = Schema.of( optimizer_code=optimizer_code, stdout=stdout, stderr=stderr, ) response, _ = self.validity_pipeline.run(schema) choice = response.get( "Pipeline_root.block.layer[0].unit[CategoricalJudge Judge]_choice", None ) return choice == "yes" if __name__ == "__main__": evaluator = OptimizerEvaluator() optimizer_code = """ import torch # Define parameter (requires_grad=True) x = torch.tensor([0.0], requires_grad=True) optimizer = torch.optim.SGD([x], lr=0.1) for step in range(20): optimizer.zero_grad() loss = (x - 3) ** 2 loss.backward() optimizer.step() print(f"Step {step + 1}: x = {x.item():.4f}, loss = {loss.item():.4f}") print(f"\nOptimal x: {x.item():.4f}") """ stdout = """ Step 1: x = 0.0000, loss = 9.0000 Step 2: x = 0.0900, loss = 8.1000 Step 3: x = 0.1620, loss = 7.2900 Step 4: x = 0.2187, loss = 6.5610 Step 5: x = 0.2624, loss = 5.9049 Step 6: x = 0.2962, loss = 5.3144 Step 7: x = 0.3225, loss = 4.7830 """ stderr = """ Traceback (most recent call last): """ score = evaluator.check_validity( optimizer_code=optimizer_code, stdout=stdout, stderr=stderr ) print(score)