lcb coding rl environment

2026-04-19 12:57:58 +00:00 · 2025-12-24 20:57:49 +00:00 · 2025-12-24 20:57:49 +00:00 · 2fd888cdb0
commit 2fd888cdb0
parent 5e962e682e
2 changed files with 1210 additions and 136 deletions
--- a/environments/code_execution_server/coding_server.py
+++ b/environments/code_execution_server/coding_server.py
@ -1,117 +1,254 @@
 import asyncio
+from datetime import datetime
 import os
-import random
-from typing import List, Optional, Tuple
-
-import docker
-import httpx
+from typing import List, Optional, Tuple, Dict
+import numpy as np
+import time
+import modal
+import math
+from pydantic import Field
+from collections import deque
+import json
+import wandb
 import regex as re
 from datasets import load_dataset
-
-from atroposlib.envs.base import BaseEnv, ScoredDataGroup
+from atroposlib.envs.base import (
+    APIServerConfig,
+    BaseEnv,
+    BaseEnvConfig,
+    ScoredDataGroup,
+    EvalHandlingEnum,
+)
 from atroposlib.type_definitions import GameHistory, Item
 from atroposlib.utils.tokenize_for_trainer import tokenize_for_trainer
+from rich import print as rprint

-system_prompt = (
-    "You are a deep thinking AI, you may use extremely long chains of thought "
-    "to deeply consider the problem and deliberate with yourself via systematic "
-    "reasoning processes to help come to a correct solution prior to answering. "
-    "You should enclose your thoughts and internal monologue inside <think> </think> "
-    "tags, and then provide your solution or response to the problem.\n\n"
-)
+system_prompt = ""

+system_prompt += f"You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests."

-async def submit_code(client, code, test_input, language="python"):
-    url = "http://localhost:5002/execute"
-    payload = {"code": code, "input": test_input, "language": language}
-    response = await client.post(url, json=payload)
-    response_json = response.json()
-    return response_json["output"]
+FORMATTING_MESSAGE_WITH_STARTER_CODE = "You will use the following starter code to write the solution to the problem and enclose your code within delimiters."
+FORMATTING_WITHOUT_STARTER_CODE = "Read the inputs from stdin, solve the problem, and write the answer to stdout (do not directly test on the sample inputs). Enclose your code within delimiters as follows. Ensure that when the python program runs it reads the inputs runs the algorithm and writes output to STDOUT."

+async_semaphore = asyncio.Semaphore(100)

-async def get_results(code, answer):
-    async with httpx.AsyncClient() as client:
-        tasks = []
-        for i in range(len(answer)):
-            tasks.append(submit_code(client, code, answer[i]))
+def get_prompt(question, problem_type, starter_code=None):
+    prompt = ""
+    prompt += f"Question: {question}\n\n"
+    if problem_type == "func" and starter_code:
+        prompt += f"{FORMATTING_MESSAGE_WITH_STARTER_CODE}\n"
+        prompt += f"```python\n{starter_code}\n```\n\n"
+    elif problem_type == "func" and not starter_code:
+        pass
+    else:
+        prompt += f"{FORMATTING_WITHOUT_STARTER_CODE}\n"
+        prompt += f"```python\n# YOUR CODE HERE\n```\n\n"
+    prompt += f"### Answer: (use the provided format with backticks)\n\n"
+    return prompt

-        results = await asyncio.gather(*tasks)
-    return [result for result in results]
-
-
-def init_docker():
-    client = docker.from_env()
-
-    def build_docker_image():
-        try:
-            # Build the Docker image
-            print("Building Docker image...")
-            current_dir = os.path.dirname(
-                os.path.abspath(__file__)
-            )  # Get the current directory of the script
-            image, logs = client.images.build(path=current_dir, tag="code-executor")
-
-            # Print the build logs
-            for log in logs:
-                print(log.get("stream", "").strip())
-
-            print("Docker image built successfully.")
-            return image
-        except docker.errors.BuildError as e:
-            print(f"Error during Docker image build: {e}")
-
-    def run_docker_container():
-        try:
-            # Run the Docker container
-            print("Running Docker container...")
-            container = client.containers.run(
-                "code-executor", ports={"5002/tcp": 5002}, detach=True
-            )  # Runs in detached mode (in the background)
-
-            print(f"Docker container is running with ID: {container.id}")
-            return container
-        except docker.errors.ContainerError as e:
-            print(f"Error during Docker container run: {e}")
-
-    build_docker_image()
-    container = run_docker_container()
-    return container
+run_test = modal.Function.from_name("joeli-lcb", "run_test")

+class CodeConfig(BaseEnvConfig):
+    dataset_name: str = Field("normal", description="dataset name, should be normal or deepmind")
+    temperature: float = Field(0.6, description="model temperature")
+    eval_temperature: float = Field(0.6, description="model temperature during evaluation")
+    top_p: float = Field(0.95, description="top p")
+    eval_top_p: float = Field(0.95, description="eval top p")
+    start_idx: int = Field(0, description="start index in training set")
+    max_eval_token_length: int = Field(40960, description="max sequence length during evaluation")

 class CodingEnv(BaseEnv):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
+        
+        self.id = 0
+        self.complete = []
+        self.total = []
+        self.complement = []
+        self.eval_metrics = []
+        self.train_metrics = {"rewards": [], "overlong_ratio": [], "pass_gs": [], "completion_lengths": [], "correct_completion_lengths": [], "incorrect_completion_lengths": []}
+        self.cur_time = datetime.now().strftime("%Y-%m-%d %H.%M.%S")

+        self.temp_metrics = {"indices": [], "num_correct": [], }
+
+        self.blacklist = set()
+        self.deq = deque()
+
+    @classmethod
+    def config_init(cls) -> Tuple[CodeConfig, List[APIServerConfig]]:
+        env_config = CodeConfig(
+            tokenizer_name="Qwen/Qwen3-14B",
+            group_size=8,
+            use_wandb=True,
+            rollout_server_url="http://localhost:8000",
+            total_steps=600,
+            batch_size=-1,
+            steps_per_eval=10,
+            max_batches_offpolicy=2,
+            max_eval_workers=128, # max workers per node * num gpus
+            eval_handling=EvalHandlingEnum.STOP_TRAIN,
+            eval_limit_ratio=0.25,
+            max_token_length=32768,
+            min_items_sent_before_logging=0,
+            wandb_name="qwen_gspo_32k_mod",
+            dataset_name="normal",
+            worker_timeout=7200,
+            temperature=1.0,
+            eval_temperature=0.6,
+            top_p=1.0,
+            eval_top_p=0.95,
+            start_idx=0,
+            max_eval_token_length=40960,
+        )
+        server_configs = [
+            APIServerConfig(
+                model_name="Qwen/Qwen3-14B",
+                base_url="http://localhost:9001/v1",
+                api_key="x",
+                num_requests_for_eval=256,
+                timeout=2400,
+            ),
+        ]
+
+        return env_config, server_configs
+
+
+    # item looks like {problem, tests, problem_type, idx}
    async def collect_trajectories(
        self, item: Item
    ) -> Tuple[GameHistory | None, List[Item]]:
-        chat_completions = await self.server.chat_completion(
-            messages=[
-                {
-                    "role": "system",
-                    "content": "You must submit your answer with ```python\n{code}```",
-                },
-                dict(item[0][0]),
-            ],
-            n=self.config.group_size,
-            max_tokens=1024 * 4,
-        )
-        to_score = list()
-        to_backlog = list()
-        for i, chat_completion in enumerate(chat_completions.choices):
-            messages = (
-                dict(item[0][0]),
-                {"role": "assistant", "content": chat_completion.message.content},
-            )
-            to_score.append(
-                (
-                    messages,
-                    item[1],
-                )
-            )
+        rprint("COLLECTING TRAJECTORIES")
+        train_or_valid = item["split"]
+        system_msg = {"role": "system", "content": system_prompt}
+        user_msg = {"role": "user", "content": get_prompt(item["problem"], item["problem_type"], item.get("starter_code", None))}

-        to_postprocess = await self.score(to_score)
-        return to_postprocess, to_backlog
+        prompt_tokens = tokenize_for_trainer(self.tokenizer, chat=[system_msg, user_msg])
+        buffer = 32
+
+        async def generate_and_score(index: int) -> Tuple[List[int], List[int], float]:
+            # Step 1: Generate single completion
+
+            max_tokens = self.config.max_token_length - len(prompt_tokens["tokens"]) - buffer
+            temp = self.config.temperature
+            top_p = self.config.top_p
+            if train_or_valid == "test":
+                max_tokens = self.config.max_eval_token_length - len(prompt_tokens["tokens"]) - buffer
+                top_p = self.config.eval_top_p
+                temp = self.config.eval_temperature
+
+            chat_completion = await self.server.chat_completion(
+                messages=[system_msg, user_msg],
+                n=1, ### CHANGE
+                max_tokens=max_tokens,
+                temperature=temp,
+                top_p=top_p,
+            )
+            content = chat_completion.choices[0].message.content
+            assistant_msg = {"role": "assistant", "content": content}
+            messages = [system_msg, user_msg, assistant_msg]
+
+            if "fn_name" not in item:
+                fn_name = "none"
+            else:
+                fn_name = item["fn_name"]
+
+            tests = item["tests"]
+            if isinstance(tests, str):
+                tests = json.loads(tests)
+            tests["fn_name"] = fn_name
+            
+            score_input = [(messages, tests, item["idx"], chat_completion.choices[0].finish_reason)]
+            scored_group, tup = await self.score(score_input)
+            return (
+                scored_group["tokens"][0],
+                scored_group["masks"][0],
+                scored_group["scores"][0],
+                scored_group["overrides"][0],
+                tup[0],
+                tup[1],
+                tup[2],
+                assistant_msg,
+            )
+            
+        start_time = time.time()
+        if train_or_valid == "train":
+            self.total.append(item["idx"]) # LOCK
+            self.complement = sorted(list(set(self.total) - set(self.complete)))
+            print("TOTAL: ", self.complement)
+        if train_or_valid == "train":
+            tasks = [generate_and_score(i) for i in range(self.config.group_size)]
+        else:
+            tasks = [generate_and_score(i) for i in range(16)]
+        results = await asyncio.gather(*tasks)
+        end_time = time.time()
+
+        dt = end_time-start_time
+        scored_data = ScoredDataGroup()
+        scored_data["tokens"] = [r[0] for r in results]
+        scored_data["masks"] = [r[1] for r in results]
+        scored_data["scores"] = [r[2] for r in results]
+        scored_data["overrides"] = [r[3] for r in results]
+
+        codes = [r[4] for r in results]
+        errors = [r[5] for r in results]
+        lengths = [r[6] for r in results]
+        asst_messages = [r[7] for r in results]
+        cur_id = item["idx"]
+        if train_or_valid == "train":
+            self.complete.append(cur_id) # LOCK
+            self.complement = sorted(list(set(self.total) - set(self.complete)))
+
+        rprint(train_or_valid)
+        rprint("CURRENT ID: ", cur_id, item["problem_type"])
+        print("GEN LENGTHS: ", lengths)
+        print("SCORES ", scored_data["scores"])
+        print("MISSING ", self.complement)
+        print("CURRENT TIME", time.time())
+        #for error, code in zip(errors, codes):
+        #    print("ERROR:", error)
+            #if 'output' in error and error['output'] == '':
+            #    print(code)
+        print(f"Elapsed time: {dt:.2f} seconds")
+
+        async with self.lock:
+            heap_sum = 0
+            for x in scored_data["scores"]:
+                if math.isclose(1.0, x):
+                    heap_sum += 1
+            num_override = sum([x["set_advantage_to_zero"] for x in scored_data["overrides"]])
+
+            if train_or_valid == "train":
+                self.train_metrics["rewards"].append(1.0 * heap_sum / self.config.group_size)
+                self.train_metrics["overlong_ratio"].append(1.0 * num_override / self.config.group_size)
+                self.train_metrics["pass_gs"].append(1.0 * (heap_sum > 0))
+                self.train_metrics["completion_lengths"].append(sum([len(x) for x in scored_data["tokens"]]) / len(scored_data["tokens"]))
+                self.train_metrics["correct_completion_lengths"].append(sum([len(x) for x, y in zip(scored_data["tokens"], scored_data["scores"]) if math.isclose(y, 1.0)]))
+                self.train_metrics["incorrect_completion_lengths"].append(sum([len(x) for x, y in zip(scored_data["tokens"], scored_data["scores"]) if math.isclose(y, -1.0)]))
+                self.temp_metrics["indices"].append(cur_id)
+                self.temp_metrics["num_correct"].append(heap_sum)
+
+            script_dir = os.path.dirname(os.path.abspath(__file__))
+            """
+            if heap_sum > self.config.group_size - 2 and train_or_valid == "train":
+                self.blacklist.add(cur_id)
+                file_path = os.path.join(script_dir, "blacklist_" + self.cur_time + ".txt")
+                with open(file_path, "a") as f:
+                    f.write(json.dumps(cur_id) + "\n")
+            """
+            if train_or_valid == "train":
+                script_dir = os.path.join(script_dir, "train_logs")
+            else:
+                script_dir = os.path.join(script_dir, "eval_logs")
+            file_path = os.path.join(script_dir, "qwen_data_dump_"+self.cur_time + ".txt")
+            file_path_long = os.path.join(script_dir, "qwen_data_dump_long"+self.cur_time+".txt")
+            with open(file_path, "a") as f:
+                mp = {"cur_id": cur_id, "num_correct": heap_sum, "total": self.config.group_size, "scores": scored_data["scores"], "lengths": lengths}
+                f.write(json.dumps(mp) + "\n")
+            with open(file_path_long, "a") as f:
+                mp = {"cur_id": cur_id, "num_correct": heap_sum, "total": self.config.group_size, "scores": scored_data["scores"], "lengths": lengths, "errors": errors, "codes": codes, "gen": asst_messages[0]}
+                f.write(json.dumps(mp) + "\n")
+
+        return scored_data, []

    async def evaluate(self, *args, **kwargs):
        """
@ -125,30 +262,289 @@ class CodingEnv(BaseEnv):
        :param kwargs:
        :return: None.
        """
+        rprint("EVALUATION")
+        test_data = self.test
+
+        sema = asyncio.Semaphore(self.config.max_eval_workers)
+
+        all_total, all_correct = [], []
+        easy_total, easy_correct = [], []
+        medium_total, medium_correct = [], []
+        hard_total, hard_correct = [], []
+
+        temp_completion_lengths = []
+
+        correct_completion_lengths = []
+        incorrect_completion_lengths = []
+
+        overlong_ratio = []
+        pass_gs = []
+
+        async def eval(curr_step):
+            async with sema:
+                item = test_data[curr_step]
+                scored_data, _ = await self.collect_trajectories(item)
+
+                scores = scored_data["scores"]
+                num_correct = sum(1 for x in scores if math.isclose(x, 1.0))
+                num_overlong = sum(x["set_advantage_to_zero"] for x in scored_data["overrides"])
+
+                async with self.lock2:
+                    overlong_ratio.append(num_overlong / len(scores))
+                    pass_gs.append(num_correct > 0)
+                    temp_completion_lengths.append(sum([len(x) for x in scored_data["tokens"]]) / len(scored_data["tokens"]))
+
+                    correct_completion_lengths.extend([len(x) for x, y in zip(scored_data["tokens"], scored_data["scores"]) if math.isclose(y, 1.0)])
+                    incorrect_completion_lengths.extend([len(x) for x, y in zip(scored_data["tokens"], scored_data["scores"]) if math.isclose(y, -1.0)])
+                    all_total.append(len(scores))
+                    all_correct.append(num_correct)
+                    if item["difficulty"] == "easy":
+                        easy_total.append(len(scores))
+                        easy_correct.append(num_correct)
+                    elif item["difficulty"] == "medium":
+                        medium_total.append(len(scores))
+                        medium_correct.append(num_correct)
+                    elif item["difficulty"] == "hard":
+                        hard_total.append(len(scores))
+                        hard_correct.append(num_correct)
+
+        tasks = [asyncio.create_task(eval(i)) for i in range(len(test_data))]
+        await asyncio.gather(*tasks)
+
+        def estimator(n: int, c: int, k: int) -> float:
+            """Calculates 1 - comb(n - c, k) / comb(n, k)."""
+            if n - c < k:
+                return 1.0
+            return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
+        
+        all_pass_1 = sum([estimator(n, c, 1) for n, c in zip(all_total, all_correct)]) / len(all_total)
+        easy_pass_1 = sum([estimator(n, c, 1) for n, c in zip(easy_total, easy_correct)]) / (len(easy_total) + 1e-6)
+        medium_pass_1 = sum([estimator(n, c, 1) for n, c in zip(medium_total, medium_correct)]) / (len(medium_total) + 1e-6)
+        hard_pass_1 = sum([estimator(n, c, 1) for n, c in zip(hard_total, hard_correct)]) / (len(hard_total) + 1e-6)
+
+        self.eval_metrics.append(("eval/overlong_ratio", 1.0 * sum(overlong_ratio) / len(overlong_ratio)))
+        self.eval_metrics.append(("eval/pass@group_size", 1.0 * sum(pass_gs) / len(pass_gs)))
+
+        avg_comp_len = sum(temp_completion_lengths) / len(temp_completion_lengths)
+
+        self.eval_metrics.append(("eval/pass_1", all_pass_1))
+        self.eval_metrics.append(("eval/easy_pass_1", easy_pass_1))
+        self.eval_metrics.append(("eval/medium_pass_1", medium_pass_1))
+        self.eval_metrics.append(("eval/hard_pass_1", hard_pass_1))
+        self.eval_metrics.append(("eval/completion_length", avg_comp_len))
+
+        self.eval_metrics.append(("eval/correct_completion_length", sum(correct_completion_lengths) / len(correct_completion_lengths)))
+        self.eval_metrics.append(("eval/incorrect_completion_length", sum(incorrect_completion_lengths) / len(incorrect_completion_lengths)))
+        print("STATS", self.eval_metrics)
+
        return
+    
+    async def offline_filter(self):
+        rprint("OFFLINE FILTERING")
+        train_data = self.train
+
+        sema = asyncio.Semaphore(256)
+
+        async def filter_temp(curr_step):
+            async with sema:
+                item = train_data[curr_step]
+                item["split"] = "train"
+                item["idx"] = curr_step
+                rprint("OFFLINE FILTERING INDEX", curr_step)
+                scored_data, _ = await self.collect_trajectories(item)
+
+                scores = scored_data["scores"]
+                num_correct = sum(1 for x in scores if math.isclose(x, 1.0))
+
+                async with self.lock2:
+                    if num_correct == self.config.group_size:
+                        script_dir = os.path.dirname(os.path.abspath(__file__))
+                        file_path = os.path.join(script_dir, "perfect_indices.txt")
+                        with open(file_path, "a") as f:
+                            f.write(json.dumps(curr_step) + "\n")
+
+        tasks = [asyncio.create_task(filter_temp(i)) for i in range(len(train_data))]
+        await asyncio.gather(*tasks) 
+        print("DONE")
+
+
+    async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
+        if wandb_metrics is None:
+            wandb_metrics = {}
+
+        for item in self.eval_metrics:
+            wandb_metrics[item[0]] = item[1]
+
+        async with self.lock:
+            if self.wandb_step == 1 and self.curr_step > 5:
+                self.wandb_step = self.curr_step
+            cnt = sum((i != 0 and i != self.config.group_size) for i in self.temp_metrics["num_correct"])
+            print("TEMP METRICS INDICES: ", self.temp_metrics["indices"], "\n", self.temp_metrics["num_correct"], cnt, len(self.temp_metrics["num_correct"]))
+            for k, v in wandb_metrics.items():
+                print(k, v)
+
+            old_idx = 0
+            idx = 0
+            good_updates = 0
+
+            old_completion_idx = 0
+            new_completion_idx = 0
+
+            #assert len(self.temp_metrics["num_correct"]) == len(self.completion_lengths), f"TEMP METRICS {len(self.temp_metrics['num_correct'])} and COMPLETION LENGTHS {len(self.completion_lengths)} MUST BE SAME LENGTH"
+            print("TEMP METRICS LENGTHS", len(self.temp_metrics["num_correct"]), "COMPLETION LENGTHS", len(self.completion_lengths))
+            print("BATCH SZ", self.config.batch_size)
+
+            while idx < len(self.temp_metrics["num_correct"]):
+                if self.temp_metrics["num_correct"][idx] != 0 and self.temp_metrics["num_correct"][idx] != self.config.group_size:
+                    good_updates += 1
+                idx += 1
+                if good_updates == self.config.batch_size // self.config.group_size:
+                    wandb_metrics["train_rewards/rewards"] = sum(self.train_metrics["rewards"][old_idx:idx]) / len(self.train_metrics["rewards"][old_idx:idx])
+                    wandb_metrics["train_rewards/overlong_ratio"] = sum(self.train_metrics["overlong_ratio"][old_idx:idx]) / len(self.train_metrics["overlong_ratio"][old_idx:idx])
+                    wandb_metrics["train_rewards/pass@group_size"] = sum(self.train_metrics["pass_gs"][old_idx:idx]) / len(self.train_metrics["pass_gs"][old_idx:idx])
+                    wandb_metrics["train_rewards/completion_lengths"] = sum(self.train_metrics["completion_lengths"][old_idx:idx]) / len(self.train_metrics["completion_lengths"][old_idx:idx])
+                    wandb_metrics["train_rewards/correct_completion_lengths"] = sum(self.train_metrics["correct_completion_lengths"][old_idx:idx]) / sum(self.temp_metrics["num_correct"][old_idx:idx])
+                    wandb_metrics["train_rewards/incorrect_completion_lengths"] = sum(self.train_metrics["incorrect_completion_lengths"][old_idx:idx]) / (self.config.group_size * (idx - old_idx) - sum(self.temp_metrics["num_correct"][old_idx:idx]))
+
+
+                    new_completion_idx += self.config.batch_size
+                    
+                    assert old_completion_idx <= len(self.completion_lengths), f"OLD COMPLETION IDX {old_completion_idx} and COMPLETION LENGTHS {len(self.completion_lengths)} MUST BE SMALLER"
+
+                    wandb_metrics["train/completion_lengths"] = sum(
+                        self.completion_lengths[old_completion_idx:new_completion_idx]
+                    ) / len(self.completion_lengths[old_completion_idx:new_completion_idx])
+                    wandb_metrics["train/completion_lengths_std"] = np.std(
+                        self.completion_lengths[old_completion_idx:new_completion_idx]
+                    )
+                    wandb_metrics["train/completion_lengths_max"] = np.max(
+                        self.completion_lengths[old_completion_idx:new_completion_idx]
+                    )
+                    wandb_metrics["train/completion_lengths_min"] = np.min(
+                        self.completion_lengths[old_completion_idx:new_completion_idx]
+                    )
+                    wandb_metrics["train/completion_lengths_p95"] = (
+                        np.array(self.completion_lengths[old_completion_idx:new_completion_idx]) > (0.95 * self.max_token_len)
+                    ).mean()
+                    
+                    if self.wandb_prepend is not None:
+                        wandb_metrics = {
+                            f"{self.wandb_prepend}_{k}": v for k, v in wandb_metrics.items()
+                        }
+                    print("WANDB LOG")
+                    wandb.log(wandb_metrics, step=self.wandb_step, commit=True)
+                    wandb_metrics = {}
+
+                    good_updates = 0
+                    self.wandb_step += 1
+                    old_idx = idx
+                    old_completion_idx = new_completion_idx
+
+
+            self.completion_lengths = self.completion_lengths[old_completion_idx:]
+            self.temp_metrics["indices"] = self.temp_metrics["indices"][old_idx:]
+            self.temp_metrics["num_correct"] = self.temp_metrics["num_correct"][old_idx:]
+            self.train_metrics["rewards"] = self.train_metrics["rewards"][old_idx:]
+            self.train_metrics["overlong_ratio"] = self.train_metrics["overlong_ratio"][old_idx:]
+            self.train_metrics["pass_gs"] = self.train_metrics["pass_gs"][old_idx:]
+            self.train_metrics["completion_lengths"] = self.train_metrics["completion_lengths"][old_idx:]
+            self.train_metrics["correct_completion_lengths"] = self.train_metrics["correct_completion_lengths"][old_idx:]
+            self.train_metrics["incorrect_completion_lengths"] = self.train_metrics["incorrect_completion_lengths"][old_idx:]
+
+        print("WANDB STEPS vs STATUS STEP", self.wandb_step, self.curr_step)
+
+        self.eval_metrics = list()
+
+        for i, server in enumerate(self.server.servers):
+            server_wandb_metrics = await server.wandb_metrics({}, f"server_{i}")
+
+        wandb_metrics = await self.create_rollout_table(wandb_metrics)
+        wandb_metrics = self.perf_stats(wandb_metrics)
+        self.rollouts_for_wandb = []
+
+        if self.config.use_wandb:
+            if self.wandb_prepend is not None:
+                wandb_metrics = {
+                    f"{self.wandb_prepend}_{k}": v for k, v in wandb_metrics.items()
+                }
+            # add server metrics to wandb without prepend to collate them all
+            wandb_metrics.update(server_wandb_metrics)
+            wandb.log(wandb_metrics, step=self.curr_step, commit=True)

    async def setup(self):
        """Setup the environment"""
-        self.container = init_docker()
-        self.train = load_dataset("deepmind/code_contests", split="train")
-        self.iter = 0
+        if self.config.dataset_name == "deepmind":
+            self.train = load_dataset("deepmind/code_contests", split="train") ### CHANGE
+        else:
+            self.train = load_dataset("NousResearch/RLVR_Coding_Problems", split="train")
+        test = load_dataset("NousResearch/lcb_test",split="test")
+        self.test = []
+        for problem in test:
+            self.test.append(problem)
+            self.test[-1]["idx"] = len(self.test) - 1
+            self.test[-1]["split"] = "test"

+        self.iter = 0 ### CHANGE
+        self.lock = asyncio.Lock()
+        self.lock2 = asyncio.Lock()
+
+        self.wandb_step = 1
+
+        
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        file_path = os.path.join(script_dir, "perfect_indices.txt")
+        with open(file_path, "r") as f:
+            for line in f:
+                a = int(line.strip())
+                self.blacklist.add(a)
+
+        self.good_indices = []
+        if self.config.dataset_name != "deepmind":
+            arr_short = []
+            for i in range(len(self.train)):
+                if i not in self.blacklist:
+                    arr_short.append(i)
+            print("ARR SHORT LENGTH", len(arr_short))
+            temp_arr = arr_short * 100
+            self.deq = deque(temp_arr[self.config.start_idx:])
+        else:
+            self.good_indices = [i for i in range(10000)]
+            for i in range(10000):
+                self.deq.append(i)
+
+        rprint("NUM FILTERED EXAMPLES:", len(self.deq))
+        rprint(self.config.batch_size)
+
+        """
+        rprint("BEFORE SETUP, do blacklisting")
+        await self.offline_filter()
+        rprint("FINISH blacklisting")
+        """
+        
+        """
+        st = time.time()
+        await self.evaluate() ### CHANGE
+        ed = time.time()
+        rprint("ELAPSED TIME FOR EVALUATION: ", ed-st)
+        """
+        
    async def get_next_item(self) -> Item:
        """
        Get the next items to be rolled out
        """
-        next_item = self.train[self.iter % len(self.train)]
-        self.iter += 1
-        prompt = tuple(
-            [frozenset({"role": "user", "content": next_item["description"]}.items())]
-        )
-        answer = (
-            tuple(next_item["private_tests"]["input"]),
-            tuple(next_item["private_tests"]["output"]),
-            tuple(next_item["generated_tests"]["input"]),
-            tuple(next_item["generated_tests"]["output"]),
-        )
-        return (prompt, answer)
+        async with self.lock:
+            cur_idx = self.deq.popleft()
+            while cur_idx in self.blacklist:
+                print("IN BLACKLIST, SKIPPING", cur_idx)
+                cur_idx = self.deq.popleft()
+            next_item = self.train[cur_idx]
+        next_item["idx"] = cur_idx
+        next_item["split"] = "train"
+        if self.config.dataset_name == "deepmind":
+            next_item["problem"] = next_item["description"]
+            next_item["tests"] = {"input": next_item["private_tests"]["input"] + next_item["generated_tests"]["input"], "output": next_item["private_tests"]["output"] + next_item["generated_tests"]["output"]}
+            next_item["problem_type"] = "stdin_stdout"
+        return next_item

    def extract_python_code_blocks(self, text):
        # Regex specifically looks for ```python\n...code...\n```
@ -157,44 +553,89 @@ class CodingEnv(BaseEnv):
        python_blocks = [r for r in result]
        return python_blocks

-    async def score(self, rollout_group_data) -> Optional[ScoredDataGroup]:
-        # print("Rollout group data", rollout_group_data)
+    async def score(self, rollout_group_data):
+        
+        assert len(rollout_group_data) == 1
+        cur_id = rollout_group_data[0][2]
+
        scores = ScoredDataGroup()
        scores["tokens"] = list()
        scores["masks"] = list()
        scores["scores"] = list()
-        random.shuffle(rollout_group_data)
+        scores["overrides"] = list()
+
+        results = []
+        codes = []
+        lengths = []
+        errors = []
+
        for item in rollout_group_data:
-            out_dict = tokenize_for_trainer(self.tokenizer, item[0])
+            scores["overrides"].append(dict())
+            scores["overrides"][-1]["set_advantage_to_zero"] = False
+            if item[3] == "length":
+                scores["overrides"][-1]["set_advantage_to_zero"] = True
+            else:
+                if item[3] != "stop":
+                    rprint("FINISH REASON", item[3])
+                #assert item[3] == "stop"
+
+            out_dict = tokenize_for_trainer(self.tokenizer, item[0], item[3])
            tokens = out_dict["tokens"]
            masks = out_dict["masks"]
-            """
-            CALCULATE REWARD NOW
-            """
-            code = self.extract_python_code_blocks(item[0][-1]["content"])[0]
-            test_cases = list(item[1][0]) + list(item[1][2])
-            x = await get_results(code, test_cases)
-            output_cases = list(item[1][1]) + list(item[1][3])
-            assert len(x) == len(output_cases)
-            reward = True
-            for k in range(len(x)):
-                if x[k] != output_cases[k]:
-                    reward = False
-                    break
-            # remove obviously bad examples
-            if len([1 for i in masks if i != -100]) < 10:
-                continue
            scores["tokens"].append(tokens)
            scores["masks"].append(masks)
-            scores["scores"].append(1.0 if reward else -1.0)
-            if len(scores["tokens"]) >= self.config.group_size:
-                break
-        # check if all the same
-        # print(scores['scores'])
-        # if all([scores["scores"][0] == score for score in scores["scores"]]):
-        #     return None  # If all the same, we return None
-        return scores
+            lengths.append(len(tokens))

+            code = self.extract_python_code_blocks(item[0][-1]["content"])
+            if len(code) > 0:
+                code = code[-1]
+            else:
+                code = None
+            test_cases = {"tests": item[1]}
+
+
+            codes.append(code)
+
+            if code == None:
+                scores["scores"].append(-1.0)
+                errors.append({"error_message": "No code"})
+                continue
+            else:
+                try:
+                    async with async_semaphore:
+                        res, metadata = await run_test.remote.aio(test_cases, code)
+                except modal.exception.RemoteError as e:
+                    res = [False]
+                    rprint("index", cur_id)
+                    rprint(test_cases["tests"]["fn_name"])
+                    metadata = {"error": "segmentation fault"}
+                    rprint("FAULT")
+                    rprint("code:\n", code)
+                    rprint(metadata)
+                except Exception as f:
+                    rprint("index", cur_id)
+                    rprint(test_cases["tests"]["fn_name"])
+                    rprint("except:", code)
+                    res = [False]
+                    metadata= {"error": "segmentation fault"}
+                    rprint("NON SEGFAULT")
+                    rprint("FAULT", f)
+                for x in res:
+                    if not isinstance(x, (int, bool)):
+                        rprint("WARNING")
+                        rprint(res)
+                if set(res) == {True}:
+                    scores["scores"].append(1.0)
+                else:
+                    scores["scores"].append(-1.0)
+                    rprint("index", cur_id)
+                    rprint(metadata)
+
+                errors.append(metadata)
+
+
+        return scores, (codes[0], errors[0], lengths[0])

 if __name__ == "__main__":
-    CodingEnv.cli()
+    print(system_prompt)
+    CodingEnv.cli()
--- a/environments/code_execution_server/lcb_modal_endpoint.py
+++ b/environments/code_execution_server/lcb_modal_endpoint.py
@ -0,0 +1,633 @@
+import ast
+import json
+import sys
+import faulthandler
+import platform
+import modal
+
+# used for debugging to time steps
+from datetime import datetime
+
+# to run the solution files we're using a timing based approach
+import signal
+
+import numpy as np
+
+from io import StringIO
+
+# used for testing the code that reads from input
+from unittest.mock import patch, mock_open
+
+# from pyext import RuntimeModule
+from types import ModuleType
+
+from enum import Enum
+from decimal import Decimal
+import time
+
+
+
+import_string = "from string import *\nfrom re import *\nfrom datetime import *\nfrom collections import *\nfrom heapq import *\nfrom bisect import *\nfrom copy import *\nfrom math import *\nfrom random import *\nfrom statistics import *\nfrom itertools import *\nfrom functools import *\nfrom operator import *\nfrom io import *\nfrom sys import *\nfrom json import *\nfrom builtins import *\nfrom typing import *\nimport string\nimport re\nimport datetime\nimport collections\nimport heapq\nimport bisect\nimport copy\nimport math\nimport random\nimport statistics\nimport itertools\nimport functools\nimport operator\nimport io\nimport sys\nimport json\nsys.setrecursionlimit(50000)\n"
+
+image = modal.Image.debian_slim(python_version="3.10").pip_install("numpy")
+app = modal.App("joeli-lcb", image=image)
+
+def truncatefn(s, length=300):
+    if isinstance(s, str):
+        pass
+    else:
+        s = str(s)
+    if len(s) <= length:
+        return s
+
+    return s[: length // 2] + "...(truncated) ..." + s[-length // 2 :]
+
+
+class CODE_TYPE(Enum):
+    call_based = 0
+    standard_input = 1
+
+
+# stuff for setting up signal timer
+class TimeoutException(Exception):
+    pass
+
+
+def timeout_handler(signum, frame):
+    print("timeout occured: alarm went off")
+    raise TimeoutException
+
+
+# used to capture stdout as a list
+# from https://stackoverflow.com/a/16571630/6416660
+# alternative use redirect_stdout() from contextlib
+class Capturing(list):
+    def __enter__(self):
+        self._stdout = sys.stdout
+        sys.stdout = self._stringio = StringIO()
+        # Make closing the StringIO a no-op
+        self._stringio.close = lambda x: 1
+        return self
+
+    def __exit__(self, *args):
+        self.append(self._stringio.getvalue())
+        del self._stringio  # free up some memory
+        sys.stdout = self._stdout
+
+
+# Custom mock for sys.stdin that supports buffer attribute
+class MockStdinWithBuffer:
+    def __init__(self, inputs: str):
+        self.inputs = inputs
+        self._stringio = StringIO(inputs)
+        self.buffer = MockBuffer(inputs)
+
+    def read(self, *args):
+        return self.inputs
+
+    def __iter__(self):  # <- required for "for line in sys.stdin"
+        return iter(self._stringio)
+
+    def readline(self, *args):
+        return self._stringio.readline(*args)
+
+    def readlines(self, *args):
+        return self.inputs.split("\n")
+
+    def __getattr__(self, name):
+        # Delegate other attributes to StringIO
+        return getattr(self._stringio, name)
+
+
+class MockBuffer:
+    def __init__(self, inputs: str):
+        self.inputs = inputs.encode("utf-8")  # Convert to bytes
+
+    def read(self, *args):
+        # Return as byte strings that can be split
+        return self.inputs
+
+    def readline(self, *args):
+        return self.inputs.split(b"\n")[0] + b"\n"
+
+
+def clean_if_name(code: str) -> str:
+    try:
+        astree = ast.parse(code)
+        last_block = astree.body[-1]
+        if isinstance(last_block, ast.If):
+            condition = last_block.test
+            if ast.unparse(condition).strip() == "__name__ == '__main__'" or ast.unparse(condition).strip() == "__name__ == \"__main__\"":
+                code = (
+                    ast.unparse(astree.body[:-1]) + "\n" + ast.unparse(last_block.body)  # type: ignore
+                )
+    except:
+        pass
+
+    return code
+
+
+def make_function(code: str) -> str:
+    try:
+        import_stmts = []
+        all_other_stmts = []
+        astree = ast.parse(code)
+        for stmt in astree.body:
+            if isinstance(stmt, (ast.Import, ast.ImportFrom)):
+                import_stmts.append(stmt)
+            else:
+                all_other_stmts.append(stmt)
+
+        function_ast = ast.FunctionDef(
+            name="wrapped_function",
+            args=ast.arguments(
+                posonlyargs=[], args=[], kwonlyargs=[], kw_defaults=[], defaults=[]
+            ),
+            body=all_other_stmts,
+            decorator_list=[],
+            lineno=-1,
+        )
+        main_code = (
+            import_string
+            + "\n"
+            + ast.unparse(import_stmts)  # type: ignore
+            + "\n"
+            + ast.unparse(function_ast)  # type: ignore
+        )
+        return main_code
+    except Exception as e:
+        return code
+
+
+def call_method(method, inputs):
+
+    if isinstance(inputs, list):
+        inputs = "\n".join(inputs)
+
+    inputs_line_iterator = iter(inputs.split("\n"))
+
+    # Create custom stdin mock with buffer support
+    mock_stdin = MockStdinWithBuffer(inputs)
+
+    # CHANGE
+
+    # sys.setrecursionlimit(10000)
+
+    # @patch('builtins.input', side_effect=inputs.split("\n"))
+    @patch("builtins.open", mock_open(read_data=inputs))
+    @patch("sys.stdin", mock_stdin)  # Use our custom mock instead of StringIO
+    @patch("sys.stdin.readline", lambda *args: next(inputs_line_iterator))
+    @patch("sys.stdin.readlines", lambda *args: inputs.split("\n"))
+    @patch("sys.stdin.read", lambda *args: inputs)
+    # @patch('sys.stdout.write', print)
+    def _inner_call_method(_method):
+        if 'stdin' in _method.__globals__:
+            _method.__globals__['stdin'] = mock_stdin
+        if 'stdout' in _method.__globals__:
+            _method.__globals__['stdout'] = sys.stdout
+        try:
+            return _method()
+        except SystemExit as e:
+            pass
+        finally:
+            pass
+
+    return _inner_call_method(method)
+
+
+def get_function(compiled_sol, fn_name: str):  # type: ignore
+    try:
+        assert hasattr(compiled_sol, fn_name)
+        return getattr(compiled_sol, fn_name)
+    except Exception as e:
+        return
+
+
+def compile_code(code: str, timeout: int):
+    signal.alarm(timeout)
+    try:
+        tmp_sol = ModuleType("tmp_sol", "")
+
+        exec(code, tmp_sol.__dict__)
+
+        if "class Solution" in code:
+            # leetcode wraps solutions in `Solution`
+            # this is a hack to check if it is leetcode solution or not
+            # currently livecodebench only supports LeetCode but
+            # else condition allows future extensibility to other platforms
+            compiled_sol = tmp_sol.Solution()
+        else:
+            # do nothing in the other case since function is accesible
+            compiled_sol = tmp_sol
+
+        assert compiled_sol is not None
+    finally:
+        signal.alarm(0)
+
+    return compiled_sol
+
+
+def convert_line_to_decimals(line: str) -> tuple[bool, list[Decimal]]:
+    try:
+        decimal_line = [Decimal(elem) for elem in line.split()]
+    except:
+        return False, []
+    return True, decimal_line
+
+
+def get_stripped_lines(val: str):
+    ## you don't want empty lines to add empty list after splitlines!
+    val = val.strip()
+
+    return [val_line.strip() for val_line in val.split("\n")]
+
+
+def grade_call_based(
+    code: str, all_inputs: list, all_outputs: list, fn_name: str, timeout: int
+):
+    # call-based clean up logic
+    # need to wrap in try-catch logic after to catch the correct errors, but for now this is fine.
+    code = import_string + "\n\n" + code
+    compiled_sol = compile_code(code, timeout)
+    print("code:", code)
+
+    if compiled_sol is None:
+        return
+
+    method = get_function(compiled_sol, fn_name)
+
+    if method is None:
+        return
+
+    all_inputs = [[json.loads(line) for line in inputs.split("\n")] if isinstance(inputs, str) else inputs for inputs in all_inputs]
+    all_outputs = [json.loads(output) if isinstance(output, str) else output[0] for output in all_outputs]
+
+    #all_inputs = [[json.loads(line) for line in inputs.split("\n")] for inputs in all_inputs]
+    #all_outputs = [json.loads(output) for output in all_outputs]
+
+    total_execution = 0
+    all_results = []
+    for idx, (gt_inp, gt_out) in enumerate(zip(all_inputs, all_outputs)):
+        signal.alarm(timeout)
+        faulthandler.enable()
+        print("INPUT", gt_inp)
+        print("Expected Out", gt_out)
+        try:
+            # can lock here so time is useful
+            start = time.time()
+            prediction = method(*gt_inp)
+            total_execution += time.time() - start
+            signal.alarm(0)
+
+            # don't penalize model if it produces tuples instead of lists
+            # ground truth sequences are not tuples
+            if isinstance(prediction, tuple):
+                prediction = list(prediction)
+
+            print("predicted out", prediction)
+
+            tmp_result = prediction == gt_out
+
+            # handle floating point comparisons
+
+            all_results.append(tmp_result)
+
+            if not tmp_result:
+                return all_results, {
+                    "output": truncatefn(prediction),
+                    "inputs": truncatefn(gt_inp),
+                    "expected": truncatefn(gt_out),
+                    "error_code": -2,
+                    "error_message": "Wrong Answer",
+                }
+        except Exception as e:
+            signal.alarm(0)
+            if "timeoutexception" in repr(e).lower():
+                all_results.append(-3)
+                return all_results, {
+                    "error": repr(e),
+                    "error_code": -3,
+                    "error_message": "Time Limit Exceeded",
+                    "inputs": truncatefn(gt_inp),
+                    "expected": truncatefn(gt_out),
+                }
+            else:
+                all_results.append(-4)
+                return all_results, {
+                    "error": repr(e),
+                    "error_code": -4,
+                    "error_message": "Runtime Error",
+                    "inputs": truncatefn(gt_inp),
+                    "expected": truncatefn(gt_out),
+                }
+
+        finally:
+            signal.alarm(0)
+            faulthandler.disable()
+
+    return all_results, {"execution time": total_execution}
+
+def decimals_are_close(a: Decimal, b: Decimal, tol: Decimal = Decimal("1e-4")) -> bool:
+    return abs(a - b) <= tol
+
+
+def grade_stdio(
+    code: str,
+    all_inputs: list,
+    all_outputs: list,
+    timeout: int,
+):
+    print("ORIGINAL CODE\n", code)
+    ## runtime doesn't interact well with __name__ == '__main__'
+    code = clean_if_name(code)
+
+    ## we wrap the given code inside another function
+    code = make_function(code)
+
+    compiled_sol = compile_code(code, timeout)
+    if compiled_sol is None:
+        return
+
+    method = get_function(compiled_sol, "wrapped_function")
+
+    if method is None:
+        return
+
+    all_results = []
+    total_execution_time = 0
+    print("LCB CODE\n", code)
+    for idx, (gt_inp, gt_out) in enumerate(zip(all_inputs, all_outputs)):
+        print("INPUT\n", gt_inp)
+        print("EXPECTED OUTPUT\n", gt_out)
+        signal.alarm(timeout)
+        faulthandler.enable()
+
+        signal.alarm(timeout)
+        with Capturing() as captured_output:
+            try:
+                start = time.time()
+                call_method(method, gt_inp)
+                total_execution_time += time.time() - start
+                # reset the alarm
+                signal.alarm(0)
+            except Exception as e:
+                signal.alarm(0)
+                if "timeoutexception" in repr(e).lower():
+                    all_results.append(-3)
+                    return all_results, {
+                        "error": repr(e),
+                        "error_code": -3,
+                        "error_message": "Time Limit Exceeded",
+                        "inputs": truncatefn(gt_inp),
+                        "expected": truncatefn(gt_out),
+                    }
+                else:
+                    all_results.append(-4)
+                    return all_results, {
+                        "error": repr(e),
+                        "error_code": -4,
+                        "error_message": "Runtime Error",
+                        "inputs": truncatefn(gt_inp),
+                        "expected": truncatefn(gt_out),
+                    }
+
+            finally:
+                signal.alarm(0)
+                faulthandler.disable()
+
+        prediction = captured_output[0]
+        print("OUTPUT\n", prediction)
+
+        stripped_prediction_lines = get_stripped_lines(prediction)
+        if isinstance(gt_out, str):
+            stripped_gt_out_lines = get_stripped_lines(gt_out)
+        else:
+            stripped_gt_out_lines = gt_out
+
+        ## WA happens in multiple circumstances
+        ## so cache the return to make it clean!
+        WA_send_args = {
+            "output": truncatefn(prediction),
+            "inputs": truncatefn(gt_inp),
+            "expected": truncatefn(gt_out),
+            "error_code": -2,
+        }
+
+        if len(stripped_prediction_lines) != len(stripped_gt_out_lines):
+            all_results.append(-2)
+            WA_send_args["error_message"] = "Wrong answer: mismatched output length"
+            return all_results, WA_send_args
+
+        for output_line_idx, (
+            stripped_prediction_line,
+            stripped_gt_out_line,
+        ) in enumerate(zip(stripped_prediction_lines, stripped_gt_out_lines)):
+            WA_send_args["error_message"] = (
+                f"Wrong answer at {output_line_idx=}: {truncatefn(stripped_prediction_line)} != {truncatefn(stripped_gt_out_line)}"
+            )
+
+            ## CASE 1: exact match
+            if stripped_prediction_line == stripped_gt_out_line:
+                continue
+
+            ## CASE 2: element-wise comparision
+            ## if there are floating elements
+            ## use `decimal` library for good floating point comparision
+            ## otherwise gotcha: np.isclose(50000000000000000, 50000000000000001) = True
+            ## note that we should always be able to convert to decimals
+
+            success, decimal_prediction_line = convert_line_to_decimals(
+                stripped_prediction_line
+            )
+            if not success:
+                all_results.append(-2)
+                return all_results, WA_send_args
+            success, decimal_gtout_line = convert_line_to_decimals(stripped_gt_out_line)
+            if not success:
+                all_results.append(-2)
+                return all_results, WA_send_args
+
+            if decimal_prediction_line == decimal_gtout_line:
+                continue
+
+            if len(decimal_prediction_line) == len(decimal_gtout_line) and all(decimals_are_close(a, b) for a, b in zip(decimal_prediction_line, decimal_gtout_line)):
+                continue
+
+            all_results.append(-2)
+            return all_results, WA_send_args
+        all_results.append(True)
+
+    return all_results, {"execution time": total_execution_time}
+
+
+@app.function(max_containers=100,cpu=1.0, restrict_modal_access=False, max_inputs=1, timeout=600, block_network=False, retries=3)
+def run_test(sample, test=None, debug=False, timeout=15):
+    """
+    if test(generated_code) is not None it'll try to run the code.
+    otherwise it'll just return an input and output pair.
+    """
+    print("VALID, went into function")
+    #test = json.loads(test)
+    signal.signal(signal.SIGALRM, timeout_handler)
+
+    # Disable functionalities that can make destructive changes to the test.
+    # max memory is set to 4GB
+    reliability_guard(5 * 1024 * 1024 * 1024)
+
+    if debug:
+        print(f"start = {datetime.now().time()}")
+
+    try:
+        #in_outs = json.loads(sample["input_output"])
+        in_outs = sample["tests"]
+    except ValueError as e:
+        raise e
+        in_outs = None
+
+    if in_outs:
+        if in_outs.get("fn_name") == "none":
+            which_type = CODE_TYPE.standard_input  # Standard input
+            method_name = None
+
+        else:
+            which_type = CODE_TYPE.call_based  # Call-based
+            method_name = in_outs["fn_name"]
+    print("type, method name: ", which_type, method_name)
+
+    if debug:
+        print(f"loaded input_output = {datetime.now().time()}")
+
+    if test is None:
+        assert False, "should not happen: test code is none"
+        return in_outs, {"error": "no test code provided"}
+    elif test is not None:
+        results = []
+        sol = import_string
+        if debug:
+            print(f"loading test code = {datetime.now().time()}")
+
+        if which_type == CODE_TYPE.call_based:
+            signal.alarm(timeout)
+            try:
+                results, metadata = grade_call_based(
+                    code=test,
+                    all_inputs=in_outs["input"],
+                    all_outputs=in_outs["output"],
+                    fn_name=method_name,
+                    timeout=timeout,
+                )
+                return results, metadata
+            except Exception as e:
+                return [-4], {
+                    "error_code": -4,
+                    "error_message": f"Error during testing: {e}",
+                }
+            finally:
+                signal.alarm(0)
+        elif which_type == CODE_TYPE.standard_input:
+            # sol
+            # if code has if __name__ == "__main__": then remove it
+
+            signal.alarm(timeout)
+            try:
+                results, metadata = grade_stdio(
+                    code=test,
+                    all_inputs=in_outs["input"],
+                    all_outputs=in_outs["output"],
+                    timeout=timeout,
+                )
+                return results, metadata
+            except Exception as e:
+                return [-4], {
+                    "error_code": -4,
+                    "error_message": f"Error during testing: {e}",
+                }
+            finally:
+                signal.alarm(0)
+
+
+def reliability_guard(maximum_memory_bytes=None):
+    """
+    This disables various destructive functions and prevents the generated code
+    from interfering with the test (e.g. fork bomb, killing other processes,
+    removing filesystem files, etc.)
+    WARNING
+    This function is NOT a security sandbox. Untrusted code, including, model-
+    generated code, should not be blindly executed outside of one. See the
+    Codex paper for more information about OpenAI's code sandbox, and proceed
+    with caution.
+    """
+
+    if maximum_memory_bytes is not None:
+        import resource
+
+        resource.setrlimit(
+            resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)
+        )
+        resource.setrlimit(
+            resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)
+        )
+        if not platform.uname().system == "Darwin":
+            resource.setrlimit(
+                resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)
+            )
+
+    faulthandler.disable()
+
+    import builtins
+
+    # builtins.exit = None
+    builtins.quit = None
+
+    import os
+
+    os.environ["OMP_NUM_THREADS"] = "1"
+
+    os.kill = None
+    os.system = None
+    os.putenv = None
+    os.remove = None
+    os.removedirs = None
+    os.rmdir = None
+    os.fchdir = None
+    os.setuid = None
+    os.fork = None
+    os.forkpty = None
+    os.killpg = None
+    os.rename = None
+    os.renames = None
+    os.truncate = None
+    os.replace = None
+    os.unlink = None
+    os.fchmod = None
+    os.fchown = None
+    os.chmod = None
+    os.chown = None
+    os.chroot = None
+    os.fchdir = None
+    os.lchflags = None
+    os.lchmod = None
+    os.lchown = None
+    os.getcwd = None
+    os.chdir = None
+
+    import shutil
+
+    shutil.rmtree = None
+    shutil.move = None
+    shutil.chown = None
+
+    import subprocess
+
+    subprocess.Popen = None  # type: ignore
+
+    __builtins__["help"] = None
+
+    import sys
+
+    sys.modules["ipdb"] = None
+    sys.modules["joblib"] = None
+    sys.modules["resource"] = None
+    sys.modules["psutil"] = None
+    sys.modules["tkinter"] = None
+