mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-24 17:04:55 +00:00
feat: add code_debug community environment
This commit is contained in:
parent
c421582b6f
commit
590e8a1ef2
4 changed files with 930 additions and 0 deletions
213
environments/community/code_debug_env/code_executor.py
Normal file
213
environments/community/code_debug_env/code_executor.py
Normal file
|
|
@ -0,0 +1,213 @@
|
|||
"""
|
||||
Safe code execution utilities for the Code Debug environment.
|
||||
|
||||
Runs generated code in isolated subprocess with timeout and resource limits.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from typing import Optional, Tuple
|
||||
|
||||
|
||||
def execute_code_with_tests(
|
||||
code: str,
|
||||
test_code: str,
|
||||
entry_point: str,
|
||||
timeout: int = 10,
|
||||
) -> Tuple[bool, str]:
|
||||
"""
|
||||
Execute code with test cases in an isolated subprocess.
|
||||
|
||||
Args:
|
||||
code: The function implementation to test.
|
||||
test_code: Test code containing a `check(candidate)` function.
|
||||
entry_point: The function name to pass to `check()`.
|
||||
timeout: Maximum execution time in seconds.
|
||||
|
||||
Returns:
|
||||
Tuple of (all_tests_passed, error_message).
|
||||
"""
|
||||
full_code = code + "\n\n" + test_code + f"\n\ncheck({entry_point})\n"
|
||||
|
||||
tmp_path = None
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode="w", suffix=".py", delete=False, prefix="code_debug_"
|
||||
) as f:
|
||||
f.write(full_code)
|
||||
tmp_path = f.name
|
||||
|
||||
result = subprocess.run(
|
||||
["python3", tmp_path],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout,
|
||||
)
|
||||
if result.returncode == 0:
|
||||
return True, ""
|
||||
else:
|
||||
error = result.stderr.strip()
|
||||
# Truncate long tracebacks
|
||||
if len(error) > 500:
|
||||
error = error[-500:]
|
||||
return False, error
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, "Execution timed out"
|
||||
except Exception as e:
|
||||
return False, f"{type(e).__name__}: {str(e)[:200]}"
|
||||
finally:
|
||||
if tmp_path and os.path.exists(tmp_path):
|
||||
os.unlink(tmp_path)
|
||||
|
||||
|
||||
def extract_boxed_code(text: str) -> Optional[str]:
|
||||
"""
|
||||
Extract code from \\boxed{...} format, handling nested braces.
|
||||
|
||||
Finds the LAST occurrence of \\boxed{} to handle cases where
|
||||
the model discusses code before providing the final answer.
|
||||
|
||||
Args:
|
||||
text: The model's response text.
|
||||
|
||||
Returns:
|
||||
The extracted code string, or None if no \\boxed{} found.
|
||||
"""
|
||||
idx = text.rfind("\\boxed{")
|
||||
if idx == -1:
|
||||
return None
|
||||
|
||||
start = idx + len("\\boxed{")
|
||||
brace_count = 1
|
||||
i = start
|
||||
|
||||
while i < len(text) and brace_count > 0:
|
||||
if text[i] == "{":
|
||||
brace_count += 1
|
||||
elif text[i] == "}":
|
||||
brace_count -= 1
|
||||
i += 1
|
||||
|
||||
if brace_count == 0:
|
||||
return text[start : i - 1].strip()
|
||||
return None
|
||||
|
||||
|
||||
def count_test_results(
|
||||
code: str,
|
||||
test_code: str,
|
||||
entry_point: str,
|
||||
timeout: int = 10,
|
||||
) -> Tuple[int, int]:
|
||||
"""
|
||||
Count how many individual assertions pass vs fail.
|
||||
|
||||
Wraps each assertion in a try/except to count partial success.
|
||||
|
||||
Args:
|
||||
code: The function implementation to test.
|
||||
test_code: Test code containing a `check(candidate)` function.
|
||||
entry_point: The function name to pass.
|
||||
timeout: Maximum execution time in seconds.
|
||||
|
||||
Returns:
|
||||
Tuple of (passed_count, total_count).
|
||||
"""
|
||||
# Build a counting test harness
|
||||
counter_code = f"""
|
||||
import sys
|
||||
|
||||
{code}
|
||||
|
||||
_passed = 0
|
||||
_total = 0
|
||||
|
||||
def _counting_check(candidate):
|
||||
global _passed, _total
|
||||
import types
|
||||
|
||||
# Get the original check function's code
|
||||
_orig_check_src = '''{test_code}'''
|
||||
_ns = {{'__builtins__': __builtins__}}
|
||||
exec(_orig_check_src, _ns)
|
||||
_orig_check = _ns.get('check')
|
||||
|
||||
if _orig_check is None:
|
||||
print("0/0")
|
||||
return
|
||||
|
||||
# Get the source of check and count assertions
|
||||
import inspect
|
||||
try:
|
||||
src = inspect.getsource(_orig_check)
|
||||
except (TypeError, OSError):
|
||||
# Can't inspect — just run it
|
||||
try:
|
||||
_orig_check(candidate)
|
||||
print("1/1")
|
||||
except Exception:
|
||||
print("0/1")
|
||||
return
|
||||
|
||||
# Count 'assert' lines
|
||||
assert_lines = [l.strip() for l in src.split('\\n') if l.strip().startswith('assert')]
|
||||
_total = max(len(assert_lines), 1)
|
||||
|
||||
# Run the full check — if it passes, all assertions passed
|
||||
try:
|
||||
_orig_check(candidate)
|
||||
_passed = _total
|
||||
except AssertionError:
|
||||
# Some failed — try to count
|
||||
_passed = 0
|
||||
for line in assert_lines:
|
||||
try:
|
||||
exec(line, {{'__builtins__': __builtins__, '{entry_point}': candidate, 'candidate': candidate}})
|
||||
_passed += 1
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
_passed = 0
|
||||
|
||||
print(f"{{_passed}}/{{_total}}")
|
||||
|
||||
_counting_check({entry_point})
|
||||
"""
|
||||
|
||||
tmp_path = None
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode="w", suffix=".py", delete=False, prefix="code_debug_count_"
|
||||
) as f:
|
||||
f.write(counter_code)
|
||||
tmp_path = f.name
|
||||
|
||||
result = subprocess.run(
|
||||
["python3", tmp_path],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
stdout = result.stdout.strip()
|
||||
if "/" in stdout:
|
||||
parts = stdout.split("/")
|
||||
try:
|
||||
return int(parts[0]), int(parts[1])
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
|
||||
# Fallback: if execution succeeded, assume all passed
|
||||
if result.returncode == 0:
|
||||
return 1, 1
|
||||
return 0, 1
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return 0, 1
|
||||
except Exception:
|
||||
return 0, 1
|
||||
finally:
|
||||
if tmp_path and os.path.exists(tmp_path):
|
||||
os.unlink(tmp_path)
|
||||
Loading…
Add table
Add a link
Reference in a new issue