mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-24 17:04:55 +00:00
Add regex generation environment for community
This commit is contained in:
parent
81b2d4daab
commit
86d5163316
4 changed files with 822 additions and 0 deletions
61
environments/community/regex_generation/README.md
Normal file
61
environments/community/regex_generation/README.md
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
# Regex Generation Environment
|
||||
|
||||
An RL environment that trains language models to generate correct Python-compatible regular expressions from natural language descriptions and example test cases.
|
||||
|
||||
## How it works
|
||||
|
||||
Each problem gives the model:
|
||||
- A natural language description of the pattern to match
|
||||
- A set of strings that **should** match
|
||||
- A set of strings that **should not** match
|
||||
|
||||
The model must produce a regex pattern inside `<answer>` tags. The pattern is tested using `re.fullmatch()` against all provided examples.
|
||||
|
||||
## Reward signal
|
||||
|
||||
The reward is the fraction of test cases passed (both positive and negative). A score of 1.0 means the regex correctly matches all positive examples and rejects all negative ones. Groups where all rollouts score identically are discarded (no learning signal).
|
||||
|
||||
## Problem set
|
||||
|
||||
The environment ships with 28 hand-crafted regex problems across three difficulty levels:
|
||||
|
||||
- **Easy**: Basic patterns (digits only, starts with X, exact match)
|
||||
- **Medium**: Emails, dates, phone numbers, hex colors, zip codes
|
||||
- **Hard**: IPv4 addresses, semantic versioning, URLs, repeated words
|
||||
|
||||
Problems are split 80/20 into train/test sets.
|
||||
|
||||
## Running
|
||||
|
||||
```bash
|
||||
# Basic training
|
||||
python regex_env.py serve \
|
||||
--env.tokenizer_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview" \
|
||||
--openai.base_url http://localhost:9001/v1
|
||||
|
||||
# Only easy/medium problems
|
||||
python regex_env.py serve \
|
||||
--env.difficulties='["easy", "medium"]'
|
||||
```
|
||||
|
||||
## Config options
|
||||
|
||||
| Option | Type | Default | Description |
|
||||
|--------|------|---------|-------------|
|
||||
| `difficulties` | list[str] | `["easy", "medium", "hard"]` | Difficulty levels to include |
|
||||
| `score_threshold` | float | `1.0` | Min score to count as "correct" in metrics |
|
||||
|
||||
Standard `BaseEnvConfig` options (`group_size`, `max_token_length`, etc.) also apply.
|
||||
|
||||
## Eval metrics
|
||||
|
||||
| Metric | Description |
|
||||
|--------|-------------|
|
||||
| `eval/avg_score` | Average fraction of test cases passed |
|
||||
| `eval/percent_perfect` | Fraction of problems with all tests passing |
|
||||
| `eval/percent_valid_regex` | Fraction of responses with syntactically valid regex |
|
||||
| `train/percent_correct` | Training accuracy (problems scoring above threshold) |
|
||||
|
||||
## Dependencies
|
||||
|
||||
No extra dependencies beyond what Atropos already provides. Uses only Python's built-in `re` module for regex validation.
|
||||
10
environments/community/regex_generation/__init__.py
Normal file
10
environments/community/regex_generation/__init__.py
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
"""
|
||||
Regex Generation Environment
|
||||
|
||||
An RL environment for training LLMs to generate correct regular expressions
|
||||
from natural language descriptions and test cases.
|
||||
"""
|
||||
|
||||
__all__ = ["RegexEnv"]
|
||||
|
||||
from regex_env import RegexEnv # noqa
|
||||
352
environments/community/regex_generation/regex_env.py
Normal file
352
environments/community/regex_generation/regex_env.py
Normal file
|
|
@ -0,0 +1,352 @@
|
|||
import logging
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
from pydantic import Field
|
||||
from regex_problems import PROBLEMS
|
||||
from tqdm.asyncio import tqdm_asyncio
|
||||
|
||||
from atroposlib.envs.base import (
|
||||
APIServerConfig,
|
||||
BaseEnv,
|
||||
BaseEnvConfig,
|
||||
ScoredDataGroup,
|
||||
)
|
||||
from atroposlib.type_definitions import Item
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
SYSTEM_PROMPT = (
|
||||
"You are a deep thinking AI, you may use extremely long chains of thought "
|
||||
"to deeply consider the problem and deliberate with yourself via systematic "
|
||||
"reasoning processes to help come to a correct solution prior to answering. "
|
||||
"You should enclose your thoughts and internal monologue inside <think> </think> "
|
||||
"tags, and then provide your solution or response to the problem.\n\n"
|
||||
"You will be given a description of a pattern to match, along with examples of "
|
||||
"strings that should and should not match. Write a Python-compatible regular "
|
||||
"expression that matches the full string (the regex will be tested with re.fullmatch).\n\n"
|
||||
"Provide your answer inside <answer> </answer> tags, containing only the regex "
|
||||
"pattern with no delimiters, flags, or extra text. For example:\n"
|
||||
"<answer>^[a-z]+$</answer>"
|
||||
)
|
||||
|
||||
|
||||
def build_user_prompt(problem: dict) -> str:
|
||||
"""Format a regex problem into a user prompt."""
|
||||
lines = [f"Description: {problem['description']}", ""]
|
||||
lines.append("Strings that SHOULD match:")
|
||||
for s in problem["positive"]:
|
||||
lines.append(f" - {repr(s)}")
|
||||
lines.append("")
|
||||
lines.append("Strings that should NOT match:")
|
||||
for s in problem["negative"]:
|
||||
lines.append(f" - {repr(s)}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def extract_answer(text: str) -> Optional[str]:
|
||||
"""Pull the regex pattern out of <answer>...</answer> tags."""
|
||||
match = re.search(r"<answer>\s*(.*?)\s*</answer>", text, re.DOTALL)
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
return None
|
||||
|
||||
|
||||
def test_regex(pattern: str, positive: list, negative: list) -> dict:
|
||||
"""
|
||||
Test a regex pattern against positive and negative examples.
|
||||
Returns a dict with pass counts and total score.
|
||||
"""
|
||||
try:
|
||||
compiled = re.compile(pattern)
|
||||
except re.error:
|
||||
return {"score": 0.0, "valid": False, "pos_pass": 0, "neg_pass": 0}
|
||||
|
||||
pos_pass = sum(1 for s in positive if compiled.fullmatch(s) is not None)
|
||||
neg_pass = sum(1 for s in negative if compiled.fullmatch(s) is None)
|
||||
|
||||
total = len(positive) + len(negative)
|
||||
score = (pos_pass + neg_pass) / total if total > 0 else 0.0
|
||||
|
||||
return {
|
||||
"score": score,
|
||||
"valid": True,
|
||||
"pos_pass": pos_pass,
|
||||
"neg_pass": neg_pass,
|
||||
}
|
||||
|
||||
|
||||
class RegexEnvConfig(BaseEnvConfig):
|
||||
"""Config for the regex generation environment."""
|
||||
|
||||
difficulties: List[str] = Field(
|
||||
default=["easy", "medium", "hard"],
|
||||
description="Which difficulty levels to include",
|
||||
)
|
||||
score_threshold: float = Field(
|
||||
default=1.0,
|
||||
description="Minimum test pass rate to count as correct for eval metrics",
|
||||
)
|
||||
|
||||
|
||||
class RegexEnv(BaseEnv):
|
||||
name = "regex_generation"
|
||||
env_config_cls = RegexEnvConfig
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: RegexEnvConfig,
|
||||
server_configs: List[APIServerConfig],
|
||||
slurm=True,
|
||||
testing=False,
|
||||
):
|
||||
super().__init__(config, server_configs, slurm, testing)
|
||||
self.percent_correct_buffer = list()
|
||||
self.eval_metrics = list()
|
||||
|
||||
@classmethod
|
||||
def config_init(cls) -> Tuple[RegexEnvConfig, List[APIServerConfig]]:
|
||||
env_config = RegexEnvConfig(
|
||||
tokenizer_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
|
||||
group_size=8,
|
||||
use_wandb=True,
|
||||
rollout_server_url="http://localhost:8000",
|
||||
total_steps=2000,
|
||||
batch_size=12,
|
||||
steps_per_eval=200,
|
||||
max_token_length=2048,
|
||||
wandb_name="regex_generation",
|
||||
)
|
||||
server_configs = [
|
||||
APIServerConfig(
|
||||
model_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
|
||||
base_url="http://localhost:9001/v1",
|
||||
api_key="x",
|
||||
num_requests_for_eval=256,
|
||||
),
|
||||
]
|
||||
return env_config, server_configs
|
||||
|
||||
async def setup(self):
|
||||
# Filter problems by configured difficulty levels
|
||||
all_problems = [
|
||||
p for p in PROBLEMS if p["difficulty"] in self.config.difficulties
|
||||
]
|
||||
random.seed(42)
|
||||
random.shuffle(all_problems)
|
||||
|
||||
# 80/20 train/test split
|
||||
split_idx = max(1, int(len(all_problems) * 0.8))
|
||||
self.train = all_problems[:split_idx]
|
||||
self.test = all_problems[split_idx:]
|
||||
|
||||
if not self.test:
|
||||
# If too few problems, use last few from train as test
|
||||
self.test = self.train[-2:]
|
||||
|
||||
self.iter = 0
|
||||
logger.info(
|
||||
f"Loaded {len(self.train)} train and {len(self.test)} test problems"
|
||||
)
|
||||
|
||||
def save_checkpoint(self, step, data=None):
|
||||
if data is None:
|
||||
data = {}
|
||||
data["iter"] = self.iter
|
||||
super().save_checkpoint(step, data)
|
||||
|
||||
async def get_next_item(self) -> Item:
|
||||
problem = self.train[self.iter % len(self.train)]
|
||||
self.iter += 1
|
||||
return problem
|
||||
|
||||
async def collect_trajectories(
|
||||
self, item: dict
|
||||
) -> Tuple[ScoredDataGroup, list[Item]]:
|
||||
user_content = build_user_prompt(item)
|
||||
messages = [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_content},
|
||||
]
|
||||
|
||||
async with self.server.managed_server(tokenizer=self.tokenizer) as managed:
|
||||
chat_completions = await managed.chat_completion(
|
||||
messages=messages,
|
||||
n=self.config.group_size,
|
||||
max_tokens=self.config.max_token_length,
|
||||
temperature=1.0,
|
||||
)
|
||||
state = managed.get_state()
|
||||
nodes = state["nodes"]
|
||||
|
||||
to_score = []
|
||||
for i, choice in enumerate(chat_completions.choices):
|
||||
to_score.append(
|
||||
{
|
||||
"response": choice.message.content,
|
||||
"finish_reason": choice.finish_reason,
|
||||
"tokens": nodes[i].tokens,
|
||||
"masks": nodes[i].masked_tokens,
|
||||
"logprobs": nodes[i].logprobs,
|
||||
"positive": item["positive"],
|
||||
"negative": item["negative"],
|
||||
}
|
||||
)
|
||||
|
||||
scored = await self.score(to_score)
|
||||
return scored, []
|
||||
|
||||
async def score(
|
||||
self, rollout_group_data: list
|
||||
) -> Union[Optional[ScoredDataGroup], List[Optional[ScoredDataGroup]]]:
|
||||
scores = ScoredDataGroup()
|
||||
scores["tokens"] = []
|
||||
scores["masks"] = []
|
||||
scores["scores"] = []
|
||||
scores["inference_logprobs"] = []
|
||||
|
||||
random.shuffle(rollout_group_data)
|
||||
|
||||
for item in rollout_group_data:
|
||||
response = item["response"]
|
||||
|
||||
# Skip truncated responses
|
||||
if item["finish_reason"] == "length":
|
||||
continue
|
||||
|
||||
pattern = extract_answer(response)
|
||||
if pattern is None:
|
||||
reward = 0.0
|
||||
else:
|
||||
result = test_regex(pattern, item["positive"], item["negative"])
|
||||
reward = result["score"]
|
||||
|
||||
tokens = item["tokens"]
|
||||
masks = item["masks"]
|
||||
logprobs = item["logprobs"]
|
||||
|
||||
# Skip very short completions
|
||||
if len([t for t in masks if t != -100]) < 10:
|
||||
continue
|
||||
|
||||
scores["tokens"].append(tokens)
|
||||
scores["masks"].append(masks)
|
||||
scores["inference_logprobs"].append(logprobs)
|
||||
scores["scores"].append(reward)
|
||||
|
||||
if len(scores["tokens"]) >= self.config.group_size:
|
||||
break
|
||||
|
||||
if not scores["tokens"]:
|
||||
return None
|
||||
|
||||
for s in scores["scores"]:
|
||||
self.percent_correct_buffer.append(1.0 if s >= self.config.score_threshold else 0.0)
|
||||
|
||||
# If all scores identical, no learning signal
|
||||
if len(set(scores["scores"])) == 1:
|
||||
return None
|
||||
|
||||
return scores
|
||||
|
||||
async def rollout_and_score_eval(self, problem: dict) -> dict:
|
||||
"""Run a single eval rollout and score it."""
|
||||
user_content = build_user_prompt(problem)
|
||||
|
||||
async with self.server.managed_server(tokenizer=self.tokenizer) as managed:
|
||||
completion = await managed.chat_completion(
|
||||
messages=[
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_content},
|
||||
],
|
||||
n=1,
|
||||
max_tokens=self.config.max_token_length,
|
||||
temperature=0.6,
|
||||
)
|
||||
response_content = completion.choices[0].message.content
|
||||
|
||||
pattern = extract_answer(response_content)
|
||||
if pattern is None:
|
||||
test_result = {"score": 0.0, "valid": False, "pos_pass": 0, "neg_pass": 0}
|
||||
else:
|
||||
test_result = test_regex(pattern, problem["positive"], problem["negative"])
|
||||
|
||||
return {
|
||||
"score": test_result["score"],
|
||||
"perfect": test_result["score"] == 1.0,
|
||||
"valid_regex": test_result.get("valid", False),
|
||||
"pattern": pattern,
|
||||
"sample": {
|
||||
"messages": [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_content},
|
||||
{"role": "assistant", "content": response_content},
|
||||
],
|
||||
"description": problem["description"],
|
||||
"difficulty": problem["difficulty"],
|
||||
"submitted_pattern": pattern,
|
||||
"score": test_result["score"],
|
||||
"correct": test_result["score"] == 1.0,
|
||||
},
|
||||
}
|
||||
|
||||
async def evaluate(self, *args, **kwargs):
|
||||
start_time = time.time()
|
||||
|
||||
eval_tasks = [self.rollout_and_score_eval(p) for p in self.test]
|
||||
results = await tqdm_asyncio.gather(*eval_tasks)
|
||||
|
||||
scores = [r["score"] for r in results]
|
||||
samples = [r["sample"] for r in results]
|
||||
perfect_count = sum(1 for r in results if r["perfect"])
|
||||
valid_count = sum(1 for r in results if r["valid_regex"])
|
||||
|
||||
avg_score = sum(scores) / len(scores) if scores else 0.0
|
||||
percent_perfect = perfect_count / len(results) if results else 0.0
|
||||
percent_valid = valid_count / len(results) if results else 0.0
|
||||
|
||||
end_time = time.time()
|
||||
|
||||
self.eval_metrics.append(("eval/avg_score", avg_score))
|
||||
self.eval_metrics.append(("eval/percent_perfect", percent_perfect))
|
||||
self.eval_metrics.append(("eval/percent_valid_regex", percent_valid))
|
||||
|
||||
eval_metrics = {
|
||||
"eval/avg_score": avg_score,
|
||||
"eval/percent_perfect": percent_perfect,
|
||||
"eval/percent_valid_regex": percent_valid,
|
||||
}
|
||||
|
||||
await self.evaluate_log(
|
||||
metrics=eval_metrics,
|
||||
samples=samples,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
generation_parameters={
|
||||
"temperature": 0.6,
|
||||
"max_tokens": self.config.max_token_length,
|
||||
},
|
||||
)
|
||||
|
||||
async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
|
||||
if wandb_metrics is None:
|
||||
wandb_metrics = {}
|
||||
|
||||
if self.percent_correct_buffer:
|
||||
wandb_metrics["train/percent_correct"] = sum(
|
||||
self.percent_correct_buffer
|
||||
) / len(self.percent_correct_buffer)
|
||||
self.percent_correct_buffer = list()
|
||||
|
||||
for key, value in self.eval_metrics:
|
||||
wandb_metrics[key] = value
|
||||
self.eval_metrics = list()
|
||||
|
||||
await super().wandb_log(wandb_metrics)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
RegexEnv.cli()
|
||||
399
environments/community/regex_generation/regex_problems.py
Normal file
399
environments/community/regex_generation/regex_problems.py
Normal file
|
|
@ -0,0 +1,399 @@
|
|||
"""
|
||||
Hand-crafted regex problems with natural language descriptions,
|
||||
positive/negative test cases, and difficulty ratings.
|
||||
"""
|
||||
|
||||
PROBLEMS = [
|
||||
# --- Easy ---
|
||||
{
|
||||
"description": (
|
||||
"Match a string that contains only digits (0-9),"
|
||||
" one or more characters long."
|
||||
),
|
||||
"positive": ["123", "0", "999999", "42", "007"],
|
||||
"negative": ["abc", "12a3", "", " 123", "12.3", "12 34"],
|
||||
"difficulty": "easy",
|
||||
},
|
||||
{
|
||||
"description": (
|
||||
"Match a string that starts with 'hello' (case-sensitive)."
|
||||
),
|
||||
"positive": ["hello", "hello world", "hellooo", "hello123"],
|
||||
"negative": ["Hello", "hi hello", "HELLO", "hell"],
|
||||
"difficulty": "easy",
|
||||
},
|
||||
{
|
||||
"description": "Match a string that ends with '.txt'.",
|
||||
"positive": ["file.txt", "my_doc.txt", ".txt", "a.txt"],
|
||||
"negative": ["file.csv", "txt", "file.txt.bak", "file.txts"],
|
||||
"difficulty": "easy",
|
||||
},
|
||||
{
|
||||
"description": (
|
||||
"Match a string consisting of exactly three lowercase letters."
|
||||
),
|
||||
"positive": ["abc", "xyz", "foo", "bar"],
|
||||
"negative": ["ab", "abcd", "ABC", "a1c", "ab ", " ab"],
|
||||
"difficulty": "easy",
|
||||
},
|
||||
{
|
||||
"description": (
|
||||
"Match a string that is either 'yes' or 'no'"
|
||||
" (exact match, case-sensitive)."
|
||||
),
|
||||
"positive": ["yes", "no"],
|
||||
"negative": ["Yes", "NO", "maybe", "yes ", " no", "yesno"],
|
||||
"difficulty": "easy",
|
||||
},
|
||||
{
|
||||
"description": (
|
||||
"Match a string that contains at least one uppercase letter."
|
||||
),
|
||||
"positive": ["Hello", "ABC", "aB", "123A456"],
|
||||
"negative": ["hello", "123", "abc!", ""],
|
||||
"difficulty": "easy",
|
||||
},
|
||||
{
|
||||
"description": (
|
||||
"Match a non-empty string consisting only of"
|
||||
" whitespace characters (spaces, tabs)."
|
||||
),
|
||||
"positive": [" ", " ", "\t", " \t "],
|
||||
"negative": ["", "a", " a ", "hello"],
|
||||
"difficulty": "easy",
|
||||
},
|
||||
{
|
||||
"description": "Match a string that starts with a digit.",
|
||||
"positive": ["1abc", "0", "9test", "3 things"],
|
||||
"negative": ["abc", " 1", "a1", ""],
|
||||
"difficulty": "easy",
|
||||
},
|
||||
# --- Medium ---
|
||||
{
|
||||
"description": (
|
||||
"Match a valid email address: one or more"
|
||||
" alphanumeric/dot/underscore/hyphen characters,"
|
||||
" then '@', then one or more alphanumeric/dot/hyphen"
|
||||
" characters, then '.', then two to four letters."
|
||||
),
|
||||
"positive": [
|
||||
"user@example.com",
|
||||
"first.last@domain.org",
|
||||
"name_123@test.co",
|
||||
"a@b.io",
|
||||
],
|
||||
"negative": [
|
||||
"@example.com",
|
||||
"user@.com",
|
||||
"user@com",
|
||||
"user@domain.toolongext",
|
||||
"user@@domain.com",
|
||||
],
|
||||
"difficulty": "medium",
|
||||
},
|
||||
{
|
||||
"description": (
|
||||
"Match a time string in 24-hour format HH:MM"
|
||||
" where HH is 00-23 and MM is 00-59."
|
||||
),
|
||||
"positive": ["00:00", "12:30", "23:59", "09:05"],
|
||||
"negative": [
|
||||
"24:00",
|
||||
"12:60",
|
||||
"1:30",
|
||||
"12:5",
|
||||
"12-30",
|
||||
"ab:cd",
|
||||
],
|
||||
"difficulty": "medium",
|
||||
},
|
||||
{
|
||||
"description": (
|
||||
"Match a US zip code: exactly 5 digits, optionally"
|
||||
" followed by a dash and exactly 4 more digits."
|
||||
),
|
||||
"positive": ["12345", "00000", "12345-6789", "99999-0000"],
|
||||
"negative": [
|
||||
"1234",
|
||||
"123456",
|
||||
"12345-678",
|
||||
"12345-67890",
|
||||
"abcde",
|
||||
],
|
||||
"difficulty": "medium",
|
||||
},
|
||||
{
|
||||
"description": (
|
||||
"Match a hex color code: a '#' followed by exactly"
|
||||
" 6 hexadecimal characters (0-9, a-f, A-F)."
|
||||
),
|
||||
"positive": ["#aabbcc", "#123456", "#ABCDEF", "#a1B2c3"],
|
||||
"negative": [
|
||||
"#abc",
|
||||
"#1234567",
|
||||
"aabbcc",
|
||||
"#GHIJKL",
|
||||
"# aabbcc",
|
||||
],
|
||||
"difficulty": "medium",
|
||||
},
|
||||
{
|
||||
"description": (
|
||||
"Match a valid IPv4 address. Each octet is 0-255,"
|
||||
" separated by dots. No leading zeros allowed"
|
||||
" except for the number 0 itself."
|
||||
),
|
||||
"positive": [
|
||||
"192.168.1.1",
|
||||
"0.0.0.0",
|
||||
"255.255.255.255",
|
||||
"10.0.0.1",
|
||||
],
|
||||
"negative": [
|
||||
"256.1.1.1",
|
||||
"1.2.3.256",
|
||||
"01.02.03.04",
|
||||
"1.2.3",
|
||||
"1.2.3.4.5",
|
||||
"abc.def.ghi.jkl",
|
||||
],
|
||||
"difficulty": "hard",
|
||||
},
|
||||
{
|
||||
"description": (
|
||||
"Match a date in the format YYYY-MM-DD where YYYY"
|
||||
" is four digits, MM is 01-12, and DD is 01-31."
|
||||
),
|
||||
"positive": ["2024-01-15", "1999-12-31", "2000-06-01"],
|
||||
"negative": [
|
||||
"2024-13-01",
|
||||
"2024-00-15",
|
||||
"2024-01-32",
|
||||
"2024-01-00",
|
||||
"24-01-15",
|
||||
"2024/01/15",
|
||||
],
|
||||
"difficulty": "medium",
|
||||
},
|
||||
{
|
||||
"description": (
|
||||
"Match a string of only alphanumeric characters and"
|
||||
" underscores. Must start with a letter or underscore"
|
||||
" and be 1-30 characters long. Like a variable name."
|
||||
),
|
||||
"positive": ["my_var", "_private", "x", "CamelCase", "var_123"],
|
||||
"negative": ["123abc", "my-var", "my var", "", "a" * 31],
|
||||
"difficulty": "medium",
|
||||
},
|
||||
{
|
||||
"description": (
|
||||
"Match a string enclosed in double quotes. Inside the"
|
||||
" quotes, any characters are allowed except unescaped"
|
||||
' double quotes. Escaped quotes (\\\") are allowed.'
|
||||
),
|
||||
"positive": [
|
||||
'"hello"',
|
||||
'"hello world"',
|
||||
'""',
|
||||
'"she said \\"hi\\""',
|
||||
],
|
||||
"negative": [
|
||||
"hello",
|
||||
'"missing end',
|
||||
'no "quotes" here',
|
||||
"'single'",
|
||||
],
|
||||
"difficulty": "medium",
|
||||
},
|
||||
{
|
||||
"description": (
|
||||
"Match a phone number in the format (XXX) XXX-XXXX"
|
||||
" where X is a digit."
|
||||
),
|
||||
"positive": [
|
||||
"(123) 456-7890",
|
||||
"(000) 000-0000",
|
||||
"(999) 999-9999",
|
||||
],
|
||||
"negative": [
|
||||
"123-456-7890",
|
||||
"(123)456-7890",
|
||||
"(123) 456 7890",
|
||||
"(12) 456-7890",
|
||||
"(1234) 456-7890",
|
||||
],
|
||||
"difficulty": "medium",
|
||||
},
|
||||
# --- Hard ---
|
||||
{
|
||||
"description": (
|
||||
"Match a valid CSS class selector: starts with a dot,"
|
||||
" followed by a letter, hyphen, or underscore, then"
|
||||
" zero or more letters, digits, hyphens, or underscores."
|
||||
),
|
||||
"positive": [".my-class", ".a", "._private", ".btn-primary-2"],
|
||||
"negative": [
|
||||
"my-class",
|
||||
".123",
|
||||
". space",
|
||||
".my class",
|
||||
".",
|
||||
],
|
||||
"difficulty": "hard",
|
||||
},
|
||||
{
|
||||
"description": (
|
||||
"Match a valid semantic version: MAJOR.MINOR.PATCH"
|
||||
" where each is a non-negative integer without leading"
|
||||
" zeros (except 0 itself). Optionally followed by a"
|
||||
" hyphen and a pre-release label (alphanumeric, dots)."
|
||||
),
|
||||
"positive": [
|
||||
"1.0.0",
|
||||
"0.1.0",
|
||||
"12.34.56",
|
||||
"1.0.0-alpha",
|
||||
"1.0.0-beta.1",
|
||||
],
|
||||
"negative": [
|
||||
"1.0",
|
||||
"1.0.0.0",
|
||||
"01.0.0",
|
||||
"1.02.0",
|
||||
"v1.0.0",
|
||||
"1.0.0-",
|
||||
],
|
||||
"difficulty": "hard",
|
||||
},
|
||||
{
|
||||
"description": (
|
||||
"Match a positive or negative integer or decimal"
|
||||
" number. May optionally start with + or -, must"
|
||||
" have digits before or after the decimal point."
|
||||
),
|
||||
"positive": ["42", "-3.14", "+0.5", "100", "0.001", "-7"],
|
||||
"negative": [".", "+-3", "12.34.5", "abc", "1.2.3", "3e10", ""],
|
||||
"difficulty": "hard",
|
||||
},
|
||||
{
|
||||
"description": (
|
||||
"Match a URL starting with http:// or https://,"
|
||||
" followed by a domain (letters, digits, dots,"
|
||||
" hyphens), then optionally a path of slashes"
|
||||
" and URL-safe characters."
|
||||
),
|
||||
"positive": [
|
||||
"http://example.com",
|
||||
"https://www.google.com/search",
|
||||
"https://a.b.c/path/to/page",
|
||||
"http://test.io/",
|
||||
],
|
||||
"negative": [
|
||||
"ftp://example.com",
|
||||
"example.com",
|
||||
"http://",
|
||||
"https:///path",
|
||||
],
|
||||
"difficulty": "hard",
|
||||
},
|
||||
{
|
||||
"description": (
|
||||
"Match a valid MAC address: six groups of two"
|
||||
" hexadecimal digits separated by colons."
|
||||
),
|
||||
"positive": [
|
||||
"00:1A:2B:3C:4D:5E",
|
||||
"ff:ff:ff:ff:ff:ff",
|
||||
"AA:BB:CC:DD:EE:FF",
|
||||
"01:23:45:67:89:ab",
|
||||
],
|
||||
"negative": [
|
||||
"00:1A:2B:3C:4D",
|
||||
"00:1A:2B:3C:4D:5E:6F",
|
||||
"001A2B3C4D5E",
|
||||
"GG:HH:II:JJ:KK:LL",
|
||||
"00-1A-2B-3C-4D-5E",
|
||||
],
|
||||
"difficulty": "hard",
|
||||
},
|
||||
{
|
||||
"description": (
|
||||
"Match a valid Markdown heading: one to six '#'"
|
||||
" characters at the start, followed by a space,"
|
||||
" then at least one non-whitespace character."
|
||||
),
|
||||
"positive": [
|
||||
"# Title",
|
||||
"## Section",
|
||||
"###### Deep",
|
||||
"### My Heading 3",
|
||||
],
|
||||
"negative": [
|
||||
"####### Too deep",
|
||||
"#NoSpace",
|
||||
"# ",
|
||||
"Not a heading",
|
||||
"",
|
||||
],
|
||||
"difficulty": "medium",
|
||||
},
|
||||
{
|
||||
"description": (
|
||||
"Match a Python f-string placeholder: starts with"
|
||||
" '{', ends with '}', contains at least one"
|
||||
" character inside that is not a brace."
|
||||
),
|
||||
"positive": ["{x}", "{name!r}", "{value:.2f}", "{obj.attr}"],
|
||||
"negative": ["{}", "{ }", "no braces", "{", "}", "{{escaped}}"],
|
||||
"difficulty": "medium",
|
||||
},
|
||||
{
|
||||
"description": (
|
||||
"Match a valid HTML opening tag (not self-closing)."
|
||||
" Starts with '<', then a tag name (letters),"
|
||||
" optionally attributes, then '>'. No '/' before '>'."
|
||||
),
|
||||
"positive": ["<div>", "<span>", '<a href="link">', "<p>"],
|
||||
"negative": ["<div/>", "</div>", "div", "< div>", "<>", "<123>"],
|
||||
"difficulty": "hard",
|
||||
},
|
||||
{
|
||||
"description": (
|
||||
"Match a string containing a repeated word (same word"
|
||||
" appearing consecutively, separated by a space)."
|
||||
" For example 'the the' or 'is is'."
|
||||
),
|
||||
"positive": [
|
||||
"the the cat",
|
||||
"I said said it",
|
||||
"go go go",
|
||||
"yes yes",
|
||||
],
|
||||
"negative": [
|
||||
"no repeats here",
|
||||
"the cat the dog",
|
||||
"hello world",
|
||||
],
|
||||
"difficulty": "hard",
|
||||
},
|
||||
{
|
||||
"description": (
|
||||
"Match a credit card-like number: exactly 16 digits,"
|
||||
" optionally separated into groups of 4 by dashes"
|
||||
" or spaces (but not mixed)."
|
||||
),
|
||||
"positive": [
|
||||
"1234567890123456",
|
||||
"1234-5678-9012-3456",
|
||||
"1234 5678 9012 3456",
|
||||
],
|
||||
"negative": [
|
||||
"1234-5678-90123456",
|
||||
"123456789012345",
|
||||
"12345678901234567",
|
||||
"1234 5678 9012-3456",
|
||||
"abcd-efgh-ijkl-mnop",
|
||||
],
|
||||
"difficulty": "hard",
|
||||
},
|
||||
]
|
||||
Loading…
Add table
Add a link
Reference in a new issue