diff --git a/eval/README.md b/eval/README.md index f7f89c7c..a52c4789 100644 --- a/eval/README.md +++ b/eval/README.md @@ -126,11 +126,24 @@ Options: - `--size`: Default dataset size (default: 100) - `--seed`: Default dataset seed (default: 42) - `--include-params`: Include all configuration parameters (default: False) +- `--category`: Only include datasets from this category (default: None) + +#### Generating Config for a Specific Category + +To generate a configuration file containing only datasets from a specific category: + +```bash +python generate_config.py --category algorithmic --output algorithmic_datasets.yaml --model "anthropic/claude-3.5-sonnet" +``` + +This will create a configuration file that includes only datasets in the "algorithmic" category. This is useful when you want to focus your evaluation on a specific type of reasoning tasks. + +Example categories include: math, arithmetic, reasoning, algorithmic, etc. The category is automatically extracted from the dataset's module name (e.g., from `reasoning_gym.math.dataset_name`, it extracts "math"). + +You can see all available categories by running the script without the `--category` option, as it will print all categories at the end of execution. ### Running Evaluations -To run evaluations: - ```bash python eval.py --config configs/your_config.yaml ``` diff --git a/eval/generate_config.py b/eval/generate_config.py index 4ab31eb1..43cff904 100644 --- a/eval/generate_config.py +++ b/eval/generate_config.py @@ -15,6 +15,7 @@ Options: --size SIZE Default dataset size (default: 100) --seed SEED Default dataset seed (default: 42) --include-params Include all configuration parameters (default: False) + --category CATEGORY Only include datasets from this category (default: None) """ import argparse @@ -35,14 +36,27 @@ def extract_category(module_name): return "other" -def generate_config(model, provider, size, seed, include_params): - """Generate configuration with all registered datasets.""" +def generate_config(model, provider, size, seed, include_params, category=None): + """Generate configuration with all registered datasets. + + Args: + model: Model name + provider: Provider name + size: Default dataset size + seed: Default dataset seed + include_params: Whether to include all configuration parameters + category: If specified, only include datasets from this category + """ # Group datasets by category categories = defaultdict(list) for dataset_name, (dataset_cls, config_cls) in DATASETS.items(): # Extract category from module name - category = extract_category(dataset_cls.__module__) + dataset_category = extract_category(dataset_cls.__module__) + + # Skip if a specific category was requested and this doesn't match + if category and dataset_category != category: + continue # Create dataset entry dataset_entry = {"dataset": dataset_name} @@ -62,7 +76,7 @@ def generate_config(model, provider, size, seed, include_params): dataset_entry["params"] = params # Add to appropriate category - categories[category].append(dataset_entry) + categories[dataset_category].append(dataset_entry) # Create configuration structure config = { @@ -90,12 +104,18 @@ def main(): parser.add_argument("--size", type=int, default=100, help="Default dataset size") parser.add_argument("--seed", type=int, default=42, help="Default dataset seed") parser.add_argument("--include-params", action="store_true", help="Include all configuration parameters") + parser.add_argument("--category", help="Only include datasets from this category") args = parser.parse_args() # Generate configuration config = generate_config( - model=args.model, provider=args.provider, size=args.size, seed=args.seed, include_params=args.include_params + model=args.model, + provider=args.provider, + size=args.size, + seed=args.seed, + include_params=args.include_params, + category=args.category, ) # Write to file