atropos/environments/hack0/MCP_datasets.py

from datasets import load_dataset
import pandas as pd
import random

# Load the MCP servers dataset
try:
    ds = load_dataset("DeepNLP/mcp-servers")
    train_ds = ds["train"]
    df = train_ds.to_pandas()
except Exception as e:
    print(f"Error loading dataset: {e}")
    # Create dummy data for demonstration
    df = pd.DataFrame({
        'content_name': ['AgentRPC', 'Git', 'Actors MCP Server'],
        'description': [
            'Toggle menu Node.js Go Python',
            'Tools to read, search, and manipulate code',
            'Use 3,000+ pre-built cloud tools'
        ],
        'subfield': ['MCP SERVER', 'MCP SERVER', 'MCP SERVER'],
        'field': ['AI AGENT', 'AI AGENT', 'AI AGENT']
    })

# Define template prompts based on server types
general_templates = [
    "I need to {action} {object}. Can you help me?",
    "How can I {action} using {tool}?",
    "I'm trying to {action}. What's the best way to do this?",
    "Can you assist me with {action}?",
    "What's the process for {action} with {tool}?"
]

# Define specific actions based on server type
server_specific_actions = {
    "AgentRPC": [
        "call a remote procedure",
        "establish a connection with a remote server",
        "execute a function on another machine",
        "implement RPC in my application",
        "set up agent communication"
    ],
    "Git": [
        "merge my branch",
        "resolve a merge conflict",
        "check the commit history",
        "revert to a previous commit",
        "create a new branch"
    ],
    "AWS KB Retrieval": [
        "find information about AWS services",
        "query the AWS knowledge base",
        "lookup AWS documentation",
        "get help with AWS configuration",
        "understand AWS pricing"
    ],
    "Anki": [
        "create flashcards for studying",
        "improve my spaced repetition system",
        "organize my study notes",
        "set up a memory training system",
        "track my learning progress"
    ],
    "ArangoDB": [
        "query a graph database",
        "store connected data",
        "implement a multi-model database",
        "perform graph traversals",
        "optimize my database queries"
    ]
}

# Default actions for any server not specifically defined
default_actions = [
    "connect to a server",
    "use an API",
    "access external data",
    "integrate with a tool",
    "automate a process"
]

def generate_prompt_for_server(server_name, description):
    """Generate a contextually appropriate prompt for a given server"""

    # Extract potential actions from description if available
    actions = []
    if description and isinstance(description, str):
        words = description.lower().split()
        verbs = ["use", "toggle", "enable", "explore", "search", "read", "process", "connect", "build"]
        for verb in verbs:
            if verb in words:
                idx = words.index(verb)
                if idx < len(words) - 1:
                    actions.append(f"{verb} {words[idx+1]}")

    # If we couldn't extract meaningful actions, use predefined ones
    if not actions:
        if server_name in server_specific_actions:
            actions = server_specific_actions[server_name]
        else:
            actions = default_actions

    # Get a random action and template
    action = random.choice(actions)
    template = random.choice(general_templates)

    # Fill in the template
    prompt = template.format(
        action=action,
        object=server_name,
        tool=server_name
    )

    return prompt

# Generate prompts for each entry in the dataset
prompts = []
for idx, row in df.iterrows():
    server_name = row['content_name']
    description = row.get('description', '')
    prompt = generate_prompt_for_server(server_name, description)
    prompts.append(prompt)

# Add the prompts as a new column
df['prompt'] = prompts

# Preview the results
print("\nDataset with Added Prompts:")
print(df[['content_name', 'prompt']].head())

# Save the modified dataset
from datasets import Dataset
modified_ds = Dataset.from_pandas(df)
modified_ds.save_to_disk("./modified_mcp_dataset")

print("\nModified dataset saved to ./modified_mcp_dataset")
print("You can load it in your RL environment with:")
print("from datasets import load_from_disk")
print("custom_dataset = load_from_disk('./modified_mcp_dataset')")

# To demonstrate how this would look in your RL environment
print("\nExample usage in RL environment:")
print("="*60)
for idx, row in df.head(3).iterrows():
    print(f"User Query: {row['prompt']}")
    print(f"Available Tool: {row['content_name']}")
    print(f"Tool Type: {row['subfield']}")
    print("-"*40)