added prompt for each MCP

2026-04-23 16:54:56 +00:00 · 2025-05-18 17:02:45 -07:00 · 2025-05-18 17:02:45 -07:00 · e78d2c5bc9
commit e78d2c5bc9
parent 366f82d5bc
1 changed files with 147 additions and 0 deletions
--- a/environments/hack0/MCP_datasets.py
+++ b/environments/hack0/MCP_datasets.py
@ -0,0 +1,147 @@
+from datasets import load_dataset
+import pandas as pd
+import random
+
+# Load the MCP servers dataset
+try:
+    ds = load_dataset("DeepNLP/mcp-servers")
+    train_ds = ds["train"]
+    df = train_ds.to_pandas()
+except Exception as e:
+    print(f"Error loading dataset: {e}")
+    # Create dummy data for demonstration
+    df = pd.DataFrame({
+        'content_name': ['AgentRPC', 'Git', 'Actors MCP Server'],
+        'description': [
+            'Toggle menu Node.js Go Python', 
+            'Tools to read, search, and manipulate code',
+            'Use 3,000+ pre-built cloud tools'
+        ],
+        'subfield': ['MCP SERVER', 'MCP SERVER', 'MCP SERVER'],
+        'field': ['AI AGENT', 'AI AGENT', 'AI AGENT']
+    })
+
+# Define template prompts based on server types
+general_templates = [
+    "I need to {action} {object}. Can you help me?",
+    "How can I {action} using {tool}?",
+    "I'm trying to {action}. What's the best way to do this?",
+    "Can you assist me with {action}?",
+    "What's the process for {action} with {tool}?"
+]
+
+# Define specific actions based on server type
+server_specific_actions = {
+    "AgentRPC": [
+        "call a remote procedure",
+        "establish a connection with a remote server",
+        "execute a function on another machine",
+        "implement RPC in my application",
+        "set up agent communication"
+    ],
+    "Git": [
+        "merge my branch",
+        "resolve a merge conflict",
+        "check the commit history",
+        "revert to a previous commit",
+        "create a new branch"
+    ],
+    "AWS KB Retrieval": [
+        "find information about AWS services",
+        "query the AWS knowledge base",
+        "lookup AWS documentation",
+        "get help with AWS configuration",
+        "understand AWS pricing"
+    ],
+    "Anki": [
+        "create flashcards for studying",
+        "improve my spaced repetition system",
+        "organize my study notes",
+        "set up a memory training system",
+        "track my learning progress"
+    ],
+    "ArangoDB": [
+        "query a graph database",
+        "store connected data",
+        "implement a multi-model database",
+        "perform graph traversals",
+        "optimize my database queries"
+    ]
+}
+
+# Default actions for any server not specifically defined
+default_actions = [
+    "connect to a server",
+    "use an API",
+    "access external data",
+    "integrate with a tool",
+    "automate a process"
+]
+
+def generate_prompt_for_server(server_name, description):
+    """Generate a contextually appropriate prompt for a given server"""
+    
+    # Extract potential actions from description if available
+    actions = []
+    if description and isinstance(description, str):
+        words = description.lower().split()
+        verbs = ["use", "toggle", "enable", "explore", "search", "read", "process", "connect", "build"]
+        for verb in verbs:
+            if verb in words:
+                idx = words.index(verb)
+                if idx < len(words) - 1:
+                    actions.append(f"{verb} {words[idx+1]}")
+    
+    # If we couldn't extract meaningful actions, use predefined ones
+    if not actions:
+        if server_name in server_specific_actions:
+            actions = server_specific_actions[server_name]
+        else:
+            actions = default_actions
+    
+    # Get a random action and template
+    action = random.choice(actions)
+    template = random.choice(general_templates)
+    
+    # Fill in the template
+    prompt = template.format(
+        action=action,
+        object=server_name,
+        tool=server_name
+    )
+    
+    return prompt
+
+# Generate prompts for each entry in the dataset
+prompts = []
+for idx, row in df.iterrows():
+    server_name = row['content_name']
+    description = row.get('description', '')
+    prompt = generate_prompt_for_server(server_name, description)
+    prompts.append(prompt)
+
+# Add the prompts as a new column
+df['prompt'] = prompts
+
+# Preview the results
+print("\nDataset with Added Prompts:")
+print(df[['content_name', 'prompt']].head())
+
+# Save the modified dataset
+from datasets import Dataset
+modified_ds = Dataset.from_pandas(df)
+modified_ds.save_to_disk("./modified_mcp_dataset")
+
+print("\nModified dataset saved to ./modified_mcp_dataset")
+print("You can load it in your RL environment with:")
+print("from datasets import load_from_disk")
+print("custom_dataset = load_from_disk('./modified_mcp_dataset')")
+
+# To demonstrate how this would look in your RL environment
+print("\nExample usage in RL environment:")
+print("="*60)
+for idx, row in df.head(3).iterrows():
+    print(f"User Query: {row['prompt']}")
+    print(f"Available Tool: {row['content_name']}")
+    print(f"Tool Type: {row['subfield']}")
+    print("-"*40)