mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-23 16:54:56 +00:00
147 lines
No EOL
4.6 KiB
Python
147 lines
No EOL
4.6 KiB
Python
from datasets import load_dataset
|
|
import pandas as pd
|
|
import random
|
|
|
|
# Load the MCP servers dataset
|
|
try:
|
|
ds = load_dataset("DeepNLP/mcp-servers")
|
|
train_ds = ds["train"]
|
|
df = train_ds.to_pandas()
|
|
except Exception as e:
|
|
print(f"Error loading dataset: {e}")
|
|
# Create dummy data for demonstration
|
|
df = pd.DataFrame({
|
|
'content_name': ['AgentRPC', 'Git', 'Actors MCP Server'],
|
|
'description': [
|
|
'Toggle menu Node.js Go Python',
|
|
'Tools to read, search, and manipulate code',
|
|
'Use 3,000+ pre-built cloud tools'
|
|
],
|
|
'subfield': ['MCP SERVER', 'MCP SERVER', 'MCP SERVER'],
|
|
'field': ['AI AGENT', 'AI AGENT', 'AI AGENT']
|
|
})
|
|
|
|
# Define template prompts based on server types
|
|
general_templates = [
|
|
"I need to {action} {object}. Can you help me?",
|
|
"How can I {action} using {tool}?",
|
|
"I'm trying to {action}. What's the best way to do this?",
|
|
"Can you assist me with {action}?",
|
|
"What's the process for {action} with {tool}?"
|
|
]
|
|
|
|
# Define specific actions based on server type
|
|
server_specific_actions = {
|
|
"AgentRPC": [
|
|
"call a remote procedure",
|
|
"establish a connection with a remote server",
|
|
"execute a function on another machine",
|
|
"implement RPC in my application",
|
|
"set up agent communication"
|
|
],
|
|
"Git": [
|
|
"merge my branch",
|
|
"resolve a merge conflict",
|
|
"check the commit history",
|
|
"revert to a previous commit",
|
|
"create a new branch"
|
|
],
|
|
"AWS KB Retrieval": [
|
|
"find information about AWS services",
|
|
"query the AWS knowledge base",
|
|
"lookup AWS documentation",
|
|
"get help with AWS configuration",
|
|
"understand AWS pricing"
|
|
],
|
|
"Anki": [
|
|
"create flashcards for studying",
|
|
"improve my spaced repetition system",
|
|
"organize my study notes",
|
|
"set up a memory training system",
|
|
"track my learning progress"
|
|
],
|
|
"ArangoDB": [
|
|
"query a graph database",
|
|
"store connected data",
|
|
"implement a multi-model database",
|
|
"perform graph traversals",
|
|
"optimize my database queries"
|
|
]
|
|
}
|
|
|
|
# Default actions for any server not specifically defined
|
|
default_actions = [
|
|
"connect to a server",
|
|
"use an API",
|
|
"access external data",
|
|
"integrate with a tool",
|
|
"automate a process"
|
|
]
|
|
|
|
def generate_prompt_for_server(server_name, description):
|
|
"""Generate a contextually appropriate prompt for a given server"""
|
|
|
|
# Extract potential actions from description if available
|
|
actions = []
|
|
if description and isinstance(description, str):
|
|
words = description.lower().split()
|
|
verbs = ["use", "toggle", "enable", "explore", "search", "read", "process", "connect", "build"]
|
|
for verb in verbs:
|
|
if verb in words:
|
|
idx = words.index(verb)
|
|
if idx < len(words) - 1:
|
|
actions.append(f"{verb} {words[idx+1]}")
|
|
|
|
# If we couldn't extract meaningful actions, use predefined ones
|
|
if not actions:
|
|
if server_name in server_specific_actions:
|
|
actions = server_specific_actions[server_name]
|
|
else:
|
|
actions = default_actions
|
|
|
|
# Get a random action and template
|
|
action = random.choice(actions)
|
|
template = random.choice(general_templates)
|
|
|
|
# Fill in the template
|
|
prompt = template.format(
|
|
action=action,
|
|
object=server_name,
|
|
tool=server_name
|
|
)
|
|
|
|
return prompt
|
|
|
|
# Generate prompts for each entry in the dataset
|
|
prompts = []
|
|
for idx, row in df.iterrows():
|
|
server_name = row['content_name']
|
|
description = row.get('description', '')
|
|
prompt = generate_prompt_for_server(server_name, description)
|
|
prompts.append(prompt)
|
|
|
|
# Add the prompts as a new column
|
|
df['prompt'] = prompts
|
|
|
|
# Preview the results
|
|
print("\nDataset with Added Prompts:")
|
|
print(df[['content_name', 'prompt']].head())
|
|
|
|
# Save the modified dataset
|
|
from datasets import Dataset
|
|
modified_ds = Dataset.from_pandas(df)
|
|
modified_ds.save_to_disk("./modified_mcp_dataset")
|
|
|
|
print("\nModified dataset saved to ./modified_mcp_dataset")
|
|
print("You can load it in your RL environment with:")
|
|
print("from datasets import load_from_disk")
|
|
print("custom_dataset = load_from_disk('./modified_mcp_dataset')")
|
|
|
|
# To demonstrate how this would look in your RL environment
|
|
print("\nExample usage in RL environment:")
|
|
print("="*60)
|
|
for idx, row in df.head(3).iterrows():
|
|
print(f"User Query: {row['prompt']}")
|
|
print(f"Available Tool: {row['content_name']}")
|
|
print(f"Tool Type: {row['subfield']}")
|
|
print("-"*40) |