mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-28 17:29:30 +00:00
Integrate aniemerg wikipedia (#143)
* initial commit * initial draft of wikipedia article creation environment * add openai for rollouts, update requirements, create script to run, etc. * add configuration, add debugging, fix tool calls, prevent wikipedia access * now creates html file * fix output for html page * check in Claude plan * fixed formatting and other issues * add zip file * update README * linting, moved to community folder * linting * linting * linting * linting --------- Co-authored-by: Allan Niemerg <niemerg@gmail.com>
This commit is contained in:
parent
b774e97215
commit
f21154ff49
14 changed files with 4480 additions and 0 deletions
203
environments/community/wikipedia_research/get_examples.py
Normal file
203
environments/community/wikipedia_research/get_examples.py
Normal file
|
|
@ -0,0 +1,203 @@
|
|||
#!/usr/bin/env python3
|
||||
import json
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def collect_wikipedia_articles(
|
||||
num_articles=50, min_size=2000, days_ago=90, save_path="wikipedia_articles.json"
|
||||
):
|
||||
"""
|
||||
Collect recently created Wikipedia articles and save them to a JSON file.
|
||||
Includes HTML, plain text, and wikitext (source) formats.
|
||||
|
||||
Parameters:
|
||||
- num_articles: Number of articles to collect
|
||||
- min_size: Minimum article size in bytes
|
||||
- days_ago: How far back to look for articles
|
||||
- save_path: Where to save the JSON file
|
||||
"""
|
||||
# API endpoint and headers
|
||||
API_ENDPOINT = "https://en.wikipedia.org/w/api.php"
|
||||
HEADERS = {"User-Agent": "WikipediaArticleCollector/0.1 (your-email@example.com)"}
|
||||
|
||||
# Calculate the date range
|
||||
end_date = datetime.now()
|
||||
start_date = end_date - timedelta(days=days_ago)
|
||||
|
||||
# Format dates for API
|
||||
rcstart = end_date.strftime("%Y%m%d%H%M%S")
|
||||
rcend = start_date.strftime("%Y%m%d%H%M%S")
|
||||
|
||||
# Initialize variables
|
||||
all_articles = []
|
||||
continue_param = None
|
||||
batch_size = 50 # API max is usually 50
|
||||
|
||||
print(f"Starting article collection from {start_date.date()} to {end_date.date()}")
|
||||
|
||||
# Collect articles with pagination
|
||||
while len(all_articles) < 500: # Cap at 500 to avoid too many requests
|
||||
# Set up parameters
|
||||
params = {
|
||||
"action": "query",
|
||||
"format": "json",
|
||||
"list": "recentchanges",
|
||||
"rctype": "new",
|
||||
"rcnamespace": "0", # Main article namespace
|
||||
"rclimit": batch_size,
|
||||
"rcprop": "title|timestamp|ids|sizes|user",
|
||||
"rcshow": "!redirect",
|
||||
"rcstart": rcstart,
|
||||
"rcend": rcend,
|
||||
"rcdir": "older",
|
||||
}
|
||||
|
||||
# Add continue parameter if we have one
|
||||
if continue_param:
|
||||
params.update(continue_param)
|
||||
|
||||
# Make the request
|
||||
response = requests.get(API_ENDPOINT, headers=HEADERS, params=params)
|
||||
data = response.json()
|
||||
|
||||
# Extract articles
|
||||
if "query" in data and "recentchanges" in data["query"]:
|
||||
batch = data["query"]["recentchanges"]
|
||||
all_articles.extend(batch)
|
||||
print(f"Retrieved {len(batch)} articles, total: {len(all_articles)}")
|
||||
|
||||
# Check if we need to continue
|
||||
if "continue" in data:
|
||||
continue_param = data["continue"]
|
||||
else:
|
||||
break
|
||||
else:
|
||||
print("No more results or error in API response.")
|
||||
break
|
||||
|
||||
# Be nice to the API
|
||||
time.sleep(1)
|
||||
|
||||
# Filter articles by size
|
||||
filtered_articles = [
|
||||
article for article in all_articles if article.get("newlen", 0) >= min_size
|
||||
]
|
||||
print(f"Articles after size filtering: {len(filtered_articles)}")
|
||||
|
||||
# Function to get article wikitext (source)
|
||||
def get_article_wikitext(title):
|
||||
params = {
|
||||
"action": "query",
|
||||
"format": "json",
|
||||
"titles": title,
|
||||
"prop": "revisions",
|
||||
"rvprop": "content",
|
||||
"rvslots": "main",
|
||||
"formatversion": "2",
|
||||
}
|
||||
|
||||
response = requests.get(API_ENDPOINT, headers=HEADERS, params=params)
|
||||
return response.json()
|
||||
|
||||
# Function to get article HTML and metadata
|
||||
def get_article_html(title):
|
||||
params = {
|
||||
"action": "parse",
|
||||
"format": "json",
|
||||
"page": title,
|
||||
"prop": "text|sections|categories|links|templates|externallinks",
|
||||
"formatversion": "2",
|
||||
}
|
||||
|
||||
response = requests.get(API_ENDPOINT, headers=HEADERS, params=params)
|
||||
return response.json()
|
||||
|
||||
# Sort by size and take the top articles
|
||||
filtered_articles.sort(key=lambda x: x.get("newlen", 0), reverse=True)
|
||||
selected_articles = filtered_articles[:num_articles]
|
||||
|
||||
# Collect detailed information for each article
|
||||
detailed_articles = []
|
||||
|
||||
for index, article in enumerate(selected_articles, 1):
|
||||
title = article["title"]
|
||||
print(f"Processing article {index}/{len(selected_articles)}: {title}")
|
||||
|
||||
# Get wikitext (source markup)
|
||||
wikitext_data = get_article_wikitext(title)
|
||||
|
||||
# Get HTML and metadata
|
||||
html_data = get_article_html(title)
|
||||
|
||||
# Skip if we couldn't get the article
|
||||
if "query" not in wikitext_data or "parse" not in html_data:
|
||||
print(f"Could not retrieve content for {title}, skipping...")
|
||||
continue
|
||||
|
||||
# Extract wikitext
|
||||
pages = wikitext_data["query"]["pages"]
|
||||
if len(pages) > 0 and "revisions" in pages[0]:
|
||||
wikitext = pages[0]["revisions"][0]["slots"]["main"]["content"]
|
||||
else:
|
||||
wikitext = "Error: Could not retrieve wikitext"
|
||||
|
||||
# Extract HTML and metadata
|
||||
html_content = html_data["parse"]["text"]
|
||||
sections = html_data["parse"].get("sections", [])
|
||||
categories = html_data["parse"].get("categories", [])
|
||||
external_links = html_data["parse"].get("externallinks", [])
|
||||
|
||||
# Parse HTML to get plain text
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
plain_text = soup.get_text(separator="\n")
|
||||
|
||||
# Create article dictionary
|
||||
article_info = {
|
||||
"title": title,
|
||||
"page_id": html_data["parse"].get("pageid"),
|
||||
"timestamp": article["timestamp"],
|
||||
"size": article.get("newlen", 0),
|
||||
"num_sections": len(sections),
|
||||
"section_titles": [section.get("line", "") for section in sections],
|
||||
"num_external_links": len(external_links),
|
||||
"external_links": external_links,
|
||||
"categories": [cat.get("*", "") for cat in categories],
|
||||
"plain_text": plain_text,
|
||||
"html_content": html_content,
|
||||
"wikitext": wikitext,
|
||||
}
|
||||
|
||||
detailed_articles.append(article_info)
|
||||
|
||||
# Save individual wikitext file
|
||||
wikitext_filename = f"{title.replace(' ', '_').replace('/', '_')}.wiki"
|
||||
with open(wikitext_filename, "w", encoding="utf-8") as wiki_file:
|
||||
wiki_file.write(wikitext)
|
||||
print(f"Saved wikitext file: {wikitext_filename}")
|
||||
|
||||
# Be nice to the API
|
||||
time.sleep(1)
|
||||
|
||||
# Save to JSON file
|
||||
with open(save_path, "w", encoding="utf-8") as json_file:
|
||||
json.dump(detailed_articles, json_file, indent=2, ensure_ascii=False)
|
||||
|
||||
print(
|
||||
f"\nCollection complete! {len(detailed_articles)} articles saved to {save_path}"
|
||||
)
|
||||
return detailed_articles
|
||||
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
# Collect 10 articles and save to 'wikipedia_articles.json'
|
||||
articles = collect_wikipedia_articles(num_articles=10, min_size=5000, days_ago=30)
|
||||
|
||||
# Print titles of collected articles
|
||||
print("\nCollected articles:")
|
||||
for i, article in enumerate(articles, 1):
|
||||
print(f"{i}. {article['title']} ({article['size']} bytes)")
|
||||
Loading…
Add table
Add a link
Reference in a new issue