atropos/environments/community/wikipedia_research/get_examples.py
shannonsands f21154ff49
Integrate aniemerg wikipedia (#143)
* initial commit

* initial draft of wikipedia article creation environment

* add openai for rollouts, update requirements, create script to run, etc.

* add configuration, add debugging, fix tool calls, prevent wikipedia access

* now creates html file

* fix output for html page

* check in Claude plan

* fixed formatting and other issues

* add zip file

* update README

* linting, moved to community folder

* linting

* linting

* linting

* linting

---------

Co-authored-by: Allan Niemerg <niemerg@gmail.com>
2025-05-28 10:22:11 +10:00

203 lines
6.8 KiB
Python

#!/usr/bin/env python3
import json
import time
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
def collect_wikipedia_articles(
num_articles=50, min_size=2000, days_ago=90, save_path="wikipedia_articles.json"
):
"""
Collect recently created Wikipedia articles and save them to a JSON file.
Includes HTML, plain text, and wikitext (source) formats.
Parameters:
- num_articles: Number of articles to collect
- min_size: Minimum article size in bytes
- days_ago: How far back to look for articles
- save_path: Where to save the JSON file
"""
# API endpoint and headers
API_ENDPOINT = "https://en.wikipedia.org/w/api.php"
HEADERS = {"User-Agent": "WikipediaArticleCollector/0.1 (your-email@example.com)"}
# Calculate the date range
end_date = datetime.now()
start_date = end_date - timedelta(days=days_ago)
# Format dates for API
rcstart = end_date.strftime("%Y%m%d%H%M%S")
rcend = start_date.strftime("%Y%m%d%H%M%S")
# Initialize variables
all_articles = []
continue_param = None
batch_size = 50 # API max is usually 50
print(f"Starting article collection from {start_date.date()} to {end_date.date()}")
# Collect articles with pagination
while len(all_articles) < 500: # Cap at 500 to avoid too many requests
# Set up parameters
params = {
"action": "query",
"format": "json",
"list": "recentchanges",
"rctype": "new",
"rcnamespace": "0", # Main article namespace
"rclimit": batch_size,
"rcprop": "title|timestamp|ids|sizes|user",
"rcshow": "!redirect",
"rcstart": rcstart,
"rcend": rcend,
"rcdir": "older",
}
# Add continue parameter if we have one
if continue_param:
params.update(continue_param)
# Make the request
response = requests.get(API_ENDPOINT, headers=HEADERS, params=params)
data = response.json()
# Extract articles
if "query" in data and "recentchanges" in data["query"]:
batch = data["query"]["recentchanges"]
all_articles.extend(batch)
print(f"Retrieved {len(batch)} articles, total: {len(all_articles)}")
# Check if we need to continue
if "continue" in data:
continue_param = data["continue"]
else:
break
else:
print("No more results or error in API response.")
break
# Be nice to the API
time.sleep(1)
# Filter articles by size
filtered_articles = [
article for article in all_articles if article.get("newlen", 0) >= min_size
]
print(f"Articles after size filtering: {len(filtered_articles)}")
# Function to get article wikitext (source)
def get_article_wikitext(title):
params = {
"action": "query",
"format": "json",
"titles": title,
"prop": "revisions",
"rvprop": "content",
"rvslots": "main",
"formatversion": "2",
}
response = requests.get(API_ENDPOINT, headers=HEADERS, params=params)
return response.json()
# Function to get article HTML and metadata
def get_article_html(title):
params = {
"action": "parse",
"format": "json",
"page": title,
"prop": "text|sections|categories|links|templates|externallinks",
"formatversion": "2",
}
response = requests.get(API_ENDPOINT, headers=HEADERS, params=params)
return response.json()
# Sort by size and take the top articles
filtered_articles.sort(key=lambda x: x.get("newlen", 0), reverse=True)
selected_articles = filtered_articles[:num_articles]
# Collect detailed information for each article
detailed_articles = []
for index, article in enumerate(selected_articles, 1):
title = article["title"]
print(f"Processing article {index}/{len(selected_articles)}: {title}")
# Get wikitext (source markup)
wikitext_data = get_article_wikitext(title)
# Get HTML and metadata
html_data = get_article_html(title)
# Skip if we couldn't get the article
if "query" not in wikitext_data or "parse" not in html_data:
print(f"Could not retrieve content for {title}, skipping...")
continue
# Extract wikitext
pages = wikitext_data["query"]["pages"]
if len(pages) > 0 and "revisions" in pages[0]:
wikitext = pages[0]["revisions"][0]["slots"]["main"]["content"]
else:
wikitext = "Error: Could not retrieve wikitext"
# Extract HTML and metadata
html_content = html_data["parse"]["text"]
sections = html_data["parse"].get("sections", [])
categories = html_data["parse"].get("categories", [])
external_links = html_data["parse"].get("externallinks", [])
# Parse HTML to get plain text
soup = BeautifulSoup(html_content, "html.parser")
plain_text = soup.get_text(separator="\n")
# Create article dictionary
article_info = {
"title": title,
"page_id": html_data["parse"].get("pageid"),
"timestamp": article["timestamp"],
"size": article.get("newlen", 0),
"num_sections": len(sections),
"section_titles": [section.get("line", "") for section in sections],
"num_external_links": len(external_links),
"external_links": external_links,
"categories": [cat.get("*", "") for cat in categories],
"plain_text": plain_text,
"html_content": html_content,
"wikitext": wikitext,
}
detailed_articles.append(article_info)
# Save individual wikitext file
wikitext_filename = f"{title.replace(' ', '_').replace('/', '_')}.wiki"
with open(wikitext_filename, "w", encoding="utf-8") as wiki_file:
wiki_file.write(wikitext)
print(f"Saved wikitext file: {wikitext_filename}")
# Be nice to the API
time.sleep(1)
# Save to JSON file
with open(save_path, "w", encoding="utf-8") as json_file:
json.dump(detailed_articles, json_file, indent=2, ensure_ascii=False)
print(
f"\nCollection complete! {len(detailed_articles)} articles saved to {save_path}"
)
return detailed_articles
# Example usage
if __name__ == "__main__":
# Collect 10 articles and save to 'wikipedia_articles.json'
articles = collect_wikipedia_articles(num_articles=10, min_size=5000, days_ago=30)
# Print titles of collected articles
print("\nCollected articles:")
for i, article in enumerate(articles, 1):
print(f"{i}. {article['title']} ({article['size']} bytes)")