mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-19 12:57:58 +00:00
* initial commit * initial draft of wikipedia article creation environment * add openai for rollouts, update requirements, create script to run, etc. * add configuration, add debugging, fix tool calls, prevent wikipedia access * now creates html file * fix output for html page * check in Claude plan * fixed formatting and other issues * add zip file * update README * linting, moved to community folder * linting * linting * linting * linting --------- Co-authored-by: Allan Niemerg <niemerg@gmail.com>
203 lines
6.8 KiB
Python
203 lines
6.8 KiB
Python
#!/usr/bin/env python3
|
|
import json
|
|
import time
|
|
from datetime import datetime, timedelta
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
def collect_wikipedia_articles(
|
|
num_articles=50, min_size=2000, days_ago=90, save_path="wikipedia_articles.json"
|
|
):
|
|
"""
|
|
Collect recently created Wikipedia articles and save them to a JSON file.
|
|
Includes HTML, plain text, and wikitext (source) formats.
|
|
|
|
Parameters:
|
|
- num_articles: Number of articles to collect
|
|
- min_size: Minimum article size in bytes
|
|
- days_ago: How far back to look for articles
|
|
- save_path: Where to save the JSON file
|
|
"""
|
|
# API endpoint and headers
|
|
API_ENDPOINT = "https://en.wikipedia.org/w/api.php"
|
|
HEADERS = {"User-Agent": "WikipediaArticleCollector/0.1 (your-email@example.com)"}
|
|
|
|
# Calculate the date range
|
|
end_date = datetime.now()
|
|
start_date = end_date - timedelta(days=days_ago)
|
|
|
|
# Format dates for API
|
|
rcstart = end_date.strftime("%Y%m%d%H%M%S")
|
|
rcend = start_date.strftime("%Y%m%d%H%M%S")
|
|
|
|
# Initialize variables
|
|
all_articles = []
|
|
continue_param = None
|
|
batch_size = 50 # API max is usually 50
|
|
|
|
print(f"Starting article collection from {start_date.date()} to {end_date.date()}")
|
|
|
|
# Collect articles with pagination
|
|
while len(all_articles) < 500: # Cap at 500 to avoid too many requests
|
|
# Set up parameters
|
|
params = {
|
|
"action": "query",
|
|
"format": "json",
|
|
"list": "recentchanges",
|
|
"rctype": "new",
|
|
"rcnamespace": "0", # Main article namespace
|
|
"rclimit": batch_size,
|
|
"rcprop": "title|timestamp|ids|sizes|user",
|
|
"rcshow": "!redirect",
|
|
"rcstart": rcstart,
|
|
"rcend": rcend,
|
|
"rcdir": "older",
|
|
}
|
|
|
|
# Add continue parameter if we have one
|
|
if continue_param:
|
|
params.update(continue_param)
|
|
|
|
# Make the request
|
|
response = requests.get(API_ENDPOINT, headers=HEADERS, params=params)
|
|
data = response.json()
|
|
|
|
# Extract articles
|
|
if "query" in data and "recentchanges" in data["query"]:
|
|
batch = data["query"]["recentchanges"]
|
|
all_articles.extend(batch)
|
|
print(f"Retrieved {len(batch)} articles, total: {len(all_articles)}")
|
|
|
|
# Check if we need to continue
|
|
if "continue" in data:
|
|
continue_param = data["continue"]
|
|
else:
|
|
break
|
|
else:
|
|
print("No more results or error in API response.")
|
|
break
|
|
|
|
# Be nice to the API
|
|
time.sleep(1)
|
|
|
|
# Filter articles by size
|
|
filtered_articles = [
|
|
article for article in all_articles if article.get("newlen", 0) >= min_size
|
|
]
|
|
print(f"Articles after size filtering: {len(filtered_articles)}")
|
|
|
|
# Function to get article wikitext (source)
|
|
def get_article_wikitext(title):
|
|
params = {
|
|
"action": "query",
|
|
"format": "json",
|
|
"titles": title,
|
|
"prop": "revisions",
|
|
"rvprop": "content",
|
|
"rvslots": "main",
|
|
"formatversion": "2",
|
|
}
|
|
|
|
response = requests.get(API_ENDPOINT, headers=HEADERS, params=params)
|
|
return response.json()
|
|
|
|
# Function to get article HTML and metadata
|
|
def get_article_html(title):
|
|
params = {
|
|
"action": "parse",
|
|
"format": "json",
|
|
"page": title,
|
|
"prop": "text|sections|categories|links|templates|externallinks",
|
|
"formatversion": "2",
|
|
}
|
|
|
|
response = requests.get(API_ENDPOINT, headers=HEADERS, params=params)
|
|
return response.json()
|
|
|
|
# Sort by size and take the top articles
|
|
filtered_articles.sort(key=lambda x: x.get("newlen", 0), reverse=True)
|
|
selected_articles = filtered_articles[:num_articles]
|
|
|
|
# Collect detailed information for each article
|
|
detailed_articles = []
|
|
|
|
for index, article in enumerate(selected_articles, 1):
|
|
title = article["title"]
|
|
print(f"Processing article {index}/{len(selected_articles)}: {title}")
|
|
|
|
# Get wikitext (source markup)
|
|
wikitext_data = get_article_wikitext(title)
|
|
|
|
# Get HTML and metadata
|
|
html_data = get_article_html(title)
|
|
|
|
# Skip if we couldn't get the article
|
|
if "query" not in wikitext_data or "parse" not in html_data:
|
|
print(f"Could not retrieve content for {title}, skipping...")
|
|
continue
|
|
|
|
# Extract wikitext
|
|
pages = wikitext_data["query"]["pages"]
|
|
if len(pages) > 0 and "revisions" in pages[0]:
|
|
wikitext = pages[0]["revisions"][0]["slots"]["main"]["content"]
|
|
else:
|
|
wikitext = "Error: Could not retrieve wikitext"
|
|
|
|
# Extract HTML and metadata
|
|
html_content = html_data["parse"]["text"]
|
|
sections = html_data["parse"].get("sections", [])
|
|
categories = html_data["parse"].get("categories", [])
|
|
external_links = html_data["parse"].get("externallinks", [])
|
|
|
|
# Parse HTML to get plain text
|
|
soup = BeautifulSoup(html_content, "html.parser")
|
|
plain_text = soup.get_text(separator="\n")
|
|
|
|
# Create article dictionary
|
|
article_info = {
|
|
"title": title,
|
|
"page_id": html_data["parse"].get("pageid"),
|
|
"timestamp": article["timestamp"],
|
|
"size": article.get("newlen", 0),
|
|
"num_sections": len(sections),
|
|
"section_titles": [section.get("line", "") for section in sections],
|
|
"num_external_links": len(external_links),
|
|
"external_links": external_links,
|
|
"categories": [cat.get("*", "") for cat in categories],
|
|
"plain_text": plain_text,
|
|
"html_content": html_content,
|
|
"wikitext": wikitext,
|
|
}
|
|
|
|
detailed_articles.append(article_info)
|
|
|
|
# Save individual wikitext file
|
|
wikitext_filename = f"{title.replace(' ', '_').replace('/', '_')}.wiki"
|
|
with open(wikitext_filename, "w", encoding="utf-8") as wiki_file:
|
|
wiki_file.write(wikitext)
|
|
print(f"Saved wikitext file: {wikitext_filename}")
|
|
|
|
# Be nice to the API
|
|
time.sleep(1)
|
|
|
|
# Save to JSON file
|
|
with open(save_path, "w", encoding="utf-8") as json_file:
|
|
json.dump(detailed_articles, json_file, indent=2, ensure_ascii=False)
|
|
|
|
print(
|
|
f"\nCollection complete! {len(detailed_articles)} articles saved to {save_path}"
|
|
)
|
|
return detailed_articles
|
|
|
|
|
|
# Example usage
|
|
if __name__ == "__main__":
|
|
# Collect 10 articles and save to 'wikipedia_articles.json'
|
|
articles = collect_wikipedia_articles(num_articles=10, min_size=5000, days_ago=30)
|
|
|
|
# Print titles of collected articles
|
|
print("\nCollected articles:")
|
|
for i, article in enumerate(articles, 1):
|
|
print(f"{i}. {article['title']} ({article['size']} bytes)")
|