mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-19 12:57:58 +00:00
init commit
This commit is contained in:
parent
c189fc3351
commit
0e660a7429
19 changed files with 11250 additions and 0 deletions
113
environments/hack0/ufc_env/get_images.py
Normal file
113
environments/hack0/ufc_env/get_images.py
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
import os
|
||||
import re
|
||||
import pandas as pd
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import time
|
||||
import random
|
||||
|
||||
# 1. Load your CSV
|
||||
df = pd.read_csv("fighter_stats.csv")
|
||||
|
||||
# List of different user agents to rotate between
|
||||
USER_AGENTS = [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.59',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
]
|
||||
|
||||
# List of different accept languages
|
||||
ACCEPT_LANGUAGES = [
|
||||
'en-US,en;q=0.9',
|
||||
'en-GB,en;q=0.9',
|
||||
'en-CA,en;q=0.9',
|
||||
'en-AU,en;q=0.9',
|
||||
'en-NZ,en;q=0.9',
|
||||
]
|
||||
|
||||
def get_random_headers():
|
||||
"""Generate a random set of headers for each request."""
|
||||
return {
|
||||
'User-Agent': random.choice(USER_AGENTS),
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': random.choice(ACCEPT_LANGUAGES),
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'Cache-Control': 'max-age=0',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'DNT': '1',
|
||||
}
|
||||
|
||||
# 2. Generate UFC profile URLs
|
||||
def make_url(name):
|
||||
"""Turn a fighter name into a UFC athlete URL slug."""
|
||||
if not isinstance(name, str) or not name.strip():
|
||||
return None
|
||||
slug = name.lower().strip()
|
||||
slug = re.sub(r"[''']", "", slug) # Remove apostrophes
|
||||
slug = re.sub(r"[^a-z\s-]", "", slug) # Keep letters, spaces, hyphens
|
||||
slug = re.sub(r"\s+", "-", slug) # Spaces → hyphens
|
||||
return f"https://www.ufc.com/athlete/{slug}"
|
||||
|
||||
df["UFC_Profile_URL"] = df["name"].apply(make_url)
|
||||
|
||||
# 3. Prepare output folder
|
||||
output_folder = "fighter_images"
|
||||
os.makedirs(output_folder, exist_ok=True)
|
||||
|
||||
# 4. Scrape & download each image
|
||||
def download_ufc_image(url, counter):
|
||||
"""Fetch the UFC athlete page, parse out the main profile image, and save it."""
|
||||
try:
|
||||
# Derive filename from URL slug
|
||||
slug = url.rstrip("/").split("/")[-1]
|
||||
filename = f"{slug}.jpg"
|
||||
path = os.path.join(output_folder, filename)
|
||||
|
||||
# Check if image already exists
|
||||
if os.path.exists(path):
|
||||
print(f"[i] Image already exists for {filename}")
|
||||
return
|
||||
|
||||
# Get fresh random headers for each request
|
||||
headers = get_random_headers()
|
||||
|
||||
resp = requests.get(url, headers=headers)
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
img_tag = soup.select_one("div.hero-profile__image-wrap img.hero-profile__image")
|
||||
if not img_tag or not img_tag.get("src"):
|
||||
print(f"[!] No image found at {url}")
|
||||
return
|
||||
img_url = img_tag["src"]
|
||||
|
||||
# Get fresh random headers for image download
|
||||
headers = get_random_headers()
|
||||
img_data = requests.get(img_url, headers=headers).content
|
||||
with open(path, "wb") as f:
|
||||
f.write(img_data)
|
||||
print(f"[✓] Saved {filename}")
|
||||
|
||||
# Add random delay between 1-3 seconds
|
||||
time.sleep(random.uniform(1, 3))
|
||||
|
||||
# After every 100 downloads, take a longer break
|
||||
if counter % 100 == 0:
|
||||
print(f"[i] Taking a 30-second break after {counter} downloads...")
|
||||
time.sleep(30)
|
||||
|
||||
except Exception as e:
|
||||
print(f"[✗] Error at {url}: {e}")
|
||||
|
||||
# 5. Iterate and download
|
||||
counter = 0
|
||||
for url in df["UFC_Profile_URL"].dropna().unique():
|
||||
counter += 1
|
||||
download_ufc_image(url, counter)
|
||||
Loading…
Add table
Add a link
Reference in a new issue