init commit

2026-04-19 12:57:58 +00:00 · 2025-05-18 16:58:42 -07:00 · 2025-05-18 16:58:42 -07:00 · 0e660a7429
commit 0e660a7429
parent c189fc3351
19 changed files with 11250 additions and 0 deletions
--- a/environments/hack0/ufc_env/get_images.py
+++ b/environments/hack0/ufc_env/get_images.py
@ -0,0 +1,113 @@
+import os
+import re
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+import time
+import random
+
+# 1. Load your CSV
+df = pd.read_csv("fighter_stats.csv")
+
+# List of different user agents to rotate between
+USER_AGENTS = [
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.59',
+    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+]
+
+# List of different accept languages
+ACCEPT_LANGUAGES = [
+    'en-US,en;q=0.9',
+    'en-GB,en;q=0.9',
+    'en-CA,en;q=0.9',
+    'en-AU,en;q=0.9',
+    'en-NZ,en;q=0.9',
+]
+
+def get_random_headers():
+    """Generate a random set of headers for each request."""
+    return {
+        'User-Agent': random.choice(USER_AGENTS),
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+        'Accept-Language': random.choice(ACCEPT_LANGUAGES),
+        'Connection': 'keep-alive',
+        'Upgrade-Insecure-Requests': '1',
+        'Cache-Control': 'max-age=0',
+        'Sec-Fetch-Dest': 'document',
+        'Sec-Fetch-Mode': 'navigate',
+        'Sec-Fetch-Site': 'none',
+        'Sec-Fetch-User': '?1',
+        'DNT': '1',
+    }
+
+# 2. Generate UFC profile URLs
+def make_url(name):
+    """Turn a fighter name into a UFC athlete URL slug."""
+    if not isinstance(name, str) or not name.strip():
+        return None
+    slug = name.lower().strip()
+    slug = re.sub(r"[''']", "", slug)        # Remove apostrophes
+    slug = re.sub(r"[^a-z\s-]", "", slug)   # Keep letters, spaces, hyphens
+    slug = re.sub(r"\s+", "-", slug)        # Spaces → hyphens
+    return f"https://www.ufc.com/athlete/{slug}"
+
+df["UFC_Profile_URL"] = df["name"].apply(make_url)
+
+# 3. Prepare output folder
+output_folder = "fighter_images"
+os.makedirs(output_folder, exist_ok=True)
+
+# 4. Scrape & download each image
+def download_ufc_image(url, counter):
+    """Fetch the UFC athlete page, parse out the main profile image, and save it."""
+    try:
+        # Derive filename from URL slug
+        slug = url.rstrip("/").split("/")[-1]
+        filename = f"{slug}.jpg"
+        path = os.path.join(output_folder, filename)
+        
+        # Check if image already exists
+        if os.path.exists(path):
+            print(f"[i] Image already exists for {filename}")
+            return
+        
+        # Get fresh random headers for each request
+        headers = get_random_headers()
+        
+        resp = requests.get(url, headers=headers)
+        resp.raise_for_status()
+        soup = BeautifulSoup(resp.text, "html.parser")
+        img_tag = soup.select_one("div.hero-profile__image-wrap img.hero-profile__image")
+        if not img_tag or not img_tag.get("src"):
+            print(f"[!] No image found at {url}")
+            return
+        img_url = img_tag["src"]
+        
+        # Get fresh random headers for image download
+        headers = get_random_headers()
+        img_data = requests.get(img_url, headers=headers).content
+        with open(path, "wb") as f:
+            f.write(img_data)
+        print(f"[✓] Saved {filename}")
+        
+        # Add random delay between 1-3 seconds
+        time.sleep(random.uniform(1, 3))
+        
+        # After every 100 downloads, take a longer break
+        if counter % 100 == 0:
+            print(f"[i] Taking a 30-second break after {counter} downloads...")
+            time.sleep(30)
+            
+    except Exception as e:
+        print(f"[✗] Error at {url}: {e}")
+
+# 5. Iterate and download
+counter = 0
+for url in df["UFC_Profile_URL"].dropna().unique():
+    counter += 1
+    download_ufc_image(url, counter)