mirror of
https://github.com/NousResearch/atropos.git
synced 2026-04-19 12:57:58 +00:00
data generation scripts to make hugging face compatible dataset
This commit is contained in:
parent
3fde5cbda8
commit
67a49f27b9
4 changed files with 251 additions and 0 deletions
69
environments/hack0/prepare_push_hf_dataset.py
Normal file
69
environments/hack0/prepare_push_hf_dataset.py
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
import os
|
||||
import json
|
||||
import numpy as np
|
||||
import trimesh
|
||||
from datasets import Dataset, Features, Value, Image
|
||||
from huggingface_hub import login
|
||||
|
||||
# Log in to HF Hub (optional if you've already done `huggingface-cli login`)
|
||||
login(token=os.getenv("HF_TOKEN")) # Or replace with string token
|
||||
|
||||
# Paths
|
||||
image_dir = "dataset/images"
|
||||
stl_dir = "dataset/stls"
|
||||
labels_path = "dataset/labels.json"
|
||||
|
||||
# Load labels
|
||||
with open(labels_path, "r") as f:
|
||||
labels = json.load(f)
|
||||
|
||||
# Build data entries
|
||||
data = []
|
||||
for image_filename in os.listdir(image_dir):
|
||||
if not image_filename.endswith(".png"):
|
||||
continue
|
||||
image_path = os.path.join(image_dir, image_filename)
|
||||
|
||||
# Extract base ID
|
||||
base_id = image_filename.split("_")[0]
|
||||
|
||||
stl_path = os.path.join(stl_dir, f"{base_id}.stl")
|
||||
label = labels.get(base_id, "unknown")
|
||||
|
||||
# Load STL features (e.g., centroid + bounding box + volume as 9 floats)
|
||||
stl_features = [0.0] * 9
|
||||
if os.path.exists(stl_path):
|
||||
try:
|
||||
mesh = trimesh.load(stl_path, force="mesh")
|
||||
bbox = mesh.bounding_box.extents
|
||||
centroid = mesh.centroid
|
||||
volume = mesh.volume
|
||||
stl_features = list(centroid) + list(bbox) + [volume]
|
||||
except Exception as e:
|
||||
print(f"⚠️ Failed to process {stl_path}: {e}")
|
||||
|
||||
data.append({
|
||||
"image": image_path,
|
||||
"label": label,
|
||||
"stl_features": stl_features,
|
||||
"id": base_id,
|
||||
})
|
||||
|
||||
# Define dataset schema
|
||||
features = Features({
|
||||
"id": Value("string"),
|
||||
"image": Image(), # Load images from file paths
|
||||
"label": Value("string"),
|
||||
"stl_features": Value("string"), # Store as JSON string for simplicity
|
||||
})
|
||||
|
||||
# Convert stl_features to JSON strings for compatibility
|
||||
for item in data:
|
||||
item["stl_features"] = json.dumps(item["stl_features"])
|
||||
|
||||
# Create Dataset
|
||||
dataset = Dataset.from_list(data).cast(features)
|
||||
|
||||
# Push to Hub
|
||||
dataset.push_to_hub("venkatacrc/stl-image-dataset", private=True)
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue