atropos/environments/community/physical_space_stl/judgement_model.py
2025-05-27 08:53:06 +10:00

53 lines
2 KiB
Python

import numpy as np
import torch
from PIL import Image
from transformers import CLIPModel, CLIPProcessor
class CLIPScorer:
def __init__(self, model_name="openai/clip-vit-base-patch32"):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
try:
self.model = CLIPModel.from_pretrained(model_name).to(self.device)
self.processor = CLIPProcessor.from_pretrained(model_name)
print(f"CLIPScorer initialized on {self.device} with {model_name}")
except Exception as e:
print(
f"Error initializing CLIPModel: {e}. Ensure model name is correct and you have internet."
)
self.model = None
self.processor = None
raise
@torch.no_grad() # Ensure no gradients are computed during inference
def score_images(self, images_np_list: list, target_text_description: str):
if not self.model or not self.processor:
print("CLIPScorer not properly initialized.")
return [0.0] * len(images_np_list) # Low score on error
try:
pil_images = [
Image.fromarray(img_arr.astype(np.uint8)) for img_arr in images_np_list
]
inputs = self.processor(
text=[target_text_description], # Single text prompt
images=pil_images,
return_tensors="pt",
padding=True,
truncation=True,
).to(self.device)
outputs = self.model(**inputs)
image_text_similarity_scores = (
outputs.logits_per_image.squeeze().tolist()
) # Squeeze to remove the text dim
if not isinstance(image_text_similarity_scores, list): # If only one image
image_text_similarity_scores = [image_text_similarity_scores]
return image_text_similarity_scores
except Exception as e:
print(f"Error in CLIP scoring: {e}")
return [0.0] * len(images_np_list) # Low score on error