change OPD style

2026-04-22 16:48:57 +00:00 · 2026-02-19 17:08:27 -05:00 · 2026-02-19 17:08:27 -05:00 · 527433b5bc
commit 527433b5bc
parent 33f5696171
10 changed files with 452 additions and 90 deletions
--- a/atroposlib/api/server.py
+++ b/atroposlib/api/server.py
@ -145,9 +145,10 @@ class ScoredData(BaseModel):
    group_overrides: Optional[dict] = None
    images: Optional[Any] = None
    env_id: Optional[int] = None  # ID of the environment that generated this data
-    # On-policy distillation: top-K logprobs from teacher model
-    # Structure: [sequence][position][top_k] = [token_id, logprob]
-    onpolicydistill_logprobs: Optional[List[List[List[List]]]] = None
+    # On-policy distillation (new format): parallel token ids + logprobs.
+    # Shape for both: [sequence][position][top_k]
+    distill_token_ids: Optional[List[List[List[int]]]] = None
+    distill_logprobs: Optional[List[List[List[float]]]] = None

    @field_validator("messages", mode="before")
    @classmethod
@ -185,7 +186,8 @@ def _scored_data_to_dict(scored_data: ScoredData) -> Dict[str, Any]:
        "group_overrides": scored_data.group_overrides,
        "images": scored_data.images,
        "env_id": scored_data.env_id,
-        "onpolicydistill_logprobs": scored_data.onpolicydistill_logprobs,
+        "distill_token_ids": scored_data.distill_token_ids,
+        "distill_logprobs": scored_data.distill_logprobs,
    }