atropos/example_trainer/data.py

"""
Data processing utilities for GRPO trainer.

Handles data retrieval from Atropos API, padding, batching,
and advantage normalization.

Also extracts inference logprobs for proper GRPO loss computation:
- Inference logprobs serve as π_old (reference policy) for importance sampling
- They are batched and padded to align token-by-token with training labels
"""

import json
import math
import time
from typing import List, Optional, Tuple

import numpy as np
import torch

from .api import get_batch


def pad_data_to_good_offset(
    data: dict,
    batch_size: int,
    extract_inference_logprobs: bool = True,
) -> Tuple[
    List[torch.Tensor],  # token_batches
    List[torch.Tensor],  # label_batches
    List[torch.Tensor],  # advantage_batches
    List[torch.Tensor],  # temperature_batches
    Optional[List[torch.Tensor]],  # inference_logprob_batches (aligned with labels)
]:
    """
    Pad and batch data from the Atropos API.

    Processes raw batch data into properly padded tensors suitable for training:
    - Pads token sequences to nearest multiple of 64
    - Normalizes advantage scores
    - Extracts temperature values
    - Extracts and pads inference logprobs for proper GRPO loss computation

    Args:
        data: Raw batch data from Atropos API
        batch_size: Size of each training batch
        extract_inference_logprobs: Whether to extract inference logprobs

    Returns:
        Tuple of (token_batches, label_batches, advantage_batches, temperature_batches, inference_logprob_batches)
        inference_logprob_batches is None if extract_inference_logprobs=False or no logprobs in data

    Note:
        inference_logprob_batches are padded with 0.0 at positions where labels == -100.
        This allows token-by-token alignment during GRPO loss computation.
    """
    max_token_len = max(
        [max([len(x) for x in item["tokens"]]) for item in data["batch"]]
    )

    # Pad to nearest multiple of 64 for GPU efficiency
    good_multiple = 64
    if (max_token_len - 1) % (good_multiple) != 0:
        max_token_len = math.ceil((max_token_len - 1) / (good_multiple)) * good_multiple
        token_setup_len = max_token_len + 1  # +1 for causal shift
    else:
        token_setup_len = max_token_len
        max_token_len = max_token_len - 1  # -1 for causal shift

    # Process all items
    input_ids = []
    labels = []
    advantages = []
    lengths = []
    temperatures = []
    inference_logprobs_padded: List[np.ndarray] = []  # Padded to match labels shape
    has_any_logprobs = False

    for item in data["batch"]:
        # Normalize advantage scores
        scores = np.array(item["scores"])
        if len(scores) > 1:
            scores = scores - scores.mean()
            scores = scores / max(scores.std(), 1e-8)
        item["scores"] = scores

        # Handle score overrides
        if item["overrides"] is not None:
            for i in range(len(item["overrides"])):
                if item["overrides"][i].get("set_advantage_to_zero", False):
                    item["scores"][i] = 0

        # Process each sample in the item
        for i in range(len(item["tokens"])):
            seq_len = len(item["tokens"][i])
            lengths.append(
                math.ceil((seq_len - 1) / good_multiple) * good_multiple
            )

            # Create labels with padding (-100 for masked positions)
            label_item = np.concatenate([
                np.array(item["masks"][i]),
                np.full(
                    max(0, token_setup_len - seq_len),
                    -100,
                    dtype=np.int32,
                ),
            ])

            # Pad tokens
            item["tokens"][i] = np.concatenate([
                np.array(item["tokens"][i]),
                np.zeros(
                    max(0, token_setup_len - seq_len),
                    dtype=np.int32,
                ),
            ])

            input_ids.append(item["tokens"][i][:-1])  # Remove last for causal
            labels.append(label_item[1:])  # Shift by 1 for causal
            advantages.append(item["scores"][i])

            # Extract and pad inference logprobs to match labels shape
            # Inference logprobs are ONLY for generated tokens (where labels != -100)
            # We need to create a padded array that aligns position-by-position
            if extract_inference_logprobs and "inference_logprobs" in item:
                if i < len(item["inference_logprobs"]):
                    raw_logprobs = np.array(item["inference_logprobs"][i], dtype=np.float32)
                    has_any_logprobs = True

                    # Create padded logprobs array matching label_item shape
                    # Fill with 0.0 (will be masked out during loss computation)
                    padded_logprobs = np.zeros(token_setup_len, dtype=np.float32)

                    # The inference logprobs correspond to generated tokens
                    # Find positions where labels != -100 (generated positions)
                    mask_arr = np.array(item["masks"][i])
                    generated_positions = np.where(mask_arr != -100)[0]

                    # Fill in inference logprobs at generated positions
                    n_to_fill = min(len(raw_logprobs), len(generated_positions))
                    if n_to_fill > 0:
                        padded_logprobs[generated_positions[:n_to_fill]] = raw_logprobs[:n_to_fill]

                    # Shift by 1 to match causal label shift
                    inference_logprobs_padded.append(padded_logprobs[1:])
                else:
                    # No logprobs for this sample, use zeros
                    inference_logprobs_padded.append(np.zeros(token_setup_len - 1, dtype=np.float32))
            elif extract_inference_logprobs:
                # No inference_logprobs in item, use zeros
                inference_logprobs_padded.append(np.zeros(token_setup_len - 1, dtype=np.float32))

            # Extract temperature (priority: override > generation_params > group_overrides > 1.0)
            t = 1.0
            if (
                item.get("overrides")
                and i < len(item["overrides"])
                and isinstance(item["overrides"][i], dict)
                and ("temperature" in item["overrides"][i])
            ):
                t = float(item["overrides"][i]["temperature"])
            elif item.get("generation_params") and ("temperature" in item["generation_params"]):
                t = float(item["generation_params"]["temperature"])
            elif item.get("group_overrides") and ("temperature" in item["group_overrides"]):
                t = float(item["group_overrides"]["temperature"])
            temperatures.append(t)

    # Batch the data
    token_batches = []
    label_batches = []
    advantage_batches = []
    temperature_batches = []
    inference_logprob_batches = []

    for i in range(len(input_ids) // batch_size):
        start = i * batch_size
        end = (i + 1) * batch_size

        token_batches.append(
            torch.tensor(np.stack(input_ids[start:end], axis=0))
        )
        label_batches.append(
            torch.tensor(np.stack(labels[start:end], axis=0))
        )
        advantage_batches.append(
            torch.tensor(np.stack(advantages[start:end], axis=0)).view(-1, 1)
        )
        temperature_batches.append(
            torch.tensor(
                np.array(temperatures[start:end], dtype=np.float32)
            ).view(-1, 1, 1)
        )

        # Batch inference logprobs (same shape as labels)
        if extract_inference_logprobs and inference_logprobs_padded:
            inference_logprob_batches.append(
                torch.tensor(np.stack(inference_logprobs_padded[start:end], axis=0))
            )

    # Return inference logprob batches if we have any real logprobs
    final_logprob_batches = inference_logprob_batches if (has_any_logprobs and inference_logprob_batches) else None

    return token_batches, label_batches, advantage_batches, temperature_batches, final_logprob_batches


def get_data(
    batch_size: int,
    seq_len: int,
    atropos_url: str = "http://localhost:8000",
    extract_inference_logprobs: bool = True,
) -> Tuple[
    List[Tuple[
        List[torch.Tensor],  # token_batches
        List[torch.Tensor],  # label_batches
        List[torch.Tensor],  # advantage_batches
        List[torch.Tensor],  # temperature_batches
        Optional[List[torch.Tensor]],  # inference_logprob_batches
    ]],
    None,  # Legacy return (no longer used)
]:
    """
    Fetch and process training data from the Atropos API.

    Continuously polls the API until data is available, then processes
    all available batches.

    Args:
        batch_size: Size of each training batch
        seq_len: Maximum sequence length (for reference, not used directly)
        atropos_url: URL of the Atropos API server
        extract_inference_logprobs: Whether to extract inference logprobs for GRPO loss

    Returns:
        Tuple of (batches, None)
        - batches: List of processed batch tuples, each containing:
          (token_batches, label_batches, advantage_batches, temperature_batches, inference_logprob_batches)
        - inference_logprob_batches are aligned with labels for proper GRPO loss computation
    """
    batches = []

    while True:
        data = get_batch(url=atropos_url)

        if data["batch"] is not None:
            # Save batch for debugging
            with open("temp.json", "w", encoding="utf-8") as f:
                json.dump(data, f)

            # Process and accumulate batches (now includes batched inference logprobs)
            token_batches, label_batches, adv_batches, temp_batches, inf_logprob_batches = \
                pad_data_to_good_offset(data, batch_size, extract_inference_logprobs)

            # Include inference logprob batches in the tuple
            batches.append((token_batches, label_batches, adv_batches, temp_batches, inf_logprob_batches))

        elif len(batches) > 0:
            # Return accumulated batches when no more data
            return batches, None
        else:
            # Wait for data
            time.sleep(1)