BLEUBERI/eval/arena-hard/utils_math.py

import pandas as pd
import numpy as np
import math
import inspect

from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from collections import defaultdict

np.random.seed(42)

STYLE_CONTROL_ELEMENTS = [
    "sum_assistant_a_tokens",
    "header_count_a",
    "list_count_a",
    "bold_count_a",
    "sum_assistant_b_tokens",
    "header_count_b",
    "list_count_b",
    "bold_count_b",
]

LENGTH_CONTROL_ELEMENTS = [
    "sum_assistant_a_tokens",
    "sum_assistant_b_tokens",
]

MARKDOWN_CONTROL_ELEMENTS = [
    "header_count_a",
    "list_count_a",
    "bold_count_a",
    "header_count_b",
    "list_count_b",
    "bold_count_b",
]


def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000, baseline_model="gpt-4-0314"):
    models = pd.concat([df["model_a"], df["model_b"]]).unique()
    models = pd.Series(np.arange(len(models)), index=models)

    # duplicate battles
    df = pd.concat([df, df], ignore_index=True)
    p = len(models.index)
    n = df.shape[0]

    X = np.zeros([n, p])
    X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
    X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)

    # one A win => two A win
    Y = np.zeros(n)
    Y[df["winner"] == "model_a"] = 1.0

    # one tie => one A win + one B win
    # find tie + tie (both bad) index
    tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
    tie_idx[len(tie_idx)//2:] = False
    Y[tie_idx] = 1.0

    lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8)
    lr.fit(X,Y)

    elo_scores = SCALE * lr.coef_[0] + INIT_RATING

    # set anchor as gpt-4-0314 = 1000
    if baseline_model in models.index:
        elo_scores += 1000 - elo_scores[models[baseline_model]]
    return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)


def get_bootstrap_result(battles, func_compute_elo, num_round, baseline_model="gpt-4-0314"):
    rows = []
    kwargs = {}
    if baseline_model in inspect.signature(func_compute_elo).parameters:
        kwargs[baseline_model] = baseline_model
    for _ in tqdm(range(num_round), desc="bootstrap"):
        rows.append(func_compute_elo(battles.sample(frac=1.0, replace=True), **kwargs))
    df = pd.DataFrame(rows)
    return df[df.median().sort_values(ascending=False).index]


def preety_print_two_ratings(ratings_1, ratings_2, column_names):
    df = pd.DataFrame([
        [n, ratings_1[n], ratings_2[n]] for n in ratings_1.keys()
    ], columns=["Model", column_names[0], column_names[1]]).sort_values(column_names[0], ascending=False).reset_index(drop=True)
    df[column_names[0]] = (df[column_names[0]] + 0.5).astype(int)
    df[column_names[1]] = (df[column_names[1]] + 0.5).astype(int)
    df.index = df.index + 1
    return df


def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
    names = sorted(list(elo_ratings.keys()))
    wins = defaultdict(lambda: defaultdict(lambda: 0))
    for a in names:
        for b in names:
            ea = 1 / (1 + BASE ** ((elo_ratings[b] - elo_ratings[a]) / SCALE))
            wins[a][b] = ea
            wins[b][a] = 1 - ea

    data = {
        a: [wins[a][b] if a != b else np.nan for b in names]
        for a in names
    }

    df = pd.DataFrame(data, index=names)
    df.index.name = "model_a"
    df.columns.name = "model_b"
    return df.T


def get_win_rate_column(df, column, baseline="gpt-4-0314"):
    to_dict = df[["model", column]].set_index("model").to_dict()[column]
    win_rate_table = predict_win_rate(to_dict)
    return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x * 100, 2))


def fit_bt(X, Y, models, SCALE=400, INIT_RATING=1000, baseline_model="gpt-4-0314"):
    from sklearn.linear_model import LogisticRegression

    p = len(models.index)

    lr = LogisticRegression(fit_intercept=False)
    lr.fit(X, Y)

    elo_scores = SCALE * lr.coef_[0] + INIT_RATING
    # calibrate llama-13b to 800 if applicable
    assert baseline_model in models.index

    elo_scores += 1114 - elo_scores[models[baseline_model]]
    return (
        pd.Series(elo_scores[:p], index=models.index).sort_values(ascending=False),
        lr.coef_[0][p:],
    )


def construct_style_matrices(
    df,
    BASE=10,
    apply_ratio=[1, 1, 1, 1],
    style_elements=STYLE_CONTROL_ELEMENTS,
    add_one=True,
):
    models = pd.concat([df["model_a"], df["model_b"]]).unique()
    models = pd.Series(np.arange(len(models)), index=models)

    # duplicate battles
    df = pd.concat([df, df], ignore_index=True)
    p = len(models.index)
    n = df.shape[0]
    assert len(style_elements) % 2 == 0
    k = int(len(style_elements) / 2)

    X = np.zeros([n, p + k])
    X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
    X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)

    # creates turn each of the specified column in "conv_metadata" into a vector
    style_vector = np.array(
        [
            df.conv_metadata.map(
                lambda x: x[element]
                if type(x[element]) is int
                else sum(x[element].values())
            ).tolist()
            for element in style_elements
        ]
    )

    style_diff = (style_vector[:k] - style_vector[k:]).astype(float)
    style_sum = (style_vector[:k] + style_vector[k:]).astype(float)

    if add_one:
        style_sum = style_sum + np.ones(style_diff.shape)

    apply_ratio = np.flatnonzero(apply_ratio)

    style_diff[apply_ratio] /= style_sum[
        apply_ratio
    ]  # Apply ratio where necessary (length, etc)

    style_mean = np.mean(style_diff, axis=1)
    style_std = np.std(style_diff, axis=1)

    X[:, -k:] = ((style_diff - style_mean[:, np.newaxis]) / style_std[:, np.newaxis]).T

    # one A win => two A win
    Y = np.zeros(n)
    Y[df["winner"] == "model_a"] = 1.0

    # one tie => one A win + one B win
    # find tie + tie (both bad) index
    tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
    tie_idx[len(tie_idx) // 2 :] = False
    Y[tie_idx] = 1.0

    return X, Y, models


def get_bootstrap_result_style_control(X, Y, battles, models, func_compute_elo, num_round=1000, baseline_model="gpt-4-0314"):
    elos = []
    coefs = []
    assert X.shape[0] % 2 == 0 and X.shape[0] == Y.shape[0]
    k = int(
        X.shape[0] / 2
    )  # Since we duplicate the battles when constructing X and Y, we don't want to sample the duplicates

    battles_tie_idx = (battles["winner"] == "tie") | (battles["winner"] == "tie (bothbad)")
    for _ in tqdm(range(num_round), desc="bootstrap"):
        indices = np.random.choice(list(range(k)), size=(k), replace=True)

        index2tie = np.zeros(k, dtype=bool)
        index2tie[battles_tie_idx] = True

        nontie_indices = indices[~index2tie[indices]]
        tie_indices = np.concatenate([indices[index2tie[indices]], indices[index2tie[indices]]+k])

        _X = np.concatenate([X[nontie_indices], X[nontie_indices], X[tie_indices]])
        _Y = np.concatenate([Y[nontie_indices], Y[nontie_indices], Y[tie_indices]])

        assert _X.shape == X.shape and _Y.shape == Y.shape

        states = ~_X[:, : len(models)].any(axis=0)

        elo, coef = func_compute_elo(_X, _Y, models[~states], baseline_model=baseline_model)
        elos.append(elo)
        coefs.append(coef)

    df = pd.DataFrame(elos)
    return df[df.median().sort_values(ascending=False).index], coefs