mirror of
https://github.com/lilakk/BLEUBERI.git
synced 2026-04-19 12:58:12 +00:00
385 lines
13 KiB
Python
385 lines
13 KiB
Python
import os
|
|
import math
|
|
import multiprocessing as mp
|
|
from functools import partial
|
|
import numpy as np
|
|
from scipy.special import expit
|
|
from scipy.optimize import minimize
|
|
import pandas as pd
|
|
from tqdm import tqdm
|
|
|
|
|
|
STYLE_CONTROL_ELEMENTS_V1 = [
|
|
"sum_assistant_a_tokens",
|
|
"header_count_a",
|
|
"list_count_a",
|
|
"bold_count_a",
|
|
"sum_assistant_b_tokens",
|
|
"header_count_b",
|
|
"list_count_b",
|
|
"bold_count_b",
|
|
]
|
|
|
|
|
|
def get_matchups_models(df):
|
|
n_rows = len(df)
|
|
model_indices, models = pd.factorize(pd.concat([df["model_a"], df["model_b"]]))
|
|
matchups = np.column_stack([model_indices[:n_rows], model_indices[n_rows:]])
|
|
return matchups, models.to_list()
|
|
|
|
|
|
def preprocess_for_elo(df):
|
|
"""
|
|
in Elo we want numpy arrays for matchups and outcomes
|
|
matchups: int32 (N,2) contains model ids for the competitors in a match
|
|
outcomes: float64 (N,) contains 1.0, 0.5, or 0.0 representing win, tie, or loss for model_a
|
|
"""
|
|
matchups, models = get_matchups_models(df)
|
|
outcomes = np.full(len(df), 0.5)
|
|
outcomes[df["winner"] == "model_a"] = 1.0
|
|
outcomes[df["winner"] == "model_b"] = 0.0
|
|
return matchups, outcomes, models
|
|
|
|
|
|
def preprocess_for_bt(df):
|
|
"""in BT we only need the unique (matchup,outcome) sets along with the weights of how often they occur"""
|
|
n_rows = len(df)
|
|
# the 3 columns of schedule represent: model_a id, model_b id, outcome_id
|
|
schedule = np.full((n_rows, 3), fill_value=1, dtype=np.int32)
|
|
# set the two model cols by mapping the model names to their int ids
|
|
schedule[:, [0, 1]], models = get_matchups_models(df)
|
|
# map outcomes to integers (must be same dtype as model ids so it can be in the same array)
|
|
# model_a win -> 2, tie -> 1 (prefilled by default), model_b win -> 0
|
|
schedule[df["winner"] == "model_a", 2] = 2
|
|
schedule[df["winner"] == "model_b", 2] = 0
|
|
# count the number of occurances of each observed result
|
|
matchups_outcomes, weights = np.unique(schedule, return_counts=True, axis=0)
|
|
matchups = matchups_outcomes[:, [0, 1]]
|
|
# map 2 -> 1.0, 1 -> 0.5, 0 -> 0.0 which will be used as labels during optimization
|
|
outcomes = matchups_outcomes[:, 2].astype(np.float64) / 2.0
|
|
weights = weights.astype(np.float64)
|
|
# each possible result is weighted according to number of times it occured in the dataset
|
|
return matchups, outcomes, models, weights
|
|
|
|
|
|
def preprocess_for_style(
|
|
df,
|
|
apply_ratio=[1, 1, 1, 1],
|
|
style_elements=STYLE_CONTROL_ELEMENTS_V1,
|
|
add_one=True,
|
|
):
|
|
matchups, outcomes, models = preprocess_for_elo(
|
|
df
|
|
) # this can use the same preprocessing as Elo
|
|
|
|
n = matchups.shape[0]
|
|
k = int(len(style_elements) / 2)
|
|
|
|
def extract_style_feature(x, feature):
|
|
val = x[feature]
|
|
if isinstance(val, int):
|
|
return val
|
|
else:
|
|
return sum(val.values())
|
|
|
|
style_vector = np.zeros(shape=(2 * k, n), dtype=np.int32)
|
|
for idx, element in enumerate(style_elements):
|
|
style_vector[idx, :] = df.conv_metadata.map(
|
|
partial(extract_style_feature, feature=element)
|
|
).values
|
|
style_vector = np.ascontiguousarray(style_vector)
|
|
|
|
style_diff = (style_vector[:k] - style_vector[k:]).astype(float)
|
|
style_sum = (style_vector[:k] + style_vector[k:]).astype(float)
|
|
|
|
if add_one:
|
|
style_sum = style_sum + np.ones(style_diff.shape)
|
|
|
|
apply_ratio = np.flatnonzero(apply_ratio)
|
|
|
|
# Apply ratio where necessary (length, etc)
|
|
style_diff[apply_ratio] /= style_sum[apply_ratio]
|
|
|
|
style_mean = np.mean(style_diff, axis=1)
|
|
style_std = np.std(style_diff, axis=1)
|
|
features = ((style_diff - style_mean[:, np.newaxis]) / style_std[:, np.newaxis]).T
|
|
|
|
return matchups, features, outcomes, models
|
|
|
|
|
|
def fit_vectorized_elo(
|
|
matchups,
|
|
outcomes,
|
|
sample_indices,
|
|
num_models,
|
|
k=4.0,
|
|
base=10.0,
|
|
init_rating=1000.0,
|
|
scale=400.0,
|
|
):
|
|
"""fit multiple sets of Elo ratings on different samples of the data at the same time"""
|
|
alpha = math.log(base) / scale
|
|
num_samples = sample_indices.shape[1]
|
|
ratings = np.zeros(shape=(num_samples, num_models), dtype=np.float64)
|
|
# iterate over the rows of sample_indices, each column is an index into a match in the input arrays
|
|
sample_range = np.arange(num_samples)
|
|
for matchup_indices in sample_indices:
|
|
model_a_indices = matchups[matchup_indices, 0]
|
|
model_b_indices = matchups[matchup_indices, 1]
|
|
model_a_ratings = ratings[sample_range, model_a_indices]
|
|
model_b_ratings = ratings[sample_range, model_b_indices]
|
|
sample_outcomes = outcomes[matchup_indices]
|
|
probs = expit(alpha * (model_a_ratings - model_b_ratings))
|
|
updates = k * (sample_outcomes - probs)
|
|
ratings[sample_range, model_a_indices] += updates
|
|
ratings[sample_range, model_b_indices] -= updates
|
|
return ratings + init_rating
|
|
|
|
|
|
def compute_elo(df, k=4.0, base=10.0, init_rating=1000.0, scale=400.0):
|
|
matchups, outcomes, models = preprocess_for_elo(df)
|
|
alpha = math.log(base) / scale
|
|
ratings = np.full(shape=(len(models),), fill_value=init_rating)
|
|
for (model_a_idx, model_b_idx), outcome in zip(matchups, outcomes):
|
|
prob = 1.0 / (
|
|
1.0 + math.exp(alpha * (ratings[model_b_idx] - ratings[model_a_idx]))
|
|
)
|
|
update = k * (outcome - prob)
|
|
ratings[model_a_idx] += update
|
|
ratings[model_b_idx] -= update
|
|
return {model: ratings[idx] for idx, model in enumerate(models)}
|
|
|
|
|
|
def compute_bootstrap_elo(
|
|
df, num_round=100, k=4.0, base=10.0, init_rating=1000.0, scale=400.0
|
|
):
|
|
matchups, outcomes, models = preprocess_for_elo(df)
|
|
sample_indices = np.random.randint(low=0, high=len(df), size=(len(df), num_round))
|
|
ratings = fit_vectorized_elo(
|
|
matchups, outcomes, sample_indices, len(models), k, base, init_rating, scale
|
|
)
|
|
df = pd.DataFrame(data=ratings, columns=models)
|
|
return df[df.median().sort_values(ascending=False).index]
|
|
|
|
|
|
def bt_loss_and_grad(ratings, matchups, outcomes, weights, alpha=1.0):
|
|
matchup_ratings = ratings[matchups]
|
|
logits = alpha * (matchup_ratings[:, 0] - matchup_ratings[:, 1])
|
|
probs = expit(logits)
|
|
# this form naturally counts a draw as half a win and half a loss
|
|
loss = -(
|
|
(np.log(probs) * outcomes + np.log(1.0 - probs) * (1.0 - outcomes)) * weights
|
|
).sum()
|
|
matchups_grads = -alpha * (outcomes - probs) * weights
|
|
model_grad = np.zeros_like(ratings)
|
|
# aggregate gradients at the model level using the indices in matchups
|
|
np.add.at(
|
|
model_grad,
|
|
matchups[:, [0, 1]],
|
|
matchups_grads[:, None] * np.array([1.0, -1.0], dtype=np.float64),
|
|
)
|
|
return loss, model_grad
|
|
|
|
|
|
def fit_bt(matchups, outcomes, weights, n_models, alpha, tol=1e-6):
|
|
initial_ratings = np.zeros(n_models, dtype=np.float64)
|
|
result = minimize(
|
|
fun=bt_loss_and_grad,
|
|
x0=initial_ratings,
|
|
args=(matchups, outcomes, weights, alpha),
|
|
jac=True,
|
|
method="L-BFGS-B",
|
|
options={"disp": False, "maxiter": 100, "gtol": tol},
|
|
)
|
|
return result["x"]
|
|
|
|
|
|
def scale_and_offset(
|
|
ratings,
|
|
models,
|
|
scale=400,
|
|
init_rating=1000,
|
|
baseline_model="mixtral-8x7b-instruct-v0.1",
|
|
baseline_rating=1114,
|
|
):
|
|
"""convert ratings from the natural scale to the Elo rating scale with an anchored baseline"""
|
|
scaled_ratings = (ratings * scale) + init_rating
|
|
if baseline_model in models:
|
|
baseline_idx = models.index(baseline_model)
|
|
scaled_ratings += baseline_rating - scaled_ratings[..., [baseline_idx]]
|
|
return scaled_ratings
|
|
|
|
|
|
def compute_bt(df, base=10.0, scale=400.0, init_rating=1000, tol=1e-6):
|
|
matchups, outcomes, models, weights = preprocess_for_bt(df)
|
|
ratings = fit_bt(matchups, outcomes, weights, len(models), math.log(base), tol)
|
|
scaled_ratings = scale_and_offset(ratings, models, scale, init_rating=init_rating)
|
|
return pd.Series(scaled_ratings, index=models).sort_values(ascending=False)
|
|
|
|
|
|
def compute_bootstrap_bt(
|
|
battles,
|
|
num_round,
|
|
base=10.0,
|
|
scale=400.0,
|
|
init_rating=1000.0,
|
|
tol=1e-6,
|
|
num_cpu=None,
|
|
):
|
|
matchups, outcomes, models, weights = preprocess_for_bt(battles)
|
|
# bootstrap sample the unique outcomes and their counts directly using the multinomial distribution
|
|
rng = np.random.default_rng(seed=0)
|
|
idxs = rng.multinomial(
|
|
n=len(battles), pvals=weights / weights.sum(), size=(num_round)
|
|
)
|
|
# only the distribution over their occurance counts changes between samples (and it can be 0)
|
|
boot_weights = idxs.astype(np.float64) / len(battles)
|
|
|
|
# the only thing different across samples is the distribution of weights
|
|
bt_fn = partial(
|
|
fit_bt, matchups, outcomes, n_models=len(models), alpha=np.log(base), tol=tol
|
|
)
|
|
with mp.Pool(num_cpu if num_cpu else os.cpu_count()) as pool:
|
|
results = list(tqdm(pool.imap_unordered(bt_fn, boot_weights), total=num_round))
|
|
|
|
ratings = np.array(results)
|
|
scaled_ratings = scale_and_offset(ratings, models, scale, init_rating)
|
|
df = pd.DataFrame(scaled_ratings, columns=models)
|
|
return df[df.median().sort_values(ascending=False).index]
|
|
|
|
|
|
DIFF_MASK = np.array(
|
|
[1.0, -1.0], dtype=np.float64
|
|
) # create globally to not incur the instantiation cost in each call
|
|
|
|
|
|
def contextual_bt_loss_and_grad(
|
|
params,
|
|
n_competitors,
|
|
matchups,
|
|
features,
|
|
outcomes,
|
|
alpha=1.0,
|
|
reg=1.0,
|
|
half_reg=0.5,
|
|
):
|
|
reg_loss = half_reg * np.inner(params, params)
|
|
|
|
# Split params into ratings and feature parameters
|
|
ratings = params[:n_competitors]
|
|
feature_params = params[n_competitors:]
|
|
|
|
matchup_ratings = ratings[matchups]
|
|
bt_logits = alpha * (matchup_ratings[:, 0] - matchup_ratings[:, 1])
|
|
context_logits = np.dot(features, feature_params)
|
|
probs = expit(bt_logits + context_logits)
|
|
loss = (
|
|
-((np.log(probs) * outcomes + np.log(1.0 - probs) * (1.0 - outcomes))).sum()
|
|
+ reg_loss
|
|
)
|
|
|
|
error = outcomes - probs
|
|
grad = reg * params # initialize the grad as the regularization grad
|
|
matchups_grads = -alpha * error
|
|
np.add.at(
|
|
grad[:n_competitors], matchups[:, [0, 1]], matchups_grads[:, None] * DIFF_MASK
|
|
)
|
|
grad[n_competitors:] -= np.dot(features.T, error)
|
|
return loss, grad
|
|
|
|
|
|
# note on regularization:
|
|
# default reg is to 0.5 since the LogisticRegression default is 1.0
|
|
# in the original implementation, matchups were duplicated
|
|
# that made the ratio of log loss to reg loss "twice as high"
|
|
# in this non-duplicated version for parity we also reduce the reg by one half to match
|
|
def fit_contextual_bt(
|
|
matchups,
|
|
features,
|
|
outcomes,
|
|
models,
|
|
idxs=None,
|
|
alpha=math.log(10.0),
|
|
reg=0.5,
|
|
tol=1e-6,
|
|
):
|
|
n_features = features.shape[1]
|
|
n_models = len(models)
|
|
initial_params = np.zeros(n_models + n_features, dtype=np.float64)
|
|
half_reg = reg / 2.0
|
|
|
|
# sample idxs optionally allow for fitting on a bootstrap sample of the dataset
|
|
if idxs is not None:
|
|
matchups, features, outcomes = matchups[idxs], features[idxs], outcomes[idxs]
|
|
|
|
result = minimize(
|
|
fun=contextual_bt_loss_and_grad,
|
|
x0=initial_params,
|
|
args=(n_models, matchups, features, outcomes, alpha, reg, half_reg),
|
|
jac=True,
|
|
method="L-BFGS-B",
|
|
options={"disp": False, "maxiter": 100, "gtol": tol},
|
|
)
|
|
return result["x"]
|
|
|
|
|
|
def compute_style_control(
|
|
df, alpha=math.log(10.0), reg=0.5, init_rating=1000.0, scale=400.0, tol=1e-6
|
|
):
|
|
matchups, features, outcomes, models = preprocess_for_style(df)
|
|
ratings_params = fit_contextual_bt(
|
|
matchups,
|
|
features,
|
|
outcomes,
|
|
models=models,
|
|
alpha=alpha,
|
|
reg=reg,
|
|
tol=tol,
|
|
)
|
|
ratings = ratings_params[: len(models)]
|
|
params = ratings_params[len(models) :]
|
|
scaled_ratings = scale_and_offset(ratings, models, scale, init_rating)
|
|
scaled_ratings = pd.Series(scaled_ratings, index=models).sort_values(
|
|
ascending=False
|
|
)
|
|
return scaled_ratings, params
|
|
|
|
|
|
def compute_bootstrap_style_control(
|
|
df,
|
|
num_round,
|
|
alpha=math.log(10.0),
|
|
reg=0.5,
|
|
init_rating=1000.0,
|
|
scale=400.0,
|
|
tol=1e-6,
|
|
num_cpu=None,
|
|
):
|
|
matchups, features, outcomes, models = preprocess_for_style(df)
|
|
|
|
contextual_bt_fn = partial(
|
|
fit_contextual_bt,
|
|
matchups,
|
|
features,
|
|
outcomes,
|
|
models,
|
|
alpha=alpha,
|
|
reg=reg,
|
|
tol=tol,
|
|
)
|
|
|
|
boot_idxs = np.random.randint(
|
|
low=0, high=matchups.shape[0], size=(num_round, matchups.shape[0])
|
|
)
|
|
|
|
with mp.Pool(num_cpu if num_cpu else os.cpu_count()) as pool:
|
|
results = list(
|
|
tqdm(pool.imap_unordered(contextual_bt_fn, boot_idxs), total=num_round)
|
|
)
|
|
|
|
ratings_params = np.array(results)
|
|
ratings = ratings_params[:, : len(models)]
|
|
params = ratings_params[:, len(models) :]
|
|
scaled_ratings = scale_and_offset(ratings, models, scale, init_rating)
|
|
df = pd.DataFrame(scaled_ratings, columns=models)
|
|
return df[df.median().sort_values(ascending=False).index], params
|