mirror of
https://github.com/lilakk/BLEUBERI.git
synced 2026-04-27 17:23:23 +00:00
231 lines
No EOL
7.2 KiB
Python
231 lines
No EOL
7.2 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import math
|
|
import inspect
|
|
|
|
from tqdm import tqdm
|
|
from sklearn.linear_model import LogisticRegression
|
|
from collections import defaultdict
|
|
|
|
np.random.seed(42)
|
|
|
|
STYLE_CONTROL_ELEMENTS = [
|
|
"sum_assistant_a_tokens",
|
|
"header_count_a",
|
|
"list_count_a",
|
|
"bold_count_a",
|
|
"sum_assistant_b_tokens",
|
|
"header_count_b",
|
|
"list_count_b",
|
|
"bold_count_b",
|
|
]
|
|
|
|
LENGTH_CONTROL_ELEMENTS = [
|
|
"sum_assistant_a_tokens",
|
|
"sum_assistant_b_tokens",
|
|
]
|
|
|
|
MARKDOWN_CONTROL_ELEMENTS = [
|
|
"header_count_a",
|
|
"list_count_a",
|
|
"bold_count_a",
|
|
"header_count_b",
|
|
"list_count_b",
|
|
"bold_count_b",
|
|
]
|
|
|
|
|
|
def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000, baseline_model="gpt-4-0314"):
|
|
models = pd.concat([df["model_a"], df["model_b"]]).unique()
|
|
models = pd.Series(np.arange(len(models)), index=models)
|
|
|
|
# duplicate battles
|
|
df = pd.concat([df, df], ignore_index=True)
|
|
p = len(models.index)
|
|
n = df.shape[0]
|
|
|
|
X = np.zeros([n, p])
|
|
X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
|
|
X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)
|
|
|
|
# one A win => two A win
|
|
Y = np.zeros(n)
|
|
Y[df["winner"] == "model_a"] = 1.0
|
|
|
|
# one tie => one A win + one B win
|
|
# find tie + tie (both bad) index
|
|
tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
|
|
tie_idx[len(tie_idx)//2:] = False
|
|
Y[tie_idx] = 1.0
|
|
|
|
lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8)
|
|
lr.fit(X,Y)
|
|
|
|
elo_scores = SCALE * lr.coef_[0] + INIT_RATING
|
|
|
|
# set anchor as gpt-4-0314 = 1000
|
|
if baseline_model in models.index:
|
|
elo_scores += 1000 - elo_scores[models[baseline_model]]
|
|
return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
|
|
|
|
|
|
def get_bootstrap_result(battles, func_compute_elo, num_round, baseline_model="gpt-4-0314"):
|
|
rows = []
|
|
kwargs = {}
|
|
if baseline_model in inspect.signature(func_compute_elo).parameters:
|
|
kwargs[baseline_model] = baseline_model
|
|
for _ in tqdm(range(num_round), desc="bootstrap"):
|
|
rows.append(func_compute_elo(battles.sample(frac=1.0, replace=True), **kwargs))
|
|
df = pd.DataFrame(rows)
|
|
return df[df.median().sort_values(ascending=False).index]
|
|
|
|
|
|
def preety_print_two_ratings(ratings_1, ratings_2, column_names):
|
|
df = pd.DataFrame([
|
|
[n, ratings_1[n], ratings_2[n]] for n in ratings_1.keys()
|
|
], columns=["Model", column_names[0], column_names[1]]).sort_values(column_names[0], ascending=False).reset_index(drop=True)
|
|
df[column_names[0]] = (df[column_names[0]] + 0.5).astype(int)
|
|
df[column_names[1]] = (df[column_names[1]] + 0.5).astype(int)
|
|
df.index = df.index + 1
|
|
return df
|
|
|
|
|
|
def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
|
|
names = sorted(list(elo_ratings.keys()))
|
|
wins = defaultdict(lambda: defaultdict(lambda: 0))
|
|
for a in names:
|
|
for b in names:
|
|
ea = 1 / (1 + BASE ** ((elo_ratings[b] - elo_ratings[a]) / SCALE))
|
|
wins[a][b] = ea
|
|
wins[b][a] = 1 - ea
|
|
|
|
data = {
|
|
a: [wins[a][b] if a != b else np.nan for b in names]
|
|
for a in names
|
|
}
|
|
|
|
df = pd.DataFrame(data, index=names)
|
|
df.index.name = "model_a"
|
|
df.columns.name = "model_b"
|
|
return df.T
|
|
|
|
|
|
def get_win_rate_column(df, column, baseline="gpt-4-0314"):
|
|
to_dict = df[["model", column]].set_index("model").to_dict()[column]
|
|
win_rate_table = predict_win_rate(to_dict)
|
|
return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x * 100, 2))
|
|
|
|
|
|
def fit_bt(X, Y, models, SCALE=400, INIT_RATING=1000, baseline_model="gpt-4-0314"):
|
|
from sklearn.linear_model import LogisticRegression
|
|
|
|
p = len(models.index)
|
|
|
|
lr = LogisticRegression(fit_intercept=False)
|
|
lr.fit(X, Y)
|
|
|
|
elo_scores = SCALE * lr.coef_[0] + INIT_RATING
|
|
# calibrate llama-13b to 800 if applicable
|
|
assert baseline_model in models.index
|
|
|
|
elo_scores += 1114 - elo_scores[models[baseline_model]]
|
|
return (
|
|
pd.Series(elo_scores[:p], index=models.index).sort_values(ascending=False),
|
|
lr.coef_[0][p:],
|
|
)
|
|
|
|
|
|
def construct_style_matrices(
|
|
df,
|
|
BASE=10,
|
|
apply_ratio=[1, 1, 1, 1],
|
|
style_elements=STYLE_CONTROL_ELEMENTS,
|
|
add_one=True,
|
|
):
|
|
models = pd.concat([df["model_a"], df["model_b"]]).unique()
|
|
models = pd.Series(np.arange(len(models)), index=models)
|
|
|
|
# duplicate battles
|
|
df = pd.concat([df, df], ignore_index=True)
|
|
p = len(models.index)
|
|
n = df.shape[0]
|
|
assert len(style_elements) % 2 == 0
|
|
k = int(len(style_elements) / 2)
|
|
|
|
X = np.zeros([n, p + k])
|
|
X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
|
|
X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)
|
|
|
|
# creates turn each of the specified column in "conv_metadata" into a vector
|
|
style_vector = np.array(
|
|
[
|
|
df.conv_metadata.map(
|
|
lambda x: x[element]
|
|
if type(x[element]) is int
|
|
else sum(x[element].values())
|
|
).tolist()
|
|
for element in style_elements
|
|
]
|
|
)
|
|
|
|
style_diff = (style_vector[:k] - style_vector[k:]).astype(float)
|
|
style_sum = (style_vector[:k] + style_vector[k:]).astype(float)
|
|
|
|
if add_one:
|
|
style_sum = style_sum + np.ones(style_diff.shape)
|
|
|
|
apply_ratio = np.flatnonzero(apply_ratio)
|
|
|
|
style_diff[apply_ratio] /= style_sum[
|
|
apply_ratio
|
|
] # Apply ratio where necessary (length, etc)
|
|
|
|
style_mean = np.mean(style_diff, axis=1)
|
|
style_std = np.std(style_diff, axis=1)
|
|
|
|
X[:, -k:] = ((style_diff - style_mean[:, np.newaxis]) / style_std[:, np.newaxis]).T
|
|
|
|
# one A win => two A win
|
|
Y = np.zeros(n)
|
|
Y[df["winner"] == "model_a"] = 1.0
|
|
|
|
# one tie => one A win + one B win
|
|
# find tie + tie (both bad) index
|
|
tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
|
|
tie_idx[len(tie_idx) // 2 :] = False
|
|
Y[tie_idx] = 1.0
|
|
|
|
return X, Y, models
|
|
|
|
|
|
def get_bootstrap_result_style_control(X, Y, battles, models, func_compute_elo, num_round=1000, baseline_model="gpt-4-0314"):
|
|
elos = []
|
|
coefs = []
|
|
assert X.shape[0] % 2 == 0 and X.shape[0] == Y.shape[0]
|
|
k = int(
|
|
X.shape[0] / 2
|
|
) # Since we duplicate the battles when constructing X and Y, we don't want to sample the duplicates
|
|
|
|
battles_tie_idx = (battles["winner"] == "tie") | (battles["winner"] == "tie (bothbad)")
|
|
for _ in tqdm(range(num_round), desc="bootstrap"):
|
|
indices = np.random.choice(list(range(k)), size=(k), replace=True)
|
|
|
|
index2tie = np.zeros(k, dtype=bool)
|
|
index2tie[battles_tie_idx] = True
|
|
|
|
nontie_indices = indices[~index2tie[indices]]
|
|
tie_indices = np.concatenate([indices[index2tie[indices]], indices[index2tie[indices]]+k])
|
|
|
|
_X = np.concatenate([X[nontie_indices], X[nontie_indices], X[tie_indices]])
|
|
_Y = np.concatenate([Y[nontie_indices], Y[nontie_indices], Y[tie_indices]])
|
|
|
|
assert _X.shape == X.shape and _Y.shape == Y.shape
|
|
|
|
states = ~_X[:, : len(models)].any(axis=0)
|
|
|
|
elo, coef = func_compute_elo(_X, _Y, models[~states], baseline_model=baseline_model)
|
|
elos.append(elo)
|
|
coefs.append(coef)
|
|
|
|
df = pd.DataFrame(elos)
|
|
return df[df.median().sort_values(ascending=False).index], coefs |