mirror of
https://github.com/lilakk/BLEUBERI.git
synced 2026-04-22 16:49:17 +00:00
497 lines
12 KiB
Text
497 lines
12 KiB
Text
>>> Config: WB Elo with K=4 and num_rounds=100; margin=3; loo=-1; seed=42; init_elo=37 models; tie_margin=1; dynamic=True;
|
|
>>> Found 1628324 votes
|
|
>>> Found 260970 non-tie votes
|
|
>>> Found 865974 votes that are not useful for WB Elo
|
|
>>> WB Elo with K=4 and num_rounds=100
|
|
{
|
|
"gpt-4o-2024-05-13": {
|
|
"avg": 1280.202814809682,
|
|
"std": 1.7727854602931103,
|
|
"median": 1280.0755628336087,
|
|
"ci": [
|
|
1277.2566788414688,
|
|
1284.097658389827
|
|
],
|
|
"init_elo": 1283.0
|
|
},
|
|
"gemini-1.5-pro": {
|
|
"avg": 1251.1880383868893,
|
|
"std": 1.5780723382654294,
|
|
"median": 1251.1898141300676,
|
|
"ci": [
|
|
1248.5834579958444,
|
|
1254.2900137732495
|
|
],
|
|
"init_elo": 1254.0
|
|
},
|
|
"gpt-4-turbo-2024-04-09": {
|
|
"avg": 1247.1721701450072,
|
|
"std": 1.6723361362495952,
|
|
"median": 1247.2617382419635,
|
|
"ci": [
|
|
1244.337626367497,
|
|
1250.2210299024766
|
|
],
|
|
"init_elo": 1249.0
|
|
},
|
|
"gpt-4-0125-preview": {
|
|
"avg": 1236.942691199828,
|
|
"std": 1.7024496219370988,
|
|
"median": 1236.9771116554643,
|
|
"ci": [
|
|
1233.8772166478332,
|
|
1240.3872849986412
|
|
],
|
|
"init_elo": 1239.0
|
|
},
|
|
"yi-large": {
|
|
"avg": 1231.4585490048169,
|
|
"std": 1.3514998766614368,
|
|
"median": 1231.3649785589523,
|
|
"ci": [
|
|
1229.4670985950543,
|
|
1234.2823546823827
|
|
],
|
|
"init_elo": 1234.0
|
|
},
|
|
"claude-3-opus-20240229": {
|
|
"avg": 1229.0800447966017,
|
|
"std": 1.3981478109920624,
|
|
"median": 1228.9921167841103,
|
|
"ci": [
|
|
1226.4119506530621,
|
|
1231.6278206433735
|
|
],
|
|
"init_elo": 1231.0
|
|
},
|
|
"Meta-Llama-3-70B-Instruct": {
|
|
"avg": 1212.4152123911208,
|
|
"std": 1.498030954624743,
|
|
"median": 1212.2115418892133,
|
|
"ci": [
|
|
1209.8207040543568,
|
|
1215.170069023085
|
|
],
|
|
"init_elo": 1214.0
|
|
},
|
|
"gemini-1.5-flash": {
|
|
"avg": 1212.4147496696253,
|
|
"std": 1.4061413697456984,
|
|
"median": 1212.4859867016958,
|
|
"ci": [
|
|
1209.6602222201316,
|
|
1214.824624789511
|
|
],
|
|
"init_elo": 1214.0
|
|
},
|
|
"claude-3-sonnet-20240229": {
|
|
"avg": 1186.6779864127143,
|
|
"std": 1.7125727788850162,
|
|
"median": 1186.6586816675858,
|
|
"ci": [
|
|
1183.3566920284509,
|
|
1189.5527764037165
|
|
],
|
|
"init_elo": 1188.0
|
|
},
|
|
"Qwen2-72B-Instruct": {
|
|
"avg": 1183.2083620205487,
|
|
"std": 1.1208589506286806,
|
|
"median": 1183.0564127076527,
|
|
"ci": [
|
|
1181.4458076352332,
|
|
1185.6620065263976
|
|
],
|
|
"init_elo": 1184.0
|
|
},
|
|
"deepseekv2-chat": {
|
|
"avg": 1181.3512373275005,
|
|
"std": 6.990536897986214,
|
|
"median": 1181.0381413379328,
|
|
"ci": [
|
|
1169.1323070723188,
|
|
1193.868039650277
|
|
],
|
|
"init_elo": "-"
|
|
},
|
|
"reka-core-20240501": {
|
|
"avg": 1175.8069269678774,
|
|
"std": 1.2519401647883974,
|
|
"median": 1175.62151163109,
|
|
"ci": [
|
|
1173.9365625674245,
|
|
1178.9063929215179
|
|
],
|
|
"init_elo": 1176.0
|
|
},
|
|
"claude-3-haiku-20240307": {
|
|
"avg": 1168.8974081263798,
|
|
"std": 1.1215428334600845,
|
|
"median": 1168.8709887818168,
|
|
"ci": [
|
|
1166.8330313790777,
|
|
1171.483155644723
|
|
],
|
|
"init_elo": 1170.0
|
|
},
|
|
"mistral-large-2402": {
|
|
"avg": 1157.9363805153466,
|
|
"std": 1.3762884725537763,
|
|
"median": 1157.8021190566649,
|
|
"ci": [
|
|
1155.29204975145,
|
|
1161.523039054541
|
|
],
|
|
"init_elo": 1158.0
|
|
},
|
|
"Yi-1.5-34B-Chat": {
|
|
"avg": 1155.5476484709307,
|
|
"std": 1.2039660997158363,
|
|
"median": 1155.4512231952376,
|
|
"ci": [
|
|
1153.5930771618048,
|
|
1158.5545179070257
|
|
],
|
|
"init_elo": 1155.0
|
|
},
|
|
"command-r-plus": {
|
|
"avg": 1153.5589462723992,
|
|
"std": 1.5088896391740247,
|
|
"median": 1153.5839084845304,
|
|
"ci": [
|
|
1150.5968000200196,
|
|
1156.0507478467523
|
|
],
|
|
"init_elo": 1154.0
|
|
},
|
|
"Yi-1.5-9B-Chat": {
|
|
"avg": 1151.4595024847079,
|
|
"std": 6.316247015184975,
|
|
"median": 1151.4942343976631,
|
|
"ci": [
|
|
1140.0044308304396,
|
|
1163.0035301779624
|
|
],
|
|
"init_elo": "-"
|
|
},
|
|
"Llama-3-Instruct-8B-SimPO": {
|
|
"avg": 1148.6033692624667,
|
|
"std": 6.735312621989667,
|
|
"median": 1148.3269165874326,
|
|
"ci": [
|
|
1134.695921881885,
|
|
1160.2115093380712
|
|
],
|
|
"init_elo": "-"
|
|
},
|
|
"Meta-Llama-3-8B-Instruct": {
|
|
"avg": 1145.2071346178084,
|
|
"std": 1.3713227805585726,
|
|
"median": 1145.2685919458645,
|
|
"ci": [
|
|
1142.6050825433992,
|
|
1147.6512317181941
|
|
],
|
|
"init_elo": 1146.0
|
|
},
|
|
"Qwen1.5-72B-Chat": {
|
|
"avg": 1143.7048806748173,
|
|
"std": 1.1704245420978383,
|
|
"median": 1143.5103633309882,
|
|
"ci": [
|
|
1141.7746209400004,
|
|
1146.3324255056766
|
|
],
|
|
"init_elo": 1143.0
|
|
},
|
|
"Qwen1.5-72B-Chat-greedy": {
|
|
"avg": 1143.3529159940354,
|
|
"std": 1.1316456522489309,
|
|
"median": 1143.2994181791155,
|
|
"ci": [
|
|
1140.9802827758472,
|
|
1145.5866156943823
|
|
],
|
|
"init_elo": 1143.0
|
|
},
|
|
"Llama-3-Instruct-8B-SimPO-ExPO": {
|
|
"avg": 1142.9923103610195,
|
|
"std": 6.297797340329856,
|
|
"median": 1143.2226408878514,
|
|
"ci": [
|
|
1130.9288591718341,
|
|
1155.5172709155747
|
|
],
|
|
"init_elo": "-"
|
|
},
|
|
"SELM-Llama-3-8B-Instruct-iter-3": {
|
|
"avg": 1142.8165606937189,
|
|
"std": 6.425097916284559,
|
|
"median": 1143.5103007914759,
|
|
"ci": [
|
|
1131.4575109966902,
|
|
1154.7025059164976
|
|
],
|
|
"init_elo": "-"
|
|
},
|
|
"Hermes-2-Theta-Llama-3-8B": {
|
|
"avg": 1131.9741021571772,
|
|
"std": 6.2988095078131545,
|
|
"median": 1131.715201462121,
|
|
"ci": [
|
|
1120.5709015440887,
|
|
1143.3376723620443
|
|
],
|
|
"init_elo": "-"
|
|
},
|
|
"Starling-LM-7B-beta-ExPO": {
|
|
"avg": 1131.2460888608446,
|
|
"std": 6.358825034755664,
|
|
"median": 1131.1432365429093,
|
|
"ci": [
|
|
1117.5140726911777,
|
|
1142.0076870139271
|
|
],
|
|
"init_elo": "-"
|
|
},
|
|
"reka-flash-20240226": {
|
|
"avg": 1129.196720138471,
|
|
"std": 1.041637195139017,
|
|
"median": 1129.1932509177548,
|
|
"ci": [
|
|
1127.2602690306512,
|
|
1131.1155846603706
|
|
],
|
|
"init_elo": 1129.0
|
|
},
|
|
"Phi-3-medium-128k-instruct": {
|
|
"avg": 1125.861584071278,
|
|
"std": 7.531870410762536,
|
|
"median": 1126.2057589987335,
|
|
"ci": [
|
|
1111.257900015648,
|
|
1138.069924237086
|
|
],
|
|
"init_elo": "-"
|
|
},
|
|
"SELM-Zephyr-7B-iter-3": {
|
|
"avg": 1117.3991085294415,
|
|
"std": 7.556022600416634,
|
|
"median": 1117.494878636929,
|
|
"ci": [
|
|
1104.2880740625908,
|
|
1133.9885703285445
|
|
],
|
|
"init_elo": "-"
|
|
},
|
|
"neo_7b_instruct_v0.1": {
|
|
"avg": 1115.9868598461735,
|
|
"std": 5.989615296826751,
|
|
"median": 1115.7656764450057,
|
|
"ci": [
|
|
1105.174330047876,
|
|
1127.1462241971553
|
|
],
|
|
"init_elo": "-"
|
|
},
|
|
"Starling-LM-7B-beta": {
|
|
"avg": 1114.8142468073672,
|
|
"std": 1.0626591977215807,
|
|
"median": 1114.6590274437126,
|
|
"ci": [
|
|
1112.9699028952473,
|
|
1117.5517193992573
|
|
],
|
|
"init_elo": 1114.0
|
|
},
|
|
"Mixtral-8x7B-Instruct-v0.1": {
|
|
"avg": 1114.7317828029732,
|
|
"std": 1.2598820838171276,
|
|
"median": 1114.812919494576,
|
|
"ci": [
|
|
1111.7583194653573,
|
|
1117.291252811192
|
|
],
|
|
"init_elo": 1114.0
|
|
},
|
|
"dbrx-instruct": {
|
|
"avg": 1111.9748460530966,
|
|
"std": 1.4079338611700414,
|
|
"median": 1111.8820763633144,
|
|
"ci": [
|
|
1109.7253389092896,
|
|
1114.6928834993737
|
|
],
|
|
"init_elo": 1111.0
|
|
},
|
|
"command-r": {
|
|
"avg": 1108.0842633743368,
|
|
"std": 1.3803943571569888,
|
|
"median": 1108.096877411685,
|
|
"ci": [
|
|
1105.263211047986,
|
|
1110.6159002657687
|
|
],
|
|
"init_elo": 1107.0
|
|
},
|
|
"gpt-3.5-turbo-0125": {
|
|
"avg": 1107.6671892927654,
|
|
"std": 1.2739448254203414,
|
|
"median": 1107.7546805436284,
|
|
"ci": [
|
|
1104.8486781256638,
|
|
1109.8658208350357
|
|
],
|
|
"init_elo": 1107.0
|
|
},
|
|
"Yi-1.5-6B-Chat": {
|
|
"avg": 1106.2412598048347,
|
|
"std": 7.704752826865893,
|
|
"median": 1106.2092954130294,
|
|
"ci": [
|
|
1092.4319306702778,
|
|
1122.825105702003
|
|
],
|
|
"init_elo": "-"
|
|
},
|
|
"reka-edge": {
|
|
"avg": 1103.179045951335,
|
|
"std": 7.916878177358469,
|
|
"median": 1102.3981901320476,
|
|
"ci": [
|
|
1088.8697547806873,
|
|
1117.8751638108683
|
|
],
|
|
"init_elo": "-"
|
|
},
|
|
"tulu-2-dpo-70b": {
|
|
"avg": 1102.1998887842033,
|
|
"std": 1.1951855881011406,
|
|
"median": 1102.090905366526,
|
|
"ci": [
|
|
1099.7863269951001,
|
|
1104.637425048166
|
|
],
|
|
"init_elo": 1101.0
|
|
},
|
|
"Yi-34B-Chat": {
|
|
"avg": 1099.0,
|
|
"std": 0.0,
|
|
"median": 1099.0,
|
|
"ci": [
|
|
1099.0,
|
|
1099.0
|
|
],
|
|
"init_elo": 1099.0
|
|
},
|
|
"Mistral-7B-Instruct-v0.2": {
|
|
"avg": 1074.9416187154252,
|
|
"std": 1.383147835558675,
|
|
"median": 1074.984061457739,
|
|
"ci": [
|
|
1072.21388892316,
|
|
1077.5698899802662
|
|
],
|
|
"init_elo": 1073.0
|
|
},
|
|
"Llama-2-70b-chat-hf": {
|
|
"avg": 1073.6560538701947,
|
|
"std": 1.265180871566298,
|
|
"median": 1073.5805077385712,
|
|
"ci": [
|
|
1071.0672456040338,
|
|
1076.1395722264592
|
|
],
|
|
"init_elo": 1072.0
|
|
},
|
|
"Qwen1.5-7B-Chat": {
|
|
"avg": 1060.0678287427136,
|
|
"std": 1.1779176675011547,
|
|
"median": 1059.9658591708385,
|
|
"ci": [
|
|
1058.0325190950812,
|
|
1062.886766466951
|
|
],
|
|
"init_elo": 1058.0
|
|
},
|
|
"Llama-2-13b-chat-hf": {
|
|
"avg": 1050.0,
|
|
"std": 0.0,
|
|
"median": 1050.0,
|
|
"ci": [
|
|
1050.0,
|
|
1050.0
|
|
],
|
|
"init_elo": 1050.0
|
|
},
|
|
"Nous-Hermes-2-Mixtral-8x7B-DPO": {
|
|
"avg": 1049.9721781840638,
|
|
"std": 1.4857108885629293,
|
|
"median": 1049.838604107716,
|
|
"ci": [
|
|
1047.1044745763118,
|
|
1052.627249921872
|
|
],
|
|
"init_elo": 1047.0
|
|
},
|
|
"gemma-7b-it": {
|
|
"avg": 1046.9967531381226,
|
|
"std": 1.7967818694635527,
|
|
"median": 1046.831484384929,
|
|
"ci": [
|
|
1044.0855512072167,
|
|
1050.4260592923983
|
|
],
|
|
"init_elo": 1047.0
|
|
},
|
|
"Phi-3-mini-128k-instruct": {
|
|
"avg": 1040.5487365931187,
|
|
"std": 1.350444832487063,
|
|
"median": 1040.4882586375547,
|
|
"ci": [
|
|
1037.9477092691454,
|
|
1043.6571974994822
|
|
],
|
|
"init_elo": 1038.0
|
|
},
|
|
"zephyr-7b-beta": {
|
|
"avg": 1029.0,
|
|
"std": 0.0,
|
|
"median": 1029.0,
|
|
"ci": [
|
|
1029.0,
|
|
1029.0
|
|
],
|
|
"init_elo": 1029.0
|
|
},
|
|
"Llama-2-7b-chat-hf": {
|
|
"avg": 1014.945546075615,
|
|
"std": 1.9235970777315843,
|
|
"median": 1015.079486364647,
|
|
"ci": [
|
|
1010.7689037212001,
|
|
1018.1981078220101
|
|
],
|
|
"init_elo": 1013.0
|
|
},
|
|
"Mistral-7B-Instruct-v0.1": {
|
|
"avg": 1006.0,
|
|
"std": 0.0,
|
|
"median": 1006.0,
|
|
"ci": [
|
|
1006.0,
|
|
1006.0
|
|
],
|
|
"init_elo": 1006.0
|
|
},
|
|
"gemma-2b-it": {
|
|
"avg": 977.3639908445978,
|
|
"std": 1.9727565465298296,
|
|
"median": 977.4262867404013,
|
|
"ci": [
|
|
973.2194970508418,
|
|
980.9750973833403
|
|
],
|
|
"init_elo": 978.0
|
|
}
|
|
}
|