BLEUBERI/eval/WildBench/leaderboard/data_dir/wb_elo.txt
2025-06-04 20:36:43 +00:00

497 lines
12 KiB
Text

>>> Config: WB Elo with K=4 and num_rounds=100; margin=3; loo=-1; seed=42; init_elo=37 models; tie_margin=1; dynamic=True;
>>> Found 1628324 votes
>>> Found 260970 non-tie votes
>>> Found 865974 votes that are not useful for WB Elo
>>> WB Elo with K=4 and num_rounds=100
{
"gpt-4o-2024-05-13": {
"avg": 1280.202814809682,
"std": 1.7727854602931103,
"median": 1280.0755628336087,
"ci": [
1277.2566788414688,
1284.097658389827
],
"init_elo": 1283.0
},
"gemini-1.5-pro": {
"avg": 1251.1880383868893,
"std": 1.5780723382654294,
"median": 1251.1898141300676,
"ci": [
1248.5834579958444,
1254.2900137732495
],
"init_elo": 1254.0
},
"gpt-4-turbo-2024-04-09": {
"avg": 1247.1721701450072,
"std": 1.6723361362495952,
"median": 1247.2617382419635,
"ci": [
1244.337626367497,
1250.2210299024766
],
"init_elo": 1249.0
},
"gpt-4-0125-preview": {
"avg": 1236.942691199828,
"std": 1.7024496219370988,
"median": 1236.9771116554643,
"ci": [
1233.8772166478332,
1240.3872849986412
],
"init_elo": 1239.0
},
"yi-large": {
"avg": 1231.4585490048169,
"std": 1.3514998766614368,
"median": 1231.3649785589523,
"ci": [
1229.4670985950543,
1234.2823546823827
],
"init_elo": 1234.0
},
"claude-3-opus-20240229": {
"avg": 1229.0800447966017,
"std": 1.3981478109920624,
"median": 1228.9921167841103,
"ci": [
1226.4119506530621,
1231.6278206433735
],
"init_elo": 1231.0
},
"Meta-Llama-3-70B-Instruct": {
"avg": 1212.4152123911208,
"std": 1.498030954624743,
"median": 1212.2115418892133,
"ci": [
1209.8207040543568,
1215.170069023085
],
"init_elo": 1214.0
},
"gemini-1.5-flash": {
"avg": 1212.4147496696253,
"std": 1.4061413697456984,
"median": 1212.4859867016958,
"ci": [
1209.6602222201316,
1214.824624789511
],
"init_elo": 1214.0
},
"claude-3-sonnet-20240229": {
"avg": 1186.6779864127143,
"std": 1.7125727788850162,
"median": 1186.6586816675858,
"ci": [
1183.3566920284509,
1189.5527764037165
],
"init_elo": 1188.0
},
"Qwen2-72B-Instruct": {
"avg": 1183.2083620205487,
"std": 1.1208589506286806,
"median": 1183.0564127076527,
"ci": [
1181.4458076352332,
1185.6620065263976
],
"init_elo": 1184.0
},
"deepseekv2-chat": {
"avg": 1181.3512373275005,
"std": 6.990536897986214,
"median": 1181.0381413379328,
"ci": [
1169.1323070723188,
1193.868039650277
],
"init_elo": "-"
},
"reka-core-20240501": {
"avg": 1175.8069269678774,
"std": 1.2519401647883974,
"median": 1175.62151163109,
"ci": [
1173.9365625674245,
1178.9063929215179
],
"init_elo": 1176.0
},
"claude-3-haiku-20240307": {
"avg": 1168.8974081263798,
"std": 1.1215428334600845,
"median": 1168.8709887818168,
"ci": [
1166.8330313790777,
1171.483155644723
],
"init_elo": 1170.0
},
"mistral-large-2402": {
"avg": 1157.9363805153466,
"std": 1.3762884725537763,
"median": 1157.8021190566649,
"ci": [
1155.29204975145,
1161.523039054541
],
"init_elo": 1158.0
},
"Yi-1.5-34B-Chat": {
"avg": 1155.5476484709307,
"std": 1.2039660997158363,
"median": 1155.4512231952376,
"ci": [
1153.5930771618048,
1158.5545179070257
],
"init_elo": 1155.0
},
"command-r-plus": {
"avg": 1153.5589462723992,
"std": 1.5088896391740247,
"median": 1153.5839084845304,
"ci": [
1150.5968000200196,
1156.0507478467523
],
"init_elo": 1154.0
},
"Yi-1.5-9B-Chat": {
"avg": 1151.4595024847079,
"std": 6.316247015184975,
"median": 1151.4942343976631,
"ci": [
1140.0044308304396,
1163.0035301779624
],
"init_elo": "-"
},
"Llama-3-Instruct-8B-SimPO": {
"avg": 1148.6033692624667,
"std": 6.735312621989667,
"median": 1148.3269165874326,
"ci": [
1134.695921881885,
1160.2115093380712
],
"init_elo": "-"
},
"Meta-Llama-3-8B-Instruct": {
"avg": 1145.2071346178084,
"std": 1.3713227805585726,
"median": 1145.2685919458645,
"ci": [
1142.6050825433992,
1147.6512317181941
],
"init_elo": 1146.0
},
"Qwen1.5-72B-Chat": {
"avg": 1143.7048806748173,
"std": 1.1704245420978383,
"median": 1143.5103633309882,
"ci": [
1141.7746209400004,
1146.3324255056766
],
"init_elo": 1143.0
},
"Qwen1.5-72B-Chat-greedy": {
"avg": 1143.3529159940354,
"std": 1.1316456522489309,
"median": 1143.2994181791155,
"ci": [
1140.9802827758472,
1145.5866156943823
],
"init_elo": 1143.0
},
"Llama-3-Instruct-8B-SimPO-ExPO": {
"avg": 1142.9923103610195,
"std": 6.297797340329856,
"median": 1143.2226408878514,
"ci": [
1130.9288591718341,
1155.5172709155747
],
"init_elo": "-"
},
"SELM-Llama-3-8B-Instruct-iter-3": {
"avg": 1142.8165606937189,
"std": 6.425097916284559,
"median": 1143.5103007914759,
"ci": [
1131.4575109966902,
1154.7025059164976
],
"init_elo": "-"
},
"Hermes-2-Theta-Llama-3-8B": {
"avg": 1131.9741021571772,
"std": 6.2988095078131545,
"median": 1131.715201462121,
"ci": [
1120.5709015440887,
1143.3376723620443
],
"init_elo": "-"
},
"Starling-LM-7B-beta-ExPO": {
"avg": 1131.2460888608446,
"std": 6.358825034755664,
"median": 1131.1432365429093,
"ci": [
1117.5140726911777,
1142.0076870139271
],
"init_elo": "-"
},
"reka-flash-20240226": {
"avg": 1129.196720138471,
"std": 1.041637195139017,
"median": 1129.1932509177548,
"ci": [
1127.2602690306512,
1131.1155846603706
],
"init_elo": 1129.0
},
"Phi-3-medium-128k-instruct": {
"avg": 1125.861584071278,
"std": 7.531870410762536,
"median": 1126.2057589987335,
"ci": [
1111.257900015648,
1138.069924237086
],
"init_elo": "-"
},
"SELM-Zephyr-7B-iter-3": {
"avg": 1117.3991085294415,
"std": 7.556022600416634,
"median": 1117.494878636929,
"ci": [
1104.2880740625908,
1133.9885703285445
],
"init_elo": "-"
},
"neo_7b_instruct_v0.1": {
"avg": 1115.9868598461735,
"std": 5.989615296826751,
"median": 1115.7656764450057,
"ci": [
1105.174330047876,
1127.1462241971553
],
"init_elo": "-"
},
"Starling-LM-7B-beta": {
"avg": 1114.8142468073672,
"std": 1.0626591977215807,
"median": 1114.6590274437126,
"ci": [
1112.9699028952473,
1117.5517193992573
],
"init_elo": 1114.0
},
"Mixtral-8x7B-Instruct-v0.1": {
"avg": 1114.7317828029732,
"std": 1.2598820838171276,
"median": 1114.812919494576,
"ci": [
1111.7583194653573,
1117.291252811192
],
"init_elo": 1114.0
},
"dbrx-instruct": {
"avg": 1111.9748460530966,
"std": 1.4079338611700414,
"median": 1111.8820763633144,
"ci": [
1109.7253389092896,
1114.6928834993737
],
"init_elo": 1111.0
},
"command-r": {
"avg": 1108.0842633743368,
"std": 1.3803943571569888,
"median": 1108.096877411685,
"ci": [
1105.263211047986,
1110.6159002657687
],
"init_elo": 1107.0
},
"gpt-3.5-turbo-0125": {
"avg": 1107.6671892927654,
"std": 1.2739448254203414,
"median": 1107.7546805436284,
"ci": [
1104.8486781256638,
1109.8658208350357
],
"init_elo": 1107.0
},
"Yi-1.5-6B-Chat": {
"avg": 1106.2412598048347,
"std": 7.704752826865893,
"median": 1106.2092954130294,
"ci": [
1092.4319306702778,
1122.825105702003
],
"init_elo": "-"
},
"reka-edge": {
"avg": 1103.179045951335,
"std": 7.916878177358469,
"median": 1102.3981901320476,
"ci": [
1088.8697547806873,
1117.8751638108683
],
"init_elo": "-"
},
"tulu-2-dpo-70b": {
"avg": 1102.1998887842033,
"std": 1.1951855881011406,
"median": 1102.090905366526,
"ci": [
1099.7863269951001,
1104.637425048166
],
"init_elo": 1101.0
},
"Yi-34B-Chat": {
"avg": 1099.0,
"std": 0.0,
"median": 1099.0,
"ci": [
1099.0,
1099.0
],
"init_elo": 1099.0
},
"Mistral-7B-Instruct-v0.2": {
"avg": 1074.9416187154252,
"std": 1.383147835558675,
"median": 1074.984061457739,
"ci": [
1072.21388892316,
1077.5698899802662
],
"init_elo": 1073.0
},
"Llama-2-70b-chat-hf": {
"avg": 1073.6560538701947,
"std": 1.265180871566298,
"median": 1073.5805077385712,
"ci": [
1071.0672456040338,
1076.1395722264592
],
"init_elo": 1072.0
},
"Qwen1.5-7B-Chat": {
"avg": 1060.0678287427136,
"std": 1.1779176675011547,
"median": 1059.9658591708385,
"ci": [
1058.0325190950812,
1062.886766466951
],
"init_elo": 1058.0
},
"Llama-2-13b-chat-hf": {
"avg": 1050.0,
"std": 0.0,
"median": 1050.0,
"ci": [
1050.0,
1050.0
],
"init_elo": 1050.0
},
"Nous-Hermes-2-Mixtral-8x7B-DPO": {
"avg": 1049.9721781840638,
"std": 1.4857108885629293,
"median": 1049.838604107716,
"ci": [
1047.1044745763118,
1052.627249921872
],
"init_elo": 1047.0
},
"gemma-7b-it": {
"avg": 1046.9967531381226,
"std": 1.7967818694635527,
"median": 1046.831484384929,
"ci": [
1044.0855512072167,
1050.4260592923983
],
"init_elo": 1047.0
},
"Phi-3-mini-128k-instruct": {
"avg": 1040.5487365931187,
"std": 1.350444832487063,
"median": 1040.4882586375547,
"ci": [
1037.9477092691454,
1043.6571974994822
],
"init_elo": 1038.0
},
"zephyr-7b-beta": {
"avg": 1029.0,
"std": 0.0,
"median": 1029.0,
"ci": [
1029.0,
1029.0
],
"init_elo": 1029.0
},
"Llama-2-7b-chat-hf": {
"avg": 1014.945546075615,
"std": 1.9235970777315843,
"median": 1015.079486364647,
"ci": [
1010.7689037212001,
1018.1981078220101
],
"init_elo": 1013.0
},
"Mistral-7B-Instruct-v0.1": {
"avg": 1006.0,
"std": 0.0,
"median": 1006.0,
"ci": [
1006.0,
1006.0
],
"init_elo": 1006.0
},
"gemma-2b-it": {
"avg": 977.3639908445978,
"std": 1.9727565465298296,
"median": 977.4262867404013,
"ci": [
973.2194970508418,
980.9750973833403
],
"init_elo": 978.0
}
}