>>> Config: WB Elo with K=4 and num_rounds=100; margin=3; loo=-1; seed=42; init_elo=37 models; tie_margin=1; dynamic=True; >>> Found 1628324 votes >>> Found 260970 non-tie votes >>> Found 865974 votes that are not useful for WB Elo >>> WB Elo with K=4 and num_rounds=100 { "gpt-4o-2024-05-13": { "avg": 1280.202814809682, "std": 1.7727854602931103, "median": 1280.0755628336087, "ci": [ 1277.2566788414688, 1284.097658389827 ], "init_elo": 1283.0 }, "gemini-1.5-pro": { "avg": 1251.1880383868893, "std": 1.5780723382654294, "median": 1251.1898141300676, "ci": [ 1248.5834579958444, 1254.2900137732495 ], "init_elo": 1254.0 }, "gpt-4-turbo-2024-04-09": { "avg": 1247.1721701450072, "std": 1.6723361362495952, "median": 1247.2617382419635, "ci": [ 1244.337626367497, 1250.2210299024766 ], "init_elo": 1249.0 }, "gpt-4-0125-preview": { "avg": 1236.942691199828, "std": 1.7024496219370988, "median": 1236.9771116554643, "ci": [ 1233.8772166478332, 1240.3872849986412 ], "init_elo": 1239.0 }, "yi-large": { "avg": 1231.4585490048169, "std": 1.3514998766614368, "median": 1231.3649785589523, "ci": [ 1229.4670985950543, 1234.2823546823827 ], "init_elo": 1234.0 }, "claude-3-opus-20240229": { "avg": 1229.0800447966017, "std": 1.3981478109920624, "median": 1228.9921167841103, "ci": [ 1226.4119506530621, 1231.6278206433735 ], "init_elo": 1231.0 }, "Meta-Llama-3-70B-Instruct": { "avg": 1212.4152123911208, "std": 1.498030954624743, "median": 1212.2115418892133, "ci": [ 1209.8207040543568, 1215.170069023085 ], "init_elo": 1214.0 }, "gemini-1.5-flash": { "avg": 1212.4147496696253, "std": 1.4061413697456984, "median": 1212.4859867016958, "ci": [ 1209.6602222201316, 1214.824624789511 ], "init_elo": 1214.0 }, "claude-3-sonnet-20240229": { "avg": 1186.6779864127143, "std": 1.7125727788850162, "median": 1186.6586816675858, "ci": [ 1183.3566920284509, 1189.5527764037165 ], "init_elo": 1188.0 }, "Qwen2-72B-Instruct": { "avg": 1183.2083620205487, "std": 1.1208589506286806, "median": 1183.0564127076527, "ci": [ 1181.4458076352332, 1185.6620065263976 ], "init_elo": 1184.0 }, "deepseekv2-chat": { "avg": 1181.3512373275005, "std": 6.990536897986214, "median": 1181.0381413379328, "ci": [ 1169.1323070723188, 1193.868039650277 ], "init_elo": "-" }, "reka-core-20240501": { "avg": 1175.8069269678774, "std": 1.2519401647883974, "median": 1175.62151163109, "ci": [ 1173.9365625674245, 1178.9063929215179 ], "init_elo": 1176.0 }, "claude-3-haiku-20240307": { "avg": 1168.8974081263798, "std": 1.1215428334600845, "median": 1168.8709887818168, "ci": [ 1166.8330313790777, 1171.483155644723 ], "init_elo": 1170.0 }, "mistral-large-2402": { "avg": 1157.9363805153466, "std": 1.3762884725537763, "median": 1157.8021190566649, "ci": [ 1155.29204975145, 1161.523039054541 ], "init_elo": 1158.0 }, "Yi-1.5-34B-Chat": { "avg": 1155.5476484709307, "std": 1.2039660997158363, "median": 1155.4512231952376, "ci": [ 1153.5930771618048, 1158.5545179070257 ], "init_elo": 1155.0 }, "command-r-plus": { "avg": 1153.5589462723992, "std": 1.5088896391740247, "median": 1153.5839084845304, "ci": [ 1150.5968000200196, 1156.0507478467523 ], "init_elo": 1154.0 }, "Yi-1.5-9B-Chat": { "avg": 1151.4595024847079, "std": 6.316247015184975, "median": 1151.4942343976631, "ci": [ 1140.0044308304396, 1163.0035301779624 ], "init_elo": "-" }, "Llama-3-Instruct-8B-SimPO": { "avg": 1148.6033692624667, "std": 6.735312621989667, "median": 1148.3269165874326, "ci": [ 1134.695921881885, 1160.2115093380712 ], "init_elo": "-" }, "Meta-Llama-3-8B-Instruct": { "avg": 1145.2071346178084, "std": 1.3713227805585726, "median": 1145.2685919458645, "ci": [ 1142.6050825433992, 1147.6512317181941 ], "init_elo": 1146.0 }, "Qwen1.5-72B-Chat": { "avg": 1143.7048806748173, "std": 1.1704245420978383, "median": 1143.5103633309882, "ci": [ 1141.7746209400004, 1146.3324255056766 ], "init_elo": 1143.0 }, "Qwen1.5-72B-Chat-greedy": { "avg": 1143.3529159940354, "std": 1.1316456522489309, "median": 1143.2994181791155, "ci": [ 1140.9802827758472, 1145.5866156943823 ], "init_elo": 1143.0 }, "Llama-3-Instruct-8B-SimPO-ExPO": { "avg": 1142.9923103610195, "std": 6.297797340329856, "median": 1143.2226408878514, "ci": [ 1130.9288591718341, 1155.5172709155747 ], "init_elo": "-" }, "SELM-Llama-3-8B-Instruct-iter-3": { "avg": 1142.8165606937189, "std": 6.425097916284559, "median": 1143.5103007914759, "ci": [ 1131.4575109966902, 1154.7025059164976 ], "init_elo": "-" }, "Hermes-2-Theta-Llama-3-8B": { "avg": 1131.9741021571772, "std": 6.2988095078131545, "median": 1131.715201462121, "ci": [ 1120.5709015440887, 1143.3376723620443 ], "init_elo": "-" }, "Starling-LM-7B-beta-ExPO": { "avg": 1131.2460888608446, "std": 6.358825034755664, "median": 1131.1432365429093, "ci": [ 1117.5140726911777, 1142.0076870139271 ], "init_elo": "-" }, "reka-flash-20240226": { "avg": 1129.196720138471, "std": 1.041637195139017, "median": 1129.1932509177548, "ci": [ 1127.2602690306512, 1131.1155846603706 ], "init_elo": 1129.0 }, "Phi-3-medium-128k-instruct": { "avg": 1125.861584071278, "std": 7.531870410762536, "median": 1126.2057589987335, "ci": [ 1111.257900015648, 1138.069924237086 ], "init_elo": "-" }, "SELM-Zephyr-7B-iter-3": { "avg": 1117.3991085294415, "std": 7.556022600416634, "median": 1117.494878636929, "ci": [ 1104.2880740625908, 1133.9885703285445 ], "init_elo": "-" }, "neo_7b_instruct_v0.1": { "avg": 1115.9868598461735, "std": 5.989615296826751, "median": 1115.7656764450057, "ci": [ 1105.174330047876, 1127.1462241971553 ], "init_elo": "-" }, "Starling-LM-7B-beta": { "avg": 1114.8142468073672, "std": 1.0626591977215807, "median": 1114.6590274437126, "ci": [ 1112.9699028952473, 1117.5517193992573 ], "init_elo": 1114.0 }, "Mixtral-8x7B-Instruct-v0.1": { "avg": 1114.7317828029732, "std": 1.2598820838171276, "median": 1114.812919494576, "ci": [ 1111.7583194653573, 1117.291252811192 ], "init_elo": 1114.0 }, "dbrx-instruct": { "avg": 1111.9748460530966, "std": 1.4079338611700414, "median": 1111.8820763633144, "ci": [ 1109.7253389092896, 1114.6928834993737 ], "init_elo": 1111.0 }, "command-r": { "avg": 1108.0842633743368, "std": 1.3803943571569888, "median": 1108.096877411685, "ci": [ 1105.263211047986, 1110.6159002657687 ], "init_elo": 1107.0 }, "gpt-3.5-turbo-0125": { "avg": 1107.6671892927654, "std": 1.2739448254203414, "median": 1107.7546805436284, "ci": [ 1104.8486781256638, 1109.8658208350357 ], "init_elo": 1107.0 }, "Yi-1.5-6B-Chat": { "avg": 1106.2412598048347, "std": 7.704752826865893, "median": 1106.2092954130294, "ci": [ 1092.4319306702778, 1122.825105702003 ], "init_elo": "-" }, "reka-edge": { "avg": 1103.179045951335, "std": 7.916878177358469, "median": 1102.3981901320476, "ci": [ 1088.8697547806873, 1117.8751638108683 ], "init_elo": "-" }, "tulu-2-dpo-70b": { "avg": 1102.1998887842033, "std": 1.1951855881011406, "median": 1102.090905366526, "ci": [ 1099.7863269951001, 1104.637425048166 ], "init_elo": 1101.0 }, "Yi-34B-Chat": { "avg": 1099.0, "std": 0.0, "median": 1099.0, "ci": [ 1099.0, 1099.0 ], "init_elo": 1099.0 }, "Mistral-7B-Instruct-v0.2": { "avg": 1074.9416187154252, "std": 1.383147835558675, "median": 1074.984061457739, "ci": [ 1072.21388892316, 1077.5698899802662 ], "init_elo": 1073.0 }, "Llama-2-70b-chat-hf": { "avg": 1073.6560538701947, "std": 1.265180871566298, "median": 1073.5805077385712, "ci": [ 1071.0672456040338, 1076.1395722264592 ], "init_elo": 1072.0 }, "Qwen1.5-7B-Chat": { "avg": 1060.0678287427136, "std": 1.1779176675011547, "median": 1059.9658591708385, "ci": [ 1058.0325190950812, 1062.886766466951 ], "init_elo": 1058.0 }, "Llama-2-13b-chat-hf": { "avg": 1050.0, "std": 0.0, "median": 1050.0, "ci": [ 1050.0, 1050.0 ], "init_elo": 1050.0 }, "Nous-Hermes-2-Mixtral-8x7B-DPO": { "avg": 1049.9721781840638, "std": 1.4857108885629293, "median": 1049.838604107716, "ci": [ 1047.1044745763118, 1052.627249921872 ], "init_elo": 1047.0 }, "gemma-7b-it": { "avg": 1046.9967531381226, "std": 1.7967818694635527, "median": 1046.831484384929, "ci": [ 1044.0855512072167, 1050.4260592923983 ], "init_elo": 1047.0 }, "Phi-3-mini-128k-instruct": { "avg": 1040.5487365931187, "std": 1.350444832487063, "median": 1040.4882586375547, "ci": [ 1037.9477092691454, 1043.6571974994822 ], "init_elo": 1038.0 }, "zephyr-7b-beta": { "avg": 1029.0, "std": 0.0, "median": 1029.0, "ci": [ 1029.0, 1029.0 ], "init_elo": 1029.0 }, "Llama-2-7b-chat-hf": { "avg": 1014.945546075615, "std": 1.9235970777315843, "median": 1015.079486364647, "ci": [ 1010.7689037212001, 1018.1981078220101 ], "init_elo": 1013.0 }, "Mistral-7B-Instruct-v0.1": { "avg": 1006.0, "std": 0.0, "median": 1006.0, "ci": [ 1006.0, 1006.0 ], "init_elo": 1006.0 }, "gemma-2b-it": { "avg": 977.3639908445978, "std": 1.9727565465298296, "median": 977.4262867404013, "ci": [ 973.2194970508418, 980.9750973833403 ], "init_elo": 978.0 } }