col_rank col_model col_org col_score col_elo col_tokens
๐Ÿฅ‡ o3-high OpenAI 88.07(-1.98, +1.16) 1461(-30.54, +20.17) 1453
๐Ÿฅˆ o3-low OpenAI 87.51(-1.22, +1.43) 1452(-18.75, +23.83) 1306
๐Ÿฅ‰ o3-medium OpenAI 85.25(-1.91, +1.42) 1419(-25.22, +20.38) 1432
4 o4-mini-high OpenAI 81.41(-1.49, +2.11) 1371(-16.58, +25.43) 1084
5 o4-mini-medium OpenAI 79.65(-1.70, +1.75) 1351(-17.66, +19.37) 1053
6 o4-mini-low OpenAI 76.38(-2.18, +2.17) 1318(-20.35, +21.63) 1020
7 deepseek-chat-v3-0324 (judge) Deepseek 59.5(-2.61, +2.50) 1181(-18.61, +18.25) 1085
8 gpt-4.1-2025-04-14 OpenAI 57.83(-2.45, +2.81) 1169(-17.30, +20.25) 949
9 quasar-alpha OpenAI 56.82(-2.52, +2.45) 1162(-17.73, +17.44) 909
10 claude-3.7-sonnet(thinking) Anthropic 55.62(-2.32, +2.81) 1153(-16.27, +19.93) 1248
11 gemini-2.5-pro-exp-03-25 Google 52.03(-3.46, +4.97) 1128(-24.06, +34.87) 2577
12 gemini-2.0-pro-exp-02-05 Google 51.76(-3.39, +3.78) 1126(-23.56, +26.45) 2372
13 chatgpt-4o-latest(2025-03-26) OpenAI 51.56(-2.91, +2.56) 1125(-20.26, +17.81) 1111
14 deepseek-r1 Deepseek 51.08(-2.74, +3.38) 1121(-19.01, +23.59) 1218
15 claude-3.7-sonnet (baseline) Anthropic 50.0(-0.00, +0.00) 1114(-0.00, +0.00) 1094
16 o3-mini-high OpenAI 49.71(-2.40, +2.49) 1112(-16.71, +17.33) 1265
17 o1-high OpenAI 49.4(-2.47, +2.49) 1110(-17.20, +17.26) 1512
18 gpt-4.5-preview OpenAI 48.67(-2.77, +3.27) 1105(-19.32, +22.75) 1040
19 o1-medium OpenAI 47.27(-2.68, +2.77) 1095(-18.73, +19.28) 1490
20 gpt-4.1-mini-2025-04-14 OpenAI 47.26(-2.15, +1.99) 1095(-15.02, +13.86) 1080
21 o1-low OpenAI 41.66(-2.28, +2.50) 1056(-16.42, +17.72) 1513
22 o3-mini-medium OpenAI 40.43(-1.95, +3.17) 1047(-14.20, +22.60) 1221
23 claude-3.5-sonnet Anthropic 38.98(-4.10, +2.27) 1036(-30.60, +16.40) 682
24 gemini-2.5-flash-preview(thinking) Google 38.81(-3.05, +3.37) 1035(-22.64, +24.33) 2521
25 grok-3-beta xAI 38.28(-2.55, +2.52) 1031(-19.02, +18.28) 1824
26 command-a Cohere 35.01(-2.70, +1.77) 1007(-20.97, +13.41) 1083
27 o3-mini-low OpenAI 34.46(-2.13, +2.30) 1002(-16.62, +17.49) 1205
28 qwen-max Alibaba 32.56(-1.53, +1.83) 987(-12.26, +14.29) 1420
29 gpt-4o-2024-11-20 OpenAI 31.47(-2.25, +1.77) 979(-18.47, +14.10) 1142
30 qwq-32b Alibaba 30.11(-2.26, +2.60) 968(-19.08, +20.96) 1517
31 gemini-2.0-flash-thinking-exp-01-21 Google 29.53(-1.90, +2.11) 963(-16.15, +17.27) 2497
32 grok-3-mini-beta xAI 28.31(-2.31, +2.15) 953(-20.30, +18.02) 2010
33 claude-3.5-haiku Anthropic 27.14(-3.03, +3.17) 942(-27.62, +26.91) 601
34 qwen-plus Alibaba 27.05(-1.89, +1.75) 942(-17.09, +15.09) 1346
35 claude-3.5-sonnet-20240620 Anthropic 25.38(-2.80, +3.17) 927(-26.66, +27.98) 628
36 gemini-2.5-flash-preview Google 24.71(-2.01, +1.86) 920(-19.29, +16.97) 2299
37 gemini-2.0-flash-001 (judge) Google 24.22(-2.07, +2.05) 916(-20.21, +18.86) 1901
38 deepseek-v3 Deepseek 23.49(-1.86, +1.80) 909(-18.52, +17.00) 1007
39 gemma-3-27b-it Google 20.9(-1.68, +1.97) 883(-18.25, +20.04) 1655
40 gpt-4.1-nano-2025-04-14 OpenAI 20.63(-1.88, +2.25) 880(-20.69, +22.94) 938
41 gemini-2.0-flash-lite-001 Google 18.66(-1.80, +1.78) 858(-21.51, +19.68) 2196
42 grok-2-1212 xAI 14.09(-1.66, +1.77) 800(-25.16, +24.11) 898
43 jamba-1.6-large AI21 12.99(-1.77, +1.18) 784(-28.99, +17.40) 863
44 nova-pro-v1 Amazon 12.36(-1.09, +1.47) 774(-18.15, +22.49) 905
45 gpt-4o-mini (judge) OpenAI 11.98(-1.33, +1.44) 768(-23.18, +22.51) 830
46 qwen-2.5-72b-instruct Alibaba 11.53(-1.45, +1.64) 760(-26.07, +26.33) 1097
47 llama-4-maverick Meta 11.52(-1.16, +1.48) 760(-20.69, +24.01) 920
48 mistral-large-2411 Mistral 10.65(-1.30, +1.08) 745(-25.14, +18.87) 906
49 minimax-01 MiniMax 10.55(-1.13, +1.03) 743(-21.80, +18.30) 1071
50 gpt-4-1106-preview OpenAI 9.76(-1.28, +1.38) 728(-26.79, +25.63) 840
51 gemma-2-27b-it Google 9.72(-1.13, +1.39) 727(-23.62, +25.94) 794
52 qwen2.5-32b-instruct Alibaba 7.02(-1.06, +1.37) 665(-30.60, +33.46) 795
53 command-r-plus-08-2024 Cohere 6.73(-0.88, +1.02) 657(-25.90, +26.54) 969
54 hermes-3-llama-3.1-405b NousResearch 6.53(-1.06, +1.22) 652(-32.77, +32.00) 771
55 mistral-small-3.1-24b-instruct-2503 Mistral 6.32(-0.79, +0.75) 646(-24.73, +20.67) 961
56 qwen-turbo Alibaba 6.19(-1.15, +1.22) 642(-37.88, +33.58) 623
57 llama-4-scout Meta 6.13(-0.84, +0.86) 640(-27.22, +24.58) 844
58 nova-lite-v1 Amazon 5.78(-0.88, +1.20) 629(-30.37, +34.97) 994
59 lfm-7b Liquid AI 5.53(-0.57, +1.01) 621(-20.06, +30.97) 1011
60 wizardlm-2-8x22b Microsoft 5.37(-0.69, +0.79) 616(-25.25, +25.45) 1028
61 hermes-3-llama-3.1-70b NousResearch 5.16(-0.82, +0.99) 608(-31.64, +32.20) 771
62 mistral-small-24b-instruct-2501 Mistral 4.94(-0.62, +0.76) 600(-24.30, +26.43) 998
63 gemma-2-9b-it Google 4.85(-0.86, +0.76) 597(-35.46, +26.72) 751
64 command-r-08-2024 Cohere 3.82(-0.82, +0.78) 554(-43.66, +33.84) 796
65 llama-3.3-70b-instruct Meta 3.75(-0.80, +1.03) 550(-43.09, +44.22) 809
66 nova-micro-v1 Amazon 3.74(-0.59, +0.79) 550(-30.98, +34.77) 927
67 lfm-40b Liquid AI 3.33(-0.75, +0.69) 529(-45.68, +33.98) 863
68 command-r7b-12-2024 Cohere 2.84(-0.53, +0.52) 500(-37.24, +29.86) 882
69 lfm-3b Liquid AI 2.19(-0.47, +0.41) 454(-42.91, +30.36) 776
70 qwen-2.5-7b-instruct Alibaba 1.75(-0.41, +0.44) 414(-46.96, +39.63) 844
71 olmo-2-0325-32b-instruct Allen AI 1.72(-0.39, +0.55) 411(-45.62, +49.18) 890
72 jamba-1.6-mini AI21 1.51(-0.35, +0.51) 388(-45.78, +52.05) 920
73 llama-3.1-nemotron-70b-instruct Meta 1.18(-0.23, +0.26) 344(-37.68, +35.10) 1854
74 llama-3.1-405b-instruct Meta 0.53(-0.16, +0.18) 205(-63.82, +51.62) 1735
75 llama-3.1-70b-instruct Meta 0.25(-0.09, +0.07) 72(-72.08, +45.43) 1923
76 llama-3.1-8b-instruct Meta 0.04(-0.02, +0.01) -268(-95.65, +52.71) 5081