๐ฅ |
o3-high |
OpenAI |
88.07(-1.98, +1.16) |
1461(-30.54, +20.17) |
1453 |
๐ฅ |
o3-low |
OpenAI |
87.51(-1.22, +1.43) |
1452(-18.75, +23.83) |
1306 |
๐ฅ |
o3-medium |
OpenAI |
85.25(-1.91, +1.42) |
1419(-25.22, +20.38) |
1432 |
4 |
o4-mini-high |
OpenAI |
81.41(-1.49, +2.11) |
1371(-16.58, +25.43) |
1084 |
5 |
o4-mini-medium |
OpenAI |
79.65(-1.70, +1.75) |
1351(-17.66, +19.37) |
1053 |
6 |
o4-mini-low |
OpenAI |
76.38(-2.18, +2.17) |
1318(-20.35, +21.63) |
1020 |
7 |
deepseek-chat-v3-0324 (judge) |
Deepseek |
59.5(-2.61, +2.50) |
1181(-18.61, +18.25) |
1085 |
8 |
gpt-4.1-2025-04-14 |
OpenAI |
57.83(-2.45, +2.81) |
1169(-17.30, +20.25) |
949 |
9 |
quasar-alpha |
OpenAI |
56.82(-2.52, +2.45) |
1162(-17.73, +17.44) |
909 |
10 |
claude-3.7-sonnet(thinking) |
Anthropic |
55.62(-2.32, +2.81) |
1153(-16.27, +19.93) |
1248 |
11 |
gemini-2.5-pro-exp-03-25 |
Google |
52.03(-3.46, +4.97) |
1128(-24.06, +34.87) |
2577 |
12 |
gemini-2.0-pro-exp-02-05 |
Google |
51.76(-3.39, +3.78) |
1126(-23.56, +26.45) |
2372 |
13 |
chatgpt-4o-latest(2025-03-26) |
OpenAI |
51.56(-2.91, +2.56) |
1125(-20.26, +17.81) |
1111 |
14 |
deepseek-r1 |
Deepseek |
51.08(-2.74, +3.38) |
1121(-19.01, +23.59) |
1218 |
15 |
claude-3.7-sonnet (baseline) |
Anthropic |
50.0(-0.00, +0.00) |
1114(-0.00, +0.00) |
1094 |
16 |
o3-mini-high |
OpenAI |
49.71(-2.40, +2.49) |
1112(-16.71, +17.33) |
1265 |
17 |
o1-high |
OpenAI |
49.4(-2.47, +2.49) |
1110(-17.20, +17.26) |
1512 |
18 |
gpt-4.5-preview |
OpenAI |
48.67(-2.77, +3.27) |
1105(-19.32, +22.75) |
1040 |
19 |
o1-medium |
OpenAI |
47.27(-2.68, +2.77) |
1095(-18.73, +19.28) |
1490 |
20 |
gpt-4.1-mini-2025-04-14 |
OpenAI |
47.26(-2.15, +1.99) |
1095(-15.02, +13.86) |
1080 |
21 |
o1-low |
OpenAI |
41.66(-2.28, +2.50) |
1056(-16.42, +17.72) |
1513 |
22 |
o3-mini-medium |
OpenAI |
40.43(-1.95, +3.17) |
1047(-14.20, +22.60) |
1221 |
23 |
claude-3.5-sonnet |
Anthropic |
38.98(-4.10, +2.27) |
1036(-30.60, +16.40) |
682 |
24 |
gemini-2.5-flash-preview(thinking) |
Google |
38.81(-3.05, +3.37) |
1035(-22.64, +24.33) |
2521 |
25 |
grok-3-beta |
xAI |
38.28(-2.55, +2.52) |
1031(-19.02, +18.28) |
1824 |
26 |
command-a |
Cohere |
35.01(-2.70, +1.77) |
1007(-20.97, +13.41) |
1083 |
27 |
o3-mini-low |
OpenAI |
34.46(-2.13, +2.30) |
1002(-16.62, +17.49) |
1205 |
28 |
qwen-max |
Alibaba |
32.56(-1.53, +1.83) |
987(-12.26, +14.29) |
1420 |
29 |
gpt-4o-2024-11-20 |
OpenAI |
31.47(-2.25, +1.77) |
979(-18.47, +14.10) |
1142 |
30 |
qwq-32b |
Alibaba |
30.11(-2.26, +2.60) |
968(-19.08, +20.96) |
1517 |
31 |
gemini-2.0-flash-thinking-exp-01-21 |
Google |
29.53(-1.90, +2.11) |
963(-16.15, +17.27) |
2497 |
32 |
grok-3-mini-beta |
xAI |
28.31(-2.31, +2.15) |
953(-20.30, +18.02) |
2010 |
33 |
claude-3.5-haiku |
Anthropic |
27.14(-3.03, +3.17) |
942(-27.62, +26.91) |
601 |
34 |
qwen-plus |
Alibaba |
27.05(-1.89, +1.75) |
942(-17.09, +15.09) |
1346 |
35 |
claude-3.5-sonnet-20240620 |
Anthropic |
25.38(-2.80, +3.17) |
927(-26.66, +27.98) |
628 |
36 |
gemini-2.5-flash-preview |
Google |
24.71(-2.01, +1.86) |
920(-19.29, +16.97) |
2299 |
37 |
gemini-2.0-flash-001 (judge) |
Google |
24.22(-2.07, +2.05) |
916(-20.21, +18.86) |
1901 |
38 |
deepseek-v3 |
Deepseek |
23.49(-1.86, +1.80) |
909(-18.52, +17.00) |
1007 |
39 |
gemma-3-27b-it |
Google |
20.9(-1.68, +1.97) |
883(-18.25, +20.04) |
1655 |
40 |
gpt-4.1-nano-2025-04-14 |
OpenAI |
20.63(-1.88, +2.25) |
880(-20.69, +22.94) |
938 |
41 |
gemini-2.0-flash-lite-001 |
Google |
18.66(-1.80, +1.78) |
858(-21.51, +19.68) |
2196 |
42 |
grok-2-1212 |
xAI |
14.09(-1.66, +1.77) |
800(-25.16, +24.11) |
898 |
43 |
jamba-1.6-large |
AI21 |
12.99(-1.77, +1.18) |
784(-28.99, +17.40) |
863 |
44 |
nova-pro-v1 |
Amazon |
12.36(-1.09, +1.47) |
774(-18.15, +22.49) |
905 |
45 |
gpt-4o-mini (judge) |
OpenAI |
11.98(-1.33, +1.44) |
768(-23.18, +22.51) |
830 |
46 |
qwen-2.5-72b-instruct |
Alibaba |
11.53(-1.45, +1.64) |
760(-26.07, +26.33) |
1097 |
47 |
llama-4-maverick |
Meta |
11.52(-1.16, +1.48) |
760(-20.69, +24.01) |
920 |
48 |
mistral-large-2411 |
Mistral |
10.65(-1.30, +1.08) |
745(-25.14, +18.87) |
906 |
49 |
minimax-01 |
MiniMax |
10.55(-1.13, +1.03) |
743(-21.80, +18.30) |
1071 |
50 |
gpt-4-1106-preview |
OpenAI |
9.76(-1.28, +1.38) |
728(-26.79, +25.63) |
840 |
51 |
gemma-2-27b-it |
Google |
9.72(-1.13, +1.39) |
727(-23.62, +25.94) |
794 |
52 |
qwen2.5-32b-instruct |
Alibaba |
7.02(-1.06, +1.37) |
665(-30.60, +33.46) |
795 |
53 |
command-r-plus-08-2024 |
Cohere |
6.73(-0.88, +1.02) |
657(-25.90, +26.54) |
969 |
54 |
hermes-3-llama-3.1-405b |
NousResearch |
6.53(-1.06, +1.22) |
652(-32.77, +32.00) |
771 |
55 |
mistral-small-3.1-24b-instruct-2503 |
Mistral |
6.32(-0.79, +0.75) |
646(-24.73, +20.67) |
961 |
56 |
qwen-turbo |
Alibaba |
6.19(-1.15, +1.22) |
642(-37.88, +33.58) |
623 |
57 |
llama-4-scout |
Meta |
6.13(-0.84, +0.86) |
640(-27.22, +24.58) |
844 |
58 |
nova-lite-v1 |
Amazon |
5.78(-0.88, +1.20) |
629(-30.37, +34.97) |
994 |
59 |
lfm-7b |
Liquid AI |
5.53(-0.57, +1.01) |
621(-20.06, +30.97) |
1011 |
60 |
wizardlm-2-8x22b |
Microsoft |
5.37(-0.69, +0.79) |
616(-25.25, +25.45) |
1028 |
61 |
hermes-3-llama-3.1-70b |
NousResearch |
5.16(-0.82, +0.99) |
608(-31.64, +32.20) |
771 |
62 |
mistral-small-24b-instruct-2501 |
Mistral |
4.94(-0.62, +0.76) |
600(-24.30, +26.43) |
998 |
63 |
gemma-2-9b-it |
Google |
4.85(-0.86, +0.76) |
597(-35.46, +26.72) |
751 |
64 |
command-r-08-2024 |
Cohere |
3.82(-0.82, +0.78) |
554(-43.66, +33.84) |
796 |
65 |
llama-3.3-70b-instruct |
Meta |
3.75(-0.80, +1.03) |
550(-43.09, +44.22) |
809 |
66 |
nova-micro-v1 |
Amazon |
3.74(-0.59, +0.79) |
550(-30.98, +34.77) |
927 |
67 |
lfm-40b |
Liquid AI |
3.33(-0.75, +0.69) |
529(-45.68, +33.98) |
863 |
68 |
command-r7b-12-2024 |
Cohere |
2.84(-0.53, +0.52) |
500(-37.24, +29.86) |
882 |
69 |
lfm-3b |
Liquid AI |
2.19(-0.47, +0.41) |
454(-42.91, +30.36) |
776 |
70 |
qwen-2.5-7b-instruct |
Alibaba |
1.75(-0.41, +0.44) |
414(-46.96, +39.63) |
844 |
71 |
olmo-2-0325-32b-instruct |
Allen AI |
1.72(-0.39, +0.55) |
411(-45.62, +49.18) |
890 |
72 |
jamba-1.6-mini |
AI21 |
1.51(-0.35, +0.51) |
388(-45.78, +52.05) |
920 |
73 |
llama-3.1-nemotron-70b-instruct |
Meta |
1.18(-0.23, +0.26) |
344(-37.68, +35.10) |
1854 |
74 |
llama-3.1-405b-instruct |
Meta |
0.53(-0.16, +0.18) |
205(-63.82, +51.62) |
1735 |
75 |
llama-3.1-70b-instruct |
Meta |
0.25(-0.09, +0.07) |
72(-72.08, +45.43) |
1923 |
76 |
llama-3.1-8b-instruct |
Meta |
0.04(-0.02, +0.01) |
-268(-95.65, +52.71) |
5081 |