๐ฅ |
claude-3.7-sonnet (baseline) |
Anthropic |
50.0(-0.00, +0.00) |
1114(-0.00, +0.00) |
1094 |
๐ฅ |
deepseek-chat-v3-0324 |
Deepseek |
49.73(-2.04, +2.04) |
1112(-14.22, +14.19) |
1085 |
๐ฅ |
gemini-2.0-pro-exp-02-05 |
Google |
47.14(-2.65, +3.45) |
1094(-18.52, +23.98) |
2372 |
4 |
gemini-2.5-pro-exp-03-25 |
Google |
47.06(-2.84, +2.83) |
1094(-19.83, +19.74) |
2577 |
5 |
chatgpt-4o-latest(2025-03-26) |
OpenAI |
46.37(-1.83, +2.19) |
1089(-12.81, +15.21) |
1111 |
6 |
deepseek-r1 |
Deepseek |
44.26(-2.70, +2.14) |
1074(-19.21, +14.95) |
1211 |
7 |
o1-high |
OpenAI |
42.54(-2.26, +1.96) |
1062(-16.20, +13.83) |
1503 |
8 |
o3-mini-high |
OpenAI |
41.64(-2.20, +2.17) |
1055(-15.84, +15.43) |
1257 |
9 |
o1-medium |
OpenAI |
40.7(-2.08, +2.18) |
1049(-15.10, +15.57) |
1487 |
10 |
gpt-4.5-preview |
OpenAI |
40.64(-2.00, +2.43) |
1048(-14.50, +17.39) |
1040 |
11 |
claude-3.5-sonnet |
Anthropic |
37.06(-2.83, +2.60) |
1022(-21.42, +19.11) |
682 |
12 |
o1-low |
OpenAI |
36.9(-1.84, +1.92) |
1021(-13.89, +14.19) |
1513 |
13 |
o3-mini-medium |
OpenAI |
34.86(-2.30, +1.86) |
1005(-17.94, +14.04) |
1221 |
14 |
command-a |
Cohere |
34.53(-2.39, +1.73) |
1003(-18.67, +13.14) |
1083 |
15 |
gemini-2.0-flash-thinking-exp-01-21 |
Google |
32.09(-2.33, +2.59) |
984(-18.88, +20.26) |
2488 |
16 |
o3-mini-low |
OpenAI |
31.98(-2.01, +1.68) |
983(-16.32, +13.21) |
1205 |
17 |
gpt-4o-2024-11-20 |
OpenAI |
30.74(-1.51, +2.02) |
973(-12.45, +16.25) |
1142 |
18 |
claude-3.5-haiku |
Anthropic |
28.54(-2.68, +2.84) |
955(-23.47, +23.54) |
601 |
19 |
gemini-2.0-flash-001 (judge) |
Google |
27.4(-1.79, +2.23) |
945(-15.97, +19.00) |
1901 |
20 |
gemma-3-27b-it |
Google |
26.33(-2.03, +1.77) |
935(-18.68, +15.51) |
1654 |
21 |
gemini-2.0-flash-lite-001 |
Google |
24.7(-2.11, +1.41) |
920(-20.32, +12.92) |
2196 |
22 |
deepseek-v3 |
Deepseek |
23.75(-2.00, +1.75) |
911(-19.80, +16.38) |
1007 |
23 |
claude-3.5-sonnet-20240620 |
Anthropic |
23.38(-1.95, +2.27) |
908(-19.49, +21.37) |
628 |
24 |
jamba-1.6-large |
AI21 |
17.63(-1.54, +1.84) |
846(-19.10, +21.16) |
859 |
25 |
grok-2-1212 |
xAI |
17.08(-1.78, +1.79) |
840(-22.85, +21.15) |
898 |
26 |
minimax-01 |
MiniMax |
16.76(-2.08, +2.50) |
836(-27.33, +29.42) |
370 |
27 |
gpt-4o-mini (judge) |
OpenAI |
16.68(-1.44, +1.55) |
835(-18.75, +18.65) |
830 |
28 |
nova-pro-v1 |
Amazon |
16.34(-1.26, +1.32) |
830(-16.58, +16.32) |
905 |
29 |
qwen-2.5-72b-instruct |
Alibaba |
16.13(-1.77, +1.40) |
828(-23.80, +17.47) |
1097 |
30 |
mistral-large-2411 |
Mistral |
14.6(-1.34, +1.25) |
807(-19.40, +16.85) |
906 |
31 |
gemma-2-27b-it |
Google |
12.91(-1.20, +1.45) |
782(-19.30, +21.38) |
794 |
32 |
gpt-4-1106-preview |
OpenAI |
12.54(-1.06, +1.36) |
777(-17.48, +20.62) |
840 |
33 |
mistral-small-3.1-24b-instruct-2503 |
Mistral |
10.74(-1.04, +1.31) |
746(-19.64, +22.54) |
961 |
34 |
command-r-plus-08-2024 |
Cohere |
10.11(-0.96, +1.04) |
735(-19.31, +19.00) |
969 |
35 |
nova-lite-v1 |
Amazon |
9.72(-1.08, +1.02) |
727(-22.57, +19.22) |
994 |
36 |
wizardlm-2-8x22b |
Microsoft |
9.49(-1.11, +0.97) |
722(-23.82, +18.60) |
1028 |
37 |
lfm-7b |
Liquid AI |
9.27(-1.07, +1.05) |
718(-23.20, +20.70) |
1011 |
38 |
qwen2.5-32b-instruct |
Alibaba |
8.85(-1.08, +1.50) |
709(-24.68, +30.05) |
795 |
39 |
mistral-small-24b-instruct-2501 |
Mistral |
8.59(-1.16, +1.14) |
703(-27.51, +23.73) |
998 |
40 |
hermes-3-llama-3.1-405b |
NousResearch |
8.26(-1.25, +1.38) |
696(-30.94, +29.40) |
771 |
41 |
hermes-3-llama-3.1-70b |
NousResearch |
7.73(-1.08, +1.13) |
683(-28.11, +25.90) |
771 |
42 |
gemma-2-9b-it |
Google |
7.67(-1.16, +1.03) |
682(-30.54, +23.86) |
751 |
43 |
nova-micro-v1 |
Amazon |
6.62(-0.96, +0.88) |
654(-29.20, +23.30) |
927 |
44 |
command-r-08-2024 |
Cohere |
6.52(-0.90, +1.13) |
651(-27.34, +30.02) |
796 |
45 |
lfm-40b |
Liquid AI |
5.76(-0.87, +0.91) |
628(-29.75, +27.41) |
863 |
46 |
llama-3.3-70b-instruct |
Meta |
5.34(-1.02, +0.98) |
615(-38.51, +31.04) |
809 |
47 |
command-r7b-12-2024 |
Cohere |
4.95(-0.76, +0.89) |
601(-30.30, +30.26) |
882 |
48 |
lfm-3b |
Liquid AI |
3.57(-0.55, +0.62) |
542(-30.03, +28.64) |
776 |
49 |
qwen-2.5-7b-instruct |
Alibaba |
3.57(-0.68, +0.86) |
542(-38.11, +38.91) |
844 |
50 |
olmo-2-0325-32b-instruct |
Allen AI |
3.31(-0.49, +0.62) |
528(-28.93, +30.56) |
890 |
51 |
jamba-1.6-mini |
AI21 |
2.56(-0.51, +0.62) |
482(-39.77, +38.96) |
920 |
52 |
llama-3.1-nemotron-70b-instruct |
Meta |
2.29(-0.31, +0.42) |
462(-25.83, +30.06) |
1854 |
53 |
llama-3.1-405b-instruct |
Meta |
0.86(-0.19, +0.24) |
289(-44.54, +42.37) |
1735 |
54 |
llama-3.1-70b-instruct |
Meta |
0.52(-0.12, +0.13) |
200(-46.44, +40.36) |
1923 |
55 |
llama-3.1-8b-instruct |
Meta |
0.07(-0.02, +0.02) |
-141(-54.75, +41.44) |
5081 |