🏆 LLM4SE Leaderboard

Community-Driven Evaluation of Top Large Language Models (LLMs) in Software Engineering (SE) Tasks

The SWE-Chatbot-Arena is an open-source platform designed to evaluate LLMs through human preference, fostering transparency and collaboration. This platform aims to empower the SE community to assess and compare the performance of leading LLMs in related tasks. For technical details, check out our paper.

{
  • "headers": [
    • "Rank",
    • "Model",
    • "Organization",
    • "Elo Score",
    • "Win Rate",
    • "Conversation Efficiency Index",
    • "Consistency Score",
    • "Bradley-Terry Coefficient",
    • "Eigenvector Centrality Value",
    • "Newman Modularity Score",
    • "PageRank Score"
    ],
  • "data": [
    • [
      • 1,
      • "Claude 3.5 Sonnet",
      • "Anthropic",
      • 1005.97,
      • 1,
      • 1,
      • null,
      • 0,
      • 1,
      • null,
      • 0.05
      ],
    • [
      • 2,
      • "ERNIE 4.5 VL 28B A3B",
      • "Baidu",
      • 1003.98,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • null,
      • 0.04
      ],
    • [
      • 3,
      • "Llama 3.1 405B Instruct",
      • "Meta",
      • 1003.95,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • null,
      • 0.04
      ],
    • [
      • 4,
      • "o3 Deep Research",
      • "OpenAI",
      • 1002,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • null,
      • 0.03
      ],
    • [
      • 4,
      • "Grok 3 Mini",
      • "xAI",
      • 1002,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • null,
      • 0.03
      ],
    • [
      • 4,
      • "Hermes 2 Pro - Llama-3 8B",
      • "NousResearch",
      • 1002,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • null,
      • 0.03
      ],
    • [
      • 4,
      • "Mistral 7B Instruct",
      • "Mistral",
      • 1002,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • null,
      • 0.03
      ],
    • [
      • 4,
      • "GLM 4.5 Air",
      • "Z.AI",
      • 1002,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • null,
      • 0.03
      ],
    • [
      • 4,
      • "Ministral 3 3B 2512",
      • "Mistral",
      • 1002,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • null,
      • 0.03
      ],
    • [
      • 4,
      • "Gemini 2.0 Flash Lite",
      • "Google",
      • 1002,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • null,
      • 0.03
      ],
    • [
      • 4,
      • "DeepSeek V3.1",
      • "DeepSeek",
      • 1002,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • null,
      • 0.03
      ],
    • [
      • 12,
      • "o3 Mini",
      • "OpenAI",
      • 1001.99,
      • 1,
      • 1,
      • null,
      • 0,
      • 0,
      • null,
      • 0.03
      ],
    • [
      • 13,
      • "Mistral Large",
      • "Mistral",
      • 1001.98,
      • 1,
      • 0.72,
      • null,
      • 0,
      • 0,
      • null,
      • 0.03
      ],
    • [
      • 14,
      • "GPT-4 Turbo",
      • "OpenAI",
      • 999.98,
      • 1,
      • -0.33,
      • null,
      • 0,
      • 0,
      • null,
      • 0.03
      ],
    • [
      • 14,
      • "gpt-oss-120b",
      • "OpenAI",
      • 999.98,
      • 0.5,
      • 1,
      • 1,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 16,
      • "Gemma 3 27B",
      • "Google",
      • 998.02,
      • 0.33,
      • 0.3,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 16,
      • "GPT-4o-mini",
      • "OpenAI",
      • 998.02,
      • 0.33,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 18,
      • "Gemini 1.5 Flash",
      • "Google",
      • 998.01,
      • 0.25,
      • -0.6,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 18,
      • "o4 Mini",
      • "OpenAI",
      • 998.01,
      • 0,
      • 0.3,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 18,
      • "GPT-4o",
      • "OpenAI",
      • 998.01,
      • 0,
      • -0.2,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 21,
      • "o1-mini",
      • "OpenAI",
      • 998,
      • 0,
      • 0.3,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 21,
      • "DeepSeek R1 Zero",
      • "DeepSeek",
      • 998,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 21,
      • "Gemini 2.5 Pro Preview 06-05",
      • "Google",
      • 998,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 21,
      • "Qwen3 30B A3B",
      • "Qwen",
      • 998,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 21,
      • "o1",
      • "OpenAI",
      • 998,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 21,
      • "GPT-5 Mini",
      • "OpenAI",
      • 998,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 21,
      • "Qwen3 235B A22B",
      • "Qwen",
      • 998,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 21,
      • "Gemma 2 27B",
      • "Google",
      • 998,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 29,
      • "Grok Beta",
      • "xAI",
      • 997.99,
      • 0,
      • 0.3,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 30,
      • "Llama 3 Euryale 70B v2.1",
      • "Sao10k",
      • 996.02,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 31,
      • "GPT-4.1 Mini",
      • "OpenAI",
      • 996.01,
      • 0,
      • -1,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 31,
      • "Claude 3.7 Sonnet",
      • "Anthropic",
      • 996.01,
      • 0,
      • -0.13,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 33,
      • "Claude 3.7 Sonnet (thinking)",
      • "Anthropic",
      • 996,
      • 0,
      • -0.43,
      • null,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 33,
      • "Gemini 1.5 Pro",
      • "Google",
      • 996,
      • 0,
      • null,
      • 1,
      • 0,
      • 0,
      • 0,
      • 0.02
      ],
    • [
      • 35,
      • "GPT-3.5 Turbo",
      • "OpenAI",
      • 988.14,
      • 0,
      • -0.63,
      • 0.33,
      • 0,
      • 0,
      • 0,
      • 0.02
      ]
    ],
  • "metadata": null
}

Made with ❤️ for SWE-Chatbot-Arena. If this work is useful to you, please consider citing our vision paper:

@inproceedings{zhao2025se,
title={SE Arena: An Interactive Platform for Evaluating Foundation Models in Software Engineering},
author={Zhao, Zhimin},
booktitle={2025 IEEE/ACM Second International Conference on AI Foundation Models and Software Engineering (Forge)},
pages={78--81},
year={2025},
organization={IEEE}
}