{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"mt-bench","slug":"mt-bench","name":"MT-Bench","type":"benchmark","url":"https://github.com/lm-sys/FastChat","page_url":"https://unfragile.ai/mt-bench","categories":["testing-quality"],"tags":[],"pricing":{"model":"free","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"mt-bench__cap_0","uri":"capability://data.processing.analysis.multi.turn.conversation.quality.evaluation.with.gpt.4.judging","name":"multi-turn conversation quality evaluation with gpt-4 judging","description":"MT-Bench evaluates LLM responses across 80 curated multi-turn questions using GPT-4 as an automated judge. The system submits model responses to GPT-4 with structured prompts that assess instruction following, reasoning coherence, and conversation consistency across turns. Responses are scored on a numeric scale, enabling quantitative comparison of model capabilities without human annotation overhead.","intents":["Compare performance of different LLM models on complex multi-turn reasoning tasks","Identify which models best handle instruction following across conversation context","Generate reproducible benchmark scores for LLM leaderboard rankings","Evaluate how well models maintain coherence and context in extended conversations"],"best_for":["LLM researchers benchmarking model families (Llama, Mistral, GPT variants)","Teams building Chatbot Arena or similar competitive evaluation platforms","Organizations selecting production LLMs based on multi-turn capability metrics"],"limitations":["GPT-4 judge introduces cost (~$0.03-0.06 per evaluation) and dependency on OpenAI API availability","Judge bias: GPT-4 may favor models with similar reasoning patterns to its own training","No human validation layer — automated scoring can miss nuanced quality differences","Fixed question set limits evaluation to 8 predefined categories; custom domains require new question curation"],"requires":["OpenAI API key with GPT-4 access","Python 3.8+","FastChat framework installed (pip install fschat)","Model inference endpoint (local or remote) for candidate models"],"input_types":["text (multi-turn conversation history)","structured JSON (question + model responses)"],"output_types":["numeric scores (typically 1-10 scale)","structured JSON with per-turn judgments","aggregated leaderboard rankings"],"categories":["data-processing-analysis","evaluation-benchmarking"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"mt-bench__cap_1","uri":"capability://data.processing.analysis.question.answer.pair.dataset.curation.and.versioning","name":"question-answer pair dataset curation and versioning","description":"MT-Bench maintains a curated set of 80 high-quality multi-turn questions across 8 semantic categories (writing, roleplay, extraction, reasoning, math, coding, knowledge, common-sense). Questions are stored as structured JSON with turn-by-turn prompts, enabling reproducible evaluation. The dataset is version-controlled in the FastChat repository, allowing tracking of changes and ensuring consistent benchmark definitions across research papers.","intents":["Access a standardized, reproducible set of multi-turn questions for fair model comparison","Understand the semantic distribution of evaluation questions across different capability domains","Extend MT-Bench with new questions while maintaining backward compatibility","Audit which question categories expose model weaknesses"],"best_for":["Researchers publishing LLM evaluation papers requiring standardized benchmarks","Teams building internal LLM leaderboards that need consistent question sets","Model developers analyzing performance breakdown by question category"],"limitations":["Fixed 80-question set may not cover domain-specific tasks (medical, legal, scientific)","English-only questions; multilingual evaluation requires separate benchmark","Question difficulty is not uniformly distributed — some categories have harder questions than others","No automatic detection of question drift or contamination in model training data"],"requires":["FastChat repository cloned locally or accessed via GitHub API","Python 3.8+ for parsing JSON question files","No external API required for question access"],"input_types":["JSON files (question definitions)","text (question category labels)"],"output_types":["structured JSON (question + turn structure)","text (question text for display)","category labels (for filtering/analysis)"],"categories":["data-processing-analysis","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"mt-bench__cap_2","uri":"capability://automation.workflow.batch.evaluation.orchestration.with.distributed.model.inference","name":"batch evaluation orchestration with distributed model inference","description":"MT-Bench integrates with FastChat's distributed serving infrastructure to evaluate multiple models in parallel. The evaluation pipeline submits each question to candidate models via the FastChat controller (which routes to model workers), collects responses, and batches them for GPT-4 judging. This architecture enables evaluating 70+ models without sequential bottlenecks, leveraging the controller-worker pattern for load distribution.","intents":["Evaluate 10+ LLM models simultaneously without sequential inference delays","Scale benchmark evaluation to hundreds of model variants (different quantizations, LoRA adapters)","Monitor inference latency and error rates across models during evaluation","Reuse existing FastChat serving infrastructure for benchmark runs"],"best_for":["Teams running Chatbot Arena with 70+ models requiring daily/weekly evaluations","Organizations with distributed GPU clusters wanting to parallelize benchmark runs","Researchers comparing model families (Llama 7B/13B/70B) efficiently"],"limitations":["Requires FastChat controller and worker infrastructure setup — not a standalone tool","GPU memory constraints limit concurrent model inference; typically 2-4 models per GPU","No built-in fault tolerance — worker crashes require manual restart and re-evaluation of failed questions","Evaluation time scales linearly with number of models (e.g., 70 models × 80 questions × 2 turns ≈ 11,200 API calls to GPT-4)"],"requires":["FastChat framework with controller and model workers running","Python 3.8+","OpenAI API key for GPT-4 judging","GPU(s) for model inference (CPU inference possible but slow)","Network connectivity between controller and workers"],"input_types":["model names/endpoints (registered with FastChat controller)","question JSON files","evaluation configuration (timeout, batch size)"],"output_types":["structured JSON (model responses per question)","numeric scores (GPT-4 judgments)","CSV/JSON leaderboard rankings"],"categories":["automation-workflow","tool-use-integration"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"mt-bench__cap_3","uri":"capability://data.processing.analysis.leaderboard.ranking.and.elo.rating.calculation","name":"leaderboard ranking and elo rating calculation","description":"MT-Bench scores feed into LMSYS's Elo rating system, which computes relative model strength based on pairwise comparison results. The Elo algorithm treats benchmark scores as implicit pairwise wins/losses, updating model ratings iteratively. Leaderboard rankings are published on lmarena.ai and updated weekly, providing a public-facing metric for model comparison that accounts for both absolute performance and relative positioning.","intents":["Rank LLM models on a single numeric scale (Elo rating) for easy comparison","Track how model performance changes over time as new versions are released","Identify which models are statistically equivalent (overlapping Elo confidence intervals)","Provide transparent, reproducible rankings for model selection decisions"],"best_for":["Model developers tracking their model's competitive position","Organizations selecting production LLMs based on published benchmarks","Researchers analyzing trends in LLM capability evolution"],"limitations":["Elo ratings are relative, not absolute — a model's rating depends on which other models are evaluated","MT-Bench scores alone don't capture domain-specific performance (medical, legal, code-heavy tasks)","Elo assumes transitivity (if A > B and B > C, then A > C), which may not hold for LLMs","Weekly updates introduce lag; real-time ranking changes are not reflected immediately"],"requires":["MT-Bench evaluation results (numeric scores per model)","Elo rating calculation library (e.g., chess-elo or custom implementation)","Historical leaderboard data for trend analysis"],"input_types":["numeric scores (per-model MT-Bench results)","model metadata (name, release date, organization)"],"output_types":["Elo ratings (numeric, typically 1000-2000 range)","confidence intervals (uncertainty bounds)","leaderboard rankings (ordered list with metadata)"],"categories":["data-processing-analysis","planning-reasoning"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"mt-bench__cap_4","uri":"capability://data.processing.analysis.conversation.template.application.for.model.specific.prompt.formatting","name":"conversation template application for model-specific prompt formatting","description":"MT-Bench questions are formatted according to model-specific conversation templates (defined in FastChat's conversation.py) before submission to each model. Templates handle differences in prompt structure, special tokens, and role markers (e.g., Llama uses [INST], ChatGLM uses different role tags). This ensures that each model receives questions in its native format, preventing unfair evaluation due to prompt formatting mismatches.","intents":["Ensure fair evaluation by formatting questions consistently with each model's training format","Avoid penalizing models for prompt formatting differences rather than capability differences","Support evaluation of models with diverse architectures (Llama, GPT, ChatGLM, Falcon) without manual prompt engineering","Maintain reproducibility by documenting exact prompt formats used for each model"],"best_for":["Researchers comparing models from different families (Llama vs Mistral vs Qwen)","Teams building multi-model evaluation pipelines requiring consistent formatting","Model developers validating that their model's prompt format is correctly handled"],"limitations":["Template mismatch: if a model's prompt format changes (e.g., new version), templates must be updated manually","No automatic detection of optimal prompt format — templates are hand-coded by LMSYS","Custom models without defined templates require manual template creation","Template differences can introduce subtle biases (e.g., some formats may be more verbose, affecting token count)"],"requires":["FastChat framework with conversation templates defined","Model name/identifier that maps to a template in fastchat/conversation.py","Python 3.8+"],"input_types":["raw question text","model identifier (e.g., 'llama-2-7b', 'gpt-4')","conversation history (for multi-turn)"],"output_types":["formatted prompt string (ready for model inference)","token count (for monitoring prompt length)","template metadata (role markers, special tokens used)"],"categories":["data-processing-analysis","tool-use-integration"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"mt-bench__cap_5","uri":"capability://data.processing.analysis.response.collection.and.storage.with.turn.level.granularity","name":"response collection and storage with turn-level granularity","description":"MT-Bench collects model responses at the turn level (not just final responses) and stores them in structured JSON format. Each turn's response is timestamped, includes metadata (model name, inference time, token count), and is linked to the corresponding question turn. This enables post-hoc analysis of how models handle multi-turn context and allows re-judging with different judges without re-running inference.","intents":["Analyze how model responses change across turns in a conversation","Re-evaluate responses with different judges (e.g., Claude instead of GPT-4) without re-running inference","Debug model failures by examining exact responses and inference metadata","Compute per-turn metrics (e.g., average response length, latency by turn)"],"best_for":["Researchers analyzing multi-turn reasoning patterns","Teams validating benchmark results with alternative judges","Model developers debugging inference issues during evaluation"],"limitations":["Storage overhead: 70 models × 80 questions × 2-3 turns × ~500 tokens per response ≈ 8-12 GB of JSON","No built-in deduplication — identical responses from different models are stored separately","Response storage is immutable; corrections require re-running evaluation","No compression; raw JSON can be slow to query for large-scale analysis"],"requires":["Disk storage (10+ GB for full evaluation of 70+ models)","Python 3.8+ for JSON parsing","Optional: database (SQLite, PostgreSQL) for efficient querying"],"input_types":["model responses (text)","inference metadata (latency, token count, model name)"],"output_types":["JSON files (per-model response collections)","structured records (turn_id, model_id, response_text, timestamp, metrics)"],"categories":["data-processing-analysis","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"mt-bench__cap_6","uri":"capability://safety.moderation.gpt.4.judge.prompt.engineering.and.consistency.validation","name":"gpt-4 judge prompt engineering and consistency validation","description":"MT-Bench uses carefully engineered prompts to instruct GPT-4 to evaluate responses on dimensions like instruction following, reasoning, and coherence. The judge prompt includes examples of good/bad responses and explicit scoring rubrics to reduce variance. Consistency is validated by re-judging a subset of responses and computing inter-judge agreement (e.g., Spearman correlation between first and second judgments).","intents":["Ensure GPT-4 judge produces consistent, reproducible scores across evaluation runs","Identify and mitigate judge bias toward certain models or response styles","Validate that judge scores correlate with human preferences (via Chatbot Arena human votes)","Improve judge reliability by iterating on prompt engineering"],"best_for":["Teams building automated evaluation systems requiring judge validation","Researchers analyzing bias in LLM-based evaluation","Organizations needing to justify benchmark scores to stakeholders"],"limitations":["Judge consistency is not guaranteed; GPT-4 can produce different scores for identical inputs due to temperature/randomness","Prompt engineering is manual and requires domain expertise; no automatic optimization","Judge bias: GPT-4 may favor models with similar reasoning patterns or writing style","Validation requires re-judging, which doubles evaluation cost (~$0.06-0.12 per response)"],"requires":["OpenAI API key with GPT-4 access","Carefully crafted judge prompts (provided by LMSYS)","Subset of responses for consistency validation (typically 10-20% of total)"],"input_types":["model response text","original question","judge prompt template"],"output_types":["numeric score (1-10 or similar scale)","judge explanation (optional, for transparency)","consistency metrics (correlation between re-judgments)"],"categories":["safety-moderation","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"mt-bench__cap_7","uri":"capability://data.processing.analysis.correlation.analysis.between.benchmark.scores.and.human.preferences","name":"correlation analysis between benchmark scores and human preferences","description":"MT-Bench scores are validated against human preferences collected via Chatbot Arena (side-by-side model battles). The system computes correlation metrics (Spearman, Kendall) between MT-Bench rankings and Chatbot Arena Elo ratings, validating that the automated benchmark aligns with human judgment. This validation is critical for establishing benchmark credibility and identifying cases where the benchmark may be misaligned with real-world preferences.","intents":["Validate that MT-Bench scores correlate with human preferences (via Chatbot Arena)","Identify models where benchmark scores diverge from human judgment (potential benchmark bias)","Establish confidence in MT-Bench as a proxy for model quality","Detect when benchmark saturation occurs (all models score similarly, losing discriminative power)"],"best_for":["Researchers publishing benchmark papers requiring validation against human judgment","Teams building leaderboards needing to justify automated scores","Organizations evaluating whether benchmarks are predictive of real-world performance"],"limitations":["Requires large-scale human evaluation data (Chatbot Arena has 1.5M+ votes); not available for new benchmarks","Correlation is not causation — high correlation doesn't prove benchmark validity, only consistency","Human preferences may be biased (e.g., toward verbose responses, specific writing styles)","Correlation can degrade over time as model capabilities converge"],"requires":["MT-Bench scores for 20+ models","Chatbot Arena Elo ratings for the same models","Statistical analysis tools (scipy, numpy for correlation computation)"],"input_types":["MT-Bench rankings (numeric scores or Elo ratings)","Chatbot Arena Elo ratings","model identifiers (for matching)"],"output_types":["correlation coefficient (Spearman, Kendall, Pearson)","p-value (statistical significance)","scatter plots (visualization of correlation)","outlier analysis (models with divergent scores)"],"categories":["data-processing-analysis","planning-reasoning"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"mt-bench__cap_8","uri":"capability://data.processing.analysis.category.level.performance.breakdown.and.capability.analysis","name":"category-level performance breakdown and capability analysis","description":"MT-Bench questions are organized into 8 semantic categories (writing, roleplay, extraction, reasoning, math, coding, knowledge, common-sense), enabling per-category performance analysis. The evaluation pipeline computes separate scores for each category, revealing which models excel at specific capabilities and which have gaps. This breakdown is more informative than aggregate scores and helps identify model strengths/weaknesses.","intents":["Identify which models are best for specific tasks (e.g., coding vs. writing)","Detect capability gaps (e.g., a model strong in reasoning but weak in math)","Analyze how model performance varies across domains","Select models for specific use cases based on category-level performance"],"best_for":["Teams building multi-model systems and needing to route tasks to best-suited models","Researchers analyzing model capability profiles","Model developers understanding which capability areas need improvement"],"limitations":["Only 8 categories; fine-grained capability analysis (e.g., calculus vs. algebra) requires more granular categorization","Category definitions are subjective; some questions could fit multiple categories","Small sample size per category (10 questions each); results may be noisy","No weighting by difficulty; a model could score high by excelling at easy questions in a category"],"requires":["MT-Bench evaluation results with per-question category labels","Python 3.8+ for aggregation and analysis"],"input_types":["per-question scores","question category labels"],"output_types":["per-category average scores","per-category ranking (which models are best in each category)","heatmaps (model × category performance matrix)","capability profiles (radar charts showing model strengths/weaknesses)"],"categories":["data-processing-analysis","planning-reasoning"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"mt-bench__cap_9","uri":"capability://data.processing.analysis.benchmark.reproducibility.through.fixed.question.sets.and.seed.management","name":"benchmark reproducibility through fixed question sets and seed management","description":"MT-Bench ensures reproducibility by using a fixed, versioned set of 80 questions and managing random seeds for model inference (temperature, sampling parameters). The system records evaluation metadata (model version, inference parameters, GPT-4 model version, timestamp) enabling exact reproduction of results. Questions are publicly available, allowing external researchers to verify results or run independent evaluations.","intents":["Enable independent verification of benchmark results by external researchers","Reproduce exact evaluation results for model versions and GPT-4 versions","Track how evaluation methodology changes affect results","Publish benchmark results with sufficient detail for peer review"],"best_for":["Academic researchers publishing benchmark results","Organizations needing auditable evaluation records","Teams comparing results across different evaluation runs or environments"],"limitations":["Fixed question set becomes stale over time; new capabilities emerge that aren't tested","Reproducibility requires controlling many variables (model version, inference parameters, GPT-4 version)","GPT-4 behavior may change between API versions, affecting historical comparisons","Public question set enables overfitting during model training; questions lose discriminative power over time"],"requires":["Versioned question dataset (stored in repository)","Inference parameter recording (temperature, top_p, max_tokens, seed)","Model version tracking (model name, version, commit hash)","Evaluation metadata storage (timestamp, GPT-4 model version, evaluator identity)"],"input_types":["question dataset version (string)","model version (string or commit hash)","inference parameters (dict)"],"output_types":["evaluation results with full metadata (JSON)","reproducibility report (text with all parameters)"],"categories":["data-processing-analysis","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"mt-bench__headline","uri":"capability://testing.quality.multi.turn.conversation.benchmarking.tool","name":"multi-turn conversation benchmarking tool","description":"MT-Bench is a comprehensive benchmarking tool designed to evaluate multi-turn conversations in chatbots, focusing on reasoning, instruction following, and coherence using high-quality questions across various categories.","intents":["best multi-turn conversation benchmark","multi-turn conversation evaluation tool for chatbots","how to benchmark chatbot conversations","top tools for evaluating conversational AI","multi-turn reasoning assessment framework"],"best_for":["developers evaluating chatbot performance","researchers in conversational AI"],"limitations":[],"requires":[],"input_types":[],"output_types":[],"categories":["testing-quality"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":63,"verified":false,"data_access_risk":"high","permissions":["OpenAI API key with GPT-4 access","Python 3.8+","FastChat framework installed (pip install fschat)","Model inference endpoint (local or remote) for candidate models","FastChat repository cloned locally or accessed via GitHub API","Python 3.8+ for parsing JSON question files","No external API required for question access","FastChat framework with controller and model workers running","OpenAI API key for GPT-4 judging","GPU(s) for model inference (CPU inference possible but slow)"],"failure_modes":["GPT-4 judge introduces cost (~$0.03-0.06 per evaluation) and dependency on OpenAI API availability","Judge bias: GPT-4 may favor models with similar reasoning patterns to its own training","No human validation layer — automated scoring can miss nuanced quality differences","Fixed question set limits evaluation to 8 predefined categories; custom domains require new question curation","Fixed 80-question set may not cover domain-specific tasks (medical, legal, scientific)","English-only questions; multilingual evaluation requires separate benchmark","Question difficulty is not uniformly distributed — some categories have harder questions than others","No automatic detection of question drift or contamination in model training data","Requires FastChat controller and worker infrastructure setup — not a standalone tool","GPU memory constraints limit concurrent model inference; typically 2-4 models per GPU","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.7,"quality":0.9,"ecosystem":0.39999999999999997,"match_graph":0.25,"freshness":0.52,"weights":{"adoption":0.25,"quality":0.35,"ecosystem":0.15,"match_graph":0.2,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-06-17T09:51:04.693Z","last_scraped_at":null,"last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=mt-bench","compare_url":"https://unfragile.ai/compare?artifact=mt-bench"}},"signature":"Nu6HWlzEwi1EGq2Wlye7+HLqTtAee+2wTrBXkwpqmMG54JnELnlPn843WVJuQsgYTxcHGtsphPadGUCeWvqyBQ==","signedAt":"2026-06-21T07:33:08.534Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/mt-bench","artifact":"https://unfragile.ai/mt-bench","verify":"https://unfragile.ai/api/v1/verify?slug=mt-bench","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}