{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"lmsys-chatbot-arena","slug":"lmsys-chatbot-arena","name":"LMSYS Chatbot Arena","type":"benchmark","url":"https://chat.lmsys.org","page_url":"https://unfragile.ai/lmsys-chatbot-arena","categories":["testing-quality"],"tags":[],"pricing":{"model":"free","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"lmsys-chatbot-arena__cap_0","uri":"capability://safety.moderation.side.by.side.anonymous.model.comparison.interface","name":"side-by-side anonymous model comparison interface","description":"Presents two LLM responses to identical prompts in a split-screen UI without revealing model identities, enabling unbiased human preference judgments. Users interact with both models sequentially or simultaneously, then submit preference votes that feed into the rating system. The anonymization prevents brand bias and ensures evaluations reflect actual response quality rather than model reputation.","intents":["I want to compare how different LLMs handle the same prompt without knowing which is which","I need to evaluate model quality based purely on response merit, not brand recognition","I want to contribute to crowdsourced LLM benchmarking by voting on response quality"],"best_for":["LLM researchers validating model performance claims","AI practitioners comparing models before deployment","Community contributors interested in transparent model evaluation"],"limitations":["No control over prompt selection — users vote on whatever prompts the system serves","Voting is subjective and may reflect individual preference rather than objective quality","No mechanism to weight votes by evaluator expertise or domain knowledge","Latency depends on both model response times; slower model delays comparison"],"requires":["Web browser with JavaScript enabled","Internet connectivity to reach chat.lmsys.org","Two LLM endpoints available and responsive"],"input_types":["text prompts (user-generated or system-provided)"],"output_types":["text responses from two models","binary preference vote (model A vs model B)"],"categories":["safety-moderation","benchmark-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"lmsys-chatbot-arena__cap_1","uri":"capability://data.processing.analysis.elo.rating.system.for.dynamic.model.ranking","name":"elo rating system for dynamic model ranking","description":"Implements a modified Elo rating algorithm that updates model scores based on pairwise comparison outcomes from crowdsourced votes. Each vote is treated as a game result; when a model receives more votes than expected (based on current Elo), its rating increases proportionally. The system handles variable match counts, new models entering the arena, and convergence toward stable rankings as vote volume increases.","intents":["I need a statistically principled way to rank models based on crowdsourced preference votes","I want to see how model rankings change over time as new votes accumulate","I need to understand confidence intervals around model ratings given limited vote samples"],"best_for":["Benchmark maintainers tracking model performance trends","Researchers analyzing how community preferences evolve","Model developers monitoring their model's competitive standing"],"limitations":["Elo assumes transitive preferences (if A > B and B > C, then A > C), which may not hold for subjective quality judgments","Early-stage models with few votes have high rating volatility; confidence intervals widen with sparse data","Vote distribution may be skewed toward popular prompts or categories, biasing ratings","No explicit handling of ties or 'both equally good' votes — requires binary preference","Rating convergence is slow for models with similar strength; requires thousands of votes for stable rankings"],"requires":["Vote history with outcome labels (winner, loser, or tie)","Initial Elo baseline (typically 1000 for new models)","K-factor parameter controlling rating volatility (typically 32-64)"],"input_types":["structured vote records: {model_a, model_b, winner, timestamp}"],"output_types":["Elo ratings per model (numeric score)","ranking leaderboard (sorted by Elo)","confidence intervals or standard error estimates"],"categories":["data-processing-analysis","benchmark-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"lmsys-chatbot-arena__cap_10","uri":"capability://data.processing.analysis.cross.model.response.comparison.and.diff.visualization","name":"cross-model response comparison and diff visualization","description":"Generates side-by-side diffs or structured comparisons of responses from two models to highlight differences in content, structure, tone, and correctness. The system may use heuristics (length, keyword presence, code block detection) or more sophisticated analysis (semantic similarity, factual accuracy checking) to identify and highlight key differences. This helps evaluators quickly understand why one response might be better without reading both in full.","intents":["I want to quickly see the key differences between two model responses without reading both in full","I want to identify specific errors or omissions in one model's response compared to the other","I want to understand whether one model is more concise, more detailed, or more accurate"],"best_for":["Evaluators making quick judgments on response quality","Researchers analyzing systematic differences between models","Users with limited time to spend on each evaluation"],"limitations":["Diff generation is heuristic-based and may miss subtle differences or highlight irrelevant ones","Automated comparison (e.g., semantic similarity) may not align with human judgment of quality","Factual accuracy checking requires external knowledge bases or APIs, adding latency and cost","Diff visualization can introduce bias by highlighting certain differences over others","No ground truth for what constitutes a 'meaningful' difference; different users may prioritize different aspects"],"requires":["Two model responses to compare","Diff algorithm (text diff, semantic diff, or custom comparison logic)","Optional: fact-checking API or knowledge base"],"input_types":["two text responses from different models"],"output_types":["highlighted diffs (insertions, deletions, modifications)","comparison metrics (length, code blocks, citations, etc.)","factual accuracy indicators (if available)"],"categories":["data-processing-analysis","text-generation-language"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"lmsys-chatbot-arena__cap_11","uri":"capability://data.processing.analysis.user.preference.pattern.analysis.and.bias.detection","name":"user preference pattern analysis and bias detection","description":"Analyzes voting patterns to detect systematic biases in user preferences (e.g., preference for longer responses, certain writing styles, or specific model families). Uses statistical methods (e.g., logistic regression, clustering) to identify confounding factors that influence votes beyond actual response quality. Flags potential biases and adjusts rankings if necessary.","intents":["Understand what factors drive user preferences beyond response quality","Detect and mitigate systematic biases in crowdsourced evaluation","Improve ranking reliability by accounting for voter behavior patterns"],"best_for":["Benchmark maintainers ensuring ranking integrity","Researchers studying crowdsourced preference aggregation","Organizations understanding voter behavior"],"limitations":["Bias detection is correlational and does not prove causation","Adjusting rankings based on detected biases introduces subjective choices about which biases to correct","Some apparent biases may reflect genuine quality differences (e.g., longer responses may be higher quality)"],"requires":["Statistical analysis tools (e.g., logistic regression, clustering)","Response metadata (length, style, model family)"],"input_types":["votes with response metadata","voter behavior patterns"],"output_types":["bias analysis reports","bias-adjusted rankings","visualization of preference patterns"],"categories":["data-processing-analysis","safety-moderation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"lmsys-chatbot-arena__cap_2","uri":"capability://data.processing.analysis.category.specific.leaderboard.segmentation","name":"category-specific leaderboard segmentation","description":"Partitions the full vote dataset into domain-specific subsets (coding, math, writing, hard prompts, etc.) and computes separate Elo rankings for each category. This allows models to be ranked differently depending on task type — a model strong in coding may rank lower on creative writing. The system tracks which prompts belong to which categories (via tagging or keyword heuristics) and filters votes accordingly before computing category-specific ratings.","intents":["I want to know which model is best for coding tasks specifically, not overall","I need to compare models on math reasoning separately from general chat ability","I want to see if a model's strengths vary by domain (e.g., strong on code, weak on writing)"],"best_for":["Developers choosing models for specific use cases (e.g., code generation vs. content writing)","Researchers analyzing model capability profiles across domains","Model builders understanding where their model excels or needs improvement"],"limitations":["Category assignment is manual or heuristic-based; misclassified prompts skew category rankings","Vote volume per category is lower than overall, increasing rating volatility and confidence intervals","Overlap between categories (e.g., a prompt requiring both coding and math) creates ambiguity","New categories require manual definition and prompt tagging; no automatic category discovery","Models may be ranked in fewer categories if they receive fewer votes in those domains"],"requires":["Prompt-to-category mapping (manual labels or automated classification)","Minimum vote threshold per category to compute meaningful rankings","Category definitions and scope documentation"],"input_types":["vote records with associated prompt category labels"],"output_types":["per-category Elo leaderboards","category-specific model rankings","category coverage metrics (votes per category)"],"categories":["data-processing-analysis","benchmark-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"lmsys-chatbot-arena__cap_3","uri":"capability://data.processing.analysis.crowdsourced.prompt.collection.and.curation","name":"crowdsourced prompt collection and curation","description":"Accepts user-submitted prompts and stores them in a pool for serving to future evaluators. The system may apply basic filtering (spam, profanity, length constraints) and optionally curates high-quality prompts based on engagement metrics (votes received, prompt diversity). Prompts are sampled uniformly or weighted by category to ensure balanced evaluation across domains. This creates a continuously evolving benchmark dataset driven by community interest.","intents":["I want to submit a prompt I think is interesting for evaluating LLMs","I want to ensure the benchmark covers diverse prompt types, not just common queries","I want to see what kinds of prompts the community finds most valuable for model comparison"],"best_for":["Community members contributing to benchmark diversity","Researchers ensuring prompt coverage across domains","Benchmark maintainers monitoring prompt quality and relevance"],"limitations":["No expert review of submitted prompts; low-quality or adversarial prompts may enter the pool","Prompt distribution reflects user interest, not systematic coverage of capability space","Duplicate or near-duplicate prompts may accumulate without deduplication","No mechanism to retire outdated or biased prompts from the active pool","Prompt sampling may be biased toward recent submissions or popular categories"],"requires":["User account or anonymous submission capability","Prompt submission form with basic validation","Content moderation pipeline (automated or manual)"],"input_types":["text prompts (user-submitted, variable length and format)"],"output_types":["curated prompt pool","prompt metadata (category, submission date, vote count)","prompt sampling distribution"],"categories":["data-processing-analysis","benchmark-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"lmsys-chatbot-arena__cap_4","uri":"capability://text.generation.language.real.time.model.response.streaming.and.rendering","name":"real-time model response streaming and rendering","description":"Fetches responses from two LLM endpoints in parallel and streams tokens to the UI as they arrive, displaying them incrementally rather than waiting for full completion. This provides immediate feedback to users and reduces perceived latency. The system handles variable response speeds (one model may be faster than the other) and renders markdown, code blocks, and formatted text appropriately. Streaming is interrupted if the user submits a vote before both models finish.","intents":["I want to see model responses appear in real-time as they're generated, not wait for full completion","I want to compare response quality while both models are still generating, not after they finish","I want to vote early if one model is clearly better, without waiting for the slower model"],"best_for":["Evaluators who want responsive, interactive comparison experience","Benchmark operators minimizing perceived latency and user friction","Users with limited patience for slow model responses"],"limitations":["Streaming adds complexity to vote recording — must capture partial responses if user votes mid-generation","Network latency and buffering can cause uneven token arrival, making one model appear slower than it is","Markdown/code rendering must be incremental, which can cause visual jitter as formatting is applied","Streaming requires compatible model APIs (OpenAI, Anthropic, etc.); some models only support batch responses","Early voting on partial responses may introduce bias toward faster-responding models"],"requires":["Model APIs supporting streaming/SSE (Server-Sent Events) or WebSocket connections","Frontend capable of incremental DOM updates (React, Vue, etc.)","Robust error handling for interrupted streams"],"input_types":["prompt text","streaming token events from two model endpoints"],"output_types":["rendered text with markdown/code formatting","partial response snapshots at vote time"],"categories":["text-generation-language","tool-use-integration"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"lmsys-chatbot-arena__cap_5","uri":"capability://data.processing.analysis.vote.aggregation.and.statistical.confidence.estimation","name":"vote aggregation and statistical confidence estimation","description":"Collects individual preference votes and aggregates them to compute model rankings with confidence intervals or uncertainty estimates. The system tracks vote count per model pair, computes win rates, and estimates statistical significance of ranking differences. This allows distinguishing between 'model A is clearly better' (high confidence) vs. 'models are roughly equivalent' (low confidence). Confidence estimates inform which rankings are stable vs. provisional.","intents":["I want to know not just which model ranks higher, but how confident we should be in that ranking","I want to see if two models are statistically significantly different or just noise","I want to understand how many more votes are needed to stabilize a model's ranking"],"best_for":["Researchers interpreting benchmark results with appropriate uncertainty","Model developers understanding statistical significance of ranking changes","Benchmark users avoiding over-interpretation of small ranking differences"],"limitations":["Confidence intervals assume votes are independent and identically distributed, which may not hold if evaluators have correlated preferences","Vote distribution across model pairs is uneven; some pairs receive many votes, others few, leading to asymmetric confidence","No adjustment for multiple comparisons; ranking many models increases false positive rate","Confidence estimates don't account for evaluator expertise variation or potential bias","Statistical significance doesn't imply practical significance — a statistically significant difference may be negligible"],"requires":["Vote history with outcome labels","Minimum vote threshold to compute meaningful confidence intervals","Statistical method selection (binomial confidence intervals, Bayesian credible intervals, etc.)"],"input_types":["structured vote records: {model_a, model_b, winner, timestamp}"],"output_types":["win rates per model pair","confidence intervals or credible intervals","statistical significance tests (p-values)","ranking stability metrics"],"categories":["data-processing-analysis","benchmark-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"lmsys-chatbot-arena__cap_6","uri":"capability://text.generation.language.multi.turn.conversation.history.tracking","name":"multi-turn conversation history tracking","description":"Maintains conversation context across multiple exchanges within a single evaluation session. Users can ask follow-up questions to both models, and the system tracks the full conversation history for each model independently. This allows evaluating models on their ability to maintain context, handle clarifications, and build on previous responses. Vote submissions can reference specific turns or the overall conversation quality.","intents":["I want to evaluate how well models handle follow-up questions and maintain context","I want to see if a model can correct itself or improve when given clarification","I want to compare models on multi-turn reasoning tasks, not just single-shot responses"],"best_for":["Evaluators testing models on complex, multi-step reasoning","Researchers studying model behavior across conversation turns","Users comparing models on tasks requiring context maintenance"],"limitations":["Multi-turn conversations increase latency and cost (more tokens processed per evaluation)","Vote attribution becomes ambiguous — is the vote based on turn 1, turn 3, or overall conversation?","Conversation history grows unbounded; long conversations may exceed model context windows","No mechanism to compare models on identical follow-up sequences; users may ask different questions","Elo rating becomes harder to interpret when votes reflect multi-turn performance vs. single-shot capability"],"requires":["Stateful conversation management (session storage, context window tracking)","Model APIs supporting multi-turn chat format (messages array with role/content)","Conversation history persistence (database or session storage)"],"input_types":["initial prompt","follow-up user messages","model responses (streamed or batch)"],"output_types":["conversation history (full message transcript)","per-turn model responses","overall conversation quality vote"],"categories":["text-generation-language","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"lmsys-chatbot-arena__cap_7","uri":"capability://data.processing.analysis.user.behavior.analytics.and.engagement.tracking","name":"user behavior analytics and engagement tracking","description":"Logs user interactions (votes submitted, prompts evaluated, time spent, category preferences) and analyzes patterns to understand evaluator behavior and benchmark coverage. The system tracks metrics like vote consistency (do the same evaluators vote similarly on similar prompts?), category participation (which domains receive most votes?), and evaluator demographics (if available). This data informs prompt curation and identifies potential biases in the evaluation process.","intents":["I want to understand which categories of prompts are under-evaluated and need more votes","I want to identify if certain evaluators have systematic biases in their voting patterns","I want to see how user engagement varies by model, category, or time period"],"best_for":["Benchmark maintainers optimizing prompt coverage and vote distribution","Researchers studying crowdsourced evaluation quality and potential biases","Platform operators monitoring user engagement and retention"],"limitations":["Privacy concerns — tracking user behavior requires careful data handling and anonymization","Behavioral data may reveal evaluator identity or preferences, compromising anonymity","Engagement metrics don't directly measure evaluation quality; high engagement ≠ high-quality votes","Feedback loops — if the system preferentially serves popular categories, it reinforces existing biases","No ground truth to validate whether identified patterns reflect real biases or natural variation"],"requires":["User session tracking and logging infrastructure","Privacy-preserving analytics (anonymization, aggregation)","Consent and transparency about data collection"],"input_types":["user interaction logs: {user_id, action, timestamp, prompt_id, vote, category}"],"output_types":["engagement metrics (votes per user, category participation)","vote consistency scores","category coverage heatmaps","evaluator bias indicators"],"categories":["data-processing-analysis","benchmark-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"lmsys-chatbot-arena__cap_8","uri":"capability://data.processing.analysis.model.metadata.and.capability.tagging.system","name":"model metadata and capability tagging system","description":"Maintains structured metadata for each model in the arena (model name, organization, release date, parameter count, training data, known capabilities/limitations). Tags models with capability labels (e.g., 'multilingual', 'code-trained', 'instruction-tuned') to enable filtering and analysis. This metadata is displayed on leaderboards and used to contextualize rankings (e.g., comparing only open-source models, or models released in the same year).","intents":["I want to compare only open-source models, not proprietary ones","I want to see how model rankings correlate with parameter count or training data size","I want to filter leaderboards to show only models from a specific organization or release period"],"best_for":["Researchers analyzing how model properties (size, training approach) correlate with performance","Practitioners filtering models by licensing or deployment constraints","Benchmark users understanding context around model rankings"],"limitations":["Metadata is manually curated and may be incomplete or outdated as models are updated","Capability tags are subjective; different taggers may label the same model differently","Metadata doesn't capture all relevant properties (e.g., inference cost, latency, hardware requirements)","No standardized schema for model metadata; different models may have inconsistent attribute coverage","Filtering by metadata reduces vote volume per filtered subset, increasing ranking volatility"],"requires":["Model metadata database with schema definition","Manual curation process or automated metadata extraction","UI for filtering/sorting by metadata attributes"],"input_types":["model metadata: {name, organization, release_date, parameters, training_data, tags, capabilities}"],"output_types":["filtered leaderboards","metadata-based model groupings","correlation analysis (metadata vs. ranking)"],"categories":["data-processing-analysis","benchmark-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"lmsys-chatbot-arena__cap_9","uri":"capability://data.processing.analysis.temporal.ranking.evolution.and.trend.analysis","name":"temporal ranking evolution and trend analysis","description":"Tracks how model rankings change over time as new votes accumulate and new models enter the arena. The system stores historical snapshots of Elo ratings and generates trend visualizations showing ranking trajectories. This enables analysis of whether a model's performance is improving, declining, or stable, and how new model releases affect the competitive landscape. Trends are computed per category and overall.","intents":["I want to see if a model's ranking has improved or declined over the past month","I want to understand how a new model release affected the rankings of existing models","I want to identify models with improving vs. declining performance trends"],"best_for":["Model developers tracking their model's competitive position over time","Researchers analyzing how the LLM landscape evolves","Benchmark users understanding ranking stability and momentum"],"limitations":["Early trends are noisy due to low vote volume; ranking changes may reflect noise rather than real performance shifts","Trend analysis assumes consistent evaluation criteria, but prompt distribution and evaluator pool may change over time","New models entering the arena can cause ranking shifts for existing models (Elo inflation/deflation)","Seasonal or temporal biases in voting (e.g., more votes on weekends) can create spurious trends","No causal analysis — can't determine whether ranking changes are due to model updates, evaluator bias, or external factors"],"requires":["Historical Elo snapshots (daily or weekly)","Timestamp metadata for all votes and model releases","Time-series analysis and visualization tools"],"input_types":["historical Elo ratings with timestamps","model release dates and update logs"],"output_types":["ranking trajectory plots","trend indicators (improving, declining, stable)","ranking volatility metrics"],"categories":["data-processing-analysis","benchmark-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"lmsys-chatbot-arena__headline","uri":"capability://testing.quality.crowdsourced.llm.evaluation.platform","name":"crowdsourced llm evaluation platform","description":"A platform for evaluating large language models by allowing users to chat with and compare two models side-by-side, providing a trusted benchmark for LLM performance across various categories.","intents":["best LLM evaluation platform","LLM benchmarking for real-world applications","top crowdsourced LLM comparison tools","how to evaluate language models effectively","best platforms for LLM performance testing"],"best_for":["researchers","developers","AI enthusiasts"],"limitations":["requires internet access","limited to available models"],"requires":["user participation","internet connection"],"input_types":["text prompts"],"output_types":["model rankings","user feedback"],"categories":["testing-quality"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":62,"verified":false,"data_access_risk":"high","permissions":["Web browser with JavaScript enabled","Internet connectivity to reach chat.lmsys.org","Two LLM endpoints available and responsive","Vote history with outcome labels (winner, loser, or tie)","Initial Elo baseline (typically 1000 for new models)","K-factor parameter controlling rating volatility (typically 32-64)","Two model responses to compare","Diff algorithm (text diff, semantic diff, or custom comparison logic)","Optional: fact-checking API or knowledge base","Statistical analysis tools (e.g., logistic regression, clustering)"],"failure_modes":["No control over prompt selection — users vote on whatever prompts the system serves","Voting is subjective and may reflect individual preference rather than objective quality","No mechanism to weight votes by evaluator expertise or domain knowledge","Latency depends on both model response times; slower model delays comparison","Elo assumes transitive preferences (if A > B and B > C, then A > C), which may not hold for subjective quality judgments","Early-stage models with few votes have high rating volatility; confidence intervals widen with sparse data","Vote distribution may be skewed toward popular prompts or categories, biasing ratings","No explicit handling of ties or 'both equally good' votes — requires binary preference","Rating convergence is slow for models with similar strength; requires thousands of votes for stable rankings","Diff generation is heuristic-based and may miss subtle differences or highlight irrelevant ones","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.7,"quality":0.9,"ecosystem":0.3,"match_graph":0.25,"freshness":0.75,"weights":{"adoption":0.25,"quality":0.35,"ecosystem":0.15,"match_graph":0.2,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:23.327Z","last_scraped_at":null,"last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=lmsys-chatbot-arena","compare_url":"https://unfragile.ai/compare?artifact=lmsys-chatbot-arena"}},"signature":"Ep4gMse8m2Y9ZpMTURhJ93PzOQOm6hqpfXCmy5VafKwjUTkx0h40v4ZzRBLIPTTe36PtF6yzVMGMncAb36TMAQ==","signedAt":"2026-06-21T01:42:01.495Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/lmsys-chatbot-arena","artifact":"https://unfragile.ai/lmsys-chatbot-arena","verify":"https://unfragile.ai/api/v1/verify?slug=lmsys-chatbot-arena","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}