{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"chatbot-arena","slug":"chatbot-arena","name":"Chatbot Arena","type":"benchmark","url":"https://lmarena.ai","page_url":"https://unfragile.ai/chatbot-arena","categories":["testing-quality"],"tags":[],"pricing":{"model":"free","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"chatbot-arena__cap_0","uri":"capability://safety.moderation.pairwise.preference.collection.via.crowdsourced.battles","name":"pairwise-preference-collection-via-crowdsourced-battles","description":"Collects human preference judgments through a web-based Battle Mode interface where users submit identical prompts to two anonymous models and select which response is superior. The platform aggregates these pairwise comparisons across millions of user interactions to build a preference dataset that reflects real-world conversational quality expectations. This crowdsourced approach captures diverse user preferences across multiple languages and task types without requiring predefined evaluation rubrics or expert annotators.","intents":["Gather large-scale human preference data for LLM evaluation without expert annotation costs","Understand how real users compare model outputs across diverse conversational tasks","Build a continuously growing preference dataset that reflects evolving user expectations","Evaluate models on tasks users actually care about rather than synthetic benchmarks"],"best_for":["LLM researchers building preference datasets for RLHF training","Model developers seeking real-world performance validation across diverse use cases","Organizations evaluating multiple LLMs against actual user preferences"],"limitations":["Sampling bias — only users who visit Arena and engage in battles contribute data, not representative of all use cases or user populations","Preference bias — human preference may favor verbose, confident-sounding, or stylistically appealing responses over factually correct but terse ones","No control over inference parameters — models are called as black boxes, so response quality depends on provider's default settings","Stochastic evaluation — pairwise preference is inherently variable; no test-retest reliability metrics provided","Language distribution unknown — 'diverse languages' mentioned but no breakdown of which languages are represented or their relative weights"],"requires":["Web browser with JavaScript enabled","User account (login required for battle participation)","Access to https://lmarena.ai","Ability to articulate preferences between two text responses"],"input_types":["text prompts (user-submitted queries)","optional file uploads (scope and supported formats unknown)"],"output_types":["pairwise preference labels (win/loss/tie)","aggregated preference data (used internally for Elo computation)"],"categories":["safety-moderation","crowdsourced-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"chatbot-arena__cap_1","uri":"capability://data.processing.analysis.elo.rating.computation.for.model.ranking","name":"elo-rating-computation-for-model-ranking","description":"Converts pairwise battle outcomes (win/loss/tie) into Elo ratings using a chess-style rating system that produces relative model rankings. The system processes individual battle results and aggregates them to compute dynamic Elo scores that reflect each model's expected performance against others. This approach enables continuous ranking updates as new battles are collected and provides a single comparable metric across all evaluated models.","intents":["Generate comparable rankings across diverse LLMs using a single metric","Track how model performance changes over time as new battles are collected","Determine which model is likely to perform better on a random conversational task","Identify performance tiers among competing models"],"best_for":["Model developers comparing their system against competitors","Researchers analyzing relative LLM performance trends","Organizations selecting between multiple LLM providers based on empirical rankings"],"limitations":["Elo formula and parameters not publicly documented — specific rating computation methodology unknown","No confidence intervals or significance testing provided — uncertainty quantification absent","Relative ranking only — Elo provides no absolute performance metric or interpretation of what a score means in real-world terms","Potential ceiling effects — pairwise preference may saturate when models are very similar in quality, making fine-grained distinctions unreliable","Sample size per model comparison unknown — some models may have insufficient battles for stable ratings","Tie-breaking rules for leaderboard ranking unknown"],"requires":["Minimum number of battles per model (threshold unknown)","Continuous battle data collection","Computational infrastructure to update ratings as new battles arrive"],"input_types":["pairwise battle outcomes (win/loss/tie)","model identifiers","timestamp metadata"],"output_types":["Elo rating (numeric score)","leaderboard ranking (ordinal position)","rating history (time-series data)"],"categories":["data-processing-analysis","ranking-algorithm"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"chatbot-arena__cap_2","uri":"capability://safety.moderation.anonymous.model.comparison.interface","name":"anonymous-model-comparison-interface","description":"Provides a web-based Battle Mode interface where users submit prompts and receive responses from two anonymous models side-by-side without knowing which model is which. The anonymization prevents bias from brand recognition or prior expectations about model quality. Users compare the responses and select which is better, with their preference recorded and used for ranking computation.","intents":["Compare two LLM outputs on identical prompts without brand bias influencing judgment","Evaluate models on tasks I care about rather than predefined benchmarks","Contribute to a large-scale preference dataset by sharing my judgments","Discover which model performs better on my specific use case"],"best_for":["Individual users evaluating LLMs for personal or organizational use","Researchers collecting unbiased preference judgments","Model developers seeking real-world performance feedback"],"limitations":["Anonymization prevents learning which specific models are being compared — useful for unbiased preference collection but limits diagnostic insights","Single-shot evaluation — no consistency testing or robustness evaluation across multiple runs","No control over model parameters — responses depend on provider defaults, making it unclear whether differences reflect model capability or inference settings","User expertise varies — casual users may make uninformed preference judgments compared to domain experts","No structured evaluation criteria — preferences are subjective and may not align with objective correctness"],"requires":["Web browser with JavaScript enabled","User account and login to https://lmarena.ai","Ability to articulate and submit text prompts","Time to wait for model responses (latency unknown)"],"input_types":["text prompts (free-form user queries)","optional file uploads (format and scope unknown)"],"output_types":["two text responses (from anonymous models)","preference selection (binary or ternary: better/worse/tie)"],"categories":["safety-moderation","user-interface"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"chatbot-arena__cap_3","uri":"capability://data.processing.analysis.multi.language.conversational.evaluation","name":"multi-language-conversational-evaluation","description":"Evaluates LLM performance across diverse languages by accepting user prompts in multiple languages and collecting preference judgments on multilingual responses. The platform aggregates language-specific preference data to produce Elo ratings that reflect model quality across linguistic diversity. This approach captures how well models handle non-English tasks and whether performance varies significantly across languages.","intents":["Evaluate LLM quality across multiple languages without requiring separate benchmarks per language","Identify whether models perform consistently across languages or have language-specific weaknesses","Understand real-world multilingual user preferences","Compare models on the languages my users actually speak"],"best_for":["Organizations serving multilingual user bases","Researchers studying cross-lingual LLM performance","Model developers optimizing for global markets"],"limitations":["Language distribution unknown — no breakdown of which languages are represented, their relative weights, or whether distribution reflects global language usage","Language-specific performance analysis not provided — cannot determine whether a model's overall Elo rating masks poor performance in specific languages","Annotator expertise varies by language — some languages may have fewer expert users contributing judgments","No language-specific confidence metrics — unclear which language rankings are reliable vs. based on small sample sizes","Potential language bias in model training — models may perform better on high-resource languages (English, Chinese, Spanish) than low-resource languages"],"requires":["User ability to write prompts in supported languages","Models that support multilingual inference","Sufficient user participation in each language to generate stable rankings"],"input_types":["text prompts in multiple languages","optional file uploads (language support unknown)"],"output_types":["multilingual responses from models","language-specific preference judgments","aggregated Elo ratings (language-agnostic)"],"categories":["data-processing-analysis","multilingual-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"chatbot-arena__cap_4","uri":"capability://safety.moderation.public.conversation.disclosure.for.research","name":"public-conversation-disclosure-for-research","description":"Automatically discloses user conversations and metadata to AI model providers and makes them publicly available for research purposes. The platform explicitly states in its terms that 'Your conversations and certain other personal information will be disclosed to the relevant AI providers and may otherwise be disclosed publicly.' This enables researchers to analyze real-world conversational patterns and model responses at scale while creating a potential data contamination vector for future model training.","intents":["Enable researchers to study real-world LLM usage patterns and failure modes","Provide model providers with feedback on how their models are used in practice","Build a public dataset of conversational AI interactions for research","Identify common user intents and model weaknesses from production data"],"best_for":["Researchers studying LLM behavior and user interactions","Model providers seeking production usage insights","Organizations building datasets for LLM research"],"limitations":["High data contamination risk — public disclosure of conversations creates vector for future model training on Arena data, potentially biasing future model evaluations","No decontamination procedures mentioned — no evidence that Arena data is excluded from model training sets","Privacy implications — users may not fully understand that conversations are publicly disclosed and may contain sensitive information","Feedback loop — models evaluated on Arena may be trained on Arena data, creating circular evaluation where models optimize for Arena preferences","Irreversible disclosure — conversations cannot be deleted once submitted; permanent public record","No opt-out mechanism mentioned — all users subject to disclosure by default"],"requires":["User acceptance of terms disclosing conversations publicly","No expectation of privacy for submitted prompts and responses","Acceptance that conversations may be used for model training"],"input_types":["user prompts","model responses","preference judgments","user metadata (account information)"],"output_types":["public conversation dataset","disclosed data to model providers","research-accessible conversation logs"],"categories":["safety-moderation","data-governance"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"chatbot-arena__cap_5","uri":"capability://search.retrieval.live.leaderboard.with.continuous.ranking.updates","name":"live-leaderboard-with-continuous-ranking-updates","description":"Maintains a publicly accessible leaderboard at https://lmarena.ai that ranks models by Elo rating and updates continuously as new battles are collected. The leaderboard provides real-time visibility into model performance rankings without requiring static benchmark re-runs. Users can search and filter models, and rankings change dynamically as preference data accumulates, enabling tracking of performance trends over time.","intents":["Check current model rankings without waiting for benchmark re-runs","Track how a specific model's performance changes over time","Compare multiple models on a single leaderboard","Identify top-performing models for a given use case"],"best_for":["Model developers monitoring competitive positioning","Organizations selecting LLMs based on current performance rankings","Researchers tracking model performance trends"],"limitations":["Leaderboard content not provided in documentation — cannot extract top 5 models or specific scores","Update frequency unknown — unclear how often rankings are refreshed or whether updates are real-time","Ranking criteria partially unknown — Elo rating confirmed but tie-breaking rules not documented","Search functionality scope unknown — unclear what filtering options are available","No historical snapshots mentioned — cannot compare leaderboard state at different points in time","Submission process unknown — unclear how new models are added or what requirements exist","No confidence intervals displayed — rankings appear definitive but underlying uncertainty is unknown"],"requires":["Web browser access to https://lmarena.ai","No authentication required for viewing leaderboard (login only for battle participation)","JavaScript enabled for dynamic content"],"input_types":["model identifiers","Elo ratings","battle outcome data"],"output_types":["ranked model list","Elo scores","performance trends (if historical data available)","search results (if filtering applied)"],"categories":["search-retrieval","leaderboard-system"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"chatbot-arena__cap_6","uri":"capability://tool.use.integration.third.party.model.execution.and.response.generation","name":"third-party-model-execution-and-response-generation","description":"Executes user prompts against third-party LLM APIs (OpenAI, Anthropic, etc.) and returns responses without controlling inference parameters or model versions. The platform acts as a black-box orchestrator that sends prompts to model providers' APIs and collects responses for comparison. Users have no visibility into which model versions are being used, what temperature or sampling parameters are applied, or how responses are generated.","intents":["Compare responses from multiple LLM providers on identical prompts","Evaluate models without needing to host or manage inference infrastructure","Test how different models respond to my specific use cases","Avoid infrastructure costs of running multiple models locally"],"best_for":["Users evaluating commercial LLM APIs without infrastructure investment","Researchers comparing black-box model behavior","Organizations avoiding model hosting and inference costs"],"limitations":["No control over inference parameters — response quality depends on provider defaults (temperature, top-p, max-tokens, etc.), making it unclear whether differences reflect model capability or settings","Model version unknown — no visibility into which specific model versions are being called, making it impossible to reproduce results or track version-specific performance","Provider-side changes invisible — model updates or API changes affect rankings without notification or control","Latency dependent on provider — response times vary based on provider load and infrastructure, not model quality","Cost opaque — no visibility into API costs or which party bears the expense","No reproducibility — cannot re-run identical evaluations with same model versions or parameters","API rate limits and availability — evaluation depends on provider API stability and may be affected by outages"],"requires":["Active API keys or accounts with model providers (OpenAI, Anthropic, etc.)","Sufficient API quota to handle battle volume","Provider API availability and uptime","Network connectivity to provider APIs"],"input_types":["text prompts","optional file uploads (format unknown)"],"output_types":["text responses from models","response metadata (latency, tokens used, etc. — unknown if captured)"],"categories":["tool-use-integration","api-orchestration"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"chatbot-arena__cap_7","uri":"capability://data.processing.analysis.real.world.task.distribution.evaluation","name":"real-world-task-distribution-evaluation","description":"Evaluates models on conversational tasks submitted by real users rather than predefined synthetic benchmarks, capturing task distribution that reflects actual use cases. The platform accepts free-form user prompts across diverse domains and use cases, enabling evaluation on tasks users genuinely care about. This approach produces rankings that reflect performance on real-world conversational quality rather than artificial benchmark tasks.","intents":["Evaluate models on tasks my users actually ask about","Understand model performance on real-world use cases rather than synthetic benchmarks","Identify which models excel at the specific tasks I care about","Validate that benchmark rankings correlate with production performance"],"best_for":["Organizations validating LLM performance on production use cases","Researchers studying real-world LLM usage patterns","Model developers understanding how models perform on diverse tasks"],"limitations":["Task distribution unknown — no breakdown of task categories, domains, or relative weights; unclear whether distribution reflects real-world usage","Sampling bias — only users who visit Arena and engage in battles contribute tasks; not representative of all use cases or user populations","No task-specific analysis — cannot determine whether a model's overall ranking masks poor performance on specific task categories","Task contamination risk — real-world tasks may overlap with model training data, biasing evaluations","No task difficulty analysis — unclear whether tasks are uniformly difficult or whether some models benefit from easier task distribution","Preference bias — user preferences may not align with objective correctness or task-specific success criteria"],"requires":["User ability to articulate tasks as text prompts","Diverse user participation across multiple domains","Sufficient task volume to establish stable rankings"],"input_types":["free-form text prompts across any domain","optional file uploads (scope unknown)"],"output_types":["model responses","preference judgments","task-specific performance data (if available)"],"categories":["data-processing-analysis","task-distribution"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"chatbot-arena__cap_8","uri":"capability://data.processing.analysis.file.upload.support.for.extended.context.evaluation","name":"file-upload-support-for-extended-context-evaluation","description":"Supports file uploads in the Battle Mode interface, enabling evaluation of models on tasks that require extended context or document analysis. Users can upload files (format and scope unknown) alongside text prompts, allowing models to process documents, code, or other file-based inputs. This extends evaluation beyond pure text prompts to include document understanding and file-based reasoning tasks.","intents":["Evaluate models on document understanding and analysis tasks","Test how models handle code review, document summarization, or file-based reasoning","Compare models on tasks requiring extended context from uploaded files","Validate model performance on document-centric use cases"],"best_for":["Users evaluating models on document understanding tasks","Organizations testing code review or document analysis capabilities","Researchers studying how models handle file-based context"],"limitations":["Supported file formats unknown — no documentation of which file types are accepted (PDF, TXT, code files, etc.)","File size limits unknown — unclear whether large documents are supported or truncated","File processing method unknown — unclear whether files are converted to text, parsed structurally, or handled as binary","Model support unknown — unclear whether all models support file uploads or only specific providers","Context window limits — extended files may exceed model context windows, causing truncation or failure","Privacy implications — uploaded files are subject to same public disclosure as text prompts","No file-specific analysis — cannot determine whether models perform differently on file-based vs. text-only tasks"],"requires":["File upload capability in web interface","Supported file format (unknown)","File size within limits (unknown)","Models that support file input (unknown which models support this)"],"input_types":["text prompts","uploaded files (format and scope unknown)"],"output_types":["model responses incorporating file context","preference judgments on file-based tasks"],"categories":["data-processing-analysis","file-handling"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"chatbot-arena__cap_9","uri":"capability://tool.use.integration.user.authentication.and.battle.participation.gating","name":"user-authentication-and-battle-participation-gating","description":"Requires user login to participate in battles and contribute preference judgments, while keeping the leaderboard publicly viewable without authentication. The platform maintains user accounts that track battle history, preferences, and contribution metrics. Authentication gates battle participation to prevent spam and enable user-specific analytics while maintaining public leaderboard visibility.","intents":["Participate in model evaluation battles and contribute preference judgments","Track my battle history and contribution metrics","Maintain a persistent identity across multiple evaluation sessions","Access user-specific features or analytics"],"best_for":["Regular users contributing to Arena evaluation","Researchers tracking individual annotator behavior","Organizations managing team participation in evaluation"],"limitations":["Login requirement creates friction — casual users may not participate if authentication is burdensome","Account creation process unknown — unclear what information is required or how accounts are managed","User-specific analytics unknown — no documentation of what metrics are tracked or available to users","No team or organizational accounts mentioned — unclear whether organizations can manage multiple evaluators","Privacy implications — user accounts linked to conversations subject to public disclosure","No documented API for programmatic participation — unclear whether evaluation can be automated or integrated into workflows"],"requires":["User account creation (process unknown)","Login credentials (authentication method unknown)","Email or other account identifier (requirements unknown)"],"input_types":["login credentials","account creation information"],"output_types":["authenticated session","user profile","battle history (if available)"],"categories":["tool-use-integration","authentication"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"chatbot-arena__headline","uri":"capability://testing.quality.crowdsourced.llm.evaluation.platform","name":"crowdsourced llm evaluation platform","description":"Chatbot Arena is a crowdsourced platform that allows users to evaluate and compare large language models (LLMs) through side-by-side comparisons, generating Elo ratings based on real human preferences across various conversational tasks and languages.","intents":["best LLM evaluation platform","LLM benchmarking for conversational AI","how to compare language models","crowdsourced model evaluation tools","Elo rating systems for AI models"],"best_for":["developers evaluating LLMs","researchers comparing AI models"],"limitations":["may not cover all model types","depends on user participation"],"requires":[],"input_types":[],"output_types":[],"categories":["testing-quality"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":62,"verified":false,"data_access_risk":"high","permissions":["Web browser with JavaScript enabled","User account (login required for battle participation)","Access to https://lmarena.ai","Ability to articulate preferences between two text responses","Minimum number of battles per model (threshold unknown)","Continuous battle data collection","Computational infrastructure to update ratings as new battles arrive","User account and login to https://lmarena.ai","Ability to articulate and submit text prompts","Time to wait for model responses (latency unknown)"],"failure_modes":["Sampling bias — only users who visit Arena and engage in battles contribute data, not representative of all use cases or user populations","Preference bias — human preference may favor verbose, confident-sounding, or stylistically appealing responses over factually correct but terse ones","No control over inference parameters — models are called as black boxes, so response quality depends on provider's default settings","Stochastic evaluation — pairwise preference is inherently variable; no test-retest reliability metrics provided","Language distribution unknown — 'diverse languages' mentioned but no breakdown of which languages are represented or their relative weights","Elo formula and parameters not publicly documented — specific rating computation methodology unknown","No confidence intervals or significance testing provided — uncertainty quantification absent","Relative ranking only — Elo provides no absolute performance metric or interpretation of what a score means in real-world terms","Potential ceiling effects — pairwise preference may saturate when models are very similar in quality, making fine-grained distinctions unreliable","Sample size per model comparison unknown — some models may have insufficient battles for stable ratings","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.7,"quality":0.9,"ecosystem":0.3,"match_graph":0.25,"freshness":0.75,"weights":{"adoption":0.25,"quality":0.35,"ecosystem":0.15,"match_graph":0.2,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:21.547Z","last_scraped_at":null,"last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=chatbot-arena","compare_url":"https://unfragile.ai/compare?artifact=chatbot-arena"}},"signature":"Kiioic7VWIht9kdN0SaRDfdiTOZzaLZJI+60V4lL1tsjPDus1AAKkfAa3dQGVigecfHNeIL1MVjq2UcjQc73DA==","signedAt":"2026-06-22T01:19:10.694Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/chatbot-arena","artifact":"https://unfragile.ai/chatbot-arena","verify":"https://unfragile.ai/api/v1/verify?slug=chatbot-arena","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}