{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"mteb","slug":"mteb","name":"MTEB","type":"benchmark","url":"https://github.com/embeddings-benchmark/mteb","page_url":"https://unfragile.ai/mteb","categories":["testing-quality","rag-knowledge"],"tags":[],"pricing":{"model":"free","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"mteb__cap_0","uri":"capability://data.processing.analysis.multi.task.embedding.model.evaluation.across.8.task.types","name":"multi-task embedding model evaluation across 8+ task types","description":"Evaluates embedding models against a standardized task hierarchy (AbsTask) that implements Classification, Clustering, PairClassification, Reranking, Retrieval, and STS tasks. Each task defines its own dataset, evaluation metrics, and task-specific logic, enabling consistent benchmarking across heterogeneous evaluation scenarios. The evaluation pipeline orchestrates model inference, metric computation, and result aggregation in a reproducible manner.","intents":["Compare embedding model performance across diverse downstream tasks","Validate that a new embedding model generalizes beyond single-task performance","Identify which task types a model excels or struggles with","Establish baseline performance for a custom embedding model"],"best_for":["Embedding model developers validating generalization","ML researchers comparing embedding approaches","Teams selecting production embedding models"],"limitations":["Task execution is sequential by default — no built-in parallelization across tasks","Memory overhead scales with dataset size; large retrieval tasks may require batching","Task-specific evaluators are tightly coupled to metric implementations — extending metrics requires subclassing"],"requires":["Python 3.8+","Model implementing the encoder protocol (SentenceTransformer wrapper or custom)","Sufficient disk space for dataset caching (varies by task selection, ~10-50GB for full benchmark)"],"input_types":["Text (sentences, documents, queries)","Model identifiers (HuggingFace model IDs or local paths)","Task configuration (task names, language codes)"],"output_types":["Structured results (JSON with per-task metrics)","Leaderboard-compatible result format","Per-task score breakdowns (precision, recall, NDCG, etc.)"],"categories":["data-processing-analysis","testing-quality"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"mteb__cap_1","uri":"capability://data.processing.analysis.multilingual.and.cross.lingual.evaluation.across.112.languages","name":"multilingual and cross-lingual evaluation across 112+ languages","description":"Supports evaluation of embedding models across 112+ languages through language-aware task metadata and multilingual dataset variants. The task system stores language codes and domain information, enabling filtering of tasks by language and cross-lingual evaluation scenarios. Dataset loading automatically handles language-specific variants, and the evaluation pipeline preserves language context through metadata propagation.","intents":["Evaluate how well an embedding model generalizes to non-English languages","Benchmark cross-lingual retrieval (e.g., query in English, documents in German)","Identify language-specific performance gaps in a multilingual model","Compare monolingual vs. multilingual embedding models on the same task set"],"best_for":["Teams building multilingual search or RAG systems","Researchers studying cross-lingual transfer in embeddings","Global companies evaluating models for non-English markets"],"limitations":["Not all 1000+ tasks have multilingual variants — coverage varies by language (English has most, low-resource languages have fewer)","Cross-lingual evaluation requires paired datasets (query language ≠ document language), which are limited to specific task types","Language-specific metrics (e.g., BERTScore for morphologically rich languages) are not task-specific — uses generic metrics"],"requires":["Python 3.8+","Model supporting the target language(s) (e.g., multilingual BERT, XLM-R)","Language code (ISO 639-1 or 639-3) for task filtering"],"input_types":["Language codes (e.g., 'en', 'de', 'zh', 'ar')","Task names with language variants","Multilingual model identifiers"],"output_types":["Per-language performance metrics","Cross-lingual performance matrices","Language-grouped leaderboard results"],"categories":["data-processing-analysis","testing-quality"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"mteb__cap_10","uri":"capability://data.processing.analysis.results.storage.loading.and.format.standardization","name":"results storage, loading, and format standardization","description":"Implements a standardized results format (JSON with per-task metrics, model metadata, and evaluation metadata) that enables reproducible result storage and leaderboard integration. Results are stored locally or in a centralized repository (HuggingFace Hub). The results system handles versioning, caching, and format validation. Results can be loaded and compared programmatically, enabling post-hoc analysis and leaderboard generation.","intents":["Store benchmark results in a standardized format for reproducibility","Load and compare results across multiple models and benchmarks","Submit results to the official leaderboard with proper formatting","Analyze results programmatically (e.g., compute correlations, identify outliers)"],"best_for":["Researchers publishing benchmark results","Teams tracking model performance over time","Leaderboard maintainers managing result submissions"],"limitations":["Results format is fixed — custom metrics require schema extension","Results are immutable once submitted — corrections require resubmission","Centralized repository (HuggingFace Hub) may have latency for large result sets","No built-in versioning — tracking result evolution requires external tools"],"requires":["Python 3.8+","Benchmark results (dict with per-task metrics)","Model metadata (dict with architecture, training data, etc.)","Optional: HuggingFace Hub credentials for submission"],"input_types":["Benchmark results (dict or JSON)","Model metadata (dict or JSON)","Evaluation metadata (timestamp, hardware, etc.)"],"output_types":["Standardized results (JSON)","Results file (local or remote)","Leaderboard-compatible format"],"categories":["data-processing-analysis","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"mteb__cap_11","uri":"capability://automation.workflow.contribution.system.with.point.based.incentives.for.task.and.model.additions","name":"contribution system with point-based incentives for task and model additions","description":"Implements a contribution tracking system that awards points for adding new tasks, models, and datasets to MTEB. Contributors earn points based on the scope and quality of their contribution (e.g., new task type, multilingual task, large dataset). The system tracks contributions and displays them on contributor profiles. Points are used to recognize and incentivize community contributions, enabling MTEB to scale beyond core maintainers.","intents":["Contribute new tasks or models to MTEB and receive recognition","Track contribution history and impact","Identify high-value contribution opportunities","Build community engagement around MTEB development"],"best_for":["Community contributors extending MTEB","Researchers publishing new evaluation tasks","Model developers adding models to the benchmark"],"limitations":["Point system is subjective — contribution value is determined by maintainers","No automated contribution validation — all contributions require manual review","Points have no tangible reward — purely recognition-based","Contribution tracking is manual — no automated detection of contributions"],"requires":["GitHub account","Contribution meeting MTEB quality standards","Pull request to MTEB repository"],"input_types":["Contribution type (new task, new model, new dataset)","Contribution scope (single-language, multilingual, new task type)","Contribution quality (test coverage, documentation, etc.)"],"output_types":["Contribution points (integer)","Contributor profile (GitHub username, contribution history)","Recognition (leaderboard, contributor list)"],"categories":["automation-workflow","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"mteb__cap_2","uri":"capability://automation.workflow.standardized.benchmark.suite.composition.and.execution","name":"standardized benchmark suite composition and execution","description":"Provides pre-defined benchmark suites (e.g., MTEB, RTEB) that group related tasks into coherent evaluation scenarios. The Benchmark class orchestrates task selection, model evaluation, and result aggregation. Benchmarks are composable — users can select specific task subsets, languages, or domains. The execution pipeline handles model loading, caching, and result serialization in a standardized format compatible with the leaderboard.","intents":["Run a standard benchmark suite (MTEB, RTEB) to compare models on a fixed task set","Create a custom benchmark by selecting specific tasks, languages, or domains","Reproduce published results by running the exact same benchmark configuration","Submit results to the official leaderboard with standardized formatting"],"best_for":["Researchers publishing embedding model papers","Teams benchmarking models for production selection","Leaderboard maintainers ensuring consistent evaluation"],"limitations":["Benchmark definitions are static — changing task composition requires code modification or custom benchmark creation","Result submission requires manual formatting and leaderboard integration — no automated CI/CD pipeline for results","Benchmark execution is deterministic but not bit-reproducible across hardware (floating-point variance)"],"requires":["Python 3.8+","MTEB library installed","Model compatible with MTEB encoder protocol","Sufficient compute for full benchmark (varies by model size and task count, typically 1-48 hours)"],"input_types":["Benchmark name (string identifier)","Model identifier (HuggingFace ID or local path)","Optional: task subset, language filter, domain filter"],"output_types":["Benchmark results (JSON with per-task and aggregate metrics)","Leaderboard submission format","Model card metadata"],"categories":["automation-workflow","testing-quality"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"mteb__cap_3","uri":"capability://tool.use.integration.encoder.protocol.abstraction.with.multi.framework.support","name":"encoder protocol abstraction with multi-framework support","description":"Defines a unified encoder protocol that abstracts over different embedding model implementations (SentenceTransformers, instruction-based models, custom implementations). The protocol specifies encode() method signatures and handles batching, device management, and output normalization. Wrappers for SentenceTransformer and instruction-based models implement the protocol, enabling seamless integration of diverse model architectures without modifying evaluation code.","intents":["Integrate a custom embedding model into MTEB without rewriting evaluation code","Use SentenceTransformer models directly without wrapper boilerplate","Evaluate instruction-tuned embedding models (e.g., with task-specific prompts)","Support emerging embedding model architectures (e.g., multimodal, sparse)"],"best_for":["Model developers adding new embedding architectures to MTEB","Teams using custom or proprietary embedding models","Researchers experimenting with novel encoder designs"],"limitations":["Protocol assumes batch encoding with optional instruction/prompt support — streaming or online learning not supported","Device management is implicit (auto-detect or explicit specification) — no fine-grained control over multi-GPU distribution","Output normalization is optional but not enforced — models returning unnormalized embeddings may produce incorrect metrics"],"requires":["Python 3.8+","Model implementing the encoder protocol (encode method with specific signature)","For SentenceTransformer: sentence-transformers library","For instruction-based: model supporting prompt/instruction parameters"],"input_types":["Text (sentences, documents, queries)","Optional: instructions/prompts for instruction-based models","Batch size (integer)"],"output_types":["Embeddings (numpy array or torch tensor, shape [batch_size, embedding_dim])","Optional: normalized embeddings (L2 norm)"],"categories":["tool-use-integration","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"mteb__cap_4","uri":"capability://data.processing.analysis.task.specific.metric.computation.and.result.aggregation","name":"task-specific metric computation and result aggregation","description":"Implements task-specific evaluators that compute metrics appropriate to each task type (e.g., NDCG for retrieval, F1 for classification, silhouette score for clustering). Metrics are computed per-task and aggregated into benchmark-level scores. The evaluation system supports custom metrics and handles edge cases (e.g., missing labels, ties in ranking). Results are serialized in a standardized format with per-task breakdowns and aggregate scores.","intents":["Compute standard metrics (NDCG, MRR, MAP) for retrieval tasks","Evaluate classification performance (accuracy, F1, precision, recall)","Measure clustering quality (silhouette score, V-measure)","Aggregate per-task metrics into a single benchmark score"],"best_for":["Researchers comparing embedding models using standard metrics","Teams validating model performance against published baselines","Leaderboard maintainers ensuring consistent metric computation"],"limitations":["Metrics are task-specific and not customizable per-task — extending metrics requires subclassing evaluators","Aggregation strategy (e.g., macro vs. micro averaging) is fixed — no per-benchmark aggregation configuration","Some metrics assume specific data distributions (e.g., NDCG assumes ranked lists) — may not apply to all task variants"],"requires":["Python 3.8+","Task-specific evaluator implementation","Ground truth labels and predictions"],"input_types":["Model predictions (embeddings or scores)","Ground truth labels (task-specific format)","Task metadata (task type, metric configuration)"],"output_types":["Per-task metrics (dict with metric names and values)","Aggregate benchmark score (float)","Detailed result breakdown (per-query or per-document metrics)"],"categories":["data-processing-analysis","testing-quality"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"mteb__cap_5","uri":"capability://automation.workflow.caching.and.performance.optimization.for.large.scale.evaluation","name":"caching and performance optimization for large-scale evaluation","description":"Implements multi-level caching to reduce redundant computation: dataset caching (avoid re-downloading), embedding caching (avoid re-encoding), and result caching (avoid re-evaluating). The caching system uses local disk storage (configurable path) and checks cache validity based on model/task/dataset versions. Batching and device management optimize memory usage and inference speed. Progress tracking and logging enable monitoring of long-running evaluations.","intents":["Speed up repeated evaluations of the same model on the same tasks","Reduce bandwidth usage by caching downloaded datasets","Avoid re-encoding documents when evaluating multiple models","Monitor progress and estimate time-to-completion for large benchmarks"],"best_for":["Teams running frequent model evaluations (e.g., during hyperparameter tuning)","Resource-constrained environments (limited bandwidth, compute)","Researchers iterating on model improvements"],"limitations":["Cache invalidation is version-based — changes to evaluation code may not invalidate stale cache entries","Embedding caching assumes deterministic model behavior — non-deterministic models (e.g., with dropout) may produce incorrect cached results","Cache storage is local disk — no distributed caching for multi-machine setups","Batching is fixed per-task — no dynamic batch size tuning based on available memory"],"requires":["Python 3.8+","Writable disk space for cache (configurable, default ~/.cache/mteb)","Sufficient RAM for batch processing (varies by model size and batch size)"],"input_types":["Model identifier","Task names","Batch size (integer, default 32)"],"output_types":["Cached embeddings (numpy arrays)","Cached results (JSON)","Cache statistics (hit rate, size)"],"categories":["automation-workflow","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"mteb__cap_6","uri":"capability://search.retrieval.interactive.leaderboard.with.dynamic.table.generation.and.filtering","name":"interactive leaderboard with dynamic table generation and filtering","description":"Provides a web-based leaderboard (Streamlit app in mteb/leaderboard/app.py) that visualizes benchmark results with interactive filtering and sorting. The leaderboard loads results from a centralized repository, generates dynamic tables with configurable columns (metrics, languages, domains), and supports filtering by model, task, language, and benchmark. Table styling and figure generation enable publication-quality visualizations. The leaderboard is automatically updated when new results are submitted.","intents":["Browse and compare embedding models across tasks and languages","Filter models by performance on specific tasks or languages","Visualize performance trends (e.g., model size vs. performance)","Submit new model results and see them reflected in the leaderboard"],"best_for":["Researchers selecting models for their use case","Model developers showcasing their results","Teams tracking model performance over time"],"limitations":["Leaderboard is read-only for most users — result submission requires manual review and approval","Table generation is static (generated at query time) — no caching for frequently accessed views","Filtering is client-side (Streamlit) — performance degrades with large result sets (1000+ models)","No historical tracking — leaderboard shows current results only, not performance evolution"],"requires":["Python 3.8+","Streamlit library","Access to results repository (local or remote)","Internet connection for web access"],"input_types":["Filter parameters (model name, task, language, benchmark)","Sort parameters (metric, ascending/descending)","Display parameters (columns to show, table format)"],"output_types":["HTML table with results","Figures (scatter plots, bar charts)","Model cards with metadata"],"categories":["search-retrieval","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"mteb__cap_7","uri":"capability://data.processing.analysis.model.metadata.and.model.card.generation","name":"model metadata and model card generation","description":"Maintains standardized metadata for each model (architecture, training data, languages supported, license) and generates model cards compatible with Hugging Face. Model metadata is stored alongside results and used for filtering and comparison. The system supports custom metadata fields and enables model developers to provide context (e.g., training approach, known limitations). Model cards are generated from metadata and results, providing a comprehensive overview of model capabilities.","intents":["Document model architecture, training data, and capabilities","Generate model cards for Hugging Face Hub integration","Filter models by metadata (e.g., license, training data)","Provide context for interpreting benchmark results"],"best_for":["Model developers publishing models to Hugging Face","Teams documenting model capabilities for internal use","Researchers providing reproducibility information"],"limitations":["Metadata schema is fixed — custom fields require schema extension","Model card generation is template-based — limited customization","Metadata is optional — many models lack complete metadata, limiting filtering effectiveness"],"requires":["Python 3.8+","Model metadata in standardized format (JSON or YAML)","Benchmark results for the model"],"input_types":["Model metadata (dict with architecture, training data, languages, etc.)","Benchmark results (JSON)","Optional: custom metadata fields"],"output_types":["Model card (Markdown)","Metadata (JSON)","Hugging Face Hub-compatible format"],"categories":["data-processing-analysis","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"mteb__cap_8","uri":"capability://automation.workflow.command.line.interface.for.benchmark.execution.and.result.submission","name":"command-line interface for benchmark execution and result submission","description":"Provides a CLI (mteb/cli.py) for running benchmarks, evaluating models, and submitting results without writing Python code. The CLI supports task selection, language filtering, model specification, and result output formatting. Commands include 'run' (execute benchmark), 'eval' (evaluate single task), 'submit' (submit results to leaderboard), and 'list' (show available tasks/models). The CLI integrates with the evaluation pipeline and handles result serialization.","intents":["Run a benchmark from the command line without Python scripting","Evaluate a single task or custom task subset","Submit results to the leaderboard with proper formatting","List available tasks, models, and benchmarks"],"best_for":["Users unfamiliar with Python API","CI/CD pipelines automating model evaluation","Quick benchmarking without custom scripts"],"limitations":["CLI is less flexible than Python API — advanced customization requires Python code","Result submission requires manual approval — no automated leaderboard updates","CLI output is text-based — no interactive visualization"],"requires":["Python 3.8+","MTEB library installed","Command-line shell (bash, zsh, Windows PowerShell)"],"input_types":["Command name (run, eval, submit, list)","Arguments (model ID, task names, benchmark name, output path)","Flags (--language, --batch-size, --cache, --output-format)"],"output_types":["Benchmark results (JSON, CSV, or text)","Submission confirmation (text)","Task/model listings (text table)"],"categories":["automation-workflow","tool-use-integration"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"mteb__cap_9","uri":"capability://tool.use.integration.extensible.task.system.for.adding.new.evaluation.scenarios","name":"extensible task system for adding new evaluation scenarios","description":"Provides a task framework (AbsTask base class) that enables developers to add new evaluation tasks without modifying core evaluation code. Tasks define dataset loading, metric computation, and task-specific logic through method overrides. The task registry enables dynamic task discovery and selection. Task metadata (language, domain, license) is standardized, enabling filtering and cross-cutting analysis. Documentation and examples guide task creation.","intents":["Add a new evaluation task (e.g., domain-specific retrieval) to MTEB","Create a custom benchmark by combining existing and new tasks","Contribute new tasks to the MTEB repository","Extend MTEB for specialized evaluation scenarios (e.g., multimodal, code)"],"best_for":["Researchers adding domain-specific evaluation tasks","Teams creating custom benchmarks for internal use","Contributors extending MTEB with new task types"],"limitations":["Task creation requires Python coding — no low-code task definition","Task interface is tightly coupled to evaluation pipeline — significant refactoring needed for novel task types","Dataset loading is task-specific — no generic dataset abstraction","Task metadata schema is fixed — custom fields require schema extension"],"requires":["Python 3.8+","Understanding of MTEB task interface (AbsTask)","Dataset in a supported format (HuggingFace datasets, local files)"],"input_types":["Task class definition (Python code)","Dataset (HuggingFace dataset or local files)","Task metadata (language, domain, license, etc.)"],"output_types":["Task implementation (Python class)","Task registration (added to task registry)","Results (compatible with leaderboard format)"],"categories":["tool-use-integration","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"mteb__headline","uri":"capability://data.processing.analysis.massive.text.embedding.benchmark.for.evaluating.embedding.models","name":"massive text embedding benchmark for evaluating embedding models","description":"MTEB is a comprehensive framework designed to evaluate text and multimodal embedding models across various tasks and languages, providing standardized benchmarks for performance comparison.","intents":["best text embedding benchmark","embedding model evaluation framework","MTEB for comparing embedding models","top benchmarks for text embeddings","how to evaluate embedding models effectively"],"best_for":["researchers","model developers","practitioners"],"limitations":[],"requires":[],"input_types":[],"output_types":[],"categories":["data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":64,"verified":false,"data_access_risk":"high","permissions":["Python 3.8+","Model implementing the encoder protocol (SentenceTransformer wrapper or custom)","Sufficient disk space for dataset caching (varies by task selection, ~10-50GB for full benchmark)","Model supporting the target language(s) (e.g., multilingual BERT, XLM-R)","Language code (ISO 639-1 or 639-3) for task filtering","Benchmark results (dict with per-task metrics)","Model metadata (dict with architecture, training data, etc.)","Optional: HuggingFace Hub credentials for submission","GitHub account","Contribution meeting MTEB quality standards"],"failure_modes":["Task execution is sequential by default — no built-in parallelization across tasks","Memory overhead scales with dataset size; large retrieval tasks may require batching","Task-specific evaluators are tightly coupled to metric implementations — extending metrics requires subclassing","Not all 1000+ tasks have multilingual variants — coverage varies by language (English has most, low-resource languages have fewer)","Cross-lingual evaluation requires paired datasets (query language ≠ document language), which are limited to specific task types","Language-specific metrics (e.g., BERTScore for morphologically rich languages) are not task-specific — uses generic metrics","Results format is fixed — custom metrics require schema extension","Results are immutable once submitted — corrections require resubmission","Centralized repository (HuggingFace Hub) may have latency for large result sets","No built-in versioning — tracking result evolution requires external tools","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.7,"quality":0.9,"ecosystem":0.49999999999999994,"match_graph":0.25,"freshness":0.52,"weights":{"adoption":0.25,"quality":0.35,"ecosystem":0.15,"match_graph":0.2,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-06-17T09:51:04.693Z","last_scraped_at":null,"last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=mteb","compare_url":"https://unfragile.ai/compare?artifact=mteb"}},"signature":"+8FaieDaVhKl3Z5zbs2sDWf//Jet8Du/m1KrIWMZVyyRJXxz+qXeyreDJYKcQ0dLBbc+OrlpGlmHkFZLgcVFCQ==","signedAt":"2026-06-22T17:24:23.423Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/mteb","artifact":"https://unfragile.ai/mteb","verify":"https://unfragile.ai/api/v1/verify?slug=mteb","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}