{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"hf-dataset-cais--mmlu","slug":"cais--mmlu","name":"mmlu","type":"dataset","url":"https://huggingface.co/datasets/cais/mmlu","page_url":"https://unfragile.ai/cais--mmlu","categories":["model-training"],"tags":["task_categories:question-answering","task_ids:multiple-choice-qa","annotations_creators:no-annotation","language_creators:expert-generated","multilinguality:monolingual","source_datasets:original","language:en","license:mit","size_categories:100K<n<1M","format:parquet","modality:text","library:datasets","library:pandas","library:polars","library:mlcroissant","arxiv:2009.03300","arxiv:2005.00700","arxiv:2005.14165","arxiv:2008.02275","region:us"],"pricing":{"model":"open_source","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"hf-dataset-cais--mmlu__cap_0","uri":"capability://data.processing.analysis.expert.curated.multiple.choice.question.answer.dataset.loading","name":"expert-curated multiple-choice question-answer dataset loading","description":"Loads a structured dataset of 439,045 multiple-choice questions across 57 academic subjects (STEM, humanities, social sciences) created by expert annotators. The dataset is distributed via HuggingFace's datasets library in Parquet format with standardized schema (question, choices A-D, correct answer, subject category), enabling direct integration into model evaluation pipelines without custom parsing or normalization logic.","intents":["benchmark language models against expert-curated academic knowledge across diverse domains","evaluate model performance on reasoning tasks requiring subject-matter expertise","train or fine-tune models on question-answering tasks with ground-truth labels","analyze model weaknesses across specific academic subjects or difficulty tiers"],"best_for":["ML researchers evaluating LLM capabilities on standardized benchmarks","model developers building question-answering systems requiring domain-specific evaluation","teams conducting comparative analysis of model performance across subjects"],"limitations":["English-only dataset — no multilingual coverage limits evaluation of non-English language models","Static snapshot from 2020 — does not reflect evolving knowledge or curriculum changes","Multiple-choice format only — does not evaluate free-form reasoning or explanation generation","No temporal versioning — cannot track model improvements over time on identical test sets","Subject distribution is imbalanced — STEM subjects overrepresented relative to humanities"],"requires":["HuggingFace datasets library (pip install datasets)","Python 3.7+","~2GB disk space for full dataset download","Internet connection for initial dataset fetch from HuggingFace Hub"],"input_types":["dataset identifier string (cais/mmlu)","optional subject filter (e.g., 'abstract_algebra', 'anatomy')","optional split selector (train/validation/test)"],"output_types":["structured records with fields: question (str), choices (list[str]), answer (str), subject (str)","Parquet files (native format)","Pandas DataFrames","PyArrow Tables"],"categories":["data-processing-analysis","benchmark-dataset"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-cais--mmlu__cap_1","uri":"capability://data.processing.analysis.subject.stratified.evaluation.split.generation","name":"subject-stratified evaluation split generation","description":"Provides pre-split train/validation/test partitions stratified by academic subject, ensuring each subject is represented proportionally across splits. This prevents data leakage where models might memorize subject-specific patterns in training data and enables fair cross-subject generalization testing. The splits are deterministic and reproducible across runs via fixed random seeds.","intents":["evaluate whether models generalize across subjects or overfit to training subject distributions","conduct subject-specific performance analysis by isolating test sets for individual domains","ensure balanced evaluation when fine-tuning models on subsets of subjects","reproduce benchmark results consistently across different research teams"],"best_for":["researchers conducting rigorous model evaluation with proper train/test separation","teams analyzing subject-specific model weaknesses or strengths","benchmark maintainers ensuring reproducibility across publications"],"limitations":["Fixed splits cannot be customized per research need — no dynamic stratification API","No cross-validation support — single train/val/test split limits statistical robustness","Subject imbalance persists across splits — some subjects have <100 test examples","No difficulty-based stratification — easy and hard questions mixed within splits"],"requires":["HuggingFace datasets library with split awareness","Python 3.7+"],"input_types":["split identifier ('train', 'validation', 'test')","optional subject filter"],"output_types":["stratified dataset subset with preserved subject proportions","Pandas DataFrame with split column"],"categories":["data-processing-analysis","evaluation-methodology"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-cais--mmlu__cap_2","uri":"capability://data.processing.analysis.zero.shot.and.few.shot.prompt.evaluation.framework","name":"zero-shot and few-shot prompt evaluation framework","description":"Enables systematic evaluation of language models under zero-shot (no examples) and few-shot (1-5 examples per subject) settings by providing standardized question formatting and answer extraction patterns. The dataset structure supports templating different prompt formats (chain-of-thought, direct answer, explanation-first) while maintaining consistent answer key matching for automated scoring.","intents":["measure model performance without fine-tuning to assess pre-trained knowledge","evaluate how quickly models adapt to new subjects with minimal in-context examples","compare prompt engineering strategies (e.g., CoT vs direct) on identical test sets","identify subjects where few-shot learning provides largest performance gains"],"best_for":["researchers studying in-context learning and prompt sensitivity","model developers optimizing prompt templates for production QA systems","teams evaluating foundation models before fine-tuning decisions"],"limitations":["No built-in prompt templating — users must implement their own formatting logic","Answer extraction assumes multiple-choice format — cannot evaluate free-form responses","No automatic prompt optimization — requires manual engineering or external tools","Few-shot examples are not pre-selected — users must implement sampling strategy"],"requires":["Language model API (OpenAI, Anthropic, local LLM)","Custom evaluation harness to format prompts and extract answers","Python 3.7+"],"input_types":["question text","multiple-choice options (A-D)","optional few-shot examples (1-5 per subject)","prompt template string"],"output_types":["model prediction (A/B/C/D)","accuracy score per subject","aggregate accuracy across all subjects","confusion matrix (predicted vs actual)"],"categories":["data-processing-analysis","evaluation-methodology"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-cais--mmlu__cap_3","uri":"capability://data.processing.analysis.cross.subject.generalization.analysis","name":"cross-subject generalization analysis","description":"Enables measurement of how well models trained or evaluated on one set of subjects transfer to held-out subjects, by providing explicit subject labels for every question. This supports leave-one-subject-out evaluation, subject-pair transfer analysis, and domain adaptation studies. The 57-subject taxonomy allows fine-grained analysis of which subject pairs have high transfer (e.g., physics→engineering) versus low transfer (e.g., law→medicine).","intents":["measure model robustness by testing on subjects completely absent from training","identify which subjects are prerequisites for learning other subjects","evaluate domain adaptation techniques by measuring transfer between subject pairs","analyze whether models develop general reasoning or memorize subject-specific patterns"],"best_for":["researchers studying transfer learning and domain generalization","teams building multi-domain QA systems requiring subject-specific adaptation","model developers optimizing for broad knowledge coverage"],"limitations":["Subject taxonomy is fixed — cannot add custom subject groupings or hierarchies","No semantic similarity between subjects — requires manual mapping to identify related domains","Subject imbalance makes some transfer analyses statistically underpowered (<100 test examples)","No temporal dimension — cannot measure knowledge decay or curriculum learning effects"],"requires":["Subject label extraction from dataset","Custom analysis code to compute transfer metrics","Python 3.7+"],"input_types":["model predictions on full dataset","subject labels for each question"],"output_types":["per-subject accuracy scores","subject-pair transfer matrix (source→target accuracy)","leave-one-subject-out evaluation results","transfer learning curves"],"categories":["data-processing-analysis","evaluation-methodology"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-cais--mmlu__cap_4","uri":"capability://data.processing.analysis.multi.format.dataset.consumption.via.standardized.library.interfaces","name":"multi-format dataset consumption via standardized library interfaces","description":"Provides access to the same dataset through multiple Python libraries (HuggingFace datasets, Pandas, Polars, MLCroissant) and serialization formats (Parquet, CSV, JSON), enabling integration into diverse ML workflows without format conversion. Each library interface exposes the same underlying schema (question, choices, answer, subject) but with library-specific optimizations (e.g., Polars for lazy evaluation, Pandas for exploratory analysis).","intents":["load dataset into preferred analysis tool without manual ETL or format conversion","integrate MMLU into existing ML pipelines using library-specific APIs","enable reproducible research by supporting multiple library versions and formats","reduce data loading latency for large-scale evaluation by choosing optimized format"],"best_for":["data scientists using Pandas for exploratory analysis","ML engineers building production pipelines with Polars for performance","researchers requiring reproducibility across different tool ecosystems","teams with heterogeneous tech stacks needing common dataset access"],"limitations":["Library-specific bugs or version incompatibilities not guaranteed to be fixed","Parquet format requires PyArrow — adds dependency for some workflows","No streaming API — full dataset must fit in memory for Pandas/Polars","Format conversions may introduce subtle schema differences (e.g., string encoding)"],"requires":["HuggingFace datasets library (pip install datasets) OR","Pandas (pip install pandas) OR","Polars (pip install polars) OR","MLCroissant support (requires additional setup)","Python 3.7+"],"input_types":["library identifier (datasets, pandas, polars, mlcroissant)","optional format specification (parquet, csv, json)"],"output_types":["HuggingFace Dataset object","Pandas DataFrame","Polars DataFrame","MLCroissant metadata + data files"],"categories":["data-processing-analysis","tool-use-integration"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-cais--mmlu__cap_5","uri":"capability://data.processing.analysis.academic.subject.taxonomy.and.hierarchical.filtering","name":"academic subject taxonomy and hierarchical filtering","description":"Provides explicit categorization of all 439K questions into 57 academic subjects (e.g., abstract_algebra, anatomy, astronomy, business_ethics, clinical_knowledge, etc.) with consistent labeling. This enables filtering, stratification, and analysis at subject level without requiring external knowledge graphs or manual categorization. Subjects span STEM (physics, chemistry, biology), humanities (history, philosophy, literature), social sciences (economics, psychology, sociology), and professional domains (law, medicine, business).","intents":["evaluate model performance on specific subjects or subject groups","train subject-specific models or adapters by filtering to relevant questions","analyze which subjects are hardest for models and why","create subject-balanced evaluation sets for fair comparison"],"best_for":["researchers studying subject-specific model capabilities","teams building specialized QA systems for particular domains","educators assessing model knowledge in specific academic areas"],"limitations":["Subject taxonomy is flat — no hierarchical grouping (e.g., STEM vs humanities)","Subject boundaries are fixed — cannot merge or split subjects for custom analysis","No subject difficulty ratings — cannot distinguish easy vs hard questions within subject","Subject imbalance — some subjects have 1000+ questions, others <100"],"requires":["Subject label access from dataset schema","Python 3.7+"],"input_types":["subject name (string) or list of subject names","optional filtering logic"],"output_types":["filtered dataset containing only questions from specified subjects","subject distribution statistics","per-subject accuracy metrics"],"categories":["data-processing-analysis","knowledge-organization"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":23,"verified":false,"data_access_risk":"low","permissions":["HuggingFace datasets library (pip install datasets)","Python 3.7+","~2GB disk space for full dataset download","Internet connection for initial dataset fetch from HuggingFace Hub","HuggingFace datasets library with split awareness","Language model API (OpenAI, Anthropic, local LLM)","Custom evaluation harness to format prompts and extract answers","Subject label extraction from dataset","Custom analysis code to compute transfer metrics","HuggingFace datasets library (pip install datasets) OR"],"failure_modes":["English-only dataset — no multilingual coverage limits evaluation of non-English language models","Static snapshot from 2020 — does not reflect evolving knowledge or curriculum changes","Multiple-choice format only — does not evaluate free-form reasoning or explanation generation","No temporal versioning — cannot track model improvements over time on identical test sets","Subject distribution is imbalanced — STEM subjects overrepresented relative to humanities","Fixed splits cannot be customized per research need — no dynamic stratification API","No cross-validation support — single train/val/test split limits statistical robustness","Subject imbalance persists across splits — some subjects have <100 test examples","No difficulty-based stratification — easy and hard questions mixed within splits","No built-in prompt templating — users must implement their own formatting logic","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.05,"quality":0.22,"ecosystem":0.5000000000000001,"match_graph":0.25,"freshness":0.75,"weights":{"adoption":0.3,"quality":0.25,"ecosystem":0.1,"match_graph":0.3,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:22.764Z","last_scraped_at":"2026-05-03T14:22:48.064Z","last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=cais--mmlu","compare_url":"https://unfragile.ai/compare?artifact=cais--mmlu"}},"signature":"jpX08WnMz4HDYmi2Og8DmG3BahcxthKZPkzHzTQXU3NAERksFp5NQELAJ1A6zff5eZfueY8EjgNegrWRKML1Bg==","signedAt":"2026-06-21T15:41:59.120Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/cais--mmlu","artifact":"https://unfragile.ai/cais--mmlu","verify":"https://unfragile.ai/api/v1/verify?slug=cais--mmlu","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}