{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"math-benchmark","slug":"math-benchmark","name":"MATH Benchmark","type":"benchmark","url":"https://github.com/hendrycks/math","page_url":"https://unfragile.ai/math-benchmark","categories":["testing-quality"],"tags":[],"pricing":{"model":"free","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"math-benchmark__cap_0","uri":"capability://data.processing.analysis.competition.mathematics.problem.dataset.loading.with.multi.subject.stratification","name":"competition-mathematics problem dataset loading with multi-subject stratification","description":"Loads and preprocesses 12,500 curated competition mathematics problems from AMC 10/12, AIME, and Math Olympiads using the MATHDataset class in MATH.py. The loader supports multiple tokenization strategies and can selectively include or exclude solution steps during preprocessing, enabling researchers to evaluate models on problem-solving without solution hints. Problems are stratified across 7 mathematical subjects (Prealgebra, Algebra, Number Theory, Counting/Probability, Geometry, Intermediate Algebra, Precalculus) with structured JSON metadata including problem statements, solutions, and difficulty levels.","intents":["Load a standardized mathematics benchmark dataset for evaluating LLM reasoning capabilities","Preprocess competition math problems with configurable tokenization for different model architectures","Access stratified subsets of problems by mathematical subject for targeted evaluation","Retrieve problems with or without solution steps depending on evaluation methodology"],"best_for":["AI researchers benchmarking language model mathematical reasoning","Teams evaluating LLM performance on competition-level mathematics","Developers building math-focused AI systems requiring standardized evaluation"],"limitations":["Dataset is static and fixed at 12,500 problems — no dynamic problem generation or augmentation","Requires manual download from Berkeley server; not automatically provisioned via package manager","Problems are English-language only; no multilingual variants","Subject distribution may not reflect real-world problem frequencies in mathematical competitions"],"requires":["Python 3.6+","Downloaded MATH dataset files in JSON format from Berkeley server","Tokenizer compatible with model architecture (BERT, GPT, etc.)"],"input_types":["JSON problem files with structure: {problem, solution, level, type, subject}"],"output_types":["Preprocessed problem-solution pairs with tokenized representations","Stratified subsets indexed by mathematical subject"],"categories":["data-processing-analysis","benchmark-dataset"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"math-benchmark__cap_1","uri":"capability://data.processing.analysis.mathematical.equivalence.checking.with.latex.normalization.and.algebraic.simplification","name":"mathematical equivalence checking with latex normalization and algebraic simplification","description":"Implements the is_equiv() function in math_equivalence.py that determines semantic equivalence between two mathematical expressions regardless of syntactic representation. The system applies a multi-stage normalization pipeline that handles LaTeX formatting, fraction representations, algebraic simplification, and numerical precision issues before performing string-based comparison. This enables accurate answer verification without requiring exact string matching, accommodating equivalent forms like '1/2', '0.5', and '\\frac{1}{2}'.","intents":["Verify whether a model-generated mathematical answer is correct without requiring exact string matching","Compare answers in different mathematical notations (decimal, fraction, LaTeX) as equivalent","Handle numerical precision issues when comparing floating-point results","Normalize algebraic expressions to canonical form for comparison"],"best_for":["Researchers evaluating LLM mathematical reasoning on competition problems","Systems requiring robust answer verification for math problems across multiple notations","Teams building automated grading systems for mathematical content"],"limitations":["Normalization pipeline may not handle all edge cases in advanced mathematics (e.g., complex numbers, symbolic expressions with multiple variables)","Numerical precision comparison uses fixed epsilon thresholds that may not adapt to problem-specific requirements","LaTeX parsing is regex-based rather than AST-based, limiting robustness to malformed or non-standard LaTeX","No support for symbolic equivalence checking (e.g., verifying that two polynomial expressions are identical)"],"requires":["Python 3.6+","math_equivalence.py module from hendrycks/math repository","Standard library dependencies (re, sympy for algebraic simplification)"],"input_types":["String representations of mathematical expressions (plain text, LaTeX, decimal, fraction formats)"],"output_types":["Boolean equivalence result (True/False)","Normalized canonical forms of both expressions"],"categories":["data-processing-analysis","safety-moderation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"math-benchmark__cap_10","uri":"capability://planning.reasoning.solution.step.extraction.and.intermediate.reasoning.evaluation","name":"solution step extraction and intermediate reasoning evaluation","description":"Extracts and preserves solution steps from MATH problems, enabling evaluation of intermediate reasoning and chain-of-thought capabilities. The system can optionally include or exclude solution steps during dataset loading, supporting different evaluation methodologies: evaluating final answers only (without hints) or evaluating intermediate reasoning steps. This enables researchers to assess whether models can generate correct reasoning chains or merely guess final answers.","intents":["Evaluate model ability to generate correct intermediate reasoning steps","Compare models on final answer accuracy vs reasoning quality","Assess chain-of-thought prompting effectiveness on competition mathematics","Analyze whether models memorize answers or derive them through reasoning"],"best_for":["Researchers studying chain-of-thought reasoning in language models","Teams evaluating reasoning quality beyond final answer correctness","Developers optimizing models for interpretable mathematical reasoning"],"limitations":["Solution step evaluation requires manual annotation of correct reasoning paths — no automatic verification","Multiple valid solution paths may exist for same problem; system cannot verify all valid approaches","Intermediate step correctness is subjective and difficult to evaluate automatically","No built-in support for partial credit — steps are either correct or incorrect"],"requires":["Python 3.6+","MATH dataset with solution steps included in problem metadata","MATHDataset class configured to include solution steps"],"input_types":["Problem metadata including solution steps","Model-generated reasoning chains"],"output_types":["Intermediate step correctness labels","Reasoning quality metrics","Chain-of-thought evaluation results"],"categories":["planning-reasoning","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"math-benchmark__cap_2","uri":"capability://planning.reasoning.local.gpt.style.model.evaluation.with.configurable.beam.search.and.sampling","name":"local gpt-style model evaluation with configurable beam search and sampling","description":"Provides evaluation infrastructure in eval_math_gpt.py that runs local language models (GPT-style architectures) on MATH dataset problems with configurable inference parameters including beam search width, sampling temperature, and top-k/top-p filtering. The run_eval() function orchestrates the evaluation pipeline: loads problems from MATHDataset, generates model responses with specified decoding strategy, extracts final answers from model outputs, and compares against ground truth using mathematical equivalence checking. Supports both greedy decoding and stochastic sampling for exploring model behavior under different inference regimes.","intents":["Evaluate a locally-hosted language model on competition mathematics problems with controlled inference parameters","Compare model performance across different decoding strategies (greedy vs beam search vs sampling)","Generate multiple candidate answers per problem using beam search to assess model uncertainty","Measure accuracy metrics (exact match, partial credit) with mathematical equivalence verification"],"best_for":["Researchers evaluating custom or fine-tuned language models on mathematical reasoning","Teams running local model evaluation without API dependencies","Developers optimizing inference parameters for math problem-solving tasks"],"limitations":["Requires local GPU/compute resources to run inference — no cloud offloading option","Beam search and sampling parameters must be manually tuned; no automatic hyperparameter optimization","Answer extraction from model outputs uses heuristic parsing (e.g., regex for 'Answer: X') which may fail on non-standard output formats","No built-in support for multi-turn reasoning or chain-of-thought prompting — evaluates single-pass generation only"],"requires":["Python 3.6+","Local language model checkpoint (GPT-2, GPT-3, or compatible architecture)","PyTorch or TensorFlow for model inference","MATH dataset loaded via MATHDataset class","GPU with sufficient VRAM for model inference (varies by model size)"],"input_types":["Problem text from MATH dataset","Model checkpoint and tokenizer","Inference configuration (beam_width, temperature, top_k, top_p)"],"output_types":["Model-generated answer strings","Accuracy metrics (correct/incorrect per problem)","Aggregated performance statistics by subject"],"categories":["planning-reasoning","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"math-benchmark__cap_3","uri":"capability://tool.use.integration.openai.gpt.3.api.based.model.evaluation.with.remote.inference","name":"openai gpt-3 api-based model evaluation with remote inference","description":"Provides evaluation infrastructure in evaluate_gpt3.py that interfaces with OpenAI's GPT-3 API for remote model evaluation on MATH problems. The system handles API authentication, batches problem submissions to the GPT-3 API, parses structured responses, and aggregates accuracy metrics. This enables evaluation of closed-source models without local compute resources, though with latency and cost considerations inherent to API-based inference.","intents":["Evaluate OpenAI GPT-3 or other API-accessible models on MATH dataset without local infrastructure","Benchmark closed-source language models on competition mathematics","Compare API-based model performance against local model baselines","Measure mathematical reasoning capabilities of production language models"],"best_for":["Researchers evaluating closed-source models (GPT-3, GPT-4) on mathematical reasoning","Teams without access to local GPU compute resources","Quick benchmarking workflows prioritizing simplicity over cost"],"limitations":["Requires valid OpenAI API key and incurs per-token costs for all evaluations","API rate limits may require evaluation to be distributed across multiple runs","Response latency adds significant overhead compared to local inference (100-500ms per problem)","No control over model inference parameters (temperature, top_k) — limited to API-exposed settings","Evaluation results depend on API availability and OpenAI's model updates"],"requires":["Python 3.6+","Valid OpenAI API key with GPT-3 access","Network connectivity to OpenAI API endpoints","MATH dataset loaded via MATHDataset class","Sufficient API credits for 12,500 problem evaluations"],"input_types":["Problem text from MATH dataset","OpenAI API credentials"],"output_types":["API responses containing model-generated answers","Accuracy metrics aggregated across problem set","Cost tracking (tokens used, estimated API charges)"],"categories":["tool-use-integration","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"math-benchmark__cap_4","uri":"capability://data.processing.analysis.subject.stratified.accuracy.metrics.aggregation.and.reporting","name":"subject-stratified accuracy metrics aggregation and reporting","description":"Aggregates evaluation results across the 12,500 problems and computes accuracy metrics stratified by mathematical subject (Prealgebra, Algebra, Number Theory, Counting/Probability, Geometry, Intermediate Algebra, Precalculus). The reporting system generates per-subject accuracy percentages, overall accuracy, and optional per-difficulty breakdowns. This enables fine-grained analysis of model strengths and weaknesses across mathematical domains, revealing whether models struggle with specific subject areas.","intents":["Compute overall accuracy on MATH dataset with mathematical equivalence verification","Analyze model performance breakdown by mathematical subject to identify domain-specific weaknesses","Generate comparative reports across multiple models or inference configurations","Track accuracy trends across problem difficulty levels (easy, medium, hard)"],"best_for":["Researchers publishing benchmarking results on MATH dataset","Teams analyzing model capabilities across mathematical domains","Developers optimizing models for specific mathematical subject areas"],"limitations":["Metrics are computed post-hoc after all evaluations complete — no streaming or incremental reporting","Subject distribution in MATH dataset may not reflect real-world problem frequencies, biasing comparative analysis","No statistical significance testing or confidence intervals — raw accuracy percentages only","Difficulty levels are subjective and may not correlate with actual model performance"],"requires":["Python 3.6+","Completed evaluation results with per-problem correctness labels","Problem metadata including subject and difficulty annotations"],"input_types":["Evaluation results: list of (problem_id, is_correct, subject, difficulty)","Problem metadata from MATH dataset"],"output_types":["Accuracy metrics: overall %, per-subject %, per-difficulty %","Aggregated statistics (mean, std dev across subjects)","Formatted reports (JSON, CSV, or human-readable tables)"],"categories":["data-processing-analysis","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"math-benchmark__cap_5","uri":"capability://data.processing.analysis.problem.metadata.extraction.and.structured.indexing","name":"problem metadata extraction and structured indexing","description":"Extracts and indexes structured metadata from MATH dataset JSON files including problem statement, solution steps, final answer, difficulty level, and mathematical subject. The indexing system enables efficient retrieval of problems by subject, difficulty, or other attributes, and provides structured access to problem components (problem text vs solution vs answer) for different evaluation workflows. Metadata is preserved throughout the evaluation pipeline to enable stratified analysis and filtering.","intents":["Access problem metadata (subject, difficulty, answer format) for filtering and analysis","Retrieve problems by mathematical subject for targeted evaluation","Extract solution steps for chain-of-thought or intermediate reasoning evaluation","Index problems for efficient lookup during evaluation runs"],"best_for":["Researchers analyzing problem characteristics and their correlation with model performance","Teams building custom evaluation workflows with problem filtering","Developers creating problem-aware model evaluation systems"],"limitations":["Metadata is static and fixed to original MATH dataset annotations — no dynamic enrichment","Subject and difficulty labels are subjective and may not align with model-specific difficulty","No full-text search or semantic indexing — filtering is limited to predefined metadata fields","Metadata schema is tightly coupled to MATH dataset format; adapting to other datasets requires code changes"],"requires":["Python 3.6+","MATH dataset JSON files with complete metadata","MATHDataset class from MATH.py"],"input_types":["MATH dataset JSON files with structure: {problem, solution, level, type, subject}"],"output_types":["Indexed problem objects with structured metadata","Filtered problem subsets by subject/difficulty","Metadata statistics (problem count per subject, difficulty distribution)"],"categories":["data-processing-analysis","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"math-benchmark__cap_6","uri":"capability://data.processing.analysis.answer.extraction.from.model.outputs.with.heuristic.parsing","name":"answer extraction from model outputs with heuristic parsing","description":"Extracts final numerical or symbolic answers from model-generated text using heuristic pattern matching (e.g., regex patterns for 'Answer: X', 'Final Answer:', or boxed notation). The extraction system handles common answer formats including integers, fractions, decimals, and algebraic expressions. This enables automatic answer verification without requiring models to output structured JSON or follow strict formatting conventions, accommodating natural language model outputs.","intents":["Extract final answers from free-form model outputs for automatic verification","Handle multiple answer format conventions (Answer:, Final Answer:, boxed notation)","Parse answers in various mathematical notations (fractions, decimals, expressions)","Enable evaluation of models not fine-tuned for structured output"],"best_for":["Evaluating base language models without answer format fine-tuning","Systems requiring flexible answer extraction from natural language outputs","Researchers benchmarking models across different output conventions"],"limitations":["Heuristic parsing is brittle and fails on non-standard output formats or ambiguous answer placement","No semantic understanding of answer context — may extract incorrect values if multiple numbers appear in output","Regex-based extraction cannot handle complex mathematical expressions or symbolic answers reliably","False negatives when models output correct answers in unexpected formats (e.g., 'The answer is 42' vs 'Answer: 42')","No confidence scoring — extraction succeeds or fails without indicating extraction reliability"],"requires":["Python 3.6+","Model output text (string)","Predefined regex patterns for answer extraction"],"input_types":["Model-generated text output (free-form natural language)"],"output_types":["Extracted answer string","Extraction success/failure flag","Normalized answer representation for equivalence checking"],"categories":["data-processing-analysis","text-generation-language"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"math-benchmark__cap_7","uri":"capability://data.processing.analysis.dataset.download.and.curation.from.competition.sources","name":"dataset download and curation from competition sources","description":"Curates and distributes the MATH dataset of 12,500 problems sourced from official mathematical competitions (AMC 10, AMC 12, AIME, and Math Olympiads). Problems are manually collected, verified for correctness, and formatted into standardized JSON structure with problem statement, solution, and metadata. The dataset is hosted on Berkeley servers and distributed via GitHub repository, enabling researchers to access high-quality competition mathematics problems for benchmarking.","intents":["Access a curated collection of competition-level mathematics problems for model evaluation","Download standardized MATH dataset in JSON format","Obtain problems verified for correctness from official mathematical competitions","Use problems spanning 7 mathematical subjects with consistent formatting"],"best_for":["Researchers benchmarking language models on mathematical reasoning","Teams evaluating AI systems on competition-level mathematics","Developers building math-focused AI applications requiring standardized evaluation"],"limitations":["Dataset is static and fixed at 12,500 problems — no dynamic updates or new problems","Download requires manual setup from Berkeley server; not automatically provisioned","Problems are English-language only; no multilingual variants","Dataset curation is one-time effort; no ongoing problem collection or quality assurance","Subject distribution may not reflect real-world problem frequencies in competitions"],"requires":["Network connectivity to Berkeley server or GitHub repository","Disk space for ~500MB dataset files","Python 3.6+ for loading and processing JSON files"],"input_types":["None (dataset is provided)"],"output_types":["JSON files containing 12,500 problems with structure: {problem, solution, level, type, subject}","Metadata files with problem statistics and subject distribution"],"categories":["data-processing-analysis","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"math-benchmark__cap_8","uri":"capability://data.processing.analysis.problem.difficulty.level.annotation.and.stratification","name":"problem difficulty level annotation and stratification","description":"Annotates each MATH problem with a difficulty level (easy, medium, hard) based on competition source and problem characteristics. The stratification system enables evaluation of model performance across difficulty tiers, revealing whether models struggle more with harder problems or show consistent performance. Difficulty annotations are preserved in problem metadata and used for stratified accuracy reporting.","intents":["Analyze model performance across problem difficulty levels","Identify whether models struggle more with hard vs easy problems","Compare model capabilities at different difficulty tiers","Filter problems by difficulty for targeted evaluation"],"best_for":["Researchers analyzing model scaling with problem difficulty","Teams evaluating whether models have consistent reasoning capabilities","Developers optimizing models for specific difficulty ranges"],"limitations":["Difficulty levels are subjective and based on competition source rather than objective metrics","Difficulty may not correlate with actual model performance — some 'hard' problems may be easy for models","No fine-grained difficulty scoring (e.g., 1-10 scale) — only coarse 3-level categorization","Difficulty annotations are static and cannot be updated based on empirical model performance"],"requires":["MATH dataset with difficulty annotations in problem metadata","Problem metadata from MATHDataset class"],"input_types":["Problem metadata including difficulty level annotation"],"output_types":["Stratified accuracy metrics by difficulty level","Problem subsets filtered by difficulty","Difficulty distribution statistics"],"categories":["data-processing-analysis","planning-reasoning"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"math-benchmark__cap_9","uri":"capability://data.processing.analysis.amps.pretraining.dataset.integration.for.model.training","name":"amps pretraining dataset integration for model training","description":"Provides access to the AMPS (Algebraic Mathematical Problem Solving) pretraining dataset, a large-scale collection of mathematical problems designed for pretraining language models on mathematical reasoning. The integration enables researchers to use AMPS for model pretraining and then evaluate the pretrained models on MATH benchmark, creating a complete pipeline from pretraining to evaluation. AMPS dataset is hosted on Google servers and can be downloaded separately from MATH.","intents":["Access large-scale mathematical problem dataset for pretraining language models","Pretrain models on diverse mathematical problems before fine-tuning on MATH","Evaluate models pretrained on AMPS against MATH benchmark","Compare models trained with vs without mathematical pretraining"],"best_for":["Researchers pretraining language models on mathematical reasoning","Teams building math-specialized language models","Developers studying impact of mathematical pretraining on downstream reasoning"],"limitations":["AMPS dataset is separate from MATH and requires independent download","No built-in integration between AMPS pretraining and MATH evaluation — requires custom pipeline","AMPS problem distribution and difficulty may differ from MATH, affecting transfer learning effectiveness","Pretraining on AMPS may introduce data leakage if AMPS contains problems similar to MATH test set"],"requires":["Python 3.6+","Network connectivity to Google servers for AMPS download","Disk space for large AMPS dataset (~several GB)","Training infrastructure (GPU, distributed training framework) for pretraining"],"input_types":["AMPS dataset files from Google servers"],"output_types":["Pretrained language model checkpoints","Training logs and loss curves","Evaluation results on MATH benchmark"],"categories":["data-processing-analysis","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"math-benchmark__headline","uri":"capability://testing.quality.mathematical.problem.solving.benchmark","name":"mathematical problem-solving benchmark","description":"A benchmark for evaluating mathematical problem-solving capabilities of language models, featuring 12,500 competition-level math problems across various subjects like algebra and number theory.","intents":["best math benchmarking tool","math evaluation framework for AI models","competition math problems for model training","benchmark for mathematical reasoning in AI","evaluate language models on math problems"],"best_for":["AI researchers","educators","data scientists"],"limitations":["requires familiarity with model evaluation","focused on math problems only"],"requires":["language model","computational resources"],"input_types":["math problems"],"output_types":["evaluation metrics","model performance data"],"categories":["testing-quality"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":63,"verified":false,"data_access_risk":"high","permissions":["Python 3.6+","Downloaded MATH dataset files in JSON format from Berkeley server","Tokenizer compatible with model architecture (BERT, GPT, etc.)","math_equivalence.py module from hendrycks/math repository","Standard library dependencies (re, sympy for algebraic simplification)","MATH dataset with solution steps included in problem metadata","MATHDataset class configured to include solution steps","Local language model checkpoint (GPT-2, GPT-3, or compatible architecture)","PyTorch or TensorFlow for model inference","MATH dataset loaded via MATHDataset class"],"failure_modes":["Dataset is static and fixed at 12,500 problems — no dynamic problem generation or augmentation","Requires manual download from Berkeley server; not automatically provisioned via package manager","Problems are English-language only; no multilingual variants","Subject distribution may not reflect real-world problem frequencies in mathematical competitions","Normalization pipeline may not handle all edge cases in advanced mathematics (e.g., complex numbers, symbolic expressions with multiple variables)","Numerical precision comparison uses fixed epsilon thresholds that may not adapt to problem-specific requirements","LaTeX parsing is regex-based rather than AST-based, limiting robustness to malformed or non-standard LaTeX","No support for symbolic equivalence checking (e.g., verifying that two polynomial expressions are identical)","Solution step evaluation requires manual annotation of correct reasoning paths — no automatic verification","Multiple valid solution paths may exist for same problem; system cannot verify all valid approaches","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.7,"quality":0.9,"ecosystem":0.39999999999999997,"match_graph":0.25,"freshness":0.52,"weights":{"adoption":0.25,"quality":0.35,"ecosystem":0.15,"match_graph":0.2,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-06-17T09:51:04.693Z","last_scraped_at":null,"last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=math-benchmark","compare_url":"https://unfragile.ai/compare?artifact=math-benchmark"}},"signature":"VWgaGIXFmgZLI2J6CWTHOeFLqvaWjOP7pN/+rDd3hImWIwovsqt2ScSsVA4EW0tppkPwpArvcV6zyAK8Z5C5Aw==","signedAt":"2026-06-20T11:42:02.893Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/math-benchmark","artifact":"https://unfragile.ai/math-benchmark","verify":"https://unfragile.ai/api/v1/verify?slug=math-benchmark","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}