{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"finqa","slug":"finqa","name":"FinQA","type":"dataset","url":"https://huggingface.co/datasets/ibm/finqa","page_url":"https://unfragile.ai/finqa","categories":["model-training","testing-quality"],"tags":[],"pricing":{"model":"free","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"finqa__cap_0","uri":"capability://data.processing.analysis.multi.step.numerical.reasoning.over.financial.documents","name":"multi-step numerical reasoning over financial documents","description":"Enables evaluation of AI systems' ability to perform chained mathematical operations (addition, subtraction, multiplication, division, comparisons) across both structured tables and unstructured text extracted from SEC filings. The dataset provides ground-truth question-answer pairs where answers require synthesizing data from multiple locations within earnings reports and applying sequential arithmetic operations, testing whether models can decompose complex financial queries into discrete computational steps.","intents":["Benchmark whether my LLM can correctly perform multi-step math over financial documents without hallucinating intermediate values","Evaluate if my financial AI system understands when to apply division vs multiplication for ratio calculations","Test whether my model can locate relevant financial figures across 10+ page earnings reports and chain them correctly"],"best_for":["ML researchers evaluating financial reasoning capabilities of LLMs and smaller language models","FinTech teams building automated financial analysis systems that need quantitative accuracy benchmarks","AI safety researchers studying numerical hallucination patterns in domain-specific contexts"],"limitations":["Dataset contains only S&P 500 companies — may not generalize to private company financials or non-US regulatory filings","Questions are synthetically generated by crowdworkers, not naturally occurring analyst queries — may miss real-world ambiguities","No temporal reasoning required — all questions reference single fiscal periods, not year-over-year trend analysis","Limited to English-language documents — no multilingual financial reasoning evaluation"],"requires":["Hugging Face datasets library (transformers>=4.0)","Python 3.7+","Sufficient GPU/CPU memory to load 8,281 question-answer pairs with full document context (~2-3GB)"],"input_types":["Natural language questions (English)","Structured financial tables (HTML/text format)","Unstructured earnings report text (SEC 10-K/10-Q filings)"],"output_types":["Numerical answers (integers, decimals, percentages)","Structured reasoning traces showing intermediate calculation steps","Boolean answers for comparison questions"],"categories":["data-processing-analysis","financial-reasoning"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"finqa__cap_1","uri":"capability://data.processing.analysis.financial.domain.knowledge.evaluation.through.earnings.report.comprehension","name":"financial domain knowledge evaluation through earnings report comprehension","description":"Assesses whether AI systems understand financial terminology, accounting concepts, and domain-specific metrics by requiring them to answer questions about real earnings reports from S&P 500 companies. The dataset tests recognition of financial line items (revenue, COGS, operating expenses, net income), ability to distinguish between different financial statements (income statement vs balance sheet), and understanding of financial ratios and metrics without explicit instruction on their definitions.","intents":["Measure whether my model understands what 'operating margin' means in the context of actual financial statements","Evaluate if my financial AI can distinguish between gross profit and net income without explicit definitions","Test whether my system recognizes which financial metrics are relevant to specific business questions"],"best_for":["Financial services companies building AI assistants for investor relations or earnings analysis","Academic researchers studying domain adaptation and transfer learning in specialized fields","FinTech startups evaluating whether general-purpose LLMs have sufficient financial literacy for production use"],"limitations":["Only covers large-cap US companies (S&P 500) — no small-cap, international, or sector-specific financial patterns","Questions focus on historical financial data interpretation, not forward-looking analysis or guidance interpretation","No accounting policy variations tested — assumes standard GAAP reporting without exploring IFRS or alternative accounting methods","Dataset does not include questions about financial risk, credit ratings, or qualitative MD&A analysis"],"requires":["Domain knowledge of basic financial statements (income statement, balance sheet, cash flow statement)","Hugging Face datasets library","Python 3.7+"],"input_types":["Natural language questions about financial metrics and concepts","Real earnings report text from SEC filings","Financial tables with line items and values"],"output_types":["Numerical financial metrics (revenue, profit, ratios)","Categorical answers identifying financial statement types","Comparative answers (e.g., 'Company A has higher revenue than Company B')"],"categories":["data-processing-analysis","domain-knowledge-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"finqa__cap_2","uri":"capability://data.processing.analysis.structured.table.extraction.and.reasoning.from.mixed.format.documents","name":"structured table extraction and reasoning from mixed-format documents","description":"Enables evaluation of AI systems' ability to extract numerical data from both structured HTML/text tables and unstructured prose within the same document, then reason over the extracted values. The dataset contains questions where relevant data appears in different formats — some figures are in formatted tables with clear row/column headers, while others are embedded in narrative text or footnotes — requiring robust parsing and entity linking before computation can occur.","intents":["Test whether my document parsing pipeline correctly extracts table values and matches them to narrative references","Evaluate if my model can handle cases where the same metric appears in both a table and narrative text with slightly different values","Benchmark my system's ability to resolve ambiguous references when multiple tables contain similar financial figures"],"best_for":["Document AI teams building table extraction and understanding systems","Enterprise search/RAG teams evaluating mixed-format document comprehension","ML engineers optimizing end-to-end document processing pipelines for financial data"],"limitations":["Tables are in text/HTML format only — no image-based table extraction required (no scanned PDFs or images)","No cross-document reasoning — all questions reference data within a single earnings report","Limited table complexity — most tables are 2D matrices without complex merged cells or hierarchical headers","No handling of footnote references or accounting policy disclosures that modify reported figures"],"requires":["Table parsing library (e.g., pandas, BeautifulSoup) for preprocessing","Python 3.7+","Hugging Face datasets library"],"input_types":["Structured tables (HTML/text format with headers and rows)","Unstructured narrative text from earnings reports","Mixed documents containing both table and prose financial data"],"output_types":["Extracted numerical values from tables","Linked entity references between tables and narrative","Computed results from extracted values"],"categories":["data-processing-analysis","document-understanding"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"finqa__cap_3","uri":"capability://data.processing.analysis.benchmark.dataset.curation.and.annotation.for.financial.ai.evaluation","name":"benchmark dataset curation and annotation for financial ai evaluation","description":"Provides a curated, crowdsourced-annotated dataset of 8,281 question-answer pairs with multi-step reasoning requirements, enabling systematic evaluation of AI systems on financial numerical reasoning. The dataset includes quality control mechanisms through crowdworker annotation, answer validation against ground truth, and coverage across diverse financial metrics and company types within the S&P 500, creating a reproducible evaluation standard for the financial AI community.","intents":["Use this dataset as a standard benchmark to compare my financial AI system against published baselines and other models","Evaluate whether my model's performance on FinQA correlates with real-world financial analysis accuracy","Track improvements in my financial reasoning system by running periodic evaluations against this fixed benchmark"],"best_for":["Researchers publishing financial AI papers who need a standard evaluation metric","ML teams establishing internal financial AI benchmarks and tracking model improvements","Open-source project maintainers building financial reasoning tools and needing reproducible evaluation"],"limitations":["Benchmark is static — does not evolve with new financial reporting standards or emerging company types","Crowdworker annotations may contain systematic biases or errors not caught by validation","No inter-annotator agreement scores provided — unclear how much ambiguity exists in question interpretation","Dataset is imbalanced across question types — some operation types (e.g., division) may be underrepresented","No official train/test split provided — researchers must define their own splits, reducing comparability"],"requires":["Hugging Face datasets library","Python 3.7+","Understanding of evaluation metrics (accuracy, F1, BLEU for numerical answers)"],"input_types":["Crowdsourced question-answer pairs","Annotated financial documents","Ground truth numerical answers"],"output_types":["Evaluation metrics (accuracy, exact match, numerical error rates)","Per-question analysis and error breakdowns","Leaderboard-compatible results for model comparison"],"categories":["data-processing-analysis","model-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"finqa__cap_4","uri":"capability://planning.reasoning.multi.hop.reasoning.evaluation.across.document.sections","name":"multi-hop reasoning evaluation across document sections","description":"Assesses AI systems' ability to perform multi-hop reasoning by requiring them to locate and combine information from different sections of earnings reports. Questions may require finding a figure in the income statement, then locating a related metric in the balance sheet, then performing arithmetic across both — testing whether models can maintain context across document boundaries and understand relationships between different financial statement sections.","intents":["Evaluate whether my model can correctly chain reasoning across multiple document sections without losing context","Test if my system understands relationships between income statement and balance sheet items (e.g., retained earnings)","Measure whether my financial AI can handle questions requiring data from footnotes, MD&A, and main financial statements"],"best_for":["AI researchers studying multi-hop reasoning and context management in long documents","Financial AI teams building systems that need to synthesize information across multiple report sections","LLM evaluation teams assessing whether models maintain coherent reasoning over 10+ page documents"],"limitations":["Questions are limited to single earnings reports — no cross-period or cross-company reasoning required","Multi-hop depth is typically 2-3 steps — does not test extreme reasoning chains of 5+ hops","No explicit reasoning traces provided — must infer reasoning path from question and answer alone","Document structure is standard SEC format — does not test reasoning over non-standard or poorly formatted documents"],"requires":["Hugging Face datasets library","Python 3.7+","Long-context language model (minimum 4K token window recommended)"],"input_types":["Multi-section earnings reports (10+ pages)","Questions requiring cross-section reasoning","Financial data from multiple statement types"],"output_types":["Numerical answers requiring multi-step derivation","Reasoning traces showing intermediate steps","Error analysis identifying where reasoning chains break"],"categories":["planning-reasoning","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"finqa__cap_5","uri":"capability://planning.reasoning.arithmetic.operation.type.classification.and.execution","name":"arithmetic operation type classification and execution","description":"Enables evaluation of whether AI systems can identify which arithmetic operations (addition, subtraction, multiplication, division, comparison) are required to answer financial questions, then execute them correctly. The dataset implicitly tests operation selection — a question asking 'what is the profit margin' requires division (net income / revenue), while 'what is total assets' requires addition — forcing models to understand financial semantics before applying math.","intents":["Test whether my model correctly identifies when to use division vs multiplication for financial ratios","Evaluate if my system can distinguish between questions requiring aggregation (addition) vs comparison (subtraction)","Measure whether my financial AI understands the semantic difference between 'increase' (subtraction) and 'growth rate' (division)"],"best_for":["ML researchers studying semantic understanding of mathematical operations in domain contexts","Financial AI teams optimizing operation selection in automated calculation systems","LLM evaluation teams assessing whether models understand when to apply which arithmetic operations"],"limitations":["Operations are limited to basic arithmetic — no advanced financial calculations (NPV, IRR, option pricing)","No explicit operation labels provided — must infer from question-answer pairs","Operations are deterministic given correct data extraction — does not test probabilistic reasoning or uncertainty","No handling of edge cases like division by zero or negative values in ratio calculations"],"requires":["Hugging Face datasets library","Python 3.7+","Ability to parse and analyze question-answer pairs to infer operation types"],"input_types":["Natural language financial questions","Numerical financial data","Financial metrics and ratios"],"output_types":["Identified arithmetic operations (add, subtract, multiply, divide, compare)","Executed calculations with correct results","Error analysis showing operation selection mistakes"],"categories":["planning-reasoning","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"finqa__cap_6","uri":"capability://data.processing.analysis.cross.document.financial.comparison.and.aggregation","name":"cross-document financial comparison and aggregation","description":"Provides evaluation capability for AI systems to compare financial metrics across multiple S&P 500 companies or aggregate metrics across different time periods within the same company's earnings reports. While individual questions reference single documents, the dataset structure enables evaluation of systems that can retrieve and compare relevant companies, requiring understanding of which metrics are comparable across entities and how to normalize for company size or accounting differences.","intents":["Evaluate whether my financial AI can correctly compare revenue growth rates across different companies","Test if my system understands which metrics require normalization (e.g., absolute revenue vs revenue per employee) for fair comparison","Measure whether my model can identify comparable companies and extract relevant metrics for peer analysis"],"best_for":["Financial analysis AI teams building peer comparison and benchmarking systems","Equity research automation platforms evaluating relative company performance","ML teams building financial data aggregation and normalization pipelines"],"limitations":["Dataset does not explicitly include cross-company comparison questions — requires external system to retrieve and compare","No normalization guidance provided — systems must independently determine which metrics require adjustment","Limited to S&P 500 companies — no cross-sector or cross-market comparisons","No temporal alignment — comparing companies with different fiscal year ends requires external handling"],"requires":["Hugging Face datasets library","Python 3.7+","External document retrieval system to fetch multiple company earnings reports","Financial data normalization logic (e.g., per-share metrics, industry-adjusted ratios)"],"input_types":["Multiple earnings reports from different S&P 500 companies","Financial metrics from comparable companies","Comparison queries (e.g., 'which company has higher margin')"],"output_types":["Comparative rankings or metrics","Aggregated financial statistics","Normalized metrics for fair comparison"],"categories":["data-processing-analysis","search-retrieval"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"finqa__headline","uri":"capability://data.processing.analysis.financial.question.answering.dataset","name":"financial question answering dataset","description":"A comprehensive dataset designed for financial question answering that requires numerical reasoning over real earnings reports, making it ideal for training AI systems in financial analysis and automated reporting.","intents":["best financial question answering dataset","financial dataset for numerical reasoning","top datasets for financial AI training","datasets for evaluating financial AI systems","financial data analysis benchmark dataset"],"best_for":["AI training in finance","evaluating financial reasoning models"],"limitations":[],"requires":[],"input_types":[],"output_types":[],"categories":["data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":57,"verified":false,"data_access_risk":"high","permissions":["Hugging Face datasets library (transformers>=4.0)","Python 3.7+","Sufficient GPU/CPU memory to load 8,281 question-answer pairs with full document context (~2-3GB)","Domain knowledge of basic financial statements (income statement, balance sheet, cash flow statement)","Hugging Face datasets library","Table parsing library (e.g., pandas, BeautifulSoup) for preprocessing","Understanding of evaluation metrics (accuracy, F1, BLEU for numerical answers)","Long-context language model (minimum 4K token window recommended)","Ability to parse and analyze question-answer pairs to infer operation types","External document retrieval system to fetch multiple company earnings reports"],"failure_modes":["Dataset contains only S&P 500 companies — may not generalize to private company financials or non-US regulatory filings","Questions are synthetically generated by crowdworkers, not naturally occurring analyst queries — may miss real-world ambiguities","No temporal reasoning required — all questions reference single fiscal periods, not year-over-year trend analysis","Limited to English-language documents — no multilingual financial reasoning evaluation","Only covers large-cap US companies (S&P 500) — no small-cap, international, or sector-specific financial patterns","Questions focus on historical financial data interpretation, not forward-looking analysis or guidance interpretation","No accounting policy variations tested — assumes standard GAAP reporting without exploring IFRS or alternative accounting methods","Dataset does not include questions about financial risk, credit ratings, or qualitative MD&A analysis","Tables are in text/HTML format only — no image-based table extraction required (no scanned PDFs or images)","No cross-document reasoning — all questions reference data within a single earnings report","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.7,"quality":0.8500000000000001,"ecosystem":0.39999999999999997,"match_graph":0.25,"freshness":0.75,"weights":{"adoption":0.3,"quality":0.25,"ecosystem":0.1,"match_graph":0.3,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:21.548Z","last_scraped_at":null,"last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=finqa","compare_url":"https://unfragile.ai/compare?artifact=finqa"}},"signature":"iX0uh9500ZcLp1JCfLMGsAQlgI1sx3dCSitz6lWsjWAiayEa3c0zb6ivDjWYOqcyvBkE/LeYXRboEbFTzb7lAA==","signedAt":"2026-06-22T09:14:23.957Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/finqa","artifact":"https://unfragile.ai/finqa","verify":"https://unfragile.ai/api/v1/verify?slug=finqa","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}