{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"hf-dataset-mlfoundations--mint-1t-pdf-cc-2023-06","slug":"mlfoundations--mint-1t-pdf-cc-2023-06","name":"MINT-1T-PDF-CC-2023-06","type":"dataset","url":"https://huggingface.co/datasets/mlfoundations/MINT-1T-PDF-CC-2023-06","page_url":"https://unfragile.ai/mlfoundations--mint-1t-pdf-cc-2023-06","categories":["model-training"],"tags":["task_categories:image-to-text","task_categories:text-generation","language:en","license:cc-by-4.0","size_categories:100B<n<1T","arxiv:2406.11271","region:us","multimodal"],"pricing":{"model":"open_source","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"hf-dataset-mlfoundations--mint-1t-pdf-cc-2023-06__cap_0","uri":"capability://data.processing.analysis.large.scale.multimodal.document.image.text.dataset.curation.and.indexing","name":"large-scale multimodal document-image-text dataset curation and indexing","description":"Provides a curated dataset of 1 trillion tokens spanning 539,406 PDF documents with aligned image-to-text pairs extracted from Common Crawl 2023-06 snapshot. The dataset uses a hierarchical indexing structure that maps document boundaries, page-level image coordinates, and corresponding OCR/text extractions, enabling efficient retrieval of multimodal training samples at scale without requiring full dataset materialization in memory.","intents":["Train vision-language models on real-world document understanding tasks with paired image and text data","Build document retrieval systems that understand both visual layout and textual content","Evaluate OCR and document parsing models against large-scale real-world PDF corpora","Create datasets for document classification, table extraction, and form understanding tasks"],"best_for":["ML researchers training multimodal foundation models at scale","Teams building document understanding and OCR systems","Organizations developing enterprise document processing pipelines"],"limitations":["1T token size requires distributed storage infrastructure — not suitable for single-machine training without streaming/sharding","PDF extraction quality varies by source document; OCR errors propagate into training data","No built-in deduplication across documents — may contain near-duplicate content from web crawl","Image resolution and quality varies significantly across source PDFs; no normalization applied","English-language dominant; multilingual coverage limited to incidental non-English content in PDFs"],"requires":["HuggingFace Datasets library (>=2.14.0) for streaming/downloading","Minimum 500GB disk space for partial dataset or cloud storage credentials for remote access","Python 3.8+ with PyTorch or TensorFlow for model training integration","PDF processing libraries (PyPDF2, pdfplumber) if custom extraction needed"],"input_types":["PDF documents (from Common Crawl 2023-06)","Document metadata (URLs, crawl timestamps)"],"output_types":["Image tensors (document page images)","Text strings (OCR/extracted text)","Structured metadata (document ID, page number, bounding boxes)"],"categories":["data-processing-analysis","multimodal-training-data"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-mlfoundations--mint-1t-pdf-cc-2023-06__cap_1","uri":"capability://data.processing.analysis.streaming.dataset.access.with.lazy.loading.and.batching","name":"streaming dataset access with lazy loading and batching","description":"Implements HuggingFace Datasets streaming protocol that enables on-demand loading of document samples without downloading the full 1T token dataset upfront. The architecture uses memory-mapped file access and configurable batch sampling strategies, allowing training loops to fetch and cache only the samples needed for each epoch while maintaining deterministic shuffling across distributed workers.","intents":["Train models on the full dataset without requiring petabyte-scale local storage","Parallelize data loading across multiple GPUs/TPUs with consistent sample ordering","Prototype and iterate on model architectures without waiting for full dataset download","Integrate dataset into existing PyTorch DataLoader or TensorFlow tf.data pipelines"],"best_for":["Teams with limited local storage but access to high-bandwidth cloud infrastructure","Researchers iterating rapidly on model architectures and hyperparameters","Distributed training setups requiring deterministic data sharding across nodes"],"limitations":["Streaming introduces network latency — slower than local SSD access by 2-5x depending on connection quality","Requires stable internet connection; network interruptions may corrupt sample batches mid-epoch","Caching behavior is opaque; no explicit control over which samples remain in memory vs. re-fetched","Shuffling across epochs requires maintaining state; distributed training may have subtle synchronization issues"],"requires":["HuggingFace Datasets library with streaming support (>=2.14.0)","Minimum 10 Mbps sustained bandwidth for practical training throughput","HuggingFace account or API token for dataset access","PyTorch (>=1.9) or TensorFlow (>=2.8) for integration"],"input_types":["Dataset configuration (split, streaming mode, batch size)","Worker process IDs (for distributed training)"],"output_types":["Batched samples with image tensors and text strings","Metadata dictionaries with document IDs and page numbers"],"categories":["data-processing-analysis","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-mlfoundations--mint-1t-pdf-cc-2023-06__cap_2","uri":"capability://data.processing.analysis.document.level.metadata.and.provenance.tracking","name":"document-level metadata and provenance tracking","description":"Maintains structured metadata for each document including source URL, Common Crawl snapshot date (2023-06), document hash, page count, and extraction quality scores. This metadata is queryable and filterable within the dataset, allowing users to select subsets based on source domain, quality thresholds, or temporal characteristics without scanning the full corpus.","intents":["Filter training data by quality metrics to improve model performance on high-quality documents","Analyze dataset composition and bias — understand which domains and document types are over/under-represented","Reproduce experiments by selecting specific document subsets or quality tiers","Audit model training data for copyright or licensing concerns by tracing documents to source URLs"],"best_for":["Researchers studying dataset bias and composition effects on model performance","Teams building production document systems that need quality guarantees","Organizations with compliance requirements to audit training data provenance"],"limitations":["Metadata quality depends on Common Crawl extraction — some URLs may be invalid or documents may have moved","Quality scores are heuristic-based (e.g., OCR confidence); no ground-truth validation for all documents","No per-document licensing information — users must verify CC-BY-4.0 compliance independently for derived works","Metadata does not include document language detection; filtering by language requires text analysis"],"requires":["HuggingFace Datasets library with metadata filtering support","Understanding of Common Crawl metadata schema and quality metrics","Python for programmatic filtering and analysis"],"input_types":["Metadata query filters (URL patterns, quality thresholds, date ranges)","Document IDs or hashes"],"output_types":["Filtered dataset splits","Metadata statistics and composition reports","Provenance traces (URL → document mapping)"],"categories":["data-processing-analysis","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-mlfoundations--mint-1t-pdf-cc-2023-06__cap_3","uri":"capability://image.visual.image.text.pair.extraction.with.layout.aware.alignment","name":"image-text pair extraction with layout-aware alignment","description":"Extracts page-level images from PDF documents and aligns them with corresponding OCR/text content using spatial layout information (bounding boxes, reading order). The extraction pipeline preserves document structure (headers, footers, tables, body text) by analyzing PDF internal structure and image coordinates, creating naturally-aligned multimodal pairs suitable for vision-language model training without requiring post-hoc alignment.","intents":["Train vision-language models that understand document layout and spatial relationships between text and images","Build document understanding models that leverage both visual and textual signals","Create training data for table detection, form field extraction, and document segmentation tasks","Evaluate vision-language models on real-world document understanding benchmarks"],"best_for":["Teams building document understanding and layout analysis models","Researchers training vision-language models on structured documents","Organizations developing document digitization and archival systems"],"limitations":["Extraction quality depends on PDF structure — scanned PDFs with poor OCR produce low-quality text pairs","Image resolution varies by source PDF; no normalization to standard DPI or dimensions","Layout alignment assumes well-formed PDF structure; malformed or corrupted PDFs may produce misaligned pairs","No explicit handling of multi-column layouts or complex document structures; reading order may be incorrect","Bounding box information is approximate; pixel-level alignment not guaranteed"],"requires":["PDF processing libraries (PyPDF2, pdfplumber, or similar) for extraction","Image processing library (Pillow) for page rendering","Python 3.8+ for custom extraction scripts"],"input_types":["PDF documents with embedded text and images","PDF metadata (page dimensions, text coordinates)"],"output_types":["Page-level image tensors (RGB or grayscale)","Extracted text strings with layout information","Bounding box coordinates for text regions"],"categories":["image-visual","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-mlfoundations--mint-1t-pdf-cc-2023-06__cap_4","uri":"capability://data.processing.analysis.common.crawl.snapshot.integration.and.temporal.consistency","name":"common crawl snapshot integration and temporal consistency","description":"Dataset is derived from a single Common Crawl snapshot (2023-06), ensuring temporal consistency across all documents — all PDFs were crawled within a specific time window, avoiding temporal distribution shifts that occur when combining data from multiple crawl dates. The integration includes Common Crawl metadata (WARC records, crawl IDs) enabling users to trace documents back to original crawl artifacts for verification or re-extraction.","intents":["Train models on temporally-consistent data to avoid distribution shifts from different crawl periods","Reproduce experiments by accessing the same Common Crawl snapshot used in published research","Analyze how document quality and content evolve over time by comparing against other snapshots","Verify dataset integrity by tracing documents back to original WARC records"],"best_for":["Researchers requiring reproducible, temporally-consistent training data","Teams building models that need to avoid temporal distribution shifts","Organizations auditing dataset integrity and source authenticity"],"limitations":["Single snapshot limits temporal diversity — models may overfit to 2023-06 web content distribution","Common Crawl snapshot is static; cannot be updated with newer documents without creating new dataset version","WARC record access requires Common Crawl infrastructure knowledge; not all users can easily verify provenance","Temporal consistency means dataset does not reflect current web state — may be outdated for some applications"],"requires":["Understanding of Common Crawl architecture and WARC format","Access to Common Crawl S3 buckets (public, no authentication required)","Optional: Common Crawl Index API for document lookup"],"input_types":["Document IDs or URLs","Common Crawl snapshot identifier (CC-MAIN-2023-06)"],"output_types":["WARC record references","Crawl metadata (crawl date, HTTP status, content-type)","Links to original Common Crawl artifacts"],"categories":["data-processing-analysis","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-mlfoundations--mint-1t-pdf-cc-2023-06__cap_5","uri":"capability://data.processing.analysis.cc.by.4.0.licensed.dataset.with.commercial.use.rights","name":"cc-by-4.0 licensed dataset with commercial use rights","description":"Dataset is released under Creative Commons Attribution 4.0 (CC-BY-4.0) license, permitting commercial use, modification, and redistribution with attribution. The license is applied at the dataset level, though individual documents may have different licenses — users are responsible for verifying compliance for derived works, but the dataset itself imposes minimal legal restrictions on model training and deployment.","intents":["Train commercial models without licensing restrictions or royalty obligations","Publish research using the dataset without requiring special permissions","Create derivative datasets and redistribute them with proper attribution","Build products and services based on models trained on this data"],"best_for":["Commercial teams building products without licensing constraints","Researchers publishing open-source models and datasets","Organizations with strict IP policies requiring permissive licenses"],"limitations":["CC-BY-4.0 requires attribution in derivative works — must cite MINT-1T dataset in publications and model cards","Individual documents in dataset may have different licenses (some may be copyrighted); users must verify compliance for sensitive applications","License does not guarantee that all source content is legally available for training — some PDFs may contain copyrighted material","Commercial use is permitted but does not indemnify against copyright claims from original document authors"],"requires":["Understanding of CC-BY-4.0 license terms and attribution requirements","Legal review for commercial applications using copyrighted source material"],"input_types":["Dataset usage context (research, commercial, etc.)"],"output_types":["License compliance checklist","Attribution requirements","Legal guidance (not legal advice)"],"categories":["data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":23,"verified":false,"data_access_risk":"low","permissions":["HuggingFace Datasets library (>=2.14.0) for streaming/downloading","Minimum 500GB disk space for partial dataset or cloud storage credentials for remote access","Python 3.8+ with PyTorch or TensorFlow for model training integration","PDF processing libraries (PyPDF2, pdfplumber) if custom extraction needed","HuggingFace Datasets library with streaming support (>=2.14.0)","Minimum 10 Mbps sustained bandwidth for practical training throughput","HuggingFace account or API token for dataset access","PyTorch (>=1.9) or TensorFlow (>=2.8) for integration","HuggingFace Datasets library with metadata filtering support","Understanding of Common Crawl metadata schema and quality metrics"],"failure_modes":["1T token size requires distributed storage infrastructure — not suitable for single-machine training without streaming/sharding","PDF extraction quality varies by source document; OCR errors propagate into training data","No built-in deduplication across documents — may contain near-duplicate content from web crawl","Image resolution and quality varies significantly across source PDFs; no normalization applied","English-language dominant; multilingual coverage limited to incidental non-English content in PDFs","Streaming introduces network latency — slower than local SSD access by 2-5x depending on connection quality","Requires stable internet connection; network interruptions may corrupt sample batches mid-epoch","Caching behavior is opaque; no explicit control over which samples remain in memory vs. re-fetched","Shuffling across epochs requires maintaining state; distributed training may have subtle synchronization issues","Metadata quality depends on Common Crawl extraction — some URLs may be invalid or documents may have moved","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.05,"quality":0.22,"ecosystem":0.5000000000000001,"match_graph":0.25,"freshness":0.75,"weights":{"adoption":0.3,"quality":0.25,"ecosystem":0.1,"match_graph":0.3,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:22.764Z","last_scraped_at":"2026-04-22T08:08:14.361Z","last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=mlfoundations--mint-1t-pdf-cc-2023-06","compare_url":"https://unfragile.ai/compare?artifact=mlfoundations--mint-1t-pdf-cc-2023-06"}},"signature":"EUKqN/nR3bf+qDOeSHMaQJ5Wn6+g62f2Kgt2TNzzj5CUxtPl1CbUTb6L+oIbNYINvDXM7ggGKaWiJTq/JFw8Bg==","signedAt":"2026-06-20T15:06:51.604Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/mlfoundations--mint-1t-pdf-cc-2023-06","artifact":"https://unfragile.ai/mlfoundations--mint-1t-pdf-cc-2023-06","verify":"https://unfragile.ai/api/v1/verify?slug=mlfoundations--mint-1t-pdf-cc-2023-06","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}